-
Notifications
You must be signed in to change notification settings - Fork 145
/
Copy pathUnicodeCharset.scala
113 lines (96 loc) · 4.44 KB
/
UnicodeCharset.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
package better.files
import java.nio.{BufferOverflowException, ByteBuffer, CharBuffer}
import java.nio.charset._
import scala.jdk.CollectionConverters._
/** A Unicode charset that handles byte-order markers
*
* @param underlyingCharset Use this charset if no known byte-order marker is detected; use this for encoding too
* @param writeByteOrderMarkers If set, write BOMs while encoding
*/
class UnicodeCharset(underlyingCharset: Charset, writeByteOrderMarkers: Boolean)
extends Charset(underlyingCharset.name(), underlyingCharset.aliases().asScala.toArray) {
override def newDecoder() = new UnicodeDecoder(underlyingCharset)
override def newEncoder() =
if (writeByteOrderMarkers) new BomEncoder(underlyingCharset) else underlyingCharset.newEncoder()
override def contains(cs: Charset) = underlyingCharset.contains(cs)
}
/** A Unicode decoder that uses the Unicode byte-order marker (BOM) to auto-detect the encoding
* (if none detected, falls back on the defaultCharset). This also gets around a bug in the JDK
* (http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058) where BOM is not consumed for UTF-8.
* See: https://github.com/pathikrit/better-files/issues/107
*
* @param defaultCharset Use this charset if no known byte-order marker is detected
*/
class UnicodeDecoder(defaultCharset: Charset) extends CharsetDecoder(defaultCharset, 1, 1) {
import UnicodeCharset.bomTable
private[this] var inferredCharset: Option[Charset] = None
@annotation.tailrec
private[this] def decode(
in: ByteBuffer,
out: CharBuffer,
candidates: Set[Charset] = Set.empty,
firstCall: Boolean
): CoderResult = {
if (isCharsetDetected) {
detectedCharset().newDecoder().decode(in, out, false)
} else if (firstCall && in.position() != 0) {
// See: https://github.com/pathikrit/better-files/pull/384
inferredCharset = Some(defaultCharset)
decode(in, out, firstCall = false)
} else if (candidates.isEmpty || !in.hasRemaining) {
inferredCharset = Some(defaultCharset)
in.rewind()
decode(in, out, firstCall = false)
} else if (candidates.forall(c => bomTable(c).length == in.position())) {
inferredCharset = candidates.headOption.ensuring(candidates.size == 1, "Ambiguous BOMs found")
decode(in, out, firstCall = false)
} else {
val idx = in.position()
val byte = in.get()
def isPossible(charset: Charset) = bomTable(charset).lift(idx).contains(byte)
decode(in, out, candidates.filter(isPossible), firstCall = false)
}
}
override def decodeLoop(in: ByteBuffer, out: CharBuffer) =
decode(in = in, out = out, candidates = bomTable.keySet, firstCall = true)
override def isCharsetDetected = inferredCharset.isDefined
override def isAutoDetecting = true
override def implReset() = inferredCharset = None
override def detectedCharset() =
inferredCharset.getOrElse(throw new IllegalStateException("Insufficient bytes read to determine charset"))
}
/** Encoder that writes the BOM for this charset */
class BomEncoder(charset: Charset) extends CharsetEncoder(charset, 1, 1) {
private[this] val bom = UnicodeCharset.bomTable
.getOrElse(charset, throw new IllegalArgumentException(s"$charset does not support BOMs"))
.toArray
private[this] var isBomWritten = false
override def encodeLoop(in: CharBuffer, out: ByteBuffer): CoderResult = {
if (!isBomWritten) {
try {
out.put(bom): Unit
} catch {
case _: BufferOverflowException => return CoderResult.OVERFLOW
} finally {
isBomWritten = true
}
}
charset.newEncoder().encode(in, out, true)
}
override def implReset() = isBomWritten = false
}
object UnicodeCharset {
private[files] val bomTable: Map[Charset, IndexedSeq[Byte]] = Map(
"UTF-8" -> IndexedSeq(0xef, 0xbb, 0xbf),
"UTF-16BE" -> IndexedSeq(0xfe, 0xff),
"UTF-16LE" -> IndexedSeq(0xff, 0xfe),
"UTF-32BE" -> IndexedSeq(0x00, 0x00, 0xfe, 0xff),
"UTF-32LE" -> IndexedSeq(0xff, 0xfe, 0x00, 0x00)
).collect {
case (charset, bytes) if Charset.isSupported(charset) => Charset.forName(charset) -> bytes.map(_.toByte)
}.ensuring(_.nonEmpty, "No unicode charset detected")
def isValid(charset: Charset): Boolean = bomTable.contains(charset)
def apply(charset: Charset, writeByteOrderMarkers: Boolean = false): Charset =
if (isValid(charset)) new UnicodeCharset(charset, writeByteOrderMarkers)
else charset
}