1/*  Title:      Pure/General/utf8.scala
2    Author:     Makarius
3
4Variations on UTF-8.
5*/
6
7package isabelle
8
9
10import java.nio.charset.Charset
11import scala.io.Codec
12
13
14object UTF8
15{
16  /* charset */
17
18  val charset_name: String = "UTF-8"
19  val charset: Charset = Charset.forName(charset_name)
20  def codec(): Codec = Codec(charset)
21
22  def bytes(s: String): Array[Byte] = s.getBytes(charset)
23
24
25  /* permissive UTF-8 decoding */
26
27  // see also https://en.wikipedia.org/wiki/UTF-8#Description
28  // overlong encodings enable byte-stuffing of low-ASCII
29
30  def decode_permissive(text: CharSequence): String =
31  {
32    val buf = new java.lang.StringBuilder(text.length)
33    var code = -1
34    var rest = 0
35    def flush()
36    {
37      if (code != -1) {
38        if (rest == 0 && Character.isValidCodePoint(code))
39          buf.appendCodePoint(code)
40        else buf.append('\uFFFD')
41        code = -1
42        rest = 0
43      }
44    }
45    def init(x: Int, n: Int)
46    {
47      flush()
48      code = x
49      rest = n
50    }
51    def push(x: Int)
52    {
53      if (rest <= 0) init(x, -1)
54      else {
55        code <<= 6
56        code += x
57        rest -= 1
58      }
59    }
60    for (i <- 0 until text.length) {
61      val c = text.charAt(i)
62      if (c < 128) { flush(); buf.append(c) }
63      else if ((c & 0xC0) == 0x80) push(c & 0x3F)
64      else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1)
65      else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2)
66      else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3)
67    }
68    flush()
69    buf.toString
70  }
71
72  private class Decode_Chars(decode: String => String,
73    buffer: Array[Byte], start: Int, end: Int) extends CharSequence
74  {
75    def length: Int = end - start
76    def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]
77    def subSequence(i: Int, j: Int): CharSequence =
78      new Decode_Chars(decode, buffer, start + i, start + j)
79
80    // toString with adhoc decoding: abuse of CharSequence interface
81    override def toString: String = decode(decode_permissive(this))
82  }
83
84  def decode_chars(decode: String => String,
85    buffer: Array[Byte], start: Int, end: Int): CharSequence =
86  {
87    require(0 <= start && start <= end && end <= buffer.length)
88    new Decode_Chars(decode, buffer, start, end)
89  }
90}
91