1/* Title: Pure/General/utf8.scala 2 Author: Makarius 3 4Variations on UTF-8. 5*/ 6 7package isabelle 8 9 10import java.nio.charset.Charset 11import scala.io.Codec 12 13 14object UTF8 15{ 16 /* charset */ 17 18 val charset_name: String = "UTF-8" 19 val charset: Charset = Charset.forName(charset_name) 20 def codec(): Codec = Codec(charset) 21 22 def bytes(s: String): Array[Byte] = s.getBytes(charset) 23 24 25 /* permissive UTF-8 decoding */ 26 27 // see also https://en.wikipedia.org/wiki/UTF-8#Description 28 // overlong encodings enable byte-stuffing of low-ASCII 29 30 def decode_permissive(text: CharSequence): String = 31 { 32 val buf = new java.lang.StringBuilder(text.length) 33 var code = -1 34 var rest = 0 35 def flush() 36 { 37 if (code != -1) { 38 if (rest == 0 && Character.isValidCodePoint(code)) 39 buf.appendCodePoint(code) 40 else buf.append('\uFFFD') 41 code = -1 42 rest = 0 43 } 44 } 45 def init(x: Int, n: Int) 46 { 47 flush() 48 code = x 49 rest = n 50 } 51 def push(x: Int) 52 { 53 if (rest <= 0) init(x, -1) 54 else { 55 code <<= 6 56 code += x 57 rest -= 1 58 } 59 } 60 for (i <- 0 until text.length) { 61 val c = text.charAt(i) 62 if (c < 128) { flush(); buf.append(c) } 63 else if ((c & 0xC0) == 0x80) push(c & 0x3F) 64 else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) 65 else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) 66 else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) 67 } 68 flush() 69 buf.toString 70 } 71 72 private class Decode_Chars(decode: String => String, 73 buffer: Array[Byte], start: Int, end: Int) extends CharSequence 74 { 75 def length: Int = end - start 76 def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char] 77 def subSequence(i: Int, j: Int): CharSequence = 78 new Decode_Chars(decode, buffer, start + i, start + j) 79 80 // toString with adhoc decoding: abuse of CharSequence interface 81 override def toString: String = decode(decode_permissive(this)) 82 } 83 84 def decode_chars(decode: String => String, 85 buffer: Array[Byte], start: Int, end: Int): CharSequence = 86 { 87 require(0 <= start && start <= end && end <= buffer.length) 88 new Decode_Chars(decode, buffer, start, end) 89 } 90} 91