1structure UTF8 :> UTF8 =
2struct
3
4exception BadUTF8 of string
5
6val two11 = 2048  (* 2 ^ 11 *)
7val two16 = 65536 (* 2 ^ 16 *)
8val Umax = 0x10FFFF (* maximum Unicode code point *)
9
10fun chr i =
11    if i < 0 then raise Chr
12    else if i < 128 then str (Char.chr i)
13    else if i < two11 then let
14        val w = Word.fromInt i
15        val byte2 = 128 + Word.toInt (Word.andb(w, 0wx3F))
16        val byte1 = 0xC0 + Word.toInt (Word.>>(w,0w6))
17      in
18        String.implode [Char.chr byte1, Char.chr byte2]
19      end
20    else if i < two16 then let
21        val w = Word.fromInt i
22        val byte3 = 128 + Word.toInt (Word.andb(w, 0wx3F))  (* 3F = 6 bits *)
23        val w = Word.>>(w,0w6)
24        val byte2 = 128 + Word.toInt (Word.andb(w, 0wx3F))  (* 3F = 6 bits *)
25        val w = Word.>>(w,0w6)
26        val byte1 = 0xE0 + Word.toInt (Word.andb(w, 0wxF))
27           (* inital E says there are 3 bytes, and with F to extract 4 bits *)
28      in
29        String.implode (map Char.chr [byte1, byte2, byte3])
30      end
31    else if i <= Umax then let
32        val w = Word.fromInt i
33        val byte4 = 128 + Word.toInt (Word.andb(w, 0wx3F))  (* 3F = 6 bits *)
34        val w = Word.>>(w,0w6)
35        val byte3 = 128 + Word.toInt (Word.andb(w, 0wx3F))  (* 3F = 6 bits *)
36        val w = Word.>>(w,0w6)
37        val byte2 = 128 + Word.toInt (Word.andb(w, 0wx3F))  (* 3F = 6 bits *)
38        val w = Word.>>(w,0w6)
39        val byte1 = 0xF0 + Word.toInt (Word.andb(w, 0wx7))
40           (* inital F says there are 4 bytes, and with 7 to extract 3 bits *)
41      in
42        String.implode (map Char.chr [byte1, byte2, byte3, byte4])
43      end
44    else raise Chr
45
46fun byte1_count c = let
47  fun recurse acc b = if b > 0w127 then recurse (acc + 1) (Word8.<<(b,0w1))
48                      else acc
49in
50  recurse 0 (Word8.fromInt (Char.ord c))
51end
52
53fun isCont_char c = let val i = Char.ord c in 128 <= i andalso i < 192 end
54
55fun pow2 i = Word.toInt (Word.<<(0w1, Word.fromInt i))
56
57fun getChar s = let
58  fun rangeCheck cnt (res as ((_, i), _)) =
59    if case cnt of 2 => 0x80 <= i
60                 | 3 => 0x800 <= i
61                 | 4 => 0x10000 <= i andalso i <= Umax
62                 | _ => false
63    then res
64    else raise BadUTF8 s
65  open Substring
66  fun ucontinue acc pos limit ss =
67      if pos = limit then let
68          val (p,s) = splitAt (ss, limit)
69        in
70          SOME((string p, acc), string s)
71        end
72      else let
73          val pos_c = sub(ss, pos)
74              handle Subscript => raise BadUTF8 (string (slice(ss,0,SOME pos)))
75        in
76          if isCont_char pos_c then
77            ucontinue (acc * 64 + Char.ord pos_c - 128) (pos + 1) limit ss
78          else raise BadUTF8 (string (slice(ss,0,SOME (pos + 1))))
79        end
80  fun recurse ss =
81      case getc ss of
82        NONE => NONE
83      | SOME (c, ss') => let
84          val i = Char.ord c
85        in
86          if i < 128 then SOME((str c, i), string ss')
87          else let
88              val cnt = byte1_count c
89            in
90              if cnt = 1 then raise BadUTF8 (str c)
91              else
92                Option.map
93                    (rangeCheck cnt)
94                    (ucontinue (i + pow2 (8 - cnt) - 256) 1 cnt ss)
95            end
96        end
97in
98  recurse (full s)
99end
100
101fun size s = let
102  open Substring
103  val ss = full s
104  val sz = size ss
105  fun recurse acc pos =
106      if pos = sz then acc
107      else let
108          val c = sub(ss,pos)
109        in
110          if Char.ord c < 128 then recurse (acc + 1) (pos + 1)
111          else let
112              val bc = byte1_count c
113            in
114              check acc (pos + 1) pos (bc - 1)
115            end
116        end
117  and check acc pos start cnt =
118      if cnt = 0 then recurse (acc + 1) pos
119      else if pos = sz then
120        raise BadUTF8 (string (slice(ss,start,SOME(pos-start))))
121      else let
122          val c = sub(ss,pos)
123        in
124          if isCont_char c then check acc (pos + 1) start (cnt - 1)
125          else raise BadUTF8 (string (slice(ss,start,SOME(pos-start))))
126        end
127in
128  recurse 0 0
129end
130
131fun lastChar s = let
132  open Substring
133  val ss = full s
134  val lastpos = size ss - 1
135  fun goback pos =
136      if pos < 0 then raise BadUTF8 (str (sub(ss,0)))
137      else let
138          val c = sub(ss,pos)
139        in
140          if Char.ord c >= 192 then let
141              val bc = byte1_count c
142            in
143              if lastpos - pos + 1 = bc then string (slice(ss,pos,NONE))
144              else raise BadUTF8 (string (slice(ss,pos+bc,NONE)))
145            end
146          else if isCont_char c then goback (pos - 1)
147          else raise BadUTF8 (string (slice(ss,pos+1,NONE)))
148        end
149in
150  if lastpos < 0 then NONE
151  else let
152      val c = sub(ss, lastpos)
153    in
154      if Char.ord c < 128 then SOME(str c, Char.ord c)
155      else Option.map #1 (getChar (goback (lastpos - 1)))
156    end
157end
158
159fun translate f s = let
160  fun recurse i changed acc ustr =
161      case getChar ustr of
162        NONE => if changed then String.concat (List.rev acc)
163                else s
164      | SOME ((c,code), rest) => let
165          val c' = f c
166        in
167          if c' = c andalso not changed then
168            recurse (i + 1) changed acc rest
169          else if not changed then
170            recurse i true (c' :: String.extract(s,0,SOME i)::acc) rest
171          else
172            recurse i true (c' :: acc) rest
173        end
174in
175  recurse 0 false [] s
176end
177
178fun padRight c len s =
179  let
180    val slen = size s
181  in
182    if slen > len then s
183    else s ^ CharVector.tabulate(len - slen, fn _ => c)
184  end
185
186fun substring (s,start,finish) =
187  let
188    fun recurse acc i s =
189      if i >= start andalso i < finish then
190        let
191          val ((c, _), rest) =
192              valOf (getChar s)
193              handle Option => raise Fail "Malformed UTF8 string"
194        in
195          recurse (c::acc) (i + 1) rest
196        end
197      else
198        String.concat (List.rev acc)
199  in
200    recurse [] 0 s
201  end
202
203end (* struct *)
204