1structure UTF8 :> UTF8 = 2struct 3 4exception BadUTF8 of string 5 6val two11 = 2048 (* 2 ^ 11 *) 7val two16 = 65536 (* 2 ^ 16 *) 8val Umax = 0x10FFFF (* maximum Unicode code point *) 9 10fun chr i = 11 if i < 0 then raise Chr 12 else if i < 128 then str (Char.chr i) 13 else if i < two11 then let 14 val w = Word.fromInt i 15 val byte2 = 128 + Word.toInt (Word.andb(w, 0wx3F)) 16 val byte1 = 0xC0 + Word.toInt (Word.>>(w,0w6)) 17 in 18 String.implode [Char.chr byte1, Char.chr byte2] 19 end 20 else if i < two16 then let 21 val w = Word.fromInt i 22 val byte3 = 128 + Word.toInt (Word.andb(w, 0wx3F)) (* 3F = 6 bits *) 23 val w = Word.>>(w,0w6) 24 val byte2 = 128 + Word.toInt (Word.andb(w, 0wx3F)) (* 3F = 6 bits *) 25 val w = Word.>>(w,0w6) 26 val byte1 = 0xE0 + Word.toInt (Word.andb(w, 0wxF)) 27 (* inital E says there are 3 bytes, and with F to extract 4 bits *) 28 in 29 String.implode (map Char.chr [byte1, byte2, byte3]) 30 end 31 else if i <= Umax then let 32 val w = Word.fromInt i 33 val byte4 = 128 + Word.toInt (Word.andb(w, 0wx3F)) (* 3F = 6 bits *) 34 val w = Word.>>(w,0w6) 35 val byte3 = 128 + Word.toInt (Word.andb(w, 0wx3F)) (* 3F = 6 bits *) 36 val w = Word.>>(w,0w6) 37 val byte2 = 128 + Word.toInt (Word.andb(w, 0wx3F)) (* 3F = 6 bits *) 38 val w = Word.>>(w,0w6) 39 val byte1 = 0xF0 + Word.toInt (Word.andb(w, 0wx7)) 40 (* inital F says there are 4 bytes, and with 7 to extract 3 bits *) 41 in 42 String.implode (map Char.chr [byte1, byte2, byte3, byte4]) 43 end 44 else raise Chr 45 46fun byte1_count c = let 47 fun recurse acc b = if b > 0w127 then recurse (acc + 1) (Word8.<<(b,0w1)) 48 else acc 49in 50 recurse 0 (Word8.fromInt (Char.ord c)) 51end 52 53fun isCont_char c = let val i = Char.ord c in 128 <= i andalso i < 192 end 54 55fun pow2 i = Word.toInt (Word.<<(0w1, Word.fromInt i)) 56 57fun getChar s = let 58 fun rangeCheck cnt (res as ((_, i), _)) = 59 if case cnt of 2 => 0x80 <= i 60 | 3 => 0x800 <= i 61 | 4 => 0x10000 <= i andalso i <= Umax 62 | _ => false 63 then res 64 else raise BadUTF8 s 65 open Substring 66 fun ucontinue acc pos limit ss = 67 if pos = limit then let 68 val (p,s) = splitAt (ss, limit) 69 in 70 SOME((string p, acc), string s) 71 end 72 else let 73 val pos_c = sub(ss, pos) 74 handle Subscript => raise BadUTF8 (string (slice(ss,0,SOME pos))) 75 in 76 if isCont_char pos_c then 77 ucontinue (acc * 64 + Char.ord pos_c - 128) (pos + 1) limit ss 78 else raise BadUTF8 (string (slice(ss,0,SOME (pos + 1)))) 79 end 80 fun recurse ss = 81 case getc ss of 82 NONE => NONE 83 | SOME (c, ss') => let 84 val i = Char.ord c 85 in 86 if i < 128 then SOME((str c, i), string ss') 87 else let 88 val cnt = byte1_count c 89 in 90 if cnt = 1 then raise BadUTF8 (str c) 91 else 92 Option.map 93 (rangeCheck cnt) 94 (ucontinue (i + pow2 (8 - cnt) - 256) 1 cnt ss) 95 end 96 end 97in 98 recurse (full s) 99end 100 101fun size s = let 102 open Substring 103 val ss = full s 104 val sz = size ss 105 fun recurse acc pos = 106 if pos = sz then acc 107 else let 108 val c = sub(ss,pos) 109 in 110 if Char.ord c < 128 then recurse (acc + 1) (pos + 1) 111 else let 112 val bc = byte1_count c 113 in 114 check acc (pos + 1) pos (bc - 1) 115 end 116 end 117 and check acc pos start cnt = 118 if cnt = 0 then recurse (acc + 1) pos 119 else if pos = sz then 120 raise BadUTF8 (string (slice(ss,start,SOME(pos-start)))) 121 else let 122 val c = sub(ss,pos) 123 in 124 if isCont_char c then check acc (pos + 1) start (cnt - 1) 125 else raise BadUTF8 (string (slice(ss,start,SOME(pos-start)))) 126 end 127in 128 recurse 0 0 129end 130 131fun lastChar s = let 132 open Substring 133 val ss = full s 134 val lastpos = size ss - 1 135 fun goback pos = 136 if pos < 0 then raise BadUTF8 (str (sub(ss,0))) 137 else let 138 val c = sub(ss,pos) 139 in 140 if Char.ord c >= 192 then let 141 val bc = byte1_count c 142 in 143 if lastpos - pos + 1 = bc then string (slice(ss,pos,NONE)) 144 else raise BadUTF8 (string (slice(ss,pos+bc,NONE))) 145 end 146 else if isCont_char c then goback (pos - 1) 147 else raise BadUTF8 (string (slice(ss,pos+1,NONE))) 148 end 149in 150 if lastpos < 0 then NONE 151 else let 152 val c = sub(ss, lastpos) 153 in 154 if Char.ord c < 128 then SOME(str c, Char.ord c) 155 else Option.map #1 (getChar (goback (lastpos - 1))) 156 end 157end 158 159fun translate f s = let 160 fun recurse i changed acc ustr = 161 case getChar ustr of 162 NONE => if changed then String.concat (List.rev acc) 163 else s 164 | SOME ((c,code), rest) => let 165 val c' = f c 166 in 167 if c' = c andalso not changed then 168 recurse (i + 1) changed acc rest 169 else if not changed then 170 recurse i true (c' :: String.extract(s,0,SOME i)::acc) rest 171 else 172 recurse i true (c' :: acc) rest 173 end 174in 175 recurse 0 false [] s 176end 177 178fun padRight c len s = 179 let 180 val slen = size s 181 in 182 if slen > len then s 183 else s ^ CharVector.tabulate(len - slen, fn _ => c) 184 end 185 186fun substring (s,start,finish) = 187 let 188 fun recurse acc i s = 189 if i >= start andalso i < finish then 190 let 191 val ((c, _), rest) = 192 valOf (getChar s) 193 handle Option => raise Fail "Malformed UTF8 string" 194 in 195 recurse (c::acc) (i + 1) rest 196 end 197 else 198 String.concat (List.rev acc) 199 in 200 recurse [] 0 s 201 end 202 203end (* struct *) 204