1------------------------------------------------------------------------------ 2-- -- 3-- GNAT RUN-TIME COMPONENTS -- 4-- -- 5-- S Y S T E M . W C H _ C N V -- 6-- -- 7-- B o d y -- 8-- -- 9-- Copyright (C) 1992-2013, Free Software Foundation, Inc. -- 10-- -- 11-- GNAT is free software; you can redistribute it and/or modify it under -- 12-- terms of the GNU General Public License as published by the Free Soft- -- 13-- ware Foundation; either version 3, or (at your option) any later ver- -- 14-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 15-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 16-- or FITNESS FOR A PARTICULAR PURPOSE. -- 17-- -- 18-- As a special exception under Section 7 of GPL version 3, you are granted -- 19-- additional permissions described in the GCC Runtime Library Exception, -- 20-- version 3.1, as published by the Free Software Foundation. -- 21-- -- 22-- You should have received a copy of the GNU General Public License and -- 23-- a copy of the GCC Runtime Library Exception along with this program; -- 24-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 25-- <http://www.gnu.org/licenses/>. -- 26-- -- 27-- GNAT was originally developed by the GNAT team at New York University. -- 28-- Extensive contributions were provided by Ada Core Technologies Inc. -- 29-- -- 30------------------------------------------------------------------------------ 31 32pragma Compiler_Unit_Warning; 33 34with Interfaces; use Interfaces; 35with System.WCh_Con; use System.WCh_Con; 36with System.WCh_JIS; use System.WCh_JIS; 37 38package body System.WCh_Cnv is 39 40 ----------------------------- 41 -- Char_Sequence_To_UTF_32 -- 42 ----------------------------- 43 44 function Char_Sequence_To_UTF_32 45 (C : Character; 46 EM : System.WCh_Con.WC_Encoding_Method) return UTF_32_Code 47 is 48 B1 : Unsigned_32; 49 C1 : Character; 50 U : Unsigned_32; 51 W : Unsigned_32; 52 53 procedure Get_Hex (N : Character); 54 -- If N is a hex character, then set B1 to 16 * B1 + character N. 55 -- Raise Constraint_Error if character N is not a hex character. 56 57 procedure Get_UTF_Byte; 58 pragma Inline (Get_UTF_Byte); 59 -- Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode. 60 -- Reads a byte, and raises CE if the first two bits are not 10. 61 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits. 62 63 ------------- 64 -- Get_Hex -- 65 ------------- 66 67 procedure Get_Hex (N : Character) is 68 B2 : constant Unsigned_32 := Character'Pos (N); 69 begin 70 if B2 in Character'Pos ('0') .. Character'Pos ('9') then 71 B1 := B1 * 16 + B2 - Character'Pos ('0'); 72 elsif B2 in Character'Pos ('A') .. Character'Pos ('F') then 73 B1 := B1 * 16 + B2 - (Character'Pos ('A') - 10); 74 elsif B2 in Character'Pos ('a') .. Character'Pos ('f') then 75 B1 := B1 * 16 + B2 - (Character'Pos ('a') - 10); 76 else 77 raise Constraint_Error; 78 end if; 79 end Get_Hex; 80 81 ------------------ 82 -- Get_UTF_Byte -- 83 ------------------ 84 85 procedure Get_UTF_Byte is 86 begin 87 U := Unsigned_32 (Character'Pos (In_Char)); 88 89 if (U and 2#11000000#) /= 2#10_000000# then 90 raise Constraint_Error; 91 end if; 92 93 W := Shift_Left (W, 6) or (U and 2#00111111#); 94 end Get_UTF_Byte; 95 96 -- Start of processing for Char_Sequence_To_Wide 97 98 begin 99 case EM is 100 101 when WCEM_Hex => 102 if C /= ASCII.ESC then 103 return Character'Pos (C); 104 105 else 106 B1 := 0; 107 Get_Hex (In_Char); 108 Get_Hex (In_Char); 109 Get_Hex (In_Char); 110 Get_Hex (In_Char); 111 112 return UTF_32_Code (B1); 113 end if; 114 115 when WCEM_Upper => 116 if C > ASCII.DEL then 117 return 256 * Character'Pos (C) + Character'Pos (In_Char); 118 else 119 return Character'Pos (C); 120 end if; 121 122 when WCEM_Shift_JIS => 123 if C > ASCII.DEL then 124 return Wide_Character'Pos (Shift_JIS_To_JIS (C, In_Char)); 125 else 126 return Character'Pos (C); 127 end if; 128 129 when WCEM_EUC => 130 if C > ASCII.DEL then 131 return Wide_Character'Pos (EUC_To_JIS (C, In_Char)); 132 else 133 return Character'Pos (C); 134 end if; 135 136 when WCEM_UTF8 => 137 138 -- Note: for details of UTF8 encoding see RFC 3629 139 140 U := Unsigned_32 (Character'Pos (C)); 141 142 -- 16#00_0000#-16#00_007F#: 0xxxxxxx 143 144 if (U and 2#10000000#) = 2#00000000# then 145 return Character'Pos (C); 146 147 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx 148 149 elsif (U and 2#11100000#) = 2#110_00000# then 150 W := U and 2#00011111#; 151 Get_UTF_Byte; 152 return UTF_32_Code (W); 153 154 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx 155 156 elsif (U and 2#11110000#) = 2#1110_0000# then 157 W := U and 2#00001111#; 158 Get_UTF_Byte; 159 Get_UTF_Byte; 160 return UTF_32_Code (W); 161 162 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 163 164 elsif (U and 2#11111000#) = 2#11110_000# then 165 W := U and 2#00000111#; 166 167 for K in 1 .. 3 loop 168 Get_UTF_Byte; 169 end loop; 170 171 return UTF_32_Code (W); 172 173 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx 174 -- 10xxxxxx 10xxxxxx 175 176 elsif (U and 2#11111100#) = 2#111110_00# then 177 W := U and 2#00000011#; 178 179 for K in 1 .. 4 loop 180 Get_UTF_Byte; 181 end loop; 182 183 return UTF_32_Code (W); 184 185 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx 186 -- 10xxxxxx 10xxxxxx 10xxxxxx 187 188 elsif (U and 2#11111110#) = 2#1111110_0# then 189 W := U and 2#00000001#; 190 191 for K in 1 .. 5 loop 192 Get_UTF_Byte; 193 end loop; 194 195 return UTF_32_Code (W); 196 197 else 198 raise Constraint_Error; 199 end if; 200 201 when WCEM_Brackets => 202 if C /= '[' then 203 return Character'Pos (C); 204 end if; 205 206 if In_Char /= '"' then 207 raise Constraint_Error; 208 end if; 209 210 B1 := 0; 211 Get_Hex (In_Char); 212 Get_Hex (In_Char); 213 214 C1 := In_Char; 215 216 if C1 /= '"' then 217 Get_Hex (C1); 218 Get_Hex (In_Char); 219 220 C1 := In_Char; 221 222 if C1 /= '"' then 223 Get_Hex (C1); 224 Get_Hex (In_Char); 225 226 C1 := In_Char; 227 228 if C1 /= '"' then 229 Get_Hex (C1); 230 Get_Hex (In_Char); 231 232 if B1 > Unsigned_32 (UTF_32_Code'Last) then 233 raise Constraint_Error; 234 end if; 235 236 if In_Char /= '"' then 237 raise Constraint_Error; 238 end if; 239 end if; 240 end if; 241 end if; 242 243 if In_Char /= ']' then 244 raise Constraint_Error; 245 end if; 246 247 return UTF_32_Code (B1); 248 249 end case; 250 end Char_Sequence_To_UTF_32; 251 252 -------------------------------- 253 -- Char_Sequence_To_Wide_Char -- 254 -------------------------------- 255 256 function Char_Sequence_To_Wide_Char 257 (C : Character; 258 EM : System.WCh_Con.WC_Encoding_Method) return Wide_Character 259 is 260 function Char_Sequence_To_UTF is new Char_Sequence_To_UTF_32 (In_Char); 261 262 U : constant UTF_32_Code := Char_Sequence_To_UTF (C, EM); 263 264 begin 265 if U > 16#FFFF# then 266 raise Constraint_Error; 267 else 268 return Wide_Character'Val (U); 269 end if; 270 end Char_Sequence_To_Wide_Char; 271 272 ----------------------------- 273 -- UTF_32_To_Char_Sequence -- 274 ----------------------------- 275 276 procedure UTF_32_To_Char_Sequence 277 (Val : UTF_32_Code; 278 EM : System.WCh_Con.WC_Encoding_Method) 279 is 280 Hexc : constant array (UTF_32_Code range 0 .. 15) of Character := 281 "0123456789ABCDEF"; 282 283 C1, C2 : Character; 284 U : Unsigned_32; 285 286 begin 287 -- Raise CE for invalid UTF_32_Code 288 289 if not Val'Valid then 290 raise Constraint_Error; 291 end if; 292 293 -- Processing depends on encoding mode 294 295 case EM is 296 297 when WCEM_Hex => 298 if Val < 256 then 299 Out_Char (Character'Val (Val)); 300 elsif Val <= 16#FFFF# then 301 Out_Char (ASCII.ESC); 302 Out_Char (Hexc (Val / (16**3))); 303 Out_Char (Hexc ((Val / (16**2)) mod 16)); 304 Out_Char (Hexc ((Val / 16) mod 16)); 305 Out_Char (Hexc (Val mod 16)); 306 else 307 raise Constraint_Error; 308 end if; 309 310 when WCEM_Upper => 311 if Val < 128 then 312 Out_Char (Character'Val (Val)); 313 elsif Val < 16#8000# or else Val > 16#FFFF# then 314 raise Constraint_Error; 315 else 316 Out_Char (Character'Val (Val / 256)); 317 Out_Char (Character'Val (Val mod 256)); 318 end if; 319 320 when WCEM_Shift_JIS => 321 if Val < 128 then 322 Out_Char (Character'Val (Val)); 323 elsif Val <= 16#FFFF# then 324 JIS_To_Shift_JIS (Wide_Character'Val (Val), C1, C2); 325 Out_Char (C1); 326 Out_Char (C2); 327 else 328 raise Constraint_Error; 329 end if; 330 331 when WCEM_EUC => 332 if Val < 128 then 333 Out_Char (Character'Val (Val)); 334 elsif Val <= 16#FFFF# then 335 JIS_To_EUC (Wide_Character'Val (Val), C1, C2); 336 Out_Char (C1); 337 Out_Char (C2); 338 else 339 raise Constraint_Error; 340 end if; 341 342 when WCEM_UTF8 => 343 344 -- Note: for details of UTF8 encoding see RFC 3629 345 346 U := Unsigned_32 (Val); 347 348 -- 16#00_0000#-16#00_007F#: 0xxxxxxx 349 350 if U <= 16#00_007F# then 351 Out_Char (Character'Val (U)); 352 353 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx 354 355 elsif U <= 16#00_07FF# then 356 Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6))); 357 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); 358 359 -- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx 360 361 elsif U <= 16#00_FFFF# then 362 Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12))); 363 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) 364 and 2#00111111#))); 365 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); 366 367 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 368 369 elsif U <= 16#10_FFFF# then 370 Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18))); 371 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12) 372 and 2#00111111#))); 373 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) 374 and 2#00111111#))); 375 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); 376 377 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx 378 -- 10xxxxxx 10xxxxxx 379 380 elsif U <= 16#03FF_FFFF# then 381 Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24))); 382 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18) 383 and 2#00111111#))); 384 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12) 385 and 2#00111111#))); 386 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) 387 and 2#00111111#))); 388 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); 389 390 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx 391 -- 10xxxxxx 10xxxxxx 10xxxxxx 392 393 elsif U <= 16#7FFF_FFFF# then 394 Out_Char (Character'Val (2#11111100# or Shift_Right (U, 30))); 395 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 24) 396 and 2#00111111#))); 397 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18) 398 and 2#00111111#))); 399 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12) 400 and 2#00111111#))); 401 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) 402 and 2#00111111#))); 403 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); 404 405 else 406 raise Constraint_Error; 407 end if; 408 409 when WCEM_Brackets => 410 411 -- Values in the range 0-255 are directly output. Note that there 412 -- is some issue with [ (16#5B#] since this will cause confusion 413 -- if the resulting string is interpreted using brackets encoding. 414 415 -- One possibility would be to always output [ as ["5B"] but in 416 -- practice this is undesirable, since for example normal use of 417 -- Wide_Text_IO for output (much more common than input), really 418 -- does want to be able to say something like 419 420 -- Put_Line ("Start of output [first run]"); 421 422 -- and have it come out as intended, rather than contaminated by 423 -- a ["5B"] sequence in place of the left bracket. 424 425 if Val < 256 then 426 Out_Char (Character'Val (Val)); 427 428 -- Otherwise use brackets notation for vales greater than 255 429 430 else 431 Out_Char ('['); 432 Out_Char ('"'); 433 434 if Val > 16#FFFF# then 435 if Val > 16#00FF_FFFF# then 436 Out_Char (Hexc (Val / 16 ** 7)); 437 Out_Char (Hexc ((Val / 16 ** 6) mod 16)); 438 end if; 439 440 Out_Char (Hexc ((Val / 16 ** 5) mod 16)); 441 Out_Char (Hexc ((Val / 16 ** 4) mod 16)); 442 end if; 443 444 Out_Char (Hexc ((Val / 16 ** 3) mod 16)); 445 Out_Char (Hexc ((Val / 16 ** 2) mod 16)); 446 Out_Char (Hexc ((Val / 16) mod 16)); 447 Out_Char (Hexc (Val mod 16)); 448 449 Out_Char ('"'); 450 Out_Char (']'); 451 end if; 452 end case; 453 end UTF_32_To_Char_Sequence; 454 455 -------------------------------- 456 -- Wide_Char_To_Char_Sequence -- 457 -------------------------------- 458 459 procedure Wide_Char_To_Char_Sequence 460 (WC : Wide_Character; 461 EM : System.WCh_Con.WC_Encoding_Method) 462 is 463 procedure UTF_To_Char_Sequence is new UTF_32_To_Char_Sequence (Out_Char); 464 begin 465 UTF_To_Char_Sequence (Wide_Character'Pos (WC), EM); 466 end Wide_Char_To_Char_Sequence; 467 468end System.WCh_Cnv; 469