1// Written in the D programming language. 2 3/** 4Classes and functions for handling and transcoding between various encodings. 5 6For cases where the _encoding is known at compile-time, functions are provided 7for arbitrary _encoding and decoding of characters, arbitrary transcoding 8between strings of different type, as well as validation and sanitization. 9 10Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1 11(also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250 and WINDOWS-1252. 12 13$(SCRIPT inhibitQuickIndex = 1;) 14$(BOOKTABLE, 15$(TR $(TH Category) $(TH Functions)) 16$(TR $(TD Decode) $(TD 17 $(LREF codePoints) 18 $(LREF decode) 19 $(LREF decodeReverse) 20 $(LREF safeDecode) 21)) 22$(TR $(TD Conversion) $(TD 23 $(LREF codeUnits) 24 $(LREF sanitize) 25 $(LREF transcode) 26)) 27$(TR $(TD Classification) $(TD 28 $(LREF canEncode) 29 $(LREF isValid) 30 $(LREF isValidCodePoint) 31 $(LREF isValidCodeUnit) 32)) 33$(TR $(TD BOM) $(TD 34 $(LREF BOM) 35 $(LREF BOMSeq) 36 $(LREF getBOM) 37 $(LREF utfBOM) 38)) 39$(TR $(TD Length & Index) $(TD 40 $(LREF firstSequence) 41 $(LREF encodedLength) 42 $(LREF index) 43 $(LREF lastSequence) 44 $(LREF validLength) 45)) 46$(TR $(TD Encoding schemes) $(TD 47 $(LREF encodingName) 48 $(LREF EncodingScheme) 49 $(LREF EncodingSchemeASCII) 50 $(LREF EncodingSchemeLatin1) 51 $(LREF EncodingSchemeLatin2) 52 $(LREF EncodingSchemeUtf16Native) 53 $(LREF EncodingSchemeUtf32Native) 54 $(LREF EncodingSchemeUtf8) 55 $(LREF EncodingSchemeWindows1250) 56 $(LREF EncodingSchemeWindows1252) 57)) 58$(TR $(TD Representation) $(TD 59 $(LREF AsciiChar) 60 $(LREF AsciiString) 61 $(LREF Latin1Char) 62 $(LREF Latin1String) 63 $(LREF Latin2Char) 64 $(LREF Latin2String) 65 $(LREF Windows1250Char) 66 $(LREF Windows1250String) 67 $(LREF Windows1252Char) 68 $(LREF Windows1252String) 69)) 70$(TR $(TD Exceptions) $(TD 71 $(LREF INVALID_SEQUENCE) 72 $(LREF EncodingException) 73)) 74) 75 76For cases where the _encoding is not known at compile-time, but is 77known at run-time, the abstract class $(LREF EncodingScheme) 78and its subclasses is provided. To construct a run-time encoder/decoder, 79one does e.g. 80 81---------------------------------------------------- 82auto e = EncodingScheme.create("utf-8"); 83---------------------------------------------------- 84 85This library supplies $(LREF EncodingScheme) subclasses for ASCII, 86ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250, 87WINDOWS-1252, UTF-8, and (on little-endian architectures) UTF-16LE and 88UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE. 89 90This library provides a mechanism whereby other modules may add $(LREF 91EncodingScheme) subclasses for any other _encoding. 92 93Copyright: Copyright Janice Caron 2008 - 2009. 94License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 95Authors: Janice Caron 96Source: $(PHOBOSSRC std/_encoding.d) 97*/ 98/* 99 Copyright Janice Caron 2008 - 2009. 100Distributed under the Boost Software License, Version 1.0. 101 (See accompanying file LICENSE_1_0.txt or copy at 102 http://www.boost.org/LICENSE_1_0.txt) 103*/ 104module std.encoding; 105 106import std.range.primitives; 107import std.traits; 108import std.typecons; 109 110@system unittest 111{ 112 static ubyte[][] validStrings = 113 [ 114 // Plain ASCII 115 cast(ubyte[])"hello", 116 117 // First possible sequence of a certain length 118 [ 0x00 ], // U+00000000 one byte 119 [ 0xC2, 0x80 ], // U+00000080 two bytes 120 [ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes 121 [ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes 122 123 // Last possible sequence of a certain length 124 [ 0x7F ], // U+0000007F one byte 125 [ 0xDF, 0xBF ], // U+000007FF two bytes 126 [ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes 127 128 // Other boundary conditions 129 [ 0xED, 0x9F, 0xBF ], 130 // U+0000D7FF Last character before surrogates 131 [ 0xEE, 0x80, 0x80 ], 132 // U+0000E000 First character after surrogates 133 [ 0xEF, 0xBF, 0xBD ], 134 // U+0000FFFD Unicode replacement character 135 [ 0xF4, 0x8F, 0xBF, 0xBF ], 136 // U+0010FFFF Very last character 137 138 // Non-character code points 139 /* NOTE: These are legal in UTF, and may be converted from 140 one UTF to another, however they do not represent Unicode 141 characters. These code points have been reserved by 142 Unicode as non-character code points. They are permissible 143 for data exchange within an application, but they are are 144 not permitted to be used as characters. Since this module 145 deals with UTF, and not with Unicode per se, we choose to 146 accept them here. */ 147 [ 0xDF, 0xBE ], // U+0000FFFE 148 [ 0xDF, 0xBF ], // U+0000FFFF 149 ]; 150 151 static ubyte[][] invalidStrings = 152 [ 153 // First possible sequence of a certain length, but greater 154 // than U+10FFFF 155 [ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes 156 [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes 157 158 // Last possible sequence of a certain length, but greater than U+10FFFF 159 [ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes 160 [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes 161 [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes 162 163 // Other boundary conditions 164 [ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000 165 // First code 166 // point after 167 // last character 168 169 // Unexpected continuation bytes 170 [ 0x80 ], 171 [ 0xBF ], 172 [ 0x20, 0x80, 0x20 ], 173 [ 0x20, 0xBF, 0x20 ], 174 [ 0x80, 0x9F, 0xA0 ], 175 176 // Lonely start bytes 177 [ 0xC0 ], 178 [ 0xCF ], 179 [ 0x20, 0xC0, 0x20 ], 180 [ 0x20, 0xCF, 0x20 ], 181 [ 0xD0 ], 182 [ 0xDF ], 183 [ 0x20, 0xD0, 0x20 ], 184 [ 0x20, 0xDF, 0x20 ], 185 [ 0xE0 ], 186 [ 0xEF ], 187 [ 0x20, 0xE0, 0x20 ], 188 [ 0x20, 0xEF, 0x20 ], 189 [ 0xF0 ], 190 [ 0xF1 ], 191 [ 0xF2 ], 192 [ 0xF3 ], 193 [ 0xF4 ], 194 [ 0xF5 ], // If this were legal it would start a character > U+10FFFF 195 [ 0xF6 ], // If this were legal it would start a character > U+10FFFF 196 [ 0xF7 ], // If this were legal it would start a character > U+10FFFF 197 198 [ 0xEF, 0xBF ], // Three byte sequence with third byte missing 199 [ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing 200 [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ], // Concatenation of the above 201 202 // Impossible bytes 203 [ 0xF8 ], 204 [ 0xF9 ], 205 [ 0xFA ], 206 [ 0xFB ], 207 [ 0xFC ], 208 [ 0xFD ], 209 [ 0xFE ], 210 [ 0xFF ], 211 [ 0x20, 0xF8, 0x20 ], 212 [ 0x20, 0xF9, 0x20 ], 213 [ 0x20, 0xFA, 0x20 ], 214 [ 0x20, 0xFB, 0x20 ], 215 [ 0x20, 0xFC, 0x20 ], 216 [ 0x20, 0xFD, 0x20 ], 217 [ 0x20, 0xFE, 0x20 ], 218 [ 0x20, 0xFF, 0x20 ], 219 220 // Overlong sequences, all representing U+002F 221 /* With a safe UTF-8 decoder, all of the following five overlong 222 representations of the ASCII character slash ("/") should be 223 rejected like a malformed UTF-8 sequence */ 224 [ 0xC0, 0xAF ], 225 [ 0xE0, 0x80, 0xAF ], 226 [ 0xF0, 0x80, 0x80, 0xAF ], 227 [ 0xF8, 0x80, 0x80, 0x80, 0xAF ], 228 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ], 229 230 // Maximum overlong sequences 231 /* Below you see the highest Unicode value that is still resulting in 232 an overlong sequence if represented with the given number of bytes. 233 This is a boundary test for safe UTF-8 decoders. All five 234 characters should be rejected like malformed UTF-8 sequences. */ 235 [ 0xC1, 0xBF ], // U+0000007F 236 [ 0xE0, 0x9F, 0xBF ], // U+000007FF 237 [ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF 238 [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF 239 [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF 240 241 // Overlong representation of the NUL character 242 /* The following five sequences should also be rejected like malformed 243 UTF-8 sequences and should not be treated like the ASCII NUL 244 character. */ 245 [ 0xC0, 0x80 ], 246 [ 0xE0, 0x80, 0x80 ], 247 [ 0xF0, 0x80, 0x80, 0x80 ], 248 [ 0xF8, 0x80, 0x80, 0x80, 0x80 ], 249 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ], 250 251 // Illegal code positions 252 /* The following UTF-8 sequences should be rejected like malformed 253 sequences, because they never represent valid ISO 10646 characters 254 and a UTF-8 decoder that accepts them might introduce security 255 problems comparable to overlong UTF-8 sequences. */ 256 [ 0xED, 0xA0, 0x80 ], // U+D800 257 [ 0xED, 0xAD, 0xBF ], // U+DB7F 258 [ 0xED, 0xAE, 0x80 ], // U+DB80 259 [ 0xED, 0xAF, 0xBF ], // U+DBFF 260 [ 0xED, 0xB0, 0x80 ], // U+DC00 261 [ 0xED, 0xBE, 0x80 ], // U+DF80 262 [ 0xED, 0xBF, 0xBF ], // U+DFFF 263 ]; 264 265 static string[] sanitizedStrings = 266 [ 267 "\uFFFD","\uFFFD", 268 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", 269 " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ", 270 "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ", 271 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", 272 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD", 273 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", 274 " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ", 275 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", 276 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", 277 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", 278 ]; 279 280 // Make sure everything that should be valid, is 281 foreach (a;validStrings) 282 { 283 string s = cast(string) a; 284 assert(isValid(s),"Failed to validate: "~makeReadable(s)); 285 } 286 287 // Make sure everything that shouldn't be valid, isn't 288 foreach (a;invalidStrings) 289 { 290 string s = cast(string) a; 291 assert(!isValid(s),"Incorrectly validated: "~makeReadable(s)); 292 } 293 294 // Make sure we can sanitize everything bad 295 assert(invalidStrings.length == sanitizedStrings.length); 296 for (int i=0; i<invalidStrings.length; ++i) 297 { 298 string s = cast(string) invalidStrings[i]; 299 string t = sanitize(s); 300 assert(isValid(t)); 301 assert(t == sanitizedStrings[i]); 302 ubyte[] u = cast(ubyte[]) t; 303 validStrings ~= u; 304 } 305 306 // Make sure all transcodings work in both directions, using both forward 307 // and reverse iteration 308 foreach (a; validStrings) 309 { 310 string s = cast(string) a; 311 string s2; 312 wstring ws, ws2; 313 dstring ds, ds2; 314 315 transcode(s,ws); 316 assert(isValid(ws)); 317 transcode(ws,s2); 318 assert(s == s2); 319 320 transcode(s,ds); 321 assert(isValid(ds)); 322 transcode(ds,s2); 323 assert(s == s2); 324 325 transcode(ws,s); 326 assert(isValid(s)); 327 transcode(s,ws2); 328 assert(ws == ws2); 329 330 transcode(ws,ds); 331 assert(isValid(ds)); 332 transcode(ds,ws2); 333 assert(ws == ws2); 334 335 transcode(ds,s); 336 assert(isValid(s)); 337 transcode(s,ds2); 338 assert(ds == ds2); 339 340 transcode(ds,ws); 341 assert(isValid(ws)); 342 transcode(ws,ds2); 343 assert(ds == ds2); 344 345 transcodeReverse(s,ws); 346 assert(isValid(ws)); 347 transcodeReverse(ws,s2); 348 assert(s == s2); 349 350 transcodeReverse(s,ds); 351 assert(isValid(ds)); 352 transcodeReverse(ds,s2); 353 assert(s == s2); 354 355 transcodeReverse(ws,s); 356 assert(isValid(s)); 357 transcodeReverse(s,ws2); 358 assert(ws == ws2); 359 360 transcodeReverse(ws,ds); 361 assert(isValid(ds)); 362 transcodeReverse(ds,ws2); 363 assert(ws == ws2); 364 365 transcodeReverse(ds,s); 366 assert(isValid(s)); 367 transcodeReverse(s,ds2); 368 assert(ds == ds2); 369 370 transcodeReverse(ds,ws); 371 assert(isValid(ws)); 372 transcodeReverse(ws,ds2); 373 assert(ds == ds2); 374 } 375 376 // Make sure the non-UTF encodings work too 377 { 378 auto s = "\u20AC100"; 379 Windows1252String t; 380 transcode(s,t); 381 assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']); 382 string u; 383 transcode(s,u); 384 assert(s == u); 385 Latin1String v; 386 transcode(s,v); 387 assert(cast(string) v == "?100"); 388 AsciiString w; 389 transcode(v,w); 390 assert(cast(string) w == "?100"); 391 s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148"; 392 Latin2String x; 393 transcode(s,x); 394 assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]); 395 Windows1250String y; 396 transcode(s,y); 397 assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]); 398 } 399 400 // Make sure we can count properly 401 { 402 assert(encodedLength!(char)('A') == 1); 403 assert(encodedLength!(char)('\u00E3') == 2); 404 assert(encodedLength!(char)('\u2028') == 3); 405 assert(encodedLength!(char)('\U0010FFF0') == 4); 406 assert(encodedLength!(wchar)('A') == 1); 407 assert(encodedLength!(wchar)('\U0010FFF0') == 2); 408 } 409 410 // Make sure we can write into mutable arrays 411 { 412 char[4] buffer; 413 auto n = encode(cast(dchar)'\u00E3',buffer); 414 assert(n == 2); 415 assert(buffer[0] == 0xC3); 416 assert(buffer[1] == 0xA3); 417 } 418} 419 420//============================================================================= 421 422/** Special value returned by $(D safeDecode) */ 423enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF; 424 425template EncoderFunctions() 426{ 427 // Various forms of read 428 429 template ReadFromString() 430 { 431 @property bool canRead() { return s.length != 0; } 432 E peek() @safe pure @nogc nothrow { return s[0]; } 433 E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; } 434 } 435 436 template ReverseReadFromString() 437 { 438 @property bool canRead() { return s.length != 0; } 439 E peek() @safe pure @nogc nothrow { return s[$-1]; } 440 E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; } 441 } 442 443 // Various forms of Write 444 445 template WriteToString() 446 { 447 E[] s; 448 void write(E c) @safe pure nothrow { s ~= c; } 449 } 450 451 template WriteToArray() 452 { 453 void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; } 454 } 455 456 template WriteToDelegate() 457 { 458 void write(E c) { dg(c); } 459 } 460 461 // Functions we will export 462 463 template EncodeViaWrite() 464 { 465 mixin encodeViaWrite; 466 void encode(dchar c) { encodeViaWrite(c); } 467 } 468 469 template SkipViaRead() 470 { 471 mixin skipViaRead; 472 void skip() @safe pure @nogc nothrow { skipViaRead(); } 473 } 474 475 template DecodeViaRead() 476 { 477 mixin decodeViaRead; 478 dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); } 479 } 480 481 template SafeDecodeViaRead() 482 { 483 mixin safeDecodeViaRead; 484 dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); } 485 } 486 487 template DecodeReverseViaRead() 488 { 489 mixin decodeReverseViaRead; 490 dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); } 491 } 492 493 // Encoding to different destinations 494 495 template EncodeToString() 496 { 497 mixin WriteToString; 498 mixin EncodeViaWrite; 499 } 500 501 template EncodeToArray() 502 { 503 mixin WriteToArray; 504 mixin EncodeViaWrite; 505 } 506 507 template EncodeToDelegate() 508 { 509 mixin WriteToDelegate; 510 mixin EncodeViaWrite; 511 } 512 513 // Decoding functions 514 515 template SkipFromString() 516 { 517 mixin ReadFromString; 518 mixin SkipViaRead; 519 } 520 521 template DecodeFromString() 522 { 523 mixin ReadFromString; 524 mixin DecodeViaRead; 525 } 526 527 template SafeDecodeFromString() 528 { 529 mixin ReadFromString; 530 mixin SafeDecodeViaRead; 531 } 532 533 template DecodeReverseFromString() 534 { 535 mixin ReverseReadFromString; 536 mixin DecodeReverseViaRead; 537 } 538 539 //========================================================================= 540 541 // Below are the functions we will ultimately expose to the user 542 543 E[] encode(dchar c) @safe pure nothrow 544 { 545 mixin EncodeToString e; 546 e.encode(c); 547 return e.s; 548 } 549 550 void encode(dchar c, ref E[] array) @safe pure nothrow 551 { 552 mixin EncodeToArray e; 553 e.encode(c); 554 } 555 556 void encode(dchar c, void delegate(E) dg) 557 { 558 mixin EncodeToDelegate e; 559 e.encode(c); 560 } 561 562 void skip(ref const(E)[] s) @safe pure nothrow 563 { 564 mixin SkipFromString e; 565 e.skip(); 566 } 567 568 dchar decode(S)(ref S s) 569 { 570 mixin DecodeFromString e; 571 return e.decode(); 572 } 573 574 dchar safeDecode(S)(ref S s) 575 { 576 mixin SafeDecodeFromString e; 577 return e.safeDecode(); 578 } 579 580 dchar decodeReverse(ref const(E)[] s) @safe pure nothrow 581 { 582 mixin DecodeReverseFromString e; 583 return e.decodeReverse(); 584 } 585} 586 587//========================================================================= 588 589struct CodePoints(E) 590{ 591 const(E)[] s; 592 593 this(const(E)[] s) 594 in 595 { 596 assert(isValid(s)); 597 } 598 body 599 { 600 this.s = s; 601 } 602 603 int opApply(scope int delegate(ref dchar) dg) 604 { 605 int result = 0; 606 while (s.length != 0) 607 { 608 dchar c = decode(s); 609 result = dg(c); 610 if (result != 0) break; 611 } 612 return result; 613 } 614 615 int opApply(scope int delegate(ref size_t, ref dchar) dg) 616 { 617 size_t i = 0; 618 int result = 0; 619 while (s.length != 0) 620 { 621 immutable len = s.length; 622 dchar c = decode(s); 623 size_t j = i; // We don't want the delegate corrupting i 624 result = dg(j,c); 625 if (result != 0) break; 626 i += len - s.length; 627 } 628 return result; 629 } 630 631 int opApplyReverse(scope int delegate(ref dchar) dg) 632 { 633 int result = 0; 634 while (s.length != 0) 635 { 636 dchar c = decodeReverse(s); 637 result = dg(c); 638 if (result != 0) break; 639 } 640 return result; 641 } 642 643 int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg) 644 { 645 int result = 0; 646 while (s.length != 0) 647 { 648 dchar c = decodeReverse(s); 649 size_t i = s.length; 650 result = dg(i,c); 651 if (result != 0) break; 652 } 653 return result; 654 } 655} 656 657struct CodeUnits(E) 658{ 659 E[] s; 660 661 this(dchar d) 662 in 663 { 664 assert(isValidCodePoint(d)); 665 } 666 body 667 { 668 s = encode!(E)(d); 669 } 670 671 int opApply(scope int delegate(ref E) dg) 672 { 673 int result = 0; 674 foreach (E c;s) 675 { 676 result = dg(c); 677 if (result != 0) break; 678 } 679 return result; 680 } 681 682 int opApplyReverse(scope int delegate(ref E) dg) 683 { 684 int result = 0; 685 foreach_reverse (E c;s) 686 { 687 result = dg(c); 688 if (result != 0) break; 689 } 690 return result; 691 } 692} 693 694//============================================================================= 695 696template EncoderInstance(E) 697{ 698 static assert(false,"Cannot instantiate EncoderInstance for type " 699 ~ E.stringof); 700} 701 702private template GenericEncoder() 703{ 704 bool canEncode(dchar c) @safe pure @nogc nothrow 705 { 706 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true; 707 if (c >= 0xFFFD) return false; 708 709 auto idx = 0; 710 while (idx < bstMap.length) 711 { 712 if (bstMap[idx][0] == c) return true; 713 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index 714 } 715 716 return false; 717 } 718 719 bool isValidCodeUnit(E c) @safe pure @nogc nothrow 720 { 721 if (c < m_charMapStart || c > m_charMapEnd) return true; 722 return charMap[c-m_charMapStart] != 0xFFFD; 723 } 724 725 size_t encodedLength(dchar c) @safe pure @nogc nothrow 726 in 727 { 728 assert(canEncode(c)); 729 } 730 body 731 { 732 return 1; 733 } 734 735 void encodeViaWrite()(dchar c) 736 { 737 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {} 738 else if (c >= 0xFFFD) { c = '?'; } 739 else 740 { 741 auto idx = 0; 742 while (idx < bstMap.length) 743 { 744 if (bstMap[idx][0] == c) 745 { 746 write(cast(E) bstMap[idx][1]); 747 return; 748 } 749 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index 750 } 751 c = '?'; 752 } 753 write(cast(E) c); 754 } 755 756 void skipViaRead()() 757 { 758 read(); 759 } 760 761 dchar decodeViaRead()() 762 { 763 E c = read(); 764 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c; 765 } 766 767 dchar safeDecodeViaRead()() 768 { 769 immutable E c = read(); 770 immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c; 771 return d == 0xFFFD ? INVALID_SEQUENCE : d; 772 } 773 774 dchar decodeReverseViaRead()() 775 { 776 E c = read(); 777 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c; 778 } 779 780 @property EString replacementSequence() @safe pure @nogc nothrow 781 { 782 return cast(EString)("?"); 783 } 784 785 mixin EncoderFunctions; 786} 787 788//============================================================================= 789// ASCII 790//============================================================================= 791 792/** Defines various character sets. */ 793enum AsciiChar : ubyte { init } 794/// Ditto 795alias AsciiString = immutable(AsciiChar)[]; 796 797template EncoderInstance(CharType : AsciiChar) 798{ 799 alias E = AsciiChar; 800 alias EString = AsciiString; 801 802 @property string encodingName() @safe pure nothrow @nogc 803 { 804 return "ASCII"; 805 } 806 807 bool canEncode(dchar c) @safe pure nothrow @nogc 808 { 809 return c < 0x80; 810 } 811 812 bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc 813 { 814 return c < 0x80; 815 } 816 817 size_t encodedLength(dchar c) @safe pure nothrow @nogc 818 in 819 { 820 assert(canEncode(c)); 821 } 822 body 823 { 824 return 1; 825 } 826 827 void encodeX(Range)(dchar c, Range r) 828 { 829 if (!canEncode(c)) c = '?'; 830 r.write(cast(AsciiChar) c); 831 } 832 833 void encodeViaWrite()(dchar c) 834 { 835 if (!canEncode(c)) c = '?'; 836 write(cast(AsciiChar) c); 837 } 838 839 void skipViaRead()() 840 { 841 read(); 842 } 843 844 dchar decodeViaRead()() 845 { 846 return read(); 847 } 848 849 dchar safeDecodeViaRead()() 850 { 851 immutable c = read(); 852 return canEncode(c) ? c : INVALID_SEQUENCE; 853 } 854 855 dchar decodeReverseViaRead()() 856 { 857 return read(); 858 } 859 860 @property EString replacementSequence() @safe pure nothrow @nogc 861 { 862 return cast(EString)("?"); 863 } 864 865 mixin EncoderFunctions; 866} 867 868//============================================================================= 869// ISO-8859-1 870//============================================================================= 871 872/** Defines an Latin1-encoded character. */ 873enum Latin1Char : ubyte { init } 874/** 875Defines an Latin1-encoded string (as an array of $(D 876immutable(Latin1Char))). 877 */ 878alias Latin1String = immutable(Latin1Char)[]; 879 880template EncoderInstance(CharType : Latin1Char) 881{ 882 alias E = Latin1Char; 883 alias EString = Latin1String; 884 885 @property string encodingName() @safe pure nothrow @nogc 886 { 887 return "ISO-8859-1"; 888 } 889 890 bool canEncode(dchar c) @safe pure nothrow @nogc 891 { 892 return c < 0x100; 893 } 894 895 bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc 896 { 897 return true; 898 } 899 900 size_t encodedLength(dchar c) @safe pure nothrow @nogc 901 in 902 { 903 assert(canEncode(c)); 904 } 905 body 906 { 907 return 1; 908 } 909 910 void encodeViaWrite()(dchar c) 911 { 912 if (!canEncode(c)) c = '?'; 913 write(cast(Latin1Char) c); 914 } 915 916 void skipViaRead()() 917 { 918 read(); 919 } 920 921 dchar decodeViaRead()() 922 { 923 return read(); 924 } 925 926 dchar safeDecodeViaRead()() 927 { 928 return read(); 929 } 930 931 dchar decodeReverseViaRead()() 932 { 933 return read(); 934 } 935 936 @property EString replacementSequence() @safe pure nothrow @nogc 937 { 938 return cast(EString)("?"); 939 } 940 941 mixin EncoderFunctions; 942} 943 944//============================================================================= 945// ISO-8859-2 946//============================================================================= 947 948/// Defines a Latin2-encoded character. 949enum Latin2Char : ubyte { init } 950 951/** 952 * Defines an Latin2-encoded string (as an array of $(D 953 * immutable(Latin2Char))). 954 */ 955alias Latin2String = immutable(Latin2Char)[]; 956 957private template EncoderInstance(CharType : Latin2Char) 958{ 959 import std.typecons : Tuple, tuple; 960 961 alias E = Latin2Char; 962 alias EString = Latin2String; 963 964 @property string encodingName() @safe pure nothrow @nogc 965 { 966 return "ISO-8859-2"; 967 } 968 969 private static immutable dchar m_charMapStart = 0xa1; 970 private static immutable dchar m_charMapEnd = 0xff; 971 972 private immutable wstring charMap = 973 "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~ 974 "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~ 975 "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~ 976 "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~ 977 "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~ 978 "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~ 979 "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~ 980 "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~ 981 "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~ 982 "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~ 983 "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~ 984 "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9"; 985 986 private immutable Tuple!(wchar, char)[] bstMap = [ 987 tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'), 988 tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'), 989 tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'), 990 tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'), 991 tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'), 992 tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'), 993 tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'), 994 tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'), 995 tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'), 996 tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'), 997 tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'), 998 tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'), 999 tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'), 1000 tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'), 1001 tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'), 1002 tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'), 1003 tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'), 1004 tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'), 1005 tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'), 1006 tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'), 1007 tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'), 1008 tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'), 1009 tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'), 1010 tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'), 1011 tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'), 1012 tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'), 1013 tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'), 1014 tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'), 1015 tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'), 1016 tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'), 1017 tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'), 1018 tuple('\u0143','\xD1'), tuple('\u0147','\xD2') 1019 ]; 1020 1021 mixin GenericEncoder!(); 1022} 1023 1024//============================================================================= 1025// WINDOWS-1250 1026//============================================================================= 1027 1028/// Defines a Windows1250-encoded character. 1029enum Windows1250Char : ubyte { init } 1030 1031/** 1032 * Defines an Windows1250-encoded string (as an array of $(D 1033 * immutable(Windows1250Char))). 1034 */ 1035alias Windows1250String = immutable(Windows1250Char)[]; 1036 1037private template EncoderInstance(CharType : Windows1250Char) 1038{ 1039 import std.typecons : Tuple, tuple; 1040 1041 alias E = Windows1250Char; 1042 alias EString = Windows1250String; 1043 1044 @property string encodingName() @safe pure nothrow @nogc 1045 { 1046 return "windows-1250"; 1047 } 1048 1049 private static immutable dchar m_charMapStart = 0x80; 1050 private static immutable dchar m_charMapEnd = 0xff; 1051 1052 private immutable wstring charMap = 1053 "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~ 1054 "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~ 1055 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~ 1056 "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~ 1057 "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~ 1058 "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~ 1059 "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~ 1060 "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~ 1061 "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~ 1062 "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~ 1063 "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~ 1064 "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~ 1065 "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~ 1066 "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~ 1067 "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~ 1068 "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9"; 1069 1070 private immutable Tuple!(wchar, char)[] bstMap = [ 1071 tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'), 1072 tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'), 1073 tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'), 1074 tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'), 1075 tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'), 1076 tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'), 1077 tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'), 1078 tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'), 1079 tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'), 1080 tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'), 1081 tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'), 1082 tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'), 1083 tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'), 1084 tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'), 1085 tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'), 1086 tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'), 1087 tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'), 1088 tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'), 1089 tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'), 1090 tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'), 1091 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'), 1092 tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'), 1093 tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'), 1094 tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'), 1095 tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'), 1096 tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'), 1097 tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'), 1098 tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'), 1099 tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'), 1100 tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'), 1101 tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'), 1102 tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'), 1103 tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'), 1104 tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'), 1105 tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'), 1106 tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'), 1107 tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'), 1108 tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'), 1109 tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'), 1110 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'), 1111 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89') 1112 ]; 1113 1114 mixin GenericEncoder!(); 1115} 1116 1117//============================================================================= 1118// WINDOWS-1252 1119//============================================================================= 1120 1121/// Defines a Windows1252-encoded character. 1122enum Windows1252Char : ubyte { init } 1123 1124/** 1125 * Defines an Windows1252-encoded string (as an array of $(D 1126 * immutable(Windows1252Char))). 1127 */ 1128alias Windows1252String = immutable(Windows1252Char)[]; 1129 1130template EncoderInstance(CharType : Windows1252Char) 1131{ 1132 import std.typecons : Tuple, tuple; 1133 1134 alias E = Windows1252Char; 1135 alias EString = Windows1252String; 1136 1137 @property string encodingName() @safe pure nothrow @nogc 1138 { 1139 return "windows-1252"; 1140 } 1141 1142 private static immutable dchar m_charMapStart = 0x80; 1143 private static immutable dchar m_charMapEnd = 0x9f; 1144 1145 private immutable wstring charMap = 1146 "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~ 1147 "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~ 1148 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~ 1149 "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178"; 1150 1151 private immutable Tuple!(wchar, char)[] bstMap = [ 1152 tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'), 1153 tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'), 1154 tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'), 1155 tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'), 1156 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'), 1157 tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'), 1158 tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'), 1159 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'), 1160 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89') 1161 ]; 1162 1163 mixin GenericEncoder!(); 1164} 1165 1166//============================================================================= 1167// UTF-8 1168//============================================================================= 1169 1170template EncoderInstance(CharType : char) 1171{ 1172 alias E = char; 1173 alias EString = immutable(char)[]; 1174 1175 @property string encodingName() @safe pure nothrow @nogc 1176 { 1177 return "UTF-8"; 1178 } 1179 1180 bool canEncode(dchar c) @safe pure nothrow @nogc 1181 { 1182 return isValidCodePoint(c); 1183 } 1184 1185 bool isValidCodeUnit(char c) @safe pure nothrow @nogc 1186 { 1187 return (c < 0xC0 || (c >= 0xC2 && c < 0xF5)); 1188 } 1189 1190 immutable ubyte[128] tailTable = 1191 [ 1192 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1193 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1194 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1195 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1196 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1197 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1198 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 1199 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0, 1200 ]; 1201 1202 private int tails(char c) @safe pure nothrow @nogc 1203 in 1204 { 1205 assert(c >= 0x80); 1206 } 1207 body 1208 { 1209 return tailTable[c-0x80]; 1210 } 1211 1212 size_t encodedLength(dchar c) @safe pure nothrow @nogc 1213 in 1214 { 1215 assert(canEncode(c)); 1216 } 1217 body 1218 { 1219 if (c < 0x80) return 1; 1220 if (c < 0x800) return 2; 1221 if (c < 0x10000) return 3; 1222 return 4; 1223 } 1224 1225 void encodeViaWrite()(dchar c) 1226 { 1227 if (c < 0x80) 1228 { 1229 write(cast(char) c); 1230 } 1231 else if (c < 0x800) 1232 { 1233 write(cast(char)((c >> 6) + 0xC0)); 1234 write(cast(char)((c & 0x3F) + 0x80)); 1235 } 1236 else if (c < 0x10000) 1237 { 1238 write(cast(char)((c >> 12) + 0xE0)); 1239 write(cast(char)(((c >> 6) & 0x3F) + 0x80)); 1240 write(cast(char)((c & 0x3F) + 0x80)); 1241 } 1242 else 1243 { 1244 write(cast(char)((c >> 18) + 0xF0)); 1245 write(cast(char)(((c >> 12) & 0x3F) + 0x80)); 1246 write(cast(char)(((c >> 6) & 0x3F) + 0x80)); 1247 write(cast(char)((c & 0x3F) + 0x80)); 1248 } 1249 } 1250 1251 void skipViaRead()() 1252 { 1253 auto c = read(); 1254 if (c < 0xC0) return; 1255 int n = tails(cast(char) c); 1256 for (size_t i=0; i<n; ++i) 1257 { 1258 read(); 1259 } 1260 } 1261 1262 dchar decodeViaRead()() 1263 { 1264 dchar c = read(); 1265 if (c < 0xC0) return c; 1266 int n = tails(cast(char) c); 1267 c &= (1 << (6 - n)) - 1; 1268 for (size_t i=0; i<n; ++i) 1269 { 1270 c = (c << 6) + (read() & 0x3F); 1271 } 1272 return c; 1273 } 1274 1275 dchar safeDecodeViaRead()() 1276 { 1277 dchar c = read(); 1278 if (c < 0x80) return c; 1279 int n = tails(cast(char) c); 1280 if (n == 0) return INVALID_SEQUENCE; 1281 1282 if (!canRead) return INVALID_SEQUENCE; 1283 size_t d = peek(); 1284 immutable err = 1285 ( 1286 (c < 0xC2) // fail overlong 2-byte sequences 1287 || (c > 0xF4) // fail overlong 4-6-byte sequences 1288 || (c == 0xE0 && ((d & 0xE0) == 0x80)) // fail overlong 3-byte sequences 1289 || (c == 0xED && ((d & 0xE0) == 0xA0)) // fail surrogates 1290 || (c == 0xF0 && ((d & 0xF0) == 0x80)) // fail overlong 4-byte sequences 1291 || (c == 0xF4 && ((d & 0xF0) >= 0x90)) // fail code points > 0x10FFFF 1292 ); 1293 1294 c &= (1 << (6 - n)) - 1; 1295 for (size_t i=0; i<n; ++i) 1296 { 1297 if (!canRead) return INVALID_SEQUENCE; 1298 d = peek(); 1299 if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE; 1300 c = (c << 6) + (read() & 0x3F); 1301 } 1302 1303 return err ? INVALID_SEQUENCE : c; 1304 } 1305 1306 dchar decodeReverseViaRead()() 1307 { 1308 dchar c = read(); 1309 if (c < 0x80) return c; 1310 size_t shift = 0; 1311 c &= 0x3F; 1312 for (size_t i=0; i<4; ++i) 1313 { 1314 shift += 6; 1315 auto d = read(); 1316 size_t n = tails(cast(char) d); 1317 immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1; 1318 c += ((d & mask) << shift); 1319 if (n != 0) break; 1320 } 1321 return c; 1322 } 1323 1324 @property EString replacementSequence() @safe pure nothrow @nogc 1325 { 1326 return "\uFFFD"; 1327 } 1328 1329 mixin EncoderFunctions; 1330} 1331 1332//============================================================================= 1333// UTF-16 1334//============================================================================= 1335 1336template EncoderInstance(CharType : wchar) 1337{ 1338 alias E = wchar; 1339 alias EString = immutable(wchar)[]; 1340 1341 @property string encodingName() @safe pure nothrow @nogc 1342 { 1343 return "UTF-16"; 1344 } 1345 1346 bool canEncode(dchar c) @safe pure nothrow @nogc 1347 { 1348 return isValidCodePoint(c); 1349 } 1350 1351 bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc 1352 { 1353 return true; 1354 } 1355 1356 size_t encodedLength(dchar c) @safe pure nothrow @nogc 1357 in 1358 { 1359 assert(canEncode(c)); 1360 } 1361 body 1362 { 1363 return (c < 0x10000) ? 1 : 2; 1364 } 1365 1366 void encodeViaWrite()(dchar c) 1367 { 1368 if (c < 0x10000) 1369 { 1370 write(cast(wchar) c); 1371 } 1372 else 1373 { 1374 size_t n = c - 0x10000; 1375 write(cast(wchar)(0xD800 + (n >> 10))); 1376 write(cast(wchar)(0xDC00 + (n & 0x3FF))); 1377 } 1378 } 1379 1380 void skipViaRead()() 1381 { 1382 immutable c = read(); 1383 if (c < 0xD800 || c >= 0xE000) return; 1384 read(); 1385 } 1386 1387 dchar decodeViaRead()() 1388 { 1389 wchar c = read(); 1390 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c; 1391 wchar d = read(); 1392 c &= 0x3FF; 1393 d &= 0x3FF; 1394 return 0x10000 + (c << 10) + d; 1395 } 1396 1397 dchar safeDecodeViaRead()() 1398 { 1399 wchar c = read(); 1400 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c; 1401 if (c >= 0xDC00) return INVALID_SEQUENCE; 1402 if (!canRead) return INVALID_SEQUENCE; 1403 wchar d = peek(); 1404 if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE; 1405 d = read(); 1406 c &= 0x3FF; 1407 d &= 0x3FF; 1408 return 0x10000 + (c << 10) + d; 1409 } 1410 1411 dchar decodeReverseViaRead()() 1412 { 1413 wchar c = read(); 1414 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c; 1415 wchar d = read(); 1416 c &= 0x3FF; 1417 d &= 0x3FF; 1418 return 0x10000 + (d << 10) + c; 1419 } 1420 1421 @property EString replacementSequence() @safe pure nothrow @nogc 1422 { 1423 return "\uFFFD"w; 1424 } 1425 1426 mixin EncoderFunctions; 1427} 1428 1429//============================================================================= 1430// UTF-32 1431//============================================================================= 1432 1433template EncoderInstance(CharType : dchar) 1434{ 1435 alias E = dchar; 1436 alias EString = immutable(dchar)[]; 1437 1438 @property string encodingName() @safe pure nothrow @nogc 1439 { 1440 return "UTF-32"; 1441 } 1442 1443 bool canEncode(dchar c) @safe pure @nogc nothrow 1444 { 1445 return isValidCodePoint(c); 1446 } 1447 1448 bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow 1449 { 1450 return isValidCodePoint(c); 1451 } 1452 1453 size_t encodedLength(dchar c) @safe pure @nogc nothrow 1454 in 1455 { 1456 assert(canEncode(c)); 1457 } 1458 body 1459 { 1460 return 1; 1461 } 1462 1463 void encodeViaWrite()(dchar c) 1464 { 1465 write(c); 1466 } 1467 1468 void skipViaRead()() 1469 { 1470 read(); 1471 } 1472 1473 dchar decodeViaRead()() 1474 { 1475 return cast(dchar) read(); 1476 } 1477 1478 dchar safeDecodeViaRead()() 1479 { 1480 immutable c = read(); 1481 return isValidCodePoint(c) ? c : INVALID_SEQUENCE; 1482 } 1483 1484 dchar decodeReverseViaRead()() 1485 { 1486 return cast(dchar) read(); 1487 } 1488 1489 @property EString replacementSequence() @safe pure nothrow @nogc 1490 { 1491 return "\uFFFD"d; 1492 } 1493 1494 mixin EncoderFunctions; 1495} 1496 1497//============================================================================= 1498// Below are forwarding functions which expose the function to the user 1499 1500/** 1501Returns true if c is a valid code point 1502 1503 Note that this includes the non-character code points U+FFFE and U+FFFF, 1504 since these are valid code points (even though they are not valid 1505 characters). 1506 1507 Supersedes: 1508 This function supersedes $(D std.utf.startsValidDchar()). 1509 1510 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1511 WINDOWS-1252 1512 1513 Params: 1514 c = the code point to be tested 1515 */ 1516bool isValidCodePoint(dchar c) @safe pure nothrow @nogc 1517{ 1518 return c < 0xD800 || (c >= 0xE000 && c < 0x110000); 1519} 1520 1521/** 1522 Returns the name of an encoding. 1523 1524 The type of encoding cannot be deduced. Therefore, it is necessary to 1525 explicitly specify the encoding type. 1526 1527 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1528 WINDOWS-1252 1529 */ 1530@property string encodingName(T)() 1531{ 1532 return EncoderInstance!(T).encodingName; 1533} 1534 1535/// 1536@safe unittest 1537{ 1538 assert(encodingName!(char) == "UTF-8"); 1539 assert(encodingName!(wchar) == "UTF-16"); 1540 assert(encodingName!(dchar) == "UTF-32"); 1541 assert(encodingName!(AsciiChar) == "ASCII"); 1542 assert(encodingName!(Latin1Char) == "ISO-8859-1"); 1543 assert(encodingName!(Latin2Char) == "ISO-8859-2"); 1544 assert(encodingName!(Windows1250Char) == "windows-1250"); 1545 assert(encodingName!(Windows1252Char) == "windows-1252"); 1546} 1547 1548/** 1549 Returns true iff it is possible to represent the specified codepoint 1550 in the encoding. 1551 1552 The type of encoding cannot be deduced. Therefore, it is necessary to 1553 explicitly specify the encoding type. 1554 1555 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1556 WINDOWS-1252 1557 */ 1558bool canEncode(E)(dchar c) 1559{ 1560 return EncoderInstance!(E).canEncode(c); 1561} 1562 1563/// 1564@safe pure unittest 1565{ 1566 assert( canEncode!(Latin1Char)('A')); 1567 assert( canEncode!(Latin2Char)('A')); 1568 assert(!canEncode!(AsciiChar)('\u00A0')); 1569 assert( canEncode!(Latin1Char)('\u00A0')); 1570 assert( canEncode!(Latin2Char)('\u00A0')); 1571 assert( canEncode!(Windows1250Char)('\u20AC')); 1572 assert(!canEncode!(Windows1250Char)('\u20AD')); 1573 assert(!canEncode!(Windows1250Char)('\uFFFD')); 1574 assert( canEncode!(Windows1252Char)('\u20AC')); 1575 assert(!canEncode!(Windows1252Char)('\u20AD')); 1576 assert(!canEncode!(Windows1252Char)('\uFFFD')); 1577 assert(!canEncode!(char)(cast(dchar) 0x110000)); 1578} 1579 1580/// How to check an entire string 1581@safe pure unittest 1582{ 1583 import std.algorithm.searching : find; 1584 import std.utf : byDchar; 1585 1586 assert("The quick brown fox" 1587 .byDchar 1588 .find!(x => !canEncode!AsciiChar(x)) 1589 .empty); 1590} 1591 1592/** 1593 Returns true if the code unit is legal. For example, the byte 0x80 would 1594 not be legal in ASCII, because ASCII code units must always be in the range 1595 0x00 to 0x7F. 1596 1597 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1598 WINDOWS-1252 1599 1600 Params: 1601 c = the code unit to be tested 1602 */ 1603bool isValidCodeUnit(E)(E c) 1604{ 1605 return EncoderInstance!(E).isValidCodeUnit(c); 1606} 1607 1608/// 1609@system pure unittest 1610{ 1611 assert(!isValidCodeUnit(cast(char) 0xC0)); 1612 assert(!isValidCodeUnit(cast(char) 0xFF)); 1613 assert( isValidCodeUnit(cast(wchar) 0xD800)); 1614 assert(!isValidCodeUnit(cast(dchar) 0xD800)); 1615 assert(!isValidCodeUnit(cast(AsciiChar) 0xA0)); 1616 assert( isValidCodeUnit(cast(Windows1250Char) 0x80)); 1617 assert(!isValidCodeUnit(cast(Windows1250Char) 0x81)); 1618 assert( isValidCodeUnit(cast(Windows1252Char) 0x80)); 1619 assert(!isValidCodeUnit(cast(Windows1252Char) 0x81)); 1620} 1621 1622/** 1623 Returns true if the string is encoded correctly 1624 1625 Supersedes: 1626 This function supersedes std.utf.validate(), however note that this 1627 function returns a bool indicating whether the input was valid or not, 1628 whereas the older function would throw an exception. 1629 1630 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1631 WINDOWS-1252 1632 1633 Params: 1634 s = the string to be tested 1635 */ 1636bool isValid(E)(const(E)[] s) 1637{ 1638 return s.length == validLength(s); 1639} 1640 1641/// 1642@system pure unittest 1643{ 1644 assert( isValid("\u20AC100")); 1645 assert(!isValid(cast(char[3])[167, 133, 175])); 1646} 1647 1648/** 1649 Returns the length of the longest possible substring, starting from 1650 the first code unit, which is validly encoded. 1651 1652 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1653 WINDOWS-1252 1654 1655 Params: 1656 s = the string to be tested 1657 */ 1658size_t validLength(E)(const(E)[] s) 1659{ 1660 size_t result, before = void; 1661 while ((before = s.length) > 0) 1662 { 1663 if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE) 1664 break; 1665 result += before - s.length; 1666 } 1667 return result; 1668} 1669 1670/** 1671 Sanitizes a string by replacing malformed code unit sequences with valid 1672 code unit sequences. The result is guaranteed to be valid for this encoding. 1673 1674 If the input string is already valid, this function returns the original, 1675 otherwise it constructs a new string by replacing all illegal code unit 1676 sequences with the encoding's replacement character, Invalid sequences will 1677 be replaced with the Unicode replacement character (U+FFFD) if the 1678 character repertoire contains it, otherwise invalid sequences will be 1679 replaced with '?'. 1680 1681 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1682 WINDOWS-1252 1683 1684 Params: 1685 s = the string to be sanitized 1686 */ 1687immutable(E)[] sanitize(E)(immutable(E)[] s) 1688{ 1689 size_t n = validLength(s); 1690 if (n == s.length) return s; 1691 1692 auto repSeq = EncoderInstance!(E).replacementSequence; 1693 1694 // Count how long the string needs to be. 1695 // Overestimating is not a problem 1696 size_t len = s.length; 1697 const(E)[] t = s[n..$]; 1698 while (t.length != 0) 1699 { 1700 immutable c = EncoderInstance!(E).safeDecode(t); 1701 assert(c == INVALID_SEQUENCE); 1702 len += repSeq.length; 1703 t = t[validLength(t)..$]; 1704 } 1705 1706 // Now do the write 1707 E[] array = new E[len]; 1708 array[0 .. n] = s[0 .. n]; 1709 size_t offset = n; 1710 1711 t = s[n..$]; 1712 while (t.length != 0) 1713 { 1714 immutable c = EncoderInstance!(E).safeDecode(t); 1715 assert(c == INVALID_SEQUENCE); 1716 array[offset .. offset+repSeq.length] = repSeq[]; 1717 offset += repSeq.length; 1718 n = validLength(t); 1719 array[offset .. offset+n] = t[0 .. n]; 1720 offset += n; 1721 t = t[n..$]; 1722 } 1723 return cast(immutable(E)[])array[0 .. offset]; 1724} 1725 1726/// 1727@system pure unittest 1728{ 1729 assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld"); 1730} 1731 1732/** 1733 Returns the length of the first encoded sequence. 1734 1735 The input to this function MUST be validly encoded. 1736 This is enforced by the function's in-contract. 1737 1738 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1739 WINDOWS-1252 1740 1741 Params: 1742 s = the string to be sliced 1743 */ 1744size_t firstSequence(E)(const(E)[] s) 1745in 1746{ 1747 assert(s.length != 0); 1748 const(E)[] u = s; 1749 assert(safeDecode(u) != INVALID_SEQUENCE); 1750} 1751body 1752{ 1753 auto before = s.length; 1754 EncoderInstance!(E).skip(s); 1755 return before - s.length; 1756} 1757 1758/// 1759@system pure unittest 1760{ 1761 assert(firstSequence("\u20AC1000") == "\u20AC".length); 1762 assert(firstSequence("hel") == "h".length); 1763} 1764 1765/** 1766 Returns the length of the last encoded sequence. 1767 1768 The input to this function MUST be validly encoded. 1769 This is enforced by the function's in-contract. 1770 1771 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1772 WINDOWS-1252 1773 1774 Params: 1775 s = the string to be sliced 1776 */ 1777size_t lastSequence(E)(const(E)[] s) 1778in 1779{ 1780 assert(s.length != 0); 1781 assert(isValid(s)); 1782} 1783body 1784{ 1785 const(E)[] t = s; 1786 EncoderInstance!(E).decodeReverse(s); 1787 return t.length - s.length; 1788} 1789 1790/// 1791@system pure unittest 1792{ 1793 assert(lastSequence("1000\u20AC") == "\u20AC".length); 1794 assert(lastSequence("hell��") == "��".length); 1795} 1796 1797/** 1798 Returns the array index at which the (n+1)th code point begins. 1799 1800 The input to this function MUST be validly encoded. 1801 This is enforced by the function's in-contract. 1802 1803 Supersedes: 1804 This function supersedes std.utf.toUTFindex(). 1805 1806 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1807 WINDOWS-1252 1808 1809 Params: 1810 s = the string to be counted 1811 n = the current code point index 1812 */ 1813ptrdiff_t index(E)(const(E)[] s,int n) 1814in 1815{ 1816 assert(isValid(s)); 1817 assert(n >= 0); 1818} 1819body 1820{ 1821 const(E)[] t = s; 1822 for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s); 1823 return t.length - s.length; 1824} 1825 1826/// 1827@system pure unittest 1828{ 1829 assert(index("\u20AC100",1) == 3); 1830 assert(index("h��llo",2) == 3); 1831} 1832 1833/** 1834 Decodes a single code point. 1835 1836 This function removes one or more code units from the start of a string, 1837 and returns the decoded code point which those code units represent. 1838 1839 The input to this function MUST be validly encoded. 1840 This is enforced by the function's in-contract. 1841 1842 Supersedes: 1843 This function supersedes std.utf.decode(), however, note that the 1844 function codePoints() supersedes it more conveniently. 1845 1846 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1847 WINDOWS-1252 1848 1849 Params: 1850 s = the string whose first code point is to be decoded 1851 */ 1852dchar decode(S)(ref S s) 1853in 1854{ 1855 assert(s.length != 0); 1856 auto u = s; 1857 assert(safeDecode(u) != INVALID_SEQUENCE); 1858} 1859body 1860{ 1861 return EncoderInstance!(typeof(s[0])).decode(s); 1862} 1863 1864/** 1865 Decodes a single code point from the end of a string. 1866 1867 This function removes one or more code units from the end of a string, 1868 and returns the decoded code point which those code units represent. 1869 1870 The input to this function MUST be validly encoded. 1871 This is enforced by the function's in-contract. 1872 1873 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1874 WINDOWS-1252 1875 1876 Params: 1877 s = the string whose first code point is to be decoded 1878 */ 1879dchar decodeReverse(E)(ref const(E)[] s) 1880in 1881{ 1882 assert(s.length != 0); 1883 assert(isValid(s)); 1884} 1885body 1886{ 1887 return EncoderInstance!(E).decodeReverse(s); 1888} 1889 1890/** 1891 Decodes a single code point. The input does not have to be valid. 1892 1893 This function removes one or more code units from the start of a string, 1894 and returns the decoded code point which those code units represent. 1895 1896 This function will accept an invalidly encoded string as input. 1897 If an invalid sequence is found at the start of the string, this 1898 function will remove it, and return the value INVALID_SEQUENCE. 1899 1900 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1901 WINDOWS-1252 1902 1903 Params: 1904 s = the string whose first code point is to be decoded 1905 */ 1906dchar safeDecode(S)(ref S s) 1907in 1908{ 1909 assert(s.length != 0); 1910} 1911body 1912{ 1913 return EncoderInstance!(typeof(s[0])).safeDecode(s); 1914} 1915 1916/** 1917 Returns the number of code units required to encode a single code point. 1918 1919 The input to this function MUST be a valid code point. 1920 This is enforced by the function's in-contract. 1921 1922 The type of the output cannot be deduced. Therefore, it is necessary to 1923 explicitly specify the encoding as a template parameter. 1924 1925 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1926 WINDOWS-1252 1927 1928 Params: 1929 c = the code point to be encoded 1930 */ 1931size_t encodedLength(E)(dchar c) 1932in 1933{ 1934 assert(isValidCodePoint(c)); 1935} 1936body 1937{ 1938 return EncoderInstance!(E).encodedLength(c); 1939} 1940 1941/** 1942 Encodes a single code point. 1943 1944 This function encodes a single code point into one or more code units. 1945 It returns a string containing those code units. 1946 1947 The input to this function MUST be a valid code point. 1948 This is enforced by the function's in-contract. 1949 1950 The type of the output cannot be deduced. Therefore, it is necessary to 1951 explicitly specify the encoding as a template parameter. 1952 1953 Supersedes: 1954 This function supersedes std.utf.encode(), however, note that the 1955 function codeUnits() supersedes it more conveniently. 1956 1957 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1958 WINDOWS-1252 1959 1960 Params: 1961 c = the code point to be encoded 1962 */ 1963E[] encode(E)(dchar c) 1964in 1965{ 1966 assert(isValidCodePoint(c)); 1967} 1968body 1969{ 1970 return EncoderInstance!(E).encode(c); 1971} 1972 1973/** 1974 Encodes a single code point into an array. 1975 1976 This function encodes a single code point into one or more code units 1977 The code units are stored in a user-supplied fixed-size array, 1978 which must be passed by reference. 1979 1980 The input to this function MUST be a valid code point. 1981 This is enforced by the function's in-contract. 1982 1983 The type of the output cannot be deduced. Therefore, it is necessary to 1984 explicitly specify the encoding as a template parameter. 1985 1986 Supersedes: 1987 This function supersedes std.utf.encode(), however, note that the 1988 function codeUnits() supersedes it more conveniently. 1989 1990 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 1991 WINDOWS-1252 1992 1993 Params: 1994 c = the code point to be encoded 1995 array = the destination array 1996 1997 Returns: 1998 the number of code units written to the array 1999 */ 2000size_t encode(E)(dchar c, E[] array) 2001in 2002{ 2003 assert(isValidCodePoint(c)); 2004} 2005body 2006{ 2007 E[] t = array; 2008 EncoderInstance!(E).encode(c,t); 2009 return array.length - t.length; 2010} 2011 2012/* 2013Encodes $(D c) in units of type $(D E) and writes the result to the 2014output range $(D R). Returns the number of $(D E)s written. 2015 */ 2016size_t encode(E, R)(dchar c, auto ref R range) 2017if (isNativeOutputRange!(R, E)) 2018{ 2019 static if (is(Unqual!E == char)) 2020 { 2021 if (c <= 0x7F) 2022 { 2023 put(range, cast(char) c); 2024 return 1; 2025 } 2026 if (c <= 0x7FF) 2027 { 2028 put(range, cast(char)(0xC0 | (c >> 6))); 2029 put(range, cast(char)(0x80 | (c & 0x3F))); 2030 return 2; 2031 } 2032 if (c <= 0xFFFF) 2033 { 2034 put(range, cast(char)(0xE0 | (c >> 12))); 2035 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F))); 2036 put(range, cast(char)(0x80 | (c & 0x3F))); 2037 return 3; 2038 } 2039 if (c <= 0x10FFFF) 2040 { 2041 put(range, cast(char)(0xF0 | (c >> 18))); 2042 put(range, cast(char)(0x80 | ((c >> 12) & 0x3F))); 2043 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F))); 2044 put(range, cast(char)(0x80 | (c & 0x3F))); 2045 return 4; 2046 } 2047 else 2048 { 2049 assert(0); 2050 } 2051 } 2052 else static if (is(Unqual!E == wchar)) 2053 { 2054 if (c <= 0xFFFF) 2055 { 2056 range.put(cast(wchar) c); 2057 return 1; 2058 } 2059 range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800)); 2060 range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00)); 2061 return 2; 2062 } 2063 else static if (is(Unqual!E == dchar)) 2064 { 2065 range.put(c); 2066 return 1; 2067 } 2068 else 2069 { 2070 static assert(0); 2071 } 2072} 2073 2074@safe pure unittest 2075{ 2076 import std.array; 2077 Appender!(char[]) r; 2078 assert(encode!(char)('T', r) == 1); 2079 assert(encode!(wchar)('T', r) == 1); 2080 assert(encode!(dchar)('T', r) == 1); 2081} 2082 2083/** 2084 Encodes a single code point to a delegate. 2085 2086 This function encodes a single code point into one or more code units. 2087 The code units are passed one at a time to the supplied delegate. 2088 2089 The input to this function MUST be a valid code point. 2090 This is enforced by the function's in-contract. 2091 2092 The type of the output cannot be deduced. Therefore, it is necessary to 2093 explicitly specify the encoding as a template parameter. 2094 2095 Supersedes: 2096 This function supersedes std.utf.encode(), however, note that the 2097 function codeUnits() supersedes it more conveniently. 2098 2099 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2100 WINDOWS-1252 2101 2102 Params: 2103 c = the code point to be encoded 2104 dg = the delegate to invoke for each code unit 2105 */ 2106void encode(E)(dchar c, void delegate(E) dg) 2107in 2108{ 2109 assert(isValidCodePoint(c)); 2110} 2111body 2112{ 2113 EncoderInstance!(E).encode(c,dg); 2114} 2115 2116/** 2117Encodes the contents of $(D s) in units of type $(D Tgt), writing the result to an 2118output range. 2119 2120Returns: The number of $(D Tgt) elements written. 2121Params: 2122Tgt = Element type of $(D range). 2123s = Input array. 2124range = Output range. 2125 */ 2126size_t encode(Tgt, Src, R)(in Src[] s, R range) 2127{ 2128 size_t result; 2129 foreach (c; s) 2130 { 2131 result += encode!(Tgt)(c, range); 2132 } 2133 return result; 2134} 2135 2136/** 2137 Returns a foreachable struct which can bidirectionally iterate over all 2138 code points in a string. 2139 2140 The input to this function MUST be validly encoded. 2141 This is enforced by the function's in-contract. 2142 2143 You can foreach either 2144 with or without an index. If an index is specified, it will be initialized 2145 at each iteration with the offset into the string at which the code point 2146 begins. 2147 2148 Supersedes: 2149 This function supersedes std.utf.decode(). 2150 2151 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2152 WINDOWS-1252 2153 2154 Params: 2155 s = the string to be decoded 2156 2157 Example: 2158 -------------------------------------------------------- 2159 string s = "hello world"; 2160 foreach (c;codePoints(s)) 2161 { 2162 // do something with c (which will always be a dchar) 2163 } 2164 -------------------------------------------------------- 2165 2166 Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s) 2167 in that the latter will fall over on encountering U+FFFF. 2168 */ 2169CodePoints!(E) codePoints(E)(immutable(E)[] s) 2170in 2171{ 2172 assert(isValid(s)); 2173} 2174body 2175{ 2176 return CodePoints!(E)(s); 2177} 2178 2179/// 2180@system unittest 2181{ 2182 string s = "hello"; 2183 string t; 2184 foreach (c;codePoints(s)) 2185 { 2186 t ~= cast(char) c; 2187 } 2188 assert(s == t); 2189} 2190 2191/** 2192 Returns a foreachable struct which can bidirectionally iterate over all 2193 code units in a code point. 2194 2195 The input to this function MUST be a valid code point. 2196 This is enforced by the function's in-contract. 2197 2198 The type of the output cannot be deduced. Therefore, it is necessary to 2199 explicitly specify the encoding type in the template parameter. 2200 2201 Supersedes: 2202 This function supersedes std.utf.encode(). 2203 2204 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2205 WINDOWS-1252 2206 2207 Params: 2208 c = the code point to be encoded 2209 */ 2210CodeUnits!(E) codeUnits(E)(dchar c) 2211in 2212{ 2213 assert(isValidCodePoint(c)); 2214} 2215body 2216{ 2217 return CodeUnits!(E)(c); 2218} 2219 2220/// 2221@system unittest 2222{ 2223 char[] a; 2224 foreach (c;codeUnits!(char)(cast(dchar)'\u20AC')) 2225 { 2226 a ~= c; 2227 } 2228 assert(a.length == 3); 2229 assert(a[0] == 0xE2); 2230 assert(a[1] == 0x82); 2231 assert(a[2] == 0xAC); 2232} 2233 2234/** 2235 Convert a string from one encoding to another. 2236 2237 Supersedes: 2238 This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and 2239 std.utf.toUTF32() 2240 (but note that to!() supersedes it more conveniently). 2241 2242 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, 2243 WINDOWS-1252 2244 2245 Params: 2246 s = Source string. $(B Must) be validly encoded. 2247 This is enforced by the function's in-contract. 2248 r = Destination string 2249 2250 See_Also: 2251 $(REF to, std,conv) 2252 */ 2253void transcode(Src, Dst)(Src[] s, out Dst[] r) 2254in 2255{ 2256 assert(isValid(s)); 2257} 2258body 2259{ 2260 static if (is(Src == Dst) && is(Src == immutable)) 2261 { 2262 r = s; 2263 } 2264 else static if (is(Unqual!Src == AsciiChar)) 2265 { 2266 transcode(cast(const(char)[])s, r); 2267 } 2268 else 2269 { 2270 static if (is(Unqual!Dst == wchar)) 2271 { 2272 immutable minReservePlace = 2; 2273 } 2274 else static if (is(Unqual!Dst == dchar)) 2275 { 2276 immutable minReservePlace = 1; 2277 } 2278 else 2279 { 2280 immutable minReservePlace = 6; 2281 } 2282 2283 auto buffer = new Unqual!Dst[s.length]; 2284 auto tmpBuffer = buffer; 2285 2286 while (s.length != 0) 2287 { 2288 if (tmpBuffer.length < minReservePlace) 2289 { 2290 size_t prevLength = buffer.length; 2291 buffer.length += s.length + minReservePlace; 2292 tmpBuffer = buffer[prevLength - tmpBuffer.length .. $]; 2293 } 2294 EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer); 2295 } 2296 2297 r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length]; 2298 } 2299} 2300 2301/// 2302@system pure unittest 2303{ 2304 wstring ws; 2305 // transcode from UTF-8 to UTF-16 2306 transcode("hello world",ws); 2307 assert(ws == "hello world"w); 2308 2309 Latin1String ls; 2310 // transcode from UTF-16 to ISO-8859-1 2311 transcode(ws, ls); 2312 assert(ws == "hello world"); 2313} 2314 2315@system pure unittest 2316{ 2317 import std.meta; 2318 import std.range; 2319 { 2320 import std.conv : to; 2321 2322 string asciiCharString = to!string(iota(0, 128, 1)); 2323 2324 alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString, 2325 Windows1250String, Windows1252String, dstring, wstring); 2326 foreach (S; Types) 2327 foreach (D; Types) 2328 { 2329 string str; 2330 S sStr; 2331 D dStr; 2332 transcode(asciiCharString, sStr); 2333 transcode(sStr, dStr); 2334 transcode(dStr, str); 2335 assert(asciiCharString == str); 2336 } 2337 } 2338 { 2339 string czechChars = "P����li�� ��lu��ou��k�� k���� ��p��l ����belsk�� ��dy."; 2340 alias Types = AliasSeq!(string, dstring, wstring); 2341 foreach (S; Types) 2342 foreach (D; Types) 2343 { 2344 string str; 2345 S sStr; 2346 D dStr; 2347 transcode(czechChars, sStr); 2348 transcode(sStr, dStr); 2349 transcode(dStr, str); 2350 assert(czechChars == str); 2351 } 2352 } 2353} 2354 2355@system unittest // mutable/const input/output 2356{ 2357 import std.meta : AliasSeq; 2358 2359 foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char)) 2360 { 2361 O[] output; 2362 2363 char[] mutableInput = "��bc".dup; 2364 transcode(mutableInput, output); 2365 assert(output == [0xE4, 'b', 'c']); 2366 2367 const char[] constInput = "��bc"; 2368 transcode(constInput, output); 2369 assert(output == [0xF6, 'b', 'c']); 2370 2371 immutable char[] immutInput = "��bc"; 2372 transcode(immutInput, output); 2373 assert(output == [0xFC, 'b', 'c']); 2374 } 2375 2376 // Make sure that const/mutable input is copied. 2377 foreach (C; AliasSeq!(char, const char)) 2378 { 2379 C[] input = "foo".dup; 2380 C[] output; 2381 transcode(input, output); 2382 assert(input == output); 2383 assert(input !is output); 2384 } 2385 2386 // But immutable input should not be copied. 2387 string input = "foo"; 2388 string output; 2389 transcode(input, output); 2390 assert(input is output); 2391} 2392 2393//============================================================================= 2394 2395/** The base class for exceptions thrown by this module */ 2396class EncodingException : Exception { this(string msg) @safe pure { super(msg); } } 2397 2398class UnrecognizedEncodingException : EncodingException 2399{ 2400 private this(string msg) @safe pure { super(msg); } 2401} 2402 2403/** Abstract base class of all encoding schemes */ 2404abstract class EncodingScheme 2405{ 2406 import std.uni : toLower; 2407 2408 /** 2409 * Registers a subclass of EncodingScheme. 2410 * 2411 * This function allows user-defined subclasses of EncodingScheme to 2412 * be declared in other modules. 2413 * 2414 * Params: 2415 * Klass = The subclass of EncodingScheme to register. 2416 * 2417 * Example: 2418 * ---------------------------------------------- 2419 * class Amiga1251 : EncodingScheme 2420 * { 2421 * shared static this() 2422 * { 2423 * EncodingScheme.register!Amiga1251; 2424 * } 2425 * } 2426 * ---------------------------------------------- 2427 */ 2428 static void register(Klass:EncodingScheme)() 2429 { 2430 scope scheme = new Klass(); 2431 foreach (encodingName;scheme.names()) 2432 { 2433 supported[toLower(encodingName)] = () => new Klass(); 2434 } 2435 } 2436 2437 deprecated("Please pass the EncodingScheme subclass as template argument instead.") 2438 static void register(string className) 2439 { 2440 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create(); 2441 if (scheme is null) 2442 throw new EncodingException("Unable to create class "~className); 2443 foreach (encodingName;scheme.names()) 2444 { 2445 supportedFactories[toLower(encodingName)] = className; 2446 } 2447 } 2448 2449 /** 2450 * Obtains a subclass of EncodingScheme which is capable of encoding 2451 * and decoding the named encoding scheme. 2452 * 2453 * This function is only aware of EncodingSchemes which have been 2454 * registered with the register() function. 2455 * 2456 * Example: 2457 * --------------------------------------------------- 2458 * auto scheme = EncodingScheme.create("Amiga-1251"); 2459 * --------------------------------------------------- 2460 */ 2461 static EncodingScheme create(string encodingName) 2462 { 2463 static bool registerDefaultEncodings() 2464 { 2465 EncodingScheme.register!EncodingSchemeASCII; 2466 EncodingScheme.register!EncodingSchemeLatin1; 2467 EncodingScheme.register!EncodingSchemeLatin2; 2468 EncodingScheme.register!EncodingSchemeWindows1250; 2469 EncodingScheme.register!EncodingSchemeWindows1252; 2470 EncodingScheme.register!EncodingSchemeUtf8; 2471 EncodingScheme.register!EncodingSchemeUtf16Native; 2472 EncodingScheme.register!EncodingSchemeUtf32Native; 2473 return true; 2474 } 2475 2476 static shared bool initialized; 2477 import std.concurrency : initOnce; 2478 initOnce!initialized(registerDefaultEncodings()); 2479 encodingName = toLower(encodingName); 2480 2481 if (auto p = encodingName in supported) 2482 return (*p)(); 2483 2484 auto p = encodingName in supportedFactories; 2485 if (p is null) 2486 throw new EncodingException("Unrecognized Encoding: "~encodingName); 2487 string className = *p; 2488 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create(); 2489 if (scheme is null) throw new EncodingException("Unable to create class "~className); 2490 return scheme; 2491 } 2492 2493 const 2494 { 2495 /** 2496 * Returns the standard name of the encoding scheme 2497 */ 2498 abstract override string toString(); 2499 2500 /** 2501 * Returns an array of all known names for this encoding scheme 2502 */ 2503 abstract string[] names(); 2504 2505 /** 2506 * Returns true if the character c can be represented 2507 * in this encoding scheme. 2508 */ 2509 abstract bool canEncode(dchar c); 2510 2511 /** 2512 * Returns the number of ubytes required to encode this code point. 2513 * 2514 * The input to this function MUST be a valid code point. 2515 * 2516 * Params: 2517 * c = the code point to be encoded 2518 * 2519 * Returns: 2520 * the number of ubytes required. 2521 */ 2522 abstract size_t encodedLength(dchar c); 2523 2524 /** 2525 * Encodes a single code point into a user-supplied, fixed-size buffer. 2526 * 2527 * This function encodes a single code point into one or more ubytes. 2528 * The supplied buffer must be code unit aligned. 2529 * (For example, UTF-16LE or UTF-16BE must be wchar-aligned, 2530 * UTF-32LE or UTF-32BE must be dchar-aligned, etc.) 2531 * 2532 * The input to this function MUST be a valid code point. 2533 * 2534 * Params: 2535 * c = the code point to be encoded 2536 * buffer = the destination array 2537 * 2538 * Returns: 2539 * the number of ubytes written. 2540 */ 2541 abstract size_t encode(dchar c, ubyte[] buffer); 2542 2543 /** 2544 * Decodes a single code point. 2545 * 2546 * This function removes one or more ubytes from the start of an array, 2547 * and returns the decoded code point which those ubytes represent. 2548 * 2549 * The input to this function MUST be validly encoded. 2550 * 2551 * Params: 2552 * s = the array whose first code point is to be decoded 2553 */ 2554 abstract dchar decode(ref const(ubyte)[] s); 2555 2556 /** 2557 * Decodes a single code point. The input does not have to be valid. 2558 * 2559 * This function removes one or more ubytes from the start of an array, 2560 * and returns the decoded code point which those ubytes represent. 2561 * 2562 * This function will accept an invalidly encoded array as input. 2563 * If an invalid sequence is found at the start of the string, this 2564 * function will remove it, and return the value INVALID_SEQUENCE. 2565 * 2566 * Params: 2567 * s = the array whose first code point is to be decoded 2568 */ 2569 abstract dchar safeDecode(ref const(ubyte)[] s); 2570 2571 /** 2572 * Returns the sequence of ubytes to be used to represent 2573 * any character which cannot be represented in the encoding scheme. 2574 * 2575 * Normally this will be a representation of some substitution 2576 * character, such as U+FFFD or '?'. 2577 */ 2578 abstract @property immutable(ubyte)[] replacementSequence(); 2579 } 2580 2581 /** 2582 * Returns true if the array is encoded correctly 2583 * 2584 * Params: 2585 * s = the array to be tested 2586 */ 2587 bool isValid(const(ubyte)[] s) 2588 { 2589 while (s.length != 0) 2590 { 2591 if (safeDecode(s) == INVALID_SEQUENCE) 2592 return false; 2593 } 2594 return true; 2595 } 2596 2597 /** 2598 * Returns the length of the longest possible substring, starting from 2599 * the first element, which is validly encoded. 2600 * 2601 * Params: 2602 * s = the array to be tested 2603 */ 2604 size_t validLength()(const(ubyte)[] s) 2605 { 2606 const(ubyte)[] r = s; 2607 const(ubyte)[] t = s; 2608 while (s.length != 0) 2609 { 2610 if (safeDecode(s) == INVALID_SEQUENCE) break; 2611 t = s; 2612 } 2613 return r.length - t.length; 2614 } 2615 2616 /** 2617 * Sanitizes an array by replacing malformed ubyte sequences with valid 2618 * ubyte sequences. The result is guaranteed to be valid for this 2619 * encoding scheme. 2620 * 2621 * If the input array is already valid, this function returns the 2622 * original, otherwise it constructs a new array by replacing all illegal 2623 * sequences with the encoding scheme's replacement sequence. 2624 * 2625 * Params: 2626 * s = the string to be sanitized 2627 */ 2628 immutable(ubyte)[] sanitize()(immutable(ubyte)[] s) 2629 { 2630 auto n = validLength(s); 2631 if (n == s.length) return s; 2632 2633 auto repSeq = replacementSequence; 2634 2635 // Count how long the string needs to be. 2636 // Overestimating is not a problem 2637 auto len = s.length; 2638 const(ubyte)[] t = s[n..$]; 2639 while (t.length != 0) 2640 { 2641 immutable c = safeDecode(t); 2642 assert(c == INVALID_SEQUENCE); 2643 len += repSeq.length; 2644 t = t[validLength(t)..$]; 2645 } 2646 2647 // Now do the write 2648 ubyte[] array = new ubyte[len]; 2649 array[0 .. n] = s[0 .. n]; 2650 auto offset = n; 2651 2652 t = s[n..$]; 2653 while (t.length != 0) 2654 { 2655 immutable c = safeDecode(t); 2656 assert(c == INVALID_SEQUENCE); 2657 array[offset .. offset+repSeq.length] = repSeq[]; 2658 offset += repSeq.length; 2659 n = validLength(t); 2660 array[offset .. offset+n] = t[0 .. n]; 2661 offset += n; 2662 t = t[n..$]; 2663 } 2664 return cast(immutable(ubyte)[])array[0 .. offset]; 2665 } 2666 2667 /** 2668 * Returns the length of the first encoded sequence. 2669 * 2670 * The input to this function MUST be validly encoded. 2671 * This is enforced by the function's in-contract. 2672 * 2673 * Params: 2674 * s = the array to be sliced 2675 */ 2676 size_t firstSequence()(const(ubyte)[] s) 2677 in 2678 { 2679 assert(s.length != 0); 2680 const(ubyte)[] u = s; 2681 assert(safeDecode(u) != INVALID_SEQUENCE); 2682 } 2683 body 2684 { 2685 const(ubyte)[] t = s; 2686 decode(s); 2687 return t.length - s.length; 2688 } 2689 2690 /** 2691 * Returns the total number of code points encoded in a ubyte array. 2692 * 2693 * The input to this function MUST be validly encoded. 2694 * This is enforced by the function's in-contract. 2695 * 2696 * Params: 2697 * s = the string to be counted 2698 */ 2699 size_t count()(const(ubyte)[] s) 2700 in 2701 { 2702 assert(isValid(s)); 2703 } 2704 body 2705 { 2706 size_t n = 0; 2707 while (s.length != 0) 2708 { 2709 decode(s); 2710 ++n; 2711 } 2712 return n; 2713 } 2714 2715 /** 2716 * Returns the array index at which the (n+1)th code point begins. 2717 * 2718 * The input to this function MUST be validly encoded. 2719 * This is enforced by the function's in-contract. 2720 * 2721 * Params: 2722 * s = the string to be counted 2723 * n = the current code point index 2724 */ 2725 ptrdiff_t index()(const(ubyte)[] s, size_t n) 2726 in 2727 { 2728 assert(isValid(s)); 2729 assert(n >= 0); 2730 } 2731 body 2732 { 2733 const(ubyte)[] t = s; 2734 for (size_t i=0; i<n; ++i) decode(s); 2735 return t.length - s.length; 2736 } 2737 2738 __gshared EncodingScheme function()[string] supported; 2739 __gshared string[string] supportedFactories; 2740} 2741 2742/** 2743 EncodingScheme to handle ASCII 2744 2745 This scheme recognises the following names: 2746 "ANSI_X3.4-1968", 2747 "ANSI_X3.4-1986", 2748 "ASCII", 2749 "IBM367", 2750 "ISO646-US", 2751 "ISO_646.irv:1991", 2752 "US-ASCII", 2753 "cp367", 2754 "csASCII" 2755 "iso-ir-6", 2756 "us" 2757 */ 2758class EncodingSchemeASCII : EncodingScheme 2759{ 2760 /* // moved to std.internal.phobosinit 2761 shared static this() 2762 { 2763 EncodingScheme.register("std.encoding.EncodingSchemeASCII"); 2764 }*/ 2765 2766 const 2767 { 2768 override string[] names() @safe pure nothrow 2769 { 2770 return 2771 [ 2772 "ANSI_X3.4-1968", 2773 "ANSI_X3.4-1986", 2774 "ASCII", 2775 "IBM367", 2776 "ISO646-US", 2777 "ISO_646.irv:1991", 2778 "US-ASCII", 2779 "cp367", 2780 "csASCII", 2781 "iso-ir-6", 2782 "us" 2783 ]; 2784 } 2785 2786 override string toString() @safe pure nothrow @nogc 2787 { 2788 return "ASCII"; 2789 } 2790 2791 override bool canEncode(dchar c) @safe pure nothrow @nogc 2792 { 2793 return std.encoding.canEncode!(AsciiChar)(c); 2794 } 2795 2796 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 2797 { 2798 return std.encoding.encodedLength!(AsciiChar)(c); 2799 } 2800 2801 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 2802 { 2803 auto r = cast(AsciiChar[]) buffer; 2804 return std.encoding.encode(c,r); 2805 } 2806 2807 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 2808 { 2809 auto t = cast(const(AsciiChar)[]) s; 2810 dchar c = std.encoding.decode(t); 2811 s = s[$-t.length..$]; 2812 return c; 2813 } 2814 2815 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 2816 { 2817 auto t = cast(const(AsciiChar)[]) s; 2818 dchar c = std.encoding.safeDecode(t); 2819 s = s[$-t.length..$]; 2820 return c; 2821 } 2822 2823 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 2824 { 2825 return cast(immutable(ubyte)[])"?"; 2826 } 2827 } 2828} 2829 2830/** 2831 EncodingScheme to handle Latin-1 2832 2833 This scheme recognises the following names: 2834 "CP819", 2835 "IBM819", 2836 "ISO-8859-1", 2837 "ISO_8859-1", 2838 "ISO_8859-1:1987", 2839 "csISOLatin1", 2840 "iso-ir-100", 2841 "l1", 2842 "latin1" 2843 */ 2844class EncodingSchemeLatin1 : EncodingScheme 2845{ 2846 /* // moved to std.internal.phobosinit 2847 shared static this() 2848 { 2849 EncodingScheme.register("std.encoding.EncodingSchemeLatin1"); 2850 }*/ 2851 2852 const 2853 { 2854 override string[] names() @safe pure nothrow 2855 { 2856 return 2857 [ 2858 "CP819", 2859 "IBM819", 2860 "ISO-8859-1", 2861 "ISO_8859-1", 2862 "ISO_8859-1:1987", 2863 "csISOLatin1", 2864 "iso-ir-100", 2865 "l1", 2866 "latin1" 2867 ]; 2868 } 2869 2870 override string toString() @safe pure nothrow @nogc 2871 { 2872 return "ISO-8859-1"; 2873 } 2874 2875 override bool canEncode(dchar c) @safe pure nothrow @nogc 2876 { 2877 return std.encoding.canEncode!(Latin1Char)(c); 2878 } 2879 2880 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 2881 { 2882 return std.encoding.encodedLength!(Latin1Char)(c); 2883 } 2884 2885 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 2886 { 2887 auto r = cast(Latin1Char[]) buffer; 2888 return std.encoding.encode(c,r); 2889 } 2890 2891 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 2892 { 2893 auto t = cast(const(Latin1Char)[]) s; 2894 dchar c = std.encoding.decode(t); 2895 s = s[$-t.length..$]; 2896 return c; 2897 } 2898 2899 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 2900 { 2901 auto t = cast(const(Latin1Char)[]) s; 2902 dchar c = std.encoding.safeDecode(t); 2903 s = s[$-t.length..$]; 2904 return c; 2905 } 2906 2907 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 2908 { 2909 return cast(immutable(ubyte)[])"?"; 2910 } 2911 } 2912} 2913 2914/** 2915 EncodingScheme to handle Latin-2 2916 2917 This scheme recognises the following names: 2918 "Latin 2", 2919 "ISO-8859-2", 2920 "ISO_8859-2", 2921 "ISO_8859-2:1999", 2922 "Windows-28592" 2923 */ 2924class EncodingSchemeLatin2 : EncodingScheme 2925{ 2926 /* // moved to std.internal.phobosinit 2927 shared static this() 2928 { 2929 EncodingScheme.register("std.encoding.EncodingSchemeLatin2"); 2930 }*/ 2931 2932 const 2933 { 2934 override string[] names() @safe pure nothrow 2935 { 2936 return 2937 [ 2938 "Latin 2", 2939 "ISO-8859-2", 2940 "ISO_8859-2", 2941 "ISO_8859-2:1999", 2942 "windows-28592" 2943 ]; 2944 } 2945 2946 override string toString() @safe pure nothrow @nogc 2947 { 2948 return "ISO-8859-2"; 2949 } 2950 2951 override bool canEncode(dchar c) @safe pure nothrow @nogc 2952 { 2953 return std.encoding.canEncode!(Latin2Char)(c); 2954 } 2955 2956 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 2957 { 2958 return std.encoding.encodedLength!(Latin2Char)(c); 2959 } 2960 2961 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 2962 { 2963 auto r = cast(Latin2Char[]) buffer; 2964 return std.encoding.encode(c,r); 2965 } 2966 2967 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 2968 { 2969 auto t = cast(const(Latin2Char)[]) s; 2970 dchar c = std.encoding.decode(t); 2971 s = s[$-t.length..$]; 2972 return c; 2973 } 2974 2975 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 2976 { 2977 auto t = cast(const(Latin2Char)[]) s; 2978 dchar c = std.encoding.safeDecode(t); 2979 s = s[$-t.length..$]; 2980 return c; 2981 } 2982 2983 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 2984 { 2985 return cast(immutable(ubyte)[])"?"; 2986 } 2987 } 2988} 2989 2990/** 2991 EncodingScheme to handle Windows-1250 2992 2993 This scheme recognises the following names: 2994 "windows-1250" 2995 */ 2996class EncodingSchemeWindows1250 : EncodingScheme 2997{ 2998 /* // moved to std.internal.phobosinit 2999 shared static this() 3000 { 3001 EncodingScheme.register("std.encoding.EncodingSchemeWindows1250"); 3002 }*/ 3003 3004 const 3005 { 3006 override string[] names() @safe pure nothrow 3007 { 3008 return 3009 [ 3010 "windows-1250" 3011 ]; 3012 } 3013 3014 override string toString() @safe pure nothrow @nogc 3015 { 3016 return "windows-1250"; 3017 } 3018 3019 override bool canEncode(dchar c) @safe pure nothrow @nogc 3020 { 3021 return std.encoding.canEncode!(Windows1250Char)(c); 3022 } 3023 3024 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3025 { 3026 return std.encoding.encodedLength!(Windows1250Char)(c); 3027 } 3028 3029 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3030 { 3031 auto r = cast(Windows1250Char[]) buffer; 3032 return std.encoding.encode(c,r); 3033 } 3034 3035 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3036 { 3037 auto t = cast(const(Windows1250Char)[]) s; 3038 dchar c = std.encoding.decode(t); 3039 s = s[$-t.length..$]; 3040 return c; 3041 } 3042 3043 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3044 { 3045 auto t = cast(const(Windows1250Char)[]) s; 3046 dchar c = std.encoding.safeDecode(t); 3047 s = s[$-t.length..$]; 3048 return c; 3049 } 3050 3051 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3052 { 3053 return cast(immutable(ubyte)[])"?"; 3054 } 3055 } 3056} 3057 3058/** 3059 EncodingScheme to handle Windows-1252 3060 3061 This scheme recognises the following names: 3062 "windows-1252" 3063 */ 3064class EncodingSchemeWindows1252 : EncodingScheme 3065{ 3066 /* // moved to std.internal.phobosinit 3067 shared static this() 3068 { 3069 EncodingScheme.register("std.encoding.EncodingSchemeWindows1252"); 3070 }*/ 3071 3072 const 3073 { 3074 override string[] names() @safe pure nothrow 3075 { 3076 return 3077 [ 3078 "windows-1252" 3079 ]; 3080 } 3081 3082 override string toString() @safe pure nothrow @nogc 3083 { 3084 return "windows-1252"; 3085 } 3086 3087 override bool canEncode(dchar c) @safe pure nothrow @nogc 3088 { 3089 return std.encoding.canEncode!(Windows1252Char)(c); 3090 } 3091 3092 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3093 { 3094 return std.encoding.encodedLength!(Windows1252Char)(c); 3095 } 3096 3097 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3098 { 3099 auto r = cast(Windows1252Char[]) buffer; 3100 return std.encoding.encode(c,r); 3101 } 3102 3103 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3104 { 3105 auto t = cast(const(Windows1252Char)[]) s; 3106 dchar c = std.encoding.decode(t); 3107 s = s[$-t.length..$]; 3108 return c; 3109 } 3110 3111 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3112 { 3113 auto t = cast(const(Windows1252Char)[]) s; 3114 dchar c = std.encoding.safeDecode(t); 3115 s = s[$-t.length..$]; 3116 return c; 3117 } 3118 3119 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3120 { 3121 return cast(immutable(ubyte)[])"?"; 3122 } 3123 } 3124} 3125 3126/** 3127 EncodingScheme to handle UTF-8 3128 3129 This scheme recognises the following names: 3130 "UTF-8" 3131 */ 3132class EncodingSchemeUtf8 : EncodingScheme 3133{ 3134 /* // moved to std.internal.phobosinit 3135 shared static this() 3136 { 3137 EncodingScheme.register("std.encoding.EncodingSchemeUtf8"); 3138 }*/ 3139 3140 const 3141 { 3142 override string[] names() @safe pure nothrow 3143 { 3144 return 3145 [ 3146 "UTF-8" 3147 ]; 3148 } 3149 3150 override string toString() @safe pure nothrow @nogc 3151 { 3152 return "UTF-8"; 3153 } 3154 3155 override bool canEncode(dchar c) @safe pure nothrow @nogc 3156 { 3157 return std.encoding.canEncode!(char)(c); 3158 } 3159 3160 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3161 { 3162 return std.encoding.encodedLength!(char)(c); 3163 } 3164 3165 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3166 { 3167 auto r = cast(char[]) buffer; 3168 return std.encoding.encode(c,r); 3169 } 3170 3171 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3172 { 3173 auto t = cast(const(char)[]) s; 3174 dchar c = std.encoding.decode(t); 3175 s = s[$-t.length..$]; 3176 return c; 3177 } 3178 3179 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3180 { 3181 auto t = cast(const(char)[]) s; 3182 dchar c = std.encoding.safeDecode(t); 3183 s = s[$-t.length..$]; 3184 return c; 3185 } 3186 3187 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3188 { 3189 return cast(immutable(ubyte)[])"\uFFFD"; 3190 } 3191 } 3192} 3193 3194/** 3195 EncodingScheme to handle UTF-16 in native byte order 3196 3197 This scheme recognises the following names: 3198 "UTF-16LE" (little-endian architecture only) 3199 "UTF-16BE" (big-endian architecture only) 3200 */ 3201class EncodingSchemeUtf16Native : EncodingScheme 3202{ 3203 /* // moved to std.internal.phobosinit 3204 shared static this() 3205 { 3206 EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native"); 3207 }*/ 3208 3209 const 3210 { 3211 version (LittleEndian) { enum string NAME = "UTF-16LE"; } 3212 version (BigEndian) { enum string NAME = "UTF-16BE"; } 3213 3214 override string[] names() @safe pure nothrow 3215 { 3216 return [ NAME ]; 3217 } 3218 3219 override string toString() @safe pure nothrow @nogc 3220 { 3221 return NAME; 3222 } 3223 3224 override bool canEncode(dchar c) @safe pure nothrow @nogc 3225 { 3226 return std.encoding.canEncode!(wchar)(c); 3227 } 3228 3229 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3230 { 3231 return std.encoding.encodedLength!(wchar)(c); 3232 } 3233 3234 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3235 { 3236 auto r = cast(wchar[]) buffer; 3237 return wchar.sizeof * std.encoding.encode(c,r); 3238 } 3239 3240 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3241 in 3242 { 3243 assert((s.length & 1) == 0); 3244 } 3245 body 3246 { 3247 auto t = cast(const(wchar)[]) s; 3248 dchar c = std.encoding.decode(t); 3249 s = s[$-t.length * wchar.sizeof..$]; 3250 return c; 3251 } 3252 3253 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3254 in 3255 { 3256 assert((s.length & 1) == 0); 3257 } 3258 body 3259 { 3260 auto t = cast(const(wchar)[]) s; 3261 dchar c = std.encoding.safeDecode(t); 3262 s = s[$-t.length * wchar.sizeof..$]; 3263 return c; 3264 } 3265 3266 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3267 { 3268 return cast(immutable(ubyte)[])"\uFFFD"w; 3269 } 3270 } 3271} 3272@system unittest 3273{ 3274 version (LittleEndian) 3275 { 3276 auto efrom = EncodingScheme.create("utf-16le"); 3277 ubyte[6] sample = [154,1, 155,1, 156,1]; 3278 } 3279 version (BigEndian) 3280 { 3281 auto efrom = EncodingScheme.create("utf-16be"); 3282 ubyte[6] sample = [1,154, 1,155, 1,156]; 3283 } 3284 const(ubyte)[] ub = cast(const(ubyte)[])sample; 3285 dchar dc = efrom.safeDecode(ub); 3286 assert(dc == 410); 3287 assert(ub.length == 4); 3288} 3289 3290/** 3291 EncodingScheme to handle UTF-32 in native byte order 3292 3293 This scheme recognises the following names: 3294 "UTF-32LE" (little-endian architecture only) 3295 "UTF-32BE" (big-endian architecture only) 3296 */ 3297class EncodingSchemeUtf32Native : EncodingScheme 3298{ 3299 /* // moved to std.internal.phobosinit 3300 shared static this() 3301 { 3302 EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native"); 3303 }*/ 3304 3305 const 3306 { 3307 version (LittleEndian) { enum string NAME = "UTF-32LE"; } 3308 version (BigEndian) { enum string NAME = "UTF-32BE"; } 3309 3310 override string[] names() @safe pure nothrow 3311 { 3312 return [ NAME ]; 3313 } 3314 3315 override string toString() @safe pure nothrow @nogc 3316 { 3317 return NAME; 3318 } 3319 3320 override bool canEncode(dchar c) @safe pure nothrow @nogc 3321 { 3322 return std.encoding.canEncode!(dchar)(c); 3323 } 3324 3325 override size_t encodedLength(dchar c) @safe pure nothrow @nogc 3326 { 3327 return std.encoding.encodedLength!(dchar)(c); 3328 } 3329 3330 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc 3331 { 3332 auto r = cast(dchar[]) buffer; 3333 return dchar.sizeof * std.encoding.encode(c,r); 3334 } 3335 3336 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3337 in 3338 { 3339 assert((s.length & 3) == 0); 3340 } 3341 body 3342 { 3343 auto t = cast(const(dchar)[]) s; 3344 dchar c = std.encoding.decode(t); 3345 s = s[$-t.length * dchar.sizeof..$]; 3346 return c; 3347 } 3348 3349 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc 3350 in 3351 { 3352 assert((s.length & 3) == 0); 3353 } 3354 body 3355 { 3356 auto t = cast(const(dchar)[]) s; 3357 dchar c = std.encoding.safeDecode(t); 3358 s = s[$-t.length * dchar.sizeof..$]; 3359 return c; 3360 } 3361 3362 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc 3363 { 3364 return cast(immutable(ubyte)[])"\uFFFD"d; 3365 } 3366 } 3367} 3368@system unittest 3369{ 3370 version (LittleEndian) 3371 { 3372 auto efrom = EncodingScheme.create("utf-32le"); 3373 ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0]; 3374 } 3375 version (BigEndian) 3376 { 3377 auto efrom = EncodingScheme.create("utf-32be"); 3378 ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156]; 3379 } 3380 const(ubyte)[] ub = cast(const(ubyte)[])sample; 3381 dchar dc = efrom.safeDecode(ub); 3382 assert(dc == 410); 3383 assert(ub.length == 8); 3384} 3385 3386//============================================================================= 3387 3388 3389// Helper functions 3390version (unittest) 3391{ 3392 void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r) 3393 { 3394 static if (is(Src == Dst)) 3395 { 3396 return s; 3397 } 3398 else static if (is(Src == AsciiChar)) 3399 { 3400 transcodeReverse!(char,Dst)(cast(string) s,r); 3401 } 3402 else 3403 { 3404 foreach_reverse (d;codePoints(s)) 3405 { 3406 foreach_reverse (c;codeUnits!(Dst)(d)) 3407 { 3408 r = c ~ r; 3409 } 3410 } 3411 } 3412 } 3413 3414 string makeReadable(string s) 3415 { 3416 string r = "\""; 3417 foreach (char c;s) 3418 { 3419 if (c >= 0x20 && c < 0x80) 3420 { 3421 r ~= c; 3422 } 3423 else 3424 { 3425 r ~= "\\x"; 3426 r ~= toHexDigit(c >> 4); 3427 r ~= toHexDigit(c); 3428 } 3429 } 3430 r ~= "\""; 3431 return r; 3432 } 3433 3434 string makeReadable(wstring s) 3435 { 3436 string r = "\""; 3437 foreach (wchar c;s) 3438 { 3439 if (c >= 0x20 && c < 0x80) 3440 { 3441 r ~= cast(char) c; 3442 } 3443 else 3444 { 3445 r ~= "\\u"; 3446 r ~= toHexDigit(c >> 12); 3447 r ~= toHexDigit(c >> 8); 3448 r ~= toHexDigit(c >> 4); 3449 r ~= toHexDigit(c); 3450 } 3451 } 3452 r ~= "\"w"; 3453 return r; 3454 } 3455 3456 string makeReadable(dstring s) 3457 { 3458 string r = "\""; 3459 foreach (dchar c; s) 3460 { 3461 if (c >= 0x20 && c < 0x80) 3462 { 3463 r ~= cast(char) c; 3464 } 3465 else if (c < 0x10000) 3466 { 3467 r ~= "\\u"; 3468 r ~= toHexDigit(c >> 12); 3469 r ~= toHexDigit(c >> 8); 3470 r ~= toHexDigit(c >> 4); 3471 r ~= toHexDigit(c); 3472 } 3473 else 3474 { 3475 r ~= "\\U00"; 3476 r ~= toHexDigit(c >> 20); 3477 r ~= toHexDigit(c >> 16); 3478 r ~= toHexDigit(c >> 12); 3479 r ~= toHexDigit(c >> 8); 3480 r ~= toHexDigit(c >> 4); 3481 r ~= toHexDigit(c); 3482 } 3483 } 3484 r ~= "\"d"; 3485 return r; 3486 } 3487 3488 char toHexDigit(int n) 3489 { 3490 return "0123456789ABCDEF"[n & 0xF]; 3491 } 3492} 3493 3494/** Definitions of common Byte Order Marks. 3495The elements of the $(D enum) can used as indices into $(D bomTable) to get 3496matching $(D BOMSeq). 3497*/ 3498enum BOM 3499{ 3500 none = 0, /// no BOM was found 3501 utf32be = 1, /// [0x00, 0x00, 0xFE, 0xFF] 3502 utf32le = 2, /// [0xFF, 0xFE, 0x00, 0x00] 3503 utf7 = 3, /* [0x2B, 0x2F, 0x76, 0x38] 3504 [0x2B, 0x2F, 0x76, 0x39], 3505 [0x2B, 0x2F, 0x76, 0x2B], 3506 [0x2B, 0x2F, 0x76, 0x2F], 3507 [0x2B, 0x2F, 0x76, 0x38, 0x2D] 3508 */ 3509 utf1 = 8, /// [0xF7, 0x64, 0x4C] 3510 utfebcdic = 9, /// [0xDD, 0x73, 0x66, 0x73] 3511 scsu = 10, /// [0x0E, 0xFE, 0xFF] 3512 bocu1 = 11, /// [0xFB, 0xEE, 0x28] 3513 gb18030 = 12, /// [0x84, 0x31, 0x95, 0x33] 3514 utf8 = 13, /// [0xEF, 0xBB, 0xBF] 3515 utf16be = 14, /// [0xFE, 0xFF] 3516 utf16le = 15 /// [0xFF, 0xFE] 3517} 3518 3519/// The type stored inside $(D bomTable). 3520alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence"); 3521 3522/** Mapping of a byte sequence to $(B Byte Order Mark (BOM)) 3523*/ 3524immutable bomTable = [ 3525 BOMSeq(BOM.none, null), 3526 BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])), 3527 BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])), 3528 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])), 3529 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])), 3530 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])), 3531 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])), 3532 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])), 3533 BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])), 3534 BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])), 3535 BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])), 3536 BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])), 3537 BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])), 3538 BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])), 3539 BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])), 3540 BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE])) 3541]; 3542 3543/** Returns a $(D BOMSeq) for a given $(D input). 3544If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is 3545returned. The $(D BOM) sequence at the beginning of the range will 3546not be comsumed from the passed range. If you pass a reference type 3547range make sure that $(D save) creates a deep copy. 3548 3549Params: 3550 input = The sequence to check for the $(D BOM) 3551 3552Returns: 3553 the found $(D BOMSeq) corresponding to the passed $(D input). 3554*/ 3555immutable(BOMSeq) getBOM(Range)(Range input) 3556if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte)) 3557{ 3558 import std.algorithm.searching : startsWith; 3559 foreach (it; bomTable[1 .. $]) 3560 { 3561 if (startsWith(input.save, it.sequence)) 3562 { 3563 return it; 3564 } 3565 } 3566 3567 return bomTable[0]; 3568} 3569 3570/// 3571@system unittest 3572{ 3573 import std.format : format; 3574 3575 auto ts = dchar(0x0000FEFF) ~ "Hello World"d; 3576 3577 auto entry = getBOM(cast(ubyte[]) ts); 3578 version (BigEndian) 3579 { 3580 assert(entry.schema == BOM.utf32be, format("%s", entry.schema)); 3581 } 3582 else 3583 { 3584 assert(entry.schema == BOM.utf32le, format("%s", entry.schema)); 3585 } 3586} 3587 3588@system unittest 3589{ 3590 import std.format : format; 3591 3592 foreach (idx, it; bomTable) 3593 { 3594 auto s = it[1] ~ cast(ubyte[])"hello world"; 3595 auto i = getBOM(s); 3596 assert(i[0] == bomTable[idx][0]); 3597 3598 if (idx < 4 || idx > 7) // get around the multiple utf7 bom's 3599 { 3600 assert(i[0] == BOM.init + idx); 3601 assert(i[1] == it[1]); 3602 } 3603 } 3604} 3605 3606@safe pure unittest 3607{ 3608 struct BOMInputRange 3609 { 3610 ubyte[] arr; 3611 3612 @property ubyte front() 3613 { 3614 return this.arr.front; 3615 } 3616 3617 @property bool empty() 3618 { 3619 return this.arr.empty; 3620 } 3621 3622 void popFront() 3623 { 3624 this.arr = this.arr[1 .. $]; 3625 } 3626 3627 @property typeof(this) save() 3628 { 3629 return this; 3630 } 3631 } 3632 3633 static assert( isInputRange!BOMInputRange); 3634 static assert(!isArray!BOMInputRange); 3635 3636 ubyte[] dummyEnd = [0,0,0,0]; 3637 3638 foreach (idx, it; bomTable[1 .. $]) 3639 { 3640 { 3641 auto ir = BOMInputRange(it.sequence.dup); 3642 3643 auto b = getBOM(ir); 3644 assert(b.schema == it.schema); 3645 assert(ir.arr == it.sequence); 3646 } 3647 3648 { 3649 auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd; 3650 size_t oldLen = noBom.length; 3651 assert(oldLen - 4 < it.sequence.length); 3652 3653 auto ir = BOMInputRange(noBom.dup); 3654 auto b = getBOM(ir); 3655 assert(b.schema == BOM.none); 3656 assert(noBom.length == oldLen); 3657 } 3658 } 3659} 3660 3661/** Constant defining a fully decoded BOM */ 3662enum dchar utfBOM = 0xfeff; 3663