codecvt.cc revision 1.4
1// Locale support (codecvt) -*- C++ -*- 2 3// Copyright (C) 2015-2018 Free Software Foundation, Inc. 4// 5// This file is part of the GNU ISO C++ Library. This library is free 6// software; you can redistribute it and/or modify it under the 7// terms of the GNU General Public License as published by the 8// Free Software Foundation; either version 3, or (at your option) 9// any later version. 10 11// This library is distributed in the hope that it will be useful, 12// but WITHOUT ANY WARRANTY; without even the implied warranty of 13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14// GNU General Public License for more details. 15 16// Under Section 7 of GPL version 3, you are granted additional 17// permissions described in the GCC Runtime Library Exception, version 18// 3.1, as published by the Free Software Foundation. 19 20// You should have received a copy of the GNU General Public License and 21// a copy of the GCC Runtime Library Exception along with this program; 22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23// <http://www.gnu.org/licenses/>. 24 25#include <codecvt> 26#include <cstring> // std::memcpy, std::memcmp 27#include <bits/stl_algobase.h> // std::min 28 29#ifdef _GLIBCXX_USE_C99_STDINT_TR1 30namespace std _GLIBCXX_VISIBILITY(default) 31{ 32_GLIBCXX_BEGIN_NAMESPACE_VERSION 33 34 // The standard doesn't define these operators, which is annoying. 35 static underlying_type<codecvt_mode>::type 36 to_integer(codecvt_mode m) 37 { return static_cast<underlying_type<codecvt_mode>::type>(m); } 38 39 static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n) 40 { return m = codecvt_mode(to_integer(m) & to_integer(n)); } 41 42 static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n) 43 { return m = codecvt_mode(to_integer(m) | to_integer(n)); } 44 45 static codecvt_mode operator~(codecvt_mode m) 46 { return codecvt_mode(~to_integer(m)); } 47 48namespace 49{ 50 // Largest code point that fits in a single UTF-16 code unit. 51 const char32_t max_single_utf16_unit = 0xFFFF; 52 53 const char32_t max_code_point = 0x10FFFF; 54 55 // The functions below rely on maxcode < incomplete_mb_character 56 // (which is enforced by the codecvt_utf* classes on construction). 57 const char32_t incomplete_mb_character = char32_t(-2); 58 const char32_t invalid_mb_sequence = char32_t(-1); 59 60 // Utility type for reading and writing code units of type Elem from 61 // a range defined by a pair of pointers. 62 template<typename Elem, bool Aligned = true> 63 struct range 64 { 65 Elem* next; 66 Elem* end; 67 68 // Write a code unit. 69 range& operator=(Elem e) 70 { 71 *next++ = e; 72 return *this; 73 } 74 75 // Read the next code unit. 76 Elem operator*() const { return *next; } 77 78 // Read the Nth code unit. 79 Elem operator[](size_t n) const { return next[n]; } 80 81 // Move to the next code unit. 82 range& operator++() 83 { 84 ++next; 85 return *this; 86 } 87 88 // Move to the Nth code unit. 89 range& operator+=(size_t n) 90 { 91 next += n; 92 return *this; 93 } 94 95 // The number of code units remaining. 96 size_t size() const { return end - next; } 97 98 // The number of bytes remaining. 99 size_t nbytes() const { return (const char*)end - (const char*)next; } 100 }; 101 102 // This specialization is used when accessing char16_t values through 103 // pointers to char, which might not be correctly aligned for char16_t. 104 template<typename Elem> 105 struct range<Elem, false> 106 { 107 using value_type = typename remove_const<Elem>::type; 108 109 using char_pointer = typename 110 conditional<is_const<Elem>::value, const char*, char*>::type; 111 112 char_pointer next; 113 char_pointer end; 114 115 // Write a code unit. 116 range& operator=(Elem e) 117 { 118 memcpy(next, &e, sizeof(Elem)); 119 ++*this; 120 return *this; 121 } 122 123 // Read the next code unit. 124 Elem operator*() const 125 { 126 value_type e; 127 memcpy(&e, next, sizeof(Elem)); 128 return e; 129 } 130 131 // Read the Nth code unit. 132 Elem operator[](size_t n) const 133 { 134 value_type e; 135 memcpy(&e, next + n * sizeof(Elem), sizeof(Elem)); 136 return e; 137 } 138 139 // Move to the next code unit. 140 range& operator++() 141 { 142 next += sizeof(Elem); 143 return *this; 144 } 145 146 // Move to the Nth code unit. 147 range& operator+=(size_t n) 148 { 149 next += n * sizeof(Elem); 150 return *this; 151 } 152 153 // The number of code units remaining. 154 size_t size() const { return nbytes() / sizeof(Elem); } 155 156 // The number of bytes remaining. 157 size_t nbytes() const { return end - next; } 158 }; 159 160 // Multibyte sequences can have "header" consisting of Byte Order Mark 161 const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF }; 162 const unsigned char utf16_bom[2] = { 0xFE, 0xFF }; 163 const unsigned char utf16le_bom[2] = { 0xFF, 0xFE }; 164 165 // Write a BOM (space permitting). 166 template<typename C, bool A, size_t N> 167 bool 168 write_bom(range<C, A>& to, const unsigned char (&bom)[N]) 169 { 170 static_assert( (N / sizeof(C)) != 0, "" ); 171 static_assert( (N % sizeof(C)) == 0, "" ); 172 173 if (to.nbytes() < N) 174 return false; 175 memcpy(to.next, bom, N); 176 to += (N / sizeof(C)); 177 return true; 178 } 179 180 // Try to read a BOM. 181 template<typename C, bool A, size_t N> 182 bool 183 read_bom(range<C, A>& from, const unsigned char (&bom)[N]) 184 { 185 static_assert( (N / sizeof(C)) != 0, "" ); 186 static_assert( (N % sizeof(C)) == 0, "" ); 187 188 if (from.nbytes() >= N && !memcmp(from.next, bom, N)) 189 { 190 from += (N / sizeof(C)); 191 return true; 192 } 193 return false; 194 } 195 196 // If generate_header is set in mode write out UTF-8 BOM. 197 bool 198 write_utf8_bom(range<char>& to, codecvt_mode mode) 199 { 200 if (mode & generate_header) 201 return write_bom(to, utf8_bom); 202 return true; 203 } 204 205 // If generate_header is set in mode write out the UTF-16 BOM indicated 206 // by whether little_endian is set in mode. 207 template<bool Aligned> 208 bool 209 write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode) 210 { 211 if (mode & generate_header) 212 { 213 if (mode & little_endian) 214 return write_bom(to, utf16le_bom); 215 else 216 return write_bom(to, utf16_bom); 217 } 218 return true; 219 } 220 221 // If consume_header is set in mode update from.next to after any BOM. 222 void 223 read_utf8_bom(range<const char>& from, codecvt_mode mode) 224 { 225 if (mode & consume_header) 226 read_bom(from, utf8_bom); 227 } 228 229 // If consume_header is not set in mode, no effects. 230 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then: 231 // - if the UTF-16BE BOM was found unset little_endian in mode, or 232 // - if the UTF-16LE BOM was found set little_endian in mode. 233 template<bool Aligned> 234 void 235 read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode) 236 { 237 if (mode & consume_header) 238 { 239 if (read_bom(from, utf16_bom)) 240 mode &= ~little_endian; 241 else if (read_bom(from, utf16le_bom)) 242 mode |= little_endian; 243 } 244 } 245 246 // Read a codepoint from a UTF-8 multibyte sequence. 247 // Updates from.next if the codepoint is not greater than maxcode. 248 // Returns invalid_mb_sequence, incomplete_mb_character or the code point. 249 char32_t 250 read_utf8_code_point(range<const char>& from, unsigned long maxcode) 251 { 252 const size_t avail = from.size(); 253 if (avail == 0) 254 return incomplete_mb_character; 255 unsigned char c1 = from[0]; 256 // https://en.wikipedia.org/wiki/UTF-8#Sample_code 257 if (c1 < 0x80) 258 { 259 ++from; 260 return c1; 261 } 262 else if (c1 < 0xC2) // continuation or overlong 2-byte sequence 263 return invalid_mb_sequence; 264 else if (c1 < 0xE0) // 2-byte sequence 265 { 266 if (avail < 2) 267 return incomplete_mb_character; 268 unsigned char c2 = from[1]; 269 if ((c2 & 0xC0) != 0x80) 270 return invalid_mb_sequence; 271 char32_t c = (c1 << 6) + c2 - 0x3080; 272 if (c <= maxcode) 273 from += 2; 274 return c; 275 } 276 else if (c1 < 0xF0) // 3-byte sequence 277 { 278 if (avail < 3) 279 return incomplete_mb_character; 280 unsigned char c2 = from[1]; 281 if ((c2 & 0xC0) != 0x80) 282 return invalid_mb_sequence; 283 if (c1 == 0xE0 && c2 < 0xA0) // overlong 284 return invalid_mb_sequence; 285 unsigned char c3 = from[2]; 286 if ((c3 & 0xC0) != 0x80) 287 return invalid_mb_sequence; 288 char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080; 289 if (c <= maxcode) 290 from += 3; 291 return c; 292 } 293 else if (c1 < 0xF5) // 4-byte sequence 294 { 295 if (avail < 4) 296 return incomplete_mb_character; 297 unsigned char c2 = from[1]; 298 if ((c2 & 0xC0) != 0x80) 299 return invalid_mb_sequence; 300 if (c1 == 0xF0 && c2 < 0x90) // overlong 301 return invalid_mb_sequence; 302 if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF 303 return invalid_mb_sequence; 304 unsigned char c3 = from[2]; 305 if ((c3 & 0xC0) != 0x80) 306 return invalid_mb_sequence; 307 unsigned char c4 = from[3]; 308 if ((c4 & 0xC0) != 0x80) 309 return invalid_mb_sequence; 310 char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080; 311 if (c <= maxcode) 312 from += 4; 313 return c; 314 } 315 else // > U+10FFFF 316 return invalid_mb_sequence; 317 } 318 319 bool 320 write_utf8_code_point(range<char>& to, char32_t code_point) 321 { 322 if (code_point < 0x80) 323 { 324 if (to.size() < 1) 325 return false; 326 to = code_point; 327 } 328 else if (code_point <= 0x7FF) 329 { 330 if (to.size() < 2) 331 return false; 332 to = (code_point >> 6) + 0xC0; 333 to = (code_point & 0x3F) + 0x80; 334 } 335 else if (code_point <= 0xFFFF) 336 { 337 if (to.size() < 3) 338 return false; 339 to = (code_point >> 12) + 0xE0; 340 to = ((code_point >> 6) & 0x3F) + 0x80; 341 to = (code_point & 0x3F) + 0x80; 342 } 343 else if (code_point <= 0x10FFFF) 344 { 345 if (to.size() < 4) 346 return false; 347 to = (code_point >> 18) + 0xF0; 348 to = ((code_point >> 12) & 0x3F) + 0x80; 349 to = ((code_point >> 6) & 0x3F) + 0x80; 350 to = (code_point & 0x3F) + 0x80; 351 } 352 else 353 return false; 354 return true; 355 } 356 357 inline char16_t 358 adjust_byte_order(char16_t c, codecvt_mode mode) 359 { 360#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 361 return (mode & little_endian) ? __builtin_bswap16(c) : c; 362#else 363 return (mode & little_endian) ? c : __builtin_bswap16(c); 364#endif 365 } 366 367 // Return true if c is a high-surrogate (aka leading) code point. 368 inline bool 369 is_high_surrogate(char32_t c) 370 { 371 return c >= 0xD800 && c <= 0xDBFF; 372 } 373 374 // Return true if c is a low-surrogate (aka trailing) code point. 375 inline bool 376 is_low_surrogate(char32_t c) 377 { 378 return c >= 0xDC00 && c <= 0xDFFF; 379 } 380 381 inline char32_t 382 surrogate_pair_to_code_point(char32_t high, char32_t low) 383 { 384 return (high << 10) + low - 0x35FDC00; 385 } 386 387 // Read a codepoint from a UTF-16 multibyte sequence. 388 // The sequence's endianness is indicated by (mode & little_endian). 389 // Updates from.next if the codepoint is not greater than maxcode. 390 // Returns invalid_mb_sequence, incomplete_mb_character or the code point. 391 template<bool Aligned> 392 char32_t 393 read_utf16_code_point(range<const char16_t, Aligned>& from, 394 unsigned long maxcode, codecvt_mode mode) 395 { 396 const size_t avail = from.size(); 397 if (avail == 0) 398 return incomplete_mb_character; 399 int inc = 1; 400 char32_t c = adjust_byte_order(from[0], mode); 401 if (is_high_surrogate(c)) 402 { 403 if (avail < 2) 404 return incomplete_mb_character; 405 const char16_t c2 = adjust_byte_order(from[1], mode); 406 if (is_low_surrogate(c2)) 407 { 408 c = surrogate_pair_to_code_point(c, c2); 409 inc = 2; 410 } 411 else 412 return invalid_mb_sequence; 413 } 414 else if (is_low_surrogate(c)) 415 return invalid_mb_sequence; 416 if (c <= maxcode) 417 from += inc; 418 return c; 419 } 420 421 template<typename C, bool A> 422 bool 423 write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode) 424 { 425 static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit"); 426 427 if (codepoint <= max_single_utf16_unit) 428 { 429 if (to.size() > 0) 430 { 431 to = adjust_byte_order(codepoint, mode); 432 return true; 433 } 434 } 435 else if (to.size() > 1) 436 { 437 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4 438 const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10); 439 char16_t lead = LEAD_OFFSET + (codepoint >> 10); 440 char16_t trail = 0xDC00 + (codepoint & 0x3FF); 441 to = adjust_byte_order(lead, mode); 442 to = adjust_byte_order(trail, mode); 443 return true; 444 } 445 return false; 446 } 447 448 // utf8 -> ucs4 449 codecvt_base::result 450 ucs4_in(range<const char>& from, range<char32_t>& to, 451 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 452 { 453 read_utf8_bom(from, mode); 454 while (from.size() && to.size()) 455 { 456 const char32_t codepoint = read_utf8_code_point(from, maxcode); 457 if (codepoint == incomplete_mb_character) 458 return codecvt_base::partial; 459 if (codepoint > maxcode) 460 return codecvt_base::error; 461 to = codepoint; 462 } 463 return from.size() ? codecvt_base::partial : codecvt_base::ok; 464 } 465 466 // ucs4 -> utf8 467 codecvt_base::result 468 ucs4_out(range<const char32_t>& from, range<char>& to, 469 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 470 { 471 if (!write_utf8_bom(to, mode)) 472 return codecvt_base::partial; 473 while (from.size()) 474 { 475 const char32_t c = from[0]; 476 if (c > maxcode) 477 return codecvt_base::error; 478 if (!write_utf8_code_point(to, c)) 479 return codecvt_base::partial; 480 ++from; 481 } 482 return codecvt_base::ok; 483 } 484 485 // utf16 -> ucs4 486 codecvt_base::result 487 ucs4_in(range<const char16_t, false>& from, range<char32_t>& to, 488 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 489 { 490 read_utf16_bom(from, mode); 491 while (from.size() && to.size()) 492 { 493 const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); 494 if (codepoint == incomplete_mb_character) 495 return codecvt_base::partial; 496 if (codepoint > maxcode) 497 return codecvt_base::error; 498 to = codepoint; 499 } 500 return from.size() ? codecvt_base::partial : codecvt_base::ok; 501 } 502 503 // ucs4 -> utf16 504 codecvt_base::result 505 ucs4_out(range<const char32_t>& from, range<char16_t, false>& to, 506 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 507 { 508 if (!write_utf16_bom(to, mode)) 509 return codecvt_base::partial; 510 while (from.size()) 511 { 512 const char32_t c = from[0]; 513 if (c > maxcode) 514 return codecvt_base::error; 515 if (!write_utf16_code_point(to, c, mode)) 516 return codecvt_base::partial; 517 ++from; 518 } 519 return codecvt_base::ok; 520 } 521 522 // Flag indicating whether to process UTF-16 or UCS2 523 enum class surrogates { allowed, disallowed }; 524 525 // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed) 526 template<typename C> 527 codecvt_base::result 528 utf16_in(range<const char>& from, range<C>& to, 529 unsigned long maxcode = max_code_point, codecvt_mode mode = {}, 530 surrogates s = surrogates::allowed) 531 { 532 read_utf8_bom(from, mode); 533 while (from.size() && to.size()) 534 { 535 auto orig = from; 536 const char32_t codepoint = read_utf8_code_point(from, maxcode); 537 if (codepoint == incomplete_mb_character) 538 { 539 if (s == surrogates::allowed) 540 return codecvt_base::partial; 541 else 542 return codecvt_base::error; // No surrogates in UCS2 543 } 544 if (codepoint > maxcode) 545 return codecvt_base::error; 546 if (!write_utf16_code_point(to, codepoint, mode)) 547 { 548 from = orig; // rewind to previous position 549 return codecvt_base::partial; 550 } 551 } 552 return codecvt_base::ok; 553 } 554 555 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed) 556 template<typename C> 557 codecvt_base::result 558 utf16_out(range<const C>& from, range<char>& to, 559 unsigned long maxcode = max_code_point, codecvt_mode mode = {}, 560 surrogates s = surrogates::allowed) 561 { 562 if (!write_utf8_bom(to, mode)) 563 return codecvt_base::partial; 564 while (from.size()) 565 { 566 char32_t c = from[0]; 567 int inc = 1; 568 if (is_high_surrogate(c)) 569 { 570 if (s == surrogates::disallowed) 571 return codecvt_base::error; // No surrogates in UCS-2 572 573 if (from.size() < 2) 574 return codecvt_base::ok; // stop converting at this point 575 576 const char32_t c2 = from[1]; 577 if (is_low_surrogate(c2)) 578 { 579 c = surrogate_pair_to_code_point(c, c2); 580 inc = 2; 581 } 582 else 583 return codecvt_base::error; 584 } 585 else if (is_low_surrogate(c)) 586 return codecvt_base::error; 587 if (c > maxcode) 588 return codecvt_base::error; 589 if (!write_utf8_code_point(to, c)) 590 return codecvt_base::partial; 591 from += inc; 592 } 593 return codecvt_base::ok; 594 } 595 596 // return pos such that [begin,pos) is valid UTF-16 string no longer than max 597 const char* 598 utf16_span(const char* begin, const char* end, size_t max, 599 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 600 { 601 range<const char> from{ begin, end }; 602 read_utf8_bom(from, mode); 603 size_t count = 0; 604 while (count+1 < max) 605 { 606 char32_t c = read_utf8_code_point(from, maxcode); 607 if (c > maxcode) 608 return from.next; 609 else if (c > max_single_utf16_unit) 610 ++count; 611 ++count; 612 } 613 if (count+1 == max) // take one more character if it fits in a single unit 614 read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode)); 615 return from.next; 616 } 617 618 // utf8 -> ucs2 619 codecvt_base::result 620 ucs2_in(range<const char>& from, range<char16_t>& to, 621 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 622 { 623 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 624 maxcode = std::min(max_single_utf16_unit, maxcode); 625 return utf16_in(from, to, maxcode, mode, surrogates::disallowed); 626 } 627 628 // ucs2 -> utf8 629 codecvt_base::result 630 ucs2_out(range<const char16_t>& from, range<char>& to, 631 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 632 { 633 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 634 maxcode = std::min(max_single_utf16_unit, maxcode); 635 return utf16_out(from, to, maxcode, mode, surrogates::disallowed); 636 } 637 638 // ucs2 -> utf16 639 codecvt_base::result 640 ucs2_out(range<const char16_t>& from, range<char16_t, false>& to, 641 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 642 { 643 if (!write_utf16_bom(to, mode)) 644 return codecvt_base::partial; 645 while (from.size() && to.size()) 646 { 647 char16_t c = from[0]; 648 if (is_high_surrogate(c)) 649 return codecvt_base::error; 650 if (c > maxcode) 651 return codecvt_base::error; 652 to = adjust_byte_order(c, mode); 653 ++from; 654 } 655 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; 656 } 657 658 // utf16 -> ucs2 659 codecvt_base::result 660 ucs2_in(range<const char16_t, false>& from, range<char16_t>& to, 661 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 662 { 663 read_utf16_bom(from, mode); 664 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 665 maxcode = std::min(max_single_utf16_unit, maxcode); 666 while (from.size() && to.size()) 667 { 668 const char32_t c = read_utf16_code_point(from, maxcode, mode); 669 if (c == incomplete_mb_character) 670 return codecvt_base::error; // UCS-2 only supports single units. 671 if (c > maxcode) 672 return codecvt_base::error; 673 to = c; 674 } 675 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; 676 } 677 678 const char16_t* 679 ucs2_span(range<const char16_t, false>& from, size_t max, 680 char32_t maxcode, codecvt_mode mode) 681 { 682 read_utf16_bom(from, mode); 683 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 684 maxcode = std::min(max_single_utf16_unit, maxcode); 685 char32_t c = 0; 686 while (max-- && c <= maxcode) 687 c = read_utf16_code_point(from, maxcode, mode); 688 return reinterpret_cast<const char16_t*>(from.next); 689 } 690 691 const char* 692 ucs2_span(const char* begin, const char* end, size_t max, 693 char32_t maxcode, codecvt_mode mode) 694 { 695 range<const char> from{ begin, end }; 696 read_utf8_bom(from, mode); 697 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 698 maxcode = std::min(max_single_utf16_unit, maxcode); 699 char32_t c = 0; 700 while (max-- && c <= maxcode) 701 c = read_utf8_code_point(from, maxcode); 702 return from.next; 703 } 704 705 // return pos such that [begin,pos) is valid UCS-4 string no longer than max 706 const char* 707 ucs4_span(const char* begin, const char* end, size_t max, 708 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 709 { 710 range<const char> from{ begin, end }; 711 read_utf8_bom(from, mode); 712 char32_t c = 0; 713 while (max-- && c <= maxcode) 714 c = read_utf8_code_point(from, maxcode); 715 return from.next; 716 } 717 718 // return pos such that [begin,pos) is valid UCS-4 string no longer than max 719 const char16_t* 720 ucs4_span(range<const char16_t, false>& from, size_t max, 721 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 722 { 723 read_utf16_bom(from, mode); 724 char32_t c = 0; 725 while (max-- && c <= maxcode) 726 c = read_utf16_code_point(from, maxcode, mode); 727 return reinterpret_cast<const char16_t*>(from.next); 728 } 729} 730 731// Define members of codecvt<char16_t, char, mbstate_t> specialization. 732// Converts from UTF-8 to UTF-16. 733 734locale::id codecvt<char16_t, char, mbstate_t>::id; 735 736codecvt<char16_t, char, mbstate_t>::~codecvt() { } 737 738codecvt_base::result 739codecvt<char16_t, char, mbstate_t>:: 740do_out(state_type&, 741 const intern_type* __from, 742 const intern_type* __from_end, const intern_type*& __from_next, 743 extern_type* __to, extern_type* __to_end, 744 extern_type*& __to_next) const 745{ 746 range<const char16_t> from{ __from, __from_end }; 747 range<char> to{ __to, __to_end }; 748 auto res = utf16_out(from, to); 749 __from_next = from.next; 750 __to_next = to.next; 751 return res; 752} 753 754codecvt_base::result 755codecvt<char16_t, char, mbstate_t>:: 756do_unshift(state_type&, extern_type* __to, extern_type*, 757 extern_type*& __to_next) const 758{ 759 __to_next = __to; 760 return noconv; // we don't use mbstate_t for the unicode facets 761} 762 763codecvt_base::result 764codecvt<char16_t, char, mbstate_t>:: 765do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 766 const extern_type*& __from_next, 767 intern_type* __to, intern_type* __to_end, 768 intern_type*& __to_next) const 769{ 770 range<const char> from{ __from, __from_end }; 771 range<char16_t> to{ __to, __to_end }; 772#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 773 codecvt_mode mode = {}; 774#else 775 codecvt_mode mode = little_endian; 776#endif 777 auto res = utf16_in(from, to, max_code_point, mode); 778 __from_next = from.next; 779 __to_next = to.next; 780 return res; 781} 782 783int 784codecvt<char16_t, char, mbstate_t>::do_encoding() const throw() 785{ return 0; } // UTF-8 is not a fixed-width encoding 786 787bool 788codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw() 789{ return false; } 790 791int 792codecvt<char16_t, char, mbstate_t>:: 793do_length(state_type&, const extern_type* __from, 794 const extern_type* __end, size_t __max) const 795{ 796 __end = utf16_span(__from, __end, __max); 797 return __end - __from; 798} 799 800int 801codecvt<char16_t, char, mbstate_t>::do_max_length() const throw() 802{ 803 // A single character (one or two UTF-16 code units) requires 804 // up to four UTF-8 code units. 805 return 4; 806} 807 808// Define members of codecvt<char32_t, char, mbstate_t> specialization. 809// Converts from UTF-8 to UTF-32 (aka UCS-4). 810 811locale::id codecvt<char32_t, char, mbstate_t>::id; 812 813codecvt<char32_t, char, mbstate_t>::~codecvt() { } 814 815codecvt_base::result 816codecvt<char32_t, char, mbstate_t>:: 817do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 818 const intern_type*& __from_next, 819 extern_type* __to, extern_type* __to_end, 820 extern_type*& __to_next) const 821{ 822 range<const char32_t> from{ __from, __from_end }; 823 range<char> to{ __to, __to_end }; 824 auto res = ucs4_out(from, to); 825 __from_next = from.next; 826 __to_next = to.next; 827 return res; 828} 829 830codecvt_base::result 831codecvt<char32_t, char, mbstate_t>:: 832do_unshift(state_type&, extern_type* __to, extern_type*, 833 extern_type*& __to_next) const 834{ 835 __to_next = __to; 836 return noconv; 837} 838 839codecvt_base::result 840codecvt<char32_t, char, mbstate_t>:: 841do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 842 const extern_type*& __from_next, 843 intern_type* __to, intern_type* __to_end, 844 intern_type*& __to_next) const 845{ 846 range<const char> from{ __from, __from_end }; 847 range<char32_t> to{ __to, __to_end }; 848 auto res = ucs4_in(from, to); 849 __from_next = from.next; 850 __to_next = to.next; 851 return res; 852} 853 854int 855codecvt<char32_t, char, mbstate_t>::do_encoding() const throw() 856{ return 0; } // UTF-8 is not a fixed-width encoding 857 858bool 859codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw() 860{ return false; } 861 862int 863codecvt<char32_t, char, mbstate_t>:: 864do_length(state_type&, const extern_type* __from, 865 const extern_type* __end, size_t __max) const 866{ 867 __end = ucs4_span(__from, __end, __max); 868 return __end - __from; 869} 870 871int 872codecvt<char32_t, char, mbstate_t>::do_max_length() const throw() 873{ 874 // A single character (one UTF-32 code unit) requires 875 // up to 4 UTF-8 code units. 876 return 4; 877} 878 879// Define members of codecvt_utf8<char16_t> base class implementation. 880// Converts from UTF-8 to UCS-2. 881 882__codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { } 883 884codecvt_base::result 885__codecvt_utf8_base<char16_t>:: 886do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 887 const intern_type*& __from_next, 888 extern_type* __to, extern_type* __to_end, 889 extern_type*& __to_next) const 890{ 891 range<const char16_t> from{ __from, __from_end }; 892 range<char> to{ __to, __to_end }; 893 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 894 __from_next = from.next; 895 __to_next = to.next; 896 return res; 897} 898 899codecvt_base::result 900__codecvt_utf8_base<char16_t>:: 901do_unshift(state_type&, extern_type* __to, extern_type*, 902 extern_type*& __to_next) const 903{ 904 __to_next = __to; 905 return noconv; 906} 907 908codecvt_base::result 909__codecvt_utf8_base<char16_t>:: 910do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 911 const extern_type*& __from_next, 912 intern_type* __to, intern_type* __to_end, 913 intern_type*& __to_next) const 914{ 915 range<const char> from{ __from, __from_end }; 916 range<char16_t> to{ __to, __to_end }; 917 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 918#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 919 mode = codecvt_mode(mode | little_endian); 920#endif 921 auto res = ucs2_in(from, to, _M_maxcode, mode); 922 __from_next = from.next; 923 __to_next = to.next; 924 return res; 925} 926 927int 928__codecvt_utf8_base<char16_t>::do_encoding() const throw() 929{ return 0; } // UTF-8 is not a fixed-width encoding 930 931bool 932__codecvt_utf8_base<char16_t>::do_always_noconv() const throw() 933{ return false; } 934 935int 936__codecvt_utf8_base<char16_t>:: 937do_length(state_type&, const extern_type* __from, 938 const extern_type* __end, size_t __max) const 939{ 940 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); 941 return __end - __from; 942} 943 944int 945__codecvt_utf8_base<char16_t>::do_max_length() const throw() 946{ 947 // A single UCS-2 character requires up to three UTF-8 code units. 948 // (UCS-2 cannot represent characters that use four UTF-8 code units). 949 int max = 3; 950 if (_M_mode & consume_header) 951 max += sizeof(utf8_bom); 952 return max; 953} 954 955// Define members of codecvt_utf8<char32_t> base class implementation. 956// Converts from UTF-8 to UTF-32 (aka UCS-4). 957 958__codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { } 959 960codecvt_base::result 961__codecvt_utf8_base<char32_t>:: 962do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 963 const intern_type*& __from_next, 964 extern_type* __to, extern_type* __to_end, 965 extern_type*& __to_next) const 966{ 967 range<const char32_t> from{ __from, __from_end }; 968 range<char> to{ __to, __to_end }; 969 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 970 __from_next = from.next; 971 __to_next = to.next; 972 return res; 973} 974 975codecvt_base::result 976__codecvt_utf8_base<char32_t>:: 977do_unshift(state_type&, extern_type* __to, extern_type*, 978 extern_type*& __to_next) const 979{ 980 __to_next = __to; 981 return noconv; 982} 983 984codecvt_base::result 985__codecvt_utf8_base<char32_t>:: 986do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 987 const extern_type*& __from_next, 988 intern_type* __to, intern_type* __to_end, 989 intern_type*& __to_next) const 990{ 991 range<const char> from{ __from, __from_end }; 992 range<char32_t> to{ __to, __to_end }; 993 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 994 __from_next = from.next; 995 __to_next = to.next; 996 return res; 997} 998 999int 1000__codecvt_utf8_base<char32_t>::do_encoding() const throw() 1001{ return 0; } // UTF-8 is not a fixed-width encoding 1002 1003bool 1004__codecvt_utf8_base<char32_t>::do_always_noconv() const throw() 1005{ return false; } 1006 1007int 1008__codecvt_utf8_base<char32_t>:: 1009do_length(state_type&, const extern_type* __from, 1010 const extern_type* __end, size_t __max) const 1011{ 1012 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); 1013 return __end - __from; 1014} 1015 1016int 1017__codecvt_utf8_base<char32_t>::do_max_length() const throw() 1018{ 1019 // A single UCS-4 character requires up to four UTF-8 code units. 1020 int max = 4; 1021 if (_M_mode & consume_header) 1022 max += sizeof(utf8_bom); 1023 return max; 1024} 1025 1026#ifdef _GLIBCXX_USE_WCHAR_T 1027 1028#if __SIZEOF_WCHAR_T__ == 2 1029static_assert(sizeof(wchar_t) == sizeof(char16_t), ""); 1030#elif __SIZEOF_WCHAR_T__ == 4 1031static_assert(sizeof(wchar_t) == sizeof(char32_t), ""); 1032#endif 1033 1034// Define members of codecvt_utf8<wchar_t> base class implementation. 1035// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). 1036 1037__codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { } 1038 1039codecvt_base::result 1040__codecvt_utf8_base<wchar_t>:: 1041do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1042 const intern_type*& __from_next, 1043 extern_type* __to, extern_type* __to_end, 1044 extern_type*& __to_next) const 1045{ 1046 range<char> to{ __to, __to_end }; 1047#if __SIZEOF_WCHAR_T__ == 2 1048 range<const char16_t> from{ 1049 reinterpret_cast<const char16_t*>(__from), 1050 reinterpret_cast<const char16_t*>(__from_end) 1051 }; 1052 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1053#elif __SIZEOF_WCHAR_T__ == 4 1054 range<const char32_t> from{ 1055 reinterpret_cast<const char32_t*>(__from), 1056 reinterpret_cast<const char32_t*>(__from_end) 1057 }; 1058 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1059#else 1060 return codecvt_base::error; 1061#endif 1062 __from_next = reinterpret_cast<const wchar_t*>(from.next); 1063 __to_next = to.next; 1064 return res; 1065} 1066 1067codecvt_base::result 1068__codecvt_utf8_base<wchar_t>:: 1069do_unshift(state_type&, extern_type* __to, extern_type*, 1070 extern_type*& __to_next) const 1071{ 1072 __to_next = __to; 1073 return noconv; 1074} 1075 1076codecvt_base::result 1077__codecvt_utf8_base<wchar_t>:: 1078do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1079 const extern_type*& __from_next, 1080 intern_type* __to, intern_type* __to_end, 1081 intern_type*& __to_next) const 1082{ 1083 range<const char> from{ __from, __from_end }; 1084#if __SIZEOF_WCHAR_T__ == 2 1085 range<char16_t> to{ 1086 reinterpret_cast<char16_t*>(__to), 1087 reinterpret_cast<char16_t*>(__to_end) 1088 }; 1089#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 1090 codecvt_mode mode = {}; 1091#else 1092 codecvt_mode mode = little_endian; 1093#endif 1094 auto res = ucs2_in(from, to, _M_maxcode, mode); 1095#elif __SIZEOF_WCHAR_T__ == 4 1096 range<char32_t> to{ 1097 reinterpret_cast<char32_t*>(__to), 1098 reinterpret_cast<char32_t*>(__to_end) 1099 }; 1100 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1101#else 1102 return codecvt_base::error; 1103#endif 1104 __from_next = from.next; 1105 __to_next = reinterpret_cast<wchar_t*>(to.next); 1106 return res; 1107} 1108 1109int 1110__codecvt_utf8_base<wchar_t>::do_encoding() const throw() 1111{ return 0; } // UTF-8 is not a fixed-width encoding 1112 1113bool 1114__codecvt_utf8_base<wchar_t>::do_always_noconv() const throw() 1115{ return false; } 1116 1117int 1118__codecvt_utf8_base<wchar_t>:: 1119do_length(state_type&, const extern_type* __from, 1120 const extern_type* __end, size_t __max) const 1121{ 1122#if __SIZEOF_WCHAR_T__ == 2 1123 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); 1124#elif __SIZEOF_WCHAR_T__ == 4 1125 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); 1126#else 1127 __end = __from; 1128#endif 1129 return __end - __from; 1130} 1131 1132int 1133__codecvt_utf8_base<wchar_t>::do_max_length() const throw() 1134{ 1135#if __SIZEOF_WCHAR_T__ == 2 1136 int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length() 1137#else 1138 int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length() 1139#endif 1140 if (_M_mode & consume_header) 1141 max += sizeof(utf8_bom); 1142 return max; 1143} 1144#endif 1145 1146// Define members of codecvt_utf16<char16_t> base class implementation. 1147// Converts from UTF-16 to UCS-2. 1148 1149__codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { } 1150 1151codecvt_base::result 1152__codecvt_utf16_base<char16_t>:: 1153do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1154 const intern_type*& __from_next, 1155 extern_type* __to, extern_type* __to_end, 1156 extern_type*& __to_next) const 1157{ 1158 range<const char16_t> from{ __from, __from_end }; 1159 range<char16_t, false> to{ __to, __to_end }; 1160 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1161 __from_next = from.next; 1162 __to_next = reinterpret_cast<char*>(to.next); 1163 return res; 1164} 1165 1166codecvt_base::result 1167__codecvt_utf16_base<char16_t>:: 1168do_unshift(state_type&, extern_type* __to, extern_type*, 1169 extern_type*& __to_next) const 1170{ 1171 __to_next = __to; 1172 return noconv; 1173} 1174 1175codecvt_base::result 1176__codecvt_utf16_base<char16_t>:: 1177do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1178 const extern_type*& __from_next, 1179 intern_type* __to, intern_type* __to_end, 1180 intern_type*& __to_next) const 1181{ 1182 range<const char16_t, false> from{ __from, __from_end }; 1183 range<char16_t> to{ __to, __to_end }; 1184 auto res = ucs2_in(from, to, _M_maxcode, _M_mode); 1185 __from_next = reinterpret_cast<const char*>(from.next); 1186 __to_next = to.next; 1187 if (res == codecvt_base::ok && __from_next != __from_end) 1188 res = codecvt_base::error; 1189 return res; 1190} 1191 1192int 1193__codecvt_utf16_base<char16_t>::do_encoding() const throw() 1194{ return 0; } // UTF-16 is not a fixed-width encoding 1195 1196bool 1197__codecvt_utf16_base<char16_t>::do_always_noconv() const throw() 1198{ return false; } 1199 1200int 1201__codecvt_utf16_base<char16_t>:: 1202do_length(state_type&, const extern_type* __from, 1203 const extern_type* __end, size_t __max) const 1204{ 1205 range<const char16_t, false> from{ __from, __end }; 1206 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); 1207 return reinterpret_cast<const char*>(next) - __from; 1208} 1209 1210int 1211__codecvt_utf16_base<char16_t>::do_max_length() const throw() 1212{ 1213 // A single UCS-2 character requires one UTF-16 code unit (so two chars). 1214 // (UCS-2 cannot represent characters that use multiple UTF-16 code units). 1215 int max = 2; 1216 if (_M_mode & consume_header) 1217 max += sizeof(utf16_bom); 1218 return max; 1219} 1220 1221// Define members of codecvt_utf16<char32_t> base class implementation. 1222// Converts from UTF-16 to UTF-32 (aka UCS-4). 1223 1224__codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { } 1225 1226codecvt_base::result 1227__codecvt_utf16_base<char32_t>:: 1228do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1229 const intern_type*& __from_next, 1230 extern_type* __to, extern_type* __to_end, 1231 extern_type*& __to_next) const 1232{ 1233 range<const char32_t> from{ __from, __from_end }; 1234 range<char16_t, false> to{ __to, __to_end }; 1235 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1236 __from_next = from.next; 1237 __to_next = reinterpret_cast<char*>(to.next); 1238 return res; 1239} 1240 1241codecvt_base::result 1242__codecvt_utf16_base<char32_t>:: 1243do_unshift(state_type&, extern_type* __to, extern_type*, 1244 extern_type*& __to_next) const 1245{ 1246 __to_next = __to; 1247 return noconv; 1248} 1249 1250codecvt_base::result 1251__codecvt_utf16_base<char32_t>:: 1252do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1253 const extern_type*& __from_next, 1254 intern_type* __to, intern_type* __to_end, 1255 intern_type*& __to_next) const 1256{ 1257 range<const char16_t, false> from{ __from, __from_end }; 1258 range<char32_t> to{ __to, __to_end }; 1259 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1260 __from_next = reinterpret_cast<const char*>(from.next); 1261 __to_next = to.next; 1262 if (res == codecvt_base::ok && __from_next != __from_end) 1263 res = codecvt_base::error; 1264 return res; 1265} 1266 1267int 1268__codecvt_utf16_base<char32_t>::do_encoding() const throw() 1269{ return 0; } // UTF-16 is not a fixed-width encoding 1270 1271bool 1272__codecvt_utf16_base<char32_t>::do_always_noconv() const throw() 1273{ return false; } 1274 1275int 1276__codecvt_utf16_base<char32_t>:: 1277do_length(state_type&, const extern_type* __from, 1278 const extern_type* __end, size_t __max) const 1279{ 1280 range<const char16_t, false> from{ __from, __end }; 1281 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); 1282 return reinterpret_cast<const char*>(next) - __from; 1283} 1284 1285int 1286__codecvt_utf16_base<char32_t>::do_max_length() const throw() 1287{ 1288 // A single UCS-4 character requires one or two UTF-16 code units 1289 // (so up to four chars). 1290 int max = 4; 1291 if (_M_mode & consume_header) 1292 max += sizeof(utf16_bom); 1293 return max; 1294} 1295 1296#ifdef _GLIBCXX_USE_WCHAR_T 1297// Define members of codecvt_utf16<wchar_t> base class implementation. 1298// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). 1299 1300__codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { } 1301 1302codecvt_base::result 1303__codecvt_utf16_base<wchar_t>:: 1304do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1305 const intern_type*& __from_next, 1306 extern_type* __to, extern_type* __to_end, 1307 extern_type*& __to_next) const 1308{ 1309 range<char16_t, false> to{ __to, __to_end }; 1310#if __SIZEOF_WCHAR_T__ == 2 1311 range<const char16_t> from{ 1312 reinterpret_cast<const char16_t*>(__from), 1313 reinterpret_cast<const char16_t*>(__from_end), 1314 }; 1315 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1316#elif __SIZEOF_WCHAR_T__ == 4 1317 range<const char32_t> from{ 1318 reinterpret_cast<const char32_t*>(__from), 1319 reinterpret_cast<const char32_t*>(__from_end), 1320 }; 1321 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1322#else 1323 return codecvt_base::error; 1324#endif 1325 __from_next = reinterpret_cast<const wchar_t*>(from.next); 1326 __to_next = reinterpret_cast<char*>(to.next); 1327 return res; 1328} 1329 1330codecvt_base::result 1331__codecvt_utf16_base<wchar_t>:: 1332do_unshift(state_type&, extern_type* __to, extern_type*, 1333 extern_type*& __to_next) const 1334{ 1335 __to_next = __to; 1336 return noconv; 1337} 1338 1339codecvt_base::result 1340__codecvt_utf16_base<wchar_t>:: 1341do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1342 const extern_type*& __from_next, 1343 intern_type* __to, intern_type* __to_end, 1344 intern_type*& __to_next) const 1345{ 1346 range<const char16_t, false> from{ __from, __from_end }; 1347#if __SIZEOF_WCHAR_T__ == 2 1348 range<char16_t> to{ 1349 reinterpret_cast<char16_t*>(__to), 1350 reinterpret_cast<char16_t*>(__to_end), 1351 }; 1352 auto res = ucs2_in(from, to, _M_maxcode, _M_mode); 1353#elif __SIZEOF_WCHAR_T__ == 4 1354 range<char32_t> to{ 1355 reinterpret_cast<char32_t*>(__to), 1356 reinterpret_cast<char32_t*>(__to_end), 1357 }; 1358 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1359#else 1360 return codecvt_base::error; 1361#endif 1362 __from_next = reinterpret_cast<const char*>(from.next); 1363 __to_next = reinterpret_cast<wchar_t*>(to.next); 1364 if (res == codecvt_base::ok && __from_next != __from_end) 1365 res = codecvt_base::error; 1366 return res; 1367} 1368 1369int 1370__codecvt_utf16_base<wchar_t>::do_encoding() const throw() 1371{ return 0; } // UTF-16 is not a fixed-width encoding 1372 1373bool 1374__codecvt_utf16_base<wchar_t>::do_always_noconv() const throw() 1375{ return false; } 1376 1377int 1378__codecvt_utf16_base<wchar_t>:: 1379do_length(state_type&, const extern_type* __from, 1380 const extern_type* __end, size_t __max) const 1381{ 1382 range<const char16_t, false> from{ __from, __end }; 1383#if __SIZEOF_WCHAR_T__ == 2 1384 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); 1385#elif __SIZEOF_WCHAR_T__ == 4 1386 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); 1387#endif 1388 return reinterpret_cast<const char*>(next) - __from; 1389} 1390 1391int 1392__codecvt_utf16_base<wchar_t>::do_max_length() const throw() 1393{ 1394#if __SIZEOF_WCHAR_T__ == 2 1395 int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length() 1396#else 1397 int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length() 1398#endif 1399 if (_M_mode & consume_header) 1400 max += sizeof(utf16_bom); 1401 return max; 1402} 1403#endif 1404 1405// Define members of codecvt_utf8_utf16<char16_t> base class implementation. 1406// Converts from UTF-8 to UTF-16. 1407 1408__codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { } 1409 1410codecvt_base::result 1411__codecvt_utf8_utf16_base<char16_t>:: 1412do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1413 const intern_type*& __from_next, 1414 extern_type* __to, extern_type* __to_end, 1415 extern_type*& __to_next) const 1416{ 1417 range<const char16_t> from{ __from, __from_end }; 1418 range<char> to{ __to, __to_end }; 1419 auto res = utf16_out(from, to, _M_maxcode, _M_mode); 1420 __from_next = from.next; 1421 __to_next = to.next; 1422 return res; 1423} 1424 1425codecvt_base::result 1426__codecvt_utf8_utf16_base<char16_t>:: 1427do_unshift(state_type&, extern_type* __to, extern_type*, 1428 extern_type*& __to_next) const 1429{ 1430 __to_next = __to; 1431 return noconv; 1432} 1433 1434codecvt_base::result 1435__codecvt_utf8_utf16_base<char16_t>:: 1436do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1437 const extern_type*& __from_next, 1438 intern_type* __to, intern_type* __to_end, 1439 intern_type*& __to_next) const 1440{ 1441 range<const char> from{ __from, __from_end }; 1442 range<char16_t> to{ __to, __to_end }; 1443 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1444#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1445 mode = codecvt_mode(mode | little_endian); 1446#endif 1447 auto res = utf16_in(from, to, _M_maxcode, mode); 1448 __from_next = from.next; 1449 __to_next = to.next; 1450 return res; 1451} 1452 1453int 1454__codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw() 1455{ return 0; } // UTF-8 is not a fixed-width encoding 1456 1457bool 1458__codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw() 1459{ return false; } 1460 1461int 1462__codecvt_utf8_utf16_base<char16_t>:: 1463do_length(state_type&, const extern_type* __from, 1464 const extern_type* __end, size_t __max) const 1465{ 1466 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); 1467 return __end - __from; 1468} 1469 1470int 1471__codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw() 1472{ 1473 // A single character can be 1 or 2 UTF-16 code units, 1474 // requiring up to 4 UTF-8 code units. 1475 int max = 4; 1476 if (_M_mode & consume_header) 1477 max += sizeof(utf8_bom); 1478 return max; 1479} 1480 1481// Define members of codecvt_utf8_utf16<char32_t> base class implementation. 1482// Converts from UTF-8 to UTF-16. 1483 1484__codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { } 1485 1486codecvt_base::result 1487__codecvt_utf8_utf16_base<char32_t>:: 1488do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1489 const intern_type*& __from_next, 1490 extern_type* __to, extern_type* __to_end, 1491 extern_type*& __to_next) const 1492{ 1493 range<const char32_t> from{ __from, __from_end }; 1494 range<char> to{ __to, __to_end }; 1495 auto res = utf16_out(from, to, _M_maxcode, _M_mode); 1496 __from_next = from.next; 1497 __to_next = to.next; 1498 return res; 1499} 1500 1501codecvt_base::result 1502__codecvt_utf8_utf16_base<char32_t>:: 1503do_unshift(state_type&, extern_type* __to, extern_type*, 1504 extern_type*& __to_next) const 1505{ 1506 __to_next = __to; 1507 return noconv; 1508} 1509 1510codecvt_base::result 1511__codecvt_utf8_utf16_base<char32_t>:: 1512do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1513 const extern_type*& __from_next, 1514 intern_type* __to, intern_type* __to_end, 1515 intern_type*& __to_next) const 1516{ 1517 range<const char> from{ __from, __from_end }; 1518 range<char32_t> to{ __to, __to_end }; 1519 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1520#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1521 mode = codecvt_mode(mode | little_endian); 1522#endif 1523 auto res = utf16_in(from, to, _M_maxcode, mode); 1524 __from_next = from.next; 1525 __to_next = to.next; 1526 return res; 1527} 1528 1529int 1530__codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw() 1531{ return 0; } // UTF-8 is not a fixed-width encoding 1532 1533bool 1534__codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw() 1535{ return false; } 1536 1537int 1538__codecvt_utf8_utf16_base<char32_t>:: 1539do_length(state_type&, const extern_type* __from, 1540 const extern_type* __end, size_t __max) const 1541{ 1542 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); 1543 return __end - __from; 1544} 1545 1546int 1547__codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw() 1548{ 1549 // A single character can be 1 or 2 UTF-16 code units, 1550 // requiring up to 4 UTF-8 code units. 1551 int max = 4; 1552 if (_M_mode & consume_header) 1553 max += sizeof(utf8_bom); 1554 return max; 1555} 1556 1557#ifdef _GLIBCXX_USE_WCHAR_T 1558// Define members of codecvt_utf8_utf16<wchar_t> base class implementation. 1559// Converts from UTF-8 to UTF-16. 1560 1561__codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { } 1562 1563codecvt_base::result 1564__codecvt_utf8_utf16_base<wchar_t>:: 1565do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1566 const intern_type*& __from_next, 1567 extern_type* __to, extern_type* __to_end, 1568 extern_type*& __to_next) const 1569{ 1570 range<const wchar_t> from{ __from, __from_end }; 1571 range<char> to{ __to, __to_end }; 1572 auto res = utf16_out(from, to, _M_maxcode, _M_mode); 1573 __from_next = from.next; 1574 __to_next = to.next; 1575 return res; 1576} 1577 1578codecvt_base::result 1579__codecvt_utf8_utf16_base<wchar_t>:: 1580do_unshift(state_type&, extern_type* __to, extern_type*, 1581 extern_type*& __to_next) const 1582{ 1583 __to_next = __to; 1584 return noconv; 1585} 1586 1587codecvt_base::result 1588__codecvt_utf8_utf16_base<wchar_t>:: 1589do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1590 const extern_type*& __from_next, 1591 intern_type* __to, intern_type* __to_end, 1592 intern_type*& __to_next) const 1593{ 1594 range<const char> from{ __from, __from_end }; 1595 range<wchar_t> to{ __to, __to_end }; 1596 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1597#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1598 mode = codecvt_mode(mode | little_endian); 1599#endif 1600 auto res = utf16_in(from, to, _M_maxcode, mode); 1601 __from_next = from.next; 1602 __to_next = to.next; 1603 return res; 1604} 1605 1606int 1607__codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw() 1608{ return 0; } // UTF-8 is not a fixed-width encoding 1609 1610bool 1611__codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw() 1612{ return false; } 1613 1614int 1615__codecvt_utf8_utf16_base<wchar_t>:: 1616do_length(state_type&, const extern_type* __from, 1617 const extern_type* __end, size_t __max) const 1618{ 1619 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); 1620 return __end - __from; 1621} 1622 1623int 1624__codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw() 1625{ 1626 // A single character can be 1 or 2 UTF-16 code units, 1627 // requiring up to 4 UTF-8 code units. 1628 int max = 4; 1629 if (_M_mode & consume_header) 1630 max += sizeof(utf8_bom); 1631 return max; 1632} 1633#endif 1634 1635inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>; 1636inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>; 1637template class codecvt_byname<char16_t, char, mbstate_t>; 1638template class codecvt_byname<char32_t, char, mbstate_t>; 1639 1640_GLIBCXX_END_NAMESPACE_VERSION 1641} 1642#endif // _GLIBCXX_USE_C99_STDINT_TR1 1643