1// Locale support (codecvt) -*- C++ -*- 2 3// Copyright (C) 2015-2022 Free Software Foundation, Inc. 4// 5// This file is part of the GNU ISO C++ Library. This library is free 6// software; you can redistribute it and/or modify it under the 7// terms of the GNU General Public License as published by the 8// Free Software Foundation; either version 3, or (at your option) 9// any later version. 10 11// This library is distributed in the hope that it will be useful, 12// but WITHOUT ANY WARRANTY; without even the implied warranty of 13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14// GNU General Public License for more details. 15 16// Under Section 7 of GPL version 3, you are granted additional 17// permissions described in the GCC Runtime Library Exception, version 18// 3.1, as published by the Free Software Foundation. 19 20// You should have received a copy of the GNU General Public License and 21// a copy of the GCC Runtime Library Exception along with this program; 22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23// <http://www.gnu.org/licenses/>. 24 25#include <codecvt> 26#include <cstring> // std::memcpy, std::memcmp 27#include <bits/stl_algobase.h> // std::min 28 29namespace std _GLIBCXX_VISIBILITY(default) 30{ 31_GLIBCXX_BEGIN_NAMESPACE_VERSION 32 33 // The standard doesn't define these operators, which is annoying. 34 static underlying_type<codecvt_mode>::type 35 to_integer(codecvt_mode m) 36 { return static_cast<underlying_type<codecvt_mode>::type>(m); } 37 38 static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n) 39 { return m = codecvt_mode(to_integer(m) & to_integer(n)); } 40 41 static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n) 42 { return m = codecvt_mode(to_integer(m) | to_integer(n)); } 43 44 static codecvt_mode operator~(codecvt_mode m) 45 { return codecvt_mode(~to_integer(m)); } 46 47namespace 48{ 49 // Largest code point that fits in a single UTF-16 code unit. 50 const char32_t max_single_utf16_unit = 0xFFFF; 51 52 const char32_t max_code_point = 0x10FFFF; 53 54 // The functions below rely on maxcode < incomplete_mb_character 55 // (which is enforced by the codecvt_utf* classes on construction). 56 const char32_t incomplete_mb_character = char32_t(-2); 57 const char32_t invalid_mb_sequence = char32_t(-1); 58 59 // Utility type for reading and writing code units of type Elem from 60 // a range defined by a pair of pointers. 61 template<typename Elem, bool Aligned = true> 62 struct range 63 { 64 Elem* next; 65 Elem* end; 66 67 // Write a code unit. 68 range& operator=(Elem e) 69 { 70 *next++ = e; 71 return *this; 72 } 73 74 // Read the next code unit. 75 Elem operator*() const { return *next; } 76 77 // Read the Nth code unit. 78 Elem operator[](size_t n) const { return next[n]; } 79 80 // Move to the next code unit. 81 range& operator++() 82 { 83 ++next; 84 return *this; 85 } 86 87 // Move to the Nth code unit. 88 range& operator+=(size_t n) 89 { 90 next += n; 91 return *this; 92 } 93 94 // The number of code units remaining. 95 size_t size() const { return end - next; } 96 97 // The number of bytes remaining. 98 size_t nbytes() const { return (const char*)end - (const char*)next; } 99 }; 100 101 // This specialization is used when accessing char16_t values through 102 // pointers to char, which might not be correctly aligned for char16_t. 103 template<typename Elem> 104 struct range<Elem, false> 105 { 106 using value_type = typename remove_const<Elem>::type; 107 108 using char_pointer = typename 109 conditional<is_const<Elem>::value, const char*, char*>::type; 110 111 char_pointer next; 112 char_pointer end; 113 114 // Write a code unit. 115 range& operator=(Elem e) 116 { 117 memcpy(next, &e, sizeof(Elem)); 118 ++*this; 119 return *this; 120 } 121 122 // Read the next code unit. 123 Elem operator*() const 124 { 125 value_type e; 126 memcpy(&e, next, sizeof(Elem)); 127 return e; 128 } 129 130 // Read the Nth code unit. 131 Elem operator[](size_t n) const 132 { 133 value_type e; 134 memcpy(&e, next + n * sizeof(Elem), sizeof(Elem)); 135 return e; 136 } 137 138 // Move to the next code unit. 139 range& operator++() 140 { 141 next += sizeof(Elem); 142 return *this; 143 } 144 145 // Move to the Nth code unit. 146 range& operator+=(size_t n) 147 { 148 next += n * sizeof(Elem); 149 return *this; 150 } 151 152 // The number of code units remaining. 153 size_t size() const { return nbytes() / sizeof(Elem); } 154 155 // The number of bytes remaining. 156 size_t nbytes() const { return end - next; } 157 }; 158 159 // Multibyte sequences can have "header" consisting of Byte Order Mark 160 const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF }; 161 const unsigned char utf16_bom[2] = { 0xFE, 0xFF }; 162 const unsigned char utf16le_bom[2] = { 0xFF, 0xFE }; 163 164 // Write a BOM (space permitting). 165 template<typename C, bool A, size_t N> 166 bool 167 write_bom(range<C, A>& to, const unsigned char (&bom)[N]) 168 { 169 static_assert( (N / sizeof(C)) != 0, "" ); 170 static_assert( (N % sizeof(C)) == 0, "" ); 171 172 if (to.nbytes() < N) 173 return false; 174 memcpy(to.next, bom, N); 175 to += (N / sizeof(C)); 176 return true; 177 } 178 179 // Try to read a BOM. 180 template<typename C, bool A, size_t N> 181 bool 182 read_bom(range<C, A>& from, const unsigned char (&bom)[N]) 183 { 184 static_assert( (N / sizeof(C)) != 0, "" ); 185 static_assert( (N % sizeof(C)) == 0, "" ); 186 187 if (from.nbytes() >= N && !memcmp(from.next, bom, N)) 188 { 189 from += (N / sizeof(C)); 190 return true; 191 } 192 return false; 193 } 194 195 // If generate_header is set in mode write out UTF-8 BOM. 196 template<typename C> 197 bool 198 write_utf8_bom(range<C>& to, codecvt_mode mode) 199 { 200 if (mode & generate_header) 201 return write_bom(to, utf8_bom); 202 return true; 203 } 204 205 // If generate_header is set in mode write out the UTF-16 BOM indicated 206 // by whether little_endian is set in mode. 207 template<bool Aligned> 208 bool 209 write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode) 210 { 211 if (mode & generate_header) 212 { 213 if (mode & little_endian) 214 return write_bom(to, utf16le_bom); 215 else 216 return write_bom(to, utf16_bom); 217 } 218 return true; 219 } 220 221 // If consume_header is set in mode update from.next to after any BOM. 222 template<typename C> 223 void 224 read_utf8_bom(range<const C>& from, codecvt_mode mode) 225 { 226 if (mode & consume_header) 227 read_bom(from, utf8_bom); 228 } 229 230 // If consume_header is not set in mode, no effects. 231 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then: 232 // - if the UTF-16BE BOM was found unset little_endian in mode, or 233 // - if the UTF-16LE BOM was found set little_endian in mode. 234 template<bool Aligned> 235 void 236 read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode) 237 { 238 if (mode & consume_header) 239 { 240 if (read_bom(from, utf16_bom)) 241 mode &= ~little_endian; 242 else if (read_bom(from, utf16le_bom)) 243 mode |= little_endian; 244 } 245 } 246 247 // Read a codepoint from a UTF-8 multibyte sequence. 248 // Updates from.next if the codepoint is not greater than maxcode. 249 // Returns invalid_mb_sequence, incomplete_mb_character or the code point. 250 template<typename C> 251 char32_t 252 read_utf8_code_point(range<const C>& from, unsigned long maxcode) 253 { 254 const size_t avail = from.size(); 255 if (avail == 0) 256 return incomplete_mb_character; 257 char32_t c1 = (unsigned char) from[0]; 258 // https://en.wikipedia.org/wiki/UTF-8#Sample_code 259 if (c1 < 0x80) 260 { 261 ++from; 262 return c1; 263 } 264 else if (c1 < 0xC2) // continuation or overlong 2-byte sequence 265 return invalid_mb_sequence; 266 else if (c1 < 0xE0) // 2-byte sequence 267 { 268 if (avail < 2) 269 return incomplete_mb_character; 270 char32_t c2 = (unsigned char) from[1]; 271 if ((c2 & 0xC0) != 0x80) 272 return invalid_mb_sequence; 273 char32_t c = (c1 << 6) + c2 - 0x3080; 274 if (c <= maxcode) 275 from += 2; 276 return c; 277 } 278 else if (c1 < 0xF0) // 3-byte sequence 279 { 280 if (avail < 3) 281 return incomplete_mb_character; 282 char32_t c2 = (unsigned char) from[1]; 283 if ((c2 & 0xC0) != 0x80) 284 return invalid_mb_sequence; 285 if (c1 == 0xE0 && c2 < 0xA0) // overlong 286 return invalid_mb_sequence; 287 char32_t c3 = (unsigned char) from[2]; 288 if ((c3 & 0xC0) != 0x80) 289 return invalid_mb_sequence; 290 char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080; 291 if (c <= maxcode) 292 from += 3; 293 return c; 294 } 295 else if (c1 < 0xF5) // 4-byte sequence 296 { 297 if (avail < 4) 298 return incomplete_mb_character; 299 char32_t c2 = (unsigned char) from[1]; 300 if ((c2 & 0xC0) != 0x80) 301 return invalid_mb_sequence; 302 if (c1 == 0xF0 && c2 < 0x90) // overlong 303 return invalid_mb_sequence; 304 if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF 305 return invalid_mb_sequence; 306 char32_t c3 = (unsigned char) from[2]; 307 if ((c3 & 0xC0) != 0x80) 308 return invalid_mb_sequence; 309 char32_t c4 = (unsigned char) from[3]; 310 if ((c4 & 0xC0) != 0x80) 311 return invalid_mb_sequence; 312 char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080; 313 if (c <= maxcode) 314 from += 4; 315 return c; 316 } 317 else // > U+10FFFF 318 return invalid_mb_sequence; 319 } 320 321 template<typename C> 322 bool 323 write_utf8_code_point(range<C>& to, char32_t code_point) 324 { 325 if (code_point < 0x80) 326 { 327 if (to.size() < 1) 328 return false; 329 to = code_point; 330 } 331 else if (code_point <= 0x7FF) 332 { 333 if (to.size() < 2) 334 return false; 335 to = (code_point >> 6) + 0xC0; 336 to = (code_point & 0x3F) + 0x80; 337 } 338 else if (code_point <= 0xFFFF) 339 { 340 if (to.size() < 3) 341 return false; 342 to = (code_point >> 12) + 0xE0; 343 to = ((code_point >> 6) & 0x3F) + 0x80; 344 to = (code_point & 0x3F) + 0x80; 345 } 346 else if (code_point <= 0x10FFFF) 347 { 348 if (to.size() < 4) 349 return false; 350 to = (code_point >> 18) + 0xF0; 351 to = ((code_point >> 12) & 0x3F) + 0x80; 352 to = ((code_point >> 6) & 0x3F) + 0x80; 353 to = (code_point & 0x3F) + 0x80; 354 } 355 else 356 return false; 357 return true; 358 } 359 360 inline char16_t 361 adjust_byte_order(char16_t c, codecvt_mode mode) 362 { 363#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 364 return (mode & little_endian) ? __builtin_bswap16(c) : c; 365#else 366 return (mode & little_endian) ? c : __builtin_bswap16(c); 367#endif 368 } 369 370 // Return true if c is a high-surrogate (aka leading) code point. 371 inline bool 372 is_high_surrogate(char32_t c) 373 { 374 return c >= 0xD800 && c <= 0xDBFF; 375 } 376 377 // Return true if c is a low-surrogate (aka trailing) code point. 378 inline bool 379 is_low_surrogate(char32_t c) 380 { 381 return c >= 0xDC00 && c <= 0xDFFF; 382 } 383 384 inline char32_t 385 surrogate_pair_to_code_point(char32_t high, char32_t low) 386 { 387 return (high << 10) + low - 0x35FDC00; 388 } 389 390 // Read a codepoint from a UTF-16 multibyte sequence. 391 // The sequence's endianness is indicated by (mode & little_endian). 392 // Updates from.next if the codepoint is not greater than maxcode. 393 // Returns invalid_mb_sequence, incomplete_mb_character or the code point. 394 template<bool Aligned> 395 char32_t 396 read_utf16_code_point(range<const char16_t, Aligned>& from, 397 unsigned long maxcode, codecvt_mode mode) 398 { 399 const size_t avail = from.size(); 400 if (avail == 0) 401 return incomplete_mb_character; 402 int inc = 1; 403 char32_t c = adjust_byte_order(from[0], mode); 404 if (is_high_surrogate(c)) 405 { 406 if (avail < 2) 407 return incomplete_mb_character; 408 const char16_t c2 = adjust_byte_order(from[1], mode); 409 if (is_low_surrogate(c2)) 410 { 411 c = surrogate_pair_to_code_point(c, c2); 412 inc = 2; 413 } 414 else 415 return invalid_mb_sequence; 416 } 417 else if (is_low_surrogate(c)) 418 return invalid_mb_sequence; 419 if (c <= maxcode) 420 from += inc; 421 return c; 422 } 423 424 template<typename C, bool A> 425 bool 426 write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode) 427 { 428 static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit"); 429 430 if (codepoint <= max_single_utf16_unit) 431 { 432 if (to.size() > 0) 433 { 434 to = adjust_byte_order(codepoint, mode); 435 return true; 436 } 437 } 438 else if (to.size() > 1) 439 { 440 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4 441 const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10); 442 char16_t lead = LEAD_OFFSET + (codepoint >> 10); 443 char16_t trail = 0xDC00 + (codepoint & 0x3FF); 444 to = adjust_byte_order(lead, mode); 445 to = adjust_byte_order(trail, mode); 446 return true; 447 } 448 return false; 449 } 450 451 // utf8 -> ucs4 452 template<typename C> 453 codecvt_base::result 454 ucs4_in(range<const C>& from, range<char32_t>& to, 455 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 456 { 457 read_utf8_bom(from, mode); 458 while (from.size() && to.size()) 459 { 460 const char32_t codepoint = read_utf8_code_point(from, maxcode); 461 if (codepoint == incomplete_mb_character) 462 return codecvt_base::partial; 463 if (codepoint > maxcode) 464 return codecvt_base::error; 465 to = codepoint; 466 } 467 return from.size() ? codecvt_base::partial : codecvt_base::ok; 468 } 469 470 // ucs4 -> utf8 471 template<typename C> 472 codecvt_base::result 473 ucs4_out(range<const char32_t>& from, range<C>& to, 474 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 475 { 476 if (!write_utf8_bom(to, mode)) 477 return codecvt_base::partial; 478 while (from.size()) 479 { 480 const char32_t c = from[0]; 481 if (c > maxcode) 482 return codecvt_base::error; 483 if (!write_utf8_code_point(to, c)) 484 return codecvt_base::partial; 485 ++from; 486 } 487 return codecvt_base::ok; 488 } 489 490 // utf16 -> ucs4 491 codecvt_base::result 492 ucs4_in(range<const char16_t, false>& from, range<char32_t>& to, 493 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 494 { 495 read_utf16_bom(from, mode); 496 while (from.size() && to.size()) 497 { 498 const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); 499 if (codepoint == incomplete_mb_character) 500 return codecvt_base::partial; 501 if (codepoint > maxcode) 502 return codecvt_base::error; 503 to = codepoint; 504 } 505 return from.size() ? codecvt_base::partial : codecvt_base::ok; 506 } 507 508 // ucs4 -> utf16 509 codecvt_base::result 510 ucs4_out(range<const char32_t>& from, range<char16_t, false>& to, 511 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 512 { 513 if (!write_utf16_bom(to, mode)) 514 return codecvt_base::partial; 515 while (from.size()) 516 { 517 const char32_t c = from[0]; 518 if (c > maxcode) 519 return codecvt_base::error; 520 if (!write_utf16_code_point(to, c, mode)) 521 return codecvt_base::partial; 522 ++from; 523 } 524 return codecvt_base::ok; 525 } 526 527 // Flag indicating whether to process UTF-16 or UCS2 528 enum class surrogates { allowed, disallowed }; 529 530 // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed) 531 template<typename C8, typename C16> 532 codecvt_base::result 533 utf16_in(range<const C8>& from, range<C16>& to, 534 unsigned long maxcode = max_code_point, codecvt_mode mode = {}, 535 surrogates s = surrogates::allowed) 536 { 537 read_utf8_bom(from, mode); 538 while (from.size() && to.size()) 539 { 540 auto orig = from; 541 const char32_t codepoint = read_utf8_code_point(from, maxcode); 542 if (codepoint == incomplete_mb_character) 543 { 544 if (s == surrogates::allowed) 545 return codecvt_base::partial; 546 else 547 return codecvt_base::error; // No surrogates in UCS2 548 } 549 if (codepoint > maxcode) 550 return codecvt_base::error; 551 if (!write_utf16_code_point(to, codepoint, mode)) 552 { 553 from = orig; // rewind to previous position 554 return codecvt_base::partial; 555 } 556 } 557 return codecvt_base::ok; 558 } 559 560 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed) 561 template<typename C16, typename C8> 562 codecvt_base::result 563 utf16_out(range<const C16>& from, range<C8>& to, 564 unsigned long maxcode = max_code_point, codecvt_mode mode = {}, 565 surrogates s = surrogates::allowed) 566 { 567 if (!write_utf8_bom(to, mode)) 568 return codecvt_base::partial; 569 while (from.size()) 570 { 571 char32_t c = from[0]; 572 int inc = 1; 573 if (is_high_surrogate(c)) 574 { 575 if (s == surrogates::disallowed) 576 return codecvt_base::error; // No surrogates in UCS-2 577 578 if (from.size() < 2) 579 return codecvt_base::ok; // stop converting at this point 580 581 const char32_t c2 = from[1]; 582 if (is_low_surrogate(c2)) 583 { 584 c = surrogate_pair_to_code_point(c, c2); 585 inc = 2; 586 } 587 else 588 return codecvt_base::error; 589 } 590 else if (is_low_surrogate(c)) 591 return codecvt_base::error; 592 if (c > maxcode) 593 return codecvt_base::error; 594 if (!write_utf8_code_point(to, c)) 595 return codecvt_base::partial; 596 from += inc; 597 } 598 return codecvt_base::ok; 599 } 600 601 // return pos such that [begin,pos) is valid UTF-16 string no longer than max 602 template<typename C> 603 const C* 604 utf16_span(const C* begin, const C* end, size_t max, 605 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 606 { 607 range<const C> from{ begin, end }; 608 read_utf8_bom(from, mode); 609 size_t count = 0; 610 while (count+1 < max) 611 { 612 char32_t c = read_utf8_code_point(from, maxcode); 613 if (c > maxcode) 614 return from.next; 615 else if (c > max_single_utf16_unit) 616 ++count; 617 ++count; 618 } 619 if (count+1 == max) // take one more character if it fits in a single unit 620 read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode)); 621 return from.next; 622 } 623 624 // utf8 -> ucs2 625 template<typename C> 626 codecvt_base::result 627 ucs2_in(range<const C>& from, range<char16_t>& to, 628 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 629 { 630 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 631 maxcode = std::min(max_single_utf16_unit, maxcode); 632 return utf16_in(from, to, maxcode, mode, surrogates::disallowed); 633 } 634 635 // ucs2 -> utf8 636 template<typename C> 637 codecvt_base::result 638 ucs2_out(range<const char16_t>& from, range<C>& to, 639 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 640 { 641 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 642 maxcode = std::min(max_single_utf16_unit, maxcode); 643 return utf16_out(from, to, maxcode, mode, surrogates::disallowed); 644 } 645 646 // ucs2 -> utf16 647 codecvt_base::result 648 ucs2_out(range<const char16_t>& from, range<char16_t, false>& to, 649 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 650 { 651 if (!write_utf16_bom(to, mode)) 652 return codecvt_base::partial; 653 while (from.size() && to.size()) 654 { 655 char16_t c = from[0]; 656 if (is_high_surrogate(c)) 657 return codecvt_base::error; 658 if (c > maxcode) 659 return codecvt_base::error; 660 to = adjust_byte_order(c, mode); 661 ++from; 662 } 663 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; 664 } 665 666 // utf16 -> ucs2 667 codecvt_base::result 668 ucs2_in(range<const char16_t, false>& from, range<char16_t>& to, 669 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 670 { 671 read_utf16_bom(from, mode); 672 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 673 maxcode = std::min(max_single_utf16_unit, maxcode); 674 while (from.size() && to.size()) 675 { 676 const char32_t c = read_utf16_code_point(from, maxcode, mode); 677 if (c == incomplete_mb_character) 678 return codecvt_base::error; // UCS-2 only supports single units. 679 if (c > maxcode) 680 return codecvt_base::error; 681 to = c; 682 } 683 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; 684 } 685 686 const char16_t* 687 ucs2_span(range<const char16_t, false>& from, size_t max, 688 char32_t maxcode, codecvt_mode mode) 689 { 690 read_utf16_bom(from, mode); 691 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 692 maxcode = std::min(max_single_utf16_unit, maxcode); 693 char32_t c = 0; 694 while (max-- && c <= maxcode) 695 c = read_utf16_code_point(from, maxcode, mode); 696 return reinterpret_cast<const char16_t*>(from.next); 697 } 698 699 template<typename C> 700 const C* 701 ucs2_span(const C* begin, const C* end, size_t max, 702 char32_t maxcode, codecvt_mode mode) 703 { 704 range<const C> from{ begin, end }; 705 read_utf8_bom(from, mode); 706 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 707 maxcode = std::min(max_single_utf16_unit, maxcode); 708 char32_t c = 0; 709 while (max-- && c <= maxcode) 710 c = read_utf8_code_point(from, maxcode); 711 return from.next; 712 } 713 714 // return pos such that [begin,pos) is valid UCS-4 string no longer than max 715 template<typename C> 716 const C* 717 ucs4_span(const C* begin, const C* end, size_t max, 718 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 719 { 720 range<const C> from{ begin, end }; 721 read_utf8_bom(from, mode); 722 char32_t c = 0; 723 while (max-- && c <= maxcode) 724 c = read_utf8_code_point(from, maxcode); 725 return from.next; 726 } 727 728 // return pos such that [begin,pos) is valid UCS-4 string no longer than max 729 const char16_t* 730 ucs4_span(range<const char16_t, false>& from, size_t max, 731 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 732 { 733 read_utf16_bom(from, mode); 734 char32_t c = 0; 735 while (max-- && c <= maxcode) 736 c = read_utf16_code_point(from, maxcode, mode); 737 return reinterpret_cast<const char16_t*>(from.next); 738 } 739} 740 741// Define members of codecvt<char16_t, char, mbstate_t> specialization. 742// Converts from UTF-8 to UTF-16. 743 744locale::id codecvt<char16_t, char, mbstate_t>::id; 745 746codecvt<char16_t, char, mbstate_t>::~codecvt() { } 747 748codecvt_base::result 749codecvt<char16_t, char, mbstate_t>:: 750do_out(state_type&, 751 const intern_type* __from, 752 const intern_type* __from_end, const intern_type*& __from_next, 753 extern_type* __to, extern_type* __to_end, 754 extern_type*& __to_next) const 755{ 756 range<const char16_t> from{ __from, __from_end }; 757 range<char> to{ __to, __to_end }; 758 auto res = utf16_out(from, to); 759 __from_next = from.next; 760 __to_next = to.next; 761 return res; 762} 763 764codecvt_base::result 765codecvt<char16_t, char, mbstate_t>:: 766do_unshift(state_type&, extern_type* __to, extern_type*, 767 extern_type*& __to_next) const 768{ 769 __to_next = __to; 770 return noconv; // we don't use mbstate_t for the unicode facets 771} 772 773codecvt_base::result 774codecvt<char16_t, char, mbstate_t>:: 775do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 776 const extern_type*& __from_next, 777 intern_type* __to, intern_type* __to_end, 778 intern_type*& __to_next) const 779{ 780 range<const char> from{ __from, __from_end }; 781 range<char16_t> to{ __to, __to_end }; 782#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 783 codecvt_mode mode = {}; 784#else 785 codecvt_mode mode = little_endian; 786#endif 787 auto res = utf16_in(from, to, max_code_point, mode); 788 __from_next = from.next; 789 __to_next = to.next; 790 return res; 791} 792 793int 794codecvt<char16_t, char, mbstate_t>::do_encoding() const throw() 795{ return 0; } // UTF-8 is not a fixed-width encoding 796 797bool 798codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw() 799{ return false; } 800 801int 802codecvt<char16_t, char, mbstate_t>:: 803do_length(state_type&, const extern_type* __from, 804 const extern_type* __end, size_t __max) const 805{ 806 __end = utf16_span(__from, __end, __max); 807 return __end - __from; 808} 809 810int 811codecvt<char16_t, char, mbstate_t>::do_max_length() const throw() 812{ 813 // A single character (one or two UTF-16 code units) requires 814 // up to four UTF-8 code units. 815 return 4; 816} 817 818// Define members of codecvt<char32_t, char, mbstate_t> specialization. 819// Converts from UTF-8 to UTF-32 (aka UCS-4). 820 821locale::id codecvt<char32_t, char, mbstate_t>::id; 822 823codecvt<char32_t, char, mbstate_t>::~codecvt() { } 824 825codecvt_base::result 826codecvt<char32_t, char, mbstate_t>:: 827do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 828 const intern_type*& __from_next, 829 extern_type* __to, extern_type* __to_end, 830 extern_type*& __to_next) const 831{ 832 range<const char32_t> from{ __from, __from_end }; 833 range<char> to{ __to, __to_end }; 834 auto res = ucs4_out(from, to); 835 __from_next = from.next; 836 __to_next = to.next; 837 return res; 838} 839 840codecvt_base::result 841codecvt<char32_t, char, mbstate_t>:: 842do_unshift(state_type&, extern_type* __to, extern_type*, 843 extern_type*& __to_next) const 844{ 845 __to_next = __to; 846 return noconv; 847} 848 849codecvt_base::result 850codecvt<char32_t, char, mbstate_t>:: 851do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 852 const extern_type*& __from_next, 853 intern_type* __to, intern_type* __to_end, 854 intern_type*& __to_next) const 855{ 856 range<const char> from{ __from, __from_end }; 857 range<char32_t> to{ __to, __to_end }; 858 auto res = ucs4_in(from, to); 859 __from_next = from.next; 860 __to_next = to.next; 861 return res; 862} 863 864int 865codecvt<char32_t, char, mbstate_t>::do_encoding() const throw() 866{ return 0; } // UTF-8 is not a fixed-width encoding 867 868bool 869codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw() 870{ return false; } 871 872int 873codecvt<char32_t, char, mbstate_t>:: 874do_length(state_type&, const extern_type* __from, 875 const extern_type* __end, size_t __max) const 876{ 877 __end = ucs4_span(__from, __end, __max); 878 return __end - __from; 879} 880 881int 882codecvt<char32_t, char, mbstate_t>::do_max_length() const throw() 883{ 884 // A single character (one UTF-32 code unit) requires 885 // up to 4 UTF-8 code units. 886 return 4; 887} 888 889#if defined(_GLIBCXX_USE_CHAR8_T) 890// Define members of codecvt<char16_t, char8_t, mbstate_t> specialization. 891// Converts from UTF-8 to UTF-16. 892 893locale::id codecvt<char16_t, char8_t, mbstate_t>::id; 894 895codecvt<char16_t, char8_t, mbstate_t>::~codecvt() { } 896 897codecvt_base::result 898codecvt<char16_t, char8_t, mbstate_t>:: 899do_out(state_type&, 900 const intern_type* __from, 901 const intern_type* __from_end, const intern_type*& __from_next, 902 extern_type* __to, extern_type* __to_end, 903 extern_type*& __to_next) const 904{ 905 range<const char16_t> from{ __from, __from_end }; 906 range<char8_t> to{ __to, __to_end }; 907 auto res = utf16_out(from, to); 908 __from_next = from.next; 909 __to_next = to.next; 910 return res; 911} 912 913codecvt_base::result 914codecvt<char16_t, char8_t, mbstate_t>:: 915do_unshift(state_type&, extern_type* __to, extern_type*, 916 extern_type*& __to_next) const 917{ 918 __to_next = __to; 919 return noconv; // we don't use mbstate_t for the unicode facets 920} 921 922codecvt_base::result 923codecvt<char16_t, char8_t, mbstate_t>:: 924do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 925 const extern_type*& __from_next, 926 intern_type* __to, intern_type* __to_end, 927 intern_type*& __to_next) const 928{ 929 range<const char8_t> from{ __from, __from_end }; 930 range<char16_t> to{ __to, __to_end }; 931#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 932 codecvt_mode mode = {}; 933#else 934 codecvt_mode mode = little_endian; 935#endif 936 auto res = utf16_in(from, to, max_code_point, mode); 937 __from_next = from.next; 938 __to_next = to.next; 939 return res; 940} 941 942int 943codecvt<char16_t, char8_t, mbstate_t>::do_encoding() const throw() 944{ return 0; } // UTF-8 is not a fixed-width encoding 945 946bool 947codecvt<char16_t, char8_t, mbstate_t>::do_always_noconv() const throw() 948{ return false; } 949 950int 951codecvt<char16_t, char8_t, mbstate_t>:: 952do_length(state_type&, const extern_type* __from, 953 const extern_type* __end, size_t __max) const 954{ 955 __end = utf16_span(__from, __end, __max); 956 return __end - __from; 957} 958 959int 960codecvt<char16_t, char8_t, mbstate_t>::do_max_length() const throw() 961{ 962 // A single character (one or two UTF-16 code units) requires 963 // up to four UTF-8 code units. 964 return 4; 965} 966 967// Define members of codecvt<char32_t, char8_t, mbstate_t> specialization. 968// Converts from UTF-8 to UTF-32 (aka UCS-4). 969 970locale::id codecvt<char32_t, char8_t, mbstate_t>::id; 971 972codecvt<char32_t, char8_t, mbstate_t>::~codecvt() { } 973 974codecvt_base::result 975codecvt<char32_t, char8_t, mbstate_t>:: 976do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 977 const intern_type*& __from_next, 978 extern_type* __to, extern_type* __to_end, 979 extern_type*& __to_next) const 980{ 981 range<const char32_t> from{ __from, __from_end }; 982 range<char8_t> to{ __to, __to_end }; 983 auto res = ucs4_out(from, to); 984 __from_next = from.next; 985 __to_next = to.next; 986 return res; 987} 988 989codecvt_base::result 990codecvt<char32_t, char8_t, mbstate_t>:: 991do_unshift(state_type&, extern_type* __to, extern_type*, 992 extern_type*& __to_next) const 993{ 994 __to_next = __to; 995 return noconv; 996} 997 998codecvt_base::result 999codecvt<char32_t, char8_t, mbstate_t>:: 1000do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1001 const extern_type*& __from_next, 1002 intern_type* __to, intern_type* __to_end, 1003 intern_type*& __to_next) const 1004{ 1005 range<const char8_t> from{ __from, __from_end }; 1006 range<char32_t> to{ __to, __to_end }; 1007 auto res = ucs4_in(from, to); 1008 __from_next = from.next; 1009 __to_next = to.next; 1010 return res; 1011} 1012 1013int 1014codecvt<char32_t, char8_t, mbstate_t>::do_encoding() const throw() 1015{ return 0; } // UTF-8 is not a fixed-width encoding 1016 1017bool 1018codecvt<char32_t, char8_t, mbstate_t>::do_always_noconv() const throw() 1019{ return false; } 1020 1021int 1022codecvt<char32_t, char8_t, mbstate_t>:: 1023do_length(state_type&, const extern_type* __from, 1024 const extern_type* __end, size_t __max) const 1025{ 1026 __end = ucs4_span(__from, __end, __max); 1027 return __end - __from; 1028} 1029 1030int 1031codecvt<char32_t, char8_t, mbstate_t>::do_max_length() const throw() 1032{ 1033 // A single character (one UTF-32 code unit) requires 1034 // up to 4 UTF-8 code units. 1035 return 4; 1036} 1037#endif // _GLIBCXX_USE_CHAR8_T 1038 1039// Define members of codecvt_utf8<char16_t> base class implementation. 1040// Converts from UTF-8 to UCS-2. 1041 1042__codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { } 1043 1044codecvt_base::result 1045__codecvt_utf8_base<char16_t>:: 1046do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1047 const intern_type*& __from_next, 1048 extern_type* __to, extern_type* __to_end, 1049 extern_type*& __to_next) const 1050{ 1051 range<const char16_t> from{ __from, __from_end }; 1052 range<char> to{ __to, __to_end }; 1053 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1054 __from_next = from.next; 1055 __to_next = to.next; 1056 return res; 1057} 1058 1059codecvt_base::result 1060__codecvt_utf8_base<char16_t>:: 1061do_unshift(state_type&, extern_type* __to, extern_type*, 1062 extern_type*& __to_next) const 1063{ 1064 __to_next = __to; 1065 return noconv; 1066} 1067 1068codecvt_base::result 1069__codecvt_utf8_base<char16_t>:: 1070do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1071 const extern_type*& __from_next, 1072 intern_type* __to, intern_type* __to_end, 1073 intern_type*& __to_next) const 1074{ 1075 range<const char> from{ __from, __from_end }; 1076 range<char16_t> to{ __to, __to_end }; 1077 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1078#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1079 mode = codecvt_mode(mode | little_endian); 1080#endif 1081 auto res = ucs2_in(from, to, _M_maxcode, mode); 1082 __from_next = from.next; 1083 __to_next = to.next; 1084 return res; 1085} 1086 1087int 1088__codecvt_utf8_base<char16_t>::do_encoding() const throw() 1089{ return 0; } // UTF-8 is not a fixed-width encoding 1090 1091bool 1092__codecvt_utf8_base<char16_t>::do_always_noconv() const throw() 1093{ return false; } 1094 1095int 1096__codecvt_utf8_base<char16_t>:: 1097do_length(state_type&, const extern_type* __from, 1098 const extern_type* __end, size_t __max) const 1099{ 1100 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); 1101 return __end - __from; 1102} 1103 1104int 1105__codecvt_utf8_base<char16_t>::do_max_length() const throw() 1106{ 1107 // A single UCS-2 character requires up to three UTF-8 code units. 1108 // (UCS-2 cannot represent characters that use four UTF-8 code units). 1109 int max = 3; 1110 if (_M_mode & consume_header) 1111 max += sizeof(utf8_bom); 1112 return max; 1113} 1114 1115// Define members of codecvt_utf8<char32_t> base class implementation. 1116// Converts from UTF-8 to UTF-32 (aka UCS-4). 1117 1118__codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { } 1119 1120codecvt_base::result 1121__codecvt_utf8_base<char32_t>:: 1122do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1123 const intern_type*& __from_next, 1124 extern_type* __to, extern_type* __to_end, 1125 extern_type*& __to_next) const 1126{ 1127 range<const char32_t> from{ __from, __from_end }; 1128 range<char> to{ __to, __to_end }; 1129 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1130 __from_next = from.next; 1131 __to_next = to.next; 1132 return res; 1133} 1134 1135codecvt_base::result 1136__codecvt_utf8_base<char32_t>:: 1137do_unshift(state_type&, extern_type* __to, extern_type*, 1138 extern_type*& __to_next) const 1139{ 1140 __to_next = __to; 1141 return noconv; 1142} 1143 1144codecvt_base::result 1145__codecvt_utf8_base<char32_t>:: 1146do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1147 const extern_type*& __from_next, 1148 intern_type* __to, intern_type* __to_end, 1149 intern_type*& __to_next) const 1150{ 1151 range<const char> from{ __from, __from_end }; 1152 range<char32_t> to{ __to, __to_end }; 1153 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1154 __from_next = from.next; 1155 __to_next = to.next; 1156 return res; 1157} 1158 1159int 1160__codecvt_utf8_base<char32_t>::do_encoding() const throw() 1161{ return 0; } // UTF-8 is not a fixed-width encoding 1162 1163bool 1164__codecvt_utf8_base<char32_t>::do_always_noconv() const throw() 1165{ return false; } 1166 1167int 1168__codecvt_utf8_base<char32_t>:: 1169do_length(state_type&, const extern_type* __from, 1170 const extern_type* __end, size_t __max) const 1171{ 1172 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); 1173 return __end - __from; 1174} 1175 1176int 1177__codecvt_utf8_base<char32_t>::do_max_length() const throw() 1178{ 1179 // A single UCS-4 character requires up to four UTF-8 code units. 1180 int max = 4; 1181 if (_M_mode & consume_header) 1182 max += sizeof(utf8_bom); 1183 return max; 1184} 1185 1186#ifdef _GLIBCXX_USE_WCHAR_T 1187 1188#if __SIZEOF_WCHAR_T__ == 2 1189static_assert(sizeof(wchar_t) == sizeof(char16_t), ""); 1190#elif __SIZEOF_WCHAR_T__ == 4 1191static_assert(sizeof(wchar_t) == sizeof(char32_t), ""); 1192#endif 1193 1194// Define members of codecvt_utf8<wchar_t> base class implementation. 1195// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). 1196 1197__codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { } 1198 1199codecvt_base::result 1200__codecvt_utf8_base<wchar_t>:: 1201do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1202 const intern_type*& __from_next, 1203 extern_type* __to, extern_type* __to_end, 1204 extern_type*& __to_next) const 1205{ 1206 range<char> to{ __to, __to_end }; 1207#if __SIZEOF_WCHAR_T__ == 2 1208 range<const char16_t> from{ 1209 reinterpret_cast<const char16_t*>(__from), 1210 reinterpret_cast<const char16_t*>(__from_end) 1211 }; 1212 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1213#elif __SIZEOF_WCHAR_T__ == 4 1214 range<const char32_t> from{ 1215 reinterpret_cast<const char32_t*>(__from), 1216 reinterpret_cast<const char32_t*>(__from_end) 1217 }; 1218 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1219#else 1220 return codecvt_base::error; 1221#endif 1222 __from_next = reinterpret_cast<const wchar_t*>(from.next); 1223 __to_next = to.next; 1224 return res; 1225} 1226 1227codecvt_base::result 1228__codecvt_utf8_base<wchar_t>:: 1229do_unshift(state_type&, extern_type* __to, extern_type*, 1230 extern_type*& __to_next) const 1231{ 1232 __to_next = __to; 1233 return noconv; 1234} 1235 1236codecvt_base::result 1237__codecvt_utf8_base<wchar_t>:: 1238do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1239 const extern_type*& __from_next, 1240 intern_type* __to, intern_type* __to_end, 1241 intern_type*& __to_next) const 1242{ 1243 range<const char> from{ __from, __from_end }; 1244#if __SIZEOF_WCHAR_T__ == 2 1245 range<char16_t> to{ 1246 reinterpret_cast<char16_t*>(__to), 1247 reinterpret_cast<char16_t*>(__to_end) 1248 }; 1249#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 1250 codecvt_mode mode = {}; 1251#else 1252 codecvt_mode mode = little_endian; 1253#endif 1254 auto res = ucs2_in(from, to, _M_maxcode, mode); 1255#elif __SIZEOF_WCHAR_T__ == 4 1256 range<char32_t> to{ 1257 reinterpret_cast<char32_t*>(__to), 1258 reinterpret_cast<char32_t*>(__to_end) 1259 }; 1260 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1261#else 1262 return codecvt_base::error; 1263#endif 1264 __from_next = from.next; 1265 __to_next = reinterpret_cast<wchar_t*>(to.next); 1266 return res; 1267} 1268 1269int 1270__codecvt_utf8_base<wchar_t>::do_encoding() const throw() 1271{ return 0; } // UTF-8 is not a fixed-width encoding 1272 1273bool 1274__codecvt_utf8_base<wchar_t>::do_always_noconv() const throw() 1275{ return false; } 1276 1277int 1278__codecvt_utf8_base<wchar_t>:: 1279do_length(state_type&, const extern_type* __from, 1280 const extern_type* __end, size_t __max) const 1281{ 1282#if __SIZEOF_WCHAR_T__ == 2 1283 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); 1284#elif __SIZEOF_WCHAR_T__ == 4 1285 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); 1286#else 1287 __end = __from; 1288#endif 1289 return __end - __from; 1290} 1291 1292int 1293__codecvt_utf8_base<wchar_t>::do_max_length() const throw() 1294{ 1295#if __SIZEOF_WCHAR_T__ == 2 1296 int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length() 1297#else 1298 int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length() 1299#endif 1300 if (_M_mode & consume_header) 1301 max += sizeof(utf8_bom); 1302 return max; 1303} 1304#endif 1305 1306// Define members of codecvt_utf16<char16_t> base class implementation. 1307// Converts from UTF-16 to UCS-2. 1308 1309__codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { } 1310 1311codecvt_base::result 1312__codecvt_utf16_base<char16_t>:: 1313do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1314 const intern_type*& __from_next, 1315 extern_type* __to, extern_type* __to_end, 1316 extern_type*& __to_next) const 1317{ 1318 range<const char16_t> from{ __from, __from_end }; 1319 range<char16_t, false> to{ __to, __to_end }; 1320 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1321 __from_next = from.next; 1322 __to_next = reinterpret_cast<char*>(to.next); 1323 return res; 1324} 1325 1326codecvt_base::result 1327__codecvt_utf16_base<char16_t>:: 1328do_unshift(state_type&, extern_type* __to, extern_type*, 1329 extern_type*& __to_next) const 1330{ 1331 __to_next = __to; 1332 return noconv; 1333} 1334 1335codecvt_base::result 1336__codecvt_utf16_base<char16_t>:: 1337do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1338 const extern_type*& __from_next, 1339 intern_type* __to, intern_type* __to_end, 1340 intern_type*& __to_next) const 1341{ 1342 range<const char16_t, false> from{ __from, __from_end }; 1343 range<char16_t> to{ __to, __to_end }; 1344 auto res = ucs2_in(from, to, _M_maxcode, _M_mode); 1345 __from_next = reinterpret_cast<const char*>(from.next); 1346 __to_next = to.next; 1347 if (res == codecvt_base::ok && __from_next != __from_end) 1348 res = codecvt_base::error; 1349 return res; 1350} 1351 1352int 1353__codecvt_utf16_base<char16_t>::do_encoding() const throw() 1354{ return 0; } // UTF-16 is not a fixed-width encoding 1355 1356bool 1357__codecvt_utf16_base<char16_t>::do_always_noconv() const throw() 1358{ return false; } 1359 1360int 1361__codecvt_utf16_base<char16_t>:: 1362do_length(state_type&, const extern_type* __from, 1363 const extern_type* __end, size_t __max) const 1364{ 1365 range<const char16_t, false> from{ __from, __end }; 1366 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); 1367 return reinterpret_cast<const char*>(next) - __from; 1368} 1369 1370int 1371__codecvt_utf16_base<char16_t>::do_max_length() const throw() 1372{ 1373 // A single UCS-2 character requires one UTF-16 code unit (so two chars). 1374 // (UCS-2 cannot represent characters that use multiple UTF-16 code units). 1375 int max = 2; 1376 if (_M_mode & consume_header) 1377 max += sizeof(utf16_bom); 1378 return max; 1379} 1380 1381// Define members of codecvt_utf16<char32_t> base class implementation. 1382// Converts from UTF-16 to UTF-32 (aka UCS-4). 1383 1384__codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { } 1385 1386codecvt_base::result 1387__codecvt_utf16_base<char32_t>:: 1388do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1389 const intern_type*& __from_next, 1390 extern_type* __to, extern_type* __to_end, 1391 extern_type*& __to_next) const 1392{ 1393 range<const char32_t> from{ __from, __from_end }; 1394 range<char16_t, false> to{ __to, __to_end }; 1395 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1396 __from_next = from.next; 1397 __to_next = reinterpret_cast<char*>(to.next); 1398 return res; 1399} 1400 1401codecvt_base::result 1402__codecvt_utf16_base<char32_t>:: 1403do_unshift(state_type&, extern_type* __to, extern_type*, 1404 extern_type*& __to_next) const 1405{ 1406 __to_next = __to; 1407 return noconv; 1408} 1409 1410codecvt_base::result 1411__codecvt_utf16_base<char32_t>:: 1412do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1413 const extern_type*& __from_next, 1414 intern_type* __to, intern_type* __to_end, 1415 intern_type*& __to_next) const 1416{ 1417 range<const char16_t, false> from{ __from, __from_end }; 1418 range<char32_t> to{ __to, __to_end }; 1419 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1420 __from_next = reinterpret_cast<const char*>(from.next); 1421 __to_next = to.next; 1422 if (res == codecvt_base::ok && __from_next != __from_end) 1423 res = codecvt_base::error; 1424 return res; 1425} 1426 1427int 1428__codecvt_utf16_base<char32_t>::do_encoding() const throw() 1429{ return 0; } // UTF-16 is not a fixed-width encoding 1430 1431bool 1432__codecvt_utf16_base<char32_t>::do_always_noconv() const throw() 1433{ return false; } 1434 1435int 1436__codecvt_utf16_base<char32_t>:: 1437do_length(state_type&, const extern_type* __from, 1438 const extern_type* __end, size_t __max) const 1439{ 1440 range<const char16_t, false> from{ __from, __end }; 1441 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); 1442 return reinterpret_cast<const char*>(next) - __from; 1443} 1444 1445int 1446__codecvt_utf16_base<char32_t>::do_max_length() const throw() 1447{ 1448 // A single UCS-4 character requires one or two UTF-16 code units 1449 // (so up to four chars). 1450 int max = 4; 1451 if (_M_mode & consume_header) 1452 max += sizeof(utf16_bom); 1453 return max; 1454} 1455 1456#ifdef _GLIBCXX_USE_WCHAR_T 1457// Define members of codecvt_utf16<wchar_t> base class implementation. 1458// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). 1459 1460__codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { } 1461 1462codecvt_base::result 1463__codecvt_utf16_base<wchar_t>:: 1464do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1465 const intern_type*& __from_next, 1466 extern_type* __to, extern_type* __to_end, 1467 extern_type*& __to_next) const 1468{ 1469 range<char16_t, false> to{ __to, __to_end }; 1470#if __SIZEOF_WCHAR_T__ == 2 1471 range<const char16_t> from{ 1472 reinterpret_cast<const char16_t*>(__from), 1473 reinterpret_cast<const char16_t*>(__from_end), 1474 }; 1475 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1476#elif __SIZEOF_WCHAR_T__ == 4 1477 range<const char32_t> from{ 1478 reinterpret_cast<const char32_t*>(__from), 1479 reinterpret_cast<const char32_t*>(__from_end), 1480 }; 1481 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1482#else 1483 return codecvt_base::error; 1484#endif 1485 __from_next = reinterpret_cast<const wchar_t*>(from.next); 1486 __to_next = reinterpret_cast<char*>(to.next); 1487 return res; 1488} 1489 1490codecvt_base::result 1491__codecvt_utf16_base<wchar_t>:: 1492do_unshift(state_type&, extern_type* __to, extern_type*, 1493 extern_type*& __to_next) const 1494{ 1495 __to_next = __to; 1496 return noconv; 1497} 1498 1499codecvt_base::result 1500__codecvt_utf16_base<wchar_t>:: 1501do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1502 const extern_type*& __from_next, 1503 intern_type* __to, intern_type* __to_end, 1504 intern_type*& __to_next) const 1505{ 1506 range<const char16_t, false> from{ __from, __from_end }; 1507#if __SIZEOF_WCHAR_T__ == 2 1508 range<char16_t> to{ 1509 reinterpret_cast<char16_t*>(__to), 1510 reinterpret_cast<char16_t*>(__to_end), 1511 }; 1512 auto res = ucs2_in(from, to, _M_maxcode, _M_mode); 1513#elif __SIZEOF_WCHAR_T__ == 4 1514 range<char32_t> to{ 1515 reinterpret_cast<char32_t*>(__to), 1516 reinterpret_cast<char32_t*>(__to_end), 1517 }; 1518 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1519#else 1520 return codecvt_base::error; 1521#endif 1522 __from_next = reinterpret_cast<const char*>(from.next); 1523 __to_next = reinterpret_cast<wchar_t*>(to.next); 1524 if (res == codecvt_base::ok && __from_next != __from_end) 1525 res = codecvt_base::error; 1526 return res; 1527} 1528 1529int 1530__codecvt_utf16_base<wchar_t>::do_encoding() const throw() 1531{ return 0; } // UTF-16 is not a fixed-width encoding 1532 1533bool 1534__codecvt_utf16_base<wchar_t>::do_always_noconv() const throw() 1535{ return false; } 1536 1537int 1538__codecvt_utf16_base<wchar_t>:: 1539do_length(state_type&, const extern_type* __from, 1540 const extern_type* __end, size_t __max) const 1541{ 1542 range<const char16_t, false> from{ __from, __end }; 1543#if __SIZEOF_WCHAR_T__ == 2 1544 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); 1545#elif __SIZEOF_WCHAR_T__ == 4 1546 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); 1547#endif 1548 return reinterpret_cast<const char*>(next) - __from; 1549} 1550 1551int 1552__codecvt_utf16_base<wchar_t>::do_max_length() const throw() 1553{ 1554#if __SIZEOF_WCHAR_T__ == 2 1555 int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length() 1556#else 1557 int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length() 1558#endif 1559 if (_M_mode & consume_header) 1560 max += sizeof(utf16_bom); 1561 return max; 1562} 1563#endif 1564 1565// Define members of codecvt_utf8_utf16<char16_t> base class implementation. 1566// Converts from UTF-8 to UTF-16. 1567 1568__codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { } 1569 1570codecvt_base::result 1571__codecvt_utf8_utf16_base<char16_t>:: 1572do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1573 const intern_type*& __from_next, 1574 extern_type* __to, extern_type* __to_end, 1575 extern_type*& __to_next) const 1576{ 1577 range<const char16_t> from{ __from, __from_end }; 1578 range<char> to{ __to, __to_end }; 1579 auto res = utf16_out(from, to, _M_maxcode, _M_mode); 1580 __from_next = from.next; 1581 __to_next = to.next; 1582 return res; 1583} 1584 1585codecvt_base::result 1586__codecvt_utf8_utf16_base<char16_t>:: 1587do_unshift(state_type&, extern_type* __to, extern_type*, 1588 extern_type*& __to_next) const 1589{ 1590 __to_next = __to; 1591 return noconv; 1592} 1593 1594codecvt_base::result 1595__codecvt_utf8_utf16_base<char16_t>:: 1596do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1597 const extern_type*& __from_next, 1598 intern_type* __to, intern_type* __to_end, 1599 intern_type*& __to_next) const 1600{ 1601 range<const char> from{ __from, __from_end }; 1602 range<char16_t> to{ __to, __to_end }; 1603 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1604#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1605 mode = codecvt_mode(mode | little_endian); 1606#endif 1607 auto res = utf16_in(from, to, _M_maxcode, mode); 1608 __from_next = from.next; 1609 __to_next = to.next; 1610 return res; 1611} 1612 1613int 1614__codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw() 1615{ return 0; } // UTF-8 is not a fixed-width encoding 1616 1617bool 1618__codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw() 1619{ return false; } 1620 1621int 1622__codecvt_utf8_utf16_base<char16_t>:: 1623do_length(state_type&, const extern_type* __from, 1624 const extern_type* __end, size_t __max) const 1625{ 1626 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); 1627 return __end - __from; 1628} 1629 1630int 1631__codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw() 1632{ 1633 // A single character can be 1 or 2 UTF-16 code units, 1634 // requiring up to 4 UTF-8 code units. 1635 int max = 4; 1636 if (_M_mode & consume_header) 1637 max += sizeof(utf8_bom); 1638 return max; 1639} 1640 1641// Define members of codecvt_utf8_utf16<char32_t> base class implementation. 1642// Converts from UTF-8 to UTF-16. 1643 1644__codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { } 1645 1646codecvt_base::result 1647__codecvt_utf8_utf16_base<char32_t>:: 1648do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1649 const intern_type*& __from_next, 1650 extern_type* __to, extern_type* __to_end, 1651 extern_type*& __to_next) const 1652{ 1653 range<const char32_t> from{ __from, __from_end }; 1654 range<char> to{ __to, __to_end }; 1655 auto res = utf16_out(from, to, _M_maxcode, _M_mode); 1656 __from_next = from.next; 1657 __to_next = to.next; 1658 return res; 1659} 1660 1661codecvt_base::result 1662__codecvt_utf8_utf16_base<char32_t>:: 1663do_unshift(state_type&, extern_type* __to, extern_type*, 1664 extern_type*& __to_next) const 1665{ 1666 __to_next = __to; 1667 return noconv; 1668} 1669 1670codecvt_base::result 1671__codecvt_utf8_utf16_base<char32_t>:: 1672do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1673 const extern_type*& __from_next, 1674 intern_type* __to, intern_type* __to_end, 1675 intern_type*& __to_next) const 1676{ 1677 range<const char> from{ __from, __from_end }; 1678 range<char32_t> to{ __to, __to_end }; 1679 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1680#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1681 mode = codecvt_mode(mode | little_endian); 1682#endif 1683 auto res = utf16_in(from, to, _M_maxcode, mode); 1684 __from_next = from.next; 1685 __to_next = to.next; 1686 return res; 1687} 1688 1689int 1690__codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw() 1691{ return 0; } // UTF-8 is not a fixed-width encoding 1692 1693bool 1694__codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw() 1695{ return false; } 1696 1697int 1698__codecvt_utf8_utf16_base<char32_t>:: 1699do_length(state_type&, const extern_type* __from, 1700 const extern_type* __end, size_t __max) const 1701{ 1702 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); 1703 return __end - __from; 1704} 1705 1706int 1707__codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw() 1708{ 1709 // A single character can be 1 or 2 UTF-16 code units, 1710 // requiring up to 4 UTF-8 code units. 1711 int max = 4; 1712 if (_M_mode & consume_header) 1713 max += sizeof(utf8_bom); 1714 return max; 1715} 1716 1717#ifdef _GLIBCXX_USE_WCHAR_T 1718// Define members of codecvt_utf8_utf16<wchar_t> base class implementation. 1719// Converts from UTF-8 to UTF-16. 1720 1721__codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { } 1722 1723codecvt_base::result 1724__codecvt_utf8_utf16_base<wchar_t>:: 1725do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1726 const intern_type*& __from_next, 1727 extern_type* __to, extern_type* __to_end, 1728 extern_type*& __to_next) const 1729{ 1730 range<const wchar_t> from{ __from, __from_end }; 1731 range<char> to{ __to, __to_end }; 1732 auto res = utf16_out(from, to, _M_maxcode, _M_mode); 1733 __from_next = from.next; 1734 __to_next = to.next; 1735 return res; 1736} 1737 1738codecvt_base::result 1739__codecvt_utf8_utf16_base<wchar_t>:: 1740do_unshift(state_type&, extern_type* __to, extern_type*, 1741 extern_type*& __to_next) const 1742{ 1743 __to_next = __to; 1744 return noconv; 1745} 1746 1747codecvt_base::result 1748__codecvt_utf8_utf16_base<wchar_t>:: 1749do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1750 const extern_type*& __from_next, 1751 intern_type* __to, intern_type* __to_end, 1752 intern_type*& __to_next) const 1753{ 1754 range<const char> from{ __from, __from_end }; 1755 range<wchar_t> to{ __to, __to_end }; 1756 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1757#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1758 mode = codecvt_mode(mode | little_endian); 1759#endif 1760 auto res = utf16_in(from, to, _M_maxcode, mode); 1761 __from_next = from.next; 1762 __to_next = to.next; 1763 return res; 1764} 1765 1766int 1767__codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw() 1768{ return 0; } // UTF-8 is not a fixed-width encoding 1769 1770bool 1771__codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw() 1772{ return false; } 1773 1774int 1775__codecvt_utf8_utf16_base<wchar_t>:: 1776do_length(state_type&, const extern_type* __from, 1777 const extern_type* __end, size_t __max) const 1778{ 1779 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); 1780 return __end - __from; 1781} 1782 1783int 1784__codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw() 1785{ 1786 // A single character can be 1 or 2 UTF-16 code units, 1787 // requiring up to 4 UTF-8 code units. 1788 int max = 4; 1789 if (_M_mode & consume_header) 1790 max += sizeof(utf8_bom); 1791 return max; 1792} 1793#endif 1794 1795inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>; 1796inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>; 1797template class codecvt_byname<char16_t, char, mbstate_t>; 1798template class codecvt_byname<char32_t, char, mbstate_t>; 1799 1800#if defined(_GLIBCXX_USE_CHAR8_T) 1801inline template class __codecvt_abstract_base<char16_t, char8_t, mbstate_t>; 1802inline template class __codecvt_abstract_base<char32_t, char8_t, mbstate_t>; 1803template class codecvt_byname<char16_t, char8_t, mbstate_t>; 1804template class codecvt_byname<char32_t, char8_t, mbstate_t>; 1805#endif 1806 1807_GLIBCXX_END_NAMESPACE_VERSION 1808} 1809