1// Locale support (codecvt) -*- C++ -*-
2
3// Copyright (C) 2015-2022 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library.  This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23// <http://www.gnu.org/licenses/>.
24
25#include <codecvt>
26#include <cstring>		// std::memcpy, std::memcmp
27#include <bits/stl_algobase.h>	// std::min
28
29namespace std _GLIBCXX_VISIBILITY(default)
30{
31_GLIBCXX_BEGIN_NAMESPACE_VERSION
32
33  // The standard doesn't define these operators, which is annoying.
34  static underlying_type<codecvt_mode>::type
35  to_integer(codecvt_mode m)
36  { return static_cast<underlying_type<codecvt_mode>::type>(m); }
37
38  static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
39  { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
40
41  static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
42  { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
43
44  static codecvt_mode operator~(codecvt_mode m)
45  { return codecvt_mode(~to_integer(m)); }
46
47namespace
48{
49  // Largest code point that fits in a single UTF-16 code unit.
50  const char32_t max_single_utf16_unit = 0xFFFF;
51
52  const char32_t max_code_point = 0x10FFFF;
53
54  // The functions below rely on maxcode < incomplete_mb_character
55  // (which is enforced by the codecvt_utf* classes on construction).
56  const char32_t incomplete_mb_character = char32_t(-2);
57  const char32_t invalid_mb_sequence = char32_t(-1);
58
59  // Utility type for reading and writing code units of type Elem from
60  // a range defined by a pair of pointers.
61  template<typename Elem, bool Aligned = true>
62    struct range
63    {
64      Elem* next;
65      Elem* end;
66
67      // Write a code unit.
68      range& operator=(Elem e)
69      {
70	*next++ = e;
71	return *this;
72      }
73
74      // Read the next code unit.
75      Elem operator*() const { return *next; }
76
77      // Read the Nth code unit.
78      Elem operator[](size_t n) const { return next[n]; }
79
80      // Move to the next code unit.
81      range& operator++()
82      {
83	++next;
84	return *this;
85      }
86
87      // Move to the Nth code unit.
88      range& operator+=(size_t n)
89      {
90	next += n;
91	return *this;
92      }
93
94      // The number of code units remaining.
95      size_t size() const { return end - next; }
96
97      // The number of bytes remaining.
98      size_t nbytes() const { return (const char*)end - (const char*)next; }
99    };
100
101  // This specialization is used when accessing char16_t values through
102  // pointers to char, which might not be correctly aligned for char16_t.
103  template<typename Elem>
104    struct range<Elem, false>
105    {
106      using value_type = typename remove_const<Elem>::type;
107
108      using char_pointer = typename
109	conditional<is_const<Elem>::value, const char*, char*>::type;
110
111      char_pointer next;
112      char_pointer end;
113
114      // Write a code unit.
115      range& operator=(Elem e)
116      {
117	memcpy(next, &e, sizeof(Elem));
118	++*this;
119	return *this;
120      }
121
122      // Read the next code unit.
123      Elem operator*() const
124      {
125	value_type e;
126	memcpy(&e, next, sizeof(Elem));
127	return e;
128      }
129
130      // Read the Nth code unit.
131      Elem operator[](size_t n) const
132      {
133	value_type e;
134	memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
135	return e;
136      }
137
138      // Move to the next code unit.
139      range& operator++()
140      {
141	next += sizeof(Elem);
142	return *this;
143      }
144
145      // Move to the Nth code unit.
146      range& operator+=(size_t n)
147      {
148	next += n * sizeof(Elem);
149	return *this;
150      }
151
152      // The number of code units remaining.
153      size_t size() const { return nbytes() / sizeof(Elem); }
154
155      // The number of bytes remaining.
156      size_t nbytes() const { return end - next; }
157    };
158
159  // Multibyte sequences can have "header" consisting of Byte Order Mark
160  const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
161  const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
162  const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
163
164  // Write a BOM (space permitting).
165  template<typename C, bool A, size_t N>
166    bool
167    write_bom(range<C, A>& to, const unsigned char (&bom)[N])
168    {
169      static_assert( (N / sizeof(C)) != 0, "" );
170      static_assert( (N % sizeof(C)) == 0, "" );
171
172      if (to.nbytes() < N)
173	return false;
174      memcpy(to.next, bom, N);
175      to += (N / sizeof(C));
176      return true;
177    }
178
179  // Try to read a BOM.
180  template<typename C, bool A, size_t N>
181    bool
182    read_bom(range<C, A>& from, const unsigned char (&bom)[N])
183    {
184      static_assert( (N / sizeof(C)) != 0, "" );
185      static_assert( (N % sizeof(C)) == 0, "" );
186
187      if (from.nbytes() >= N && !memcmp(from.next, bom, N))
188	{
189	  from += (N / sizeof(C));
190	  return true;
191	}
192      return false;
193    }
194
195  // If generate_header is set in mode write out UTF-8 BOM.
196  template<typename C>
197  bool
198  write_utf8_bom(range<C>& to, codecvt_mode mode)
199  {
200    if (mode & generate_header)
201      return write_bom(to, utf8_bom);
202    return true;
203  }
204
205  // If generate_header is set in mode write out the UTF-16 BOM indicated
206  // by whether little_endian is set in mode.
207  template<bool Aligned>
208  bool
209  write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
210  {
211    if (mode & generate_header)
212    {
213      if (mode & little_endian)
214	return write_bom(to, utf16le_bom);
215      else
216	return write_bom(to, utf16_bom);
217    }
218    return true;
219  }
220
221  // If consume_header is set in mode update from.next to after any BOM.
222  template<typename C>
223  void
224  read_utf8_bom(range<const C>& from, codecvt_mode mode)
225  {
226    if (mode & consume_header)
227      read_bom(from, utf8_bom);
228  }
229
230  // If consume_header is not set in mode, no effects.
231  // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
232  // - if the UTF-16BE BOM was found unset little_endian in mode, or
233  // - if the UTF-16LE BOM was found set little_endian in mode.
234  template<bool Aligned>
235  void
236  read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
237  {
238    if (mode & consume_header)
239      {
240	if (read_bom(from, utf16_bom))
241	  mode &= ~little_endian;
242	else if (read_bom(from, utf16le_bom))
243	  mode |= little_endian;
244      }
245  }
246
247  // Read a codepoint from a UTF-8 multibyte sequence.
248  // Updates from.next if the codepoint is not greater than maxcode.
249  // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
250  template<typename C>
251  char32_t
252  read_utf8_code_point(range<const C>& from, unsigned long maxcode)
253  {
254    const size_t avail = from.size();
255    if (avail == 0)
256      return incomplete_mb_character;
257    char32_t c1 = (unsigned char) from[0];
258    // https://en.wikipedia.org/wiki/UTF-8#Sample_code
259    if (c1 < 0x80)
260    {
261      ++from;
262      return c1;
263    }
264    else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
265      return invalid_mb_sequence;
266    else if (c1 < 0xE0) // 2-byte sequence
267    {
268      if (avail < 2)
269	return incomplete_mb_character;
270      char32_t c2 = (unsigned char) from[1];
271      if ((c2 & 0xC0) != 0x80)
272	return invalid_mb_sequence;
273      char32_t c = (c1 << 6) + c2 - 0x3080;
274      if (c <= maxcode)
275	from += 2;
276      return c;
277    }
278    else if (c1 < 0xF0) // 3-byte sequence
279    {
280      if (avail < 3)
281	return incomplete_mb_character;
282      char32_t c2 = (unsigned char) from[1];
283      if ((c2 & 0xC0) != 0x80)
284	return invalid_mb_sequence;
285      if (c1 == 0xE0 && c2 < 0xA0) // overlong
286	return invalid_mb_sequence;
287      char32_t c3 = (unsigned char) from[2];
288      if ((c3 & 0xC0) != 0x80)
289	return invalid_mb_sequence;
290      char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
291      if (c <= maxcode)
292	from += 3;
293      return c;
294    }
295    else if (c1 < 0xF5) // 4-byte sequence
296    {
297      if (avail < 4)
298	return incomplete_mb_character;
299      char32_t c2 = (unsigned char) from[1];
300      if ((c2 & 0xC0) != 0x80)
301	return invalid_mb_sequence;
302      if (c1 == 0xF0 && c2 < 0x90) // overlong
303	return invalid_mb_sequence;
304      if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
305      return invalid_mb_sequence;
306      char32_t c3 = (unsigned char) from[2];
307      if ((c3 & 0xC0) != 0x80)
308	return invalid_mb_sequence;
309      char32_t c4 = (unsigned char) from[3];
310      if ((c4 & 0xC0) != 0x80)
311	return invalid_mb_sequence;
312      char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
313      if (c <= maxcode)
314	from += 4;
315      return c;
316    }
317    else // > U+10FFFF
318      return invalid_mb_sequence;
319  }
320
321  template<typename C>
322  bool
323  write_utf8_code_point(range<C>& to, char32_t code_point)
324  {
325    if (code_point < 0x80)
326      {
327	if (to.size() < 1)
328	  return false;
329	to = code_point;
330      }
331    else if (code_point <= 0x7FF)
332      {
333	if (to.size() < 2)
334	  return false;
335	to = (code_point >> 6) + 0xC0;
336	to = (code_point & 0x3F) + 0x80;
337      }
338    else if (code_point <= 0xFFFF)
339      {
340	if (to.size() < 3)
341	  return false;
342	to = (code_point >> 12) + 0xE0;
343	to = ((code_point >> 6) & 0x3F) + 0x80;
344	to = (code_point & 0x3F) + 0x80;
345      }
346    else if (code_point <= 0x10FFFF)
347      {
348	if (to.size() < 4)
349	  return false;
350	to = (code_point >> 18) + 0xF0;
351	to = ((code_point >> 12) & 0x3F) + 0x80;
352	to = ((code_point >> 6) & 0x3F) + 0x80;
353	to = (code_point & 0x3F) + 0x80;
354      }
355    else
356      return false;
357    return true;
358  }
359
360  inline char16_t
361  adjust_byte_order(char16_t c, codecvt_mode mode)
362  {
363#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
364    return (mode & little_endian) ? __builtin_bswap16(c) : c;
365#else
366    return (mode & little_endian) ? c : __builtin_bswap16(c);
367#endif
368  }
369
370  // Return true if c is a high-surrogate (aka leading) code point.
371  inline bool
372  is_high_surrogate(char32_t c)
373  {
374    return c >= 0xD800 && c <= 0xDBFF;
375  }
376
377  // Return true if c is a low-surrogate (aka trailing) code point.
378  inline bool
379  is_low_surrogate(char32_t c)
380  {
381    return c >= 0xDC00 && c <= 0xDFFF;
382  }
383
384  inline char32_t
385  surrogate_pair_to_code_point(char32_t high, char32_t low)
386  {
387    return (high << 10) + low - 0x35FDC00;
388  }
389
390  // Read a codepoint from a UTF-16 multibyte sequence.
391  // The sequence's endianness is indicated by (mode & little_endian).
392  // Updates from.next if the codepoint is not greater than maxcode.
393  // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
394  template<bool Aligned>
395    char32_t
396    read_utf16_code_point(range<const char16_t, Aligned>& from,
397			  unsigned long maxcode, codecvt_mode mode)
398    {
399      const size_t avail = from.size();
400      if (avail == 0)
401	return incomplete_mb_character;
402      int inc = 1;
403      char32_t c = adjust_byte_order(from[0], mode);
404      if (is_high_surrogate(c))
405	{
406	  if (avail < 2)
407	    return incomplete_mb_character;
408	  const char16_t c2 = adjust_byte_order(from[1], mode);
409	  if (is_low_surrogate(c2))
410	    {
411	      c = surrogate_pair_to_code_point(c, c2);
412	      inc = 2;
413	    }
414	  else
415	    return invalid_mb_sequence;
416	}
417      else if (is_low_surrogate(c))
418	return invalid_mb_sequence;
419      if (c <= maxcode)
420	from += inc;
421      return c;
422    }
423
424  template<typename C, bool A>
425  bool
426  write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
427  {
428    static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
429
430    if (codepoint <= max_single_utf16_unit)
431      {
432	if (to.size() > 0)
433	  {
434	    to = adjust_byte_order(codepoint, mode);
435	    return true;
436	  }
437      }
438    else if (to.size() > 1)
439      {
440	// Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
441	const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
442	char16_t lead = LEAD_OFFSET + (codepoint >> 10);
443	char16_t trail = 0xDC00 + (codepoint & 0x3FF);
444	to = adjust_byte_order(lead, mode);
445	to = adjust_byte_order(trail, mode);
446	return true;
447      }
448    return false;
449  }
450
451  // utf8 -> ucs4
452  template<typename C>
453  codecvt_base::result
454  ucs4_in(range<const C>& from, range<char32_t>& to,
455          unsigned long maxcode = max_code_point, codecvt_mode mode = {})
456  {
457    read_utf8_bom(from, mode);
458    while (from.size() && to.size())
459      {
460	const char32_t codepoint = read_utf8_code_point(from, maxcode);
461	if (codepoint == incomplete_mb_character)
462	  return codecvt_base::partial;
463	if (codepoint > maxcode)
464	  return codecvt_base::error;
465	to = codepoint;
466      }
467    return from.size() ? codecvt_base::partial : codecvt_base::ok;
468  }
469
470  // ucs4 -> utf8
471  template<typename C>
472  codecvt_base::result
473  ucs4_out(range<const char32_t>& from, range<C>& to,
474           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
475  {
476    if (!write_utf8_bom(to, mode))
477      return codecvt_base::partial;
478    while (from.size())
479      {
480	const char32_t c = from[0];
481	if (c > maxcode)
482	  return codecvt_base::error;
483	if (!write_utf8_code_point(to, c))
484	  return codecvt_base::partial;
485	++from;
486      }
487    return codecvt_base::ok;
488  }
489
490  // utf16 -> ucs4
491  codecvt_base::result
492  ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
493          unsigned long maxcode = max_code_point, codecvt_mode mode = {})
494  {
495    read_utf16_bom(from, mode);
496    while (from.size() && to.size())
497      {
498	const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
499	if (codepoint == incomplete_mb_character)
500	  return codecvt_base::partial;
501	if (codepoint > maxcode)
502	  return codecvt_base::error;
503	to = codepoint;
504      }
505    return from.size() ? codecvt_base::partial : codecvt_base::ok;
506  }
507
508  // ucs4 -> utf16
509  codecvt_base::result
510  ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
511           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
512  {
513    if (!write_utf16_bom(to, mode))
514      return codecvt_base::partial;
515    while (from.size())
516      {
517	const char32_t c = from[0];
518	if (c > maxcode)
519	  return codecvt_base::error;
520	if (!write_utf16_code_point(to, c, mode))
521	  return codecvt_base::partial;
522	++from;
523      }
524    return codecvt_base::ok;
525  }
526
527  // Flag indicating whether to process UTF-16 or UCS2
528  enum class surrogates { allowed, disallowed };
529
530  // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
531  template<typename C8, typename C16>
532  codecvt_base::result
533  utf16_in(range<const C8>& from, range<C16>& to,
534	   unsigned long maxcode = max_code_point, codecvt_mode mode = {},
535	   surrogates s = surrogates::allowed)
536  {
537    read_utf8_bom(from, mode);
538    while (from.size() && to.size())
539      {
540	auto orig = from;
541	const char32_t codepoint = read_utf8_code_point(from, maxcode);
542	if (codepoint == incomplete_mb_character)
543	  {
544	    if (s == surrogates::allowed)
545	      return codecvt_base::partial;
546	    else
547	      return codecvt_base::error; // No surrogates in UCS2
548	  }
549	if (codepoint > maxcode)
550	  return codecvt_base::error;
551	if (!write_utf16_code_point(to, codepoint, mode))
552	  {
553	    from = orig; // rewind to previous position
554	    return codecvt_base::partial;
555	  }
556      }
557    return codecvt_base::ok;
558  }
559
560  // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
561  template<typename C16, typename C8>
562  codecvt_base::result
563  utf16_out(range<const C16>& from, range<C8>& to,
564	    unsigned long maxcode = max_code_point, codecvt_mode mode = {},
565	    surrogates s = surrogates::allowed)
566  {
567    if (!write_utf8_bom(to, mode))
568      return codecvt_base::partial;
569    while (from.size())
570      {
571	char32_t c = from[0];
572	int inc = 1;
573	if (is_high_surrogate(c))
574	  {
575	    if (s == surrogates::disallowed)
576	      return codecvt_base::error; // No surrogates in UCS-2
577
578	    if (from.size() < 2)
579	      return codecvt_base::ok; // stop converting at this point
580
581	    const char32_t c2 = from[1];
582	    if (is_low_surrogate(c2))
583	      {
584		c = surrogate_pair_to_code_point(c, c2);
585		inc = 2;
586	      }
587	    else
588	      return codecvt_base::error;
589	  }
590	else if (is_low_surrogate(c))
591	  return codecvt_base::error;
592	if (c > maxcode)
593	  return codecvt_base::error;
594	if (!write_utf8_code_point(to, c))
595	  return codecvt_base::partial;
596	from += inc;
597      }
598    return codecvt_base::ok;
599  }
600
601  // return pos such that [begin,pos) is valid UTF-16 string no longer than max
602  template<typename C>
603  const C*
604  utf16_span(const C* begin, const C* end, size_t max,
605	     char32_t maxcode = max_code_point, codecvt_mode mode = {})
606  {
607    range<const C> from{ begin, end };
608    read_utf8_bom(from, mode);
609    size_t count = 0;
610    while (count+1 < max)
611      {
612	char32_t c = read_utf8_code_point(from, maxcode);
613	if (c > maxcode)
614	  return from.next;
615	else if (c > max_single_utf16_unit)
616	  ++count;
617	++count;
618      }
619    if (count+1 == max) // take one more character if it fits in a single unit
620      read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
621    return from.next;
622  }
623
624  // utf8 -> ucs2
625  template<typename C>
626  codecvt_base::result
627  ucs2_in(range<const C>& from, range<char16_t>& to,
628	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
629  {
630    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
631    maxcode = std::min(max_single_utf16_unit, maxcode);
632    return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
633  }
634
635  // ucs2 -> utf8
636  template<typename C>
637  codecvt_base::result
638  ucs2_out(range<const char16_t>& from, range<C>& to,
639	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
640  {
641    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
642    maxcode = std::min(max_single_utf16_unit, maxcode);
643    return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
644  }
645
646  // ucs2 -> utf16
647  codecvt_base::result
648  ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
649	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
650  {
651    if (!write_utf16_bom(to, mode))
652      return codecvt_base::partial;
653    while (from.size() && to.size())
654      {
655	char16_t c = from[0];
656	if (is_high_surrogate(c))
657	  return codecvt_base::error;
658	if (c > maxcode)
659	  return codecvt_base::error;
660	to = adjust_byte_order(c, mode);
661	++from;
662      }
663    return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
664  }
665
666  // utf16 -> ucs2
667  codecvt_base::result
668  ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
669	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
670  {
671    read_utf16_bom(from, mode);
672    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
673    maxcode = std::min(max_single_utf16_unit, maxcode);
674    while (from.size() && to.size())
675      {
676	const char32_t c = read_utf16_code_point(from, maxcode, mode);
677	if (c == incomplete_mb_character)
678	  return codecvt_base::error; // UCS-2 only supports single units.
679	if (c > maxcode)
680	  return codecvt_base::error;
681	to = c;
682      }
683    return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
684  }
685
686  const char16_t*
687  ucs2_span(range<const char16_t, false>& from, size_t max,
688            char32_t maxcode, codecvt_mode mode)
689  {
690    read_utf16_bom(from, mode);
691    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
692    maxcode = std::min(max_single_utf16_unit, maxcode);
693    char32_t c = 0;
694    while (max-- && c <= maxcode)
695      c = read_utf16_code_point(from, maxcode, mode);
696    return reinterpret_cast<const char16_t*>(from.next);
697  }
698
699  template<typename C>
700  const C*
701  ucs2_span(const C* begin, const C* end, size_t max,
702            char32_t maxcode, codecvt_mode mode)
703  {
704    range<const C> from{ begin, end };
705    read_utf8_bom(from, mode);
706    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
707    maxcode = std::min(max_single_utf16_unit, maxcode);
708    char32_t c = 0;
709    while (max-- && c <= maxcode)
710      c = read_utf8_code_point(from, maxcode);
711    return from.next;
712  }
713
714  // return pos such that [begin,pos) is valid UCS-4 string no longer than max
715  template<typename C>
716  const C*
717  ucs4_span(const C* begin, const C* end, size_t max,
718            char32_t maxcode = max_code_point, codecvt_mode mode = {})
719  {
720    range<const C> from{ begin, end };
721    read_utf8_bom(from, mode);
722    char32_t c = 0;
723    while (max-- && c <= maxcode)
724      c = read_utf8_code_point(from, maxcode);
725    return from.next;
726  }
727
728  // return pos such that [begin,pos) is valid UCS-4 string no longer than max
729  const char16_t*
730  ucs4_span(range<const char16_t, false>& from, size_t max,
731            char32_t maxcode = max_code_point, codecvt_mode mode = {})
732  {
733    read_utf16_bom(from, mode);
734    char32_t c = 0;
735    while (max-- && c <= maxcode)
736      c = read_utf16_code_point(from, maxcode, mode);
737    return reinterpret_cast<const char16_t*>(from.next);
738  }
739}
740
741// Define members of codecvt<char16_t, char, mbstate_t> specialization.
742// Converts from UTF-8 to UTF-16.
743
744locale::id codecvt<char16_t, char, mbstate_t>::id;
745
746codecvt<char16_t, char, mbstate_t>::~codecvt() { }
747
748codecvt_base::result
749codecvt<char16_t, char, mbstate_t>::
750do_out(state_type&,
751       const intern_type* __from,
752       const intern_type* __from_end, const intern_type*& __from_next,
753       extern_type* __to, extern_type* __to_end,
754       extern_type*& __to_next) const
755{
756  range<const char16_t> from{ __from, __from_end };
757  range<char> to{ __to, __to_end };
758  auto res = utf16_out(from, to);
759  __from_next = from.next;
760  __to_next = to.next;
761  return res;
762}
763
764codecvt_base::result
765codecvt<char16_t, char, mbstate_t>::
766do_unshift(state_type&, extern_type* __to, extern_type*,
767	   extern_type*& __to_next) const
768{
769  __to_next = __to;
770  return noconv; // we don't use mbstate_t for the unicode facets
771}
772
773codecvt_base::result
774codecvt<char16_t, char, mbstate_t>::
775do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
776      const extern_type*& __from_next,
777      intern_type* __to, intern_type* __to_end,
778      intern_type*& __to_next) const
779{
780  range<const char> from{ __from, __from_end };
781  range<char16_t> to{ __to, __to_end };
782#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
783  codecvt_mode mode = {};
784#else
785  codecvt_mode mode = little_endian;
786#endif
787  auto res = utf16_in(from, to, max_code_point, mode);
788  __from_next = from.next;
789  __to_next = to.next;
790  return res;
791}
792
793int
794codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
795{ return 0; } // UTF-8 is not a fixed-width encoding
796
797bool
798codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
799{ return false; }
800
801int
802codecvt<char16_t, char, mbstate_t>::
803do_length(state_type&, const extern_type* __from,
804	  const extern_type* __end, size_t __max) const
805{
806  __end = utf16_span(__from, __end, __max);
807  return __end - __from;
808}
809
810int
811codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
812{
813  // A single character (one or two UTF-16 code units) requires
814  // up to four UTF-8 code units.
815  return 4;
816}
817
818// Define members of codecvt<char32_t, char, mbstate_t> specialization.
819// Converts from UTF-8 to UTF-32 (aka UCS-4).
820
821locale::id codecvt<char32_t, char, mbstate_t>::id;
822
823codecvt<char32_t, char, mbstate_t>::~codecvt() { }
824
825codecvt_base::result
826codecvt<char32_t, char, mbstate_t>::
827do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
828       const intern_type*& __from_next,
829       extern_type* __to, extern_type* __to_end,
830       extern_type*& __to_next) const
831{
832  range<const char32_t> from{ __from, __from_end };
833  range<char> to{ __to, __to_end };
834  auto res = ucs4_out(from, to);
835  __from_next = from.next;
836  __to_next = to.next;
837  return res;
838}
839
840codecvt_base::result
841codecvt<char32_t, char, mbstate_t>::
842do_unshift(state_type&, extern_type* __to, extern_type*,
843	   extern_type*& __to_next) const
844{
845  __to_next = __to;
846  return noconv;
847}
848
849codecvt_base::result
850codecvt<char32_t, char, mbstate_t>::
851do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
852      const extern_type*& __from_next,
853      intern_type* __to, intern_type* __to_end,
854      intern_type*& __to_next) const
855{
856  range<const char> from{ __from, __from_end };
857  range<char32_t> to{ __to, __to_end };
858  auto res = ucs4_in(from, to);
859  __from_next = from.next;
860  __to_next = to.next;
861  return res;
862}
863
864int
865codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
866{ return 0; } // UTF-8 is not a fixed-width encoding
867
868bool
869codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
870{ return false; }
871
872int
873codecvt<char32_t, char, mbstate_t>::
874do_length(state_type&, const extern_type* __from,
875	  const extern_type* __end, size_t __max) const
876{
877  __end = ucs4_span(__from, __end, __max);
878  return __end - __from;
879}
880
881int
882codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
883{
884  // A single character (one UTF-32 code unit) requires
885  // up to 4 UTF-8 code units.
886  return 4;
887}
888
889#if defined(_GLIBCXX_USE_CHAR8_T)
890// Define members of codecvt<char16_t, char8_t, mbstate_t> specialization.
891// Converts from UTF-8 to UTF-16.
892
893locale::id codecvt<char16_t, char8_t, mbstate_t>::id;
894
895codecvt<char16_t, char8_t, mbstate_t>::~codecvt() { }
896
897codecvt_base::result
898codecvt<char16_t, char8_t, mbstate_t>::
899do_out(state_type&,
900       const intern_type* __from,
901       const intern_type* __from_end, const intern_type*& __from_next,
902       extern_type* __to, extern_type* __to_end,
903       extern_type*& __to_next) const
904{
905  range<const char16_t> from{ __from, __from_end };
906  range<char8_t> to{ __to, __to_end };
907  auto res = utf16_out(from, to);
908  __from_next = from.next;
909  __to_next = to.next;
910  return res;
911}
912
913codecvt_base::result
914codecvt<char16_t, char8_t, mbstate_t>::
915do_unshift(state_type&, extern_type* __to, extern_type*,
916	   extern_type*& __to_next) const
917{
918  __to_next = __to;
919  return noconv; // we don't use mbstate_t for the unicode facets
920}
921
922codecvt_base::result
923codecvt<char16_t, char8_t, mbstate_t>::
924do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
925      const extern_type*& __from_next,
926      intern_type* __to, intern_type* __to_end,
927      intern_type*& __to_next) const
928{
929  range<const char8_t> from{ __from, __from_end };
930  range<char16_t> to{ __to, __to_end };
931#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
932  codecvt_mode mode = {};
933#else
934  codecvt_mode mode = little_endian;
935#endif
936  auto res = utf16_in(from, to, max_code_point, mode);
937  __from_next = from.next;
938  __to_next = to.next;
939  return res;
940}
941
942int
943codecvt<char16_t, char8_t, mbstate_t>::do_encoding() const throw()
944{ return 0; } // UTF-8 is not a fixed-width encoding
945
946bool
947codecvt<char16_t, char8_t, mbstate_t>::do_always_noconv() const throw()
948{ return false; }
949
950int
951codecvt<char16_t, char8_t, mbstate_t>::
952do_length(state_type&, const extern_type* __from,
953	  const extern_type* __end, size_t __max) const
954{
955  __end = utf16_span(__from, __end, __max);
956  return __end - __from;
957}
958
959int
960codecvt<char16_t, char8_t, mbstate_t>::do_max_length() const throw()
961{
962  // A single character (one or two UTF-16 code units) requires
963  // up to four UTF-8 code units.
964  return 4;
965}
966
967// Define members of codecvt<char32_t, char8_t, mbstate_t> specialization.
968// Converts from UTF-8 to UTF-32 (aka UCS-4).
969
970locale::id codecvt<char32_t, char8_t, mbstate_t>::id;
971
972codecvt<char32_t, char8_t, mbstate_t>::~codecvt() { }
973
974codecvt_base::result
975codecvt<char32_t, char8_t, mbstate_t>::
976do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
977       const intern_type*& __from_next,
978       extern_type* __to, extern_type* __to_end,
979       extern_type*& __to_next) const
980{
981  range<const char32_t> from{ __from, __from_end };
982  range<char8_t> to{ __to, __to_end };
983  auto res = ucs4_out(from, to);
984  __from_next = from.next;
985  __to_next = to.next;
986  return res;
987}
988
989codecvt_base::result
990codecvt<char32_t, char8_t, mbstate_t>::
991do_unshift(state_type&, extern_type* __to, extern_type*,
992	   extern_type*& __to_next) const
993{
994  __to_next = __to;
995  return noconv;
996}
997
998codecvt_base::result
999codecvt<char32_t, char8_t, mbstate_t>::
1000do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1001      const extern_type*& __from_next,
1002      intern_type* __to, intern_type* __to_end,
1003      intern_type*& __to_next) const
1004{
1005  range<const char8_t> from{ __from, __from_end };
1006  range<char32_t> to{ __to, __to_end };
1007  auto res = ucs4_in(from, to);
1008  __from_next = from.next;
1009  __to_next = to.next;
1010  return res;
1011}
1012
1013int
1014codecvt<char32_t, char8_t, mbstate_t>::do_encoding() const throw()
1015{ return 0; } // UTF-8 is not a fixed-width encoding
1016
1017bool
1018codecvt<char32_t, char8_t, mbstate_t>::do_always_noconv() const throw()
1019{ return false; }
1020
1021int
1022codecvt<char32_t, char8_t, mbstate_t>::
1023do_length(state_type&, const extern_type* __from,
1024	  const extern_type* __end, size_t __max) const
1025{
1026  __end = ucs4_span(__from, __end, __max);
1027  return __end - __from;
1028}
1029
1030int
1031codecvt<char32_t, char8_t, mbstate_t>::do_max_length() const throw()
1032{
1033  // A single character (one UTF-32 code unit) requires
1034  // up to 4 UTF-8 code units.
1035  return 4;
1036}
1037#endif // _GLIBCXX_USE_CHAR8_T
1038
1039// Define members of codecvt_utf8<char16_t> base class implementation.
1040// Converts from UTF-8 to UCS-2.
1041
1042__codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
1043
1044codecvt_base::result
1045__codecvt_utf8_base<char16_t>::
1046do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1047       const intern_type*& __from_next,
1048       extern_type* __to, extern_type* __to_end,
1049       extern_type*& __to_next) const
1050{
1051  range<const char16_t> from{ __from, __from_end };
1052  range<char> to{ __to, __to_end };
1053  auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1054  __from_next = from.next;
1055  __to_next = to.next;
1056  return res;
1057}
1058
1059codecvt_base::result
1060__codecvt_utf8_base<char16_t>::
1061do_unshift(state_type&, extern_type* __to, extern_type*,
1062	   extern_type*& __to_next) const
1063{
1064  __to_next = __to;
1065  return noconv;
1066}
1067
1068codecvt_base::result
1069__codecvt_utf8_base<char16_t>::
1070do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1071      const extern_type*& __from_next,
1072      intern_type* __to, intern_type* __to_end,
1073      intern_type*& __to_next) const
1074{
1075  range<const char> from{ __from, __from_end };
1076  range<char16_t> to{ __to, __to_end };
1077  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1078#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1079  mode = codecvt_mode(mode | little_endian);
1080#endif
1081  auto res = ucs2_in(from, to, _M_maxcode, mode);
1082  __from_next = from.next;
1083  __to_next = to.next;
1084  return res;
1085}
1086
1087int
1088__codecvt_utf8_base<char16_t>::do_encoding() const throw()
1089{ return 0; } // UTF-8 is not a fixed-width encoding
1090
1091bool
1092__codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
1093{ return false; }
1094
1095int
1096__codecvt_utf8_base<char16_t>::
1097do_length(state_type&, const extern_type* __from,
1098	  const extern_type* __end, size_t __max) const
1099{
1100  __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1101  return __end - __from;
1102}
1103
1104int
1105__codecvt_utf8_base<char16_t>::do_max_length() const throw()
1106{
1107  // A single UCS-2 character requires up to three UTF-8 code units.
1108  // (UCS-2 cannot represent characters that use four UTF-8 code units).
1109  int max = 3;
1110  if (_M_mode & consume_header)
1111    max += sizeof(utf8_bom);
1112  return max;
1113}
1114
1115// Define members of codecvt_utf8<char32_t> base class implementation.
1116// Converts from UTF-8 to UTF-32 (aka UCS-4).
1117
1118__codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
1119
1120codecvt_base::result
1121__codecvt_utf8_base<char32_t>::
1122do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1123       const intern_type*& __from_next,
1124       extern_type* __to, extern_type* __to_end,
1125       extern_type*& __to_next) const
1126{
1127  range<const char32_t> from{ __from, __from_end };
1128  range<char> to{ __to, __to_end };
1129  auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1130  __from_next = from.next;
1131  __to_next = to.next;
1132  return res;
1133}
1134
1135codecvt_base::result
1136__codecvt_utf8_base<char32_t>::
1137do_unshift(state_type&, extern_type* __to, extern_type*,
1138	   extern_type*& __to_next) const
1139{
1140  __to_next = __to;
1141  return noconv;
1142}
1143
1144codecvt_base::result
1145__codecvt_utf8_base<char32_t>::
1146do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1147      const extern_type*& __from_next,
1148      intern_type* __to, intern_type* __to_end,
1149      intern_type*& __to_next) const
1150{
1151  range<const char> from{ __from, __from_end };
1152  range<char32_t> to{ __to, __to_end };
1153  auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1154  __from_next = from.next;
1155  __to_next = to.next;
1156  return res;
1157}
1158
1159int
1160__codecvt_utf8_base<char32_t>::do_encoding() const throw()
1161{ return 0; } // UTF-8 is not a fixed-width encoding
1162
1163bool
1164__codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1165{ return false; }
1166
1167int
1168__codecvt_utf8_base<char32_t>::
1169do_length(state_type&, const extern_type* __from,
1170	  const extern_type* __end, size_t __max) const
1171{
1172  __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1173  return __end - __from;
1174}
1175
1176int
1177__codecvt_utf8_base<char32_t>::do_max_length() const throw()
1178{
1179  // A single UCS-4 character requires up to four UTF-8 code units.
1180  int max = 4;
1181  if (_M_mode & consume_header)
1182    max += sizeof(utf8_bom);
1183  return max;
1184}
1185
1186#ifdef _GLIBCXX_USE_WCHAR_T
1187
1188#if __SIZEOF_WCHAR_T__ == 2
1189static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1190#elif __SIZEOF_WCHAR_T__ == 4
1191static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1192#endif
1193
1194// Define members of codecvt_utf8<wchar_t> base class implementation.
1195// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1196
1197__codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1198
1199codecvt_base::result
1200__codecvt_utf8_base<wchar_t>::
1201do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1202       const intern_type*& __from_next,
1203       extern_type* __to, extern_type* __to_end,
1204       extern_type*& __to_next) const
1205{
1206  range<char> to{ __to, __to_end };
1207#if __SIZEOF_WCHAR_T__ == 2
1208  range<const char16_t> from{
1209    reinterpret_cast<const char16_t*>(__from),
1210    reinterpret_cast<const char16_t*>(__from_end)
1211  };
1212  auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1213#elif __SIZEOF_WCHAR_T__ == 4
1214  range<const char32_t> from{
1215    reinterpret_cast<const char32_t*>(__from),
1216    reinterpret_cast<const char32_t*>(__from_end)
1217  };
1218  auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1219#else
1220  return codecvt_base::error;
1221#endif
1222  __from_next = reinterpret_cast<const wchar_t*>(from.next);
1223  __to_next = to.next;
1224  return res;
1225}
1226
1227codecvt_base::result
1228__codecvt_utf8_base<wchar_t>::
1229do_unshift(state_type&, extern_type* __to, extern_type*,
1230	   extern_type*& __to_next) const
1231{
1232  __to_next = __to;
1233  return noconv;
1234}
1235
1236codecvt_base::result
1237__codecvt_utf8_base<wchar_t>::
1238do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1239      const extern_type*& __from_next,
1240      intern_type* __to, intern_type* __to_end,
1241      intern_type*& __to_next) const
1242{
1243  range<const char> from{ __from, __from_end };
1244#if __SIZEOF_WCHAR_T__ == 2
1245  range<char16_t> to{
1246    reinterpret_cast<char16_t*>(__to),
1247    reinterpret_cast<char16_t*>(__to_end)
1248  };
1249#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1250  codecvt_mode mode = {};
1251#else
1252  codecvt_mode mode = little_endian;
1253#endif
1254  auto res = ucs2_in(from, to, _M_maxcode, mode);
1255#elif __SIZEOF_WCHAR_T__ == 4
1256  range<char32_t> to{
1257    reinterpret_cast<char32_t*>(__to),
1258    reinterpret_cast<char32_t*>(__to_end)
1259  };
1260  auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1261#else
1262  return codecvt_base::error;
1263#endif
1264  __from_next = from.next;
1265  __to_next = reinterpret_cast<wchar_t*>(to.next);
1266  return res;
1267}
1268
1269int
1270__codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1271{ return 0; } // UTF-8 is not a fixed-width encoding
1272
1273bool
1274__codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1275{ return false; }
1276
1277int
1278__codecvt_utf8_base<wchar_t>::
1279do_length(state_type&, const extern_type* __from,
1280	  const extern_type* __end, size_t __max) const
1281{
1282#if __SIZEOF_WCHAR_T__ == 2
1283  __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1284#elif __SIZEOF_WCHAR_T__ == 4
1285  __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1286#else
1287  __end = __from;
1288#endif
1289  return __end - __from;
1290}
1291
1292int
1293__codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1294{
1295#if __SIZEOF_WCHAR_T__ == 2
1296  int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1297#else
1298  int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1299#endif
1300  if (_M_mode & consume_header)
1301    max += sizeof(utf8_bom);
1302  return max;
1303}
1304#endif
1305
1306// Define members of codecvt_utf16<char16_t> base class implementation.
1307// Converts from UTF-16 to UCS-2.
1308
1309__codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1310
1311codecvt_base::result
1312__codecvt_utf16_base<char16_t>::
1313do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1314       const intern_type*& __from_next,
1315       extern_type* __to, extern_type* __to_end,
1316       extern_type*& __to_next) const
1317{
1318  range<const char16_t> from{ __from, __from_end };
1319  range<char16_t, false> to{ __to, __to_end };
1320  auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1321  __from_next = from.next;
1322  __to_next = reinterpret_cast<char*>(to.next);
1323  return res;
1324}
1325
1326codecvt_base::result
1327__codecvt_utf16_base<char16_t>::
1328do_unshift(state_type&, extern_type* __to, extern_type*,
1329	   extern_type*& __to_next) const
1330{
1331  __to_next = __to;
1332  return noconv;
1333}
1334
1335codecvt_base::result
1336__codecvt_utf16_base<char16_t>::
1337do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1338      const extern_type*& __from_next,
1339      intern_type* __to, intern_type* __to_end,
1340      intern_type*& __to_next) const
1341{
1342  range<const char16_t, false> from{ __from, __from_end };
1343  range<char16_t> to{ __to, __to_end };
1344  auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1345  __from_next = reinterpret_cast<const char*>(from.next);
1346  __to_next = to.next;
1347  if (res == codecvt_base::ok && __from_next != __from_end)
1348    res = codecvt_base::error;
1349  return res;
1350}
1351
1352int
1353__codecvt_utf16_base<char16_t>::do_encoding() const throw()
1354{ return 0; } // UTF-16 is not a fixed-width encoding
1355
1356bool
1357__codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1358{ return false; }
1359
1360int
1361__codecvt_utf16_base<char16_t>::
1362do_length(state_type&, const extern_type* __from,
1363	  const extern_type* __end, size_t __max) const
1364{
1365  range<const char16_t, false> from{ __from, __end };
1366  const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1367  return reinterpret_cast<const char*>(next) - __from;
1368}
1369
1370int
1371__codecvt_utf16_base<char16_t>::do_max_length() const throw()
1372{
1373  // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1374  // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1375  int max = 2;
1376  if (_M_mode & consume_header)
1377    max += sizeof(utf16_bom);
1378  return max;
1379}
1380
1381// Define members of codecvt_utf16<char32_t> base class implementation.
1382// Converts from UTF-16 to UTF-32 (aka UCS-4).
1383
1384__codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1385
1386codecvt_base::result
1387__codecvt_utf16_base<char32_t>::
1388do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1389       const intern_type*& __from_next,
1390       extern_type* __to, extern_type* __to_end,
1391       extern_type*& __to_next) const
1392{
1393  range<const char32_t> from{ __from, __from_end };
1394  range<char16_t, false> to{ __to, __to_end };
1395  auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1396  __from_next = from.next;
1397  __to_next = reinterpret_cast<char*>(to.next);
1398  return res;
1399}
1400
1401codecvt_base::result
1402__codecvt_utf16_base<char32_t>::
1403do_unshift(state_type&, extern_type* __to, extern_type*,
1404	   extern_type*& __to_next) const
1405{
1406  __to_next = __to;
1407  return noconv;
1408}
1409
1410codecvt_base::result
1411__codecvt_utf16_base<char32_t>::
1412do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1413      const extern_type*& __from_next,
1414      intern_type* __to, intern_type* __to_end,
1415      intern_type*& __to_next) const
1416{
1417  range<const char16_t, false> from{ __from, __from_end };
1418  range<char32_t> to{ __to, __to_end };
1419  auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1420  __from_next = reinterpret_cast<const char*>(from.next);
1421  __to_next = to.next;
1422  if (res == codecvt_base::ok && __from_next != __from_end)
1423    res = codecvt_base::error;
1424  return res;
1425}
1426
1427int
1428__codecvt_utf16_base<char32_t>::do_encoding() const throw()
1429{ return 0; } // UTF-16 is not a fixed-width encoding
1430
1431bool
1432__codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1433{ return false; }
1434
1435int
1436__codecvt_utf16_base<char32_t>::
1437do_length(state_type&, const extern_type* __from,
1438	  const extern_type* __end, size_t __max) const
1439{
1440  range<const char16_t, false> from{ __from, __end };
1441  const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1442  return reinterpret_cast<const char*>(next) - __from;
1443}
1444
1445int
1446__codecvt_utf16_base<char32_t>::do_max_length() const throw()
1447{
1448  // A single UCS-4 character requires one or two UTF-16 code units
1449  // (so up to four chars).
1450  int max = 4;
1451  if (_M_mode & consume_header)
1452    max += sizeof(utf16_bom);
1453  return max;
1454}
1455
1456#ifdef _GLIBCXX_USE_WCHAR_T
1457// Define members of codecvt_utf16<wchar_t> base class implementation.
1458// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1459
1460__codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1461
1462codecvt_base::result
1463__codecvt_utf16_base<wchar_t>::
1464do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1465       const intern_type*& __from_next,
1466       extern_type* __to, extern_type* __to_end,
1467       extern_type*& __to_next) const
1468{
1469  range<char16_t, false> to{ __to, __to_end };
1470#if __SIZEOF_WCHAR_T__ == 2
1471  range<const char16_t> from{
1472    reinterpret_cast<const char16_t*>(__from),
1473    reinterpret_cast<const char16_t*>(__from_end),
1474  };
1475  auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1476#elif __SIZEOF_WCHAR_T__ == 4
1477  range<const char32_t> from{
1478    reinterpret_cast<const char32_t*>(__from),
1479    reinterpret_cast<const char32_t*>(__from_end),
1480  };
1481  auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1482#else
1483  return codecvt_base::error;
1484#endif
1485  __from_next = reinterpret_cast<const wchar_t*>(from.next);
1486  __to_next = reinterpret_cast<char*>(to.next);
1487  return res;
1488}
1489
1490codecvt_base::result
1491__codecvt_utf16_base<wchar_t>::
1492do_unshift(state_type&, extern_type* __to, extern_type*,
1493	   extern_type*& __to_next) const
1494{
1495  __to_next = __to;
1496  return noconv;
1497}
1498
1499codecvt_base::result
1500__codecvt_utf16_base<wchar_t>::
1501do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1502      const extern_type*& __from_next,
1503      intern_type* __to, intern_type* __to_end,
1504      intern_type*& __to_next) const
1505{
1506  range<const char16_t, false> from{ __from, __from_end };
1507#if __SIZEOF_WCHAR_T__ == 2
1508  range<char16_t> to{
1509    reinterpret_cast<char16_t*>(__to),
1510    reinterpret_cast<char16_t*>(__to_end),
1511  };
1512  auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1513#elif __SIZEOF_WCHAR_T__ == 4
1514  range<char32_t> to{
1515    reinterpret_cast<char32_t*>(__to),
1516    reinterpret_cast<char32_t*>(__to_end),
1517  };
1518  auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1519#else
1520  return codecvt_base::error;
1521#endif
1522  __from_next = reinterpret_cast<const char*>(from.next);
1523  __to_next = reinterpret_cast<wchar_t*>(to.next);
1524  if (res == codecvt_base::ok && __from_next != __from_end)
1525    res = codecvt_base::error;
1526  return res;
1527}
1528
1529int
1530__codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1531{ return 0; } // UTF-16 is not a fixed-width encoding
1532
1533bool
1534__codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1535{ return false; }
1536
1537int
1538__codecvt_utf16_base<wchar_t>::
1539do_length(state_type&, const extern_type* __from,
1540	  const extern_type* __end, size_t __max) const
1541{
1542  range<const char16_t, false> from{ __from, __end };
1543#if __SIZEOF_WCHAR_T__ == 2
1544  const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1545#elif __SIZEOF_WCHAR_T__ == 4
1546  const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1547#endif
1548  return reinterpret_cast<const char*>(next) - __from;
1549}
1550
1551int
1552__codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1553{
1554#if __SIZEOF_WCHAR_T__ == 2
1555  int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1556#else
1557  int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1558#endif
1559  if (_M_mode & consume_header)
1560    max += sizeof(utf16_bom);
1561  return max;
1562}
1563#endif
1564
1565// Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1566// Converts from UTF-8 to UTF-16.
1567
1568__codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1569
1570codecvt_base::result
1571__codecvt_utf8_utf16_base<char16_t>::
1572do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1573       const intern_type*& __from_next,
1574       extern_type* __to, extern_type* __to_end,
1575       extern_type*& __to_next) const
1576{
1577  range<const char16_t> from{ __from, __from_end };
1578  range<char> to{ __to, __to_end };
1579  auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1580  __from_next = from.next;
1581  __to_next = to.next;
1582  return res;
1583}
1584
1585codecvt_base::result
1586__codecvt_utf8_utf16_base<char16_t>::
1587do_unshift(state_type&, extern_type* __to, extern_type*,
1588	   extern_type*& __to_next) const
1589{
1590  __to_next = __to;
1591  return noconv;
1592}
1593
1594codecvt_base::result
1595__codecvt_utf8_utf16_base<char16_t>::
1596do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1597      const extern_type*& __from_next,
1598      intern_type* __to, intern_type* __to_end,
1599      intern_type*& __to_next) const
1600{
1601  range<const char> from{ __from, __from_end };
1602  range<char16_t> to{ __to, __to_end };
1603  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1604#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1605  mode = codecvt_mode(mode | little_endian);
1606#endif
1607  auto res = utf16_in(from, to, _M_maxcode, mode);
1608  __from_next = from.next;
1609  __to_next = to.next;
1610  return res;
1611}
1612
1613int
1614__codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1615{ return 0; } // UTF-8 is not a fixed-width encoding
1616
1617bool
1618__codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1619{ return false; }
1620
1621int
1622__codecvt_utf8_utf16_base<char16_t>::
1623do_length(state_type&, const extern_type* __from,
1624	  const extern_type* __end, size_t __max) const
1625{
1626  __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1627  return __end - __from;
1628}
1629
1630int
1631__codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1632{
1633  // A single character can be 1 or 2 UTF-16 code units,
1634  // requiring up to 4 UTF-8 code units.
1635  int max = 4;
1636  if (_M_mode & consume_header)
1637    max += sizeof(utf8_bom);
1638  return max;
1639}
1640
1641// Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1642// Converts from UTF-8 to UTF-16.
1643
1644__codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1645
1646codecvt_base::result
1647__codecvt_utf8_utf16_base<char32_t>::
1648do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1649       const intern_type*& __from_next,
1650       extern_type* __to, extern_type* __to_end,
1651       extern_type*& __to_next) const
1652{
1653  range<const char32_t> from{ __from, __from_end };
1654  range<char> to{ __to, __to_end };
1655  auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1656  __from_next = from.next;
1657  __to_next = to.next;
1658  return res;
1659}
1660
1661codecvt_base::result
1662__codecvt_utf8_utf16_base<char32_t>::
1663do_unshift(state_type&, extern_type* __to, extern_type*,
1664	   extern_type*& __to_next) const
1665{
1666  __to_next = __to;
1667  return noconv;
1668}
1669
1670codecvt_base::result
1671__codecvt_utf8_utf16_base<char32_t>::
1672do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1673      const extern_type*& __from_next,
1674      intern_type* __to, intern_type* __to_end,
1675      intern_type*& __to_next) const
1676{
1677  range<const char> from{ __from, __from_end };
1678  range<char32_t> to{ __to, __to_end };
1679  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1680#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1681  mode = codecvt_mode(mode | little_endian);
1682#endif
1683  auto res = utf16_in(from, to, _M_maxcode, mode);
1684  __from_next = from.next;
1685  __to_next = to.next;
1686  return res;
1687}
1688
1689int
1690__codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1691{ return 0; } // UTF-8 is not a fixed-width encoding
1692
1693bool
1694__codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1695{ return false; }
1696
1697int
1698__codecvt_utf8_utf16_base<char32_t>::
1699do_length(state_type&, const extern_type* __from,
1700	  const extern_type* __end, size_t __max) const
1701{
1702  __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1703  return __end - __from;
1704}
1705
1706int
1707__codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1708{
1709  // A single character can be 1 or 2 UTF-16 code units,
1710  // requiring up to 4 UTF-8 code units.
1711  int max = 4;
1712  if (_M_mode & consume_header)
1713    max += sizeof(utf8_bom);
1714  return max;
1715}
1716
1717#ifdef _GLIBCXX_USE_WCHAR_T
1718// Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1719// Converts from UTF-8 to UTF-16.
1720
1721__codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1722
1723codecvt_base::result
1724__codecvt_utf8_utf16_base<wchar_t>::
1725do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1726       const intern_type*& __from_next,
1727       extern_type* __to, extern_type* __to_end,
1728       extern_type*& __to_next) const
1729{
1730  range<const wchar_t> from{ __from, __from_end };
1731  range<char> to{ __to, __to_end };
1732  auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1733  __from_next = from.next;
1734  __to_next = to.next;
1735  return res;
1736}
1737
1738codecvt_base::result
1739__codecvt_utf8_utf16_base<wchar_t>::
1740do_unshift(state_type&, extern_type* __to, extern_type*,
1741	   extern_type*& __to_next) const
1742{
1743  __to_next = __to;
1744  return noconv;
1745}
1746
1747codecvt_base::result
1748__codecvt_utf8_utf16_base<wchar_t>::
1749do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1750      const extern_type*& __from_next,
1751      intern_type* __to, intern_type* __to_end,
1752      intern_type*& __to_next) const
1753{
1754  range<const char> from{ __from, __from_end };
1755  range<wchar_t> to{ __to, __to_end };
1756  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1757#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1758  mode = codecvt_mode(mode | little_endian);
1759#endif
1760  auto res = utf16_in(from, to, _M_maxcode, mode);
1761  __from_next = from.next;
1762  __to_next = to.next;
1763  return res;
1764}
1765
1766int
1767__codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1768{ return 0; } // UTF-8 is not a fixed-width encoding
1769
1770bool
1771__codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1772{ return false; }
1773
1774int
1775__codecvt_utf8_utf16_base<wchar_t>::
1776do_length(state_type&, const extern_type* __from,
1777	  const extern_type* __end, size_t __max) const
1778{
1779  __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1780  return __end - __from;
1781}
1782
1783int
1784__codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1785{
1786  // A single character can be 1 or 2 UTF-16 code units,
1787  // requiring up to 4 UTF-8 code units.
1788  int max = 4;
1789  if (_M_mode & consume_header)
1790    max += sizeof(utf8_bom);
1791  return max;
1792}
1793#endif
1794
1795inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1796inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1797template class codecvt_byname<char16_t, char, mbstate_t>;
1798template class codecvt_byname<char32_t, char, mbstate_t>;
1799
1800#if defined(_GLIBCXX_USE_CHAR8_T)
1801inline template class __codecvt_abstract_base<char16_t, char8_t, mbstate_t>;
1802inline template class __codecvt_abstract_base<char32_t, char8_t, mbstate_t>;
1803template class codecvt_byname<char16_t, char8_t, mbstate_t>;
1804template class codecvt_byname<char32_t, char8_t, mbstate_t>;
1805#endif
1806
1807_GLIBCXX_END_NAMESPACE_VERSION
1808}
1809