codecvt.cc revision 1.4
1// Locale support (codecvt) -*- C++ -*-
2
3// Copyright (C) 2015-2018 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library.  This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23// <http://www.gnu.org/licenses/>.
24
25#include <codecvt>
26#include <cstring>		// std::memcpy, std::memcmp
27#include <bits/stl_algobase.h>	// std::min
28
29#ifdef _GLIBCXX_USE_C99_STDINT_TR1
30namespace std _GLIBCXX_VISIBILITY(default)
31{
32_GLIBCXX_BEGIN_NAMESPACE_VERSION
33
34  // The standard doesn't define these operators, which is annoying.
35  static underlying_type<codecvt_mode>::type
36  to_integer(codecvt_mode m)
37  { return static_cast<underlying_type<codecvt_mode>::type>(m); }
38
39  static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
40  { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
41
42  static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
43  { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
44
45  static codecvt_mode operator~(codecvt_mode m)
46  { return codecvt_mode(~to_integer(m)); }
47
48namespace
49{
50  // Largest code point that fits in a single UTF-16 code unit.
51  const char32_t max_single_utf16_unit = 0xFFFF;
52
53  const char32_t max_code_point = 0x10FFFF;
54
55  // The functions below rely on maxcode < incomplete_mb_character
56  // (which is enforced by the codecvt_utf* classes on construction).
57  const char32_t incomplete_mb_character = char32_t(-2);
58  const char32_t invalid_mb_sequence = char32_t(-1);
59
60  // Utility type for reading and writing code units of type Elem from
61  // a range defined by a pair of pointers.
62  template<typename Elem, bool Aligned = true>
63    struct range
64    {
65      Elem* next;
66      Elem* end;
67
68      // Write a code unit.
69      range& operator=(Elem e)
70      {
71	*next++ = e;
72	return *this;
73      }
74
75      // Read the next code unit.
76      Elem operator*() const { return *next; }
77
78      // Read the Nth code unit.
79      Elem operator[](size_t n) const { return next[n]; }
80
81      // Move to the next code unit.
82      range& operator++()
83      {
84	++next;
85	return *this;
86      }
87
88      // Move to the Nth code unit.
89      range& operator+=(size_t n)
90      {
91	next += n;
92	return *this;
93      }
94
95      // The number of code units remaining.
96      size_t size() const { return end - next; }
97
98      // The number of bytes remaining.
99      size_t nbytes() const { return (const char*)end - (const char*)next; }
100    };
101
102  // This specialization is used when accessing char16_t values through
103  // pointers to char, which might not be correctly aligned for char16_t.
104  template<typename Elem>
105    struct range<Elem, false>
106    {
107      using value_type = typename remove_const<Elem>::type;
108
109      using char_pointer = typename
110	conditional<is_const<Elem>::value, const char*, char*>::type;
111
112      char_pointer next;
113      char_pointer end;
114
115      // Write a code unit.
116      range& operator=(Elem e)
117      {
118	memcpy(next, &e, sizeof(Elem));
119	++*this;
120	return *this;
121      }
122
123      // Read the next code unit.
124      Elem operator*() const
125      {
126	value_type e;
127	memcpy(&e, next, sizeof(Elem));
128	return e;
129      }
130
131      // Read the Nth code unit.
132      Elem operator[](size_t n) const
133      {
134	value_type e;
135	memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
136	return e;
137      }
138
139      // Move to the next code unit.
140      range& operator++()
141      {
142	next += sizeof(Elem);
143	return *this;
144      }
145
146      // Move to the Nth code unit.
147      range& operator+=(size_t n)
148      {
149	next += n * sizeof(Elem);
150	return *this;
151      }
152
153      // The number of code units remaining.
154      size_t size() const { return nbytes() / sizeof(Elem); }
155
156      // The number of bytes remaining.
157      size_t nbytes() const { return end - next; }
158    };
159
160  // Multibyte sequences can have "header" consisting of Byte Order Mark
161  const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
162  const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
163  const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
164
165  // Write a BOM (space permitting).
166  template<typename C, bool A, size_t N>
167    bool
168    write_bom(range<C, A>& to, const unsigned char (&bom)[N])
169    {
170      static_assert( (N / sizeof(C)) != 0, "" );
171      static_assert( (N % sizeof(C)) == 0, "" );
172
173      if (to.nbytes() < N)
174	return false;
175      memcpy(to.next, bom, N);
176      to += (N / sizeof(C));
177      return true;
178    }
179
180  // Try to read a BOM.
181  template<typename C, bool A, size_t N>
182    bool
183    read_bom(range<C, A>& from, const unsigned char (&bom)[N])
184    {
185      static_assert( (N / sizeof(C)) != 0, "" );
186      static_assert( (N % sizeof(C)) == 0, "" );
187
188      if (from.nbytes() >= N && !memcmp(from.next, bom, N))
189	{
190	  from += (N / sizeof(C));
191	  return true;
192	}
193      return false;
194    }
195
196  // If generate_header is set in mode write out UTF-8 BOM.
197  bool
198  write_utf8_bom(range<char>& to, codecvt_mode mode)
199  {
200    if (mode & generate_header)
201      return write_bom(to, utf8_bom);
202    return true;
203  }
204
205  // If generate_header is set in mode write out the UTF-16 BOM indicated
206  // by whether little_endian is set in mode.
207  template<bool Aligned>
208  bool
209  write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
210  {
211    if (mode & generate_header)
212    {
213      if (mode & little_endian)
214	return write_bom(to, utf16le_bom);
215      else
216	return write_bom(to, utf16_bom);
217    }
218    return true;
219  }
220
221  // If consume_header is set in mode update from.next to after any BOM.
222  void
223  read_utf8_bom(range<const char>& from, codecvt_mode mode)
224  {
225    if (mode & consume_header)
226      read_bom(from, utf8_bom);
227  }
228
229  // If consume_header is not set in mode, no effects.
230  // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
231  // - if the UTF-16BE BOM was found unset little_endian in mode, or
232  // - if the UTF-16LE BOM was found set little_endian in mode.
233  template<bool Aligned>
234  void
235  read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
236  {
237    if (mode & consume_header)
238      {
239	if (read_bom(from, utf16_bom))
240	  mode &= ~little_endian;
241	else if (read_bom(from, utf16le_bom))
242	  mode |= little_endian;
243      }
244  }
245
246  // Read a codepoint from a UTF-8 multibyte sequence.
247  // Updates from.next if the codepoint is not greater than maxcode.
248  // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
249  char32_t
250  read_utf8_code_point(range<const char>& from, unsigned long maxcode)
251  {
252    const size_t avail = from.size();
253    if (avail == 0)
254      return incomplete_mb_character;
255    unsigned char c1 = from[0];
256    // https://en.wikipedia.org/wiki/UTF-8#Sample_code
257    if (c1 < 0x80)
258    {
259      ++from;
260      return c1;
261    }
262    else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
263      return invalid_mb_sequence;
264    else if (c1 < 0xE0) // 2-byte sequence
265    {
266      if (avail < 2)
267	return incomplete_mb_character;
268      unsigned char c2 = from[1];
269      if ((c2 & 0xC0) != 0x80)
270	return invalid_mb_sequence;
271      char32_t c = (c1 << 6) + c2 - 0x3080;
272      if (c <= maxcode)
273	from += 2;
274      return c;
275    }
276    else if (c1 < 0xF0) // 3-byte sequence
277    {
278      if (avail < 3)
279	return incomplete_mb_character;
280      unsigned char c2 = from[1];
281      if ((c2 & 0xC0) != 0x80)
282	return invalid_mb_sequence;
283      if (c1 == 0xE0 && c2 < 0xA0) // overlong
284	return invalid_mb_sequence;
285      unsigned char c3 = from[2];
286      if ((c3 & 0xC0) != 0x80)
287	return invalid_mb_sequence;
288      char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
289      if (c <= maxcode)
290	from += 3;
291      return c;
292    }
293    else if (c1 < 0xF5) // 4-byte sequence
294    {
295      if (avail < 4)
296	return incomplete_mb_character;
297      unsigned char c2 = from[1];
298      if ((c2 & 0xC0) != 0x80)
299	return invalid_mb_sequence;
300      if (c1 == 0xF0 && c2 < 0x90) // overlong
301	return invalid_mb_sequence;
302      if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
303      return invalid_mb_sequence;
304      unsigned char c3 = from[2];
305      if ((c3 & 0xC0) != 0x80)
306	return invalid_mb_sequence;
307      unsigned char c4 = from[3];
308      if ((c4 & 0xC0) != 0x80)
309	return invalid_mb_sequence;
310      char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
311      if (c <= maxcode)
312	from += 4;
313      return c;
314    }
315    else // > U+10FFFF
316      return invalid_mb_sequence;
317  }
318
319  bool
320  write_utf8_code_point(range<char>& to, char32_t code_point)
321  {
322    if (code_point < 0x80)
323      {
324	if (to.size() < 1)
325	  return false;
326	to = code_point;
327      }
328    else if (code_point <= 0x7FF)
329      {
330	if (to.size() < 2)
331	  return false;
332	to = (code_point >> 6) + 0xC0;
333	to = (code_point & 0x3F) + 0x80;
334      }
335    else if (code_point <= 0xFFFF)
336      {
337	if (to.size() < 3)
338	  return false;
339	to = (code_point >> 12) + 0xE0;
340	to = ((code_point >> 6) & 0x3F) + 0x80;
341	to = (code_point & 0x3F) + 0x80;
342      }
343    else if (code_point <= 0x10FFFF)
344      {
345	if (to.size() < 4)
346	  return false;
347	to = (code_point >> 18) + 0xF0;
348	to = ((code_point >> 12) & 0x3F) + 0x80;
349	to = ((code_point >> 6) & 0x3F) + 0x80;
350	to = (code_point & 0x3F) + 0x80;
351      }
352    else
353      return false;
354    return true;
355  }
356
357  inline char16_t
358  adjust_byte_order(char16_t c, codecvt_mode mode)
359  {
360#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
361    return (mode & little_endian) ? __builtin_bswap16(c) : c;
362#else
363    return (mode & little_endian) ? c : __builtin_bswap16(c);
364#endif
365  }
366
367  // Return true if c is a high-surrogate (aka leading) code point.
368  inline bool
369  is_high_surrogate(char32_t c)
370  {
371    return c >= 0xD800 && c <= 0xDBFF;
372  }
373
374  // Return true if c is a low-surrogate (aka trailing) code point.
375  inline bool
376  is_low_surrogate(char32_t c)
377  {
378    return c >= 0xDC00 && c <= 0xDFFF;
379  }
380
381  inline char32_t
382  surrogate_pair_to_code_point(char32_t high, char32_t low)
383  {
384    return (high << 10) + low - 0x35FDC00;
385  }
386
387  // Read a codepoint from a UTF-16 multibyte sequence.
388  // The sequence's endianness is indicated by (mode & little_endian).
389  // Updates from.next if the codepoint is not greater than maxcode.
390  // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
391  template<bool Aligned>
392    char32_t
393    read_utf16_code_point(range<const char16_t, Aligned>& from,
394			  unsigned long maxcode, codecvt_mode mode)
395    {
396      const size_t avail = from.size();
397      if (avail == 0)
398	return incomplete_mb_character;
399      int inc = 1;
400      char32_t c = adjust_byte_order(from[0], mode);
401      if (is_high_surrogate(c))
402	{
403	  if (avail < 2)
404	    return incomplete_mb_character;
405	  const char16_t c2 = adjust_byte_order(from[1], mode);
406	  if (is_low_surrogate(c2))
407	    {
408	      c = surrogate_pair_to_code_point(c, c2);
409	      inc = 2;
410	    }
411	  else
412	    return invalid_mb_sequence;
413	}
414      else if (is_low_surrogate(c))
415	return invalid_mb_sequence;
416      if (c <= maxcode)
417	from += inc;
418      return c;
419    }
420
421  template<typename C, bool A>
422  bool
423  write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
424  {
425    static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
426
427    if (codepoint <= max_single_utf16_unit)
428      {
429	if (to.size() > 0)
430	  {
431	    to = adjust_byte_order(codepoint, mode);
432	    return true;
433	  }
434      }
435    else if (to.size() > 1)
436      {
437	// Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
438	const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
439	char16_t lead = LEAD_OFFSET + (codepoint >> 10);
440	char16_t trail = 0xDC00 + (codepoint & 0x3FF);
441	to = adjust_byte_order(lead, mode);
442	to = adjust_byte_order(trail, mode);
443	return true;
444      }
445    return false;
446  }
447
448  // utf8 -> ucs4
449  codecvt_base::result
450  ucs4_in(range<const char>& from, range<char32_t>& to,
451          unsigned long maxcode = max_code_point, codecvt_mode mode = {})
452  {
453    read_utf8_bom(from, mode);
454    while (from.size() && to.size())
455      {
456	const char32_t codepoint = read_utf8_code_point(from, maxcode);
457	if (codepoint == incomplete_mb_character)
458	  return codecvt_base::partial;
459	if (codepoint > maxcode)
460	  return codecvt_base::error;
461	to = codepoint;
462      }
463    return from.size() ? codecvt_base::partial : codecvt_base::ok;
464  }
465
466  // ucs4 -> utf8
467  codecvt_base::result
468  ucs4_out(range<const char32_t>& from, range<char>& to,
469           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
470  {
471    if (!write_utf8_bom(to, mode))
472      return codecvt_base::partial;
473    while (from.size())
474      {
475	const char32_t c = from[0];
476	if (c > maxcode)
477	  return codecvt_base::error;
478	if (!write_utf8_code_point(to, c))
479	  return codecvt_base::partial;
480	++from;
481      }
482    return codecvt_base::ok;
483  }
484
485  // utf16 -> ucs4
486  codecvt_base::result
487  ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
488          unsigned long maxcode = max_code_point, codecvt_mode mode = {})
489  {
490    read_utf16_bom(from, mode);
491    while (from.size() && to.size())
492      {
493	const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
494	if (codepoint == incomplete_mb_character)
495	  return codecvt_base::partial;
496	if (codepoint > maxcode)
497	  return codecvt_base::error;
498	to = codepoint;
499      }
500    return from.size() ? codecvt_base::partial : codecvt_base::ok;
501  }
502
503  // ucs4 -> utf16
504  codecvt_base::result
505  ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
506           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
507  {
508    if (!write_utf16_bom(to, mode))
509      return codecvt_base::partial;
510    while (from.size())
511      {
512	const char32_t c = from[0];
513	if (c > maxcode)
514	  return codecvt_base::error;
515	if (!write_utf16_code_point(to, c, mode))
516	  return codecvt_base::partial;
517	++from;
518      }
519    return codecvt_base::ok;
520  }
521
522  // Flag indicating whether to process UTF-16 or UCS2
523  enum class surrogates { allowed, disallowed };
524
525  // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
526  template<typename C>
527  codecvt_base::result
528  utf16_in(range<const char>& from, range<C>& to,
529	   unsigned long maxcode = max_code_point, codecvt_mode mode = {},
530	   surrogates s = surrogates::allowed)
531  {
532    read_utf8_bom(from, mode);
533    while (from.size() && to.size())
534      {
535	auto orig = from;
536	const char32_t codepoint = read_utf8_code_point(from, maxcode);
537	if (codepoint == incomplete_mb_character)
538	  {
539	    if (s == surrogates::allowed)
540	      return codecvt_base::partial;
541	    else
542	      return codecvt_base::error; // No surrogates in UCS2
543	  }
544	if (codepoint > maxcode)
545	  return codecvt_base::error;
546	if (!write_utf16_code_point(to, codepoint, mode))
547	  {
548	    from = orig; // rewind to previous position
549	    return codecvt_base::partial;
550	  }
551      }
552    return codecvt_base::ok;
553  }
554
555  // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
556  template<typename C>
557  codecvt_base::result
558  utf16_out(range<const C>& from, range<char>& to,
559	    unsigned long maxcode = max_code_point, codecvt_mode mode = {},
560	    surrogates s = surrogates::allowed)
561  {
562    if (!write_utf8_bom(to, mode))
563      return codecvt_base::partial;
564    while (from.size())
565      {
566	char32_t c = from[0];
567	int inc = 1;
568	if (is_high_surrogate(c))
569	  {
570	    if (s == surrogates::disallowed)
571	      return codecvt_base::error; // No surrogates in UCS-2
572
573	    if (from.size() < 2)
574	      return codecvt_base::ok; // stop converting at this point
575
576	    const char32_t c2 = from[1];
577	    if (is_low_surrogate(c2))
578	      {
579		c = surrogate_pair_to_code_point(c, c2);
580		inc = 2;
581	      }
582	    else
583	      return codecvt_base::error;
584	  }
585	else if (is_low_surrogate(c))
586	  return codecvt_base::error;
587	if (c > maxcode)
588	  return codecvt_base::error;
589	if (!write_utf8_code_point(to, c))
590	  return codecvt_base::partial;
591	from += inc;
592      }
593    return codecvt_base::ok;
594  }
595
596  // return pos such that [begin,pos) is valid UTF-16 string no longer than max
597  const char*
598  utf16_span(const char* begin, const char* end, size_t max,
599	     char32_t maxcode = max_code_point, codecvt_mode mode = {})
600  {
601    range<const char> from{ begin, end };
602    read_utf8_bom(from, mode);
603    size_t count = 0;
604    while (count+1 < max)
605      {
606	char32_t c = read_utf8_code_point(from, maxcode);
607	if (c > maxcode)
608	  return from.next;
609	else if (c > max_single_utf16_unit)
610	  ++count;
611	++count;
612      }
613    if (count+1 == max) // take one more character if it fits in a single unit
614      read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
615    return from.next;
616  }
617
618  // utf8 -> ucs2
619  codecvt_base::result
620  ucs2_in(range<const char>& from, range<char16_t>& to,
621	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
622  {
623    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
624    maxcode = std::min(max_single_utf16_unit, maxcode);
625    return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
626  }
627
628  // ucs2 -> utf8
629  codecvt_base::result
630  ucs2_out(range<const char16_t>& from, range<char>& to,
631	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
632  {
633    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
634    maxcode = std::min(max_single_utf16_unit, maxcode);
635    return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
636  }
637
638  // ucs2 -> utf16
639  codecvt_base::result
640  ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
641	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
642  {
643    if (!write_utf16_bom(to, mode))
644      return codecvt_base::partial;
645    while (from.size() && to.size())
646      {
647	char16_t c = from[0];
648	if (is_high_surrogate(c))
649	  return codecvt_base::error;
650	if (c > maxcode)
651	  return codecvt_base::error;
652	to = adjust_byte_order(c, mode);
653	++from;
654      }
655    return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
656  }
657
658  // utf16 -> ucs2
659  codecvt_base::result
660  ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
661	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
662  {
663    read_utf16_bom(from, mode);
664    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
665    maxcode = std::min(max_single_utf16_unit, maxcode);
666    while (from.size() && to.size())
667      {
668	const char32_t c = read_utf16_code_point(from, maxcode, mode);
669	if (c == incomplete_mb_character)
670	  return codecvt_base::error; // UCS-2 only supports single units.
671	if (c > maxcode)
672	  return codecvt_base::error;
673	to = c;
674      }
675    return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
676  }
677
678  const char16_t*
679  ucs2_span(range<const char16_t, false>& from, size_t max,
680            char32_t maxcode, codecvt_mode mode)
681  {
682    read_utf16_bom(from, mode);
683    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
684    maxcode = std::min(max_single_utf16_unit, maxcode);
685    char32_t c = 0;
686    while (max-- && c <= maxcode)
687      c = read_utf16_code_point(from, maxcode, mode);
688    return reinterpret_cast<const char16_t*>(from.next);
689  }
690
691  const char*
692  ucs2_span(const char* begin, const char* end, size_t max,
693            char32_t maxcode, codecvt_mode mode)
694  {
695    range<const char> from{ begin, end };
696    read_utf8_bom(from, mode);
697    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
698    maxcode = std::min(max_single_utf16_unit, maxcode);
699    char32_t c = 0;
700    while (max-- && c <= maxcode)
701      c = read_utf8_code_point(from, maxcode);
702    return from.next;
703  }
704
705  // return pos such that [begin,pos) is valid UCS-4 string no longer than max
706  const char*
707  ucs4_span(const char* begin, const char* end, size_t max,
708            char32_t maxcode = max_code_point, codecvt_mode mode = {})
709  {
710    range<const char> from{ begin, end };
711    read_utf8_bom(from, mode);
712    char32_t c = 0;
713    while (max-- && c <= maxcode)
714      c = read_utf8_code_point(from, maxcode);
715    return from.next;
716  }
717
718  // return pos such that [begin,pos) is valid UCS-4 string no longer than max
719  const char16_t*
720  ucs4_span(range<const char16_t, false>& from, size_t max,
721            char32_t maxcode = max_code_point, codecvt_mode mode = {})
722  {
723    read_utf16_bom(from, mode);
724    char32_t c = 0;
725    while (max-- && c <= maxcode)
726      c = read_utf16_code_point(from, maxcode, mode);
727    return reinterpret_cast<const char16_t*>(from.next);
728  }
729}
730
731// Define members of codecvt<char16_t, char, mbstate_t> specialization.
732// Converts from UTF-8 to UTF-16.
733
734locale::id codecvt<char16_t, char, mbstate_t>::id;
735
736codecvt<char16_t, char, mbstate_t>::~codecvt() { }
737
738codecvt_base::result
739codecvt<char16_t, char, mbstate_t>::
740do_out(state_type&,
741       const intern_type* __from,
742       const intern_type* __from_end, const intern_type*& __from_next,
743       extern_type* __to, extern_type* __to_end,
744       extern_type*& __to_next) const
745{
746  range<const char16_t> from{ __from, __from_end };
747  range<char> to{ __to, __to_end };
748  auto res = utf16_out(from, to);
749  __from_next = from.next;
750  __to_next = to.next;
751  return res;
752}
753
754codecvt_base::result
755codecvt<char16_t, char, mbstate_t>::
756do_unshift(state_type&, extern_type* __to, extern_type*,
757	   extern_type*& __to_next) const
758{
759  __to_next = __to;
760  return noconv; // we don't use mbstate_t for the unicode facets
761}
762
763codecvt_base::result
764codecvt<char16_t, char, mbstate_t>::
765do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
766      const extern_type*& __from_next,
767      intern_type* __to, intern_type* __to_end,
768      intern_type*& __to_next) const
769{
770  range<const char> from{ __from, __from_end };
771  range<char16_t> to{ __to, __to_end };
772#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
773  codecvt_mode mode = {};
774#else
775  codecvt_mode mode = little_endian;
776#endif
777  auto res = utf16_in(from, to, max_code_point, mode);
778  __from_next = from.next;
779  __to_next = to.next;
780  return res;
781}
782
783int
784codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
785{ return 0; } // UTF-8 is not a fixed-width encoding
786
787bool
788codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
789{ return false; }
790
791int
792codecvt<char16_t, char, mbstate_t>::
793do_length(state_type&, const extern_type* __from,
794	  const extern_type* __end, size_t __max) const
795{
796  __end = utf16_span(__from, __end, __max);
797  return __end - __from;
798}
799
800int
801codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
802{
803  // A single character (one or two UTF-16 code units) requires
804  // up to four UTF-8 code units.
805  return 4;
806}
807
808// Define members of codecvt<char32_t, char, mbstate_t> specialization.
809// Converts from UTF-8 to UTF-32 (aka UCS-4).
810
811locale::id codecvt<char32_t, char, mbstate_t>::id;
812
813codecvt<char32_t, char, mbstate_t>::~codecvt() { }
814
815codecvt_base::result
816codecvt<char32_t, char, mbstate_t>::
817do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
818       const intern_type*& __from_next,
819       extern_type* __to, extern_type* __to_end,
820       extern_type*& __to_next) const
821{
822  range<const char32_t> from{ __from, __from_end };
823  range<char> to{ __to, __to_end };
824  auto res = ucs4_out(from, to);
825  __from_next = from.next;
826  __to_next = to.next;
827  return res;
828}
829
830codecvt_base::result
831codecvt<char32_t, char, mbstate_t>::
832do_unshift(state_type&, extern_type* __to, extern_type*,
833	   extern_type*& __to_next) const
834{
835  __to_next = __to;
836  return noconv;
837}
838
839codecvt_base::result
840codecvt<char32_t, char, mbstate_t>::
841do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
842      const extern_type*& __from_next,
843      intern_type* __to, intern_type* __to_end,
844      intern_type*& __to_next) const
845{
846  range<const char> from{ __from, __from_end };
847  range<char32_t> to{ __to, __to_end };
848  auto res = ucs4_in(from, to);
849  __from_next = from.next;
850  __to_next = to.next;
851  return res;
852}
853
854int
855codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
856{ return 0; } // UTF-8 is not a fixed-width encoding
857
858bool
859codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
860{ return false; }
861
862int
863codecvt<char32_t, char, mbstate_t>::
864do_length(state_type&, const extern_type* __from,
865	  const extern_type* __end, size_t __max) const
866{
867  __end = ucs4_span(__from, __end, __max);
868  return __end - __from;
869}
870
871int
872codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
873{
874  // A single character (one UTF-32 code unit) requires
875  // up to 4 UTF-8 code units.
876  return 4;
877}
878
879// Define members of codecvt_utf8<char16_t> base class implementation.
880// Converts from UTF-8 to UCS-2.
881
882__codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
883
884codecvt_base::result
885__codecvt_utf8_base<char16_t>::
886do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
887       const intern_type*& __from_next,
888       extern_type* __to, extern_type* __to_end,
889       extern_type*& __to_next) const
890{
891  range<const char16_t> from{ __from, __from_end };
892  range<char> to{ __to, __to_end };
893  auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
894  __from_next = from.next;
895  __to_next = to.next;
896  return res;
897}
898
899codecvt_base::result
900__codecvt_utf8_base<char16_t>::
901do_unshift(state_type&, extern_type* __to, extern_type*,
902	   extern_type*& __to_next) const
903{
904  __to_next = __to;
905  return noconv;
906}
907
908codecvt_base::result
909__codecvt_utf8_base<char16_t>::
910do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
911      const extern_type*& __from_next,
912      intern_type* __to, intern_type* __to_end,
913      intern_type*& __to_next) const
914{
915  range<const char> from{ __from, __from_end };
916  range<char16_t> to{ __to, __to_end };
917  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
918#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
919  mode = codecvt_mode(mode | little_endian);
920#endif
921  auto res = ucs2_in(from, to, _M_maxcode, mode);
922  __from_next = from.next;
923  __to_next = to.next;
924  return res;
925}
926
927int
928__codecvt_utf8_base<char16_t>::do_encoding() const throw()
929{ return 0; } // UTF-8 is not a fixed-width encoding
930
931bool
932__codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
933{ return false; }
934
935int
936__codecvt_utf8_base<char16_t>::
937do_length(state_type&, const extern_type* __from,
938	  const extern_type* __end, size_t __max) const
939{
940  __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
941  return __end - __from;
942}
943
944int
945__codecvt_utf8_base<char16_t>::do_max_length() const throw()
946{
947  // A single UCS-2 character requires up to three UTF-8 code units.
948  // (UCS-2 cannot represent characters that use four UTF-8 code units).
949  int max = 3;
950  if (_M_mode & consume_header)
951    max += sizeof(utf8_bom);
952  return max;
953}
954
955// Define members of codecvt_utf8<char32_t> base class implementation.
956// Converts from UTF-8 to UTF-32 (aka UCS-4).
957
958__codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
959
960codecvt_base::result
961__codecvt_utf8_base<char32_t>::
962do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
963       const intern_type*& __from_next,
964       extern_type* __to, extern_type* __to_end,
965       extern_type*& __to_next) const
966{
967  range<const char32_t> from{ __from, __from_end };
968  range<char> to{ __to, __to_end };
969  auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
970  __from_next = from.next;
971  __to_next = to.next;
972  return res;
973}
974
975codecvt_base::result
976__codecvt_utf8_base<char32_t>::
977do_unshift(state_type&, extern_type* __to, extern_type*,
978	   extern_type*& __to_next) const
979{
980  __to_next = __to;
981  return noconv;
982}
983
984codecvt_base::result
985__codecvt_utf8_base<char32_t>::
986do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
987      const extern_type*& __from_next,
988      intern_type* __to, intern_type* __to_end,
989      intern_type*& __to_next) const
990{
991  range<const char> from{ __from, __from_end };
992  range<char32_t> to{ __to, __to_end };
993  auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
994  __from_next = from.next;
995  __to_next = to.next;
996  return res;
997}
998
999int
1000__codecvt_utf8_base<char32_t>::do_encoding() const throw()
1001{ return 0; } // UTF-8 is not a fixed-width encoding
1002
1003bool
1004__codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1005{ return false; }
1006
1007int
1008__codecvt_utf8_base<char32_t>::
1009do_length(state_type&, const extern_type* __from,
1010	  const extern_type* __end, size_t __max) const
1011{
1012  __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1013  return __end - __from;
1014}
1015
1016int
1017__codecvt_utf8_base<char32_t>::do_max_length() const throw()
1018{
1019  // A single UCS-4 character requires up to four UTF-8 code units.
1020  int max = 4;
1021  if (_M_mode & consume_header)
1022    max += sizeof(utf8_bom);
1023  return max;
1024}
1025
1026#ifdef _GLIBCXX_USE_WCHAR_T
1027
1028#if __SIZEOF_WCHAR_T__ == 2
1029static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1030#elif __SIZEOF_WCHAR_T__ == 4
1031static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1032#endif
1033
1034// Define members of codecvt_utf8<wchar_t> base class implementation.
1035// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1036
1037__codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1038
1039codecvt_base::result
1040__codecvt_utf8_base<wchar_t>::
1041do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1042       const intern_type*& __from_next,
1043       extern_type* __to, extern_type* __to_end,
1044       extern_type*& __to_next) const
1045{
1046  range<char> to{ __to, __to_end };
1047#if __SIZEOF_WCHAR_T__ == 2
1048  range<const char16_t> from{
1049    reinterpret_cast<const char16_t*>(__from),
1050    reinterpret_cast<const char16_t*>(__from_end)
1051  };
1052  auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1053#elif __SIZEOF_WCHAR_T__ == 4
1054  range<const char32_t> from{
1055    reinterpret_cast<const char32_t*>(__from),
1056    reinterpret_cast<const char32_t*>(__from_end)
1057  };
1058  auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1059#else
1060  return codecvt_base::error;
1061#endif
1062  __from_next = reinterpret_cast<const wchar_t*>(from.next);
1063  __to_next = to.next;
1064  return res;
1065}
1066
1067codecvt_base::result
1068__codecvt_utf8_base<wchar_t>::
1069do_unshift(state_type&, extern_type* __to, extern_type*,
1070	   extern_type*& __to_next) const
1071{
1072  __to_next = __to;
1073  return noconv;
1074}
1075
1076codecvt_base::result
1077__codecvt_utf8_base<wchar_t>::
1078do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1079      const extern_type*& __from_next,
1080      intern_type* __to, intern_type* __to_end,
1081      intern_type*& __to_next) const
1082{
1083  range<const char> from{ __from, __from_end };
1084#if __SIZEOF_WCHAR_T__ == 2
1085  range<char16_t> to{
1086    reinterpret_cast<char16_t*>(__to),
1087    reinterpret_cast<char16_t*>(__to_end)
1088  };
1089#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1090  codecvt_mode mode = {};
1091#else
1092  codecvt_mode mode = little_endian;
1093#endif
1094  auto res = ucs2_in(from, to, _M_maxcode, mode);
1095#elif __SIZEOF_WCHAR_T__ == 4
1096  range<char32_t> to{
1097    reinterpret_cast<char32_t*>(__to),
1098    reinterpret_cast<char32_t*>(__to_end)
1099  };
1100  auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1101#else
1102  return codecvt_base::error;
1103#endif
1104  __from_next = from.next;
1105  __to_next = reinterpret_cast<wchar_t*>(to.next);
1106  return res;
1107}
1108
1109int
1110__codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1111{ return 0; } // UTF-8 is not a fixed-width encoding
1112
1113bool
1114__codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1115{ return false; }
1116
1117int
1118__codecvt_utf8_base<wchar_t>::
1119do_length(state_type&, const extern_type* __from,
1120	  const extern_type* __end, size_t __max) const
1121{
1122#if __SIZEOF_WCHAR_T__ == 2
1123  __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1124#elif __SIZEOF_WCHAR_T__ == 4
1125  __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1126#else
1127  __end = __from;
1128#endif
1129  return __end - __from;
1130}
1131
1132int
1133__codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1134{
1135#if __SIZEOF_WCHAR_T__ == 2
1136  int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1137#else
1138  int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1139#endif
1140  if (_M_mode & consume_header)
1141    max += sizeof(utf8_bom);
1142  return max;
1143}
1144#endif
1145
1146// Define members of codecvt_utf16<char16_t> base class implementation.
1147// Converts from UTF-16 to UCS-2.
1148
1149__codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1150
1151codecvt_base::result
1152__codecvt_utf16_base<char16_t>::
1153do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1154       const intern_type*& __from_next,
1155       extern_type* __to, extern_type* __to_end,
1156       extern_type*& __to_next) const
1157{
1158  range<const char16_t> from{ __from, __from_end };
1159  range<char16_t, false> to{ __to, __to_end };
1160  auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1161  __from_next = from.next;
1162  __to_next = reinterpret_cast<char*>(to.next);
1163  return res;
1164}
1165
1166codecvt_base::result
1167__codecvt_utf16_base<char16_t>::
1168do_unshift(state_type&, extern_type* __to, extern_type*,
1169	   extern_type*& __to_next) const
1170{
1171  __to_next = __to;
1172  return noconv;
1173}
1174
1175codecvt_base::result
1176__codecvt_utf16_base<char16_t>::
1177do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1178      const extern_type*& __from_next,
1179      intern_type* __to, intern_type* __to_end,
1180      intern_type*& __to_next) const
1181{
1182  range<const char16_t, false> from{ __from, __from_end };
1183  range<char16_t> to{ __to, __to_end };
1184  auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1185  __from_next = reinterpret_cast<const char*>(from.next);
1186  __to_next = to.next;
1187  if (res == codecvt_base::ok && __from_next != __from_end)
1188    res = codecvt_base::error;
1189  return res;
1190}
1191
1192int
1193__codecvt_utf16_base<char16_t>::do_encoding() const throw()
1194{ return 0; } // UTF-16 is not a fixed-width encoding
1195
1196bool
1197__codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1198{ return false; }
1199
1200int
1201__codecvt_utf16_base<char16_t>::
1202do_length(state_type&, const extern_type* __from,
1203	  const extern_type* __end, size_t __max) const
1204{
1205  range<const char16_t, false> from{ __from, __end };
1206  const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1207  return reinterpret_cast<const char*>(next) - __from;
1208}
1209
1210int
1211__codecvt_utf16_base<char16_t>::do_max_length() const throw()
1212{
1213  // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1214  // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1215  int max = 2;
1216  if (_M_mode & consume_header)
1217    max += sizeof(utf16_bom);
1218  return max;
1219}
1220
1221// Define members of codecvt_utf16<char32_t> base class implementation.
1222// Converts from UTF-16 to UTF-32 (aka UCS-4).
1223
1224__codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1225
1226codecvt_base::result
1227__codecvt_utf16_base<char32_t>::
1228do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1229       const intern_type*& __from_next,
1230       extern_type* __to, extern_type* __to_end,
1231       extern_type*& __to_next) const
1232{
1233  range<const char32_t> from{ __from, __from_end };
1234  range<char16_t, false> to{ __to, __to_end };
1235  auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1236  __from_next = from.next;
1237  __to_next = reinterpret_cast<char*>(to.next);
1238  return res;
1239}
1240
1241codecvt_base::result
1242__codecvt_utf16_base<char32_t>::
1243do_unshift(state_type&, extern_type* __to, extern_type*,
1244	   extern_type*& __to_next) const
1245{
1246  __to_next = __to;
1247  return noconv;
1248}
1249
1250codecvt_base::result
1251__codecvt_utf16_base<char32_t>::
1252do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1253      const extern_type*& __from_next,
1254      intern_type* __to, intern_type* __to_end,
1255      intern_type*& __to_next) const
1256{
1257  range<const char16_t, false> from{ __from, __from_end };
1258  range<char32_t> to{ __to, __to_end };
1259  auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1260  __from_next = reinterpret_cast<const char*>(from.next);
1261  __to_next = to.next;
1262  if (res == codecvt_base::ok && __from_next != __from_end)
1263    res = codecvt_base::error;
1264  return res;
1265}
1266
1267int
1268__codecvt_utf16_base<char32_t>::do_encoding() const throw()
1269{ return 0; } // UTF-16 is not a fixed-width encoding
1270
1271bool
1272__codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1273{ return false; }
1274
1275int
1276__codecvt_utf16_base<char32_t>::
1277do_length(state_type&, const extern_type* __from,
1278	  const extern_type* __end, size_t __max) const
1279{
1280  range<const char16_t, false> from{ __from, __end };
1281  const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1282  return reinterpret_cast<const char*>(next) - __from;
1283}
1284
1285int
1286__codecvt_utf16_base<char32_t>::do_max_length() const throw()
1287{
1288  // A single UCS-4 character requires one or two UTF-16 code units
1289  // (so up to four chars).
1290  int max = 4;
1291  if (_M_mode & consume_header)
1292    max += sizeof(utf16_bom);
1293  return max;
1294}
1295
1296#ifdef _GLIBCXX_USE_WCHAR_T
1297// Define members of codecvt_utf16<wchar_t> base class implementation.
1298// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1299
1300__codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1301
1302codecvt_base::result
1303__codecvt_utf16_base<wchar_t>::
1304do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1305       const intern_type*& __from_next,
1306       extern_type* __to, extern_type* __to_end,
1307       extern_type*& __to_next) const
1308{
1309  range<char16_t, false> to{ __to, __to_end };
1310#if __SIZEOF_WCHAR_T__ == 2
1311  range<const char16_t> from{
1312    reinterpret_cast<const char16_t*>(__from),
1313    reinterpret_cast<const char16_t*>(__from_end),
1314  };
1315  auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1316#elif __SIZEOF_WCHAR_T__ == 4
1317  range<const char32_t> from{
1318    reinterpret_cast<const char32_t*>(__from),
1319    reinterpret_cast<const char32_t*>(__from_end),
1320  };
1321  auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1322#else
1323  return codecvt_base::error;
1324#endif
1325  __from_next = reinterpret_cast<const wchar_t*>(from.next);
1326  __to_next = reinterpret_cast<char*>(to.next);
1327  return res;
1328}
1329
1330codecvt_base::result
1331__codecvt_utf16_base<wchar_t>::
1332do_unshift(state_type&, extern_type* __to, extern_type*,
1333	   extern_type*& __to_next) const
1334{
1335  __to_next = __to;
1336  return noconv;
1337}
1338
1339codecvt_base::result
1340__codecvt_utf16_base<wchar_t>::
1341do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1342      const extern_type*& __from_next,
1343      intern_type* __to, intern_type* __to_end,
1344      intern_type*& __to_next) const
1345{
1346  range<const char16_t, false> from{ __from, __from_end };
1347#if __SIZEOF_WCHAR_T__ == 2
1348  range<char16_t> to{
1349    reinterpret_cast<char16_t*>(__to),
1350    reinterpret_cast<char16_t*>(__to_end),
1351  };
1352  auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1353#elif __SIZEOF_WCHAR_T__ == 4
1354  range<char32_t> to{
1355    reinterpret_cast<char32_t*>(__to),
1356    reinterpret_cast<char32_t*>(__to_end),
1357  };
1358  auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1359#else
1360  return codecvt_base::error;
1361#endif
1362  __from_next = reinterpret_cast<const char*>(from.next);
1363  __to_next = reinterpret_cast<wchar_t*>(to.next);
1364  if (res == codecvt_base::ok && __from_next != __from_end)
1365    res = codecvt_base::error;
1366  return res;
1367}
1368
1369int
1370__codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1371{ return 0; } // UTF-16 is not a fixed-width encoding
1372
1373bool
1374__codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1375{ return false; }
1376
1377int
1378__codecvt_utf16_base<wchar_t>::
1379do_length(state_type&, const extern_type* __from,
1380	  const extern_type* __end, size_t __max) const
1381{
1382  range<const char16_t, false> from{ __from, __end };
1383#if __SIZEOF_WCHAR_T__ == 2
1384  const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1385#elif __SIZEOF_WCHAR_T__ == 4
1386  const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1387#endif
1388  return reinterpret_cast<const char*>(next) - __from;
1389}
1390
1391int
1392__codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1393{
1394#if __SIZEOF_WCHAR_T__ == 2
1395  int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1396#else
1397  int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1398#endif
1399  if (_M_mode & consume_header)
1400    max += sizeof(utf16_bom);
1401  return max;
1402}
1403#endif
1404
1405// Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1406// Converts from UTF-8 to UTF-16.
1407
1408__codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1409
1410codecvt_base::result
1411__codecvt_utf8_utf16_base<char16_t>::
1412do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1413       const intern_type*& __from_next,
1414       extern_type* __to, extern_type* __to_end,
1415       extern_type*& __to_next) const
1416{
1417  range<const char16_t> from{ __from, __from_end };
1418  range<char> to{ __to, __to_end };
1419  auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1420  __from_next = from.next;
1421  __to_next = to.next;
1422  return res;
1423}
1424
1425codecvt_base::result
1426__codecvt_utf8_utf16_base<char16_t>::
1427do_unshift(state_type&, extern_type* __to, extern_type*,
1428	   extern_type*& __to_next) const
1429{
1430  __to_next = __to;
1431  return noconv;
1432}
1433
1434codecvt_base::result
1435__codecvt_utf8_utf16_base<char16_t>::
1436do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1437      const extern_type*& __from_next,
1438      intern_type* __to, intern_type* __to_end,
1439      intern_type*& __to_next) const
1440{
1441  range<const char> from{ __from, __from_end };
1442  range<char16_t> to{ __to, __to_end };
1443  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1444#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1445  mode = codecvt_mode(mode | little_endian);
1446#endif
1447  auto res = utf16_in(from, to, _M_maxcode, mode);
1448  __from_next = from.next;
1449  __to_next = to.next;
1450  return res;
1451}
1452
1453int
1454__codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1455{ return 0; } // UTF-8 is not a fixed-width encoding
1456
1457bool
1458__codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1459{ return false; }
1460
1461int
1462__codecvt_utf8_utf16_base<char16_t>::
1463do_length(state_type&, const extern_type* __from,
1464	  const extern_type* __end, size_t __max) const
1465{
1466  __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1467  return __end - __from;
1468}
1469
1470int
1471__codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1472{
1473  // A single character can be 1 or 2 UTF-16 code units,
1474  // requiring up to 4 UTF-8 code units.
1475  int max = 4;
1476  if (_M_mode & consume_header)
1477    max += sizeof(utf8_bom);
1478  return max;
1479}
1480
1481// Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1482// Converts from UTF-8 to UTF-16.
1483
1484__codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1485
1486codecvt_base::result
1487__codecvt_utf8_utf16_base<char32_t>::
1488do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1489       const intern_type*& __from_next,
1490       extern_type* __to, extern_type* __to_end,
1491       extern_type*& __to_next) const
1492{
1493  range<const char32_t> from{ __from, __from_end };
1494  range<char> to{ __to, __to_end };
1495  auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1496  __from_next = from.next;
1497  __to_next = to.next;
1498  return res;
1499}
1500
1501codecvt_base::result
1502__codecvt_utf8_utf16_base<char32_t>::
1503do_unshift(state_type&, extern_type* __to, extern_type*,
1504	   extern_type*& __to_next) const
1505{
1506  __to_next = __to;
1507  return noconv;
1508}
1509
1510codecvt_base::result
1511__codecvt_utf8_utf16_base<char32_t>::
1512do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1513      const extern_type*& __from_next,
1514      intern_type* __to, intern_type* __to_end,
1515      intern_type*& __to_next) const
1516{
1517  range<const char> from{ __from, __from_end };
1518  range<char32_t> to{ __to, __to_end };
1519  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1520#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1521  mode = codecvt_mode(mode | little_endian);
1522#endif
1523  auto res = utf16_in(from, to, _M_maxcode, mode);
1524  __from_next = from.next;
1525  __to_next = to.next;
1526  return res;
1527}
1528
1529int
1530__codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1531{ return 0; } // UTF-8 is not a fixed-width encoding
1532
1533bool
1534__codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1535{ return false; }
1536
1537int
1538__codecvt_utf8_utf16_base<char32_t>::
1539do_length(state_type&, const extern_type* __from,
1540	  const extern_type* __end, size_t __max) const
1541{
1542  __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1543  return __end - __from;
1544}
1545
1546int
1547__codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1548{
1549  // A single character can be 1 or 2 UTF-16 code units,
1550  // requiring up to 4 UTF-8 code units.
1551  int max = 4;
1552  if (_M_mode & consume_header)
1553    max += sizeof(utf8_bom);
1554  return max;
1555}
1556
1557#ifdef _GLIBCXX_USE_WCHAR_T
1558// Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1559// Converts from UTF-8 to UTF-16.
1560
1561__codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1562
1563codecvt_base::result
1564__codecvt_utf8_utf16_base<wchar_t>::
1565do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1566       const intern_type*& __from_next,
1567       extern_type* __to, extern_type* __to_end,
1568       extern_type*& __to_next) const
1569{
1570  range<const wchar_t> from{ __from, __from_end };
1571  range<char> to{ __to, __to_end };
1572  auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1573  __from_next = from.next;
1574  __to_next = to.next;
1575  return res;
1576}
1577
1578codecvt_base::result
1579__codecvt_utf8_utf16_base<wchar_t>::
1580do_unshift(state_type&, extern_type* __to, extern_type*,
1581	   extern_type*& __to_next) const
1582{
1583  __to_next = __to;
1584  return noconv;
1585}
1586
1587codecvt_base::result
1588__codecvt_utf8_utf16_base<wchar_t>::
1589do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1590      const extern_type*& __from_next,
1591      intern_type* __to, intern_type* __to_end,
1592      intern_type*& __to_next) const
1593{
1594  range<const char> from{ __from, __from_end };
1595  range<wchar_t> to{ __to, __to_end };
1596  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1597#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1598  mode = codecvt_mode(mode | little_endian);
1599#endif
1600  auto res = utf16_in(from, to, _M_maxcode, mode);
1601  __from_next = from.next;
1602  __to_next = to.next;
1603  return res;
1604}
1605
1606int
1607__codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1608{ return 0; } // UTF-8 is not a fixed-width encoding
1609
1610bool
1611__codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1612{ return false; }
1613
1614int
1615__codecvt_utf8_utf16_base<wchar_t>::
1616do_length(state_type&, const extern_type* __from,
1617	  const extern_type* __end, size_t __max) const
1618{
1619  __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1620  return __end - __from;
1621}
1622
1623int
1624__codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1625{
1626  // A single character can be 1 or 2 UTF-16 code units,
1627  // requiring up to 4 UTF-8 code units.
1628  int max = 4;
1629  if (_M_mode & consume_header)
1630    max += sizeof(utf8_bom);
1631  return max;
1632}
1633#endif
1634
1635inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1636inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1637template class codecvt_byname<char16_t, char, mbstate_t>;
1638template class codecvt_byname<char32_t, char, mbstate_t>;
1639
1640_GLIBCXX_END_NAMESPACE_VERSION
1641}
1642#endif // _GLIBCXX_USE_C99_STDINT_TR1
1643