1// Locale support (codecvt) -*- C++ -*-
2
3// Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006
4//  Free Software Foundation, Inc.
5//
6// This file is part of the GNU ISO C++ Library.  This library is free
7// software; you can redistribute it and/or modify it under the
8// terms of the GNU General Public License as published by the
9// Free Software Foundation; either version 2, or (at your option)
10// any later version.
11
12// This library is distributed in the hope that it will be useful,
13// but WITHOUT ANY WARRANTY; without even the implied warranty of
14// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15// GNU General Public License for more details.
16
17// You should have received a copy of the GNU General Public License along
18// with this library; see the file COPYING.  If not, write to the Free
19// Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
20// USA.
21
22// As a special exception, you may use this file as part of a free software
23// library without restriction.  Specifically, if other files instantiate
24// templates or use macros or inline functions from this file, or you compile
25// this file and link it with other files to produce an executable, this
26// file does not by itself cause the resulting executable to be covered by
27// the GNU General Public License.  This exception does not however
28// invalidate any other reasons why the executable file might be covered by
29// the GNU General Public License.
30
31//
32// ISO C++ 14882: 22.2.1.5 Template class codecvt
33//
34
35// Written by Benjamin Kosnik <bkoz@redhat.com>
36
37/** @file ext/codecvt_specializations.h
38 *  This file is a GNU extension to the Standard C++ Library.
39 */
40
41#ifndef _EXT_CODECVT_SPECIALIZATIONS_H
42#define _EXT_CODECVT_SPECIALIZATIONS_H 1
43
44#include <bits/c++config.h>
45
46#ifdef _GLIBCXX_USE_ICONV
47
48#include <locale>
49#include <iconv.h>
50
51  // XXX
52  // Define this here so codecvt.cc can have _S_max_size definition.
53#define _GLIBCXX_USE_ENCODING_STATE 1
54
55_GLIBCXX_BEGIN_NAMESPACE(__gnu_cxx)
56
57  /// @brief  Extension to use icov for dealing with character encodings.
58  // This includes conversions and comparisons between various character
59  // sets.  This object encapsulates data that may need to be shared between
60  // char_traits, codecvt and ctype.
61  class encoding_state
62  {
63  public:
64    // Types:
65    // NB: A conversion descriptor subsumes and enhances the
66    // functionality of a simple state type such as mbstate_t.
67    typedef iconv_t	descriptor_type;
68
69  protected:
70    // Name of internal character set encoding.
71    std::string	       	_M_int_enc;
72
73    // Name of external character set encoding.
74    std::string  	_M_ext_enc;
75
76    // Conversion descriptor between external encoding to internal encoding.
77    descriptor_type	_M_in_desc;
78
79    // Conversion descriptor between internal encoding to external encoding.
80    descriptor_type	_M_out_desc;
81
82    // The byte-order marker for the external encoding, if necessary.
83    int			_M_ext_bom;
84
85    // The byte-order marker for the internal encoding, if necessary.
86    int			_M_int_bom;
87
88    // Number of external bytes needed to construct one complete
89    // character in the internal encoding.
90    // NB: -1 indicates variable, or stateful, encodings.
91    int 		_M_bytes;
92
93  public:
94    explicit
95    encoding_state()
96    : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
97    { }
98
99    explicit
100    encoding_state(const char* __int, const char* __ext,
101		   int __ibom = 0, int __ebom = 0, int __bytes = 1)
102    : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
103      _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
104    { init(); }
105
106    // 21.1.2 traits typedefs
107    // p4
108    // typedef STATE_T state_type
109    // requires: state_type shall meet the requirements of
110    // CopyConstructible types (20.1.3)
111    // NB: This does not preseve the actual state of the conversion
112    // descriptor member, but it does duplicate the encoding
113    // information.
114    encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
115    { construct(__obj); }
116
117    // Need assignment operator as well.
118    encoding_state&
119    operator=(const encoding_state& __obj)
120    {
121      construct(__obj);
122      return *this;
123    }
124
125    ~encoding_state()
126    { destroy(); }
127
128    bool
129    good() const throw()
130    {
131      const descriptor_type __err = reinterpret_cast<iconv_t>(-1);
132      bool __test = _M_in_desc && _M_in_desc != __err;
133      __test &=  _M_out_desc && _M_out_desc != __err;
134      return __test;
135    }
136
137    int
138    character_ratio() const
139    { return _M_bytes; }
140
141    const std::string
142    internal_encoding() const
143    { return _M_int_enc; }
144
145    int
146    internal_bom() const
147    { return _M_int_bom; }
148
149    const std::string
150    external_encoding() const
151    { return _M_ext_enc; }
152
153    int
154    external_bom() const
155    { return _M_ext_bom; }
156
157    const descriptor_type&
158    in_descriptor() const
159    { return _M_in_desc; }
160
161    const descriptor_type&
162    out_descriptor() const
163    { return _M_out_desc; }
164
165  protected:
166    void
167    init()
168    {
169      const descriptor_type __err = reinterpret_cast<iconv_t>(-1);
170      const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
171      if (!_M_in_desc && __have_encodings)
172	{
173	  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
174	  if (_M_in_desc == __err)
175	    std::__throw_runtime_error(__N("encoding_state::_M_init "
176				    "creating iconv input descriptor failed"));
177	}
178      if (!_M_out_desc && __have_encodings)
179	{
180	  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
181	  if (_M_out_desc == __err)
182	    std::__throw_runtime_error(__N("encoding_state::_M_init "
183				  "creating iconv output descriptor failed"));
184	}
185    }
186
187    void
188    construct(const encoding_state& __obj)
189    {
190      destroy();
191      _M_int_enc = __obj._M_int_enc;
192      _M_ext_enc = __obj._M_ext_enc;
193      _M_ext_bom = __obj._M_ext_bom;
194      _M_int_bom = __obj._M_int_bom;
195      _M_bytes = __obj._M_bytes;
196      init();
197    }
198
199    void
200    destroy() throw()
201    {
202      const descriptor_type __err = reinterpret_cast<iconv_t>(-1);
203      if (_M_in_desc && _M_in_desc != __err)
204	{
205	  iconv_close(_M_in_desc);
206	  _M_in_desc = 0;
207	}
208      if (_M_out_desc && _M_out_desc != __err)
209	{
210	  iconv_close(_M_out_desc);
211	  _M_out_desc = 0;
212	}
213    }
214  };
215
216  /// @brief  encoding_char_traits.
217  // Custom traits type with encoding_state for the state type, and the
218  // associated fpos<encoding_state> for the position type, all other
219  // bits equivalent to the required char_traits instantiations.
220  template<typename _CharT>
221    struct encoding_char_traits : public std::char_traits<_CharT>
222    {
223      typedef encoding_state				state_type;
224      typedef typename std::fpos<state_type>		pos_type;
225    };
226
227_GLIBCXX_END_NAMESPACE
228
229
230_GLIBCXX_BEGIN_NAMESPACE(std)
231
232  using __gnu_cxx::encoding_state;
233
234  /// @brief  codecvt<InternT, _ExternT, encoding_state> specialization.
235  // This partial specialization takes advantage of iconv to provide
236  // code conversions between a large number of character encodings.
237  template<typename _InternT, typename _ExternT>
238    class codecvt<_InternT, _ExternT, encoding_state>
239    : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
240    {
241    public:
242      // Types:
243      typedef codecvt_base::result			result;
244      typedef _InternT 					intern_type;
245      typedef _ExternT 					extern_type;
246      typedef __gnu_cxx::encoding_state 		state_type;
247      typedef state_type::descriptor_type 		descriptor_type;
248
249      // Data Members:
250      static locale::id 		id;
251
252      explicit
253      codecvt(size_t __refs = 0)
254      : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
255      { }
256
257      explicit
258      codecvt(state_type& __enc, size_t __refs = 0)
259      : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
260      { }
261
262     protected:
263      virtual
264      ~codecvt() { }
265
266      virtual result
267      do_out(state_type& __state, const intern_type* __from,
268	     const intern_type* __from_end, const intern_type*& __from_next,
269	     extern_type* __to, extern_type* __to_end,
270	     extern_type*& __to_next) const;
271
272      virtual result
273      do_unshift(state_type& __state, extern_type* __to,
274		 extern_type* __to_end, extern_type*& __to_next) const;
275
276      virtual result
277      do_in(state_type& __state, const extern_type* __from,
278	    const extern_type* __from_end, const extern_type*& __from_next,
279	    intern_type* __to, intern_type* __to_end,
280	    intern_type*& __to_next) const;
281
282      virtual int
283      do_encoding() const throw();
284
285      virtual bool
286      do_always_noconv() const throw();
287
288      virtual int
289      do_length(state_type&, const extern_type* __from,
290		const extern_type* __end, size_t __max) const;
291
292      virtual int
293      do_max_length() const throw();
294    };
295
296  template<typename _InternT, typename _ExternT>
297    locale::id
298    codecvt<_InternT, _ExternT, encoding_state>::id;
299
300  // This adaptor works around the signature problems of the second
301  // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
302  // uses 'char**', which matches the POSIX 1003.1-2001 standard.
303  // Using this adaptor, g++ will do the work for us.
304  template<typename _Tp>
305    inline size_t
306    __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
307                    iconv_t __cd, char** __inbuf, size_t* __inbytes,
308                    char** __outbuf, size_t* __outbytes)
309    { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
310
311  template<typename _InternT, typename _ExternT>
312    codecvt_base::result
313    codecvt<_InternT, _ExternT, encoding_state>::
314    do_out(state_type& __state, const intern_type* __from,
315	   const intern_type* __from_end, const intern_type*& __from_next,
316	   extern_type* __to, extern_type* __to_end,
317	   extern_type*& __to_next) const
318    {
319      result __ret = codecvt_base::error;
320      if (__state.good())
321	{
322	  const descriptor_type& __desc = __state.out_descriptor();
323	  const size_t __fmultiple = sizeof(intern_type);
324	  size_t __fbytes = __fmultiple * (__from_end - __from);
325	  const size_t __tmultiple = sizeof(extern_type);
326	  size_t __tbytes = __tmultiple * (__to_end - __to);
327
328	  // Argument list for iconv specifies a byte sequence. Thus,
329	  // all to/from arrays must be brutally casted to char*.
330	  char* __cto = reinterpret_cast<char*>(__to);
331	  char* __cfrom;
332	  size_t __conv;
333
334	  // Some encodings need a byte order marker as the first item
335	  // in the byte stream, to designate endian-ness. The default
336	  // value for the byte order marker is NULL, so if this is
337	  // the case, it's not necessary and we can just go on our
338	  // merry way.
339	  int __int_bom = __state.internal_bom();
340	  if (__int_bom)
341	    {
342	      size_t __size = __from_end - __from;
343	      intern_type* __cfixed = static_cast<intern_type*>
344		(__builtin_alloca(sizeof(intern_type) * (__size + 1)));
345	      __cfixed[0] = static_cast<intern_type>(__int_bom);
346	      char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
347	      __cfrom = reinterpret_cast<char*>(__cfixed);
348	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
349                                        &__fbytes, &__cto, &__tbytes);
350	    }
351	  else
352	    {
353	      intern_type* __cfixed = const_cast<intern_type*>(__from);
354	      __cfrom = reinterpret_cast<char*>(__cfixed);
355	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
356				       &__cto, &__tbytes);
357	    }
358
359	  if (__conv != size_t(-1))
360	    {
361	      __from_next = reinterpret_cast<const intern_type*>(__cfrom);
362	      __to_next = reinterpret_cast<extern_type*>(__cto);
363	      __ret = codecvt_base::ok;
364	    }
365	  else
366	    {
367	      if (__fbytes < __fmultiple * (__from_end - __from))
368		{
369		  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
370		  __to_next = reinterpret_cast<extern_type*>(__cto);
371		  __ret = codecvt_base::partial;
372		}
373	      else
374		__ret = codecvt_base::error;
375	    }
376	}
377      return __ret;
378    }
379
380  template<typename _InternT, typename _ExternT>
381    codecvt_base::result
382    codecvt<_InternT, _ExternT, encoding_state>::
383    do_unshift(state_type& __state, extern_type* __to,
384	       extern_type* __to_end, extern_type*& __to_next) const
385    {
386      result __ret = codecvt_base::error;
387      if (__state.good())
388	{
389	  const descriptor_type& __desc = __state.in_descriptor();
390	  const size_t __tmultiple = sizeof(intern_type);
391	  size_t __tlen = __tmultiple * (__to_end - __to);
392
393	  // Argument list for iconv specifies a byte sequence. Thus,
394	  // all to/from arrays must be brutally casted to char*.
395	  char* __cto = reinterpret_cast<char*>(__to);
396	  size_t __conv = __iconv_adaptor(iconv,__desc, NULL, NULL,
397                                          &__cto, &__tlen);
398
399	  if (__conv != size_t(-1))
400	    {
401	      __to_next = reinterpret_cast<extern_type*>(__cto);
402	      if (__tlen == __tmultiple * (__to_end - __to))
403		__ret = codecvt_base::noconv;
404	      else if (__tlen == 0)
405		__ret = codecvt_base::ok;
406	      else
407		__ret = codecvt_base::partial;
408	    }
409	  else
410	    __ret = codecvt_base::error;
411	}
412      return __ret;
413    }
414
415  template<typename _InternT, typename _ExternT>
416    codecvt_base::result
417    codecvt<_InternT, _ExternT, encoding_state>::
418    do_in(state_type& __state, const extern_type* __from,
419	  const extern_type* __from_end, const extern_type*& __from_next,
420	  intern_type* __to, intern_type* __to_end,
421	  intern_type*& __to_next) const
422    {
423      result __ret = codecvt_base::error;
424      if (__state.good())
425	{
426	  const descriptor_type& __desc = __state.in_descriptor();
427	  const size_t __fmultiple = sizeof(extern_type);
428	  size_t __flen = __fmultiple * (__from_end - __from);
429	  const size_t __tmultiple = sizeof(intern_type);
430	  size_t __tlen = __tmultiple * (__to_end - __to);
431
432	  // Argument list for iconv specifies a byte sequence. Thus,
433	  // all to/from arrays must be brutally casted to char*.
434	  char* __cto = reinterpret_cast<char*>(__to);
435	  char* __cfrom;
436	  size_t __conv;
437
438	  // Some encodings need a byte order marker as the first item
439	  // in the byte stream, to designate endian-ness. The default
440	  // value for the byte order marker is NULL, so if this is
441	  // the case, it's not necessary and we can just go on our
442	  // merry way.
443	  int __ext_bom = __state.external_bom();
444	  if (__ext_bom)
445	    {
446	      size_t __size = __from_end - __from;
447	      extern_type* __cfixed =  static_cast<extern_type*>
448		(__builtin_alloca(sizeof(extern_type) * (__size + 1)));
449	      __cfixed[0] = static_cast<extern_type>(__ext_bom);
450	      char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
451	      __cfrom = reinterpret_cast<char*>(__cfixed);
452	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
453                                       &__flen, &__cto, &__tlen);
454	    }
455	  else
456	    {
457	      extern_type* __cfixed = const_cast<extern_type*>(__from);
458	      __cfrom = reinterpret_cast<char*>(__cfixed);
459	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
460                                       &__flen, &__cto, &__tlen);
461	    }
462
463
464	  if (__conv != size_t(-1))
465	    {
466	      __from_next = reinterpret_cast<const extern_type*>(__cfrom);
467	      __to_next = reinterpret_cast<intern_type*>(__cto);
468	      __ret = codecvt_base::ok;
469	    }
470	  else
471	    {
472	      if (__flen < static_cast<size_t>(__from_end - __from))
473		{
474		  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
475		  __to_next = reinterpret_cast<intern_type*>(__cto);
476		  __ret = codecvt_base::partial;
477		}
478	      else
479		__ret = codecvt_base::error;
480	    }
481	}
482      return __ret;
483    }
484
485  template<typename _InternT, typename _ExternT>
486    int
487    codecvt<_InternT, _ExternT, encoding_state>::
488    do_encoding() const throw()
489    {
490      int __ret = 0;
491      if (sizeof(_ExternT) <= sizeof(_InternT))
492	__ret = sizeof(_InternT) / sizeof(_ExternT);
493      return __ret;
494    }
495
496  template<typename _InternT, typename _ExternT>
497    bool
498    codecvt<_InternT, _ExternT, encoding_state>::
499    do_always_noconv() const throw()
500    { return false; }
501
502  template<typename _InternT, typename _ExternT>
503    int
504    codecvt<_InternT, _ExternT, encoding_state>::
505    do_length(state_type&, const extern_type* __from,
506	      const extern_type* __end, size_t __max) const
507    { return std::min(__max, static_cast<size_t>(__end - __from)); }
508
509  // _GLIBCXX_RESOLVE_LIB_DEFECTS
510  // 74.  Garbled text for codecvt::do_max_length
511  template<typename _InternT, typename _ExternT>
512    int
513    codecvt<_InternT, _ExternT, encoding_state>::
514    do_max_length() const throw()
515    { return 1; }
516
517_GLIBCXX_END_NAMESPACE
518
519#endif
520
521#endif
522