1///////////////////////////////////////////////////////////////////////////////
2// Name:        src/common/convauto.cpp
3// Purpose:     implementation of wxConvAuto
4// Author:      Vadim Zeitlin
5// Created:     2006-04-04
6// RCS-ID:      $Id: convauto.cpp 38570 2006-04-05 14:37:47Z VZ $
7// Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
8// Licence:     wxWindows licence
9///////////////////////////////////////////////////////////////////////////////
10
11// ============================================================================
12// declarations
13// ============================================================================
14
15// ----------------------------------------------------------------------------
16// headers
17// ----------------------------------------------------------------------------
18
19// for compilers that support precompilation, includes "wx.h".
20#include "wx/wxprec.h"
21
22#ifdef __BORLANDC__
23    #pragma hdrstop
24#endif
25
26#if wxUSE_WCHAR_T
27
28#ifndef WX_PRECOMP
29#endif //WX_PRECOMP
30
31#include "wx/convauto.h"
32
33// ============================================================================
34// implementation
35// ============================================================================
36
37/* static */
38wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
39{
40    if ( srcLen < 2 )
41    {
42        // minimal BOM is 2 bytes so bail out immediately and simplify the code
43        // below which wouldn't need to check for length for UTF-16 cases
44        return BOM_None;
45    }
46
47    // examine the buffer for BOM presence
48    //
49    // see http://www.unicode.org/faq/utf_bom.html#BOM
50    switch ( *src++ )
51    {
52        case '\0':
53            // could only be big endian UTF-32 (00 00 FE FF)
54            if ( srcLen >= 4 &&
55                    src[0] == '\0' &&
56                        src[1] == '\xfe' &&
57                            src[2] == '\xff' )
58            {
59                return BOM_UTF32BE;
60            }
61            break;
62
63        case '\xfe':
64            // could only be big endian UTF-16 (FE FF)
65            if ( *src++ == '\xff' )
66            {
67                return BOM_UTF16BE;
68            }
69            break;
70
71        case '\xff':
72            // could be either little endian UTF-16 or UTF-32, both start
73            // with FF FE
74            if ( *src++ == '\xfe' )
75            {
76                return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
77                            ? BOM_UTF32LE
78                            : BOM_UTF16LE;
79            }
80            break;
81
82        case '\xef':
83            // is this UTF-8 BOM (EF BB BF)?
84            if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
85            {
86                return BOM_UTF8;
87            }
88            break;
89    }
90
91    return BOM_None;
92}
93
94void wxConvAuto::InitFromBOM(BOMType bomType)
95{
96    m_consumedBOM = false;
97
98    switch ( bomType )
99    {
100        case BOM_UTF32BE:
101            m_conv = new wxMBConvUTF32BE;
102            m_ownsConv = true;
103            break;
104
105        case BOM_UTF32LE:
106            m_conv = new wxMBConvUTF32LE;
107            m_ownsConv = true;
108            break;
109
110        case BOM_UTF16BE:
111            m_conv = new wxMBConvUTF16BE;
112            m_ownsConv = true;
113            break;
114
115        case BOM_UTF16LE:
116            m_conv = new wxMBConvUTF16LE;
117            m_ownsConv = true;
118            break;
119
120        case BOM_UTF8:
121            m_conv = &wxConvUTF8;
122            m_ownsConv = false;
123            break;
124
125        default:
126            wxFAIL_MSG( _T("unexpected BOM type") );
127            // fall through: still need to create something
128
129        case BOM_None:
130            InitWithDefault();
131            m_consumedBOM = true; // as there is nothing to consume
132    }
133}
134
135void wxConvAuto::SkipBOM(const char **src, size_t *len) const
136{
137    int ofs;
138    switch ( m_bomType )
139    {
140        case BOM_UTF32BE:
141        case BOM_UTF32LE:
142            ofs = 4;
143            break;
144
145        case BOM_UTF16BE:
146        case BOM_UTF16LE:
147            ofs = 2;
148            break;
149
150        case BOM_UTF8:
151            ofs = 3;
152            break;
153
154        default:
155            wxFAIL_MSG( _T("unexpected BOM type") );
156            // fall through: still need to create something
157
158        case BOM_None:
159            ofs = 0;
160    }
161
162    *src += ofs;
163    if ( *len != (size_t)-1 )
164        *len -= ofs;
165}
166
167void wxConvAuto::InitFromInput(const char **src, size_t *len)
168{
169    m_bomType = DetectBOM(*src, *len);
170    InitFromBOM(m_bomType);
171    SkipBOM(src, len);
172}
173
174size_t
175wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
176                    const char *src, size_t srcLen) const
177{
178    // we check BOM and create the appropriate conversion the first time we're
179    // called but we also need to ensure that the BOM is skipped not only
180    // during this initial call but also during the first call with non-NULL
181    // dst as typically we're first called with NULL dst to calculate the
182    // needed buffer size
183    wxConvAuto *self = wx_const_cast(wxConvAuto *, this);
184    if ( !m_conv )
185    {
186        self->InitFromInput(&src, &srcLen);
187        if ( dst )
188            self->m_consumedBOM = true;
189    }
190
191    if ( !m_consumedBOM && dst )
192    {
193        self->m_consumedBOM = true;
194        SkipBOM(&src, &srcLen);
195    }
196
197    return m_conv->ToWChar(dst, dstLen, src, srcLen);
198}
199
200size_t
201wxConvAuto::FromWChar(char *dst, size_t dstLen,
202                      const wchar_t *src, size_t srcLen) const
203{
204    if ( !m_conv )
205    {
206        // default to UTF-8 for the multibyte output
207        wx_const_cast(wxConvAuto *, this)->InitWithDefault();
208    }
209
210    return m_conv->FromWChar(dst, dstLen, src, srcLen);
211}
212
213#endif // wxUSE_WCHAR_T
214
215