1/*
2 * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "TextCodecMac.h"
29
30#include "CharsetData.h"
31#include "ThreadGlobalData.h"
32#include <wtf/Assertions.h>
33#include <wtf/PassOwnPtr.h>
34#include <wtf/RetainPtr.h>
35#include <wtf/Threading.h>
36#include <wtf/text/CString.h>
37#include <wtf/text/WTFString.h>
38#include <wtf/unicode/CharacterNames.h>
39
40using namespace std;
41
42namespace WebCore {
43
44// We need to keep this because ICU doesn't support some of the encodings that we need:
45// <http://bugs.webkit.org/show_bug.cgi?id=4195>.
46
47const size_t ConversionBufferSize = 16384;
48
49static TECConverterWrapper& cachedConverterTEC()
50{
51    return threadGlobalData().cachedConverterTEC();
52}
53
54void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar)
55{
56    TECTextEncodingID lastEncoding = invalidEncoding;
57    const char* lastName = 0;
58
59    for (size_t i = 0; CharsetTable[i].name; ++i) {
60        if (CharsetTable[i].encoding != lastEncoding) {
61            lastEncoding = CharsetTable[i].encoding;
62            lastName = CharsetTable[i].name;
63        }
64        registrar(CharsetTable[i].name, lastName);
65    }
66}
67
68static PassOwnPtr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData)
69{
70    return adoptPtr(new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData)));
71}
72
73void TextCodecMac::registerCodecs(TextCodecRegistrar registrar)
74{
75    TECTextEncodingID lastEncoding = invalidEncoding;
76
77    for (size_t i = 0; CharsetTable[i].name; ++i)
78        if (CharsetTable[i].encoding != lastEncoding) {
79            registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding);
80            lastEncoding = CharsetTable[i].encoding;
81        }
82}
83
84TextCodecMac::TextCodecMac(TECTextEncodingID encoding)
85    : m_encoding(encoding)
86    , m_numBufferedBytes(0)
87    , m_converterTEC(0)
88{
89}
90
91TextCodecMac::~TextCodecMac()
92{
93    releaseTECConverter();
94}
95
96void TextCodecMac::releaseTECConverter() const
97{
98    if (m_converterTEC) {
99        TECConverterWrapper& cachedConverter = cachedConverterTEC();
100        if (cachedConverter.converter)
101            TECDisposeConverter(cachedConverter.converter);
102        cachedConverter.converter = m_converterTEC;
103        cachedConverter.encoding = m_encoding;
104        m_converterTEC = 0;
105    }
106}
107
108OSStatus TextCodecMac::createTECConverter() const
109{
110    TECConverterWrapper& cachedConverter = cachedConverterTEC();
111
112    bool cachedEncodingEqual = cachedConverter.encoding == m_encoding;
113    cachedConverter.encoding = invalidEncoding;
114
115    if (cachedEncodingEqual && cachedConverter.converter) {
116        m_converterTEC = cachedConverter.converter;
117        cachedConverter.converter = 0;
118
119        TECClearConverterContextInfo(m_converterTEC);
120    } else {
121        OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding,
122            CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
123        if (status)
124            return status;
125
126        TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
127    }
128
129    return noErr;
130}
131
132OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength,
133    void *outputBuffer, int outputBufferLength, int& outputLength)
134{
135    OSStatus status;
136    unsigned long bytesRead = 0;
137    unsigned long bytesWritten = 0;
138
139    if (m_numBufferedBytes != 0) {
140        // Finish converting a partial character that's in our buffer.
141
142        // First, fill the partial character buffer with as many bytes as are available.
143        ASSERT_WITH_SECURITY_IMPLICATION(m_numBufferedBytes < sizeof(m_bufferedBytes));
144        const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes;
145        const int bytesToPutInBuffer = min(spaceInBuffer, inputBufferLength);
146        ASSERT(bytesToPutInBuffer != 0);
147        memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer);
148
149        // Now, do a conversion on the buffer.
150        status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead,
151            reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
152        ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer);
153
154        if (status == kTECPartialCharErr && bytesRead == 0) {
155            // Handle the case where the partial character was not converted.
156            if (bytesToPutInBuffer >= spaceInBuffer) {
157                LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes));
158                m_numBufferedBytes = 0;
159                status = kTECUnmappableElementErr; // should never happen, but use this error code
160            } else {
161                // Tell the caller we read all the source bytes and keep them in the buffer.
162                m_numBufferedBytes += bytesToPutInBuffer;
163                bytesRead = bytesToPutInBuffer;
164                status = noErr;
165            }
166        } else {
167            // We are done with the partial character buffer.
168            // Also, we have read some of the bytes from the main buffer.
169            if (bytesRead > m_numBufferedBytes) {
170                bytesRead -= m_numBufferedBytes;
171            } else {
172                LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
173                bytesRead = 0;
174            }
175            m_numBufferedBytes = 0;
176            if (status == kTECPartialCharErr) {
177                // While there may be a partial character problem in the small buffer,
178                // we have to try again and not get confused and think there is a partial
179                // character problem in the large buffer.
180                status = noErr;
181            }
182        }
183    } else {
184        status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead,
185            static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
186        ASSERT(static_cast<int>(bytesRead) <= inputBufferLength);
187    }
188
189    // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
190    if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0)
191        status = kTECOutputBufferFullStatus;
192
193    inputLength = bytesRead;
194    outputLength = bytesWritten;
195    return status;
196}
197
198String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
199{
200    // Get a converter for the passed-in encoding.
201    if (!m_converterTEC && createTECConverter() != noErr)
202        return String();
203
204    Vector<UChar> result;
205
206    const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes);
207    int sourceLength = length;
208    bool bufferWasFull = false;
209    UniChar buffer[ConversionBufferSize];
210
211    while ((sourceLength || bufferWasFull) && !sawError) {
212        int bytesRead = 0;
213        int bytesWritten = 0;
214        OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
215        ASSERT(bytesRead <= sourceLength);
216        sourcePointer += bytesRead;
217        sourceLength -= bytesRead;
218
219        switch (status) {
220            case noErr:
221            case kTECOutputBufferFullStatus:
222                break;
223            case kTextMalformedInputErr:
224            case kTextUndefinedElementErr:
225                // FIXME: Put FFFD character into the output string in this case?
226                TECClearConverterContextInfo(m_converterTEC);
227                if (stopOnError) {
228                    sawError = true;
229                    break;
230                }
231                if (sourceLength) {
232                    sourcePointer += 1;
233                    sourceLength -= 1;
234                }
235                break;
236            case kTECPartialCharErr: {
237                // Put the partial character into the buffer.
238                ASSERT(m_numBufferedBytes == 0);
239                const int bufferSize = sizeof(m_numBufferedBytes);
240                if (sourceLength < bufferSize) {
241                    memcpy(m_bufferedBytes, sourcePointer, sourceLength);
242                    m_numBufferedBytes = sourceLength;
243                } else {
244                    LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
245                }
246                sourceLength = 0;
247                break;
248            }
249            default:
250                sawError = true;
251                return String();
252        }
253
254        ASSERT(!(bytesWritten % sizeof(UChar)));
255        result.append(buffer, bytesWritten / sizeof(UChar));
256
257        bufferWasFull = status == kTECOutputBufferFullStatus;
258    }
259
260    if (flush) {
261        unsigned long bytesWritten = 0;
262        TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
263        ASSERT(!(bytesWritten % sizeof(UChar)));
264        result.append(buffer, bytesWritten / sizeof(UChar));
265    }
266
267    String resultString = String::adopt(result);
268
269    // <rdar://problem/3225472>
270    // Simplified Chinese pages use the code A3A0 to mean "full-width space".
271    // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
272    // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
273    if (m_encoding == kCFStringEncodingGB_18030_2000)
274        resultString.replace(0xE5E5, ideographicSpace);
275
276    return resultString;
277}
278
279CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling)
280{
281    // FIXME: We should really use TEC here instead of CFString for consistency with the other direction.
282
283    // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
284    // Encoding will change the yen sign back into a backslash.
285    String copy(characters, length);
286    copy.replace('\\', m_backslashAsCurrencySymbol);
287    RetainPtr<CFStringRef> cfs = copy.createCFString();
288
289    CFIndex startPos = 0;
290    CFIndex charactersLeft = CFStringGetLength(cfs.get());
291    Vector<char> result;
292    size_t size = 0;
293    UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0;
294    while (charactersLeft > 0) {
295        CFRange range = CFRangeMake(startPos, charactersLeft);
296        CFIndex bufferLength;
297        CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength);
298
299        result.grow(size + bufferLength);
300        unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size);
301        CFIndex charactersConverted = CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength);
302        size += bufferLength;
303
304        if (charactersConverted != charactersLeft) {
305            unsigned badChar = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
306            ++charactersConverted;
307            if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate
308                UniChar low = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
309                if ((low & 0xFC00) == 0xDC00) { // is low surrogate
310                    badChar <<= 10;
311                    badChar += low;
312                    badChar += 0x10000 - (0xD800 << 10) - 0xDC00;
313                    ++charactersConverted;
314                }
315            }
316            UnencodableReplacementArray entity;
317            int entityLength = getUnencodableReplacement(badChar, handling, entity);
318            result.grow(size + entityLength);
319            memcpy(result.data() + size, entity, entityLength);
320            size += entityLength;
321        }
322
323        startPos += charactersConverted;
324        charactersLeft -= charactersConverted;
325    }
326    return CString(result.data(), size);
327}
328
329} // namespace WebCore
330