1/*
2 * Copyright (C) 2003, 2006, 2008, 2009, 2010, 2011 Apple Inc. All rights reserved.
3 * Copyright (C) 2008 Holger Hans Peter Freyther
4 * Copyright (C) Research In Motion Limited 2011. All rights reserved.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB.  If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 *
21 */
22
23#include "config.h"
24#include "SurrogatePairAwareTextIterator.h"
25
26#if USE(ICU_UNICODE)
27#include <unicode/unorm.h>
28#endif
29
30using namespace WTF;
31using namespace Unicode;
32
33namespace WebCore {
34
35SurrogatePairAwareTextIterator::SurrogatePairAwareTextIterator(const UChar* characters, int currentCharacter, int lastCharacter, int endCharacter)
36    : m_characters(characters)
37    , m_currentCharacter(currentCharacter)
38    , m_lastCharacter(lastCharacter)
39    , m_endCharacter(endCharacter)
40{
41}
42
43bool SurrogatePairAwareTextIterator::consumeSlowCase(UChar32& character, unsigned& clusterLength)
44{
45    if (character <= 0x30FE) {
46        // Deal with Hiragana and Katakana voiced and semi-voiced syllables.
47        // Normalize into composed form, and then look for glyph with base + combined mark.
48        // Check above for character range to minimize performance impact.
49        if (UChar32 normalized = normalizeVoicingMarks()) {
50            character = normalized;
51            clusterLength = 2;
52        }
53        return true;
54    }
55
56    if (!U16_IS_SURROGATE(character))
57        return true;
58
59    // If we have a surrogate pair, make sure it starts with the high part.
60    if (!U16_IS_SURROGATE_LEAD(character))
61        return false;
62
63    // Do we have a surrogate pair? If so, determine the full Unicode (32 bit) code point before glyph lookup.
64    // Make sure we have another character and it's a low surrogate.
65    if (m_currentCharacter + 1 >= m_endCharacter)
66        return false;
67
68    UChar low = m_characters[1];
69    if (!U16_IS_TRAIL(low))
70        return false;
71
72    character = U16_GET_SUPPLEMENTARY(character, low);
73    clusterLength = 2;
74    return true;
75}
76
77UChar32 SurrogatePairAwareTextIterator::normalizeVoicingMarks()
78{
79    // According to http://www.unicode.org/Public/UNIDATA/UCD.html#Canonical_Combining_Class_Values
80    static const uint8_t hiraganaKatakanaVoicingMarksCombiningClass = 8;
81
82    if (m_currentCharacter + 1 >= m_endCharacter)
83        return 0;
84
85    if (combiningClass(m_characters[1]) == hiraganaKatakanaVoicingMarksCombiningClass) {
86#if USE(ICU_UNICODE)
87        // Normalize into composed form using 3.2 rules.
88        UChar normalizedCharacters[2] = { 0, 0 };
89        UErrorCode uStatus = U_ZERO_ERROR;
90        int32_t resultLength = unorm_normalize(m_characters, 2, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], 2, &uStatus);
91        if (resultLength == 1 && !uStatus)
92            return normalizedCharacters[0];
93#endif
94    }
95
96    return 0;
97}
98
99}
100