1/* 2 * Copyright (C) 2003, 2006, 2008, 2009, 2010, 2011 Apple Inc. All rights reserved. 3 * Copyright (C) 2008 Holger Hans Peter Freyther 4 * Copyright (C) Research In Motion Limited 2011. All rights reserved. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Library General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Library General Public License for more details. 15 * 16 * You should have received a copy of the GNU Library General Public License 17 * along with this library; see the file COPYING.LIB. If not, write to 18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 * Boston, MA 02110-1301, USA. 20 * 21 */ 22 23#include "config.h" 24#include "SurrogatePairAwareTextIterator.h" 25 26#if USE(ICU_UNICODE) 27#include <unicode/unorm.h> 28#endif 29 30using namespace WTF; 31using namespace Unicode; 32 33namespace WebCore { 34 35SurrogatePairAwareTextIterator::SurrogatePairAwareTextIterator(const UChar* characters, int currentCharacter, int lastCharacter, int endCharacter) 36 : m_characters(characters) 37 , m_currentCharacter(currentCharacter) 38 , m_lastCharacter(lastCharacter) 39 , m_endCharacter(endCharacter) 40{ 41} 42 43bool SurrogatePairAwareTextIterator::consumeSlowCase(UChar32& character, unsigned& clusterLength) 44{ 45 if (character <= 0x30FE) { 46 // Deal with Hiragana and Katakana voiced and semi-voiced syllables. 47 // Normalize into composed form, and then look for glyph with base + combined mark. 48 // Check above for character range to minimize performance impact. 49 if (UChar32 normalized = normalizeVoicingMarks()) { 50 character = normalized; 51 clusterLength = 2; 52 } 53 return true; 54 } 55 56 if (!U16_IS_SURROGATE(character)) 57 return true; 58 59 // If we have a surrogate pair, make sure it starts with the high part. 60 if (!U16_IS_SURROGATE_LEAD(character)) 61 return false; 62 63 // Do we have a surrogate pair? If so, determine the full Unicode (32 bit) code point before glyph lookup. 64 // Make sure we have another character and it's a low surrogate. 65 if (m_currentCharacter + 1 >= m_endCharacter) 66 return false; 67 68 UChar low = m_characters[1]; 69 if (!U16_IS_TRAIL(low)) 70 return false; 71 72 character = U16_GET_SUPPLEMENTARY(character, low); 73 clusterLength = 2; 74 return true; 75} 76 77UChar32 SurrogatePairAwareTextIterator::normalizeVoicingMarks() 78{ 79 // According to http://www.unicode.org/Public/UNIDATA/UCD.html#Canonical_Combining_Class_Values 80 static const uint8_t hiraganaKatakanaVoicingMarksCombiningClass = 8; 81 82 if (m_currentCharacter + 1 >= m_endCharacter) 83 return 0; 84 85 if (combiningClass(m_characters[1]) == hiraganaKatakanaVoicingMarksCombiningClass) { 86#if USE(ICU_UNICODE) 87 // Normalize into composed form using 3.2 rules. 88 UChar normalizedCharacters[2] = { 0, 0 }; 89 UErrorCode uStatus = U_ZERO_ERROR; 90 int32_t resultLength = unorm_normalize(m_characters, 2, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], 2, &uStatus); 91 if (resultLength == 1 && !uStatus) 92 return normalizedCharacters[0]; 93#endif 94 } 95 96 return 0; 97} 98 99} 100