1/* 2 * Copyright (C) 2012 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26// See ES 5.1, 15.10.2.8 27function canonicalize(ch) 28{ 29 var u = String.fromCharCode(ch).toUpperCase(); 30 if (u.length > 1) 31 return ch; 32 var cu = u.charCodeAt(0); 33 if (ch >= 128 && cu < 128) 34 return ch; 35 return cu; 36} 37 38var MAX_UCS2 = 0xFFFF; 39var MAX_LATIN = 0xFF; 40 41var groupedCanonically = []; 42// Pass 1: populate groupedCanonically - this is mapping from canonicalized 43// values back to the set of character code that canonicalize to them. 44for (var i = 0; i <= MAX_UCS2; ++i) { 45 var ch = canonicalize(i); 46 if (!groupedCanonically[ch]) 47 groupedCanonically[ch] = []; 48 groupedCanonically[ch].push(i); 49} 50 51var typeInfo = []; 52var latinTypeInfo = []; 53var characterSetInfo = []; 54// Pass 2: populate typeInfo & characterSetInfo. For every character calculate 55// a typeInfo value, described by the types above, and a value payload. 56for (cu in groupedCanonically) { 57 // The set of characters that canonicalize to cu 58 var characters = groupedCanonically[cu]; 59 60 // If there is only one, it is unique. 61 if (characters.length == 1) { 62 typeInfo[characters[0]] = "CanonicalizeUnique:0"; 63 latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0"; 64 continue; 65 } 66 67 // Sort the array. 68 characters.sort(function(x,y){return x-y;}); 69 70 // If there are more than two characters, create an entry in characterSetInfo. 71 if (characters.length > 2) { 72 for (i in characters) 73 typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; 74 characterSetInfo.push(characters); 75 76 if (characters[1] <= MAX_LATIN) 77 throw new Error("sets with more than one latin character not supported!"); 78 if (characters[0] <= MAX_LATIN) { 79 for (i in characters) 80 latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0]; 81 latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0"; 82 } else { 83 for (i in characters) 84 latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0"; 85 } 86 87 continue; 88 } 89 90 // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. 91 var lo = characters[0]; 92 var hi = characters[1]; 93 var delta = hi - lo; 94 if (delta == 1) { 95 var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; 96 typeInfo[lo] = type; 97 typeInfo[hi] = type; 98 } else { 99 typeInfo[lo] = "CanonicalizeRangeLo:" + delta; 100 typeInfo[hi] = "CanonicalizeRangeHi:" + delta; 101 } 102 103 if (lo > MAX_LATIN) { 104 latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; 105 latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0"; 106 } else if (hi > MAX_LATIN) { 107 latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; 108 latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo; 109 } else { 110 if (delta != 0x20 || lo & 0x20) 111 throw new Error("pairs of latin characters that don't mask with 0x20 not supported!"); 112 latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0"; 113 latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0"; 114 } 115} 116 117var rangeInfo = []; 118// Pass 3: coallesce types into ranges. 119for (var end = 0; end <= MAX_UCS2; ++end) { 120 var begin = end; 121 var type = typeInfo[end]; 122 while (end < MAX_UCS2 && typeInfo[end + 1] == type) 123 ++end; 124 rangeInfo.push({begin:begin, end:end, type:type}); 125} 126 127var latinRangeInfo = []; 128// Pass 4: coallesce latin-1 types into ranges. 129for (var end = 0; end <= MAX_UCS2; ++end) { 130 var begin = end; 131 var type = latinTypeInfo[end]; 132 while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type) 133 ++end; 134 latinRangeInfo.push({begin:begin, end:end, type:type}); 135} 136 137 138// Helper function to convert a number to a fixed width hex representation of a C uint16_t. 139function hex(x) 140{ 141 var s = Number(x).toString(16); 142 while (s.length < 4) 143 s = 0 + s; 144 return "0x" + s + "u"; 145} 146 147var copyright = ( 148 "/*" + "\n" + 149 " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" + 150 " *" + "\n" + 151 " * Redistribution and use in source and binary forms, with or without" + "\n" + 152 " * modification, are permitted provided that the following conditions" + "\n" + 153 " * are met:" + "\n" + 154 " * 1. Redistributions of source code must retain the above copyright" + "\n" + 155 " * notice, this list of conditions and the following disclaimer." + "\n" + 156 " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + 157 " * notice, this list of conditions and the following disclaimer in the" + "\n" + 158 " * documentation and/or other materials provided with the distribution." + "\n" + 159 " *" + "\n" + 160 " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + 161 " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + 162 " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + 163 " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + 164 " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + 165 " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + 166 " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + 167 " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + 168 " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + 169 " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + 170 " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + 171 " */"); 172 173print(copyright); 174print(); 175print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js"); 176print(); 177print('#include "config.h"'); 178print('#include "YarrCanonicalizeUCS2.h"'); 179print(); 180print("namespace JSC { namespace Yarr {"); 181print(); 182print("#include <stdint.h>"); 183print(); 184 185for (i in characterSetInfo) { 186 var characters = "" 187 var set = characterSetInfo[i]; 188 for (var j in set) 189 characters += hex(set[j]) + ", "; 190 print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };"); 191} 192print(); 193print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); 194print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {"); 195for (i in characterSetInfo) 196print(" ucs2CharacterSet" + i + ","); 197print("};"); 198print(); 199print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); 200print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {"); 201for (i in rangeInfo) { 202 var info = rangeInfo[i]; 203 var typeAndValue = info.type.split(':'); 204 print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); 205} 206print("};"); 207print(); 208print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";"); 209print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {"); 210for (i in latinRangeInfo) { 211 var info = latinRangeInfo[i]; 212 var typeAndValue = info.type.split(':'); 213 print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); 214} 215print("};"); 216print(); 217print("} } // JSC::Yarr"); 218print(); 219 220