1/*
2 * Copyright (C) 2012 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26// See ES 5.1, 15.10.2.8
27function canonicalize(ch)
28{
29    var u = String.fromCharCode(ch).toUpperCase();
30    if (u.length > 1)
31        return ch;
32    var cu = u.charCodeAt(0);
33    if (ch >= 128 && cu < 128)
34        return ch;
35    return cu;
36}
37
38var MAX_UCS2 = 0xFFFF;
39var MAX_LATIN = 0xFF;
40
41var groupedCanonically = [];
42// Pass 1: populate groupedCanonically - this is mapping from canonicalized
43// values back to the set of character code that canonicalize to them.
44for (var i = 0; i <= MAX_UCS2; ++i) {
45    var ch = canonicalize(i);
46    if (!groupedCanonically[ch])
47        groupedCanonically[ch] = [];
48    groupedCanonically[ch].push(i);
49}
50
51var typeInfo = [];
52var latinTypeInfo = [];
53var characterSetInfo = [];
54// Pass 2: populate typeInfo & characterSetInfo. For every character calculate
55// a typeInfo value, described by the types above, and a value payload.
56for (cu in groupedCanonically) {
57    // The set of characters that canonicalize to cu
58    var characters = groupedCanonically[cu];
59
60    // If there is only one, it is unique.
61    if (characters.length == 1) {
62        typeInfo[characters[0]] = "CanonicalizeUnique:0";
63        latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
64        continue;
65    }
66
67    // Sort the array.
68    characters.sort(function(x,y){return x-y;});
69
70    // If there are more than two characters, create an entry in characterSetInfo.
71    if (characters.length > 2) {
72        for (i in characters)
73            typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length;
74        characterSetInfo.push(characters);
75
76        if (characters[1] <= MAX_LATIN)
77            throw new Error("sets with more than one latin character not supported!");
78        if (characters[0] <= MAX_LATIN) {
79            for (i in characters)
80                latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0];
81            latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0";
82        } else {
83            for (i in characters)
84                latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0";
85        }
86
87        continue;
88    }
89
90    // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
91    var lo = characters[0];
92    var hi = characters[1];
93    var delta = hi - lo;
94    if (delta == 1) {
95        var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
96        typeInfo[lo] = type;
97        typeInfo[hi] = type;
98    } else {
99        typeInfo[lo] = "CanonicalizeRangeLo:" + delta;
100        typeInfo[hi] = "CanonicalizeRangeHi:" + delta;
101    }
102
103    if (lo > MAX_LATIN) {
104        latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0";
105        latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0";
106    } else if (hi > MAX_LATIN) {
107        latinTypeInfo[lo] = "CanonicalizeLatinSelf:0";
108        latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo;
109    } else {
110        if (delta != 0x20 || lo & 0x20)
111            throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
112        latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0";
113        latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0";
114    }
115}
116
117var rangeInfo = [];
118// Pass 3: coallesce types into ranges.
119for (var end = 0; end <= MAX_UCS2; ++end) {
120    var begin = end;
121    var type = typeInfo[end];
122    while (end < MAX_UCS2 && typeInfo[end + 1] == type)
123        ++end;
124    rangeInfo.push({begin:begin, end:end, type:type});
125}
126
127var latinRangeInfo = [];
128// Pass 4: coallesce latin-1 types into ranges.
129for (var end = 0; end <= MAX_UCS2; ++end) {
130    var begin = end;
131    var type = latinTypeInfo[end];
132    while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type)
133        ++end;
134    latinRangeInfo.push({begin:begin, end:end, type:type});
135}
136
137
138// Helper function to convert a number to a fixed width hex representation of a C uint16_t.
139function hex(x)
140{
141    var s = Number(x).toString(16);
142    while (s.length < 4)
143        s = 0 + s;
144    return "0x" + s + "u";
145}
146
147var copyright = (
148    "/*"                                                                            + "\n" +
149    " * Copyright (C) 2012 Apple Inc. All rights reserved."                         + "\n" +
150    " *"                                                                            + "\n" +
151    " * Redistribution and use in source and binary forms, with or without"         + "\n" +
152    " * modification, are permitted provided that the following conditions"         + "\n" +
153    " * are met:"                                                                   + "\n" +
154    " * 1. Redistributions of source code must retain the above copyright"          + "\n" +
155    " *    notice, this list of conditions and the following disclaimer."           + "\n" +
156    " * 2. Redistributions in binary form must reproduce the above copyright"       + "\n" +
157    " *    notice, this list of conditions and the following disclaimer in the"     + "\n" +
158    " *    documentation and/or other materials provided with the distribution."    + "\n" +
159    " *"                                                                            + "\n" +
160    " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY"                  + "\n" +
161    " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE"          + "\n" +
162    " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR"         + "\n" +
163    " * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR"                   + "\n" +
164    " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,"      + "\n" +
165    " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,"        + "\n" +
166    " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR"         + "\n" +
167    " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY"        + "\n" +
168    " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT"               + "\n" +
169    " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE"      + "\n" +
170    " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "      + "\n" +
171    " */");
172
173print(copyright);
174print();
175print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
176print();
177print('#include "config.h"');
178print('#include "YarrCanonicalizeUCS2.h"');
179print();
180print("namespace JSC { namespace Yarr {");
181print();
182print("#include <stdint.h>");
183print();
184
185for (i in characterSetInfo) {
186    var characters = ""
187    var set = characterSetInfo[i];
188    for (var j in set)
189        characters += hex(set[j]) + ", ";
190    print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };");
191}
192print();
193print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";");
194print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
195for (i in characterSetInfo)
196print("    ucs2CharacterSet" + i + ",");
197print("};");
198print();
199print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";");
200print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
201for (i in rangeInfo) {
202    var info = rangeInfo[i];
203    var typeAndValue = info.type.split(':');
204    print("    { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
205}
206print("};");
207print();
208print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";");
209print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
210for (i in latinRangeInfo) {
211    var info = latinRangeInfo[i];
212    var typeAndValue = info.type.split(':');
213    print("    { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
214}
215print("};");
216print();
217print("} } // JSC::Yarr");
218print();
219
220