1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2013, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#include "cmemory.h"
11
12#if !UCONFIG_NO_CONVERSION
13#include "csrsbcs.h"
14#include "csmatch.h"
15
16#define N_GRAM_SIZE 3
17#define N_GRAM_MASK 0xFFFFFF
18#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20U_NAMESPACE_BEGIN
21
22NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
23 : ngram(0), byteIndex(0)
24{
25    ngramList = theNgramList;
26    charMap   = theCharMap;
27
28    ngramCount = hitCount = 0;
29}
30
31/*
32 * Binary search for value in table, which must have exactly 64 entries.
33 */
34
35int32_t NGramParser::search(const int32_t *table, int32_t value)
36{
37    int32_t index = 0;
38
39    if (table[index + 32] <= value) {
40        index += 32;
41    }
42
43    if (table[index + 16] <= value) {
44        index += 16;
45    }
46
47    if (table[index + 8] <= value) {
48        index += 8;
49    }
50
51    if (table[index + 4] <= value) {
52        index += 4;
53    }
54
55    if (table[index + 2] <= value) {
56        index += 2;
57    }
58
59    if (table[index + 1] <= value) {
60        index += 1;
61    }
62
63    if (table[index] > value) {
64        index -= 1;
65    }
66
67    if (index < 0 || table[index] != value) {
68        return -1;
69    }
70
71    return index;
72}
73
74void NGramParser::lookup(int32_t thisNgram)
75{
76    ngramCount += 1;
77
78    if (search(ngramList, thisNgram) >= 0) {
79        hitCount += 1;
80    }
81
82}
83
84void NGramParser::addByte(int32_t b)
85{
86    ngram = ((ngram << 8) + b) & N_GRAM_MASK;
87    lookup(ngram);
88}
89
90int32_t NGramParser::nextByte(InputText *det)
91{
92    if (byteIndex >= det->fInputLen) {
93        return -1;
94    }
95
96    return det->fInputBytes[byteIndex++];
97}
98
99void NGramParser::parseCharacters(InputText *det)
100{
101    int32_t b;
102    bool ignoreSpace = FALSE;
103
104    while ((b = nextByte(det)) >= 0) {
105        uint8_t mb = charMap[b];
106
107        // TODO: 0x20 might not be a space in all character sets...
108        if (mb != 0) {
109            if (!(mb == 0x20 && ignoreSpace)) {
110                addByte(mb);
111            }
112
113            ignoreSpace = (mb == 0x20);
114        }
115    }
116}
117
118int32_t NGramParser::parse(InputText *det)
119{
120    parseCharacters(det);
121
122    // TODO: Is this OK? The buffer could have ended in the middle of a word...
123    addByte(0x20);
124
125    double rawPercent = (double) hitCount / (double) ngramCount;
126
127    //            if (rawPercent <= 2.0) {
128    //                return 0;
129    //            }
130
131    // TODO - This is a bit of a hack to take care of a case
132    // were we were getting a confidence of 135...
133    if (rawPercent > 0.33) {
134        return 98;
135    }
136
137    return (int32_t) (rawPercent * 300.0);
138}
139
140static const uint8_t unshapeMap_IBM420[] = {
141/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
142/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
143/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
144/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
145/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
146/* 4- */    0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
147/* 5- */    0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
148/* 6- */    0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
149/* 7- */    0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
150/* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
151/* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
152/* A- */    0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
153/* B- */    0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
154/* C- */    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
155/* D- */    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
156/* E- */    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
157/* F- */    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
158};
159
160NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
161{
162	alef = 0x00;
163}
164
165
166int32_t NGramParser_IBM420::isLamAlef(int32_t b)
167{
168	if(b == 0xB2 || b == 0xB3){
169         	return 0x47;
170        }else if(b == 0xB4 || b == 0xB5){
171         	return 0x49;
172        }else if(b == 0xB8 || b == 0xB9){
173         	return 0x56;
174        }else
175         	return 0x00;
176}
177
178/*
179* Arabic shaping needs to be done manually. Cannot call ArabicShaping class
180* because CharsetDetector is dealing with bytes not Unicode code points. We could
181* convert the bytes to Unicode code points but that would leave us dependent
182* on CharsetICU which we try to avoid. IBM420 converter amongst different versions
183* of JDK can produce different results and therefore is also avoided.
184*/
185int32_t NGramParser_IBM420::nextByte(InputText *det)
186{
187
188    if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
189        return -1;
190    }
191    int next;
192
193    alef = isLamAlef(det->fInputBytes[byteIndex]);
194    if(alef != 0x00)
195        next = 0xB1 & 0xFF;
196    else
197        next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
198
199    byteIndex++;
200
201    return next;
202}
203
204void NGramParser_IBM420::parseCharacters(InputText *det)
205{
206	int32_t b;
207    bool ignoreSpace = FALSE;
208
209    while ((b = nextByte(det)) >= 0) {
210        uint8_t mb = charMap[b];
211
212        // TODO: 0x20 might not be a space in all character sets...
213        if (mb != 0) {
214            if (!(mb == 0x20 && ignoreSpace)) {
215                addByte(mb);
216            }
217            ignoreSpace = (mb == 0x20);
218        }
219
220		if(alef != 0x00){
221            mb = charMap[alef & 0xFF];
222
223            // TODO: 0x20 might not be a space in all character sets...
224            if (mb != 0) {
225                if (!(mb == 0x20 && ignoreSpace)) {
226                    addByte(mb);
227                }
228
229                ignoreSpace = (mb == 0x20);
230            }
231
232        }
233    }
234}
235
236CharsetRecog_sbcs::CharsetRecog_sbcs()
237{
238    // nothing else to do
239}
240
241CharsetRecog_sbcs::~CharsetRecog_sbcs()
242{
243    // nothing to do
244}
245
246int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[],  const uint8_t byteMap[]) const
247{
248    NGramParser parser(ngrams, byteMap);
249    int32_t result;
250
251    result = parser.parse(det);
252
253    return result;
254}
255
256static const uint8_t charMap_8859_1[] = {
257    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
258    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
259    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
260    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
261    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
262    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
263    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
264    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
266    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
267    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
268    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
269    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
270    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
271    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
272    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
273    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
274    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
275    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
276    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
277    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
278    0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
279    0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
280    0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
281    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
282    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
283    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
284    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
285    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
286    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
287    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
288    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
289};
290
291static const uint8_t charMap_8859_2[] = {
292    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
293    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
294    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
295    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
296    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
297    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
299    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
301    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
302    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
303    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
304    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
305    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
306    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
307    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
308    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
309    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
310    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
311    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
312    0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
313    0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
314    0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
315    0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
316    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
317    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
318    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
319    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
320    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
321    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
322    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
323    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
324};
325
326static const uint8_t charMap_8859_5[] = {
327    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
328    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
329    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
330    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
331    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
332    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
333    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
334    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
336    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
337    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
338    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
339    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
340    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
341    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
342    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
343    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
344    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
345    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
346    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
347    0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
348    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
349    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
350    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
351    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
352    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
353    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
354    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
355    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
356    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
357    0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
358    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
359};
360
361static const uint8_t charMap_8859_6[] = {
362    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
363    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
364    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
365    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
366    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
367    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
368    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
369    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
371    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
372    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
373    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
374    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
375    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
376    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
377    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
378    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
379    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
380    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
381    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
382    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
383    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
384    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386    0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
387    0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
388    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
389    0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
390    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
391    0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
392    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
393    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
394};
395
396static const uint8_t charMap_8859_7[] = {
397    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
398    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
399    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
400    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
401    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
402    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
403    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
404    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
405    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
406    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
407    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
408    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
409    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
410    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
411    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
412    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
413    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
414    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
415    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
416    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
417    0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
418    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
419    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
420    0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
421    0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
422    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
423    0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
424    0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
425    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
426    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
427    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
428    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
429};
430
431static const uint8_t charMap_8859_8[] = {
432    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
433    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
434    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
435    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
436    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
437    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
438    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
439    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
440    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
441    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
442    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
443    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
444    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
445    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
446    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
447    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
448    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
449    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
450    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
451    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454    0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
455    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
459    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
460    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
461    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
462    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
463    0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
464};
465
466static const uint8_t charMap_8859_9[] = {
467    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
468    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
469    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
470    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
471    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
472    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
473    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
474    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
475    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
476    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
477    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
478    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
479    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
480    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
481    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
482    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
483    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
484    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
485    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
486    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
487    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
488    0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
489    0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
490    0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
491    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
492    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
493    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
494    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
495    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
496    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
497    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
498    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
499};
500
501static const int32_t ngrams_windows_1251[] = {
502    0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
503    0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
504    0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
505    0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
506};
507
508static const uint8_t charMap_windows_1251[] = {
509    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
510    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
511    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
512    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
513    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
514    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
515    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
517    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
518    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
519    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
520    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
521    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
522    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
523    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
524    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
525    0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
526    0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
527    0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
528    0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
529    0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
530    0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
531    0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
532    0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
533    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
534    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
535    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
536    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
537    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
538    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
539    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
540    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
541};
542
543static const int32_t ngrams_windows_1256[] = {
544    0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
545    0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
546    0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
547    0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
548};
549
550static const uint8_t charMap_windows_1256[] = {
551    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
552    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
553    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
554    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
555    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
556    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
557    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
558    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
559    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
560    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
561    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
562    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
563    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
564    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
565    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
566    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
567    0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
568    0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
569    0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
570    0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
571    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
572    0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
573    0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
574    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
575    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
576    0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
577    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
578    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
579    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
580    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
581    0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
582    0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
583};
584
585static const int32_t ngrams_KOI8_R[] = {
586    0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
587    0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
588    0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
589    0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
590};
591
592static const uint8_t charMap_KOI8_R[] = {
593    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
594    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
595    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
596    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
597    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
598    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
599    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
600    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
601    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
602    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
603    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
604    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
605    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
606    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
607    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
608    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
609    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
610    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
611    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
612    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
613    0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
614    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
615    0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
616    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
617    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
618    0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
619    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
620    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
621    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
622    0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
623    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
624    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
625};
626
627static const int32_t ngrams_IBM424_he_rtl[] = {
628    0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
629    0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
630    0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
631    0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
632};
633
634static const int32_t ngrams_IBM424_he_ltr[] = {
635    0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
636    0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
637    0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
638    0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
639};
640
641static const uint8_t charMap_IBM424_he[] = {
642/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
643/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
644/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
645/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
646/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
647/* 4- */    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
648/* 5- */    0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
649/* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
650/* 7- */    0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
651/* 8- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
652/* 9- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
653/* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
654/* B- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
655/* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
656/* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
657/* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
658/* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
659};
660
661static const int32_t ngrams_IBM420_ar_rtl[] = {
662    0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
663    0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
664    0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
665    0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
666};
667
668static const int32_t ngrams_IBM420_ar_ltr[] = {
669    0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
670    0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
671    0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
672    0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
673};
674
675static const uint8_t charMap_IBM420_ar[]= {
676/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
677/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
678/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
679/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
680/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
681/* 4- */    0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
682/* 5- */    0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
683/* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
684/* 7- */    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
685/* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
686/* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
687/* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
688/* B- */    0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
689/* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
690/* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
691/* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
692/* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
693};
694
695//ISO-8859-1,2,5,6,7,8,9 Ngrams
696
697struct NGramsPlusLang {
698    const int32_t ngrams[64];
699    const char *  lang;
700};
701
702static const NGramsPlusLang ngrams_8859_1[] =  {
703  {
704    {
705    0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
706    0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
707    0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
708    0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
709    },
710    "en"
711  },
712  {
713    {
714    0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
715    0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
716    0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
717    0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
718    },
719    "da"
720  },
721  {
722    {
723    0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
724    0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
725    0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
726    0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
727    },
728    "de"
729  },
730  {
731    {
732    0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
733    0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
734    0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
735    0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
736    },
737    "es"
738  },
739  {
740    {
741    0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
742    0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
743    0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
744    0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
745    },
746    "fr"
747  },
748  {
749    {
750    0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
751    0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
752    0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
753    0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
754    },
755    "it"
756  },
757  {
758    {
759    0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
760    0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
761    0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
762    0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
763    },
764    "nl"
765  },
766  {
767    {
768    0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
769    0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
770    0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
771    0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
772    },
773    "no"
774  },
775  {
776    {
777    0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
778    0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
779    0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
780    0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
781    },
782    "pt"
783  },
784  {
785    {
786    0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
787    0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
788    0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
789    0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
790    },
791    "sv"
792  }
793};
794
795
796static const NGramsPlusLang ngrams_8859_2[] =  {
797  {
798    {
799    0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
800    0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
801    0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
802    0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
803    },
804    "cs"
805  },
806  {
807    {
808    0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
809    0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
810    0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
811    0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
812    },
813    "hu"
814  },
815  {
816    {
817    0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
818    0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
819    0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
820    0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
821    },
822    "pl"
823  },
824  {
825    {
826    0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
827    0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
828    0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
829    0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
830    },
831    "ro"
832  }
833};
834
835static const int32_t ngrams_8859_5_ru[] = {
836    0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
837    0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
838    0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
839    0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
840};
841
842static const int32_t ngrams_8859_6_ar[] = {
843    0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
844    0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
845    0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
846    0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
847};
848
849static const int32_t ngrams_8859_7_el[] = {
850    0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
851    0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
852    0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
853    0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
854};
855
856static const int32_t ngrams_8859_8_I_he[] = {
857    0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
858    0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
859    0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
860    0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
861};
862
863static const int32_t ngrams_8859_8_he[] = {
864    0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
865    0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
866    0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
867    0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
868};
869
870static const int32_t ngrams_8859_9_tr[] = {
871    0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
872    0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
873    0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
874    0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
875};
876
877CharsetRecog_8859_1::~CharsetRecog_8859_1()
878{
879    // nothing to do
880}
881
882UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
883    const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
884    uint32_t i;
885    int32_t bestConfidenceSoFar = -1;
886    for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) {
887        const int32_t *ngrams = ngrams_8859_1[i].ngrams;
888        const char    *lang   = ngrams_8859_1[i].lang;
889        int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
890        if (confidence > bestConfidenceSoFar) {
891            results->set(textIn, this, confidence, name, lang);
892            bestConfidenceSoFar = confidence;
893        }
894    }
895    return (bestConfidenceSoFar > 0);
896}
897
898const char *CharsetRecog_8859_1::getName() const
899{
900    return "ISO-8859-1";
901}
902
903
904CharsetRecog_8859_2::~CharsetRecog_8859_2()
905{
906    // nothing to do
907}
908
909UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
910    const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
911    uint32_t i;
912    int32_t bestConfidenceSoFar = -1;
913    for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) {
914        const int32_t *ngrams = ngrams_8859_2[i].ngrams;
915        const char    *lang   = ngrams_8859_2[i].lang;
916        int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
917        if (confidence > bestConfidenceSoFar) {
918            results->set(textIn, this, confidence, name, lang);
919            bestConfidenceSoFar = confidence;
920        }
921    }
922    return (bestConfidenceSoFar > 0);
923}
924
925const char *CharsetRecog_8859_2::getName() const
926{
927    return "ISO-8859-2";
928}
929
930
931CharsetRecog_8859_5::~CharsetRecog_8859_5()
932{
933    // nothing to do
934}
935
936const char *CharsetRecog_8859_5::getName() const
937{
938    return "ISO-8859-5";
939}
940
941CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
942{
943    // nothing to do
944}
945
946const char *CharsetRecog_8859_5_ru::getLanguage() const
947{
948    return "ru";
949}
950
951UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
952{
953    int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
954    results->set(textIn, this, confidence);
955    return (confidence > 0);
956}
957
958CharsetRecog_8859_6::~CharsetRecog_8859_6()
959{
960    // nothing to do
961}
962
963const char *CharsetRecog_8859_6::getName() const
964{
965    return "ISO-8859-6";
966}
967
968CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
969{
970    // nothing to do
971}
972
973const char *CharsetRecog_8859_6_ar::getLanguage() const
974{
975    return "ar";
976}
977
978UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
979{
980    int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
981    results->set(textIn, this, confidence);
982    return (confidence > 0);
983}
984
985CharsetRecog_8859_7::~CharsetRecog_8859_7()
986{
987    // nothing to do
988}
989
990const char *CharsetRecog_8859_7::getName() const
991{
992    return "ISO-8859-7";
993}
994
995CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
996{
997    // nothing to do
998}
999
1000const char *CharsetRecog_8859_7_el::getLanguage() const
1001{
1002    return "el";
1003}
1004
1005UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
1006{
1007    const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
1008    int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
1009    results->set(textIn, this, confidence, name, "el");
1010    return (confidence > 0);
1011}
1012
1013CharsetRecog_8859_8::~CharsetRecog_8859_8()
1014{
1015    // nothing to do
1016}
1017
1018const char *CharsetRecog_8859_8::getName() const
1019{
1020    return "ISO-8859-8";
1021}
1022
1023CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1024{
1025    // nothing to do
1026}
1027
1028const char *CharsetRecog_8859_8_I_he::getName() const
1029{
1030    return "ISO-8859-8-I";
1031}
1032
1033const char *CharsetRecog_8859_8_I_he::getLanguage() const
1034{
1035    return "he";
1036}
1037
1038UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
1039{
1040    const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
1041    int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
1042    results->set(textIn, this, confidence, name, "he");
1043    return (confidence > 0);
1044}
1045
1046CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1047{
1048    // od ot gnihton
1049}
1050
1051const char *CharsetRecog_8859_8_he::getLanguage() const
1052{
1053    return "he";
1054}
1055
1056UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
1057{
1058    const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
1059    int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
1060    results->set(textIn, this, confidence, name, "he");
1061    return (confidence > 0);
1062}
1063
1064CharsetRecog_8859_9::~CharsetRecog_8859_9()
1065{
1066    // nothing to do
1067}
1068
1069const char *CharsetRecog_8859_9::getName() const
1070{
1071    return "ISO-8859-9";
1072}
1073
1074CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1075{
1076    // nothing to do
1077}
1078
1079const char *CharsetRecog_8859_9_tr::getLanguage() const
1080{
1081    return "tr";
1082}
1083
1084UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
1085{
1086    const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
1087    int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
1088    results->set(textIn, this, confidence, name, "tr");
1089    return (confidence > 0);
1090}
1091
1092CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1093{
1094    // nothing to do
1095}
1096
1097const char *CharsetRecog_windows_1256::getName() const
1098{
1099    return  "windows-1256";
1100}
1101
1102const char *CharsetRecog_windows_1256::getLanguage() const
1103{
1104    return "ar";
1105}
1106
1107UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
1108{
1109    int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1110    results->set(textIn, this, confidence);
1111    return (confidence > 0);
1112}
1113
1114CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1115{
1116    // nothing to do
1117}
1118
1119const char *CharsetRecog_windows_1251::getName() const
1120{
1121    return  "windows-1251";
1122}
1123
1124const char *CharsetRecog_windows_1251::getLanguage() const
1125{
1126    return "ru";
1127}
1128
1129UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
1130{
1131    int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1132    results->set(textIn, this, confidence);
1133    return (confidence > 0);
1134}
1135
1136CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1137{
1138    // nothing to do
1139}
1140
1141const char *CharsetRecog_KOI8_R::getName() const
1142{
1143    return  "KOI8-R";
1144}
1145
1146const char *CharsetRecog_KOI8_R::getLanguage() const
1147{
1148    return "ru";
1149}
1150
1151UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
1152{
1153    int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1154    results->set(textIn, this, confidence);
1155    return (confidence > 0);
1156}
1157
1158CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1159{
1160    // nothing to do
1161}
1162
1163const char *CharsetRecog_IBM424_he::getLanguage() const
1164{
1165    return "he";
1166}
1167
1168CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1169{
1170    // nothing to do
1171}
1172
1173const char *CharsetRecog_IBM424_he_rtl::getName() const
1174{
1175    return  "IBM424_rtl";
1176}
1177
1178UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
1179{
1180    int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1181    results->set(textIn, this, confidence);
1182    return (confidence > 0);
1183}
1184
1185CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1186{
1187    // nothing to do
1188}
1189
1190const char *CharsetRecog_IBM424_he_ltr::getName() const
1191{
1192    return  "IBM424_ltr";
1193}
1194
1195UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
1196{
1197    int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1198    results->set(textIn, this, confidence);
1199    return (confidence > 0);
1200}
1201
1202CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1203{
1204    // nothing to do
1205}
1206
1207const char *CharsetRecog_IBM420_ar::getLanguage() const
1208{
1209    return "ar";
1210}
1211
1212
1213int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[],  const uint8_t byteMap[]) const
1214{
1215    NGramParser_IBM420 parser(ngrams, byteMap);
1216    int32_t result;
1217
1218    result = parser.parse(det);
1219
1220    return result;
1221}
1222
1223CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1224{
1225    // nothing to do
1226}
1227
1228const char *CharsetRecog_IBM420_ar_rtl::getName() const
1229{
1230    return  "IBM420_rtl";
1231}
1232
1233UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
1234{
1235    int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1236    results->set(textIn, this, confidence);
1237    return (confidence > 0);
1238}
1239
1240CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1241{
1242    // nothing to do
1243}
1244
1245const char *CharsetRecog_IBM420_ar_ltr::getName() const
1246{
1247    return  "IBM420_ltr";
1248}
1249
1250UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
1251{
1252    int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1253    results->set(textIn, this, confidence);
1254    return (confidence > 0);
1255}
1256
1257U_NAMESPACE_END
1258#endif
1259
1260