1/*
2*******************************************************************************
3*
4*   Copyright (C) 2001-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucol_bld.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created 02/22/2001
14*   created by: Vladimir Weinstein
15*
16* This module builds a collator based on the rule set.
17*
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_COLLATION
23
24#include "unicode/ucoleitr.h"
25#include "unicode/udata.h"
26#include "unicode/uchar.h"
27#include "unicode/uniset.h"
28#include "unicode/uscript.h"
29#include "unicode/ustring.h"
30#include "unicode/utf16.h"
31#include "normalizer2impl.h"
32#include "ucol_bld.h"
33#include "ucol_elm.h"
34#include "ucol_cnt.h"
35#include "ucln_in.h"
36#include "umutex.h"
37#include "cmemory.h"
38#include "cstring.h"
39
40#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41
42static const InverseUCATableHeader* _staticInvUCA = NULL;
43static UDataMemory* invUCA_DATA_MEM = NULL;
44
45U_CDECL_BEGIN
46static UBool U_CALLCONV
47isAcceptableInvUCA(void * /*context*/,
48                   const char * /*type*/, const char * /*name*/,
49                   const UDataInfo *pInfo)
50{
51    /* context, type & name are intentionally not used */
52    if( pInfo->size>=20 &&
53        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
54        pInfo->charsetFamily==U_CHARSET_FAMILY &&
55        pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
56        pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
57        pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
58        pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
59        pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
60        pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
61        //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
62        //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
63        //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
64        )
65    {
66        UVersionInfo UCDVersion;
67        u_getUnicodeVersion(UCDVersion);
68        return (pInfo->dataVersion[0]==UCDVersion[0] &&
69            pInfo->dataVersion[1]==UCDVersion[1]);
70            //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
71            //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
72            //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
73    } else {
74        return FALSE;
75    }
76}
77U_CDECL_END
78
79/*
80* Takes two CEs (lead and continuation) and
81* compares them as CEs should be compared:
82* primary vs. primary, secondary vs. secondary
83* tertiary vs. tertiary
84*/
85static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
86    uint32_t s1 = source0, s2, t1 = target0, t2;
87    if(isContinuation(source1)) {
88        s2 = source1;
89    } else {
90        s2 = 0;
91    }
92    if(isContinuation(target1)) {
93        t2 = target1;
94    } else {
95        t2 = 0;
96    }
97
98    uint32_t s = 0, t = 0;
99    if(s1 == t1 && s2 == t2) {
100        return 0;
101    }
102    s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
103    t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
104    if(s < t) {
105        return -1;
106    } else if(s > t) {
107        return 1;
108    } else {
109        s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
110        t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
111        if(s < t) {
112            return -1;
113        } else if(s > t) {
114            return 1;
115        } else {
116            s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
117            t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
118            if(s < t) {
119                return -1;
120            } else {
121                return 1;
122            }
123        }
124    }
125}
126
127static
128int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
129    uint32_t bottom = 0, top = src->invUCA->tableSize;
130    uint32_t i = 0;
131    uint32_t first = 0, second = 0;
132    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
133    int32_t res = 0;
134
135    while(bottom < top-1) {
136        i = (top+bottom)/2;
137        first = *(CETable+3*i);
138        second = *(CETable+3*i+1);
139        res = compareCEs(first, second, CE, SecondCE);
140        if(res > 0) {
141            top = i;
142        } else if(res < 0) {
143            bottom = i;
144        } else {
145            break;
146        }
147    }
148
149    /* weiv:                                                  */
150    /* in searching for elements, I have removed the failure  */
151    /* The reason for this is that the builder does not rely  */
152    /* on search mechanism telling it that it didn't find an  */
153    /* element. However, indirect positioning relies on being */
154    /* able to find the elements around any CE, even if it is */
155    /* not defined in the UCA. */
156    return i;
157    /*
158    if((first == CE && second == SecondCE)) {
159    return i;
160    } else {
161    return -1;
162    }
163    */
164}
165
166static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
167    0xFFFF0000,
168    0xFFFFFF00,
169    0xFFFFFFFF
170};
171
172U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
173                                            uint32_t CE, uint32_t contCE,
174                                            uint32_t *nextCE, uint32_t *nextContCE,
175                                            uint32_t strength)
176{
177    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
178    int32_t iCE;
179
180    iCE = ucol_inv_findCE(src, CE, contCE);
181
182    if(iCE<0) {
183        *nextCE = UCOL_NOT_FOUND;
184        return -1;
185    }
186
187    CE &= strengthMask[strength];
188    contCE &= strengthMask[strength];
189
190    *nextCE = CE;
191    *nextContCE = contCE;
192
193    while((*nextCE  & strengthMask[strength]) == CE
194        && (*nextContCE  & strengthMask[strength]) == contCE)
195    {
196        *nextCE = (*(CETable+3*(++iCE)));
197        *nextContCE = (*(CETable+3*(iCE)+1));
198    }
199
200    return iCE;
201}
202
203U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
204                                            uint32_t CE, uint32_t contCE,
205                                            uint32_t *prevCE, uint32_t *prevContCE,
206                                            uint32_t strength)
207{
208    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
209    int32_t iCE;
210
211    iCE = ucol_inv_findCE(src, CE, contCE);
212
213    if(iCE<0) {
214        *prevCE = UCOL_NOT_FOUND;
215        return -1;
216    }
217
218    CE &= strengthMask[strength];
219    contCE &= strengthMask[strength];
220
221    *prevCE = CE;
222    *prevContCE = contCE;
223
224    while((*prevCE  & strengthMask[strength]) == CE
225        && (*prevContCE  & strengthMask[strength])== contCE
226        && iCE > 0) /* this condition should prevent falling off the edge of the world */
227    {
228        /* here, we end up in a singularity - zero */
229        *prevCE = (*(CETable+3*(--iCE)));
230        *prevContCE = (*(CETable+3*(iCE)+1));
231    }
232
233    return iCE;
234}
235
236U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
237                                                       uint32_t prevCE, uint32_t prevContCE)
238{
239    if(prevCE == CE && prevContCE == contCE) {
240        return UCOL_IDENTICAL;
241    }
242    if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
243        || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
244    {
245        return UCOL_PRIMARY;
246    }
247    if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
248        || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
249    {
250        return UCOL_SECONDARY;
251    }
252    return UCOL_TERTIARY;
253}
254
255
256/*static
257inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
258
259    uint32_t CE = lh->baseCE;
260    uint32_t SecondCE = lh->baseContCE;
261
262    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
263    uint32_t previousCE, previousContCE;
264    int32_t iCE;
265
266    iCE = ucol_inv_findCE(src, CE, SecondCE);
267
268    if(iCE<0) {
269        return -1;
270    }
271
272    CE &= strengthMask[strength];
273    SecondCE &= strengthMask[strength];
274
275    previousCE = CE;
276    previousContCE = SecondCE;
277
278    while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
279        previousCE = (*(CETable+3*(--iCE)));
280        previousContCE = (*(CETable+3*(iCE)+1));
281    }
282    lh->previousCE = previousCE;
283    lh->previousContCE = previousContCE;
284
285    return iCE;
286}*/
287
288static
289inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
290    uint32_t CE = lh->baseCE;
291    uint32_t SecondCE = lh->baseContCE;
292
293    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
294    uint32_t nextCE, nextContCE;
295    int32_t iCE;
296
297    iCE = ucol_inv_findCE(src, CE, SecondCE);
298
299    if(iCE<0) {
300        return -1;
301    }
302
303    CE &= strengthMask[strength];
304    SecondCE &= strengthMask[strength];
305
306    nextCE = CE;
307    nextContCE = SecondCE;
308
309    while((nextCE  & strengthMask[strength]) == CE
310        && (nextContCE  & strengthMask[strength]) == SecondCE)
311    {
312        nextCE = (*(CETable+3*(++iCE)));
313        nextContCE = (*(CETable+3*(iCE)+1));
314    }
315
316    lh->nextCE = nextCE;
317    lh->nextContCE = nextContCE;
318
319    return iCE;
320}
321
322static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
323    /* reset all the gaps */
324    int32_t i = 0;
325    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
326    uint32_t st = 0;
327    uint32_t t1, t2;
328    int32_t pos;
329
330    UColToken *tok = lh->first;
331    uint32_t tokStrength = tok->strength;
332
333    for(i = 0; i<3; i++) {
334        lh->gapsHi[3*i] = 0;
335        lh->gapsHi[3*i+1] = 0;
336        lh->gapsHi[3*i+2] = 0;
337        lh->gapsLo[3*i] = 0;
338        lh->gapsLo[3*i+1] = 0;
339        lh->gapsLo[3*i+2] = 0;
340        lh->numStr[i] = 0;
341        lh->fStrToken[i] = NULL;
342        lh->lStrToken[i] = NULL;
343        lh->pos[i] = -1;
344    }
345
346    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
347
348    if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
349        //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
350        lh->pos[0] = 0;
351        t1 = lh->baseCE;
352        t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
353        lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
354        lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
355        lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
356        uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
357        primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
358
359        t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
360        t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
361
362        lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
363        lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
364        lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
365    } else if(lh->indirect == TRUE && lh->nextCE != 0) {
366        //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
367        lh->pos[0] = 0;
368        t1 = lh->baseCE;
369        t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
370        lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
371        lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
372        lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
373        t1 = lh->nextCE;
374        t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
375        lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
376        lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
377        lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
378    } else {
379        for(;;) {
380            if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
381                if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
382                    lh->fStrToken[tokStrength] = tok;
383                } else { /* The CE must be implicit, since it's not in the table */
384                    /* Error */
385                    *status = U_INTERNAL_PROGRAM_ERROR;
386                }
387            }
388
389            while(tok != NULL && tok->strength >= tokStrength) {
390                if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
391                    lh->lStrToken[tokStrength] = tok;
392                }
393                tok = tok->next;
394            }
395            if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
396                /* check if previous interval is the same and merge the intervals if it is so */
397                if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
398                    lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
399                    lh->fStrToken[tokStrength+1] = NULL;
400                    lh->lStrToken[tokStrength+1] = NULL;
401                    lh->pos[tokStrength+1] = -1;
402                }
403            }
404            if(tok != NULL) {
405                tokStrength = tok->strength;
406            } else {
407                break;
408            }
409        }
410        for(st = 0; st < 3; st++) {
411            if((pos = lh->pos[st]) >= 0) {
412                t1 = *(CETable+3*(pos));
413                t2 = *(CETable+3*(pos)+1);
414                lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
415                lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
416                //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
417                lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
418                //pos--;
419                //t1 = *(CETable+3*(pos));
420                //t2 = *(CETable+3*(pos)+1);
421                t1 = lh->baseCE;
422                t2 = lh->baseContCE;
423                lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
424                lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
425                lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
426            }
427        }
428    }
429}
430
431
432#define ucol_countBytes(value, noOfBytes)   \
433{                               \
434    uint32_t mask = 0xFFFFFFFF;   \
435    (noOfBytes) = 0;              \
436    while(mask != 0) {            \
437    if(((value) & mask) != 0) { \
438    (noOfBytes)++;            \
439    }                           \
440    mask >>= 8;                 \
441    }                             \
442}
443
444static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
445    if(U_SUCCESS(*status)) {
446        g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
447    }
448    return g->current;
449}
450
451static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
452    /* TODO: rename to enum names */
453    uint32_t high, low, count=1;
454    uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
455
456    if(strength == UCOL_SECONDARY) {
457        low = UCOL_COMMON_TOP2<<24;
458        high = 0xFFFFFFFF;
459        count = 0xFF - UCOL_COMMON_TOP2;
460    } else {
461        low = UCOL_BYTE_COMMON << 24; //0x05000000;
462        high = 0x40000000;
463        count = 0x40 - UCOL_BYTE_COMMON;
464    }
465
466    if(tok->next != NULL && tok->next->strength == strength) {
467        count = tok->next->toInsert;
468    }
469
470    g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
471    g->current = UCOL_BYTE_COMMON<<24;
472
473    if(g->noOfRanges == 0) {
474        *status = U_INTERNAL_PROGRAM_ERROR;
475    }
476    return g->current;
477}
478
479static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
480    uint32_t strength = tok->strength;
481    uint32_t low = lows[fStrength*3+strength];
482    uint32_t high = highs[fStrength*3+strength];
483    uint32_t maxByte = 0;
484    if(strength == UCOL_TERTIARY) {
485        maxByte = 0x3F;
486    } else if(strength == UCOL_PRIMARY) {
487        maxByte = 0xFE;
488    } else {
489        maxByte = 0xFF;
490    }
491
492    uint32_t count = tok->toInsert;
493
494    if(low >= high && strength > UCOL_PRIMARY) {
495        int32_t s = strength;
496        for(;;) {
497            s--;
498            if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
499                if(strength == UCOL_SECONDARY) {
500                    if (low < UCOL_COMMON_TOP2<<24 ) {
501                       // Override if low range is less than UCOL_COMMON_TOP2.
502		        low = UCOL_COMMON_TOP2<<24;
503                    }
504                    high = 0xFFFFFFFF;
505                } else {
506                    // Override if low range is less than UCOL_COMMON_BOT3.
507		    if ( low < UCOL_COMMON_BOT3<<24 ) {
508                        low = UCOL_COMMON_BOT3<<24;
509		    }
510                    high = 0x40000000;
511                }
512                break;
513            }
514            if(s<0) {
515                *status = U_INTERNAL_PROGRAM_ERROR;
516                return 0;
517            }
518        }
519    }
520
521    if(low < 0x02000000) {
522        // We must not use CE weight byte 02, so we set it as the minimum lower bound.
523        // See http://site.icu-project.org/design/collation/bytes
524        low = 0x02000000;
525    }
526
527    if(strength == UCOL_SECONDARY) { /* similar as simple */
528        if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
529            low = UCOL_COMMON_TOP2<<24;
530        }
531        if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
532            high = UCOL_COMMON_TOP2<<24;
533        }
534        if(low < (UCOL_COMMON_BOT2<<24)) {
535            g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
536            g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
537            //g->current = UCOL_COMMON_BOT2<<24;
538            return g->current;
539        }
540    }
541
542    g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
543    if(g->noOfRanges == 0) {
544        *status = U_INTERNAL_PROGRAM_ERROR;
545    }
546    g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
547    return g->current;
548}
549
550static
551uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
552    uint32_t i = 0;
553    UChar c;
554
555    if(U_FAILURE(*status)) {
556        return 0;
557    }
558
559    if(sourceLen > resLen) {
560        *status = U_MEMORY_ALLOCATION_ERROR;
561        return 0;
562    }
563
564    for(i = 0; i < sourceLen; i++) {
565        c = source[i];
566        if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
567            switch(c - 0x3000) {
568            case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
569            case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
570                c++;
571                break;
572            case 0xF5:
573                c = 0x30AB;
574                break;
575            case 0xF6:
576                c = 0x30B1;
577                break;
578            }
579        }
580        resBuf[i] = c;
581    }
582    return sourceLen;
583}
584
585static
586uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
587    uint32_t i = 0;
588    UChar c;
589
590    if(U_FAILURE(*status)) {
591        return 0;
592    }
593
594    if(sourceLen > resLen) {
595        *status = U_MEMORY_ALLOCATION_ERROR;
596        return 0;
597    }
598
599    for(i = 0; i < sourceLen; i++) {
600        c = source[i];
601        if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
602            switch(c - 0x3000) {
603            case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
604            case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
605                c--;
606                break;
607            case 0xAB:
608                c = 0x30F5;
609                break;
610            case 0xB1:
611                c = 0x30F6;
612                break;
613            }
614        }
615        resBuf[i] = c;
616    }
617    return sourceLen;
618}
619
620U_NAMESPACE_BEGIN
621
622static
623uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
624    uint32_t i = 0;
625    UChar n[128];
626    uint32_t nLen = 0;
627    uint32_t uCount = 0, lCount = 0;
628
629    collIterate s;
630    uint32_t order = 0;
631
632    if(U_FAILURE(*status)) {
633        return UCOL_LOWER_CASE;
634    }
635
636    nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
637    if(U_SUCCESS(*status)) {
638        for(i = 0; i < nLen; i++) {
639            uprv_init_collIterate(UCA, &n[i], 1, &s, status);
640            order = ucol_getNextCE(UCA, &s, status);
641            if(isContinuation(order)) {
642                *status = U_INTERNAL_PROGRAM_ERROR;
643                return UCOL_LOWER_CASE;
644            }
645            if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
646                uCount++;
647            } else {
648                if(u_islower(n[i])) {
649                    lCount++;
650                } else if(U_SUCCESS(*status)) {
651                    UChar sk[1], lk[1];
652                    u_toSmallKana(&n[i], 1, sk, 1, status);
653                    u_toLargeKana(&n[i], 1, lk, 1, status);
654                    if(sk[0] == n[i] && lk[0] != n[i]) {
655                        lCount++;
656                    }
657                }
658            }
659        }
660    }
661
662    if(uCount != 0 && lCount != 0) {
663        return UCOL_MIXED_CASE;
664    } else if(uCount != 0) {
665        return UCOL_UPPER_CASE;
666    } else {
667        return UCOL_LOWER_CASE;
668    }
669}
670
671
672U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
673    /* this one makes the table and stuff */
674    uint32_t noOfBytes[3];
675    uint32_t i;
676
677    for(i = 0; i<3; i++) {
678        ucol_countBytes(CEparts[i], noOfBytes[i]);
679    }
680
681    /* Here we have to pack CEs from parts */
682
683    uint32_t CEi = 0;
684    uint32_t value = 0;
685
686    while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
687        if(CEi > 0) {
688            value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
689        } else {
690            value = 0;
691        }
692
693        if(2*CEi<noOfBytes[0]) {
694            value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
695        }
696        if(CEi<noOfBytes[1]) {
697            value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
698        }
699        if(CEi<noOfBytes[2]) {
700            value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
701        }
702        tok->CEs[CEi] = value;
703        CEi++;
704    }
705    if(CEi == 0) { /* totally ignorable */
706        tok->noOfCEs = 1;
707        tok->CEs[0] = 0;
708    } else { /* there is at least something */
709        tok->noOfCEs = CEi;
710    }
711
712
713    // we want to set case bits here and now, not later.
714    // Case bits handling
715    if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
716        tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
717        int32_t cSize = (tok->source & 0xFF000000) >> 24;
718        UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
719
720        if(cSize > 1) {
721            // Do it manually
722            tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
723        } else {
724            // Copy it from the UCA
725            uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
726            tok->CEs[0] |= (caseCE & 0xC0);
727        }
728    }
729
730#if UCOL_DEBUG==2
731    fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
732    for(i = 0; i<tok->noOfCEs; i++) {
733        fprintf(stderr, "%08X ", tok->CEs[i]);
734    }
735    fprintf(stderr, "\n");
736#endif
737}
738
739U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
740    ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
741    uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
742
743    UColToken *tok = lh->last;
744    uint32_t t[UCOL_STRENGTH_LIMIT];
745
746    uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
747
748    /* must initialize ranges to avoid memory check warnings */
749    for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
750        uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
751    }
752
753    tok->toInsert = 1;
754    t[tok->strength] = 1;
755
756    while(tok->previous != NULL) {
757        if(tok->previous->strength < tok->strength) { /* going up */
758            t[tok->strength] = 0;
759            t[tok->previous->strength]++;
760        } else if(tok->previous->strength > tok->strength) { /* going down */
761            t[tok->previous->strength] = 1;
762        } else {
763            t[tok->strength]++;
764        }
765        tok=tok->previous;
766        tok->toInsert = t[tok->strength];
767    }
768
769    tok->toInsert = t[tok->strength];
770    ucol_inv_getGapPositions(src, lh, status);
771
772#if UCOL_DEBUG
773    fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
774    int32_t j = 2;
775    for(j = 2; j >= 0; j--) {
776        fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
777        fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
778    }
779    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
780
781    do {
782        fprintf(stderr,"%i", tok->strength);
783        tok = tok->next;
784    } while(tok != NULL);
785    fprintf(stderr, "\n");
786
787    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
788
789    do {
790        fprintf(stderr,"%i", tok->toInsert);
791        tok = tok->next;
792    } while(tok != NULL);
793#endif
794
795    tok = lh->first;
796    uint32_t fStrength = UCOL_IDENTICAL;
797    uint32_t initStrength = UCOL_IDENTICAL;
798
799
800    CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
801    CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
802    CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
803
804    while (tok != NULL && U_SUCCESS(*status)) {
805        fStrength = tok->strength;
806        if(fStrength < initStrength) {
807            initStrength = fStrength;
808            if(lh->pos[fStrength] == -1) {
809                while(lh->pos[fStrength] == -1 && fStrength > 0) {
810                    fStrength--;
811                }
812                if(lh->pos[fStrength] == -1) {
813                    *status = U_INTERNAL_PROGRAM_ERROR;
814                    return;
815                }
816            }
817            if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
818                CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
819                CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
820                /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
821                CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
822            } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
823                CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
824                /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
825                CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
826                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
827            } else { /* primaries */
828                /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
829                CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
830                CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
831                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
832            }
833        } else {
834            if(tok->strength == UCOL_TERTIARY) {
835                CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
836            } else if(tok->strength == UCOL_SECONDARY) {
837                CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
838                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
839            } else if(tok->strength == UCOL_PRIMARY) {
840                CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
841                CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
842                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
843            }
844        }
845        ucol_doCE(src, CEparts, tok, status);
846        tok = tok->next;
847    }
848}
849
850U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
851    UCAElements el;
852    UColToken *tok = lh->first;
853    UColToken *expt = NULL;
854    uint32_t i = 0, j = 0;
855    const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
856
857    while(tok != NULL && U_SUCCESS(*status)) {
858        /* first, check if there are any expansions */
859        /* if there are expansions, we need to do a little bit more processing */
860        /* since parts of expansion can be tailored, while others are not */
861        if(tok->expansion != 0) {
862            uint32_t len = tok->expansion >> 24;
863            uint32_t currentSequenceLen = len;
864            uint32_t expOffset = tok->expansion & 0x00FFFFFF;
865            //uint32_t exp = currentSequenceLen | expOffset;
866            UColToken exp;
867            exp.source = currentSequenceLen | expOffset;
868            exp.rulesToParseHdl = &(src->source);
869
870            while(len > 0) {
871                currentSequenceLen = len;
872                while(currentSequenceLen > 0) {
873                    exp.source = (currentSequenceLen << 24) | expOffset;
874                    if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
875                        uint32_t noOfCEsToCopy = expt->noOfCEs;
876                        for(j = 0; j<noOfCEsToCopy; j++) {
877                            tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
878                        }
879                        tok->noOfExpCEs += noOfCEsToCopy;
880                        // Smart people never try to add codepoints and CEs.
881                        // For some odd reason, it won't work.
882                        expOffset += currentSequenceLen; //noOfCEsToCopy;
883                        len -= currentSequenceLen; //noOfCEsToCopy;
884                        break;
885                    } else {
886                        currentSequenceLen--;
887                    }
888                }
889                if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
890                    /* will have to get one from UCA */
891                    /* first, get the UChars from the rules */
892                    /* then pick CEs out until there is no more and stuff them into expansion */
893                    collIterate s;
894                    uint32_t order = 0;
895                    uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
896
897                    for(;;) {
898                        order = ucol_getNextCE(src->UCA, &s, status);
899                        if(order == UCOL_NO_MORE_CES) {
900                            break;
901                        }
902                        tok->expCEs[tok->noOfExpCEs++] = order;
903                    }
904                    expOffset++;
905                    len--;
906                }
907            }
908        } else {
909            tok->noOfExpCEs = 0;
910        }
911
912        /* set the ucaelement with obtained values */
913        el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
914        /* copy CEs */
915        for(i = 0; i<tok->noOfCEs; i++) {
916            el.CEs[i] = tok->CEs[i];
917        }
918        for(i = 0; i<tok->noOfExpCEs; i++) {
919            el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
920        }
921
922        /* copy UChars */
923        // We kept prefix and source kind of together, as it is a kind of a contraction.
924        // However, now we have to slice the prefix off the main thing -
925        el.prefix = el.prefixChars;
926        el.cPoints = el.uchars;
927        if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
928            // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
929            // decomposed elements to the unsaf table.
930            el.prefixSize = tok->prefix>>24;
931            uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
932
933            el.cSize = (tok->source >> 24)-(tok->prefix>>24);
934            uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
935        } else {
936            el.prefixSize = 0;
937            *el.prefix = 0;
938
939            el.cSize = (tok->source >> 24);
940            uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
941        }
942        if(src->UCA != NULL) {
943            for(i = 0; i<el.cSize; i++) {
944                if(UCOL_ISJAMO(el.cPoints[i])) {
945                    t->image->jamoSpecial = TRUE;
946                }
947            }
948            if (!src->buildCCTabFlag && el.cSize > 0) {
949                // Check the trailing canonical combining class (tccc) of the last character.
950                const UChar *s = el.cPoints + el.cSize;
951                uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
952                if ((fcd & 0xff) != 0) {
953                    src->buildCCTabFlag = TRUE;
954                }
955            }
956        }
957
958        /* and then, add it */
959#if UCOL_DEBUG==2
960        fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
961#endif
962        uprv_uca_addAnElement(t, &el, status);
963
964#if UCOL_DEBUG_DUPLICATES
965        if(*status != U_ZERO_ERROR) {
966            fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
967            *status = U_ZERO_ERROR;
968        }
969#endif
970
971        tok = tok->next;
972    }
973}
974
975U_CDECL_BEGIN
976static UBool U_CALLCONV
977_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
978    UErrorCode status = U_ZERO_ERROR;
979    tempUCATable *t = (tempUCATable *)context;
980    if(value == 0) {
981        while(start < limit) {
982            uint32_t CE = utrie_get32(t->mapping, start, NULL);
983            if(CE == UCOL_NOT_FOUND) {
984                UCAElements el;
985                el.isThai = FALSE;
986                el.prefixSize = 0;
987                el.prefixChars[0] = 0;
988                el.prefix = el.prefixChars;
989                el.cPoints = el.uchars;
990
991                el.cSize = 0;
992                U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
993
994                el.noOfCEs = 1;
995                el.CEs[0] = 0;
996                uprv_uca_addAnElement(t, &el, &status);
997
998            }
999            start++;
1000        }
1001    }
1002    if(U_FAILURE(status)) {
1003        return FALSE;
1004    } else {
1005        return TRUE;
1006    }
1007}
1008U_CDECL_END
1009
1010static void
1011ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1012                               UChar32 start, UChar32 end,
1013                               UErrorCode *status)
1014{
1015    //UChar decomp[256];
1016    uint32_t CE = UCOL_NOT_FOUND;
1017    UChar32 u = 0;
1018    UCAElements el;
1019    el.isThai = FALSE;
1020    el.prefixSize = 0;
1021    el.prefixChars[0] = 0;
1022    collIterate colIt;
1023
1024    if(U_SUCCESS(*status)) {
1025        for(u = start; u<=end; u++) {
1026            if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1027                /* this test is for contractions that are missing the starting element. */
1028                || ((isCntTableElement(CE)) &&
1029                (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1030                )
1031            {
1032                el.cSize = 0;
1033                U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1034                //decomp[0] = (UChar)u;
1035                //el.uchars[0] = (UChar)u;
1036                el.cPoints = el.uchars;
1037                //el.cSize = 1;
1038                el.noOfCEs = 0;
1039                el.prefix = el.prefixChars;
1040                el.prefixSize = 0;
1041                //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1042                // We actually want to check whether this element is a special
1043                // If it is an implicit element (hangul, CJK - we want to copy the
1044                // special, not the resolved CEs) - for hangul, copying resolved
1045                // would just make things the same (there is an expansion and it
1046                // takes approximately the same amount of time to resolve as
1047                // falling back to the UCA).
1048                /*
1049                UTRIE_GET32(src->UCA->mapping, u, CE);
1050                tag = getCETag(CE);
1051                if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1052                || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1053                || tag == LEAD_SURROGATE_TAG) {
1054                el.CEs[el.noOfCEs++] = CE;
1055                } else {
1056                */
1057                // It turns out that it does not make sense to keep implicits
1058                // unresolved. The cost of resolving them is big enough so that
1059                // it doesn't make any difference whether we have to go to the UCA
1060                // or not.
1061                {
1062                    uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1063                    while(CE != UCOL_NO_MORE_CES) {
1064                        CE = ucol_getNextCE(src->UCA, &colIt, status);
1065                        if(CE != UCOL_NO_MORE_CES) {
1066                            el.CEs[el.noOfCEs++] = CE;
1067                        }
1068                    }
1069                }
1070                uprv_uca_addAnElement(t, &el, status);
1071            }
1072        }
1073    }
1074}
1075
1076U_NAMESPACE_END
1077
1078U_CFUNC UCATableHeader *
1079ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1080    U_NAMESPACE_USE
1081
1082    uint32_t i = 0;
1083    if(U_FAILURE(*status)) {
1084        return NULL;
1085    }
1086    /*
1087    2.  Eliminate the negative lists by doing the following for each non-null negative list:
1088    o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1089    create new ListHeader X
1090    o   reverse the list, add to the end of X's positive list. Reset the strength of the
1091    first item you add, based on the stronger strength levels of the two lists.
1092    */
1093    /*
1094    3.  For each ListHeader with a non-null positive list:
1095    */
1096    /*
1097    o   Find all character strings with CEs between the baseCE and the
1098    next/previous CE, at the strength of the first token. Add these to the
1099    tailoring.
1100    ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1101    tailoring has & x < z...
1102    ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1103    */
1104    /* It is possible that this part should be done even while constructing list */
1105    /* The problem is that it is unknown what is going to be the strongest weight */
1106    /* So we might as well do it here */
1107
1108    /*
1109    o   Allocate CEs for each token in the list, based on the total number N of the
1110    largest level difference, and the gap G between baseCE and nextCE at that
1111    level. The relation * between the last item and nextCE is the same as the
1112    strongest strength.
1113    o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1114    ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1115    Then fit b and c into the secondary gap between a and d, then fit q
1116    into the tertiary gap between b and c.
1117
1118    o   Example: baseCE << b <<< q << c * nextCE(X,2)
1119    ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1120    Then fit q into the tertiary gap between b and c.
1121    o   When incrementing primary values, we will not cross high byte
1122    boundaries except where there is only a single-byte primary. That is to
1123    ensure that the script reordering will continue to work.
1124    */
1125    UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1126    /* test for NULL */
1127    if (image == NULL) {
1128        *status = U_MEMORY_ALLOCATION_ERROR;
1129        return NULL;
1130    }
1131    uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1132
1133    for(i = 0; i<src->resultLen; i++) {
1134        /* now we need to generate the CEs */
1135        /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1136        /* According to strength                                                          */
1137        if(U_SUCCESS(*status)) {
1138            if(src->lh[i].first) { // if there are any elements
1139                // due to the way parser works, subsequent tailorings
1140                // may remove all the elements from a sequence, therefore
1141                // leaving an empty tailoring sequence.
1142                ucol_initBuffers(src, &src->lh[i], status);
1143            }
1144        }
1145        if(U_FAILURE(*status)) {
1146            uprv_free(image);
1147            return NULL;
1148        }
1149    }
1150
1151    if(src->varTop != NULL) { /* stuff the variable top value */
1152        src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1153        /* remove it from the list */
1154        if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1155            src->varTop->listHeader->first = src->varTop->next;
1156        }
1157        if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1158            src->varTop->listHeader->last = src->varTop->previous;
1159        }
1160        if(src->varTop->next != NULL) {
1161            src->varTop->next->previous = src->varTop->previous;
1162        }
1163        if(src->varTop->previous != NULL) {
1164            src->varTop->previous->next = src->varTop->next;
1165        }
1166    }
1167
1168
1169    tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1170    if(U_FAILURE(*status)) {
1171        uprv_free(image);
1172        return NULL;
1173    }
1174
1175
1176    /* After this, we have assigned CE values to all regular CEs      */
1177    /* now we will go through list once more and resolve expansions,  */
1178    /* make UCAElements structs and add them to table                 */
1179    for(i = 0; i<src->resultLen; i++) {
1180        /* now we need to generate the CEs */
1181        /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1182        /* According to strength                                                          */
1183        if(U_SUCCESS(*status)) {
1184            ucol_createElements(src, t, &src->lh[i], status);
1185        }
1186    }
1187
1188    UCAElements el;
1189    el.isThai = FALSE;
1190    el.prefixSize = 0;
1191    el.prefixChars[0] = 0;
1192
1193    /* add latin-1 stuff */
1194    ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1195
1196    /* add stuff for copying */
1197    if(src->copySet != NULL) {
1198        int32_t i = 0;
1199        UnicodeSet *set = (UnicodeSet *)src->copySet;
1200        for(i = 0; i < set->getRangeCount(); i++) {
1201            ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1202        }
1203    }
1204
1205    if(U_SUCCESS(*status)) {
1206        /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1207
1208        uint32_t tailoredCE = UCOL_NOT_FOUND;
1209        UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1210        int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
1211        UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1212        // Check for null pointer
1213        if (ucaEl == NULL) {
1214            *status = U_MEMORY_ALLOCATION_ERROR;
1215            return NULL;
1216        }
1217        while(*conts != 0) {
1218            // A continuation is NUL-terminated and NUL-padded
1219            // except if it has the maximum length.
1220            int32_t contractionLength = maxUCAContractionLength;
1221            while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1222                --contractionLength;
1223            }
1224            UChar32 first;
1225            int32_t firstLength = 0;
1226            U16_NEXT(conts, firstLength, contractionLength, first);
1227            tailoredCE = utrie_get32(t->mapping, first, NULL);
1228            if(tailoredCE != UCOL_NOT_FOUND) {
1229                UBool needToAdd = TRUE;
1230                if(isCntTableElement(tailoredCE)) {
1231                    if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
1232                        needToAdd = FALSE;
1233                    }
1234                }
1235                if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1236                    UCAElements elm;
1237                    elm.cPoints = el.uchars;
1238                    elm.noOfCEs = 0;
1239                    elm.uchars[0] = *conts;
1240                    elm.uchars[1] = 0;
1241                    elm.cSize = 1;
1242                    elm.prefixChars[0] = *(conts+2);
1243                    elm.isThai = FALSE;
1244                    elm.prefix = elm.prefixChars;
1245                    elm.prefixSize = 1;
1246                    UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1247                    if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1248                        needToAdd = TRUE;
1249                    }
1250                }
1251                if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1252                    needToAdd = FALSE;
1253                }
1254
1255                if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1256                    if (*(conts+1) != 0) {  // contractions
1257                        el.prefix = el.prefixChars;
1258                        el.prefixSize = 0;
1259                        el.cPoints = el.uchars;
1260                        el.noOfCEs = 0;
1261                        u_memcpy(el.uchars, conts, contractionLength);
1262                        el.cSize = contractionLength;
1263                        ucol_setText(ucaEl, el.uchars, el.cSize, status);
1264                    }
1265                    else { // pre-context character
1266                        UChar str[4] = { 0 };
1267                        int32_t len=0;
1268                        int32_t preKeyLen=0;
1269
1270                        el.cPoints = el.uchars;
1271                        el.noOfCEs = 0;
1272                        el.uchars[0] = *conts;
1273                        el.uchars[1] = 0;
1274                        el.cSize = 1;
1275                        el.prefixChars[0] = *(conts+2);
1276                        el.prefix = el.prefixChars;
1277                        el.prefixSize = 1;
1278                        if (el.prefixChars[0]!=0) {
1279                            // get CE of prefix character first
1280                            str[0]=el.prefixChars[0];
1281                            str[1]=0;
1282                            ucol_setText(ucaEl, str, 1, status);
1283                            while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1284                                    != UCOL_NULLORDER) {
1285                                preKeyLen++;  // count number of keys for prefix character
1286                            }
1287                            str[len++] = el.prefixChars[0];
1288                        }
1289
1290                        str[len++] = el.uchars[0];
1291                        str[len]=0;
1292                        ucol_setText(ucaEl, str, len, status);
1293                        // Skip the keys for prefix character, then copy the rest to el.
1294                        while ((preKeyLen-->0) &&
1295                               (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1296                            continue;
1297                        }
1298
1299                    }
1300                    while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1301                        el.noOfCEs++;
1302                    }
1303                    uprv_uca_addAnElement(t, &el, status);
1304                }
1305
1306            } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1307                ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1308            }
1309            conts+=maxUCAContractionLength;
1310        }
1311        ucol_closeElements(ucaEl);
1312    }
1313
1314    // Add completely ignorable elements
1315    utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1316
1317    // add tailoring characters related canonical closures
1318    uprv_uca_canonicalClosure(t, src, NULL, status);
1319
1320    /* still need to produce compatibility closure */
1321
1322    UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1323
1324    uprv_uca_closeTempTable(t);
1325    uprv_free(image);
1326
1327    return myData;
1328}
1329
1330U_CDECL_BEGIN
1331static UBool U_CALLCONV
1332ucol_bld_cleanup(void)
1333{
1334    udata_close(invUCA_DATA_MEM);
1335    invUCA_DATA_MEM = NULL;
1336    _staticInvUCA = NULL;
1337    return TRUE;
1338}
1339U_CDECL_END
1340
1341U_CAPI const InverseUCATableHeader * U_EXPORT2
1342ucol_initInverseUCA(UErrorCode *status)
1343{
1344    if(U_FAILURE(*status)) return NULL;
1345
1346    UBool needsInit;
1347    UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1348
1349    if(needsInit) {
1350        InverseUCATableHeader *newInvUCA = NULL;
1351        UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1352
1353        if(U_FAILURE(*status)) {
1354            if (result) {
1355                udata_close(result);
1356            }
1357            // This is not needed, as we are talking about
1358            // memory we got from UData
1359            //uprv_free(newInvUCA);
1360        }
1361
1362        if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1363            newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1364            UCollator *UCA = ucol_initUCA(status);
1365            // UCA versions of UCA and inverse UCA should match
1366            if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1367                *status = U_INVALID_FORMAT_ERROR;
1368                udata_close(result);
1369                return NULL;
1370            }
1371
1372            umtx_lock(NULL);
1373            if(_staticInvUCA == NULL) {
1374                invUCA_DATA_MEM = result;
1375                _staticInvUCA = newInvUCA;
1376                result = NULL;
1377                newInvUCA = NULL;
1378            }
1379            umtx_unlock(NULL);
1380
1381            if(newInvUCA != NULL) {
1382                udata_close(result);
1383                // This is not needed, as we are talking about
1384                // memory we got from UData
1385                //uprv_free(newInvUCA);
1386            }
1387            else {
1388                ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1389            }
1390        }
1391    }
1392    return _staticInvUCA;
1393}
1394
1395/* This is the data that is used for non-script reordering codes. These _must_ be kept
1396 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1397 */
1398static const char * const ReorderingTokenNames[] = {
1399    "SPACE",
1400    "PUNCT",
1401    "SYMBOL",
1402    "CURRENCY",
1403    "DIGIT"
1404};
1405
1406static void toUpper(const char* src, char* dst, uint32_t length) {
1407   for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1408       *dst = uprv_toupper(*src);
1409   }
1410   *dst = '\0';
1411}
1412
1413U_INTERNAL int32_t U_EXPORT2
1414ucol_findReorderingEntry(const char* name) {
1415    char buffer[32];
1416    toUpper(name, buffer, 32);
1417    for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
1418        if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1419            return entry + UCOL_REORDER_CODE_FIRST;
1420        }
1421    }
1422    return USCRIPT_INVALID_CODE;
1423}
1424
1425#endif /* #if !UCONFIG_NO_COLLATION */
1426