1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  n2builder.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009nov25
14*   created by: Markus W. Scherer
15*
16* Builds Normalizer2 data and writes a binary .nrm file.
17* For the file format see source/common/normalizer2impl.h.
18*/
19
20#include "unicode/utypes.h"
21#include "n2builder.h"
22
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#if U_HAVE_STD_STRING
27#include <vector>
28#endif
29#include "unicode/errorcode.h"
30#include "unicode/localpointer.h"
31#include "unicode/putil.h"
32#include "unicode/udata.h"
33#include "unicode/uniset.h"
34#include "unicode/unistr.h"
35#include "unicode/ustring.h"
36#include "hash.h"
37#include "normalizer2impl.h"
38#include "toolutil.h"
39#include "unewdata.h"
40#include "utrie2.h"
41#include "uvectr32.h"
42
43#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
44
45#if !UCONFIG_NO_NORMALIZATION
46
47/* UDataInfo cf. udata.h */
48static UDataInfo dataInfo={
49    sizeof(UDataInfo),
50    0,
51
52    U_IS_BIG_ENDIAN,
53    U_CHARSET_FAMILY,
54    U_SIZEOF_UCHAR,
55    0,
56
57    { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
58    { 2, 0, 0, 0 },             /* formatVersion */
59    { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
60};
61
62U_NAMESPACE_BEGIN
63
64class HangulIterator {
65public:
66    struct Range {
67        UChar32 start, limit;
68        uint16_t norm16;
69    };
70
71    HangulIterator() : rangeIndex(0) {}
72    const Range *nextRange() {
73        if(rangeIndex<LENGTHOF(ranges)) {
74            return ranges+rangeIndex++;
75        } else {
76            return NULL;
77        }
78    }
79    void reset() { rangeIndex=0; }
80private:
81    static const Range ranges[4];
82    int32_t rangeIndex;
83};
84
85const HangulIterator::Range HangulIterator::ranges[4]={
86    { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
87    { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
88    // JAMO_T_BASE+1: not U+11A7
89    { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
90    { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
91};
92
93struct CompositionPair {
94    CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
95    UChar32 trail, composite;
96};
97
98struct Norm {
99    enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
100
101    UBool hasMapping() const { return mappingType>REMOVED; }
102
103    // Requires hasMapping() and well-formed mapping.
104    void setMappingCP() {
105        UChar32 c;
106        if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
107            mappingCP=c;
108        } else {
109            mappingCP=U_SENTINEL;
110        }
111    }
112
113    const CompositionPair *getCompositionPairs(int32_t &length) const {
114        if(compositions==NULL) {
115            length=0;
116            return NULL;
117        } else {
118            length=compositions->size()/2;
119            return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
120        }
121    }
122
123    UnicodeString *mapping;
124    UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
125    UChar32 mappingCP;  // >=0 if mapping to 1 code point
126    int32_t mappingPhase;
127    MappingType mappingType;
128
129    UVector32 *compositions;  // (trail, composite) pairs
130    uint8_t cc;
131    UBool combinesBack;
132    UBool hasNoCompBoundaryAfter;
133
134    enum OffsetType {
135        OFFSET_NONE,
136        // Composition for back-combining character. Allowed, but not normally used.
137        OFFSET_MAYBE_YES,
138        // Composition for a starter that does not have a decomposition mapping.
139        OFFSET_YES_YES,
140        // Round-trip mapping & composition for a starter.
141        OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
142        // Round-trip mapping for a starter that itself does not combine-forward.
143        OFFSET_YES_NO_MAPPING_ONLY,
144        // One-way mapping.
145        OFFSET_NO_NO,
146        // Delta for an algorithmic one-way mapping.
147        OFFSET_DELTA
148    };
149    enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
150    int32_t offset;
151};
152
153class Normalizer2DBEnumerator {
154public:
155    Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
156    virtual ~Normalizer2DBEnumerator() {}
157    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
158    Normalizer2DBEnumerator *ptr() { return this; }
159protected:
160    Normalizer2DataBuilder &builder;
161};
162
163U_CDECL_BEGIN
164
165static UBool U_CALLCONV
166enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
167    return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
168}
169
170U_CDECL_END
171
172Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
173        phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) {
174    memset(unicodeVersion, 0, sizeof(unicodeVersion));
175    normTrie=utrie2_open(0, 0, &errorCode);
176    normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
177    norms=allocNorm();  // unused Norm struct at index 0
178    memset(indexes, 0, sizeof(indexes));
179    memset(smallFCD, 0, sizeof(smallFCD));
180}
181
182Normalizer2DataBuilder::~Normalizer2DataBuilder() {
183    utrie2_close(normTrie);
184    int32_t normsLength=utm_countItems(normMem);
185    for(int32_t i=1; i<normsLength; ++i) {
186        delete norms[i].mapping;
187        delete norms[i].rawMapping;
188        delete norms[i].compositions;
189    }
190    utm_close(normMem);
191    utrie2_close(norm16Trie);
192}
193
194void
195Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
196    UVersionInfo nullVersion={ 0, 0, 0, 0 };
197    UVersionInfo version;
198    u_versionFromString(version, v);
199    if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
200        0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
201    ) {
202        char buffer[U_MAX_VERSION_STRING_LENGTH];
203        u_versionToString(unicodeVersion, buffer);
204        fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
205                buffer, v);
206        exit(U_ILLEGAL_ARGUMENT_ERROR);
207    }
208    memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
209}
210
211Norm *Normalizer2DataBuilder::allocNorm() {
212    Norm *p=(Norm *)utm_alloc(normMem);
213    norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
214    return p;
215}
216
217/* get an existing Norm unit */
218Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
219    uint32_t i=utrie2_get32(normTrie, c);
220    if(i==0) {
221        return NULL;
222    }
223    return norms+i;
224}
225
226const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
227    return norms[utrie2_get32(normTrie, c)];
228}
229
230/*
231 * get or create a Norm unit;
232 * get or create the intermediate trie entries for it as well
233 */
234Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
235    uint32_t i=utrie2_get32(normTrie, c);
236    if(i!=0) {
237        return norms+i;
238    } else {
239        /* allocate Norm */
240        Norm *p=allocNorm();
241        IcuToolErrorCode errorCode("gennorm2/createNorm()");
242        utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
243        return p;
244    }
245}
246
247Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
248    if(p!=NULL) {
249        if(p->mappingType!=Norm::NONE) {
250            if( overrideHandling==OVERRIDE_NONE ||
251                (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
252            ) {
253                fprintf(stderr,
254                        "error in gennorm2 phase %d: "
255                        "not permitted to override mapping for U+%04lX from phase %d\n",
256                        (int)phase, (long)c, (int)p->mappingPhase);
257                exit(U_INVALID_FORMAT_ERROR);
258            }
259            delete p->mapping;
260            p->mapping=NULL;
261        }
262        p->mappingPhase=phase;
263    }
264    return p;
265}
266
267void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
268    overrideHandling=oh;
269    ++phase;
270}
271
272void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
273    createNorm(c)->cc=cc;
274}
275
276uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
277    return getNormRef(c).cc;
278}
279
280static UBool isWellFormed(const UnicodeString &s) {
281    UErrorCode errorCode=U_ZERO_ERROR;
282    u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
283    return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
284}
285
286void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
287    if(!isWellFormed(m)) {
288        fprintf(stderr,
289                "error in gennorm2 phase %d: "
290                "illegal one-way mapping from U+%04lX to malformed string\n",
291                (int)phase, (long)c);
292        exit(U_INVALID_FORMAT_ERROR);
293    }
294    Norm *p=checkNormForMapping(createNorm(c), c);
295    p->mapping=new UnicodeString(m);
296    p->mappingType=Norm::ONE_WAY;
297    p->setMappingCP();
298}
299
300void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
301    if(U_IS_SURROGATE(c)) {
302        fprintf(stderr,
303                "error in gennorm2 phase %d: "
304                "illegal round-trip mapping from surrogate code point U+%04lX\n",
305                (int)phase, (long)c);
306        exit(U_INVALID_FORMAT_ERROR);
307    }
308    if(!isWellFormed(m)) {
309        fprintf(stderr,
310                "error in gennorm2 phase %d: "
311                "illegal round-trip mapping from U+%04lX to malformed string\n",
312                (int)phase, (long)c);
313        exit(U_INVALID_FORMAT_ERROR);
314    }
315    int32_t numCP=u_countChar32(m.getBuffer(), m.length());
316    if(numCP!=2) {
317        fprintf(stderr,
318                "error in gennorm2 phase %d: "
319                "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
320                (int)phase, (long)c, (int)numCP);
321        exit(U_INVALID_FORMAT_ERROR);
322    }
323    Norm *p=checkNormForMapping(createNorm(c), c);
324    p->mapping=new UnicodeString(m);
325    p->mappingType=Norm::ROUND_TRIP;
326    p->mappingCP=U_SENTINEL;
327}
328
329void Normalizer2DataBuilder::removeMapping(UChar32 c) {
330    Norm *p=checkNormForMapping(getNorm(c), c);
331    if(p!=NULL) {
332        p->mappingType=Norm::REMOVED;
333    }
334}
335
336class CompositionBuilder : public Normalizer2DBEnumerator {
337public:
338    CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
339    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
340        builder.addComposition(start, end, value);
341        return TRUE;
342    }
343};
344
345void
346Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
347    if(norms[value].mappingType==Norm::ROUND_TRIP) {
348        if(start!=end) {
349            fprintf(stderr,
350                    "gennorm2 error: same round-trip mapping for "
351                    "more than 1 code point U+%04lX..U+%04lX\n",
352                    (long)start, (long)end);
353            exit(U_INVALID_FORMAT_ERROR);
354        }
355        if(norms[value].cc!=0) {
356            fprintf(stderr,
357                    "gennorm2 error: "
358                    "U+%04lX has a round-trip mapping and ccc!=0, "
359                    "not possible in Unicode normalization\n",
360                    (long)start);
361            exit(U_INVALID_FORMAT_ERROR);
362        }
363        // setRoundTripMapping() ensured that there are exactly two code points.
364        const UnicodeString &m=*norms[value].mapping;
365        UChar32 lead=m.char32At(0);
366        UChar32 trail=m.char32At(m.length()-1);
367        if(getCC(lead)!=0) {
368            fprintf(stderr,
369                    "gennorm2 error: "
370                    "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
371                    "not possible in Unicode normalization\n",
372                    (long)start, (long)lead);
373            exit(U_INVALID_FORMAT_ERROR);
374        }
375        // Flag for trailing character.
376        createNorm(trail)->combinesBack=TRUE;
377        // Insert (trail, composite) pair into compositions list for the lead character.
378        IcuToolErrorCode errorCode("gennorm2/addComposition()");
379        Norm *leadNorm=createNorm(lead);
380        UVector32 *compositions=leadNorm->compositions;
381        int32_t i;
382        if(compositions==NULL) {
383            compositions=leadNorm->compositions=new UVector32(errorCode);
384            i=0;  // "insert" the first pair at index 0
385        } else {
386            // Insertion sort, and check for duplicate trail characters.
387            int32_t length;
388            const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
389            for(i=0; i<length; ++i) {
390                if(trail==pairs[i].trail) {
391                    fprintf(stderr,
392                            "gennorm2 error: same round-trip mapping for "
393                            "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
394                            (long)start, (long)lead, (long)trail);
395                    exit(U_INVALID_FORMAT_ERROR);
396                }
397                if(trail<pairs[i].trail) {
398                    break;
399                }
400            }
401        }
402        compositions->insertElementAt(trail, 2*i, errorCode);
403        compositions->insertElementAt(start, 2*i+1, errorCode);
404    }
405}
406
407UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
408                                                    uint8_t lowCC, uint8_t highCC) const {
409    if((highCC-lowCC)>=2) {
410        int32_t length;
411        const CompositionPair *pairs=norm.getCompositionPairs(length);
412        for(int32_t i=0; i<length; ++i) {
413            uint8_t trailCC=getCC(pairs[i].trail);
414            if(lowCC<trailCC && trailCC<highCC) {
415                return TRUE;
416            }
417        }
418    }
419    return FALSE;
420}
421
422UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
423    int32_t length;
424    const CompositionPair *pairs=norm.getCompositionPairs(length);
425    for(int32_t i=0; i<length; ++i) {
426        if(trail==pairs[i].trail) {
427            return pairs[i].composite;
428        }
429        if(trail<pairs[i].trail) {
430            break;
431        }
432    }
433    return U_SENTINEL;
434}
435
436class Decomposer : public Normalizer2DBEnumerator {
437public:
438    Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
439    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
440        didDecompose|=builder.decompose(start, end, value);
441        return TRUE;
442    }
443    UBool didDecompose;
444};
445
446UBool
447Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
448    if(norms[value].hasMapping()) {
449        Norm &norm=norms[value];
450        const UnicodeString &m=*norm.mapping;
451        UnicodeString *decomposed=NULL;
452        const UChar *s=m.getBuffer();
453        int32_t length=m.length();
454        int32_t prev, i=0;
455        UChar32 c;
456        while(i<length) {
457            prev=i;
458            U16_NEXT(s, i, length, c);
459            if(start<=c && c<=end) {
460                fprintf(stderr,
461                        "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
462                        (long)c);
463                exit(U_INVALID_FORMAT_ERROR);
464            }
465            const Norm &cNorm=getNormRef(c);
466            if(cNorm.hasMapping()) {
467                if(norm.mappingType==Norm::ROUND_TRIP) {
468                    if(prev==0) {
469                        if(cNorm.mappingType!=Norm::ROUND_TRIP) {
470                            fprintf(stderr,
471                                    "gennorm2 error: "
472                                    "U+%04lX's round-trip mapping's starter "
473                                    "U+%04lX one-way-decomposes, "
474                                    "not possible in Unicode normalization\n",
475                                    (long)start, (long)c);
476                            exit(U_INVALID_FORMAT_ERROR);
477                        }
478                        uint8_t myTrailCC=getCC(m.char32At(i));
479                        UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
480                        uint8_t cTrailCC=getCC(cTrailChar);
481                        if(cTrailCC>myTrailCC) {
482                            fprintf(stderr,
483                                    "gennorm2 error: "
484                                    "U+%04lX's round-trip mapping's starter "
485                                    "U+%04lX decomposes and the "
486                                    "inner/earlier tccc=%hu > outer/following tccc=%hu, "
487                                    "not possible in Unicode normalization\n",
488                                    (long)start, (long)c,
489                                    (short)cTrailCC, (short)myTrailCC);
490                            exit(U_INVALID_FORMAT_ERROR);
491                        }
492                    } else {
493                        fprintf(stderr,
494                                "gennorm2 error: "
495                                "U+%04lX's round-trip mapping's non-starter "
496                                "U+%04lX decomposes, "
497                                "not possible in Unicode normalization\n",
498                                (long)start, (long)c);
499                        exit(U_INVALID_FORMAT_ERROR);
500                    }
501                }
502                if(decomposed==NULL) {
503                    decomposed=new UnicodeString(m, 0, prev);
504                }
505                decomposed->append(*cNorm.mapping);
506            } else if(Hangul::isHangul(c)) {
507                UChar buffer[3];
508                int32_t hangulLength=Hangul::decompose(c, buffer);
509                if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
510                    fprintf(stderr,
511                            "gennorm2 error: "
512                            "U+%04lX's round-trip mapping's non-starter "
513                            "U+%04lX decomposes, "
514                            "not possible in Unicode normalization\n",
515                            (long)start, (long)c);
516                    exit(U_INVALID_FORMAT_ERROR);
517                }
518                if(decomposed==NULL) {
519                    decomposed=new UnicodeString(m, 0, prev);
520                }
521                decomposed->append(buffer, hangulLength);
522            } else if(decomposed!=NULL) {
523                decomposed->append(m, prev, i-prev);
524            }
525        }
526        if(decomposed!=NULL) {
527            if(norm.rawMapping==NULL) {
528                // Remember the original mapping when decomposing recursively.
529                norm.rawMapping=norm.mapping;
530            } else {
531                delete norm.mapping;
532            }
533            norm.mapping=decomposed;
534            // Not  norm.setMappingCP();  because the original mapping
535            // is most likely to be encodable as a delta.
536            return TRUE;
537        }
538    }
539    return FALSE;
540}
541
542class BuilderReorderingBuffer {
543public:
544    BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
545    void reset() {
546        fLength=0;
547        fLastStarterIndex=-1;
548        fDidReorder=FALSE;
549    }
550    int32_t length() const { return fLength; }
551    UBool isEmpty() const { return fLength==0; }
552    int32_t lastStarterIndex() const { return fLastStarterIndex; }
553    UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
554    uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
555    UBool didReorder() const { return fDidReorder; }
556    void append(UChar32 c, uint8_t cc) {
557        if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
558            if(cc==0) {
559                fLastStarterIndex=fLength;
560            }
561            fArray[fLength++]=(c<<8)|cc;
562            return;
563        }
564        // Let this character bubble back to its canonical order.
565        int32_t i=fLength-1;
566        while(i>fLastStarterIndex && ccAt(i)>cc) {
567            --i;
568        }
569        ++i;  // after the last starter or prevCC<=cc
570        // Move this and the following characters forward one to make space.
571        for(int32_t j=fLength; i<j; --j) {
572            fArray[j]=fArray[j-1];
573        }
574        fArray[i]=(c<<8)|cc;
575        ++fLength;
576        fDidReorder=TRUE;
577    }
578    void toString(UnicodeString &dest) {
579        dest.remove();
580        for(int32_t i=0; i<fLength; ++i) {
581            dest.append(charAt(i));
582        }
583    }
584    void setComposite(UChar32 composite, int32_t combMarkIndex) {
585        fArray[fLastStarterIndex]=composite<<8;
586        // Remove the combining mark that contributed to the composite.
587        --fLength;
588        while(combMarkIndex<fLength) {
589            fArray[combMarkIndex]=fArray[combMarkIndex+1];
590            ++combMarkIndex;
591        }
592    }
593private:
594    int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
595    int32_t fLength;
596    int32_t fLastStarterIndex;
597    UBool fDidReorder;
598};
599
600void
601Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
602    UnicodeString &m=*p->mapping;
603    int32_t length=m.length();
604    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
605        return;  // writeMapping() will complain about it and print the code point.
606    }
607    const UChar *s=m.getBuffer();
608    int32_t i=0;
609    UChar32 c;
610    while(i<length) {
611        U16_NEXT(s, i, length, c);
612        buffer.append(c, getCC(c));
613    }
614    if(buffer.didReorder()) {
615        buffer.toString(m);
616    }
617}
618
619/*
620 * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
621 * A starter character with a mapping does not have a composition boundary after it
622 * if the character itself combines-forward (which is tested by the caller of this function),
623 * or it is deleted (mapped to the empty string),
624 * or its mapping contains no starter,
625 * or the last starter combines-forward.
626 */
627UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
628    if(buffer.isEmpty()) {
629        return TRUE;  // maps-to-empty-string is no boundary of any kind
630    }
631    int32_t lastStarterIndex=buffer.lastStarterIndex();
632    if(lastStarterIndex<0) {
633        return TRUE;  // no starter
634    }
635    UChar32 starter=buffer.charAt(lastStarterIndex);
636    if( Hangul::isJamoL(starter) ||
637        (Hangul::isJamoV(starter) &&
638         0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
639    ) {
640        // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
641        // otherwise it is blocked.
642        return lastStarterIndex==buffer.length()-1;
643    }
644    // Note: There can be no Hangul syllable in the fully decomposed mapping.
645    const Norm *starterNorm=&getNormRef(starter);
646    if(starterNorm->compositions==NULL) {
647        return FALSE;  // the last starter does not combine forward
648    }
649    // Compose as far as possible, and see if further compositions are possible.
650    uint8_t prevCC=0;
651    for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
652        uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
653        if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
654            return TRUE;
655        }
656        if( prevCC<cc &&
657            (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
658        ) {
659            buffer.setComposite(starter, combMarkIndex);
660            starterNorm=&getNormRef(starter);
661            if(starterNorm->compositions==NULL) {
662                return FALSE;  // the composite does not combine further
663            }
664        } else {
665            prevCC=cc;
666            ++combMarkIndex;
667        }
668    }
669    // TRUE if the final, forward-combining starter is at the end.
670    return prevCC==0;
671}
672
673// Requires p->hasMapping().
674// Returns the offset of the "first unit" from the beginning of the extraData for c.
675// That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
676int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
677    UnicodeString &m=*p->mapping;
678    int32_t length=m.length();
679    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
680        fprintf(stderr,
681                "gennorm2 error: "
682                "mapping for U+%04lX longer than maximum of %d\n",
683                (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
684        exit(U_INVALID_FORMAT_ERROR);
685    }
686    int32_t leadCC, trailCC;
687    if(length==0) {
688        leadCC=trailCC=0;
689    } else {
690        leadCC=getCC(m.char32At(0));
691        trailCC=getCC(m.char32At(length-1));
692    }
693    if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
694        fprintf(stderr,
695                "gennorm2 error: "
696                "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
697                (long)c);
698        exit(U_INVALID_FORMAT_ERROR);
699    }
700    // Write small-FCD data.
701    if((leadCC|trailCC)!=0) {
702        UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
703        smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
704    }
705    // Write the mapping & raw mapping extraData.
706    int32_t firstUnit=length|(trailCC<<8);
707    int32_t preMappingLength=0;
708    if(p->rawMapping!=NULL) {
709        UnicodeString &rm=*p->rawMapping;
710        int32_t rmLength=rm.length();
711        if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
712            fprintf(stderr,
713                    "gennorm2 error: "
714                    "raw mapping for U+%04lX longer than maximum of %d\n",
715                    (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
716            exit(U_INVALID_FORMAT_ERROR);
717        }
718        UChar rm0=rm.charAt(0);
719        if( rmLength==length-1 &&
720            // 99: overlong substring lengths get pinned to remainder lengths anyway
721            0==rm.compare(1, 99, m, 2, 99) &&
722            rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
723        ) {
724            // Compression:
725            // rawMapping=rm0+mapping.substring(2) -> store only rm0
726            //
727            // The raw mapping is the same as the final mapping after replacing
728            // the final mapping's first two code units with the raw mapping's first one.
729            // In this case, we store only that first unit, rm0.
730            // This helps with a few hundred mappings.
731            dataString.append(rm0);
732            preMappingLength=1;
733        } else {
734            // Store the raw mapping with its length.
735            dataString.append(rm);
736            dataString.append((UChar)rmLength);
737            preMappingLength=rmLength+1;
738        }
739        firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
740    }
741    int32_t cccLccc=p->cc|(leadCC<<8);
742    if(cccLccc!=0) {
743        dataString.append((UChar)cccLccc);
744        ++preMappingLength;
745        firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
746    }
747    if(p->hasNoCompBoundaryAfter) {
748        firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
749    }
750    dataString.append((UChar)firstUnit);
751    dataString.append(m);
752    return preMappingLength;
753}
754
755// Requires p->compositions!=NULL.
756void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
757    if(p->cc!=0) {
758        fprintf(stderr,
759                "gennorm2 error: "
760                "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
761                (long)c);
762        exit(U_INVALID_FORMAT_ERROR);
763    }
764    int32_t length;
765    const CompositionPair *pairs=p->getCompositionPairs(length);
766    for(int32_t i=0; i<length; ++i) {
767        const CompositionPair &pair=pairs[i];
768        // 22 bits for the composite character and whether it combines forward.
769        UChar32 compositeAndFwd=pair.composite<<1;
770        if(getNormRef(pair.composite).compositions!=NULL) {
771            compositeAndFwd|=1;  // The composite character also combines-forward.
772        }
773        // Encode most pairs in two units and some in three.
774        int32_t firstUnit, secondUnit, thirdUnit;
775        if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
776            if(compositeAndFwd<=0xffff) {
777                firstUnit=pair.trail<<1;
778                secondUnit=compositeAndFwd;
779                thirdUnit=-1;
780            } else {
781                firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
782                secondUnit=compositeAndFwd>>16;
783                thirdUnit=compositeAndFwd;
784            }
785        } else {
786            firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
787                       (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
788                      Normalizer2Impl::COMP_1_TRIPLE;
789            secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
790                       (compositeAndFwd>>16);
791            thirdUnit=compositeAndFwd;
792        }
793        // Set the high bit of the first unit if this is the last composition pair.
794        if(i==(length-1)) {
795            firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
796        }
797        dataString.append((UChar)firstUnit).append((UChar)secondUnit);
798        if(thirdUnit>=0) {
799            dataString.append((UChar)thirdUnit);
800        }
801    }
802}
803
804class ExtraDataWriter : public Normalizer2DBEnumerator {
805public:
806    ExtraDataWriter(Normalizer2DataBuilder &b) :
807        Normalizer2DBEnumerator(b),
808        yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
809        yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
810    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
811        if(value!=0) {
812            if(start!=end) {
813                fprintf(stderr,
814                        "gennorm2 error: unexpected shared data for "
815                        "multiple code points U+%04lX..U+%04lX\n",
816                        (long)start, (long)end);
817                exit(U_INTERNAL_PROGRAM_ERROR);
818            }
819            builder.writeExtraData(start, value, *this);
820        }
821        return TRUE;
822    }
823    UnicodeString maybeYesCompositions;
824    UnicodeString yesYesCompositions;
825    UnicodeString yesNoMappingsAndCompositions;
826    UnicodeString yesNoMappingsOnly;
827    UnicodeString noNoMappings;
828    Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
829};
830
831void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
832    Norm *p=norms+value;
833    if(!p->hasMapping()) {
834        // Write small-FCD data.
835        // There is similar code in writeMapping() for characters that do have a mapping.
836        if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
837            fprintf(stderr,
838                    "gennorm2 error: "
839                    "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
840                    (long)c);
841            exit(U_INVALID_FORMAT_ERROR);
842        }
843        if(p->cc!=0) {
844            UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
845            smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
846        }
847    }
848    if(p->combinesBack) {
849        if(p->hasMapping()) {
850            fprintf(stderr,
851                    "gennorm2 error: "
852                    "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
853                    (long)c);
854            exit(U_INVALID_FORMAT_ERROR);
855        }
856        if(p->compositions!=NULL) {
857            p->offset=
858                (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
859                Norm::OFFSET_MAYBE_YES;
860            writeCompositions(c, p, writer.maybeYesCompositions);
861        }
862    } else if(!p->hasMapping()) {
863        if(p->compositions!=NULL) {
864            p->offset=
865                (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
866                Norm::OFFSET_YES_YES;
867            writeCompositions(c, p, writer.yesYesCompositions);
868        }
869    } else if(p->mappingType==Norm::ROUND_TRIP) {
870        if(p->compositions!=NULL) {
871            int32_t offset=writer.yesNoMappingsAndCompositions.length()+
872                           writeMapping(c, p, writer.yesNoMappingsAndCompositions);
873            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
874            writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
875        } else {
876            int32_t offset=writer.yesNoMappingsOnly.length()+
877                           writeMapping(c, p, writer.yesNoMappingsOnly);
878            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
879        }
880    } else /* one-way */ {
881        if(p->compositions!=NULL) {
882            fprintf(stderr,
883                    "gennorm2 error: "
884                    "U+%04lX combines-forward and has a one-way mapping, "
885                    "not possible in Unicode normalization\n",
886                    (long)c);
887            exit(U_INVALID_FORMAT_ERROR);
888        }
889        if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
890            // Try a compact, algorithmic encoding.
891            // Only for ccc=0, because we can't store additional information
892            // and we do not recursively follow an algorithmic encoding for access to the ccc.
893            //
894            // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
895            // if the mappingCP decomposes further, to ensure that there is a place to store it.
896            // We want to see that the final mapping does not have exactly 1 code point,
897            // or else we would have to recursively ensure that the final mapping is stored
898            // in normal extraData.
899            if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
900                int32_t delta=p->mappingCP-c;
901                if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
902                    p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
903                }
904            }
905        }
906        if(p->offset==0) {
907            int32_t oldNoNoLength=writer.noNoMappings.length();
908            int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
909            UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
910            int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
911            if(previousOffset!=0) {
912                // Duplicate, remove the new units and point to the old ones.
913                writer.noNoMappings.truncate(oldNoNoLength);
914                p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
915            } else {
916                // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
917                IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
918                writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
919                p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
920            }
921        }
922    }
923}
924
925class Norm16Writer : public Normalizer2DBEnumerator {
926public:
927    Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
928    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
929        builder.writeNorm16(start, end, value);
930        return TRUE;
931    }
932};
933
934void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
935    if(value!=0) {
936        const Norm *p=norms+value;
937        int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
938        int32_t norm16=0;
939        UBool isDecompNo=FALSE;
940        UBool isCompNoMaybe=FALSE;
941        switch(p->offset&Norm::OFFSET_MASK) {
942        case Norm::OFFSET_NONE:
943            // No mapping, no compositions list.
944            if(p->combinesBack) {
945                norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
946                isDecompNo=(UBool)(p->cc!=0);
947                isCompNoMaybe=TRUE;
948            } else if(p->cc!=0) {
949                norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
950                isDecompNo=isCompNoMaybe=TRUE;
951            }
952            break;
953        case Norm::OFFSET_MAYBE_YES:
954            norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
955            isCompNoMaybe=TRUE;
956            break;
957        case Norm::OFFSET_YES_YES:
958            norm16=offset;
959            break;
960        case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
961            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
962            isDecompNo=TRUE;
963            break;
964        case Norm::OFFSET_YES_NO_MAPPING_ONLY:
965            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
966            isDecompNo=TRUE;
967            break;
968        case Norm::OFFSET_NO_NO:
969            norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
970            isDecompNo=isCompNoMaybe=TRUE;
971            break;
972        case Norm::OFFSET_DELTA:
973            norm16=getCenterNoNoDelta()+offset;
974            isDecompNo=isCompNoMaybe=TRUE;
975            break;
976        default:  // Should not occur.
977            exit(U_INTERNAL_PROGRAM_ERROR);
978        }
979        IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
980        utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
981        if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
982            indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
983        }
984        if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
985            indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
986        }
987    }
988}
989
990void Normalizer2DataBuilder::setHangulData() {
991    HangulIterator hi;
992    const HangulIterator::Range *range;
993    // Check that none of the Hangul/Jamo code points have data.
994    while((range=hi.nextRange())!=NULL) {
995        for(UChar32 c=range->start; c<range->limit; ++c) {
996            if(utrie2_get32(norm16Trie, c)!=0) {
997                fprintf(stderr,
998                        "gennorm2 error: "
999                        "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
1000                        (long)c);
1001                exit(U_INVALID_FORMAT_ERROR);
1002            }
1003        }
1004    }
1005    // Set data for algorithmic runtime handling.
1006    IcuToolErrorCode errorCode("gennorm2/setHangulData()");
1007    hi.reset();
1008    while((range=hi.nextRange())!=NULL) {
1009        uint16_t norm16=range->norm16;
1010        if(norm16==0) {
1011            norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
1012            if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
1013                indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
1014            }
1015        } else {
1016            if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
1017                indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
1018            }
1019        }
1020        utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
1021        errorCode.assertSuccess();
1022    }
1023}
1024
1025U_CDECL_BEGIN
1026
1027static UBool U_CALLCONV
1028enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1029    uint32_t *pMaxValue=(uint32_t *)context;
1030    if(value>*pMaxValue) {
1031        *pMaxValue=value;
1032    }
1033    return TRUE;
1034}
1035
1036U_CDECL_END
1037
1038void Normalizer2DataBuilder::processData() {
1039    IcuToolErrorCode errorCode("gennorm2/processData()");
1040    norm16Trie=utrie2_open(0, 0, errorCode);
1041    errorCode.assertSuccess();
1042
1043    utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
1044
1045    Decomposer decomposer(*this);
1046    do {
1047        decomposer.didDecompose=FALSE;
1048        utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
1049    } while(decomposer.didDecompose);
1050
1051    BuilderReorderingBuffer buffer;
1052    int32_t normsLength=utm_countItems(normMem);
1053    for(int32_t i=1; i<normsLength; ++i) {
1054        // Set the hasNoCompBoundaryAfter flag for use by the last code branch
1055        // in Normalizer2Impl::hasCompBoundaryAfter().
1056        // For details see the comments on hasNoCompBoundaryAfter(buffer).
1057        const Norm &norm=norms[i];
1058        if(norm.hasMapping()) {
1059            if(norm.compositions!=NULL) {
1060                norms[i].hasNoCompBoundaryAfter=TRUE;
1061            } else {
1062                buffer.reset();
1063                reorder(norms+i, buffer);
1064                norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
1065            }
1066        }
1067    }
1068
1069    indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
1070    indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
1071
1072    ExtraDataWriter extraDataWriter(*this);
1073    utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
1074
1075    extraData=extraDataWriter.maybeYesCompositions;
1076    extraData.append(extraDataWriter.yesYesCompositions).
1077              append(extraDataWriter.yesNoMappingsAndCompositions).
1078              append(extraDataWriter.yesNoMappingsOnly).
1079              append(extraDataWriter.noNoMappings);
1080    // Pad to even length for 4-byte alignment of following data.
1081    if(extraData.length()&1) {
1082        extraData.append((UChar)0);
1083    }
1084
1085    indexes[Normalizer2Impl::IX_MIN_YES_NO]=
1086        extraDataWriter.yesYesCompositions.length();
1087    indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
1088        indexes[Normalizer2Impl::IX_MIN_YES_NO]+
1089        extraDataWriter.yesNoMappingsAndCompositions.length();
1090    indexes[Normalizer2Impl::IX_MIN_NO_NO]=
1091        indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
1092        extraDataWriter.yesNoMappingsOnly.length();
1093    indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
1094        indexes[Normalizer2Impl::IX_MIN_NO_NO]+
1095        extraDataWriter.noNoMappings.length();
1096    indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
1097        Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
1098        extraDataWriter.maybeYesCompositions.length();
1099
1100    int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
1101    if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
1102        fprintf(stderr,
1103                "gennorm2 error: "
1104                "data structure overflow, too much mapping composition data\n");
1105        exit(U_BUFFER_OVERFLOW_ERROR);
1106    }
1107
1108    utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
1109
1110    setHangulData();
1111
1112    // Look for the "worst" norm16 value of any supplementary code point
1113    // corresponding to a lead surrogate, and set it as that surrogate's value.
1114    // Enables quick check inner loops to look at only code units.
1115    //
1116    // We could be more sophisticated:
1117    // We could collect a bit set for whether there are values in the different
1118    // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
1119    // and select the best value that only breaks the composition and/or decomposition
1120    // inner loops if necessary.
1121    // However, that seems like overkill for an optimization for supplementary characters.
1122    for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1123        uint32_t maxValue=utrie2_get32(norm16Trie, lead);
1124        utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
1125        if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
1126            maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
1127        ) {
1128            // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
1129            // Otherwise it might end up at something like JAMO_VT which stays in
1130            // the inner decomposition quick check loop.
1131            maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
1132        }
1133        utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
1134    }
1135
1136    // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
1137    // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
1138    // which is harmless.
1139    // As a result, the minimum code points are always BMP code points.
1140    int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
1141    if(minCP>=0x10000) {
1142        indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
1143    }
1144    minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
1145    if(minCP>=0x10000) {
1146        indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
1147    }
1148}
1149
1150void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
1151    processData();
1152
1153    IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
1154    utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
1155    int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
1156    if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
1157        fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
1158                errorCode.errorName());
1159        exit(errorCode.reset());
1160    }
1161    errorCode.reset();
1162    LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1163    utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1164    errorCode.assertSuccess();
1165
1166    int32_t offset=(int32_t)sizeof(indexes);
1167    indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
1168    offset+=norm16TrieLength;
1169    indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
1170    offset+=extraData.length()*2;
1171    indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
1172    offset+=sizeof(smallFCD);
1173    int32_t totalSize=offset;
1174    for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
1175        indexes[i]=totalSize;
1176    }
1177
1178    if(beVerbose) {
1179        printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
1180        printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
1181        printf("size of small-FCD data:             %5ld bytes\n", (long)sizeof(smallFCD));
1182        printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
1183        printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
1184        printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
1185        printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
1186        printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
1187        printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
1188        printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
1189        printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
1190    }
1191
1192    UVersionInfo nullVersion={ 0, 0, 0, 0 };
1193    if(0==memcmp(nullVersion, unicodeVersion, 4)) {
1194        u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
1195    }
1196    memcpy(dataInfo.dataVersion, unicodeVersion, 4);
1197    UNewDataMemory *pData=
1198        udata_create(NULL, NULL, filename, &dataInfo,
1199                     haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
1200    if(errorCode.isFailure()) {
1201        fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
1202                filename, errorCode.errorName());
1203        exit(errorCode.reset());
1204    }
1205    udata_writeBlock(pData, indexes, sizeof(indexes));
1206    udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
1207    udata_writeUString(pData, extraData.getBuffer(), extraData.length());
1208    udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
1209    int32_t writtenSize=udata_finish(pData, errorCode);
1210    if(errorCode.isFailure()) {
1211        fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
1212        exit(errorCode.reset());
1213    }
1214    if(writtenSize!=totalSize) {
1215        fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
1216            (long)writtenSize, (long)totalSize);
1217        exit(U_INTERNAL_PROGRAM_ERROR);
1218    }
1219}
1220
1221U_NAMESPACE_END
1222
1223#endif /* #if !UCONFIG_NO_NORMALIZATION */
1224
1225/*
1226 * Hey, Emacs, please set the following:
1227 *
1228 * Local Variables:
1229 * indent-tabs-mode: nil
1230 * End:
1231 */
1232