1/*
2******************************************************************************
3*
4*   Copyright (C) 1999-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  unames.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 1999oct04
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/putil.h"
19#include "unicode/uchar.h"
20#include "unicode/udata.h"
21#include "unicode/utf.h"
22#include "unicode/utf16.h"
23#include "ustr_imp.h"
24#include "umutex.h"
25#include "cmemory.h"
26#include "cstring.h"
27#include "ucln_cmn.h"
28#include "udataswp.h"
29#include "uprops.h"
30
31/* prototypes ------------------------------------------------------------- */
32
33#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34
35static const char DATA_NAME[] = "unames";
36static const char DATA_TYPE[] = "icu";
37
38#define GROUP_SHIFT 5
39#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
40#define GROUP_MASK (LINES_PER_GROUP-1)
41
42/*
43 * This struct was replaced by explicitly accessing equivalent
44 * fields from triples of uint16_t.
45 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
46 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
47 * would advance by 6 bytes (3 uint16_t).
48 *
49 * We can't just change the data structure because it's loaded from a data file,
50 * and we don't want to make it less compact, so we changed the access code.
51 *
52 * For details see ICU tickets 6331 and 6008.
53typedef struct {
54    uint16_t groupMSB,
55             offsetHigh, offsetLow; / * avoid padding * /
56} Group;
57 */
58enum {
59    GROUP_MSB,
60    GROUP_OFFSET_HIGH,
61    GROUP_OFFSET_LOW,
62    GROUP_LENGTH
63};
64
65/*
66 * Get the 32-bit group offset.
67 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
68 * @return group offset (int32_t)
69 */
70#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
71
72#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
73#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
74
75typedef struct {
76    uint32_t start, end;
77    uint8_t type, variant;
78    uint16_t size;
79} AlgorithmicRange;
80
81typedef struct {
82    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
83} UCharNames;
84
85/*
86 * Get the groups table from a UCharNames struct.
87 * The groups table consists of one uint16_t groupCount followed by
88 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
89 * and the comment for the old struct Group above.
90 *
91 * @param names (const UCharNames *) pointer to the UCharNames indexes
92 * @return (const uint16_t *) pointer to the groups table
93 */
94#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
95
96typedef struct {
97    const char *otherName;
98    UChar32 code;
99} FindName;
100
101#define DO_FIND_NAME NULL
102
103static UDataMemory *uCharNamesData=NULL;
104static UCharNames *uCharNames=NULL;
105static UErrorCode gLoadErrorCode=U_ZERO_ERROR;
106
107/*
108 * Maximum length of character names (regular & 1.0).
109 */
110static int32_t gMaxNameLength=0;
111
112/*
113 * Set of chars used in character names (regular & 1.0).
114 * Chars are platform-dependent (can be EBCDIC).
115 */
116static uint32_t gNameSet[8]={ 0 };
117
118#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
119#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
120#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
121
122#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
123
124static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
125    "unassigned",
126    "uppercase letter",
127    "lowercase letter",
128    "titlecase letter",
129    "modifier letter",
130    "other letter",
131    "non spacing mark",
132    "enclosing mark",
133    "combining spacing mark",
134    "decimal digit number",
135    "letter number",
136    "other number",
137    "space separator",
138    "line separator",
139    "paragraph separator",
140    "control",
141    "format",
142    "private use area",
143    "surrogate",
144    "dash punctuation",
145    "start punctuation",
146    "end punctuation",
147    "connector punctuation",
148    "other punctuation",
149    "math symbol",
150    "currency symbol",
151    "modifier symbol",
152    "other symbol",
153    "initial punctuation",
154    "final punctuation",
155    "noncharacter",
156    "lead surrogate",
157    "trail surrogate"
158};
159
160/* implementation ----------------------------------------------------------- */
161
162static UBool U_CALLCONV unames_cleanup(void)
163{
164    if(uCharNamesData) {
165        udata_close(uCharNamesData);
166        uCharNamesData = NULL;
167    }
168    if(uCharNames) {
169        uCharNames = NULL;
170    }
171    gMaxNameLength=0;
172    return TRUE;
173}
174
175static UBool U_CALLCONV
176isAcceptable(void * /*context*/,
177             const char * /*type*/, const char * /*name*/,
178             const UDataInfo *pInfo) {
179    return (UBool)(
180        pInfo->size>=20 &&
181        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
182        pInfo->charsetFamily==U_CHARSET_FAMILY &&
183        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
184        pInfo->dataFormat[1]==0x6e &&
185        pInfo->dataFormat[2]==0x61 &&
186        pInfo->dataFormat[3]==0x6d &&
187        pInfo->formatVersion[0]==1);
188}
189
190static UBool
191isDataLoaded(UErrorCode *pErrorCode) {
192    /* load UCharNames from file if necessary */
193    UBool isCached;
194
195    /* do this because double-checked locking is broken */
196    UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);
197
198    if(!isCached) {
199        UCharNames *names;
200        UDataMemory *data;
201
202        /* check error code from previous attempt */
203        if(U_FAILURE(gLoadErrorCode)) {
204            *pErrorCode=gLoadErrorCode;
205            return FALSE;
206        }
207
208        /* open the data outside the mutex block */
209        data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
210        if(U_FAILURE(*pErrorCode)) {
211            gLoadErrorCode=*pErrorCode;
212            return FALSE;
213        }
214
215        names=(UCharNames *)udata_getMemory(data);
216
217        /* in the mutex block, set the data for this process */
218        {
219            umtx_lock(NULL);
220            if(uCharNames==NULL) {
221                uCharNamesData=data;
222                uCharNames=names;
223                data=NULL;
224                names=NULL;
225                ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
226            }
227            umtx_unlock(NULL);
228        }
229
230        /* if a different thread set it first, then close the extra data */
231        if(data!=NULL) {
232            udata_close(data); /* NULL if it was set correctly */
233        }
234    }
235    return TRUE;
236}
237
238#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
239    if((bufferLength)>0) { \
240        *(buffer)++=c; \
241        --(bufferLength); \
242    } \
243    ++(bufferPos); \
244}
245
246#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
247
248/*
249 * Important: expandName() and compareName() are almost the same -
250 * apply fixes to both.
251 *
252 * UnicodeData.txt uses ';' as a field separator, so no
253 * field can contain ';' as part of its contents.
254 * In unames.dat, it is marked as token[';']==-1 only if the
255 * semicolon is used in the data file - which is iff we
256 * have Unicode 1.0 names or ISO comments or aliases.
257 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
258 * although we know that it will never be part of a name.
259 */
260static uint16_t
261expandName(UCharNames *names,
262           const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
263           char *buffer, uint16_t bufferLength) {
264    uint16_t *tokens=(uint16_t *)names+8;
265    uint16_t token, tokenCount=*tokens++, bufferPos=0;
266    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
267    uint8_t c;
268
269    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
270        /*
271         * skip the modern name if it is not requested _and_
272         * if the semicolon byte value is a character, not a token number
273         */
274        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
275            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
276            do {
277                while(nameLength>0) {
278                    --nameLength;
279                    if(*name++==';') {
280                        break;
281                    }
282                }
283            } while(--fieldIndex>0);
284        } else {
285            /*
286             * the semicolon byte value is a token number, therefore
287             * only modern names are stored in unames.dat and there is no
288             * such requested alternate name here
289             */
290            nameLength=0;
291        }
292    }
293
294    /* write each letter directly, and write a token word per token */
295    while(nameLength>0) {
296        --nameLength;
297        c=*name++;
298
299        if(c>=tokenCount) {
300            if(c!=';') {
301                /* implicit letter */
302                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
303            } else {
304                /* finished */
305                break;
306            }
307        } else {
308            token=tokens[c];
309            if(token==(uint16_t)(-2)) {
310                /* this is a lead byte for a double-byte token */
311                token=tokens[c<<8|*name++];
312                --nameLength;
313            }
314            if(token==(uint16_t)(-1)) {
315                if(c!=';') {
316                    /* explicit letter */
317                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
318                } else {
319                    /* stop, but skip the semicolon if we are seeking
320                       extended names and there was no 2.0 name but there
321                       is a 1.0 name. */
322                    if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
323                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
324                            continue;
325                        }
326                    }
327                    /* finished */
328                    break;
329                }
330            } else {
331                /* write token word */
332                uint8_t *tokenString=tokenStrings+token;
333                while((c=*tokenString++)!=0) {
334                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
335                }
336            }
337        }
338    }
339
340    /* zero-terminate */
341    if(bufferLength>0) {
342        *buffer=0;
343    }
344
345    return bufferPos;
346}
347
348/*
349 * compareName() is almost the same as expandName() except that it compares
350 * the currently expanded name to an input name.
351 * It returns the match/no match result as soon as possible.
352 */
353static UBool
354compareName(UCharNames *names,
355            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
356            const char *otherName) {
357    uint16_t *tokens=(uint16_t *)names+8;
358    uint16_t token, tokenCount=*tokens++;
359    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
360    uint8_t c;
361    const char *origOtherName = otherName;
362
363    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
364        /*
365         * skip the modern name if it is not requested _and_
366         * if the semicolon byte value is a character, not a token number
367         */
368        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
369            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
370            do {
371                while(nameLength>0) {
372                    --nameLength;
373                    if(*name++==';') {
374                        break;
375                    }
376                }
377            } while(--fieldIndex>0);
378        } else {
379            /*
380             * the semicolon byte value is a token number, therefore
381             * only modern names are stored in unames.dat and there is no
382             * such requested alternate name here
383             */
384            nameLength=0;
385        }
386    }
387
388    /* compare each letter directly, and compare a token word per token */
389    while(nameLength>0) {
390        --nameLength;
391        c=*name++;
392
393        if(c>=tokenCount) {
394            if(c!=';') {
395                /* implicit letter */
396                if((char)c!=*otherName++) {
397                    return FALSE;
398                }
399            } else {
400                /* finished */
401                break;
402            }
403        } else {
404            token=tokens[c];
405            if(token==(uint16_t)(-2)) {
406                /* this is a lead byte for a double-byte token */
407                token=tokens[c<<8|*name++];
408                --nameLength;
409            }
410            if(token==(uint16_t)(-1)) {
411                if(c!=';') {
412                    /* explicit letter */
413                    if((char)c!=*otherName++) {
414                        return FALSE;
415                    }
416                } else {
417                    /* stop, but skip the semicolon if we are seeking
418                       extended names and there was no 2.0 name but there
419                       is a 1.0 name. */
420                    if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
421                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
422                            continue;
423                        }
424                    }
425                    /* finished */
426                    break;
427                }
428            } else {
429                /* write token word */
430                uint8_t *tokenString=tokenStrings+token;
431                while((c=*tokenString++)!=0) {
432                    if((char)c!=*otherName++) {
433                        return FALSE;
434                    }
435                }
436            }
437        }
438    }
439
440    /* complete match? */
441    return (UBool)(*otherName==0);
442}
443
444static uint8_t getCharCat(UChar32 cp) {
445    uint8_t cat;
446
447    if (U_IS_UNICODE_NONCHAR(cp)) {
448        return U_NONCHARACTER_CODE_POINT;
449    }
450
451    if ((cat = u_charType(cp)) == U_SURROGATE) {
452        cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
453    }
454
455    return cat;
456}
457
458static const char *getCharCatName(UChar32 cp) {
459    uint8_t cat = getCharCat(cp);
460
461    /* Return unknown if the table of names above is not up to
462       date. */
463
464    if (cat >= LENGTHOF(charCatNames)) {
465        return "unknown";
466    } else {
467        return charCatNames[cat];
468    }
469}
470
471static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
472    const char *catname = getCharCatName(code);
473    uint16_t length = 0;
474
475    UChar32 cp;
476    int ndigits, i;
477
478    WRITE_CHAR(buffer, bufferLength, length, '<');
479    while (catname[length - 1]) {
480        WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
481    }
482    WRITE_CHAR(buffer, bufferLength, length, '-');
483    for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
484        ;
485    if (ndigits < 4)
486        ndigits = 4;
487    for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
488        uint8_t v = (uint8_t)(cp & 0xf);
489        buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
490    }
491    buffer += ndigits;
492    length += ndigits;
493    WRITE_CHAR(buffer, bufferLength, length, '>');
494
495    return length;
496}
497
498/*
499 * getGroup() does a binary search for the group that contains the
500 * Unicode code point "code".
501 * The return value is always a valid Group* that may contain "code"
502 * or else is the highest group before "code".
503 * If the lowest group is after "code", then that one is returned.
504 */
505static const uint16_t *
506getGroup(UCharNames *names, uint32_t code) {
507    const uint16_t *groups=GET_GROUPS(names);
508    uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
509             start=0,
510             limit=*groups++,
511             number;
512
513    /* binary search for the group of names that contains the one for code */
514    while(start<limit-1) {
515        number=(uint16_t)((start+limit)/2);
516        if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
517            limit=number;
518        } else {
519            start=number;
520        }
521    }
522
523    /* return this regardless of whether it is an exact match */
524    return groups+start*GROUP_LENGTH;
525}
526
527/*
528 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
529 * expands them into offsets and lengths for each string.
530 * Lengths are stored with a variable-width encoding in consecutive nibbles:
531 * If a nibble<0xc, then it is the length itself (0=empty string).
532 * If a nibble>=0xc, then it forms a length value with the following nibble.
533 * Calculation see below.
534 * The offsets and lengths arrays must be at least 33 (one more) long because
535 * there is no check here at the end if the last nibble is still used.
536 */
537static const uint8_t *
538expandGroupLengths(const uint8_t *s,
539                   uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
540    /* read the lengths of the 32 strings in this group and get each string's offset */
541    uint16_t i=0, offset=0, length=0;
542    uint8_t lengthByte;
543
544    /* all 32 lengths must be read to get the offset of the first group string */
545    while(i<LINES_PER_GROUP) {
546        lengthByte=*s++;
547
548        /* read even nibble - MSBs of lengthByte */
549        if(length>=12) {
550            /* double-nibble length spread across two bytes */
551            length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
552            lengthByte&=0xf;
553        } else if((lengthByte /* &0xf0 */)>=0xc0) {
554            /* double-nibble length spread across this one byte */
555            length=(uint16_t)((lengthByte&0x3f)+12);
556        } else {
557            /* single-nibble length in MSBs */
558            length=(uint16_t)(lengthByte>>4);
559            lengthByte&=0xf;
560        }
561
562        *offsets++=offset;
563        *lengths++=length;
564
565        offset+=length;
566        ++i;
567
568        /* read odd nibble - LSBs of lengthByte */
569        if((lengthByte&0xf0)==0) {
570            /* this nibble was not consumed for a double-nibble length above */
571            length=lengthByte;
572            if(length<12) {
573                /* single-nibble length in LSBs */
574                *offsets++=offset;
575                *lengths++=length;
576
577                offset+=length;
578                ++i;
579            }
580        } else {
581            length=0;   /* prevent double-nibble detection in the next iteration */
582        }
583    }
584
585    /* now, s is at the first group string */
586    return s;
587}
588
589static uint16_t
590expandGroupName(UCharNames *names, const uint16_t *group,
591                uint16_t lineNumber, UCharNameChoice nameChoice,
592                char *buffer, uint16_t bufferLength) {
593    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
594    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
595    s=expandGroupLengths(s, offsets, lengths);
596    return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
597                      buffer, bufferLength);
598}
599
600static uint16_t
601getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
602        char *buffer, uint16_t bufferLength) {
603    const uint16_t *group=getGroup(names, code);
604    if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
605        return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
606                               buffer, bufferLength);
607    } else {
608        /* group not found */
609        /* zero-terminate */
610        if(bufferLength>0) {
611            *buffer=0;
612        }
613        return 0;
614    }
615}
616
617/*
618 * enumGroupNames() enumerates all the names in a 32-group
619 * and either calls the enumerator function or finds a given input name.
620 */
621static UBool
622enumGroupNames(UCharNames *names, const uint16_t *group,
623               UChar32 start, UChar32 end,
624               UEnumCharNamesFn *fn, void *context,
625               UCharNameChoice nameChoice) {
626    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
627    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
628
629    s=expandGroupLengths(s, offsets, lengths);
630    if(fn!=DO_FIND_NAME) {
631        char buffer[200];
632        uint16_t length;
633
634        while(start<=end) {
635            length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
636            if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
637                buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
638            }
639            /* here, we assume that the buffer is large enough */
640            if(length>0) {
641                if(!fn(context, start, nameChoice, buffer, length)) {
642                    return FALSE;
643                }
644            }
645            ++start;
646        }
647    } else {
648        const char *otherName=((FindName *)context)->otherName;
649        while(start<=end) {
650            if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
651                ((FindName *)context)->code=start;
652                return FALSE;
653            }
654            ++start;
655        }
656    }
657    return TRUE;
658}
659
660/*
661 * enumExtNames enumerate extended names.
662 * It only needs to do it if it is called with a real function and not
663 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
664 * for extended names by itself.
665 */
666static UBool
667enumExtNames(UChar32 start, UChar32 end,
668             UEnumCharNamesFn *fn, void *context)
669{
670    if(fn!=DO_FIND_NAME) {
671        char buffer[200];
672        uint16_t length;
673
674        while(start<=end) {
675            buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
676            /* here, we assume that the buffer is large enough */
677            if(length>0) {
678                if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
679                    return FALSE;
680                }
681            }
682            ++start;
683        }
684    }
685
686    return TRUE;
687}
688
689static UBool
690enumNames(UCharNames *names,
691          UChar32 start, UChar32 limit,
692          UEnumCharNamesFn *fn, void *context,
693          UCharNameChoice nameChoice) {
694    uint16_t startGroupMSB, endGroupMSB, groupCount;
695    const uint16_t *group, *groupLimit;
696
697    startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
698    endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
699
700    /* find the group that contains start, or the highest before it */
701    group=getGroup(names, start);
702
703    if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
704        /* enumerate synthetic names between start and the group start */
705        UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
706        if(extLimit>limit) {
707            extLimit=limit;
708        }
709        if(!enumExtNames(start, extLimit-1, fn, context)) {
710            return FALSE;
711        }
712        start=extLimit;
713    }
714
715    if(startGroupMSB==endGroupMSB) {
716        if(startGroupMSB==group[GROUP_MSB]) {
717            /* if start and limit-1 are in the same group, then enumerate only in that one */
718            return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
719        }
720    } else {
721        const uint16_t *groups=GET_GROUPS(names);
722        groupCount=*groups++;
723        groupLimit=groups+groupCount*GROUP_LENGTH;
724
725        if(startGroupMSB==group[GROUP_MSB]) {
726            /* enumerate characters in the partial start group */
727            if((start&GROUP_MASK)!=0) {
728                if(!enumGroupNames(names, group,
729                                   start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
730                                   fn, context, nameChoice)) {
731                    return FALSE;
732                }
733                group=NEXT_GROUP(group); /* continue with the next group */
734            }
735        } else if(startGroupMSB>group[GROUP_MSB]) {
736            /* make sure that we start enumerating with the first group after start */
737            const uint16_t *nextGroup=NEXT_GROUP(group);
738            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
739                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
740                if (end > limit) {
741                    end = limit;
742                }
743                if (!enumExtNames(start, end - 1, fn, context)) {
744                    return FALSE;
745                }
746            }
747            group=nextGroup;
748        }
749
750        /* enumerate entire groups between the start- and end-groups */
751        while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
752            const uint16_t *nextGroup;
753            start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
754            if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
755                return FALSE;
756            }
757            nextGroup=NEXT_GROUP(group);
758            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
759                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
760                if (end > limit) {
761                    end = limit;
762                }
763                if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
764                    return FALSE;
765                }
766            }
767            group=nextGroup;
768        }
769
770        /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
771        if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
772            return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
773        } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
774            UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
775            if (next > start) {
776                start = next;
777            }
778        } else {
779            return TRUE;
780        }
781    }
782
783    /* we have not found a group, which means everything is made of
784       extended names. */
785    if (nameChoice == U_EXTENDED_CHAR_NAME) {
786        if (limit > UCHAR_MAX_VALUE + 1) {
787            limit = UCHAR_MAX_VALUE + 1;
788        }
789        return enumExtNames(start, limit - 1, fn, context);
790    }
791
792    return TRUE;
793}
794
795static uint16_t
796writeFactorSuffix(const uint16_t *factors, uint16_t count,
797                  const char *s, /* suffix elements */
798                  uint32_t code,
799                  uint16_t indexes[8], /* output fields from here */
800                  const char *elementBases[8], const char *elements[8],
801                  char *buffer, uint16_t bufferLength) {
802    uint16_t i, factor, bufferPos=0;
803    char c;
804
805    /* write elements according to the factors */
806
807    /*
808     * the factorized elements are determined by modulo arithmetic
809     * with the factors of this algorithm
810     *
811     * note that for fewer operations, count is decremented here
812     */
813    --count;
814    for(i=count; i>0; --i) {
815        factor=factors[i];
816        indexes[i]=(uint16_t)(code%factor);
817        code/=factor;
818    }
819    /*
820     * we don't need to calculate the last modulus because start<=code<=end
821     * guarantees here that code<=factors[0]
822     */
823    indexes[0]=(uint16_t)code;
824
825    /* write each element */
826    for(;;) {
827        if(elementBases!=NULL) {
828            *elementBases++=s;
829        }
830
831        /* skip indexes[i] strings */
832        factor=indexes[i];
833        while(factor>0) {
834            while(*s++!=0) {}
835            --factor;
836        }
837        if(elements!=NULL) {
838            *elements++=s;
839        }
840
841        /* write element */
842        while((c=*s++)!=0) {
843            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
844        }
845
846        /* we do not need to perform the rest of this loop for i==count - break here */
847        if(i>=count) {
848            break;
849        }
850
851        /* skip the rest of the strings for this factors[i] */
852        factor=(uint16_t)(factors[i]-indexes[i]-1);
853        while(factor>0) {
854            while(*s++!=0) {}
855            --factor;
856        }
857
858        ++i;
859    }
860
861    /* zero-terminate */
862    if(bufferLength>0) {
863        *buffer=0;
864    }
865
866    return bufferPos;
867}
868
869/*
870 * Important:
871 * Parts of findAlgName() are almost the same as some of getAlgName().
872 * Fixes must be applied to both.
873 */
874static uint16_t
875getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
876        char *buffer, uint16_t bufferLength) {
877    uint16_t bufferPos=0;
878
879    /* Only the normative character name can be algorithmic. */
880    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
881        /* zero-terminate */
882        if(bufferLength>0) {
883            *buffer=0;
884        }
885        return 0;
886    }
887
888    switch(range->type) {
889    case 0: {
890        /* name = prefix hex-digits */
891        const char *s=(const char *)(range+1);
892        char c;
893
894        uint16_t i, count;
895
896        /* copy prefix */
897        while((c=*s++)!=0) {
898            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
899        }
900
901        /* write hexadecimal code point value */
902        count=range->variant;
903
904        /* zero-terminate */
905        if(count<bufferLength) {
906            buffer[count]=0;
907        }
908
909        for(i=count; i>0;) {
910            if(--i<bufferLength) {
911                c=(char)(code&0xf);
912                if(c<10) {
913                    c+='0';
914                } else {
915                    c+='A'-10;
916                }
917                buffer[i]=c;
918            }
919            code>>=4;
920        }
921
922        bufferPos+=count;
923        break;
924    }
925    case 1: {
926        /* name = prefix factorized-elements */
927        uint16_t indexes[8];
928        const uint16_t *factors=(const uint16_t *)(range+1);
929        uint16_t count=range->variant;
930        const char *s=(const char *)(factors+count);
931        char c;
932
933        /* copy prefix */
934        while((c=*s++)!=0) {
935            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
936        }
937
938        bufferPos+=writeFactorSuffix(factors, count,
939                                     s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
940        break;
941    }
942    default:
943        /* undefined type */
944        /* zero-terminate */
945        if(bufferLength>0) {
946            *buffer=0;
947        }
948        break;
949    }
950
951    return bufferPos;
952}
953
954/*
955 * Important: enumAlgNames() and findAlgName() are almost the same.
956 * Any fix must be applied to both.
957 */
958static UBool
959enumAlgNames(AlgorithmicRange *range,
960             UChar32 start, UChar32 limit,
961             UEnumCharNamesFn *fn, void *context,
962             UCharNameChoice nameChoice) {
963    char buffer[200];
964    uint16_t length;
965
966    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
967        return TRUE;
968    }
969
970    switch(range->type) {
971    case 0: {
972        char *s, *end;
973        char c;
974
975        /* get the full name of the start character */
976        length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
977        if(length<=0) {
978            return TRUE;
979        }
980
981        /* call the enumerator function with this first character */
982        if(!fn(context, start, nameChoice, buffer, length)) {
983            return FALSE;
984        }
985
986        /* go to the end of the name; all these names have the same length */
987        end=buffer;
988        while(*end!=0) {
989            ++end;
990        }
991
992        /* enumerate the rest of the names */
993        while(++start<limit) {
994            /* increment the hexadecimal number on a character-basis */
995            s=end;
996            for (;;) {
997                c=*--s;
998                if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
999                    *s=(char)(c+1);
1000                    break;
1001                } else if(c=='9') {
1002                    *s='A';
1003                    break;
1004                } else if(c=='F') {
1005                    *s='0';
1006                }
1007            }
1008
1009            if(!fn(context, start, nameChoice, buffer, length)) {
1010                return FALSE;
1011            }
1012        }
1013        break;
1014    }
1015    case 1: {
1016        uint16_t indexes[8];
1017        const char *elementBases[8], *elements[8];
1018        const uint16_t *factors=(const uint16_t *)(range+1);
1019        uint16_t count=range->variant;
1020        const char *s=(const char *)(factors+count);
1021        char *suffix, *t;
1022        uint16_t prefixLength, i, idx;
1023
1024        char c;
1025
1026        /* name = prefix factorized-elements */
1027
1028        /* copy prefix */
1029        suffix=buffer;
1030        prefixLength=0;
1031        while((c=*s++)!=0) {
1032            *suffix++=c;
1033            ++prefixLength;
1034        }
1035
1036        /* append the suffix of the start character */
1037        length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1038                                              s, (uint32_t)start-range->start,
1039                                              indexes, elementBases, elements,
1040                                              suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1041
1042        /* call the enumerator function with this first character */
1043        if(!fn(context, start, nameChoice, buffer, length)) {
1044            return FALSE;
1045        }
1046
1047        /* enumerate the rest of the names */
1048        while(++start<limit) {
1049            /* increment the indexes in lexical order bound by the factors */
1050            i=count;
1051            for (;;) {
1052                idx=(uint16_t)(indexes[--i]+1);
1053                if(idx<factors[i]) {
1054                    /* skip one index and its element string */
1055                    indexes[i]=idx;
1056                    s=elements[i];
1057                    while(*s++!=0) {
1058                    }
1059                    elements[i]=s;
1060                    break;
1061                } else {
1062                    /* reset this index to 0 and its element string to the first one */
1063                    indexes[i]=0;
1064                    elements[i]=elementBases[i];
1065                }
1066            }
1067
1068            /* to make matters a little easier, just append all elements to the suffix */
1069            t=suffix;
1070            length=prefixLength;
1071            for(i=0; i<count; ++i) {
1072                s=elements[i];
1073                while((c=*s++)!=0) {
1074                    *t++=c;
1075                    ++length;
1076                }
1077            }
1078            /* zero-terminate */
1079            *t=0;
1080
1081            if(!fn(context, start, nameChoice, buffer, length)) {
1082                return FALSE;
1083            }
1084        }
1085        break;
1086    }
1087    default:
1088        /* undefined type */
1089        break;
1090    }
1091
1092    return TRUE;
1093}
1094
1095/*
1096 * findAlgName() is almost the same as enumAlgNames() except that it
1097 * returns the code point for a name if it fits into the range.
1098 * It returns 0xffff otherwise.
1099 */
1100static UChar32
1101findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1102    UChar32 code;
1103
1104    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1105        return 0xffff;
1106    }
1107
1108    switch(range->type) {
1109    case 0: {
1110        /* name = prefix hex-digits */
1111        const char *s=(const char *)(range+1);
1112        char c;
1113
1114        uint16_t i, count;
1115
1116        /* compare prefix */
1117        while((c=*s++)!=0) {
1118            if((char)c!=*otherName++) {
1119                return 0xffff;
1120            }
1121        }
1122
1123        /* read hexadecimal code point value */
1124        count=range->variant;
1125        code=0;
1126        for(i=0; i<count; ++i) {
1127            c=*otherName++;
1128            if('0'<=c && c<='9') {
1129                code=(code<<4)|(c-'0');
1130            } else if('A'<=c && c<='F') {
1131                code=(code<<4)|(c-'A'+10);
1132            } else {
1133                return 0xffff;
1134            }
1135        }
1136
1137        /* does it fit into the range? */
1138        if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1139            return code;
1140        }
1141        break;
1142    }
1143    case 1: {
1144        char buffer[64];
1145        uint16_t indexes[8];
1146        const char *elementBases[8], *elements[8];
1147        const uint16_t *factors=(const uint16_t *)(range+1);
1148        uint16_t count=range->variant;
1149        const char *s=(const char *)(factors+count), *t;
1150        UChar32 start, limit;
1151        uint16_t i, idx;
1152
1153        char c;
1154
1155        /* name = prefix factorized-elements */
1156
1157        /* compare prefix */
1158        while((c=*s++)!=0) {
1159            if((char)c!=*otherName++) {
1160                return 0xffff;
1161            }
1162        }
1163
1164        start=(UChar32)range->start;
1165        limit=(UChar32)(range->end+1);
1166
1167        /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1168        writeFactorSuffix(factors, count, s, 0,
1169                          indexes, elementBases, elements, buffer, sizeof(buffer));
1170
1171        /* compare the first suffix */
1172        if(0==uprv_strcmp(otherName, buffer)) {
1173            return start;
1174        }
1175
1176        /* enumerate and compare the rest of the suffixes */
1177        while(++start<limit) {
1178            /* increment the indexes in lexical order bound by the factors */
1179            i=count;
1180            for (;;) {
1181                idx=(uint16_t)(indexes[--i]+1);
1182                if(idx<factors[i]) {
1183                    /* skip one index and its element string */
1184                    indexes[i]=idx;
1185                    s=elements[i];
1186                    while(*s++!=0) {}
1187                    elements[i]=s;
1188                    break;
1189                } else {
1190                    /* reset this index to 0 and its element string to the first one */
1191                    indexes[i]=0;
1192                    elements[i]=elementBases[i];
1193                }
1194            }
1195
1196            /* to make matters a little easier, just compare all elements of the suffix */
1197            t=otherName;
1198            for(i=0; i<count; ++i) {
1199                s=elements[i];
1200                while((c=*s++)!=0) {
1201                    if(c!=*t++) {
1202                        s=""; /* does not match */
1203                        i=99;
1204                    }
1205                }
1206            }
1207            if(i<99 && *t==0) {
1208                return start;
1209            }
1210        }
1211        break;
1212    }
1213    default:
1214        /* undefined type */
1215        break;
1216    }
1217
1218    return 0xffff;
1219}
1220
1221/* sets of name characters, maximum name lengths ---------------------------- */
1222
1223#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1224#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1225
1226static int32_t
1227calcStringSetLength(uint32_t set[8], const char *s) {
1228    int32_t length=0;
1229    char c;
1230
1231    while((c=*s++)!=0) {
1232        SET_ADD(set, c);
1233        ++length;
1234    }
1235    return length;
1236}
1237
1238static int32_t
1239calcAlgNameSetsLengths(int32_t maxNameLength) {
1240    AlgorithmicRange *range;
1241    uint32_t *p;
1242    uint32_t rangeCount;
1243    int32_t length;
1244
1245    /* enumerate algorithmic ranges */
1246    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1247    rangeCount=*p;
1248    range=(AlgorithmicRange *)(p+1);
1249    while(rangeCount>0) {
1250        switch(range->type) {
1251        case 0:
1252            /* name = prefix + (range->variant times) hex-digits */
1253            /* prefix */
1254            length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1255            if(length>maxNameLength) {
1256                maxNameLength=length;
1257            }
1258            break;
1259        case 1: {
1260            /* name = prefix factorized-elements */
1261            const uint16_t *factors=(const uint16_t *)(range+1);
1262            const char *s;
1263            int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1264
1265            /* prefix length */
1266            s=(const char *)(factors+count);
1267            length=calcStringSetLength(gNameSet, s);
1268            s+=length+1; /* start of factor suffixes */
1269
1270            /* get the set and maximum factor suffix length for each factor */
1271            for(i=0; i<count; ++i) {
1272                maxFactorLength=0;
1273                for(factor=factors[i]; factor>0; --factor) {
1274                    factorLength=calcStringSetLength(gNameSet, s);
1275                    s+=factorLength+1;
1276                    if(factorLength>maxFactorLength) {
1277                        maxFactorLength=factorLength;
1278                    }
1279                }
1280                length+=maxFactorLength;
1281            }
1282
1283            if(length>maxNameLength) {
1284                maxNameLength=length;
1285            }
1286            break;
1287        }
1288        default:
1289            /* unknown type */
1290            break;
1291        }
1292
1293        range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1294        --rangeCount;
1295    }
1296    return maxNameLength;
1297}
1298
1299static int32_t
1300calcExtNameSetsLengths(int32_t maxNameLength) {
1301    int32_t i, length;
1302
1303    for(i=0; i<LENGTHOF(charCatNames); ++i) {
1304        /*
1305         * for each category, count the length of the category name
1306         * plus 9=
1307         * 2 for <>
1308         * 1 for -
1309         * 6 for most hex digits per code point
1310         */
1311        length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1312        if(length>maxNameLength) {
1313            maxNameLength=length;
1314        }
1315    }
1316    return maxNameLength;
1317}
1318
1319static int32_t
1320calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1321                  uint32_t set[8],
1322                  const uint8_t **pLine, const uint8_t *lineLimit) {
1323    const uint8_t *line=*pLine;
1324    int32_t length=0, tokenLength;
1325    uint16_t c, token;
1326
1327    while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1328        if(c>=tokenCount) {
1329            /* implicit letter */
1330            SET_ADD(set, c);
1331            ++length;
1332        } else {
1333            token=tokens[c];
1334            if(token==(uint16_t)(-2)) {
1335                /* this is a lead byte for a double-byte token */
1336                c=c<<8|*line++;
1337                token=tokens[c];
1338            }
1339            if(token==(uint16_t)(-1)) {
1340                /* explicit letter */
1341                SET_ADD(set, c);
1342                ++length;
1343            } else {
1344                /* count token word */
1345                if(tokenLengths!=NULL) {
1346                    /* use cached token length */
1347                    tokenLength=tokenLengths[c];
1348                    if(tokenLength==0) {
1349                        tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1350                        tokenLengths[c]=(int8_t)tokenLength;
1351                    }
1352                } else {
1353                    tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1354                }
1355                length+=tokenLength;
1356            }
1357        }
1358    }
1359
1360    *pLine=line;
1361    return length;
1362}
1363
1364static void
1365calcGroupNameSetsLengths(int32_t maxNameLength) {
1366    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1367
1368    uint16_t *tokens=(uint16_t *)uCharNames+8;
1369    uint16_t tokenCount=*tokens++;
1370    uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1371
1372    int8_t *tokenLengths;
1373
1374    const uint16_t *group;
1375    const uint8_t *s, *line, *lineLimit;
1376
1377    int32_t groupCount, lineNumber, length;
1378
1379    tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1380    if(tokenLengths!=NULL) {
1381        uprv_memset(tokenLengths, 0, tokenCount);
1382    }
1383
1384    group=GET_GROUPS(uCharNames);
1385    groupCount=*group++;
1386
1387    /* enumerate all groups */
1388    while(groupCount>0) {
1389        s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1390        s=expandGroupLengths(s, offsets, lengths);
1391
1392        /* enumerate all lines in each group */
1393        for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1394            line=s+offsets[lineNumber];
1395            length=lengths[lineNumber];
1396            if(length==0) {
1397                continue;
1398            }
1399
1400            lineLimit=line+length;
1401
1402            /* read regular name */
1403            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1404            if(length>maxNameLength) {
1405                maxNameLength=length;
1406            }
1407            if(line==lineLimit) {
1408                continue;
1409            }
1410
1411            /* read Unicode 1.0 name */
1412            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1413            if(length>maxNameLength) {
1414                maxNameLength=length;
1415            }
1416            if(line==lineLimit) {
1417                continue;
1418            }
1419
1420            /* read ISO comment */
1421            /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1422        }
1423
1424        group=NEXT_GROUP(group);
1425        --groupCount;
1426    }
1427
1428    if(tokenLengths!=NULL) {
1429        uprv_free(tokenLengths);
1430    }
1431
1432    /* set gMax... - name length last for threading */
1433    gMaxNameLength=maxNameLength;
1434}
1435
1436static UBool
1437calcNameSetsLengths(UErrorCode *pErrorCode) {
1438    static const char extChars[]="0123456789ABCDEF<>-";
1439    int32_t i, maxNameLength;
1440
1441    if(gMaxNameLength!=0) {
1442        return TRUE;
1443    }
1444
1445    if(!isDataLoaded(pErrorCode)) {
1446        return FALSE;
1447    }
1448
1449    /* set hex digits, used in various names, and <>-, used in extended names */
1450    for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1451        SET_ADD(gNameSet, extChars[i]);
1452    }
1453
1454    /* set sets and lengths from algorithmic names */
1455    maxNameLength=calcAlgNameSetsLengths(0);
1456
1457    /* set sets and lengths from extended names */
1458    maxNameLength=calcExtNameSetsLengths(maxNameLength);
1459
1460    /* set sets and lengths from group names, set global maximum values */
1461    calcGroupNameSetsLengths(maxNameLength);
1462
1463    return TRUE;
1464}
1465
1466/* public API --------------------------------------------------------------- */
1467
1468U_CAPI int32_t U_EXPORT2
1469u_charName(UChar32 code, UCharNameChoice nameChoice,
1470           char *buffer, int32_t bufferLength,
1471           UErrorCode *pErrorCode) {
1472    AlgorithmicRange *algRange;
1473    uint32_t *p;
1474    uint32_t i;
1475    int32_t length;
1476
1477    /* check the argument values */
1478    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1479        return 0;
1480    } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1481              bufferLength<0 || (bufferLength>0 && buffer==NULL)
1482    ) {
1483        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1484        return 0;
1485    }
1486
1487    if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1488        return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1489    }
1490
1491    length=0;
1492
1493    /* try algorithmic names first */
1494    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1495    i=*p;
1496    algRange=(AlgorithmicRange *)(p+1);
1497    while(i>0) {
1498        if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1499            length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1500            break;
1501        }
1502        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1503        --i;
1504    }
1505
1506    if(i==0) {
1507        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1508            length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1509            if (!length) {
1510                /* extended character name */
1511                length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1512            }
1513        } else {
1514            /* normal character name */
1515            length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1516        }
1517    }
1518
1519    return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1520}
1521
1522U_CAPI int32_t U_EXPORT2
1523u_getISOComment(UChar32 /*c*/,
1524                char *dest, int32_t destCapacity,
1525                UErrorCode *pErrorCode) {
1526    /* check the argument values */
1527    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1528        return 0;
1529    } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1530        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1531        return 0;
1532    }
1533
1534    return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1535}
1536
1537U_CAPI UChar32 U_EXPORT2
1538u_charFromName(UCharNameChoice nameChoice,
1539               const char *name,
1540               UErrorCode *pErrorCode) {
1541    char upper[120], lower[120];
1542    FindName findName;
1543    AlgorithmicRange *algRange;
1544    uint32_t *p;
1545    uint32_t i;
1546    UChar32 cp = 0;
1547    char c0;
1548    UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
1549
1550    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1551        return error;
1552    }
1553
1554    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1555        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1556        return error;
1557    }
1558
1559    if(!isDataLoaded(pErrorCode)) {
1560        return error;
1561    }
1562
1563    /* construct the uppercase and lowercase of the name first */
1564    for(i=0; i<sizeof(upper); ++i) {
1565        if((c0=*name++)!=0) {
1566            upper[i]=uprv_toupper(c0);
1567            lower[i]=uprv_tolower(c0);
1568        } else {
1569            upper[i]=lower[i]=0;
1570            break;
1571        }
1572    }
1573    if(i==sizeof(upper)) {
1574        /* name too long, there is no such character */
1575        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1576        return error;
1577    }
1578
1579    /* try extended names first */
1580    if (lower[0] == '<') {
1581        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1582            if (lower[--i] == '>') {
1583                for (--i; lower[i] && lower[i] != '-'; --i) {
1584                }
1585
1586                if (lower[i] == '-') { /* We've got a category. */
1587                    uint32_t cIdx;
1588
1589                    lower[i] = 0;
1590
1591                    for (++i; lower[i] != '>'; ++i) {
1592                        if (lower[i] >= '0' && lower[i] <= '9') {
1593                            cp = (cp << 4) + lower[i] - '0';
1594                        } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1595                            cp = (cp << 4) + lower[i] - 'a' + 10;
1596                        } else {
1597                            *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1598                            return error;
1599                        }
1600                    }
1601
1602                    /* Now validate the category name.
1603                       We could use a binary search, or a trie, if
1604                       we really wanted to. */
1605
1606                    for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
1607
1608                        if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1609                            if (getCharCat(cp) == cIdx) {
1610                                return cp;
1611                            }
1612                            break;
1613                        }
1614                    }
1615                }
1616            }
1617        }
1618
1619        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1620        return error;
1621    }
1622
1623    /* try algorithmic names now */
1624    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1625    i=*p;
1626    algRange=(AlgorithmicRange *)(p+1);
1627    while(i>0) {
1628        if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1629            return cp;
1630        }
1631        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1632        --i;
1633    }
1634
1635    /* normal character name */
1636    findName.otherName=upper;
1637    findName.code=error;
1638    enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1639    if (findName.code == error) {
1640         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1641    }
1642    return findName.code;
1643}
1644
1645U_CAPI void U_EXPORT2
1646u_enumCharNames(UChar32 start, UChar32 limit,
1647                UEnumCharNamesFn *fn,
1648                void *context,
1649                UCharNameChoice nameChoice,
1650                UErrorCode *pErrorCode) {
1651    AlgorithmicRange *algRange;
1652    uint32_t *p;
1653    uint32_t i;
1654
1655    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1656        return;
1657    }
1658
1659    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1660        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1661        return;
1662    }
1663
1664    if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1665        limit = UCHAR_MAX_VALUE + 1;
1666    }
1667    if((uint32_t)start>=(uint32_t)limit) {
1668        return;
1669    }
1670
1671    if(!isDataLoaded(pErrorCode)) {
1672        return;
1673    }
1674
1675    /* interleave the data-driven ones with the algorithmic ones */
1676    /* iterate over all algorithmic ranges; assume that they are in ascending order */
1677    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1678    i=*p;
1679    algRange=(AlgorithmicRange *)(p+1);
1680    while(i>0) {
1681        /* enumerate the character names before the current algorithmic range */
1682        /* here: start<limit */
1683        if((uint32_t)start<algRange->start) {
1684            if((uint32_t)limit<=algRange->start) {
1685                enumNames(uCharNames, start, limit, fn, context, nameChoice);
1686                return;
1687            }
1688            if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1689                return;
1690            }
1691            start=(UChar32)algRange->start;
1692        }
1693        /* enumerate the character names in the current algorithmic range */
1694        /* here: algRange->start<=start<limit */
1695        if((uint32_t)start<=algRange->end) {
1696            if((uint32_t)limit<=(algRange->end+1)) {
1697                enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1698                return;
1699            }
1700            if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1701                return;
1702            }
1703            start=(UChar32)algRange->end+1;
1704        }
1705        /* continue to the next algorithmic range (here: start<limit) */
1706        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1707        --i;
1708    }
1709    /* enumerate the character names after the last algorithmic range */
1710    enumNames(uCharNames, start, limit, fn, context, nameChoice);
1711}
1712
1713U_CAPI int32_t U_EXPORT2
1714uprv_getMaxCharNameLength() {
1715    UErrorCode errorCode=U_ZERO_ERROR;
1716    if(calcNameSetsLengths(&errorCode)) {
1717        return gMaxNameLength;
1718    } else {
1719        return 0;
1720    }
1721}
1722
1723/**
1724 * Converts the char set cset into a Unicode set uset.
1725 * @param cset Set of 256 bit flags corresponding to a set of chars.
1726 * @param uset USet to receive characters. Existing contents are deleted.
1727 */
1728static void
1729charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1730    UChar us[256];
1731    char cs[256];
1732
1733    int32_t i, length;
1734    UErrorCode errorCode;
1735
1736    errorCode=U_ZERO_ERROR;
1737
1738    if(!calcNameSetsLengths(&errorCode)) {
1739        return;
1740    }
1741
1742    /* build a char string with all chars that are used in character names */
1743    length=0;
1744    for(i=0; i<256; ++i) {
1745        if(SET_CONTAINS(cset, i)) {
1746            cs[length++]=(char)i;
1747        }
1748    }
1749
1750    /* convert the char string to a UChar string */
1751    u_charsToUChars(cs, us, length);
1752
1753    /* add each UChar to the USet */
1754    for(i=0; i<length; ++i) {
1755        if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1756            sa->add(sa->set, us[i]);
1757        }
1758    }
1759}
1760
1761/**
1762 * Fills set with characters that are used in Unicode character names.
1763 * @param set USet to receive characters.
1764 */
1765U_CAPI void U_EXPORT2
1766uprv_getCharNameCharacters(const USetAdder *sa) {
1767    charSetToUSet(gNameSet, sa);
1768}
1769
1770/* data swapping ------------------------------------------------------------ */
1771
1772/*
1773 * The token table contains non-negative entries for token bytes,
1774 * and -1 for bytes that represent themselves in the data file's charset.
1775 * -2 entries are used for lead bytes.
1776 *
1777 * Direct bytes (-1 entries) must be translated from the input charset family
1778 * to the output charset family.
1779 * makeTokenMap() writes a permutation mapping for this.
1780 * Use it once for single-/lead-byte tokens and once more for all trail byte
1781 * tokens. (';' is an unused trail byte marked with -1.)
1782 */
1783static void
1784makeTokenMap(const UDataSwapper *ds,
1785             int16_t tokens[], uint16_t tokenCount,
1786             uint8_t map[256],
1787             UErrorCode *pErrorCode) {
1788    UBool usedOutChar[256];
1789    uint16_t i, j;
1790    uint8_t c1, c2;
1791
1792    if(U_FAILURE(*pErrorCode)) {
1793        return;
1794    }
1795
1796    if(ds->inCharset==ds->outCharset) {
1797        /* Same charset family: identity permutation */
1798        for(i=0; i<256; ++i) {
1799            map[i]=(uint8_t)i;
1800        }
1801    } else {
1802        uprv_memset(map, 0, 256);
1803        uprv_memset(usedOutChar, 0, 256);
1804
1805        if(tokenCount>256) {
1806            tokenCount=256;
1807        }
1808
1809        /* set the direct bytes (byte 0 always maps to itself) */
1810        for(i=1; i<tokenCount; ++i) {
1811            if(tokens[i]==-1) {
1812                /* convert the direct byte character */
1813                c1=(uint8_t)i;
1814                ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1815                if(U_FAILURE(*pErrorCode)) {
1816                    udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1817                                     i, ds->inCharset);
1818                    return;
1819                }
1820
1821                /* enter the converted character into the map and mark it used */
1822                map[c1]=c2;
1823                usedOutChar[c2]=TRUE;
1824            }
1825        }
1826
1827        /* set the mappings for the rest of the permutation */
1828        for(i=j=1; i<tokenCount; ++i) {
1829            /* set mappings that were not set for direct bytes */
1830            if(map[i]==0) {
1831                /* set an output byte value that was not used as an output byte above */
1832                while(usedOutChar[j]) {
1833                    ++j;
1834                }
1835                map[i]=(uint8_t)j++;
1836            }
1837        }
1838
1839        /*
1840         * leave mappings at tokenCount and above unset if tokenCount<256
1841         * because they won't be used
1842         */
1843    }
1844}
1845
1846U_CAPI int32_t U_EXPORT2
1847uchar_swapNames(const UDataSwapper *ds,
1848                const void *inData, int32_t length, void *outData,
1849                UErrorCode *pErrorCode) {
1850    const UDataInfo *pInfo;
1851    int32_t headerSize;
1852
1853    const uint8_t *inBytes;
1854    uint8_t *outBytes;
1855
1856    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1857             offset, i, count, stringsCount;
1858
1859    const AlgorithmicRange *inRange;
1860    AlgorithmicRange *outRange;
1861
1862    /* udata_swapDataHeader checks the arguments */
1863    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1864    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1865        return 0;
1866    }
1867
1868    /* check data format and format version */
1869    pInfo=(const UDataInfo *)((const char *)inData+4);
1870    if(!(
1871        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
1872        pInfo->dataFormat[1]==0x6e &&
1873        pInfo->dataFormat[2]==0x61 &&
1874        pInfo->dataFormat[3]==0x6d &&
1875        pInfo->formatVersion[0]==1
1876    )) {
1877        udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1878                         pInfo->dataFormat[0], pInfo->dataFormat[1],
1879                         pInfo->dataFormat[2], pInfo->dataFormat[3],
1880                         pInfo->formatVersion[0]);
1881        *pErrorCode=U_UNSUPPORTED_ERROR;
1882        return 0;
1883    }
1884
1885    inBytes=(const uint8_t *)inData+headerSize;
1886    outBytes=(uint8_t *)outData+headerSize;
1887    if(length<0) {
1888        algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1889    } else {
1890        length-=headerSize;
1891        if( length<20 ||
1892            (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1893        ) {
1894            udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1895                             length);
1896            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1897            return 0;
1898        }
1899    }
1900
1901    if(length<0) {
1902        /* preflighting: iterate through algorithmic ranges */
1903        offset=algNamesOffset;
1904        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1905        offset+=4;
1906
1907        for(i=0; i<count; ++i) {
1908            inRange=(const AlgorithmicRange *)(inBytes+offset);
1909            offset+=ds->readUInt16(inRange->size);
1910        }
1911    } else {
1912        /* swap data */
1913        const uint16_t *p;
1914        uint16_t *q, *temp;
1915
1916        int16_t tokens[512];
1917        uint16_t tokenCount;
1918
1919        uint8_t map[256], trailMap[256];
1920
1921        /* copy the data for inaccessible bytes */
1922        if(inBytes!=outBytes) {
1923            uprv_memcpy(outBytes, inBytes, length);
1924        }
1925
1926        /* the initial 4 offsets first */
1927        tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1928        groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1929        groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1930        ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1931
1932        /*
1933         * now the tokens table
1934         * it needs to be permutated along with the compressed name strings
1935         */
1936        p=(const uint16_t *)(inBytes+16);
1937        q=(uint16_t *)(outBytes+16);
1938
1939        /* read and swap the tokenCount */
1940        tokenCount=ds->readUInt16(*p);
1941        ds->swapArray16(ds, p, 2, q, pErrorCode);
1942        ++p;
1943        ++q;
1944
1945        /* read the first 512 tokens and make the token maps */
1946        if(tokenCount<=512) {
1947            count=tokenCount;
1948        } else {
1949            count=512;
1950        }
1951        for(i=0; i<count; ++i) {
1952            tokens[i]=udata_readInt16(ds, p[i]);
1953        }
1954        for(; i<512; ++i) {
1955            tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1956        }
1957        makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1958        makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1959        if(U_FAILURE(*pErrorCode)) {
1960            return 0;
1961        }
1962
1963        /*
1964         * swap and permutate the tokens
1965         * go through a temporary array to support in-place swapping
1966         */
1967        temp=(uint16_t *)uprv_malloc(tokenCount*2);
1968        if(temp==NULL) {
1969            udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1970                             tokenCount);
1971            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1972            return 0;
1973        }
1974
1975        /* swap and permutate single-/lead-byte tokens */
1976        for(i=0; i<tokenCount && i<256; ++i) {
1977            ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1978        }
1979
1980        /* swap and permutate trail-byte tokens */
1981        for(; i<tokenCount; ++i) {
1982            ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1983        }
1984
1985        /* copy the result into the output and free the temporary array */
1986        uprv_memcpy(q, temp, tokenCount*2);
1987        uprv_free(temp);
1988
1989        /*
1990         * swap the token strings but not a possible padding byte after
1991         * the terminating NUL of the last string
1992         */
1993        udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1994                                    outBytes+tokenStringOffset, pErrorCode);
1995        if(U_FAILURE(*pErrorCode)) {
1996            udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1997            return 0;
1998        }
1999
2000        /* swap the group table */
2001        count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
2002        ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
2003                           outBytes+groupsOffset, pErrorCode);
2004
2005        /*
2006         * swap the group strings
2007         * swap the string bytes but not the nibble-encoded string lengths
2008         */
2009        if(ds->inCharset!=ds->outCharset) {
2010            uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
2011
2012            const uint8_t *inStrings, *nextInStrings;
2013            uint8_t *outStrings;
2014
2015            uint8_t c;
2016
2017            inStrings=inBytes+groupStringOffset;
2018            outStrings=outBytes+groupStringOffset;
2019
2020            stringsCount=algNamesOffset-groupStringOffset;
2021
2022            /* iterate through string groups until only a few padding bytes are left */
2023            while(stringsCount>32) {
2024                nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2025
2026                /* move past the length bytes */
2027                stringsCount-=(uint32_t)(nextInStrings-inStrings);
2028                outStrings+=nextInStrings-inStrings;
2029                inStrings=nextInStrings;
2030
2031                count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2032                stringsCount-=count;
2033
2034                /* swap the string bytes using map[] and trailMap[] */
2035                while(count>0) {
2036                    c=*inStrings++;
2037                    *outStrings++=map[c];
2038                    if(tokens[c]!=-2) {
2039                        --count;
2040                    } else {
2041                        /* token lead byte: swap the trail byte, too */
2042                        *outStrings++=trailMap[*inStrings++];
2043                        count-=2;
2044                    }
2045                }
2046            }
2047        }
2048
2049        /* swap the algorithmic ranges */
2050        offset=algNamesOffset;
2051        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2052        ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2053        offset+=4;
2054
2055        for(i=0; i<count; ++i) {
2056            if(offset>(uint32_t)length) {
2057                udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2058                                 length, i);
2059                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2060                return 0;
2061            }
2062
2063            inRange=(const AlgorithmicRange *)(inBytes+offset);
2064            outRange=(AlgorithmicRange *)(outBytes+offset);
2065            offset+=ds->readUInt16(inRange->size);
2066
2067            ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2068            ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2069            switch(inRange->type) {
2070            case 0:
2071                /* swap prefix string */
2072                ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2073                                    outRange+1, pErrorCode);
2074                if(U_FAILURE(*pErrorCode)) {
2075                    udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2076                                     i);
2077                    return 0;
2078                }
2079                break;
2080            case 1:
2081                {
2082                    /* swap factors and the prefix and factor strings */
2083                    uint32_t factorsCount;
2084
2085                    factorsCount=inRange->variant;
2086                    p=(const uint16_t *)(inRange+1);
2087                    q=(uint16_t *)(outRange+1);
2088                    ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2089
2090                    /* swap the strings, up to the last terminating NUL */
2091                    p+=factorsCount;
2092                    q+=factorsCount;
2093                    stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2094                    while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2095                        --stringsCount;
2096                    }
2097                    ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2098                }
2099                break;
2100            default:
2101                udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2102                                 inRange->type, i);
2103                *pErrorCode=U_UNSUPPORTED_ERROR;
2104                return 0;
2105            }
2106        }
2107    }
2108
2109    return headerSize+(int32_t)offset;
2110}
2111
2112/*
2113 * Hey, Emacs, please set the following:
2114 *
2115 * Local Variables:
2116 * indent-tabs-mode: nil
2117 * End:
2118 *
2119 */
2120