1/*
2*******************************************************************************
3*   Copyright (C) 2011-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ppucd.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2011dec11
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "unicode/uchar.h"
17#include "charstr.h"
18#include "cstring.h"
19#include "ppucd.h"
20#include "uassert.h"
21#include "uparse.h"
22
23#include <stdio.h>
24#include <string.h>
25
26#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
27
28U_NAMESPACE_BEGIN
29
30PropertyNames::~PropertyNames() {}
31
32int32_t
33PropertyNames::getPropertyEnum(const char *name) const {
34    return u_getPropertyEnum(name);
35}
36
37int32_t
38PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
39    return u_getPropertyValueEnum((UProperty)property, name);
40}
41
42UniProps::UniProps()
43        : start(U_SENTINEL), end(U_SENTINEL),
44          bmg(U_SENTINEL),
45          scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
46          digitValue(-1), numericValue(NULL),
47          name(NULL), nameAlias(NULL) {
48    memset(binProps, 0, sizeof(binProps));
49    memset(intProps, 0, sizeof(intProps));
50    memset(age, 0, 4);
51}
52
53UniProps::~UniProps() {}
54
55const int32_t PreparsedUCD::kNumLineBuffers;
56
57PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
58        : icuPnames(new PropertyNames()), pnames(icuPnames),
59          file(NULL),
60          defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
61          lineNumber(0),
62          lineType(NO_LINE),
63          fieldLimit(NULL), lineLimit(NULL) {
64    if(U_FAILURE(errorCode)) { return; }
65
66    if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
67        filename=NULL;
68        file=stdin;
69    } else {
70        file=fopen(filename, "r");
71    }
72    if(file==NULL) {
73        perror("error opening preparsed UCD");
74        fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
75        errorCode=U_FILE_ACCESS_ERROR;
76        return;
77    }
78
79    memset(ucdVersion, 0, 4);
80    lines[0][0]=0;
81}
82
83PreparsedUCD::~PreparsedUCD() {
84    if(file!=stdin) {
85        fclose(file);
86    }
87    delete icuPnames;
88}
89
90// Same order as the LineType values.
91static const char *lineTypeStrings[]={
92    NULL,
93    NULL,
94    "ucd",
95    "property",
96    "binary",
97    "value",
98    "defaults",
99    "block",
100    "cp",
101    "algnamesrange"
102};
103
104PreparsedUCD::LineType
105PreparsedUCD::readLine(UErrorCode &errorCode) {
106    if(U_FAILURE(errorCode)) { return NO_LINE; }
107    // Select the next available line buffer.
108    while(!isLineBufferAvailable(lineIndex)) {
109        ++lineIndex;
110        if (lineIndex == kNumLineBuffers) {
111            lineIndex = 0;
112        }
113    }
114    char *line=lines[lineIndex];
115    *line=0;
116    lineLimit=fieldLimit=line;
117    lineType=NO_LINE;
118    char *result=fgets(line, sizeof(lines[0]), file);
119    if(result==NULL) {
120        if(ferror(file)) {
121            perror("error reading preparsed UCD");
122            fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
123            errorCode=U_FILE_ACCESS_ERROR;
124        }
125        return NO_LINE;
126    }
127    ++lineNumber;
128    if(*line=='#') {
129        fieldLimit=strchr(line, 0);
130        return lineType=EMPTY_LINE;
131    }
132    // Remove trailing /r/n.
133    char c;
134    char *limit=strchr(line, 0);
135    while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
136    // Remove trailing white space.
137    while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
138    *limit=0;
139    lineLimit=limit;
140    if(line==limit) {
141        fieldLimit=limit;
142        return lineType=EMPTY_LINE;
143    }
144    // Split by ';'.
145    char *semi=line;
146    while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
147    fieldLimit=strchr(line, 0);
148    // Determine the line type.
149    int32_t type;
150    for(type=EMPTY_LINE+1;; ++type) {
151        if(type==LINE_TYPE_COUNT) {
152            fprintf(stderr,
153                    "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
154                    line, (long)lineNumber);
155            errorCode=U_PARSE_ERROR;
156            return NO_LINE;
157        }
158        if(0==strcmp(line, lineTypeStrings[type])) {
159            break;
160        }
161    }
162    lineType=(LineType)type;
163    if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
164        u_versionFromString(ucdVersion, fieldLimit+1);
165    }
166    return lineType;
167}
168
169const char *
170PreparsedUCD::firstField() {
171    char *field=lines[lineIndex];
172    fieldLimit=strchr(field, 0);
173    return field;
174}
175
176const char *
177PreparsedUCD::nextField() {
178    if(fieldLimit==lineLimit) { return NULL; }
179    char *field=fieldLimit+1;
180    fieldLimit=strchr(field, 0);
181    return field;
182}
183
184const UniProps *
185PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
186    if(U_FAILURE(errorCode)) { return NULL; }
187    newValues.clear();
188    if(!lineHasPropertyValues()) {
189        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
190        return NULL;
191    }
192    firstField();
193    const char *field=nextField();
194    if(field==NULL) {
195        // No range field after the type.
196        fprintf(stderr,
197                "error in preparsed UCD: missing default/block/cp range field "
198                "(no second field) on line %ld\n",
199                (long)lineNumber);
200        errorCode=U_PARSE_ERROR;
201        return NULL;
202    }
203    UChar32 start, end;
204    if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
205    UniProps *props;
206    switch(lineType) {
207    case DEFAULTS_LINE:
208        if(defaultLineIndex>=0) {
209            fprintf(stderr,
210                    "error in preparsed UCD: second line with default properties on line %ld\n",
211                    (long)lineNumber);
212            errorCode=U_PARSE_ERROR;
213            return NULL;
214        }
215        if(start!=0 || end!=0x10ffff) {
216            fprintf(stderr,
217                    "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
218                    field, (long)lineNumber);
219            errorCode=U_PARSE_ERROR;
220            return NULL;
221        }
222        props=&defaultProps;
223        defaultLineIndex=lineIndex;
224        break;
225    case BLOCK_LINE:
226        blockProps=defaultProps;  // Block inherits default properties.
227        props=&blockProps;
228        blockLineIndex=lineIndex;
229        break;
230    case CP_LINE:
231        if(blockProps.start<=start && end<=blockProps.end) {
232            // Code point range fully inside the last block inherits the block properties.
233            cpProps=blockProps;
234        } else if(start>blockProps.end || end<blockProps.start) {
235            // Code point range fully outside the last block inherits the default properties.
236            cpProps=defaultProps;
237        } else {
238            // Code point range partially overlapping with the last block is illegal.
239            fprintf(stderr,
240                    "error in preparsed UCD: cp range %s on line %ld only "
241                    "partially overlaps with block range %04lX..%04lX\n",
242                    field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
243            errorCode=U_PARSE_ERROR;
244            return NULL;
245        }
246        props=&cpProps;
247        break;
248    default:
249        // Will not occur because of the range check above.
250        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
251        return NULL;
252    }
253    props->start=start;
254    props->end=end;
255    while((field=nextField())!=NULL) {
256        if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
257    }
258    return props;
259}
260
261static const struct {
262    const char *name;
263    int32_t prop;
264} ppucdProperties[]={
265    { "Name_Alias", PPUCD_NAME_ALIAS },
266    { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
267    { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
268};
269
270// Returns TRUE for "ok to continue parsing fields".
271UBool
272PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
273                            UErrorCode &errorCode) {
274    CharString pBuffer;
275    const char *p=field;
276    const char *v=strchr(p, '=');
277    int binaryValue;
278    if(*p=='-') {
279        if(v!=NULL) {
280            fprintf(stderr,
281                    "error in preparsed UCD: mix of binary-property-no and "
282                    "enum-property syntax '%s' on line %ld\n",
283                    field, (long)lineNumber);
284            errorCode=U_PARSE_ERROR;
285            return FALSE;
286        }
287        binaryValue=0;
288        ++p;
289    } else if(v==NULL) {
290        binaryValue=1;
291    } else {
292        binaryValue=-1;
293        // Copy out the property name rather than modifying the field (writing a NUL).
294        pBuffer.append(p, (int32_t)(v-p), errorCode);
295        p=pBuffer.data();
296        ++v;
297    }
298    int32_t prop=pnames->getPropertyEnum(p);
299    if(prop<0) {
300        for(int32_t i=0;; ++i) {
301            if(i==LENGTHOF(ppucdProperties)) {
302                // Ignore unknown property names.
303                return TRUE;
304            }
305            if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
306                prop=ppucdProperties[i].prop;
307                U_ASSERT(prop>=0);
308                break;
309            }
310        }
311    }
312    if(prop<UCHAR_BINARY_LIMIT) {
313        if(binaryValue>=0) {
314            props.binProps[prop]=(UBool)binaryValue;
315        } else {
316            // No binary value for a binary property.
317            fprintf(stderr,
318                    "error in preparsed UCD: enum-property syntax '%s' "
319                    "for binary property on line %ld\n",
320                    field, (long)lineNumber);
321            errorCode=U_PARSE_ERROR;
322        }
323    } else if(binaryValue>=0) {
324        // Binary value for a non-binary property.
325        fprintf(stderr,
326                "error in preparsed UCD: binary-property syntax '%s' "
327                "for non-binary property on line %ld\n",
328                field, (long)lineNumber);
329        errorCode=U_PARSE_ERROR;
330    } else if (prop < UCHAR_INT_START) {
331        fprintf(stderr,
332                "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
333                prop, (long)lineNumber);
334        errorCode=U_PARSE_ERROR;
335    } else if(prop<UCHAR_INT_LIMIT) {
336        int32_t value=pnames->getPropertyValueEnum(prop, v);
337        if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
338            // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
339            char *end;
340            unsigned long ccc=uprv_strtoul(v, &end, 10);
341            if(v<end && *end==0 && ccc<=254) {
342                value=(int32_t)ccc;
343            }
344        }
345        if(value==UCHAR_INVALID_CODE) {
346            fprintf(stderr,
347                    "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
348                    field, (long)lineNumber);
349            errorCode=U_PARSE_ERROR;
350        } else {
351            props.intProps[prop-UCHAR_INT_START]=value;
352        }
353    } else if(*v=='<') {
354        // Do not parse default values like <code point>, just set null values.
355        switch(prop) {
356        case UCHAR_BIDI_MIRRORING_GLYPH:
357            props.bmg=U_SENTINEL;
358            break;
359        case UCHAR_SIMPLE_CASE_FOLDING:
360            props.scf=U_SENTINEL;
361            break;
362        case UCHAR_SIMPLE_LOWERCASE_MAPPING:
363            props.slc=U_SENTINEL;
364            break;
365        case UCHAR_SIMPLE_TITLECASE_MAPPING:
366            props.stc=U_SENTINEL;
367            break;
368        case UCHAR_SIMPLE_UPPERCASE_MAPPING:
369            props.suc=U_SENTINEL;
370            break;
371        case UCHAR_CASE_FOLDING:
372            props.cf.remove();
373            break;
374        case UCHAR_LOWERCASE_MAPPING:
375            props.lc.remove();
376            break;
377        case UCHAR_TITLECASE_MAPPING:
378            props.tc.remove();
379            break;
380        case UCHAR_UPPERCASE_MAPPING:
381            props.uc.remove();
382            break;
383        case UCHAR_SCRIPT_EXTENSIONS:
384            props.scx.clear();
385            break;
386        default:
387            fprintf(stderr,
388                    "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
389                    field, (long)lineNumber);
390            errorCode=U_PARSE_ERROR;
391        }
392    } else {
393        char c;
394        switch(prop) {
395        case UCHAR_NUMERIC_VALUE:
396            props.numericValue=v;
397            c=*v;
398            if('0'<=c && c<='9' && v[1]==0) {
399                props.digitValue=c-'0';
400            } else {
401                props.digitValue=-1;
402            }
403            break;
404        case UCHAR_NAME:
405            props.name=v;
406            break;
407        case UCHAR_AGE:
408            u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
409            break;
410        case UCHAR_BIDI_MIRRORING_GLYPH:
411            props.bmg=parseCodePoint(v, errorCode);
412            break;
413        case UCHAR_SIMPLE_CASE_FOLDING:
414            props.scf=parseCodePoint(v, errorCode);
415            break;
416        case UCHAR_SIMPLE_LOWERCASE_MAPPING:
417            props.slc=parseCodePoint(v, errorCode);
418            break;
419        case UCHAR_SIMPLE_TITLECASE_MAPPING:
420            props.stc=parseCodePoint(v, errorCode);
421            break;
422        case UCHAR_SIMPLE_UPPERCASE_MAPPING:
423            props.suc=parseCodePoint(v, errorCode);
424            break;
425        case UCHAR_CASE_FOLDING:
426            parseString(v, props.cf, errorCode);
427            break;
428        case UCHAR_LOWERCASE_MAPPING:
429            parseString(v, props.lc, errorCode);
430            break;
431        case UCHAR_TITLECASE_MAPPING:
432            parseString(v, props.tc, errorCode);
433            break;
434        case UCHAR_UPPERCASE_MAPPING:
435            parseString(v, props.uc, errorCode);
436            break;
437        case PPUCD_NAME_ALIAS:
438            props.nameAlias=v;
439            break;
440        case PPUCD_CONDITIONAL_CASE_MAPPINGS:
441        case PPUCD_TURKIC_CASE_FOLDING:
442            // No need to parse their values: They are hardcoded in the runtime library.
443            break;
444        case UCHAR_SCRIPT_EXTENSIONS:
445            parseScriptExtensions(v, props.scx, errorCode);
446            break;
447        default:
448            // Ignore unhandled properties.
449            return TRUE;
450        }
451    }
452    if(U_SUCCESS(errorCode)) {
453        newValues.add((UChar32)prop);
454        return TRUE;
455    } else {
456        return FALSE;
457    }
458}
459
460UBool
461PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
462    if(U_FAILURE(errorCode)) { return FALSE; }
463    if(lineType!=ALG_NAMES_RANGE_LINE) {
464        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
465        return FALSE;
466    }
467    firstField();
468    const char *field=nextField();
469    if(field==NULL) {
470        // No range field after the type.
471        fprintf(stderr,
472                "error in preparsed UCD: missing algnamesrange range field "
473                "(no second field) on line %ld\n",
474                (long)lineNumber);
475        errorCode=U_PARSE_ERROR;
476        return FALSE;
477    }
478    return parseCodePointRange(field, start, end, errorCode);
479}
480
481UChar32
482PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
483    char *end;
484    uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
485    if(end<=s || *end!=0 || value>=0x110000) {
486        fprintf(stderr,
487                "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
488                s, (long)lineNumber);
489        errorCode=U_PARSE_ERROR;
490        return U_SENTINEL;
491    }
492    return (UChar32)value;
493}
494
495UBool
496PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
497    uint32_t st, e;
498    u_parseCodePointRange(s, &st, &e, &errorCode);
499    if(U_FAILURE(errorCode)) {
500        fprintf(stderr,
501                "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
502                s, (long)lineNumber);
503        return FALSE;
504    }
505    start=(UChar32)st;
506    end=(UChar32)e;
507    return TRUE;
508}
509
510void
511PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
512    UChar *buffer=uni.getBuffer(-1);
513    int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
514    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
515        errorCode=U_ZERO_ERROR;
516        uni.releaseBuffer(0);
517        buffer=uni.getBuffer(length);
518        length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
519    }
520    uni.releaseBuffer(length);
521    if(U_FAILURE(errorCode)) {
522        fprintf(stderr,
523                "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
524                s, (long)lineNumber);
525    }
526}
527
528void
529PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
530    if(U_FAILURE(errorCode)) { return; }
531    scx.clear();
532    CharString scString;
533    for(;;) {
534        const char *scs;
535        const char *scLimit=strchr(s, ' ');
536        if(scLimit!=NULL) {
537            scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
538            if(U_FAILURE(errorCode)) { return; }
539        } else {
540            scs=s;
541        }
542        int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
543        if(script==UCHAR_INVALID_CODE) {
544            fprintf(stderr,
545                    "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
546                    scs, (long)lineNumber);
547            errorCode=U_PARSE_ERROR;
548            return;
549        } else if(scx.contains(script)) {
550            fprintf(stderr,
551                    "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
552                    scs, (long)lineNumber);
553            errorCode=U_PARSE_ERROR;
554            return;
555        } else {
556            scx.add(script);
557        }
558        if(scLimit!=NULL) {
559            s=scLimit+1;
560        } else {
561            break;
562        }
563    }
564    if(scx.isEmpty()) {
565        fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
566        errorCode=U_PARSE_ERROR;
567    }
568}
569
570U_NAMESPACE_END
571