1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  gennorm2.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009nov25
14*   created by: Markus W. Scherer
15*
16*   This program reads text files that define Unicode normalization,
17*   parses them, and builds a binary data file.
18*/
19
20#include "unicode/utypes.h"
21#include "n2builder.h"
22
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include "unicode/errorcode.h"
27#include "unicode/localpointer.h"
28#include "unicode/putil.h"
29#include "unicode/uchar.h"
30#include "unicode/unistr.h"
31#include "charstr.h"
32#include "normalizer2impl.h"
33#include "toolutil.h"
34#include "uoptions.h"
35#include "uparse.h"
36
37#if UCONFIG_NO_NORMALIZATION
38#include "unewdata.h"
39#endif
40
41#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42
43U_NAMESPACE_BEGIN
44
45UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
48
49#if !UCONFIG_NO_NORMALIZATION
50void parseFile(FILE *f, Normalizer2DataBuilder &builder);
51#endif
52
53/* -------------------------------------------------------------------------- */
54
55enum {
56    HELP_H,
57    HELP_QUESTION_MARK,
58    VERBOSE,
59    COPYRIGHT,
60    SOURCEDIR,
61    OUTPUT_FILENAME,
62    UNICODE_VERSION,
63    OPT_FAST
64};
65
66static UOption options[]={
67    UOPTION_HELP_H,
68    UOPTION_HELP_QUESTION_MARK,
69    UOPTION_VERBOSE,
70    UOPTION_COPYRIGHT,
71    UOPTION_SOURCEDIR,
72    UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
73    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
74    UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
75};
76
77extern "C" int
78main(int argc, char* argv[]) {
79    U_MAIN_INIT_ARGS(argc, argv);
80
81    /* preset then read command line options */
82    options[SOURCEDIR].value="";
83    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
84
85    /* error handling, printing usage message */
86    if(argc<0) {
87        fprintf(stderr,
88            "error in command line argument \"%s\"\n",
89            argv[-argc]);
90    }
91    if(!options[OUTPUT_FILENAME].doesOccur) {
92        argc=-1;
93    }
94    if( argc<2 ||
95        options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
96    ) {
97        /*
98         * Broken into chunks because the C89 standard says the minimum
99         * required supported string length is 509 bytes.
100         */
101        fprintf(stderr,
102            "Usage: %s [-options] infiles+ -o outputfilename\n"
103            "\n"
104            "Reads the infiles with normalization data and\n"
105            "creates a binary file (outputfilename) with the data.\n"
106            "\n",
107            argv[0]);
108        fprintf(stderr,
109            "Options:\n"
110            "\t-h or -? or --help  this usage text\n"
111            "\t-v or --verbose     verbose output\n"
112            "\t-c or --copyright   include a copyright notice\n"
113            "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
114        fprintf(stderr,
115            "\t-s or --sourcedir   source directory, followed by the path\n"
116            "\t-o or --output      output filename\n");
117        fprintf(stderr,
118            "\t      --fast        optimize the .nrm file for fast normalization,\n"
119            "\t                    which might increase its size  (Writes fully decomposed\n"
120            "\t                    regular mappings instead of delta mappings.\n"
121            "\t                    You should measure the runtime speed to make sure that\n"
122            "\t                    this is a good trade-off.)\n");
123        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
124    }
125
126    beVerbose=options[VERBOSE].doesOccur;
127    haveCopyright=options[COPYRIGHT].doesOccur;
128
129    IcuToolErrorCode errorCode("gennorm2/main()");
130
131#if UCONFIG_NO_NORMALIZATION
132
133    fprintf(stderr,
134        "gennorm2 writes a dummy binary data file "
135        "because UCONFIG_NO_NORMALIZATION is set, \n"
136        "see icu/source/common/unicode/uconfig.h\n");
137    udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
138    // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
139    // return U_UNSUPPORTED_ERROR;
140    return 0;
141
142#else
143
144    LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
145    errorCode.assertSuccess();
146
147    if(options[UNICODE_VERSION].doesOccur) {
148        builder->setUnicodeVersion(options[UNICODE_VERSION].value);
149    }
150
151    if(options[OPT_FAST].doesOccur) {
152        builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
153    }
154
155    // prepare the filename beginning with the source dir
156    CharString filename(options[SOURCEDIR].value, errorCode);
157    int32_t pathLength=filename.length();
158    if( pathLength>0 &&
159        filename[pathLength-1]!=U_FILE_SEP_CHAR &&
160        filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
161    ) {
162        filename.append(U_FILE_SEP_CHAR, errorCode);
163        pathLength=filename.length();
164    }
165
166    for(int i=1; i<argc; ++i) {
167        printf("gennorm2: processing %s\n", argv[i]);
168        filename.append(argv[i], errorCode);
169        LocalStdioFilePointer f(fopen(filename.data(), "r"));
170        if(f==NULL) {
171            fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
172            exit(U_FILE_ACCESS_ERROR);
173        }
174        builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
175        parseFile(f.getAlias(), *builder);
176        filename.truncate(pathLength);
177    }
178
179    builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
180
181    return errorCode.get();
182
183#endif
184}
185
186#if !UCONFIG_NO_NORMALIZATION
187
188void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
189    IcuToolErrorCode errorCode("gennorm2/parseFile()");
190    char line[300];
191    uint32_t startCP, endCP;
192    while(NULL!=fgets(line, (int)sizeof(line), f)) {
193        char *comment=(char *)strchr(line, '#');
194        if(comment!=NULL) {
195            *comment=0;
196        }
197        u_rtrim(line);
198        if(line[0]==0) {
199            continue;  // skip empty and comment-only lines
200        }
201        if(line[0]=='*') {
202            const char *s=u_skipWhitespace(line+1);
203            if(0==strncmp(s, "Unicode", 7)) {
204                s=u_skipWhitespace(s+7);
205                builder.setUnicodeVersion(s);
206            }
207            continue;  // reserved syntax
208        }
209        const char *delimiter;
210        int32_t rangeLength=
211            u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
212        if(errorCode.isFailure()) {
213            fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
214            exit(errorCode.reset());
215        }
216        delimiter=u_skipWhitespace(delimiter);
217        if(*delimiter==':') {
218            const char *s=u_skipWhitespace(delimiter+1);
219            char *end;
220            unsigned long value=strtoul(s, &end, 10);
221            if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
222                fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
223                exit(U_PARSE_ERROR);
224            }
225            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
226                builder.setCC(c, (uint8_t)value);
227            }
228            continue;
229        }
230        if(*delimiter=='-') {
231            if(*u_skipWhitespace(delimiter+1)!=0) {
232                fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
233                exit(U_PARSE_ERROR);
234            }
235            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
236                builder.removeMapping(c);
237            }
238            continue;
239        }
240        if(*delimiter=='=' || *delimiter=='>') {
241            UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
242            int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
243            if(errorCode.isFailure()) {
244                fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
245                exit(errorCode.reset());
246            }
247            UnicodeString mapping(FALSE, uchars, length);
248            if(*delimiter=='=') {
249                if(rangeLength!=1) {
250                    fprintf(stderr,
251                            "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
252                            line);
253                    exit(U_PARSE_ERROR);
254                }
255                builder.setRoundTripMapping((UChar32)startCP, mapping);
256            } else {
257                for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
258                    builder.setOneWayMapping(c, mapping);
259                }
260            }
261            continue;
262        }
263        fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
264        exit(U_PARSE_ERROR);
265    }
266}
267
268#endif // !UCONFIG_NO_NORMALIZATION
269
270U_NAMESPACE_END
271
272/*
273 * Hey, Emacs, please set the following:
274 *
275 * Local Variables:
276 * indent-tabs-mode: nil
277 * End:
278 *
279 */
280