1/*
2**********************************************************************
3*   Copyright (C) 2002-2009, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*
7* File genctd.c
8*/
9
10//--------------------------------------------------------------------
11//
12//   Tool for generating CompactTrieDictionary data files (.ctd files).
13//
14//   Usage:  genctd [options] -o output-file.ctd input-file
15//
16//       options:   -v         verbose
17//                  -? or -h   help
18//
19//   The input  file is a plain text file containing words, one per line.
20//    Words end at the first whitespace; lines beginning with whitespace
21//    are ignored.
22//    The file can be encoded as utf-8, or utf-16 (either endian), or
23//    in the default code page (platform dependent.).  utf encoded
24//    files must include a BOM.
25//
26//--------------------------------------------------------------------
27
28#include "unicode/utypes.h"
29#include "unicode/uchar.h"
30#include "unicode/ucnv.h"
31#include "unicode/uniset.h"
32#include "unicode/unistr.h"
33#include "unicode/uclean.h"
34#include "unicode/udata.h"
35#include "unicode/putil.h"
36
37#include "uoptions.h"
38#include "unewdata.h"
39#include "ucmndata.h"
40#include "rbbidata.h"
41#include "triedict.h"
42#include "cmemory.h"
43
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47
48U_NAMESPACE_USE
49
50static char *progName;
51static UOption options[]={
52    UOPTION_HELP_H,             /* 0 */
53    UOPTION_HELP_QUESTION_MARK, /* 1 */
54    UOPTION_VERBOSE,            /* 2 */
55    { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 3 */
56    UOPTION_ICUDATADIR,         /* 4 */
57    UOPTION_DESTDIR,            /* 5 */
58    UOPTION_COPYRIGHT,          /* 6 */
59};
60
61void usageAndDie(int retCode) {
62        printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
63        printf("\tRead in word list and write out compact trie dictionary\n"
64            "options:\n"
65            "\t-h or -? or --help  this usage text\n"
66            "\t-V or --version     show a version message\n"
67            "\t-c or --copyright   include a copyright notice\n"
68            "\t-v or --verbose     turn on verbose output\n"
69            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
70            "\t                    followed by path, defaults to %s\n"
71            "\t-d or --destdir     destination directory, followed by the path\n",
72            u_getDataDirectory());
73        exit (retCode);
74}
75
76
77#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
78
79/* dummy UDataInfo cf. udata.h */
80static UDataInfo dummyDataInfo = {
81    sizeof(UDataInfo),
82    0,
83
84    U_IS_BIG_ENDIAN,
85    U_CHARSET_FAMILY,
86    U_SIZEOF_UCHAR,
87    0,
88
89    { 0, 0, 0, 0 },                 /* dummy dataFormat */
90    { 0, 0, 0, 0 },                 /* dummy formatVersion */
91    { 0, 0, 0, 0 }                  /* dummy dataVersion */
92};
93
94#else
95
96//
97//  Set up the ICU data header, defined in ucmndata.h
98//
99DataHeader dh ={
100    {sizeof(DataHeader),           // Struct MappedData
101        0xda,
102        0x27},
103
104    {                               // struct UDataInfo
105        sizeof(UDataInfo),          //     size
106        0,                          //     reserved
107        U_IS_BIG_ENDIAN,
108        U_CHARSET_FAMILY,
109        U_SIZEOF_UCHAR,
110        0,                          //     reserved
111
112    { 0x54, 0x72, 0x44, 0x63 },     // "TrDc" Trie Dictionary
113    { 1, 0, 0, 0 },                 // 1.0.0.0
114    { 0, 0, 0, 0 },                 // Irrelevant for this data type
115    }};
116
117#endif
118
119//----------------------------------------------------------------------------
120//
121//  main      for genctd
122//
123//----------------------------------------------------------------------------
124int  main(int argc, char **argv) {
125    UErrorCode  status = U_ZERO_ERROR;
126    const char *wordFileName;
127    const char *outFileName;
128    const char *outDir = NULL;
129    const char *copyright = NULL;
130
131    //
132    // Pick up and check the command line arguments,
133    //    using the standard ICU tool utils option handling.
134    //
135    U_MAIN_INIT_ARGS(argc, argv);
136    progName = argv[0];
137    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
138    if(argc<0) {
139        // Unrecognized option
140        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
141        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
142    }
143
144    if(options[0].doesOccur || options[1].doesOccur) {
145        //  -? or -h for help.
146        usageAndDie(0);
147    }
148
149    if (!options[3].doesOccur || argc < 2) {
150        fprintf(stderr, "input and output file must both be specified.\n");
151        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
152    }
153    outFileName  = options[3].value;
154    wordFileName = argv[1];
155
156    if (options[4].doesOccur) {
157        u_setDataDirectory(options[4].value);
158    }
159
160    status = U_ZERO_ERROR;
161
162    /* Combine the directory with the file name */
163    if(options[5].doesOccur) {
164        outDir = options[5].value;
165    }
166    if (options[6].doesOccur) {
167        copyright = U_COPYRIGHT_STRING;
168    }
169
170#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
171
172    UNewDataMemory *pData;
173    char msg[1024];
174
175    /* write message with just the name */
176    sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
177    fprintf(stderr, "%s\n", msg);
178
179    /* write the dummy data file */
180    pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
181    udata_writeBlock(pData, msg, strlen(msg));
182    udata_finish(pData, &status);
183    return (int)status;
184
185#else
186    /* Initialize ICU */
187    u_init(&status);
188    if (U_FAILURE(status)) {
189        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
190            argv[0], u_errorName(status));
191        exit(1);
192    }
193    status = U_ZERO_ERROR;
194
195    //
196    //  Read in the dictionary source file
197    //
198    long        result;
199    long        wordFileSize;
200    FILE        *file;
201    char        *wordBufferC;
202
203    file = fopen(wordFileName, "rb");
204    if( file == 0 ) {
205        fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
206        exit(-1);
207    }
208    fseek(file, 0, SEEK_END);
209    wordFileSize = ftell(file);
210    fseek(file, 0, SEEK_SET);
211    wordBufferC = new char[wordFileSize+10];
212
213    result = (long)fread(wordBufferC, 1, wordFileSize, file);
214    if (result != wordFileSize)  {
215        fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
216        exit (-1);
217    }
218    wordBufferC[wordFileSize]=0;
219    fclose(file);
220
221    //
222    // Look for a Unicode Signature (BOM) on the word file
223    //
224    int32_t        signatureLength;
225    const char *   wordSourceC = wordBufferC;
226    const char*    encoding = ucnv_detectUnicodeSignature(
227                           wordSourceC, wordFileSize, &signatureLength, &status);
228    if (U_FAILURE(status)) {
229        exit(status);
230    }
231    if(encoding!=NULL ){
232        wordSourceC  += signatureLength;
233        wordFileSize -= signatureLength;
234    }
235
236    //
237    // Open a converter to take the rule file to UTF-16
238    //
239    UConverter* conv;
240    conv = ucnv_open(encoding, &status);
241    if (U_FAILURE(status)) {
242        fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
243        exit(status);
244    }
245
246    //
247    // Convert the words to UChar.
248    //  Preflight first to determine required buffer size.
249    //
250    uint32_t destCap = ucnv_toUChars(conv,
251                       NULL,           //  dest,
252                       0,              //  destCapacity,
253                       wordSourceC,
254                       wordFileSize,
255                       &status);
256    if (status != U_BUFFER_OVERFLOW_ERROR) {
257        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
258        exit(status);
259    };
260
261    status = U_ZERO_ERROR;
262    UChar *wordSourceU = new UChar[destCap+1];
263    ucnv_toUChars(conv,
264                  wordSourceU,     //  dest,
265                  destCap+1,
266                  wordSourceC,
267                  wordFileSize,
268                  &status);
269    if (U_FAILURE(status)) {
270        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
271        exit(status);
272    };
273    ucnv_close(conv);
274
275    // Get rid of the original file buffer
276    delete[] wordBufferC;
277
278    // Create a MutableTrieDictionary, and loop through all the lines, inserting
279    // words.
280
281    // First, pick a median character.
282    UChar *current = wordSourceU + (destCap/2);
283    UChar uc = *current++;
284    UnicodeSet breaks;
285    breaks.add(0x000A);     // Line Feed
286    breaks.add(0x000D);     // Carriage Return
287    breaks.add(0x2028);     // Line Separator
288    breaks.add(0x2029);     // Paragraph Separator
289
290    do {
291        // Look for line break
292        while (uc && !breaks.contains(uc)) {
293            uc = *current++;
294        }
295        // Now skip to first non-line-break
296        while (uc && breaks.contains(uc)) {
297            uc = *current++;
298        }
299    }
300    while (uc && (breaks.contains(uc) || u_isspace(uc)));
301
302    MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
303
304    if (U_FAILURE(status)) {
305        fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
306        exit(status);
307    }
308
309    // Now add the words. Words are non-space characters at the beginning of
310    // lines, and must be at least one UChar.
311    current = wordSourceU;
312    UChar *candidate = current;
313    uc = *current++;
314    int32_t length = 0;
315
316    while (uc) {
317        while (uc && !u_isspace(uc)) {
318            ++length;
319            uc = *current++;
320        }
321        if (length > 0) {
322            mtd->addWord(candidate, length, status);
323            if (U_FAILURE(status)) {
324                fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
325                        u_errorName(status));
326                exit(status);
327            }
328        }
329        // Find beginning of next line
330        while (uc && !breaks.contains(uc)) {
331            uc = *current++;
332        }
333        while (uc && breaks.contains(uc)) {
334            uc = *current++;
335        }
336        candidate = current-1;
337        length = 0;
338    }
339
340    // Get rid of the Unicode text buffer
341    delete[] wordSourceU;
342
343    // Now, create a CompactTrieDictionary from the mutable dictionary
344    CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
345    if (U_FAILURE(status)) {
346        fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
347        exit(status);
348    }
349
350    // Get rid of the MutableTrieDictionary
351    delete mtd;
352
353    //
354    //  Get the binary data from the dictionary.
355    //
356    uint32_t        outDataSize = ctd->dataSize();
357    const uint8_t  *outData = (const uint8_t *)ctd->data();
358
359    //
360    //  Create the output file
361    //
362    size_t bytesWritten;
363    UNewDataMemory *pData;
364    pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
365    if(U_FAILURE(status)) {
366        fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
367                         outFileName, u_errorName(status));
368        exit(status);
369    }
370
371
372    //  Write the data itself.
373    udata_writeBlock(pData, outData, outDataSize);
374    // finish up
375    bytesWritten = udata_finish(pData, &status);
376    if(U_FAILURE(status)) {
377        fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
378        exit(status);
379    }
380
381    if (bytesWritten != outDataSize) {
382        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
383        exit(-1);
384    }
385
386    // Get rid of the CompactTrieDictionary
387    delete ctd;
388
389    u_cleanup();
390
391    printf("genctd: tool completed successfully.\n");
392    return 0;
393
394#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
395}
396
397