1/**************************************************************************
2*
3*   Copyright (C) 2002-2010, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*
6***************************************************************************
7*/
8
9//
10//   ugrep  - an ICU sample program illustrating the use of ICU Regular Expressions.
11//
12//            The use of the ICU Regex API all occurs within the main()
13//            function.  The rest of the code deals with with opening files,
14//            encoding conversions, printing results, etc.
15//
16//            This is not a full-featured grep program.  The command line options
17//            have been kept to a minimum to avoid complicating the sample code.
18//
19
20
21
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25
26#include "unicode/utypes.h"
27#include "unicode/ustring.h"
28#include "unicode/regex.h"
29#include "unicode/ucnv.h"
30#include "unicode/uclean.h"
31
32
33//
34//  The following variables contain paramters that may be set from the command line.
35//
36const char *pattern = NULL;     // The regular expression
37int        firstFileNum;        //  argv index of the first file name
38UBool      displayFileName = FALSE;
39UBool      displayLineNum  = FALSE;
40
41
42//
43//  Info regarding the file currently being processed
44//
45const char *fileName;
46int         fileLen;              // Length, in UTF-16 Code Units.
47
48UChar      *ucharBuf = 0;         // Buffer, holds converted file.  (Simple minded program, always reads
49                                  //   the whole file at once.
50
51char       *charBuf = 0;          // Buffer, for original, unconverted file data.
52
53
54//
55//  Info regarding the line currently being processed
56//
57int      lineStart;     // Index of first char of the current line in the file buffer
58int      lineEnd;       // Index of char following the new line sequence for the current line
59int      lineNum;
60
61//
62//  Converter, used on output to convert Unicode data back to char *
63//             so that it will display in non-Unicode terminal windows.
64//
65UConverter  *outConverter = 0;
66
67//
68//  Function forward declarations
69//
70void processOptions(int argc, const char **argv);
71void nextLine(int start);
72void printMatch();
73void printUsage();
74void readFile(const char *name);
75
76
77
78//------------------------------------------------------------------------------------------
79//
80//   main          for ugrep
81//
82//           Structurally, all use of the ICU Regular Expression API is in main(),
83//           and all of the supporting stuff necessary to make a running program, but
84//           not directly related to regular expressions, is factored out into these other
85//           functions.
86//
87//------------------------------------------------------------------------------------------
88int main(int argc, const char** argv) {
89    UBool     matchFound = FALSE;
90
91    //
92    //  Process the commmand line options.
93    //
94    processOptions(argc, argv);
95
96    //
97    // Create a RegexPattern object from the user supplied pattern string.
98    //
99    UErrorCode status = U_ZERO_ERROR;   // All ICU operations report success or failure
100                                        //   in a status variable.
101
102    UParseError    parseErr;            // In the event of a syntax error in the regex pattern,
103                                        //   this struct will contain the position of the
104                                        //   error.
105
106    RegexPattern  *rePat = RegexPattern::compile(pattern, parseErr, status);
107                                        // Note that C++ is doing an automatic conversion
108                                        //  of the (char *) pattern to a temporary
109                                        //  UnicodeString object.
110    if (U_FAILURE(status)) {
111        fprintf(stderr, "ugrep:  error in pattern: \"%s\" at position %d\n",
112            u_errorName(status), parseErr.offset);
113        exit(-1);
114    }
115
116    //
117    // Create a RegexMatcher from the newly created pattern.
118    //
119    UnicodeString empty;
120    RegexMatcher *matcher = rePat->matcher(empty, status);
121    if (U_FAILURE(status)) {
122        fprintf(stderr, "ugrep:  error in creating RegexMatcher: \"%s\"\n",
123            u_errorName(status));
124        exit(-1);
125    }
126
127    //
128    // Loop, processing each of the input files.
129    //
130    for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
131        readFile(argv[fileNum]);
132
133        //
134        //  Loop through the lines of a file, trying to match the regex pattern on each.
135        //
136        for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
137            UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
138            matcher->reset(s);
139            if (matcher->find()) {
140                matchFound = TRUE;
141                printMatch();
142            }
143        }
144    }
145
146    //
147    //  Clean up
148    //
149    delete matcher;
150    delete rePat;
151    free(ucharBuf);
152    free(charBuf);
153    ucnv_close(outConverter);
154
155    u_cleanup();       // shut down ICU, release any cached data it owns.
156
157    return matchFound? 0: 1;
158}
159
160
161
162//------------------------------------------------------------------------------------------
163//
164//   doOptions          Run through the command line options, and set
165//                      the global variables accordingly.
166//
167//                      exit without returning if an error occured and
168//                      ugrep should not proceed further.
169//
170//------------------------------------------------------------------------------------------
171void processOptions(int argc, const char **argv) {
172    int            optInd;
173    UBool          doUsage   = FALSE;
174    UBool          doVersion = FALSE;
175    const char    *arg;
176
177
178    for(optInd = 1; optInd < argc; ++optInd) {
179        arg = argv[optInd];
180
181        /* version info */
182        if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
183            doVersion = TRUE;
184        }
185        /* usage info */
186        else if(strcmp(arg, "--help") == 0) {
187            doUsage = TRUE;
188        }
189        else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
190            displayLineNum = TRUE;
191        }
192        /* POSIX.1 says all arguments after -- are not options */
193        else if(strcmp(arg, "--") == 0) {
194            /* skip the -- */
195            ++optInd;
196            break;
197        }
198        /* unrecognized option */
199        else if(strncmp(arg, "-", strlen("-")) == 0) {
200            printf("ugrep: invalid option -- %s\n", arg+1);
201            doUsage = TRUE;
202        }
203        /* done with options */
204        else {
205            break;
206        }
207    }
208
209    if (doUsage) {
210        printUsage();
211        exit(0);
212    }
213
214    if (doVersion) {
215        printf("ugrep version 0.01\n");
216        if (optInd == argc) {
217            exit(0);
218        }
219    }
220
221    int  remainingArgs = argc-optInd;     // pattern file ...
222    if (remainingArgs < 2) {
223        fprintf(stderr, "ugrep:  files or pattern are missing.\n");
224        printUsage();
225        exit(1);
226    }
227
228    if (remainingArgs > 2) {
229        // More than one file to be processed.   Display file names with match output.
230        displayFileName = TRUE;
231    }
232
233    pattern      = argv[optInd];
234    firstFileNum = optInd+1;
235}
236
237//------------------------------------------------------------------------------------------
238//
239//   printUsage
240//
241//------------------------------------------------------------------------------------------
242void printUsage() {
243    printf("ugrep [options] pattern file...\n"
244        "     -V or --version     display version information\n"
245        "     --help              display this help and exit\n"
246        "     --                  stop further option processing\n"
247        "-n,  --line-number       Prefix each line of output with the line number within its input file.\n"
248        );
249    exit(0);
250}
251
252//------------------------------------------------------------------------------------------
253//
254//    readFile          Read a file into memory, and convert it to Unicode.
255//
256//                      Since this is just a demo program, take the simple minded approach
257//                      of always reading the whole file at once.  No intelligent buffering
258//                      is done.
259//
260//------------------------------------------------------------------------------------------
261void readFile(const char *name) {
262
263    //
264    //  Initialize global file variables
265    //
266    fileName = name;
267    fileLen  = 0;      // zero length prevents processing in case of errors.
268
269
270    //
271    //  Open the file and determine its size.
272    //
273    FILE *file = fopen(name, "rb");
274    if (file == 0 ) {
275        fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
276        return;
277    }
278    fseek(file, 0, SEEK_END);
279    int rawFileLen = ftell(file);
280    fseek(file, 0, SEEK_SET);
281
282
283    //
284    //   Read in the file
285    //
286    charBuf    = (char *)realloc(charBuf, rawFileLen+1);   // Need error checking...
287    int t = fread(charBuf, 1, rawFileLen, file);
288    if (t != rawFileLen)  {
289        fprintf(stderr, "Error reading file \"%s\"\n", fileName);
290        fclose(file);
291        return;
292    }
293    charBuf[rawFileLen]=0;
294    fclose(file);
295
296    //
297    // Look for a Unicode Signature (BOM) in the data
298    //
299    int32_t        signatureLength;
300    const char *   charDataStart = charBuf;
301    UErrorCode     status        = U_ZERO_ERROR;
302    const char*    encoding      = ucnv_detectUnicodeSignature(
303                           charDataStart, rawFileLen, &signatureLength, &status);
304    if (U_FAILURE(status)) {
305        fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
306            u_errorName(status));
307        return;
308    }
309    if(encoding!=NULL ){
310        charDataStart  += signatureLength;
311        rawFileLen     -= signatureLength;
312    }
313
314    //
315    // Open a converter to take the file to UTF-16
316    //
317    UConverter* conv;
318    conv = ucnv_open(encoding, &status);
319    if (U_FAILURE(status)) {
320        fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
321        return;
322    }
323
324    //
325    // Convert the file data to UChar.
326    //  Preflight first to determine required buffer size.
327    //
328    uint32_t destCap = ucnv_toUChars(conv,
329                       NULL,           //  dest,
330                       0,              //  destCapacity,
331                       charDataStart,
332                       rawFileLen,
333                       &status);
334    if (status != U_BUFFER_OVERFLOW_ERROR) {
335        fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
336        return;
337    };
338
339    status = U_ZERO_ERROR;
340    ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
341    ucnv_toUChars(conv,
342        ucharBuf,           //  dest,
343        destCap+1,
344        charDataStart,
345        rawFileLen,
346        &status);
347    if (U_FAILURE(status)) {
348        fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
349        return;
350    };
351    ucnv_close(conv);
352
353    //
354    //  Successful conversion.  Set the global size variables so that
355    //     the rest of the processing will proceed for this file.
356    //
357    fileLen = destCap;
358}
359
360
361
362
363
364//------------------------------------------------------------------------------------------
365//
366//   nextLine           Advance the line index variables, starting at the
367//                      specified position in the input file buffer, by
368//                      scanning forwrd until the next end-of-line.
369//
370//                      Need to take into account all of the possible Unicode
371//                      line ending sequences.
372//
373//------------------------------------------------------------------------------------------
374void nextLine(int  startPos) {
375    if (startPos == 0) {
376        lineNum = 0;
377    } else {
378        lineNum++;
379    }
380    lineStart = lineEnd = startPos;
381
382    for (;;) {
383        if (lineEnd >= fileLen) {
384            return;
385        }
386        UChar c = ucharBuf[lineEnd];
387        lineEnd++;
388        if (c == 0x0a   ||       // Line Feed
389            c == 0x0c   ||       // Form Feed
390            c == 0x0d   ||       // Carriage Return
391            c == 0x85   ||       // Next Line
392            c == 0x2028 ||       // Line Separator
393            c == 0x2029)         // Paragraph separator
394        {
395            break;
396        }
397    }
398
399    // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
400    if (lineEnd < fileLen           &&
401        ucharBuf[lineEnd-1] == 0x0d &&
402        ucharBuf[lineEnd]   == 0x0a)
403    {
404        lineEnd++;
405    }
406}
407
408
409//------------------------------------------------------------------------------------------
410//
411//   printMatch         Called when a matching line has been located.
412//                      Print out the line from the file with the match, after
413//                         converting it back to the default code page.
414//
415//------------------------------------------------------------------------------------------
416void printMatch() {
417    char                buf[2000];
418    UErrorCode         status       = U_ZERO_ERROR;
419
420    // If we haven't already created a converter for output, do it now.
421    if (outConverter == 0) {
422        outConverter = ucnv_open(NULL, &status);
423        if (U_FAILURE(status)) {
424            fprintf(stderr, "ugrep:  Error opening default converter: \"%s\"\n",
425                u_errorName(status));
426            exit(-1);
427        }
428    };
429
430    // Convert the line to be printed back to the default 8 bit code page.
431    //   If the line is too long for our buffer, just truncate it.
432    ucnv_fromUChars(outConverter,
433                    buf,                   // destination buffer for conversion
434                    sizeof(buf),           // capacity of destination buffer
435                    &ucharBuf[lineStart],   // Input to conversion
436                    lineEnd-lineStart,     // number of UChars to convert
437                    &status);
438    buf[sizeof(buf)-1] = 0;                // Add null for use in case of too long lines.
439                                           // The converter null-terminates its output unless
440                                           //   the buffer completely fills.
441
442    if (displayFileName) {
443        printf("%s:", fileName);
444    }
445    if (displayLineNum) {
446        printf("%d:", lineNum);
447    }
448    printf("%s", buf);
449}
450
451