1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  bidiconf.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009oct16
14*   created by: Markus W. Scherer
15*
16*   BiDi conformance test, using the Unicode BidiTest.txt file.
17*/
18
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include "unicode/utypes.h"
23#include "unicode/ubidi.h"
24#include "unicode/errorcode.h"
25#include "unicode/localpointer.h"
26#include "unicode/putil.h"
27#include "unicode/unistr.h"
28#include "intltest.h"
29#include "uparse.h"
30
31class BiDiConformanceTest : public IntlTest {
32public:
33    BiDiConformanceTest() :
34        directionBits(0), lineNumber(0), levelsCount(0), orderingCount(0),
35        errorCount(0) {}
36
37    void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
38
39    void TestBidiTest();
40private:
41    char *getUnidataPath(char path[]);
42
43    UBool parseLevels(const char *start);
44    UBool parseOrdering(const char *start);
45    UBool parseInputStringFromBiDiClasses(const char *&start);
46
47    UBool checkLevels(const UBiDiLevel actualLevels[], int32_t actualCount,
48                      const char *paraLevelName);
49    UBool checkOrdering(UBiDi *ubidi, const char *paraLevelName);
50
51    void printErrorLine(const char *paraLevelName);
52
53    char line[10000];
54    UBiDiLevel levels[1000];
55    uint32_t directionBits;
56    int32_t ordering[1000];
57    int32_t lineNumber;
58    int32_t levelsCount;
59    int32_t orderingCount;
60    int32_t errorCount;
61    UnicodeString inputString;
62};
63
64extern IntlTest *createBiDiConformanceTest() {
65    return new BiDiConformanceTest();
66}
67
68void BiDiConformanceTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
69    if(exec) {
70        logln("TestSuite BiDiConformanceTest: ");
71    }
72    switch (index) {
73        TESTCASE(0, TestBidiTest);
74        default:
75            name="";
76            break; // needed to end the loop
77    }
78}
79
80// TODO: Move to a common place (IntlTest?) to avoid duplication with UnicodeTest (ucdtest.cpp).
81char *BiDiConformanceTest::getUnidataPath(char path[]) {
82    IcuTestErrorCode errorCode(*this, "getUnidataPath");
83    const int kUnicodeDataTxtLength=15;  // strlen("UnicodeData.txt")
84
85    // Look inside ICU_DATA first.
86    strcpy(path, pathToDataDirectory());
87    strcat(path, "unidata" U_FILE_SEP_STRING "UnicodeData.txt");
88    FILE *f=fopen(path, "r");
89    if(f!=NULL) {
90        fclose(f);
91        *(strchr(path, 0)-kUnicodeDataTxtLength)=0;  // Remove the basename.
92        return path;
93    }
94
95    // As a fallback, try to guess where the source data was located
96    // at the time ICU was built, and look there.
97#   ifdef U_TOPSRCDIR
98        strcpy(path, U_TOPSRCDIR  U_FILE_SEP_STRING "data");
99#   else
100        strcpy(path, loadTestData(errorCode));
101        strcat(path, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
102                     U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
103                     U_FILE_SEP_STRING "data");
104#   endif
105    strcat(path, U_FILE_SEP_STRING);
106    strcat(path, "unidata" U_FILE_SEP_STRING "UnicodeData.txt");
107    f=fopen(path, "r");
108    if(f!=NULL) {
109        fclose(f);
110        *(strchr(path, 0)-kUnicodeDataTxtLength)=0;  // Remove the basename.
111        return path;
112    }
113    return NULL;
114}
115
116U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
117
118UBool BiDiConformanceTest::parseLevels(const char *start) {
119    directionBits=0;
120    levelsCount=0;
121    while(*start!=0 && *(start=u_skipWhitespace(start))!=0) {
122        if(*start=='x') {
123            levels[levelsCount++]=UBIDI_DEFAULT_LTR;
124            ++start;
125        } else {
126            char *end;
127            uint32_t value=(uint32_t)strtoul(start, &end, 10);
128            if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>(UBIDI_MAX_EXPLICIT_LEVEL+1)) {
129                errln("@Levels: parse error at %s", start);
130                return FALSE;
131            }
132            levels[levelsCount++]=(UBiDiLevel)value;
133            directionBits|=(1<<(value&1));
134            start=end;
135        }
136    }
137    return TRUE;
138}
139
140UBool BiDiConformanceTest::parseOrdering(const char *start) {
141    orderingCount=0;
142    while(*start!=0 && *(start=u_skipWhitespace(start))!=0) {
143        char *end;
144        uint32_t value=(uint32_t)strtoul(start, &end, 10);
145        if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>=1000) {
146            errln("@Reorder: parse error at %s", start);
147            return FALSE;
148        }
149        ordering[orderingCount++]=(int32_t)value;
150        start=end;
151    }
152    return TRUE;
153}
154
155static const UChar charFromBiDiClass[U_CHAR_DIRECTION_COUNT]={
156    0x6c,   // 'l' for L
157    0x52,   // 'R' for R
158    0x33,   // '3' for EN
159    0x2d,   // '-' for ES
160    0x25,   // '%' for ET
161    0x39,   // '9' for AN
162    0x2c,   // ',' for CS
163    0x2f,   // '/' for B
164    0x5f,   // '_' for S
165    0x20,   // ' ' for WS
166    0x3d,   // '=' for ON
167    0x65,   // 'e' for LRE
168    0x6f,   // 'o' for LRO
169    0x41,   // 'A' for AL
170    0x45,   // 'E' for RLE
171    0x4f,   // 'O' for RLO
172    0x2a,   // '*' for PDF
173    0x60,   // '`' for NSM
174    0x7c    // '|' for BN
175};
176
177U_CDECL_BEGIN
178
179static UCharDirection U_CALLCONV
180biDiConfUBiDiClassCallback(const void * /*context*/, UChar32 c) {
181    for(int i=0; i<U_CHAR_DIRECTION_COUNT; ++i) {
182        if(c==charFromBiDiClass[i]) {
183            return (UCharDirection)i;
184        }
185    }
186    // Character not in our hardcoded table.
187    // Should not occur during testing.
188    return U_BIDI_CLASS_DEFAULT;
189}
190
191U_CDECL_END
192
193static const int8_t biDiClassNameLengths[U_CHAR_DIRECTION_COUNT+1]={
194    1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 0
195};
196
197UBool BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start) {
198    inputString.remove();
199    /*
200     * Lengthy but fast BiDi class parser.
201     * A simple parser could terminate or extract the name string and use
202     *   int32_t biDiClassInt=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, bidiClassString);
203     * but that makes this test take significantly more time.
204     */
205    while(*start!=0 && *(start=u_skipWhitespace(start))!=0 && *start!=';') {
206        UCharDirection biDiClass=U_CHAR_DIRECTION_COUNT;
207        // Compare each character once until we have a match on
208        // a complete, short BiDi class name.
209        if(start[0]=='L') {
210            if(start[1]=='R') {
211                if(start[2]=='E') {
212                    biDiClass=U_LEFT_TO_RIGHT_EMBEDDING;
213                } else if(start[2]=='O') {
214                    biDiClass=U_LEFT_TO_RIGHT_OVERRIDE;
215                }
216            } else {
217                biDiClass=U_LEFT_TO_RIGHT;
218            }
219        } else if(start[0]=='R') {
220            if(start[1]=='L') {
221                if(start[2]=='E') {
222                    biDiClass=U_RIGHT_TO_LEFT_EMBEDDING;
223                } else if(start[2]=='O') {
224                    biDiClass=U_RIGHT_TO_LEFT_OVERRIDE;
225                }
226            } else {
227                biDiClass=U_RIGHT_TO_LEFT;
228            }
229        } else if(start[0]=='E') {
230            if(start[1]=='N') {
231                biDiClass=U_EUROPEAN_NUMBER;
232            } else if(start[1]=='S') {
233                biDiClass=U_EUROPEAN_NUMBER_SEPARATOR;
234            } else if(start[1]=='T') {
235                biDiClass=U_EUROPEAN_NUMBER_TERMINATOR;
236            }
237        } else if(start[0]=='A') {
238            if(start[1]=='L') {
239                biDiClass=U_RIGHT_TO_LEFT_ARABIC;
240            } else if(start[1]=='N') {
241                biDiClass=U_ARABIC_NUMBER;
242            }
243        } else if(start[0]=='C' && start[1]=='S') {
244            biDiClass=U_COMMON_NUMBER_SEPARATOR;
245        } else if(start[0]=='B') {
246            if(start[1]=='N') {
247                biDiClass=U_BOUNDARY_NEUTRAL;
248            } else {
249                biDiClass=U_BLOCK_SEPARATOR;
250            }
251        } else if(start[0]=='S') {
252            biDiClass=U_SEGMENT_SEPARATOR;
253        } else if(start[0]=='W' && start[1]=='S') {
254            biDiClass=U_WHITE_SPACE_NEUTRAL;
255        } else if(start[0]=='O' && start[1]=='N') {
256            biDiClass=U_OTHER_NEUTRAL;
257        } else if(start[0]=='P' && start[1]=='D' && start[2]=='F') {
258            biDiClass=U_POP_DIRECTIONAL_FORMAT;
259        } else if(start[0]=='N' && start[1]=='S' && start[2]=='M') {
260            biDiClass=U_DIR_NON_SPACING_MARK;
261        }
262        // Now we verify that the class name is terminated properly,
263        // and not just the start of a longer word.
264        int8_t biDiClassNameLength=biDiClassNameLengths[biDiClass];
265        char c=start[biDiClassNameLength];
266        if(biDiClass==U_CHAR_DIRECTION_COUNT || (!U_IS_INV_WHITESPACE(c) && c!=';' && c!=0)) {
267            errln("BiDi class string not recognized at %s", start);
268            return FALSE;
269        }
270        inputString.append(charFromBiDiClass[biDiClass]);
271        start+=biDiClassNameLength;
272    }
273    return TRUE;
274}
275
276void BiDiConformanceTest::TestBidiTest() {
277    IcuTestErrorCode errorCode(*this, "TestBidiTest");
278    const char *sourceTestDataPath=getSourceTestData(errorCode);
279    if(errorCode.logIfFailureAndReset("unable to find the source/test/testdata "
280                                      "folder (getSourceTestData())")) {
281        return;
282    }
283    char bidiTestPath[400];
284    strcpy(bidiTestPath, sourceTestDataPath);
285    strcat(bidiTestPath, "BidiTest.txt");
286    LocalStdioFilePointer bidiTestFile(fopen(bidiTestPath, "r"));
287    if(bidiTestFile.isNull()) {
288        errln("unable to open %s", bidiTestPath);
289        return;
290    }
291    LocalUBiDiPointer ubidi(ubidi_open());
292    ubidi_setClassCallback(ubidi.getAlias(), biDiConfUBiDiClassCallback, NULL,
293                           NULL, NULL, errorCode);
294    if(errorCode.logIfFailureAndReset("ubidi_setClassCallback()")) {
295        return;
296    }
297    lineNumber=0;
298    levelsCount=0;
299    orderingCount=0;
300    errorCount=0;
301    while(errorCount<10 && fgets(line, (int)sizeof(line), bidiTestFile.getAlias())!=NULL) {
302        ++lineNumber;
303        // Remove trailing comments and whitespace.
304        char *commentStart=strchr(line, '#');
305        if(commentStart!=NULL) {
306            *commentStart=0;
307        }
308        u_rtrim(line);
309        const char *start=u_skipWhitespace(line);
310        if(*start==0) {
311            continue;  // Skip empty and comment-only lines.
312        }
313        if(*start=='@') {
314            ++start;
315            if(0==strncmp(start, "Levels:", 7)) {
316                if(!parseLevels(start+7)) {
317                    return;
318                }
319            } else if(0==strncmp(start, "Reorder:", 8)) {
320                if(!parseOrdering(start+8)) {
321                    return;
322                }
323            }
324            // Skip unknown @Xyz: ...
325        } else {
326            if(!parseInputStringFromBiDiClasses(start)) {
327                return;
328            }
329            start=u_skipWhitespace(start);
330            if(*start!=';') {
331                errln("missing ; separator on input line %s", line);
332                return;
333            }
334            start=u_skipWhitespace(start+1);
335            char *end;
336            uint32_t bitset=(uint32_t)strtoul(start, &end, 16);
337            if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0)) {
338                errln("input bitset parse error at %s", start);
339                return;
340            }
341            // Loop over the bitset.
342            static const UBiDiLevel paraLevels[]={ UBIDI_DEFAULT_LTR, 0, 1, UBIDI_DEFAULT_RTL };
343            static const char *const paraLevelNames[]={ "auto/LTR", "LTR", "RTL", "auto/RTL" };
344            for(int i=0; i<=3; ++i) {
345                if(bitset&(1<<i)) {
346                    ubidi_setPara(ubidi.getAlias(), inputString.getBuffer(), inputString.length(),
347                                  paraLevels[i], NULL, errorCode);
348                    const UBiDiLevel *actualLevels=ubidi_getLevels(ubidi.getAlias(), errorCode);
349                    if(errorCode.logIfFailureAndReset("ubidi_setPara() or ubidi_getLevels()")) {
350                        errln("Input line %d: %s", (int)lineNumber, line);
351                        return;
352                    }
353                    if(!checkLevels(actualLevels, ubidi_getProcessedLength(ubidi.getAlias()),
354                                    paraLevelNames[i])) {
355                        // continue outerLoop;  does not exist in C++
356                        // so just break out of the inner loop.
357                        break;
358                    }
359                    if(!checkOrdering(ubidi.getAlias(), paraLevelNames[i])) {
360                        // continue outerLoop;  does not exist in C++
361                        // so just break out of the inner loop.
362                        break;
363                    }
364                }
365            }
366        }
367    }
368}
369
370static UChar printLevel(UBiDiLevel level) {
371    if(level<UBIDI_DEFAULT_LTR) {
372        return 0x30+level;
373    } else {
374        return 0x78;  // 'x'
375    }
376}
377
378static uint32_t getDirectionBits(const UBiDiLevel actualLevels[], int32_t actualCount) {
379    uint32_t actualDirectionBits=0;
380    for(int32_t i=0; i<actualCount; ++i) {
381        actualDirectionBits|=(1<<(actualLevels[i]&1));
382    }
383    return actualDirectionBits;
384}
385
386UBool BiDiConformanceTest::checkLevels(const UBiDiLevel actualLevels[], int32_t actualCount,
387                                       const char *paraLevelName) {
388    UBool isOk=TRUE;
389    if(levelsCount!=actualCount) {
390        errln("Wrong number of level values; expected %d actual %d",
391              (int)levelsCount, (int)actualCount);
392        isOk=FALSE;
393    } else {
394        for(int32_t i=0; i<actualCount; ++i) {
395            if(levels[i]!=actualLevels[i] && levels[i]<UBIDI_DEFAULT_LTR) {
396                if(directionBits!=3 && directionBits==getDirectionBits(actualLevels, actualCount)) {
397                    // ICU used a shortcut:
398                    // Since the text is unidirectional, it did not store the resolved
399                    // levels but just returns all levels as the paragraph level 0 or 1.
400                    // The reordering result is the same, so this is fine.
401                    break;
402                } else {
403                    errln("Wrong level value at index %d; expected %d actual %d",
404                          (int)i, levels[i], actualLevels[i]);
405                    isOk=FALSE;
406                    break;
407                }
408            }
409        }
410    }
411    if(!isOk) {
412        printErrorLine(paraLevelName);
413        UnicodeString els("Expected levels:   ");
414        int32_t i;
415        for(i=0; i<levelsCount; ++i) {
416            els.append((UChar)0x20).append(printLevel(levels[i]));
417        }
418        UnicodeString als("Actual   levels:   ");
419        for(i=0; i<actualCount; ++i) {
420            als.append((UChar)0x20).append(printLevel(actualLevels[i]));
421        }
422        errln(els);
423        errln(als);
424    }
425    return isOk;
426}
427
428// Note: ubidi_setReorderingOptions(ubidi, UBIDI_OPTION_REMOVE_CONTROLS);
429// does not work for custom BiDi class assignments
430// and anyway also removes LRM/RLM/ZWJ/ZWNJ which is not desirable here.
431// Therefore we just skip the indexes for BiDi controls while comparing
432// with the expected ordering that has them omitted.
433UBool BiDiConformanceTest::checkOrdering(UBiDi *ubidi, const char *paraLevelName) {
434    UBool isOk=TRUE;
435    IcuTestErrorCode errorCode(*this, "TestBidiTest/checkOrdering()");
436    int32_t resultLength=ubidi_getResultLength(ubidi);  // visual length including BiDi controls
437    int32_t i, visualIndex;
438    // Note: It should be faster to call ubidi_countRuns()/ubidi_getVisualRun()
439    // and loop over each run's indexes, but that seems unnecessary for this test code.
440    for(i=visualIndex=0; i<resultLength; ++i) {
441        int32_t logicalIndex=ubidi_getLogicalIndex(ubidi, i, errorCode);
442        if(errorCode.logIfFailureAndReset("ubidi_getLogicalIndex()")) {
443            errln("Input line %d: %s", (int)lineNumber, line);
444            return FALSE;
445        }
446        if(levels[logicalIndex]>=UBIDI_DEFAULT_LTR) {
447            continue;  // BiDi control, omitted from expected ordering.
448        }
449        if(visualIndex<orderingCount && logicalIndex!=ordering[visualIndex]) {
450            errln("Wrong ordering value at visual index %d; expected %d actual %d",
451                  (int)visualIndex, ordering[visualIndex], logicalIndex);
452            isOk=FALSE;
453            break;
454        }
455        ++visualIndex;
456    }
457    // visualIndex is now the visual length minus the BiDi controls,
458    // which should match the length of the BidiTest.txt ordering.
459    if(isOk && orderingCount!=visualIndex) {
460        errln("Wrong number of ordering values; expected %d actual %d",
461              (int)orderingCount, (int)visualIndex);
462        isOk=FALSE;
463    }
464    if(!isOk) {
465        printErrorLine(paraLevelName);
466        UnicodeString eord("Expected ordering: ");
467        for(i=0; i<orderingCount; ++i) {
468            eord.append((UChar)0x20).append((UChar)(0x30+ordering[i]));
469        }
470        UnicodeString aord("Actual   ordering: ");
471        for(i=0; i<resultLength; ++i) {
472            int32_t logicalIndex=ubidi_getLogicalIndex(ubidi, i, errorCode);
473            if(levels[logicalIndex]<UBIDI_DEFAULT_LTR) {
474                aord.append((UChar)0x20).append((UChar)(0x30+logicalIndex));
475            }
476        }
477        errln(eord);
478        errln(aord);
479    }
480    return isOk;
481}
482
483void BiDiConformanceTest::printErrorLine(const char *paraLevelName) {
484    ++errorCount;
485    errln("Input line %5d:   %s", (int)lineNumber, line);
486    errln(UnicodeString("Input string:       ")+inputString);
487    errln("Para level:         %s", paraLevelName);
488}
489