1/*
2 * Copyright (c) 1996, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 */
23
24/*
25 * @test
26 * @bug 4035266 4052418 4068133 4068137 4068139 4086052 4095322 4097779
27 *      4097920 4098467 4111338 4113835 4117554 4143071 4146175 4152117
28 *      4152416 4153072 4158381 4214367 4217703 4638433
29 * @library /java/text/testlib
30 * @run main/timeout=2000 BreakIteratorTest
31 * @summary test BreakIterator
32 */
33
34/*
35 *
36 *
37 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
38 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
39 *
40 * Portions copyright (c) 2007 Sun Microsystems, Inc.
41 * All Rights Reserved.
42 *
43 * The original version of this source code and documentation
44 * is copyrighted and owned by Taligent, Inc., a wholly-owned
45 * subsidiary of IBM. These materials are provided under terms
46 * of a License Agreement between Taligent and Sun. This technology
47 * is protected by multiple US and International patents.
48 *
49 * This notice and attribution to Taligent may not be removed.
50 * Taligent is a registered trademark of Taligent, Inc.
51 *
52 * Permission to use, copy, modify, and distribute this software
53 * and its documentation for NON-COMMERCIAL purposes and without
54 * fee is hereby granted provided that this copyright notice
55 * appears in all copies. Please refer to the file "copyright.html"
56 * for further important copyright and licensing information.
57 *
58 * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
59 * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
60 * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
61 * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
62 * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
63 * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
64 *
65 */
66
67import java.text.BreakIterator;
68import java.text.CharacterIterator;
69import java.text.StringCharacterIterator;
70import java.util.Locale;
71import java.util.Vector;
72import java.util.Enumeration;
73import java.io.*;
74
75public class BreakIteratorTest extends IntlTest
76{
77    private BreakIterator characterBreak;
78    private BreakIterator wordBreak;
79    private BreakIterator lineBreak;
80    private BreakIterator sentenceBreak;
81
82    public static void main(String[] args) throws Exception {
83        new BreakIteratorTest().run(args);
84    }
85
86    public BreakIteratorTest()
87    {
88        characterBreak = BreakIterator.getCharacterInstance();
89        wordBreak = BreakIterator.getWordInstance();
90        lineBreak = BreakIterator.getLineInstance();
91        sentenceBreak = BreakIterator.getSentenceInstance();
92    }
93
94    //=========================================================================
95    // general test subroutines
96    //=========================================================================
97
98    private void generalIteratorTest(BreakIterator bi, Vector expectedResult) {
99        StringBuffer buffer = new StringBuffer();
100        String text;
101        for (int i = 0; i < expectedResult.size(); i++) {
102            text = (String)expectedResult.elementAt(i);
103            buffer.append(text);
104        }
105        text = buffer.toString();
106
107        bi.setText(text);
108
109        Vector nextResults = testFirstAndNext(bi, text);
110        Vector previousResults = testLastAndPrevious(bi, text);
111
112        logln("comparing forward and backward...");
113        int errs = getErrorCount();
114        compareFragmentLists("forward iteration", "backward iteration", nextResults,
115                        previousResults);
116        if (getErrorCount() == errs) {
117            logln("comparing expected and actual...");
118            compareFragmentLists("expected result", "actual result", expectedResult,
119                            nextResults);
120        }
121
122        int[] boundaries = new int[expectedResult.size() + 3];
123        boundaries[0] = BreakIterator.DONE;
124        boundaries[1] = 0;
125        for (int i = 0; i < expectedResult.size(); i++)
126            boundaries[i + 2] = boundaries[i + 1] + ((String)expectedResult.elementAt(i)).
127                            length();
128        boundaries[boundaries.length - 1] = BreakIterator.DONE;
129
130        testFollowing(bi, text, boundaries);
131        testPreceding(bi, text, boundaries);
132        testIsBoundary(bi, text, boundaries);
133
134        doMultipleSelectionTest(bi, text);
135    }
136
137    private Vector testFirstAndNext(BreakIterator bi, String text) {
138        int p = bi.first();
139        int lastP = p;
140        Vector<String> result = new Vector<String>();
141
142        if (p != 0)
143            errln("first() returned " + p + " instead of 0");
144        while (p != BreakIterator.DONE) {
145            p = bi.next();
146            if (p != BreakIterator.DONE) {
147                if (p <= lastP)
148                    errln("next() failed to move forward: next() on position "
149                                    + lastP + " yielded " + p);
150
151                result.addElement(text.substring(lastP, p));
152            }
153            else {
154                if (lastP != text.length())
155                    errln("next() returned DONE prematurely: offset was "
156                                    + lastP + " instead of " + text.length());
157            }
158            lastP = p;
159        }
160        return result;
161    }
162
163    private Vector testLastAndPrevious(BreakIterator bi, String text) {
164        int p = bi.last();
165        int lastP = p;
166        Vector<String> result = new Vector<String>();
167
168        if (p != text.length())
169            errln("last() returned " + p + " instead of " + text.length());
170        while (p != BreakIterator.DONE) {
171            p = bi.previous();
172            if (p != BreakIterator.DONE) {
173                if (p >= lastP)
174                    errln("previous() failed to move backward: previous() on position "
175                                    + lastP + " yielded " + p);
176
177                result.insertElementAt(text.substring(p, lastP), 0);
178            }
179            else {
180                if (lastP != 0)
181                    errln("previous() returned DONE prematurely: offset was "
182                                    + lastP + " instead of 0");
183            }
184            lastP = p;
185        }
186        return result;
187    }
188
189    private void compareFragmentLists(String f1Name, String f2Name, Vector f1, Vector f2) {
190        int p1 = 0;
191        int p2 = 0;
192        String s1;
193        String s2;
194        int t1 = 0;
195        int t2 = 0;
196
197        while (p1 < f1.size() && p2 < f2.size()) {
198            s1 = (String)f1.elementAt(p1);
199            s2 = (String)f2.elementAt(p2);
200            t1 += s1.length();
201            t2 += s2.length();
202
203            if (s1.equals(s2)) {
204                debugLogln("   >" + s1 + "<");
205                ++p1;
206                ++p2;
207            }
208            else {
209                int tempT1 = t1;
210                int tempT2 = t2;
211                int tempP1 = p1;
212                int tempP2 = p2;
213
214                while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) {
215                    while (tempT1 < tempT2 && tempP1 < f1.size()) {
216                        tempT1 += ((String)f1.elementAt(tempP1)).length();
217                        ++tempP1;
218                    }
219                    while (tempT2 < tempT1 && tempP2 < f2.size()) {
220                        tempT2 += ((String)f2.elementAt(tempP2)).length();
221                        ++tempP2;
222                    }
223                }
224                logln("*** " + f1Name + " has:");
225                while (p1 <= tempP1 && p1 < f1.size()) {
226                    s1 = (String)f1.elementAt(p1);
227                    t1 += s1.length();
228                    debugLogln(" *** >" + s1 + "<");
229                    ++p1;
230                }
231                logln("***** " + f2Name + " has:");
232                while (p2 <= tempP2 && p2 < f2.size()) {
233                    s2 = (String)f2.elementAt(p2);
234                    t2 += s2.length();
235                    debugLogln(" ***** >" + s2 + "<");
236                    ++p2;
237                }
238                errln("Discrepancy between " + f1Name + " and " + f2Name + "\n---\n" + f1 +"\n---\n" + f2);
239            }
240        }
241    }
242
243    private void testFollowing(BreakIterator bi, String text, int[] boundaries) {
244        logln("testFollowing():");
245        int p = 2;
246        int i = 0;
247        try {
248            for (i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
249                if (i == boundaries[p])
250                    ++p;
251
252                int b = bi.following(i);
253                logln("bi.following(" + i + ") -> " + b);
254                if (b != boundaries[p])
255                    errln("Wrong result from following() for " + i + ": expected " + boundaries[p]
256                          + ", got " + b);
257            }
258        } catch (IllegalArgumentException illargExp) {
259            errln("IllegalArgumentException caught from following() for offset: " + i);
260        }
261    }
262
263    private void testPreceding(BreakIterator bi, String text, int[] boundaries) {
264        logln("testPreceding():");
265        int p = 0;
266        int i = 0;
267        try {
268            for (i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
269                int b = bi.preceding(i);
270                logln("bi.preceding(" + i + ") -> " + b);
271                if (b != boundaries[p])
272                    errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
273                          + ", got " + b);
274
275                if (i == boundaries[p + 1])
276                    ++p;
277            }
278        } catch (IllegalArgumentException illargExp) {
279            errln("IllegalArgumentException caught from preceding() for offset: " + i);
280        }
281    }
282
283    private void testIsBoundary(BreakIterator bi, String text, int[] boundaries) {
284        logln("testIsBoundary():");
285        int p = 1;
286        boolean isB;
287        for (int i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
288            isB = bi.isBoundary(i);
289            logln("bi.isBoundary(" + i + ") -> " + isB);
290
291            if (i == boundaries[p]) {
292                if (!isB)
293                    errln("Wrong result from isBoundary() for " + i + ": expected true, got false");
294                ++p;
295            }
296            else {
297                if (isB)
298                    errln("Wrong result from isBoundary() for " + i + ": expected false, got true");
299            }
300        }
301    }
302
303    private void doMultipleSelectionTest(BreakIterator iterator, String testText)
304    {
305        logln("Multiple selection test...");
306        BreakIterator testIterator = (BreakIterator)iterator.clone();
307        int offset = iterator.first();
308        int testOffset;
309        int count = 0;
310
311        do {
312            testOffset = testIterator.first();
313            testOffset = testIterator.next(count);
314            logln("next(" + count + ") -> " + testOffset);
315            if (offset != testOffset)
316                errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
317
318            if (offset != BreakIterator.DONE) {
319                count++;
320                offset = iterator.next();
321            }
322        } while (offset != BreakIterator.DONE);
323
324        // now do it backwards...
325        offset = iterator.last();
326        count = 0;
327
328        do {
329            testOffset = testIterator.last();
330            testOffset = testIterator.next(count);
331            logln("next(" + count + ") -> " + testOffset);
332            if (offset != testOffset)
333                errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
334
335            if (offset != BreakIterator.DONE) {
336                count--;
337                offset = iterator.previous();
338            }
339        } while (offset != BreakIterator.DONE);
340    }
341
342    private void doBreakInvariantTest(BreakIterator tb, String testChars)
343    {
344        StringBuffer work = new StringBuffer("aaa");
345        int errorCount = 0;
346
347        // a break should always occur after CR (unless followed by LF), LF, PS, and LS
348        String breaks = /*"\r\n\u2029\u2028"*/"\n\u2029\u2028";
349                            // change this back when new BI code is added
350
351        for (int i = 0; i < breaks.length(); i++) {
352            work.setCharAt(1, breaks.charAt(i));
353            for (int j = 0; j < testChars.length(); j++) {
354                work.setCharAt(0, testChars.charAt(j));
355                for (int k = 0; k < testChars.length(); k++) {
356                    char c = testChars.charAt(k);
357
358                    // if a cr is followed by lf, don't do the check (they stay together)
359                    if (work.charAt(1) == '\r' && (c == '\n'))
360                        continue;
361
362                    // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
363                    // for breaking purposes as per UTR14
364                    int type1 = Character.getType(work.charAt(1));
365                    int type2 = Character.getType(c);
366                    if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
367                        type2 == Character.CONTROL || type2 == Character.FORMAT) {
368                        continue;
369                    }
370
371                    work.setCharAt(2, c);
372                    tb.setText(work.toString());
373                    boolean seen2 = false;
374                    for (int l = tb.first(); l != BreakIterator.DONE; l = tb.next()) {
375                        if (l == 2)
376                            seen2 = true;
377                    }
378                    if (!seen2) {
379                        errln("No break between U+" + Integer.toHexString((int)(work.charAt(1)))
380                                    + " and U+" + Integer.toHexString((int)(work.charAt(2))));
381                        errorCount++;
382                        if (errorCount >= 75)
383                            return;
384                    }
385                }
386            }
387        }
388    }
389
390    private void doOtherInvariantTest(BreakIterator tb, String testChars)
391    {
392        StringBuffer work = new StringBuffer("a\r\na");
393        int errorCount = 0;
394
395        // a break should never occur between CR and LF
396        for (int i = 0; i < testChars.length(); i++) {
397            work.setCharAt(0, testChars.charAt(i));
398            for (int j = 0; j < testChars.length(); j++) {
399                work.setCharAt(3, testChars.charAt(j));
400                tb.setText(work.toString());
401                for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
402                    if (k == 2) {
403                        errln("Break between CR and LF in string U+" + Integer.toHexString(
404                                (int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
405                                (int)(work.charAt(3))));
406                        errorCount++;
407                        if (errorCount >= 75)
408                            return;
409                    }
410            }
411        }
412
413        // a break should never occur before a non-spacing mark, unless it's preceded
414        // by a line terminator
415        work.setLength(0);
416        work.append("aaaa");
417        for (int i = 0; i < testChars.length(); i++) {
418            char c = testChars.charAt(i);
419            if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003')
420                continue;
421            work.setCharAt(1, c);
422            for (int j = 0; j < testChars.length(); j++) {
423                c = testChars.charAt(j);
424                if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c)
425                        != Character.ENCLOSING_MARK)
426                    continue;
427                work.setCharAt(2, c);
428
429                // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
430                // for breaking purposes as per UTR14
431                int type1 = Character.getType(work.charAt(1));
432                int type2 = Character.getType(work.charAt(2));
433                if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
434                    type2 == Character.CONTROL || type2 == Character.FORMAT) {
435                    continue;
436                }
437
438                tb.setText(work.toString());
439                for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
440                    if (k == 2) {
441                        errln("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
442                                + " and U+" + Integer.toHexString((int)(work.charAt(2))));
443                        errorCount++;
444                        if (errorCount >= 75)
445                            return;
446                    }
447            }
448        }
449    }
450
451    public void debugLogln(String s) {
452        final String zeros = "0000";
453        String temp;
454        StringBuffer out = new StringBuffer();
455        for (int i = 0; i < s.length(); i++) {
456            char c = s.charAt(i);
457            if (c >= ' ' && c < '\u007f')
458                out.append(c);
459            else {
460                out.append("\\u");
461                temp = Integer.toHexString((int)c);
462                out.append(zeros.substring(0, 4 - temp.length()));
463                out.append(temp);
464            }
465        }
466        logln(out.toString());
467    }
468
469    //=========================================================================
470    // tests
471    //=========================================================================
472
473    public void TestWordBreak() {
474
475        Vector<String> wordSelectionData = new Vector<String>();
476
477        wordSelectionData.addElement("12,34");
478
479        wordSelectionData.addElement(" ");
480        wordSelectionData.addElement("\u00A2"); //cent sign
481        wordSelectionData.addElement("\u00A3"); //pound sign
482        wordSelectionData.addElement("\u00A4"); //currency sign
483        wordSelectionData.addElement("\u00A5"); //yen sign
484        wordSelectionData.addElement("alpha-beta-gamma");
485        wordSelectionData.addElement(".");
486        wordSelectionData.addElement(" ");
487        wordSelectionData.addElement("Badges");
488        wordSelectionData.addElement("?");
489        wordSelectionData.addElement(" ");
490        wordSelectionData.addElement("BADGES");
491        wordSelectionData.addElement("!");
492        wordSelectionData.addElement("?");
493        wordSelectionData.addElement("!");
494        wordSelectionData.addElement(" ");
495        wordSelectionData.addElement("We");
496        wordSelectionData.addElement(" ");
497        wordSelectionData.addElement("don't");
498        wordSelectionData.addElement(" ");
499        wordSelectionData.addElement("need");
500        wordSelectionData.addElement(" ");
501        wordSelectionData.addElement("no");
502        wordSelectionData.addElement(" ");
503        wordSelectionData.addElement("STINKING");
504        wordSelectionData.addElement(" ");
505        wordSelectionData.addElement("BADGES");
506        wordSelectionData.addElement("!");
507        wordSelectionData.addElement("!");
508        wordSelectionData.addElement("!");
509
510        wordSelectionData.addElement("012.566,5");
511        wordSelectionData.addElement(" ");
512        wordSelectionData.addElement("123.3434,900");
513        wordSelectionData.addElement(" ");
514        wordSelectionData.addElement("1000,233,456.000");
515        wordSelectionData.addElement(" ");
516        wordSelectionData.addElement("1,23.322%");
517        wordSelectionData.addElement(" ");
518        wordSelectionData.addElement("123.1222");
519
520        wordSelectionData.addElement(" ");
521        wordSelectionData.addElement("\u0024123,000.20");
522
523        wordSelectionData.addElement(" ");
524        wordSelectionData.addElement("179.01\u0025");
525
526        wordSelectionData.addElement("Hello");
527        wordSelectionData.addElement(",");
528        wordSelectionData.addElement(" ");
529        wordSelectionData.addElement("how");
530        wordSelectionData.addElement(" ");
531        wordSelectionData.addElement("are");
532        wordSelectionData.addElement(" ");
533        wordSelectionData.addElement("you");
534        wordSelectionData.addElement(" ");
535        wordSelectionData.addElement("X");
536        wordSelectionData.addElement(" ");
537
538        wordSelectionData.addElement("Now");
539        wordSelectionData.addElement("\r");
540        wordSelectionData.addElement("is");
541        wordSelectionData.addElement("\n");
542        wordSelectionData.addElement("the");
543        wordSelectionData.addElement("\r\n");
544        wordSelectionData.addElement("time");
545        wordSelectionData.addElement("\n");
546        wordSelectionData.addElement("\r");
547        wordSelectionData.addElement("for");
548        wordSelectionData.addElement("\r");
549        wordSelectionData.addElement("\r");
550        wordSelectionData.addElement("all");
551        wordSelectionData.addElement(" ");
552
553        generalIteratorTest(wordBreak, wordSelectionData);
554    }
555
556    public void TestBug4097779() {
557        Vector<String> wordSelectionData = new Vector<String>();
558
559        wordSelectionData.addElement("aa\u0300a");
560        wordSelectionData.addElement(" ");
561
562        generalIteratorTest(wordBreak, wordSelectionData);
563    }
564
565    public void TestBug4098467Words() {
566        Vector<String> wordSelectionData = new Vector<String>();
567
568        // What follows is a string of Korean characters (I found it in the Yellow Pages
569        // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
570        // it correctly), first as precomposed syllables, and then as conjoining jamo.
571        // Both sequences should be semantically identical and break the same way.
572        // precomposed syllables...
573        wordSelectionData.addElement("\uc0c1\ud56d");
574        wordSelectionData.addElement(" ");
575        wordSelectionData.addElement("\ud55c\uc778");
576        wordSelectionData.addElement(" ");
577        wordSelectionData.addElement("\uc5f0\ud569");
578        wordSelectionData.addElement(" ");
579        wordSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c");
580        wordSelectionData.addElement(" ");
581        // conjoining jamo...
582        wordSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc");
583        wordSelectionData.addElement(" ");
584        wordSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab");
585        wordSelectionData.addElement(" ");
586        wordSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8");
587        wordSelectionData.addElement(" ");
588        wordSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
589        wordSelectionData.addElement(" ");
590
591        generalIteratorTest(wordBreak, wordSelectionData);
592    }
593
594    public void TestBug4117554Words() {
595        Vector<String> wordSelectionData = new Vector<String>();
596
597        // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
598        // count as a Kanji character for the purposes of word breaking
599        wordSelectionData.addElement("abc");
600        wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03");
601        wordSelectionData.addElement("abc");
602
603        generalIteratorTest(wordBreak, wordSelectionData);
604    }
605
606    public void TestSentenceBreak() {
607        Vector<String> sentenceSelectionData = new Vector<String>();
608
609        sentenceSelectionData.addElement("This is a simple sample sentence. ");
610        sentenceSelectionData.addElement("(This is it.) ");
611        sentenceSelectionData.addElement("This is a simple sample sentence. ");
612        sentenceSelectionData.addElement("\"This isn\'t it.\" ");
613        sentenceSelectionData.addElement("Hi! ");
614        sentenceSelectionData.addElement("This is a simple sample sentence. ");
615        sentenceSelectionData.addElement("It does not have to make any sense as you can see. ");
616        sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
617        sentenceSelectionData.addElement("Che la dritta via aveo smarrita. ");
618        sentenceSelectionData.addElement("He said, that I said, that you said!! ");
619
620        sentenceSelectionData.addElement("Don't rock the boat.\u2029");
621
622        sentenceSelectionData.addElement("Because I am the daddy, that is why. ");
623        sentenceSelectionData.addElement("Not on my time (el timo.)! ");
624
625        sentenceSelectionData.addElement("So what!!\u2029");
626
627        sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" ");
628        sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
629        sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n");
630        sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
631        sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? ");
632        sentenceSelectionData.addElement("He answered, \"You may not!\" ");
633        sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". ");
634        sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
635        sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? ");
636        sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!");
637
638        generalIteratorTest(sentenceBreak, sentenceSelectionData);
639    }
640
641    public void TestBug4113835() {
642        Vector<String> sentenceSelectionData = new Vector<String>();
643
644        // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
645        sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029");
646
647        generalIteratorTest(sentenceBreak, sentenceSelectionData);
648    }
649
650    public void TestBug4111338() {
651        Vector<String> sentenceSelectionData = new Vector<String>();
652
653        // test for bug #4111338: Don't break sentences at the boundary between CJK
654        // and other letters
655        sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c"
656                + "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba"
657                + "\u611d\u57b6\u2510\u5d46\".\u2029");
658        sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
659                + "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
660                + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
661        sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4"
662                + "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8"
663                + "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
664        sentenceSelectionData.addElement("He said, \"I can go there.\"\u2029");
665
666        generalIteratorTest(sentenceBreak, sentenceSelectionData);
667    }
668
669    public void TestBug4117554Sentences() {
670        Vector<String> sentenceSelectionData = new Vector<String>();
671
672        // Treat fullwidth variants of .!? the same as their
673        // normal counterparts
674        sentenceSelectionData.addElement("I know I'm right\uff0e ");
675        sentenceSelectionData.addElement("Right\uff1f ");
676        sentenceSelectionData.addElement("Right\uff01 ");
677
678        // Don't break sentences at boundary between CJK and digits
679        sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
680                + "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
681                + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
682
683        // Break sentence between a sentence terminator and
684        // opening punctuation
685        sentenceSelectionData.addElement("no?");
686        sentenceSelectionData.addElement("(yes)");
687
688        generalIteratorTest(sentenceBreak, sentenceSelectionData);
689    }
690
691    public void TestBug4158381() {
692        Vector<String> sentenceSelectionData = new Vector<String>();
693
694        // Don't break sentence after period if it isn't followed by a space
695        sentenceSelectionData.addElement("Test <code>Flags.Flag</code> class.  ");
696        sentenceSelectionData.addElement("Another test.\u2029");
697
698        // No breaks when there are no terminators around
699        sentenceSelectionData.addElement("<P>Provides a set of "
700                + "&quot;lightweight&quot; (all-java<FONT SIZE=\"-2\"><SUP>TM"
701                + "</SUP></FONT> language) components that, "
702                + "to the maximum degree possible, work the same on all platforms.  ");
703        sentenceSelectionData.addElement("Another test.\u2029");
704
705        generalIteratorTest(sentenceBreak, sentenceSelectionData);
706    }
707
708    public void TestBug4143071() {
709        Vector<String> sentenceSelectionData = new Vector<String>();
710
711        // Make sure sentences that end with digits work right
712        sentenceSelectionData.addElement("Today is the 27th of May, 1998.  ");
713        sentenceSelectionData.addElement("Tomorrow with be 28 May 1998.  ");
714        sentenceSelectionData.addElement("The day after will be the 30th.\u2029");
715
716        generalIteratorTest(sentenceBreak, sentenceSelectionData);
717    }
718
719    public void TestBug4152416() {
720        Vector<String> sentenceSelectionData = new Vector<String>();
721
722        // Make sure sentences ending with a capital letter are treated correctly
723        sentenceSelectionData.addElement("The type of all primitive "
724                + "<code>boolean</code> values accessed in the target VM.  ");
725        sentenceSelectionData.addElement("Calls to xxx will return an "
726                + "implementor of this interface.\u2029");
727
728        generalIteratorTest(sentenceBreak, sentenceSelectionData);
729    }
730
731    public void TestBug4152117() {
732        Vector<String> sentenceSelectionData = new Vector<String>();
733
734        // Make sure sentence breaking is handling punctuation correctly
735        // [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE
736        // IT DOESN'T CROP UP]
737        sentenceSelectionData.addElement("Constructs a randomly generated "
738                + "BigInteger, uniformly distributed over the range <tt>0</tt> "
739                + "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive.  ");
740        sentenceSelectionData.addElement("The uniformity of the distribution "
741                + "assumes that a fair source of random bits is provided in "
742                + "<tt>rnd</tt>.  ");
743        sentenceSelectionData.addElement("Note that this constructor always "
744                + "constructs a non-negative BigInteger.\u2029");
745
746        generalIteratorTest(sentenceBreak, sentenceSelectionData);
747    }
748
749    public void TestLineBreak() {
750        Vector<String> lineSelectionData = new Vector<String>();
751
752        lineSelectionData.addElement("Multi-");
753        lineSelectionData.addElement("Level ");
754        lineSelectionData.addElement("example ");
755        lineSelectionData.addElement("of ");
756        lineSelectionData.addElement("a ");
757        lineSelectionData.addElement("semi-");
758        lineSelectionData.addElement("idiotic ");
759        lineSelectionData.addElement("non-");
760        lineSelectionData.addElement("sensical ");
761        lineSelectionData.addElement("(non-");
762        lineSelectionData.addElement("important) ");
763        lineSelectionData.addElement("sentence. ");
764
765        lineSelectionData.addElement("Hi  ");
766        lineSelectionData.addElement("Hello ");
767        lineSelectionData.addElement("How\n");
768        lineSelectionData.addElement("are\r");
769        lineSelectionData.addElement("you\u2028");
770        lineSelectionData.addElement("fine.\t");
771        lineSelectionData.addElement("good.  ");
772
773        lineSelectionData.addElement("Now\r");
774        lineSelectionData.addElement("is\n");
775        lineSelectionData.addElement("the\r\n");
776        lineSelectionData.addElement("time\n");
777        lineSelectionData.addElement("\r");
778        lineSelectionData.addElement("for\r");
779        lineSelectionData.addElement("\r");
780        lineSelectionData.addElement("all");
781
782        generalIteratorTest(lineBreak, lineSelectionData);
783    }
784
785    public void TestBug4068133() {
786        Vector<String> lineSelectionData = new Vector<String>();
787
788        lineSelectionData.addElement("\u96f6");
789        lineSelectionData.addElement("\u4e00\u3002");
790        lineSelectionData.addElement("\u4e8c\u3001");
791        lineSelectionData.addElement("\u4e09\u3002\u3001");
792        lineSelectionData.addElement("\u56db\u3001\u3002\u3001");
793        lineSelectionData.addElement("\u4e94,");
794        lineSelectionData.addElement("\u516d.");
795        lineSelectionData.addElement("\u4e03.\u3001,\u3002");
796        lineSelectionData.addElement("\u516b");
797
798        generalIteratorTest(lineBreak, lineSelectionData);
799    }
800
801    public void TestBug4086052() {
802        Vector<String> lineSelectionData = new Vector<String>();
803
804        lineSelectionData.addElement("foo\u00a0bar ");
805//        lineSelectionData.addElement("foo\ufeffbar");
806
807        generalIteratorTest(lineBreak, lineSelectionData);
808    }
809
810    public void TestBug4097920() {
811        Vector<String> lineSelectionData = new Vector<String>();
812
813        lineSelectionData.addElement("dog,");
814        lineSelectionData.addElement("cat,");
815        lineSelectionData.addElement("mouse ");
816        lineSelectionData.addElement("(one)");
817        lineSelectionData.addElement("(two)\n");
818
819        generalIteratorTest(lineBreak, lineSelectionData);
820    }
821    /*
822    public void TestBug4035266() {
823        Vector<String> lineSelectionData = new Vector<String>();
824
825        lineSelectionData.addElement("The ");
826        lineSelectionData.addElement("balance ");
827        lineSelectionData.addElement("is ");
828        lineSelectionData.addElement("$-23,456.78, ");
829        lineSelectionData.addElement("not ");
830        lineSelectionData.addElement("-$32,456.78!\n");
831
832        generalIteratorTest(lineBreak, lineSelectionData);
833    }
834    */
835    public void TestBug4098467Lines() {
836        Vector<String> lineSelectionData = new Vector<String>();
837
838        // What follows is a string of Korean characters (I found it in the Yellow Pages
839        // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
840        // it correctly), first as precomposed syllables, and then as conjoining jamo.
841        // Both sequences should be semantically identical and break the same way.
842        // precomposed syllables...
843        lineSelectionData.addElement("\uc0c1");
844        lineSelectionData.addElement("\ud56d ");
845        lineSelectionData.addElement("\ud55c");
846        lineSelectionData.addElement("\uc778 ");
847        lineSelectionData.addElement("\uc5f0");
848        lineSelectionData.addElement("\ud569 ");
849        lineSelectionData.addElement("\uc7a5");
850        lineSelectionData.addElement("\ub85c");
851        lineSelectionData.addElement("\uad50");
852        lineSelectionData.addElement("\ud68c ");
853        // conjoining jamo...
854        lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc ");
855        lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab ");
856        lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 ");
857        lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
858
859        if (Locale.getDefault().getLanguage().equals("th")) {
860            logln("This test is skipped in th locale.");
861            return;
862        }
863
864        generalIteratorTest(lineBreak, lineSelectionData);
865    }
866
867    public void TestBug4117554Lines() {
868        Vector<String> lineSelectionData = new Vector<String>();
869
870        // Fullwidth .!? should be treated as postJwrd
871        lineSelectionData.addElement("\u4e01\uff0e");
872        lineSelectionData.addElement("\u4e02\uff01");
873        lineSelectionData.addElement("\u4e03\uff1f");
874
875        generalIteratorTest(lineBreak, lineSelectionData);
876    }
877
878    public void TestBug4217703() {
879        if (Locale.getDefault().getLanguage().equals("th")) {
880            logln("This test is skipped in th locale.");
881            return;
882        }
883
884        Vector<String> lineSelectionData = new Vector<String>();
885
886        // There shouldn't be a line break between sentence-ending punctuation
887        // and a closing quote
888        lineSelectionData.addElement("He ");
889        lineSelectionData.addElement("said ");
890        lineSelectionData.addElement("\"Go!\"  ");
891        lineSelectionData.addElement("I ");
892        lineSelectionData.addElement("went.  ");
893
894        lineSelectionData.addElement("Hashtable$Enumeration ");
895        lineSelectionData.addElement("getText().");
896        lineSelectionData.addElement("getIndex()");
897
898        generalIteratorTest(lineBreak, lineSelectionData);
899    }
900
901    private static final String graveS = "S\u0300";
902    private static final String acuteBelowI = "i\u0317";
903    private static final String acuteE = "e\u0301";
904    private static final String circumflexA = "a\u0302";
905    private static final String tildeE = "e\u0303";
906
907    public void TestCharacterBreak() {
908        Vector<String> characterSelectionData = new Vector<String>();
909
910        characterSelectionData.addElement(graveS);
911        characterSelectionData.addElement(acuteBelowI);
912        characterSelectionData.addElement("m");
913        characterSelectionData.addElement("p");
914        characterSelectionData.addElement("l");
915        characterSelectionData.addElement(acuteE);
916        characterSelectionData.addElement(" ");
917        characterSelectionData.addElement("s");
918        characterSelectionData.addElement(circumflexA);
919        characterSelectionData.addElement("m");
920        characterSelectionData.addElement("p");
921        characterSelectionData.addElement("l");
922        characterSelectionData.addElement(tildeE);
923        characterSelectionData.addElement(".");
924        characterSelectionData.addElement("w");
925        characterSelectionData.addElement(circumflexA);
926        characterSelectionData.addElement("w");
927        characterSelectionData.addElement("a");
928        characterSelectionData.addElement("f");
929        characterSelectionData.addElement("q");
930        characterSelectionData.addElement("\n");
931        characterSelectionData.addElement("\r");
932        characterSelectionData.addElement("\r\n");
933        characterSelectionData.addElement("\n");
934
935        generalIteratorTest(characterBreak, characterSelectionData);
936    }
937
938    public void TestBug4098467Characters() {
939        Vector<String> characterSelectionData = new Vector<String>();
940
941        // What follows is a string of Korean characters (I found it in the Yellow Pages
942        // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
943        // it correctly), first as precomposed syllables, and then as conjoining jamo.
944        // Both sequences should be semantically identical and break the same way.
945        // precomposed syllables...
946        characterSelectionData.addElement("\uc0c1");
947        characterSelectionData.addElement("\ud56d");
948        characterSelectionData.addElement(" ");
949        characterSelectionData.addElement("\ud55c");
950        characterSelectionData.addElement("\uc778");
951        characterSelectionData.addElement(" ");
952        characterSelectionData.addElement("\uc5f0");
953        characterSelectionData.addElement("\ud569");
954        characterSelectionData.addElement(" ");
955        characterSelectionData.addElement("\uc7a5");
956        characterSelectionData.addElement("\ub85c");
957        characterSelectionData.addElement("\uad50");
958        characterSelectionData.addElement("\ud68c");
959        characterSelectionData.addElement(" ");
960        // conjoining jamo...
961        characterSelectionData.addElement("\u1109\u1161\u11bc");
962        characterSelectionData.addElement("\u1112\u1161\u11bc");
963        characterSelectionData.addElement(" ");
964        characterSelectionData.addElement("\u1112\u1161\u11ab");
965        characterSelectionData.addElement("\u110b\u1175\u11ab");
966        characterSelectionData.addElement(" ");
967        characterSelectionData.addElement("\u110b\u1167\u11ab");
968        characterSelectionData.addElement("\u1112\u1161\u11b8");
969        characterSelectionData.addElement(" ");
970        characterSelectionData.addElement("\u110c\u1161\u11bc");
971        characterSelectionData.addElement("\u1105\u1169");
972        characterSelectionData.addElement("\u1100\u116d");
973        characterSelectionData.addElement("\u1112\u116c");
974
975        generalIteratorTest(characterBreak, characterSelectionData);
976    }
977
978    public void TestBug4153072() {
979        BreakIterator iter = BreakIterator.getWordInstance();
980        String str = "...Hello, World!...";
981        int begin = 3;
982        int end = str.length() - 3;
983        boolean gotException = false;
984        boolean dummy;
985
986        iter.setText(new StringCharacterIterator(str, begin, end, begin));
987        for (int index = -1; index < begin + 1; ++index) {
988            try {
989                dummy = iter.isBoundary(index);
990                if (index < begin)
991                    errln("Didn't get exception with offset = " + index +
992                                    " and begin index = " + begin);
993            }
994            catch (IllegalArgumentException e) {
995                if (index >= begin)
996                    errln("Got exception with offset = " + index +
997                                    " and begin index = " + begin);
998            }
999        }
1000    }
1001
1002    public void TestBug4146175Sentences() {
1003        Vector<String> sentenceSelectionData = new Vector<String>();
1004
1005        // break between periods and opening punctuation even when there's no
1006        // intervening space
1007        sentenceSelectionData.addElement("end.");
1008        sentenceSelectionData.addElement("(This is\u2029");
1009
1010        // treat the fullwidth period as an unambiguous sentence terminator
1011        sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e");
1012        sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f");
1013
1014        generalIteratorTest(sentenceBreak, sentenceSelectionData);
1015    }
1016
1017    public void TestBug4146175Lines() {
1018        if (Locale.getDefault().getLanguage().equals("th")) {
1019            logln("This test is skipped in th locale.");
1020            return;
1021        }
1022
1023        Vector<String> lineSelectionData = new Vector<String>();
1024
1025        // the fullwidth comma should stick to the preceding Japanese character
1026        lineSelectionData.addElement("\u7d42\uff0c");
1027        lineSelectionData.addElement("\u308f");
1028
1029        generalIteratorTest(lineBreak, lineSelectionData);
1030    }
1031
1032    public void TestBug4214367() {
1033        if (Locale.getDefault().getLanguage().equals("th")) {
1034            logln("This test is skipped in th locale.");
1035            return;
1036        }
1037
1038        Vector<String> wordSelectionData = new Vector<String>();
1039
1040        // the hiragana and katakana iteration marks and the long vowel mark
1041        // are not being treated correctly by the word-break iterator
1042        wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042");
1043        wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2");
1044
1045        generalIteratorTest(wordBreak, wordSelectionData);
1046    }
1047
1048    private static final String cannedTestChars // characters fo the class Cc are ignorable for breaking
1049        = /*"\u0000\u0001\u0002\u0003\u0004*/" !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2"
1050        + "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3"
1051        + "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303"
1052        + "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000"
1053        + "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f"
1054        + "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164";
1055
1056    public void TestSentenceInvariants()
1057    {
1058        BreakIterator e = BreakIterator.getSentenceInstance();
1059        doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
1060    }
1061
1062    public void TestWordInvariants()
1063    {
1064        if (Locale.getDefault().getLanguage().equals("th")) {
1065            logln("This test is skipped in th locale.");
1066            return;
1067        }
1068
1069        BreakIterator e = BreakIterator.getWordInstance();
1070        doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1071            + "\u30a3\u4e00\u4e01\u4e02");
1072        doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1073            + "\u30a3\u4e00\u4e01\u4e02");
1074    }
1075
1076    public void TestLineInvariants()
1077    {
1078        if (Locale.getDefault().getLanguage().equals("th")) {
1079            logln("This test is skipped in th locale.");
1080            return;
1081        }
1082
1083        BreakIterator e = BreakIterator.getLineInstance();
1084        String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045"
1085            + "\u30a3\u4e00\u4e01\u4e02";
1086        doBreakInvariantTest(e, testChars);
1087        doOtherInvariantTest(e, testChars);
1088
1089        int errorCount = 0;
1090
1091        // in addition to the other invariants, a line-break iterator should make sure that:
1092        // it doesn't break around the non-breaking characters
1093        String noBreak = "\u00a0\u2007\u2011\ufeff";
1094        StringBuffer work = new StringBuffer("aaa");
1095        for (int i = 0; i < testChars.length(); i++) {
1096            char c = testChars.charAt(i);
1097            if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003')
1098                continue;
1099            work.setCharAt(0, c);
1100            for (int j = 0; j < noBreak.length(); j++) {
1101                work.setCharAt(1, noBreak.charAt(j));
1102                for (int k = 0; k < testChars.length(); k++) {
1103                    work.setCharAt(2, testChars.charAt(k));
1104                    // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
1105                    // for breaking purposes as per UTR14
1106                    int type1 = Character.getType(work.charAt(1));
1107                    int type2 = Character.getType(work.charAt(2));
1108                    if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
1109                        type2 == Character.CONTROL || type2 == Character.FORMAT) {
1110                        continue;
1111                    }
1112                    e.setText(work.toString());
1113                    for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) {
1114                        if (l == 1 || l == 2) {
1115                            //errln("Got break between U+" + Integer.toHexString((int)
1116                            //        (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1117                            //        (int)(work.charAt(l))) + "\ntype1 = " + type1 + "\ntype2 = " + type2);
1118                            // as per UTR14 spaces followed by a GLUE character should allow
1119                            // line breaking
1120                            if (work.charAt(l-1) == '\u0020' && (work.charAt(l) == '\u00a0' ||
1121                                                                 work.charAt(l) == '\u0f0c' ||
1122                                                                 work.charAt(l) == '\u2007' ||
1123                                                                 work.charAt(l) == '\u2011' ||
1124                                                                 work.charAt(l) == '\u202f' ||
1125                                                                 work.charAt(l) == '\ufeff')) {
1126                                continue;
1127                            }
1128                            errln("Got break between U+" + Integer.toHexString((int)
1129                                    (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1130                                    (int)(work.charAt(l))));
1131                            errorCount++;
1132                            if (errorCount >= 75)
1133                                return;
1134                        }
1135                    }
1136                }
1137            }
1138        }
1139
1140        // The following test has so many exceptions that it would be better to write a new set of data
1141        // that tested exactly what should be tested
1142        // Until that point it will be commented out
1143        /*
1144
1145        // it does break after dashes (unless they're followed by a digit, a non-spacing mark,
1146        // a currency symbol, a space, a format-control character, a regular control character,
1147        // a line or paragraph separator, or another dash)
1148        String dashes = "-\u00ad\u2010\u2012\u2013\u2014";
1149        for (int i = 0; i < testChars.length(); i++) {
1150            work.setCharAt(0, testChars.charAt(i));
1151            for (int j = 0; j < dashes.length(); j++) {
1152                work.setCharAt(1, dashes.charAt(j));
1153                for (int k = 0; k < testChars.length(); k++) {
1154                    char c = testChars.charAt(k);
1155                    if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER ||
1156                        Character.getType(c) == Character.OTHER_NUMBER ||
1157                        Character.getType(c) == Character.NON_SPACING_MARK ||
1158                        Character.getType(c) == Character.ENCLOSING_MARK ||
1159                        Character.getType(c) == Character.CURRENCY_SYMBOL ||
1160                        Character.getType(c) == Character.DASH_PUNCTUATION ||
1161                        Character.getType(c) == Character.SPACE_SEPARATOR ||
1162                        Character.getType(c) == Character.FORMAT ||
1163                        Character.getType(c) == Character.CONTROL ||
1164                        Character.getType(c) == Character.END_PUNCTUATION ||
1165                        Character.getType(c) == Character.FINAL_QUOTE_PUNCTUATION ||
1166                        Character.getType(c) == Character.OTHER_PUNCTUATION ||
1167                        c == '\'' || c == '\"' ||
1168                        // category EX as per UTR14
1169                        c == '!' || c == '?' || c == '\ufe56' || c == '\ufe57' || c == '\uff01' || c == '\uff1f' ||
1170                        c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' ||
1171                        c == '\u0003' || c == '\u2007' || c == '\u2011' ||
1172                        c == '\ufeff')
1173                        continue;
1174                    work.setCharAt(2, c);
1175                    e.setText(work.toString());
1176                    boolean saw2 = false;
1177                    for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
1178                        if (l == 2)
1179                            saw2 = true;
1180                    if (!saw2) {
1181                        errln("Didn't get break between U+" + Integer.toHexString((int)
1182                                    (work.charAt(1))) + " and U+" + Integer.toHexString(
1183                                    (int)(work.charAt(2))));
1184                        errorCount++;
1185                        if (errorCount >= 75)
1186                            return;
1187                    }
1188                }
1189            }
1190        }
1191        */
1192    }
1193
1194    public void TestCharacterInvariants()
1195    {
1196        BreakIterator e = BreakIterator.getCharacterInstance();
1197        doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1198            + "\u11a9\u11aa");
1199        doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1200            + "\u11a9\u11aa");
1201    }
1202
1203    public void TestEmptyString()
1204    {
1205        String text = "";
1206        Vector<String> x = new Vector<String>();
1207        x.addElement(text);
1208
1209        generalIteratorTest(lineBreak, x);
1210    }
1211
1212    public void TestGetAvailableLocales()
1213    {
1214        Locale[] locList = BreakIterator.getAvailableLocales();
1215
1216        if (locList.length == 0)
1217            errln("getAvailableLocales() returned an empty list!");
1218        // I have no idea how to test this function...
1219    }
1220
1221
1222    /**
1223     * Bug 4095322
1224     */
1225    public void TestJapaneseLineBreak()
1226    {
1227        StringBuffer testString = new StringBuffer("\u4e00x\u4e8c");
1228        // Breaking on <Kanji>$<Kanji> is inconsistent
1229
1230        /* Characters in precedingChars and followingChars have been updated
1231         * from Unicode 2.0.14-based to 3.0.0-based when 4638433 was fixed.
1232         * In concrete terms,
1233         *   0x301F : Its category was changed from Ps to Pe since Unicode 2.1.
1234         *   0x169B & 0x169C : added since Unicode 3.0.0.
1235         */
1236        String precedingChars =
1237            /* Puctuation, Open */
1238          "([{\u201a\u201e\u2045\u207d\u208d\u2329\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff62\u169b"
1239            /* Punctuation, Initial quote */
1240          + "\u00ab\u2018\u201b\u201c\u201f\u2039"
1241            /* Symbol, Currency */
1242          + "\u00a5\u00a3\u00a4\u20a0";
1243
1244        String followingChars =
1245            /* Puctuation, Close */
1246          ")]}\u2046\u207e\u208e\u232a\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e\u301f\ufd3e\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42\ufe44\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff63\u169c"
1247            /* Punctuation, Final quote */
1248          + "\u00bb\u2019\u201d\u203a"
1249            /* Punctuation, Other */
1250          + "!%,.:;\u3001\u3002\u2030\u2031\u2032\u2033\u2034"
1251            /* Punctuation, Dash */
1252          + "\u2103\u2109"
1253            /* Symbol, Currency */
1254          + "\u00a2"
1255            /* Letter, Modifier */
1256          + "\u3005\u309d\u309e"
1257            /* Letter, Other */
1258          + "\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc\u30fd\u30fe"
1259           /* Mark, Non-Spacing */
1260          + "\u0300\u0301\u0302"
1261            /* Symbol, Modifier */
1262          + "\u309b\u309c"
1263            /* Symbol, Other */
1264          + "\u00b0";
1265
1266        BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN);
1267
1268        for (int i = 0; i < precedingChars.length(); i++) {
1269            testString.setCharAt(1, precedingChars.charAt(i));
1270            iter.setText(testString.toString());
1271            int j = iter.first();
1272            if (j != 0) {
1273                errln("ja line break failure: failed to start at 0 and bounced at " + j);
1274            }
1275            j = iter.next();
1276            if (j != 1) {
1277                errln("ja line break failure: failed to stop before '"
1278                        + precedingChars.charAt(i) + "' (\\u"
1279                        + Integer.toString(precedingChars.charAt(i), 16)
1280                        + ") at 1 and bounded at " + j);
1281            }
1282            j = iter.next();
1283            if (j != 3) {
1284                errln("ja line break failure: failed to skip position after '"
1285                        + precedingChars.charAt(i) + "' (\\u"
1286                        + Integer.toString(precedingChars.charAt(i), 16)
1287                        + ") at 3 and bounded at " + j);
1288            }
1289        }
1290
1291        for (int i = 0; i < followingChars.length(); i++) {
1292            testString.setCharAt(1, followingChars.charAt(i));
1293            iter.setText(testString.toString());
1294            int j = iter.first();
1295            if (j != 0) {
1296                errln("ja line break failure: failed to start at 0 and bounded at " + j);
1297            }
1298            j = iter.next();
1299            if (j != 2) {
1300                errln("ja line break failure: failed to skip position before '"
1301                        + followingChars.charAt(i) + "' (\\u"
1302                        + Integer.toString(followingChars.charAt(i), 16)
1303                        + ") at 2 and bounded at " + j);
1304            }
1305            j = iter.next();
1306            if (j != 3) {
1307                errln("ja line break failure: failed to stop after '"
1308                        + followingChars.charAt(i) + "' (\\u"
1309                        + Integer.toString(followingChars.charAt(i), 16)
1310                        + ") at 3 and bounded at " + j);
1311            }
1312        }
1313    }
1314
1315    /**
1316     * Bug 4638433
1317     */
1318    public void TestLineBreakBasedOnUnicode3_0_0()
1319    {
1320        BreakIterator iter;
1321        int i;
1322
1323        /* Latin Extend-B characters
1324         * 0x0218-0x0233 which have been added since Unicode 3.0.0.
1325         */
1326        iter = BreakIterator.getWordInstance(Locale.US);
1327        iter.setText("\u0216\u0217\u0218\u0219\u021A");
1328        i = iter.first();
1329        i = iter.next();
1330        if (i != 5) {
1331            errln("Word break failure: failed to stop at 5 and bounded at " + i);
1332        }
1333
1334
1335        iter = BreakIterator.getLineInstance(Locale.US);
1336
1337        /* <Three(Nd)><Two(Nd)><Low Double Prime Quotation Mark(Pe)><One(Nd)>
1338         * \u301f has changed its category from Ps to Pe since Unicode 2.1.
1339         */
1340        iter.setText("32\u301f1");
1341        i = iter.first();
1342        i = iter.next();
1343        if (i != 3) {
1344            errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i);
1345        }
1346
1347        /* Mongolian <Letter A(Lo)><Todo Soft Hyphen(Pd)><Letter E(Lo)>
1348         * which have been added since Unicode 3.0.0.
1349         */
1350        iter.setText("\u1820\u1806\u1821");
1351        i = iter.first();
1352        i = iter.next();
1353        if (i != 2) {
1354            errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i);
1355        }
1356
1357        /* Khmer <ZERO(Nd)><Currency Symbol(Sc)><ONE(Nd)> which have
1358         * been added since Unicode 3.0.0.
1359         */
1360        iter.setText("\u17E0\u17DB\u17E1");
1361        i = iter.first();
1362        i = iter.next();
1363        if (i != 1) {
1364            errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i);
1365        }
1366        i = iter.next();
1367        if (i != 3) {
1368            errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i);
1369        }
1370
1371        /* Ogham <Letter UR(Lo)><Space Mark(Zs)><Letter OR(Lo)> which have
1372         * been added since Unicode 3.0.0.
1373         */
1374        iter.setText("\u1692\u1680\u1696");
1375        i = iter.first();
1376        i = iter.next();
1377        if (i != 2) {
1378            errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i);
1379        }
1380
1381
1382        // Confirm changes in BreakIteratorRules_th.java have been reflected.
1383        iter = BreakIterator.getLineInstance(new Locale("th", ""));
1384
1385        /* Thai <Seven(Nd)>
1386         *      <Left Double Quotation Mark(Pi)>
1387         *      <Five(Nd)>
1388         *      <Right Double Quotation Mark(Pf)>
1389         *      <Three(Nd)>
1390         */
1391        iter.setText("\u0E57\u201C\u0E55\u201D\u0E53");
1392        i = iter.first();
1393        i = iter.next();
1394        if (i != 1) {
1395            errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i);
1396        }
1397        i = iter.next();
1398        if (i != 4) {
1399            errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i);
1400        }
1401    }
1402
1403    /**
1404     * Bug 4068137
1405     */
1406    public void TestEndBehavior()
1407    {
1408        String testString = "boo.";
1409        BreakIterator wb = BreakIterator.getWordInstance();
1410        wb.setText(testString);
1411
1412        if (wb.first() != 0)
1413            errln("Didn't get break at beginning of string.");
1414        if (wb.next() != 3)
1415            errln("Didn't get break before period in \"boo.\"");
1416        if (wb.current() != 4 && wb.next() != 4)
1417            errln("Didn't get break at end of string.");
1418    }
1419
1420    // [serialization test has been removed pursuant to bug #4152965]
1421
1422    /**
1423     * Bug 4450804
1424     */
1425    public void TestLineBreakContractions() {
1426        Vector<String> expected = new Vector<String>();
1427
1428        expected.add("These ");
1429        expected.add("are ");
1430        expected.add("'foobles'. ");
1431        expected.add("Don't ");
1432        expected.add("you ");
1433        expected.add("like ");
1434        expected.add("them?");
1435        generalIteratorTest(lineBreak, expected);
1436    }
1437
1438}
1439