1/*
2 * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26/*
27 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
28 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
29 *
30 * The original version of this source code and documentation
31 * is copyrighted and owned by Taligent, Inc., a wholly-owned
32 * subsidiary of IBM. These materials are provided under terms
33 * of a License Agreement between Taligent and Sun. This technology
34 * is protected by multiple US and International patents.
35 *
36 * This notice and attribution to Taligent may not be removed.
37 * Taligent is a registered trademark of Taligent, Inc.
38 *
39 */
40
41package java.text;
42
43import java.lang.ref.SoftReference;
44import java.text.spi.BreakIteratorProvider;
45import java.util.Locale;
46import sun.util.locale.provider.LocaleProviderAdapter;
47import sun.util.locale.provider.LocaleServiceProviderPool;
48
49
50/**
51 * The <code>BreakIterator</code> class implements methods for finding
52 * the location of boundaries in text. Instances of <code>BreakIterator</code>
53 * maintain a current position and scan over text
54 * returning the index of characters where boundaries occur.
55 * Internally, <code>BreakIterator</code> scans text using a
56 * <code>CharacterIterator</code>, and is thus able to scan text held
57 * by any object implementing that protocol. A <code>StringCharacterIterator</code>
58 * is used to scan <code>String</code> objects passed to <code>setText</code>.
59 *
60 * <p>
61 * You use the factory methods provided by this class to create
62 * instances of various types of break iterators. In particular,
63 * use <code>getWordInstance</code>, <code>getLineInstance</code>,
64 * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
65 * to create <code>BreakIterator</code>s that perform
66 * word, line, sentence, and character boundary analysis respectively.
67 * A single <code>BreakIterator</code> can work only on one unit
68 * (word, line, sentence, and so on). You must use a different iterator
69 * for each unit boundary analysis you wish to perform.
70 *
71 * <p><a id="line"></a>
72 * Line boundary analysis determines where a text string can be
73 * broken when line-wrapping. The mechanism correctly handles
74 * punctuation and hyphenated words. Actual line breaking needs
75 * to also consider the available line width and is handled by
76 * higher-level software.
77 *
78 * <p><a id="sentence"></a>
79 * Sentence boundary analysis allows selection with correct interpretation
80 * of periods within numbers and abbreviations, and trailing punctuation
81 * marks such as quotation marks and parentheses.
82 *
83 * <p><a id="word"></a>
84 * Word boundary analysis is used by search and replace functions, as
85 * well as within text editing applications that allow the user to
86 * select words with a double click. Word selection provides correct
87 * interpretation of punctuation marks within and following
88 * words. Characters that are not part of a word, such as symbols
89 * or punctuation marks, have word-breaks on both sides.
90 *
91 * <p><a id="character"></a>
92 * Character boundary analysis allows users to interact with characters
93 * as they expect to, for example, when moving the cursor through a text
94 * string. Character boundary analysis provides correct navigation
95 * through character strings, regardless of how the character is stored.
96 * The boundaries returned may be those of supplementary characters,
97 * combining character sequences, or ligature clusters.
98 * For example, an accented character might be stored as a base character
99 * and a diacritical mark. What users consider to be a character can
100 * differ between languages.
101 *
102 * <p>
103 * The <code>BreakIterator</code> instances returned by the factory methods
104 * of this class are intended for use with natural languages only, not for
105 * programming language text. It is however possible to define subclasses
106 * that tokenize a programming language.
107 *
108 * <P>
109 * <strong>Examples</strong>:<P>
110 * Creating and using text boundaries:
111 * <blockquote>
112 * <pre>
113 * public static void main(String args[]) {
114 *      if (args.length == 1) {
115 *          String stringToExamine = args[0];
116 *          //print each word in order
117 *          BreakIterator boundary = BreakIterator.getWordInstance();
118 *          boundary.setText(stringToExamine);
119 *          printEachForward(boundary, stringToExamine);
120 *          //print each sentence in reverse order
121 *          boundary = BreakIterator.getSentenceInstance(Locale.US);
122 *          boundary.setText(stringToExamine);
123 *          printEachBackward(boundary, stringToExamine);
124 *          printFirst(boundary, stringToExamine);
125 *          printLast(boundary, stringToExamine);
126 *      }
127 * }
128 * </pre>
129 * </blockquote>
130 *
131 * Print each element in order:
132 * <blockquote>
133 * <pre>
134 * public static void printEachForward(BreakIterator boundary, String source) {
135 *     int start = boundary.first();
136 *     for (int end = boundary.next();
137 *          end != BreakIterator.DONE;
138 *          start = end, end = boundary.next()) {
139 *          System.out.println(source.substring(start,end));
140 *     }
141 * }
142 * </pre>
143 * </blockquote>
144 *
145 * Print each element in reverse order:
146 * <blockquote>
147 * <pre>
148 * public static void printEachBackward(BreakIterator boundary, String source) {
149 *     int end = boundary.last();
150 *     for (int start = boundary.previous();
151 *          start != BreakIterator.DONE;
152 *          end = start, start = boundary.previous()) {
153 *         System.out.println(source.substring(start,end));
154 *     }
155 * }
156 * </pre>
157 * </blockquote>
158 *
159 * Print first element:
160 * <blockquote>
161 * <pre>
162 * public static void printFirst(BreakIterator boundary, String source) {
163 *     int start = boundary.first();
164 *     int end = boundary.next();
165 *     System.out.println(source.substring(start,end));
166 * }
167 * </pre>
168 * </blockquote>
169 *
170 * Print last element:
171 * <blockquote>
172 * <pre>
173 * public static void printLast(BreakIterator boundary, String source) {
174 *     int end = boundary.last();
175 *     int start = boundary.previous();
176 *     System.out.println(source.substring(start,end));
177 * }
178 * </pre>
179 * </blockquote>
180 *
181 * Print the element at a specified position:
182 * <blockquote>
183 * <pre>
184 * public static void printAt(BreakIterator boundary, int pos, String source) {
185 *     int end = boundary.following(pos);
186 *     int start = boundary.previous();
187 *     System.out.println(source.substring(start,end));
188 * }
189 * </pre>
190 * </blockquote>
191 *
192 * Find the next word:
193 * <blockquote>
194 * <pre>{@code
195 * public static int nextWordStartAfter(int pos, String text) {
196 *     BreakIterator wb = BreakIterator.getWordInstance();
197 *     wb.setText(text);
198 *     int last = wb.following(pos);
199 *     int current = wb.next();
200 *     while (current != BreakIterator.DONE) {
201 *         for (int p = last; p < current; p++) {
202 *             if (Character.isLetter(text.codePointAt(p)))
203 *                 return last;
204 *         }
205 *         last = current;
206 *         current = wb.next();
207 *     }
208 *     return BreakIterator.DONE;
209 * }
210 * }</pre>
211 * (The iterator returned by BreakIterator.getWordInstance() is unique in that
212 * the break positions it returns don't represent both the start and end of the
213 * thing being iterated over.  That is, a sentence-break iterator returns breaks
214 * that each represent the end of one sentence and the beginning of the next.
215 * With the word-break iterator, the characters between two boundaries might be a
216 * word, or they might be the punctuation or whitespace between two words.  The
217 * above code uses a simple heuristic to determine which boundary is the beginning
218 * of a word: If the characters between this boundary and the next boundary
219 * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
220 * a Hangul syllable, a Kana character, etc.), then the text between this boundary
221 * and the next is a word; otherwise, it's the material between words.)
222 * </blockquote>
223 *
224 * @since 1.1
225 * @see CharacterIterator
226 *
227 */
228
229public abstract class BreakIterator implements Cloneable
230{
231    /**
232     * Constructor. BreakIterator is stateless and has no default behavior.
233     */
234    protected BreakIterator()
235    {
236    }
237
238    /**
239     * Create a copy of this iterator
240     * @return A copy of this
241     */
242    @Override
243    public Object clone()
244    {
245        try {
246            return super.clone();
247        }
248        catch (CloneNotSupportedException e) {
249            throw new InternalError(e);
250        }
251    }
252
253    /**
254     * DONE is returned by previous(), next(), next(int), preceding(int)
255     * and following(int) when either the first or last text boundary has been
256     * reached.
257     */
258    public static final int DONE = -1;
259
260    /**
261     * Returns the first boundary. The iterator's current position is set
262     * to the first text boundary.
263     * @return The character index of the first text boundary.
264     */
265    public abstract int first();
266
267    /**
268     * Returns the last boundary. The iterator's current position is set
269     * to the last text boundary.
270     * @return The character index of the last text boundary.
271     */
272    public abstract int last();
273
274    /**
275     * Returns the nth boundary from the current boundary. If either
276     * the first or last text boundary has been reached, it returns
277     * <code>BreakIterator.DONE</code> and the current position is set to either
278     * the first or last text boundary depending on which one is reached. Otherwise,
279     * the iterator's current position is set to the new boundary.
280     * For example, if the iterator's current position is the mth text boundary
281     * and three more boundaries exist from the current boundary to the last text
282     * boundary, the next(2) call will return m + 2. The new text position is set
283     * to the (m + 2)th text boundary. A next(4) call would return
284     * <code>BreakIterator.DONE</code> and the last text boundary would become the
285     * new text position.
286     * @param n which boundary to return.  A value of 0
287     * does nothing.  Negative values move to previous boundaries
288     * and positive values move to later boundaries.
289     * @return The character index of the nth boundary from the current position
290     * or <code>BreakIterator.DONE</code> if either first or last text boundary
291     * has been reached.
292     */
293    public abstract int next(int n);
294
295    /**
296     * Returns the boundary following the current boundary. If the current boundary
297     * is the last text boundary, it returns <code>BreakIterator.DONE</code> and
298     * the iterator's current position is unchanged. Otherwise, the iterator's
299     * current position is set to the boundary following the current boundary.
300     * @return The character index of the next text boundary or
301     * <code>BreakIterator.DONE</code> if the current boundary is the last text
302     * boundary.
303     * Equivalent to next(1).
304     * @see #next(int)
305     */
306    public abstract int next();
307
308    /**
309     * Returns the boundary preceding the current boundary. If the current boundary
310     * is the first text boundary, it returns <code>BreakIterator.DONE</code> and
311     * the iterator's current position is unchanged. Otherwise, the iterator's
312     * current position is set to the boundary preceding the current boundary.
313     * @return The character index of the previous text boundary or
314     * <code>BreakIterator.DONE</code> if the current boundary is the first text
315     * boundary.
316     */
317    public abstract int previous();
318
319    /**
320     * Returns the first boundary following the specified character offset. If the
321     * specified offset equals to the last text boundary, it returns
322     * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
323     * Otherwise, the iterator's current position is set to the returned boundary.
324     * The value returned is always greater than the offset or the value
325     * <code>BreakIterator.DONE</code>.
326     * @param offset the character offset to begin scanning.
327     * @return The first boundary after the specified offset or
328     * <code>BreakIterator.DONE</code> if the last text boundary is passed in
329     * as the offset.
330     * @exception  IllegalArgumentException if the specified offset is less than
331     * the first text boundary or greater than the last text boundary.
332     */
333    public abstract int following(int offset);
334
335    /**
336     * Returns the last boundary preceding the specified character offset. If the
337     * specified offset equals to the first text boundary, it returns
338     * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
339     * Otherwise, the iterator's current position is set to the returned boundary.
340     * The value returned is always less than the offset or the value
341     * <code>BreakIterator.DONE</code>.
342     * @param offset the character offset to begin scanning.
343     * @return The last boundary before the specified offset or
344     * <code>BreakIterator.DONE</code> if the first text boundary is passed in
345     * as the offset.
346     * @exception   IllegalArgumentException if the specified offset is less than
347     * the first text boundary or greater than the last text boundary.
348     * @since 1.2
349     */
350    public int preceding(int offset) {
351        // NOTE:  This implementation is here solely because we can't add new
352        // abstract methods to an existing class.  There is almost ALWAYS a
353        // better, faster way to do this.
354        int pos = following(offset);
355        while (pos >= offset && pos != DONE) {
356            pos = previous();
357        }
358        return pos;
359    }
360
361    /**
362     * Returns true if the specified character offset is a text boundary.
363     * @param offset the character offset to check.
364     * @return <code>true</code> if "offset" is a boundary position,
365     * <code>false</code> otherwise.
366     * @exception   IllegalArgumentException if the specified offset is less than
367     * the first text boundary or greater than the last text boundary.
368     * @since 1.2
369     */
370    public boolean isBoundary(int offset) {
371        // NOTE: This implementation probably is wrong for most situations
372        // because it fails to take into account the possibility that a
373        // CharacterIterator passed to setText() may not have a begin offset
374        // of 0.  But since the abstract BreakIterator doesn't have that
375        // knowledge, it assumes the begin offset is 0.  If you subclass
376        // BreakIterator, copy the SimpleTextBoundary implementation of this
377        // function into your subclass.  [This should have been abstract at
378        // this level, but it's too late to fix that now.]
379        if (offset == 0) {
380            return true;
381        }
382        int boundary = following(offset - 1);
383        if (boundary == DONE) {
384            throw new IllegalArgumentException();
385        }
386        return boundary == offset;
387    }
388
389    /**
390     * Returns character index of the text boundary that was most
391     * recently returned by next(), next(int), previous(), first(), last(),
392     * following(int) or preceding(int). If any of these methods returns
393     * <code>BreakIterator.DONE</code> because either first or last text boundary
394     * has been reached, it returns the first or last text boundary depending on
395     * which one is reached.
396     * @return The text boundary returned from the above methods, first or last
397     * text boundary.
398     * @see #next()
399     * @see #next(int)
400     * @see #previous()
401     * @see #first()
402     * @see #last()
403     * @see #following(int)
404     * @see #preceding(int)
405     */
406    public abstract int current();
407
408    /**
409     * Get the text being scanned
410     * @return the text being scanned
411     */
412    public abstract CharacterIterator getText();
413
414    /**
415     * Set a new text string to be scanned.  The current scan
416     * position is reset to first().
417     * @param newText new text to scan.
418     */
419    public void setText(String newText)
420    {
421        setText(new StringCharacterIterator(newText));
422    }
423
424    /**
425     * Set a new text for scanning.  The current scan
426     * position is reset to first().
427     * @param newText new text to scan.
428     */
429    public abstract void setText(CharacterIterator newText);
430
431    private static final int CHARACTER_INDEX = 0;
432    private static final int WORD_INDEX = 1;
433    private static final int LINE_INDEX = 2;
434    private static final int SENTENCE_INDEX = 3;
435
436    @SuppressWarnings("unchecked")
437    private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4];
438
439    /**
440     * Returns a new <code>BreakIterator</code> instance
441     * for <a href="BreakIterator.html#word">word breaks</a>
442     * for the {@linkplain Locale#getDefault() default locale}.
443     * @return A break iterator for word breaks
444     */
445    public static BreakIterator getWordInstance()
446    {
447        return getWordInstance(Locale.getDefault());
448    }
449
450    /**
451     * Returns a new <code>BreakIterator</code> instance
452     * for <a href="BreakIterator.html#word">word breaks</a>
453     * for the given locale.
454     * @param locale the desired locale
455     * @return A break iterator for word breaks
456     * @exception NullPointerException if <code>locale</code> is null
457     */
458    public static BreakIterator getWordInstance(Locale locale)
459    {
460        return getBreakInstance(locale, WORD_INDEX);
461    }
462
463    /**
464     * Returns a new <code>BreakIterator</code> instance
465     * for <a href="BreakIterator.html#line">line breaks</a>
466     * for the {@linkplain Locale#getDefault() default locale}.
467     * @return A break iterator for line breaks
468     */
469    public static BreakIterator getLineInstance()
470    {
471        return getLineInstance(Locale.getDefault());
472    }
473
474    /**
475     * Returns a new <code>BreakIterator</code> instance
476     * for <a href="BreakIterator.html#line">line breaks</a>
477     * for the given locale.
478     * @param locale the desired locale
479     * @return A break iterator for line breaks
480     * @exception NullPointerException if <code>locale</code> is null
481     */
482    public static BreakIterator getLineInstance(Locale locale)
483    {
484        return getBreakInstance(locale, LINE_INDEX);
485    }
486
487    /**
488     * Returns a new <code>BreakIterator</code> instance
489     * for <a href="BreakIterator.html#character">character breaks</a>
490     * for the {@linkplain Locale#getDefault() default locale}.
491     * @return A break iterator for character breaks
492     */
493    public static BreakIterator getCharacterInstance()
494    {
495        return getCharacterInstance(Locale.getDefault());
496    }
497
498    /**
499     * Returns a new <code>BreakIterator</code> instance
500     * for <a href="BreakIterator.html#character">character breaks</a>
501     * for the given locale.
502     * @param locale the desired locale
503     * @return A break iterator for character breaks
504     * @exception NullPointerException if <code>locale</code> is null
505     */
506    public static BreakIterator getCharacterInstance(Locale locale)
507    {
508        return getBreakInstance(locale, CHARACTER_INDEX);
509    }
510
511    /**
512     * Returns a new <code>BreakIterator</code> instance
513     * for <a href="BreakIterator.html#sentence">sentence breaks</a>
514     * for the {@linkplain Locale#getDefault() default locale}.
515     * @return A break iterator for sentence breaks
516     */
517    public static BreakIterator getSentenceInstance()
518    {
519        return getSentenceInstance(Locale.getDefault());
520    }
521
522    /**
523     * Returns a new <code>BreakIterator</code> instance
524     * for <a href="BreakIterator.html#sentence">sentence breaks</a>
525     * for the given locale.
526     * @param locale the desired locale
527     * @return A break iterator for sentence breaks
528     * @exception NullPointerException if <code>locale</code> is null
529     */
530    public static BreakIterator getSentenceInstance(Locale locale)
531    {
532        return getBreakInstance(locale, SENTENCE_INDEX);
533    }
534
535    private static BreakIterator getBreakInstance(Locale locale, int type) {
536        if (iterCache[type] != null) {
537            BreakIteratorCache cache = iterCache[type].get();
538            if (cache != null) {
539                if (cache.getLocale().equals(locale)) {
540                    return cache.createBreakInstance();
541                }
542            }
543        }
544
545        BreakIterator result = createBreakInstance(locale, type);
546        BreakIteratorCache cache = new BreakIteratorCache(locale, result);
547        iterCache[type] = new SoftReference<>(cache);
548        return result;
549    }
550
551    private static BreakIterator createBreakInstance(Locale locale,
552                                                     int type) {
553        LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale);
554        BreakIterator iterator = createBreakInstance(adapter, locale, type);
555        if (iterator == null) {
556            iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type);
557        }
558        return iterator;
559    }
560
561    private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) {
562        BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider();
563        BreakIterator iterator = null;
564        switch (type) {
565        case CHARACTER_INDEX:
566            iterator = breakIteratorProvider.getCharacterInstance(locale);
567            break;
568        case WORD_INDEX:
569            iterator = breakIteratorProvider.getWordInstance(locale);
570            break;
571        case LINE_INDEX:
572            iterator = breakIteratorProvider.getLineInstance(locale);
573            break;
574        case SENTENCE_INDEX:
575            iterator = breakIteratorProvider.getSentenceInstance(locale);
576            break;
577        }
578        return iterator;
579    }
580
581    /**
582     * Returns an array of all locales for which the
583     * <code>get*Instance</code> methods of this class can return
584     * localized instances.
585     * The returned array represents the union of locales supported by the Java
586     * runtime and by installed
587     * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations.
588     * It must contain at least a <code>Locale</code>
589     * instance equal to {@link java.util.Locale#US Locale.US}.
590     *
591     * @return An array of locales for which localized
592     *         <code>BreakIterator</code> instances are available.
593     */
594    public static synchronized Locale[] getAvailableLocales()
595    {
596        LocaleServiceProviderPool pool =
597            LocaleServiceProviderPool.getPool(BreakIteratorProvider.class);
598        return pool.getAvailableLocales();
599    }
600
601    private static final class BreakIteratorCache {
602
603        private BreakIterator iter;
604        private Locale locale;
605
606        BreakIteratorCache(Locale locale, BreakIterator iter) {
607            this.locale = locale;
608            this.iter = (BreakIterator) iter.clone();
609        }
610
611        Locale getLocale() {
612            return locale;
613        }
614
615        BreakIterator createBreakInstance() {
616            return (BreakIterator) iter.clone();
617        }
618    }
619}
620