1/*
2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26/*
27 *******************************************************************************
28 * Copyright (C) 2000-2014, International Business Machines Corporation and
29 * others. All Rights Reserved.
30 *******************************************************************************
31 */
32package sun.text.normalizer;
33
34import java.text.CharacterIterator;
35import java.text.Normalizer;
36
37/**
38 * Unicode Normalization
39 *
40 * <h2>Unicode normalization API</h2>
41 *
42 * <code>normalize</code> transforms Unicode text into an equivalent composed or
43 * decomposed form, allowing for easier sorting and searching of text.
44 * <code>normalize</code> supports the standard normalization forms described in
45 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
46 * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
47 *
48 * Characters with accents or other adornments can be encoded in
49 * several different ways in Unicode.  For example, take the character A-acute.
50 * In Unicode, this can be encoded as a single character (the
51 * "composed" form):
52 *
53 * <pre>
54 *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
55 * </pre>
56 *
57 * or as two separate characters (the "decomposed" form):
58 *
59 * <pre>
60 *      0041    LATIN CAPITAL LETTER A
61 *      0301    COMBINING ACUTE ACCENT
62 * </pre>
63 *
64 * To a user of your program, however, both of these sequences should be
65 * treated as the same "user-level" character "A with acute accent".  When you
66 * are searching or comparing text, you must ensure that these two sequences are
67 * treated equivalently.  In addition, you must handle characters with more than
68 * one accent.  Sometimes the order of a character's combining accents is
69 * significant, while in other cases accent sequences in different orders are
70 * really equivalent.
71 *
72 * Similarly, the string "ffi" can be encoded as three separate letters:
73 *
74 * <pre>
75 *      0066    LATIN SMALL LETTER F
76 *      0066    LATIN SMALL LETTER F
77 *      0069    LATIN SMALL LETTER I
78 * </pre>
79 *
80 * or as the single character
81 *
82 * <pre>
83 *      FB03    LATIN SMALL LIGATURE FFI
84 * </pre>
85 *
86 * The ffi ligature is not a distinct semantic character, and strictly speaking
87 * it shouldn't be in Unicode at all, but it was included for compatibility
88 * with existing character sets that already provided it.  The Unicode standard
89 * identifies such characters by giving them "compatibility" decompositions
90 * into the corresponding semantic characters.  When sorting and searching, you
91 * will often want to use these mappings.
92 *
93 * <code>normalize</code> helps solve these problems by transforming text into
94 * the canonical composed and decomposed forms as shown in the first example
95 * above. In addition, you can have it perform compatibility decompositions so
96 * that you can treat compatibility characters the same as their equivalents.
97 * Finally, <code>normalize</code> rearranges accents into the proper canonical
98 * order, so that you do not have to worry about accent rearrangement on your
99 * own.
100 *
101 * Form FCD, "Fast C or D", is also designed for collation.
102 * It allows to work on strings that are not necessarily normalized
103 * with an algorithm (like in collation) that works under "canonical closure",
104 * i.e., it treats precomposed characters and their decomposed equivalents the
105 * same.
106 *
107 * It is not a normalization form because it does not provide for uniqueness of
108 * representation. Multiple strings may be canonically equivalent (their NFDs
109 * are identical) and may all conform to FCD without being identical themselves.
110 *
111 * The form is defined such that the "raw decomposition", the recursive
112 * canonical decomposition of each character, results in a string that is
113 * canonically ordered. This means that precomposed characters are allowed for
114 * as long as their decompositions do not need canonical reordering.
115 *
116 * Its advantage for a process like collation is that all NFD and most NFC texts
117 * - and many unnormalized texts - already conform to FCD and do not need to be
118 * normalized (NFD) for such a process. The FCD quick check will return YES for
119 * most strings in practice.
120 *
121 * normalize(FCD) may be implemented with NFD.
122 *
123 * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
124 * http://www.unicode.org/notes/tn5/#FCD
125 *
126 * ICU collation performs either NFD or FCD normalization automatically if
127 * normalization is turned on for the collator object. Beyond collation and
128 * string search, normalized strings may be useful for string equivalence
129 * comparisons, transliteration/transcription, unique representations, etc.
130 *
131 * The W3C generally recommends to exchange texts in NFC.
132 * Note also that most legacy character encodings use only precomposed forms and
133 * often do not encode any combining marks by themselves. For conversion to such
134 * character encodings the Unicode text needs to be normalized to NFC.
135 * For more usage examples, see the Unicode Standard Annex.
136 *
137 * Note: The Normalizer class also provides API for iterative normalization.
138 * While the setIndex() and getIndex() refer to indices in the
139 * underlying Unicode input text, the next() and previous() methods
140 * iterate through characters in the normalized output.
141 * This means that there is not necessarily a one-to-one correspondence
142 * between characters returned by next() and previous() and the indices
143 * passed to and returned from setIndex() and getIndex().
144 * It is for this reason that Normalizer does not implement the CharacterIterator interface.
145 *
146 * @stable ICU 2.8
147 */
148// Original filename in ICU4J: Normalizer.java
149public final class NormalizerBase implements Cloneable {
150
151    // The input text and our position in it
152    private UCharacterIterator  text;
153    private Normalizer2         norm2;
154    private Mode                mode;
155    private int                 options;
156
157    // The normalization buffer is the result of normalization
158    // of the source in [currentIndex..nextIndex] .
159    private int                 currentIndex;
160    private int                 nextIndex;
161
162    // A buffer for holding intermediate results
163    private StringBuilder       buffer;
164    private int                 bufferPos;
165
166    // Helper classes to defer loading of normalization data.
167    private static final class ModeImpl {
168        private ModeImpl(Normalizer2 n2) {
169            normalizer2 = n2;
170        }
171        private final Normalizer2 normalizer2;
172    }
173
174    private static final class NFDModeImpl {
175        private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
176    }
177
178    private static final class NFKDModeImpl {
179        private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
180    }
181
182    private static final class NFCModeImpl {
183        private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
184    }
185
186    private static final class NFKCModeImpl {
187        private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
188    }
189
190    private static final class Unicode32 {
191        private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
192    }
193
194    private static final class NFD32ModeImpl {
195        private static final ModeImpl INSTANCE =
196            new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
197                                                 Unicode32.INSTANCE));
198    }
199
200    private static final class NFKD32ModeImpl {
201        private static final ModeImpl INSTANCE =
202            new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
203                                                 Unicode32.INSTANCE));
204    }
205
206    private static final class NFC32ModeImpl {
207        private static final ModeImpl INSTANCE =
208            new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
209                                                 Unicode32.INSTANCE));
210    }
211
212    private static final class NFKC32ModeImpl {
213        private static final ModeImpl INSTANCE =
214            new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
215                                                 Unicode32.INSTANCE));
216    }
217
218    /**
219     * Options bit set value to select Unicode 3.2 normalization
220     * (except NormalizationCorrections).
221     * At most one Unicode version can be selected at a time.
222     * @stable ICU 2.6
223     */
224    public static final int UNICODE_3_2=0x20;
225
226    public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2;
227
228    /*
229     * Default option for the latest Unicode normalization. This option is
230     * provided mainly for testing.
231     * The value zero means that normalization is done with the fixes for
232     *   - Corrigendum 4 (Five CJK Canonical Mapping Errors)
233     *   - Corrigendum 5 (Normalization Idempotency)
234     */
235    public static final int UNICODE_LATEST = 0x00;
236
237    /**
238     * Constant indicating that the end of the iteration has been reached.
239     * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
240     * @stable ICU 2.8
241     */
242    public static final int DONE = UCharacterIterator.DONE;
243
244    /**
245     * Constants for normalization modes.
246     * <p>
247     * The Mode class is not intended for public subclassing.
248     * Only the Mode constants provided by the Normalizer class should be used,
249     * and any fields or methods should not be called or overridden by users.
250     * @stable ICU 2.8
251     */
252    public abstract static class Mode {
253
254        /**
255         * Sole constructor
256         * @internal
257         * @deprecated This API is ICU internal only.
258         */
259        @Deprecated
260        protected Mode() {
261        }
262
263        /**
264         * @internal
265         * @deprecated This API is ICU internal only.
266         */
267        @Deprecated
268        protected abstract Normalizer2 getNormalizer2(int options);
269    }
270
271    private static Mode toMode(Normalizer.Form form) {
272        switch (form) {
273        case NFC :
274            return NFC;
275        case NFD :
276            return NFD;
277        case NFKC :
278            return NFKC;
279        case NFKD :
280            return NFKD;
281        }
282
283        throw new IllegalArgumentException("Unexpected normalization form: " +
284                                           form);
285    }
286
287    private static final class NONEMode extends Mode {
288        protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
289    }
290
291    private static final class NFDMode extends Mode {
292        protected Normalizer2 getNormalizer2(int options) {
293            return (options&UNICODE_3_2) != 0 ?
294                    NFD32ModeImpl.INSTANCE.normalizer2 :
295                    NFDModeImpl.INSTANCE.normalizer2;
296        }
297    }
298
299    private static final class NFKDMode extends Mode {
300        protected Normalizer2 getNormalizer2(int options) {
301            return (options&UNICODE_3_2) != 0 ?
302                    NFKD32ModeImpl.INSTANCE.normalizer2 :
303                    NFKDModeImpl.INSTANCE.normalizer2;
304        }
305    }
306
307    private static final class NFCMode extends Mode {
308        protected Normalizer2 getNormalizer2(int options) {
309            return (options&UNICODE_3_2) != 0 ?
310                    NFC32ModeImpl.INSTANCE.normalizer2 :
311                    NFCModeImpl.INSTANCE.normalizer2;
312        }
313    }
314
315    private static final class NFKCMode extends Mode {
316        protected Normalizer2 getNormalizer2(int options) {
317            return (options&UNICODE_3_2) != 0 ?
318                    NFKC32ModeImpl.INSTANCE.normalizer2 :
319                    NFKCModeImpl.INSTANCE.normalizer2;
320        }
321    }
322
323    /**
324     * No decomposition/composition.
325     * @stable ICU 2.8
326     */
327    public static final Mode NONE = new NONEMode();
328
329    /**
330     * Canonical decomposition.
331     * @stable ICU 2.8
332     */
333    public static final Mode NFD = new NFDMode();
334
335    /**
336     * Compatibility decomposition.
337     * @stable ICU 2.8
338     */
339    public static final Mode NFKD = new NFKDMode();
340
341    /**
342     * Canonical decomposition followed by canonical composition.
343     * @stable ICU 2.8
344     */
345    public static final Mode NFC = new NFCMode();
346
347    public static final Mode NFKC =new NFKCMode();
348
349    //-------------------------------------------------------------------------
350    // Iterator constructors
351    //-------------------------------------------------------------------------
352
353    /**
354     * Creates a new {@code NormalizerBase} object for iterating over the
355     * normalized form of a given string.
356     * <p>
357     * The {@code options} parameter specifies which optional
358     * {@code NormalizerBase} features are to be enabled for this object.
359     * <p>
360     * @param str  The string to be normalized.  The normalization
361     *              will start at the beginning of the string.
362     *
363     * @param mode The normalization mode.
364     *
365     * @param opt Any optional features to be enabled.
366     *            Currently the only available option is {@link #UNICODE_3_2}.
367     *            If you want the default behavior corresponding to one of the
368     *            standard Unicode Normalization Forms, use 0 for this argument.
369     * @stable ICU 2.6
370     */
371    public NormalizerBase(String str, Mode mode, int opt) {
372        this.text = UCharacterIterator.getInstance(str);
373        this.mode = mode;
374        this.options=opt;
375        norm2 = mode.getNormalizer2(opt);
376        buffer = new StringBuilder();
377    }
378
379    public NormalizerBase(String str, Mode mode) {
380       this(str, mode, 0);
381    }
382
383
384    /**
385     * Creates a new {@code NormalizerBase} object for iterating over the
386     * normalized form of the given text.
387     * <p>
388     * @param iter  The input text to be normalized.  The normalization
389     *              will start at the beginning of the string.
390     *
391     * @param mode  The normalization mode.
392     *
393     * @param opt Any optional features to be enabled.
394     *            Currently the only available option is {@link #UNICODE_3_2}.
395     *            If you want the default behavior corresponding to one of the
396     *            standard Unicode Normalization Forms, use 0 for this argument.
397     * @stable ICU 2.6
398     */
399    public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
400        this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
401        this.mode = mode;
402        this.options = opt;
403        norm2 = mode.getNormalizer2(opt);
404        buffer = new StringBuilder();
405    }
406
407    public NormalizerBase(CharacterIterator iter, Mode mode) {
408       this(iter, mode, 0);
409    }
410
411    /**
412     * Clones this {@code NormalizerBase} object.  All properties of this
413     * object are duplicated in the new object, including the cloning of any
414     * {@link CharacterIterator} that was passed in to the constructor
415     * or to {@link #setText(CharacterIterator) setText}.
416     * However, the text storage underlying
417     * the {@code CharacterIterator} is not duplicated unless the
418     * iterator's {@code clone} method does so.
419     * @stable ICU 2.8
420     */
421    public Object clone() {
422        try {
423            NormalizerBase copy = (NormalizerBase) super.clone();
424            copy.text = (UCharacterIterator) text.clone();
425            copy.mode = mode;
426            copy.options = options;
427            copy.norm2 = norm2;
428            copy.buffer = new StringBuilder(buffer);
429            copy.bufferPos = bufferPos;
430            copy.currentIndex = currentIndex;
431            copy.nextIndex = nextIndex;
432            return copy;
433        }
434        catch (CloneNotSupportedException e) {
435            throw new InternalError(e.toString(), e);
436        }
437    }
438
439    /**
440     * Normalizes a {@code String} using the given normalization operation.
441     * <p>
442     * The {@code options} parameter specifies which optional
443     * {@code NormalizerBase} features are to be enabled for this operation.
444     * Currently the only available option is {@link #UNICODE_3_2}.
445     * If you want the default behavior corresponding to one of the standard
446     * Unicode Normalization Forms, use 0 for this argument.
447     * <p>
448     * @param str       the input string to be normalized.
449     * @param mode      the normalization mode
450     * @param options   the optional features to be enabled.
451     * @return String   the normalized string
452     * @stable ICU 2.6
453     */
454    public static String normalize(String str, Mode mode, int options) {
455        return mode.getNormalizer2(options).normalize(str);
456    }
457
458    public static String normalize(String str, Normalizer.Form form) {
459        return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
460    }
461
462    public static String normalize(String str, Normalizer.Form form, int options) {
463        return NormalizerBase.normalize(str, toMode(form), options);
464    }
465
466    /**
467     * Test if a string is in a given normalization form.
468     * This is semantically equivalent to source.equals(normalize(source, mode)).
469     *
470     * Unlike quickCheck(), this function returns a definitive result,
471     * never a "maybe".
472     * For NFD, NFKD, and FCD, both functions work exactly the same.
473     * For NFC and NFKC where quickCheck may return "maybe", this function will
474     * perform further tests to arrive at a true/false result.
475     * @param str       the input string to be checked to see if it is
476     *                   normalized
477     * @param mode      the normalization mode
478     * @param options   Options for use with exclusion set and tailored Normalization
479     *                  The only option that is currently recognized is UNICODE_3_2
480     * @see #isNormalized
481     * @stable ICU 2.6
482     */
483    public static boolean isNormalized(String str, Mode mode, int options) {
484        return mode.getNormalizer2(options).isNormalized(str);
485    }
486
487    public static boolean isNormalized(String str, Normalizer.Form form) {
488        return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
489    }
490
491    public static boolean isNormalized(String str, Normalizer.Form form, int options) {
492        return NormalizerBase.isNormalized(str, toMode(form), options);
493    }
494
495    //-------------------------------------------------------------------------
496    // Iteration API
497    //-------------------------------------------------------------------------
498
499    /**
500     * Return the current character in the normalized text.
501     * @return The codepoint as an int
502     * @stable ICU 2.8
503     */
504    public int current() {
505        if(bufferPos<buffer.length() || nextNormalize()) {
506            return buffer.codePointAt(bufferPos);
507        } else {
508            return DONE;
509        }
510    }
511
512    /**
513     * Return the next character in the normalized text and advance
514     * the iteration position by one.  If the end
515     * of the text has already been reached, {@link #DONE} is returned.
516     * @return The codepoint as an int
517     * @stable ICU 2.8
518     */
519    public int next() {
520        if(bufferPos<buffer.length() ||  nextNormalize()) {
521            int c=buffer.codePointAt(bufferPos);
522            bufferPos+=Character.charCount(c);
523            return c;
524        } else {
525            return DONE;
526        }
527    }
528
529    /**
530     * Return the previous character in the normalized text and decrement
531     * the iteration position by one.  If the beginning
532     * of the text has already been reached, {@link #DONE} is returned.
533     * @return The codepoint as an int
534     * @stable ICU 2.8
535     */
536    public int previous() {
537        if(bufferPos>0 || previousNormalize()) {
538            int c=buffer.codePointBefore(bufferPos);
539            bufferPos-=Character.charCount(c);
540            return c;
541        } else {
542            return DONE;
543        }
544    }
545
546    /**
547     * Reset the index to the beginning of the text.
548     * This is equivalent to setIndexOnly(startIndex)).
549     * @stable ICU 2.8
550     */
551    public void reset() {
552        text.setIndex(0);
553        currentIndex=nextIndex=0;
554        clearBuffer();
555    }
556
557    /**
558     * Set the iteration position in the input text that is being normalized,
559     * without any immediate normalization.
560     * After setIndexOnly(), getIndex() will return the same index that is
561     * specified here.
562     *
563     * @param index the desired index in the input text.
564     * @stable ICU 2.8
565     */
566    public void setIndexOnly(int index) {
567        text.setIndex(index);  // validates index
568        currentIndex=nextIndex=index;
569        clearBuffer();
570    }
571
572    /**
573     * Set the iteration position in the input text that is being normalized
574     * and return the first normalized character at that position.
575     * <p>
576     * <b>Note:</b> This method sets the position in the <em>input</em> text,
577     * while {@link #next} and {@link #previous} iterate through characters
578     * in the normalized <em>output</em>.  This means that there is not
579     * necessarily a one-to-one correspondence between characters returned
580     * by {@code next} and {@code previous} and the indices passed to and
581     * returned from {@code setIndex} and {@link #getIndex}.
582     * <p>
583     * @param index the desired index in the input text.
584     *
585     * @return   the first normalized character that is the result of iterating
586     *            forward starting at the given index.
587     *
588     * @throws IllegalArgumentException if the given index is less than
589     *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
590     * deprecated ICU 3.2
591     * @obsolete ICU 3.2
592     */
593     public int setIndex(int index) {
594         setIndexOnly(index);
595         return current();
596     }
597
598    /**
599     * Retrieve the index of the start of the input text. This is the begin
600     * index of the {@code CharacterIterator} or the start (i.e. 0) of the
601     * {@code String} over which this {@code NormalizerBase} is iterating
602     * @deprecated ICU 2.2. Use startIndex() instead.
603     * @return The codepoint as an int
604     * @see #startIndex
605     */
606    @Deprecated
607    public int getBeginIndex() {
608        return 0;
609    }
610
611    /**
612     * Retrieve the index of the end of the input text.  This is the end index
613     * of the {@code CharacterIterator} or the length of the {@code String}
614     * over which this {@code NormalizerBase} is iterating
615     * @deprecated ICU 2.2. Use endIndex() instead.
616     * @return The codepoint as an int
617     * @see #endIndex
618     */
619    @Deprecated
620    public int getEndIndex() {
621        return endIndex();
622    }
623
624    /**
625     * Retrieve the current iteration position in the input text that is
626     * being normalized.  This method is useful in applications such as
627     * searching, where you need to be able to determine the position in
628     * the input text that corresponds to a given normalized output character.
629     * <p>
630     * <b>Note:</b> This method sets the position in the <em>input</em>, while
631     * {@link #next} and {@link #previous} iterate through characters in the
632     * <em>output</em>.  This means that there is not necessarily a one-to-one
633     * correspondence between characters returned by {@code next} and
634     * {@code previous} and the indices passed to and returned from
635     * {@code setIndex} and {@link #getIndex}.
636     * @return The current iteration position
637     * @stable ICU 2.8
638     */
639    public int getIndex() {
640        if(bufferPos<buffer.length()) {
641            return currentIndex;
642        } else {
643            return nextIndex;
644        }
645    }
646
647    /**
648     * Retrieve the index of the end of the input text.  This is the end index
649     * of the {@code CharacterIterator} or the length of the {@code String}
650     * over which this {@code NormalizerBase} is iterating
651     * @return The current iteration position
652     * @stable ICU 2.8
653     */
654    public int endIndex() {
655        return text.getLength();
656    }
657
658    //-------------------------------------------------------------------------
659    // Iterator attributes
660    //-------------------------------------------------------------------------
661    /**
662     * Set the normalization mode for this object.
663     * <p>
664     * <b>Note:</b>If the normalization mode is changed while iterating
665     * over a string, calls to {@link #next} and {@link #previous} may
666     * return previously buffers characters in the old normalization mode
667     * until the iteration is able to re-sync at the next base character.
668     * It is safest to call {@link #setText setText()}, {@link #first},
669     * {@link #last}, etc. after calling {@code setMode}.
670     * <p>
671     * @param newMode the new mode for this {@code NormalizerBase}.
672     * The supported modes are:
673     * <ul>
674     *  <li>{@link #NFC}    - Unicode canonical decompositiion
675     *                        followed by canonical composition.
676     *  <li>{@link #NFKC}   - Unicode compatibility decompositiion
677     *                        follwed by canonical composition.
678     *  <li>{@link #NFD}    - Unicode canonical decomposition
679     *  <li>{@link #NFKD}   - Unicode compatibility decomposition.
680     *  <li>{@link #NONE}   - Do nothing but return characters
681     *                        from the underlying input text.
682     * </ul>
683     *
684     * @see #getMode
685     * @stable ICU 2.8
686     */
687    public void setMode(Mode newMode) {
688        mode = newMode;
689        norm2 = mode.getNormalizer2(options);
690    }
691
692    /**
693     * Return the basic operation performed by this {@code NormalizerBase}
694     *
695     * @see #setMode
696     * @stable ICU 2.8
697     */
698    public Mode getMode() {
699        return mode;
700    }
701
702    /**
703     * Set the input text over which this {@code NormalizerBase} will iterate.
704     * The iteration position is set to the beginning of the input text.
705     * @param newText   The new string to be normalized.
706     * @stable ICU 2.8
707     */
708    public void setText(String newText) {
709        UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
710        if (newIter == null) {
711            throw new IllegalStateException("Could not create a new UCharacterIterator");
712        }
713        text = newIter;
714        reset();
715    }
716
717    /**
718     * Set the input text over which this {@code NormalizerBase} will iterate.
719     * The iteration position is set to the beginning of the input text.
720     * @param newText   The new string to be normalized.
721     * @stable ICU 2.8
722     */
723    public void setText(CharacterIterator newText) {
724        UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
725        if (newIter == null) {
726            throw new IllegalStateException("Could not create a new UCharacterIterator");
727        }
728        text = newIter;
729        currentIndex=nextIndex=0;
730        clearBuffer();
731    }
732
733    private void clearBuffer() {
734        buffer.setLength(0);
735        bufferPos=0;
736    }
737
738    private boolean nextNormalize() {
739        clearBuffer();
740        currentIndex=nextIndex;
741        text.setIndex(nextIndex);
742        // Skip at least one character so we make progress.
743        int c=text.nextCodePoint();
744        if(c<0) {
745            return false;
746        }
747        StringBuilder segment=new StringBuilder().appendCodePoint(c);
748        while((c=text.nextCodePoint())>=0) {
749            if(norm2.hasBoundaryBefore(c)) {
750                text.moveCodePointIndex(-1);
751                break;
752            }
753            segment.appendCodePoint(c);
754        }
755        nextIndex=text.getIndex();
756        norm2.normalize(segment, buffer);
757        return buffer.length()!=0;
758    }
759
760    private boolean previousNormalize() {
761        clearBuffer();
762        nextIndex=currentIndex;
763        text.setIndex(currentIndex);
764        StringBuilder segment=new StringBuilder();
765        int c;
766        while((c=text.previousCodePoint())>=0) {
767            if(c<=0xffff) {
768                segment.insert(0, (char)c);
769            } else {
770                segment.insert(0, Character.toChars(c));
771            }
772            if(norm2.hasBoundaryBefore(c)) {
773                break;
774            }
775        }
776        currentIndex=text.getIndex();
777        norm2.normalize(segment, buffer);
778        bufferPos=buffer.length();
779        return buffer.length()!=0;
780    }
781
782}
783