1/*
2 * Copyright (c) 1994, 2004, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package java.util;
27
28import java.lang.*;
29
30/**
31 * The string tokenizer class allows an application to break a
32 * string into tokens. The tokenization method is much simpler than
33 * the one used by the {@code StreamTokenizer} class. The
34 * {@code StringTokenizer} methods do not distinguish among
35 * identifiers, numbers, and quoted strings, nor do they recognize
36 * and skip comments.
37 * <p>
38 * The set of delimiters (the characters that separate tokens) may
39 * be specified either at creation time or on a per-token basis.
40 * <p>
41 * An instance of {@code StringTokenizer} behaves in one of two
42 * ways, depending on whether it was created with the
43 * {@code returnDelims} flag having the value {@code true}
44 * or {@code false}:
45 * <ul>
46 * <li>If the flag is {@code false}, delimiter characters serve to
47 *     separate tokens. A token is a maximal sequence of consecutive
48 *     characters that are not delimiters.
49 * <li>If the flag is {@code true}, delimiter characters are themselves
50 *     considered to be tokens. A token is thus either one delimiter
51 *     character, or a maximal sequence of consecutive characters that are
52 *     not delimiters.
53 * </ul><p>
54 * A {@code StringTokenizer} object internally maintains a current
55 * position within the string to be tokenized. Some operations advance this
56 * current position past the characters processed.<p>
57 * A token is returned by taking a substring of the string that was used to
58 * create the {@code StringTokenizer} object.
59 * <p>
60 * The following is one example of the use of the tokenizer. The code:
61 * <blockquote><pre>
62 *     StringTokenizer st = new StringTokenizer("this is a test");
63 *     while (st.hasMoreTokens()) {
64 *         System.out.println(st.nextToken());
65 *     }
66 * </pre></blockquote>
67 * <p>
68 * prints the following output:
69 * <blockquote><pre>
70 *     this
71 *     is
72 *     a
73 *     test
74 * </pre></blockquote>
75 *
76 * <p>
77 * {@code StringTokenizer} is a legacy class that is retained for
78 * compatibility reasons although its use is discouraged in new code. It is
79 * recommended that anyone seeking this functionality use the {@code split}
80 * method of {@code String} or the java.util.regex package instead.
81 * <p>
82 * The following example illustrates how the {@code String.split}
83 * method can be used to break up a string into its basic tokens:
84 * <blockquote><pre>
85 *     String[] result = "this is a test".split("\\s");
86 *     for (int x=0; x&lt;result.length; x++)
87 *         System.out.println(result[x]);
88 * </pre></blockquote>
89 * <p>
90 * prints the following output:
91 * <blockquote><pre>
92 *     this
93 *     is
94 *     a
95 *     test
96 * </pre></blockquote>
97 *
98 * @author  unascribed
99 * @see     java.io.StreamTokenizer
100 * @since   1.0
101 */
102public
103class StringTokenizer implements Enumeration<Object> {
104    private int currentPosition;
105    private int newPosition;
106    private int maxPosition;
107    private String str;
108    private String delimiters;
109    private boolean retDelims;
110    private boolean delimsChanged;
111
112    /**
113     * maxDelimCodePoint stores the value of the delimiter character with the
114     * highest value. It is used to optimize the detection of delimiter
115     * characters.
116     *
117     * It is unlikely to provide any optimization benefit in the
118     * hasSurrogates case because most string characters will be
119     * smaller than the limit, but we keep it so that the two code
120     * paths remain similar.
121     */
122    private int maxDelimCodePoint;
123
124    /**
125     * If delimiters include any surrogates (including surrogate
126     * pairs), hasSurrogates is true and the tokenizer uses the
127     * different code path. This is because String.indexOf(int)
128     * doesn't handle unpaired surrogates as a single character.
129     */
130    private boolean hasSurrogates = false;
131
132    /**
133     * When hasSurrogates is true, delimiters are converted to code
134     * points and isDelimiter(int) is used to determine if the given
135     * codepoint is a delimiter.
136     */
137    private int[] delimiterCodePoints;
138
139    /**
140     * Set maxDelimCodePoint to the highest char in the delimiter set.
141     */
142    private void setMaxDelimCodePoint() {
143        if (delimiters == null) {
144            maxDelimCodePoint = 0;
145            return;
146        }
147
148        int m = 0;
149        int c;
150        int count = 0;
151        for (int i = 0; i < delimiters.length(); i += Character.charCount(c)) {
152            c = delimiters.charAt(i);
153            if (c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE) {
154                c = delimiters.codePointAt(i);
155                hasSurrogates = true;
156            }
157            if (m < c)
158                m = c;
159            count++;
160        }
161        maxDelimCodePoint = m;
162
163        if (hasSurrogates) {
164            delimiterCodePoints = new int[count];
165            for (int i = 0, j = 0; i < count; i++, j += Character.charCount(c)) {
166                c = delimiters.codePointAt(j);
167                delimiterCodePoints[i] = c;
168            }
169        }
170    }
171
172    /**
173     * Constructs a string tokenizer for the specified string. All
174     * characters in the {@code delim} argument are the delimiters
175     * for separating tokens.
176     * <p>
177     * If the {@code returnDelims} flag is {@code true}, then
178     * the delimiter characters are also returned as tokens. Each
179     * delimiter is returned as a string of length one. If the flag is
180     * {@code false}, the delimiter characters are skipped and only
181     * serve as separators between tokens.
182     * <p>
183     * Note that if {@code delim} is {@code null}, this constructor does
184     * not throw an exception. However, trying to invoke other methods on the
185     * resulting {@code StringTokenizer} may result in a
186     * {@code NullPointerException}.
187     *
188     * @param   str            a string to be parsed.
189     * @param   delim          the delimiters.
190     * @param   returnDelims   flag indicating whether to return the delimiters
191     *                         as tokens.
192     * @exception NullPointerException if str is {@code null}
193     */
194    public StringTokenizer(String str, String delim, boolean returnDelims) {
195        currentPosition = 0;
196        newPosition = -1;
197        delimsChanged = false;
198        this.str = str;
199        maxPosition = str.length();
200        delimiters = delim;
201        retDelims = returnDelims;
202        setMaxDelimCodePoint();
203    }
204
205    /**
206     * Constructs a string tokenizer for the specified string. The
207     * characters in the {@code delim} argument are the delimiters
208     * for separating tokens. Delimiter characters themselves will not
209     * be treated as tokens.
210     * <p>
211     * Note that if {@code delim} is {@code null}, this constructor does
212     * not throw an exception. However, trying to invoke other methods on the
213     * resulting {@code StringTokenizer} may result in a
214     * {@code NullPointerException}.
215     *
216     * @param   str     a string to be parsed.
217     * @param   delim   the delimiters.
218     * @exception NullPointerException if str is {@code null}
219     */
220    public StringTokenizer(String str, String delim) {
221        this(str, delim, false);
222    }
223
224    /**
225     * Constructs a string tokenizer for the specified string. The
226     * tokenizer uses the default delimiter set, which is
227     * <code>"&nbsp;&#92;t&#92;n&#92;r&#92;f"</code>: the space character,
228     * the tab character, the newline character, the carriage-return character,
229     * and the form-feed character. Delimiter characters themselves will
230     * not be treated as tokens.
231     *
232     * @param   str   a string to be parsed.
233     * @exception NullPointerException if str is {@code null}
234     */
235    public StringTokenizer(String str) {
236        this(str, " \t\n\r\f", false);
237    }
238
239    /**
240     * Skips delimiters starting from the specified position. If retDelims
241     * is false, returns the index of the first non-delimiter character at or
242     * after startPos. If retDelims is true, startPos is returned.
243     */
244    private int skipDelimiters(int startPos) {
245        if (delimiters == null)
246            throw new NullPointerException();
247
248        int position = startPos;
249        while (!retDelims && position < maxPosition) {
250            if (!hasSurrogates) {
251                char c = str.charAt(position);
252                if ((c > maxDelimCodePoint) || (delimiters.indexOf(c) < 0))
253                    break;
254                position++;
255            } else {
256                int c = str.codePointAt(position);
257                if ((c > maxDelimCodePoint) || !isDelimiter(c)) {
258                    break;
259                }
260                position += Character.charCount(c);
261            }
262        }
263        return position;
264    }
265
266    /**
267     * Skips ahead from startPos and returns the index of the next delimiter
268     * character encountered, or maxPosition if no such delimiter is found.
269     */
270    private int scanToken(int startPos) {
271        int position = startPos;
272        while (position < maxPosition) {
273            if (!hasSurrogates) {
274                char c = str.charAt(position);
275                if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
276                    break;
277                position++;
278            } else {
279                int c = str.codePointAt(position);
280                if ((c <= maxDelimCodePoint) && isDelimiter(c))
281                    break;
282                position += Character.charCount(c);
283            }
284        }
285        if (retDelims && (startPos == position)) {
286            if (!hasSurrogates) {
287                char c = str.charAt(position);
288                if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
289                    position++;
290            } else {
291                int c = str.codePointAt(position);
292                if ((c <= maxDelimCodePoint) && isDelimiter(c))
293                    position += Character.charCount(c);
294            }
295        }
296        return position;
297    }
298
299    private boolean isDelimiter(int codePoint) {
300        for (int delimiterCodePoint : delimiterCodePoints) {
301            if (delimiterCodePoint == codePoint) {
302                return true;
303            }
304        }
305        return false;
306    }
307
308    /**
309     * Tests if there are more tokens available from this tokenizer's string.
310     * If this method returns {@code true}, then a subsequent call to
311     * {@code nextToken} with no argument will successfully return a token.
312     *
313     * @return  {@code true} if and only if there is at least one token
314     *          in the string after the current position; {@code false}
315     *          otherwise.
316     */
317    public boolean hasMoreTokens() {
318        /*
319         * Temporarily store this position and use it in the following
320         * nextToken() method only if the delimiters haven't been changed in
321         * that nextToken() invocation.
322         */
323        newPosition = skipDelimiters(currentPosition);
324        return (newPosition < maxPosition);
325    }
326
327    /**
328     * Returns the next token from this string tokenizer.
329     *
330     * @return     the next token from this string tokenizer.
331     * @exception  NoSuchElementException  if there are no more tokens in this
332     *               tokenizer's string.
333     */
334    public String nextToken() {
335        /*
336         * If next position already computed in hasMoreElements() and
337         * delimiters have changed between the computation and this invocation,
338         * then use the computed value.
339         */
340
341        currentPosition = (newPosition >= 0 && !delimsChanged) ?
342            newPosition : skipDelimiters(currentPosition);
343
344        /* Reset these anyway */
345        delimsChanged = false;
346        newPosition = -1;
347
348        if (currentPosition >= maxPosition)
349            throw new NoSuchElementException();
350        int start = currentPosition;
351        currentPosition = scanToken(currentPosition);
352        return str.substring(start, currentPosition);
353    }
354
355    /**
356     * Returns the next token in this string tokenizer's string. First,
357     * the set of characters considered to be delimiters by this
358     * {@code StringTokenizer} object is changed to be the characters in
359     * the string {@code delim}. Then the next token in the string
360     * after the current position is returned. The current position is
361     * advanced beyond the recognized token.  The new delimiter set
362     * remains the default after this call.
363     *
364     * @param      delim   the new delimiters.
365     * @return     the next token, after switching to the new delimiter set.
366     * @exception  NoSuchElementException  if there are no more tokens in this
367     *               tokenizer's string.
368     * @exception NullPointerException if delim is {@code null}
369     */
370    public String nextToken(String delim) {
371        delimiters = delim;
372
373        /* delimiter string specified, so set the appropriate flag. */
374        delimsChanged = true;
375
376        setMaxDelimCodePoint();
377        return nextToken();
378    }
379
380    /**
381     * Returns the same value as the {@code hasMoreTokens}
382     * method. It exists so that this class can implement the
383     * {@code Enumeration} interface.
384     *
385     * @return  {@code true} if there are more tokens;
386     *          {@code false} otherwise.
387     * @see     java.util.Enumeration
388     * @see     java.util.StringTokenizer#hasMoreTokens()
389     */
390    public boolean hasMoreElements() {
391        return hasMoreTokens();
392    }
393
394    /**
395     * Returns the same value as the {@code nextToken} method,
396     * except that its declared return value is {@code Object} rather than
397     * {@code String}. It exists so that this class can implement the
398     * {@code Enumeration} interface.
399     *
400     * @return     the next token in the string.
401     * @exception  NoSuchElementException  if there are no more tokens in this
402     *               tokenizer's string.
403     * @see        java.util.Enumeration
404     * @see        java.util.StringTokenizer#nextToken()
405     */
406    public Object nextElement() {
407        return nextToken();
408    }
409
410    /**
411     * Calculates the number of times that this tokenizer's
412     * {@code nextToken} method can be called before it generates an
413     * exception. The current position is not advanced.
414     *
415     * @return  the number of tokens remaining in the string using the current
416     *          delimiter set.
417     * @see     java.util.StringTokenizer#nextToken()
418     */
419    public int countTokens() {
420        int count = 0;
421        int currpos = currentPosition;
422        while (currpos < maxPosition) {
423            currpos = skipDelimiters(currpos);
424            if (currpos >= maxPosition)
425                break;
426            currpos = scanToken(currpos);
427            count++;
428        }
429        return count;
430    }
431}
432