1/*
2********************************************************************
3*
4*   Copyright (C) 1997-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7********************************************************************
8*/
9
10#ifndef CHARITER_H
11#define CHARITER_H
12
13#include "unicode/utypes.h"
14#include "unicode/uobject.h"
15#include "unicode/unistr.h"
16/**
17 * \file
18 * \brief C++ API: Character Iterator
19 */
20
21U_NAMESPACE_BEGIN
22/**
23 * Abstract class that defines an API for forward-only iteration
24 * on text objects.
25 * This is a minimal interface for iteration without random access
26 * or backwards iteration. It is especially useful for wrapping
27 * streams with converters into an object for collation or
28 * normalization.
29 *
30 * <p>Characters can be accessed in two ways: as code units or as
31 * code points.
32 * Unicode code points are 21-bit integers and are the scalar values
33 * of Unicode characters. ICU uses the type UChar32 for them.
34 * Unicode code units are the storage units of a given
35 * Unicode/UCS Transformation Format (a character encoding scheme).
36 * With UTF-16, all code points can be represented with either one
37 * or two code units ("surrogates").
38 * String storage is typically based on code units, while properties
39 * of characters are typically determined using code point values.
40 * Some processes may be designed to work with sequences of code units,
41 * or it may be known that all characters that are important to an
42 * algorithm can be represented with single code units.
43 * Other processes will need to use the code point access functions.</p>
44 *
45 * <p>ForwardCharacterIterator provides nextPostInc() to access
46 * a code unit and advance an internal position into the text object,
47 * similar to a <code>return text[position++]</code>.<br>
48 * It provides next32PostInc() to access a code point and advance an internal
49 * position.</p>
50 *
51 * <p>next32PostInc() assumes that the current position is that of
52 * the beginning of a code point, i.e., of its first code unit.
53 * After next32PostInc(), this will be true again.
54 * In general, access to code units and code points in the same
55 * iteration loop should not be mixed. In UTF-16, if the current position
56 * is on a second code unit (Low Surrogate), then only that code unit
57 * is returned even by next32PostInc().</p>
58 *
59 * <p>For iteration with either function, there are two ways to
60 * check for the end of the iteration. When there are no more
61 * characters in the text object:
62 * <ul>
63 * <li>The hasNext() function returns FALSE.</li>
64 * <li>nextPostInc() and next32PostInc() return DONE
65 *     when one attempts to read beyond the end of the text object.</li>
66 * </ul>
67 *
68 * Example:
69 * \code
70 * void function1(ForwardCharacterIterator &it) {
71 *     UChar32 c;
72 *     while(it.hasNext()) {
73 *         c=it.next32PostInc();
74 *         // use c
75 *     }
76 * }
77 *
78 * void function1(ForwardCharacterIterator &it) {
79 *     UChar c;
80 *     while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
81 *         // use c
82 *      }
83 *  }
84 * \endcode
85 * </p>
86 *
87 * @stable ICU 2.0
88 */
89class U_COMMON_API ForwardCharacterIterator : public UObject {
90public:
91    /**
92     * Value returned by most of ForwardCharacterIterator's functions
93     * when the iterator has reached the limits of its iteration.
94     * @stable ICU 2.0
95     */
96    enum { DONE = 0xffff };
97
98    /**
99     * Destructor.
100     * @stable ICU 2.0
101     */
102    virtual ~ForwardCharacterIterator();
103
104    /**
105     * Returns true when both iterators refer to the same
106     * character in the same character-storage object.
107     * @param that The ForwardCharacterIterator to be compared for equality
108     * @return true when both iterators refer to the same
109     * character in the same character-storage object
110     * @stable ICU 2.0
111     */
112    virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
113
114    /**
115     * Returns true when the iterators refer to different
116     * text-storage objects, or to different characters in the
117     * same text-storage object.
118     * @param that The ForwardCharacterIterator to be compared for inequality
119     * @return true when the iterators refer to different
120     * text-storage objects, or to different characters in the
121     * same text-storage object
122     * @stable ICU 2.0
123     */
124    inline UBool operator!=(const ForwardCharacterIterator& that) const;
125
126    /**
127     * Generates a hash code for this iterator.
128     * @return the hash code.
129     * @stable ICU 2.0
130     */
131    virtual int32_t hashCode(void) const = 0;
132
133    /**
134     * Returns a UClassID for this ForwardCharacterIterator ("poor man's
135     * RTTI").<P> Despite the fact that this function is public,
136     * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
137     * @return a UClassID for this ForwardCharacterIterator
138     * @stable ICU 2.0
139     */
140    virtual UClassID getDynamicClassID(void) const = 0;
141
142    /**
143     * Gets the current code unit for returning and advances to the next code unit
144     * in the iteration range
145     * (toward endIndex()).  If there are
146     * no more code units to return, returns DONE.
147     * @return the current code unit.
148     * @stable ICU 2.0
149     */
150    virtual UChar         nextPostInc(void) = 0;
151
152    /**
153     * Gets the current code point for returning and advances to the next code point
154     * in the iteration range
155     * (toward endIndex()).  If there are
156     * no more code points to return, returns DONE.
157     * @return the current code point.
158     * @stable ICU 2.0
159     */
160    virtual UChar32       next32PostInc(void) = 0;
161
162    /**
163     * Returns FALSE if there are no more code units or code points
164     * at or after the current position in the iteration range.
165     * This is used with nextPostInc() or next32PostInc() in forward
166     * iteration.
167     * @returns FALSE if there are no more code units or code points
168     * at or after the current position in the iteration range.
169     * @stable ICU 2.0
170     */
171    virtual UBool        hasNext() = 0;
172
173protected:
174    /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
175    ForwardCharacterIterator();
176
177    /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
178    ForwardCharacterIterator(const ForwardCharacterIterator &other);
179
180    /**
181     * Assignment operator to be overridden in the implementing class.
182     * @stable ICU 2.0
183     */
184    ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
185};
186
187/**
188 * Abstract class that defines an API for iteration
189 * on text objects.
190 * This is an interface for forward and backward iteration
191 * and random access into a text object.
192 *
193 * <p>The API provides backward compatibility to the Java and older ICU
194 * CharacterIterator classes but extends them significantly:
195 * <ol>
196 * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
197 * <li>While the old API functions provided forward iteration with
198 *     "pre-increment" semantics, the new one also provides functions
199 *     with "post-increment" semantics. They are more efficient and should
200 *     be the preferred iterator functions for new implementations.
201 *     The backward iteration always had "pre-decrement" semantics, which
202 *     are efficient.</li>
203 * <li>Just like ForwardCharacterIterator, it provides access to
204 *     both code units and code points. Code point access versions are available
205 *     for the old and the new iteration semantics.</li>
206 * <li>There are new functions for setting and moving the current position
207 *     without returning a character, for efficiency.</li>
208 * </ol>
209 *
210 * See ForwardCharacterIterator for examples for using the new forward iteration
211 * functions. For backward iteration, there is also a hasPrevious() function
212 * that can be used analogously to hasNext().
213 * The old functions work as before and are shown below.</p>
214 *
215 * <p>Examples for some of the new functions:</p>
216 *
217 * Forward iteration with hasNext():
218 * \code
219 * void forward1(CharacterIterator &it) {
220 *     UChar32 c;
221 *     for(it.setToStart(); it.hasNext();) {
222 *         c=it.next32PostInc();
223 *         // use c
224 *     }
225 *  }
226 * \endcode
227 * Forward iteration more similar to loops with the old forward iteration,
228 * showing a way to convert simple for() loops:
229 * \code
230 * void forward2(CharacterIterator &it) {
231 *     UChar c;
232 *     for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
233 *          // use c
234 *      }
235 * }
236 * \endcode
237 * Backward iteration with setToEnd() and hasPrevious():
238 * \code
239 *  void backward1(CharacterIterator &it) {
240 *      UChar32 c;
241 *      for(it.setToEnd(); it.hasPrevious();) {
242 *         c=it.previous32();
243 *          // use c
244 *      }
245 *  }
246 * \endcode
247 * Backward iteration with a more traditional for() loop:
248 * \code
249 * void backward2(CharacterIterator &it) {
250 *     UChar c;
251 *     for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
252 *         // use c
253 *      }
254 *  }
255 * \endcode
256 *
257 * Example for random access:
258 * \code
259 *  void random(CharacterIterator &it) {
260 *      // set to the third code point from the beginning
261 *      it.move32(3, CharacterIterator::kStart);
262 *      // get a code point from here without moving the position
263 *      UChar32 c=it.current32();
264 *      // get the position
265 *      int32_t pos=it.getIndex();
266 *      // get the previous code unit
267 *      UChar u=it.previous();
268 *      // move back one more code unit
269 *      it.move(-1, CharacterIterator::kCurrent);
270 *      // set the position back to where it was
271 *      // and read the same code point c and move beyond it
272 *      it.setIndex(pos);
273 *      if(c!=it.next32PostInc()) {
274 *          exit(1); // CharacterIterator inconsistent
275 *      }
276 *  }
277 * \endcode
278 *
279 * <p>Examples, especially for the old API:</p>
280 *
281 * Function processing characters, in this example simple output
282 * <pre>
283 * \code
284 *  void processChar( UChar c )
285 *  {
286 *      cout << " " << c;
287 *  }
288 * \endcode
289 * </pre>
290 * Traverse the text from start to finish
291 * <pre>
292 * \code
293 *  void traverseForward(CharacterIterator& iter)
294 *  {
295 *      for(UChar c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
296 *          processChar(c);
297 *      }
298 *  }
299 * \endcode
300 * </pre>
301 * Traverse the text backwards, from end to start
302 * <pre>
303 * \code
304 *  void traverseBackward(CharacterIterator& iter)
305 *  {
306 *      for(UChar c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
307 *          processChar(c);
308 *      }
309 *  }
310 * \endcode
311 * </pre>
312 * Traverse both forward and backward from a given position in the text.
313 * Calls to notBoundary() in this example represents some additional stopping criteria.
314 * <pre>
315 * \code
316 * void traverseOut(CharacterIterator& iter, int32_t pos)
317 * {
318 *      UChar c;
319 *      for (c = iter.setIndex(pos);
320 *      c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
321 *          c = iter.next()) {}
322 *      int32_t end = iter.getIndex();
323 *      for (c = iter.setIndex(pos);
324 *          c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
325 *          c = iter.previous()) {}
326 *      int32_t start = iter.getIndex() + 1;
327 *
328 *      cout << "start: " << start << " end: " << end << endl;
329 *      for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
330 *          processChar(c);
331 *     }
332 *  }
333 * \endcode
334 * </pre>
335 * Creating a StringCharacterIterator and calling the test functions
336 * <pre>
337 * \code
338 *  void CharacterIterator_Example( void )
339 *   {
340 *       cout << endl << "===== CharacterIterator_Example: =====" << endl;
341 *       UnicodeString text("Ein kleiner Satz.");
342 *       StringCharacterIterator iterator(text);
343 *       cout << "----- traverseForward: -----------" << endl;
344 *       traverseForward( iterator );
345 *       cout << endl << endl << "----- traverseBackward: ----------" << endl;
346 *       traverseBackward( iterator );
347 *       cout << endl << endl << "----- traverseOut: ---------------" << endl;
348 *       traverseOut( iterator, 7 );
349 *       cout << endl << endl << "-----" << endl;
350 *   }
351 * \endcode
352 * </pre>
353 *
354 * @stable ICU 2.0
355 */
356class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
357public:
358    /**
359     * Origin enumeration for the move() and move32() functions.
360     * @stable ICU 2.0
361     */
362    enum EOrigin { kStart, kCurrent, kEnd };
363
364    /**
365     * Destructor.
366     * @stable ICU 2.0
367     */
368    virtual ~CharacterIterator();
369
370    /**
371     * Returns a pointer to a new CharacterIterator of the same
372     * concrete class as this one, and referring to the same
373     * character in the same text-storage object as this one.  The
374     * caller is responsible for deleting the new clone.
375     * @return a pointer to a new CharacterIterator
376     * @stable ICU 2.0
377     */
378    virtual CharacterIterator* clone(void) const = 0;
379
380    /**
381     * Sets the iterator to refer to the first code unit in its
382     * iteration range, and returns that code unit.
383     * This can be used to begin an iteration with next().
384     * @return the first code unit in its iteration range.
385     * @stable ICU 2.0
386     */
387    virtual UChar         first(void) = 0;
388
389    /**
390     * Sets the iterator to refer to the first code unit in its
391     * iteration range, returns that code unit, and moves the position
392     * to the second code unit. This is an alternative to setToStart()
393     * for forward iteration with nextPostInc().
394     * @return the first code unit in its iteration range.
395     * @stable ICU 2.0
396     */
397    virtual UChar         firstPostInc(void);
398
399    /**
400     * Sets the iterator to refer to the first code point in its
401     * iteration range, and returns that code unit,
402     * This can be used to begin an iteration with next32().
403     * Note that an iteration with next32PostInc(), beginning with,
404     * e.g., setToStart() or firstPostInc(), is more efficient.
405     * @return the first code point in its iteration range.
406     * @stable ICU 2.0
407     */
408    virtual UChar32       first32(void) = 0;
409
410    /**
411     * Sets the iterator to refer to the first code point in its
412     * iteration range, returns that code point, and moves the position
413     * to the second code point. This is an alternative to setToStart()
414     * for forward iteration with next32PostInc().
415     * @return the first code point in its iteration range.
416     * @stable ICU 2.0
417     */
418    virtual UChar32       first32PostInc(void);
419
420    /**
421     * Sets the iterator to refer to the first code unit or code point in its
422     * iteration range. This can be used to begin a forward
423     * iteration with nextPostInc() or next32PostInc().
424     * @return the start position of the iteration range
425     * @stable ICU 2.0
426     */
427    inline int32_t    setToStart();
428
429    /**
430     * Sets the iterator to refer to the last code unit in its
431     * iteration range, and returns that code unit.
432     * This can be used to begin an iteration with previous().
433     * @return the last code unit.
434     * @stable ICU 2.0
435     */
436    virtual UChar         last(void) = 0;
437
438    /**
439     * Sets the iterator to refer to the last code point in its
440     * iteration range, and returns that code unit.
441     * This can be used to begin an iteration with previous32().
442     * @return the last code point.
443     * @stable ICU 2.0
444     */
445    virtual UChar32       last32(void) = 0;
446
447    /**
448     * Sets the iterator to the end of its iteration range, just behind
449     * the last code unit or code point. This can be used to begin a backward
450     * iteration with previous() or previous32().
451     * @return the end position of the iteration range
452     * @stable ICU 2.0
453     */
454    inline int32_t    setToEnd();
455
456    /**
457     * Sets the iterator to refer to the "position"-th code unit
458     * in the text-storage object the iterator refers to, and
459     * returns that code unit.
460     * @param position the "position"-th code unit in the text-storage object
461     * @return the "position"-th code unit.
462     * @stable ICU 2.0
463     */
464    virtual UChar         setIndex(int32_t position) = 0;
465
466    /**
467     * Sets the iterator to refer to the beginning of the code point
468     * that contains the "position"-th code unit
469     * in the text-storage object the iterator refers to, and
470     * returns that code point.
471     * The current position is adjusted to the beginning of the code point
472     * (its first code unit).
473     * @param position the "position"-th code unit in the text-storage object
474     * @return the "position"-th code point.
475     * @stable ICU 2.0
476     */
477    virtual UChar32       setIndex32(int32_t position) = 0;
478
479    /**
480     * Returns the code unit the iterator currently refers to.
481     * @return the current code unit.
482     * @stable ICU 2.0
483     */
484    virtual UChar         current(void) const = 0;
485
486    /**
487     * Returns the code point the iterator currently refers to.
488     * @return the current code point.
489     * @stable ICU 2.0
490     */
491    virtual UChar32       current32(void) const = 0;
492
493    /**
494     * Advances to the next code unit in the iteration range
495     * (toward endIndex()), and returns that code unit.  If there are
496     * no more code units to return, returns DONE.
497     * @return the next code unit.
498     * @stable ICU 2.0
499     */
500    virtual UChar         next(void) = 0;
501
502    /**
503     * Advances to the next code point in the iteration range
504     * (toward endIndex()), and returns that code point.  If there are
505     * no more code points to return, returns DONE.
506     * Note that iteration with "pre-increment" semantics is less
507     * efficient than iteration with "post-increment" semantics
508     * that is provided by next32PostInc().
509     * @return the next code point.
510     * @stable ICU 2.0
511     */
512    virtual UChar32       next32(void) = 0;
513
514    /**
515     * Advances to the previous code unit in the iteration range
516     * (toward startIndex()), and returns that code unit.  If there are
517     * no more code units to return, returns DONE.
518     * @return the previous code unit.
519     * @stable ICU 2.0
520     */
521    virtual UChar         previous(void) = 0;
522
523    /**
524     * Advances to the previous code point in the iteration range
525     * (toward startIndex()), and returns that code point.  If there are
526     * no more code points to return, returns DONE.
527     * @return the previous code point.
528     * @stable ICU 2.0
529     */
530    virtual UChar32       previous32(void) = 0;
531
532    /**
533     * Returns FALSE if there are no more code units or code points
534     * before the current position in the iteration range.
535     * This is used with previous() or previous32() in backward
536     * iteration.
537     * @return FALSE if there are no more code units or code points
538     * before the current position in the iteration range, return TRUE otherwise.
539     * @stable ICU 2.0
540     */
541    virtual UBool        hasPrevious() = 0;
542
543    /**
544     * Returns the numeric index in the underlying text-storage
545     * object of the character returned by first().  Since it's
546     * possible to create an iterator that iterates across only
547     * part of a text-storage object, this number isn't
548     * necessarily 0.
549     * @returns the numeric index in the underlying text-storage
550     * object of the character returned by first().
551     * @stable ICU 2.0
552     */
553    inline int32_t       startIndex(void) const;
554
555    /**
556     * Returns the numeric index in the underlying text-storage
557     * object of the position immediately BEYOND the character
558     * returned by last().
559     * @return the numeric index in the underlying text-storage
560     * object of the position immediately BEYOND the character
561     * returned by last().
562     * @stable ICU 2.0
563     */
564    inline int32_t       endIndex(void) const;
565
566    /**
567     * Returns the numeric index in the underlying text-storage
568     * object of the character the iterator currently refers to
569     * (i.e., the character returned by current()).
570     * @return the numberic index in the text-storage object of
571     * the character the iterator currently refers to
572     * @stable ICU 2.0
573     */
574    inline int32_t       getIndex(void) const;
575
576    /**
577     * Returns the length of the entire text in the underlying
578     * text-storage object.
579     * @return the length of the entire text in the text-storage object
580     * @stable ICU 2.0
581     */
582    inline int32_t           getLength() const;
583
584    /**
585     * Moves the current position relative to the start or end of the
586     * iteration range, or relative to the current position itself.
587     * The movement is expressed in numbers of code units forward
588     * or backward by specifying a positive or negative delta.
589     * @param delta the position relative to origin. A positive delta means forward;
590     * a negative delta means backward.
591     * @param origin Origin enumeration {kStart, kCurrent, kEnd}
592     * @return the new position
593     * @stable ICU 2.0
594     */
595    virtual int32_t      move(int32_t delta, EOrigin origin) = 0;
596
597    /**
598     * Moves the current position relative to the start or end of the
599     * iteration range, or relative to the current position itself.
600     * The movement is expressed in numbers of code points forward
601     * or backward by specifying a positive or negative delta.
602     * @param delta the position relative to origin. A positive delta means forward;
603     * a negative delta means backward.
604     * @param origin Origin enumeration {kStart, kCurrent, kEnd}
605     * @return the new position
606     * @stable ICU 2.0
607     */
608    virtual int32_t      move32(int32_t delta, EOrigin origin) = 0;
609
610    /**
611     * Copies the text under iteration into the UnicodeString
612     * referred to by "result".
613     * @param result Receives a copy of the text under iteration.
614     * @stable ICU 2.0
615     */
616    virtual void            getText(UnicodeString&  result) = 0;
617
618protected:
619    /**
620     * Empty constructor.
621     * @stable ICU 2.0
622     */
623    CharacterIterator();
624
625    /**
626     * Constructor, just setting the length field in this base class.
627     * @stable ICU 2.0
628     */
629    CharacterIterator(int32_t length);
630
631    /**
632     * Constructor, just setting the length and position fields in this base class.
633     * @stable ICU 2.0
634     */
635    CharacterIterator(int32_t length, int32_t position);
636
637    /**
638     * Constructor, just setting the length, start, end, and position fields in this base class.
639     * @stable ICU 2.0
640     */
641    CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
642
643    /**
644     * Copy constructor.
645     *
646     * @param that The CharacterIterator to be copied
647     * @stable ICU 2.0
648     */
649    CharacterIterator(const CharacterIterator &that);
650
651    /**
652     * Assignment operator.  Sets this CharacterIterator to have the same behavior,
653     * as the one passed in.
654     * @param that The CharacterIterator passed in.
655     * @return the newly set CharacterIterator.
656     * @stable ICU 2.0
657     */
658    CharacterIterator &operator=(const CharacterIterator &that);
659
660    /**
661     * Base class text length field.
662     * Necessary this for correct getText() and hashCode().
663     * @stable ICU 2.0
664     */
665    int32_t textLength;
666
667    /**
668     * Base class field for the current position.
669     * @stable ICU 2.0
670     */
671    int32_t  pos;
672
673    /**
674     * Base class field for the start of the iteration range.
675     * @stable ICU 2.0
676     */
677    int32_t  begin;
678
679    /**
680     * Base class field for the end of the iteration range.
681     * @stable ICU 2.0
682     */
683    int32_t  end;
684};
685
686inline UBool
687ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
688    return !operator==(that);
689}
690
691inline int32_t
692CharacterIterator::setToStart() {
693    return move(0, kStart);
694}
695
696inline int32_t
697CharacterIterator::setToEnd() {
698    return move(0, kEnd);
699}
700
701inline int32_t
702CharacterIterator::startIndex(void) const {
703    return begin;
704}
705
706inline int32_t
707CharacterIterator::endIndex(void) const {
708    return end;
709}
710
711inline int32_t
712CharacterIterator::getIndex(void) const {
713    return pos;
714}
715
716inline int32_t
717CharacterIterator::getLength(void) const {
718    return textLength;
719}
720
721U_NAMESPACE_END
722#endif
723