1/*
2*******************************************************************************
3* Copyright (C) 2009-2013, International Business Machines Corporation and
4* others. All Rights Reserved.
5*******************************************************************************
6*/
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
11
12#include "unicode/alphaindex.h"
13#include "unicode/coleitr.h"
14#include "unicode/coll.h"
15#include "unicode/localpointer.h"
16#include "unicode/normalizer2.h"
17#include "unicode/tblcoll.h"
18#include "unicode/ulocdata.h"
19#include "unicode/uniset.h"
20#include "unicode/uobject.h"
21#include "unicode/usetiter.h"
22#include "unicode/utf16.h"
23
24#include "cmemory.h"
25#include "cstring.h"
26#include "uassert.h"
27#include "uvector.h"
28
29//#include <string>
30//#include <iostream>
31
32#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
33
34U_NAMESPACE_BEGIN
35
36namespace {
37
38/**
39 * Prefix string for Chinese index buckets.
40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
41 */
42const UChar BASE[1] = { 0xFDD0 };
43const int32_t BASE_LENGTH = 1;
44
45UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
46                                const UnicodeString &one, const UnicodeString &other);
47
48}  // namespace
49
50static int32_t U_CALLCONV
51collatorComparator(const void *context, const void *left, const void *right);
52
53static int32_t U_CALLCONV
54recordCompareFn(const void *context, const void *left, const void *right);
55
56//  UVector<Record *> support function, delete a Record.
57static void U_CALLCONV
58alphaIndex_deleteRecord(void *obj) {
59    delete static_cast<AlphabeticIndex::Record *>(obj);
60}
61
62namespace {
63
64UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
65                           UErrorCode &errorCode) {
66    if (U_FAILURE(errorCode)) { return NULL; }
67    if (owned.isValid()) {
68        return owned.orphan();
69    }
70    UnicodeString *p = new UnicodeString(s);
71    if (p == NULL) {
72        errorCode = U_MEMORY_ALLOCATION_ERROR;
73    }
74    return p;
75}
76
77inline UnicodeString *getString(const UVector &list, int32_t i) {
78    return static_cast<UnicodeString *>(list[i]);
79}
80
81inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
82    return static_cast<AlphabeticIndex::Bucket *>(list[i]);
83}
84
85inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
86    return static_cast<AlphabeticIndex::Record *>(list[i]);
87}
88
89/**
90 * Like Java Collections.binarySearch(List, String, Comparator).
91 *
92 * @return the index>=0 where the item was found,
93 *         or the index<0 for inserting the string at ~index in sorted order
94 */
95int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
96    if (list.size() == 0) { return ~0; }
97    int32_t start = 0;
98    int32_t limit = list.size();
99    for (;;) {
100        int32_t i = (start + limit) / 2;
101        const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
102        UErrorCode errorCode = U_ZERO_ERROR;
103        UCollationResult cmp = coll.compare(s, *si, errorCode);
104        if (cmp == UCOL_EQUAL) {
105            return i;
106        } else if (cmp < 0) {
107            if (i == start) {
108                return ~start;  // insert s before *si
109            }
110            limit = i;
111        } else {
112            if (i == start) {
113                return ~(start + 1);  // insert s after *si
114            }
115            start = i;
116        }
117    }
118}
119
120}  // namespace
121
122// The BucketList is not in the anonymous namespace because only Clang
123// seems to support its use in other classes from there.
124// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
125class BucketList : public UObject {
126public:
127    BucketList(UVector *bucketList, UVector *publicBucketList)
128            : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
129        int32_t displayIndex = 0;
130        for (int32_t i = 0; i < publicBucketList->size(); ++i) {
131            getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
132        }
133    }
134
135    // The virtual destructor must not be inline.
136    // See ticket #8454 for details.
137    virtual ~BucketList();
138
139    int32_t getBucketCount() const {
140        return immutableVisibleList_->size();
141    }
142
143    int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
144                           UErrorCode &errorCode) {
145        // binary search
146        int32_t start = 0;
147        int32_t limit = bucketList_->size();
148        while ((start + 1) < limit) {
149            int32_t i = (start + limit) / 2;
150            const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
151            UCollationResult nameVsBucket =
152                collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
153            if (nameVsBucket < 0) {
154                limit = i;
155            } else {
156                start = i;
157            }
158        }
159        const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
160        if (bucket->displayBucket_ != NULL) {
161            bucket = bucket->displayBucket_;
162        }
163        return bucket->displayIndex_;
164    }
165
166    /** All of the buckets, visible and invisible. */
167    UVector *bucketList_;
168    /** Just the visible buckets. */
169    UVector *immutableVisibleList_;
170};
171
172BucketList::~BucketList() {
173    delete bucketList_;
174    if (immutableVisibleList_ != bucketList_) {
175        delete immutableVisibleList_;
176    }
177}
178
179AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
180    delete buckets_;
181    delete collatorPrimaryOnly_;
182}
183
184int32_t
185AlphabeticIndex::ImmutableIndex::getBucketCount() const {
186    return buckets_->getBucketCount();
187}
188
189int32_t
190AlphabeticIndex::ImmutableIndex::getBucketIndex(
191        const UnicodeString &name, UErrorCode &errorCode) const {
192    return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
193}
194
195const AlphabeticIndex::Bucket *
196AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
197    if (0 <= index && index < buckets_->getBucketCount()) {
198        return icu::getBucket(*buckets_->immutableVisibleList_, index);
199    } else {
200        return NULL;
201    }
202}
203
204AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
205        : inputList_(NULL),
206          labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
207          maxLabelCount_(99),
208          initialLabels_(NULL), firstCharsInScripts_(NULL),
209          collator_(NULL), collatorPrimaryOnly_(NULL),
210          buckets_(NULL) {
211    init(&locale, status);
212}
213
214
215AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
216        : inputList_(NULL),
217          labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
218          maxLabelCount_(99),
219          initialLabels_(NULL), firstCharsInScripts_(NULL),
220          collator_(collator), collatorPrimaryOnly_(NULL),
221          buckets_(NULL) {
222    init(NULL, status);
223}
224
225
226
227AlphabeticIndex::~AlphabeticIndex() {
228    delete collator_;
229    delete collatorPrimaryOnly_;
230    delete firstCharsInScripts_;
231    delete buckets_;
232    delete inputList_;
233    delete initialLabels_;
234}
235
236
237AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
238    if (U_FAILURE(status)) {
239        return *this;
240    }
241    initialLabels_->addAll(additions);
242    clearBuckets();
243    return *this;
244}
245
246
247AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
248    addIndexExemplars(locale, status);
249    clearBuckets();
250    return *this;
251}
252
253
254AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
255    if (U_FAILURE(errorCode)) { return NULL; }
256    // In C++, the ImmutableIndex must own its copy of the BucketList,
257    // even if it contains no records, for proper memory management.
258    // We could clone the buckets_ if they are not NULL,
259    // but that would be worth it only if this method is called multiple times,
260    // or called after using the old-style bucket iterator API.
261    LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
262    LocalPointer<RuleBasedCollator> coll(
263        static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
264    if (immutableBucketList.isNull() || coll.isNull()) {
265        errorCode = U_MEMORY_ALLOCATION_ERROR;
266        return NULL;
267    }
268    ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
269    if (immIndex == NULL) {
270        errorCode = U_MEMORY_ALLOCATION_ERROR;
271        return NULL;
272    }
273    // The ImmutableIndex adopted its parameter objects.
274    immutableBucketList.orphan();
275    coll.orphan();
276    return immIndex;
277}
278
279int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
280    initBuckets(status);
281    if (U_FAILURE(status)) {
282        return 0;
283    }
284    return buckets_->getBucketCount();
285}
286
287
288int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
289    if (U_FAILURE(status) || inputList_ == NULL) {
290        return 0;
291    }
292    return inputList_->size();
293}
294
295void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
296    const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
297    if (U_FAILURE(errorCode)) { return; }
298
299    const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
300    const UnicodeString &overflowBoundary =
301        *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
302
303    // We make a sorted array of elements.
304    // Some of the input may be redundant.
305    // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
306    // We filter out those cases.
307    UnicodeSetIterator iter(*initialLabels_);
308    while (iter.next()) {
309        const UnicodeString *item = &iter.getString();
310        LocalPointer<UnicodeString> ownedItem;
311        UBool checkDistinct;
312        int32_t itemLength = item->length();
313        if (!item->hasMoreChar32Than(0, itemLength, 1)) {
314            checkDistinct = FALSE;
315        } else if(item->charAt(itemLength - 1) == 0x2a &&  // '*'
316                item->charAt(itemLength - 2) != 0x2a) {
317            // Use a label if it is marked with one trailing star,
318            // even if the label string sorts the same when all contractions are suppressed.
319            ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
320            item = ownedItem.getAlias();
321            if (item == NULL) {
322                errorCode = U_MEMORY_ALLOCATION_ERROR;
323                return;
324            }
325            checkDistinct = FALSE;
326        } else {
327            checkDistinct = TRUE;
328        }
329        if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
330            // Ignore a primary-ignorable or non-alphabetic index character.
331        } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
332            // Ignore an index characters that will land in the overflow bucket.
333        } else if (checkDistinct &&
334                collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
335            // Ignore a multi-code point index character that does not sort distinctly
336            // from the sequence of its separate characters.
337        } else {
338            int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
339            if (insertionPoint < 0) {
340                indexCharacters.insertElementAt(
341                    ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
342            } else {
343                const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
344                if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
345                    indexCharacters.setElementAt(
346                        ownedString(*item, ownedItem, errorCode), insertionPoint);
347                }
348            }
349        }
350    }
351    if (U_FAILURE(errorCode)) { return; }
352
353    // if the result is still too large, cut down to maxCount elements, by removing every nth element
354
355    int32_t size = indexCharacters.size() - 1;
356    if (size > maxLabelCount_) {
357        int32_t count = 0;
358        int32_t old = -1;
359        for (int32_t i = 0; i < indexCharacters.size();) {
360            ++count;
361            int32_t bump = count * maxLabelCount_ / size;
362            if (bump == old) {
363                indexCharacters.removeElementAt(i);
364            } else {
365                old = bump;
366                ++i;
367            }
368        }
369    }
370}
371
372namespace {
373
374const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
375    if (!current.startsWith(BASE, BASE_LENGTH)) {
376        return current;
377    }
378    UChar rest = current.charAt(BASE_LENGTH);
379    if (0x2800 < rest && rest <= 0x28FF) { // stroke count
380        int32_t count = rest-0x2800;
381        temp.setTo((UChar)(0x30 + count % 10));
382        if (count >= 10) {
383            count /= 10;
384            temp.insert(0, (UChar)(0x30 + count % 10));
385            if (count >= 10) {
386                count /= 10;
387                temp.insert(0, (UChar)(0x30 + count));
388            }
389        }
390        return temp.append((UChar)0x5283);
391    }
392    return temp.setTo(current, BASE_LENGTH);
393}
394
395UBool hasMultiplePrimaryWeights(
396        CollationElementIterator &cei, int32_t variableTop,
397        const UnicodeString &s, UErrorCode &errorCode) {
398    cei.setText(s, errorCode);
399    UBool seenPrimary = FALSE;
400    for (;;) {
401        int32_t ce32 = cei.next(errorCode);
402        if (ce32 == CollationElementIterator::NULLORDER) {
403            break;
404        }
405        int32_t p = CollationElementIterator::primaryOrder(ce32);
406        if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
407            // not primary ignorable, and not a continuation CE
408            if (seenPrimary) {
409                return TRUE;
410            }
411            seenPrimary = TRUE;
412        }
413    }
414    return FALSE;
415}
416
417}  // namespace
418
419BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
420    // Initialize indexCharacters.
421    UVector indexCharacters(errorCode);
422    indexCharacters.setDeleter(uprv_deleteUObject);
423    initLabels(indexCharacters, errorCode);
424    if (U_FAILURE(errorCode)) { return NULL; }
425
426    // Variables for hasMultiplePrimaryWeights().
427    LocalPointer<CollationElementIterator> cei(
428        collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
429    if (cei.isNull()) {
430        errorCode = U_MEMORY_ALLOCATION_ERROR;
431        return NULL;
432    }
433    int32_t variableTop;
434    if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
435        variableTop = CollationElementIterator::primaryOrder(
436            (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
437    } else {
438        variableTop = 0;
439    }
440    UBool hasInvisibleBuckets = FALSE;
441
442    // Helper arrays for Chinese Pinyin collation.
443    Bucket *asciiBuckets[26] = {
444        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
445        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
446    };
447    Bucket *pinyinBuckets[26] = {
448        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
449        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
450    };
451    UBool hasPinyin = FALSE;
452
453    LocalPointer<UVector> bucketList(new UVector(errorCode));
454    if (bucketList.isNull()) {
455        errorCode = U_MEMORY_ALLOCATION_ERROR;
456        return NULL;
457    }
458    bucketList->setDeleter(uprv_deleteUObject);
459
460    // underflow bucket
461    Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
462    if (bucket == NULL) {
463        errorCode = U_MEMORY_ALLOCATION_ERROR;
464        return NULL;
465    }
466    bucketList->addElement(bucket, errorCode);
467    if (U_FAILURE(errorCode)) { return NULL; }
468
469    UnicodeString temp;
470
471    // fix up the list, adding underflow, additions, overflow
472    // Insert inflow labels as needed.
473    int32_t scriptIndex = -1;
474    const UnicodeString *scriptUpperBoundary = &emptyString_;
475    for (int32_t i = 0; i < indexCharacters.size(); ++i) {
476        UnicodeString &current = *getString(indexCharacters, i);
477        if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
478            // We crossed the script boundary into a new script.
479            const UnicodeString &inflowBoundary = *scriptUpperBoundary;
480            UBool skippedScript = FALSE;
481            for (;;) {
482                scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
483                if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
484                    break;
485                }
486                skippedScript = TRUE;
487            }
488            if (skippedScript && bucketList->size() > 1) {
489                // We are skipping one or more scripts,
490                // and we are not just getting out of the underflow label.
491                bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
492                if (bucket == NULL) {
493                    errorCode = U_MEMORY_ALLOCATION_ERROR;
494                    return NULL;
495                }
496                bucketList->addElement(bucket, errorCode);
497            }
498        }
499        // Add a bucket with the current label.
500        bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
501        if (bucket == NULL) {
502            errorCode = U_MEMORY_ALLOCATION_ERROR;
503            return NULL;
504        }
505        bucketList->addElement(bucket, errorCode);
506        // Remember ASCII and Pinyin buckets for Pinyin redirects.
507        UChar c;
508        if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) {  // A-Z
509            asciiBuckets[c - 0x41] = bucket;
510        } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
511                0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
512            pinyinBuckets[c - 0x41] = bucket;
513            hasPinyin = TRUE;
514        }
515        // Check for multiple primary weights.
516        if (!current.startsWith(BASE, BASE_LENGTH) &&
517                hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
518                current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
519            // "AE-ligature" or "Sch" etc.
520            for (int32_t i = bucketList->size() - 2;; --i) {
521                Bucket *singleBucket = getBucket(*bucketList, i);
522                if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
523                    // There is no single-character bucket since the last
524                    // underflow or inflow label.
525                    break;
526                }
527                if (singleBucket->displayBucket_ == NULL &&
528                        !hasMultiplePrimaryWeights(
529                            *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
530                    // Add an invisible bucket that redirects strings greater than the expansion
531                    // to the previous single-character bucket.
532                    // For example, after ... Q R S Sch we add Sch\uFFFF->S
533                    // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
534                    bucket = new Bucket(emptyString_,
535                        UnicodeString(current).append((UChar)0xFFFF),
536                        U_ALPHAINDEX_NORMAL);
537                    if (bucket == NULL) {
538                        errorCode = U_MEMORY_ALLOCATION_ERROR;
539                        return NULL;
540                    }
541                    bucket->displayBucket_ = singleBucket;
542                    bucketList->addElement(bucket, errorCode);
543                    hasInvisibleBuckets = TRUE;
544                    break;
545                }
546            }
547        }
548    }
549    if (U_FAILURE(errorCode)) { return NULL; }
550    if (bucketList->size() == 1) {
551        // No real labels, show only the underflow label.
552        BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
553        if (bl == NULL) {
554            errorCode = U_MEMORY_ALLOCATION_ERROR;
555            return NULL;
556        }
557        bucketList.orphan();
558        return bl;
559    }
560    // overflow bucket
561    bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
562    if (bucket == NULL) {
563        errorCode = U_MEMORY_ALLOCATION_ERROR;
564        return NULL;
565    }
566    bucketList->addElement(bucket, errorCode); // final
567
568    if (hasPinyin) {
569        // Redirect Pinyin buckets.
570        Bucket *asciiBucket = NULL;
571        for (int32_t i = 0; i < 26; ++i) {
572            if (asciiBuckets[i] != NULL) {
573                asciiBucket = asciiBuckets[i];
574            }
575            if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
576                pinyinBuckets[i]->displayBucket_ = asciiBucket;
577                hasInvisibleBuckets = TRUE;
578            }
579        }
580    }
581
582    if (U_FAILURE(errorCode)) { return NULL; }
583    if (!hasInvisibleBuckets) {
584        BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
585        if (bl == NULL) {
586            errorCode = U_MEMORY_ALLOCATION_ERROR;
587            return NULL;
588        }
589        bucketList.orphan();
590        return bl;
591    }
592    // Merge inflow buckets that are visually adjacent.
593    // Iterate backwards: Merge inflow into overflow rather than the other way around.
594    int32_t i = bucketList->size() - 1;
595    Bucket *nextBucket = getBucket(*bucketList, i);
596    while (--i > 0) {
597        bucket = getBucket(*bucketList, i);
598        if (bucket->displayBucket_ != NULL) {
599            continue;  // skip invisible buckets
600        }
601        if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
602            if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
603                bucket->displayBucket_ = nextBucket;
604                continue;
605            }
606        }
607        nextBucket = bucket;
608    }
609
610    LocalPointer<UVector> publicBucketList(new UVector(errorCode));
611    if (bucketList.isNull()) {
612        errorCode = U_MEMORY_ALLOCATION_ERROR;
613        return NULL;
614    }
615    // Do not call publicBucketList->setDeleter():
616    // This vector shares its objects with the bucketList.
617    for (int32_t i = 0; i < bucketList->size(); ++i) {
618        bucket = getBucket(*bucketList, i);
619        if (bucket->displayBucket_ == NULL) {
620            publicBucketList->addElement(bucket, errorCode);
621        }
622    }
623    if (U_FAILURE(errorCode)) { return NULL; }
624    BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
625    if (bl == NULL) {
626        errorCode = U_MEMORY_ALLOCATION_ERROR;
627        return NULL;
628    }
629    bucketList.orphan();
630    publicBucketList.orphan();
631    return bl;
632}
633
634/**
635 * Creates an index, and buckets and sorts the list of records into the index.
636 */
637void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
638    if (U_FAILURE(errorCode) || buckets_ != NULL) {
639        return;
640    }
641    buckets_ = createBucketList(errorCode);
642    if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
643        return;
644    }
645
646    // Sort the records by name.
647    // Stable sort preserves input order of collation duplicates.
648    inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
649
650    // Now, we traverse all of the input, which is now sorted.
651    // If the item doesn't go in the current bucket, we find the next bucket that contains it.
652    // This makes the process order n*log(n), since we just sort the list and then do a linear process.
653    // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
654    // we need to improve it for that case.
655
656    Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
657    int32_t bucketIndex = 1;
658    Bucket *nextBucket;
659    const UnicodeString *upperBoundary;
660    if (bucketIndex < buckets_->bucketList_->size()) {
661        nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
662        upperBoundary = &nextBucket->lowerBoundary_;
663    } else {
664        nextBucket = NULL;
665        upperBoundary = NULL;
666    }
667    for (int32_t i = 0; i < inputList_->size(); ++i) {
668        Record *r = getRecord(*inputList_, i);
669        // if the current bucket isn't the right one, find the one that is
670        // We have a special flag for the last bucket so that we don't look any further
671        while (upperBoundary != NULL &&
672                collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
673            currentBucket = nextBucket;
674            // now reset the boundary that we compare against
675            if (bucketIndex < buckets_->bucketList_->size()) {
676                nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
677                upperBoundary = &nextBucket->lowerBoundary_;
678            } else {
679                upperBoundary = NULL;
680            }
681        }
682        // now put the record into the bucket.
683        Bucket *bucket = currentBucket;
684        if (bucket->displayBucket_ != NULL) {
685            bucket = bucket->displayBucket_;
686        }
687        if (bucket->records_ == NULL) {
688            bucket->records_ = new UVector(errorCode);
689            if (bucket->records_ == NULL) {
690                errorCode = U_MEMORY_ALLOCATION_ERROR;
691                return;
692            }
693        }
694        bucket->records_->addElement(r, errorCode);
695    }
696}
697
698void AlphabeticIndex::clearBuckets() {
699    if (buckets_ != NULL) {
700        delete buckets_;
701        buckets_ = NULL;
702        internalResetBucketIterator();
703    }
704}
705
706void AlphabeticIndex::internalResetBucketIterator() {
707    labelsIterIndex_ = -1;
708    currentBucket_ = NULL;
709}
710
711
712void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
713    if (U_FAILURE(status)) { return; }
714    // Chinese index characters, which are specific to each of the several Chinese tailorings,
715    // take precedence over the single locale data exemplar set per language.
716    const char *language = locale.getLanguage();
717    if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
718            uprv_strcmp(language, "ko") == 0) {
719        // TODO: This should be done regardless of the language, but it's expensive.
720        // We should add a Collator function (can be @internal)
721        // to enumerate just the contractions that start with a given code point or string.
722        if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
723            return;
724        }
725    }
726
727    LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
728    if (U_FAILURE(status)) {
729        return;
730    }
731
732    UnicodeSet exemplars;
733    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
734    if (U_SUCCESS(status)) {
735        initialLabels_->addAll(exemplars);
736        return;
737    }
738    status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR
739
740    // The locale data did not include explicit Index characters.
741    // Synthesize a set of them from the locale's standard exemplar characters.
742    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
743    if (U_FAILURE(status)) {
744        return;
745    }
746
747    // question: should we add auxiliary exemplars?
748    if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
749        exemplars.add(0x61, 0x7A);
750    }
751    if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
752        // cut down to small list
753        exemplars.remove(0xAC00, 0xD7A3).
754            add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
755            add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
756            add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
757            add(0xD30C).add(0xD558);
758    }
759    if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
760        // cut down to small list
761        // make use of the fact that Ethiopic is allocated in 8's, where
762        // the base is 0 mod 8.
763        UnicodeSet ethiopic(
764            UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
765        UnicodeSetIterator it(ethiopic);
766        while (it.next() && !it.isString()) {
767            if ((it.getCodepoint() & 0x7) != 0) {
768                exemplars.remove(it.getCodepoint());
769            }
770        }
771    }
772
773    // Upper-case any that aren't already so.
774    //   (We only do this for synthesized index characters.)
775    UnicodeSetIterator it(exemplars);
776    UnicodeString upperC;
777    while (it.next()) {
778        const UnicodeString &exemplarC = it.getString();
779        upperC = exemplarC;
780        upperC.toUpper(locale);
781        initialLabels_->add(upperC);
782    }
783}
784
785UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
786    UnicodeSet contractions;
787    ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
788                                      contractions.toUSet(), NULL, FALSE, &errorCode);
789    if (U_FAILURE(errorCode)) { return FALSE; }
790    UnicodeString firstHanBoundary;
791    UBool hasPinyin = FALSE;
792    UnicodeSetIterator iter(contractions);
793    while (iter.next()) {
794        const UnicodeString &s = iter.getString();
795        if (s.startsWith(BASE, BASE_LENGTH)) {
796            initialLabels_->add(s);
797            if (firstHanBoundary.isEmpty() ||
798                    collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
799                firstHanBoundary = s;
800            }
801            UChar c = s.charAt(s.length() - 1);
802            if (0x41 <= c && c <= 0x5A) {  // A-Z
803                hasPinyin = TRUE;
804            }
805        }
806    }
807    if (hasPinyin) {
808        initialLabels_->add(0x41, 0x5A);  // A-Z
809    }
810    if (!firstHanBoundary.isEmpty()) {
811        // The hardcoded list of script boundaries includes U+4E00
812        // which is tailored to not be the first primary
813        // in all Chinese tailorings except "unihan".
814        // Replace U+4E00 with the first boundary string from the tailoring.
815        // TODO: This becomes obsolete when the root collator gets
816        // reliable script-first-primary mappings.
817        int32_t hanIndex = binarySearch(
818                *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
819        if (hanIndex >= 0) {
820            UnicodeString *fh = new UnicodeString(firstHanBoundary);
821            if (fh == NULL) {
822                errorCode = U_MEMORY_ALLOCATION_ERROR;
823                return FALSE;
824            }
825            firstCharsInScripts_->setElementAt(fh, hanIndex);
826        }
827        return TRUE;
828    } else {
829        return FALSE;
830    }
831}
832
833
834/*
835 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
836 */
837static const UChar CGJ = 0x034F;
838UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
839    UnicodeString result;
840    if (item.length() == 0) {
841        return result;
842    }
843    int32_t i = 0;
844    for (;;) {
845        UChar32  cp = item.char32At(i);
846        result.append(cp);
847        i = item.moveIndex32(i, 1);
848        if (i >= item.length()) {
849            break;
850        }
851        result.append(CGJ);
852    }
853    return result;
854}
855
856
857UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
858    return FALSE;
859}
860
861
862UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
863    return FALSE;
864}
865
866
867const RuleBasedCollator &AlphabeticIndex::getCollator() const {
868    // There are no known non-RuleBasedCollator collators, and none ever expected.
869    // But, in case that changes, better a null pointer than a wrong type.
870    return *dynamic_cast<RuleBasedCollator *>(collator_);
871}
872
873
874const UnicodeString &AlphabeticIndex::getInflowLabel() const {
875    return inflowLabel_;
876}
877
878const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
879    return overflowLabel_;
880}
881
882
883const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
884    return underflowLabel_;
885}
886
887
888AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
889    inflowLabel_ = label;
890    clearBuckets();
891    return *this;
892}
893
894
895AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
896    overflowLabel_ = label;
897    clearBuckets();
898    return *this;
899}
900
901
902AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
903    underflowLabel_ = label;
904    clearBuckets();
905    return *this;
906}
907
908
909int32_t AlphabeticIndex::getMaxLabelCount() const {
910    return maxLabelCount_;
911}
912
913
914AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
915    if (U_FAILURE(status)) {
916        return *this;
917    }
918    if (maxLabelCount <= 0) {
919        status = U_ILLEGAL_ARGUMENT_ERROR;
920        return *this;
921    }
922    maxLabelCount_ = maxLabelCount;
923    clearBuckets();
924    return *this;
925}
926
927
928//
929//  init() - Common code for constructors.
930//
931
932void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
933    if (U_FAILURE(status)) { return; }
934    if (locale == NULL && collator_ == NULL) {
935        status = U_ILLEGAL_ARGUMENT_ERROR;
936        return;
937    }
938
939    initialLabels_         = new UnicodeSet();
940    if (initialLabels_ == NULL) {
941        status = U_MEMORY_ALLOCATION_ERROR;
942        return;
943    }
944
945    inflowLabel_.setTo((UChar)0x2026);    // Ellipsis
946    overflowLabel_ = inflowLabel_;
947    underflowLabel_ = inflowLabel_;
948
949    if (collator_ == NULL) {
950        collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
951        if (U_FAILURE(status)) { return; }
952        if (collator_ == NULL) {
953            status = U_MEMORY_ALLOCATION_ERROR;
954            return;
955        }
956    }
957    collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
958    if (collatorPrimaryOnly_ == NULL) {
959        status = U_MEMORY_ALLOCATION_ERROR;
960        return;
961    }
962    collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
963    firstCharsInScripts_ = firstStringsInScript(status);
964    if (U_FAILURE(status)) { return; }
965    firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
966    UnicodeString _4E00((UChar)0x4E00);
967    UnicodeString _1100((UChar)0x1100);
968    UnicodeString _1112((UChar)0x1112);
969    if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
970            collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
971        // The standard Korean tailoring sorts Hanja (Han characters)
972        // as secondary differences from Hangul syllables.
973        // This makes U+4E00 not useful as a Han-script boundary.
974        // TODO: This becomes obsolete when the root collator gets
975        // reliable script-first-primary mappings.
976        int32_t hanIndex = binarySearch(
977                *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
978        if (hanIndex >= 0) {
979            firstCharsInScripts_->removeElementAt(hanIndex);
980        }
981    }
982    // Guard against a degenerate collator where
983    // some script boundary strings are primary ignorable.
984    for (;;) {
985        if (U_FAILURE(status)) { return; }
986        if (firstCharsInScripts_->isEmpty()) {
987            // AlphabeticIndex requires some non-ignorable script boundary strings.
988            status = U_ILLEGAL_ARGUMENT_ERROR;
989            return;
990        }
991        if (collatorPrimaryOnly_->compare(
992                *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
993                emptyString_, status) == UCOL_EQUAL) {
994            firstCharsInScripts_->removeElementAt(0);
995        } else {
996            break;
997        }
998    }
999
1000    if (locale != NULL) {
1001        addIndexExemplars(*locale, status);
1002    }
1003}
1004
1005
1006//
1007//  Comparison function for UVector<UnicodeString *> sorting with a collator.
1008//
1009static int32_t U_CALLCONV
1010collatorComparator(const void *context, const void *left, const void *right) {
1011    const UElement *leftElement = static_cast<const UElement *>(left);
1012    const UElement *rightElement = static_cast<const UElement *>(right);
1013    const UnicodeString *leftString  = static_cast<const UnicodeString *>(leftElement->pointer);
1014    const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
1015
1016    if (leftString == rightString) {
1017        // Catches case where both are NULL
1018        return 0;
1019    }
1020    if (leftString == NULL) {
1021        return 1;
1022    };
1023    if (rightString == NULL) {
1024        return -1;
1025    }
1026    const Collator *col = static_cast<const Collator *>(context);
1027    UErrorCode errorCode = U_ZERO_ERROR;
1028    return col->compare(*leftString, *rightString, errorCode);
1029}
1030
1031//
1032//  Comparison function for UVector<Record *> sorting with a collator.
1033//
1034static int32_t U_CALLCONV
1035recordCompareFn(const void *context, const void *left, const void *right) {
1036    const UElement *leftElement = static_cast<const UElement *>(left);
1037    const UElement *rightElement = static_cast<const UElement *>(right);
1038    const AlphabeticIndex::Record *leftRec  = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
1039    const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
1040    const Collator *col = static_cast<const Collator *>(context);
1041    UErrorCode errorCode = U_ZERO_ERROR;
1042    return col->compare(leftRec->name_, rightRec->name_, errorCode);
1043}
1044
1045
1046/**
1047 * This list contains one character per script that has the
1048 * lowest primary weight for that script in the root collator.
1049 * This list will be copied and sorted to account for script reordering.
1050 *
1051 * <p>TODO: This is fragile. If the first character of a script is tailored
1052 * so that it does not map to the script's lowest primary weight any more,
1053 * then the buckets will be off.
1054 * There are hacks in the code to handle the known CJK tailorings of U+4E00.
1055 *
1056 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
1057 *
1058 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
1059 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
1060 */
1061static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] =  {
1062    0x41, 0, 0x03B1, 0,
1063    0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
1064    0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
1065    0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
1066    0xAAF2, 0,  // Meetei Mayek
1067    0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
1068    U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0,  // Sharada
1069    U16_LEAD(0x11680), U16_TRAIL(0x11680), 0,  // Takri
1070    0x1B83, 0,
1071    0xD802, 0xDE00, 0, 0x0E01, 0,
1072    0x0EDE, 0,  // Lao
1073    0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
1074    0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
1075    U16_LEAD(0x11103), U16_TRAIL(0x11103), 0,  // Chakma
1076    0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
1077    0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
1078    0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
1079    U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0,  // Miao
1080    0xD800, 0xDE80, 0,
1081    0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
1082    0xD801, 0xDC80, 0,
1083    U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0,  // Sora Sompeng
1084    0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
1085    0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
1086    U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0,  // Meroitic Cursive
1087    U16_LEAD(0x10980), U16_TRAIL(0x10980), 0,  // Meroitic Hieroglyphs
1088    0x4E00, 0,
1089    // TODO: The overflow bucket's lowerBoundary string should be the
1090    // first item after the last reordering group in the collator's script order.
1091    // This should normally be the first Unicode code point
1092    // that is unassigned (U+0378 in Unicode 6.3) and untailored.
1093    // However, at least up to ICU 51 the Hani reordering group includes
1094    // unassigned code points,
1095    // and there is no stable string for the start of the trailing-weights range.
1096    // The only known string that sorts "high" is U+FFFF.
1097    // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
1098    // and fix relevant test code.
1099    // Ideally, FractionalUCA.txt will have a "script first primary"
1100    // for unassigned code points.
1101    0xFFFF, 0
1102};
1103
1104UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
1105    if (U_FAILURE(status)) {
1106        return NULL;
1107    }
1108    UVector *dest = new UVector(status);
1109    if (dest == NULL) {
1110        status = U_MEMORY_ALLOCATION_ERROR;
1111        return NULL;
1112    }
1113    dest->setDeleter(uprv_deleteUObject);
1114    const UChar *src  = HACK_FIRST_CHARS_IN_SCRIPTS;
1115    const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
1116    do {
1117        if (U_FAILURE(status)) {
1118            return dest;
1119        }
1120        UnicodeString *str = new UnicodeString(src, -1);
1121        if (str == NULL) {
1122            status = U_MEMORY_ALLOCATION_ERROR;
1123            return dest;
1124        }
1125        dest->addElement(str, status);
1126        src += str->length() + 1;
1127    } while (src < limit);
1128    return dest;
1129}
1130
1131
1132namespace {
1133
1134/**
1135 * Returns true if one index character string is "better" than the other.
1136 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1137 * better, and otherwise binary-less-than is better.
1138 */
1139UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1140                                const UnicodeString &one, const UnicodeString &other) {
1141    // This is called with primary-equal strings, but never with one.equals(other).
1142    UErrorCode status = U_ZERO_ERROR;
1143    UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1144    UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1145    if (U_FAILURE(status)) { return FALSE; }
1146    int32_t result = n1.countChar32() - n2.countChar32();
1147    if (result != 0) {
1148        return result < 0;
1149    }
1150    result = n1.compareCodePointOrder(n2);
1151    if (result != 0) {
1152        return result < 0;
1153    }
1154    return one.compareCodePointOrder(other) < 0;
1155}
1156
1157}  // namespace
1158
1159//
1160//  Constructor & Destructor for AlphabeticIndex::Record
1161//
1162//     Records are internal only, instances are not directly surfaced in the public API.
1163//     This class is mostly struct-like, with all public fields.
1164
1165AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1166        : name_(name), data_(data) {}
1167
1168AlphabeticIndex::Record::~Record() {
1169}
1170
1171
1172AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1173    if (U_FAILURE(status)) {
1174        return *this;
1175    }
1176    if (inputList_ == NULL) {
1177        inputList_ = new UVector(status);
1178        if (inputList_ == NULL) {
1179            status = U_MEMORY_ALLOCATION_ERROR;
1180            return *this;
1181        }
1182        inputList_->setDeleter(alphaIndex_deleteRecord);
1183    }
1184    Record *r = new Record(name, data);
1185    if (r == NULL) {
1186        status = U_MEMORY_ALLOCATION_ERROR;
1187        return *this;
1188    }
1189    inputList_->addElement(r, status);
1190    clearBuckets();
1191    //std::string ss;
1192    //std::string ss2;
1193    //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1194    //             "   sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1195    return *this;
1196}
1197
1198
1199AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1200    if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1201        inputList_->removeAllElements();
1202        clearBuckets();
1203    }
1204    return *this;
1205}
1206
1207int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1208    initBuckets(status);
1209    if (U_FAILURE(status)) {
1210        return 0;
1211    }
1212    return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1213}
1214
1215
1216int32_t AlphabeticIndex::getBucketIndex() const {
1217    return labelsIterIndex_;
1218}
1219
1220
1221UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1222    if (U_FAILURE(status)) {
1223        return FALSE;
1224    }
1225    if (buckets_ == NULL && currentBucket_ != NULL) {
1226        status = U_ENUM_OUT_OF_SYNC_ERROR;
1227        return FALSE;
1228    }
1229    initBuckets(status);
1230    if (U_FAILURE(status)) {
1231        return FALSE;
1232    }
1233    ++labelsIterIndex_;
1234    if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1235        labelsIterIndex_ = buckets_->getBucketCount();
1236        return FALSE;
1237    }
1238    currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1239    resetRecordIterator();
1240    return TRUE;
1241}
1242
1243const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1244    if (currentBucket_ != NULL) {
1245        return currentBucket_->label_;
1246    } else {
1247        return emptyString_;
1248    }
1249}
1250
1251
1252UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1253    if (currentBucket_ != NULL) {
1254        return currentBucket_->labelType_;
1255    } else {
1256        return U_ALPHAINDEX_NORMAL;
1257    }
1258}
1259
1260
1261int32_t AlphabeticIndex::getBucketRecordCount() const {
1262    if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1263        return currentBucket_->records_->size();
1264    } else {
1265        return 0;
1266    }
1267}
1268
1269
1270AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1271    if (U_FAILURE(status)) {
1272        return *this;
1273    }
1274    internalResetBucketIterator();
1275    return *this;
1276}
1277
1278
1279UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1280    if (U_FAILURE(status)) {
1281        return FALSE;
1282    }
1283    if (currentBucket_ == NULL) {
1284        // We are trying to iterate over the items in a bucket, but there is no
1285        // current bucket from the enumeration of buckets.
1286        status = U_INVALID_STATE_ERROR;
1287        return FALSE;
1288    }
1289    if (buckets_ == NULL) {
1290        status = U_ENUM_OUT_OF_SYNC_ERROR;
1291        return FALSE;
1292    }
1293    if (currentBucket_->records_ == NULL) {
1294        return FALSE;
1295    }
1296    ++itemsIterIndex_;
1297    if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1298        itemsIterIndex_  = currentBucket_->records_->size();
1299        return FALSE;
1300    }
1301    return TRUE;
1302}
1303
1304
1305const UnicodeString &AlphabeticIndex::getRecordName() const {
1306    const UnicodeString *retStr = &emptyString_;
1307    if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1308        itemsIterIndex_ >= 0 &&
1309        itemsIterIndex_ < currentBucket_->records_->size()) {
1310            Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1311            retStr = &item->name_;
1312    }
1313    return *retStr;
1314}
1315
1316const void *AlphabeticIndex::getRecordData() const {
1317    const void *retPtr = NULL;
1318    if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1319        itemsIterIndex_ >= 0 &&
1320        itemsIterIndex_ < currentBucket_->records_->size()) {
1321            Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1322            retPtr = item->data_;
1323    }
1324    return retPtr;
1325}
1326
1327
1328AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1329    itemsIterIndex_ = -1;
1330    return *this;
1331}
1332
1333
1334
1335AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1336                                const UnicodeString &lowerBoundary,
1337                                UAlphabeticIndexLabelType type)
1338        : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1339          displayBucket_(NULL), displayIndex_(-1),
1340          records_(NULL) {
1341}
1342
1343
1344AlphabeticIndex::Bucket::~Bucket() {
1345    delete records_;
1346}
1347
1348U_NAMESPACE_END
1349
1350#endif
1351