1/*
2*******************************************************************************
3*   Copyright (C) 1996-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ucol.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11* Modification history
12* Date        Name      Comments
13* 1996-1999   various members of ICU team maintained C API for collation framework
14* 02/16/2001  synwee    Added internal method getPrevSpecialCE
15* 03/01/2001  synwee    Added maxexpansion functionality.
16* 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_COLLATION
22
23#include "unicode/bytestream.h"
24#include "unicode/coleitr.h"
25#include "unicode/unorm.h"
26#include "unicode/udata.h"
27#include "unicode/ustring.h"
28#include "unicode/utf8.h"
29
30#include "ucol_imp.h"
31#include "bocsu.h"
32
33#include "normalizer2impl.h"
34#include "unorm_it.h"
35#include "umutex.h"
36#include "cmemory.h"
37#include "ucln_in.h"
38#include "cstring.h"
39#include "utracimp.h"
40#include "putilimp.h"
41#include "uassert.h"
42#include "unicode/coll.h"
43
44#ifdef UCOL_DEBUG
45#include <stdio.h>
46#endif
47
48U_NAMESPACE_USE
49
50#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
51
52#define LAST_BYTE_MASK_           0xFF
53#define SECOND_LAST_BYTE_SHIFT_   8
54
55#define ZERO_CC_LIMIT_            0xC0
56
57// These are static pointers to the NFC/NFD implementation instance.
58// Each of them is always the same between calls to u_cleanup
59// and therefore writing to it is not synchronized.
60// They are cleaned in ucol_cleanup
61static const Normalizer2 *g_nfd = NULL;
62static const Normalizer2Impl *g_nfcImpl = NULL;
63
64// These are values from UCA required for
65// implicit generation and supressing sort key compression
66// they should regularly be in the UCA, but if one
67// is running without UCA, it could be a problem
68static const int32_t maxRegularPrimary  = 0x7A;
69static const int32_t minImplicitPrimary = 0xE0;
70static const int32_t maxImplicitPrimary = 0xE4;
71
72U_CDECL_BEGIN
73static UBool U_CALLCONV
74ucol_cleanup(void)
75{
76    g_nfd = NULL;
77    g_nfcImpl = NULL;
78    return TRUE;
79}
80
81static int32_t U_CALLCONV
82_getFoldingOffset(uint32_t data) {
83    return (int32_t)(data&0xFFFFFF);
84}
85
86U_CDECL_END
87
88static inline
89UBool initializeNFD(UErrorCode *status) {
90    if (g_nfd != NULL) {
91        return TRUE;
92    } else {
93        // The result is constant, until the library is reloaded.
94        g_nfd = Normalizer2Factory::getNFDInstance(*status);
95        ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
96        return U_SUCCESS(*status);
97    }
98}
99
100// init FCD data
101static inline
102UBool initializeFCD(UErrorCode *status) {
103    if (g_nfcImpl != NULL) {
104        return TRUE;
105    } else {
106        // The result is constant, until the library is reloaded.
107        g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
108        // Note: Alternatively, we could also store this pointer in each collIterate struct,
109        // same as Normalizer2Factory::getImpl(collIterate->nfd).
110        ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
111        return U_SUCCESS(*status);
112    }
113}
114
115static
116inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
117                              int32_t sourceLen, collIterate *s,
118                              UErrorCode *status)
119{
120    (s)->string = (s)->pos = sourceString;
121    (s)->origFlags = 0;
122    (s)->flags = 0;
123    if (sourceLen >= 0) {
124        s->flags |= UCOL_ITER_HASLEN;
125        (s)->endp = (UChar *)sourceString+sourceLen;
126    }
127    else {
128        /* change to enable easier checking for end of string for fcdpositon */
129        (s)->endp = NULL;
130    }
131    (s)->extendCEs = NULL;
132    (s)->extendCEsSize = 0;
133    (s)->CEpos = (s)->toReturn = (s)->CEs;
134    (s)->offsetBuffer = NULL;
135    (s)->offsetBufferSize = 0;
136    (s)->offsetReturn = (s)->offsetStore = NULL;
137    (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
138    (s)->coll = (collator);
139    if (initializeNFD(status)) {
140        (s)->nfd = g_nfd;
141    } else {
142        return;
143    }
144    (s)->fcdPosition = 0;
145    if(collator->normalizationMode == UCOL_ON) {
146        (s)->flags |= UCOL_ITER_NORM;
147    }
148    if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
149        (s)->flags |= UCOL_HIRAGANA_Q;
150    }
151    (s)->iterator = NULL;
152    //(s)->iteratorIndex = 0;
153}
154
155U_CAPI void  U_EXPORT2
156uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
157                             int32_t sourceLen, collIterate *s,
158                             UErrorCode *status) {
159    /* Out-of-line version for use from other files. */
160    IInit_collIterate(collator, sourceString, sourceLen, s, status);
161}
162
163U_CAPI collIterate * U_EXPORT2
164uprv_new_collIterate(UErrorCode *status) {
165    if(U_FAILURE(*status)) {
166        return NULL;
167    }
168    collIterate *s = new collIterate;
169    if(s == NULL) {
170        *status = U_MEMORY_ALLOCATION_ERROR;
171        return NULL;
172    }
173    return s;
174}
175
176U_CAPI void U_EXPORT2
177uprv_delete_collIterate(collIterate *s) {
178    delete s;
179}
180
181U_CAPI UBool U_EXPORT2
182uprv_collIterateAtEnd(collIterate *s) {
183    return s == NULL || s->pos == s->endp;
184}
185
186/**
187* Backup the state of the collIterate struct data
188* @param data collIterate to backup
189* @param backup storage
190*/
191static
192inline void backupState(const collIterate *data, collIterateState *backup)
193{
194    backup->fcdPosition = data->fcdPosition;
195    backup->flags       = data->flags;
196    backup->origFlags   = data->origFlags;
197    backup->pos         = data->pos;
198    backup->bufferaddress = data->writableBuffer.getBuffer();
199    backup->buffersize    = data->writableBuffer.length();
200    backup->iteratorMove = 0;
201    backup->iteratorIndex = 0;
202    if(data->iterator != NULL) {
203        //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
204        backup->iteratorIndex = data->iterator->getState(data->iterator);
205        // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
206        if(backup->iteratorIndex == UITER_NO_STATE) {
207            while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
208                backup->iteratorMove++;
209                data->iterator->move(data->iterator, -1, UITER_CURRENT);
210            }
211            data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
212        }
213    }
214}
215
216/**
217* Loads the state into the collIterate struct data
218* @param data collIterate to backup
219* @param backup storage
220* @param forwards boolean to indicate if forwards iteration is used,
221*        false indicates backwards iteration
222*/
223static
224inline void loadState(collIterate *data, const collIterateState *backup,
225                      UBool        forwards)
226{
227    UErrorCode status = U_ZERO_ERROR;
228    data->flags       = backup->flags;
229    data->origFlags   = backup->origFlags;
230    if(data->iterator != NULL) {
231        //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
232        data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
233        if(backup->iteratorMove != 0) {
234            data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
235        }
236    }
237    data->pos         = backup->pos;
238
239    if ((data->flags & UCOL_ITER_INNORMBUF) &&
240        data->writableBuffer.getBuffer() != backup->bufferaddress) {
241        /*
242        this is when a new buffer has been reallocated and we'll have to
243        calculate the new position.
244        note the new buffer has to contain the contents of the old buffer.
245        */
246        if (forwards) {
247            data->pos = data->writableBuffer.getTerminatedBuffer() +
248                                         (data->pos - backup->bufferaddress);
249        }
250        else {
251            /* backwards direction */
252            int32_t temp = backup->buffersize -
253                                  (int32_t)(data->pos - backup->bufferaddress);
254            data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
255        }
256    }
257    if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
258        /*
259        this is alittle tricky.
260        if we are initially not in the normalization buffer, even if we
261        normalize in the later stage, the data in the buffer will be
262        ignored, since we skip back up to the data string.
263        however if we are already in the normalization buffer, any
264        further normalization will pull data into the normalization
265        buffer and modify the fcdPosition.
266        since we are keeping the data in the buffer for use, the
267        fcdPosition can not be reverted back.
268        arrgghh....
269        */
270        data->fcdPosition = backup->fcdPosition;
271    }
272}
273
274static UBool
275reallocCEs(collIterate *data, int32_t newCapacity) {
276    uint32_t *oldCEs = data->extendCEs;
277    if(oldCEs == NULL) {
278        oldCEs = data->CEs;
279    }
280    int32_t length = data->CEpos - oldCEs;
281    uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
282    if(newCEs == NULL) {
283        return FALSE;
284    }
285    uprv_memcpy(newCEs, oldCEs, length * 4);
286    uprv_free(data->extendCEs);
287    data->extendCEs = newCEs;
288    data->extendCEsSize = newCapacity;
289    data->CEpos = newCEs + length;
290    return TRUE;
291}
292
293static UBool
294increaseCEsCapacity(collIterate *data) {
295    int32_t oldCapacity;
296    if(data->extendCEs != NULL) {
297        oldCapacity = data->extendCEsSize;
298    } else {
299        oldCapacity = LENGTHOF(data->CEs);
300    }
301    return reallocCEs(data, 2 * oldCapacity);
302}
303
304static UBool
305ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
306    int32_t oldCapacity;
307    if(data->extendCEs != NULL) {
308        oldCapacity = data->extendCEsSize;
309    } else {
310        oldCapacity = LENGTHOF(data->CEs);
311    }
312    if(minCapacity <= oldCapacity) {
313        return TRUE;
314    }
315    oldCapacity *= 2;
316    return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
317}
318
319void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
320    if(U_FAILURE(errorCode)) {
321        return;
322    }
323    int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
324    U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
325    if(length >= offsetBufferSize) {
326        int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
327        int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4));
328        if(newBuffer == NULL) {
329            errorCode = U_MEMORY_ALLOCATION_ERROR;
330            return;
331        }
332        if(length > 0) {
333            uprv_memcpy(newBuffer, offsetBuffer, length * 4);
334        }
335        uprv_free(offsetBuffer);
336        offsetBuffer = newBuffer;
337        offsetStore = offsetBuffer + length;
338        offsetBufferSize = newCapacity;
339    }
340    *offsetStore++ = offset;
341}
342
343/*
344* collIter_eos()
345*     Checks for a collIterate being positioned at the end of
346*     its source string.
347*
348*/
349static
350inline UBool collIter_eos(collIterate *s) {
351    if(s->flags & UCOL_USE_ITERATOR) {
352      return !(s->iterator->hasNext(s->iterator));
353    }
354    if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
355        // Null terminated string, but not at null, so not at end.
356        //   Whether in main or normalization buffer doesn't matter.
357        return FALSE;
358    }
359
360    // String with length.  Can't be in normalization buffer, which is always
361    //  null termintated.
362    if (s->flags & UCOL_ITER_HASLEN) {
363        return (s->pos == s->endp);
364    }
365
366    // We are at a null termination, could be either normalization buffer or main string.
367    if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
368        // At null at end of main string.
369        return TRUE;
370    }
371
372    // At null at end of normalization buffer.  Need to check whether there there are
373    //   any characters left in the main buffer.
374    if(s->origFlags & UCOL_USE_ITERATOR) {
375      return !(s->iterator->hasNext(s->iterator));
376    } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
377        // Null terminated main string.  fcdPosition is the 'return' position into main buf.
378        return (*s->fcdPosition == 0);
379    }
380    else {
381        // Main string with an end pointer.
382        return s->fcdPosition == s->endp;
383    }
384}
385
386/*
387* collIter_bos()
388*     Checks for a collIterate being positioned at the start of
389*     its source string.
390*
391*/
392static
393inline UBool collIter_bos(collIterate *source) {
394  // if we're going backwards, we need to know whether there is more in the
395  // iterator, even if we are in the side buffer
396  if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
397    return !source->iterator->hasPrevious(source->iterator);
398  }
399  if (source->pos <= source->string ||
400      ((source->flags & UCOL_ITER_INNORMBUF) &&
401      *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
402    return TRUE;
403  }
404  return FALSE;
405}
406
407/*static
408inline UBool collIter_SimpleBos(collIterate *source) {
409  // if we're going backwards, we need to know whether there is more in the
410  // iterator, even if we are in the side buffer
411  if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
412    return !source->iterator->hasPrevious(source->iterator);
413  }
414  if (source->pos == source->string) {
415    return TRUE;
416  }
417  return FALSE;
418}*/
419    //return (data->pos == data->string) ||
420
421
422/****************************************************************************/
423/* Following are the open/close functions                                   */
424/*                                                                          */
425/****************************************************************************/
426
427static UCollator*
428ucol_initFromBinary(const uint8_t *bin, int32_t length,
429                const UCollator *base,
430                UCollator *fillIn,
431                UErrorCode *status)
432{
433    UCollator *result = fillIn;
434    if(U_FAILURE(*status)) {
435        return NULL;
436    }
437    /*
438    if(base == NULL) {
439        // we don't support null base yet
440        *status = U_ILLEGAL_ARGUMENT_ERROR;
441        return NULL;
442    }
443    */
444    // We need these and we could be running without UCA
445    uprv_uca_initImplicitConstants(status);
446    UCATableHeader *colData = (UCATableHeader *)bin;
447    // do we want version check here? We're trying to figure out whether collators are compatible
448    if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
449        uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
450        colData->version[0] != UCOL_BUILDER_VERSION)
451    {
452        *status = U_COLLATOR_VERSION_MISMATCH;
453        return NULL;
454    }
455    else {
456        if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
457            result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
458            if(U_FAILURE(*status)){
459                return NULL;
460            }
461            result->hasRealData = TRUE;
462        }
463        else {
464            if(base) {
465                result = ucol_initCollator(base->image, result, base, status);
466                ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
467                if(U_FAILURE(*status)){
468                    return NULL;
469                }
470                result->hasRealData = FALSE;
471            }
472            else {
473                *status = U_USELESS_COLLATOR_ERROR;
474                return NULL;
475            }
476        }
477        result->freeImageOnClose = FALSE;
478    }
479    result->actualLocale = NULL;
480    result->validLocale = NULL;
481    result->requestedLocale = NULL;
482    result->rules = NULL;
483    result->rulesLength = 0;
484    result->freeRulesOnClose = FALSE;
485    result->ucaRules = NULL;
486    return result;
487}
488
489U_CAPI UCollator* U_EXPORT2
490ucol_openBinary(const uint8_t *bin, int32_t length,
491                const UCollator *base,
492                UErrorCode *status)
493{
494    return ucol_initFromBinary(bin, length, base, NULL, status);
495}
496
497U_CAPI int32_t U_EXPORT2
498ucol_cloneBinary(const UCollator *coll,
499                 uint8_t *buffer, int32_t capacity,
500                 UErrorCode *status)
501{
502    int32_t length = 0;
503    if(U_FAILURE(*status)) {
504        return length;
505    }
506    if(capacity < 0) {
507        *status = U_ILLEGAL_ARGUMENT_ERROR;
508        return length;
509    }
510    if(coll->hasRealData == TRUE) {
511        length = coll->image->size;
512        if(length <= capacity) {
513            uprv_memcpy(buffer, coll->image, length);
514        } else {
515            *status = U_BUFFER_OVERFLOW_ERROR;
516        }
517    } else {
518        length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
519        if(length <= capacity) {
520            /* build the UCATableHeader with minimal entries */
521            /* do not copy the header from the UCA file because its values are wrong! */
522            /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
523
524            /* reset everything */
525            uprv_memset(buffer, 0, length);
526
527            /* set the tailoring-specific values */
528            UCATableHeader *myData = (UCATableHeader *)buffer;
529            myData->size = length;
530
531            /* offset for the options, the only part of the data that is present after the header */
532            myData->options = sizeof(UCATableHeader);
533
534            /* need to always set the expansion value for an upper bound of the options */
535            myData->expansion = myData->options + sizeof(UColOptionSet);
536
537            myData->magic = UCOL_HEADER_MAGIC;
538            myData->isBigEndian = U_IS_BIG_ENDIAN;
539            myData->charSetFamily = U_CHARSET_FAMILY;
540
541            /* copy UCA's version; genrb will override all but the builder version with tailoring data */
542            uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
543
544            uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
545            uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
546            uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
547            myData->jamoSpecial = coll->image->jamoSpecial;
548
549            /* copy the collator options */
550            uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
551        } else {
552            *status = U_BUFFER_OVERFLOW_ERROR;
553        }
554    }
555    return length;
556}
557
558U_CAPI UCollator* U_EXPORT2
559ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
560{
561    UCollator * localCollator;
562    int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
563    char *stackBufferChars = (char *)stackBuffer;
564    int32_t imageSize = 0;
565    int32_t rulesSize = 0;
566    int32_t rulesPadding = 0;
567    int32_t defaultReorderCodesSize = 0;
568    int32_t reorderCodesSize = 0;
569    uint8_t *image;
570    UChar *rules;
571    int32_t* defaultReorderCodes;
572    int32_t* reorderCodes;
573    uint8_t* leadBytePermutationTable;
574    UBool colAllocated = FALSE;
575    UBool imageAllocated = FALSE;
576
577    if (status == NULL || U_FAILURE(*status)){
578        return 0;
579    }
580    if ((stackBuffer && !pBufferSize) || !coll){
581       *status = U_ILLEGAL_ARGUMENT_ERROR;
582        return 0;
583    }
584
585    if (coll->rules && coll->freeRulesOnClose) {
586        rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
587        rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
588        bufferSizeNeeded += rulesSize + rulesPadding;
589    }
590    // no padding for alignment needed from here since the next two are 4 byte quantities
591    if (coll->defaultReorderCodes) {
592        defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
593        bufferSizeNeeded += defaultReorderCodesSize;
594    }
595    if (coll->reorderCodes) {
596        reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
597        bufferSizeNeeded += reorderCodesSize;
598    }
599    if (coll->leadBytePermutationTable) {
600        bufferSizeNeeded += 256 * sizeof(uint8_t);
601    }
602
603    if (stackBuffer && *pBufferSize <= 0) { /* 'preflighting' request - set needed size into *pBufferSize */
604        *pBufferSize =  bufferSizeNeeded;
605        return 0;
606    }
607
608    /* Pointers on 64-bit platforms need to be aligned
609     * on a 64-bit boundry in memory.
610     */
611    if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
612        int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
613        if (*pBufferSize > offsetUp) {
614            *pBufferSize -= offsetUp;
615            stackBufferChars += offsetUp;
616        }
617        else {
618            /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
619            *pBufferSize = 1;
620        }
621    }
622    stackBuffer = (void *)stackBufferChars;
623
624    if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
625        /* allocate one here...*/
626        stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
627        // Null pointer check.
628        if (stackBufferChars == NULL) {
629            *status = U_MEMORY_ALLOCATION_ERROR;
630            return NULL;
631        }
632        colAllocated = TRUE;
633        if (U_SUCCESS(*status)) {
634            *status = U_SAFECLONE_ALLOCATED_WARNING;
635        }
636    }
637    localCollator = (UCollator *)stackBufferChars;
638    rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
639    defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
640    reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
641    leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
642
643    {
644        UErrorCode tempStatus = U_ZERO_ERROR;
645        imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
646    }
647    if (coll->freeImageOnClose) {
648        image = (uint8_t *)uprv_malloc(imageSize);
649        // Null pointer check
650        if (image == NULL) {
651            *status = U_MEMORY_ALLOCATION_ERROR;
652            return NULL;
653        }
654        ucol_cloneBinary(coll, image, imageSize, status);
655        imageAllocated = TRUE;
656    }
657    else {
658        image = (uint8_t *)coll->image;
659    }
660    localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
661    if (U_FAILURE(*status)) {
662        return NULL;
663    }
664
665    if (coll->rules) {
666        if (coll->freeRulesOnClose) {
667            localCollator->rules = u_strcpy(rules, coll->rules);
668            //bufferEnd += rulesSize;
669        }
670        else {
671            localCollator->rules = coll->rules;
672        }
673        localCollator->freeRulesOnClose = FALSE;
674        localCollator->rulesLength = coll->rulesLength;
675    }
676
677    // collator reordering
678    if (coll->defaultReorderCodes) {
679        localCollator->defaultReorderCodes =
680            (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
681        localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
682        localCollator->freeDefaultReorderCodesOnClose = FALSE;
683    }
684    if (coll->reorderCodes) {
685        localCollator->reorderCodes =
686            (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
687        localCollator->reorderCodesLength = coll->reorderCodesLength;
688        localCollator->freeReorderCodesOnClose = FALSE;
689    }
690    if (coll->leadBytePermutationTable) {
691        localCollator->leadBytePermutationTable =
692            (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
693        localCollator->freeLeadBytePermutationTableOnClose = FALSE;
694    }
695
696    int32_t i;
697    for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
698        ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
699    }
700    // zero copies of pointers
701    localCollator->actualLocale = NULL;
702    localCollator->validLocale = NULL;
703    localCollator->requestedLocale = NULL;
704    localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
705    localCollator->freeOnClose = colAllocated;
706    localCollator->freeImageOnClose = imageAllocated;
707    return localCollator;
708}
709
710U_CAPI void U_EXPORT2
711ucol_close(UCollator *coll)
712{
713    UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
714    UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
715    if(coll != NULL) {
716        // these are always owned by each UCollator struct,
717        // so we always free them
718        if(coll->validLocale != NULL) {
719            uprv_free(coll->validLocale);
720        }
721        if(coll->actualLocale != NULL) {
722            uprv_free(coll->actualLocale);
723        }
724        if(coll->requestedLocale != NULL) {
725            uprv_free(coll->requestedLocale);
726        }
727        if(coll->latinOneCEs != NULL) {
728            uprv_free(coll->latinOneCEs);
729        }
730        if(coll->options != NULL && coll->freeOptionsOnClose) {
731            uprv_free(coll->options);
732        }
733        if(coll->rules != NULL && coll->freeRulesOnClose) {
734            uprv_free((UChar *)coll->rules);
735        }
736        if(coll->image != NULL && coll->freeImageOnClose) {
737            uprv_free((UCATableHeader *)coll->image);
738        }
739
740        if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
741            uprv_free(coll->leadBytePermutationTable);
742        }
743        if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
744            uprv_free(coll->defaultReorderCodes);
745        }
746        if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
747            uprv_free(coll->reorderCodes);
748        }
749
750        if(coll->delegate != NULL) {
751          delete (Collator*)coll->delegate;
752        }
753
754        /* Here, it would be advisable to close: */
755        /* - UData for UCA (unless we stuff it in the root resb */
756        /* Again, do we need additional housekeeping... HMMM! */
757        UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
758        if(coll->freeOnClose){
759            /* for safeClone, if freeOnClose is FALSE,
760            don't free the other instance data */
761            uprv_free(coll);
762        }
763    }
764    UTRACE_EXIT();
765}
766
767/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
768/* you should be able to get the binary chunk to write out...  Doesn't look very full now */
769U_CFUNC uint8_t* U_EXPORT2
770ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
771{
772    uint8_t *result = NULL;
773    if(U_FAILURE(*status)) {
774        return NULL;
775    }
776    if(coll->hasRealData == TRUE) {
777        *length = coll->image->size;
778        result = (uint8_t *)uprv_malloc(*length);
779        /* test for NULL */
780        if (result == NULL) {
781            *status = U_MEMORY_ALLOCATION_ERROR;
782            return NULL;
783        }
784        uprv_memcpy(result, coll->image, *length);
785    } else {
786        *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
787        result = (uint8_t *)uprv_malloc(*length);
788        /* test for NULL */
789        if (result == NULL) {
790            *status = U_MEMORY_ALLOCATION_ERROR;
791            return NULL;
792        }
793
794        /* build the UCATableHeader with minimal entries */
795        /* do not copy the header from the UCA file because its values are wrong! */
796        /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
797
798        /* reset everything */
799        uprv_memset(result, 0, *length);
800
801        /* set the tailoring-specific values */
802        UCATableHeader *myData = (UCATableHeader *)result;
803        myData->size = *length;
804
805        /* offset for the options, the only part of the data that is present after the header */
806        myData->options = sizeof(UCATableHeader);
807
808        /* need to always set the expansion value for an upper bound of the options */
809        myData->expansion = myData->options + sizeof(UColOptionSet);
810
811        myData->magic = UCOL_HEADER_MAGIC;
812        myData->isBigEndian = U_IS_BIG_ENDIAN;
813        myData->charSetFamily = U_CHARSET_FAMILY;
814
815        /* copy UCA's version; genrb will override all but the builder version with tailoring data */
816        uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
817
818        uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
819        uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
820        uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
821        myData->jamoSpecial = coll->image->jamoSpecial;
822
823        /* copy the collator options */
824        uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
825    }
826    return result;
827}
828
829void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
830    if(U_FAILURE(*status)) {
831        return;
832    }
833    result->caseFirst = (UColAttributeValue)opts->caseFirst;
834    result->caseLevel = (UColAttributeValue)opts->caseLevel;
835    result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
836    result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
837    if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
838        return;
839    }
840    result->strength = (UColAttributeValue)opts->strength;
841    result->variableTopValue = opts->variableTopValue;
842    result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
843    result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
844    result->numericCollation = (UColAttributeValue)opts->numericCollation;
845    result->caseFirstisDefault = TRUE;
846    result->caseLevelisDefault = TRUE;
847    result->frenchCollationisDefault = TRUE;
848    result->normalizationModeisDefault = TRUE;
849    result->strengthisDefault = TRUE;
850    result->variableTopValueisDefault = TRUE;
851    result->alternateHandlingisDefault = TRUE;
852    result->hiraganaQisDefault = TRUE;
853    result->numericCollationisDefault = TRUE;
854
855    ucol_updateInternalState(result, status);
856
857    result->options = opts;
858}
859
860
861/**
862* Approximate determination if a character is at a contraction end.
863* Guaranteed to be TRUE if a character is at the end of a contraction,
864* otherwise it is not deterministic.
865* @param c character to be determined
866* @param coll collator
867*/
868static
869inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
870    if (c < coll->minContrEndCP) {
871        return FALSE;
872    }
873
874    int32_t  hash = c;
875    uint8_t  htbyte;
876    if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
877        if (U16_IS_TRAIL(c)) {
878            return TRUE;
879        }
880        hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
881    }
882    htbyte = coll->contrEndCP[hash>>3];
883    return (((htbyte >> (hash & 7)) & 1) == 1);
884}
885
886
887
888/*
889*   i_getCombiningClass()
890*        A fast, at least partly inline version of u_getCombiningClass()
891*        This is a candidate for further optimization.  Used heavily
892*        in contraction processing.
893*/
894static
895inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
896    uint8_t sCC = 0;
897    if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
898        sCC = u_getCombiningClass(c);
899    }
900    return sCC;
901}
902
903UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
904    UChar c;
905    UCollator *result = fillIn;
906    if(U_FAILURE(*status) || image == NULL) {
907        return NULL;
908    }
909
910    if(result == NULL) {
911        result = (UCollator *)uprv_malloc(sizeof(UCollator));
912        if(result == NULL) {
913            *status = U_MEMORY_ALLOCATION_ERROR;
914            return result;
915        }
916        result->freeOnClose = TRUE;
917    } else {
918        result->freeOnClose = FALSE;
919    }
920
921    result->delegate = NULL;
922
923    result->image = image;
924    result->mapping.getFoldingOffset = _getFoldingOffset;
925    const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
926    utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
927    if(U_FAILURE(*status)) {
928        if(result->freeOnClose == TRUE) {
929            uprv_free(result);
930            result = NULL;
931        }
932        return result;
933    }
934
935    result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
936    result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
937    result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
938    result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
939    result->rules = NULL;
940    result->rulesLength = 0;
941    result->freeRulesOnClose = FALSE;
942    result->defaultReorderCodes = NULL;
943    result->defaultReorderCodesLength = 0;
944    result->freeDefaultReorderCodesOnClose = FALSE;
945    result->reorderCodes = NULL;
946    result->reorderCodesLength = 0;
947    result->freeReorderCodesOnClose = FALSE;
948    result->leadBytePermutationTable = NULL;
949    result->freeLeadBytePermutationTableOnClose = FALSE;
950
951    /* get the version info from UCATableHeader and populate the Collator struct*/
952    result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
953    result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
954    result->dataVersion[2] = 0;
955    result->dataVersion[3] = 0;
956
957    result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
958    result->minUnsafeCP = 0;
959    for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
960        if (ucol_unsafeCP(c, result)) break;
961    }
962    result->minUnsafeCP = c;
963
964    result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
965    result->minContrEndCP = 0;
966    for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
967        if (ucol_contractionEndCP(c, result)) break;
968    }
969    result->minContrEndCP = c;
970
971    /* max expansion tables */
972    result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
973                                         result->image->endExpansionCE);
974    result->lastEndExpansionCE = result->endExpansionCE +
975                                 result->image->endExpansionCECount - 1;
976    result->expansionCESize = (uint8_t*)result->image +
977                                               result->image->expansionCESize;
978
979
980    //result->errorCode = *status;
981
982    result->latinOneCEs = NULL;
983
984    result->latinOneRegenTable = FALSE;
985    result->latinOneFailed = FALSE;
986    result->UCA = UCA;
987
988    /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
989    result->ucaRules = NULL;
990    result->actualLocale = NULL;
991    result->validLocale = NULL;
992    result->requestedLocale = NULL;
993    result->hasRealData = FALSE; // real data lives in .dat file...
994    result->freeImageOnClose = FALSE;
995
996    /* set attributes */
997    ucol_setOptionsFromHeader(
998        result,
999        (UColOptionSet*)((uint8_t*)result->image+result->image->options),
1000        status);
1001    result->freeOptionsOnClose = FALSE;
1002
1003    return result;
1004}
1005
1006/* new Mark's code */
1007
1008/**
1009 * For generation of Implicit CEs
1010 * @author Davis
1011 *
1012 * Cleaned up so that changes can be made more easily.
1013 * Old values:
1014# First Implicit: E26A792D
1015# Last Implicit: E3DC70C0
1016# First CJK: E0030300
1017# Last CJK: E0A9DD00
1018# First CJK_A: E0A9DF00
1019# Last CJK_A: E0DE3100
1020 */
1021/* Following is a port of Mark's code for new treatment of implicits.
1022 * It is positioned here, since ucol_initUCA need to initialize the
1023 * variables below according to the data in the fractional UCA.
1024 */
1025
1026/**
1027 * Function used to:
1028 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
1029 * b) bump any non-CJK characters by 10FFFF.
1030 * The relevant blocks are:
1031 * A:    4E00..9FFF; CJK Unified Ideographs
1032 *       F900..FAFF; CJK Compatibility Ideographs
1033 * B:    3400..4DBF; CJK Unified Ideographs Extension A
1034 *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
1035 * As long as
1036 *   no new B characters are allocated between 4E00 and FAFF, and
1037 *   no new A characters are outside of this range,
1038 * (very high probability) this simple code will work.
1039 * The reordered blocks are:
1040 * Block1 is CJK
1041 * Block2 is CJK_COMPAT_USED
1042 * Block3 is CJK_A
1043 * (all contiguous)
1044 * Any other CJK gets its normal code point
1045 * Any non-CJK gets +10FFFF
1046 * When we reorder Block1, we make sure that it is at the very start,
1047 * so that it will use a 3-byte form.
1048 * Warning: the we only pick up the compatibility characters that are
1049 * NOT decomposed, so that block is smaller!
1050 */
1051
1052// CONSTANTS
1053static const UChar32
1054    NON_CJK_OFFSET = 0x110000,
1055    UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
1056
1057/**
1058 * Precomputed by initImplicitConstants()
1059 */
1060static int32_t
1061    final3Multiplier = 0,
1062    final4Multiplier = 0,
1063    final3Count = 0,
1064    final4Count = 0,
1065    medialCount = 0,
1066    min3Primary = 0,
1067    min4Primary = 0,
1068    max4Primary = 0,
1069    minTrail = 0,
1070    maxTrail = 0,
1071    max3Trail = 0,
1072    max4Trail = 0,
1073    min4Boundary = 0;
1074
1075static const UChar32
1076    // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
1077    // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;  (Unicode 6.1)
1078    CJK_BASE = 0x4E00,
1079    CJK_LIMIT = 0x9FCC+1,
1080    // Unified CJK ideographs in the compatibility ideographs block.
1081    CJK_COMPAT_USED_BASE = 0xFA0E,
1082    CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1083    // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1084    // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1085    CJK_A_BASE = 0x3400,
1086    CJK_A_LIMIT = 0x4DB5+1,
1087    // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1088    // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1089    CJK_B_BASE = 0x20000,
1090    CJK_B_LIMIT = 0x2A6D6+1,
1091    // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1092    // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1093    CJK_C_BASE = 0x2A700,
1094    CJK_C_LIMIT = 0x2B734+1,
1095    // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1096    // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1097    CJK_D_BASE = 0x2B740,
1098    CJK_D_LIMIT = 0x2B81D+1;
1099    // when adding to this list, look for all occurrences (in project)
1100    // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
1101
1102static UChar32 swapCJK(UChar32 i) {
1103    if (i < CJK_A_BASE) {
1104        // non-CJK
1105    } else if (i < CJK_A_LIMIT) {
1106        // Extension A has lower code points than the original Unihan+compat
1107        // but sorts higher.
1108        return i - CJK_A_BASE
1109                + (CJK_LIMIT - CJK_BASE)
1110                + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1111    } else if (i < CJK_BASE) {
1112        // non-CJK
1113    } else if (i < CJK_LIMIT) {
1114        return i - CJK_BASE;
1115    } else if (i < CJK_COMPAT_USED_BASE) {
1116        // non-CJK
1117    } else if (i < CJK_COMPAT_USED_LIMIT) {
1118        return i - CJK_COMPAT_USED_BASE
1119                + (CJK_LIMIT - CJK_BASE);
1120    } else if (i < CJK_B_BASE) {
1121        // non-CJK
1122    } else if (i < CJK_B_LIMIT) {
1123        return i; // non-BMP-CJK
1124    } else if (i < CJK_C_BASE) {
1125        // non-CJK
1126    } else if (i < CJK_C_LIMIT) {
1127        return i; // non-BMP-CJK
1128    } else if (i < CJK_D_BASE) {
1129        // non-CJK
1130    } else if (i < CJK_D_LIMIT) {
1131        return i; // non-BMP-CJK
1132    }
1133    return i + NON_CJK_OFFSET; // non-CJK
1134}
1135
1136U_CAPI UChar32 U_EXPORT2
1137uprv_uca_getRawFromCodePoint(UChar32 i) {
1138    return swapCJK(i)+1;
1139}
1140
1141U_CAPI UChar32 U_EXPORT2
1142uprv_uca_getCodePointFromRaw(UChar32 i) {
1143    i--;
1144    UChar32 result = 0;
1145    if(i >= NON_CJK_OFFSET) {
1146        result = i - NON_CJK_OFFSET;
1147    } else if(i >= CJK_B_BASE) {
1148        result = i;
1149    } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1150        if(i < CJK_LIMIT - CJK_BASE) {
1151            result = i + CJK_BASE;
1152        } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1153            result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1154        } else {
1155            result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1156        }
1157    } else {
1158        result = -1;
1159    }
1160    return result;
1161}
1162
1163// GET IMPLICIT PRIMARY WEIGHTS
1164// Return value is left justified primary key
1165U_CAPI uint32_t U_EXPORT2
1166uprv_uca_getImplicitFromRaw(UChar32 cp) {
1167    /*
1168    if (cp < 0 || cp > UCOL_MAX_INPUT) {
1169        throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1170    }
1171    */
1172    int32_t last0 = cp - min4Boundary;
1173    if (last0 < 0) {
1174        int32_t last1 = cp / final3Count;
1175        last0 = cp % final3Count;
1176
1177        int32_t last2 = last1 / medialCount;
1178        last1 %= medialCount;
1179
1180        last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1181        last1 = minTrail + last1; // offset
1182        last2 = min3Primary + last2; // offset
1183        /*
1184        if (last2 >= min4Primary) {
1185            throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1186        }
1187        */
1188        return (last2 << 24) + (last1 << 16) + (last0 << 8);
1189    } else {
1190        int32_t last1 = last0 / final4Count;
1191        last0 %= final4Count;
1192
1193        int32_t last2 = last1 / medialCount;
1194        last1 %= medialCount;
1195
1196        int32_t last3 = last2 / medialCount;
1197        last2 %= medialCount;
1198
1199        last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1200        last1 = minTrail + last1; // offset
1201        last2 = minTrail + last2; // offset
1202        last3 = min4Primary + last3; // offset
1203        /*
1204        if (last3 > max4Primary) {
1205            throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1206        }
1207        */
1208        return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1209    }
1210}
1211
1212static uint32_t U_EXPORT2
1213uprv_uca_getImplicitPrimary(UChar32 cp) {
1214   //fprintf(stdout, "Incoming: %04x\n", cp);
1215    //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1216
1217    cp = swapCJK(cp);
1218    cp++;
1219    // we now have a range of numbers from 0 to 21FFFF.
1220
1221    //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1222    //fprintf(stdout, "CJK swapped: %04x\n", cp);
1223
1224    return uprv_uca_getImplicitFromRaw(cp);
1225}
1226
1227/**
1228 * Converts implicit CE into raw integer ("code point")
1229 * @param implicit
1230 * @return -1 if illegal format
1231 */
1232U_CAPI UChar32 U_EXPORT2
1233uprv_uca_getRawFromImplicit(uint32_t implicit) {
1234    UChar32 result;
1235    UChar32 b3 = implicit & 0xFF;
1236    UChar32 b2 = (implicit >> 8) & 0xFF;
1237    UChar32 b1 = (implicit >> 16) & 0xFF;
1238    UChar32 b0 = (implicit >> 24) & 0xFF;
1239
1240    // simple parameter checks
1241    if (b0 < min3Primary || b0 > max4Primary
1242        || b1 < minTrail || b1 > maxTrail)
1243        return -1;
1244    // normal offsets
1245    b1 -= minTrail;
1246
1247    // take care of the final values, and compose
1248    if (b0 < min4Primary) {
1249        if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1250            return -1;
1251        b2 -= minTrail;
1252        UChar32 remainder = b2 % final3Multiplier;
1253        if (remainder != 0)
1254            return -1;
1255        b0 -= min3Primary;
1256        b2 /= final3Multiplier;
1257        result = ((b0 * medialCount) + b1) * final3Count + b2;
1258    } else {
1259        if (b2 < minTrail || b2 > maxTrail
1260            || b3 < minTrail || b3 > max4Trail)
1261            return -1;
1262        b2 -= minTrail;
1263        b3 -= minTrail;
1264        UChar32 remainder = b3 % final4Multiplier;
1265        if (remainder != 0)
1266            return -1;
1267        b3 /= final4Multiplier;
1268        b0 -= min4Primary;
1269        result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1270    }
1271    // final check
1272    if (result < 0 || result > UCOL_MAX_INPUT)
1273        return -1;
1274    return result;
1275}
1276
1277
1278static inline int32_t divideAndRoundUp(int a, int b) {
1279    return 1 + (a-1)/b;
1280}
1281
1282/* this function is either called from initUCA or from genUCA before
1283 * doing canonical closure for the UCA.
1284 */
1285
1286/**
1287 * Set up to generate implicits.
1288 * Maintenance Note:  this function may end up being called more than once, due
1289 *                    to threading races during initialization.  Make sure that
1290 *                    none of the Constants is ever transiently assigned an
1291 *                    incorrect value.
1292 * @param minPrimary
1293 * @param maxPrimary
1294 * @param minTrail final byte
1295 * @param maxTrail final byte
1296 * @param gap3 the gap we leave for tailoring for 3-byte forms
1297 * @param gap4 the gap we leave for tailoring for 4-byte forms
1298 */
1299static void initImplicitConstants(int minPrimary, int maxPrimary,
1300                                    int minTrailIn, int maxTrailIn,
1301                                    int gap3, int primaries3count,
1302                                    UErrorCode *status) {
1303    // some simple parameter checks
1304    if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1305        || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1306        || (primaries3count < 1))
1307    {
1308        *status = U_ILLEGAL_ARGUMENT_ERROR;
1309        return;
1310    };
1311
1312    minTrail = minTrailIn;
1313    maxTrail = maxTrailIn;
1314
1315    min3Primary = minPrimary;
1316    max4Primary = maxPrimary;
1317    // compute constants for use later.
1318    // number of values we can use in trailing bytes
1319    // leave room for empty values between AND above, e.g. if gap = 2
1320    // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1321    // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1322    // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1323    final3Multiplier = gap3 + 1;
1324    final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1325    max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1326
1327    // medials can use full range
1328    medialCount = (maxTrail - minTrail + 1);
1329    // find out how many values fit in each form
1330    int32_t threeByteCount = medialCount * final3Count;
1331    // now determine where the 3/4 boundary is.
1332    // we use 3 bytes below the boundary, and 4 above
1333    int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1334    int32_t primaries4count = primariesAvailable - primaries3count;
1335
1336
1337    int32_t min3ByteCoverage = primaries3count * threeByteCount;
1338    min4Primary = minPrimary + primaries3count;
1339    min4Boundary = min3ByteCoverage;
1340    // Now expand out the multiplier for the 4 bytes, and redo.
1341
1342    int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1343    int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1344    int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1345    int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1346    if (gap4 < 1) {
1347        *status = U_ILLEGAL_ARGUMENT_ERROR;
1348        return;
1349    }
1350    final4Multiplier = gap4 + 1;
1351    final4Count = neededPerFinalByte;
1352    max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1353}
1354
1355    /**
1356     * Supply parameters for generating implicit CEs
1357     */
1358U_CAPI void U_EXPORT2
1359uprv_uca_initImplicitConstants(UErrorCode *status) {
1360    // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1361    //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1362    initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1363}
1364
1365
1366/*    collIterNormalize     Incremental Normalization happens here.                       */
1367/*                          pick up the range of chars identifed by FCD,                  */
1368/*                          normalize it into the collIterate's writable buffer,          */
1369/*                          switch the collIterate's state to use the writable buffer.    */
1370/*                                                                                        */
1371static
1372void collIterNormalize(collIterate *collationSource)
1373{
1374    UErrorCode  status = U_ZERO_ERROR;
1375    const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1376    const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1377
1378    collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1379                                    collationSource->writableBuffer,
1380                                    status);
1381    if (U_FAILURE(status)) {
1382#ifdef UCOL_DEBUG
1383        fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
1384#endif
1385        return;
1386    }
1387
1388    collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
1389    collationSource->origFlags  = collationSource->flags;
1390    collationSource->flags     |= UCOL_ITER_INNORMBUF;
1391    collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1392}
1393
1394
1395// This function takes the iterator and extracts normalized stuff up to the next boundary
1396// It is similar in the end results to the collIterNormalize, but for the cases when we
1397// use an iterator
1398/*static
1399inline void normalizeIterator(collIterate *collationSource) {
1400  UErrorCode status = U_ZERO_ERROR;
1401  UBool wasNormalized = FALSE;
1402  //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1403  uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1404  int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1405    (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1406  if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1407    // reallocate and terminate
1408    if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1409                               &collationSource->writableBuffer,
1410                               (int32_t *)&collationSource->writableBufSize, normLen + 1,
1411                               0)
1412    ) {
1413    #ifdef UCOL_DEBUG
1414        fprintf(stderr, "normalizeIterator(), out of memory\n");
1415    #endif
1416        return;
1417    }
1418    status = U_ZERO_ERROR;
1419    //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1420    collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1421    normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1422    (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1423  }
1424  // Terminate the buffer - we already checked that it is big enough
1425  collationSource->writableBuffer[normLen] = 0;
1426  if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1427      collationSource->flags |= UCOL_ITER_ALLOCATED;
1428  }
1429  collationSource->pos        = collationSource->writableBuffer;
1430  collationSource->origFlags  = collationSource->flags;
1431  collationSource->flags     |= UCOL_ITER_INNORMBUF;
1432  collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1433}*/
1434
1435
1436/* Incremental FCD check and normalize                                                    */
1437/*   Called from getNextCE when normalization state is suspect.                           */
1438/*   When entering, the state is known to be this:                                        */
1439/*      o   We are working in the main buffer of the collIterate, not the side            */
1440/*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1441/*          so we won't get here.                                                         */
1442/*      o   The leading combining class from the current character is 0 or                */
1443/*          the trailing combining class of the previous char was zero.                   */
1444/*          True because the previous call to this function will have always exited       */
1445/*          that way, and we get called for every char where cc might be non-zero.        */
1446static
1447inline UBool collIterFCD(collIterate *collationSource) {
1448    const UChar *srcP, *endP;
1449    uint8_t     leadingCC;
1450    uint8_t     prevTrailingCC = 0;
1451    uint16_t    fcd;
1452    UBool       needNormalize = FALSE;
1453
1454    srcP = collationSource->pos-1;
1455
1456    if (collationSource->flags & UCOL_ITER_HASLEN) {
1457        endP = collationSource->endp;
1458    } else {
1459        endP = NULL;
1460    }
1461
1462    // Get the trailing combining class of the current character. If it's zero, we are OK.
1463    fcd = g_nfcImpl->nextFCD16(srcP, endP);
1464    if (fcd != 0) {
1465        prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1466
1467        if (prevTrailingCC != 0) {
1468            // The current char has a non-zero trailing CC.  Scan forward until we find
1469            //   a char with a leading cc of zero.
1470            while (endP == NULL || srcP != endP)
1471            {
1472                const UChar *savedSrcP = srcP;
1473
1474                fcd = g_nfcImpl->nextFCD16(srcP, endP);
1475                leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1476                if (leadingCC == 0) {
1477                    srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1478                                           //   back up over it.  (Could be surrogate pair!)
1479                    break;
1480                }
1481
1482                if (leadingCC < prevTrailingCC) {
1483                    needNormalize = TRUE;
1484                }
1485
1486                prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1487            }
1488        }
1489    }
1490
1491    collationSource->fcdPosition = (UChar *)srcP;
1492
1493    return needNormalize;
1494}
1495
1496/****************************************************************************/
1497/* Following are the CE retrieval functions                                 */
1498/*                                                                          */
1499/****************************************************************************/
1500
1501static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1502static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1503
1504/* there should be a macro version of this function in the header file */
1505/* This is the first function that tries to fetch a collation element  */
1506/* If it's not succesfull or it encounters a more difficult situation  */
1507/* some more sofisticated and slower functions are invoked             */
1508static
1509inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1510    uint32_t order = 0;
1511    if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1512        order = *(collationSource->toReturn++);                         /* if so, return them */
1513        if(collationSource->CEpos == collationSource->toReturn) {
1514            collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1515        }
1516        return order;
1517    }
1518
1519    UChar ch = 0;
1520    collationSource->offsetReturn = NULL;
1521
1522    do {
1523        for (;;)                           /* Loop handles case when incremental normalize switches   */
1524        {                                  /*   to or from the side buffer / original string, and we  */
1525            /*   need to start again to get the next character.        */
1526
1527            if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1528            {
1529                // The source string is null terminated and we're not working from the side buffer,
1530                //   and we're not normalizing.  This is the fast path.
1531                //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1532                ch = *collationSource->pos++;
1533                if (ch != 0) {
1534                    break;
1535                }
1536                else {
1537                    return UCOL_NO_MORE_CES;
1538                }
1539            }
1540
1541            if (collationSource->flags & UCOL_ITER_HASLEN) {
1542                // Normal path for strings when length is specified.
1543                //   (We can't be in side buffer because it is always null terminated.)
1544                if (collationSource->pos >= collationSource->endp) {
1545                    // Ran off of the end of the main source string.  We're done.
1546                    return UCOL_NO_MORE_CES;
1547                }
1548                ch = *collationSource->pos++;
1549            }
1550            else if(collationSource->flags & UCOL_USE_ITERATOR) {
1551                UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1552                if(iterCh == U_SENTINEL) {
1553                    return UCOL_NO_MORE_CES;
1554                }
1555                ch = (UChar)iterCh;
1556            }
1557            else
1558            {
1559                // Null terminated string.
1560                ch = *collationSource->pos++;
1561                if (ch == 0) {
1562                    // Ran off end of buffer.
1563                    if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1564                        // Ran off end of main string. backing up one character.
1565                        collationSource->pos--;
1566                        return UCOL_NO_MORE_CES;
1567                    }
1568                    else
1569                    {
1570                        // Hit null in the normalize side buffer.
1571                        // Usually this means the end of the normalized data,
1572                        // except for one odd case: a null followed by combining chars,
1573                        //   which is the case if we are at the start of the buffer.
1574                        if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
1575                            break;
1576                        }
1577
1578                        //  Null marked end of side buffer.
1579                        //   Revert to the main string and
1580                        //   loop back to top to try again to get a character.
1581                        collationSource->pos   = collationSource->fcdPosition;
1582                        collationSource->flags = collationSource->origFlags;
1583                        continue;
1584                    }
1585                }
1586            }
1587
1588            if(collationSource->flags&UCOL_HIRAGANA_Q) {
1589                /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1590                 * based on whether the previous codepoint was Hiragana or Katakana.
1591                 */
1592                if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1593                        ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1594                    collationSource->flags |= UCOL_WAS_HIRAGANA;
1595                } else {
1596                    collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1597                }
1598            }
1599
1600            // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1601            //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1602            if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1603                break;
1604            }
1605
1606            if (collationSource->fcdPosition >= collationSource->pos) {
1607                // An earlier FCD check has already covered the current character.
1608                // We can go ahead and process this char.
1609                break;
1610            }
1611
1612            if (ch < ZERO_CC_LIMIT_ ) {
1613                // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1614                break;
1615            }
1616
1617            if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1618                // We need to peek at the next character in order to tell if we are FCD
1619                if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1620                    // We are at the last char of source string.
1621                    //  It is always OK for FCD check.
1622                    break;
1623                }
1624
1625                // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1626                if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1627                    break;
1628                }
1629            }
1630
1631
1632            // Need a more complete FCD check and possible normalization.
1633            if (collIterFCD(collationSource)) {
1634                collIterNormalize(collationSource);
1635            }
1636            if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1637                //  No normalization was needed.  Go ahead and process the char we already had.
1638                break;
1639            }
1640
1641            // Some normalization happened.  Next loop iteration will pick up a char
1642            //   from the normalization buffer.
1643
1644        }   // end for (;;)
1645
1646
1647        if (ch <= 0xFF) {
1648            /*  For latin-1 characters we never need to fall back to the UCA table        */
1649            /*    because all of the UCA data is replicated in the latinOneMapping array  */
1650            order = coll->latinOneMapping[ch];
1651            if (order > UCOL_NOT_FOUND) {
1652                order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1653            }
1654        }
1655        else
1656        {
1657            // Always use UCA for Han, Hangul
1658            // (Han extension A is before main Han block)
1659            // **** Han compatibility chars ?? ****
1660            if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1661                (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1662                if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1663                    // between the two target ranges; do normal lookup
1664                    // **** this range is YI, Modifier tone letters, ****
1665                    // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
1666                    // **** Latin-D might be tailored, so we need to ****
1667                    // **** do the normal lookup for these guys.     ****
1668                    order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1669                } else {
1670                    // in one of the target ranges; use UCA
1671                    order = UCOL_NOT_FOUND;
1672                }
1673            } else {
1674                order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1675            }
1676
1677            if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1678                order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1679            }
1680
1681            if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1682                /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1683                order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1684
1685                if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1686                    order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1687                }
1688            }
1689        }
1690    } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
1691
1692    if(order == UCOL_NOT_FOUND) {
1693        order = getImplicit(ch, collationSource);
1694    }
1695    return order; /* return the CE */
1696}
1697
1698/* ucol_getNextCE, out-of-line version for use from other files.   */
1699U_CAPI uint32_t  U_EXPORT2
1700ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1701    return ucol_IGetNextCE(coll, collationSource, status);
1702}
1703
1704
1705/**
1706* Incremental previous normalization happens here. Pick up the range of chars
1707* identifed by FCD, normalize it into the collIterate's writable buffer,
1708* switch the collIterate's state to use the writable buffer.
1709* @param data collation iterator data
1710*/
1711static
1712void collPrevIterNormalize(collIterate *data)
1713{
1714    UErrorCode status  = U_ZERO_ERROR;
1715    const UChar *pEnd   = data->pos;  /* End normalize + 1 */
1716    const UChar *pStart;
1717
1718    /* Start normalize */
1719    if (data->fcdPosition == NULL) {
1720        pStart = data->string;
1721    }
1722    else {
1723        pStart = data->fcdPosition + 1;
1724    }
1725
1726    int32_t normLen =
1727        data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
1728                             data->writableBuffer,
1729                             status).
1730        length();
1731    if(U_FAILURE(status)) {
1732        return;
1733    }
1734    /*
1735    this puts the null termination infront of the normalized string instead
1736    of the end
1737    */
1738    data->writableBuffer.insert(0, (UChar)0);
1739
1740    /*
1741     * The usual case at this point is that we've got a base
1742     * character followed by marks that were normalized. If
1743     * fcdPosition is NULL, that means that we backed up to
1744     * the beginning of the string and there's no base character.
1745     *
1746     * Forward processing will usually normalize when it sees
1747     * the first mark, so that mark will get it's natural offset
1748     * and the rest will get the offset of the character following
1749     * the marks. The base character will also get its natural offset.
1750     *
1751     * We write the offset of the base character, if there is one,
1752     * followed by the offset of the first mark and then the offsets
1753     * of the rest of the marks.
1754     */
1755    int32_t firstMarkOffset = 0;
1756    int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
1757    int32_t trailCount      = normLen - 1;
1758
1759    if (data->fcdPosition != NULL) {
1760        int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1761        UChar   baseChar   = *data->fcdPosition;
1762
1763        firstMarkOffset = baseOffset + 1;
1764
1765        /*
1766         * If the base character is the start of a contraction, forward processing
1767         * will normalize the marks while checking for the contraction, which means
1768         * that the offset of the first mark will the same as the other marks.
1769         *
1770         * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1771         */
1772        if (baseChar >= 0x100) {
1773            uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1774
1775            if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1776                baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1777            }
1778
1779            if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1780                firstMarkOffset = trailOffset;
1781            }
1782        }
1783
1784        data->appendOffset(baseOffset, status);
1785    }
1786
1787    data->appendOffset(firstMarkOffset, status);
1788
1789    for (int32_t i = 0; i < trailCount; i += 1) {
1790        data->appendOffset(trailOffset, status);
1791    }
1792
1793    data->offsetRepeatValue = trailOffset;
1794
1795    data->offsetReturn = data->offsetStore - 1;
1796    if (data->offsetReturn == data->offsetBuffer) {
1797        data->offsetStore = data->offsetBuffer;
1798    }
1799
1800    data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1801    data->origFlags  = data->flags;
1802    data->flags     |= UCOL_ITER_INNORMBUF;
1803    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1804}
1805
1806
1807/**
1808* Incremental FCD check for previous iteration and normalize. Called from
1809* getPrevCE when normalization state is suspect.
1810* When entering, the state is known to be this:
1811* o  We are working in the main buffer of the collIterate, not the side
1812*    writable buffer. When in the side buffer, normalization mode is always
1813*    off, so we won't get here.
1814* o  The leading combining class from the current character is 0 or the
1815*    trailing combining class of the previous char was zero.
1816*    True because the previous call to this function will have always exited
1817*    that way, and we get called for every char where cc might be non-zero.
1818* @param data collation iterate struct
1819* @return normalization status, TRUE for normalization to be done, FALSE
1820*         otherwise
1821*/
1822static
1823inline UBool collPrevIterFCD(collIterate *data)
1824{
1825    const UChar *src, *start;
1826    uint8_t     leadingCC;
1827    uint8_t     trailingCC = 0;
1828    uint16_t    fcd;
1829    UBool       result = FALSE;
1830
1831    start = data->string;
1832    src = data->pos + 1;
1833
1834    /* Get the trailing combining class of the current character. */
1835    fcd = g_nfcImpl->previousFCD16(start, src);
1836
1837    leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1838
1839    if (leadingCC != 0) {
1840        /*
1841        The current char has a non-zero leading combining class.
1842        Scan backward until we find a char with a trailing cc of zero.
1843        */
1844        for (;;)
1845        {
1846            if (start == src) {
1847                data->fcdPosition = NULL;
1848                return result;
1849            }
1850
1851            fcd = g_nfcImpl->previousFCD16(start, src);
1852
1853            trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1854
1855            if (trailingCC == 0) {
1856                break;
1857            }
1858
1859            if (leadingCC < trailingCC) {
1860                result = TRUE;
1861            }
1862
1863            leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1864        }
1865    }
1866
1867    data->fcdPosition = (UChar *)src;
1868
1869    return result;
1870}
1871
1872/** gets a code unit from the string at a given offset
1873 *  Handles both normal and iterative cases.
1874 *  No error checking - caller beware!
1875 */
1876static inline
1877UChar peekCodeUnit(collIterate *source, int32_t offset) {
1878    if(source->pos != NULL) {
1879        return *(source->pos + offset);
1880    } else if(source->iterator != NULL) {
1881        UChar32 c;
1882        if(offset != 0) {
1883            source->iterator->move(source->iterator, offset, UITER_CURRENT);
1884            c = source->iterator->next(source->iterator);
1885            source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1886        } else {
1887            c = source->iterator->current(source->iterator);
1888        }
1889        return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
1890    } else {
1891        return 0xfffd;
1892    }
1893}
1894
1895// Code point version. Treats the offset as a _code point_ delta.
1896// We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
1897// We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
1898static inline
1899UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1900    UChar32 c;
1901    if(source->pos != NULL) {
1902        const UChar *p = source->pos;
1903        if(offset >= 0) {
1904            // Skip forward over (offset-1) code points.
1905            while(--offset >= 0) {
1906                if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1907                    ++p;
1908                }
1909            }
1910            // Read the code point there.
1911            c = *p++;
1912            UChar trail;
1913            if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1914                c = U16_GET_SUPPLEMENTARY(c, trail);
1915            }
1916        } else /* offset<0 */ {
1917            // Skip backward over (offset-1) code points.
1918            while(++offset < 0) {
1919                if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1920                    --p;
1921                }
1922            }
1923            // Read the code point before that.
1924            c = *--p;
1925            UChar lead;
1926            if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1927                c = U16_GET_SUPPLEMENTARY(lead, c);
1928            }
1929        }
1930    } else if(source->iterator != NULL) {
1931        if(offset >= 0) {
1932            // Skip forward over (offset-1) code points.
1933            int32_t fwd = offset;
1934            while(fwd-- > 0) {
1935                uiter_next32(source->iterator);
1936            }
1937            // Read the code point there.
1938            c = uiter_current32(source->iterator);
1939            // Return to the starting point, skipping backward over (offset-1) code points.
1940            while(offset-- > 0) {
1941                uiter_previous32(source->iterator);
1942            }
1943        } else /* offset<0 */ {
1944            // Read backward, reading offset code points, remember only the last-read one.
1945            int32_t back = offset;
1946            do {
1947                c = uiter_previous32(source->iterator);
1948            } while(++back < 0);
1949            // Return to the starting position, skipping forward over offset code points.
1950            do {
1951                uiter_next32(source->iterator);
1952            } while(++offset < 0);
1953        }
1954    } else {
1955        c = U_SENTINEL;
1956    }
1957    return c;
1958}
1959
1960/**
1961* Determines if we are at the start of the data string in the backwards
1962* collation iterator
1963* @param data collation iterator
1964* @return TRUE if we are at the start
1965*/
1966static
1967inline UBool isAtStartPrevIterate(collIterate *data) {
1968    if(data->pos == NULL && data->iterator != NULL) {
1969        return !data->iterator->hasPrevious(data->iterator);
1970    }
1971    //return (collIter_bos(data)) ||
1972    return (data->pos == data->string) ||
1973              ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
1974              *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1975}
1976
1977static
1978inline void goBackOne(collIterate *data) {
1979# if 0
1980    // somehow, it looks like we need to keep iterator synced up
1981    // at all times, as above.
1982    if(data->pos) {
1983        data->pos--;
1984    }
1985    if(data->iterator) {
1986        data->iterator->previous(data->iterator);
1987    }
1988#endif
1989    if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1990        data->iterator->previous(data->iterator);
1991    }
1992    if(data->pos) {
1993        data->pos --;
1994    }
1995}
1996
1997/**
1998* Inline function that gets a simple CE.
1999* So what it does is that it will first check the expansion buffer. If the
2000* expansion buffer is not empty, ie the end pointer to the expansion buffer
2001* is different from the string pointer, we return the collation element at the
2002* return pointer and decrement it.
2003* For more complicated CEs it resorts to getComplicatedCE.
2004* @param coll collator data
2005* @param data collation iterator struct
2006* @param status error status
2007*/
2008static
2009inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
2010                               UErrorCode *status)
2011{
2012    uint32_t result = (uint32_t)UCOL_NULLORDER;
2013
2014    if (data->offsetReturn != NULL) {
2015        if (data->offsetRepeatCount > 0) {
2016                data->offsetRepeatCount -= 1;
2017        } else {
2018            if (data->offsetReturn == data->offsetBuffer) {
2019                data->offsetReturn = NULL;
2020                data->offsetStore  = data->offsetBuffer;
2021            } else {
2022                data->offsetReturn -= 1;
2023            }
2024        }
2025    }
2026
2027    if ((data->extendCEs && data->toReturn > data->extendCEs) ||
2028            (!data->extendCEs && data->toReturn > data->CEs))
2029    {
2030        data->toReturn -= 1;
2031        result = *(data->toReturn);
2032        if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
2033            data->CEpos = data->toReturn;
2034        }
2035    }
2036    else {
2037        UChar ch = 0;
2038
2039        do {
2040            /*
2041            Loop handles case when incremental normalize switches to or from the
2042            side buffer / original string, and we need to start again to get the
2043            next character.
2044            */
2045            for (;;) {
2046                if (data->flags & UCOL_ITER_HASLEN) {
2047                    /*
2048                    Normal path for strings when length is specified.
2049                    Not in side buffer because it is always null terminated.
2050                    */
2051                    if (data->pos <= data->string) {
2052                        /* End of the main source string */
2053                        return UCOL_NO_MORE_CES;
2054                    }
2055                    data->pos --;
2056                    ch = *data->pos;
2057                }
2058                // we are using an iterator to go back. Pray for us!
2059                else if (data->flags & UCOL_USE_ITERATOR) {
2060                  UChar32 iterCh = data->iterator->previous(data->iterator);
2061                  if(iterCh == U_SENTINEL) {
2062                    return UCOL_NO_MORE_CES;
2063                  } else {
2064                    ch = (UChar)iterCh;
2065                  }
2066                }
2067                else {
2068                    data->pos --;
2069                    ch = *data->pos;
2070                    /* we are in the side buffer. */
2071                    if (ch == 0) {
2072                        /*
2073                        At the start of the normalize side buffer.
2074                        Go back to string.
2075                        Because pointer points to the last accessed character,
2076                        hence we have to increment it by one here.
2077                        */
2078                        data->flags = data->origFlags;
2079                        data->offsetRepeatValue = 0;
2080
2081                         if (data->fcdPosition == NULL) {
2082                            data->pos = data->string;
2083                            return UCOL_NO_MORE_CES;
2084                        }
2085                        else {
2086                            data->pos   = data->fcdPosition + 1;
2087                        }
2088
2089                       continue;
2090                    }
2091                }
2092
2093                if(data->flags&UCOL_HIRAGANA_Q) {
2094                  if(ch>=0x3040 && ch<=0x309f) {
2095                    data->flags |= UCOL_WAS_HIRAGANA;
2096                  } else {
2097                    data->flags &= ~UCOL_WAS_HIRAGANA;
2098                  }
2099                }
2100
2101                /*
2102                * got a character to determine if there's fcd and/or normalization
2103                * stuff to do.
2104                * if the current character is not fcd.
2105                * if current character is at the start of the string
2106                * Trailing combining class == 0.
2107                * Note if pos is in the writablebuffer, norm is always 0
2108                */
2109                if (ch < ZERO_CC_LIMIT_ ||
2110                  // this should propel us out of the loop in the iterator case
2111                    (data->flags & UCOL_ITER_NORM) == 0 ||
2112                    (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2113                    || data->string == data->pos) {
2114                    break;
2115                }
2116
2117                if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2118                    /* if next character is FCD */
2119                    if (data->pos == data->string) {
2120                        /* First char of string is always OK for FCD check */
2121                        break;
2122                    }
2123
2124                    /* Not first char of string, do the FCD fast test */
2125                    if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2126                        break;
2127                    }
2128                }
2129
2130                /* Need a more complete FCD check and possible normalization. */
2131                if (collPrevIterFCD(data)) {
2132                    collPrevIterNormalize(data);
2133                }
2134
2135                if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2136                    /*  No normalization. Go ahead and process the char. */
2137                    break;
2138                }
2139
2140                /*
2141                Some normalization happened.
2142                Next loop picks up a char from the normalization buffer.
2143                */
2144            }
2145
2146            /* attempt to handle contractions, after removal of the backwards
2147            contraction
2148            */
2149            if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2150                result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2151            } else {
2152                if (ch <= 0xFF) {
2153                    result = coll->latinOneMapping[ch];
2154                }
2155                else {
2156                    // Always use UCA for [3400..9FFF], [AC00..D7AF]
2157                    // **** [FA0E..FA2F] ?? ****
2158                    if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2159                        (ch >= 0x3400 && ch <= 0xD7AF)) {
2160                        if (ch > 0x9FFF && ch < 0xAC00) {
2161                            // between the two target ranges; do normal lookup
2162                            // **** this range is YI, Modifier tone letters, ****
2163                            // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
2164                            // **** Latin-D might be tailored, so we need to ****
2165                            // **** do the normal lookup for these guys.     ****
2166                             result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2167                        } else {
2168                            result = UCOL_NOT_FOUND;
2169                        }
2170                    } else {
2171                        result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2172                    }
2173                }
2174                if (result > UCOL_NOT_FOUND) {
2175                    result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2176                }
2177                if (result == UCOL_NOT_FOUND) { // Not found in master list
2178                    if (!isAtStartPrevIterate(data) &&
2179                        ucol_contractionEndCP(ch, data->coll))
2180                    {
2181                        result = UCOL_CONTRACTION;
2182                    } else {
2183                        if(coll->UCA) {
2184                            result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2185                        }
2186                    }
2187
2188                    if (result > UCOL_NOT_FOUND) {
2189                        if(coll->UCA) {
2190                            result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2191                        }
2192                    }
2193                }
2194            }
2195        } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
2196
2197        if(result == UCOL_NOT_FOUND) {
2198            result = getPrevImplicit(ch, data);
2199        }
2200    }
2201
2202    return result;
2203}
2204
2205
2206/*   ucol_getPrevCE, out-of-line version for use from other files.  */
2207U_CFUNC uint32_t  U_EXPORT2
2208ucol_getPrevCE(const UCollator *coll, collIterate *data,
2209                        UErrorCode *status) {
2210    return ucol_IGetPrevCE(coll, data, status);
2211}
2212
2213
2214/* this should be connected to special Jamo handling */
2215U_CFUNC uint32_t  U_EXPORT2
2216ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2217    collIterate colIt;
2218    IInit_collIterate(coll, &u, 1, &colIt, status);
2219    if(U_FAILURE(*status)) {
2220        return 0;
2221    }
2222    return ucol_IGetNextCE(coll, &colIt, status);
2223}
2224
2225/**
2226* Inserts the argument character into the end of the buffer pushing back the
2227* null terminator.
2228* @param data collIterate struct data
2229* @param ch character to be appended
2230* @return the position of the new addition
2231*/
2232static
2233inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2234{
2235    int32_t oldLength = data->writableBuffer.length();
2236    return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2237}
2238
2239/**
2240* Inserts the argument string into the end of the buffer pushing back the
2241* null terminator.
2242* @param data collIterate struct data
2243* @param string to be appended
2244* @param length of the string to be appended
2245* @return the position of the new addition
2246*/
2247static
2248inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
2249{
2250    int32_t oldLength = data->writableBuffer.length();
2251    return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
2252}
2253
2254/**
2255* Special normalization function for contraction in the forwards iterator.
2256* This normalization sequence will place the current character at source->pos
2257* and its following normalized sequence into the buffer.
2258* The fcd position, pos will be changed.
2259* pos will now point to positions in the buffer.
2260* Flags will be changed accordingly.
2261* @param data collation iterator data
2262*/
2263static
2264inline void normalizeNextContraction(collIterate *data)
2265{
2266    int32_t     strsize;
2267    UErrorCode  status     = U_ZERO_ERROR;
2268    /* because the pointer points to the next character */
2269    const UChar *pStart    = data->pos - 1;
2270    const UChar *pEnd;
2271
2272    if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2273        data->writableBuffer.setTo(*(pStart - 1));
2274        strsize               = 1;
2275    }
2276    else {
2277        strsize = data->writableBuffer.length();
2278    }
2279
2280    pEnd = data->fcdPosition;
2281
2282    data->writableBuffer.append(
2283        data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
2284    if(U_FAILURE(status)) {
2285        return;
2286    }
2287
2288    data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
2289    data->origFlags  = data->flags;
2290    data->flags     |= UCOL_ITER_INNORMBUF;
2291    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2292}
2293
2294/**
2295* Contraction character management function that returns the next character
2296* for the forwards iterator.
2297* Does nothing if the next character is in buffer and not the first character
2298* in it.
2299* Else it checks next character in data string to see if it is normalizable.
2300* If it is not, the character is simply copied into the buffer, else
2301* the whole normalized substring is copied into the buffer, including the
2302* current character.
2303* @param data collation element iterator data
2304* @return next character
2305*/
2306static
2307inline UChar getNextNormalizedChar(collIterate *data)
2308{
2309    UChar  nextch;
2310    UChar  ch;
2311    // Here we need to add the iterator code. One problem is the way
2312    // end of string is handled. If we just return next char, it could
2313    // be the sentinel. Most of the cases already check for this, but we
2314    // need to be sure.
2315    if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2316         /* if no normalization and not in buffer. */
2317      if(data->flags & UCOL_USE_ITERATOR) {
2318         return (UChar)data->iterator->next(data->iterator);
2319      } else {
2320         return *(data->pos ++);
2321      }
2322    }
2323
2324    //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2325      //normalizeIterator(data);
2326    //}
2327
2328    UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2329    if ((innormbuf && *data->pos != 0) ||
2330        (data->fcdPosition != NULL && !innormbuf &&
2331        data->pos < data->fcdPosition)) {
2332        /*
2333        if next character is in normalized buffer, no further normalization
2334        is required
2335        */
2336        return *(data->pos ++);
2337    }
2338
2339    if (data->flags & UCOL_ITER_HASLEN) {
2340        /* in data string */
2341        if (data->pos + 1 == data->endp) {
2342            return *(data->pos ++);
2343        }
2344    }
2345    else {
2346        if (innormbuf) {
2347          // inside the normalization buffer, but at the end
2348          // (since we encountered zero). This means, in the
2349          // case we're using char iterator, that we need to
2350          // do another round of normalization.
2351          //if(data->origFlags & UCOL_USE_ITERATOR) {
2352            // we need to restore original flags,
2353            // otherwise, we'll lose them
2354            //data->flags = data->origFlags;
2355            //normalizeIterator(data);
2356            //return *(data->pos++);
2357          //} else {
2358            /*
2359            in writable buffer, at this point fcdPosition can not be
2360            pointing to the end of the data string. see contracting tag.
2361            */
2362          if(data->fcdPosition) {
2363            if (*(data->fcdPosition + 1) == 0 ||
2364                data->fcdPosition + 1 == data->endp) {
2365                /* at the end of the string, dump it into the normalizer */
2366                data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2367                // Check if data->pos received a null pointer
2368                if (data->pos == NULL) {
2369                    return (UChar)-1; // Return to indicate error.
2370                }
2371                return *(data->fcdPosition ++);
2372            }
2373            data->pos = data->fcdPosition;
2374          } else if(data->origFlags & UCOL_USE_ITERATOR) {
2375            // if we are here, we're using a normalizing iterator.
2376            // we should just continue further.
2377            data->flags = data->origFlags;
2378            data->pos = NULL;
2379            return (UChar)data->iterator->next(data->iterator);
2380          }
2381          //}
2382        }
2383        else {
2384            if (*(data->pos + 1) == 0) {
2385                return *(data->pos ++);
2386            }
2387        }
2388    }
2389
2390    ch = *data->pos ++;
2391    nextch = *data->pos;
2392
2393    /*
2394    * if the current character is not fcd.
2395    * Trailing combining class == 0.
2396    */
2397    if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2398        (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2399         ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2400            /*
2401            Need a more complete FCD check and possible normalization.
2402            normalize substring will be appended to buffer
2403            */
2404        if (collIterFCD(data)) {
2405            normalizeNextContraction(data);
2406            return *(data->pos ++);
2407        }
2408        else if (innormbuf) {
2409            /* fcdposition shifted even when there's no normalization, if we
2410            don't input the rest into this, we'll get the wrong position when
2411            we reach the end of the writableBuffer */
2412            int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2413            data->pos = insertBufferEnd(data, data->pos - 1, length);
2414            // Check if data->pos received a null pointer
2415            if (data->pos == NULL) {
2416                return (UChar)-1; // Return to indicate error.
2417            }
2418            return *(data->pos ++);
2419        }
2420    }
2421
2422    if (innormbuf) {
2423        /*
2424        no normalization is to be done hence only one character will be
2425        appended to the buffer.
2426        */
2427        data->pos = insertBufferEnd(data, ch) + 1;
2428        // Check if data->pos received a null pointer
2429        if (data->pos == NULL) {
2430            return (UChar)-1; // Return to indicate error.
2431        }
2432    }
2433
2434    /* points back to the pos in string */
2435    return ch;
2436}
2437
2438
2439
2440/**
2441* Function to copy the buffer into writableBuffer and sets the fcd position to
2442* the correct position
2443* @param source data string source
2444* @param buffer character buffer
2445*/
2446static
2447inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
2448{
2449    /* okay confusing part here. to ensure that the skipped characters are
2450    considered later, we need to place it in the appropriate position in the
2451    normalization buffer and reassign the pos pointer. simple case if pos
2452    reside in string, simply copy to normalization buffer and
2453    fcdposition = pos, pos = start of normalization buffer. if pos in
2454    normalization buffer, we'll insert the copy infront of pos and point pos
2455    to the start of the normalization buffer. why am i doing these copies?
2456    well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2457    not require any changes, which be really painful. */
2458    if (source->flags & UCOL_ITER_INNORMBUF) {
2459        int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
2460        source->writableBuffer.replace(0, replaceLength, buffer);
2461    }
2462    else {
2463        source->fcdPosition  = source->pos;
2464        source->origFlags    = source->flags;
2465        source->flags       |= UCOL_ITER_INNORMBUF;
2466        source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2467        source->writableBuffer = buffer;
2468    }
2469
2470    source->pos = source->writableBuffer.getTerminatedBuffer();
2471}
2472
2473/**
2474* Function to get the discontiguos collation element within the source.
2475* Note this function will set the position to the appropriate places.
2476* @param coll current collator used
2477* @param source data string source
2478* @param constart index to the start character in the contraction table
2479* @return discontiguos collation element offset
2480*/
2481static
2482uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2483                                const UChar *constart)
2484{
2485    /* source->pos currently points to the second combining character after
2486       the start character */
2487          const UChar *temppos      = source->pos;
2488          UnicodeString buffer;
2489    const UChar   *tempconstart = constart;
2490          uint8_t  tempflags    = source->flags;
2491          UBool    multicontraction = FALSE;
2492          collIterateState discState;
2493
2494          backupState(source, &discState);
2495
2496    buffer.setTo(peekCodePoint(source, -1));
2497    for (;;) {
2498        UChar    *UCharOffset;
2499        UChar     schar,
2500                  tchar;
2501        uint32_t  result;
2502
2503        if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2504            || (peekCodeUnit(source, 0) == 0  &&
2505            //|| (*source->pos == 0  &&
2506                ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2507                 source->fcdPosition == NULL ||
2508                 source->fcdPosition == source->endp ||
2509                 *(source->fcdPosition) == 0 ||
2510                 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2511                 /* end of string in null terminated string or stopped by a
2512                 null character, note fcd does not always point to a base
2513                 character after the discontiguos change */
2514                 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2515                 //u_getCombiningClass(*(source->pos)) == 0) {
2516            //constart = (UChar *)coll->image + getContractOffset(CE);
2517            if (multicontraction) {
2518                source->pos    = temppos - 1;
2519                setDiscontiguosAttribute(source, buffer);
2520                return *(coll->contractionCEs +
2521                                    (tempconstart - coll->contractionIndex));
2522            }
2523            constart = tempconstart;
2524            break;
2525        }
2526
2527        UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2528        schar = getNextNormalizedChar(source);
2529
2530        while (schar > (tchar = *UCharOffset)) {
2531            UCharOffset++;
2532        }
2533
2534        if (schar != tchar) {
2535            /* not the correct codepoint. we stuff the current codepoint into
2536            the discontiguos buffer and try the next character */
2537            buffer.append(schar);
2538            continue;
2539        }
2540        else {
2541            if (u_getCombiningClass(schar) ==
2542                u_getCombiningClass(peekCodePoint(source, -2))) {
2543                buffer.append(schar);
2544                continue;
2545            }
2546            result = *(coll->contractionCEs +
2547                                      (UCharOffset - coll->contractionIndex));
2548        }
2549
2550        if (result == UCOL_NOT_FOUND) {
2551          break;
2552        } else if (isContraction(result)) {
2553            /* this is a multi-contraction*/
2554            tempconstart = (UChar *)coll->image + getContractOffset(result);
2555            if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2556                != UCOL_NOT_FOUND) {
2557                multicontraction = TRUE;
2558                temppos       = source->pos + 1;
2559            }
2560        } else {
2561            setDiscontiguosAttribute(source, buffer);
2562            return result;
2563        }
2564    }
2565
2566    /* no problems simply reverting just like that,
2567    if we are in string before getting into this function, points back to
2568    string hence no problem.
2569    if we are in normalization buffer before getting into this function,
2570    since we'll never use another normalization within this function, we
2571    know that fcdposition points to a base character. the normalization buffer
2572    never change, hence this revert works. */
2573    loadState(source, &discState, TRUE);
2574    goBackOne(source);
2575
2576    //source->pos   = temppos - 1;
2577    source->flags = tempflags;
2578    return *(coll->contractionCEs + (constart - coll->contractionIndex));
2579}
2580
2581/* now uses Mark's getImplicitPrimary code */
2582static
2583inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2584    uint32_t r = uprv_uca_getImplicitPrimary(cp);
2585    *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2586    collationSource->offsetRepeatCount += 1;
2587    return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2588}
2589
2590/**
2591* Inserts the argument character into the front of the buffer replacing the
2592* front null terminator.
2593* @param data collation element iterator data
2594* @param ch character to be appended
2595*/
2596static
2597inline void insertBufferFront(collIterate *data, UChar ch)
2598{
2599    data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
2600}
2601
2602/**
2603* Special normalization function for contraction in the previous iterator.
2604* This normalization sequence will place the current character at source->pos
2605* and its following normalized sequence into the buffer.
2606* The fcd position, pos will be changed.
2607* pos will now point to positions in the buffer.
2608* Flags will be changed accordingly.
2609* @param data collation iterator data
2610*/
2611static
2612inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2613{
2614    const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
2615    const UChar *pStart;
2616
2617    UnicodeString endOfBuffer;
2618    if (data->flags & UCOL_ITER_HASLEN) {
2619        /*
2620        normalization buffer not used yet, we'll pull down the next
2621        character into the end of the buffer
2622        */
2623        endOfBuffer.setTo(*pEnd);
2624    }
2625    else {
2626        endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
2627    }
2628
2629    if (data->fcdPosition == NULL) {
2630        pStart = data->string;
2631    }
2632    else {
2633        pStart = data->fcdPosition + 1;
2634    }
2635    int32_t normLen =
2636        data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
2637                             data->writableBuffer,
2638                             *status).
2639        length();
2640    if(U_FAILURE(*status)) {
2641        return;
2642    }
2643    /*
2644    this puts the null termination infront of the normalized string instead
2645    of the end
2646    */
2647    data->pos =
2648        data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
2649        1 + normLen;
2650    data->origFlags  = data->flags;
2651    data->flags     |= UCOL_ITER_INNORMBUF;
2652    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2653}
2654
2655/**
2656* Contraction character management function that returns the previous character
2657* for the backwards iterator.
2658* Does nothing if the previous character is in buffer and not the first
2659* character in it.
2660* Else it checks previous character in data string to see if it is
2661* normalizable.
2662* If it is not, the character is simply copied into the buffer, else
2663* the whole normalized substring is copied into the buffer, including the
2664* current character.
2665* @param data collation element iterator data
2666* @return previous character
2667*/
2668static
2669inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2670{
2671    UChar  prevch;
2672    UChar  ch;
2673    const UChar *start;
2674    UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2675    if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2676        (innormbuf && *(data->pos - 1) != 0)) {
2677        /*
2678        if no normalization.
2679        if previous character is in normalized buffer, no further normalization
2680        is required
2681        */
2682      if(data->flags & UCOL_USE_ITERATOR) {
2683        data->iterator->move(data->iterator, -1, UITER_CURRENT);
2684        return (UChar)data->iterator->next(data->iterator);
2685      } else {
2686        return *(data->pos - 1);
2687      }
2688    }
2689
2690    start = data->pos;
2691    if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2692        /* in data string */
2693        if ((start - 1) == data->string) {
2694            return *(start - 1);
2695        }
2696        start --;
2697        ch     = *start;
2698        prevch = *(start - 1);
2699    }
2700    else {
2701        /*
2702        in writable buffer, at this point fcdPosition can not be NULL.
2703        see contracting tag.
2704        */
2705        if (data->fcdPosition == data->string) {
2706            /* at the start of the string, just dump it into the normalizer */
2707            insertBufferFront(data, *(data->fcdPosition));
2708            data->fcdPosition = NULL;
2709            return *(data->pos - 1);
2710        }
2711        start  = data->fcdPosition;
2712        ch     = *start;
2713        prevch = *(start - 1);
2714    }
2715    /*
2716    * if the current character is not fcd.
2717    * Trailing combining class == 0.
2718    */
2719    if (data->fcdPosition > start &&
2720       (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2721    {
2722        /*
2723        Need a more complete FCD check and possible normalization.
2724        normalize substring will be appended to buffer
2725        */
2726        const UChar *backuppos = data->pos;
2727        data->pos = start;
2728        if (collPrevIterFCD(data)) {
2729            normalizePrevContraction(data, status);
2730            return *(data->pos - 1);
2731        }
2732        data->pos = backuppos;
2733        data->fcdPosition ++;
2734    }
2735
2736    if (innormbuf) {
2737    /*
2738    no normalization is to be done hence only one character will be
2739    appended to the buffer.
2740    */
2741        insertBufferFront(data, ch);
2742        data->fcdPosition --;
2743    }
2744
2745    return ch;
2746}
2747
2748/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2749/* It is called by getNextCE */
2750
2751/* The following should be even */
2752#define UCOL_MAX_DIGITS_FOR_NUMBER 254
2753
2754uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2755    collIterateState entryState;
2756    backupState(source, &entryState);
2757    UChar32 cp = ch;
2758
2759    for (;;) {
2760        // This loop will repeat only in the case of contractions, and only when a contraction
2761        //   is found and the first CE resulting from that contraction is itself a special
2762        //   (an expansion, for example.)  All other special CE types are fully handled the
2763        //   first time through, and the loop exits.
2764
2765        const uint32_t *CEOffset = NULL;
2766        switch(getCETag(CE)) {
2767        case NOT_FOUND_TAG:
2768            /* This one is not found, and we'll let somebody else bother about it... no more games */
2769            return CE;
2770        case SPEC_PROC_TAG:
2771            {
2772                // Special processing is getting a CE that is preceded by a certain prefix
2773                // Currently this is only needed for optimizing Japanese length and iteration marks.
2774                // When we encouter a special processing tag, we go backwards and try to see if
2775                // we have a match.
2776                // Contraction tables are used - so the whole process is not unlike contraction.
2777                // prefix data is stored backwards in the table.
2778                const UChar *UCharOffset;
2779                UChar schar, tchar;
2780                collIterateState prefixState;
2781                backupState(source, &prefixState);
2782                loadState(source, &entryState, TRUE);
2783                goBackOne(source); // We want to look at the point where we entered - actually one
2784                // before that...
2785
2786                for(;;) {
2787                    // This loop will run once per source string character, for as long as we
2788                    //  are matching a potential contraction sequence
2789
2790                    // First we position ourselves at the begining of contraction sequence
2791                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2792                    if (collIter_bos(source)) {
2793                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2794                        break;
2795                    }
2796                    schar = getPrevNormalizedChar(source, status);
2797                    goBackOne(source);
2798
2799                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2800                        UCharOffset++;
2801                    }
2802
2803                    if (schar == tchar) {
2804                        // Found the source string char in the table.
2805                        //  Pick up the corresponding CE from the table.
2806                        CE = *(coll->contractionCEs +
2807                            (UCharOffset - coll->contractionIndex));
2808                    }
2809                    else
2810                    {
2811                        // Source string char was not in the table.
2812                        //   We have not found the prefix.
2813                        CE = *(coll->contractionCEs +
2814                            (ContractionStart - coll->contractionIndex));
2815                    }
2816
2817                    if(!isPrefix(CE)) {
2818                        // The source string char was in the contraction table, and the corresponding
2819                        //   CE is not a prefix CE.  We found the prefix, break
2820                        //   out of loop, this CE will end up being returned.  This is the normal
2821                        //   way out of prefix handling when the source actually contained
2822                        //   the prefix.
2823                        break;
2824                    }
2825                }
2826                if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2827                    loadState(source, &prefixState, TRUE);
2828                    if(source->origFlags & UCOL_USE_ITERATOR) {
2829                        source->flags = source->origFlags;
2830                    }
2831                } else { // prefix search was a failure, we have to backup all the way to the start
2832                    loadState(source, &entryState, TRUE);
2833                }
2834                break;
2835            }
2836        case CONTRACTION_TAG:
2837            {
2838                /* This should handle contractions */
2839                collIterateState state;
2840                backupState(source, &state);
2841                uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2842                const UChar *UCharOffset;
2843                UChar schar, tchar;
2844
2845                for (;;) {
2846                    /* This loop will run once per source string character, for as long as we     */
2847                    /*  are matching a potential contraction sequence                  */
2848
2849                    /* First we position ourselves at the begining of contraction sequence */
2850                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2851
2852                    if (collIter_eos(source)) {
2853                        // Ran off the end of the source string.
2854                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2855                        // So we'll pick whatever we have at the point...
2856                        if (CE == UCOL_NOT_FOUND) {
2857                            // back up the source over all the chars we scanned going into this contraction.
2858                            CE = firstCE;
2859                            loadState(source, &state, TRUE);
2860                            if(source->origFlags & UCOL_USE_ITERATOR) {
2861                                source->flags = source->origFlags;
2862                            }
2863                        }
2864                        break;
2865                    }
2866
2867                    uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2868                    uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2869
2870                    schar = getNextNormalizedChar(source);
2871                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2872                        UCharOffset++;
2873                    }
2874
2875                    if (schar == tchar) {
2876                        // Found the source string char in the contraction table.
2877                        //  Pick up the corresponding CE from the table.
2878                        CE = *(coll->contractionCEs +
2879                            (UCharOffset - coll->contractionIndex));
2880                    }
2881                    else
2882                    {
2883                        // Source string char was not in contraction table.
2884                        //   Unless we have a discontiguous contraction, we have finished
2885                        //   with this contraction.
2886                        // in order to do the proper detection, we
2887                        // need to see if we're dealing with a supplementary
2888                        /* We test whether the next two char are surrogate pairs.
2889                        * This test is done if the iterator is not NULL.
2890                        * If there is no surrogate pair, the iterator
2891                        * goes back one if needed. */
2892                        UChar32 miss = schar;
2893                        if (source->iterator) {
2894                            UChar32 surrNextChar; /* the next char in the iteration to test */
2895                            int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2896                            if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2897                                prevPos = source->iterator->index;
2898                                surrNextChar = getNextNormalizedChar(source);
2899                                if (U16_IS_TRAIL(surrNextChar)) {
2900                                    miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2901                                } else if (prevPos < source->iterator->index){
2902                                    goBackOne(source);
2903                                }
2904                            }
2905                        } else if (U16_IS_LEAD(schar)) {
2906                            miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2907                        }
2908
2909                        uint8_t sCC;
2910                        if (miss < 0x300 ||
2911                            maxCC == 0 ||
2912                            (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2913                            sCC>maxCC ||
2914                            (allSame != 0 && sCC == maxCC) ||
2915                            collIter_eos(source))
2916                        {
2917                            //  Contraction can not be discontiguous.
2918                            goBackOne(source);  // back up the source string by one,
2919                            //  because  the character we just looked at was
2920                            //  not part of the contraction.   */
2921                            if(U_IS_SUPPLEMENTARY(miss)) {
2922                                goBackOne(source);
2923                            }
2924                            CE = *(coll->contractionCEs +
2925                                (ContractionStart - coll->contractionIndex));
2926                        } else {
2927                            //
2928                            // Contraction is possibly discontiguous.
2929                            //   Scan more of source string looking for a match
2930                            //
2931                            UChar tempchar;
2932                            /* find the next character if schar is not a base character
2933                            and we are not yet at the end of the string */
2934                            tempchar = getNextNormalizedChar(source);
2935                            // probably need another supplementary thingie here
2936                            goBackOne(source);
2937                            if (i_getCombiningClass(tempchar, coll) == 0) {
2938                                goBackOne(source);
2939                                if(U_IS_SUPPLEMENTARY(miss)) {
2940                                    goBackOne(source);
2941                                }
2942                                /* Spit out the last char of the string, wasn't tasty enough */
2943                                CE = *(coll->contractionCEs +
2944                                    (ContractionStart - coll->contractionIndex));
2945                            } else {
2946                                CE = getDiscontiguous(coll, source, ContractionStart);
2947                            }
2948                        }
2949                    } // else after if(schar == tchar)
2950
2951                    if(CE == UCOL_NOT_FOUND) {
2952                        /* The Source string did not match the contraction that we were checking.  */
2953                        /*  Back up the source position to undo the effects of having partially    */
2954                        /*   scanned through what ultimately proved to not be a contraction.       */
2955                        loadState(source, &state, TRUE);
2956                        CE = firstCE;
2957                        break;
2958                    }
2959
2960                    if(!isContraction(CE)) {
2961                        // The source string char was in the contraction table, and the corresponding
2962                        //   CE is not a contraction CE.  We completed the contraction, break
2963                        //   out of loop, this CE will end up being returned.  This is the normal
2964                        //   way out of contraction handling when the source actually contained
2965                        //   the contraction.
2966                        break;
2967                    }
2968
2969
2970                    // The source string char was in the contraction table, and the corresponding
2971                    //   CE is IS  a contraction CE.  We will continue looping to check the source
2972                    //   string for the remaining chars in the contraction.
2973                    uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2974                    if(tempCE != UCOL_NOT_FOUND) {
2975                        // We have scanned a a section of source string for which there is a
2976                        //  CE from the contraction table.  Remember the CE and scan position, so
2977                        //  that we can return to this point if further scanning fails to
2978                        //  match a longer contraction sequence.
2979                        firstCE = tempCE;
2980
2981                        goBackOne(source);
2982                        backupState(source, &state);
2983                        getNextNormalizedChar(source);
2984
2985                        // Another way to do this is:
2986                        //collIterateState tempState;
2987                        //backupState(source, &tempState);
2988                        //goBackOne(source);
2989                        //backupState(source, &state);
2990                        //loadState(source, &tempState, TRUE);
2991
2992                        // The problem is that for incomplete contractions we have to remember the previous
2993                        // position. Before, the only thing I needed to do was state.pos--;
2994                        // After iterator introduction and especially after introduction of normalizing
2995                        // iterators, it became much more difficult to decrease the saved state.
2996                        // I'm not yet sure which of the two methods above is faster.
2997                    }
2998                } // for(;;)
2999                break;
3000            } // case CONTRACTION_TAG:
3001        case LONG_PRIMARY_TAG:
3002            {
3003                *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3004                CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3005                source->offsetRepeatCount += 1;
3006                return CE;
3007            }
3008        case EXPANSION_TAG:
3009            {
3010                /* This should handle expansion. */
3011                /* NOTE: we can encounter both continuations and expansions in an expansion! */
3012                /* I have to decide where continuations are going to be dealt with */
3013                uint32_t size;
3014                uint32_t i;    /* general counter */
3015
3016                CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3017                size = getExpansionCount(CE);
3018                CE = *CEOffset++;
3019              //source->offsetRepeatCount = -1;
3020
3021                if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3022                    for(i = 1; i<size; i++) {
3023                        *(source->CEpos++) = *CEOffset++;
3024                        source->offsetRepeatCount += 1;
3025                    }
3026                } else { /* else, we do */
3027                    while(*CEOffset != 0) {
3028                        *(source->CEpos++) = *CEOffset++;
3029                        source->offsetRepeatCount += 1;
3030                    }
3031                }
3032
3033                return CE;
3034            }
3035        case DIGIT_TAG:
3036            {
3037                /*
3038                We do a check to see if we want to collate digits as numbers; if so we generate
3039                a custom collation key. Otherwise we pull out the value stored in the expansion table.
3040                */
3041                //uint32_t size;
3042                uint32_t i;    /* general counter */
3043
3044                if (source->coll->numericCollation == UCOL_ON){
3045                    collIterateState digitState = {0,0,0,0,0,0,0,0,0};
3046                    UChar32 char32 = 0;
3047                    int32_t digVal = 0;
3048
3049                    uint32_t digIndx = 0;
3050                    uint32_t endIndex = 0;
3051                    uint32_t trailingZeroIndex = 0;
3052
3053                    uint8_t collateVal = 0;
3054
3055                    UBool nonZeroValReached = FALSE;
3056
3057                    uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
3058                    /*
3059                         We parse the source string until we hit a char that's NOT a digit.
3060                        Use this u_charDigitValue. This might be slow because we have to
3061                        handle surrogates...
3062                    */
3063            /*
3064                    if (U16_IS_LEAD(ch)){
3065                      if (!collIter_eos(source)) {
3066                        backupState(source, &digitState);
3067                        UChar trail = getNextNormalizedChar(source);
3068                        if(U16_IS_TRAIL(trail)) {
3069                          char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3070                        } else {
3071                          loadState(source, &digitState, TRUE);
3072                          char32 = ch;
3073                        }
3074                      } else {
3075                        char32 = ch;
3076                      }
3077                    } else {
3078                      char32 = ch;
3079                    }
3080                    digVal = u_charDigitValue(char32);
3081            */
3082                    digVal = u_charDigitValue(cp); // if we have arrived here, we have
3083                    // already processed possible supplementaries that trigered the digit tag -
3084                    // all supplementaries are marked in the UCA.
3085                    /*
3086                        We  pad a zero in front of the first element anyways. This takes
3087                        care of the (probably) most common case where people are sorting things followed
3088                        by a single digit
3089                    */
3090                    digIndx++;
3091                    for(;;){
3092                        // Make sure we have enough space. No longer needed;
3093                        // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3094                        // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3095                        // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3096
3097                        // Skipping over leading zeroes.
3098                        if (digVal != 0) {
3099                            nonZeroValReached = TRUE;
3100                        }
3101                        if (nonZeroValReached) {
3102                            /*
3103                            We parse the digit string into base 100 numbers (this fits into a byte).
3104                            We only add to the buffer in twos, thus if we are parsing an odd character,
3105                            that serves as the 'tens' digit while the if we are parsing an even one, that
3106                            is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3107                            a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3108                            overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3109                            than all the other bytes.
3110                            */
3111
3112                            if (digIndx % 2 == 1){
3113                                collateVal += (uint8_t)digVal;
3114
3115                                // We don't enter the low-order-digit case unless we've already seen
3116                                // the high order, or for the first digit, which is always non-zero.
3117                                if (collateVal != 0)
3118                                    trailingZeroIndex = 0;
3119
3120                                numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3121                                collateVal = 0;
3122                            }
3123                            else{
3124                                // We drop the collation value into the buffer so if we need to do
3125                                // a "front patch" we don't have to check to see if we're hitting the
3126                                // last element.
3127                                collateVal = (uint8_t)(digVal * 10);
3128
3129                                // Check for trailing zeroes.
3130                                if (collateVal == 0)
3131                                {
3132                                    if (!trailingZeroIndex)
3133                                        trailingZeroIndex = (digIndx/2) + 2;
3134                                }
3135                                else
3136                                    trailingZeroIndex = 0;
3137
3138                                numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3139                            }
3140                            digIndx++;
3141                        }
3142
3143                        // Get next character.
3144                        if (!collIter_eos(source)){
3145                            ch = getNextNormalizedChar(source);
3146                            if (U16_IS_LEAD(ch)){
3147                                if (!collIter_eos(source)) {
3148                                    backupState(source, &digitState);
3149                                    UChar trail = getNextNormalizedChar(source);
3150                                    if(U16_IS_TRAIL(trail)) {
3151                                        char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3152                                    } else {
3153                                        loadState(source, &digitState, TRUE);
3154                                        char32 = ch;
3155                                    }
3156                                }
3157                            } else {
3158                                char32 = ch;
3159                            }
3160
3161                            if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3162                                // Resetting position to point to the next unprocessed char. We
3163                                // overshot it when doing our test/set for numbers.
3164                                if (char32 > 0xFFFF) { // For surrogates.
3165                                    loadState(source, &digitState, TRUE);
3166                                    //goBackOne(source);
3167                                }
3168                                goBackOne(source);
3169                                break;
3170                            }
3171                        } else {
3172                            break;
3173                        }
3174                    }
3175
3176                    if (nonZeroValReached == FALSE){
3177                        digIndx = 2;
3178                        numTempBuf[2] = 6;
3179                    }
3180
3181                    endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3182                    if (digIndx % 2 != 0){
3183                        /*
3184                        We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3185                        we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3186                        Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3187                        single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3188                        */
3189
3190                        for(i = 2; i < endIndex; i++){
3191                            numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3192                                (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3193                        }
3194                        --digIndx;
3195                    }
3196
3197                    // Subtract one off of the last byte.
3198                    numTempBuf[endIndex-1] -= 1;
3199
3200                    /*
3201                    We want to skip over the first two slots in the buffer. The first slot
3202                    is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3203                    sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3204                    */
3205                    numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3206                    numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3207
3208                    // Now transfer the collation key to our collIterate struct.
3209                    // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3210                    //size = ((endIndex+1) & ~1)/2;
3211                    CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3212                        (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3213                        UCOL_BYTE_COMMON; // Tertiary weight.
3214                    i = 2; // Reset the index into the buffer.
3215                    while(i < endIndex)
3216                    {
3217                        uint32_t primWeight = numTempBuf[i++] << 8;
3218                        if ( i < endIndex)
3219                            primWeight |= numTempBuf[i++];
3220                        *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3221                    }
3222
3223                } else {
3224                    // no numeric mode, we'll just switch to whatever we stashed and continue
3225                    CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3226                    CE = *CEOffset++;
3227                    break;
3228                }
3229                return CE;
3230            }
3231            /* various implicits optimization */
3232        case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3233            /* UCA is filled with these. Tailorings are NOT_FOUND */
3234            return getImplicit(cp, source);
3235        case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3236            // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3237            return getImplicit(cp, source);
3238        case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3239            {
3240                static const uint32_t
3241                    SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3242                //const uint32_t LCount = 19;
3243                static const uint32_t VCount = 21;
3244                static const uint32_t TCount = 28;
3245                //const uint32_t NCount = VCount * TCount;   // 588
3246                //const uint32_t SCount = LCount * NCount;   // 11172
3247                uint32_t L = ch - SBase;
3248
3249                // divide into pieces
3250
3251                uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3252                L /= TCount;
3253                uint32_t V = L % VCount;
3254                L /= VCount;
3255
3256                // offset them
3257
3258                L += LBase;
3259                V += VBase;
3260                T += TBase;
3261
3262                // return the first CE, but first put the rest into the expansion buffer
3263                if (!source->coll->image->jamoSpecial) { // FAST PATH
3264
3265                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3266                    if (T != TBase) {
3267                        *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3268                    }
3269
3270                    return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3271
3272                } else { // Jamo is Special
3273                    // Since Hanguls pass the FCD check, it is
3274                    // guaranteed that we won't be in
3275                    // the normalization buffer if something like this happens
3276
3277                    // However, if we are using a uchar iterator and normalization
3278                    // is ON, the Hangul that lead us here is going to be in that
3279                    // normalization buffer. Here we want to restore the uchar
3280                    // iterator state and pull out of the normalization buffer
3281                    if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3282                        source->flags = source->origFlags; // restore the iterator
3283                        source->pos = NULL;
3284                    }
3285
3286                    // Move Jamos into normalization buffer
3287                    UChar *buffer = source->writableBuffer.getBuffer(4);
3288                    int32_t bufferLength;
3289                    buffer[0] = (UChar)L;
3290                    buffer[1] = (UChar)V;
3291                    if (T != TBase) {
3292                        buffer[2] = (UChar)T;
3293                        bufferLength = 3;
3294                    } else {
3295                        bufferLength = 2;
3296                    }
3297                    source->writableBuffer.releaseBuffer(bufferLength);
3298
3299                    // Indicate where to continue in main input string after exhausting the writableBuffer
3300                    source->fcdPosition       = source->pos;
3301
3302                    source->pos   = source->writableBuffer.getTerminatedBuffer();
3303                    source->origFlags   = source->flags;
3304                    source->flags       |= UCOL_ITER_INNORMBUF;
3305                    source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3306
3307                    return(UCOL_IGNORABLE);
3308                }
3309            }
3310        case SURROGATE_TAG:
3311            /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3312            /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3313            /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3314            /* we treat it like an unassigned code point. */
3315            {
3316                UChar trail;
3317                collIterateState state;
3318                backupState(source, &state);
3319                if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3320                    // we chould have stepped one char forward and it might have turned that it
3321                    // was not a trail surrogate. In that case, we have to backup.
3322                    loadState(source, &state, TRUE);
3323                    return UCOL_NOT_FOUND;
3324                } else {
3325                    /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3326                    CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3327                    if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3328                        // We need to backup
3329                        loadState(source, &state, TRUE);
3330                        return CE;
3331                    }
3332                    // calculate the supplementary code point value, if surrogate was not tailored
3333                    cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3334                }
3335            }
3336            break;
3337        case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3338            UChar nextChar;
3339            if( source->flags & UCOL_USE_ITERATOR) {
3340                if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3341                    cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3342                    source->iterator->next(source->iterator);
3343                    return getImplicit(cp, source);
3344                }
3345            } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3346                      U_IS_TRAIL((nextChar=*source->pos))) {
3347                cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3348                source->pos++;
3349                return getImplicit(cp, source);
3350            }
3351            return UCOL_NOT_FOUND;
3352        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3353            return UCOL_NOT_FOUND; /* broken surrogate sequence */
3354        case CHARSET_TAG:
3355            /* not yet implemented */
3356            /* probably after 1.8 */
3357            return UCOL_NOT_FOUND;
3358        default:
3359            *status = U_INTERNAL_PROGRAM_ERROR;
3360            CE=0;
3361            break;
3362    }
3363    if (CE <= UCOL_NOT_FOUND) break;
3364  }
3365  return CE;
3366}
3367
3368
3369/* now uses Mark's getImplicitPrimary code */
3370static
3371inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3372    uint32_t r = uprv_uca_getImplicitPrimary(cp);
3373
3374    *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3375    collationSource->toReturn = collationSource->CEpos;
3376
3377    // **** doesn't work if using iterator ****
3378    if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3379        collationSource->offsetRepeatCount = 1;
3380    } else {
3381        int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3382
3383        UErrorCode errorCode = U_ZERO_ERROR;
3384        collationSource->appendOffset(firstOffset, errorCode);
3385        collationSource->appendOffset(firstOffset + 1, errorCode);
3386
3387        collationSource->offsetReturn = collationSource->offsetStore - 1;
3388        *(collationSource->offsetBuffer) = firstOffset;
3389        if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3390            collationSource->offsetStore = collationSource->offsetBuffer;
3391        }
3392    }
3393
3394    return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3395}
3396
3397/**
3398 * This function handles the special CEs like contractions, expansions,
3399 * surrogates, Thai.
3400 * It is called by both getPrevCE
3401 */
3402uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3403                          collIterate *source,
3404                          UErrorCode *status)
3405{
3406    const uint32_t *CEOffset    = NULL;
3407          UChar    *UCharOffset = NULL;
3408          UChar    schar;
3409    const UChar    *constart    = NULL;
3410          uint32_t size;
3411          UChar    buffer[UCOL_MAX_BUFFER];
3412          uint32_t *endCEBuffer;
3413          UChar   *strbuffer;
3414          int32_t noChars = 0;
3415          int32_t CECount = 0;
3416
3417    for(;;)
3418    {
3419        /* the only ces that loops are thai and contractions */
3420        switch (getCETag(CE))
3421        {
3422        case NOT_FOUND_TAG:  /* this tag always returns */
3423            return CE;
3424
3425        case SPEC_PROC_TAG:
3426            {
3427                // Special processing is getting a CE that is preceded by a certain prefix
3428                // Currently this is only needed for optimizing Japanese length and iteration marks.
3429                // When we encouter a special processing tag, we go backwards and try to see if
3430                // we have a match.
3431                // Contraction tables are used - so the whole process is not unlike contraction.
3432                // prefix data is stored backwards in the table.
3433                const UChar *UCharOffset;
3434                UChar schar, tchar;
3435                collIterateState prefixState;
3436                backupState(source, &prefixState);
3437                for(;;) {
3438                    // This loop will run once per source string character, for as long as we
3439                    //  are matching a potential contraction sequence
3440
3441                    // First we position ourselves at the begining of contraction sequence
3442                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3443
3444                    if (collIter_bos(source)) {
3445                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3446                        break;
3447                    }
3448                    schar = getPrevNormalizedChar(source, status);
3449                    goBackOne(source);
3450
3451                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3452                        UCharOffset++;
3453                    }
3454
3455                    if (schar == tchar) {
3456                        // Found the source string char in the table.
3457                        //  Pick up the corresponding CE from the table.
3458                        CE = *(coll->contractionCEs +
3459                            (UCharOffset - coll->contractionIndex));
3460                    }
3461                    else
3462                    {
3463                        // if there is a completely ignorable code point in the middle of
3464                        // a prefix, we need to act as if it's not there
3465                        // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3466                        // lone surrogates cannot be set to zero as it would break other processing
3467                        uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3468                        // it's easy for BMP code points
3469                        if(isZeroCE == 0) {
3470                            continue;
3471                        } else if(U16_IS_SURROGATE(schar)) {
3472                            // for supplementary code points, we have to check the next one
3473                            // situations where we are going to ignore
3474                            // 1. beginning of the string: schar is a lone surrogate
3475                            // 2. schar is a lone surrogate
3476                            // 3. schar is a trail surrogate in a valid surrogate sequence
3477                            //    that is explicitly set to zero.
3478                            if (!collIter_bos(source)) {
3479                                UChar lead;
3480                                if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3481                                    isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3482                                    if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3483                                        uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3484                                        if(finalCE == 0) {
3485                                            // this is a real, assigned completely ignorable code point
3486                                            goBackOne(source);
3487                                            continue;
3488                                        }
3489                                    }
3490                                } else {
3491                                    // lone surrogate, treat like unassigned
3492                                    return UCOL_NOT_FOUND;
3493                                }
3494                            } else {
3495                                // lone surrogate at the beggining, treat like unassigned
3496                                return UCOL_NOT_FOUND;
3497                            }
3498                        }
3499                        // Source string char was not in the table.
3500                        //   We have not found the prefix.
3501                        CE = *(coll->contractionCEs +
3502                            (ContractionStart - coll->contractionIndex));
3503                    }
3504
3505                    if(!isPrefix(CE)) {
3506                        // The source string char was in the contraction table, and the corresponding
3507                        //   CE is not a prefix CE.  We found the prefix, break
3508                        //   out of loop, this CE will end up being returned.  This is the normal
3509                        //   way out of prefix handling when the source actually contained
3510                        //   the prefix.
3511                        break;
3512                    }
3513                }
3514                loadState(source, &prefixState, TRUE);
3515                break;
3516            }
3517
3518        case CONTRACTION_TAG: {
3519            /* to ensure that the backwards and forwards iteration matches, we
3520            take the current region of most possible match and pass it through
3521            the forward iteration. this will ensure that the obstinate problem of
3522            overlapping contractions will not occur.
3523            */
3524            schar = peekCodeUnit(source, 0);
3525            constart = (UChar *)coll->image + getContractOffset(CE);
3526            if (isAtStartPrevIterate(source)
3527                /* commented away contraction end checks after adding the checks
3528                in getPrevCE  */) {
3529                    /* start of string or this is not the end of any contraction */
3530                    CE = *(coll->contractionCEs +
3531                        (constart - coll->contractionIndex));
3532                    break;
3533            }
3534            strbuffer = buffer;
3535            UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3536            *(UCharOffset --) = 0;
3537            noChars = 0;
3538            // have to swap thai characters
3539            while (ucol_unsafeCP(schar, coll)) {
3540                *(UCharOffset) = schar;
3541                noChars++;
3542                UCharOffset --;
3543                schar = getPrevNormalizedChar(source, status);
3544                goBackOne(source);
3545                // TODO: when we exhaust the contraction buffer,
3546                // it needs to get reallocated. The problem is
3547                // that the size depends on the string which is
3548                // not iterated over. However, since we're travelling
3549                // backwards, we already had to set the iterator at
3550                // the end - so we might as well know where we are?
3551                if (UCharOffset + 1 == buffer) {
3552                    /* we have exhausted the buffer */
3553                    int32_t newsize = 0;
3554                    if(source->pos) { // actually dealing with a position
3555                        newsize = (int32_t)(source->pos - source->string + 1);
3556                    } else { // iterator
3557                        newsize = 4 * UCOL_MAX_BUFFER;
3558                    }
3559                    strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3560                        (newsize + UCOL_MAX_BUFFER));
3561                    /* test for NULL */
3562                    if (strbuffer == NULL) {
3563                        *status = U_MEMORY_ALLOCATION_ERROR;
3564                        return UCOL_NO_MORE_CES;
3565                    }
3566                    UCharOffset = strbuffer + newsize;
3567                    uprv_memcpy(UCharOffset, buffer,
3568                        UCOL_MAX_BUFFER * sizeof(UChar));
3569                    UCharOffset --;
3570                }
3571                if ((source->pos && (source->pos == source->string ||
3572                    ((source->flags & UCOL_ITER_INNORMBUF) &&
3573                    *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3574                    || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3575                        break;
3576                }
3577            }
3578            /* adds the initial base character to the string */
3579            *(UCharOffset) = schar;
3580            noChars++;
3581
3582            int32_t offsetBias;
3583
3584            // **** doesn't work if using iterator ****
3585            if (source->flags & UCOL_ITER_INNORMBUF) {
3586                offsetBias = -1;
3587            } else {
3588                offsetBias = (int32_t)(source->pos - source->string);
3589            }
3590
3591            /* a new collIterate is used to simplify things, since using the current
3592            collIterate will mean that the forward and backwards iteration will
3593            share and change the same buffers. we don't want to get into that. */
3594            collIterate temp;
3595            int32_t rawOffset;
3596
3597            IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3598            if(U_FAILURE(*status)) {
3599                return (uint32_t)UCOL_NULLORDER;
3600            }
3601            temp.flags &= ~UCOL_ITER_NORM;
3602            temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3603
3604            rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
3605            CE = ucol_IGetNextCE(coll, &temp, status);
3606
3607            if (source->extendCEs) {
3608                endCEBuffer = source->extendCEs + source->extendCEsSize;
3609                CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
3610            } else {
3611                endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3612                CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
3613            }
3614
3615            while (CE != UCOL_NO_MORE_CES) {
3616                *(source->CEpos ++) = CE;
3617
3618                if (offsetBias >= 0) {
3619                    source->appendOffset(rawOffset + offsetBias, *status);
3620                }
3621
3622                CECount++;
3623                if (source->CEpos == endCEBuffer) {
3624                    /* ran out of CE space, reallocate to new buffer.
3625                    If reallocation fails, reset pointers and bail out,
3626                    there's no guarantee of the right character position after
3627                    this bail*/
3628                    if (!increaseCEsCapacity(source)) {
3629                        *status = U_MEMORY_ALLOCATION_ERROR;
3630                        break;
3631                    }
3632
3633                    endCEBuffer = source->extendCEs + source->extendCEsSize;
3634                }
3635
3636                if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3637                    rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3638                } else {
3639                    rawOffset = (int32_t)(temp.pos - temp.string);
3640                }
3641
3642                CE = ucol_IGetNextCE(coll, &temp, status);
3643            }
3644
3645            if (strbuffer != buffer) {
3646                uprv_free(strbuffer);
3647            }
3648            if (U_FAILURE(*status)) {
3649                return (uint32_t)UCOL_NULLORDER;
3650            }
3651
3652            if (source->offsetRepeatValue != 0) {
3653                if (CECount > noChars) {
3654                    source->offsetRepeatCount += temp.offsetRepeatCount;
3655                } else {
3656                    // **** does this really skip the right offsets? ****
3657                    source->offsetReturn -= (noChars - CECount);
3658                }
3659            }
3660
3661            if (offsetBias >= 0) {
3662                source->offsetReturn = source->offsetStore - 1;
3663                if (source->offsetReturn == source->offsetBuffer) {
3664                    source->offsetStore = source->offsetBuffer;
3665                }
3666            }
3667
3668            source->toReturn = source->CEpos - 1;
3669            if (source->toReturn == source->CEs) {
3670                source->CEpos = source->CEs;
3671            }
3672
3673            return *(source->toReturn);
3674        }
3675        case LONG_PRIMARY_TAG:
3676            {
3677                *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3678                *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3679                source->toReturn = source->CEpos - 1;
3680
3681                if (source->flags & UCOL_ITER_INNORMBUF) {
3682                    source->offsetRepeatCount = 1;
3683                } else {
3684                    int32_t firstOffset = (int32_t)(source->pos - source->string);
3685
3686                    source->appendOffset(firstOffset, *status);
3687                    source->appendOffset(firstOffset + 1, *status);
3688
3689                    source->offsetReturn = source->offsetStore - 1;
3690                    *(source->offsetBuffer) = firstOffset;
3691                    if (source->offsetReturn == source->offsetBuffer) {
3692                        source->offsetStore = source->offsetBuffer;
3693                    }
3694                }
3695
3696
3697                return *(source->toReturn);
3698            }
3699
3700        case EXPANSION_TAG: /* this tag always returns */
3701            {
3702            /*
3703            This should handle expansion.
3704            NOTE: we can encounter both continuations and expansions in an expansion!
3705            I have to decide where continuations are going to be dealt with
3706            */
3707            int32_t firstOffset = (int32_t)(source->pos - source->string);
3708
3709            // **** doesn't work if using iterator ****
3710            if (source->offsetReturn != NULL) {
3711                if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3712                    source->offsetStore = source->offsetBuffer;
3713                }else {
3714                  firstOffset = -1;
3715                }
3716            }
3717
3718            /* find the offset to expansion table */
3719            CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3720            size     = getExpansionCount(CE);
3721            if (size != 0) {
3722                /*
3723                if there are less than 16 elements in expansion, we don't terminate
3724                */
3725                uint32_t count;
3726
3727                for (count = 0; count < size; count++) {
3728                    *(source->CEpos ++) = *CEOffset++;
3729
3730                    if (firstOffset >= 0) {
3731                        source->appendOffset(firstOffset + 1, *status);
3732                    }
3733                }
3734            } else {
3735                /* else, we do */
3736                while (*CEOffset != 0) {
3737                    *(source->CEpos ++) = *CEOffset ++;
3738
3739                    if (firstOffset >= 0) {
3740                        source->appendOffset(firstOffset + 1, *status);
3741                    }
3742                }
3743            }
3744
3745            if (firstOffset >= 0) {
3746                source->offsetReturn = source->offsetStore - 1;
3747                *(source->offsetBuffer) = firstOffset;
3748                if (source->offsetReturn == source->offsetBuffer) {
3749                    source->offsetStore = source->offsetBuffer;
3750                }
3751            } else {
3752                source->offsetRepeatCount += size - 1;
3753            }
3754
3755            source->toReturn = source->CEpos - 1;
3756            // in case of one element expansion, we
3757            // want to immediately return CEpos
3758            if(source->toReturn == source->CEs) {
3759                source->CEpos = source->CEs;
3760            }
3761
3762            return *(source->toReturn);
3763            }
3764
3765        case DIGIT_TAG:
3766            {
3767                /*
3768                We do a check to see if we want to collate digits as numbers; if so we generate
3769                a custom collation key. Otherwise we pull out the value stored in the expansion table.
3770                */
3771                uint32_t i;    /* general counter */
3772
3773                if (source->coll->numericCollation == UCOL_ON){
3774                    uint32_t digIndx = 0;
3775                    uint32_t endIndex = 0;
3776                    uint32_t leadingZeroIndex = 0;
3777                    uint32_t trailingZeroCount = 0;
3778
3779                    uint8_t collateVal = 0;
3780
3781                    UBool nonZeroValReached = FALSE;
3782
3783                    uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3784                    /*
3785                    We parse the source string until we hit a char that's NOT a digit.
3786                    Use this u_charDigitValue. This might be slow because we have to
3787                    handle surrogates...
3788                    */
3789                    /*
3790                    We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3791                    with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3792                    element we process when going backward. To determine how long that chunk might be, we may need to make
3793                    two passes through the loop that collects digits - one to see how long the string is (and how much is
3794                    leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3795                    more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3796                    element chunk after resetting the state to the initialState at the right side of the digit string.
3797                    */
3798                    uint32_t ceLimit = 0;
3799                    UChar initial_ch = ch;
3800                    collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3801                    backupState(source, &initialState);
3802
3803                    for(;;) {
3804                        collIterateState state = {0,0,0,0,0,0,0,0,0};
3805                        UChar32 char32 = 0;
3806                        int32_t digVal = 0;
3807
3808                        if (U16_IS_TRAIL (ch)) {
3809                            if (!collIter_bos(source)){
3810                                UChar lead = getPrevNormalizedChar(source, status);
3811                                if(U16_IS_LEAD(lead)) {
3812                                    char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3813                                    goBackOne(source);
3814                                } else {
3815                                    char32 = ch;
3816                                }
3817                            } else {
3818                                char32 = ch;
3819                            }
3820                        } else {
3821                            char32 = ch;
3822                        }
3823                        digVal = u_charDigitValue(char32);
3824
3825                        for(;;) {
3826                            // Make sure we have enough space. No longer needed;
3827                            // at this point the largest value of digIndx when we need to save data in numTempBuf
3828                            // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3829                            // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3830
3831                            // Skip over trailing zeroes, and keep a count of them.
3832                            if (digVal != 0)
3833                                nonZeroValReached = TRUE;
3834
3835                            if (nonZeroValReached) {
3836                                /*
3837                                We parse the digit string into base 100 numbers (this fits into a byte).
3838                                We only add to the buffer in twos, thus if we are parsing an odd character,
3839                                that serves as the 'tens' digit while the if we are parsing an even one, that
3840                                is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3841                                a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3842                                overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3843                                than all the other bytes.
3844
3845                                Since we're doing in this reverse we want to put the first digit encountered into the
3846                                ones place and the second digit encountered into the tens place.
3847                                */
3848
3849                                if ((digIndx + trailingZeroCount) % 2 == 1) {
3850                                    // High-order digit case (tens place)
3851                                    collateVal += (uint8_t)(digVal * 10);
3852
3853                                    // We cannot set leadingZeroIndex unless it has been set for the
3854                                    // low-order digit. Therefore, all we can do for the high-order
3855                                    // digit is turn it off, never on.
3856                                    // The only time we will have a high digit without a low is for
3857                                    // the very first non-zero digit, so no zero check is necessary.
3858                                    if (collateVal != 0)
3859                                        leadingZeroIndex = 0;
3860
3861                                    // The first pass through, digIndx may exceed the limit, but in that case
3862                                    // we no longer care about numTempBuf contents since they will be discarded
3863                                    if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3864                                        numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3865                                    }
3866                                    collateVal = 0;
3867                                } else {
3868                                    // Low-order digit case (ones place)
3869                                    collateVal = (uint8_t)digVal;
3870
3871                                    // Check for leading zeroes.
3872                                    if (collateVal == 0) {
3873                                        if (!leadingZeroIndex)
3874                                            leadingZeroIndex = (digIndx/2) + 2;
3875                                    } else
3876                                        leadingZeroIndex = 0;
3877
3878                                    // No need to write to buffer; the case of a last odd digit
3879                                    // is handled below.
3880                                }
3881                                ++digIndx;
3882                            } else
3883                                ++trailingZeroCount;
3884
3885                            if (!collIter_bos(source)) {
3886                                ch = getPrevNormalizedChar(source, status);
3887                                //goBackOne(source);
3888                                if (U16_IS_TRAIL(ch)) {
3889                                    backupState(source, &state);
3890                                    if (!collIter_bos(source)) {
3891                                        goBackOne(source);
3892                                        UChar lead = getPrevNormalizedChar(source, status);
3893
3894                                        if(U16_IS_LEAD(lead)) {
3895                                            char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3896                                        } else {
3897                                            loadState(source, &state, FALSE);
3898                                            char32 = ch;
3899                                        }
3900                                    }
3901                                } else
3902                                    char32 = ch;
3903
3904                                if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3905                                    if (char32 > 0xFFFF) {// For surrogates.
3906                                        loadState(source, &state, FALSE);
3907                                    }
3908                                    // Don't need to "reverse" the goBackOne call,
3909                                    // as this points to the next position to process..
3910                                    //if (char32 > 0xFFFF) // For surrogates.
3911                                    //getNextNormalizedChar(source);
3912                                    break;
3913                                }
3914
3915                                goBackOne(source);
3916                            }else
3917                                break;
3918                        }
3919
3920                        if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3921                            // our collation element is not too big, go ahead and finish with it
3922                            break;
3923                        }
3924                        // our digit string is too long for a collation element;
3925                        // set the limit for it, reset the state and begin again
3926                        ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3927                        if ( ceLimit == 0 ) {
3928                            ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3929                        }
3930                        ch = initial_ch;
3931                        loadState(source, &initialState, FALSE);
3932                        digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3933                        collateVal = 0;
3934                        nonZeroValReached = FALSE;
3935                    }
3936
3937                    if (! nonZeroValReached) {
3938                        digIndx = 2;
3939                        trailingZeroCount = 0;
3940                        numTempBuf[2] = 6;
3941                    }
3942
3943                    if ((digIndx + trailingZeroCount) % 2 != 0) {
3944                        numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3945                        digIndx += 1;       // The implicit leading zero
3946                    }
3947                    if (trailingZeroCount % 2 != 0) {
3948                        // We had to consume one trailing zero for the low digit
3949                        // of the least significant byte
3950                        digIndx += 1;       // The trailing zero not in the exponent
3951                        trailingZeroCount -= 1;
3952                    }
3953
3954                    endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3955
3956                    // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3957                    numTempBuf[2] -= 1;
3958
3959                    /*
3960                    We want to skip over the first two slots in the buffer. The first slot
3961                    is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3962                    sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3963                    The exponent must be adjusted by the number of leading zeroes, and the number of
3964                    trailing zeroes.
3965                    */
3966                    numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3967                    uint32_t exponent = (digIndx+trailingZeroCount)/2;
3968                    if (leadingZeroIndex)
3969                        exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3970                    numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3971
3972                    // Now transfer the collation key to our collIterate struct.
3973                    // The total size for our collation key is half of endIndex, rounded up.
3974                    int32_t size = (endIndex+1)/2;
3975                    if(!ensureCEsCapacity(source, size)) {
3976                        return (uint32_t)UCOL_NULLORDER;
3977                    }
3978                    *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3979                        (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3980                        UCOL_BYTE_COMMON; // Tertiary weight.
3981                    i = endIndex - 1; // Reset the index into the buffer.
3982                    while(i >= 2) {
3983                        uint32_t primWeight = numTempBuf[i--] << 8;
3984                        if ( i >= 2)
3985                            primWeight |= numTempBuf[i--];
3986                        *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3987                    }
3988
3989                    source->toReturn = source->CEpos -1;
3990                    return *(source->toReturn);
3991                } else {
3992                    CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3993                    CE = *(CEOffset++);
3994                    break;
3995                }
3996            }
3997
3998        case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3999            {
4000                static const uint32_t
4001                    SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
4002                //const uint32_t LCount = 19;
4003                static const uint32_t VCount = 21;
4004                static const uint32_t TCount = 28;
4005                //const uint32_t NCount = VCount * TCount;   /* 588 */
4006                //const uint32_t SCount = LCount * NCount;   /* 11172 */
4007
4008                uint32_t L = ch - SBase;
4009                /*
4010                divide into pieces.
4011                we do it in this order since some compilers can do % and / in one
4012                operation
4013                */
4014                uint32_t T = L % TCount;
4015                L /= TCount;
4016                uint32_t V = L % VCount;
4017                L /= VCount;
4018
4019                /* offset them */
4020                L += LBase;
4021                V += VBase;
4022                T += TBase;
4023
4024                int32_t firstOffset = (int32_t)(source->pos - source->string);
4025                source->appendOffset(firstOffset, *status);
4026
4027                /*
4028                 * return the first CE, but first put the rest into the expansion buffer
4029                 */
4030                if (!source->coll->image->jamoSpecial) {
4031                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
4032                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
4033                    source->appendOffset(firstOffset + 1, *status);
4034
4035                    if (T != TBase) {
4036                        *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
4037                        source->appendOffset(firstOffset + 1, *status);
4038                    }
4039
4040                    source->toReturn = source->CEpos - 1;
4041
4042                    source->offsetReturn = source->offsetStore - 1;
4043                    if (source->offsetReturn == source->offsetBuffer) {
4044                        source->offsetStore = source->offsetBuffer;
4045                    }
4046
4047                    return *(source->toReturn);
4048                } else {
4049                    // Since Hanguls pass the FCD check, it is
4050                    // guaranteed that we won't be in
4051                    // the normalization buffer if something like this happens
4052
4053                    // Move Jamos into normalization buffer
4054                    UChar *tempbuffer = source->writableBuffer.getBuffer(5);
4055                    int32_t tempbufferLength, jamoOffset;
4056                    tempbuffer[0] = 0;
4057                    tempbuffer[1] = (UChar)L;
4058                    tempbuffer[2] = (UChar)V;
4059                    if (T != TBase) {
4060                        tempbuffer[3] = (UChar)T;
4061                        tempbufferLength = 4;
4062                    } else {
4063                        tempbufferLength = 3;
4064                    }
4065                    source->writableBuffer.releaseBuffer(tempbufferLength);
4066
4067                    // Indicate where to continue in main input string after exhausting the writableBuffer
4068                    if (source->pos  == source->string) {
4069                        jamoOffset = 0;
4070                        source->fcdPosition = NULL;
4071                    } else {
4072                        jamoOffset = source->pos - source->string;
4073                        source->fcdPosition       = source->pos-1;
4074                    }
4075
4076                    // Append offsets for the additional chars
4077                    // (not the 0, and not the L whose offsets match the original Hangul)
4078                    int32_t jamoRemaining = tempbufferLength - 2;
4079                    jamoOffset++; // appended offsets should match end of original Hangul
4080                    while (jamoRemaining-- > 0) {
4081                        source->appendOffset(jamoOffset, *status);
4082                    }
4083
4084                    source->offsetRepeatValue = jamoOffset;
4085
4086                    source->offsetReturn = source->offsetStore - 1;
4087                    if (source->offsetReturn == source->offsetBuffer) {
4088                        source->offsetStore = source->offsetBuffer;
4089                    }
4090
4091                    source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
4092                    source->origFlags         = source->flags;
4093                    source->flags            |= UCOL_ITER_INNORMBUF;
4094                    source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4095
4096                    return(UCOL_IGNORABLE);
4097                }
4098            }
4099
4100        case IMPLICIT_TAG:        /* everything that is not defined otherwise */
4101            return getPrevImplicit(ch, source);
4102
4103            // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4104        case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4105            return getPrevImplicit(ch, source);
4106
4107        case SURROGATE_TAG:  /* This is a surrogate pair */
4108            /* essentially an engaged lead surrogate. */
4109            /* if you have encountered it here, it means that a */
4110            /* broken sequence was encountered and this is an error */
4111            return UCOL_NOT_FOUND;
4112
4113        case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
4114            return UCOL_NOT_FOUND; /* broken surrogate sequence */
4115
4116        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4117            {
4118                UChar32 cp = 0;
4119                UChar  prevChar;
4120                const UChar *prev;
4121                if (isAtStartPrevIterate(source)) {
4122                    /* we are at the start of the string, wrong place to be at */
4123                    return UCOL_NOT_FOUND;
4124                }
4125                if (source->pos != source->writableBuffer.getBuffer()) {
4126                    prev     = source->pos - 1;
4127                } else {
4128                    prev     = source->fcdPosition;
4129                }
4130                prevChar = *prev;
4131
4132                /* Handles Han and Supplementary characters here.*/
4133                if (U16_IS_LEAD(prevChar)) {
4134                    cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4135                    source->pos = prev;
4136                } else {
4137                    return UCOL_NOT_FOUND; /* like unassigned */
4138                }
4139
4140                return getPrevImplicit(cp, source);
4141            }
4142
4143            /* UCA is filled with these. Tailorings are NOT_FOUND */
4144            /* not yet implemented */
4145        case CHARSET_TAG:  /* this tag always returns */
4146            /* probably after 1.8 */
4147            return UCOL_NOT_FOUND;
4148
4149        default:           /* this tag always returns */
4150            *status = U_INTERNAL_PROGRAM_ERROR;
4151            CE=0;
4152            break;
4153        }
4154
4155        if (CE <= UCOL_NOT_FOUND) {
4156            break;
4157        }
4158    }
4159
4160    return CE;
4161}
4162
4163/* This should really be a macro                                                                      */
4164/* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4165/* secondaries in French                                                                              */
4166/*
4167void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4168  uint8_t temp;
4169  while(start<end) {
4170    temp = *start;
4171    *start++ = *end;
4172    *end-- = temp;
4173  }
4174}
4175*/
4176
4177#define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4178  TYPE tempA; \
4179while((start)<(end)) { \
4180    tempA = *(start); \
4181    *(start)++ = *(end); \
4182    *(end)-- = tempA; \
4183} \
4184}
4185
4186/****************************************************************************/
4187/* Following are the sortkey generation functions                           */
4188/*                                                                          */
4189/****************************************************************************/
4190
4191U_CAPI int32_t U_EXPORT2
4192ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4193                   const uint8_t *src2, int32_t src2Length,
4194                   uint8_t *dest, int32_t destCapacity) {
4195    /* check arguments */
4196    if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4197        src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4198        destCapacity<0 || (destCapacity>0 && dest==NULL)
4199    ) {
4200        /* error, attempt to write a zero byte and return 0 */
4201        if(dest!=NULL && destCapacity>0) {
4202            *dest=0;
4203        }
4204        return 0;
4205    }
4206
4207    /* check lengths and capacity */
4208    if(src1Length<0) {
4209        src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4210    }
4211    if(src2Length<0) {
4212        src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4213    }
4214
4215    int32_t destLength=src1Length+src2Length;
4216    if(destLength>destCapacity) {
4217        /* the merged sort key does not fit into the destination */
4218        return destLength;
4219    }
4220
4221    /* merge the sort keys with the same number of levels */
4222    uint8_t *p=dest;
4223    for(;;) {
4224        /* copy level from src1 not including 00 or 01 */
4225        uint8_t b;
4226        while((b=*src1)>=2) {
4227            ++src1;
4228            *p++=b;
4229        }
4230
4231        /* add a 02 merge separator */
4232        *p++=2;
4233
4234        /* copy level from src2 not including 00 or 01 */
4235        while((b=*src2)>=2) {
4236            ++src2;
4237            *p++=b;
4238        }
4239
4240        /* if both sort keys have another level, then add a 01 level separator and continue */
4241        if(*src1==1 && *src2==1) {
4242            ++src1;
4243            ++src2;
4244            *p++=1;
4245        } else {
4246            break;
4247        }
4248    }
4249
4250    /*
4251     * here, at least one sort key is finished now, but the other one
4252     * might have some contents left from containing more levels;
4253     * that contents is just appended to the result
4254     */
4255    if(*src1!=0) {
4256        /* src1 is not finished, therefore *src2==0, and src1 is appended */
4257        src2=src1;
4258    }
4259    /* append src2, "the other, unfinished sort key" */
4260    while((*p++=*src2++)!=0) {}
4261
4262    /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */
4263    return (int32_t)(p-dest);
4264}
4265
4266U_NAMESPACE_BEGIN
4267
4268class SortKeyByteSink : public ByteSink {
4269public:
4270    SortKeyByteSink(char *dest, int32_t destCapacity)
4271            : buffer_(dest), capacity_(destCapacity),
4272              appended_(0) {
4273        if (buffer_ == NULL) {
4274            capacity_ = 0;
4275        } else if(capacity_ < 0) {
4276            buffer_ = NULL;
4277            capacity_ = 0;
4278        }
4279    }
4280    virtual ~SortKeyByteSink();
4281
4282    virtual void Append(const char *bytes, int32_t n);
4283    void Append(uint32_t b) {
4284        if (appended_ < capacity_ || Resize(1, appended_)) {
4285            buffer_[appended_] = (char)b;
4286        }
4287        ++appended_;
4288    }
4289    void Append(uint32_t b1, uint32_t b2) {
4290        int32_t a2 = appended_ + 2;
4291        if (a2 <= capacity_ || Resize(2, appended_)) {
4292            buffer_[appended_] = (char)b1;
4293            buffer_[appended_ + 1] = (char)b2;
4294        } else if(appended_ < capacity_) {
4295            buffer_[appended_] = (char)b1;
4296        }
4297        appended_ = a2;
4298    }
4299    virtual char *GetAppendBuffer(int32_t min_capacity,
4300                                  int32_t desired_capacity_hint,
4301                                  char *scratch, int32_t scratch_capacity,
4302                                  int32_t *result_capacity);
4303    int32_t NumberOfBytesAppended() const { return appended_; }
4304    /** @return FALSE if memory allocation failed */
4305    UBool IsOk() const { return buffer_ != NULL; }
4306
4307protected:
4308    virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0;
4309    virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
4310
4311    void SetNotOk() {
4312        buffer_ = NULL;
4313        capacity_ = 0;
4314    }
4315
4316    char *buffer_;
4317    int32_t capacity_;
4318    int32_t appended_;
4319
4320private:
4321    SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
4322    SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
4323};
4324
4325SortKeyByteSink::~SortKeyByteSink() {}
4326
4327void
4328SortKeyByteSink::Append(const char *bytes, int32_t n) {
4329    if (n <= 0 || bytes == NULL) {
4330        return;
4331    }
4332    int32_t length = appended_;
4333    appended_ += n;
4334    if ((buffer_ + length) == bytes) {
4335        return;  // the caller used GetAppendBuffer() and wrote the bytes already
4336    }
4337    int32_t available = capacity_ - length;
4338    if (n <= available) {
4339        uprv_memcpy(buffer_ + length, bytes, n);
4340    } else {
4341        AppendBeyondCapacity(bytes, n, length);
4342    }
4343}
4344
4345char *
4346SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
4347                                 int32_t desired_capacity_hint,
4348                                 char *scratch,
4349                                 int32_t scratch_capacity,
4350                                 int32_t *result_capacity) {
4351    if (min_capacity < 1 || scratch_capacity < min_capacity) {
4352        *result_capacity = 0;
4353        return NULL;
4354    }
4355    int32_t available = capacity_ - appended_;
4356    if (available >= min_capacity) {
4357        *result_capacity = available;
4358        return buffer_ + appended_;
4359    } else if (Resize(desired_capacity_hint, appended_)) {
4360        *result_capacity = capacity_ - appended_;
4361        return buffer_ + appended_;
4362    } else {
4363        *result_capacity = scratch_capacity;
4364        return scratch;
4365    }
4366}
4367
4368class FixedSortKeyByteSink : public SortKeyByteSink {
4369public:
4370    FixedSortKeyByteSink(char *dest, int32_t destCapacity)
4371            : SortKeyByteSink(dest, destCapacity) {}
4372    virtual ~FixedSortKeyByteSink();
4373
4374private:
4375    virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4376    virtual UBool Resize(int32_t appendCapacity, int32_t length);
4377};
4378
4379FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
4380
4381void
4382FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
4383    // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4384    // Fill the buffer completely.
4385    int32_t available = capacity_ - length;
4386    if (available > 0) {
4387        uprv_memcpy(buffer_ + length, bytes, available);
4388    }
4389}
4390
4391UBool
4392FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
4393    return FALSE;
4394}
4395
4396class CollationKeyByteSink : public SortKeyByteSink {
4397public:
4398    CollationKeyByteSink(CollationKey &key)
4399            : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
4400              key_(key) {}
4401    virtual ~CollationKeyByteSink();
4402
4403private:
4404    virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4405    virtual UBool Resize(int32_t appendCapacity, int32_t length);
4406
4407    CollationKey &key_;
4408};
4409
4410CollationKeyByteSink::~CollationKeyByteSink() {}
4411
4412void
4413CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
4414    // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4415    if (Resize(n, length)) {
4416        uprv_memcpy(buffer_ + length, bytes, n);
4417    }
4418}
4419
4420UBool
4421CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
4422    if (buffer_ == NULL) {
4423        return FALSE;  // allocation failed before already
4424    }
4425    int32_t newCapacity = 2 * capacity_;
4426    int32_t altCapacity = length + 2 * appendCapacity;
4427    if (newCapacity < altCapacity) {
4428        newCapacity = altCapacity;
4429    }
4430    if (newCapacity < 200) {
4431        newCapacity = 200;
4432    }
4433    uint8_t *newBuffer = key_.reallocate(newCapacity, length);
4434    if (newBuffer == NULL) {
4435        SetNotOk();
4436        return FALSE;
4437    }
4438    buffer_ = reinterpret_cast<char *>(newBuffer);
4439    capacity_ = newCapacity;
4440    return TRUE;
4441}
4442
4443/**
4444 * uint8_t byte buffer, similar to CharString but simpler.
4445 */
4446class SortKeyLevel : public UMemory {
4447public:
4448    SortKeyLevel() : len(0), ok(TRUE) {}
4449    ~SortKeyLevel() {}
4450
4451    /** @return FALSE if memory allocation failed */
4452    UBool isOk() const { return ok; }
4453    UBool isEmpty() const { return len == 0; }
4454    int32_t length() const { return len; }
4455    const uint8_t *data() const { return buffer.getAlias(); }
4456    uint8_t operator[](int32_t index) const { return buffer[index]; }
4457
4458    void appendByte(uint32_t b);
4459
4460    void appendTo(ByteSink &sink) const {
4461        sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);
4462    }
4463
4464    uint8_t &lastByte() {
4465        U_ASSERT(len > 0);
4466        return buffer[len - 1];
4467    }
4468
4469    uint8_t *getLastFewBytes(int32_t n) {
4470        if (ok && len >= n) {
4471            return buffer.getAlias() + len - n;
4472        } else {
4473            return NULL;
4474        }
4475    }
4476
4477private:
4478    MaybeStackArray<uint8_t, 40> buffer;
4479    int32_t len;
4480    UBool ok;
4481
4482    UBool ensureCapacity(int32_t appendCapacity);
4483
4484    SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class
4485    SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class
4486};
4487
4488void SortKeyLevel::appendByte(uint32_t b) {
4489    if(len < buffer.getCapacity() || ensureCapacity(1)) {
4490        buffer[len++] = (uint8_t)b;
4491    }
4492}
4493
4494UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {
4495    if(!ok) {
4496        return FALSE;
4497    }
4498    int32_t newCapacity = 2 * buffer.getCapacity();
4499    int32_t altCapacity = len + 2 * appendCapacity;
4500    if (newCapacity < altCapacity) {
4501        newCapacity = altCapacity;
4502    }
4503    if (newCapacity < 200) {
4504        newCapacity = 200;
4505    }
4506    if(buffer.resize(newCapacity, len)==NULL) {
4507        return ok = FALSE;
4508    }
4509    return TRUE;
4510}
4511
4512U_NAMESPACE_END
4513
4514/* sortkey API */
4515U_CAPI int32_t U_EXPORT2
4516ucol_getSortKey(const    UCollator    *coll,
4517        const    UChar        *source,
4518        int32_t        sourceLength,
4519        uint8_t        *result,
4520        int32_t        resultLength)
4521{
4522    UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4523    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4524        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4525            ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4526    }
4527
4528    if(coll->delegate != NULL) {
4529      return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);
4530    }
4531
4532    UErrorCode status = U_ZERO_ERROR;
4533    int32_t keySize   = 0;
4534
4535    if(source != NULL) {
4536        // source == NULL is actually an error situation, but we would need to
4537        // have an error code to return it. Until we introduce a new
4538        // API, it stays like this
4539
4540        /* this uses the function pointer that is set in updateinternalstate */
4541        /* currently, there are two funcs: */
4542        /*ucol_calcSortKey(...);*/
4543        /*ucol_calcSortKeySimpleTertiary(...);*/
4544
4545        uint8_t noDest[1] = { 0 };
4546        if(result == NULL) {
4547            // Distinguish pure preflighting from an allocation error.
4548            result = noDest;
4549            resultLength = 0;
4550        }
4551        FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength);
4552        coll->sortKeyGen(coll, source, sourceLength, sink, &status);
4553        if(U_SUCCESS(status)) {
4554            keySize = sink.NumberOfBytesAppended();
4555        }
4556    }
4557    UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4558    UTRACE_EXIT_STATUS(status);
4559    return keySize;
4560}
4561
4562U_CFUNC int32_t
4563ucol_getCollationKey(const UCollator *coll,
4564                     const UChar *source, int32_t sourceLength,
4565                     CollationKey &key,
4566                     UErrorCode &errorCode) {
4567    CollationKeyByteSink sink(key);
4568    coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);
4569    return sink.NumberOfBytesAppended();
4570}
4571
4572// Is this primary weight compressible?
4573// Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4574// TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4575static inline UBool
4576isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4577    return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
4578}
4579
4580static
4581inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {
4582    if (caseShift  == 0) {
4583        cases.appendByte(UCOL_CASE_BYTE_START);
4584        caseShift = UCOL_CASE_SHIFT_START;
4585    }
4586}
4587
4588// Packs the secondary buffer when processing French locale.
4589static void
4590packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
4591    secondaries += secsize;  // We read the secondary-level bytes back to front.
4592    uint8_t secondary;
4593    int32_t count2 = 0;
4594    int32_t i = 0;
4595    // we use i here since the key size already accounts for terminators, so we'll discard the increment
4596    for(i = 0; i<secsize; i++) {
4597        secondary = *(secondaries-i-1);
4598        /* This is compression code. */
4599        if (secondary == UCOL_COMMON2) {
4600            ++count2;
4601        } else {
4602            if (count2 > 0) {
4603                if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4604                    while (count2 > UCOL_TOP_COUNT2) {
4605                        result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4606                        count2 -= (uint32_t)UCOL_TOP_COUNT2;
4607                    }
4608                    result.Append(UCOL_COMMON_TOP2 - (count2-1));
4609                } else {
4610                    while (count2 > UCOL_BOT_COUNT2) {
4611                        result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4612                        count2 -= (uint32_t)UCOL_BOT_COUNT2;
4613                    }
4614                    result.Append(UCOL_COMMON_BOT2 + (count2-1));
4615                }
4616                count2 = 0;
4617            }
4618            result.Append(secondary);
4619        }
4620    }
4621    if (count2 > 0) {
4622        while (count2 > UCOL_BOT_COUNT2) {
4623            result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4624            count2 -= (uint32_t)UCOL_BOT_COUNT2;
4625        }
4626        result.Append(UCOL_COMMON_BOT2 + (count2-1));
4627    }
4628}
4629
4630#define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4631
4632/* This is the sortkey work horse function */
4633U_CFUNC void U_CALLCONV
4634ucol_calcSortKey(const    UCollator    *coll,
4635        const    UChar        *source,
4636        int32_t        sourceLength,
4637        SortKeyByteSink &result,
4638        UErrorCode *status)
4639{
4640    if(U_FAILURE(*status)) {
4641        return;
4642    }
4643
4644    SortKeyByteSink &primaries = result;
4645    SortKeyLevel secondaries;
4646    SortKeyLevel tertiaries;
4647    SortKeyLevel cases;
4648    SortKeyLevel quads;
4649
4650    UnicodeString normSource;
4651
4652    int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4653
4654    UColAttributeValue strength = coll->strength;
4655
4656    uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4657    uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4658    uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4659    UBool  compareIdent = (strength == UCOL_IDENTICAL);
4660    UBool  doCase = (coll->caseLevel == UCOL_ON);
4661    UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4662    UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4663    //UBool  qShifted = shifted && (compareQuad == 0);
4664    UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4665
4666    uint32_t variableTopValue = coll->variableTopValue;
4667    // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4668    // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4669    uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4670    uint8_t UCOL_HIRAGANA_QUAD = 0;
4671    if(doHiragana) {
4672        UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4673        /* allocate one more space for hiragana, value for hiragana */
4674    }
4675    uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4676
4677    /* support for special features like caselevel and funky secondaries */
4678    int32_t lastSecondaryLength = 0;
4679    uint32_t caseShift = 0;
4680
4681    /* If we need to normalize, we'll do it all at once at the beginning! */
4682    const Normalizer2 *norm2;
4683    if(compareIdent) {
4684        norm2 = Normalizer2Factory::getNFDInstance(*status);
4685    } else if(coll->normalizationMode != UCOL_OFF) {
4686        norm2 = Normalizer2Factory::getFCDInstance(*status);
4687    } else {
4688        norm2 = NULL;
4689    }
4690    if(norm2 != NULL) {
4691        normSource.setTo(FALSE, source, len);
4692        int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4693        if(qcYesLength != len) {
4694            UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4695            normSource.truncate(qcYesLength);
4696            norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4697            source = normSource.getBuffer();
4698            len = normSource.length();
4699        }
4700    }
4701    collIterate s;
4702    IInit_collIterate(coll, source, len, &s, status);
4703    if(U_FAILURE(*status)) {
4704        return;
4705    }
4706    s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
4707
4708    uint32_t order = 0;
4709
4710    uint8_t primary1 = 0;
4711    uint8_t primary2 = 0;
4712    uint8_t secondary = 0;
4713    uint8_t tertiary = 0;
4714    uint8_t caseSwitch = coll->caseSwitch;
4715    uint8_t tertiaryMask = coll->tertiaryMask;
4716    int8_t tertiaryAddition = coll->tertiaryAddition;
4717    uint8_t tertiaryTop = coll->tertiaryTop;
4718    uint8_t tertiaryBottom = coll->tertiaryBottom;
4719    uint8_t tertiaryCommon = coll->tertiaryCommon;
4720    uint8_t caseBits = 0;
4721
4722    UBool wasShifted = FALSE;
4723    UBool notIsContinuation = FALSE;
4724
4725    uint32_t count2 = 0, count3 = 0, count4 = 0;
4726    uint8_t leadPrimary = 0;
4727
4728    for(;;) {
4729        order = ucol_IGetNextCE(coll, &s, status);
4730        if(order == UCOL_NO_MORE_CES) {
4731            break;
4732        }
4733
4734        if(order == 0) {
4735            continue;
4736        }
4737
4738        notIsContinuation = !isContinuation(order);
4739
4740        if(notIsContinuation) {
4741            tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4742        } else {
4743            tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4744        }
4745
4746        secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4747        primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4748        primary1 = (uint8_t)(order >> 8);
4749
4750        uint8_t originalPrimary1 = primary1;
4751        if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4752            primary1 = coll->leadBytePermutationTable[primary1];
4753        }
4754
4755        if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4756                        || (!notIsContinuation && wasShifted)))
4757            || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4758        {
4759            /* and other ignorables should be removed if following a shifted code point */
4760            if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4761                /* we should just completely ignore it */
4762                continue;
4763            }
4764            if(compareQuad == 0) {
4765                if(count4 > 0) {
4766                    while (count4 > UCOL_BOT_COUNT4) {
4767                        quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4768                        count4 -= UCOL_BOT_COUNT4;
4769                    }
4770                    quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4771                    count4 = 0;
4772                }
4773                /* We are dealing with a variable and we're treating them as shifted */
4774                /* This is a shifted ignorable */
4775                if(primary1 != 0) { /* we need to check this since we could be in continuation */
4776                    quads.appendByte(primary1);
4777                }
4778                if(primary2 != 0) {
4779                    quads.appendByte(primary2);
4780                }
4781            }
4782            wasShifted = TRUE;
4783        } else {
4784            wasShifted = FALSE;
4785            /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4786            /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
4787            /* regular and simple sortkey calc */
4788            if(primary1 != UCOL_IGNORABLE) {
4789                if(notIsContinuation) {
4790                    if(leadPrimary == primary1) {
4791                        primaries.Append(primary2);
4792                    } else {
4793                        if(leadPrimary != 0) {
4794                            primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4795                        }
4796                        if(primary2 == UCOL_IGNORABLE) {
4797                            /* one byter, not compressed */
4798                            primaries.Append(primary1);
4799                            leadPrimary = 0;
4800                        } else if(isCompressible(coll, originalPrimary1)) {
4801                            /* compress */
4802                            primaries.Append(leadPrimary = primary1, primary2);
4803                        } else {
4804                            leadPrimary = 0;
4805                            primaries.Append(primary1, primary2);
4806                        }
4807                    }
4808                } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4809                    if(primary2 == UCOL_IGNORABLE) {
4810                        primaries.Append(primary1);
4811                    } else {
4812                        primaries.Append(primary1, primary2);
4813                    }
4814                }
4815            }
4816
4817            if(secondary > compareSec) {
4818                if(!isFrenchSec) {
4819                    /* This is compression code. */
4820                    if (secondary == UCOL_COMMON2 && notIsContinuation) {
4821                        ++count2;
4822                    } else {
4823                        if (count2 > 0) {
4824                            if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4825                                while (count2 > UCOL_TOP_COUNT2) {
4826                                    secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4827                                    count2 -= (uint32_t)UCOL_TOP_COUNT2;
4828                                }
4829                                secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
4830                            } else {
4831                                while (count2 > UCOL_BOT_COUNT2) {
4832                                    secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4833                                    count2 -= (uint32_t)UCOL_BOT_COUNT2;
4834                                }
4835                                secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4836                            }
4837                            count2 = 0;
4838                        }
4839                        secondaries.appendByte(secondary);
4840                    }
4841                } else {
4842                    /* Do the special handling for French secondaries */
4843                    /* We need to get continuation elements and do intermediate restore */
4844                    /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4845                    if(notIsContinuation) {
4846                        if (lastSecondaryLength > 1) {
4847                            uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4848                            if (frenchStartPtr != NULL) {
4849                                /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4850                                uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4851                                uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4852                            }
4853                        }
4854                        lastSecondaryLength = 1;
4855                    } else {
4856                        ++lastSecondaryLength;
4857                    }
4858                    secondaries.appendByte(secondary);
4859                }
4860            }
4861
4862            if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4863                // do the case level if we need to do it. We don't want to calculate
4864                // case level for primary ignorables if we have only primary strength and case level
4865                // otherwise we would break well formedness of CEs
4866                doCaseShift(cases, caseShift);
4867                if(notIsContinuation) {
4868                    caseBits = (uint8_t)(tertiary & 0xC0);
4869
4870                    if(tertiary != 0) {
4871                        if(coll->caseFirst == UCOL_UPPER_FIRST) {
4872                            if((caseBits & 0xC0) == 0) {
4873                                cases.lastByte() |= 1 << (--caseShift);
4874                            } else {
4875                                cases.lastByte() |= 0 << (--caseShift);
4876                                /* second bit */
4877                                doCaseShift(cases, caseShift);
4878                                cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift);
4879                            }
4880                        } else {
4881                            if((caseBits & 0xC0) == 0) {
4882                                cases.lastByte() |= 0 << (--caseShift);
4883                            } else {
4884                                cases.lastByte() |= 1 << (--caseShift);
4885                                /* second bit */
4886                                doCaseShift(cases, caseShift);
4887                                cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift);
4888                            }
4889                        }
4890                    }
4891                }
4892            } else {
4893                if(notIsContinuation) {
4894                    tertiary ^= caseSwitch;
4895                }
4896            }
4897
4898            tertiary &= tertiaryMask;
4899            if(tertiary > compareTer) {
4900                /* This is compression code. */
4901                /* sequence size check is included in the if clause */
4902                if (tertiary == tertiaryCommon && notIsContinuation) {
4903                    ++count3;
4904                } else {
4905                    if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4906                        tertiary += tertiaryAddition;
4907                    } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4908                        tertiary -= tertiaryAddition;
4909                    }
4910                    if (count3 > 0) {
4911                        if ((tertiary > tertiaryCommon)) {
4912                            while (count3 > coll->tertiaryTopCount) {
4913                                tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
4914                                count3 -= (uint32_t)coll->tertiaryTopCount;
4915                            }
4916                            tertiaries.appendByte(tertiaryTop - (count3-1));
4917                        } else {
4918                            while (count3 > coll->tertiaryBottomCount) {
4919                                tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
4920                                count3 -= (uint32_t)coll->tertiaryBottomCount;
4921                            }
4922                            tertiaries.appendByte(tertiaryBottom + (count3-1));
4923                        }
4924                        count3 = 0;
4925                    }
4926                    tertiaries.appendByte(tertiary);
4927                }
4928            }
4929
4930            if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4931                if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4932                    if(count4>0) { // Close this part
4933                        while (count4 > UCOL_BOT_COUNT4) {
4934                            quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4935                            count4 -= UCOL_BOT_COUNT4;
4936                        }
4937                        quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4938                        count4 = 0;
4939                    }
4940                    quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana
4941                } else { // This wasn't Hiragana, so we can continue adding stuff
4942                    count4++;
4943                }
4944            }
4945        }
4946    }
4947
4948    /* Here, we are generally done with processing */
4949    /* bailing out would not be too productive */
4950
4951    UBool ok = TRUE;
4952    if(U_SUCCESS(*status)) {
4953        /* we have done all the CE's, now let's put them together to form a key */
4954        if(compareSec == 0) {
4955            if (count2 > 0) {
4956                while (count2 > UCOL_BOT_COUNT2) {
4957                    secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4958                    count2 -= (uint32_t)UCOL_BOT_COUNT2;
4959                }
4960                secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4961            }
4962            result.Append(UCOL_LEVELTERMINATOR);
4963            if(!secondaries.isOk()) {
4964                ok = FALSE;
4965            } else if(!isFrenchSec) {
4966                secondaries.appendTo(result);
4967            } else {
4968                // If there are any unresolved continuation secondaries,
4969                // reverse them here so that we can reverse the whole secondary thing.
4970                if (lastSecondaryLength > 1) {
4971                    uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4972                    if (frenchStartPtr != NULL) {
4973                        /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4974                        uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4975                        uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4976                    }
4977                }
4978                packFrench(secondaries.data(), secondaries.length(), result);
4979            }
4980        }
4981
4982        if(doCase) {
4983            ok &= cases.isOk();
4984            result.Append(UCOL_LEVELTERMINATOR);
4985            cases.appendTo(result);
4986        }
4987
4988        if(compareTer == 0) {
4989            if (count3 > 0) {
4990                if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4991                    while (count3 >= coll->tertiaryTopCount) {
4992                        tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
4993                        count3 -= (uint32_t)coll->tertiaryTopCount;
4994                    }
4995                    tertiaries.appendByte(tertiaryTop - count3);
4996                } else {
4997                    while (count3 > coll->tertiaryBottomCount) {
4998                        tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
4999                        count3 -= (uint32_t)coll->tertiaryBottomCount;
5000                    }
5001                    tertiaries.appendByte(tertiaryBottom + (count3-1));
5002                }
5003            }
5004            ok &= tertiaries.isOk();
5005            result.Append(UCOL_LEVELTERMINATOR);
5006            tertiaries.appendTo(result);
5007
5008            if(compareQuad == 0/*qShifted == TRUE*/) {
5009                if(count4 > 0) {
5010                    while (count4 > UCOL_BOT_COUNT4) {
5011                        quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5012                        count4 -= UCOL_BOT_COUNT4;
5013                    }
5014                    quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
5015                }
5016                ok &= quads.isOk();
5017                result.Append(UCOL_LEVELTERMINATOR);
5018                quads.appendTo(result);
5019            }
5020
5021            if(compareIdent) {
5022                result.Append(UCOL_LEVELTERMINATOR);
5023                u_writeIdenticalLevelRun(s.string, len, result);
5024            }
5025        }
5026        result.Append(0);
5027    }
5028
5029    /* To avoid memory leak, free the offset buffer if necessary. */
5030    ucol_freeOffsetBuffer(&s);
5031
5032    ok &= result.IsOk();
5033    if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
5034}
5035
5036
5037U_CFUNC void U_CALLCONV
5038ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
5039        const    UChar        *source,
5040        int32_t        sourceLength,
5041        SortKeyByteSink &result,
5042        UErrorCode *status)
5043{
5044    U_ALIGN_CODE(16);
5045
5046    if(U_FAILURE(*status)) {
5047        return;
5048    }
5049
5050    SortKeyByteSink &primaries = result;
5051    SortKeyLevel secondaries;
5052    SortKeyLevel tertiaries;
5053
5054    UnicodeString normSource;
5055
5056    int32_t len =  sourceLength;
5057
5058    /* If we need to normalize, we'll do it all at once at the beginning! */
5059    if(coll->normalizationMode != UCOL_OFF) {
5060        normSource.setTo(len < 0, source, len);
5061        const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
5062        int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
5063        if(qcYesLength != normSource.length()) {
5064            UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
5065            normSource.truncate(qcYesLength);
5066            norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
5067            source = normSource.getBuffer();
5068            len = normSource.length();
5069        }
5070    }
5071    collIterate s;
5072    IInit_collIterate(coll, (UChar *)source, len, &s, status);
5073    if(U_FAILURE(*status)) {
5074        return;
5075    }
5076    s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
5077
5078    uint32_t order = 0;
5079
5080    uint8_t primary1 = 0;
5081    uint8_t primary2 = 0;
5082    uint8_t secondary = 0;
5083    uint8_t tertiary = 0;
5084    uint8_t caseSwitch = coll->caseSwitch;
5085    uint8_t tertiaryMask = coll->tertiaryMask;
5086    int8_t tertiaryAddition = coll->tertiaryAddition;
5087    uint8_t tertiaryTop = coll->tertiaryTop;
5088    uint8_t tertiaryBottom = coll->tertiaryBottom;
5089    uint8_t tertiaryCommon = coll->tertiaryCommon;
5090
5091    UBool notIsContinuation = FALSE;
5092
5093    uint32_t count2 = 0, count3 = 0;
5094    uint8_t leadPrimary = 0;
5095
5096    for(;;) {
5097        order = ucol_IGetNextCE(coll, &s, status);
5098
5099        if(order == 0) {
5100            continue;
5101        }
5102
5103        if(order == UCOL_NO_MORE_CES) {
5104            break;
5105        }
5106
5107        notIsContinuation = !isContinuation(order);
5108
5109        if(notIsContinuation) {
5110            tertiary = (uint8_t)((order & tertiaryMask));
5111        } else {
5112            tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5113        }
5114
5115        secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5116        primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5117        primary1 = (uint8_t)(order >> 8);
5118
5119        uint8_t originalPrimary1 = primary1;
5120        if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5121            primary1 = coll->leadBytePermutationTable[primary1];
5122        }
5123
5124        /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5125        /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
5126        /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5127        /* regular and simple sortkey calc */
5128        if(primary1 != UCOL_IGNORABLE) {
5129            if(notIsContinuation) {
5130                if(leadPrimary == primary1) {
5131                    primaries.Append(primary2);
5132                } else {
5133                    if(leadPrimary != 0) {
5134                        primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5135                    }
5136                    if(primary2 == UCOL_IGNORABLE) {
5137                        /* one byter, not compressed */
5138                        primaries.Append(primary1);
5139                        leadPrimary = 0;
5140                    } else if(isCompressible(coll, originalPrimary1)) {
5141                        /* compress */
5142                        primaries.Append(leadPrimary = primary1, primary2);
5143                    } else {
5144                        leadPrimary = 0;
5145                        primaries.Append(primary1, primary2);
5146                    }
5147                }
5148            } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5149                if(primary2 == UCOL_IGNORABLE) {
5150                    primaries.Append(primary1);
5151                } else {
5152                    primaries.Append(primary1, primary2);
5153                }
5154            }
5155        }
5156
5157        if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5158            /* This is compression code. */
5159            if (secondary == UCOL_COMMON2 && notIsContinuation) {
5160                ++count2;
5161            } else {
5162                if (count2 > 0) {
5163                    if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5164                        while (count2 > UCOL_TOP_COUNT2) {
5165                            secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5166                            count2 -= (uint32_t)UCOL_TOP_COUNT2;
5167                        }
5168                        secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
5169                    } else {
5170                        while (count2 > UCOL_BOT_COUNT2) {
5171                            secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5172                            count2 -= (uint32_t)UCOL_BOT_COUNT2;
5173                        }
5174                        secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5175                    }
5176                    count2 = 0;
5177                }
5178                secondaries.appendByte(secondary);
5179            }
5180        }
5181
5182        if(notIsContinuation) {
5183            tertiary ^= caseSwitch;
5184        }
5185
5186        if(tertiary > 0) {
5187            /* This is compression code. */
5188            /* sequence size check is included in the if clause */
5189            if (tertiary == tertiaryCommon && notIsContinuation) {
5190                ++count3;
5191            } else {
5192                if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5193                    tertiary += tertiaryAddition;
5194                } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5195                    tertiary -= tertiaryAddition;
5196                }
5197                if (count3 > 0) {
5198                    if ((tertiary > tertiaryCommon)) {
5199                        while (count3 > coll->tertiaryTopCount) {
5200                            tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5201                            count3 -= (uint32_t)coll->tertiaryTopCount;
5202                        }
5203                        tertiaries.appendByte(tertiaryTop - (count3-1));
5204                    } else {
5205                        while (count3 > coll->tertiaryBottomCount) {
5206                            tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5207                            count3 -= (uint32_t)coll->tertiaryBottomCount;
5208                        }
5209                        tertiaries.appendByte(tertiaryBottom + (count3-1));
5210                    }
5211                    count3 = 0;
5212                }
5213                tertiaries.appendByte(tertiary);
5214            }
5215        }
5216    }
5217
5218    UBool ok = TRUE;
5219    if(U_SUCCESS(*status)) {
5220        /* we have done all the CE's, now let's put them together to form a key */
5221        if (count2 > 0) {
5222            while (count2 > UCOL_BOT_COUNT2) {
5223                secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5224                count2 -= (uint32_t)UCOL_BOT_COUNT2;
5225            }
5226            secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5227        }
5228        ok &= secondaries.isOk();
5229        result.Append(UCOL_LEVELTERMINATOR);
5230        secondaries.appendTo(result);
5231
5232        if (count3 > 0) {
5233            if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5234                while (count3 >= coll->tertiaryTopCount) {
5235                    tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5236                    count3 -= (uint32_t)coll->tertiaryTopCount;
5237                }
5238                tertiaries.appendByte(tertiaryTop - count3);
5239            } else {
5240                while (count3 > coll->tertiaryBottomCount) {
5241                    tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5242                    count3 -= (uint32_t)coll->tertiaryBottomCount;
5243                }
5244                tertiaries.appendByte(tertiaryBottom + (count3-1));
5245            }
5246        }
5247        ok &= tertiaries.isOk();
5248        result.Append(UCOL_LEVELTERMINATOR);
5249        tertiaries.appendTo(result);
5250
5251        result.Append(0);
5252    }
5253
5254    /* To avoid memory leak, free the offset buffer if necessary. */
5255    ucol_freeOffsetBuffer(&s);
5256
5257    ok &= result.IsOk();
5258    if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
5259}
5260
5261static inline
5262UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5263    UBool notIsContinuation = !isContinuation(CE);
5264    uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5265    if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5266               || (!notIsContinuation && *wasShifted)))
5267        || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5268    {
5269        // The stuff below should probably be in the sortkey code... maybe not...
5270        if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5271            /* we should just completely ignore it */
5272            *wasShifted = TRUE;
5273            //continue;
5274        }
5275        //*wasShifted = TRUE;
5276        return TRUE;
5277    } else {
5278        *wasShifted = FALSE;
5279        return FALSE;
5280    }
5281}
5282static inline
5283void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5284    if(level < maxLevel) {
5285        dest[i++] = UCOL_LEVELTERMINATOR;
5286    } else {
5287        dest[i++] = 0;
5288    }
5289}
5290
5291/** enumeration of level identifiers for partial sort key generation */
5292enum {
5293  UCOL_PSK_PRIMARY = 0,
5294    UCOL_PSK_SECONDARY = 1,
5295    UCOL_PSK_CASE = 2,
5296    UCOL_PSK_TERTIARY = 3,
5297    UCOL_PSK_QUATERNARY = 4,
5298    UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5299    UCOL_PSK_IDENTICAL = 6,
5300    UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5301    UCOL_PSK_LIMIT
5302};
5303
5304/** collation state enum. *_SHIFT value is how much to shift right
5305 *  to get the state piece to the right. *_MASK value should be
5306 *  ANDed with the shifted state. This data is stored in state[1]
5307 *  field.
5308 */
5309enum {
5310    UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5311    UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5312    UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5313    UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5314    /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5315     *  This field is also used to denote that the French secondary level is finished
5316     */
5317    UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5318    UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5319    UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5320    UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5321    /** When we do French we need to reverse secondary values. However, continuations
5322     *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5323     */
5324    UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5325    UCOL_PSK_BOCSU_BYTES_MASK = 3,
5326    UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5327    UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5328};
5329
5330// macro calculating the number of expansion CEs available
5331#define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5332
5333
5334/** main sortkey part procedure. On the first call,
5335 *  you should pass in a collator, an iterator, empty state
5336 *  state[0] == state[1] == 0, a buffer to hold results
5337 *  number of bytes you need and an error code pointer.
5338 *  Make sure your buffer is big enough to hold the wanted
5339 *  number of sortkey bytes. I don't check.
5340 *  The only meaningful status you can get back is
5341 *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5342 *  have been dealt a raw deal and that you probably won't
5343 *  be able to use partial sortkey generation for this
5344 *  particular combination of string and collator. This
5345 *  is highly unlikely, but you should still check the error code.
5346 *  Any other status means that you're not in a sane situation
5347 *  anymore. After the first call, preserve state values and
5348 *  use them on subsequent calls to obtain more bytes of a sortkey.
5349 *  Use until the number of bytes written is smaller than the requested
5350 *  number of bytes. Generated sortkey is not compatible with the
5351 *  one generated by ucol_getSortKey, as we don't do any compression.
5352 *  However, levels are still terminated by a 1 (one) and the sortkey
5353 *  is terminated by a 0 (zero). Identical level is the same as in the
5354 *  regular sortkey - internal bocu-1 implementation is used.
5355 *  For curious, although you cannot do much about this, here is
5356 *  the structure of state words.
5357 *  state[0] - iterator state. Depends on the iterator implementation,
5358 *             but allows the iterator to continue where it stopped in
5359 *             the last iteration.
5360 *  state[1] - collation processing state. Here is the distribution
5361 *             of the bits:
5362 *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5363 *             quaternary, quin (we don't use this one), identical and
5364 *             null (producing only zeroes - first one to terminate the
5365 *             sortkey and subsequent to fill the buffer).
5366 *   3       - byte count. Number of bytes written on the primary level.
5367 *   4       - was shifted. Whether the previous iteration finished in the
5368 *             shifted state.
5369 *   5, 6    - French continuation bytes written. See the comment in the enum
5370 *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
5371 *             the identical level.
5372 *   9..31   - CEs consumed. Number of getCE or next32 operations performed
5373 *             since thes last successful update of the iterator state.
5374 */
5375U_CAPI int32_t U_EXPORT2
5376ucol_nextSortKeyPart(const UCollator *coll,
5377                     UCharIterator *iter,
5378                     uint32_t state[2],
5379                     uint8_t *dest, int32_t count,
5380                     UErrorCode *status)
5381{
5382    /* error checking */
5383    if(status==NULL || U_FAILURE(*status)) {
5384        return 0;
5385    }
5386    UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5387    if( coll==NULL || iter==NULL ||
5388        state==NULL ||
5389        count<0 || (count>0 && dest==NULL)
5390    ) {
5391        *status=U_ILLEGAL_ARGUMENT_ERROR;
5392        UTRACE_EXIT_STATUS(status);
5393        return 0;
5394    }
5395
5396    UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5397                  coll, iter, state[0], state[1], dest, count);
5398
5399    if(count==0) {
5400        /* nothing to do */
5401        UTRACE_EXIT_VALUE(0);
5402        return 0;
5403    }
5404    /** Setting up situation according to the state we got from the previous iteration */
5405    // The state of the iterator from the previous invocation
5406    uint32_t iterState = state[0];
5407    // Has the last iteration ended in the shifted state
5408    UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5409    // What is the current level of the sortkey?
5410    int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5411    // Have we written only one byte from a two byte primary in the previous iteration?
5412    // Also on secondary level - have we finished with the French secondary?
5413    int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5414    // number of bytes in the continuation buffer for French
5415    int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5416    // Number of bytes already written from a bocsu sequence. Since
5417    // the longes bocsu sequence is 4 long, this can be up to 3.
5418    int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5419    // Number of elements that need to be consumed in this iteration because
5420    // the iterator returned UITER_NO_STATE at the end of the last iteration,
5421    // so we had to save the last valid state.
5422    int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5423
5424    /** values that depend on the collator attributes */
5425    // strength of the collator.
5426    int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5427    // maximal level of the partial sortkey. Need to take whether case level is done
5428    int32_t maxLevel = 0;
5429    if(strength < UCOL_TERTIARY) {
5430        if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5431            maxLevel = UCOL_PSK_CASE;
5432        } else {
5433            maxLevel = strength;
5434        }
5435    } else {
5436        if(strength == UCOL_TERTIARY) {
5437            maxLevel = UCOL_PSK_TERTIARY;
5438        } else if(strength == UCOL_QUATERNARY) {
5439            maxLevel = UCOL_PSK_QUATERNARY;
5440        } else { // identical
5441            maxLevel = UCOL_IDENTICAL;
5442        }
5443    }
5444    // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5445    uint8_t UCOL_HIRAGANA_QUAD =
5446      (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5447    // Boundary value that decides whether a CE is shifted or not
5448    uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5449    // Are we doing French collation?
5450    UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5451
5452    /** initializing the collation state */
5453    UBool notIsContinuation = FALSE;
5454    uint32_t CE = UCOL_NO_MORE_CES;
5455
5456    collIterate s;
5457    IInit_collIterate(coll, NULL, -1, &s, status);
5458    if(U_FAILURE(*status)) {
5459        UTRACE_EXIT_STATUS(*status);
5460        return 0;
5461    }
5462    s.iterator = iter;
5463    s.flags |= UCOL_USE_ITERATOR;
5464    // This variable tells us whether we have produced some other levels in this iteration
5465    // before we moved to the identical level. In that case, we need to switch the
5466    // type of the iterator.
5467    UBool doingIdenticalFromStart = FALSE;
5468    // Normalizing iterator
5469    // The division for the array length may truncate the array size to
5470    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5471    // for all platforms anyway.
5472    UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5473    UNormIterator *normIter = NULL;
5474    // If the normalization is turned on for the collator and we are below identical level
5475    // we will use a FCD normalizing iterator
5476    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5477        normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5478        s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5479        s.flags &= ~UCOL_ITER_NORM;
5480        if(U_FAILURE(*status)) {
5481            UTRACE_EXIT_STATUS(*status);
5482            return 0;
5483        }
5484    } else if(level == UCOL_PSK_IDENTICAL) {
5485        // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5486        // will be updating the state - and this cannot be done on an ordinary iterator.
5487        normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5488        s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5489        s.flags &= ~UCOL_ITER_NORM;
5490        if(U_FAILURE(*status)) {
5491            UTRACE_EXIT_STATUS(*status);
5492            return 0;
5493        }
5494        doingIdenticalFromStart = TRUE;
5495    }
5496
5497    // This is the tentative new state of the iterator. The problem
5498    // is that the iterator might return an undefined state, in
5499    // which case we should save the last valid state and increase
5500    // the iterator skip value.
5501    uint32_t newState = 0;
5502
5503    // First, we set the iterator to the last valid position
5504    // from the last iteration. This was saved in state[0].
5505    if(iterState == 0) {
5506        /* initial state */
5507        if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5508            s.iterator->move(s.iterator, 0, UITER_LIMIT);
5509        } else {
5510            s.iterator->move(s.iterator, 0, UITER_START);
5511        }
5512    } else {
5513        /* reset to previous state */
5514        s.iterator->setState(s.iterator, iterState, status);
5515        if(U_FAILURE(*status)) {
5516            UTRACE_EXIT_STATUS(*status);
5517            return 0;
5518        }
5519    }
5520
5521
5522
5523    // This variable tells us whether we can attempt to update the state
5524    // of iterator. Situations where we don't want to update iterator state
5525    // are the existence of expansion CEs that are not yet processed, and
5526    // finishing the case level without enough space in the buffer to insert
5527    // a level terminator.
5528    UBool canUpdateState = TRUE;
5529
5530    // Consume all the CEs that were consumed at the end of the previous
5531    // iteration without updating the iterator state. On identical level,
5532    // consume the code points.
5533    int32_t counter = cces;
5534    if(level < UCOL_PSK_IDENTICAL) {
5535        while(counter-->0) {
5536            // If we're doing French and we are on the secondary level,
5537            // we go backwards.
5538            if(level == UCOL_PSK_SECONDARY && doingFrench) {
5539                CE = ucol_IGetPrevCE(coll, &s, status);
5540            } else {
5541                CE = ucol_IGetNextCE(coll, &s, status);
5542            }
5543            if(CE==UCOL_NO_MORE_CES) {
5544                /* should not happen */
5545                *status=U_INTERNAL_PROGRAM_ERROR;
5546                UTRACE_EXIT_STATUS(*status);
5547                return 0;
5548            }
5549            if(uprv_numAvailableExpCEs(s)) {
5550                canUpdateState = FALSE;
5551            }
5552        }
5553    } else {
5554        while(counter-->0) {
5555            uiter_next32(s.iterator);
5556        }
5557    }
5558
5559    // French secondary needs to know whether the iterator state of zero came from previous level OR
5560    // from a new invocation...
5561    UBool wasDoingPrimary = FALSE;
5562    // destination buffer byte counter. When this guy
5563    // gets to count, we're done with the iteration
5564    int32_t i = 0;
5565    // used to count the zero bytes written after we
5566    // have finished with the sort key
5567    int32_t j = 0;
5568
5569
5570    // Hm.... I think we're ready to plunge in. Basic story is as following:
5571    // we have a fall through case based on level. This is used for initial
5572    // positioning on iteration start. Every level processor contains a
5573    // for(;;) which will be broken when we exhaust all the CEs. Other
5574    // way to exit is a goto saveState, which happens when we have filled
5575    // out our buffer.
5576    switch(level) {
5577    case UCOL_PSK_PRIMARY:
5578        wasDoingPrimary = TRUE;
5579        for(;;) {
5580            if(i==count) {
5581                goto saveState;
5582            }
5583            // We should save the state only if we
5584            // are sure that we are done with the
5585            // previous iterator state
5586            if(canUpdateState && byteCountOrFrenchDone == 0) {
5587                newState = s.iterator->getState(s.iterator);
5588                if(newState != UITER_NO_STATE) {
5589                    iterState = newState;
5590                    cces = 0;
5591                }
5592            }
5593            CE = ucol_IGetNextCE(coll, &s, status);
5594            cces++;
5595            if(CE==UCOL_NO_MORE_CES) {
5596                // Add the level separator
5597                terminatePSKLevel(level, maxLevel, i, dest);
5598                byteCountOrFrenchDone=0;
5599                // Restart the iteration an move to the
5600                // second level
5601                s.iterator->move(s.iterator, 0, UITER_START);
5602                cces = 0;
5603                level = UCOL_PSK_SECONDARY;
5604                break;
5605            }
5606            if(!isContinuation(CE)){
5607                if(coll->leadBytePermutationTable != NULL){
5608                    CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
5609                }
5610            }
5611            if(!isShiftedCE(CE, LVT, &wasShifted)) {
5612                CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5613                if(CE != 0) {
5614                    if(byteCountOrFrenchDone == 0) {
5615                        // get the second byte of primary
5616                        dest[i++]=(uint8_t)(CE >> 8);
5617                    } else {
5618                        byteCountOrFrenchDone = 0;
5619                    }
5620                    if((CE &=0xff)!=0) {
5621                        if(i==count) {
5622                            /* overflow */
5623                            byteCountOrFrenchDone = 1;
5624                            cces--;
5625                            goto saveState;
5626                        }
5627                        dest[i++]=(uint8_t)CE;
5628                    }
5629                }
5630            }
5631            if(uprv_numAvailableExpCEs(s)) {
5632                canUpdateState = FALSE;
5633            } else {
5634                canUpdateState = TRUE;
5635            }
5636        }
5637        /* fall through to next level */
5638    case UCOL_PSK_SECONDARY:
5639        if(strength >= UCOL_SECONDARY) {
5640            if(!doingFrench) {
5641                for(;;) {
5642                    if(i == count) {
5643                        goto saveState;
5644                    }
5645                    // We should save the state only if we
5646                    // are sure that we are done with the
5647                    // previous iterator state
5648                    if(canUpdateState) {
5649                        newState = s.iterator->getState(s.iterator);
5650                        if(newState != UITER_NO_STATE) {
5651                            iterState = newState;
5652                            cces = 0;
5653                        }
5654                    }
5655                    CE = ucol_IGetNextCE(coll, &s, status);
5656                    cces++;
5657                    if(CE==UCOL_NO_MORE_CES) {
5658                        // Add the level separator
5659                        terminatePSKLevel(level, maxLevel, i, dest);
5660                        byteCountOrFrenchDone = 0;
5661                        // Restart the iteration an move to the
5662                        // second level
5663                        s.iterator->move(s.iterator, 0, UITER_START);
5664                        cces = 0;
5665                        level = UCOL_PSK_CASE;
5666                        break;
5667                    }
5668                    if(!isShiftedCE(CE, LVT, &wasShifted)) {
5669                        CE >>= 8; /* get secondary */
5670                        if(CE != 0) {
5671                            dest[i++]=(uint8_t)CE;
5672                        }
5673                    }
5674                    if(uprv_numAvailableExpCEs(s)) {
5675                        canUpdateState = FALSE;
5676                    } else {
5677                        canUpdateState = TRUE;
5678                    }
5679                }
5680            } else { // French secondary processing
5681                uint8_t frenchBuff[UCOL_MAX_BUFFER];
5682                int32_t frenchIndex = 0;
5683                // Here we are going backwards.
5684                // If the iterator is at the beggining, it should be
5685                // moved to end.
5686                if(wasDoingPrimary) {
5687                    s.iterator->move(s.iterator, 0, UITER_LIMIT);
5688                    cces = 0;
5689                }
5690                for(;;) {
5691                    if(i == count) {
5692                        goto saveState;
5693                    }
5694                    if(canUpdateState) {
5695                        newState = s.iterator->getState(s.iterator);
5696                        if(newState != UITER_NO_STATE) {
5697                            iterState = newState;
5698                            cces = 0;
5699                        }
5700                    }
5701                    CE = ucol_IGetPrevCE(coll, &s, status);
5702                    cces++;
5703                    if(CE==UCOL_NO_MORE_CES) {
5704                        // Add the level separator
5705                        terminatePSKLevel(level, maxLevel, i, dest);
5706                        byteCountOrFrenchDone = 0;
5707                        // Restart the iteration an move to the next level
5708                        s.iterator->move(s.iterator, 0, UITER_START);
5709                        level = UCOL_PSK_CASE;
5710                        break;
5711                    }
5712                    if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5713                        // reverse when we get a first non-continuation CE.
5714                        CE >>= 8;
5715                        frenchBuff[frenchIndex++] = (uint8_t)CE;
5716                    } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5717                        CE >>= 8; /* get secondary */
5718                        if(!frenchIndex) {
5719                            if(CE != 0) {
5720                                dest[i++]=(uint8_t)CE;
5721                            }
5722                        } else {
5723                            frenchBuff[frenchIndex++] = (uint8_t)CE;
5724                            frenchIndex -= usedFrench;
5725                            usedFrench = 0;
5726                            while(i < count && frenchIndex) {
5727                                dest[i++] = frenchBuff[--frenchIndex];
5728                                usedFrench++;
5729                            }
5730                        }
5731                    }
5732                    if(uprv_numAvailableExpCEs(s)) {
5733                        canUpdateState = FALSE;
5734                    } else {
5735                        canUpdateState = TRUE;
5736                    }
5737                }
5738            }
5739        } else {
5740            level = UCOL_PSK_CASE;
5741        }
5742        /* fall through to next level */
5743    case UCOL_PSK_CASE:
5744        if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5745            uint32_t caseShift = UCOL_CASE_SHIFT_START;
5746            uint8_t caseByte = UCOL_CASE_BYTE_START;
5747            uint8_t caseBits = 0;
5748
5749            for(;;) {
5750                U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
5751                if(i == count) {
5752                    goto saveState;
5753                }
5754                // We should save the state only if we
5755                // are sure that we are done with the
5756                // previous iterator state
5757                if(canUpdateState) {
5758                    newState = s.iterator->getState(s.iterator);
5759                    if(newState != UITER_NO_STATE) {
5760                        iterState = newState;
5761                        cces = 0;
5762                    }
5763                }
5764                CE = ucol_IGetNextCE(coll, &s, status);
5765                cces++;
5766                if(CE==UCOL_NO_MORE_CES) {
5767                    // On the case level we might have an unfinished
5768                    // case byte. Add one if it's started.
5769                    if(caseShift != UCOL_CASE_SHIFT_START) {
5770                        dest[i++] = caseByte;
5771                    }
5772                    cces = 0;
5773                    // We have finished processing CEs on this level.
5774                    // However, we don't know if we have enough space
5775                    // to add a case level terminator.
5776                    if(i < count) {
5777                        // Add the level separator
5778                        terminatePSKLevel(level, maxLevel, i, dest);
5779                        // Restart the iteration and move to the
5780                        // next level
5781                        s.iterator->move(s.iterator, 0, UITER_START);
5782                        level = UCOL_PSK_TERTIARY;
5783                    } else {
5784                        canUpdateState = FALSE;
5785                    }
5786                    break;
5787                }
5788
5789                if(!isShiftedCE(CE, LVT, &wasShifted)) {
5790                    if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5791                        // do the case level if we need to do it. We don't want to calculate
5792                        // case level for primary ignorables if we have only primary strength and case level
5793                        // otherwise we would break well formedness of CEs
5794                        CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5795                        caseBits = (uint8_t)(CE & 0xC0);
5796                        // this copies the case level logic from the
5797                        // sort key generation code
5798                        if(CE != 0) {
5799                            if (caseShift == 0) {
5800                                dest[i++] = caseByte;
5801                                caseShift = UCOL_CASE_SHIFT_START;
5802                                caseByte = UCOL_CASE_BYTE_START;
5803                            }
5804                            if(coll->caseFirst == UCOL_UPPER_FIRST) {
5805                                if((caseBits & 0xC0) == 0) {
5806                                    caseByte |= 1 << (--caseShift);
5807                                } else {
5808                                    caseByte |= 0 << (--caseShift);
5809                                    /* second bit */
5810                                    if(caseShift == 0) {
5811                                        dest[i++] = caseByte;
5812                                        caseShift = UCOL_CASE_SHIFT_START;
5813                                        caseByte = UCOL_CASE_BYTE_START;
5814                                    }
5815                                    caseByte |= ((caseBits>>6)&1) << (--caseShift);
5816                                }
5817                            } else {
5818                                if((caseBits & 0xC0) == 0) {
5819                                    caseByte |= 0 << (--caseShift);
5820                                } else {
5821                                    caseByte |= 1 << (--caseShift);
5822                                    /* second bit */
5823                                    if(caseShift == 0) {
5824                                        dest[i++] = caseByte;
5825                                        caseShift = UCOL_CASE_SHIFT_START;
5826                                        caseByte = UCOL_CASE_BYTE_START;
5827                                    }
5828                                    caseByte |= ((caseBits>>7)&1) << (--caseShift);
5829                                }
5830                            }
5831                        }
5832
5833                    }
5834                }
5835                // Not sure this is correct for the case level - revisit
5836                if(uprv_numAvailableExpCEs(s)) {
5837                    canUpdateState = FALSE;
5838                } else {
5839                    canUpdateState = TRUE;
5840                }
5841            }
5842        } else {
5843            level = UCOL_PSK_TERTIARY;
5844        }
5845        /* fall through to next level */
5846    case UCOL_PSK_TERTIARY:
5847        if(strength >= UCOL_TERTIARY) {
5848            for(;;) {
5849                if(i == count) {
5850                    goto saveState;
5851                }
5852                // We should save the state only if we
5853                // are sure that we are done with the
5854                // previous iterator state
5855                if(canUpdateState) {
5856                    newState = s.iterator->getState(s.iterator);
5857                    if(newState != UITER_NO_STATE) {
5858                        iterState = newState;
5859                        cces = 0;
5860                    }
5861                }
5862                CE = ucol_IGetNextCE(coll, &s, status);
5863                cces++;
5864                if(CE==UCOL_NO_MORE_CES) {
5865                    // Add the level separator
5866                    terminatePSKLevel(level, maxLevel, i, dest);
5867                    byteCountOrFrenchDone = 0;
5868                    // Restart the iteration an move to the
5869                    // second level
5870                    s.iterator->move(s.iterator, 0, UITER_START);
5871                    cces = 0;
5872                    level = UCOL_PSK_QUATERNARY;
5873                    break;
5874                }
5875                if(!isShiftedCE(CE, LVT, &wasShifted)) {
5876                    notIsContinuation = !isContinuation(CE);
5877
5878                    if(notIsContinuation) {
5879                        CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5880                        CE ^= coll->caseSwitch;
5881                        CE &= coll->tertiaryMask;
5882                    } else {
5883                        CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
5884                    }
5885
5886                    if(CE != 0) {
5887                        dest[i++]=(uint8_t)CE;
5888                    }
5889                }
5890                if(uprv_numAvailableExpCEs(s)) {
5891                    canUpdateState = FALSE;
5892                } else {
5893                    canUpdateState = TRUE;
5894                }
5895            }
5896        } else {
5897            // if we're not doing tertiary
5898            // skip to the end
5899            level = UCOL_PSK_NULL;
5900        }
5901        /* fall through to next level */
5902    case UCOL_PSK_QUATERNARY:
5903        if(strength >= UCOL_QUATERNARY) {
5904            for(;;) {
5905                if(i == count) {
5906                    goto saveState;
5907                }
5908                // We should save the state only if we
5909                // are sure that we are done with the
5910                // previous iterator state
5911                if(canUpdateState) {
5912                    newState = s.iterator->getState(s.iterator);
5913                    if(newState != UITER_NO_STATE) {
5914                        iterState = newState;
5915                        cces = 0;
5916                    }
5917                }
5918                CE = ucol_IGetNextCE(coll, &s, status);
5919                cces++;
5920                if(CE==UCOL_NO_MORE_CES) {
5921                    // Add the level separator
5922                    terminatePSKLevel(level, maxLevel, i, dest);
5923                    //dest[i++] = UCOL_LEVELTERMINATOR;
5924                    byteCountOrFrenchDone = 0;
5925                    // Restart the iteration an move to the
5926                    // second level
5927                    s.iterator->move(s.iterator, 0, UITER_START);
5928                    cces = 0;
5929                    level = UCOL_PSK_QUIN;
5930                    break;
5931                }
5932                if(CE==0)
5933                    continue;
5934                if(isShiftedCE(CE, LVT, &wasShifted)) {
5935                    CE >>= 16; /* get primary */
5936                    if(CE != 0) {
5937                        if(byteCountOrFrenchDone == 0) {
5938                            dest[i++]=(uint8_t)(CE >> 8);
5939                        } else {
5940                            byteCountOrFrenchDone = 0;
5941                        }
5942                        if((CE &=0xff)!=0) {
5943                            if(i==count) {
5944                                /* overflow */
5945                                byteCountOrFrenchDone = 1;
5946                                goto saveState;
5947                            }
5948                            dest[i++]=(uint8_t)CE;
5949                        }
5950                    }
5951                } else {
5952                    notIsContinuation = !isContinuation(CE);
5953                    if(notIsContinuation) {
5954                        if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5955                            dest[i++] = UCOL_HIRAGANA_QUAD;
5956                        } else {
5957                            dest[i++] = 0xFF;
5958                        }
5959                    }
5960                }
5961                if(uprv_numAvailableExpCEs(s)) {
5962                    canUpdateState = FALSE;
5963                } else {
5964                    canUpdateState = TRUE;
5965                }
5966            }
5967        } else {
5968            // if we're not doing quaternary
5969            // skip to the end
5970            level = UCOL_PSK_NULL;
5971        }
5972        /* fall through to next level */
5973    case UCOL_PSK_QUIN:
5974        level = UCOL_PSK_IDENTICAL;
5975        /* fall through to next level */
5976    case UCOL_PSK_IDENTICAL:
5977        if(strength >= UCOL_IDENTICAL) {
5978            UChar32 first, second;
5979            int32_t bocsuBytesWritten = 0;
5980            // We always need to do identical on
5981            // the NFD form of the string.
5982            if(normIter == NULL) {
5983                // we arrived from the level below and
5984                // normalization was not turned on.
5985                // therefore, we need to make a fresh NFD iterator
5986                normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5987                s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5988            } else if(!doingIdenticalFromStart) {
5989                // there is an iterator, but we did some other levels.
5990                // therefore, we have a FCD iterator - need to make
5991                // a NFD one.
5992                // normIter being at the beginning does not guarantee
5993                // that the underlying iterator is at the beginning
5994                iter->move(iter, 0, UITER_START);
5995                s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5996            }
5997            // At this point we have a NFD iterator that is positioned
5998            // in the right place
5999            if(U_FAILURE(*status)) {
6000                UTRACE_EXIT_STATUS(*status);
6001                return 0;
6002            }
6003            first = uiter_previous32(s.iterator);
6004            // maybe we're at the start of the string
6005            if(first == U_SENTINEL) {
6006                first = 0;
6007            } else {
6008                uiter_next32(s.iterator);
6009            }
6010
6011            j = 0;
6012            for(;;) {
6013                if(i == count) {
6014                    if(j+1 < bocsuBytesWritten) {
6015                        bocsuBytesUsed = j+1;
6016                    }
6017                    goto saveState;
6018                }
6019
6020                // On identical level, we will always save
6021                // the state if we reach this point, since
6022                // we don't depend on getNextCE for content
6023                // all the content is in our buffer and we
6024                // already either stored the full buffer OR
6025                // otherwise we won't arrive here.
6026                newState = s.iterator->getState(s.iterator);
6027                if(newState != UITER_NO_STATE) {
6028                    iterState = newState;
6029                    cces = 0;
6030                }
6031
6032                uint8_t buff[4];
6033                second = uiter_next32(s.iterator);
6034                cces++;
6035
6036                // end condition for identical level
6037                if(second == U_SENTINEL) {
6038                    terminatePSKLevel(level, maxLevel, i, dest);
6039                    level = UCOL_PSK_NULL;
6040                    break;
6041                }
6042                bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6043                first = second;
6044
6045                j = 0;
6046                if(bocsuBytesUsed != 0) {
6047                    while(bocsuBytesUsed-->0) {
6048                        j++;
6049                    }
6050                }
6051
6052                while(i < count && j < bocsuBytesWritten) {
6053                    dest[i++] = buff[j++];
6054                }
6055            }
6056
6057        } else {
6058            level = UCOL_PSK_NULL;
6059        }
6060        /* fall through to next level */
6061    case UCOL_PSK_NULL:
6062        j = i;
6063        while(j<count) {
6064            dest[j++]=0;
6065        }
6066        break;
6067    default:
6068        *status = U_INTERNAL_PROGRAM_ERROR;
6069        UTRACE_EXIT_STATUS(*status);
6070        return 0;
6071    }
6072
6073saveState:
6074    // Now we need to return stuff. First we want to see whether we have
6075    // done everything for the current state of iterator.
6076    if(byteCountOrFrenchDone
6077        || canUpdateState == FALSE
6078        || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6079    {
6080        // Any of above mean that the previous transaction
6081        // wasn't finished and that we should store the
6082        // previous iterator state.
6083        state[0] = iterState;
6084    } else {
6085        // The transaction is complete. We will continue in the next iteration.
6086        state[0] = s.iterator->getState(s.iterator);
6087        cces = 0;
6088    }
6089    // Store the number of bocsu bytes written.
6090    if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6091        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6092    }
6093    state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6094
6095    // Next we put in the level of comparison
6096    state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6097
6098    // If we are doing French, we need to store whether we have just finished the French level
6099    if(level == UCOL_PSK_SECONDARY && doingFrench) {
6100        state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6101    } else {
6102        state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6103    }
6104
6105    // Was the latest CE shifted
6106    if(wasShifted) {
6107        state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6108    }
6109    // Check for cces overflow
6110    if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6111        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6112    }
6113    // Store cces
6114    state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6115
6116    // Check for French overflow
6117    if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6118        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6119    }
6120    // Store number of bytes written in the French secondary continuation sequence
6121    state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6122
6123
6124    // If we have used normalizing iterator, get rid of it
6125    if(normIter != NULL) {
6126        unorm_closeIter(normIter);
6127    }
6128
6129    /* To avoid memory leak, free the offset buffer if necessary. */
6130    ucol_freeOffsetBuffer(&s);
6131
6132    // Return number of meaningful sortkey bytes.
6133    UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6134                  dest,i, state[0], state[1]);
6135    UTRACE_EXIT_VALUE(i);
6136    return i;
6137}
6138
6139/**
6140 * Produce a bound for a given sortkey and a number of levels.
6141 */
6142U_CAPI int32_t U_EXPORT2
6143ucol_getBound(const uint8_t       *source,
6144        int32_t             sourceLength,
6145        UColBoundMode       boundType,
6146        uint32_t            noOfLevels,
6147        uint8_t             *result,
6148        int32_t             resultLength,
6149        UErrorCode          *status)
6150{
6151    // consistency checks
6152    if(status == NULL || U_FAILURE(*status)) {
6153        return 0;
6154    }
6155    if(source == NULL) {
6156        *status = U_ILLEGAL_ARGUMENT_ERROR;
6157        return 0;
6158    }
6159
6160    int32_t sourceIndex = 0;
6161    // Scan the string until we skip enough of the key OR reach the end of the key
6162    do {
6163        sourceIndex++;
6164        if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6165            noOfLevels--;
6166        }
6167    } while (noOfLevels > 0
6168        && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6169
6170    if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6171        && noOfLevels > 0) {
6172            *status = U_SORT_KEY_TOO_SHORT_WARNING;
6173    }
6174
6175
6176    // READ ME: this code assumes that the values for boundType
6177    // enum will not changes. They are set so that the enum value
6178    // corresponds to the number of extra bytes each bound type
6179    // needs.
6180    if(result != NULL && resultLength >= sourceIndex+boundType) {
6181        uprv_memcpy(result, source, sourceIndex);
6182        switch(boundType) {
6183            // Lower bound just gets terminated. No extra bytes
6184        case UCOL_BOUND_LOWER: // = 0
6185            break;
6186            // Upper bound needs one extra byte
6187        case UCOL_BOUND_UPPER: // = 1
6188            result[sourceIndex++] = 2;
6189            break;
6190            // Upper long bound needs two extra bytes
6191        case UCOL_BOUND_UPPER_LONG: // = 2
6192            result[sourceIndex++] = 0xFF;
6193            result[sourceIndex++] = 0xFF;
6194            break;
6195        default:
6196            *status = U_ILLEGAL_ARGUMENT_ERROR;
6197            return 0;
6198        }
6199        result[sourceIndex++] = 0;
6200
6201        return sourceIndex;
6202    } else {
6203        return sourceIndex+boundType+1;
6204    }
6205}
6206
6207/****************************************************************************/
6208/* Following are the functions that deal with the properties of a collator  */
6209/* there are new APIs and some compatibility APIs                           */
6210/****************************************************************************/
6211
6212static inline void
6213ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6214                    int32_t *primShift, int32_t *secShift, int32_t *terShift)
6215{
6216    uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6217    UBool reverseSecondary = FALSE;
6218    UBool continuation = isContinuation(CE);
6219    if(!continuation) {
6220        tertiary = (uint8_t)((CE & coll->tertiaryMask));
6221        tertiary ^= coll->caseSwitch;
6222        reverseSecondary = TRUE;
6223    } else {
6224        tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6225        tertiary &= UCOL_REMOVE_CASE;
6226        reverseSecondary = FALSE;
6227    }
6228
6229    secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6230    primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6231    primary1 = (uint8_t)(CE >> 8);
6232
6233    if(primary1 != 0) {
6234        if (coll->leadBytePermutationTable != NULL && !continuation) {
6235            primary1 = coll->leadBytePermutationTable[primary1];
6236        }
6237
6238        coll->latinOneCEs[ch] |= (primary1 << *primShift);
6239        *primShift -= 8;
6240    }
6241    if(primary2 != 0) {
6242        if(*primShift < 0) {
6243            coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6244            coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6245            coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6246            return;
6247        }
6248        coll->latinOneCEs[ch] |= (primary2 << *primShift);
6249        *primShift -= 8;
6250    }
6251    if(secondary != 0) {
6252        if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6253            coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6254            coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6255        } else { // normal case
6256            coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6257        }
6258        *secShift -= 8;
6259    }
6260    if(tertiary != 0) {
6261        coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6262        *terShift -= 8;
6263    }
6264}
6265
6266static inline UBool
6267ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6268    uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6269    if(newTable == NULL) {
6270      *status = U_MEMORY_ALLOCATION_ERROR;
6271      coll->latinOneFailed = TRUE;
6272      return FALSE;
6273    }
6274    int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6275    uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6276    uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6277    uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6278    uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6279    coll->latinOneTableLen = size;
6280    uprv_free(coll->latinOneCEs);
6281    coll->latinOneCEs = newTable;
6282    return TRUE;
6283}
6284
6285static UBool
6286ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6287    UBool result = TRUE;
6288    if(coll->latinOneCEs == NULL) {
6289        coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6290        if(coll->latinOneCEs == NULL) {
6291            *status = U_MEMORY_ALLOCATION_ERROR;
6292            return FALSE;
6293        }
6294        coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6295    }
6296    UChar ch = 0;
6297    UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6298    // Check for null pointer
6299    if (U_FAILURE(*status)) {
6300        ucol_closeElements(it);
6301        return FALSE;
6302    }
6303    uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6304
6305    int32_t primShift = 24, secShift = 24, terShift = 24;
6306    uint32_t CE = 0;
6307    int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6308
6309    // TODO: make safe if you get more than you wanted...
6310    for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6311        primShift = 24; secShift = 24; terShift = 24;
6312        if(ch < 0x100) {
6313            CE = coll->latinOneMapping[ch];
6314        } else {
6315            CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6316            if(CE == UCOL_NOT_FOUND && coll->UCA) {
6317                CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6318            }
6319        }
6320        if(CE < UCOL_NOT_FOUND) {
6321            ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6322        } else {
6323            switch (getCETag(CE)) {
6324            case EXPANSION_TAG:
6325            case DIGIT_TAG:
6326                ucol_setText(it, &ch, 1, status);
6327                while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6328                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6329                        coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6330                        coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6331                        coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6332                        break;
6333                    }
6334                    ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6335                }
6336                break;
6337            case CONTRACTION_TAG:
6338                // here is the trick
6339                // F2 is contraction. We do something very similar to contractions
6340                // but have two indices, one in the real contraction table and the
6341                // other to where we stuffed things. This hopes that we don't have
6342                // many contractions (this should work for latin-1 tables).
6343                {
6344                    if((CE & 0x00FFF000) != 0) {
6345                        *status = U_UNSUPPORTED_ERROR;
6346                        goto cleanup_after_failure;
6347                    }
6348
6349                    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6350
6351                    CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6352
6353                    coll->latinOneCEs[ch] = CE;
6354                    coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6355                    coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6356
6357                    // We're going to jump into contraction table, pick the elements
6358                    // and use them
6359                    do {
6360                        CE = *(coll->contractionCEs +
6361                            (UCharOffset - coll->contractionIndex));
6362                        if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6363                            uint32_t size;
6364                            uint32_t i;    /* general counter */
6365                            uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6366                            size = getExpansionCount(CE);
6367                            //CE = *CEOffset++;
6368                            if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6369                                for(i = 0; i<size; i++) {
6370                                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6371                                        coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6372                                        coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6373                                        coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6374                                        break;
6375                                    }
6376                                    ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6377                                }
6378                            } else { /* else, we do */
6379                                while(*CEOffset != 0) {
6380                                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6381                                        coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6382                                        coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6383                                        coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6384                                        break;
6385                                    }
6386                                    ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6387                                }
6388                            }
6389                            contractionOffset++;
6390                        } else if(CE < UCOL_NOT_FOUND) {
6391                            ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6392                        } else {
6393                            coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6394                            coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6395                            coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6396                            contractionOffset++;
6397                        }
6398                        UCharOffset++;
6399                        primShift = 24; secShift = 24; terShift = 24;
6400                        if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6401                            if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6402                                goto cleanup_after_failure;
6403                            }
6404                        }
6405                    } while(*UCharOffset != 0xFFFF);
6406                }
6407                break;;
6408            case SPEC_PROC_TAG:
6409                {
6410                    // 0xB7 is a precontext character defined in UCA5.1, a special
6411                    // handle is implemeted in order to save LatinOne table for
6412                    // most locales.
6413                    if (ch==0xb7) {
6414                        ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6415                    }
6416                    else {
6417                        goto cleanup_after_failure;
6418                    }
6419                }
6420                break;
6421            default:
6422                goto cleanup_after_failure;
6423            }
6424        }
6425    }
6426    // compact table
6427    if(contractionOffset < coll->latinOneTableLen) {
6428        if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6429            goto cleanup_after_failure;
6430        }
6431    }
6432    ucol_closeElements(it);
6433    return result;
6434
6435cleanup_after_failure:
6436    // status should already be set before arriving here.
6437    coll->latinOneFailed = TRUE;
6438    ucol_closeElements(it);
6439    return FALSE;
6440}
6441
6442void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6443    if(U_SUCCESS(*status)) {
6444        if(coll->caseFirst == UCOL_UPPER_FIRST) {
6445            coll->caseSwitch = UCOL_CASE_SWITCH;
6446        } else {
6447            coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6448        }
6449
6450        if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6451            coll->tertiaryMask = UCOL_REMOVE_CASE;
6452            coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6453            coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6454            coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6455            coll->tertiaryBottom = UCOL_COMMON_BOT3;
6456        } else {
6457            coll->tertiaryMask = UCOL_KEEP_CASE;
6458            coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6459            if(coll->caseFirst == UCOL_UPPER_FIRST) {
6460                coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6461                coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6462                coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6463            } else {
6464                coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6465                coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6466                coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6467            }
6468        }
6469
6470        /* Set the compression values */
6471        uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
6472        coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6473        coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6474
6475        if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6476            && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6477        {
6478            coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6479        } else {
6480            coll->sortKeyGen = ucol_calcSortKey;
6481        }
6482        if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6483            && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
6484        {
6485            if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6486                if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6487                    //fprintf(stderr, "F");
6488                    coll->latinOneUse = TRUE;
6489                } else {
6490                    coll->latinOneUse = FALSE;
6491                }
6492                if(*status == U_UNSUPPORTED_ERROR) {
6493                    *status = U_ZERO_ERROR;
6494                }
6495            } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6496                coll->latinOneUse = TRUE;
6497            }
6498        } else {
6499            coll->latinOneUse = FALSE;
6500        }
6501    }
6502}
6503
6504U_CAPI uint32_t  U_EXPORT2
6505ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6506    if(U_FAILURE(*status) || coll == NULL) {
6507        return 0;
6508    }
6509    if(len == -1) {
6510        len = u_strlen(varTop);
6511    }
6512    if(len == 0) {
6513        *status = U_ILLEGAL_ARGUMENT_ERROR;
6514        return 0;
6515    }
6516
6517    if(coll->delegate!=NULL) {
6518      return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
6519    }
6520
6521
6522    collIterate s;
6523    IInit_collIterate(coll, varTop, len, &s, status);
6524    if(U_FAILURE(*status)) {
6525        return 0;
6526    }
6527
6528    uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6529
6530    /* here we check if we have consumed all characters */
6531    /* you can put in either one character or a contraction */
6532    /* you shouldn't put more... */
6533    if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6534        *status = U_CE_NOT_FOUND_ERROR;
6535        return 0;
6536    }
6537
6538    uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6539
6540    if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6541        *status = U_PRIMARY_TOO_LONG_ERROR;
6542        return 0;
6543    }
6544    if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6545        coll->variableTopValueisDefault = FALSE;
6546        coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6547    }
6548
6549    /* To avoid memory leak, free the offset buffer if necessary. */
6550    ucol_freeOffsetBuffer(&s);
6551
6552    return CE & UCOL_PRIMARYMASK;
6553}
6554
6555U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6556    if(U_FAILURE(*status) || coll == NULL) {
6557        return 0;
6558    }
6559    if(coll->delegate!=NULL) {
6560      return ((const Collator*)coll->delegate)->getVariableTop(*status);
6561    }
6562    return coll->variableTopValue<<16;
6563}
6564
6565U_CAPI void  U_EXPORT2
6566ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6567    if(U_FAILURE(*status) || coll == NULL) {
6568        return;
6569    }
6570
6571    if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6572        coll->variableTopValueisDefault = FALSE;
6573        coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6574    }
6575}
6576/* Attribute setter API */
6577U_CAPI void  U_EXPORT2
6578ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6579    if(U_FAILURE(*status) || coll == NULL) {
6580      return;
6581    }
6582
6583    if(coll->delegate != NULL) {
6584      ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
6585      return;
6586    }
6587
6588    UColAttributeValue oldFrench = coll->frenchCollation;
6589    UColAttributeValue oldCaseFirst = coll->caseFirst;
6590    switch(attr) {
6591    case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6592        if(value == UCOL_ON) {
6593            coll->numericCollation = UCOL_ON;
6594            coll->numericCollationisDefault = FALSE;
6595        } else if (value == UCOL_OFF) {
6596            coll->numericCollation = UCOL_OFF;
6597            coll->numericCollationisDefault = FALSE;
6598        } else if (value == UCOL_DEFAULT) {
6599            coll->numericCollationisDefault = TRUE;
6600            coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6601        } else {
6602            *status = U_ILLEGAL_ARGUMENT_ERROR;
6603        }
6604        break;
6605    case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6606        if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) {
6607            // This attribute is an implementation detail of the CLDR Japanese tailoring.
6608            // The implementation might change to use a different mechanism
6609            // to achieve the same Japanese sort order.
6610            // Since ICU 50, this attribute is not settable any more via API functions.
6611        } else {
6612            *status = U_ILLEGAL_ARGUMENT_ERROR;
6613        }
6614        break;
6615    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6616        if(value == UCOL_ON) {
6617            coll->frenchCollation = UCOL_ON;
6618            coll->frenchCollationisDefault = FALSE;
6619        } else if (value == UCOL_OFF) {
6620            coll->frenchCollation = UCOL_OFF;
6621            coll->frenchCollationisDefault = FALSE;
6622        } else if (value == UCOL_DEFAULT) {
6623            coll->frenchCollationisDefault = TRUE;
6624            coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6625        } else {
6626            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6627        }
6628        break;
6629    case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6630        if(value == UCOL_SHIFTED) {
6631            coll->alternateHandling = UCOL_SHIFTED;
6632            coll->alternateHandlingisDefault = FALSE;
6633        } else if (value == UCOL_NON_IGNORABLE) {
6634            coll->alternateHandling = UCOL_NON_IGNORABLE;
6635            coll->alternateHandlingisDefault = FALSE;
6636        } else if (value == UCOL_DEFAULT) {
6637            coll->alternateHandlingisDefault = TRUE;
6638            coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6639        } else {
6640            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6641        }
6642        break;
6643    case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6644        if(value == UCOL_LOWER_FIRST) {
6645            coll->caseFirst = UCOL_LOWER_FIRST;
6646            coll->caseFirstisDefault = FALSE;
6647        } else if (value == UCOL_UPPER_FIRST) {
6648            coll->caseFirst = UCOL_UPPER_FIRST;
6649            coll->caseFirstisDefault = FALSE;
6650        } else if (value == UCOL_OFF) {
6651            coll->caseFirst = UCOL_OFF;
6652            coll->caseFirstisDefault = FALSE;
6653        } else if (value == UCOL_DEFAULT) {
6654            coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6655            coll->caseFirstisDefault = TRUE;
6656        } else {
6657            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6658        }
6659        break;
6660    case UCOL_CASE_LEVEL: /* do we have an extra case level */
6661        if(value == UCOL_ON) {
6662            coll->caseLevel = UCOL_ON;
6663            coll->caseLevelisDefault = FALSE;
6664        } else if (value == UCOL_OFF) {
6665            coll->caseLevel = UCOL_OFF;
6666            coll->caseLevelisDefault = FALSE;
6667        } else if (value == UCOL_DEFAULT) {
6668            coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6669            coll->caseLevelisDefault = TRUE;
6670        } else {
6671            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6672        }
6673        break;
6674    case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6675        if(value == UCOL_ON) {
6676            coll->normalizationMode = UCOL_ON;
6677            coll->normalizationModeisDefault = FALSE;
6678            initializeFCD(status);
6679        } else if (value == UCOL_OFF) {
6680            coll->normalizationMode = UCOL_OFF;
6681            coll->normalizationModeisDefault = FALSE;
6682        } else if (value == UCOL_DEFAULT) {
6683            coll->normalizationModeisDefault = TRUE;
6684            coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6685            if(coll->normalizationMode == UCOL_ON) {
6686                initializeFCD(status);
6687            }
6688        } else {
6689            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6690        }
6691        break;
6692    case UCOL_STRENGTH:         /* attribute for strength */
6693        if (value == UCOL_DEFAULT) {
6694            coll->strengthisDefault = TRUE;
6695            coll->strength = (UColAttributeValue)coll->options->strength;
6696        } else if (value <= UCOL_IDENTICAL) {
6697            coll->strengthisDefault = FALSE;
6698            coll->strength = value;
6699        } else {
6700            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6701        }
6702        break;
6703    case UCOL_ATTRIBUTE_COUNT:
6704    default:
6705        *status = U_ILLEGAL_ARGUMENT_ERROR;
6706        break;
6707    }
6708    if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6709        coll->latinOneRegenTable = TRUE;
6710    } else {
6711        coll->latinOneRegenTable = FALSE;
6712    }
6713    ucol_updateInternalState(coll, status);
6714}
6715
6716U_CAPI UColAttributeValue  U_EXPORT2
6717ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6718    if(U_FAILURE(*status) || coll == NULL) {
6719      return UCOL_DEFAULT;
6720    }
6721
6722    if(coll->delegate != NULL) {
6723      return ((Collator*)coll->delegate)->getAttribute(attr,*status);
6724    }
6725
6726    switch(attr) {
6727    case UCOL_NUMERIC_COLLATION:
6728      return coll->numericCollation;
6729    case UCOL_HIRAGANA_QUATERNARY_MODE:
6730      return coll->hiraganaQ;
6731    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6732        return coll->frenchCollation;
6733    case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6734        return coll->alternateHandling;
6735    case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6736        return coll->caseFirst;
6737    case UCOL_CASE_LEVEL: /* do we have an extra case level */
6738        return coll->caseLevel;
6739    case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6740        return coll->normalizationMode;
6741    case UCOL_STRENGTH:         /* attribute for strength */
6742        return coll->strength;
6743    case UCOL_ATTRIBUTE_COUNT:
6744    default:
6745        *status = U_ILLEGAL_ARGUMENT_ERROR;
6746        break;
6747    }
6748    return UCOL_DEFAULT;
6749}
6750
6751U_CAPI void U_EXPORT2
6752ucol_setStrength(    UCollator                *coll,
6753            UCollationStrength        strength)
6754{
6755    UErrorCode status = U_ZERO_ERROR;
6756    ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6757}
6758
6759U_CAPI UCollationStrength U_EXPORT2
6760ucol_getStrength(const UCollator *coll)
6761{
6762    UErrorCode status = U_ZERO_ERROR;
6763    return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6764}
6765
6766U_CAPI int32_t U_EXPORT2
6767ucol_getReorderCodes(const UCollator *coll,
6768                    int32_t *dest,
6769                    int32_t destCapacity,
6770                    UErrorCode *status) {
6771    if (U_FAILURE(*status)) {
6772        return 0;
6773    }
6774
6775    if(coll->delegate!=NULL) {
6776      return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status);
6777    }
6778
6779    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6780        *status = U_ILLEGAL_ARGUMENT_ERROR;
6781        return 0;
6782    }
6783
6784#ifdef UCOL_DEBUG
6785    printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
6786    printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
6787#endif
6788
6789    if (coll->reorderCodesLength > destCapacity) {
6790        *status = U_BUFFER_OVERFLOW_ERROR;
6791        return coll->reorderCodesLength;
6792    }
6793    for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
6794        dest[i] = coll->reorderCodes[i];
6795    }
6796    return coll->reorderCodesLength;
6797}
6798
6799U_CAPI void U_EXPORT2
6800ucol_setReorderCodes(UCollator* coll,
6801                    const int32_t* reorderCodes,
6802                    int32_t reorderCodesLength,
6803                    UErrorCode *status) {
6804    if (U_FAILURE(*status)) {
6805        return;
6806    }
6807
6808    if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
6809        *status = U_ILLEGAL_ARGUMENT_ERROR;
6810        return;
6811    }
6812
6813    if(coll->delegate!=NULL) {
6814      ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
6815      return;
6816    }
6817
6818    if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
6819        uprv_free(coll->reorderCodes);
6820    }
6821    coll->reorderCodes = NULL;
6822    coll->reorderCodesLength = 0;
6823    if (reorderCodesLength == 0) {
6824        if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
6825            uprv_free(coll->leadBytePermutationTable);
6826        }
6827        coll->leadBytePermutationTable = NULL;
6828        return;
6829    }
6830    coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
6831    if (coll->reorderCodes == NULL) {
6832        *status = U_MEMORY_ALLOCATION_ERROR;
6833        return;
6834    }
6835    coll->freeReorderCodesOnClose = TRUE;
6836    for (int32_t i = 0; i < reorderCodesLength; i++) {
6837        coll->reorderCodes[i] = reorderCodes[i];
6838    }
6839    coll->reorderCodesLength = reorderCodesLength;
6840    ucol_buildPermutationTable(coll, status);
6841}
6842
6843U_CAPI int32_t U_EXPORT2
6844ucol_getEquivalentReorderCodes(int32_t reorderCode,
6845                    int32_t* dest,
6846                    int32_t destCapacity,
6847                    UErrorCode *pErrorCode) {
6848    bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
6849    uint16_t leadBytes[256];
6850    int leadBytesCount;
6851    int leadByteIndex;
6852    int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
6853    int reorderCodesForLeadByteCount;
6854    int reorderCodeIndex;
6855
6856    int32_t equivalentCodesCount = 0;
6857    int setIndex;
6858
6859    if (U_FAILURE(*pErrorCode)) {
6860        return 0;
6861    }
6862
6863    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6864        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
6865        return 0;
6866    }
6867
6868    uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
6869
6870    const UCollator* uca = ucol_initUCA(pErrorCode);
6871    if (U_FAILURE(*pErrorCode)) {
6872	return 0;
6873    }
6874    leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
6875    for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
6876        reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
6877            uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
6878        for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
6879            equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
6880        }
6881    }
6882
6883    for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6884        if (equivalentCodesSet[setIndex] == true) {
6885            equivalentCodesCount++;
6886        }
6887    }
6888
6889    if (destCapacity == 0) {
6890        return equivalentCodesCount;
6891    }
6892
6893    equivalentCodesCount = 0;
6894    for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6895        if (equivalentCodesSet[setIndex] == true) {
6896            dest[equivalentCodesCount++] = setIndex;
6897            if (equivalentCodesCount >= destCapacity) {
6898                break;
6899            }
6900        }
6901    }
6902    return equivalentCodesCount;
6903}
6904
6905
6906/****************************************************************************/
6907/* Following are misc functions                                             */
6908/* there are new APIs and some compatibility APIs                           */
6909/****************************************************************************/
6910
6911U_CAPI void U_EXPORT2
6912ucol_getVersion(const UCollator* coll,
6913                UVersionInfo versionInfo)
6914{
6915    if(coll->delegate!=NULL) {
6916      ((const Collator*)coll->delegate)->getVersion(versionInfo);
6917      return;
6918    }
6919    /* RunTime version  */
6920    uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6921    /* Builder version*/
6922    uint8_t bdVersion = coll->image->version[0];
6923
6924    /* Charset Version. Need to get the version from cnv files
6925     * makeconv should populate cnv files with version and
6926     * an api has to be provided in ucnv.h to obtain this version
6927     */
6928    uint8_t csVersion = 0;
6929
6930    /* combine the version info */
6931    uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6932
6933    /* Tailoring rules */
6934    versionInfo[0] = (uint8_t)(cmbVersion>>8);
6935    versionInfo[1] = (uint8_t)cmbVersion;
6936    versionInfo[2] = coll->image->version[1];
6937    if(coll->UCA) {
6938        /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
6939        versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
6940    } else {
6941        versionInfo[3] = 0;
6942    }
6943}
6944
6945
6946/* This internal API checks whether a character is tailored or not */
6947U_CAPI UBool  U_EXPORT2
6948ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6949    if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6950        return FALSE;
6951    }
6952
6953    uint32_t CE = UCOL_NOT_FOUND;
6954    const UChar *ContractionStart = NULL;
6955    if(u < 0x100) { /* latin-1 */
6956        CE = coll->latinOneMapping[u];
6957        if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6958            return FALSE;
6959        }
6960    } else { /* regular */
6961        CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6962    }
6963
6964    if(isContraction(CE)) {
6965        ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6966        CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6967    }
6968
6969    return (UBool)(CE != UCOL_NOT_FOUND);
6970}
6971
6972
6973/****************************************************************************/
6974/* Following are the string compare functions                               */
6975/*                                                                          */
6976/****************************************************************************/
6977
6978
6979/*  ucol_checkIdent    internal function.  Does byte level string compare.   */
6980/*                     Used by strcoll if strength == identical and strings  */
6981/*                     are otherwise equal.                                  */
6982/*                                                                           */
6983/*                     Comparison must be done on NFD normalized strings.    */
6984/*                     FCD is not good enough.                               */
6985
6986static
6987UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6988{
6989    // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6990    // of same type, but that doesn't really mean that it will stay that way.
6991    int32_t            comparison;
6992
6993    if (sColl->flags & UCOL_USE_ITERATOR) {
6994        // The division for the array length may truncate the array size to
6995        // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6996        // for all platforms anyway.
6997        UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6998        UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6999        UNormIterator *sNIt = NULL, *tNIt = NULL;
7000        sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7001        tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7002        sColl->iterator->move(sColl->iterator, 0, UITER_START);
7003        tColl->iterator->move(tColl->iterator, 0, UITER_START);
7004        UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7005        UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7006        comparison = u_strCompareIter(sIt, tIt, TRUE);
7007        unorm_closeIter(sNIt);
7008        unorm_closeIter(tNIt);
7009    } else {
7010        int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
7011        const UChar *sBuf = sColl->string;
7012        int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
7013        const UChar *tBuf = tColl->string;
7014
7015        if (normalize) {
7016            *status = U_ZERO_ERROR;
7017            // Note: We could use Normalizer::compare() or similar, but for short strings
7018            // which may not be in FCD it might be faster to just NFD them.
7019            // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
7020            // NFD'ing immediately might be faster for long strings,
7021            // but string comparison is usually done on relatively short strings.
7022            sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
7023                                  sColl->writableBuffer,
7024                                  *status);
7025            tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
7026                                  tColl->writableBuffer,
7027                                  *status);
7028            if(U_FAILURE(*status)) {
7029                return UCOL_LESS;
7030            }
7031            comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
7032        } else {
7033            comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
7034        }
7035    }
7036
7037    if (comparison < 0) {
7038        return UCOL_LESS;
7039    } else if (comparison == 0) {
7040        return UCOL_EQUAL;
7041    } else /* comparison > 0 */ {
7042        return UCOL_GREATER;
7043    }
7044}
7045
7046/*  CEBuf - A struct and some inline functions to handle the saving    */
7047/*          of CEs in a buffer within ucol_strcoll                     */
7048
7049#define UCOL_CEBUF_SIZE 512
7050typedef struct ucol_CEBuf {
7051    uint32_t    *buf;
7052    uint32_t    *endp;
7053    uint32_t    *pos;
7054    uint32_t     localArray[UCOL_CEBUF_SIZE];
7055} ucol_CEBuf;
7056
7057
7058static
7059inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7060    (b)->buf = (b)->pos = (b)->localArray;
7061    (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7062}
7063
7064static
7065void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7066    uint32_t  oldSize;
7067    uint32_t  newSize;
7068    uint32_t  *newBuf;
7069
7070    ci->flags |= UCOL_ITER_ALLOCATED;
7071    oldSize = (uint32_t)(b->pos - b->buf);
7072    newSize = oldSize * 2;
7073    newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7074    if(newBuf == NULL) {
7075        *status = U_MEMORY_ALLOCATION_ERROR;
7076    }
7077    else {
7078        uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7079        if (b->buf != b->localArray) {
7080            uprv_free(b->buf);
7081        }
7082        b->buf = newBuf;
7083        b->endp = b->buf + newSize;
7084        b->pos  = b->buf + oldSize;
7085    }
7086}
7087
7088static
7089inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7090    if (b->pos == b->endp) {
7091        ucol_CEBuf_Expand(b, ci, status);
7092    }
7093    if (U_SUCCESS(*status)) {
7094        *(b)->pos++ = ce;
7095    }
7096}
7097
7098/* This is a trick string compare function that goes in and uses sortkeys to compare */
7099/* It is used when compare gets in trouble and needs to bail out                     */
7100static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7101                                                  collIterate *tColl,
7102                                                  UErrorCode *status)
7103{
7104    uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7105    uint8_t *sourceKeyP = sourceKey;
7106    uint8_t *targetKeyP = targetKey;
7107    int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7108    const UCollator *coll = sColl->coll;
7109    const UChar *source = NULL;
7110    const UChar *target = NULL;
7111    int32_t result = UCOL_EQUAL;
7112    UnicodeString sourceString, targetString;
7113    int32_t sourceLength;
7114    int32_t targetLength;
7115
7116    if(sColl->flags & UCOL_USE_ITERATOR) {
7117        sColl->iterator->move(sColl->iterator, 0, UITER_START);
7118        tColl->iterator->move(tColl->iterator, 0, UITER_START);
7119        UChar32 c;
7120        while((c=sColl->iterator->next(sColl->iterator))>=0) {
7121            sourceString.append((UChar)c);
7122        }
7123        while((c=tColl->iterator->next(tColl->iterator))>=0) {
7124            targetString.append((UChar)c);
7125        }
7126        source = sourceString.getBuffer();
7127        sourceLength = sourceString.length();
7128        target = targetString.getBuffer();
7129        targetLength = targetString.length();
7130    } else { // no iterators
7131        sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
7132        targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
7133        source = sColl->string;
7134        target = tColl->string;
7135    }
7136
7137
7138
7139    sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7140    if(sourceKeyLen > UCOL_MAX_BUFFER) {
7141        sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7142        if(sourceKeyP == NULL) {
7143            *status = U_MEMORY_ALLOCATION_ERROR;
7144            goto cleanup_and_do_compare;
7145        }
7146        sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7147    }
7148
7149    targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7150    if(targetKeyLen > UCOL_MAX_BUFFER) {
7151        targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7152        if(targetKeyP == NULL) {
7153            *status = U_MEMORY_ALLOCATION_ERROR;
7154            goto cleanup_and_do_compare;
7155        }
7156        targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7157    }
7158
7159    result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7160
7161cleanup_and_do_compare:
7162    if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7163        uprv_free(sourceKeyP);
7164    }
7165
7166    if(targetKeyP != NULL && targetKeyP != targetKey) {
7167        uprv_free(targetKeyP);
7168    }
7169
7170    if(result<0) {
7171        return UCOL_LESS;
7172    } else if(result>0) {
7173        return UCOL_GREATER;
7174    } else {
7175        return UCOL_EQUAL;
7176    }
7177}
7178
7179
7180static UCollationResult
7181ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7182{
7183    U_ALIGN_CODE(16);
7184
7185    const UCollator *coll = sColl->coll;
7186
7187
7188    // setting up the collator parameters
7189    UColAttributeValue strength = coll->strength;
7190    UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7191
7192    UBool checkSecTer = initialCheckSecTer;
7193    UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7194    UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7195    UBool checkIdent = (strength == UCOL_IDENTICAL);
7196    UBool checkCase = (coll->caseLevel == UCOL_ON);
7197    UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7198    UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7199    UBool qShifted = shifted && checkQuad;
7200    UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7201
7202    if(doHiragana && shifted) {
7203        return (ucol_compareUsingSortKeys(sColl, tColl, status));
7204    }
7205    uint8_t caseSwitch = coll->caseSwitch;
7206    uint8_t tertiaryMask = coll->tertiaryMask;
7207
7208    // This is the lowest primary value that will not be ignored if shifted
7209    uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7210
7211    UCollationResult result = UCOL_EQUAL;
7212    UCollationResult hirResult = UCOL_EQUAL;
7213
7214    // Preparing the CE buffers. They will be filled during the primary phase
7215    ucol_CEBuf   sCEs;
7216    ucol_CEBuf   tCEs;
7217    UCOL_INIT_CEBUF(&sCEs);
7218    UCOL_INIT_CEBUF(&tCEs);
7219
7220    uint32_t secS = 0, secT = 0;
7221    uint32_t sOrder=0, tOrder=0;
7222
7223    // Non shifted primary processing is quite simple
7224    if(!shifted) {
7225        for(;;) {
7226
7227            // We fetch CEs until we hit a non ignorable primary or end.
7228            do {
7229                // We get the next CE
7230                sOrder = ucol_IGetNextCE(coll, sColl, status);
7231                // Stuff it in the buffer
7232                UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7233                // And keep just the primary part.
7234                sOrder &= UCOL_PRIMARYMASK;
7235            } while(sOrder == 0);
7236
7237            // see the comments on the above block
7238            do {
7239                tOrder = ucol_IGetNextCE(coll, tColl, status);
7240                UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7241                tOrder &= UCOL_PRIMARYMASK;
7242            } while(tOrder == 0);
7243
7244            // if both primaries are the same
7245            if(sOrder == tOrder) {
7246                // and there are no more CEs, we advance to the next level
7247                if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7248                    break;
7249                }
7250                if(doHiragana && hirResult == UCOL_EQUAL) {
7251                    if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7252                        hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7253                            ? UCOL_LESS:UCOL_GREATER;
7254                    }
7255                }
7256            } else {
7257                // only need to check one for continuation
7258                // if one is then the other must be or the preceding CE would be a prefix of the other
7259                if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
7260                    sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7261                    tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7262                }
7263                // if two primaries are different, we are done
7264                result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
7265                goto commonReturn;
7266            }
7267        } // no primary difference... do the rest from the buffers
7268    } else { // shifted - do a slightly more complicated processing :)
7269        for(;;) {
7270            UBool sInShifted = FALSE;
7271            UBool tInShifted = FALSE;
7272            // This version of code can be refactored. However, it seems easier to understand this way.
7273            // Source loop. Sam as the target loop.
7274            for(;;) {
7275                sOrder = ucol_IGetNextCE(coll, sColl, status);
7276                if(sOrder == UCOL_NO_MORE_CES) {
7277                    UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7278                    break;
7279                } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7280                    /* UCA amendment - ignore ignorables that follow shifted code points */
7281                    continue;
7282                } else if(isContinuation(sOrder)) {
7283                    if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7284                        if(sInShifted) {
7285                            sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7286                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7287                            continue;
7288                        } else {
7289                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7290                            break;
7291                        }
7292                    } else { /* Just lower level values */
7293                        if(sInShifted) {
7294                            continue;
7295                        } else {
7296                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7297                            continue;
7298                        }
7299                    }
7300                } else { /* regular */
7301                    if(coll->leadBytePermutationTable != NULL){
7302                        sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7303                    }
7304                    if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7305                        UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7306                        break;
7307                    } else {
7308                        if((sOrder & UCOL_PRIMARYMASK) > 0) {
7309                            sInShifted = TRUE;
7310                            sOrder &= UCOL_PRIMARYMASK;
7311                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7312                            continue;
7313                        } else {
7314                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7315                            sInShifted = FALSE;
7316                            continue;
7317                        }
7318                    }
7319                }
7320            }
7321            sOrder &= UCOL_PRIMARYMASK;
7322            sInShifted = FALSE;
7323
7324            for(;;) {
7325                tOrder = ucol_IGetNextCE(coll, tColl, status);
7326                if(tOrder == UCOL_NO_MORE_CES) {
7327                    UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7328                    break;
7329                } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7330                    /* UCA amendment - ignore ignorables that follow shifted code points */
7331                    continue;
7332                } else if(isContinuation(tOrder)) {
7333                    if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7334                        if(tInShifted) {
7335                            tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7336                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7337                            continue;
7338                        } else {
7339                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7340                            break;
7341                        }
7342                    } else { /* Just lower level values */
7343                        if(tInShifted) {
7344                            continue;
7345                        } else {
7346                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7347                            continue;
7348                        }
7349                    }
7350                } else { /* regular */
7351                    if(coll->leadBytePermutationTable != NULL){
7352                        tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7353                    }
7354                    if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7355                        UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7356                        break;
7357                    } else {
7358                        if((tOrder & UCOL_PRIMARYMASK) > 0) {
7359                            tInShifted = TRUE;
7360                            tOrder &= UCOL_PRIMARYMASK;
7361                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7362                            continue;
7363                        } else {
7364                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7365                            tInShifted = FALSE;
7366                            continue;
7367                        }
7368                    }
7369                }
7370            }
7371            tOrder &= UCOL_PRIMARYMASK;
7372            tInShifted = FALSE;
7373
7374            if(sOrder == tOrder) {
7375                /*
7376                if(doHiragana && hirResult == UCOL_EQUAL) {
7377                if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7378                hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7379                ? UCOL_LESS:UCOL_GREATER;
7380                }
7381                }
7382                */
7383                if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7384                    break;
7385                } else {
7386                    sOrder = 0;
7387                    tOrder = 0;
7388                    continue;
7389                }
7390            } else {
7391                result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7392                goto commonReturn;
7393            }
7394        } /* no primary difference... do the rest from the buffers */
7395    }
7396
7397    /* now, we're gonna reexamine collected CEs */
7398    uint32_t    *sCE;
7399    uint32_t    *tCE;
7400
7401    /* This is the secondary level of comparison */
7402    if(checkSecTer) {
7403        if(!isFrenchSec) { /* normal */
7404            sCE = sCEs.buf;
7405            tCE = tCEs.buf;
7406            for(;;) {
7407                while (secS == 0) {
7408                    secS = *(sCE++) & UCOL_SECONDARYMASK;
7409                }
7410
7411                while(secT == 0) {
7412                    secT = *(tCE++) & UCOL_SECONDARYMASK;
7413                }
7414
7415                if(secS == secT) {
7416                    if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7417                        break;
7418                    } else {
7419                        secS = 0; secT = 0;
7420                        continue;
7421                    }
7422                } else {
7423                    result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7424                    goto commonReturn;
7425                }
7426            }
7427        } else { /* do the French */
7428            uint32_t *sCESave = NULL;
7429            uint32_t *tCESave = NULL;
7430            sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7431            tCE = tCEs.pos-2;
7432            for(;;) {
7433                while (secS == 0 && sCE >= sCEs.buf) {
7434                    if(sCESave == NULL) {
7435                        secS = *(sCE--);
7436                        if(isContinuation(secS)) {
7437                            while(isContinuation(secS = *(sCE--)))
7438                                ;
7439                            /* after this, secS has the start of continuation, and sCEs points before that */
7440                            sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7441                            sCE+=2;  /* need to point to the first continuation CP */
7442                            /* However, now you can just continue doing stuff */
7443                        }
7444                    } else {
7445                        secS = *(sCE++);
7446                        if(!isContinuation(secS)) { /* This means we have finished with this cont */
7447                            sCE = sCESave;            /* reset the pointer to before continuation */
7448                            sCESave = NULL;
7449                            secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
7450                            continue;
7451                        }
7452                    }
7453                    secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7454                }
7455
7456                while(secT == 0 && tCE >= tCEs.buf) {
7457                    if(tCESave == NULL) {
7458                        secT = *(tCE--);
7459                        if(isContinuation(secT)) {
7460                            while(isContinuation(secT = *(tCE--)))
7461                                ;
7462                            /* after this, secS has the start of continuation, and sCEs points before that */
7463                            tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7464                            tCE+=2;  /* need to point to the first continuation CP */
7465                            /* However, now you can just continue doing stuff */
7466                        }
7467                    } else {
7468                        secT = *(tCE++);
7469                        if(!isContinuation(secT)) { /* This means we have finished with this cont */
7470                            tCE = tCESave;          /* reset the pointer to before continuation */
7471                            tCESave = NULL;
7472                            secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
7473                            continue;
7474                        }
7475                    }
7476                    secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7477                }
7478
7479                if(secS == secT) {
7480                    if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7481                        break;
7482                    } else {
7483                        secS = 0; secT = 0;
7484                        continue;
7485                    }
7486                } else {
7487                    result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7488                    goto commonReturn;
7489                }
7490            }
7491        }
7492    }
7493
7494    /* doing the case bit */
7495    if(checkCase) {
7496        sCE = sCEs.buf;
7497        tCE = tCEs.buf;
7498        for(;;) {
7499            while((secS & UCOL_REMOVE_CASE) == 0) {
7500                if(!isContinuation(*sCE++)) {
7501                    secS =*(sCE-1);
7502                    if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7503                        // primary ignorables should not be considered on the case level when the strength is primary
7504                        // otherwise, the CEs stop being well-formed
7505                        secS &= UCOL_TERT_CASE_MASK;
7506                        secS ^= caseSwitch;
7507                    } else {
7508                        secS = 0;
7509                    }
7510                } else {
7511                    secS = 0;
7512                }
7513            }
7514
7515            while((secT & UCOL_REMOVE_CASE) == 0) {
7516                if(!isContinuation(*tCE++)) {
7517                    secT = *(tCE-1);
7518                    if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7519                        // primary ignorables should not be considered on the case level when the strength is primary
7520                        // otherwise, the CEs stop being well-formed
7521                        secT &= UCOL_TERT_CASE_MASK;
7522                        secT ^= caseSwitch;
7523                    } else {
7524                        secT = 0;
7525                    }
7526                } else {
7527                    secT = 0;
7528                }
7529            }
7530
7531            if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7532                result = UCOL_LESS;
7533                goto commonReturn;
7534            } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7535                result = UCOL_GREATER;
7536                goto commonReturn;
7537            }
7538
7539            if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7540                break;
7541            } else {
7542                secS = 0;
7543                secT = 0;
7544            }
7545        }
7546    }
7547
7548    /* Tertiary level */
7549    if(checkTertiary) {
7550        secS = 0;
7551        secT = 0;
7552        sCE = sCEs.buf;
7553        tCE = tCEs.buf;
7554        for(;;) {
7555            while((secS & UCOL_REMOVE_CASE) == 0) {
7556                secS = *(sCE++) & tertiaryMask;
7557                if(!isContinuation(secS)) {
7558                    secS ^= caseSwitch;
7559                } else {
7560                    secS &= UCOL_REMOVE_CASE;
7561                }
7562            }
7563
7564            while((secT & UCOL_REMOVE_CASE)  == 0) {
7565                secT = *(tCE++) & tertiaryMask;
7566                if(!isContinuation(secT)) {
7567                    secT ^= caseSwitch;
7568                } else {
7569                    secT &= UCOL_REMOVE_CASE;
7570                }
7571            }
7572
7573            if(secS == secT) {
7574                if((secS & UCOL_REMOVE_CASE) == 1) {
7575                    break;
7576                } else {
7577                    secS = 0; secT = 0;
7578                    continue;
7579                }
7580            } else {
7581                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7582                goto commonReturn;
7583            }
7584        }
7585    }
7586
7587
7588    if(qShifted /*checkQuad*/) {
7589        UBool sInShifted = TRUE;
7590        UBool tInShifted = TRUE;
7591        secS = 0;
7592        secT = 0;
7593        sCE = sCEs.buf;
7594        tCE = tCEs.buf;
7595        for(;;) {
7596            while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
7597                secS = *(sCE++);
7598                if(isContinuation(secS)) {
7599                    if(!sInShifted) {
7600                        continue;
7601                    }
7602                } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7603                    secS = UCOL_PRIMARYMASK;
7604                    sInShifted = FALSE;
7605                } else {
7606                    sInShifted = TRUE;
7607                }
7608            }
7609            secS &= UCOL_PRIMARYMASK;
7610
7611
7612            while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
7613                secT = *(tCE++);
7614                if(isContinuation(secT)) {
7615                    if(!tInShifted) {
7616                        continue;
7617                    }
7618                } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7619                    secT = UCOL_PRIMARYMASK;
7620                    tInShifted = FALSE;
7621                } else {
7622                    tInShifted = TRUE;
7623                }
7624            }
7625            secT &= UCOL_PRIMARYMASK;
7626
7627            if(secS == secT) {
7628                if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7629                    break;
7630                } else {
7631                    secS = 0; secT = 0;
7632                    continue;
7633                }
7634            } else {
7635                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7636                goto commonReturn;
7637            }
7638        }
7639    } else if(doHiragana && hirResult != UCOL_EQUAL) {
7640        // If we're fine on quaternaries, we might be different
7641        // on Hiragana. This, however, might fail us in shifted.
7642        result = hirResult;
7643        goto commonReturn;
7644    }
7645
7646    /*  For IDENTICAL comparisons, we use a bitwise character comparison */
7647    /*  as a tiebreaker if all else is equal.                                */
7648    /*  Getting here  should be quite rare - strings are not identical -     */
7649    /*     that is checked first, but compared == through all other checks.  */
7650    if(checkIdent)
7651    {
7652        //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7653        result = ucol_checkIdent(sColl, tColl, TRUE, status);
7654    }
7655
7656commonReturn:
7657    if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7658        if (sCEs.buf != sCEs.localArray ) {
7659            uprv_free(sCEs.buf);
7660        }
7661        if (tCEs.buf != tCEs.localArray ) {
7662            uprv_free(tCEs.buf);
7663        }
7664    }
7665
7666    return result;
7667}
7668
7669static UCollationResult
7670ucol_strcollRegular(const UCollator *coll,
7671                    const UChar *source, int32_t sourceLength,
7672                    const UChar *target, int32_t targetLength,
7673                    UErrorCode *status) {
7674    collIterate sColl, tColl;
7675    // Preparing the context objects for iterating over strings
7676    IInit_collIterate(coll, source, sourceLength, &sColl, status);
7677    IInit_collIterate(coll, target, targetLength, &tColl, status);
7678    if(U_FAILURE(*status)) {
7679        return UCOL_LESS;
7680    }
7681    return ucol_strcollRegular(&sColl, &tColl, status);
7682}
7683
7684static inline uint32_t
7685ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7686                          uint32_t CE, const UChar *s, int32_t *index, int32_t len)
7687{
7688    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7689    int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7690    int32_t offset = 1;
7691    UChar schar = 0, tchar = 0;
7692
7693    for(;;) {
7694        if(len == -1) {
7695            if(s[*index] == 0) { // end of string
7696                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7697            } else {
7698                schar = s[*index];
7699            }
7700        } else {
7701            if(*index == len) {
7702                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7703            } else {
7704                schar = s[*index];
7705            }
7706        }
7707
7708        while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7709            offset++;
7710        }
7711
7712        if (schar == tchar) {
7713            (*index)++;
7714            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7715        }
7716        else
7717        {
7718            if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7719                return UCOL_BAIL_OUT_CE;
7720            }
7721            // skip completely ignorables
7722            uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7723            if(isZeroCE == 0) { // we have to ignore completely ignorables
7724                (*index)++;
7725                continue;
7726            }
7727
7728            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7729        }
7730    }
7731}
7732
7733
7734/**
7735 * This is a fast strcoll, geared towards text in Latin-1.
7736 * It supports contractions of size two, French secondaries
7737 * and case switching. You can use it with strengths primary
7738 * to tertiary. It does not support shifted and case level.
7739 * It relies on the table build by setupLatin1Table. If it
7740 * doesn't understand something, it will go to the regular
7741 * strcoll.
7742 */
7743static UCollationResult
7744ucol_strcollUseLatin1( const UCollator    *coll,
7745              const UChar        *source,
7746              int32_t            sLen,
7747              const UChar        *target,
7748              int32_t            tLen,
7749              UErrorCode *status)
7750{
7751    U_ALIGN_CODE(16);
7752    int32_t strength = coll->strength;
7753
7754    int32_t sIndex = 0, tIndex = 0;
7755    UChar sChar = 0, tChar = 0;
7756    uint32_t sOrder=0, tOrder=0;
7757
7758    UBool endOfSource = FALSE;
7759
7760    uint32_t *elements = coll->latinOneCEs;
7761
7762    UBool haveContractions = FALSE; // if we have contractions in our string
7763                                    // we cannot do French secondary
7764
7765    // Do the primary level
7766    for(;;) {
7767        while(sOrder==0) { // this loop skips primary ignorables
7768            // sOrder=getNextlatinOneCE(source);
7769            if(sLen==-1) {   // handling zero terminated strings
7770                sChar=source[sIndex++];
7771                if(sChar==0) {
7772                    endOfSource = TRUE;
7773                    break;
7774                }
7775            } else {        // handling strings with known length
7776                if(sIndex==sLen) {
7777                    endOfSource = TRUE;
7778                    break;
7779                }
7780                sChar=source[sIndex++];
7781            }
7782            if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7783                //fprintf(stderr, "R");
7784                return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7785            }
7786            sOrder = elements[sChar];
7787            if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7788                // specials can basically be either contractions or bail-out signs. If we get anything
7789                // else, we'll bail out anywasy
7790                if(getCETag(sOrder) == CONTRACTION_TAG) {
7791                    sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7792                    haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7793                    // However, if there are contractions in the table, but we always use just one char,
7794                    // we might be able to do French. This should be checked out.
7795                }
7796                if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7797                    //fprintf(stderr, "S");
7798                    return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7799                }
7800            }
7801        }
7802
7803        while(tOrder==0) {  // this loop skips primary ignorables
7804            // tOrder=getNextlatinOneCE(target);
7805            if(tLen==-1) {    // handling zero terminated strings
7806                tChar=target[tIndex++];
7807                if(tChar==0) {
7808                    if(endOfSource) { // this is different than source loop,
7809                        // as we already know that source loop is done here,
7810                        // so we can either finish the primary loop if both
7811                        // strings are done or anounce the result if only
7812                        // target is done. Same below.
7813                        goto endOfPrimLoop;
7814                    } else {
7815                        return UCOL_GREATER;
7816                    }
7817                }
7818            } else {          // handling strings with known length
7819                if(tIndex==tLen) {
7820                    if(endOfSource) {
7821                        goto endOfPrimLoop;
7822                    } else {
7823                        return UCOL_GREATER;
7824                    }
7825                }
7826                tChar=target[tIndex++];
7827            }
7828            if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7829                //fprintf(stderr, "R");
7830                return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7831            }
7832            tOrder = elements[tChar];
7833            if(tOrder >= UCOL_NOT_FOUND) {
7834                // Handling specials, see the comments for source
7835                if(getCETag(tOrder) == CONTRACTION_TAG) {
7836                    tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7837                    haveContractions = TRUE;
7838                }
7839                if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7840                    //fprintf(stderr, "S");
7841                    return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7842                }
7843            }
7844        }
7845        if(endOfSource) { // source is finished, but target is not, say the result.
7846            return UCOL_LESS;
7847        }
7848
7849        if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7850            sOrder = 0; tOrder = 0;
7851            continue;
7852        } else {
7853            // compare current top bytes
7854            if(((sOrder^tOrder)&0xFF000000)!=0) {
7855                // top bytes differ, return difference
7856                if(sOrder < tOrder) {
7857                    return UCOL_LESS;
7858                } else if(sOrder > tOrder) {
7859                    return UCOL_GREATER;
7860                }
7861                // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7862                // since we must return enum value
7863            }
7864
7865            // top bytes match, continue with following bytes
7866            sOrder<<=8;
7867            tOrder<<=8;
7868        }
7869    }
7870
7871endOfPrimLoop:
7872    // after primary loop, we definitely know the sizes of strings,
7873    // so we set it and use simpler loop for secondaries and tertiaries
7874    sLen = sIndex; tLen = tIndex;
7875    if(strength >= UCOL_SECONDARY) {
7876        // adjust the table beggining
7877        elements += coll->latinOneTableLen;
7878        endOfSource = FALSE;
7879
7880        if(coll->frenchCollation == UCOL_OFF) { // non French
7881            // This loop is a simplified copy of primary loop
7882            // at this point we know that whole strings are latin-1, so we don't
7883            // check for that. We also know that we only have contractions as
7884            // specials.
7885            sIndex = 0; tIndex = 0;
7886            for(;;) {
7887                while(sOrder==0) {
7888                    if(sIndex==sLen) {
7889                        endOfSource = TRUE;
7890                        break;
7891                    }
7892                    sChar=source[sIndex++];
7893                    sOrder = elements[sChar];
7894                    if(sOrder > UCOL_NOT_FOUND) {
7895                        sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7896                    }
7897                }
7898
7899                while(tOrder==0) {
7900                    if(tIndex==tLen) {
7901                        if(endOfSource) {
7902                            goto endOfSecLoop;
7903                        } else {
7904                            return UCOL_GREATER;
7905                        }
7906                    }
7907                    tChar=target[tIndex++];
7908                    tOrder = elements[tChar];
7909                    if(tOrder > UCOL_NOT_FOUND) {
7910                        tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7911                    }
7912                }
7913                if(endOfSource) {
7914                    return UCOL_LESS;
7915                }
7916
7917                if(sOrder == tOrder) {
7918                    sOrder = 0; tOrder = 0;
7919                    continue;
7920                } else {
7921                    // see primary loop for comments on this
7922                    if(((sOrder^tOrder)&0xFF000000)!=0) {
7923                        if(sOrder < tOrder) {
7924                            return UCOL_LESS;
7925                        } else if(sOrder > tOrder) {
7926                            return UCOL_GREATER;
7927                        }
7928                    }
7929                    sOrder<<=8;
7930                    tOrder<<=8;
7931                }
7932            }
7933        } else { // French
7934            if(haveContractions) { // if we have contractions, we have to bail out
7935                // since we don't really know how to handle them here
7936                return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7937            }
7938            // For French, we go backwards
7939            sIndex = sLen; tIndex = tLen;
7940            for(;;) {
7941                while(sOrder==0) {
7942                    if(sIndex==0) {
7943                        endOfSource = TRUE;
7944                        break;
7945                    }
7946                    sChar=source[--sIndex];
7947                    sOrder = elements[sChar];
7948                    // don't even look for contractions
7949                }
7950
7951                while(tOrder==0) {
7952                    if(tIndex==0) {
7953                        if(endOfSource) {
7954                            goto endOfSecLoop;
7955                        } else {
7956                            return UCOL_GREATER;
7957                        }
7958                    }
7959                    tChar=target[--tIndex];
7960                    tOrder = elements[tChar];
7961                    // don't even look for contractions
7962                }
7963                if(endOfSource) {
7964                    return UCOL_LESS;
7965                }
7966
7967                if(sOrder == tOrder) {
7968                    sOrder = 0; tOrder = 0;
7969                    continue;
7970                } else {
7971                    // see the primary loop for comments
7972                    if(((sOrder^tOrder)&0xFF000000)!=0) {
7973                        if(sOrder < tOrder) {
7974                            return UCOL_LESS;
7975                        } else if(sOrder > tOrder) {
7976                            return UCOL_GREATER;
7977                        }
7978                    }
7979                    sOrder<<=8;
7980                    tOrder<<=8;
7981                }
7982            }
7983        }
7984    }
7985
7986endOfSecLoop:
7987    if(strength >= UCOL_TERTIARY) {
7988        // tertiary loop is the same as secondary (except no French)
7989        elements += coll->latinOneTableLen;
7990        sIndex = 0; tIndex = 0;
7991        endOfSource = FALSE;
7992        for(;;) {
7993            while(sOrder==0) {
7994                if(sIndex==sLen) {
7995                    endOfSource = TRUE;
7996                    break;
7997                }
7998                sChar=source[sIndex++];
7999                sOrder = elements[sChar];
8000                if(sOrder > UCOL_NOT_FOUND) {
8001                    sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8002                }
8003            }
8004            while(tOrder==0) {
8005                if(tIndex==tLen) {
8006                    if(endOfSource) {
8007                        return UCOL_EQUAL; // if both strings are at the end, they are equal
8008                    } else {
8009                        return UCOL_GREATER;
8010                    }
8011                }
8012                tChar=target[tIndex++];
8013                tOrder = elements[tChar];
8014                if(tOrder > UCOL_NOT_FOUND) {
8015                    tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8016                }
8017            }
8018            if(endOfSource) {
8019                return UCOL_LESS;
8020            }
8021            if(sOrder == tOrder) {
8022                sOrder = 0; tOrder = 0;
8023                continue;
8024            } else {
8025                if(((sOrder^tOrder)&0xff000000)!=0) {
8026                    if(sOrder < tOrder) {
8027                        return UCOL_LESS;
8028                    } else if(sOrder > tOrder) {
8029                        return UCOL_GREATER;
8030                    }
8031                }
8032                sOrder<<=8;
8033                tOrder<<=8;
8034            }
8035        }
8036    }
8037    return UCOL_EQUAL;
8038}
8039
8040/*
8041  Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
8042  null terminated input string takes extra amount of CPU cycles.
8043*/
8044static UCollationResult
8045ucol_strcollRegularUTF8(
8046                    const UCollator *coll,
8047                    const char      *source,
8048                    int32_t         sourceLength,
8049                    const char      *target,
8050                    int32_t         targetLength,
8051                    UErrorCode      *status)
8052{
8053    UCharIterator src;
8054    UCharIterator tgt;
8055
8056    uiter_setUTF8(&src, source, sourceLength);
8057    uiter_setUTF8(&tgt, target, targetLength);
8058
8059    // Preparing the context objects for iterating over strings
8060    collIterate sColl, tColl;
8061    IInit_collIterate(coll, NULL, -1, &sColl, status);
8062    IInit_collIterate(coll, NULL, -1, &tColl, status);
8063    if(U_FAILURE(*status)) {
8064        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8065        return UCOL_EQUAL;
8066    }
8067    // The division for the array length may truncate the array size to
8068    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8069    // for all platforms anyway.
8070    UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8071    UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8072    UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8073
8074    sColl.iterator = &src;
8075    sColl.flags |= UCOL_USE_ITERATOR;
8076    tColl.flags |= UCOL_USE_ITERATOR;
8077    tColl.iterator = &tgt;
8078
8079    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8080        sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8081        sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
8082        sColl.flags &= ~UCOL_ITER_NORM;
8083
8084        tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8085        tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
8086        tColl.flags &= ~UCOL_ITER_NORM;
8087    }
8088
8089    return ucol_strcollRegular(&sColl, &tColl, status);
8090}
8091
8092static inline uint32_t
8093ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
8094                          uint32_t CE, const char *s, int32_t *index, int32_t len)
8095{
8096    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8097    int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8098    int32_t offset = 1;
8099    UChar32 schar = 0, tchar = 0;
8100
8101    for(;;) {
8102        if (*index == len) {
8103            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8104        }
8105        U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar);
8106        if (len < 0 && schar == 0) {
8107            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8108        }
8109
8110        while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8111            offset++;
8112        }
8113
8114        if (schar == tchar) {
8115            U8_FWD_1(s, *index, len);
8116            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8117        }
8118        else
8119        {
8120            if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8121                return UCOL_BAIL_OUT_CE;
8122            }
8123            // skip completely ignorables
8124            uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8125            if(isZeroCE == 0) { // we have to ignore completely ignorables
8126                U8_FWD_1(s, *index, len);
8127                continue;
8128            }
8129
8130            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8131        }
8132    }
8133}
8134
8135static inline UCollationResult
8136ucol_strcollUseLatin1UTF8(
8137                const UCollator *coll,
8138                const char      *source,
8139                int32_t         sLen,
8140                const char      *target,
8141                int32_t         tLen,
8142                UErrorCode      *status)
8143{
8144    U_ALIGN_CODE(16);
8145    int32_t strength = coll->strength;
8146
8147    int32_t sIndex = 0, tIndex = 0;
8148    UChar32 sChar = 0, tChar = 0;
8149    uint32_t sOrder=0, tOrder=0;
8150
8151    UBool endOfSource = FALSE;
8152
8153    uint32_t *elements = coll->latinOneCEs;
8154
8155    UBool haveContractions = FALSE; // if we have contractions in our string
8156                                    // we cannot do French secondary
8157
8158    // Do the primary level
8159    for(;;) {
8160        while(sOrder==0) { // this loop skips primary ignorables
8161            // sOrder=getNextlatinOneCE(source);
8162            if (sIndex == sLen) {
8163                endOfSource = TRUE;
8164                break;
8165            }
8166            U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);
8167            if (sLen < 0 && sChar == 0) {
8168                endOfSource = TRUE;
8169                sLen = sIndex;
8170                break;
8171            }
8172            if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8173                //fprintf(stderr, "R");
8174                return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8175            }
8176            sOrder = elements[sChar];
8177            if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8178                // specials can basically be either contractions or bail-out signs. If we get anything
8179                // else, we'll bail out anywasy
8180                if(getCETag(sOrder) == CONTRACTION_TAG) {
8181                    sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8182                    haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8183                    // However, if there are contractions in the table, but we always use just one char,
8184                    // we might be able to do French. This should be checked out.
8185                }
8186                if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8187                    //fprintf(stderr, "S");
8188                    return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8189                }
8190            }
8191        }
8192
8193        while(tOrder==0) {  // this loop skips primary ignorables
8194            // tOrder=getNextlatinOneCE(target);
8195            if (tIndex == tLen) {
8196                if(endOfSource) {
8197                    goto endOfPrimLoopU8;
8198                } else {
8199                    return UCOL_GREATER;
8200                }
8201            }
8202            U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8203            if (tLen < 0 && tChar == 0) {
8204                if(endOfSource) {
8205                    tLen = tIndex;
8206                    goto endOfPrimLoopU8;
8207                } else {
8208                    return UCOL_GREATER;
8209                }
8210            }
8211            if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8212                //fprintf(stderr, "R");
8213                return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8214            }
8215            tOrder = elements[tChar];
8216            if(tOrder >= UCOL_NOT_FOUND) {
8217                // Handling specials, see the comments for source
8218                if(getCETag(tOrder) == CONTRACTION_TAG) {
8219                    tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8220                    haveContractions = TRUE;
8221                }
8222                if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8223                    //fprintf(stderr, "S");
8224                    return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8225                }
8226            }
8227        }
8228        if(endOfSource) { // source is finished, but target is not, say the result.
8229            return UCOL_LESS;
8230        }
8231
8232        if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8233            sOrder = 0; tOrder = 0;
8234            continue;
8235        } else {
8236            // compare current top bytes
8237            if(((sOrder^tOrder)&0xFF000000)!=0) {
8238                // top bytes differ, return difference
8239                if(sOrder < tOrder) {
8240                    return UCOL_LESS;
8241                } else if(sOrder > tOrder) {
8242                    return UCOL_GREATER;
8243                }
8244                // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8245                // since we must return enum value
8246            }
8247
8248            // top bytes match, continue with following bytes
8249            sOrder<<=8;
8250            tOrder<<=8;
8251        }
8252    }
8253
8254endOfPrimLoopU8:
8255    // after primary loop, we definitely know the sizes of strings,
8256    // so we set it and use simpler loop for secondaries and tertiaries
8257    sLen = sIndex; tLen = tIndex;
8258    if(strength >= UCOL_SECONDARY) {
8259        // adjust the table beggining
8260        elements += coll->latinOneTableLen;
8261        endOfSource = FALSE;
8262
8263        if(coll->frenchCollation == UCOL_OFF) { // non French
8264            // This loop is a simplified copy of primary loop
8265            // at this point we know that whole strings are latin-1, so we don't
8266            // check for that. We also know that we only have contractions as
8267            // specials.
8268            sIndex = 0; tIndex = 0;
8269            for(;;) {
8270                while(sOrder==0) {
8271                    if(sIndex==sLen) {
8272                        endOfSource = TRUE;
8273                        break;
8274                    }
8275                    U_ASSERT(sLen >= 0);
8276                    U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8277                    U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8278                    sOrder = elements[sChar];
8279                    if(sOrder > UCOL_NOT_FOUND) {
8280                        sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8281                    }
8282                }
8283
8284                while(tOrder==0) {
8285                    if(tIndex==tLen) {
8286                        if(endOfSource) {
8287                            goto endOfSecLoopU8;
8288                        } else {
8289                            return UCOL_GREATER;
8290                        }
8291                    }
8292                    U_ASSERT(tLen >= 0);
8293                    U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8294                    U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8295                    tOrder = elements[tChar];
8296                    if(tOrder > UCOL_NOT_FOUND) {
8297                        tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8298                    }
8299                }
8300                if(endOfSource) {
8301                    return UCOL_LESS;
8302                }
8303
8304                if(sOrder == tOrder) {
8305                    sOrder = 0; tOrder = 0;
8306                    continue;
8307                } else {
8308                    // see primary loop for comments on this
8309                    if(((sOrder^tOrder)&0xFF000000)!=0) {
8310                        if(sOrder < tOrder) {
8311                            return UCOL_LESS;
8312                        } else if(sOrder > tOrder) {
8313                            return UCOL_GREATER;
8314                        }
8315                    }
8316                    sOrder<<=8;
8317                    tOrder<<=8;
8318                }
8319            }
8320        } else { // French
8321            if(haveContractions) { // if we have contractions, we have to bail out
8322                // since we don't really know how to handle them here
8323                return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8324            }
8325            // For French, we go backwards
8326            sIndex = sLen; tIndex = tLen;
8327            for(;;) {
8328                while(sOrder==0) {
8329                    if(sIndex==0) {
8330                        endOfSource = TRUE;
8331                        break;
8332                    }
8333                    U8_PREV_OR_FFFD(source, 0, sIndex, sChar);
8334                    U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8335                    sOrder = elements[sChar];
8336                    // don't even look for contractions
8337                }
8338
8339                while(tOrder==0) {
8340                    if(tIndex==0) {
8341                        if(endOfSource) {
8342                            goto endOfSecLoopU8;
8343                        } else {
8344                            return UCOL_GREATER;
8345                        }
8346                    }
8347                    U8_PREV_OR_FFFD(target, 0, tIndex, tChar);
8348                    U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8349                    tOrder = elements[tChar];
8350                    // don't even look for contractions
8351                }
8352                if(endOfSource) {
8353                    return UCOL_LESS;
8354                }
8355
8356                if(sOrder == tOrder) {
8357                    sOrder = 0; tOrder = 0;
8358                    continue;
8359                } else {
8360                    // see the primary loop for comments
8361                    if(((sOrder^tOrder)&0xFF000000)!=0) {
8362                        if(sOrder < tOrder) {
8363                            return UCOL_LESS;
8364                        } else if(sOrder > tOrder) {
8365                            return UCOL_GREATER;
8366                        }
8367                    }
8368                    sOrder<<=8;
8369                    tOrder<<=8;
8370                }
8371            }
8372        }
8373    }
8374
8375endOfSecLoopU8:
8376    if(strength >= UCOL_TERTIARY) {
8377        // tertiary loop is the same as secondary (except no French)
8378        elements += coll->latinOneTableLen;
8379        sIndex = 0; tIndex = 0;
8380        endOfSource = FALSE;
8381        for(;;) {
8382            while(sOrder==0) {
8383                if(sIndex==sLen) {
8384                    endOfSource = TRUE;
8385                    break;
8386                }
8387                U_ASSERT(sLen >= 0);
8388                U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8389                U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8390                sOrder = elements[sChar];
8391                if(sOrder > UCOL_NOT_FOUND) {
8392                    sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8393                }
8394            }
8395            while(tOrder==0) {
8396                if(tIndex==tLen) {
8397                    if(endOfSource) {
8398                        return UCOL_EQUAL; // if both strings are at the end, they are equal
8399                    } else {
8400                        return UCOL_GREATER;
8401                    }
8402                }
8403                U_ASSERT(tLen >= 0);
8404                U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8405                U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8406                tOrder = elements[tChar];
8407                if(tOrder > UCOL_NOT_FOUND) {
8408                    tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8409                }
8410            }
8411            if(endOfSource) {
8412                return UCOL_LESS;
8413            }
8414            if(sOrder == tOrder) {
8415                sOrder = 0; tOrder = 0;
8416                continue;
8417            } else {
8418                if(((sOrder^tOrder)&0xff000000)!=0) {
8419                    if(sOrder < tOrder) {
8420                        return UCOL_LESS;
8421                    } else if(sOrder > tOrder) {
8422                        return UCOL_GREATER;
8423                    }
8424                }
8425                sOrder<<=8;
8426                tOrder<<=8;
8427            }
8428        }
8429    }
8430    return UCOL_EQUAL;
8431}
8432
8433U_CAPI UCollationResult U_EXPORT2
8434ucol_strcollIter( const UCollator    *coll,
8435                 UCharIterator *sIter,
8436                 UCharIterator *tIter,
8437                 UErrorCode         *status)
8438{
8439    if(!status || U_FAILURE(*status)) {
8440        return UCOL_EQUAL;
8441    }
8442
8443    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8444    UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8445
8446    if (sIter == tIter) {
8447        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8448        return UCOL_EQUAL;
8449    }
8450    if(sIter == NULL || tIter == NULL || coll == NULL) {
8451        *status = U_ILLEGAL_ARGUMENT_ERROR;
8452        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8453        return UCOL_EQUAL;
8454    }
8455
8456    UCollationResult result = UCOL_EQUAL;
8457
8458    // Preparing the context objects for iterating over strings
8459    collIterate sColl, tColl;
8460    IInit_collIterate(coll, NULL, -1, &sColl, status);
8461    IInit_collIterate(coll, NULL, -1, &tColl, status);
8462    if(U_FAILURE(*status)) {
8463        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8464        return UCOL_EQUAL;
8465    }
8466    // The division for the array length may truncate the array size to
8467    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8468    // for all platforms anyway.
8469    UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8470    UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8471    UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8472
8473    sColl.iterator = sIter;
8474    sColl.flags |= UCOL_USE_ITERATOR;
8475    tColl.flags |= UCOL_USE_ITERATOR;
8476    tColl.iterator = tIter;
8477
8478    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8479        sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8480        sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8481        sColl.flags &= ~UCOL_ITER_NORM;
8482
8483        tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8484        tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8485        tColl.flags &= ~UCOL_ITER_NORM;
8486    }
8487
8488    UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8489
8490    while((sChar = sColl.iterator->next(sColl.iterator)) ==
8491        (tChar = tColl.iterator->next(tColl.iterator))) {
8492            if(sChar == U_SENTINEL) {
8493                result = UCOL_EQUAL;
8494                goto end_compare;
8495            }
8496    }
8497
8498    if(sChar == U_SENTINEL) {
8499        tChar = tColl.iterator->previous(tColl.iterator);
8500    }
8501
8502    if(tChar == U_SENTINEL) {
8503        sChar = sColl.iterator->previous(sColl.iterator);
8504    }
8505
8506    sChar = sColl.iterator->previous(sColl.iterator);
8507    tChar = tColl.iterator->previous(tColl.iterator);
8508
8509    if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8510    {
8511        // We are stopped in the middle of a contraction.
8512        // Scan backwards through the == part of the string looking for the start of the contraction.
8513        //   It doesn't matter which string we scan, since they are the same in this region.
8514        do
8515        {
8516            sChar = sColl.iterator->previous(sColl.iterator);
8517            tChar = tColl.iterator->previous(tColl.iterator);
8518        }
8519        while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8520    }
8521
8522
8523    if(U_SUCCESS(*status)) {
8524        result = ucol_strcollRegular(&sColl, &tColl, status);
8525    }
8526
8527end_compare:
8528    if(sNormIter || tNormIter) {
8529        unorm_closeIter(sNormIter);
8530        unorm_closeIter(tNormIter);
8531    }
8532
8533    UTRACE_EXIT_VALUE_STATUS(result, *status)
8534    return result;
8535}
8536
8537
8538/*                                                                      */
8539/* ucol_strcoll     Main public API string comparison function          */
8540/*                                                                      */
8541U_CAPI UCollationResult U_EXPORT2
8542ucol_strcoll( const UCollator    *coll,
8543              const UChar        *source,
8544              int32_t            sourceLength,
8545              const UChar        *target,
8546              int32_t            targetLength)
8547{
8548    U_ALIGN_CODE(16);
8549
8550    UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8551    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8552        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8553        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8554        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8555    }
8556
8557    if(source == NULL || target == NULL) {
8558        // do not crash, but return. Should have
8559        // status argument to return error.
8560        UTRACE_EXIT_VALUE(UCOL_EQUAL);
8561        return UCOL_EQUAL;
8562    }
8563
8564    /* Quick check if source and target are same strings. */
8565    /* They should either both be NULL terminated or the explicit length should be set on both. */
8566    if (source==target && sourceLength==targetLength) {
8567        UTRACE_EXIT_VALUE(UCOL_EQUAL);
8568        return UCOL_EQUAL;
8569    }
8570
8571    if(coll->delegate != NULL) {
8572      UErrorCode status = U_ZERO_ERROR;
8573      return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
8574    }
8575
8576    /* Scan the strings.  Find:                                                             */
8577    /*    The length of any leading portion that is equal                                   */
8578    /*    Whether they are exactly equal.  (in which case we just return)                   */
8579    const UChar    *pSrc    = source;
8580    const UChar    *pTarg   = target;
8581    int32_t        equalLength;
8582
8583    if (sourceLength == -1 && targetLength == -1) {
8584        // Both strings are null terminated.
8585        //    Scan through any leading equal portion.
8586        while (*pSrc == *pTarg && *pSrc != 0) {
8587            pSrc++;
8588            pTarg++;
8589        }
8590        if (*pSrc == 0 && *pTarg == 0) {
8591            UTRACE_EXIT_VALUE(UCOL_EQUAL);
8592            return UCOL_EQUAL;
8593        }
8594        equalLength = (int32_t)(pSrc - source);
8595    }
8596    else
8597    {
8598        // One or both strings has an explicit length.
8599        const UChar    *pSrcEnd = source + sourceLength;
8600        const UChar    *pTargEnd = target + targetLength;
8601
8602        // Scan while the strings are bitwise ==, or until one is exhausted.
8603        for (;;) {
8604            if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8605                break;
8606            }
8607            if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8608                break;
8609            }
8610            if (*pSrc != *pTarg) {
8611                break;
8612            }
8613            pSrc++;
8614            pTarg++;
8615        }
8616        equalLength = (int32_t)(pSrc - source);
8617
8618        // If we made it all the way through both strings, we are done.  They are ==
8619        if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8620            (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
8621        {
8622            UTRACE_EXIT_VALUE(UCOL_EQUAL);
8623            return UCOL_EQUAL;
8624        }
8625    }
8626    if (equalLength > 0) {
8627        /* There is an identical portion at the beginning of the two strings.        */
8628        /*   If the identical portion ends within a contraction or a comibining      */
8629        /*   character sequence, back up to the start of that sequence.              */
8630
8631        // These values should already be set by the code above.
8632        //pSrc  = source + equalLength;        /* point to the first differing chars   */
8633        //pTarg = target + equalLength;
8634        if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8635            (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8636        {
8637            // We are stopped in the middle of a contraction.
8638            // Scan backwards through the == part of the string looking for the start of the contraction.
8639            //   It doesn't matter which string we scan, since they are the same in this region.
8640            do
8641            {
8642                equalLength--;
8643                pSrc--;
8644            }
8645            while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8646        }
8647
8648        source += equalLength;
8649        target += equalLength;
8650        if (sourceLength > 0) {
8651            sourceLength -= equalLength;
8652        }
8653        if (targetLength > 0) {
8654            targetLength -= equalLength;
8655        }
8656    }
8657
8658    UErrorCode status = U_ZERO_ERROR;
8659    UCollationResult returnVal;
8660    if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8661        returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
8662    } else {
8663        returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8664    }
8665    UTRACE_EXIT_VALUE(returnVal);
8666    return returnVal;
8667}
8668
8669U_CAPI UCollationResult U_EXPORT2
8670ucol_strcollUTF8(
8671        const UCollator *coll,
8672        const char      *source,
8673        int32_t         sourceLength,
8674        const char      *target,
8675        int32_t         targetLength,
8676        UErrorCode      *status)
8677{
8678    U_ALIGN_CODE(16);
8679
8680    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
8681    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8682        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8683        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
8684        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
8685    }
8686
8687    if (U_FAILURE(*status)) {
8688        /* do nothing */
8689        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8690        return UCOL_EQUAL;
8691    }
8692
8693    if(source == NULL || target == NULL) {
8694        *status = U_ILLEGAL_ARGUMENT_ERROR;
8695        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8696        return UCOL_EQUAL;
8697    }
8698
8699    /* Quick check if source and target are same strings. */
8700    /* They should either both be NULL terminated or the explicit length should be set on both. */
8701    if (source==target && sourceLength==targetLength) {
8702        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8703        return UCOL_EQUAL;
8704    }
8705
8706    if(coll->delegate != NULL) {
8707        return ((const Collator*)coll->delegate)->compareUTF8(
8708            StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength),
8709            StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength),
8710            *status);
8711    }
8712
8713    /* Scan the strings.  Find:                                                             */
8714    /*    The length of any leading portion that is equal                                   */
8715    /*    Whether they are exactly equal.  (in which case we just return)                   */
8716    const char  *pSrc = source;
8717    const char  *pTarg = target;
8718    UBool       bSrcLimit = FALSE;
8719    UBool       bTargLimit = FALSE;
8720
8721    if (sourceLength == -1 && targetLength == -1) {
8722        // Both strings are null terminated.
8723        //    Scan through any leading equal portion.
8724        while (*pSrc == *pTarg && *pSrc != 0) {
8725            pSrc++;
8726            pTarg++;
8727        }
8728        if (*pSrc == 0 && *pTarg == 0) {
8729            UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8730            return UCOL_EQUAL;
8731        }
8732        bSrcLimit = (*pSrc == 0);
8733        bTargLimit = (*pTarg == 0);
8734    }
8735    else
8736    {
8737        // One or both strings has an explicit length.
8738        const char *pSrcEnd = source + sourceLength;
8739        const char *pTargEnd = target + targetLength;
8740
8741        // Scan while the strings are bitwise ==, or until one is exhausted.
8742        for (;;) {
8743            if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8744                break;
8745            }
8746            if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8747                break;
8748            }
8749            if (*pSrc != *pTarg) {
8750                break;
8751            }
8752            pSrc++;
8753            pTarg++;
8754        }
8755        bSrcLimit = (pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0));
8756        bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
8757
8758        // If we made it all the way through both strings, we are done.  They are ==
8759        if (bSrcLimit &&    /* At end of src string, however it was specified. */
8760            bTargLimit)     /* and also at end of dest string                  */
8761        {
8762            UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8763            return UCOL_EQUAL;
8764        }
8765    }
8766
8767    U_ASSERT(!(bSrcLimit && bTargLimit));
8768
8769    int32_t    equalLength = pSrc - source;
8770    UBool       bSawNonLatin1 = FALSE;
8771
8772    if (equalLength > 0) {
8773        // Align position to the start of UTF-8 code point.
8774        if (bTargLimit) {
8775            U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
8776        } else {
8777            U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
8778        }
8779        pSrc = source + equalLength;
8780        pTarg = target + equalLength;
8781    }
8782
8783    if (equalLength > 0) {
8784        /* There is an identical portion at the beginning of the two strings.        */
8785        /*   If the identical portion ends within a contraction or a comibining      */
8786        /*   character sequence, back up to the start of that sequence.              */
8787        UBool bUnsafeCP = FALSE;
8788        UChar32 uc32 = -1;
8789
8790        if (!bSrcLimit) {
8791            U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);
8792            if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8793                bUnsafeCP = TRUE;
8794            }
8795            bSawNonLatin1 |= (uc32 > 0xff);
8796        }
8797        if (!bTargLimit) {
8798            U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);
8799            if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8800                bUnsafeCP = TRUE;
8801            }
8802            bSawNonLatin1 |= (uc32 > 0xff);
8803        }
8804
8805        if (bUnsafeCP) {
8806            while (equalLength > 0) {
8807                // We are stopped in the middle of a contraction.
8808                // Scan backwards through the == part of the string looking for the start of the contraction.
8809                //   It doesn't matter which string we scan, since they are the same in this region.
8810                U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);
8811                bSawNonLatin1 |= (uc32 > 0xff);
8812                if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
8813                    break;
8814                }
8815            }
8816        }
8817        source += equalLength;
8818        target += equalLength;
8819        if (sourceLength > 0) {
8820            sourceLength -= equalLength;
8821        }
8822        if (targetLength > 0) {
8823            targetLength -= equalLength;
8824        }
8825    } else {
8826        // Lead byte of Latin 1 character is 0x00 - 0xC3
8827        bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3);
8828        bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3);
8829    }
8830
8831    UCollationResult returnVal;
8832
8833    if(!coll->latinOneUse || bSawNonLatin1) {
8834        returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
8835    } else {
8836        returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
8837    }
8838    UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
8839    return returnVal;
8840}
8841
8842
8843/* convenience function for comparing strings */
8844U_CAPI UBool U_EXPORT2
8845ucol_greater(    const    UCollator        *coll,
8846        const    UChar            *source,
8847        int32_t            sourceLength,
8848        const    UChar            *target,
8849        int32_t            targetLength)
8850{
8851    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8852        == UCOL_GREATER);
8853}
8854
8855/* convenience function for comparing strings */
8856U_CAPI UBool U_EXPORT2
8857ucol_greaterOrEqual(    const    UCollator    *coll,
8858            const    UChar        *source,
8859            int32_t        sourceLength,
8860            const    UChar        *target,
8861            int32_t        targetLength)
8862{
8863    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8864        != UCOL_LESS);
8865}
8866
8867/* convenience function for comparing strings */
8868U_CAPI UBool U_EXPORT2
8869ucol_equal(        const    UCollator        *coll,
8870            const    UChar            *source,
8871            int32_t            sourceLength,
8872            const    UChar            *target,
8873            int32_t            targetLength)
8874{
8875    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8876        == UCOL_EQUAL);
8877}
8878
8879U_CAPI void U_EXPORT2
8880ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8881    if(coll && coll->UCA) {
8882        uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8883    }
8884}
8885
8886#endif /* #if !UCONFIG_NO_COLLATION */
8887