1/*
2******************************************************************************
3*   Copyright (C) 2001-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5******************************************************************************
6*
7* File ucoleitr.cpp
8*
9* Modification History:
10*
11* Date        Name        Description
12* 02/15/2001  synwee      Modified all methods to process its own function
13*                         instead of calling the equivalent c++ api (coleitr.h)
14******************************************************************************/
15
16#include "unicode/utypes.h"
17
18#if !UCONFIG_NO_COLLATION
19
20#include "unicode/ucoleitr.h"
21#include "unicode/ustring.h"
22#include "unicode/sortkey.h"
23#include "unicode/uobject.h"
24#include "ucol_imp.h"
25#include "cmemory.h"
26
27U_NAMESPACE_USE
28
29#define BUFFER_LENGTH             100
30
31#define DEFAULT_BUFFER_SIZE 16
32#define BUFFER_GROW 8
33
34#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
35
36#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
37
38#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39
40#define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
41
42#define DELETE_ARRAY(array) uprv_free((void *) (array))
43
44typedef struct icu::collIterate collIterator;
45
46struct RCEI
47{
48    uint32_t ce;
49    int32_t  low;
50    int32_t  high;
51};
52
53U_NAMESPACE_BEGIN
54
55struct RCEBuffer
56{
57    RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
58    RCEI   *buffer;
59    int32_t bufferIndex;
60    int32_t bufferSize;
61
62    RCEBuffer();
63    ~RCEBuffer();
64
65    UBool empty() const;
66    void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
67    const RCEI *get();
68};
69
70RCEBuffer::RCEBuffer()
71{
72    buffer = defaultBuffer;
73    bufferIndex = 0;
74    bufferSize = DEFAULT_BUFFER_SIZE;
75}
76
77RCEBuffer::~RCEBuffer()
78{
79    if (buffer != defaultBuffer) {
80        DELETE_ARRAY(buffer);
81    }
82}
83
84UBool RCEBuffer::empty() const
85{
86    return bufferIndex <= 0;
87}
88
89void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
90{
91    if (bufferIndex >= bufferSize) {
92        RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
93
94        ARRAY_COPY(newBuffer, buffer, bufferSize);
95
96        if (buffer != defaultBuffer) {
97            DELETE_ARRAY(buffer);
98        }
99
100        buffer = newBuffer;
101        bufferSize += BUFFER_GROW;
102    }
103
104    buffer[bufferIndex].ce   = ce;
105    buffer[bufferIndex].low  = ixLow;
106    buffer[bufferIndex].high = ixHigh;
107
108    bufferIndex += 1;
109}
110
111const RCEI *RCEBuffer::get()
112{
113    if (bufferIndex > 0) {
114     return &buffer[--bufferIndex];
115    }
116
117    return NULL;
118}
119
120struct PCEI
121{
122    uint64_t ce;
123    int32_t  low;
124    int32_t  high;
125};
126
127struct PCEBuffer
128{
129    PCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
130    PCEI   *buffer;
131    int32_t bufferIndex;
132    int32_t bufferSize;
133
134    PCEBuffer();
135    ~PCEBuffer();
136
137    void  reset();
138    UBool empty() const;
139    void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
140    const PCEI *get();
141};
142
143PCEBuffer::PCEBuffer()
144{
145    buffer = defaultBuffer;
146    bufferIndex = 0;
147    bufferSize = DEFAULT_BUFFER_SIZE;
148}
149
150PCEBuffer::~PCEBuffer()
151{
152    if (buffer != defaultBuffer) {
153        DELETE_ARRAY(buffer);
154    }
155}
156
157void PCEBuffer::reset()
158{
159    bufferIndex = 0;
160}
161
162UBool PCEBuffer::empty() const
163{
164    return bufferIndex <= 0;
165}
166
167void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
168{
169    if (bufferIndex >= bufferSize) {
170        PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
171
172        ARRAY_COPY(newBuffer, buffer, bufferSize);
173
174        if (buffer != defaultBuffer) {
175            DELETE_ARRAY(buffer);
176        }
177
178        buffer = newBuffer;
179        bufferSize += BUFFER_GROW;
180    }
181
182    buffer[bufferIndex].ce   = ce;
183    buffer[bufferIndex].low  = ixLow;
184    buffer[bufferIndex].high = ixHigh;
185
186    bufferIndex += 1;
187}
188
189const PCEI *PCEBuffer::get()
190{
191    if (bufferIndex > 0) {
192     return &buffer[--bufferIndex];
193    }
194
195    return NULL;
196}
197
198/*
199 * This inherits from UObject so that
200 * it can be allocated by new and the
201 * constructor for PCEBuffer is called.
202 */
203struct UCollationPCE : public UObject
204{
205    PCEBuffer          pceBuffer;
206    UCollationStrength strength;
207    UBool              toShift;
208    UBool              isShifted;
209    uint32_t           variableTop;
210
211    UCollationPCE(UCollationElements *elems);
212    ~UCollationPCE();
213
214    void init(const UCollator *coll);
215
216    virtual UClassID getDynamicClassID() const;
217    static UClassID getStaticClassID();
218};
219
220UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE)
221
222UCollationPCE::UCollationPCE(UCollationElements *elems)
223{
224    init(elems->iteratordata_.coll);
225}
226
227void UCollationPCE::init(const UCollator *coll)
228{
229    UErrorCode status = U_ZERO_ERROR;
230
231    strength    = ucol_getStrength(coll);
232    toShift     = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
233    isShifted   = FALSE;
234    variableTop = coll->variableTopValue << 16;
235}
236
237UCollationPCE::~UCollationPCE()
238{
239    // nothing to do
240}
241
242
243U_NAMESPACE_END
244
245
246inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
247{
248    uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
249
250    // This is clean, but somewhat slow...
251    // We could apply the mask to ce and then
252    // just get all three orders...
253    switch(elems->pce->strength) {
254    default:
255        tertiary = ucol_tertiaryOrder(ce);
256        /* note fall-through */
257
258    case UCOL_SECONDARY:
259        secondary = ucol_secondaryOrder(ce);
260        /* note fall-through */
261
262    case UCOL_PRIMARY:
263        primary = ucol_primaryOrder(ce);
264    }
265
266    // **** This should probably handle continuations too.  ****
267    // **** That means that we need 24 bits for the primary ****
268    // **** instead of the 16 that we're currently using.   ****
269    // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
270    // **** Another complication with continuations is that ****
271    // **** the *second* CE is marked as a continuation, so ****
272    // **** we always have to peek ahead to know how long   ****
273    // **** the primary is...                               ****
274    if ((elems->pce->toShift && elems->pce->variableTop > ce && primary != 0)
275                || (elems->pce->isShifted && primary == 0)) {
276
277        if (primary == 0) {
278            return UCOL_IGNORABLE;
279        }
280
281        if (elems->pce->strength >= UCOL_QUATERNARY) {
282            quaternary = primary;
283        }
284
285        primary = secondary = tertiary = 0;
286        elems->pce->isShifted = TRUE;
287    } else {
288        if (elems->pce->strength >= UCOL_QUATERNARY) {
289            quaternary = 0xFFFF;
290        }
291
292        elems->pce->isShifted = FALSE;
293    }
294
295    return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
296}
297
298U_CAPI void U_EXPORT2
299uprv_init_pce(const UCollationElements *elems)
300{
301    if (elems->pce != NULL) {
302        elems->pce->init(elems->iteratordata_.coll);
303    }
304}
305
306
307
308/* public methods ---------------------------------------------------- */
309
310U_CAPI UCollationElements* U_EXPORT2
311ucol_openElements(const UCollator  *coll,
312                  const UChar      *text,
313                        int32_t    textLength,
314                        UErrorCode *status)
315{
316    if (U_FAILURE(*status)) {
317        return NULL;
318    }
319
320    UCollationElements *result = new UCollationElements;
321    if (result == NULL) {
322        *status = U_MEMORY_ALLOCATION_ERROR;
323        return NULL;
324    }
325
326    result->reset_ = TRUE;
327    result->isWritable = FALSE;
328    result->pce = NULL;
329
330    if (text == NULL) {
331        textLength = 0;
332    }
333    uprv_init_collIterate(coll, text, textLength, &result->iteratordata_, status);
334
335    return result;
336}
337
338
339U_CAPI void U_EXPORT2
340ucol_closeElements(UCollationElements *elems)
341{
342	if (elems != NULL) {
343	  collIterate *ci = &elems->iteratordata_;
344
345	  if (ci->extendCEs) {
346		  uprv_free(ci->extendCEs);
347	  }
348
349	  if (ci->offsetBuffer) {
350		  uprv_free(ci->offsetBuffer);
351	  }
352
353	  if (elems->isWritable && elems->iteratordata_.string != NULL)
354	  {
355		uprv_free((UChar *)elems->iteratordata_.string);
356	  }
357
358	  if (elems->pce != NULL) {
359		  delete elems->pce;
360	  }
361
362	  delete elems;
363	}
364}
365
366U_CAPI void U_EXPORT2
367ucol_reset(UCollationElements *elems)
368{
369    collIterate *ci = &(elems->iteratordata_);
370    elems->reset_   = TRUE;
371    ci->pos         = ci->string;
372    if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) {
373        ci->endp      = ci->string + u_strlen(ci->string);
374    }
375    ci->CEpos       = ci->toReturn = ci->CEs;
376    ci->flags       = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN;
377    if (ci->coll->normalizationMode == UCOL_ON) {
378        ci->flags |= UCOL_ITER_NORM;
379    }
380
381    ci->writableBuffer.remove();
382    ci->fcdPosition = NULL;
383
384  //ci->offsetReturn = ci->offsetStore = NULL;
385	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
386}
387
388U_CAPI void U_EXPORT2
389ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status)
390{
391    if (U_FAILURE(*status)) {
392        return;
393    }
394
395    if (elems == NULL) {
396        *status = U_ILLEGAL_ARGUMENT_ERROR;
397        return;
398    }
399
400    elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT;
401}
402
403U_CAPI int32_t U_EXPORT2
404ucol_next(UCollationElements *elems,
405          UErrorCode         *status)
406{
407    int32_t result;
408    if (U_FAILURE(*status)) {
409        return UCOL_NULLORDER;
410    }
411
412    elems->reset_ = FALSE;
413
414    result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll,
415                                     &elems->iteratordata_,
416                                     status);
417
418    if (result == UCOL_NO_MORE_CES) {
419        result = UCOL_NULLORDER;
420    }
421    return result;
422}
423
424U_CAPI int64_t U_EXPORT2
425ucol_nextProcessed(UCollationElements *elems,
426                   int32_t            *ixLow,
427                   int32_t            *ixHigh,
428                   UErrorCode         *status)
429{
430    const UCollator *coll = elems->iteratordata_.coll;
431    int64_t result = UCOL_IGNORABLE;
432    uint32_t low = 0, high = 0;
433
434    if (U_FAILURE(*status)) {
435        return UCOL_PROCESSED_NULLORDER;
436    }
437
438    if (elems->pce == NULL) {
439        elems->pce = new UCollationPCE(elems);
440    } else {
441        elems->pce->pceBuffer.reset();
442    }
443
444    elems->reset_ = FALSE;
445
446    do {
447        low = ucol_getOffset(elems);
448        uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status);
449        high = ucol_getOffset(elems);
450
451        if (ce == UCOL_NO_MORE_CES) {
452             result = UCOL_PROCESSED_NULLORDER;
453             break;
454        }
455
456        result = processCE(elems, ce);
457    } while (result == UCOL_IGNORABLE);
458
459    if (ixLow != NULL) {
460        *ixLow = low;
461    }
462
463    if (ixHigh != NULL) {
464        *ixHigh = high;
465    }
466
467    return result;
468}
469
470U_CAPI int32_t U_EXPORT2
471ucol_previous(UCollationElements *elems,
472              UErrorCode         *status)
473{
474    if(U_FAILURE(*status)) {
475        return UCOL_NULLORDER;
476    }
477    else
478    {
479        int32_t result;
480
481        if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) {
482            if (elems->iteratordata_.endp == NULL) {
483                elems->iteratordata_.endp = elems->iteratordata_.string +
484                                            u_strlen(elems->iteratordata_.string);
485                elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
486            }
487            elems->iteratordata_.pos = elems->iteratordata_.endp;
488            elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
489        }
490
491        elems->reset_ = FALSE;
492
493        result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll,
494                                         &(elems->iteratordata_),
495                                         status);
496
497        if (result == UCOL_NO_MORE_CES) {
498            result = UCOL_NULLORDER;
499        }
500
501        return result;
502    }
503}
504
505U_CAPI int64_t U_EXPORT2
506ucol_previousProcessed(UCollationElements *elems,
507                   int32_t            *ixLow,
508                   int32_t            *ixHigh,
509                   UErrorCode         *status)
510{
511    const UCollator *coll = elems->iteratordata_.coll;
512    int64_t result = UCOL_IGNORABLE;
513 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
514 // UCollationStrength strength = ucol_getStrength(coll);
515 //  UBool toShift   = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) ==  UCOL_SHIFTED;
516 // uint32_t variableTop = coll->variableTopValue;
517    int32_t  low = 0, high = 0;
518
519    if (U_FAILURE(*status)) {
520        return UCOL_PROCESSED_NULLORDER;
521    }
522
523    if (elems->reset_ &&
524        (elems->iteratordata_.pos == elems->iteratordata_.string)) {
525        if (elems->iteratordata_.endp == NULL) {
526            elems->iteratordata_.endp = elems->iteratordata_.string +
527                                        u_strlen(elems->iteratordata_.string);
528            elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
529        }
530
531        elems->iteratordata_.pos = elems->iteratordata_.endp;
532        elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
533    }
534
535    if (elems->pce == NULL) {
536        elems->pce = new UCollationPCE(elems);
537    } else {
538      //elems->pce->pceBuffer.reset();
539    }
540
541    elems->reset_ = FALSE;
542
543    while (elems->pce->pceBuffer.empty()) {
544        // buffer raw CEs up to non-ignorable primary
545        RCEBuffer rceb;
546        uint32_t ce;
547
548        // **** do we need to reset rceb, or will it always be empty at this point ****
549        do {
550            high = ucol_getOffset(elems);
551            ce   = ucol_getPrevCE(coll, &elems->iteratordata_, status);
552            low  = ucol_getOffset(elems);
553
554            if (ce == UCOL_NO_MORE_CES) {
555                if (! rceb.empty()) {
556                    break;
557                }
558
559                goto finish;
560            }
561
562            rceb.put(ce, low, high);
563        } while ((ce & UCOL_PRIMARYMASK) == 0);
564
565        // process the raw CEs
566        while (! rceb.empty()) {
567            const RCEI *rcei = rceb.get();
568
569            result = processCE(elems, rcei->ce);
570
571            if (result != UCOL_IGNORABLE) {
572                elems->pce->pceBuffer.put(result, rcei->low, rcei->high);
573            }
574        }
575    }
576
577finish:
578    if (elems->pce->pceBuffer.empty()) {
579        // **** Is -1 the right value for ixLow, ixHigh? ****
580    	if (ixLow != NULL) {
581    		*ixLow = -1;
582    	}
583
584    	if (ixHigh != NULL) {
585    		*ixHigh = -1
586    		;
587    	}
588        return UCOL_PROCESSED_NULLORDER;
589    }
590
591    const PCEI *pcei = elems->pce->pceBuffer.get();
592
593    if (ixLow != NULL) {
594        *ixLow = pcei->low;
595    }
596
597    if (ixHigh != NULL) {
598        *ixHigh = pcei->high;
599    }
600
601    return pcei->ce;
602}
603
604U_CAPI int32_t U_EXPORT2
605ucol_getMaxExpansion(const UCollationElements *elems,
606                           int32_t            order)
607{
608    uint8_t result;
609
610#if 0
611    UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result);
612#else
613    const UCollator *coll = elems->iteratordata_.coll;
614    const uint32_t *start;
615    const uint32_t *limit;
616    const uint32_t *mid;
617          uint32_t strengthMask = 0;
618          uint32_t mOrder = (uint32_t) order;
619
620    switch (coll->strength)
621    {
622    default:
623        strengthMask |= UCOL_TERTIARYORDERMASK;
624        /* fall through */
625
626    case UCOL_SECONDARY:
627        strengthMask |= UCOL_SECONDARYORDERMASK;
628        /* fall through */
629
630    case UCOL_PRIMARY:
631        strengthMask |= UCOL_PRIMARYORDERMASK;
632    }
633
634    mOrder &= strengthMask;
635    start = (coll)->endExpansionCE;
636    limit = (coll)->lastEndExpansionCE;
637
638    while (start < limit - 1) {
639        mid = start + ((limit - start) >> 1);
640        if (mOrder <= (*mid & strengthMask)) {
641          limit = mid;
642        } else {
643          start = mid;
644        }
645    }
646
647    // FIXME: with a masked search, there might be more than one hit,
648    // so we need to look forward and backward from the match to find all
649    // of the hits...
650    if ((*start & strengthMask) == mOrder) {
651        result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE));
652    } else if ((*limit & strengthMask) == mOrder) {
653         result = *(coll->expansionCESize + (limit - coll->endExpansionCE));
654   } else if ((mOrder & 0xFFFF) == 0x00C0) {
655        result = 2;
656   } else {
657       result = 1;
658   }
659#endif
660
661    return result;
662}
663
664U_CAPI void U_EXPORT2
665ucol_setText(      UCollationElements *elems,
666             const UChar              *text,
667                   int32_t            textLength,
668                   UErrorCode         *status)
669{
670    if (U_FAILURE(*status)) {
671        return;
672    }
673
674    if (elems->isWritable && elems->iteratordata_.string != NULL)
675    {
676        uprv_free((UChar *)elems->iteratordata_.string);
677    }
678
679    if (text == NULL) {
680        textLength = 0;
681    }
682
683    elems->isWritable = FALSE;
684
685    /* free offset buffer to avoid memory leak before initializing. */
686    ucol_freeOffsetBuffer(&(elems->iteratordata_));
687    /* Ensure that previously allocated extendCEs is freed before setting to NULL. */
688    if (elems->iteratordata_.extendCEs != NULL) {
689        uprv_free(elems->iteratordata_.extendCEs);
690    }
691    uprv_init_collIterate(elems->iteratordata_.coll, text, textLength,
692                          &elems->iteratordata_, status);
693
694    elems->reset_   = TRUE;
695}
696
697U_CAPI int32_t U_EXPORT2
698ucol_getOffset(const UCollationElements *elems)
699{
700  const collIterate *ci = &(elems->iteratordata_);
701
702  if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) {
703      return ci->offsetRepeatValue;
704  }
705
706  if (ci->offsetReturn != NULL) {
707      return *ci->offsetReturn;
708  }
709
710  // while processing characters in normalization buffer getOffset will
711  // return the next non-normalized character.
712  // should be inline with the old implementation since the old codes uses
713  // nextDecomp in normalizer which also decomposes the string till the
714  // first base character is found.
715  if (ci->flags & UCOL_ITER_INNORMBUF) {
716      if (ci->fcdPosition == NULL) {
717        return 0;
718      }
719      return (int32_t)(ci->fcdPosition - ci->string);
720  }
721  else {
722      return (int32_t)(ci->pos - ci->string);
723  }
724}
725
726U_CAPI void U_EXPORT2
727ucol_setOffset(UCollationElements    *elems,
728               int32_t           offset,
729               UErrorCode            *status)
730{
731    if (U_FAILURE(*status)) {
732        return;
733    }
734
735    // this methods will clean up any use of the writable buffer and points to
736    // the original string
737    collIterate *ci = &(elems->iteratordata_);
738    ci->pos         = ci->string + offset;
739    ci->CEpos       = ci->toReturn = ci->CEs;
740    if (ci->flags & UCOL_ITER_INNORMBUF) {
741        ci->flags = ci->origFlags;
742    }
743    if ((ci->flags & UCOL_ITER_HASLEN) == 0) {
744        ci->endp  = ci->string + u_strlen(ci->string);
745        ci->flags |= UCOL_ITER_HASLEN;
746    }
747    ci->fcdPosition = NULL;
748    elems->reset_ = FALSE;
749
750	ci->offsetReturn = NULL;
751    ci->offsetStore = ci->offsetBuffer;
752	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
753}
754
755U_CAPI int32_t U_EXPORT2
756ucol_primaryOrder (int32_t order)
757{
758    order &= UCOL_PRIMARYMASK;
759    return (order >> UCOL_PRIMARYORDERSHIFT);
760}
761
762U_CAPI int32_t U_EXPORT2
763ucol_secondaryOrder (int32_t order)
764{
765    order &= UCOL_SECONDARYMASK;
766    return (order >> UCOL_SECONDARYORDERSHIFT);
767}
768
769U_CAPI int32_t U_EXPORT2
770ucol_tertiaryOrder (int32_t order)
771{
772    return (order & UCOL_TERTIARYMASK);
773}
774
775
776void ucol_freeOffsetBuffer(collIterate *s) {
777    if (s != NULL && s->offsetBuffer != NULL) {
778        uprv_free(s->offsetBuffer);
779        s->offsetBuffer = NULL;
780        s->offsetBufferSize = 0;
781    }
782}
783
784#endif /* #if !UCONFIG_NO_COLLATION */
785