1/* $OpenLDAP$ */
2/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3 *
4 * Copyright 1998-2011 The OpenLDAP Foundation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
9 * Public License.
10 *
11 * A copy of this license is available in file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
14 */
15/* Copyright 2001 Computing Research Labs, New Mexico State University
16 *
17 * Permission is hereby granted, free of charge, to any person obtaining a
18 * copy of this software and associated documentation files (the "Software"),
19 * to deal in the Software without restriction, including without limitation
20 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
21 * and/or sell copies of the Software, and to permit persons to whom the
22 * Software is furnished to do so, subject to the following conditions:
23 *
24 * The above copyright notice and this permission notice shall be included in
25 * all copies or substantial portions of the Software.
26 *
27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
28 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
29 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
30 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
31 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
32 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
33 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
34 */
35/* $Id: ucdata.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */
36
37#include "portable.h"
38#include "ldap_config.h"
39
40#include <stdio.h>
41#include <ac/stdlib.h>
42#include <ac/string.h>
43#include <ac/unistd.h>
44
45#include <ac/bytes.h>
46
47#include "lber_pvt.h"
48#include "ucdata.h"
49
50#ifndef HARDCODE_DATA
51#define	HARDCODE_DATA	1
52#endif
53
54#if HARDCODE_DATA
55#include "uctable.h"
56#endif
57
58/**************************************************************************
59 *
60 * Miscellaneous types, data, and support functions.
61 *
62 **************************************************************************/
63
64typedef struct {
65    ac_uint2 bom;
66    ac_uint2 cnt;
67    union {
68        ac_uint4 bytes;
69        ac_uint2 len[2];
70    } size;
71} _ucheader_t;
72
73/*
74 * A simple array of 32-bit masks for lookup.
75 */
76static ac_uint4 masks32[32] = {
77	0x00000001UL, 0x00000002UL, 0x00000004UL, 0x00000008UL,
78	0x00000010UL, 0x00000020UL, 0x00000040UL, 0x00000080UL,
79	0x00000100UL, 0x00000200UL, 0x00000400UL, 0x00000800UL,
80	0x00001000UL, 0x00002000UL, 0x00004000UL, 0x00008000UL,
81	0x00010000UL, 0x00020000UL, 0x00040000UL, 0x00080000UL,
82	0x00100000UL, 0x00200000UL, 0x00400000UL, 0x00800000UL,
83	0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL,
84	0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL
85};
86
87#define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8))
88#define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\
89                        ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24))
90
91#if !HARDCODE_DATA
92static FILE *
93_ucopenfile(char *paths, char *filename, char *mode)
94{
95    FILE *f;
96    char *fp, *dp, *pp, path[BUFSIZ];
97
98    if (filename == 0 || *filename == 0)
99      return 0;
100
101    dp = paths;
102    while (dp && *dp) {
103        pp = path;
104        while (*dp && *dp != ':')
105          *pp++ = *dp++;
106        *pp++ = *LDAP_DIRSEP;
107
108        fp = filename;
109        while (*fp)
110          *pp++ = *fp++;
111        *pp = 0;
112
113        if ((f = fopen(path, mode)) != 0)
114          return f;
115
116        if (*dp == ':')
117          dp++;
118    }
119
120    return 0;
121}
122#endif
123
124/**************************************************************************
125 *
126 * Support for the character properties.
127 *
128 **************************************************************************/
129
130#if !HARDCODE_DATA
131
132static ac_uint4 _ucprop_size;
133static ac_uint2 *_ucprop_offsets;
134static ac_uint4 *_ucprop_ranges;
135
136/*
137 * Return -1 on error, 0 if okay
138 */
139static int
140_ucprop_load(char *paths, int reload)
141{
142    FILE *in;
143    ac_uint4 size, i;
144    _ucheader_t hdr;
145
146    if (_ucprop_size > 0) {
147        if (!reload)
148          /*
149           * The character properties have already been loaded.
150           */
151          return 0;
152
153        /*
154         * Unload the current character property data in preparation for
155         * loading a new copy.  Only the first array has to be deallocated
156         * because all the memory for the arrays is allocated as a single
157         * block.
158         */
159        free((char *) _ucprop_offsets);
160        _ucprop_size = 0;
161    }
162
163    if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0)
164      return -1;
165
166    /*
167     * Load the header.
168     */
169    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
170
171    if (hdr.bom == 0xfffe) {
172        hdr.cnt = endian_short(hdr.cnt);
173        hdr.size.bytes = endian_long(hdr.size.bytes);
174    }
175
176    if ((_ucprop_size = hdr.cnt) == 0) {
177        fclose(in);
178        return -1;
179    }
180
181    /*
182     * Allocate all the storage needed for the lookup table.
183     */
184    _ucprop_offsets = (ac_uint2 *) malloc(hdr.size.bytes);
185
186    /*
187     * Calculate the offset into the storage for the ranges.  The offsets
188     * array is on a 4-byte boundary and one larger than the value provided in
189     * the header count field.  This means the offset to the ranges must be
190     * calculated after aligning the count to a 4-byte boundary.
191     */
192    if ((size = ((hdr.cnt + 1) * sizeof(ac_uint2))) & 3)
193      size += 4 - (size & 3);
194    size >>= 1;
195    _ucprop_ranges = (ac_uint4 *) (_ucprop_offsets + size);
196
197    /*
198     * Load the offset array.
199     */
200    fread((char *) _ucprop_offsets, sizeof(ac_uint2), size, in);
201
202    /*
203     * Do an endian swap if necessary.  Don't forget there is an extra node on
204     * the end with the final index.
205     */
206    if (hdr.bom == 0xfffe) {
207        for (i = 0; i <= _ucprop_size; i++)
208          _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]);
209    }
210
211    /*
212     * Load the ranges.  The number of elements is in the last array position
213     * of the offsets.
214     */
215    fread((char *) _ucprop_ranges, sizeof(ac_uint4),
216          _ucprop_offsets[_ucprop_size], in);
217
218    fclose(in);
219
220    /*
221     * Do an endian swap if necessary.
222     */
223    if (hdr.bom == 0xfffe) {
224        for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++)
225          _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]);
226    }
227    return 0;
228}
229
230static void
231_ucprop_unload(void)
232{
233    if (_ucprop_size == 0)
234      return;
235
236    /*
237     * Only need to free the offsets because the memory is allocated as a
238     * single block.
239     */
240    free((char *) _ucprop_offsets);
241    _ucprop_size = 0;
242}
243#endif
244
245static int
246_ucprop_lookup(ac_uint4 code, ac_uint4 n)
247{
248    long l, r, m;
249
250    if (_ucprop_size == 0)
251      return 0;
252
253    /*
254     * There is an extra node on the end of the offsets to allow this routine
255     * to work right.  If the index is 0xffff, then there are no nodes for the
256     * property.
257     */
258    if ((l = _ucprop_offsets[n]) == 0xffff)
259      return 0;
260
261    /*
262     * Locate the next offset that is not 0xffff.  The sentinel at the end of
263     * the array is the max index value.
264     */
265    for (m = 1;
266         n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ;
267
268    r = _ucprop_offsets[n + m] - 1;
269
270    while (l <= r) {
271        /*
272         * Determine a "mid" point and adjust to make sure the mid point is at
273         * the beginning of a range pair.
274         */
275        m = (l + r) >> 1;
276        m -= (m & 1);
277        if (code > _ucprop_ranges[m + 1])
278          l = m + 2;
279        else if (code < _ucprop_ranges[m])
280          r = m - 2;
281        else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
282          return 1;
283    }
284    return 0;
285}
286
287int
288ucisprop(ac_uint4 code, ac_uint4 mask1, ac_uint4 mask2)
289{
290    ac_uint4 i;
291
292    if (mask1 == 0 && mask2 == 0)
293      return 0;
294
295    for (i = 0; mask1 && i < 32; i++) {
296        if ((mask1 & masks32[i]) && _ucprop_lookup(code, i))
297          return 1;
298    }
299
300    for (i = 32; mask2 && i < _ucprop_size; i++) {
301        if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i))
302          return 1;
303    }
304
305    return 0;
306}
307
308/**************************************************************************
309 *
310 * Support for case mapping.
311 *
312 **************************************************************************/
313
314#if !HARDCODE_DATA
315
316/* These record the number of slots in the map.
317 * There are 3 words per slot.
318 */
319static ac_uint4 _uccase_size;
320static ac_uint2 _uccase_len[2];
321static ac_uint4 *_uccase_map;
322
323/*
324 * Return -1 on error, 0 if okay
325 */
326static int
327_uccase_load(char *paths, int reload)
328{
329    FILE *in;
330    ac_uint4 i;
331    _ucheader_t hdr;
332
333    if (_uccase_size > 0) {
334        if (!reload)
335          /*
336           * The case mappings have already been loaded.
337           */
338          return 0;
339
340        free((char *) _uccase_map);
341        _uccase_size = 0;
342    }
343
344    if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0)
345      return -1;
346
347    /*
348     * Load the header.
349     */
350    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
351
352    if (hdr.bom == 0xfffe) {
353        hdr.cnt = endian_short(hdr.cnt);
354        hdr.size.len[0] = endian_short(hdr.size.len[0]);
355        hdr.size.len[1] = endian_short(hdr.size.len[1]);
356    }
357
358    /*
359     * Set the node count and lengths of the upper and lower case mapping
360     * tables.
361     */
362    _uccase_size = hdr.cnt;
363    _uccase_len[0] = hdr.size.len[0];
364    _uccase_len[1] = hdr.size.len[1];
365
366    _uccase_map = (ac_uint4 *)
367        malloc(_uccase_size * 3 * sizeof(ac_uint4));
368
369    /*
370     * Load the case mapping table.
371     */
372    fread((char *) _uccase_map, sizeof(ac_uint4), _uccase_size * 3, in);
373
374    /*
375     * Do an endian swap if necessary.
376     */
377    if (hdr.bom == 0xfffe) {
378        for (i = 0; i < _uccase_size * 3; i++)
379          _uccase_map[i] = endian_long(_uccase_map[i]);
380    }
381    fclose(in);
382    return 0;
383}
384
385static void
386_uccase_unload(void)
387{
388    if (_uccase_size == 0)
389      return;
390
391    free((char *) _uccase_map);
392    _uccase_size = 0;
393}
394#endif
395
396static ac_uint4
397_uccase_lookup(ac_uint4 code, long l, long r, int field)
398{
399    long m;
400	const ac_uint4 *tmp;
401
402    /*
403     * Do the binary search.
404     */
405    while (l <= r) {
406        /*
407         * Determine a "mid" point and adjust to make sure the mid point is at
408         * the beginning of a case mapping triple.
409         */
410        m = (l + r) >> 1;
411		tmp = &_uccase_map[m*3];
412        if (code > *tmp)
413          l = m + 1;
414        else if (code < *tmp)
415          r = m - 1;
416        else if (code == *tmp)
417          return tmp[field];
418    }
419
420    return code;
421}
422
423ac_uint4
424uctoupper(ac_uint4 code)
425{
426    int field;
427    long l, r;
428
429    if (ucisupper(code))
430      return code;
431
432    if (ucislower(code)) {
433        /*
434         * The character is lower case.
435         */
436        field = 2;
437        l = _uccase_len[0];
438        r = (l + _uccase_len[1]) - 1;
439    } else {
440        /*
441         * The character is title case.
442         */
443        field = 1;
444        l = _uccase_len[0] + _uccase_len[1];
445        r = _uccase_size - 1;
446    }
447    return _uccase_lookup(code, l, r, field);
448}
449
450ac_uint4
451uctolower(ac_uint4 code)
452{
453    int field;
454    long l, r;
455
456    if (ucislower(code))
457      return code;
458
459    if (ucisupper(code)) {
460        /*
461         * The character is upper case.
462         */
463        field = 1;
464        l = 0;
465        r = _uccase_len[0] - 1;
466    } else {
467        /*
468         * The character is title case.
469         */
470        field = 2;
471        l = _uccase_len[0] + _uccase_len[1];
472        r = _uccase_size - 1;
473    }
474    return _uccase_lookup(code, l, r, field);
475}
476
477ac_uint4
478uctotitle(ac_uint4 code)
479{
480    int field;
481    long l, r;
482
483    if (ucistitle(code))
484      return code;
485
486    /*
487     * The offset will always be the same for converting to title case.
488     */
489    field = 2;
490
491    if (ucisupper(code)) {
492        /*
493         * The character is upper case.
494         */
495        l = 0;
496        r = _uccase_len[0] - 1;
497    } else {
498        /*
499         * The character is lower case.
500         */
501        l = _uccase_len[0];
502        r = (l + _uccase_len[1]) - 1;
503    }
504    return _uccase_lookup(code, l, r, field);
505}
506
507/**************************************************************************
508 *
509 * Support for compositions.
510 *
511 **************************************************************************/
512
513#if !HARDCODE_DATA
514
515static ac_uint4  _uccomp_size;
516static ac_uint4 *_uccomp_data;
517
518/*
519 * Return -1 on error, 0 if okay
520 */
521static int
522_uccomp_load(char *paths, int reload)
523{
524    FILE *in;
525    ac_uint4 size, i;
526    _ucheader_t hdr;
527
528    if (_uccomp_size > 0) {
529        if (!reload)
530            /*
531             * The compositions have already been loaded.
532             */
533            return 0;
534
535        free((char *) _uccomp_data);
536        _uccomp_size = 0;
537    }
538
539    if ((in = _ucopenfile(paths, "comp.dat", "rb")) == 0)
540        return -1;
541
542    /*
543     * Load the header.
544     */
545    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
546
547    if (hdr.bom == 0xfffe) {
548        hdr.cnt = endian_short(hdr.cnt);
549        hdr.size.bytes = endian_long(hdr.size.bytes);
550    }
551
552    _uccomp_size = hdr.cnt;
553    _uccomp_data = (ac_uint4 *) malloc(hdr.size.bytes);
554
555    /*
556     * Read the composition data in.
557     */
558    size = hdr.size.bytes / sizeof(ac_uint4);
559    fread((char *) _uccomp_data, sizeof(ac_uint4), size, in);
560
561    /*
562     * Do an endian swap if necessary.
563     */
564    if (hdr.bom == 0xfffe) {
565        for (i = 0; i < size; i++)
566            _uccomp_data[i] = endian_long(_uccomp_data[i]);
567    }
568
569    /*
570     * Assume that the data is ordered on count, so that all compositions
571     * of length 2 come first. Only handling length 2 for now.
572     */
573    for (i = 1; i < size; i += 4)
574      if (_uccomp_data[i] != 2)
575        break;
576    _uccomp_size = i - 1;
577
578    fclose(in);
579    return 0;
580}
581
582static void
583_uccomp_unload(void)
584{
585    if (_uccomp_size == 0)
586        return;
587
588    free((char *) _uccomp_data);
589    _uccomp_size = 0;
590}
591#endif
592
593int
594uccomp(ac_uint4 node1, ac_uint4 node2, ac_uint4 *comp)
595{
596    int l, r, m;
597
598    l = 0;
599    r = _uccomp_size - 1;
600
601    while (l <= r) {
602        m = ((r + l) >> 1);
603        m -= m & 3;
604        if (node1 > _uccomp_data[m+2])
605          l = m + 4;
606        else if (node1 < _uccomp_data[m+2])
607          r = m - 4;
608        else if (node2 > _uccomp_data[m+3])
609          l = m + 4;
610        else if (node2 < _uccomp_data[m+3])
611          r = m - 4;
612        else {
613            *comp = _uccomp_data[m];
614            return 1;
615        }
616    }
617    return 0;
618}
619
620int
621uccomp_hangul(ac_uint4 *str, int len)
622{
623    const int SBase = 0xAC00, LBase = 0x1100,
624        VBase = 0x1161, TBase = 0x11A7,
625        LCount = 19, VCount = 21, TCount = 28,
626        NCount = VCount * TCount,   /* 588 */
627        SCount = LCount * NCount;   /* 11172 */
628
629    int i, rlen;
630    ac_uint4 ch, last, lindex, sindex;
631
632    last = str[0];
633    rlen = 1;
634    for ( i = 1; i < len; i++ ) {
635        ch = str[i];
636
637        /* check if two current characters are L and V */
638        lindex = last - LBase;
639        if (lindex < (ac_uint4) LCount) {
640            ac_uint4 vindex = ch - VBase;
641            if (vindex < (ac_uint4) VCount) {
642                /* make syllable of form LV */
643                last = SBase + (lindex * VCount + vindex) * TCount;
644                str[rlen-1] = last; /* reset last */
645                continue;
646            }
647        }
648
649        /* check if two current characters are LV and T */
650        sindex = last - SBase;
651        if (sindex < (ac_uint4) SCount
652			&& (sindex % TCount) == 0)
653		{
654            ac_uint4 tindex = ch - TBase;
655            if (tindex <= (ac_uint4) TCount) {
656                /* make syllable of form LVT */
657                last += tindex;
658                str[rlen-1] = last; /* reset last */
659                continue;
660            }
661        }
662
663        /* if neither case was true, just add the character */
664        last = ch;
665        str[rlen] = ch;
666        rlen++;
667    }
668    return rlen;
669}
670
671int
672uccanoncomp(ac_uint4 *str, int len)
673{
674    int i, stpos, copos;
675    ac_uint4 cl, prevcl, st, ch, co;
676
677    st = str[0];
678    stpos = 0;
679    copos = 1;
680    prevcl = uccombining_class(st) == 0 ? 0 : 256;
681
682    for (i = 1; i < len; i++) {
683        ch = str[i];
684        cl = uccombining_class(ch);
685        if (uccomp(st, ch, &co) && (prevcl < cl || prevcl == 0))
686          st = str[stpos] = co;
687        else {
688            if (cl == 0) {
689                stpos = copos;
690                st = ch;
691            }
692            prevcl = cl;
693            str[copos++] = ch;
694        }
695    }
696
697    return uccomp_hangul(str, copos);
698}
699
700/**************************************************************************
701 *
702 * Support for decompositions.
703 *
704 **************************************************************************/
705
706#if !HARDCODE_DATA
707
708static ac_uint4  _ucdcmp_size;
709static ac_uint4 *_ucdcmp_nodes;
710static ac_uint4 *_ucdcmp_decomp;
711
712static ac_uint4  _uckdcmp_size;
713static ac_uint4 *_uckdcmp_nodes;
714static ac_uint4 *_uckdcmp_decomp;
715
716/*
717 * Return -1 on error, 0 if okay
718 */
719static int
720_ucdcmp_load(char *paths, int reload)
721{
722    FILE *in;
723    ac_uint4 size, i;
724    _ucheader_t hdr;
725
726    if (_ucdcmp_size > 0) {
727        if (!reload)
728            /*
729             * The decompositions have already been loaded.
730             */
731          return 0;
732
733        free((char *) _ucdcmp_nodes);
734        _ucdcmp_size = 0;
735    }
736
737    if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0)
738        return -1;
739
740    /*
741     * Load the header.
742     */
743    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
744
745    if (hdr.bom == 0xfffe) {
746        hdr.cnt = endian_short(hdr.cnt);
747        hdr.size.bytes = endian_long(hdr.size.bytes);
748    }
749
750    _ucdcmp_size = hdr.cnt << 1;
751    _ucdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
752    _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1);
753
754    /*
755     * Read the decomposition data in.
756     */
757    size = hdr.size.bytes / sizeof(ac_uint4);
758    fread((char *) _ucdcmp_nodes, sizeof(ac_uint4), size, in);
759
760    /*
761     * Do an endian swap if necessary.
762     */
763    if (hdr.bom == 0xfffe) {
764        for (i = 0; i < size; i++)
765            _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]);
766    }
767    fclose(in);
768    return 0;
769}
770
771/*
772 * Return -1 on error, 0 if okay
773 */
774static int
775_uckdcmp_load(char *paths, int reload)
776{
777    FILE *in;
778    ac_uint4 size, i;
779    _ucheader_t hdr;
780
781    if (_uckdcmp_size > 0) {
782        if (!reload)
783            /*
784             * The decompositions have already been loaded.
785             */
786          return 0;
787
788        free((char *) _uckdcmp_nodes);
789        _uckdcmp_size = 0;
790    }
791
792    if ((in = _ucopenfile(paths, "kdecomp.dat", "rb")) == 0)
793        return -1;
794
795    /*
796     * Load the header.
797     */
798    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
799
800    if (hdr.bom == 0xfffe) {
801        hdr.cnt = endian_short(hdr.cnt);
802        hdr.size.bytes = endian_long(hdr.size.bytes);
803    }
804
805    _uckdcmp_size = hdr.cnt << 1;
806    _uckdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
807    _uckdcmp_decomp = _uckdcmp_nodes + (_uckdcmp_size + 1);
808
809    /*
810     * Read the decomposition data in.
811     */
812    size = hdr.size.bytes / sizeof(ac_uint4);
813    fread((char *) _uckdcmp_nodes, sizeof(ac_uint4), size, in);
814
815    /*
816     * Do an endian swap if necessary.
817     */
818    if (hdr.bom == 0xfffe) {
819        for (i = 0; i < size; i++)
820            _uckdcmp_nodes[i] = endian_long(_uckdcmp_nodes[i]);
821    }
822    fclose(in);
823    return 0;
824}
825
826static void
827_ucdcmp_unload(void)
828{
829    if (_ucdcmp_size == 0)
830      return;
831
832    /*
833     * Only need to free the offsets because the memory is allocated as a
834     * single block.
835     */
836    free((char *) _ucdcmp_nodes);
837    _ucdcmp_size = 0;
838}
839
840static void
841_uckdcmp_unload(void)
842{
843    if (_uckdcmp_size == 0)
844      return;
845
846    /*
847     * Only need to free the offsets because the memory is allocated as a
848     * single block.
849     */
850    free((char *) _uckdcmp_nodes);
851    _uckdcmp_size = 0;
852}
853#endif
854
855int
856ucdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp)
857{
858    long l, r, m;
859
860    if (code < _ucdcmp_nodes[0]) {
861	return 0;
862    }
863
864    l = 0;
865    r = _ucdcmp_nodes[_ucdcmp_size] - 1;
866
867    while (l <= r) {
868        /*
869         * Determine a "mid" point and adjust to make sure the mid point is at
870         * the beginning of a code+offset pair.
871         */
872        m = (l + r) >> 1;
873        m -= (m & 1);
874        if (code > _ucdcmp_nodes[m])
875          l = m + 2;
876        else if (code < _ucdcmp_nodes[m])
877          r = m - 2;
878        else if (code == _ucdcmp_nodes[m]) {
879            *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1];
880            *decomp = (ac_uint4*)&_ucdcmp_decomp[_ucdcmp_nodes[m + 1]];
881            return 1;
882        }
883    }
884    return 0;
885}
886
887int
888uckdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp)
889{
890    long l, r, m;
891
892    if (code < _uckdcmp_nodes[0]) {
893	return 0;
894    }
895
896    l = 0;
897    r = _uckdcmp_nodes[_uckdcmp_size] - 1;
898
899    while (l <= r) {
900        /*
901         * Determine a "mid" point and adjust to make sure the mid point is at
902         * the beginning of a code+offset pair.
903         */
904        m = (l + r) >> 1;
905        m -= (m & 1);
906        if (code > _uckdcmp_nodes[m])
907          l = m + 2;
908        else if (code < _uckdcmp_nodes[m])
909          r = m - 2;
910        else if (code == _uckdcmp_nodes[m]) {
911            *num = _uckdcmp_nodes[m + 3] - _uckdcmp_nodes[m + 1];
912            *decomp = (ac_uint4*)&_uckdcmp_decomp[_uckdcmp_nodes[m + 1]];
913            return 1;
914        }
915    }
916    return 0;
917}
918
919int
920ucdecomp_hangul(ac_uint4 code, ac_uint4 *num, ac_uint4 decomp[])
921{
922    if (!ucishangul(code))
923      return 0;
924
925    code -= 0xac00;
926    decomp[0] = 0x1100 + (ac_uint4) (code / 588);
927    decomp[1] = 0x1161 + (ac_uint4) ((code % 588) / 28);
928    decomp[2] = 0x11a7 + (ac_uint4) (code % 28);
929    *num = (decomp[2] != 0x11a7) ? 3 : 2;
930
931    return 1;
932}
933
934/* mode == 0 for canonical, mode == 1 for compatibility */
935static int
936uccanoncompatdecomp(const ac_uint4 *in, int inlen,
937		    ac_uint4 **out, int *outlen, short mode, void *ctx)
938{
939    int l, size;
940	unsigned i, j, k;
941    ac_uint4 num, class, *decomp, hangdecomp[3];
942
943    size = inlen * 2;
944    *out = (ac_uint4 *) ber_memalloc_x(size * sizeof(**out), ctx);
945    if (*out == NULL)
946        return *outlen = -1;
947
948    i = 0;
949    for (j = 0; j < (unsigned) inlen; j++) {
950	if (mode ? uckdecomp(in[j], &num, &decomp) : ucdecomp(in[j], &num, &decomp)) {
951            if ( size - i < num) {
952                size = inlen + i - j + num - 1;
953                *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx );
954                if (*out == NULL)
955                    return *outlen = -1;
956            }
957            for (k = 0; k < num; k++) {
958                class = uccombining_class(decomp[k]);
959                if (class == 0) {
960                    (*out)[i] = decomp[k];
961                } else {
962                    for (l = i; l > 0; l--)
963                        if (class >= uccombining_class((*out)[l-1]))
964                            break;
965                    AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out));
966                    (*out)[l] = decomp[k];
967                }
968                i++;
969            }
970        } else if (ucdecomp_hangul(in[j], &num, hangdecomp)) {
971            if (size - i < num) {
972                size = inlen + i - j + num - 1;
973                *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx);
974                if (*out == NULL)
975                    return *outlen = -1;
976            }
977            for (k = 0; k < num; k++) {
978                (*out)[i] = hangdecomp[k];
979                i++;
980            }
981        } else {
982            if (size - i < 1) {
983                size = inlen + i - j;
984                *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx);
985                if (*out == NULL)
986                    return *outlen = -1;
987            }
988            class = uccombining_class(in[j]);
989            if (class == 0) {
990                (*out)[i] = in[j];
991            } else {
992                for (l = i; l > 0; l--)
993                    if (class >= uccombining_class((*out)[l-1]))
994                        break;
995                AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out));
996                (*out)[l] = in[j];
997            }
998            i++;
999        }
1000    }
1001    return *outlen = i;
1002}
1003
1004int
1005uccanondecomp(const ac_uint4 *in, int inlen,
1006              ac_uint4 **out, int *outlen, void *ctx)
1007{
1008    return uccanoncompatdecomp(in, inlen, out, outlen, 0, ctx);
1009}
1010
1011int
1012uccompatdecomp(const ac_uint4 *in, int inlen,
1013	       ac_uint4 **out, int *outlen, void *ctx)
1014{
1015    return uccanoncompatdecomp(in, inlen, out, outlen, 1, ctx);
1016}
1017
1018/**************************************************************************
1019 *
1020 * Support for combining classes.
1021 *
1022 **************************************************************************/
1023
1024#if !HARDCODE_DATA
1025static ac_uint4  _uccmcl_size;
1026static ac_uint4 *_uccmcl_nodes;
1027
1028/*
1029 * Return -1 on error, 0 if okay
1030 */
1031static int
1032_uccmcl_load(char *paths, int reload)
1033{
1034    FILE *in;
1035    ac_uint4 i;
1036    _ucheader_t hdr;
1037
1038    if (_uccmcl_size > 0) {
1039        if (!reload)
1040            /*
1041             * The combining classes have already been loaded.
1042             */
1043            return 0;
1044
1045        free((char *) _uccmcl_nodes);
1046        _uccmcl_size = 0;
1047    }
1048
1049    if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0)
1050        return -1;
1051
1052    /*
1053     * Load the header.
1054     */
1055    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
1056
1057    if (hdr.bom == 0xfffe) {
1058        hdr.cnt = endian_short(hdr.cnt);
1059        hdr.size.bytes = endian_long(hdr.size.bytes);
1060    }
1061
1062    _uccmcl_size = hdr.cnt * 3;
1063    _uccmcl_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
1064
1065    /*
1066     * Read the combining classes in.
1067     */
1068    fread((char *) _uccmcl_nodes, sizeof(ac_uint4), _uccmcl_size, in);
1069
1070    /*
1071     * Do an endian swap if necessary.
1072     */
1073    if (hdr.bom == 0xfffe) {
1074        for (i = 0; i < _uccmcl_size; i++)
1075            _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]);
1076    }
1077    fclose(in);
1078    return 0;
1079}
1080
1081static void
1082_uccmcl_unload(void)
1083{
1084    if (_uccmcl_size == 0)
1085      return;
1086
1087    free((char *) _uccmcl_nodes);
1088    _uccmcl_size = 0;
1089}
1090#endif
1091
1092ac_uint4
1093uccombining_class(ac_uint4 code)
1094{
1095    long l, r, m;
1096
1097    l = 0;
1098    r = _uccmcl_size - 1;
1099
1100    while (l <= r) {
1101        m = (l + r) >> 1;
1102        m -= (m % 3);
1103        if (code > _uccmcl_nodes[m + 1])
1104          l = m + 3;
1105        else if (code < _uccmcl_nodes[m])
1106          r = m - 3;
1107        else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1])
1108          return _uccmcl_nodes[m + 2];
1109    }
1110    return 0;
1111}
1112
1113/**************************************************************************
1114 *
1115 * Support for numeric values.
1116 *
1117 **************************************************************************/
1118
1119#if !HARDCODE_DATA
1120static ac_uint4 *_ucnum_nodes;
1121static ac_uint4 _ucnum_size;
1122static short *_ucnum_vals;
1123
1124/*
1125 * Return -1 on error, 0 if okay
1126 */
1127static int
1128_ucnumb_load(char *paths, int reload)
1129{
1130    FILE *in;
1131    ac_uint4 size, i;
1132    _ucheader_t hdr;
1133
1134    if (_ucnum_size > 0) {
1135        if (!reload)
1136          /*
1137           * The numbers have already been loaded.
1138           */
1139          return 0;
1140
1141        free((char *) _ucnum_nodes);
1142        _ucnum_size = 0;
1143    }
1144
1145    if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0)
1146      return -1;
1147
1148    /*
1149     * Load the header.
1150     */
1151    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
1152
1153    if (hdr.bom == 0xfffe) {
1154        hdr.cnt = endian_short(hdr.cnt);
1155        hdr.size.bytes = endian_long(hdr.size.bytes);
1156    }
1157
1158    _ucnum_size = hdr.cnt;
1159    _ucnum_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
1160    _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size);
1161
1162    /*
1163     * Read the combining classes in.
1164     */
1165    fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in);
1166
1167    /*
1168     * Do an endian swap if necessary.
1169     */
1170    if (hdr.bom == 0xfffe) {
1171        for (i = 0; i < _ucnum_size; i++)
1172          _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]);
1173
1174        /*
1175         * Determine the number of values that have to be adjusted.
1176         */
1177        size = (hdr.size.bytes -
1178                (_ucnum_size * (sizeof(ac_uint4) << 1))) /
1179            sizeof(short);
1180
1181        for (i = 0; i < size; i++)
1182          _ucnum_vals[i] = endian_short(_ucnum_vals[i]);
1183    }
1184    fclose(in);
1185    return 0;
1186}
1187
1188static void
1189_ucnumb_unload(void)
1190{
1191    if (_ucnum_size == 0)
1192      return;
1193
1194    free((char *) _ucnum_nodes);
1195    _ucnum_size = 0;
1196}
1197#endif
1198
1199int
1200ucnumber_lookup(ac_uint4 code, struct ucnumber *num)
1201{
1202    long l, r, m;
1203    short *vp;
1204
1205    l = 0;
1206    r = _ucnum_size - 1;
1207    while (l <= r) {
1208        /*
1209         * Determine a "mid" point and adjust to make sure the mid point is at
1210         * the beginning of a code+offset pair.
1211         */
1212        m = (l + r) >> 1;
1213        m -= (m & 1);
1214        if (code > _ucnum_nodes[m])
1215          l = m + 2;
1216        else if (code < _ucnum_nodes[m])
1217          r = m - 2;
1218        else {
1219            vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1];
1220            num->numerator = (int) *vp++;
1221            num->denominator = (int) *vp;
1222            return 1;
1223        }
1224    }
1225    return 0;
1226}
1227
1228int
1229ucdigit_lookup(ac_uint4 code, int *digit)
1230{
1231    long l, r, m;
1232    short *vp;
1233
1234    l = 0;
1235    r = _ucnum_size - 1;
1236    while (l <= r) {
1237        /*
1238         * Determine a "mid" point and adjust to make sure the mid point is at
1239         * the beginning of a code+offset pair.
1240         */
1241        m = (l + r) >> 1;
1242        m -= (m & 1);
1243        if (code > _ucnum_nodes[m])
1244          l = m + 2;
1245        else if (code < _ucnum_nodes[m])
1246          r = m - 2;
1247        else {
1248            vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1];
1249            if (*vp == *(vp + 1)) {
1250              *digit = *vp;
1251              return 1;
1252            }
1253            return 0;
1254        }
1255    }
1256    return 0;
1257}
1258
1259struct ucnumber
1260ucgetnumber(ac_uint4 code)
1261{
1262    struct ucnumber num;
1263
1264    /*
1265     * Initialize with some arbitrary value, because the caller simply cannot
1266     * tell for sure if the code is a number without calling the ucisnumber()
1267     * macro before calling this function.
1268     */
1269    num.numerator = num.denominator = -111;
1270
1271    (void) ucnumber_lookup(code, &num);
1272
1273    return num;
1274}
1275
1276int
1277ucgetdigit(ac_uint4 code)
1278{
1279    int dig;
1280
1281    /*
1282     * Initialize with some arbitrary value, because the caller simply cannot
1283     * tell for sure if the code is a number without calling the ucisdigit()
1284     * macro before calling this function.
1285     */
1286    dig = -111;
1287
1288    (void) ucdigit_lookup(code, &dig);
1289
1290    return dig;
1291}
1292
1293/**************************************************************************
1294 *
1295 * Setup and cleanup routines.
1296 *
1297 **************************************************************************/
1298
1299#if HARDCODE_DATA
1300int ucdata_load(char *paths, int masks) { return 0; }
1301void ucdata_unload(int masks) { }
1302int ucdata_reload(char *paths, int masks) { return 0; }
1303#else
1304/*
1305 * Return 0 if okay, negative on error
1306 */
1307int
1308ucdata_load(char *paths, int masks)
1309{
1310    int error = 0;
1311
1312    if (masks & UCDATA_CTYPE)
1313      error |= _ucprop_load(paths, 0) < 0 ? UCDATA_CTYPE : 0;
1314    if (masks & UCDATA_CASE)
1315      error |= _uccase_load(paths, 0) < 0 ? UCDATA_CASE : 0;
1316    if (masks & UCDATA_DECOMP)
1317      error |= _ucdcmp_load(paths, 0) < 0 ? UCDATA_DECOMP : 0;
1318    if (masks & UCDATA_CMBCL)
1319      error |= _uccmcl_load(paths, 0) < 0 ? UCDATA_CMBCL : 0;
1320    if (masks & UCDATA_NUM)
1321      error |= _ucnumb_load(paths, 0) < 0 ? UCDATA_NUM : 0;
1322    if (masks & UCDATA_COMP)
1323      error |= _uccomp_load(paths, 0) < 0 ? UCDATA_COMP : 0;
1324    if (masks & UCDATA_KDECOMP)
1325      error |= _uckdcmp_load(paths, 0) < 0 ? UCDATA_KDECOMP : 0;
1326
1327    return -error;
1328}
1329
1330void
1331ucdata_unload(int masks)
1332{
1333    if (masks & UCDATA_CTYPE)
1334      _ucprop_unload();
1335    if (masks & UCDATA_CASE)
1336      _uccase_unload();
1337    if (masks & UCDATA_DECOMP)
1338      _ucdcmp_unload();
1339    if (masks & UCDATA_CMBCL)
1340      _uccmcl_unload();
1341    if (masks & UCDATA_NUM)
1342      _ucnumb_unload();
1343    if (masks & UCDATA_COMP)
1344      _uccomp_unload();
1345    if (masks & UCDATA_KDECOMP)
1346      _uckdcmp_unload();
1347}
1348
1349/*
1350 * Return 0 if okay, negative on error
1351 */
1352int
1353ucdata_reload(char *paths, int masks)
1354{
1355    int error = 0;
1356
1357    if (masks & UCDATA_CTYPE)
1358        error |= _ucprop_load(paths, 1) < 0 ? UCDATA_CTYPE : 0;
1359    if (masks & UCDATA_CASE)
1360        error |= _uccase_load(paths, 1) < 0 ? UCDATA_CASE : 0;
1361    if (masks & UCDATA_DECOMP)
1362        error |= _ucdcmp_load(paths, 1) < 0 ? UCDATA_DECOMP : 0;
1363    if (masks & UCDATA_CMBCL)
1364        error |= _uccmcl_load(paths, 1) < 0 ? UCDATA_CMBCL : 0;
1365    if (masks & UCDATA_NUM)
1366        error |= _ucnumb_load(paths, 1) < 0 ? UCDATA_NUM : 0;
1367    if (masks & UCDATA_COMP)
1368        error |= _uccomp_load(paths, 1) < 0 ? UCDATA_COMP : 0;
1369    if (masks & UCDATA_KDECOMP)
1370        error |= _uckdcmp_load(paths, 1) < 0 ? UCDATA_KDECOMP : 0;
1371
1372    return -error;
1373}
1374#endif
1375
1376#ifdef TEST
1377
1378void
1379main(void)
1380{
1381    int dig;
1382    ac_uint4 i, lo, *dec;
1383    struct ucnumber num;
1384
1385/*    ucdata_setup("."); */
1386
1387    if (ucisweak(0x30))
1388      printf("WEAK\n");
1389    else
1390      printf("NOT WEAK\n");
1391
1392    printf("LOWER 0x%04lX\n", uctolower(0xff3a));
1393    printf("UPPER 0x%04lX\n", uctoupper(0xff5a));
1394
1395    if (ucisalpha(0x1d5))
1396      printf("ALPHA\n");
1397    else
1398      printf("NOT ALPHA\n");
1399
1400    if (ucisupper(0x1d5)) {
1401        printf("UPPER\n");
1402        lo = uctolower(0x1d5);
1403        printf("0x%04lx\n", lo);
1404        lo = uctotitle(0x1d5);
1405        printf("0x%04lx\n", lo);
1406    } else
1407      printf("NOT UPPER\n");
1408
1409    if (ucistitle(0x1d5))
1410      printf("TITLE\n");
1411    else
1412      printf("NOT TITLE\n");
1413
1414    if (uciscomposite(0x1d5))
1415      printf("COMPOSITE\n");
1416    else
1417      printf("NOT COMPOSITE\n");
1418
1419    if (ucdecomp(0x1d5, &lo, &dec)) {
1420        for (i = 0; i < lo; i++)
1421          printf("0x%04lx ", dec[i]);
1422        putchar('\n');
1423    }
1424
1425    if ((lo = uccombining_class(0x41)) != 0)
1426      printf("0x41 CCL %ld\n", lo);
1427
1428    if (ucisxdigit(0xfeff))
1429      printf("0xFEFF HEX DIGIT\n");
1430    else
1431      printf("0xFEFF NOT HEX DIGIT\n");
1432
1433    if (ucisdefined(0x10000))
1434      printf("0x10000 DEFINED\n");
1435    else
1436      printf("0x10000 NOT DEFINED\n");
1437
1438    if (ucnumber_lookup(0x30, &num)) {
1439        if (num.denominator != 1)
1440          printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
1441        else
1442          printf("UCNUMBER: 0x30 = %d\n", num.numerator);
1443    } else
1444      printf("UCNUMBER: 0x30 NOT A NUMBER\n");
1445
1446    if (ucnumber_lookup(0xbc, &num)) {
1447        if (num.denominator != 1)
1448          printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
1449        else
1450          printf("UCNUMBER: 0xbc = %d\n", num.numerator);
1451    } else
1452      printf("UCNUMBER: 0xbc NOT A NUMBER\n");
1453
1454
1455    if (ucnumber_lookup(0xff19, &num)) {
1456        if (num.denominator != 1)
1457          printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
1458        else
1459          printf("UCNUMBER: 0xff19 = %d\n", num.numerator);
1460    } else
1461      printf("UCNUMBER: 0xff19 NOT A NUMBER\n");
1462
1463    if (ucnumber_lookup(0x4e00, &num)) {
1464        if (num.denominator != 1)
1465          printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator);
1466        else
1467          printf("UCNUMBER: 0x4e00 = %d\n", num.numerator);
1468    } else
1469      printf("UCNUMBER: 0x4e00 NOT A NUMBER\n");
1470
1471    if (ucdigit_lookup(0x06f9, &dig))
1472      printf("UCDIGIT: 0x6f9 = %d\n", dig);
1473    else
1474      printf("UCDIGIT: 0x6f9 NOT A NUMBER\n");
1475
1476    dig = ucgetdigit(0x0969);
1477    printf("UCGETDIGIT: 0x969 = %d\n", dig);
1478
1479    num = ucgetnumber(0x30);
1480    if (num.denominator != 1)
1481      printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
1482    else
1483      printf("UCGETNUMBER: 0x30 = %d\n", num.numerator);
1484
1485    num = ucgetnumber(0xbc);
1486    if (num.denominator != 1)
1487      printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
1488    else
1489      printf("UCGETNUMBER: 0xbc = %d\n", num.numerator);
1490
1491    num = ucgetnumber(0xff19);
1492    if (num.denominator != 1)
1493      printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
1494    else
1495      printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator);
1496
1497/*    ucdata_cleanup(); */
1498    exit(0);
1499}
1500
1501#endif /* TEST */
1502