1/*	$NetBSD: ucdata.c,v 1.1.1.3 2010/12/12 15:21:56 adam Exp $	*/
2
3/* OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucdata.c,v 1.32.2.5 2010/04/13 20:23:04 kurt Exp */
4/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 1998-2010 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17/* Copyright 2001 Computing Research Labs, New Mexico State University
18 *
19 * Permission is hereby granted, free of charge, to any person obtaining a
20 * copy of this software and associated documentation files (the "Software"),
21 * to deal in the Software without restriction, including without limitation
22 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
23 * and/or sell copies of the Software, and to permit persons to whom the
24 * Software is furnished to do so, subject to the following conditions:
25 *
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
28 *
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
32 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
33 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
34 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
35 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37/* Id: ucdata.c,v 1.4 2001/01/02 18:46:20 mleisher Exp" */
38
39#include "portable.h"
40#include "ldap_config.h"
41
42#include <stdio.h>
43#include <ac/stdlib.h>
44#include <ac/string.h>
45#include <ac/unistd.h>
46
47#include <ac/bytes.h>
48
49#include "lber_pvt.h"
50#include "ucdata.h"
51
52#ifndef HARDCODE_DATA
53#define	HARDCODE_DATA	1
54#endif
55
56#if HARDCODE_DATA
57#include "uctable.h"
58#endif
59
60/**************************************************************************
61 *
62 * Miscellaneous types, data, and support functions.
63 *
64 **************************************************************************/
65
66typedef struct {
67    ac_uint2 bom;
68    ac_uint2 cnt;
69    union {
70        ac_uint4 bytes;
71        ac_uint2 len[2];
72    } size;
73} _ucheader_t;
74
75/*
76 * A simple array of 32-bit masks for lookup.
77 */
78static ac_uint4 masks32[32] = {
79	0x00000001UL, 0x00000002UL, 0x00000004UL, 0x00000008UL,
80	0x00000010UL, 0x00000020UL, 0x00000040UL, 0x00000080UL,
81	0x00000100UL, 0x00000200UL, 0x00000400UL, 0x00000800UL,
82	0x00001000UL, 0x00002000UL, 0x00004000UL, 0x00008000UL,
83	0x00010000UL, 0x00020000UL, 0x00040000UL, 0x00080000UL,
84	0x00100000UL, 0x00200000UL, 0x00400000UL, 0x00800000UL,
85	0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL,
86	0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL
87};
88
89#define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8))
90#define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\
91                        ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24))
92
93#if !HARDCODE_DATA
94static FILE *
95_ucopenfile(char *paths, char *filename, char *mode)
96{
97    FILE *f;
98    char *fp, *dp, *pp, path[BUFSIZ];
99
100    if (filename == 0 || *filename == 0)
101      return 0;
102
103    dp = paths;
104    while (dp && *dp) {
105        pp = path;
106        while (*dp && *dp != ':')
107          *pp++ = *dp++;
108        *pp++ = *LDAP_DIRSEP;
109
110        fp = filename;
111        while (*fp)
112          *pp++ = *fp++;
113        *pp = 0;
114
115        if ((f = fopen(path, mode)) != 0)
116          return f;
117
118        if (*dp == ':')
119          dp++;
120    }
121
122    return 0;
123}
124#endif
125
126/**************************************************************************
127 *
128 * Support for the character properties.
129 *
130 **************************************************************************/
131
132#if !HARDCODE_DATA
133
134static ac_uint4 _ucprop_size;
135static ac_uint2 *_ucprop_offsets;
136static ac_uint4 *_ucprop_ranges;
137
138/*
139 * Return -1 on error, 0 if okay
140 */
141static int
142_ucprop_load(char *paths, int reload)
143{
144    FILE *in;
145    ac_uint4 size, i;
146    _ucheader_t hdr;
147
148    if (_ucprop_size > 0) {
149        if (!reload)
150          /*
151           * The character properties have already been loaded.
152           */
153          return 0;
154
155        /*
156         * Unload the current character property data in preparation for
157         * loading a new copy.  Only the first array has to be deallocated
158         * because all the memory for the arrays is allocated as a single
159         * block.
160         */
161        free((char *) _ucprop_offsets);
162        _ucprop_size = 0;
163    }
164
165    if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0)
166      return -1;
167
168    /*
169     * Load the header.
170     */
171    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
172
173    if (hdr.bom == 0xfffe) {
174        hdr.cnt = endian_short(hdr.cnt);
175        hdr.size.bytes = endian_long(hdr.size.bytes);
176    }
177
178    if ((_ucprop_size = hdr.cnt) == 0) {
179        fclose(in);
180        return -1;
181    }
182
183    /*
184     * Allocate all the storage needed for the lookup table.
185     */
186    _ucprop_offsets = (ac_uint2 *) malloc(hdr.size.bytes);
187
188    /*
189     * Calculate the offset into the storage for the ranges.  The offsets
190     * array is on a 4-byte boundary and one larger than the value provided in
191     * the header count field.  This means the offset to the ranges must be
192     * calculated after aligning the count to a 4-byte boundary.
193     */
194    if ((size = ((hdr.cnt + 1) * sizeof(ac_uint2))) & 3)
195      size += 4 - (size & 3);
196    size >>= 1;
197    _ucprop_ranges = (ac_uint4 *) (_ucprop_offsets + size);
198
199    /*
200     * Load the offset array.
201     */
202    fread((char *) _ucprop_offsets, sizeof(ac_uint2), size, in);
203
204    /*
205     * Do an endian swap if necessary.  Don't forget there is an extra node on
206     * the end with the final index.
207     */
208    if (hdr.bom == 0xfffe) {
209        for (i = 0; i <= _ucprop_size; i++)
210          _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]);
211    }
212
213    /*
214     * Load the ranges.  The number of elements is in the last array position
215     * of the offsets.
216     */
217    fread((char *) _ucprop_ranges, sizeof(ac_uint4),
218          _ucprop_offsets[_ucprop_size], in);
219
220    fclose(in);
221
222    /*
223     * Do an endian swap if necessary.
224     */
225    if (hdr.bom == 0xfffe) {
226        for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++)
227          _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]);
228    }
229    return 0;
230}
231
232static void
233_ucprop_unload(void)
234{
235    if (_ucprop_size == 0)
236      return;
237
238    /*
239     * Only need to free the offsets because the memory is allocated as a
240     * single block.
241     */
242    free((char *) _ucprop_offsets);
243    _ucprop_size = 0;
244}
245#endif
246
247static int
248_ucprop_lookup(ac_uint4 code, ac_uint4 n)
249{
250    long l, r, m;
251
252    if (_ucprop_size == 0)
253      return 0;
254
255    /*
256     * There is an extra node on the end of the offsets to allow this routine
257     * to work right.  If the index is 0xffff, then there are no nodes for the
258     * property.
259     */
260    if ((l = _ucprop_offsets[n]) == 0xffff)
261      return 0;
262
263    /*
264     * Locate the next offset that is not 0xffff.  The sentinel at the end of
265     * the array is the max index value.
266     */
267    for (m = 1;
268         n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ;
269
270    r = _ucprop_offsets[n + m] - 1;
271
272    while (l <= r) {
273        /*
274         * Determine a "mid" point and adjust to make sure the mid point is at
275         * the beginning of a range pair.
276         */
277        m = (l + r) >> 1;
278        m -= (m & 1);
279        if (code > _ucprop_ranges[m + 1])
280          l = m + 2;
281        else if (code < _ucprop_ranges[m])
282          r = m - 2;
283        else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
284          return 1;
285    }
286    return 0;
287}
288
289int
290ucisprop(ac_uint4 code, ac_uint4 mask1, ac_uint4 mask2)
291{
292    ac_uint4 i;
293
294    if (mask1 == 0 && mask2 == 0)
295      return 0;
296
297    for (i = 0; mask1 && i < 32; i++) {
298        if ((mask1 & masks32[i]) && _ucprop_lookup(code, i))
299          return 1;
300    }
301
302    for (i = 32; mask2 && i < _ucprop_size; i++) {
303        if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i))
304          return 1;
305    }
306
307    return 0;
308}
309
310/**************************************************************************
311 *
312 * Support for case mapping.
313 *
314 **************************************************************************/
315
316#if !HARDCODE_DATA
317
318/* These record the number of slots in the map.
319 * There are 3 words per slot.
320 */
321static ac_uint4 _uccase_size;
322static ac_uint2 _uccase_len[2];
323static ac_uint4 *_uccase_map;
324
325/*
326 * Return -1 on error, 0 if okay
327 */
328static int
329_uccase_load(char *paths, int reload)
330{
331    FILE *in;
332    ac_uint4 i;
333    _ucheader_t hdr;
334
335    if (_uccase_size > 0) {
336        if (!reload)
337          /*
338           * The case mappings have already been loaded.
339           */
340          return 0;
341
342        free((char *) _uccase_map);
343        _uccase_size = 0;
344    }
345
346    if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0)
347      return -1;
348
349    /*
350     * Load the header.
351     */
352    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
353
354    if (hdr.bom == 0xfffe) {
355        hdr.cnt = endian_short(hdr.cnt);
356        hdr.size.len[0] = endian_short(hdr.size.len[0]);
357        hdr.size.len[1] = endian_short(hdr.size.len[1]);
358    }
359
360    /*
361     * Set the node count and lengths of the upper and lower case mapping
362     * tables.
363     */
364    _uccase_size = hdr.cnt;
365    _uccase_len[0] = hdr.size.len[0];
366    _uccase_len[1] = hdr.size.len[1];
367
368    _uccase_map = (ac_uint4 *)
369        malloc(_uccase_size * 3 * sizeof(ac_uint4));
370
371    /*
372     * Load the case mapping table.
373     */
374    fread((char *) _uccase_map, sizeof(ac_uint4), _uccase_size * 3, in);
375
376    /*
377     * Do an endian swap if necessary.
378     */
379    if (hdr.bom == 0xfffe) {
380        for (i = 0; i < _uccase_size * 3; i++)
381          _uccase_map[i] = endian_long(_uccase_map[i]);
382    }
383    fclose(in);
384    return 0;
385}
386
387static void
388_uccase_unload(void)
389{
390    if (_uccase_size == 0)
391      return;
392
393    free((char *) _uccase_map);
394    _uccase_size = 0;
395}
396#endif
397
398static ac_uint4
399_uccase_lookup(ac_uint4 code, long l, long r, int field)
400{
401    long m;
402	const ac_uint4 *tmp;
403
404    /*
405     * Do the binary search.
406     */
407    while (l <= r) {
408        /*
409         * Determine a "mid" point and adjust to make sure the mid point is at
410         * the beginning of a case mapping triple.
411         */
412        m = (l + r) >> 1;
413		tmp = &_uccase_map[m*3];
414        if (code > *tmp)
415          l = m + 1;
416        else if (code < *tmp)
417          r = m - 1;
418        else if (code == *tmp)
419          return tmp[field];
420    }
421
422    return code;
423}
424
425ac_uint4
426uctoupper(ac_uint4 code)
427{
428    int field;
429    long l, r;
430
431    if (ucisupper(code))
432      return code;
433
434    if (ucislower(code)) {
435        /*
436         * The character is lower case.
437         */
438        field = 2;
439        l = _uccase_len[0];
440        r = (l + _uccase_len[1]) - 1;
441    } else {
442        /*
443         * The character is title case.
444         */
445        field = 1;
446        l = _uccase_len[0] + _uccase_len[1];
447        r = _uccase_size - 1;
448    }
449    return _uccase_lookup(code, l, r, field);
450}
451
452ac_uint4
453uctolower(ac_uint4 code)
454{
455    int field;
456    long l, r;
457
458    if (ucislower(code))
459      return code;
460
461    if (ucisupper(code)) {
462        /*
463         * The character is upper case.
464         */
465        field = 1;
466        l = 0;
467        r = _uccase_len[0] - 1;
468    } else {
469        /*
470         * The character is title case.
471         */
472        field = 2;
473        l = _uccase_len[0] + _uccase_len[1];
474        r = _uccase_size - 1;
475    }
476    return _uccase_lookup(code, l, r, field);
477}
478
479ac_uint4
480uctotitle(ac_uint4 code)
481{
482    int field;
483    long l, r;
484
485    if (ucistitle(code))
486      return code;
487
488    /*
489     * The offset will always be the same for converting to title case.
490     */
491    field = 2;
492
493    if (ucisupper(code)) {
494        /*
495         * The character is upper case.
496         */
497        l = 0;
498        r = _uccase_len[0] - 1;
499    } else {
500        /*
501         * The character is lower case.
502         */
503        l = _uccase_len[0];
504        r = (l + _uccase_len[1]) - 1;
505    }
506    return _uccase_lookup(code, l, r, field);
507}
508
509/**************************************************************************
510 *
511 * Support for compositions.
512 *
513 **************************************************************************/
514
515#if !HARDCODE_DATA
516
517static ac_uint4  _uccomp_size;
518static ac_uint4 *_uccomp_data;
519
520/*
521 * Return -1 on error, 0 if okay
522 */
523static int
524_uccomp_load(char *paths, int reload)
525{
526    FILE *in;
527    ac_uint4 size, i;
528    _ucheader_t hdr;
529
530    if (_uccomp_size > 0) {
531        if (!reload)
532            /*
533             * The compositions have already been loaded.
534             */
535            return 0;
536
537        free((char *) _uccomp_data);
538        _uccomp_size = 0;
539    }
540
541    if ((in = _ucopenfile(paths, "comp.dat", "rb")) == 0)
542        return -1;
543
544    /*
545     * Load the header.
546     */
547    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
548
549    if (hdr.bom == 0xfffe) {
550        hdr.cnt = endian_short(hdr.cnt);
551        hdr.size.bytes = endian_long(hdr.size.bytes);
552    }
553
554    _uccomp_size = hdr.cnt;
555    _uccomp_data = (ac_uint4 *) malloc(hdr.size.bytes);
556
557    /*
558     * Read the composition data in.
559     */
560    size = hdr.size.bytes / sizeof(ac_uint4);
561    fread((char *) _uccomp_data, sizeof(ac_uint4), size, in);
562
563    /*
564     * Do an endian swap if necessary.
565     */
566    if (hdr.bom == 0xfffe) {
567        for (i = 0; i < size; i++)
568            _uccomp_data[i] = endian_long(_uccomp_data[i]);
569    }
570
571    /*
572     * Assume that the data is ordered on count, so that all compositions
573     * of length 2 come first. Only handling length 2 for now.
574     */
575    for (i = 1; i < size; i += 4)
576      if (_uccomp_data[i] != 2)
577        break;
578    _uccomp_size = i - 1;
579
580    fclose(in);
581    return 0;
582}
583
584static void
585_uccomp_unload(void)
586{
587    if (_uccomp_size == 0)
588        return;
589
590    free((char *) _uccomp_data);
591    _uccomp_size = 0;
592}
593#endif
594
595int
596uccomp(ac_uint4 node1, ac_uint4 node2, ac_uint4 *comp)
597{
598    int l, r, m;
599
600    l = 0;
601    r = _uccomp_size - 1;
602
603    while (l <= r) {
604        m = ((r + l) >> 1);
605        m -= m & 3;
606        if (node1 > _uccomp_data[m+2])
607          l = m + 4;
608        else if (node1 < _uccomp_data[m+2])
609          r = m - 4;
610        else if (node2 > _uccomp_data[m+3])
611          l = m + 4;
612        else if (node2 < _uccomp_data[m+3])
613          r = m - 4;
614        else {
615            *comp = _uccomp_data[m];
616            return 1;
617        }
618    }
619    return 0;
620}
621
622int
623uccomp_hangul(ac_uint4 *str, int len)
624{
625    const int SBase = 0xAC00, LBase = 0x1100,
626        VBase = 0x1161, TBase = 0x11A7,
627        LCount = 19, VCount = 21, TCount = 28,
628        NCount = VCount * TCount,   /* 588 */
629        SCount = LCount * NCount;   /* 11172 */
630
631    int i, rlen;
632    ac_uint4 ch, last, lindex, sindex;
633
634    last = str[0];
635    rlen = 1;
636    for ( i = 1; i < len; i++ ) {
637        ch = str[i];
638
639        /* check if two current characters are L and V */
640        lindex = last - LBase;
641        if (lindex < (ac_uint4) LCount) {
642            ac_uint4 vindex = ch - VBase;
643            if (vindex < (ac_uint4) VCount) {
644                /* make syllable of form LV */
645                last = SBase + (lindex * VCount + vindex) * TCount;
646                str[rlen-1] = last; /* reset last */
647                continue;
648            }
649        }
650
651        /* check if two current characters are LV and T */
652        sindex = last - SBase;
653        if (sindex < (ac_uint4) SCount
654			&& (sindex % TCount) == 0)
655		{
656            ac_uint4 tindex = ch - TBase;
657            if (tindex <= (ac_uint4) TCount) {
658                /* make syllable of form LVT */
659                last += tindex;
660                str[rlen-1] = last; /* reset last */
661                continue;
662            }
663        }
664
665        /* if neither case was true, just add the character */
666        last = ch;
667        str[rlen] = ch;
668        rlen++;
669    }
670    return rlen;
671}
672
673int
674uccanoncomp(ac_uint4 *str, int len)
675{
676    int i, stpos, copos;
677    ac_uint4 cl, prevcl, st, ch, co;
678
679    st = str[0];
680    stpos = 0;
681    copos = 1;
682    prevcl = uccombining_class(st) == 0 ? 0 : 256;
683
684    for (i = 1; i < len; i++) {
685        ch = str[i];
686        cl = uccombining_class(ch);
687        if (uccomp(st, ch, &co) && (prevcl < cl || prevcl == 0))
688          st = str[stpos] = co;
689        else {
690            if (cl == 0) {
691                stpos = copos;
692                st = ch;
693            }
694            prevcl = cl;
695            str[copos++] = ch;
696        }
697    }
698
699    return uccomp_hangul(str, copos);
700}
701
702/**************************************************************************
703 *
704 * Support for decompositions.
705 *
706 **************************************************************************/
707
708#if !HARDCODE_DATA
709
710static ac_uint4  _ucdcmp_size;
711static ac_uint4 *_ucdcmp_nodes;
712static ac_uint4 *_ucdcmp_decomp;
713
714static ac_uint4  _uckdcmp_size;
715static ac_uint4 *_uckdcmp_nodes;
716static ac_uint4 *_uckdcmp_decomp;
717
718/*
719 * Return -1 on error, 0 if okay
720 */
721static int
722_ucdcmp_load(char *paths, int reload)
723{
724    FILE *in;
725    ac_uint4 size, i;
726    _ucheader_t hdr;
727
728    if (_ucdcmp_size > 0) {
729        if (!reload)
730            /*
731             * The decompositions have already been loaded.
732             */
733          return 0;
734
735        free((char *) _ucdcmp_nodes);
736        _ucdcmp_size = 0;
737    }
738
739    if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0)
740        return -1;
741
742    /*
743     * Load the header.
744     */
745    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
746
747    if (hdr.bom == 0xfffe) {
748        hdr.cnt = endian_short(hdr.cnt);
749        hdr.size.bytes = endian_long(hdr.size.bytes);
750    }
751
752    _ucdcmp_size = hdr.cnt << 1;
753    _ucdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
754    _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1);
755
756    /*
757     * Read the decomposition data in.
758     */
759    size = hdr.size.bytes / sizeof(ac_uint4);
760    fread((char *) _ucdcmp_nodes, sizeof(ac_uint4), size, in);
761
762    /*
763     * Do an endian swap if necessary.
764     */
765    if (hdr.bom == 0xfffe) {
766        for (i = 0; i < size; i++)
767            _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]);
768    }
769    fclose(in);
770    return 0;
771}
772
773/*
774 * Return -1 on error, 0 if okay
775 */
776static int
777_uckdcmp_load(char *paths, int reload)
778{
779    FILE *in;
780    ac_uint4 size, i;
781    _ucheader_t hdr;
782
783    if (_uckdcmp_size > 0) {
784        if (!reload)
785            /*
786             * The decompositions have already been loaded.
787             */
788          return 0;
789
790        free((char *) _uckdcmp_nodes);
791        _uckdcmp_size = 0;
792    }
793
794    if ((in = _ucopenfile(paths, "kdecomp.dat", "rb")) == 0)
795        return -1;
796
797    /*
798     * Load the header.
799     */
800    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
801
802    if (hdr.bom == 0xfffe) {
803        hdr.cnt = endian_short(hdr.cnt);
804        hdr.size.bytes = endian_long(hdr.size.bytes);
805    }
806
807    _uckdcmp_size = hdr.cnt << 1;
808    _uckdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
809    _uckdcmp_decomp = _uckdcmp_nodes + (_uckdcmp_size + 1);
810
811    /*
812     * Read the decomposition data in.
813     */
814    size = hdr.size.bytes / sizeof(ac_uint4);
815    fread((char *) _uckdcmp_nodes, sizeof(ac_uint4), size, in);
816
817    /*
818     * Do an endian swap if necessary.
819     */
820    if (hdr.bom == 0xfffe) {
821        for (i = 0; i < size; i++)
822            _uckdcmp_nodes[i] = endian_long(_uckdcmp_nodes[i]);
823    }
824    fclose(in);
825    return 0;
826}
827
828static void
829_ucdcmp_unload(void)
830{
831    if (_ucdcmp_size == 0)
832      return;
833
834    /*
835     * Only need to free the offsets because the memory is allocated as a
836     * single block.
837     */
838    free((char *) _ucdcmp_nodes);
839    _ucdcmp_size = 0;
840}
841
842static void
843_uckdcmp_unload(void)
844{
845    if (_uckdcmp_size == 0)
846      return;
847
848    /*
849     * Only need to free the offsets because the memory is allocated as a
850     * single block.
851     */
852    free((char *) _uckdcmp_nodes);
853    _uckdcmp_size = 0;
854}
855#endif
856
857int
858ucdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp)
859{
860    long l, r, m;
861
862    if (code < _ucdcmp_nodes[0]) {
863	return 0;
864    }
865
866    l = 0;
867    r = _ucdcmp_nodes[_ucdcmp_size] - 1;
868
869    while (l <= r) {
870        /*
871         * Determine a "mid" point and adjust to make sure the mid point is at
872         * the beginning of a code+offset pair.
873         */
874        m = (l + r) >> 1;
875        m -= (m & 1);
876        if (code > _ucdcmp_nodes[m])
877          l = m + 2;
878        else if (code < _ucdcmp_nodes[m])
879          r = m - 2;
880        else if (code == _ucdcmp_nodes[m]) {
881            *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1];
882            *decomp = (ac_uint4*)&_ucdcmp_decomp[_ucdcmp_nodes[m + 1]];
883            return 1;
884        }
885    }
886    return 0;
887}
888
889int
890uckdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp)
891{
892    long l, r, m;
893
894    if (code < _uckdcmp_nodes[0]) {
895	return 0;
896    }
897
898    l = 0;
899    r = _uckdcmp_nodes[_uckdcmp_size] - 1;
900
901    while (l <= r) {
902        /*
903         * Determine a "mid" point and adjust to make sure the mid point is at
904         * the beginning of a code+offset pair.
905         */
906        m = (l + r) >> 1;
907        m -= (m & 1);
908        if (code > _uckdcmp_nodes[m])
909          l = m + 2;
910        else if (code < _uckdcmp_nodes[m])
911          r = m - 2;
912        else if (code == _uckdcmp_nodes[m]) {
913            *num = _uckdcmp_nodes[m + 3] - _uckdcmp_nodes[m + 1];
914            *decomp = (ac_uint4*)&_uckdcmp_decomp[_uckdcmp_nodes[m + 1]];
915            return 1;
916        }
917    }
918    return 0;
919}
920
921int
922ucdecomp_hangul(ac_uint4 code, ac_uint4 *num, ac_uint4 decomp[])
923{
924    if (!ucishangul(code))
925      return 0;
926
927    code -= 0xac00;
928    decomp[0] = 0x1100 + (ac_uint4) (code / 588);
929    decomp[1] = 0x1161 + (ac_uint4) ((code % 588) / 28);
930    decomp[2] = 0x11a7 + (ac_uint4) (code % 28);
931    *num = (decomp[2] != 0x11a7) ? 3 : 2;
932
933    return 1;
934}
935
936/* mode == 0 for canonical, mode == 1 for compatibility */
937static int
938uccanoncompatdecomp(const ac_uint4 *in, int inlen,
939		    ac_uint4 **out, int *outlen, short mode, void *ctx)
940{
941    int l, size;
942	unsigned i, j, k;
943    ac_uint4 num, class, *decomp, hangdecomp[3];
944
945    size = inlen * 2;
946    *out = (ac_uint4 *) ber_memalloc_x(size * sizeof(**out), ctx);
947    if (*out == NULL)
948        return *outlen = -1;
949
950    i = 0;
951    for (j = 0; j < (unsigned) inlen; j++) {
952	if (mode ? uckdecomp(in[j], &num, &decomp) : ucdecomp(in[j], &num, &decomp)) {
953            if ( size - i < num) {
954                size = inlen + i - j + num - 1;
955                *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx );
956                if (*out == NULL)
957                    return *outlen = -1;
958            }
959            for (k = 0; k < num; k++) {
960                class = uccombining_class(decomp[k]);
961                if (class == 0) {
962                    (*out)[i] = decomp[k];
963                } else {
964                    for (l = i; l > 0; l--)
965                        if (class >= uccombining_class((*out)[l-1]))
966                            break;
967                    AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out));
968                    (*out)[l] = decomp[k];
969                }
970                i++;
971            }
972        } else if (ucdecomp_hangul(in[j], &num, hangdecomp)) {
973            if (size - i < num) {
974                size = inlen + i - j + num - 1;
975                *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx);
976                if (*out == NULL)
977                    return *outlen = -1;
978            }
979            for (k = 0; k < num; k++) {
980                (*out)[i] = hangdecomp[k];
981                i++;
982            }
983        } else {
984            if (size - i < 1) {
985                size = inlen + i - j;
986                *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx);
987                if (*out == NULL)
988                    return *outlen = -1;
989            }
990            class = uccombining_class(in[j]);
991            if (class == 0) {
992                (*out)[i] = in[j];
993            } else {
994                for (l = i; l > 0; l--)
995                    if (class >= uccombining_class((*out)[l-1]))
996                        break;
997                AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out));
998                (*out)[l] = in[j];
999            }
1000            i++;
1001        }
1002    }
1003    return *outlen = i;
1004}
1005
1006int
1007uccanondecomp(const ac_uint4 *in, int inlen,
1008              ac_uint4 **out, int *outlen, void *ctx)
1009{
1010    return uccanoncompatdecomp(in, inlen, out, outlen, 0, ctx);
1011}
1012
1013int
1014uccompatdecomp(const ac_uint4 *in, int inlen,
1015	       ac_uint4 **out, int *outlen, void *ctx)
1016{
1017    return uccanoncompatdecomp(in, inlen, out, outlen, 1, ctx);
1018}
1019
1020/**************************************************************************
1021 *
1022 * Support for combining classes.
1023 *
1024 **************************************************************************/
1025
1026#if !HARDCODE_DATA
1027static ac_uint4  _uccmcl_size;
1028static ac_uint4 *_uccmcl_nodes;
1029
1030/*
1031 * Return -1 on error, 0 if okay
1032 */
1033static int
1034_uccmcl_load(char *paths, int reload)
1035{
1036    FILE *in;
1037    ac_uint4 i;
1038    _ucheader_t hdr;
1039
1040    if (_uccmcl_size > 0) {
1041        if (!reload)
1042            /*
1043             * The combining classes have already been loaded.
1044             */
1045            return 0;
1046
1047        free((char *) _uccmcl_nodes);
1048        _uccmcl_size = 0;
1049    }
1050
1051    if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0)
1052        return -1;
1053
1054    /*
1055     * Load the header.
1056     */
1057    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
1058
1059    if (hdr.bom == 0xfffe) {
1060        hdr.cnt = endian_short(hdr.cnt);
1061        hdr.size.bytes = endian_long(hdr.size.bytes);
1062    }
1063
1064    _uccmcl_size = hdr.cnt * 3;
1065    _uccmcl_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
1066
1067    /*
1068     * Read the combining classes in.
1069     */
1070    fread((char *) _uccmcl_nodes, sizeof(ac_uint4), _uccmcl_size, in);
1071
1072    /*
1073     * Do an endian swap if necessary.
1074     */
1075    if (hdr.bom == 0xfffe) {
1076        for (i = 0; i < _uccmcl_size; i++)
1077            _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]);
1078    }
1079    fclose(in);
1080    return 0;
1081}
1082
1083static void
1084_uccmcl_unload(void)
1085{
1086    if (_uccmcl_size == 0)
1087      return;
1088
1089    free((char *) _uccmcl_nodes);
1090    _uccmcl_size = 0;
1091}
1092#endif
1093
1094ac_uint4
1095uccombining_class(ac_uint4 code)
1096{
1097    long l, r, m;
1098
1099    l = 0;
1100    r = _uccmcl_size - 1;
1101
1102    while (l <= r) {
1103        m = (l + r) >> 1;
1104        m -= (m % 3);
1105        if (code > _uccmcl_nodes[m + 1])
1106          l = m + 3;
1107        else if (code < _uccmcl_nodes[m])
1108          r = m - 3;
1109        else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1])
1110          return _uccmcl_nodes[m + 2];
1111    }
1112    return 0;
1113}
1114
1115/**************************************************************************
1116 *
1117 * Support for numeric values.
1118 *
1119 **************************************************************************/
1120
1121#if !HARDCODE_DATA
1122static ac_uint4 *_ucnum_nodes;
1123static ac_uint4 _ucnum_size;
1124static short *_ucnum_vals;
1125
1126/*
1127 * Return -1 on error, 0 if okay
1128 */
1129static int
1130_ucnumb_load(char *paths, int reload)
1131{
1132    FILE *in;
1133    ac_uint4 size, i;
1134    _ucheader_t hdr;
1135
1136    if (_ucnum_size > 0) {
1137        if (!reload)
1138          /*
1139           * The numbers have already been loaded.
1140           */
1141          return 0;
1142
1143        free((char *) _ucnum_nodes);
1144        _ucnum_size = 0;
1145    }
1146
1147    if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0)
1148      return -1;
1149
1150    /*
1151     * Load the header.
1152     */
1153    fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
1154
1155    if (hdr.bom == 0xfffe) {
1156        hdr.cnt = endian_short(hdr.cnt);
1157        hdr.size.bytes = endian_long(hdr.size.bytes);
1158    }
1159
1160    _ucnum_size = hdr.cnt;
1161    _ucnum_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
1162    _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size);
1163
1164    /*
1165     * Read the combining classes in.
1166     */
1167    fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in);
1168
1169    /*
1170     * Do an endian swap if necessary.
1171     */
1172    if (hdr.bom == 0xfffe) {
1173        for (i = 0; i < _ucnum_size; i++)
1174          _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]);
1175
1176        /*
1177         * Determine the number of values that have to be adjusted.
1178         */
1179        size = (hdr.size.bytes -
1180                (_ucnum_size * (sizeof(ac_uint4) << 1))) /
1181            sizeof(short);
1182
1183        for (i = 0; i < size; i++)
1184          _ucnum_vals[i] = endian_short(_ucnum_vals[i]);
1185    }
1186    fclose(in);
1187    return 0;
1188}
1189
1190static void
1191_ucnumb_unload(void)
1192{
1193    if (_ucnum_size == 0)
1194      return;
1195
1196    free((char *) _ucnum_nodes);
1197    _ucnum_size = 0;
1198}
1199#endif
1200
1201int
1202ucnumber_lookup(ac_uint4 code, struct ucnumber *num)
1203{
1204    long l, r, m;
1205    short *vp;
1206
1207    l = 0;
1208    r = _ucnum_size - 1;
1209    while (l <= r) {
1210        /*
1211         * Determine a "mid" point and adjust to make sure the mid point is at
1212         * the beginning of a code+offset pair.
1213         */
1214        m = (l + r) >> 1;
1215        m -= (m & 1);
1216        if (code > _ucnum_nodes[m])
1217          l = m + 2;
1218        else if (code < _ucnum_nodes[m])
1219          r = m - 2;
1220        else {
1221            vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1];
1222            num->numerator = (int) *vp++;
1223            num->denominator = (int) *vp;
1224            return 1;
1225        }
1226    }
1227    return 0;
1228}
1229
1230int
1231ucdigit_lookup(ac_uint4 code, int *digit)
1232{
1233    long l, r, m;
1234    short *vp;
1235
1236    l = 0;
1237    r = _ucnum_size - 1;
1238    while (l <= r) {
1239        /*
1240         * Determine a "mid" point and adjust to make sure the mid point is at
1241         * the beginning of a code+offset pair.
1242         */
1243        m = (l + r) >> 1;
1244        m -= (m & 1);
1245        if (code > _ucnum_nodes[m])
1246          l = m + 2;
1247        else if (code < _ucnum_nodes[m])
1248          r = m - 2;
1249        else {
1250            vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1];
1251            if (*vp == *(vp + 1)) {
1252              *digit = *vp;
1253              return 1;
1254            }
1255            return 0;
1256        }
1257    }
1258    return 0;
1259}
1260
1261struct ucnumber
1262ucgetnumber(ac_uint4 code)
1263{
1264    struct ucnumber num;
1265
1266    /*
1267     * Initialize with some arbitrary value, because the caller simply cannot
1268     * tell for sure if the code is a number without calling the ucisnumber()
1269     * macro before calling this function.
1270     */
1271    num.numerator = num.denominator = -111;
1272
1273    (void) ucnumber_lookup(code, &num);
1274
1275    return num;
1276}
1277
1278int
1279ucgetdigit(ac_uint4 code)
1280{
1281    int dig;
1282
1283    /*
1284     * Initialize with some arbitrary value, because the caller simply cannot
1285     * tell for sure if the code is a number without calling the ucisdigit()
1286     * macro before calling this function.
1287     */
1288    dig = -111;
1289
1290    (void) ucdigit_lookup(code, &dig);
1291
1292    return dig;
1293}
1294
1295/**************************************************************************
1296 *
1297 * Setup and cleanup routines.
1298 *
1299 **************************************************************************/
1300
1301#if HARDCODE_DATA
1302int ucdata_load(char *paths, int masks) { return 0; }
1303void ucdata_unload(int masks) { }
1304int ucdata_reload(char *paths, int masks) { return 0; }
1305#else
1306/*
1307 * Return 0 if okay, negative on error
1308 */
1309int
1310ucdata_load(char *paths, int masks)
1311{
1312    int error = 0;
1313
1314    if (masks & UCDATA_CTYPE)
1315      error |= _ucprop_load(paths, 0) < 0 ? UCDATA_CTYPE : 0;
1316    if (masks & UCDATA_CASE)
1317      error |= _uccase_load(paths, 0) < 0 ? UCDATA_CASE : 0;
1318    if (masks & UCDATA_DECOMP)
1319      error |= _ucdcmp_load(paths, 0) < 0 ? UCDATA_DECOMP : 0;
1320    if (masks & UCDATA_CMBCL)
1321      error |= _uccmcl_load(paths, 0) < 0 ? UCDATA_CMBCL : 0;
1322    if (masks & UCDATA_NUM)
1323      error |= _ucnumb_load(paths, 0) < 0 ? UCDATA_NUM : 0;
1324    if (masks & UCDATA_COMP)
1325      error |= _uccomp_load(paths, 0) < 0 ? UCDATA_COMP : 0;
1326    if (masks & UCDATA_KDECOMP)
1327      error |= _uckdcmp_load(paths, 0) < 0 ? UCDATA_KDECOMP : 0;
1328
1329    return -error;
1330}
1331
1332void
1333ucdata_unload(int masks)
1334{
1335    if (masks & UCDATA_CTYPE)
1336      _ucprop_unload();
1337    if (masks & UCDATA_CASE)
1338      _uccase_unload();
1339    if (masks & UCDATA_DECOMP)
1340      _ucdcmp_unload();
1341    if (masks & UCDATA_CMBCL)
1342      _uccmcl_unload();
1343    if (masks & UCDATA_NUM)
1344      _ucnumb_unload();
1345    if (masks & UCDATA_COMP)
1346      _uccomp_unload();
1347    if (masks & UCDATA_KDECOMP)
1348      _uckdcmp_unload();
1349}
1350
1351/*
1352 * Return 0 if okay, negative on error
1353 */
1354int
1355ucdata_reload(char *paths, int masks)
1356{
1357    int error = 0;
1358
1359    if (masks & UCDATA_CTYPE)
1360        error |= _ucprop_load(paths, 1) < 0 ? UCDATA_CTYPE : 0;
1361    if (masks & UCDATA_CASE)
1362        error |= _uccase_load(paths, 1) < 0 ? UCDATA_CASE : 0;
1363    if (masks & UCDATA_DECOMP)
1364        error |= _ucdcmp_load(paths, 1) < 0 ? UCDATA_DECOMP : 0;
1365    if (masks & UCDATA_CMBCL)
1366        error |= _uccmcl_load(paths, 1) < 0 ? UCDATA_CMBCL : 0;
1367    if (masks & UCDATA_NUM)
1368        error |= _ucnumb_load(paths, 1) < 0 ? UCDATA_NUM : 0;
1369    if (masks & UCDATA_COMP)
1370        error |= _uccomp_load(paths, 1) < 0 ? UCDATA_COMP : 0;
1371    if (masks & UCDATA_KDECOMP)
1372        error |= _uckdcmp_load(paths, 1) < 0 ? UCDATA_KDECOMP : 0;
1373
1374    return -error;
1375}
1376#endif
1377
1378#ifdef TEST
1379
1380void
1381main(void)
1382{
1383    int dig;
1384    ac_uint4 i, lo, *dec;
1385    struct ucnumber num;
1386
1387/*    ucdata_setup("."); */
1388
1389    if (ucisweak(0x30))
1390      printf("WEAK\n");
1391    else
1392      printf("NOT WEAK\n");
1393
1394    printf("LOWER 0x%04lX\n", uctolower(0xff3a));
1395    printf("UPPER 0x%04lX\n", uctoupper(0xff5a));
1396
1397    if (ucisalpha(0x1d5))
1398      printf("ALPHA\n");
1399    else
1400      printf("NOT ALPHA\n");
1401
1402    if (ucisupper(0x1d5)) {
1403        printf("UPPER\n");
1404        lo = uctolower(0x1d5);
1405        printf("0x%04lx\n", lo);
1406        lo = uctotitle(0x1d5);
1407        printf("0x%04lx\n", lo);
1408    } else
1409      printf("NOT UPPER\n");
1410
1411    if (ucistitle(0x1d5))
1412      printf("TITLE\n");
1413    else
1414      printf("NOT TITLE\n");
1415
1416    if (uciscomposite(0x1d5))
1417      printf("COMPOSITE\n");
1418    else
1419      printf("NOT COMPOSITE\n");
1420
1421    if (ucdecomp(0x1d5, &lo, &dec)) {
1422        for (i = 0; i < lo; i++)
1423          printf("0x%04lx ", dec[i]);
1424        putchar('\n');
1425    }
1426
1427    if ((lo = uccombining_class(0x41)) != 0)
1428      printf("0x41 CCL %ld\n", lo);
1429
1430    if (ucisxdigit(0xfeff))
1431      printf("0xFEFF HEX DIGIT\n");
1432    else
1433      printf("0xFEFF NOT HEX DIGIT\n");
1434
1435    if (ucisdefined(0x10000))
1436      printf("0x10000 DEFINED\n");
1437    else
1438      printf("0x10000 NOT DEFINED\n");
1439
1440    if (ucnumber_lookup(0x30, &num)) {
1441        if (num.denominator != 1)
1442          printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
1443        else
1444          printf("UCNUMBER: 0x30 = %d\n", num.numerator);
1445    } else
1446      printf("UCNUMBER: 0x30 NOT A NUMBER\n");
1447
1448    if (ucnumber_lookup(0xbc, &num)) {
1449        if (num.denominator != 1)
1450          printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
1451        else
1452          printf("UCNUMBER: 0xbc = %d\n", num.numerator);
1453    } else
1454      printf("UCNUMBER: 0xbc NOT A NUMBER\n");
1455
1456
1457    if (ucnumber_lookup(0xff19, &num)) {
1458        if (num.denominator != 1)
1459          printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
1460        else
1461          printf("UCNUMBER: 0xff19 = %d\n", num.numerator);
1462    } else
1463      printf("UCNUMBER: 0xff19 NOT A NUMBER\n");
1464
1465    if (ucnumber_lookup(0x4e00, &num)) {
1466        if (num.denominator != 1)
1467          printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator);
1468        else
1469          printf("UCNUMBER: 0x4e00 = %d\n", num.numerator);
1470    } else
1471      printf("UCNUMBER: 0x4e00 NOT A NUMBER\n");
1472
1473    if (ucdigit_lookup(0x06f9, &dig))
1474      printf("UCDIGIT: 0x6f9 = %d\n", dig);
1475    else
1476      printf("UCDIGIT: 0x6f9 NOT A NUMBER\n");
1477
1478    dig = ucgetdigit(0x0969);
1479    printf("UCGETDIGIT: 0x969 = %d\n", dig);
1480
1481    num = ucgetnumber(0x30);
1482    if (num.denominator != 1)
1483      printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
1484    else
1485      printf("UCGETNUMBER: 0x30 = %d\n", num.numerator);
1486
1487    num = ucgetnumber(0xbc);
1488    if (num.denominator != 1)
1489      printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
1490    else
1491      printf("UCGETNUMBER: 0xbc = %d\n", num.numerator);
1492
1493    num = ucgetnumber(0xff19);
1494    if (num.denominator != 1)
1495      printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
1496    else
1497      printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator);
1498
1499/*    ucdata_cleanup(); */
1500    exit(0);
1501}
1502
1503#endif /* TEST */
1504