1/*	$NetBSD$	*/
2
3/* OpenLDAP: pkg/ldap/libraries/liblunicode/utbm/utbm.c,v 1.7.2.5 2010/04/13 20:23:04 kurt Exp */
4/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 1998-2010 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17/* Copyright 1997, 1998, 1999 Computing Research Labs,
18 * New Mexico State University
19 *
20 * Permission is hereby granted, free of charge, to any person obtaining a
21 * copy of this software and associated documentation files (the "Software"),
22 * to deal in the Software without restriction, including without limitation
23 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 * and/or sell copies of the Software, and to permit persons to whom the
25 * Software is furnished to do so, subject to the following conditions:
26 *
27 * The above copyright notice and this permission notice shall be included in
28 * all copies or substantial portions of the Software.
29 *
30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
33 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 */
38/* Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp */
39
40/*
41 * Assumptions:
42 * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
43 * 2. Case conversions are all one-to-one.
44 * 3. Text and pattern have already been normalized in some fashion.
45 */
46
47#include <stdlib.h>
48#include <unistd.h>
49#include <string.h>
50#include "utbm.h"
51
52/*
53 * Single pattern character.
54 */
55typedef struct {
56    ucs4_t lc;
57    ucs4_t uc;
58    ucs4_t tc;
59} _utbm_char_t;
60
61typedef struct {
62    _utbm_char_t *ch;
63    unsigned long skip;
64} _utbm_skip_t;
65
66typedef struct _utbm_pattern_t {
67    unsigned long flags;
68
69    _utbm_char_t *pat;
70    unsigned long pat_used;
71    unsigned long pat_size;
72    unsigned long patlen;
73
74    _utbm_skip_t *skip;
75    unsigned long skip_used;
76    unsigned long skip_size;
77
78    unsigned long md4;
79} _utbm_pattern_t;
80
81/*************************************************************************
82 *
83 * Support functions.
84 *
85 *************************************************************************/
86
87/*
88 * Routine to look up the skip value for a character.
89 */
90static unsigned long
91_utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
92{
93    unsigned long i;
94    ucs4_t c1, c2;
95    _utbm_skip_t *sp;
96
97    if (start >= end)
98      return 0;
99
100    c1 = *start;
101    c2 = (start + 1 < end) ? *(start + 1) : ~0;
102    if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
103      c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
104
105    for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
106        if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
107            return ((unsigned long) (end - start) < sp->skip) ?
108                end - start : sp->skip;
109        }
110    }
111    return p->patlen;
112}
113
114static int
115_utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
116            unsigned long *match_start, unsigned long *match_end)
117{
118    int check_space;
119    ucs4_t c1, c2;
120    unsigned long count;
121    _utbm_char_t *cp;
122
123    /*
124     * Set the potential match endpoint first.
125     */
126    *match_end = (start - text) + 1;
127
128    c1 = *start;
129    c2 = (start + 1 < end) ? *(start + 1) : ~0;
130    if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
131        c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
132        /*
133         * Adjust the match end point to occur after the UTF-16 character.
134         */
135        *match_end = *match_end + 1;
136    }
137
138    if (pat->pat_used == 1) {
139        *match_start = start - text;
140        return 1;
141    }
142
143    /*
144     * Compare backward.
145     */
146    cp = pat->pat + (pat->pat_used - 1);
147
148    for (count = pat->patlen; start > text && count > 0;) {
149        /*
150         * Ignore non-spacing characters if indicated.
151         */
152        if (pat->flags & UTBM_IGNORE_NONSPACING) {
153            while (start > text && _utbm_nonspacing(c1)) {
154                c2 = *--start;
155                c1 = (start - 1 > text) ? *(start - 1) : ~0;
156                if (0xdc00 <= c2 && c2 <= 0xdfff &&
157                    0xd800 <= c1 && c1 <= 0xdbff) {
158                    c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
159                    start--;
160                } else
161                  c1 = c2;
162            }
163        }
164
165        /*
166         * Handle space compression if indicated.
167         */
168        if (pat->flags & UTBM_SPACE_COMPRESS) {
169            check_space = 0;
170            while (start > text &&
171                   (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
172                check_space = _utbm_isspace(c1, 1);
173                c2 = *--start;
174                c1 = (start - 1 > text) ? *(start - 1) : ~0;
175                if (0xdc00 <= c2 && c2 <= 0xdfff &&
176                    0xd800 <= c1 && c1 <= 0xdbff) {
177                    c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
178                    start--;
179                } else
180                  c1 = c2;
181            }
182            /*
183             * Handle things if space compression was indicated and one or
184             * more member characters were found.
185             */
186            if (check_space) {
187                if (cp->uc != ' ')
188                  return 0;
189                cp--;
190                count--;
191            }
192        }
193
194        /*
195         * Handle the normal comparison cases.
196         */
197        if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
198          return 0;
199
200        count -= (c1 >= 0x10000) ? 2 : 1;
201        if (count > 0) {
202            cp--;
203
204            /*
205             * Get the next preceding character.
206             */
207            if (start > text) {
208                c2 = *--start;
209                c1 = (start - 1 > text) ? *(start - 1) : ~0;
210                if (0xdc00 <= c2 && c2 <= 0xdfff &&
211                    0xd800 <= c1 && c1 <= 0xdbff) {
212                    c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
213                    start--;
214                } else
215                  c1 = c2;
216            }
217        }
218    }
219
220    /*
221     * Set the match start position.
222     */
223    *match_start = start - text;
224    return 1;
225}
226
227/*************************************************************************
228 *
229 * API.
230 *
231 *************************************************************************/
232
233utbm_pattern_t
234utbm_create_pattern(void)
235{
236    utbm_pattern_t p;
237
238    p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
239    (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t));
240    return p;
241}
242
243void
244utbm_free_pattern(utbm_pattern_t pattern)
245{
246    if (pattern == 0)
247      return;
248
249    if (pattern->pat_size > 0)
250      free((char *) pattern->pat);
251
252    if (pattern->skip_size > 0)
253      free((char *) pattern->skip);
254
255    free((char *) pattern);
256}
257
258void
259utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
260             utbm_pattern_t p)
261{
262    int have_space;
263    unsigned long i, j, k, slen;
264    _utbm_char_t *cp;
265    _utbm_skip_t *sp;
266    ucs4_t c1, c2, sentinel;
267
268    if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
269      return;
270
271    /*
272     * Reset the pattern buffer.
273     */
274    p->patlen = p->pat_used = p->skip_used = 0;
275
276    /*
277     * Set the flags.
278     */
279    p->flags = flags;
280
281    /*
282     * Initialize the extra skip flag.
283     */
284    p->md4 = 1;
285
286    /*
287     * Allocate more storage if necessary.
288     */
289    if (patlen > p->pat_size) {
290        if (p->pat_size == 0) {
291            p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
292            p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
293        } else {
294            p->pat = (_utbm_char_t *)
295                realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
296            p->skip = (_utbm_skip_t *)
297                realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
298        }
299        p->pat_size = p->skip_size = patlen;
300    }
301
302    /*
303     * Preprocess the pattern to remove controls (if specified) and determine
304     * case.
305     */
306    for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
307        c1 = pat[i];
308        c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
309        if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
310          c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
311
312        /*
313         * Make sure the `have_space' flag is turned off if the character
314         * is not an appropriate one.
315         */
316        if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
317          have_space = 0;
318
319        /*
320         * If non-spacing characters should be ignored, do it here.
321         */
322        if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
323          continue;
324
325        /*
326         * Check if spaces and controls need to be compressed.
327         */
328        if (flags & UTBM_SPACE_COMPRESS) {
329            if (_utbm_isspace(c1, 1)) {
330                if (!have_space) {
331                    /*
332                     * Add a space and set the flag.
333                     */
334                    cp->uc = cp->lc = cp->tc = ' ';
335                    cp++;
336
337                    /*
338                     * Increase the real pattern length.
339                     */
340                    p->patlen++;
341                    sentinel = ' ';
342                    have_space = 1;
343                }
344                continue;
345            }
346
347            /*
348             * Ignore all control characters.
349             */
350            if (_utbm_iscntrl(c1))
351              continue;
352        }
353
354        /*
355         * Add the character.
356         */
357        if (flags & UTBM_CASEFOLD) {
358            cp->uc = _utbm_toupper(c1);
359            cp->lc = _utbm_tolower(c1);
360            cp->tc = _utbm_totitle(c1);
361        } else
362          cp->uc = cp->lc = cp->tc = c1;
363
364        /*
365         * Set the sentinel character.
366         */
367        sentinel = cp->uc;
368
369        /*
370         * Move to the next character.
371         */
372        cp++;
373
374        /*
375         * Increase the real pattern length appropriately.
376         */
377        p->patlen += (c1 >= 0x10000) ? 2 : 1;
378
379        /*
380         * Increment the loop index for UTF-16 characters.
381         */
382        i += (c1 >= 0x10000) ? 1 : 0;
383
384    }
385
386    /*
387     * Set the number of characters actually used.
388     */
389    p->pat_used = cp - p->pat;
390
391    /*
392     * Go through and construct the skip array and determine the actual length
393     * of the pattern in UCS2 terms.
394     */
395    slen = p->patlen - 1;
396    cp = p->pat;
397    for (i = k = 0; i < p->pat_used; i++, cp++) {
398        /*
399         * Locate the character in the skip array.
400         */
401        for (sp = p->skip, j = 0;
402             j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
403
404        /*
405         * If the character is not found, set the new skip element and
406         * increase the number of skip elements.
407         */
408        if (j == p->skip_used) {
409            sp->ch = cp;
410            p->skip_used++;
411        }
412
413        /*
414         * Set the updated skip value.  If the character is UTF-16 and is
415         * not the last one in the pattern, add one to its skip value.
416         */
417        sp->skip = slen - k;
418        if (cp->uc >= 0x10000 && k + 2 < slen)
419          sp->skip++;
420
421        /*
422         * Set the new extra skip for the sentinel character.
423         */
424        if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
425            cp->uc == sentinel)
426          p->md4 = slen - k;
427
428        /*
429         * Increase the actual index.
430         */
431        k += (cp->uc >= 0x10000) ? 2 : 1;
432    }
433}
434
435int
436utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
437          unsigned long *match_start, unsigned long *match_end)
438{
439    unsigned long k;
440    ucs2_t *start, *end;
441
442    if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
443        textlen < pat->patlen)
444      return 0;
445
446    start = text + pat->patlen;
447    end = text + textlen;
448
449    /*
450     * Adjust the start point if it points to a low surrogate.
451     */
452    if (0xdc00 <= *start && *start <= 0xdfff &&
453        0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
454      start--;
455
456    while (start < end) {
457        while ((k = _utbm_skip(pat, start, end))) {
458            start += k;
459            if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
460                0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
461              start--;
462        }
463
464        if (start < end &&
465            _utbm_match(pat, text, start, end, match_start, match_end))
466          return 1;
467
468        start += pat->md4;
469        if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
470            0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
471          start--;
472    }
473    return 0;
474}
475