1/* $OpenLDAP$ */
2/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3 *
4 * Copyright 1998-2011 The OpenLDAP Foundation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
9 * Public License.
10 *
11 * A copy of this license is available in file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
14 */
15/* Copyright 1997, 1998, 1999 Computing Research Labs,
16 * New Mexico State University
17 *
18 * Permission is hereby granted, free of charge, to any person obtaining a
19 * copy of this software and associated documentation files (the "Software"),
20 * to deal in the Software without restriction, including without limitation
21 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
22 * and/or sell copies of the Software, and to permit persons to whom the
23 * Software is furnished to do so, subject to the following conditions:
24 *
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
31 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
32 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
33 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
34 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35 */
36/* $Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $ */
37
38/*
39 * Assumptions:
40 * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
41 * 2. Case conversions are all one-to-one.
42 * 3. Text and pattern have already been normalized in some fashion.
43 */
44
45#include <stdlib.h>
46#include <unistd.h>
47#include <string.h>
48#include "utbm.h"
49
50/*
51 * Single pattern character.
52 */
53typedef struct {
54    ucs4_t lc;
55    ucs4_t uc;
56    ucs4_t tc;
57} _utbm_char_t;
58
59typedef struct {
60    _utbm_char_t *ch;
61    unsigned long skip;
62} _utbm_skip_t;
63
64typedef struct _utbm_pattern_t {
65    unsigned long flags;
66
67    _utbm_char_t *pat;
68    unsigned long pat_used;
69    unsigned long pat_size;
70    unsigned long patlen;
71
72    _utbm_skip_t *skip;
73    unsigned long skip_used;
74    unsigned long skip_size;
75
76    unsigned long md4;
77} _utbm_pattern_t;
78
79/*************************************************************************
80 *
81 * Support functions.
82 *
83 *************************************************************************/
84
85/*
86 * Routine to look up the skip value for a character.
87 */
88static unsigned long
89_utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
90{
91    unsigned long i;
92    ucs4_t c1, c2;
93    _utbm_skip_t *sp;
94
95    if (start >= end)
96      return 0;
97
98    c1 = *start;
99    c2 = (start + 1 < end) ? *(start + 1) : ~0;
100    if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
101      c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
102
103    for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
104        if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
105            return ((unsigned long) (end - start) < sp->skip) ?
106                end - start : sp->skip;
107        }
108    }
109    return p->patlen;
110}
111
112static int
113_utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
114            unsigned long *match_start, unsigned long *match_end)
115{
116    int check_space;
117    ucs4_t c1, c2;
118    unsigned long count;
119    _utbm_char_t *cp;
120
121    /*
122     * Set the potential match endpoint first.
123     */
124    *match_end = (start - text) + 1;
125
126    c1 = *start;
127    c2 = (start + 1 < end) ? *(start + 1) : ~0;
128    if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
129        c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
130        /*
131         * Adjust the match end point to occur after the UTF-16 character.
132         */
133        *match_end = *match_end + 1;
134    }
135
136    if (pat->pat_used == 1) {
137        *match_start = start - text;
138        return 1;
139    }
140
141    /*
142     * Compare backward.
143     */
144    cp = pat->pat + (pat->pat_used - 1);
145
146    for (count = pat->patlen; start > text && count > 0;) {
147        /*
148         * Ignore non-spacing characters if indicated.
149         */
150        if (pat->flags & UTBM_IGNORE_NONSPACING) {
151            while (start > text && _utbm_nonspacing(c1)) {
152                c2 = *--start;
153                c1 = (start - 1 > text) ? *(start - 1) : ~0;
154                if (0xdc00 <= c2 && c2 <= 0xdfff &&
155                    0xd800 <= c1 && c1 <= 0xdbff) {
156                    c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
157                    start--;
158                } else
159                  c1 = c2;
160            }
161        }
162
163        /*
164         * Handle space compression if indicated.
165         */
166        if (pat->flags & UTBM_SPACE_COMPRESS) {
167            check_space = 0;
168            while (start > text &&
169                   (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
170                check_space = _utbm_isspace(c1, 1);
171                c2 = *--start;
172                c1 = (start - 1 > text) ? *(start - 1) : ~0;
173                if (0xdc00 <= c2 && c2 <= 0xdfff &&
174                    0xd800 <= c1 && c1 <= 0xdbff) {
175                    c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
176                    start--;
177                } else
178                  c1 = c2;
179            }
180            /*
181             * Handle things if space compression was indicated and one or
182             * more member characters were found.
183             */
184            if (check_space) {
185                if (cp->uc != ' ')
186                  return 0;
187                cp--;
188                count--;
189            }
190        }
191
192        /*
193         * Handle the normal comparison cases.
194         */
195        if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
196          return 0;
197
198        count -= (c1 >= 0x10000) ? 2 : 1;
199        if (count > 0) {
200            cp--;
201
202            /*
203             * Get the next preceding character.
204             */
205            if (start > text) {
206                c2 = *--start;
207                c1 = (start - 1 > text) ? *(start - 1) : ~0;
208                if (0xdc00 <= c2 && c2 <= 0xdfff &&
209                    0xd800 <= c1 && c1 <= 0xdbff) {
210                    c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
211                    start--;
212                } else
213                  c1 = c2;
214            }
215        }
216    }
217
218    /*
219     * Set the match start position.
220     */
221    *match_start = start - text;
222    return 1;
223}
224
225/*************************************************************************
226 *
227 * API.
228 *
229 *************************************************************************/
230
231utbm_pattern_t
232utbm_create_pattern(void)
233{
234    utbm_pattern_t p;
235
236    p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
237    (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t));
238    return p;
239}
240
241void
242utbm_free_pattern(utbm_pattern_t pattern)
243{
244    if (pattern == 0)
245      return;
246
247    if (pattern->pat_size > 0)
248      free((char *) pattern->pat);
249
250    if (pattern->skip_size > 0)
251      free((char *) pattern->skip);
252
253    free((char *) pattern);
254}
255
256void
257utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
258             utbm_pattern_t p)
259{
260    int have_space;
261    unsigned long i, j, k, slen;
262    _utbm_char_t *cp;
263    _utbm_skip_t *sp;
264    ucs4_t c1, c2, sentinel;
265
266    if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
267      return;
268
269    /*
270     * Reset the pattern buffer.
271     */
272    p->patlen = p->pat_used = p->skip_used = 0;
273
274    /*
275     * Set the flags.
276     */
277    p->flags = flags;
278
279    /*
280     * Initialize the extra skip flag.
281     */
282    p->md4 = 1;
283
284    /*
285     * Allocate more storage if necessary.
286     */
287    if (patlen > p->pat_size) {
288        if (p->pat_size == 0) {
289            p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
290            p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
291        } else {
292            p->pat = (_utbm_char_t *)
293                realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
294            p->skip = (_utbm_skip_t *)
295                realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
296        }
297        p->pat_size = p->skip_size = patlen;
298    }
299
300    /*
301     * Preprocess the pattern to remove controls (if specified) and determine
302     * case.
303     */
304    for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
305        c1 = pat[i];
306        c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
307        if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
308          c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
309
310        /*
311         * Make sure the `have_space' flag is turned off if the character
312         * is not an appropriate one.
313         */
314        if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
315          have_space = 0;
316
317        /*
318         * If non-spacing characters should be ignored, do it here.
319         */
320        if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
321          continue;
322
323        /*
324         * Check if spaces and controls need to be compressed.
325         */
326        if (flags & UTBM_SPACE_COMPRESS) {
327            if (_utbm_isspace(c1, 1)) {
328                if (!have_space) {
329                    /*
330                     * Add a space and set the flag.
331                     */
332                    cp->uc = cp->lc = cp->tc = ' ';
333                    cp++;
334
335                    /*
336                     * Increase the real pattern length.
337                     */
338                    p->patlen++;
339                    sentinel = ' ';
340                    have_space = 1;
341                }
342                continue;
343            }
344
345            /*
346             * Ignore all control characters.
347             */
348            if (_utbm_iscntrl(c1))
349              continue;
350        }
351
352        /*
353         * Add the character.
354         */
355        if (flags & UTBM_CASEFOLD) {
356            cp->uc = _utbm_toupper(c1);
357            cp->lc = _utbm_tolower(c1);
358            cp->tc = _utbm_totitle(c1);
359        } else
360          cp->uc = cp->lc = cp->tc = c1;
361
362        /*
363         * Set the sentinel character.
364         */
365        sentinel = cp->uc;
366
367        /*
368         * Move to the next character.
369         */
370        cp++;
371
372        /*
373         * Increase the real pattern length appropriately.
374         */
375        p->patlen += (c1 >= 0x10000) ? 2 : 1;
376
377        /*
378         * Increment the loop index for UTF-16 characters.
379         */
380        i += (c1 >= 0x10000) ? 1 : 0;
381
382    }
383
384    /*
385     * Set the number of characters actually used.
386     */
387    p->pat_used = cp - p->pat;
388
389    /*
390     * Go through and construct the skip array and determine the actual length
391     * of the pattern in UCS2 terms.
392     */
393    slen = p->patlen - 1;
394    cp = p->pat;
395    for (i = k = 0; i < p->pat_used; i++, cp++) {
396        /*
397         * Locate the character in the skip array.
398         */
399        for (sp = p->skip, j = 0;
400             j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
401
402        /*
403         * If the character is not found, set the new skip element and
404         * increase the number of skip elements.
405         */
406        if (j == p->skip_used) {
407            sp->ch = cp;
408            p->skip_used++;
409        }
410
411        /*
412         * Set the updated skip value.  If the character is UTF-16 and is
413         * not the last one in the pattern, add one to its skip value.
414         */
415        sp->skip = slen - k;
416        if (cp->uc >= 0x10000 && k + 2 < slen)
417          sp->skip++;
418
419        /*
420         * Set the new extra skip for the sentinel character.
421         */
422        if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
423            cp->uc == sentinel)
424          p->md4 = slen - k;
425
426        /*
427         * Increase the actual index.
428         */
429        k += (cp->uc >= 0x10000) ? 2 : 1;
430    }
431}
432
433int
434utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
435          unsigned long *match_start, unsigned long *match_end)
436{
437    unsigned long k;
438    ucs2_t *start, *end;
439
440    if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
441        textlen < pat->patlen)
442      return 0;
443
444    start = text + pat->patlen;
445    end = text + textlen;
446
447    /*
448     * Adjust the start point if it points to a low surrogate.
449     */
450    if (0xdc00 <= *start && *start <= 0xdfff &&
451        0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
452      start--;
453
454    while (start < end) {
455        while ((k = _utbm_skip(pat, start, end))) {
456            start += k;
457            if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
458                0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
459              start--;
460        }
461
462        if (start < end &&
463            _utbm_match(pat, text, start, end, match_start, match_end))
464          return 1;
465
466        start += pat->md4;
467        if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
468            0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
469          start--;
470    }
471    return 0;
472}
473