1/* $OpenLDAP$ */ 2/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 3 * 4 * Copyright 1998-2011 The OpenLDAP Foundation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted only as authorized by the OpenLDAP 9 * Public License. 10 * 11 * A copy of this license is available in file LICENSE in the 12 * top-level directory of the distribution or, alternatively, at 13 * <http://www.OpenLDAP.org/license.html>. 14 */ 15/* Copyright 1997, 1998, 1999 Computing Research Labs, 16 * New Mexico State University 17 * 18 * Permission is hereby granted, free of charge, to any person obtaining a 19 * copy of this software and associated documentation files (the "Software"), 20 * to deal in the Software without restriction, including without limitation 21 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 22 * and/or sell copies of the Software, and to permit persons to whom the 23 * Software is furnished to do so, subject to the following conditions: 24 * 25 * The above copyright notice and this permission notice shall be included in 26 * all copies or substantial portions of the Software. 27 * 28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 31 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 32 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 33 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 34 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 35 */ 36/* $Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $ */ 37 38/* 39 * Assumptions: 40 * 1. Case conversions of UTF-16 characters must also be UTF-16 characters. 41 * 2. Case conversions are all one-to-one. 42 * 3. Text and pattern have already been normalized in some fashion. 43 */ 44 45#include <stdlib.h> 46#include <unistd.h> 47#include <string.h> 48#include "utbm.h" 49 50/* 51 * Single pattern character. 52 */ 53typedef struct { 54 ucs4_t lc; 55 ucs4_t uc; 56 ucs4_t tc; 57} _utbm_char_t; 58 59typedef struct { 60 _utbm_char_t *ch; 61 unsigned long skip; 62} _utbm_skip_t; 63 64typedef struct _utbm_pattern_t { 65 unsigned long flags; 66 67 _utbm_char_t *pat; 68 unsigned long pat_used; 69 unsigned long pat_size; 70 unsigned long patlen; 71 72 _utbm_skip_t *skip; 73 unsigned long skip_used; 74 unsigned long skip_size; 75 76 unsigned long md4; 77} _utbm_pattern_t; 78 79/************************************************************************* 80 * 81 * Support functions. 82 * 83 *************************************************************************/ 84 85/* 86 * Routine to look up the skip value for a character. 87 */ 88static unsigned long 89_utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end) 90{ 91 unsigned long i; 92 ucs4_t c1, c2; 93 _utbm_skip_t *sp; 94 95 if (start >= end) 96 return 0; 97 98 c1 = *start; 99 c2 = (start + 1 < end) ? *(start + 1) : ~0; 100 if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) 101 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 102 103 for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) { 104 if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) { 105 return ((unsigned long) (end - start) < sp->skip) ? 106 end - start : sp->skip; 107 } 108 } 109 return p->patlen; 110} 111 112static int 113_utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end, 114 unsigned long *match_start, unsigned long *match_end) 115{ 116 int check_space; 117 ucs4_t c1, c2; 118 unsigned long count; 119 _utbm_char_t *cp; 120 121 /* 122 * Set the potential match endpoint first. 123 */ 124 *match_end = (start - text) + 1; 125 126 c1 = *start; 127 c2 = (start + 1 < end) ? *(start + 1) : ~0; 128 if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) { 129 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 130 /* 131 * Adjust the match end point to occur after the UTF-16 character. 132 */ 133 *match_end = *match_end + 1; 134 } 135 136 if (pat->pat_used == 1) { 137 *match_start = start - text; 138 return 1; 139 } 140 141 /* 142 * Compare backward. 143 */ 144 cp = pat->pat + (pat->pat_used - 1); 145 146 for (count = pat->patlen; start > text && count > 0;) { 147 /* 148 * Ignore non-spacing characters if indicated. 149 */ 150 if (pat->flags & UTBM_IGNORE_NONSPACING) { 151 while (start > text && _utbm_nonspacing(c1)) { 152 c2 = *--start; 153 c1 = (start - 1 > text) ? *(start - 1) : ~0; 154 if (0xdc00 <= c2 && c2 <= 0xdfff && 155 0xd800 <= c1 && c1 <= 0xdbff) { 156 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 157 start--; 158 } else 159 c1 = c2; 160 } 161 } 162 163 /* 164 * Handle space compression if indicated. 165 */ 166 if (pat->flags & UTBM_SPACE_COMPRESS) { 167 check_space = 0; 168 while (start > text && 169 (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) { 170 check_space = _utbm_isspace(c1, 1); 171 c2 = *--start; 172 c1 = (start - 1 > text) ? *(start - 1) : ~0; 173 if (0xdc00 <= c2 && c2 <= 0xdfff && 174 0xd800 <= c1 && c1 <= 0xdbff) { 175 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 176 start--; 177 } else 178 c1 = c2; 179 } 180 /* 181 * Handle things if space compression was indicated and one or 182 * more member characters were found. 183 */ 184 if (check_space) { 185 if (cp->uc != ' ') 186 return 0; 187 cp--; 188 count--; 189 } 190 } 191 192 /* 193 * Handle the normal comparison cases. 194 */ 195 if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc))) 196 return 0; 197 198 count -= (c1 >= 0x10000) ? 2 : 1; 199 if (count > 0) { 200 cp--; 201 202 /* 203 * Get the next preceding character. 204 */ 205 if (start > text) { 206 c2 = *--start; 207 c1 = (start - 1 > text) ? *(start - 1) : ~0; 208 if (0xdc00 <= c2 && c2 <= 0xdfff && 209 0xd800 <= c1 && c1 <= 0xdbff) { 210 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 211 start--; 212 } else 213 c1 = c2; 214 } 215 } 216 } 217 218 /* 219 * Set the match start position. 220 */ 221 *match_start = start - text; 222 return 1; 223} 224 225/************************************************************************* 226 * 227 * API. 228 * 229 *************************************************************************/ 230 231utbm_pattern_t 232utbm_create_pattern(void) 233{ 234 utbm_pattern_t p; 235 236 p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t)); 237 (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t)); 238 return p; 239} 240 241void 242utbm_free_pattern(utbm_pattern_t pattern) 243{ 244 if (pattern == 0) 245 return; 246 247 if (pattern->pat_size > 0) 248 free((char *) pattern->pat); 249 250 if (pattern->skip_size > 0) 251 free((char *) pattern->skip); 252 253 free((char *) pattern); 254} 255 256void 257utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags, 258 utbm_pattern_t p) 259{ 260 int have_space; 261 unsigned long i, j, k, slen; 262 _utbm_char_t *cp; 263 _utbm_skip_t *sp; 264 ucs4_t c1, c2, sentinel; 265 266 if (p == 0 || pat == 0 || *pat == 0 || patlen == 0) 267 return; 268 269 /* 270 * Reset the pattern buffer. 271 */ 272 p->patlen = p->pat_used = p->skip_used = 0; 273 274 /* 275 * Set the flags. 276 */ 277 p->flags = flags; 278 279 /* 280 * Initialize the extra skip flag. 281 */ 282 p->md4 = 1; 283 284 /* 285 * Allocate more storage if necessary. 286 */ 287 if (patlen > p->pat_size) { 288 if (p->pat_size == 0) { 289 p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen); 290 p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen); 291 } else { 292 p->pat = (_utbm_char_t *) 293 realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen); 294 p->skip = (_utbm_skip_t *) 295 realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen); 296 } 297 p->pat_size = p->skip_size = patlen; 298 } 299 300 /* 301 * Preprocess the pattern to remove controls (if specified) and determine 302 * case. 303 */ 304 for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) { 305 c1 = pat[i]; 306 c2 = (i + 1 < patlen) ? pat[i + 1] : ~0; 307 if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) 308 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 309 310 /* 311 * Make sure the `have_space' flag is turned off if the character 312 * is not an appropriate one. 313 */ 314 if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS)) 315 have_space = 0; 316 317 /* 318 * If non-spacing characters should be ignored, do it here. 319 */ 320 if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1)) 321 continue; 322 323 /* 324 * Check if spaces and controls need to be compressed. 325 */ 326 if (flags & UTBM_SPACE_COMPRESS) { 327 if (_utbm_isspace(c1, 1)) { 328 if (!have_space) { 329 /* 330 * Add a space and set the flag. 331 */ 332 cp->uc = cp->lc = cp->tc = ' '; 333 cp++; 334 335 /* 336 * Increase the real pattern length. 337 */ 338 p->patlen++; 339 sentinel = ' '; 340 have_space = 1; 341 } 342 continue; 343 } 344 345 /* 346 * Ignore all control characters. 347 */ 348 if (_utbm_iscntrl(c1)) 349 continue; 350 } 351 352 /* 353 * Add the character. 354 */ 355 if (flags & UTBM_CASEFOLD) { 356 cp->uc = _utbm_toupper(c1); 357 cp->lc = _utbm_tolower(c1); 358 cp->tc = _utbm_totitle(c1); 359 } else 360 cp->uc = cp->lc = cp->tc = c1; 361 362 /* 363 * Set the sentinel character. 364 */ 365 sentinel = cp->uc; 366 367 /* 368 * Move to the next character. 369 */ 370 cp++; 371 372 /* 373 * Increase the real pattern length appropriately. 374 */ 375 p->patlen += (c1 >= 0x10000) ? 2 : 1; 376 377 /* 378 * Increment the loop index for UTF-16 characters. 379 */ 380 i += (c1 >= 0x10000) ? 1 : 0; 381 382 } 383 384 /* 385 * Set the number of characters actually used. 386 */ 387 p->pat_used = cp - p->pat; 388 389 /* 390 * Go through and construct the skip array and determine the actual length 391 * of the pattern in UCS2 terms. 392 */ 393 slen = p->patlen - 1; 394 cp = p->pat; 395 for (i = k = 0; i < p->pat_used; i++, cp++) { 396 /* 397 * Locate the character in the skip array. 398 */ 399 for (sp = p->skip, j = 0; 400 j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ; 401 402 /* 403 * If the character is not found, set the new skip element and 404 * increase the number of skip elements. 405 */ 406 if (j == p->skip_used) { 407 sp->ch = cp; 408 p->skip_used++; 409 } 410 411 /* 412 * Set the updated skip value. If the character is UTF-16 and is 413 * not the last one in the pattern, add one to its skip value. 414 */ 415 sp->skip = slen - k; 416 if (cp->uc >= 0x10000 && k + 2 < slen) 417 sp->skip++; 418 419 /* 420 * Set the new extra skip for the sentinel character. 421 */ 422 if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) && 423 cp->uc == sentinel) 424 p->md4 = slen - k; 425 426 /* 427 * Increase the actual index. 428 */ 429 k += (cp->uc >= 0x10000) ? 2 : 1; 430 } 431} 432 433int 434utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen, 435 unsigned long *match_start, unsigned long *match_end) 436{ 437 unsigned long k; 438 ucs2_t *start, *end; 439 440 if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 || 441 textlen < pat->patlen) 442 return 0; 443 444 start = text + pat->patlen; 445 end = text + textlen; 446 447 /* 448 * Adjust the start point if it points to a low surrogate. 449 */ 450 if (0xdc00 <= *start && *start <= 0xdfff && 451 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) 452 start--; 453 454 while (start < end) { 455 while ((k = _utbm_skip(pat, start, end))) { 456 start += k; 457 if (start < end && 0xdc00 <= *start && *start <= 0xdfff && 458 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) 459 start--; 460 } 461 462 if (start < end && 463 _utbm_match(pat, text, start, end, match_start, match_end)) 464 return 1; 465 466 start += pat->md4; 467 if (start < end && 0xdc00 <= *start && *start <= 0xdfff && 468 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) 469 start--; 470 } 471 return 0; 472} 473