1/* 2 tre-match-parallel.c - TRE parallel regex matching engine 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7*/ 8 9/* 10 This algorithm searches for matches basically by reading characters 11 in the searched string one by one, starting at the beginning. All 12 matching paths in the TNFA are traversed in parallel. When two or 13 more paths reach the same state, exactly one is chosen according to 14 tag ordering rules; if returning submatches is not required it does 15 not matter which path is chosen. 16 17 The worst case time required for finding the leftmost and longest 18 match, or determining that there is no match, is always linearly 19 dependent on the length of the text being searched. 20 21 This algorithm cannot handle TNFAs with back referencing nodes. 22 See `tre-match-backtrack.c'. 23*/ 24 25 26#ifdef HAVE_CONFIG_H 27#include <config.h> 28#endif /* HAVE_CONFIG_H */ 29 30/* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state 31 info while running */ 32#undef TRE_USE_ALLOCA 33 34#ifdef TRE_USE_ALLOCA 35/* AIX requires this to be the first thing in the file. */ 36#ifndef __GNUC__ 37# if HAVE_ALLOCA_H 38# include <alloca.h> 39# else 40# ifdef _AIX 41 #pragma alloca 42# else 43# ifndef alloca /* predefined by HP cc +Olibcalls */ 44char *alloca (); 45# endif 46# endif 47# endif 48#endif 49#endif /* TRE_USE_ALLOCA */ 50 51#include <assert.h> 52#include <stdlib.h> 53#include <string.h> 54#ifdef HAVE_WCHAR_H 55#include <wchar.h> 56#endif /* HAVE_WCHAR_H */ 57#ifdef HAVE_WCTYPE_H 58#include <wctype.h> 59#endif /* HAVE_WCTYPE_H */ 60#ifndef TRE_WCHAR 61#include <ctype.h> 62#endif /* !TRE_WCHAR */ 63#ifdef HAVE_MALLOC_H 64#include <malloc.h> 65#endif /* HAVE_MALLOC_H */ 66 67#include "tre-internal.h" 68#include "tre-match-utils.h" 69#include "tre.h" 70#include "xmalloc.h" 71 72 73 74typedef struct { 75 tre_tnfa_transition_t *state; 76 tre_tag_t *tags; 77} tre_tnfa_reach_t; 78 79typedef struct { 80 int pos; 81 tre_tag_t **tags; 82} tre_reach_pos_t; 83 84 85#ifdef TRE_DEBUG 86static void 87tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags) 88{ 89 DPRINT((" %p", (void *)state)); 90 if (num_tags > 0) 91 { 92 DPRINT(("/")); 93 tre_print_tags(tags, num_tags); 94 } 95} 96 97static void 98tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) 99{ 100 while (reach->state != NULL) 101 { 102 tre_print_reach1(reach->state, reach->tags, num_tags); 103 reach++; 104 } 105 DPRINT(("\n")); 106 107} 108#endif /* TRE_DEBUG */ 109 110reg_errcode_t 111tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 112 tre_str_type_t type, tre_tag_t *match_tags, int eflags, 113 int *match_end_ofs) 114{ 115 /* State variables required by GET_NEXT_WCHAR. */ 116 tre_char_t prev_c = 0, next_c = 0; 117 const char *str_byte = string; 118 int pos = -1; 119 unsigned int pos_add_next = 1; 120#ifdef TRE_WCHAR 121 const wchar_t *str_wide = string; 122#ifdef TRE_MBSTATE 123 mbstate_t mbstate; 124#endif /* TRE_MBSTATE */ 125#endif /* TRE_WCHAR */ 126 int reg_notbol = eflags & REG_NOTBOL; 127 int reg_noteol = eflags & REG_NOTEOL; 128 int reg_newline = tnfa->cflags & REG_NEWLINE; 129#ifdef TRE_STR_USER 130 int str_user_end = 0; 131#endif /* TRE_STR_USER */ 132 133 char *buf; 134 tre_tnfa_transition_t *trans_i; 135 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; 136 tre_reach_pos_t *reach_pos; 137 int *tag_i; 138 int num_tags, i; 139 140 int match_eo = -1; /* end offset of match (-1 if no match found yet) */ 141#ifdef TRE_DEBUG 142 int once; 143#endif /* TRE_DEBUG */ 144 tre_tag_t *tmp_tags = NULL; 145 tre_tag_t *tmp_iptr; 146 int tbytes; 147 int touch = 1; 148 149#ifdef TRE_MBSTATE 150 memset(&mbstate, '\0', sizeof(mbstate)); 151#endif /* TRE_MBSTATE */ 152 153 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); 154 155 if (!match_tags) 156 num_tags = 0; 157 else 158 num_tags = tnfa->num_tags; 159 160 /* Allocate memory for temporary data required for matching. This needs to 161 be done for every matching operation to be thread safe. This allocates 162 everything in a single large block from the stack frame using alloca() 163 or with malloc() if alloca is unavailable. */ 164 { 165 int rbytes, pbytes, total_bytes; 166 char *tmp_buf; 167 /* Compute the length of the block we need. */ 168 tbytes = sizeof(*tmp_tags) * num_tags; 169 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); 170 pbytes = sizeof(*reach_pos) * tnfa->num_states; 171 total_bytes = 172 (sizeof(long) - 1) * 4 /* for alignment paddings */ 173 + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes; 174 175 DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes)); 176 /* Allocate the memory. */ 177#ifdef TRE_USE_ALLOCA 178 buf = alloca(total_bytes); 179#else /* !TRE_USE_ALLOCA */ 180 buf = xmalloc((unsigned)total_bytes); 181#endif /* !TRE_USE_ALLOCA */ 182 if (buf == NULL) 183 return REG_ESPACE; 184 memset(buf, 0, (size_t)total_bytes); 185 186 /* Get the various pointers within tmp_buf (properly aligned). */ 187 tmp_tags = (void *)buf; 188 tmp_buf = buf + tbytes; 189 tmp_buf += ALIGN(tmp_buf, long); 190 reach_next = (void *)tmp_buf; 191 tmp_buf += rbytes; 192 tmp_buf += ALIGN(tmp_buf, long); 193 reach = (void *)tmp_buf; 194 tmp_buf += rbytes; 195 tmp_buf += ALIGN(tmp_buf, long); 196 reach_pos = (void *)tmp_buf; 197 tmp_buf += pbytes; 198 tmp_buf += ALIGN(tmp_buf, long); 199 for (i = 0; i < tnfa->num_states; i++) 200 { 201 reach[i].tags = (void *)tmp_buf; 202 tmp_buf += tbytes; 203 reach_next[i].tags = (void *)tmp_buf; 204 tmp_buf += tbytes; 205 } 206 } 207 208 for (i = 0; i < tnfa->num_states; i++) 209 reach_pos[i].pos = -1; 210 211 /* If only one character can start a match, find it first. */ 212 if (tnfa->first_char >= 0 && str_byte) 213 { 214 const char *orig_str = str_byte; 215 int first = tnfa->first_char; 216 int found_high_bit = 0; 217 218 219 if (type == STR_BYTE) 220 { 221 if (len >= 0) 222 str_byte = memchr(orig_str, first, (size_t)len); 223 else 224 str_byte = strchr(orig_str, first); 225 } 226 else if (type == STR_MBS) 227 { 228 /* 229 * If the match character is ASCII, try to match the character 230 * directly, but if a high bit character is found, we stop there. 231 */ 232 if (first < 0x80) 233 { 234 if (len >= 0) 235 { 236 int i; 237 for (i = 0; ; str_byte++, i++) 238 { 239 if (i >= len) 240 { 241 str_byte = NULL; 242 break; 243 } 244 if (*str_byte == first) 245 break; 246 if (*str_byte & 0x80) 247 { 248 found_high_bit = 1; 249 break; 250 } 251 } 252 } 253 else 254 { 255 for (; ; str_byte++) 256 { 257 if (!*str_byte) 258 { 259 str_byte = NULL; 260 break; 261 } 262 if (*str_byte == first) 263 break; 264 if (*str_byte & 0x80) 265 { 266 found_high_bit = 1; 267 break; 268 } 269 } 270 } 271 } 272 else 273 { 274 if (len >= 0) 275 { 276 int i; 277 for (i = 0; ; str_byte++, i++) 278 { 279 if (i >= len) 280 { 281 str_byte = NULL; 282 break; 283 } 284 if (*str_byte & 0x80) 285 { 286 found_high_bit = 1; 287 break; 288 } 289 } 290 } 291 else 292 { 293 for (; ; str_byte++) 294 { 295 if (!*str_byte) 296 { 297 str_byte = NULL; 298 break; 299 } 300 if (*str_byte & 0x80) 301 { 302 found_high_bit = 1; 303 break; 304 } 305 } 306 } 307 } 308 } 309 if (str_byte == NULL) 310 { 311#ifndef TRE_USE_ALLOCA 312 if (buf) 313 xfree(buf); 314#endif /* !TRE_USE_ALLOCA */ 315 return REG_NOMATCH; 316 } 317 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); 318 if (!found_high_bit) 319 { 320 if (str_byte >= orig_str + 1) 321 prev_c = (unsigned char)*(str_byte - 1); 322 next_c = (unsigned char)*str_byte; 323 pos = str_byte - orig_str; 324 if (len < 0 || pos < len) 325 str_byte++; 326 } 327 else 328 { 329 if (str_byte == orig_str) 330 goto no_first_optimization; 331 /* 332 * Back up one character, fix up the position, then call 333 * GET_NEXT_WCHAR() to process the multibyte character. 334 */ 335 /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */ 336 next_c = (unsigned char)*(str_byte - 1); 337 pos = (str_byte - 1) - orig_str; 338 GET_NEXT_WCHAR(); 339 } 340 } 341 else 342 { 343no_first_optimization: 344 GET_NEXT_WCHAR(); 345 pos = 0; 346 } 347 348#ifdef USE_FIRSTPOS_CHARS /* not defined */ 349 /* Skip over characters that cannot possibly be the first character 350 of a match. */ 351 if (tnfa->firstpos_chars != NULL) 352 { 353 char *chars = tnfa->firstpos_chars; 354 355 if (len < 0) 356 { 357 const char *orig_str = str_byte; 358 /* XXX - use strpbrk() and wcspbrk() because they might be 359 optimized for the target architecture. Try also strcspn() 360 and wcscspn() and compare the speeds. */ 361 while (next_c != L'\0' && !chars[next_c]) 362 { 363 next_c = *str_byte++; 364 } 365 prev_c = *(str_byte - 2); 366 pos += str_byte - orig_str; 367 DPRINT(("skipped %d chars\n", str_byte - orig_str)); 368 } 369 else 370 { 371 while (pos <= len && !chars[next_c]) 372 { 373 prev_c = next_c; 374 next_c = (unsigned char)(*str_byte++); 375 pos++; 376 } 377 } 378 } 379#endif /* USE_FIRSTPOS_CHARS */ 380 381 DPRINT(("length: %d\n", len)); 382 DPRINT(("pos:chr/code | states and tags\n")); 383 DPRINT(("-------------+------------------------------------------------\n")); 384 385 reach_next_i = reach_next; 386 while (/*CONSTCOND*/1) 387 { 388 /* If no match found yet, add the initial states to `reach_next'. */ 389 if (match_eo < 0) 390 { 391 DPRINT((" init >")); 392 trans_i = tnfa->initial; 393 while (trans_i->state != NULL) 394 { 395 if (reach_pos[trans_i->state_id].pos < pos) 396 { 397 if (trans_i->assertions 398 && CHECK_ASSERTIONS(trans_i->assertions)) 399 { 400 DPRINT(("assertion failed\n")); 401 trans_i++; 402 continue; 403 } 404 405 DPRINT((" %p", (void *)trans_i->state)); 406 reach_next_i->state = trans_i->state; 407 memset(reach_next_i->tags, 0, tbytes); 408 tag_i = trans_i->tags; 409 if (tag_i) 410 { 411 while (*tag_i >= 0) 412 { 413 if (*tag_i < num_tags) 414 tre_tag_set(reach_next_i->tags, *tag_i, pos, touch); 415 tag_i++; 416 } 417 touch++; 418 } 419 if (reach_next_i->state == tnfa->final) 420 { 421 DPRINT((" found empty match\n")); 422 match_eo = pos; 423 memcpy(match_tags, reach_next_i->tags, tbytes); 424 } 425 reach_pos[trans_i->state_id].pos = pos; 426 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 427 reach_next_i++; 428 } 429 trans_i++; 430 } 431 DPRINT(("\n")); 432 reach_next_i->state = NULL; 433 } 434 else 435 { 436 if (num_tags == 0 || reach_next_i == reach_next) 437 /*�We have found a match. */ 438 break; 439 } 440 441 /* Check for end of string. */ 442 if (len < 0) 443 { 444#ifdef TRE_STR_USER 445 if (type == STR_USER) 446 { 447 if (str_user_end) 448 break; 449 } 450 else 451#endif /* TRE_STR_USER */ 452 if (next_c == L'\0') 453 break; 454 } 455 else 456 { 457 if (pos >= len) 458 break; 459 } 460 461 GET_NEXT_WCHAR(); 462 463#ifdef TRE_DEBUG 464 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); 465 tre_print_reach(tnfa, reach_next, num_tags); 466 //DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c)); 467 //tre_print_reach(tnfa, reach_next, num_tags); 468#endif /* TRE_DEBUG */ 469 470 /* Swap `reach' and `reach_next'. */ 471 reach_i = reach; 472 reach = reach_next; 473 reach_next = reach_i; 474 475#ifdef TRE_DEBUG 476 once = 0; 477#endif /* TRE_DEBUG */ 478 479 /* For each state in `reach' see if there is a transition leaving with 480 the current input symbol to a state not yet in `reach_next', and 481 add the destination states to `reach_next'. */ 482 reach_next_i = reach_next; 483 for (reach_i = reach; reach_i->state; reach_i++) 484 { 485 for (trans_i = reach_i->state; trans_i->state; trans_i++) 486 { 487 /* Does this transition match the input symbol? */ 488 if (trans_i->code_min <= (tre_cint_t)prev_c && 489 trans_i->code_max >= (tre_cint_t)prev_c) 490 { 491 if (trans_i->assertions 492 && (CHECK_ASSERTIONS(trans_i->assertions) 493 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) 494 { 495 DPRINT(("assertion failed\n")); 496 continue; 497 } 498 499 /* Compute the tags after this transition. */ 500 memcpy(tmp_tags, reach_i->tags, tbytes); 501 tag_i = trans_i->tags; 502 if (tag_i != NULL) 503 { 504 while (*tag_i >= 0) 505 { 506 if (*tag_i < num_tags) 507 tre_tag_set(tmp_tags, *tag_i, pos, touch); 508 tag_i++; 509 } 510 touch++; 511 } 512 513 /* For each new transition, weed out those that don't 514 fulfill the minimal matching conditions. */ 515 if (tnfa->num_minimals && match_eo >= 0) 516 { 517 int skip = 0; 518#ifdef TRE_DEBUG 519 if (!once) 520 { 521 DPRINT(("Checking minimal conditions: match_eo=%d " 522 "match_tags=", match_eo)); 523 tre_print_tags(match_tags, num_tags); 524 DPRINT(("\n")); 525 once++; 526 } 527#endif /* TRE_DEBUG */ 528 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) 529 { 530 int end = tnfa->minimal_tags[i]; 531 int start = tnfa->minimal_tags[i + 1]; 532 DPRINT((" Minimal start %d, end %d\n", start, end)); 533 if (tre_minimal_tag_order(start, end, match_tags, 534 tmp_tags) > 0) 535 { 536 skip = 1; 537 break; 538 } 539 } 540 if (skip) 541 { 542#ifdef TRE_DEBUG 543 DPRINT((" Throwing out")); 544 tre_print_reach1(reach_i->state, tmp_tags, 545 num_tags); 546 DPRINT(("\n")); 547#endif /* TRE_DEBUG */ 548 continue; 549 } 550 } 551 552 if (reach_pos[trans_i->state_id].pos < pos) 553 { 554 /* Found an unvisited node. */ 555 reach_next_i->state = trans_i->state; 556 tmp_iptr = reach_next_i->tags; 557 reach_next_i->tags = tmp_tags; 558 tmp_tags = tmp_iptr; 559 reach_pos[trans_i->state_id].pos = pos; 560 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 561 562 if (reach_next_i->state == tnfa->final 563 && (match_eo == -1 564 || (num_tags > 0 565 && tre_tag_get(reach_next_i->tags, 0) <= 566 tre_tag_get(match_tags, 0)))) 567 { 568#ifdef TRE_DEBUG 569 DPRINT((" found match")); 570 tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags); 571 DPRINT(("\n")); 572#endif /* TRE_DEBUG */ 573 match_eo = pos; 574 memcpy(match_tags, reach_next_i->tags, tbytes); 575 } 576 reach_next_i++; 577 578 } 579 else 580 { 581 assert(reach_pos[trans_i->state_id].pos == pos); 582 /* Another path has also reached this state. We choose 583 the winner by examining the tag values for both 584 paths. */ 585 if (tre_tag_order(num_tags, tnfa->tag_directions, 586 tmp_tags, 587 *reach_pos[trans_i->state_id].tags)) 588 { 589 /* The new path wins. */ 590 tmp_iptr = *reach_pos[trans_i->state_id].tags; 591 *reach_pos[trans_i->state_id].tags = tmp_tags; 592 if (trans_i->state == tnfa->final) 593 { 594#ifdef TRE_DEBUG 595 DPRINT((" found better match")); 596 tre_print_reach1(trans_i->state, tmp_tags, num_tags); 597 DPRINT(("\n")); 598#endif /* TRE_DEBUG */ 599 match_eo = pos; 600 memcpy(match_tags, tmp_tags, tbytes); 601 } 602 tmp_tags = tmp_iptr; 603 } 604 } 605 } 606 } 607 } 608 reach_next_i->state = NULL; 609 } 610 611 DPRINT(("match end offset = %d\n", match_eo)); 612 613 *match_end_ofs = match_eo; 614#ifdef TRE_DEBUG 615 if (match_tags) 616 { 617 DPRINT(("Winning tags=")); 618 tre_print_tags_all(match_tags, num_tags); 619 DPRINT((" touch=%d\n", touch)); 620 } 621#endif /* TRE_DEBUG */ 622 623#ifndef TRE_USE_ALLOCA 624 if (buf) 625 xfree(buf); 626#endif /* !TRE_USE_ALLOCA */ 627 628 return match_eo >= 0 ? REG_OK : REG_NOMATCH; 629} 630 631/* EOF */ 632