1/* 2 tre-match-utils.h - TRE matcher helper definitions 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7*/ 8 9#include "tre-internal.h" 10 11#define str_source ((const tre_str_source*)string) 12 13#ifdef TRE_WCHAR 14 15#ifdef TRE_MULTIBYTE 16 17/* Wide character and multibyte support. */ 18 19#ifdef TRE_STR_USER 20#error TRE_STR_USER defined 21#define GET_NEXT_WCHAR() \ 22 do { \ 23 prev_c = next_c; \ 24 if (type == STR_BYTE) \ 25 { \ 26 pos++; \ 27 if (len >= 0 && pos >= len) \ 28 next_c = '\0'; \ 29 else \ 30 next_c = (unsigned char)(*str_byte++); \ 31 } \ 32 else if (type == STR_WIDE) \ 33 { \ 34 pos++; \ 35 if (len >= 0 && pos >= len) \ 36 next_c = L'\0'; \ 37 else \ 38 next_c = *str_wide++; \ 39 } \ 40 else if (type == STR_MBS) \ 41 { \ 42 pos += pos_add_next; \ 43 if (str_byte == NULL) \ 44 next_c = L'\0'; \ 45 else \ 46 { \ 47 size_t w; \ 48 int max; \ 49 if (len >= 0) \ 50 max = len - pos; \ 51 else \ 52 max = 32; \ 53 if (max <= 0) \ 54 { \ 55 next_c = L'\0'; \ 56 pos_add_next = 1; \ 57 } \ 58 else \ 59 { \ 60 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \ 61 tnfa->loc); \ 62 if (w == (size_t)-1 || w == (size_t)-2) \ 63 return REG_ILLSEQ; \ 64 if (w == 0 && len >= 0) \ 65 { \ 66 pos_add_next = 1; \ 67 next_c = 0; \ 68 str_byte++; \ 69 } \ 70 else \ 71 { \ 72 pos_add_next = w; \ 73 str_byte += w; \ 74 } \ 75 } \ 76 } \ 77 } \ 78 else if (type == STR_USER) \ 79 { \ 80 pos += pos_add_next; \ 81 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ 82 str_source->context); \ 83 } \ 84 } while(/*CONSTCOND*/0) 85#else /* !TRE_STR_USER */ 86/* 87 * Because all multibyte encodings are exclusively single-shift encoding, 88 * with the shift codes having the high bit set, we can make an optimization 89 * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character 90 * is detected, and just do a direct copy for ASCII characters. 91 */ 92#define GET_NEXT_WCHAR() \ 93 do { \ 94 prev_c = next_c; \ 95 switch (type) \ 96 { \ 97 case STR_BYTE: \ 98 pos++; \ 99 if (len >= 0 && pos >= len) \ 100 next_c = '\0'; \ 101 else \ 102 next_c = (unsigned char)(*str_byte++); \ 103 break; \ 104 case STR_WIDE: \ 105 pos++; \ 106 if (len >= 0 && pos >= len) \ 107 next_c = L'\0'; \ 108 else \ 109 next_c = *str_wide++; \ 110 break; \ 111 case STR_MBS: \ 112 pos += pos_add_next; \ 113 if (__builtin_expect(len >= 0 && pos >= len, 0)) \ 114 { \ 115 next_c = L'\0'; \ 116 pos_add_next = 1; \ 117 } \ 118 else if (__builtin_expect(!(*str_byte & 0x80), 1)) \ 119 { \ 120 next_c = (unsigned char)(*str_byte++); \ 121 pos_add_next = 1; \ 122 } \ 123 else \ 124 { \ 125 size_t w; \ 126 int max; \ 127 if (len >= 0) \ 128 max = len - pos; \ 129 else \ 130 max = 32; \ 131 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \ 132 tnfa->loc); \ 133 if (w == (size_t)-1 || w == (size_t)-2) \ 134 return REG_ILLSEQ; \ 135 if (w == 0 && len >= 0) \ 136 { \ 137 pos_add_next = 1; \ 138 next_c = 0; \ 139 str_byte++; \ 140 } \ 141 else \ 142 { \ 143 pos_add_next = w; \ 144 str_byte += w; \ 145 } \ 146 } \ 147 break; \ 148 } \ 149 } while(/*CONSTCOND*/0) 150#endif /* !TRE_STR_USER */ 151 152#else /* !TRE_MULTIBYTE */ 153 154/* Wide character support, no multibyte support. */ 155#error TRE_MULTIBYTE undefined 156 157#ifdef TRE_STR_USER 158#define GET_NEXT_WCHAR() \ 159 do { \ 160 prev_c = next_c; \ 161 if (type == STR_BYTE) \ 162 { \ 163 pos++; \ 164 if (len >= 0 && pos >= len) \ 165 next_c = '\0'; \ 166 else \ 167 next_c = (unsigned char)(*str_byte++); \ 168 } \ 169 else if (type == STR_WIDE) \ 170 { \ 171 pos++; \ 172 if (len >= 0 && pos >= len) \ 173 next_c = L'\0'; \ 174 else \ 175 next_c = *str_wide++; \ 176 } \ 177 else if (type == STR_USER) \ 178 { \ 179 pos += pos_add_next; \ 180 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ 181 str_source->context); \ 182 } \ 183 } while(/*CONSTCOND*/0) 184#else /* !TRE_STR_USER */ 185#define GET_NEXT_WCHAR() \ 186 do { \ 187 prev_c = next_c; \ 188 if (type == STR_BYTE) \ 189 { \ 190 pos++; \ 191 if (len >= 0 && pos >= len) \ 192 next_c = '\0'; \ 193 else \ 194 next_c = (unsigned char)(*str_byte++); \ 195 } \ 196 else if (type == STR_WIDE) \ 197 { \ 198 pos++; \ 199 if (len >= 0 && pos >= len) \ 200 next_c = L'\0'; \ 201 else \ 202 next_c = *str_wide++; \ 203 } \ 204 } while(/*CONSTCOND*/0) 205#endif /* !TRE_STR_USER */ 206 207#endif /* !TRE_MULTIBYTE */ 208 209#else /* !TRE_WCHAR */ 210 211/* No wide character or multibyte support. */ 212#error TRE_WCHAR undefined 213 214#ifdef TRE_STR_USER 215#define GET_NEXT_WCHAR() \ 216 do { \ 217 prev_c = next_c; \ 218 if (type == STR_BYTE) \ 219 { \ 220 pos++; \ 221 if (len >= 0 && pos >= len) \ 222 next_c = '\0'; \ 223 else \ 224 next_c = (unsigned char)(*str_byte++); \ 225 } \ 226 else if (type == STR_USER) \ 227 { \ 228 pos += pos_add_next; \ 229 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ 230 str_source->context); \ 231 } \ 232 } while(/*CONSTCOND*/0) 233#else /* !TRE_STR_USER */ 234#define GET_NEXT_WCHAR() \ 235 do { \ 236 prev_c = next_c; \ 237 if (type == STR_BYTE) \ 238 { \ 239 pos++; \ 240 if (len >= 0 && pos >= len) \ 241 next_c = '\0'; \ 242 else \ 243 next_c = (unsigned char)(*str_byte++); \ 244 } \ 245 } while(/*CONSTCOND*/0) 246#endif /* !TRE_STR_USER */ 247 248#endif /* !TRE_WCHAR */ 249 250 251 252/* Assumes tre_tnfa_t *tnfa in scope */ 253#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum_l(c, tnfa->loc)) 254 255#define CHECK_ASSERTIONS(assertions) \ 256 (((assertions & ASSERT_AT_BOL) \ 257 && (pos > 0 || reg_notbol) \ 258 && (prev_c != L'\n' || !reg_newline)) \ 259 || ((assertions & ASSERT_AT_EOL) \ 260 && (next_c != L'\0' || reg_noteol) \ 261 && (next_c != L'\n' || !reg_newline)) \ 262 || ((assertions & ASSERT_AT_BOW) \ 263 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ 264 || ((assertions & ASSERT_AT_EOW) \ 265 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ 266 || ((assertions & ASSERT_AT_WB) \ 267 && (pos != 0 && next_c != L'\0' \ 268 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ 269 || ((assertions & ASSERT_AT_WB_NEG) \ 270 && (pos == 0 || next_c == L'\0' \ 271 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) 272 273#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ 274 ((trans_i->assertions & ASSERT_BRACKET_MATCH) \ 275 && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c, \ 276 tnfa)) 277 278 279 280 281inline static int 282tre_tag_get(const tre_tag_t *tags, int i) 283{ 284 tags += i; 285 return tags->count > 0 ? tags->value : -1; 286} 287 288inline static void 289tre_tag_set(tre_tag_t *tags, int i, int val, int touch) 290{ 291 tags += i; 292 if (tags->count++ == 0) 293 tags->first = val; 294 tags->value = val; 295 tags->touch = touch; 296} 297 298inline static void 299tre_tag_reset(tre_tag_t *tags, int i) 300{ 301 tags[i].count = 0; 302} 303 304inline static int 305tre_tag_touch_get(const tre_tag_t *tags, int i) 306{ 307 return tags[i].touch; 308} 309 310#ifdef TRE_DEBUG 311inline static void 312tre_print_tags(const tre_tag_t *tags, int num_tags) 313{ 314 int i; 315 for (i = 0; i < num_tags; i++, tags++) 316 { 317 switch(tags->count) 318 { 319 case 0: 320 DPRINT(("%d:(0,-1)", i)); 321 break; 322 case 1: 323 DPRINT(("%d:(1,%d)", i, tags->first)); 324 break; 325 default: 326 DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first, 327 tags->value)); 328 break; 329 } 330 if (i < (num_tags - 1)) 331 DPRINT((" ")); 332 } 333} 334 335inline static void 336tre_print_tags_all(const tre_tag_t *tags, int num_tags) 337{ 338 int i; 339 for (i = 0; i < num_tags; i++, tags++) 340 { 341 switch(tags->count) 342 { 343 case 0: 344 DPRINT(("%d:(0,-1)/%d", i, tags->touch)); 345 break; 346 case 1: 347 DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch)); 348 break; 349 default: 350 DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first, 351 tags->value, tags->touch)); 352 break; 353 } 354 if (i < (num_tags - 1)) 355 DPRINT((" ")); 356 } 357} 358#endif /* TRE_DEBUG */ 359 360/* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal 361 * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */ 362inline static int 363tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1, 364 const tre_tag_t *tags2) 365{ 366 const tre_tag_t *t1, *t2; 367 368 t1 = tags1 + start; 369 t2 = tags2 + start; 370 /* We need both start tags to be set */ 371 if (t1->count == 0 || t2->count == 0) 372 return 0; 373 374 /* The start tags must be equal */ 375 if (t1->value != t2->value) 376 return 0; 377 378 t1 = tags1 + end; 379 t2 = tags2 + end; 380 /* For the end tags, we prefer set over unset, because unset means that 381 * the end tag is still growing */ 382 if (t1->count == 0) 383 { 384 /* if t2 is set, t1 loses since it is unset */ 385 if (t2->count != 0) 386 return -1; 387 } 388 /* if t2 not set, t1 wins since it is set */ 389 else if (t2->count == 0) 390 return 1; 391 392 /* least current value wins */ 393 return t2->value - t1->value; 394} 395 396/* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare 397 * (t1 loses if < 0, t1 wins if > 0) */ 398inline static int 399tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1, 400 const tre_tag_t *t2) 401{ 402 int diff; 403 404 t1 += i; 405 t2 += i; 406 switch (dir) 407 { 408 case TRE_TAG_MINIMIZE: 409 /* least current value wins (because tags are initialized to all zeros, 410 * unset wins over set; also, tre_minimal_tag_order() will have already 411 * been run, which checks for being unset) */ 412 return t2->value - t1->value; 413 414 case TRE_TAG_MAXIMIZE: 415 /* prefer set */ 416 if (t1->count == 0) 417 { 418 /* if neither t1 and t2 are set, try next tag */ 419 if (t2->count == 0) 420 return 0; 421 /* t2 is set, t1 loses since it is unset */ 422 return -1; 423 } 424 /* if t2 not set, t1 wins since it is set */ 425 else if (t2->count == 0) 426 return 1; 427 /* greatest initial value wins */ 428 if ((diff = t1->first - t2->first) != 0) 429 return diff; 430 /* least number of times the tag was set, wins */ 431 if ((diff = t2->count - t1->count) != 0) 432 return diff; 433 /* if the tags were only set once, they only have initial values */ 434 if (t1->count == 1) 435 return 0; 436 /* greatest current value wins */ 437 return t1->value - t2->value; 438 439 case TRE_TAG_LEFT_MAXIMIZE: 440 /* prefer set */ 441 if (t1->count == 0) 442 { 443 /* if neither t1 and t2 are set, try next tag */ 444 if (t2->count == 0) 445 return 0; 446 /* t2 is set, t1 loses since it is unset */ 447 return -1; 448 } 449 /* if t2 not set, t1 wins since it is set */ 450 else if (t2->count == 0) 451 return 1; 452 /* least initial value wins */ 453 if ((diff = t2->first - t1->first) != 0) 454 return diff; 455 /* least number of times the tag was set, wins */ 456 if ((diff = t2->count - t1->count) != 0) 457 return diff; 458 /* if the tags were only set once, they only have initial values */ 459 if (t1->count == 1) 460 return 0; 461 /* greatest current value wins */ 462 return t1->value - t2->value; 463 464 default: 465 /* Shouldn't happen: only assert if TRE_DEBUG defined */ 466 assert(0); 467 break; 468 } 469 return 0; 470} 471 472#ifdef TRE_DEBUG 473#define _MORE_DEBUGGING 474#endif /* TRE_DEBUG */ 475 476/* Returns 1 if `t1' wins `t2', 0 otherwise. */ 477inline static int 478#ifdef _MORE_DEBUGGING 479_tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, 480 const tre_tag_t *t1, const tre_tag_t *t2) 481#else /* !_MORE_DEBUGGING */ 482tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, 483 const tre_tag_t *t1, const tre_tag_t *t2) 484#endif /* !_MORE_DEBUGGING */ 485{ 486 int i, ret; 487 488 for (i = 0; i < num_tags; i++) 489 { 490 if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0) 491 return (ret > 0); 492 } 493 /* assert(0);*/ 494 return 0; 495} 496 497#ifdef _MORE_DEBUGGING 498inline static int 499tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, 500 const tre_tag_t *t1, const tre_tag_t *t2) 501{ 502 int ret = _tre_tag_order(num_tags, tag_directions, t1, t2); 503 DPRINT(("tre_tag_order: ")); 504 tre_print_tags(t1, num_tags); 505 DPRINT((" %s ", ret ? "wins" : "doesn't win")); 506 tre_print_tags(t2, num_tags); 507 DPRINT(("\n")); 508 return ret; 509} 510#endif /* _MORE_DEBUGGING */ 511 512#ifdef __LIBC__ 513#include <xlocale_private.h> 514#else /* !__LIBC__ */ 515#include <xlocale.h> 516#endif /* !__LIBC__ */ 517 518int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len); 519 520inline static int 521tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc, 522 const tre_tnfa_t * __restrict tnfa) 523{ 524 int match = 0; 525 int i; 526 tre_bracket_match_t *b; 527 tre_cint_t uc, lc; 528 int we, ue, le, got_equiv = 0; 529 int icase = ((tnfa->cflags & REG_ICASE) != 0); 530 531 DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase)); 532 if (icase) 533 { 534 if (tre_islower_l(wc, tnfa->loc)) 535 { 536 lc = wc; 537 uc = tre_toupper_l(wc, tnfa->loc); 538 } 539 else if (tre_isupper_l(wc, tnfa->loc)) 540 { 541 uc = wc; 542 lc = tre_tolower_l(wc, tnfa->loc); 543 } 544 else 545 { 546 icase = 0; 547 } 548 } 549 for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches; 550 i++, b++) 551 { 552 switch (b->type) 553 { 554 case TRE_BRACKET_MATCH_TYPE_CHAR: 555 if (icase) 556 match = (b->value == uc || b->value == lc); 557 else 558 match = (b->value == wc); 559 break; 560 case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN: 561 { 562 tre_cint_t start = b->value, end; 563 if (++i >= list->num_bracket_matches || 564 (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END) 565 { 566 DPRINT(("tre_bracket_match: no following range end\n")); 567 assert(0); 568 goto error; 569 } 570 end = b->value; 571 if (!got_equiv) 572 { 573 if (icase) 574 { 575 ue = __collate_equiv_value(tnfa->loc, &uc, 1); 576 le = __collate_equiv_value(tnfa->loc, &lc, 1); 577 } 578 else 579 we = __collate_equiv_value(tnfa->loc, &wc, 1); 580 got_equiv = 1; 581 } 582 if (icase) 583 match = ((start <= ue && ue <= end) || 584 (start <= le && le <= end)); 585 else 586 match = (start <= we && we <= end); 587 break; 588 } 589 case TRE_BRACKET_MATCH_TYPE_RANGE_END: 590 DPRINT(("tre_bracket_match: range end without preceeding start\n")); 591 assert(0); 592 break; 593 case TRE_BRACKET_MATCH_TYPE_CLASS: 594 if (icase) 595 match = (tre_isctype_l(uc, b->value, tnfa->loc) || 596 tre_isctype_l(lc, b->value, tnfa->loc)); 597 else 598 match = (tre_isctype_l(wc, b->value, tnfa->loc)); 599 break; 600 case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE: 601 if (!got_equiv) 602 { 603 if (icase) 604 { 605 ue = __collate_equiv_value(tnfa->loc, &uc, 1); 606 le = __collate_equiv_value(tnfa->loc, &lc, 1); 607 } 608 else 609 we = __collate_equiv_value(tnfa->loc, &wc, 1); 610 got_equiv = 1; 611 } 612 if (icase) 613 match = (b->value == ue || b->value == le); 614 else 615 match = (b->value == we); 616 break; 617 default: 618 DPRINT(("tre_bracket_match: unknown type %d\n", b->type)); 619 assert(0); 620 break; 621 } 622 if (match) 623 break; 624 } 625error: 626 if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) { 627 if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0; 628 match = !match; 629 } 630 return match; 631} 632