1/* -*- buffer-read-only: t -*- vi: set ro: */ 2/* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3#line 1 4/* Extended regular expression matching and search library. 5 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free 6 Software Foundation, Inc. 7 This file is part of the GNU C Library. 8 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 9 10 This program is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 3, or (at your option) 13 any later version. 14 15 This program is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License along 21 with this program; if not, write to the Free Software Foundation, 22 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 23 24#include "verify.h" 25#include "intprops.h" 26static void re_string_construct_common (const char *str, Idx len, 27 re_string_t *pstr, 28 RE_TRANSLATE_TYPE trans, bool icase, 29 const re_dfa_t *dfa) internal_function; 30static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 31 const re_node_set *nodes, 32 re_hashval_t hash) internal_function; 33static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 34 const re_node_set *nodes, 35 unsigned int context, 36 re_hashval_t hash) internal_function; 37 38/* Functions for string operation. */ 39 40/* This function allocate the buffers. It is necessary to call 41 re_string_reconstruct before using the object. */ 42 43static reg_errcode_t 44internal_function 45re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 46 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 47{ 48 reg_errcode_t ret; 49 Idx init_buf_len; 50 51 /* Ensure at least one character fits into the buffers. */ 52 if (init_len < dfa->mb_cur_max) 53 init_len = dfa->mb_cur_max; 54 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 55 re_string_construct_common (str, len, pstr, trans, icase, dfa); 56 57 ret = re_string_realloc_buffers (pstr, init_buf_len); 58 if (BE (ret != REG_NOERROR, 0)) 59 return ret; 60 61 pstr->word_char = dfa->word_char; 62 pstr->word_ops_used = dfa->word_ops_used; 63 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 64 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 65 pstr->valid_raw_len = pstr->valid_len; 66 return REG_NOERROR; 67} 68 69/* This function allocate the buffers, and initialize them. */ 70 71static reg_errcode_t 72internal_function 73re_string_construct (re_string_t *pstr, const char *str, Idx len, 74 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 75{ 76 reg_errcode_t ret; 77 memset (pstr, '\0', sizeof (re_string_t)); 78 re_string_construct_common (str, len, pstr, trans, icase, dfa); 79 80 if (len > 0) 81 { 82 ret = re_string_realloc_buffers (pstr, len + 1); 83 if (BE (ret != REG_NOERROR, 0)) 84 return ret; 85 } 86 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 87 88 if (icase) 89 { 90#ifdef RE_ENABLE_I18N 91 if (dfa->mb_cur_max > 1) 92 { 93 while (1) 94 { 95 ret = build_wcs_upper_buffer (pstr); 96 if (BE (ret != REG_NOERROR, 0)) 97 return ret; 98 if (pstr->valid_raw_len >= len) 99 break; 100 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 101 break; 102 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 103 if (BE (ret != REG_NOERROR, 0)) 104 return ret; 105 } 106 } 107 else 108#endif /* RE_ENABLE_I18N */ 109 build_upper_buffer (pstr); 110 } 111 else 112 { 113#ifdef RE_ENABLE_I18N 114 if (dfa->mb_cur_max > 1) 115 build_wcs_buffer (pstr); 116 else 117#endif /* RE_ENABLE_I18N */ 118 { 119 if (trans != NULL) 120 re_string_translate_buffer (pstr); 121 else 122 { 123 pstr->valid_len = pstr->bufs_len; 124 pstr->valid_raw_len = pstr->bufs_len; 125 } 126 } 127 } 128 129 return REG_NOERROR; 130} 131 132/* Helper functions for re_string_allocate, and re_string_construct. */ 133 134static reg_errcode_t 135internal_function 136re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 137{ 138#ifdef RE_ENABLE_I18N 139 if (pstr->mb_cur_max > 1) 140 { 141 wint_t *new_wcs; 142 143 /* Avoid overflow. */ 144 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); 145 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) 146 return REG_ESPACE; 147 148 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 149 if (BE (new_wcs == NULL, 0)) 150 return REG_ESPACE; 151 pstr->wcs = new_wcs; 152 if (pstr->offsets != NULL) 153 { 154 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len); 155 if (BE (new_offsets == NULL, 0)) 156 return REG_ESPACE; 157 pstr->offsets = new_offsets; 158 } 159 } 160#endif /* RE_ENABLE_I18N */ 161 if (pstr->mbs_allocated) 162 { 163 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 164 new_buf_len); 165 if (BE (new_mbs == NULL, 0)) 166 return REG_ESPACE; 167 pstr->mbs = new_mbs; 168 } 169 pstr->bufs_len = new_buf_len; 170 return REG_NOERROR; 171} 172 173 174static void 175internal_function 176re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 177 RE_TRANSLATE_TYPE trans, bool icase, 178 const re_dfa_t *dfa) 179{ 180 pstr->raw_mbs = (const unsigned char *) str; 181 pstr->len = len; 182 pstr->raw_len = len; 183 pstr->trans = trans; 184 pstr->icase = icase; 185 pstr->mbs_allocated = (trans != NULL || icase); 186 pstr->mb_cur_max = dfa->mb_cur_max; 187 pstr->is_utf8 = dfa->is_utf8; 188 pstr->map_notascii = dfa->map_notascii; 189 pstr->stop = pstr->len; 190 pstr->raw_stop = pstr->stop; 191} 192 193#ifdef RE_ENABLE_I18N 194 195/* Build wide character buffer PSTR->WCS. 196 If the byte sequence of the string are: 197 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 198 Then wide character buffer will be: 199 <wc1> , WEOF , <wc2> , WEOF , <wc3> 200 We use WEOF for padding, they indicate that the position isn't 201 a first byte of a multibyte character. 202 203 Note that this function assumes PSTR->VALID_LEN elements are already 204 built and starts from PSTR->VALID_LEN. */ 205 206static void 207internal_function 208build_wcs_buffer (re_string_t *pstr) 209{ 210#ifdef _LIBC 211 unsigned char buf[MB_LEN_MAX]; 212 assert (MB_LEN_MAX >= pstr->mb_cur_max); 213#else 214 unsigned char buf[64]; 215#endif 216 mbstate_t prev_st; 217 Idx byte_idx, end_idx, remain_len; 218 size_t mbclen; 219 220 /* Build the buffers from pstr->valid_len to either pstr->len or 221 pstr->bufs_len. */ 222 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 223 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 224 { 225 wchar_t wc; 226 const char *p; 227 228 remain_len = end_idx - byte_idx; 229 prev_st = pstr->cur_state; 230 /* Apply the translation if we need. */ 231 if (BE (pstr->trans != NULL, 0)) 232 { 233 int i, ch; 234 235 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 236 { 237 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 238 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 239 } 240 p = (const char *) buf; 241 } 242 else 243 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 244 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 245 if (BE (mbclen == (size_t) -2, 0)) 246 { 247 /* The buffer doesn't have enough space, finish to build. */ 248 pstr->cur_state = prev_st; 249 break; 250 } 251 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 252 { 253 /* We treat these cases as a singlebyte character. */ 254 mbclen = 1; 255 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 256 if (BE (pstr->trans != NULL, 0)) 257 wc = pstr->trans[wc]; 258 pstr->cur_state = prev_st; 259 } 260 261 /* Write wide character and padding. */ 262 pstr->wcs[byte_idx++] = wc; 263 /* Write paddings. */ 264 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 265 pstr->wcs[byte_idx++] = WEOF; 266 } 267 pstr->valid_len = byte_idx; 268 pstr->valid_raw_len = byte_idx; 269} 270 271/* Build wide character buffer PSTR->WCS like build_wcs_buffer, 272 but for REG_ICASE. */ 273 274static reg_errcode_t 275internal_function 276build_wcs_upper_buffer (re_string_t *pstr) 277{ 278 mbstate_t prev_st; 279 Idx src_idx, byte_idx, end_idx, remain_len; 280 size_t mbclen; 281#ifdef _LIBC 282 char buf[MB_LEN_MAX]; 283 assert (MB_LEN_MAX >= pstr->mb_cur_max); 284#else 285 char buf[64]; 286#endif 287 288 byte_idx = pstr->valid_len; 289 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 290 291 /* The following optimization assumes that ASCII characters can be 292 mapped to wide characters with a simple cast. */ 293 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 294 { 295 while (byte_idx < end_idx) 296 { 297 wchar_t wc; 298 299 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 300 && mbsinit (&pstr->cur_state)) 301 { 302 /* In case of a singlebyte character. */ 303 pstr->mbs[byte_idx] 304 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 305 /* The next step uses the assumption that wchar_t is encoded 306 ASCII-safe: all ASCII values can be converted like this. */ 307 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 308 ++byte_idx; 309 continue; 310 } 311 312 remain_len = end_idx - byte_idx; 313 prev_st = pstr->cur_state; 314 mbclen = __mbrtowc (&wc, 315 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 316 + byte_idx), remain_len, &pstr->cur_state); 317 if (BE (mbclen < (size_t) -2, 1)) 318 { 319 wchar_t wcu = wc; 320 if (iswlower (wc)) 321 { 322 size_t mbcdlen; 323 324 wcu = towupper (wc); 325 mbcdlen = wcrtomb (buf, wcu, &prev_st); 326 if (BE (mbclen == mbcdlen, 1)) 327 memcpy (pstr->mbs + byte_idx, buf, mbclen); 328 else 329 { 330 src_idx = byte_idx; 331 goto offsets_needed; 332 } 333 } 334 else 335 memcpy (pstr->mbs + byte_idx, 336 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 337 pstr->wcs[byte_idx++] = wcu; 338 /* Write paddings. */ 339 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 340 pstr->wcs[byte_idx++] = WEOF; 341 } 342 else if (mbclen == (size_t) -1 || mbclen == 0) 343 { 344 /* It is an invalid character or '\0'. Just use the byte. */ 345 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 346 pstr->mbs[byte_idx] = ch; 347 /* And also cast it to wide char. */ 348 pstr->wcs[byte_idx++] = (wchar_t) ch; 349 if (BE (mbclen == (size_t) -1, 0)) 350 pstr->cur_state = prev_st; 351 } 352 else 353 { 354 /* The buffer doesn't have enough space, finish to build. */ 355 pstr->cur_state = prev_st; 356 break; 357 } 358 } 359 pstr->valid_len = byte_idx; 360 pstr->valid_raw_len = byte_idx; 361 return REG_NOERROR; 362 } 363 else 364 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 365 { 366 wchar_t wc; 367 const char *p; 368 offsets_needed: 369 remain_len = end_idx - byte_idx; 370 prev_st = pstr->cur_state; 371 if (BE (pstr->trans != NULL, 0)) 372 { 373 int i, ch; 374 375 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 376 { 377 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 378 buf[i] = pstr->trans[ch]; 379 } 380 p = (const char *) buf; 381 } 382 else 383 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 384 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 385 if (BE (mbclen < (size_t) -2, 1)) 386 { 387 wchar_t wcu = wc; 388 if (iswlower (wc)) 389 { 390 size_t mbcdlen; 391 392 wcu = towupper (wc); 393 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 394 if (BE (mbclen == mbcdlen, 1)) 395 memcpy (pstr->mbs + byte_idx, buf, mbclen); 396 else if (mbcdlen != (size_t) -1) 397 { 398 size_t i; 399 400 if (byte_idx + mbcdlen > pstr->bufs_len) 401 { 402 pstr->cur_state = prev_st; 403 break; 404 } 405 406 if (pstr->offsets == NULL) 407 { 408 pstr->offsets = re_malloc (Idx, pstr->bufs_len); 409 410 if (pstr->offsets == NULL) 411 return REG_ESPACE; 412 } 413 if (!pstr->offsets_needed) 414 { 415 for (i = 0; i < (size_t) byte_idx; ++i) 416 pstr->offsets[i] = i; 417 pstr->offsets_needed = 1; 418 } 419 420 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 421 pstr->wcs[byte_idx] = wcu; 422 pstr->offsets[byte_idx] = src_idx; 423 for (i = 1; i < mbcdlen; ++i) 424 { 425 pstr->offsets[byte_idx + i] 426 = src_idx + (i < mbclen ? i : mbclen - 1); 427 pstr->wcs[byte_idx + i] = WEOF; 428 } 429 pstr->len += mbcdlen - mbclen; 430 if (pstr->raw_stop > src_idx) 431 pstr->stop += mbcdlen - mbclen; 432 end_idx = (pstr->bufs_len > pstr->len) 433 ? pstr->len : pstr->bufs_len; 434 byte_idx += mbcdlen; 435 src_idx += mbclen; 436 continue; 437 } 438 else 439 memcpy (pstr->mbs + byte_idx, p, mbclen); 440 } 441 else 442 memcpy (pstr->mbs + byte_idx, p, mbclen); 443 444 if (BE (pstr->offsets_needed != 0, 0)) 445 { 446 size_t i; 447 for (i = 0; i < mbclen; ++i) 448 pstr->offsets[byte_idx + i] = src_idx + i; 449 } 450 src_idx += mbclen; 451 452 pstr->wcs[byte_idx++] = wcu; 453 /* Write paddings. */ 454 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 455 pstr->wcs[byte_idx++] = WEOF; 456 } 457 else if (mbclen == (size_t) -1 || mbclen == 0) 458 { 459 /* It is an invalid character or '\0'. Just use the byte. */ 460 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 461 462 if (BE (pstr->trans != NULL, 0)) 463 ch = pstr->trans [ch]; 464 pstr->mbs[byte_idx] = ch; 465 466 if (BE (pstr->offsets_needed != 0, 0)) 467 pstr->offsets[byte_idx] = src_idx; 468 ++src_idx; 469 470 /* And also cast it to wide char. */ 471 pstr->wcs[byte_idx++] = (wchar_t) ch; 472 if (BE (mbclen == (size_t) -1, 0)) 473 pstr->cur_state = prev_st; 474 } 475 else 476 { 477 /* The buffer doesn't have enough space, finish to build. */ 478 pstr->cur_state = prev_st; 479 break; 480 } 481 } 482 pstr->valid_len = byte_idx; 483 pstr->valid_raw_len = src_idx; 484 return REG_NOERROR; 485} 486 487/* Skip characters until the index becomes greater than NEW_RAW_IDX. 488 Return the index. */ 489 490static Idx 491internal_function 492re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 493{ 494 mbstate_t prev_st; 495 Idx rawbuf_idx; 496 size_t mbclen; 497 wint_t wc = WEOF; 498 499 /* Skip the characters which are not necessary to check. */ 500 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 501 rawbuf_idx < new_raw_idx;) 502 { 503 wchar_t wc2; 504 Idx remain_len; 505 remain_len = pstr->len - rawbuf_idx; 506 prev_st = pstr->cur_state; 507 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 508 remain_len, &pstr->cur_state); 509 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 510 { 511 /* We treat these cases as a single byte character. */ 512 if (mbclen == 0 || remain_len == 0) 513 wc = L'\0'; 514 else 515 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 516 mbclen = 1; 517 pstr->cur_state = prev_st; 518 } 519 else 520 wc = wc2; 521 /* Then proceed the next character. */ 522 rawbuf_idx += mbclen; 523 } 524 *last_wc = wc; 525 return rawbuf_idx; 526} 527#endif /* RE_ENABLE_I18N */ 528 529/* Build the buffer PSTR->MBS, and apply the translation if we need. 530 This function is used in case of REG_ICASE. */ 531 532static void 533internal_function 534build_upper_buffer (re_string_t *pstr) 535{ 536 Idx char_idx, end_idx; 537 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 538 539 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 540 { 541 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 542 if (BE (pstr->trans != NULL, 0)) 543 ch = pstr->trans[ch]; 544 if (islower (ch)) 545 pstr->mbs[char_idx] = toupper (ch); 546 else 547 pstr->mbs[char_idx] = ch; 548 } 549 pstr->valid_len = char_idx; 550 pstr->valid_raw_len = char_idx; 551} 552 553/* Apply TRANS to the buffer in PSTR. */ 554 555static void 556internal_function 557re_string_translate_buffer (re_string_t *pstr) 558{ 559 Idx buf_idx, end_idx; 560 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 561 562 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 563 { 564 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 565 pstr->mbs[buf_idx] = pstr->trans[ch]; 566 } 567 568 pstr->valid_len = buf_idx; 569 pstr->valid_raw_len = buf_idx; 570} 571 572/* This function re-construct the buffers. 573 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 574 convert to upper case in case of REG_ICASE, apply translation. */ 575 576static reg_errcode_t 577internal_function 578re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 579{ 580 Idx offset; 581 582 if (BE (pstr->raw_mbs_idx <= idx, 0)) 583 offset = idx - pstr->raw_mbs_idx; 584 else 585 { 586 /* Reset buffer. */ 587#ifdef RE_ENABLE_I18N 588 if (pstr->mb_cur_max > 1) 589 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 590#endif /* RE_ENABLE_I18N */ 591 pstr->len = pstr->raw_len; 592 pstr->stop = pstr->raw_stop; 593 pstr->valid_len = 0; 594 pstr->raw_mbs_idx = 0; 595 pstr->valid_raw_len = 0; 596 pstr->offsets_needed = 0; 597 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 598 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 599 if (!pstr->mbs_allocated) 600 pstr->mbs = (unsigned char *) pstr->raw_mbs; 601 offset = idx; 602 } 603 604 if (BE (offset != 0, 1)) 605 { 606 /* Should the already checked characters be kept? */ 607 if (BE (offset < pstr->valid_raw_len, 1)) 608 { 609 /* Yes, move them to the front of the buffer. */ 610#ifdef RE_ENABLE_I18N 611 if (BE (pstr->offsets_needed, 0)) 612 { 613 Idx low = 0, high = pstr->valid_len, mid; 614 do 615 { 616 mid = (high + low) / 2; 617 if (pstr->offsets[mid] > offset) 618 high = mid; 619 else if (pstr->offsets[mid] < offset) 620 low = mid + 1; 621 else 622 break; 623 } 624 while (low < high); 625 if (pstr->offsets[mid] < offset) 626 ++mid; 627 pstr->tip_context = re_string_context_at (pstr, mid - 1, 628 eflags); 629 /* This can be quite complicated, so handle specially 630 only the common and easy case where the character with 631 different length representation of lower and upper 632 case is present at or after offset. */ 633 if (pstr->valid_len > offset 634 && mid == offset && pstr->offsets[mid] == offset) 635 { 636 memmove (pstr->wcs, pstr->wcs + offset, 637 (pstr->valid_len - offset) * sizeof (wint_t)); 638 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 639 pstr->valid_len -= offset; 640 pstr->valid_raw_len -= offset; 641 for (low = 0; low < pstr->valid_len; low++) 642 pstr->offsets[low] = pstr->offsets[low + offset] - offset; 643 } 644 else 645 { 646 /* Otherwise, just find out how long the partial multibyte 647 character at offset is and fill it with WEOF/255. */ 648 pstr->len = pstr->raw_len - idx + offset; 649 pstr->stop = pstr->raw_stop - idx + offset; 650 pstr->offsets_needed = 0; 651 while (mid > 0 && pstr->offsets[mid - 1] == offset) 652 --mid; 653 while (mid < pstr->valid_len) 654 if (pstr->wcs[mid] != WEOF) 655 break; 656 else 657 ++mid; 658 if (mid == pstr->valid_len) 659 pstr->valid_len = 0; 660 else 661 { 662 pstr->valid_len = pstr->offsets[mid] - offset; 663 if (pstr->valid_len) 664 { 665 for (low = 0; low < pstr->valid_len; ++low) 666 pstr->wcs[low] = WEOF; 667 memset (pstr->mbs, 255, pstr->valid_len); 668 } 669 } 670 pstr->valid_raw_len = pstr->valid_len; 671 } 672 } 673 else 674#endif 675 { 676 pstr->tip_context = re_string_context_at (pstr, offset - 1, 677 eflags); 678#ifdef RE_ENABLE_I18N 679 if (pstr->mb_cur_max > 1) 680 memmove (pstr->wcs, pstr->wcs + offset, 681 (pstr->valid_len - offset) * sizeof (wint_t)); 682#endif /* RE_ENABLE_I18N */ 683 if (BE (pstr->mbs_allocated, 0)) 684 memmove (pstr->mbs, pstr->mbs + offset, 685 pstr->valid_len - offset); 686 pstr->valid_len -= offset; 687 pstr->valid_raw_len -= offset; 688#if DEBUG 689 assert (pstr->valid_len > 0); 690#endif 691 } 692 } 693 else 694 { 695#ifdef RE_ENABLE_I18N 696 /* No, skip all characters until IDX. */ 697 Idx prev_valid_len = pstr->valid_len; 698 699 if (BE (pstr->offsets_needed, 0)) 700 { 701 pstr->len = pstr->raw_len - idx + offset; 702 pstr->stop = pstr->raw_stop - idx + offset; 703 pstr->offsets_needed = 0; 704 } 705#endif 706 pstr->valid_len = 0; 707#ifdef RE_ENABLE_I18N 708 if (pstr->mb_cur_max > 1) 709 { 710 Idx wcs_idx; 711 wint_t wc = WEOF; 712 713 if (pstr->is_utf8) 714 { 715 const unsigned char *raw, *p, *end; 716 717 /* Special case UTF-8. Multi-byte chars start with any 718 byte other than 0x80 - 0xbf. */ 719 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 720 end = raw + (offset - pstr->mb_cur_max); 721 if (end < pstr->raw_mbs) 722 end = pstr->raw_mbs; 723 p = raw + offset - 1; 724#ifdef _LIBC 725 /* We know the wchar_t encoding is UCS4, so for the simple 726 case, ASCII characters, skip the conversion step. */ 727 if (isascii (*p) && BE (pstr->trans == NULL, 1)) 728 { 729 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 730 /* pstr->valid_len = 0; */ 731 wc = (wchar_t) *p; 732 } 733 else 734#endif 735 for (; p >= end; --p) 736 if ((*p & 0xc0) != 0x80) 737 { 738 mbstate_t cur_state; 739 wchar_t wc2; 740 Idx mlen = raw + pstr->len - p; 741 unsigned char buf[6]; 742 size_t mbclen; 743 744 if (BE (pstr->trans != NULL, 0)) 745 { 746 int i = mlen < 6 ? mlen : 6; 747 while (--i >= 0) 748 buf[i] = pstr->trans[p[i]]; 749 } 750 /* XXX Don't use mbrtowc, we know which conversion 751 to use (UTF-8 -> UCS4). */ 752 memset (&cur_state, 0, sizeof (cur_state)); 753 mbclen = __mbrtowc (&wc2, (const char *) p, mlen, 754 &cur_state); 755 if (raw + offset - p <= mbclen 756 && mbclen < (size_t) -2) 757 { 758 memset (&pstr->cur_state, '\0', 759 sizeof (mbstate_t)); 760 pstr->valid_len = mbclen - (raw + offset - p); 761 wc = wc2; 762 } 763 break; 764 } 765 } 766 767 if (wc == WEOF) 768 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 769 if (wc == WEOF) 770 pstr->tip_context 771 = re_string_context_at (pstr, prev_valid_len - 1, eflags); 772 else 773 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 774 && IS_WIDE_WORD_CHAR (wc)) 775 ? CONTEXT_WORD 776 : ((IS_WIDE_NEWLINE (wc) 777 && pstr->newline_anchor) 778 ? CONTEXT_NEWLINE : 0)); 779 if (BE (pstr->valid_len, 0)) 780 { 781 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 782 pstr->wcs[wcs_idx] = WEOF; 783 if (pstr->mbs_allocated) 784 memset (pstr->mbs, 255, pstr->valid_len); 785 } 786 pstr->valid_raw_len = pstr->valid_len; 787 } 788 else 789#endif /* RE_ENABLE_I18N */ 790 { 791 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 792 pstr->valid_raw_len = 0; 793 if (pstr->trans) 794 c = pstr->trans[c]; 795 pstr->tip_context = (bitset_contain (pstr->word_char, c) 796 ? CONTEXT_WORD 797 : ((IS_NEWLINE (c) && pstr->newline_anchor) 798 ? CONTEXT_NEWLINE : 0)); 799 } 800 } 801 if (!BE (pstr->mbs_allocated, 0)) 802 pstr->mbs += offset; 803 } 804 pstr->raw_mbs_idx = idx; 805 pstr->len -= offset; 806 pstr->stop -= offset; 807 808 /* Then build the buffers. */ 809#ifdef RE_ENABLE_I18N 810 if (pstr->mb_cur_max > 1) 811 { 812 if (pstr->icase) 813 { 814 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 815 if (BE (ret != REG_NOERROR, 0)) 816 return ret; 817 } 818 else 819 build_wcs_buffer (pstr); 820 } 821 else 822#endif /* RE_ENABLE_I18N */ 823 if (BE (pstr->mbs_allocated, 0)) 824 { 825 if (pstr->icase) 826 build_upper_buffer (pstr); 827 else if (pstr->trans != NULL) 828 re_string_translate_buffer (pstr); 829 } 830 else 831 pstr->valid_len = pstr->len; 832 833 pstr->cur_idx = 0; 834 return REG_NOERROR; 835} 836 837static unsigned char 838internal_function __attribute ((pure)) 839re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 840{ 841 int ch; 842 Idx off; 843 844 /* Handle the common (easiest) cases first. */ 845 if (BE (!pstr->mbs_allocated, 1)) 846 return re_string_peek_byte (pstr, idx); 847 848#ifdef RE_ENABLE_I18N 849 if (pstr->mb_cur_max > 1 850 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 851 return re_string_peek_byte (pstr, idx); 852#endif 853 854 off = pstr->cur_idx + idx; 855#ifdef RE_ENABLE_I18N 856 if (pstr->offsets_needed) 857 off = pstr->offsets[off]; 858#endif 859 860 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 861 862#ifdef RE_ENABLE_I18N 863 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 864 this function returns CAPITAL LETTER I instead of first byte of 865 DOTLESS SMALL LETTER I. The latter would confuse the parser, 866 since peek_byte_case doesn't advance cur_idx in any way. */ 867 if (pstr->offsets_needed && !isascii (ch)) 868 return re_string_peek_byte (pstr, idx); 869#endif 870 871 return ch; 872} 873 874static unsigned char 875internal_function __attribute ((pure)) 876re_string_fetch_byte_case (re_string_t *pstr) 877{ 878 if (BE (!pstr->mbs_allocated, 1)) 879 return re_string_fetch_byte (pstr); 880 881#ifdef RE_ENABLE_I18N 882 if (pstr->offsets_needed) 883 { 884 Idx off; 885 int ch; 886 887 /* For tr_TR.UTF-8 [[:islower:]] there is 888 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 889 in that case the whole multi-byte character and return 890 the original letter. On the other side, with 891 [[: DOTLESS SMALL LETTER I return [[:I, as doing 892 anything else would complicate things too much. */ 893 894 if (!re_string_first_byte (pstr, pstr->cur_idx)) 895 return re_string_fetch_byte (pstr); 896 897 off = pstr->offsets[pstr->cur_idx]; 898 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 899 900 if (! isascii (ch)) 901 return re_string_fetch_byte (pstr); 902 903 re_string_skip_bytes (pstr, 904 re_string_char_size_at (pstr, pstr->cur_idx)); 905 return ch; 906 } 907#endif 908 909 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 910} 911 912static void 913internal_function 914re_string_destruct (re_string_t *pstr) 915{ 916#ifdef RE_ENABLE_I18N 917 re_free (pstr->wcs); 918 re_free (pstr->offsets); 919#endif /* RE_ENABLE_I18N */ 920 if (pstr->mbs_allocated) 921 re_free (pstr->mbs); 922} 923 924/* Return the context at IDX in INPUT. */ 925 926static unsigned int 927internal_function 928re_string_context_at (const re_string_t *input, Idx idx, int eflags) 929{ 930 int c; 931 if (BE (! REG_VALID_INDEX (idx), 0)) 932 /* In this case, we use the value stored in input->tip_context, 933 since we can't know the character in input->mbs[-1] here. */ 934 return input->tip_context; 935 if (BE (idx == input->len, 0)) 936 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 937 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 938#ifdef RE_ENABLE_I18N 939 if (input->mb_cur_max > 1) 940 { 941 wint_t wc; 942 Idx wc_idx = idx; 943 while(input->wcs[wc_idx] == WEOF) 944 { 945#ifdef DEBUG 946 /* It must not happen. */ 947 assert (REG_VALID_INDEX (wc_idx)); 948#endif 949 --wc_idx; 950 if (! REG_VALID_INDEX (wc_idx)) 951 return input->tip_context; 952 } 953 wc = input->wcs[wc_idx]; 954 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 955 return CONTEXT_WORD; 956 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 957 ? CONTEXT_NEWLINE : 0); 958 } 959 else 960#endif 961 { 962 c = re_string_byte_at (input, idx); 963 if (bitset_contain (input->word_char, c)) 964 return CONTEXT_WORD; 965 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 966 } 967} 968 969/* Functions for set operation. */ 970 971static reg_errcode_t 972internal_function 973re_node_set_alloc (re_node_set *set, Idx size) 974{ 975 set->alloc = size; 976 set->nelem = 0; 977 set->elems = re_malloc (Idx, size); 978 if (BE (set->elems == NULL, 0)) 979 return REG_ESPACE; 980 return REG_NOERROR; 981} 982 983static reg_errcode_t 984internal_function 985re_node_set_init_1 (re_node_set *set, Idx elem) 986{ 987 set->alloc = 1; 988 set->nelem = 1; 989 set->elems = re_malloc (Idx, 1); 990 if (BE (set->elems == NULL, 0)) 991 { 992 set->alloc = set->nelem = 0; 993 return REG_ESPACE; 994 } 995 set->elems[0] = elem; 996 return REG_NOERROR; 997} 998 999static reg_errcode_t 1000internal_function 1001re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 1002{ 1003 set->alloc = 2; 1004 set->elems = re_malloc (Idx, 2); 1005 if (BE (set->elems == NULL, 0)) 1006 return REG_ESPACE; 1007 if (elem1 == elem2) 1008 { 1009 set->nelem = 1; 1010 set->elems[0] = elem1; 1011 } 1012 else 1013 { 1014 set->nelem = 2; 1015 if (elem1 < elem2) 1016 { 1017 set->elems[0] = elem1; 1018 set->elems[1] = elem2; 1019 } 1020 else 1021 { 1022 set->elems[0] = elem2; 1023 set->elems[1] = elem1; 1024 } 1025 } 1026 return REG_NOERROR; 1027} 1028 1029static reg_errcode_t 1030internal_function 1031re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1032{ 1033 dest->nelem = src->nelem; 1034 if (src->nelem > 0) 1035 { 1036 dest->alloc = dest->nelem; 1037 dest->elems = re_malloc (Idx, dest->alloc); 1038 if (BE (dest->elems == NULL, 0)) 1039 { 1040 dest->alloc = dest->nelem = 0; 1041 return REG_ESPACE; 1042 } 1043 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1044 } 1045 else 1046 re_node_set_init_empty (dest); 1047 return REG_NOERROR; 1048} 1049 1050/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1051 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1052 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1053 1054static reg_errcode_t 1055internal_function 1056re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1057 const re_node_set *src2) 1058{ 1059 Idx i1, i2, is, id, delta, sbase; 1060 if (src1->nelem == 0 || src2->nelem == 0) 1061 return REG_NOERROR; 1062 1063 /* We need dest->nelem + 2 * elems_in_intersection; this is a 1064 conservative estimate. */ 1065 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1066 { 1067 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 1068 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc); 1069 if (BE (new_elems == NULL, 0)) 1070 return REG_ESPACE; 1071 dest->elems = new_elems; 1072 dest->alloc = new_alloc; 1073 } 1074 1075 /* Find the items in the intersection of SRC1 and SRC2, and copy 1076 into the top of DEST those that are not already in DEST itself. */ 1077 sbase = dest->nelem + src1->nelem + src2->nelem; 1078 i1 = src1->nelem - 1; 1079 i2 = src2->nelem - 1; 1080 id = dest->nelem - 1; 1081 for (;;) 1082 { 1083 if (src1->elems[i1] == src2->elems[i2]) 1084 { 1085 /* Try to find the item in DEST. Maybe we could binary search? */ 1086 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 1087 --id; 1088 1089 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 1090 dest->elems[--sbase] = src1->elems[i1]; 1091 1092 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 1093 break; 1094 } 1095 1096 /* Lower the highest of the two items. */ 1097 else if (src1->elems[i1] < src2->elems[i2]) 1098 { 1099 if (! REG_VALID_INDEX (--i2)) 1100 break; 1101 } 1102 else 1103 { 1104 if (! REG_VALID_INDEX (--i1)) 1105 break; 1106 } 1107 } 1108 1109 id = dest->nelem - 1; 1110 is = dest->nelem + src1->nelem + src2->nelem - 1; 1111 delta = is - sbase + 1; 1112 1113 /* Now copy. When DELTA becomes zero, the remaining 1114 DEST elements are already in place; this is more or 1115 less the same loop that is in re_node_set_merge. */ 1116 dest->nelem += delta; 1117 if (delta > 0 && REG_VALID_INDEX (id)) 1118 for (;;) 1119 { 1120 if (dest->elems[is] > dest->elems[id]) 1121 { 1122 /* Copy from the top. */ 1123 dest->elems[id + delta--] = dest->elems[is--]; 1124 if (delta == 0) 1125 break; 1126 } 1127 else 1128 { 1129 /* Slide from the bottom. */ 1130 dest->elems[id + delta] = dest->elems[id]; 1131 if (! REG_VALID_INDEX (--id)) 1132 break; 1133 } 1134 } 1135 1136 /* Copy remaining SRC elements. */ 1137 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx)); 1138 1139 return REG_NOERROR; 1140} 1141 1142/* Calculate the union set of the sets SRC1 and SRC2. And store it to 1143 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1144 1145static reg_errcode_t 1146internal_function 1147re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1148 const re_node_set *src2) 1149{ 1150 Idx i1, i2, id; 1151 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1152 { 1153 dest->alloc = src1->nelem + src2->nelem; 1154 dest->elems = re_malloc (Idx, dest->alloc); 1155 if (BE (dest->elems == NULL, 0)) 1156 return REG_ESPACE; 1157 } 1158 else 1159 { 1160 if (src1 != NULL && src1->nelem > 0) 1161 return re_node_set_init_copy (dest, src1); 1162 else if (src2 != NULL && src2->nelem > 0) 1163 return re_node_set_init_copy (dest, src2); 1164 else 1165 re_node_set_init_empty (dest); 1166 return REG_NOERROR; 1167 } 1168 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1169 { 1170 if (src1->elems[i1] > src2->elems[i2]) 1171 { 1172 dest->elems[id++] = src2->elems[i2++]; 1173 continue; 1174 } 1175 if (src1->elems[i1] == src2->elems[i2]) 1176 ++i2; 1177 dest->elems[id++] = src1->elems[i1++]; 1178 } 1179 if (i1 < src1->nelem) 1180 { 1181 memcpy (dest->elems + id, src1->elems + i1, 1182 (src1->nelem - i1) * sizeof (Idx)); 1183 id += src1->nelem - i1; 1184 } 1185 else if (i2 < src2->nelem) 1186 { 1187 memcpy (dest->elems + id, src2->elems + i2, 1188 (src2->nelem - i2) * sizeof (Idx)); 1189 id += src2->nelem - i2; 1190 } 1191 dest->nelem = id; 1192 return REG_NOERROR; 1193} 1194 1195/* Calculate the union set of the sets DEST and SRC. And store it to 1196 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1197 1198static reg_errcode_t 1199internal_function 1200re_node_set_merge (re_node_set *dest, const re_node_set *src) 1201{ 1202 Idx is, id, sbase, delta; 1203 if (src == NULL || src->nelem == 0) 1204 return REG_NOERROR; 1205 if (dest->alloc < 2 * src->nelem + dest->nelem) 1206 { 1207 Idx new_alloc = 2 * (src->nelem + dest->alloc); 1208 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc); 1209 if (BE (new_buffer == NULL, 0)) 1210 return REG_ESPACE; 1211 dest->elems = new_buffer; 1212 dest->alloc = new_alloc; 1213 } 1214 1215 if (BE (dest->nelem == 0, 0)) 1216 { 1217 dest->nelem = src->nelem; 1218 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1219 return REG_NOERROR; 1220 } 1221 1222 /* Copy into the top of DEST the items of SRC that are not 1223 found in DEST. Maybe we could binary search in DEST? */ 1224 for (sbase = dest->nelem + 2 * src->nelem, 1225 is = src->nelem - 1, id = dest->nelem - 1; 1226 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1227 { 1228 if (dest->elems[id] == src->elems[is]) 1229 is--, id--; 1230 else if (dest->elems[id] < src->elems[is]) 1231 dest->elems[--sbase] = src->elems[is--]; 1232 else /* if (dest->elems[id] > src->elems[is]) */ 1233 --id; 1234 } 1235 1236 if (REG_VALID_INDEX (is)) 1237 { 1238 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1239 sbase -= is + 1; 1240 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx)); 1241 } 1242 1243 id = dest->nelem - 1; 1244 is = dest->nelem + 2 * src->nelem - 1; 1245 delta = is - sbase + 1; 1246 if (delta == 0) 1247 return REG_NOERROR; 1248 1249 /* Now copy. When DELTA becomes zero, the remaining 1250 DEST elements are already in place. */ 1251 dest->nelem += delta; 1252 for (;;) 1253 { 1254 if (dest->elems[is] > dest->elems[id]) 1255 { 1256 /* Copy from the top. */ 1257 dest->elems[id + delta--] = dest->elems[is--]; 1258 if (delta == 0) 1259 break; 1260 } 1261 else 1262 { 1263 /* Slide from the bottom. */ 1264 dest->elems[id + delta] = dest->elems[id]; 1265 if (! REG_VALID_INDEX (--id)) 1266 { 1267 /* Copy remaining SRC elements. */ 1268 memcpy (dest->elems, dest->elems + sbase, 1269 delta * sizeof (Idx)); 1270 break; 1271 } 1272 } 1273 } 1274 1275 return REG_NOERROR; 1276} 1277 1278/* Insert the new element ELEM to the re_node_set* SET. 1279 SET should not already have ELEM. 1280 Return true if successful. */ 1281 1282static bool 1283internal_function 1284re_node_set_insert (re_node_set *set, Idx elem) 1285{ 1286 Idx idx; 1287 /* In case the set is empty. */ 1288 if (set->alloc == 0) 1289 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1); 1290 1291 if (BE (set->nelem, 0) == 0) 1292 { 1293 /* We already guaranteed above that set->alloc != 0. */ 1294 set->elems[0] = elem; 1295 ++set->nelem; 1296 return true; 1297 } 1298 1299 /* Realloc if we need. */ 1300 if (set->alloc == set->nelem) 1301 { 1302 Idx *new_elems; 1303 set->alloc = set->alloc * 2; 1304 new_elems = re_realloc (set->elems, Idx, set->alloc); 1305 if (BE (new_elems == NULL, 0)) 1306 return false; 1307 set->elems = new_elems; 1308 } 1309 1310 /* Move the elements which follows the new element. Test the 1311 first element separately to skip a check in the inner loop. */ 1312 if (elem < set->elems[0]) 1313 { 1314 idx = 0; 1315 for (idx = set->nelem; idx > 0; idx--) 1316 set->elems[idx] = set->elems[idx - 1]; 1317 } 1318 else 1319 { 1320 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1321 set->elems[idx] = set->elems[idx - 1]; 1322 } 1323 1324 /* Insert the new element. */ 1325 set->elems[idx] = elem; 1326 ++set->nelem; 1327 return true; 1328} 1329 1330/* Insert the new element ELEM to the re_node_set* SET. 1331 SET should not already have any element greater than or equal to ELEM. 1332 Return true if successful. */ 1333 1334static bool 1335internal_function 1336re_node_set_insert_last (re_node_set *set, Idx elem) 1337{ 1338 /* Realloc if we need. */ 1339 if (set->alloc == set->nelem) 1340 { 1341 Idx *new_elems; 1342 set->alloc = (set->alloc + 1) * 2; 1343 new_elems = re_realloc (set->elems, Idx, set->alloc); 1344 if (BE (new_elems == NULL, 0)) 1345 return false; 1346 set->elems = new_elems; 1347 } 1348 1349 /* Insert the new element. */ 1350 set->elems[set->nelem++] = elem; 1351 return true; 1352} 1353 1354/* Compare two node sets SET1 and SET2. 1355 Return true if SET1 and SET2 are equivalent. */ 1356 1357static bool 1358internal_function __attribute ((pure)) 1359re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1360{ 1361 Idx i; 1362 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1363 return false; 1364 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1365 if (set1->elems[i] != set2->elems[i]) 1366 return false; 1367 return true; 1368} 1369 1370/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1371 1372static Idx 1373internal_function __attribute ((pure)) 1374re_node_set_contains (const re_node_set *set, Idx elem) 1375{ 1376 __re_size_t idx, right, mid; 1377 if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1378 return 0; 1379 1380 /* Binary search the element. */ 1381 idx = 0; 1382 right = set->nelem - 1; 1383 while (idx < right) 1384 { 1385 mid = (idx + right) / 2; 1386 if (set->elems[mid] < elem) 1387 idx = mid + 1; 1388 else 1389 right = mid; 1390 } 1391 return set->elems[idx] == elem ? idx + 1 : 0; 1392} 1393 1394static void 1395internal_function 1396re_node_set_remove_at (re_node_set *set, Idx idx) 1397{ 1398 verify (! TYPE_SIGNED (Idx)); 1399 /* if (idx < 0) 1400 return; */ 1401 if (idx >= set->nelem) 1402 return; 1403 --set->nelem; 1404 for (; idx < set->nelem; idx++) 1405 set->elems[idx] = set->elems[idx + 1]; 1406} 1407 1408 1409/* Add the token TOKEN to dfa->nodes, and return the index of the token. 1410 Or return REG_MISSING if an error occurred. */ 1411 1412static Idx 1413internal_function 1414re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1415{ 1416 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1417 { 1418 size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1419 Idx *new_nexts, *new_indices; 1420 re_node_set *new_edests, *new_eclosures; 1421 re_token_t *new_nodes; 1422 size_t max_object_size = 1423 MAX (sizeof (re_token_t), 1424 MAX (sizeof (re_node_set), 1425 sizeof (Idx))); 1426 1427 /* Avoid overflows. */ 1428 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0)) 1429 return REG_MISSING; 1430 1431 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1432 if (BE (new_nodes == NULL, 0)) 1433 return REG_MISSING; 1434 dfa->nodes = new_nodes; 1435 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1436 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1437 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1438 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1439 if (BE (new_nexts == NULL || new_indices == NULL 1440 || new_edests == NULL || new_eclosures == NULL, 0)) 1441 return REG_MISSING; 1442 dfa->nexts = new_nexts; 1443 dfa->org_indices = new_indices; 1444 dfa->edests = new_edests; 1445 dfa->eclosures = new_eclosures; 1446 dfa->nodes_alloc = new_nodes_alloc; 1447 } 1448 dfa->nodes[dfa->nodes_len] = token; 1449 dfa->nodes[dfa->nodes_len].constraint = 0; 1450#ifdef RE_ENABLE_I18N 1451 { 1452 int type = token.type; 1453 dfa->nodes[dfa->nodes_len].accept_mb = 1454 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1455 } 1456#endif 1457 dfa->nexts[dfa->nodes_len] = REG_MISSING; 1458 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1459 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1460 return dfa->nodes_len++; 1461} 1462 1463static inline re_hashval_t 1464internal_function 1465calc_state_hash (const re_node_set *nodes, unsigned int context) 1466{ 1467 re_hashval_t hash = nodes->nelem + context; 1468 Idx i; 1469 for (i = 0 ; i < nodes->nelem ; i++) 1470 hash += nodes->elems[i]; 1471 return hash; 1472} 1473 1474/* Search for the state whose node_set is equivalent to NODES. 1475 Return the pointer to the state, if we found it in the DFA. 1476 Otherwise create the new one and return it. In case of an error 1477 return NULL and set the error code in ERR. 1478 Note: - We assume NULL as the invalid state, then it is possible that 1479 return value is NULL and ERR is REG_NOERROR. 1480 - We never return non-NULL value in case of any errors, it is for 1481 optimization. */ 1482 1483static re_dfastate_t * 1484internal_function 1485re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1486 const re_node_set *nodes) 1487{ 1488 re_hashval_t hash; 1489 re_dfastate_t *new_state; 1490 struct re_state_table_entry *spot; 1491 Idx i; 1492#ifdef lint 1493 /* Suppress bogus uninitialized-variable warnings. */ 1494 *err = REG_NOERROR; 1495#endif 1496 if (BE (nodes->nelem == 0, 0)) 1497 { 1498 *err = REG_NOERROR; 1499 return NULL; 1500 } 1501 hash = calc_state_hash (nodes, 0); 1502 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1503 1504 for (i = 0 ; i < spot->num ; i++) 1505 { 1506 re_dfastate_t *state = spot->array[i]; 1507 if (hash != state->hash) 1508 continue; 1509 if (re_node_set_compare (&state->nodes, nodes)) 1510 return state; 1511 } 1512 1513 /* There are no appropriate state in the dfa, create the new one. */ 1514 new_state = create_ci_newstate (dfa, nodes, hash); 1515 if (BE (new_state == NULL, 0)) 1516 *err = REG_ESPACE; 1517 1518 return new_state; 1519} 1520 1521/* Search for the state whose node_set is equivalent to NODES and 1522 whose context is equivalent to CONTEXT. 1523 Return the pointer to the state, if we found it in the DFA. 1524 Otherwise create the new one and return it. In case of an error 1525 return NULL and set the error code in ERR. 1526 Note: - We assume NULL as the invalid state, then it is possible that 1527 return value is NULL and ERR is REG_NOERROR. 1528 - We never return non-NULL value in case of any errors, it is for 1529 optimization. */ 1530 1531static re_dfastate_t * 1532internal_function 1533re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1534 const re_node_set *nodes, unsigned int context) 1535{ 1536 re_hashval_t hash; 1537 re_dfastate_t *new_state; 1538 struct re_state_table_entry *spot; 1539 Idx i; 1540#ifdef lint 1541 /* Suppress bogus uninitialized-variable warnings. */ 1542 *err = REG_NOERROR; 1543#endif 1544 if (nodes->nelem == 0) 1545 { 1546 *err = REG_NOERROR; 1547 return NULL; 1548 } 1549 hash = calc_state_hash (nodes, context); 1550 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1551 1552 for (i = 0 ; i < spot->num ; i++) 1553 { 1554 re_dfastate_t *state = spot->array[i]; 1555 if (state->hash == hash 1556 && state->context == context 1557 && re_node_set_compare (state->entrance_nodes, nodes)) 1558 return state; 1559 } 1560 /* There are no appropriate state in `dfa', create the new one. */ 1561 new_state = create_cd_newstate (dfa, nodes, context, hash); 1562 if (BE (new_state == NULL, 0)) 1563 *err = REG_ESPACE; 1564 1565 return new_state; 1566} 1567 1568/* Finish initialization of the new state NEWSTATE, and using its hash value 1569 HASH put in the appropriate bucket of DFA's state table. Return value 1570 indicates the error code if failed. */ 1571 1572static reg_errcode_t 1573register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1574 re_hashval_t hash) 1575{ 1576 struct re_state_table_entry *spot; 1577 reg_errcode_t err; 1578 Idx i; 1579 1580 newstate->hash = hash; 1581 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1582 if (BE (err != REG_NOERROR, 0)) 1583 return REG_ESPACE; 1584 for (i = 0; i < newstate->nodes.nelem; i++) 1585 { 1586 Idx elem = newstate->nodes.elems[i]; 1587 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1588 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0)) 1589 return REG_ESPACE; 1590 } 1591 1592 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1593 if (BE (spot->alloc <= spot->num, 0)) 1594 { 1595 Idx new_alloc = 2 * spot->num + 2; 1596 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1597 new_alloc); 1598 if (BE (new_array == NULL, 0)) 1599 return REG_ESPACE; 1600 spot->array = new_array; 1601 spot->alloc = new_alloc; 1602 } 1603 spot->array[spot->num++] = newstate; 1604 return REG_NOERROR; 1605} 1606 1607static void 1608free_state (re_dfastate_t *state) 1609{ 1610 re_node_set_free (&state->non_eps_nodes); 1611 re_node_set_free (&state->inveclosure); 1612 if (state->entrance_nodes != &state->nodes) 1613 { 1614 re_node_set_free (state->entrance_nodes); 1615 re_free (state->entrance_nodes); 1616 } 1617 re_node_set_free (&state->nodes); 1618 re_free (state->word_trtable); 1619 re_free (state->trtable); 1620 re_free (state); 1621} 1622 1623/* Create the new state which is independ of contexts. 1624 Return the new state if succeeded, otherwise return NULL. */ 1625 1626static re_dfastate_t * 1627internal_function 1628create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1629 re_hashval_t hash) 1630{ 1631 Idx i; 1632 reg_errcode_t err; 1633 re_dfastate_t *newstate; 1634 1635 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1636 if (BE (newstate == NULL, 0)) 1637 return NULL; 1638 err = re_node_set_init_copy (&newstate->nodes, nodes); 1639 if (BE (err != REG_NOERROR, 0)) 1640 { 1641 re_free (newstate); 1642 return NULL; 1643 } 1644 1645 newstate->entrance_nodes = &newstate->nodes; 1646 for (i = 0 ; i < nodes->nelem ; i++) 1647 { 1648 re_token_t *node = dfa->nodes + nodes->elems[i]; 1649 re_token_type_t type = node->type; 1650 if (type == CHARACTER && !node->constraint) 1651 continue; 1652#ifdef RE_ENABLE_I18N 1653 newstate->accept_mb |= node->accept_mb; 1654#endif /* RE_ENABLE_I18N */ 1655 1656 /* If the state has the halt node, the state is a halt state. */ 1657 if (type == END_OF_RE) 1658 newstate->halt = 1; 1659 else if (type == OP_BACK_REF) 1660 newstate->has_backref = 1; 1661 else if (type == ANCHOR || node->constraint) 1662 newstate->has_constraint = 1; 1663 } 1664 err = register_state (dfa, newstate, hash); 1665 if (BE (err != REG_NOERROR, 0)) 1666 { 1667 free_state (newstate); 1668 newstate = NULL; 1669 } 1670 return newstate; 1671} 1672 1673/* Create the new state which is depend on the context CONTEXT. 1674 Return the new state if succeeded, otherwise return NULL. */ 1675 1676static re_dfastate_t * 1677internal_function 1678create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1679 unsigned int context, re_hashval_t hash) 1680{ 1681 Idx i, nctx_nodes = 0; 1682 reg_errcode_t err; 1683 re_dfastate_t *newstate; 1684 1685 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1686 if (BE (newstate == NULL, 0)) 1687 return NULL; 1688 err = re_node_set_init_copy (&newstate->nodes, nodes); 1689 if (BE (err != REG_NOERROR, 0)) 1690 { 1691 re_free (newstate); 1692 return NULL; 1693 } 1694 1695 newstate->context = context; 1696 newstate->entrance_nodes = &newstate->nodes; 1697 1698 for (i = 0 ; i < nodes->nelem ; i++) 1699 { 1700 re_token_t *node = dfa->nodes + nodes->elems[i]; 1701 re_token_type_t type = node->type; 1702 unsigned int constraint = node->constraint; 1703 1704 if (type == CHARACTER && !constraint) 1705 continue; 1706#ifdef RE_ENABLE_I18N 1707 newstate->accept_mb |= node->accept_mb; 1708#endif /* RE_ENABLE_I18N */ 1709 1710 /* If the state has the halt node, the state is a halt state. */ 1711 if (type == END_OF_RE) 1712 newstate->halt = 1; 1713 else if (type == OP_BACK_REF) 1714 newstate->has_backref = 1; 1715 1716 if (constraint) 1717 { 1718 if (newstate->entrance_nodes == &newstate->nodes) 1719 { 1720 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1721 if (BE (newstate->entrance_nodes == NULL, 0)) 1722 { 1723 free_state (newstate); 1724 return NULL; 1725 } 1726 re_node_set_init_copy (newstate->entrance_nodes, nodes); 1727 nctx_nodes = 0; 1728 newstate->has_constraint = 1; 1729 } 1730 1731 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1732 { 1733 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1734 ++nctx_nodes; 1735 } 1736 } 1737 } 1738 err = register_state (dfa, newstate, hash); 1739 if (BE (err != REG_NOERROR, 0)) 1740 { 1741 free_state (newstate); 1742 newstate = NULL; 1743 } 1744 return newstate; 1745} 1746