1/* 2 tre_regexec.c - TRE POSIX compatible matching functions (and more). 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7*/ 8 9#ifdef HAVE_CONFIG_H 10#include <config.h> 11#endif /* HAVE_CONFIG_H */ 12 13#ifdef TRE_USE_ALLOCA 14/* AIX requires this to be the first thing in the file. */ 15#ifndef __GNUC__ 16# if HAVE_ALLOCA_H 17# include <alloca.h> 18# else 19# ifdef _AIX 20 #pragma alloca 21# else 22# ifndef alloca /* predefined by HP cc +Olibcalls */ 23char *alloca (); 24# endif 25# endif 26# endif 27#endif 28#endif /* TRE_USE_ALLOCA */ 29 30#include <assert.h> 31#include <stdlib.h> 32#include <string.h> 33#ifdef HAVE_WCHAR_H 34#include <wchar.h> 35#endif /* HAVE_WCHAR_H */ 36#ifdef HAVE_WCTYPE_H 37#include <wctype.h> 38#endif /* HAVE_WCTYPE_H */ 39#ifndef TRE_WCHAR 40#include <ctype.h> 41#endif /* !TRE_WCHAR */ 42#ifdef HAVE_MALLOC_H 43#include <malloc.h> 44#endif /* HAVE_MALLOC_H */ 45#include <limits.h> 46 47#include "tre-internal.h" 48#include "tre-match-utils.h" 49#include "tre.h" 50#include "xmalloc.h" 51 52 53/* For each tre_last_matched_t in the lm array, find the last matched branch by 54 comparing the touch value of the cmp_tag's. For all other branches, reset 55 the corresponding tags. If reset_all is non-zero, reset all tags in all 56 branches. Recurse into the nested last matched structures, clearing tags as 57 apprpriate. */ 58static void 59tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm, 60 int n, int start_tag, int reset_all) 61{ 62 int max, i, reset; 63 tre_last_matched_branch_t *b; 64 65 DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n", 66 n, start_tag, reset_all)); 67 for (; n-- > 0; lm++) 68 { 69 if (lm->n_branches == 1) 70 { 71 b = lm->branches; 72 if (start_tag > 0) 73 { 74 DPRINT((" b->cmp_tag=%d %d <? %d\n", b->cmp_tag, 75 tre_tag_touch_get(tags, b->cmp_tag), 76 tre_tag_touch_get(tags, start_tag))); 77 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < 78 tre_tag_touch_get(tags, start_tag)); 79 } 80 else 81 reset = 0; 82 83 if (reset) 84 { 85 int *t; 86 87 for (i = b->n_tags, t = b->tags; i > 0; i--, t++) 88 { 89 DPRINT((" Resetting t%d\n", *t)); 90 tre_tag_reset(tags, *t); 91 } 92 } 93 if (b->n_last_matched > 0) 94 tre_reset_last_matched_branches(tags, b->last_matched, 95 b->n_last_matched, 96 lm->start_tag, reset); 97 } 98 else 99 { 100 if (!reset_all) 101 { 102#ifdef TRE_DEBUG 103 int last; 104#endif /* TRE_DEBUG */ 105 max = 0; 106 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) 107 { 108 int t = b->cmp_tag; 109 int touch = tre_tag_touch_get(tags, t); 110 if (touch > max) 111 { 112 max = touch; 113#ifdef TRE_DEBUG 114 last = t; 115#endif /* TRE_DEBUG */ 116 } 117 } 118 DPRINT((" Last touched end tag t%d=%d\n", last, max)); 119 } 120 121 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) 122 { 123 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max); 124 if (reset) 125 { 126 int j; 127 int *t; 128 129 for (j = b->n_tags, t = b->tags; j > 0; j--, t++) 130 { 131 DPRINT((" Resetting t%d\n", *t)); 132 tre_tag_reset(tags, *t); 133 } 134 } 135 if (b->n_last_matched > 0) 136 tre_reset_last_matched_branches(tags, b->last_matched, 137 b->n_last_matched, 138 lm->start_tag, reset); 139 } 140 } 141 } 142} 143 144 145/* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match 146 endpoint values. */ 147reg_errcode_t 148tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, 149 const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo) 150{ 151 unsigned int i; 152 153 if (cflags & REG_NOSUB) return REG_OK; 154 155 i = 0; 156 if (match_eo >= 0 && intags) 157 { 158 const tre_tag_t *tags = intags; 159 tre_submatch_data_t *submatch_data; 160 161 if (tnfa->last_matched_branch && 162 tnfa->last_matched_branch->n_last_matched > 0) 163 { 164 tre_tag_t *t; 165#ifdef TRE_USE_ALLOCA 166 t = alloca(sizeof(*t) * tnfa->num_tags); 167#else /* !TRE_USE_ALLOCA */ 168 t = xmalloc(sizeof(*t) * tnfa->num_tags); 169#endif /* !TRE_USE_ALLOCA */ 170 if (!t) return REG_ESPACE; 171 memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t)); 172 tre_reset_last_matched_branches(t, 173 tnfa->last_matched_branch->last_matched, 174 tnfa->last_matched_branch->n_last_matched, 175 0, 0); 176 tags = t; 177 } 178 /* Construct submatch offsets from the tags. */ 179 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); 180 submatch_data = tnfa->submatch_data; 181 while (i < tnfa->num_submatches && i < nmatch) 182 { 183 if (submatch_data[i].so_tag == tnfa->end_tag) 184 pmatch[i].rm_so = match_eo; 185 else 186 pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag); 187 188 if (submatch_data[i].eo_tag == tnfa->end_tag) 189 pmatch[i].rm_eo = match_eo; 190 else 191 pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag); 192 193 /* If either of the endpoints were not used, this submatch 194 was not part of the match. */ 195 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) 196 pmatch[i].rm_so = pmatch[i].rm_eo = -1; 197 198 DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i, 199 submatch_data[i].so_tag, pmatch[i].rm_so, 200 submatch_data[i].eo_tag, pmatch[i].rm_eo)); 201 i++; 202 } 203#ifndef TRE_USE_ALLOCA 204 if (tags != intags) xfree(tags); 205#endif /* !TRE_USE_ALLOCA */ 206 } 207 208 while (i < nmatch) 209 { 210 pmatch[i].rm_so = -1; 211 pmatch[i].rm_eo = -1; 212 i++; 213 } 214 215 return REG_OK; 216} 217 218 219/* 220 Wrapper functions for POSIX compatible regexp matching. 221*/ 222 223#ifndef __LIBC__ 224int 225tre_have_backrefs(const regex_t *preg) 226{ 227 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 228 return tnfa->have_backrefs; 229} 230 231#ifdef TRE_APPROX 232int 233tre_have_approx(const regex_t *preg) 234{ 235 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 236 return tnfa->have_approx; 237} 238#endif /* TRE_APPROX */ 239#endif /* !__LIBC__ */ 240 241static int 242tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, 243 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], 244 int eflags) 245{ 246 reg_errcode_t status; 247 tre_tag_t *tags = NULL; 248 int eo; 249 size_t offset = 0, count = 0; 250 if (tnfa->num_tags > 0 && nmatch > 0) 251 { 252#ifdef TRE_USE_ALLOCA 253 tags = alloca(sizeof(*tags) * tnfa->num_tags); 254#else /* !TRE_USE_ALLOCA */ 255 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 256#endif /* !TRE_USE_ALLOCA */ 257 if (tags == NULL) 258 return REG_ESPACE; 259 } 260 261 if ( 262#ifdef TRE_STR_USER 263 type != STR_USER && 264#endif /* TRE_STR_USER */ 265 (eflags & REG_STARTEND) && pmatch) 266 { 267 if (pmatch->rm_so < 0) 268 return REG_INVARG; 269 if (len == (size_t)-1) 270 { 271 if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo) 272 return REG_INVARG; 273 len = pmatch->rm_eo - pmatch->rm_so; 274 } 275 count = offset = pmatch->rm_so; 276 if (type == STR_WIDE) offset *= sizeof(wchar_t); 277 } 278 279 /* Dispatch to the appropriate matcher. */ 280 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) 281 { 282 /* The regex has back references, use the backtracking matcher. */ 283#ifdef TRE_STR_USER 284 if (type == STR_USER) 285 { 286 const tre_str_source *source = string; 287 if (source->rewind == NULL || source->compare == NULL) 288 /* The backtracking matcher requires rewind and compare 289 capabilities from the input stream. */ 290 return REG_BADPAT; 291 } 292#endif /* TRE_STR_USER */ 293 status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type, 294 tags, eflags, &eo); 295 } 296#ifdef TRE_APPROX 297 else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER) 298 { 299 /* The regex uses approximate matching, use the approximate matcher. */ 300 regamatch_t match; 301 regaparams_t params; 302 tre_regaparams_default(¶ms); 303 params.max_err = 0; 304 params.max_cost = 0; 305 status = tre_tnfa_run_approx(tnfa, string + offset, (int)len, type, tags, 306 &match, params, eflags, &eo); 307 } 308#endif /* TRE_APPROX */ 309 else 310 { 311 /* Exact matching, no back references, use the parallel matcher. */ 312 status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type, 313 tags, eflags, &eo); 314 } 315 316 if (status == REG_OK) 317 { 318 /* A match was found, so fill the submatch registers. */ 319 status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); 320 /* If doing REG_STARTEND, adjust the pmatch array (we can't build 321 this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls 322 tre_fill_pmatch itself). */ 323 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) && 324#ifdef TRE_STR_USER 325 type != STR_USER && 326#endif /* TRE_STR_USER */ 327 (eflags & REG_STARTEND) && pmatch && nmatch > 0) 328 { 329 size_t i; 330 regmatch_t *p; 331 for (i = nmatch, p = pmatch; i > 0; p++, i--) 332 { 333 if (p->rm_so >= 0) p->rm_so += count; 334 if (p->rm_eo >= 0) p->rm_eo += count; 335 } 336 } 337 } 338#ifndef TRE_USE_ALLOCA 339 if (tags) 340 xfree(tags); 341#endif /* !TRE_USE_ALLOCA */ 342 return status; 343} 344 345int 346tre_regnexec(const regex_t *preg, const char *str, size_t len, 347 size_t nmatch, regmatch_t pmatch[], int eflags) 348{ 349 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 350 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS; 351 352#ifdef TRE_USE_SYSTEM_REGEX_H 353 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 354#endif /* TRE_USE_SYSTEM_REGEX_H */ 355 356 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); 357} 358 359int 360tre_regexec(const regex_t *preg, const char *str, 361 size_t nmatch, regmatch_t pmatch[], int eflags) 362{ 363 return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags); 364} 365 366 367#ifdef TRE_WCHAR 368 369int 370tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len, 371 size_t nmatch, regmatch_t pmatch[], int eflags) 372{ 373 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 374 375#ifdef TRE_USE_SYSTEM_REGEX_H 376 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 377#endif /* TRE_USE_SYSTEM_REGEX_H */ 378 379 return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags); 380} 381 382int 383tre_regwexec(const regex_t *preg, const wchar_t *str, 384 size_t nmatch, regmatch_t pmatch[], int eflags) 385{ 386 return tre_regwnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags); 387} 388 389#endif /* TRE_WCHAR */ 390 391#ifdef TRE_STR_USER 392int 393tre_reguexec(const regex_t *preg, const tre_str_source *str, 394 size_t nmatch, regmatch_t pmatch[], int eflags) 395{ 396 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 397 398#ifdef TRE_USE_SYSTEM_REGEX_H 399 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 400#endif /* TRE_USE_SYSTEM_REGEX_H */ 401 402 return tre_match(tnfa, str, (size_t)-1, STR_USER, nmatch, pmatch, eflags); 403} 404#endif /* TRE_STR_USER */ 405 406 407#ifdef TRE_APPROX 408 409/* 410 Wrapper functions for approximate regexp matching. 411*/ 412 413static int 414tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len, 415 tre_str_type_t type, regamatch_t *match, regaparams_t params, 416 int eflags) 417{ 418 reg_errcode_t status; 419 tre_tag_t *tags = NULL; 420 int eo; 421 size_t offset = 0, count = 0; 422 423 /* If the regexp does not use approximate matching features, the 424 maximum cost is zero, and the approximate matcher isn't forced, 425 use the exact matcher instead. */ 426 if (params.max_cost == 0 && !tnfa->have_approx 427 && !(eflags & REG_APPROX_MATCHER)) 428 return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch, 429 eflags); 430 431 /* Back references are not supported by the approximate matcher. */ 432 if (tnfa->have_backrefs) 433 return REG_BADPAT; 434 435 if (tnfa->num_tags > 0 && match->nmatch > 0) 436 { 437#if TRE_USE_ALLOCA 438 tags = alloca(sizeof(*tags) * tnfa->num_tags); 439#else /* !TRE_USE_ALLOCA */ 440 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 441#endif /* !TRE_USE_ALLOCA */ 442 if (tags == NULL) 443 return REG_ESPACE; 444 } 445 446 if ( 447#ifdef TRE_STR_USER 448 type != STR_USER && 449#endif /* TRE_STR_USER */ 450 (eflags & REG_STARTEND) && match->pmatch) 451 { 452 if (match->pmatch->rm_so < 0) 453 return REG_INVARG; 454 if (len == (size_t)-1) 455 { 456 if (match->pmatch->rm_eo < 0 || match->pmatch->rm_so > 457 match->pmatch->rm_eo) 458 return REG_INVARG; 459 len = match->pmatch->rm_eo - match->pmatch->rm_so; 460 } 461 count = offset = match->pmatch->rm_so; 462 if (type == STR_WIDE) offset *= sizeof(wchar_t); 463 } 464 465 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, 466 match, params, eflags, &eo); 467 if (status == REG_OK) 468 { 469 status = tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, 470 tnfa, tags, eo); 471 /* If doing REG_STARTEND, adjust the pmatch array (we can't build 472 this into tre_fill_pmatch, because tre_tnfa_run_backtrack call 473 tre_fill_pmatch itself). */ 474 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) && 475#ifdef TRE_STR_USER 476 type != STR_USER && 477#endif /* TRE_STR_USER */ 478 (eflags & REG_STARTEND) && match->pmatch && match->nmatch > 0) 479 { 480 size_t i; 481 regmatch_t *p; 482 for (i = match->nmatch, p = match->pmatch; i > 0; p++, i--) 483 { 484 if (p->rm_so >= 0) p->rm_so += count; 485 if (p->rm_eo >= 0) p->rm_eo += count; 486 } 487 } 488 } 489#ifndef TRE_USE_ALLOCA 490 if (tags) 491 xfree(tags); 492#endif /* !TRE_USE_ALLOCA */ 493 return status; 494} 495 496int 497tre_reganexec(const regex_t *preg, const char *str, size_t len, 498 regamatch_t *match, regaparams_t params, int eflags) 499{ 500 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 501 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS; 502 503#ifdef TRE_USE_SYSTEM_REGEX_H 504 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 505#endif /* TRE_USE_SYSTEM_REGEX_H */ 506 507 return tre_match_approx(tnfa, str, len, type, match, params, eflags); 508} 509 510int 511tre_regaexec(const regex_t *preg, const char *str, 512 regamatch_t *match, regaparams_t params, int eflags) 513{ 514 return tre_reganexec(preg, str, (size_t)-1, match, params, eflags); 515} 516 517#ifdef TRE_WCHAR 518 519int 520tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len, 521 regamatch_t *match, regaparams_t params, int eflags) 522{ 523 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 524 525#ifdef TRE_USE_SYSTEM_REGEX_H 526 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 527#endif /* TRE_USE_SYSTEM_REGEX_H */ 528 529 return tre_match_approx(tnfa, str, len, STR_WIDE, 530 match, params, eflags); 531} 532 533int 534tre_regawexec(const regex_t *preg, const wchar_t *str, 535 regamatch_t *match, regaparams_t params, int eflags) 536{ 537 return tre_regawnexec(preg, str, (size_t)-1, match, params, eflags); 538} 539 540#endif /* TRE_WCHAR */ 541 542void 543tre_regaparams_default(regaparams_t *params) 544{ 545 memset(params, 0, sizeof(*params)); 546 params->cost_ins = 1; 547 params->cost_del = 1; 548 params->cost_subst = 1; 549 params->max_cost = INT_MAX; 550 params->max_ins = INT_MAX; 551 params->max_del = INT_MAX; 552 params->max_subst = INT_MAX; 553 params->max_err = INT_MAX; 554} 555 556#endif /* TRE_APPROX */ 557 558/* EOF */ 559