1/* 2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com> 3 * All rights reserved 4 * 5 * "THE BEER-WARE LICENSE" (Revision 42): 6 * Sergey Lyubka wrote this file. As long as you retain this notice you 7 * can do whatever you want with this stuff. If we meet some day, and you think 8 * this stuff is worth it, you can buy me a beer in return. 9 */ 10 11/* 12 * Downloaded Sat Nov 5 17:43:06 CET 2011 at 13 * http://slre.sourceforge.net/1.0/slre.c 14 */ 15 16#ifdef SLRE_TEST 17#include <stdio.h> 18#include <assert.h> 19#include <ctype.h> 20#include <stdlib.h> 21#include <string.h> 22#else 23#include <log.h> 24#include <linux/ctype.h> 25#include <linux/string.h> 26#endif /* SLRE_TEST */ 27 28#include <errno.h> 29 30#include <slre.h> 31 32enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL, 33 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT}; 34 35#ifdef SLRE_TEST 36static struct { 37 const char *name; 38 int narg; 39 const char *flags; 40} opcodes[] = { 41 {"END", 0, ""}, /* End of code block or program */ 42 {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */ 43 {"ANY", 0, ""}, /* Match any character, "." */ 44 {"EXACT", 2, "d"}, /* Match exact string */ 45 {"ANYOF", 2, "D"}, /* Match any from set, "[]" */ 46 {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/ 47 {"OPEN ", 1, "i"}, /* Capture start, "(" */ 48 {"CLOSE", 1, "i"}, /* Capture end, ")" */ 49 {"BOL", 0, ""}, /* Beginning of string, "^" */ 50 {"EOL", 0, ""}, /* End of string, "$" */ 51 {"STAR", 1, "o"}, /* Match zero or more times "*" */ 52 {"PLUS", 1, "o"}, /* Match one or more times, "+" */ 53 {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */ 54 {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */ 55 {"QUEST", 1, "o"}, /* Match zero or one time, "?" */ 56 {"SPACE", 0, ""}, /* Match whitespace, "\s" */ 57 {"NONSPACE", 0, ""}, /* Match non-space, "\S" */ 58 {"DIGIT", 0, ""} /* Match digit, "\d" */ 59}; 60#endif /* SLRE_TEST */ 61 62/* 63 * Commands and operands are all unsigned char (1 byte long). All code offsets 64 * are relative to current address, and positive (always point forward). Data 65 * offsets are absolute. Commands with operands: 66 * 67 * BRANCH offset1 offset2 68 * Try to match the code block that follows the BRANCH instruction 69 * (code block ends with END). If no match, try to match code block that 70 * starts at offset1. If either of these match, jump to offset2. 71 * 72 * EXACT data_offset data_length 73 * Try to match exact string. String is recorded in data section from 74 * data_offset, and has length data_length. 75 * 76 * OPEN capture_number 77 * CLOSE capture_number 78 * If the user have passed 'struct cap' array for captures, OPEN 79 * records the beginning of the matched substring (cap->ptr), CLOSE 80 * sets the length (cap->len) for respective capture_number. 81 * 82 * STAR code_offset 83 * PLUS code_offset 84 * QUEST code_offset 85 * *, +, ?, respectively. Try to gobble as much as possible from the 86 * matched buffer, until code block that follows these instructions 87 * matches. When the longest possible string is matched, 88 * jump to code_offset 89 * 90 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS. 91 */ 92 93static const char *meta_chars = "|.^$*+?()[\\"; 94 95#ifdef SLRE_TEST 96 97static void 98print_character_set(FILE *fp, const unsigned char *p, int len) 99{ 100 int i; 101 102 for (i = 0; i < len; i++) { 103 if (i > 0) 104 (void) fputc(',', fp); 105 if (p[i] == 0) { 106 i++; 107 if (p[i] == 0) 108 (void) fprintf(fp, "\\x%02x", p[i]); 109 else 110 (void) fprintf(fp, "%s", opcodes[p[i]].name); 111 } else if (isprint(p[i])) { 112 (void) fputc(p[i], fp); 113 } else { 114 (void) fprintf(fp, "\\x%02x", p[i]); 115 } 116 } 117} 118 119void 120slre_dump(const struct slre *r, FILE *fp) 121{ 122 int i, j, ch, op, pc; 123 124 for (pc = 0; pc < r->code_size; pc++) { 125 126 op = r->code[pc]; 127 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name); 128 129 for (i = 0; opcodes[op].flags[i] != '\0'; i++) 130 switch (opcodes[op].flags[i]) { 131 case 'i': 132 (void) fprintf(fp, "%d ", r->code[pc + 1]); 133 pc++; 134 break; 135 case 'o': 136 (void) fprintf(fp, "%d ", 137 pc + r->code[pc + 1] - i); 138 pc++; 139 break; 140 case 'D': 141 print_character_set(fp, r->data + 142 r->code[pc + 1], r->code[pc + 2]); 143 pc += 2; 144 break; 145 case 'd': 146 (void) fputc('"', fp); 147 for (j = 0; j < r->code[pc + 2]; j++) { 148 ch = r->data[r->code[pc + 1] + j]; 149 if (isprint(ch)) { 150 (void) fputc(ch, fp); 151 } else { 152 (void) fprintf(fp, 153 "\\x%02x", ch); 154 } 155 } 156 (void) fputc('"', fp); 157 pc += 2; 158 break; 159 } 160 161 (void) fputc('\n', fp); 162 } 163} 164#endif /* SLRE_TEST */ 165 166static void 167set_jump_offset(struct slre *r, int pc, int offset) 168{ 169 assert(offset < r->code_size); 170 171 if (r->code_size - offset > 0xff) 172 r->err_str = "Jump offset is too big"; 173 else 174 r->code[pc] = (unsigned char) (r->code_size - offset); 175} 176 177static void 178emit(struct slre *r, int code) 179{ 180 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0]))) 181 r->err_str = "RE is too long (code overflow)"; 182 else 183 r->code[r->code_size++] = (unsigned char) code; 184} 185 186static void 187store_char_in_data(struct slre *r, int ch) 188{ 189 if (r->data_size >= (int) sizeof(r->data)) 190 r->err_str = "RE is too long (data overflow)"; 191 else 192 r->data[r->data_size++] = ch; 193} 194 195static void 196exact(struct slre *r, const char **re) 197{ 198 int old_data_size = r->data_size; 199 200 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL) 201 store_char_in_data(r, *(*re)++); 202 203 emit(r, EXACT); 204 emit(r, old_data_size); 205 emit(r, r->data_size - old_data_size); 206} 207 208static int 209get_escape_char(const char **re) 210{ 211 int res; 212 213 switch (*(*re)++) { 214 case 'n': 215 res = '\n'; 216 break; 217 case 'r': 218 res = '\r'; 219 break; 220 case 't': 221 res = '\t'; 222 break; 223 case '0': 224 res = 0; 225 break; 226 case 'S': 227 res = NONSPACE << 8; 228 break; 229 case 's': 230 res = SPACE << 8; 231 break; 232 case 'd': 233 res = DIGIT << 8; 234 break; 235 default: 236 res = (*re)[-1]; 237 break; 238 } 239 240 return res; 241} 242 243static void 244anyof(struct slre *r, const char **re) 245{ 246 int esc, old_data_size = r->data_size, op = ANYOF; 247 248 if (**re == '^') { 249 op = ANYBUT; 250 (*re)++; 251 } 252 253 while (**re != '\0') 254 255 switch (*(*re)++) { 256 case ']': 257 emit(r, op); 258 emit(r, old_data_size); 259 emit(r, r->data_size - old_data_size); 260 return; 261 /* NOTREACHED */ 262 break; 263 case '\\': 264 esc = get_escape_char(re); 265 if ((esc & 0xff) == 0) { 266 store_char_in_data(r, 0); 267 store_char_in_data(r, esc >> 8); 268 } else { 269 store_char_in_data(r, esc); 270 } 271 break; 272 default: 273 store_char_in_data(r, (*re)[-1]); 274 break; 275 } 276 277 r->err_str = "No closing ']' bracket"; 278} 279 280static void 281relocate(struct slre *r, int begin, int shift) 282{ 283 emit(r, END); 284 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin); 285 r->code_size += shift; 286} 287 288static void 289quantifier(struct slre *r, int prev, int op) 290{ 291 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) { 292 r->code[prev + 2]--; 293 emit(r, EXACT); 294 emit(r, r->code[prev + 1] + r->code[prev + 2]); 295 emit(r, 1); 296 prev = r->code_size - 3; 297 } 298 relocate(r, prev, 2); 299 r->code[prev] = op; 300 set_jump_offset(r, prev + 1, prev); 301} 302 303static void 304exact_one_char(struct slre *r, int ch) 305{ 306 emit(r, EXACT); 307 emit(r, r->data_size); 308 emit(r, 1); 309 store_char_in_data(r, ch); 310} 311 312static void 313fixup_branch(struct slre *r, int fixup) 314{ 315 if (fixup > 0) { 316 emit(r, END); 317 set_jump_offset(r, fixup, fixup - 2); 318 } 319} 320 321static void 322compile(struct slre *r, const char **re) 323{ 324 int op, esc, branch_start, last_op, fixup, cap_no, level; 325 326 fixup = 0; 327 level = r->num_caps; 328 branch_start = last_op = r->code_size; 329 330 for (;;) 331 switch (*(*re)++) { 332 case '\0': 333 (*re)--; 334 return; 335 /* NOTREACHED */ 336 break; 337 case '^': 338 emit(r, BOL); 339 break; 340 case '$': 341 emit(r, EOL); 342 break; 343 case '.': 344 last_op = r->code_size; 345 emit(r, ANY); 346 break; 347 case '[': 348 last_op = r->code_size; 349 anyof(r, re); 350 break; 351 case '\\': 352 last_op = r->code_size; 353 esc = get_escape_char(re); 354 if (esc & 0xff00) 355 emit(r, esc >> 8); 356 else 357 exact_one_char(r, esc); 358 break; 359 case '(': 360 last_op = r->code_size; 361 cap_no = ++r->num_caps; 362 emit(r, OPEN); 363 emit(r, cap_no); 364 365 compile(r, re); 366 if (*(*re)++ != ')') { 367 r->err_str = "No closing bracket"; 368 return; 369 } 370 371 emit(r, CLOSE); 372 emit(r, cap_no); 373 break; 374 case ')': 375 (*re)--; 376 fixup_branch(r, fixup); 377 if (level == 0) { 378 r->err_str = "Unbalanced brackets"; 379 return; 380 } 381 return; 382 /* NOTREACHED */ 383 break; 384 case '+': 385 case '*': 386 op = (*re)[-1] == '*' ? STAR : PLUS; 387 if (**re == '?') { 388 (*re)++; 389 op = op == STAR ? STARQ : PLUSQ; 390 } 391 quantifier(r, last_op, op); 392 break; 393 case '?': 394 quantifier(r, last_op, QUEST); 395 break; 396 case '|': 397 fixup_branch(r, fixup); 398 relocate(r, branch_start, 3); 399 r->code[branch_start] = BRANCH; 400 set_jump_offset(r, branch_start + 1, branch_start); 401 fixup = branch_start + 2; 402 r->code[fixup] = 0xff; 403 break; 404 default: 405 (*re)--; 406 last_op = r->code_size; 407 exact(r, re); 408 break; 409 } 410} 411 412int 413slre_compile(struct slre *r, const char *re) 414{ 415 r->err_str = NULL; 416 r->code_size = r->data_size = r->num_caps = r->anchored = 0; 417 418 if (*re == '^') 419 r->anchored++; 420 421 emit(r, OPEN); /* This will capture what matches full RE */ 422 emit(r, 0); 423 424 while (*re != '\0') 425 compile(r, &re); 426 427 if (r->code[2] == BRANCH) 428 fixup_branch(r, 4); 429 430 emit(r, CLOSE); 431 emit(r, 0); 432 emit(r, END); 433 434 return (r->err_str == NULL ? 1 : 0); 435} 436 437static int match(const struct slre *, int, 438 const char *, int, int *, struct cap *); 439 440static void 441loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs) 442{ 443 int saved_offset, matched_offset; 444 445 matched_offset = *ofs; 446 447 while (match(r, pc + 2, s, len, ofs, NULL)) { 448 saved_offset = *ofs; 449 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL)) 450 matched_offset = saved_offset; 451 *ofs = saved_offset; 452 } 453 454 *ofs = matched_offset; 455} 456 457static void 458loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs) 459{ 460 int saved_offset = *ofs; 461 462 while (match(r, pc + 2, s, len, ofs, NULL)) { 463 saved_offset = *ofs; 464 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL)) 465 break; 466 } 467 468 *ofs = saved_offset; 469} 470 471static int 472is_any_of(const unsigned char *p, int len, const char *s, int *ofs) 473{ 474 int i, ch; 475 476 ch = s[*ofs]; 477 478 for (i = 0; i < len; i++) 479 if (p[i] == ch) { 480 (*ofs)++; 481 return 1; 482 } 483 484 return 0; 485} 486 487static int 488is_any_but(const unsigned char *p, int len, const char *s, int *ofs) 489{ 490 int i, ch; 491 492 ch = s[*ofs]; 493 494 for (i = 0; i < len; i++) { 495 if (p[i] == ch) 496 return 0; 497 } 498 499 (*ofs)++; 500 return 1; 501} 502 503static int 504match(const struct slre *r, int pc, const char *s, int len, 505 int *ofs, struct cap *caps) 506{ 507 int n, saved_offset, res = 1; 508 509 while (res && r->code[pc] != END) { 510 511 assert(pc < r->code_size); 512 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0]))); 513 514 switch (r->code[pc]) { 515 case BRANCH: 516 saved_offset = *ofs; 517 res = match(r, pc + 3, s, len, ofs, caps); 518 if (res == 0) { 519 *ofs = saved_offset; 520 res = match(r, pc + r->code[pc + 1], 521 s, len, ofs, caps); 522 } 523 pc += r->code[pc + 2]; 524 break; 525 case EXACT: 526 res = 0; 527 n = r->code[pc + 2]; /* String length */ 528 if (n <= len - *ofs && !memcmp(s + *ofs, r->data + 529 r->code[pc + 1], n)) { 530 (*ofs) += n; 531 res = 1; 532 } 533 pc += 3; 534 break; 535 case QUEST: 536 res = 1; 537 saved_offset = *ofs; 538 if (!match(r, pc + 2, s, len, ofs, caps)) 539 *ofs = saved_offset; 540 pc += r->code[pc + 1]; 541 break; 542 case STAR: 543 res = 1; 544 loop_greedy(r, pc, s, len, ofs); 545 pc += r->code[pc + 1]; 546 break; 547 case STARQ: 548 res = 1; 549 loop_non_greedy(r, pc, s, len, ofs); 550 pc += r->code[pc + 1]; 551 break; 552 case PLUS: 553 res = match(r, pc + 2, s, len, ofs, caps); 554 if (res == 0) 555 break; 556 557 loop_greedy(r, pc, s, len, ofs); 558 pc += r->code[pc + 1]; 559 break; 560 case PLUSQ: 561 res = match(r, pc + 2, s, len, ofs, caps); 562 if (res == 0) 563 break; 564 565 loop_non_greedy(r, pc, s, len, ofs); 566 pc += r->code[pc + 1]; 567 break; 568 case SPACE: 569 res = 0; 570 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) { 571 (*ofs)++; 572 res = 1; 573 } 574 pc++; 575 break; 576 case NONSPACE: 577 res = 0; 578 if (*ofs < len && 579 !isspace(((unsigned char *)s)[*ofs])) { 580 (*ofs)++; 581 res = 1; 582 } 583 pc++; 584 break; 585 case DIGIT: 586 res = 0; 587 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) { 588 (*ofs)++; 589 res = 1; 590 } 591 pc++; 592 break; 593 case ANY: 594 res = 0; 595 if (*ofs < len) { 596 (*ofs)++; 597 res = 1; 598 } 599 pc++; 600 break; 601 case ANYOF: 602 res = 0; 603 if (*ofs < len) 604 res = is_any_of(r->data + r->code[pc + 1], 605 r->code[pc + 2], s, ofs); 606 pc += 3; 607 break; 608 case ANYBUT: 609 res = 0; 610 if (*ofs < len) 611 res = is_any_but(r->data + r->code[pc + 1], 612 r->code[pc + 2], s, ofs); 613 pc += 3; 614 break; 615 case BOL: 616 res = *ofs == 0 ? 1 : 0; 617 pc++; 618 break; 619 case EOL: 620 res = *ofs == len ? 1 : 0; 621 pc++; 622 break; 623 case OPEN: 624 if (caps != NULL) 625 caps[r->code[pc + 1]].ptr = s + *ofs; 626 pc += 2; 627 break; 628 case CLOSE: 629 if (caps != NULL) 630 caps[r->code[pc + 1]].len = (s + *ofs) - 631 caps[r->code[pc + 1]].ptr; 632 pc += 2; 633 break; 634 case END: 635 pc++; 636 break; 637 default: 638 printf("unknown cmd (%d) at %d\n", r->code[pc], pc); 639 assert(0); 640 break; 641 } 642 } 643 644 return res; 645} 646 647int 648slre_match(const struct slre *r, const char *buf, int len, 649 struct cap *caps) 650{ 651 int i, ofs = 0, res = 0; 652 653 if (r->anchored) { 654 res = match(r, 0, buf, len, &ofs, caps); 655 } else { 656 for (i = 0; i < len && res == 0; i++) { 657 ofs = i; 658 res = match(r, 0, buf, len, &ofs, caps); 659 } 660 } 661 662 return res; 663} 664 665#ifdef SLRE_TEST 666#define N_CAPS 5 667 668int main(int argc, char *argv[]) 669{ 670 struct slre slre; 671 struct cap caps[N_CAPS]; 672 unsigned char data[1 * 1024 * 1024]; 673 FILE *fp; 674 int i, res, len; 675 676 if (argc < 2) { 677 fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]); 678 return 1; 679 } 680 681 fp = fopen(argv[2], "rb"); 682 if (fp == NULL) { 683 fprintf(stderr, "Error: cannot open %s:%s\n", 684 argv[2], strerror(errno)); 685 return 1; 686 } 687 688 if (!slre_compile(&slre, argv[1])) { 689 (void) fclose(fp); 690 fprintf(stderr, "Error compiling slre: %s\n", slre.err_str); 691 return 1; 692 } 693 694 slre_dump(&slre, stderr); 695 696 while (fgets(data, sizeof(data), fp) != NULL) { 697 len = strlen(data); 698 699 if ((len > 0) && (data[len-1] == '\n')) { 700 data[len-1] = '\0'; 701 --len; 702 } 703 704 printf("Data = \"%s\"\n", data); 705 706 (void) memset(caps, 0, sizeof(caps)); 707 708 res = slre_match(&slre, data, len, caps); 709 printf("Result [%d]: %d\n", i, res); 710 711 for (i = 0; i < N_CAPS; i++) { 712 if (caps[i].len > 0) { 713 printf("Substring %d: len=%d [%.*s]\n", i, 714 caps[i].len, 715 caps[i].len, caps[i].ptr); 716 } 717 } 718 printf("----------------------------------------------------\n"); 719 } 720 (void) fclose(fp); 721 722 return 0; 723} 724#endif /* SLRE_TEST */ 725