1/* 2 $Id: strscan.c 44659 2014-01-19 16:28:53Z nagachika $ 3 4 Copyright (c) 1999-2006 Minero Aoki 5 6 This program is free software. 7 You can distribute/modify this program under the terms of 8 the Ruby License. For details, see the file COPYING. 9*/ 10 11#include "ruby/ruby.h" 12#include "ruby/re.h" 13#include "ruby/encoding.h" 14#include "regint.h" 15 16#define STRSCAN_VERSION "0.7.0" 17 18/* ======================================================================= 19 Data Type Definitions 20 ======================================================================= */ 21 22static VALUE StringScanner; 23static VALUE ScanError; 24static ID id_byteslice; 25 26struct strscanner 27{ 28 /* multi-purpose flags */ 29 unsigned long flags; 30#define FLAG_MATCHED (1 << 0) 31 32 /* the string to scan */ 33 VALUE str; 34 35 /* scan pointers */ 36 long prev; /* legal only when MATCHED_P(s) */ 37 long curr; /* always legal */ 38 39 /* the regexp register; legal only when MATCHED_P(s) */ 40 struct re_registers regs; 41}; 42 43#define MATCHED_P(s) ((s)->flags & FLAG_MATCHED) 44#define MATCHED(s) (s)->flags |= FLAG_MATCHED 45#define CLEAR_MATCH_STATUS(s) (s)->flags &= ~FLAG_MATCHED 46 47#define S_PBEG(s) (RSTRING_PTR((s)->str)) 48#define S_LEN(s) (RSTRING_LEN((s)->str)) 49#define S_PEND(s) (S_PBEG(s) + S_LEN(s)) 50#define CURPTR(s) (S_PBEG(s) + (s)->curr) 51#define S_RESTLEN(s) (S_LEN(s) - (s)->curr) 52 53#define EOS_P(s) ((s)->curr >= RSTRING_LEN(p->str)) 54 55#define GET_SCANNER(obj,var) do {\ 56 (var) = check_strscan(obj);\ 57 if (NIL_P((var)->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");\ 58} while (0) 59 60/* ======================================================================= 61 Function Prototypes 62 ======================================================================= */ 63 64static VALUE infect _((VALUE str, struct strscanner *p)); 65static VALUE extract_range _((struct strscanner *p, long beg_i, long end_i)); 66static VALUE extract_beg_len _((struct strscanner *p, long beg_i, long len)); 67 68static struct strscanner *check_strscan _((VALUE obj)); 69static void strscan_mark _((void *p)); 70static void strscan_free _((void *p)); 71static size_t strscan_memsize _((const void *p)); 72static VALUE strscan_s_allocate _((VALUE klass)); 73static VALUE strscan_initialize _((int argc, VALUE *argv, VALUE self)); 74static VALUE strscan_init_copy _((VALUE vself, VALUE vorig)); 75 76static VALUE strscan_s_mustc _((VALUE self)); 77static VALUE strscan_terminate _((VALUE self)); 78static VALUE strscan_clear _((VALUE self)); 79static VALUE strscan_get_string _((VALUE self)); 80static VALUE strscan_set_string _((VALUE self, VALUE str)); 81static VALUE strscan_concat _((VALUE self, VALUE str)); 82static VALUE strscan_get_pos _((VALUE self)); 83static VALUE strscan_set_pos _((VALUE self, VALUE pos)); 84static VALUE strscan_do_scan _((VALUE self, VALUE regex, 85 int succptr, int getstr, int headonly)); 86static VALUE strscan_scan _((VALUE self, VALUE re)); 87static VALUE strscan_match_p _((VALUE self, VALUE re)); 88static VALUE strscan_skip _((VALUE self, VALUE re)); 89static VALUE strscan_check _((VALUE self, VALUE re)); 90static VALUE strscan_scan_full _((VALUE self, VALUE re, 91 VALUE succp, VALUE getp)); 92static VALUE strscan_scan_until _((VALUE self, VALUE re)); 93static VALUE strscan_skip_until _((VALUE self, VALUE re)); 94static VALUE strscan_check_until _((VALUE self, VALUE re)); 95static VALUE strscan_search_full _((VALUE self, VALUE re, 96 VALUE succp, VALUE getp)); 97static void adjust_registers_to_matched _((struct strscanner *p)); 98static VALUE strscan_getch _((VALUE self)); 99static VALUE strscan_get_byte _((VALUE self)); 100static VALUE strscan_getbyte _((VALUE self)); 101static VALUE strscan_peek _((VALUE self, VALUE len)); 102static VALUE strscan_peep _((VALUE self, VALUE len)); 103static VALUE strscan_unscan _((VALUE self)); 104static VALUE strscan_bol_p _((VALUE self)); 105static VALUE strscan_eos_p _((VALUE self)); 106static VALUE strscan_empty_p _((VALUE self)); 107static VALUE strscan_rest_p _((VALUE self)); 108static VALUE strscan_matched_p _((VALUE self)); 109static VALUE strscan_matched _((VALUE self)); 110static VALUE strscan_matched_size _((VALUE self)); 111static VALUE strscan_aref _((VALUE self, VALUE idx)); 112static VALUE strscan_pre_match _((VALUE self)); 113static VALUE strscan_post_match _((VALUE self)); 114static VALUE strscan_rest _((VALUE self)); 115static VALUE strscan_rest_size _((VALUE self)); 116 117static VALUE strscan_inspect _((VALUE self)); 118static VALUE inspect1 _((struct strscanner *p)); 119static VALUE inspect2 _((struct strscanner *p)); 120 121/* ======================================================================= 122 Utils 123 ======================================================================= */ 124 125static VALUE 126infect(VALUE str, struct strscanner *p) 127{ 128 OBJ_INFECT(str, p->str); 129 return str; 130} 131 132static VALUE 133str_new(struct strscanner *p, const char *ptr, long len) 134{ 135 VALUE str = rb_str_new(ptr, len); 136 rb_enc_copy(str, p->str); 137 return str; 138} 139 140static VALUE 141extract_range(struct strscanner *p, long beg_i, long end_i) 142{ 143 if (beg_i > S_LEN(p)) return Qnil; 144 if (end_i > S_LEN(p)) 145 end_i = S_LEN(p); 146 return infect(str_new(p, S_PBEG(p) + beg_i, end_i - beg_i), p); 147} 148 149static VALUE 150extract_beg_len(struct strscanner *p, long beg_i, long len) 151{ 152 if (beg_i > S_LEN(p)) return Qnil; 153 if (beg_i + len > S_LEN(p)) 154 len = S_LEN(p) - beg_i; 155 return infect(str_new(p, S_PBEG(p) + beg_i, len), p); 156} 157 158/* ======================================================================= 159 Constructor 160 ======================================================================= */ 161 162static void 163strscan_mark(void *ptr) 164{ 165 struct strscanner *p = ptr; 166 rb_gc_mark(p->str); 167} 168 169static void 170strscan_free(void *ptr) 171{ 172 struct strscanner *p = ptr; 173 onig_region_free(&(p->regs), 0); 174 ruby_xfree(p); 175} 176 177static size_t 178strscan_memsize(const void *ptr) 179{ 180 const struct strscanner *p = ptr; 181 size_t size = 0; 182 if (p) { 183 size = sizeof(*p) - sizeof(p->regs) + onig_region_memsize(&p->regs); 184 } 185 return size; 186} 187 188static const rb_data_type_t strscanner_type = { 189 "StringScanner", 190 {strscan_mark, strscan_free, strscan_memsize} 191}; 192 193static VALUE 194strscan_s_allocate(VALUE klass) 195{ 196 struct strscanner *p; 197 198 p = ALLOC(struct strscanner); 199 MEMZERO(p, struct strscanner, 1); 200 CLEAR_MATCH_STATUS(p); 201 onig_region_init(&(p->regs)); 202 p->str = Qnil; 203 return TypedData_Wrap_Struct(klass, &strscanner_type, p); 204} 205 206/* 207 * call-seq: StringScanner.new(string, dup = false) 208 * 209 * Creates a new StringScanner object to scan over the given +string+. 210 * +dup+ argument is obsolete and not used now. 211 */ 212static VALUE 213strscan_initialize(int argc, VALUE *argv, VALUE self) 214{ 215 struct strscanner *p; 216 VALUE str, need_dup; 217 218 p = check_strscan(self); 219 rb_scan_args(argc, argv, "11", &str, &need_dup); 220 StringValue(str); 221 p->str = str; 222 223 return self; 224} 225 226static struct strscanner * 227check_strscan(VALUE obj) 228{ 229 return rb_check_typeddata(obj, &strscanner_type); 230} 231 232/* 233 * call-seq: 234 * dup 235 * clone 236 * 237 * Duplicates a StringScanner object. 238 */ 239static VALUE 240strscan_init_copy(VALUE vself, VALUE vorig) 241{ 242 struct strscanner *self, *orig; 243 244 self = check_strscan(vself); 245 orig = check_strscan(vorig); 246 if (self != orig) { 247 self->flags = orig->flags; 248 self->str = orig->str; 249 self->prev = orig->prev; 250 self->curr = orig->curr; 251 onig_region_copy(&self->regs, &orig->regs); 252 } 253 254 return vself; 255} 256 257/* ======================================================================= 258 Instance Methods 259 ======================================================================= */ 260 261/* 262 * call-seq: StringScanner.must_C_version 263 * 264 * This method is defined for backward compatibility. 265 */ 266static VALUE 267strscan_s_mustc(VALUE self) 268{ 269 return self; 270} 271 272/* 273 * Reset the scan pointer (index 0) and clear matching data. 274 */ 275static VALUE 276strscan_reset(VALUE self) 277{ 278 struct strscanner *p; 279 280 GET_SCANNER(self, p); 281 p->curr = 0; 282 CLEAR_MATCH_STATUS(p); 283 return self; 284} 285 286/* 287 * call-seq: 288 * terminate 289 * clear 290 * 291 * Set the scan pointer to the end of the string and clear matching data. 292 */ 293static VALUE 294strscan_terminate(VALUE self) 295{ 296 struct strscanner *p; 297 298 GET_SCANNER(self, p); 299 p->curr = S_LEN(p); 300 CLEAR_MATCH_STATUS(p); 301 return self; 302} 303 304/* 305 * Equivalent to #terminate. 306 * This method is obsolete; use #terminate instead. 307 */ 308static VALUE 309strscan_clear(VALUE self) 310{ 311 rb_warning("StringScanner#clear is obsolete; use #terminate instead"); 312 return strscan_terminate(self); 313} 314 315/* 316 * Returns the string being scanned. 317 */ 318static VALUE 319strscan_get_string(VALUE self) 320{ 321 struct strscanner *p; 322 323 GET_SCANNER(self, p); 324 return p->str; 325} 326 327/* 328 * call-seq: string=(str) 329 * 330 * Changes the string being scanned to +str+ and resets the scanner. 331 * Returns +str+. 332 */ 333static VALUE 334strscan_set_string(VALUE self, VALUE str) 335{ 336 struct strscanner *p = check_strscan(self); 337 338 StringValue(str); 339 p->str = str; 340 p->curr = 0; 341 CLEAR_MATCH_STATUS(p); 342 return str; 343} 344 345/* 346 * call-seq: 347 * concat(str) 348 * <<(str) 349 * 350 * Appends +str+ to the string being scanned. 351 * This method does not affect scan pointer. 352 * 353 * s = StringScanner.new("Fri Dec 12 1975 14:39") 354 * s.scan(/Fri /) 355 * s << " +1000 GMT" 356 * s.string # -> "Fri Dec 12 1975 14:39 +1000 GMT" 357 * s.scan(/Dec/) # -> "Dec" 358 */ 359static VALUE 360strscan_concat(VALUE self, VALUE str) 361{ 362 struct strscanner *p; 363 364 GET_SCANNER(self, p); 365 StringValue(str); 366 rb_str_append(p->str, str); 367 return self; 368} 369 370/* 371 * Returns the byte position of the scan pointer. In the 'reset' position, this 372 * value is zero. In the 'terminated' position (i.e. the string is exhausted), 373 * this value is the bytesize of the string. 374 * 375 * In short, it's a 0-based index into bytes of the string. 376 * 377 * s = StringScanner.new('test string') 378 * s.pos # -> 0 379 * s.scan_until /str/ # -> "test str" 380 * s.pos # -> 8 381 * s.terminate # -> #<StringScanner fin> 382 * s.pos # -> 11 383 */ 384static VALUE 385strscan_get_pos(VALUE self) 386{ 387 struct strscanner *p; 388 389 GET_SCANNER(self, p); 390 return INT2FIX(p->curr); 391} 392 393/* 394 * Returns the character position of the scan pointer. In the 'reset' position, this 395 * value is zero. In the 'terminated' position (i.e. the string is exhausted), 396 * this value is the size of the string. 397 * 398 * In short, it's a 0-based index into the string. 399 * 400 * s = StringScanner.new("abcädeföghi") 401 * s.charpos # -> 0 402 * s.scan_until(/ä/) # -> "abcä" 403 * s.pos # -> 5 404 * s.charpos # -> 4 405 */ 406static VALUE 407strscan_get_charpos(VALUE self) 408{ 409 struct strscanner *p; 410 VALUE substr; 411 412 GET_SCANNER(self, p); 413 414 substr = rb_funcall(p->str, id_byteslice, 2, INT2FIX(0), INT2NUM(p->curr)); 415 416 return rb_str_length(substr); 417} 418 419/* 420 * call-seq: pos=(n) 421 * 422 * Set the byte position of the scan pointer. 423 * 424 * s = StringScanner.new('test string') 425 * s.pos = 7 # -> 7 426 * s.rest # -> "ring" 427 */ 428static VALUE 429strscan_set_pos(VALUE self, VALUE v) 430{ 431 struct strscanner *p; 432 long i; 433 434 GET_SCANNER(self, p); 435 i = NUM2INT(v); 436 if (i < 0) i += S_LEN(p); 437 if (i < 0) rb_raise(rb_eRangeError, "index out of range"); 438 if (i > S_LEN(p)) rb_raise(rb_eRangeError, "index out of range"); 439 p->curr = i; 440 return INT2NUM(i); 441} 442 443static VALUE 444strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly) 445{ 446 regex_t *rb_reg_prepare_re(VALUE re, VALUE str); 447 struct strscanner *p; 448 regex_t *re; 449 long ret; 450 int tmpreg; 451 452 Check_Type(regex, T_REGEXP); 453 GET_SCANNER(self, p); 454 455 CLEAR_MATCH_STATUS(p); 456 if (S_RESTLEN(p) < 0) { 457 return Qnil; 458 } 459 re = rb_reg_prepare_re(regex, p->str); 460 tmpreg = re != RREGEXP(regex)->ptr; 461 if (!tmpreg) RREGEXP(regex)->usecnt++; 462 463 if (headonly) { 464 ret = onig_match(re, (UChar* )CURPTR(p), 465 (UChar* )(CURPTR(p) + S_RESTLEN(p)), 466 (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE); 467 } 468 else { 469 ret = onig_search(re, 470 (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)), 471 (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)), 472 &(p->regs), ONIG_OPTION_NONE); 473 } 474 if (!tmpreg) RREGEXP(regex)->usecnt--; 475 if (tmpreg) { 476 if (RREGEXP(regex)->usecnt) { 477 onig_free(re); 478 } 479 else { 480 onig_free(RREGEXP(regex)->ptr); 481 RREGEXP(regex)->ptr = re; 482 } 483 } 484 485 if (ret == -2) rb_raise(ScanError, "regexp buffer overflow"); 486 if (ret < 0) { 487 /* not matched */ 488 return Qnil; 489 } 490 491 MATCHED(p); 492 p->prev = p->curr; 493 if (succptr) { 494 p->curr += p->regs.end[0]; 495 } 496 if (getstr) { 497 return extract_beg_len(p, p->prev, p->regs.end[0]); 498 } 499 else { 500 return INT2FIX(p->regs.end[0]); 501 } 502} 503 504/* 505 * call-seq: scan(pattern) => String 506 * 507 * Tries to match with +pattern+ at the current position. If there's a match, 508 * the scanner advances the "scan pointer" and returns the matched string. 509 * Otherwise, the scanner returns +nil+. 510 * 511 * s = StringScanner.new('test string') 512 * p s.scan(/\w+/) # -> "test" 513 * p s.scan(/\w+/) # -> nil 514 * p s.scan(/\s+/) # -> " " 515 * p s.scan(/\w+/) # -> "string" 516 * p s.scan(/./) # -> nil 517 * 518 */ 519static VALUE 520strscan_scan(VALUE self, VALUE re) 521{ 522 return strscan_do_scan(self, re, 1, 1, 1); 523} 524 525/* 526 * call-seq: match?(pattern) 527 * 528 * Tests whether the given +pattern+ is matched from the current scan pointer. 529 * Returns the length of the match, or +nil+. The scan pointer is not advanced. 530 * 531 * s = StringScanner.new('test string') 532 * p s.match?(/\w+/) # -> 4 533 * p s.match?(/\w+/) # -> 4 534 * p s.match?(/\s+/) # -> nil 535 */ 536static VALUE 537strscan_match_p(VALUE self, VALUE re) 538{ 539 return strscan_do_scan(self, re, 0, 0, 1); 540} 541 542/* 543 * call-seq: skip(pattern) 544 * 545 * Attempts to skip over the given +pattern+ beginning with the scan pointer. 546 * If it matches, the scan pointer is advanced to the end of the match, and the 547 * length of the match is returned. Otherwise, +nil+ is returned. 548 * 549 * It's similar to #scan, but without returning the matched string. 550 * 551 * s = StringScanner.new('test string') 552 * p s.skip(/\w+/) # -> 4 553 * p s.skip(/\w+/) # -> nil 554 * p s.skip(/\s+/) # -> 1 555 * p s.skip(/\w+/) # -> 6 556 * p s.skip(/./) # -> nil 557 * 558 */ 559static VALUE 560strscan_skip(VALUE self, VALUE re) 561{ 562 return strscan_do_scan(self, re, 1, 0, 1); 563} 564 565/* 566 * call-seq: check(pattern) 567 * 568 * This returns the value that #scan would return, without advancing the scan 569 * pointer. The match register is affected, though. 570 * 571 * s = StringScanner.new("Fri Dec 12 1975 14:39") 572 * s.check /Fri/ # -> "Fri" 573 * s.pos # -> 0 574 * s.matched # -> "Fri" 575 * s.check /12/ # -> nil 576 * s.matched # -> nil 577 * 578 * Mnemonic: it "checks" to see whether a #scan will return a value. 579 */ 580static VALUE 581strscan_check(VALUE self, VALUE re) 582{ 583 return strscan_do_scan(self, re, 0, 1, 1); 584} 585 586/* 587 * call-seq: scan_full(pattern, advance_pointer_p, return_string_p) 588 * 589 * Tests whether the given +pattern+ is matched from the current scan pointer. 590 * Advances the scan pointer if +advance_pointer_p+ is true. 591 * Returns the matched string if +return_string_p+ is true. 592 * The match register is affected. 593 * 594 * "full" means "#scan with full parameters". 595 */ 596static VALUE 597strscan_scan_full(VALUE self, VALUE re, VALUE s, VALUE f) 598{ 599 return strscan_do_scan(self, re, RTEST(s), RTEST(f), 1); 600} 601 602/* 603 * call-seq: scan_until(pattern) 604 * 605 * Scans the string _until_ the +pattern+ is matched. Returns the substring up 606 * to and including the end of the match, advancing the scan pointer to that 607 * location. If there is no match, +nil+ is returned. 608 * 609 * s = StringScanner.new("Fri Dec 12 1975 14:39") 610 * s.scan_until(/1/) # -> "Fri Dec 1" 611 * s.pre_match # -> "Fri Dec " 612 * s.scan_until(/XYZ/) # -> nil 613 */ 614static VALUE 615strscan_scan_until(VALUE self, VALUE re) 616{ 617 return strscan_do_scan(self, re, 1, 1, 0); 618} 619 620/* 621 * call-seq: exist?(pattern) 622 * 623 * Looks _ahead_ to see if the +pattern+ exists _anywhere_ in the string, 624 * without advancing the scan pointer. This predicates whether a #scan_until 625 * will return a value. 626 * 627 * s = StringScanner.new('test string') 628 * s.exist? /s/ # -> 3 629 * s.scan /test/ # -> "test" 630 * s.exist? /s/ # -> 2 631 * s.exist? /e/ # -> nil 632 */ 633static VALUE 634strscan_exist_p(VALUE self, VALUE re) 635{ 636 return strscan_do_scan(self, re, 0, 0, 0); 637} 638 639/* 640 * call-seq: skip_until(pattern) 641 * 642 * Advances the scan pointer until +pattern+ is matched and consumed. Returns 643 * the number of bytes advanced, or +nil+ if no match was found. 644 * 645 * Look ahead to match +pattern+, and advance the scan pointer to the _end_ 646 * of the match. Return the number of characters advanced, or +nil+ if the 647 * match was unsuccessful. 648 * 649 * It's similar to #scan_until, but without returning the intervening string. 650 * 651 * s = StringScanner.new("Fri Dec 12 1975 14:39") 652 * s.skip_until /12/ # -> 10 653 * s # 654 */ 655static VALUE 656strscan_skip_until(VALUE self, VALUE re) 657{ 658 return strscan_do_scan(self, re, 1, 0, 0); 659} 660 661/* 662 * call-seq: check_until(pattern) 663 * 664 * This returns the value that #scan_until would return, without advancing the 665 * scan pointer. The match register is affected, though. 666 * 667 * s = StringScanner.new("Fri Dec 12 1975 14:39") 668 * s.check_until /12/ # -> "Fri Dec 12" 669 * s.pos # -> 0 670 * s.matched # -> 12 671 * 672 * Mnemonic: it "checks" to see whether a #scan_until will return a value. 673 */ 674static VALUE 675strscan_check_until(VALUE self, VALUE re) 676{ 677 return strscan_do_scan(self, re, 0, 1, 0); 678} 679 680/* 681 * call-seq: search_full(pattern, advance_pointer_p, return_string_p) 682 * 683 * Scans the string _until_ the +pattern+ is matched. 684 * Advances the scan pointer if +advance_pointer_p+, otherwise not. 685 * Returns the matched string if +return_string_p+ is true, otherwise 686 * returns the number of bytes advanced. 687 * This method does affect the match register. 688 */ 689static VALUE 690strscan_search_full(VALUE self, VALUE re, VALUE s, VALUE f) 691{ 692 return strscan_do_scan(self, re, RTEST(s), RTEST(f), 0); 693} 694 695static void 696adjust_registers_to_matched(struct strscanner *p) 697{ 698 onig_region_clear(&(p->regs)); 699 onig_region_set(&(p->regs), 0, 0, (int)(p->curr - p->prev)); 700} 701 702/* 703 * Scans one character and returns it. 704 * This method is multibyte character sensitive. 705 * 706 * s = StringScanner.new("ab") 707 * s.getch # => "a" 708 * s.getch # => "b" 709 * s.getch # => nil 710 * 711 * $KCODE = 'EUC' 712 * s = StringScanner.new("\244\242") 713 * s.getch # => "\244\242" # Japanese hira-kana "A" in EUC-JP 714 * s.getch # => nil 715 */ 716static VALUE 717strscan_getch(VALUE self) 718{ 719 struct strscanner *p; 720 long len; 721 722 GET_SCANNER(self, p); 723 CLEAR_MATCH_STATUS(p); 724 if (EOS_P(p)) 725 return Qnil; 726 727 len = rb_enc_mbclen(CURPTR(p), S_PEND(p), rb_enc_get(p->str)); 728 if (p->curr + len > S_LEN(p)) { 729 len = S_LEN(p) - p->curr; 730 } 731 p->prev = p->curr; 732 p->curr += len; 733 MATCHED(p); 734 adjust_registers_to_matched(p); 735 return extract_range(p, p->prev + p->regs.beg[0], 736 p->prev + p->regs.end[0]); 737} 738 739/* 740 * Scans one byte and returns it. 741 * This method is not multibyte character sensitive. 742 * See also: #getch. 743 * 744 * s = StringScanner.new('ab') 745 * s.get_byte # => "a" 746 * s.get_byte # => "b" 747 * s.get_byte # => nil 748 * 749 * $KCODE = 'EUC' 750 * s = StringScanner.new("\244\242") 751 * s.get_byte # => "\244" 752 * s.get_byte # => "\242" 753 * s.get_byte # => nil 754 */ 755static VALUE 756strscan_get_byte(VALUE self) 757{ 758 struct strscanner *p; 759 760 GET_SCANNER(self, p); 761 CLEAR_MATCH_STATUS(p); 762 if (EOS_P(p)) 763 return Qnil; 764 765 p->prev = p->curr; 766 p->curr++; 767 MATCHED(p); 768 adjust_registers_to_matched(p); 769 return extract_range(p, p->prev + p->regs.beg[0], 770 p->prev + p->regs.end[0]); 771} 772 773/* 774 * Equivalent to #get_byte. 775 * This method is obsolete; use #get_byte instead. 776 */ 777static VALUE 778strscan_getbyte(VALUE self) 779{ 780 rb_warning("StringScanner#getbyte is obsolete; use #get_byte instead"); 781 return strscan_get_byte(self); 782} 783 784/* 785 * call-seq: peek(len) 786 * 787 * Extracts a string corresponding to <tt>string[pos,len]</tt>, without 788 * advancing the scan pointer. 789 * 790 * s = StringScanner.new('test string') 791 * s.peek(7) # => "test st" 792 * s.peek(7) # => "test st" 793 * 794 */ 795static VALUE 796strscan_peek(VALUE self, VALUE vlen) 797{ 798 struct strscanner *p; 799 long len; 800 801 GET_SCANNER(self, p); 802 803 len = NUM2LONG(vlen); 804 if (EOS_P(p)) 805 return infect(str_new(p, "", 0), p); 806 807 if (p->curr + len > S_LEN(p)) 808 len = S_LEN(p) - p->curr; 809 return extract_beg_len(p, p->curr, len); 810} 811 812/* 813 * Equivalent to #peek. 814 * This method is obsolete; use #peek instead. 815 */ 816static VALUE 817strscan_peep(VALUE self, VALUE vlen) 818{ 819 rb_warning("StringScanner#peep is obsolete; use #peek instead"); 820 return strscan_peek(self, vlen); 821} 822 823/* 824 * Set the scan pointer to the previous position. Only one previous position is 825 * remembered, and it changes with each scanning operation. 826 * 827 * s = StringScanner.new('test string') 828 * s.scan(/\w+/) # => "test" 829 * s.unscan 830 * s.scan(/../) # => "te" 831 * s.scan(/\d/) # => nil 832 * s.unscan # ScanError: unscan failed: previous match record not exist 833 */ 834static VALUE 835strscan_unscan(VALUE self) 836{ 837 struct strscanner *p; 838 839 GET_SCANNER(self, p); 840 if (! MATCHED_P(p)) 841 rb_raise(ScanError, "unscan failed: previous match record not exist"); 842 p->curr = p->prev; 843 CLEAR_MATCH_STATUS(p); 844 return self; 845} 846 847/* 848 * Returns +true+ iff the scan pointer is at the beginning of the line. 849 * 850 * s = StringScanner.new("test\ntest\n") 851 * s.bol? # => true 852 * s.scan(/te/) 853 * s.bol? # => false 854 * s.scan(/st\n/) 855 * s.bol? # => true 856 * s.terminate 857 * s.bol? # => true 858 */ 859static VALUE 860strscan_bol_p(VALUE self) 861{ 862 struct strscanner *p; 863 864 GET_SCANNER(self, p); 865 if (CURPTR(p) > S_PEND(p)) return Qnil; 866 if (p->curr == 0) return Qtrue; 867 return (*(CURPTR(p) - 1) == '\n') ? Qtrue : Qfalse; 868} 869 870/* 871 * Returns +true+ if the scan pointer is at the end of the string. 872 * 873 * s = StringScanner.new('test string') 874 * p s.eos? # => false 875 * s.scan(/test/) 876 * p s.eos? # => false 877 * s.terminate 878 * p s.eos? # => true 879 */ 880static VALUE 881strscan_eos_p(VALUE self) 882{ 883 struct strscanner *p; 884 885 GET_SCANNER(self, p); 886 return EOS_P(p) ? Qtrue : Qfalse; 887} 888 889/* 890 * Equivalent to #eos?. 891 * This method is obsolete, use #eos? instead. 892 */ 893static VALUE 894strscan_empty_p(VALUE self) 895{ 896 rb_warning("StringScanner#empty? is obsolete; use #eos? instead"); 897 return strscan_eos_p(self); 898} 899 900/* 901 * Returns true iff there is more data in the string. See #eos?. 902 * This method is obsolete; use #eos? instead. 903 * 904 * s = StringScanner.new('test string') 905 * s.eos? # These two 906 * s.rest? # are opposites. 907 */ 908static VALUE 909strscan_rest_p(VALUE self) 910{ 911 struct strscanner *p; 912 913 GET_SCANNER(self, p); 914 return EOS_P(p) ? Qfalse : Qtrue; 915} 916 917/* 918 * Returns +true+ iff the last match was successful. 919 * 920 * s = StringScanner.new('test string') 921 * s.match?(/\w+/) # => 4 922 * s.matched? # => true 923 * s.match?(/\d+/) # => nil 924 * s.matched? # => false 925 */ 926static VALUE 927strscan_matched_p(VALUE self) 928{ 929 struct strscanner *p; 930 931 GET_SCANNER(self, p); 932 return MATCHED_P(p) ? Qtrue : Qfalse; 933} 934 935/* 936 * Returns the last matched string. 937 * 938 * s = StringScanner.new('test string') 939 * s.match?(/\w+/) # -> 4 940 * s.matched # -> "test" 941 */ 942static VALUE 943strscan_matched(VALUE self) 944{ 945 struct strscanner *p; 946 947 GET_SCANNER(self, p); 948 if (! MATCHED_P(p)) return Qnil; 949 return extract_range(p, p->prev + p->regs.beg[0], 950 p->prev + p->regs.end[0]); 951} 952 953/* 954 * Returns the size of the most recent match (see #matched), or +nil+ if there 955 * was no recent match. 956 * 957 * s = StringScanner.new('test string') 958 * s.check /\w+/ # -> "test" 959 * s.matched_size # -> 4 960 * s.check /\d+/ # -> nil 961 * s.matched_size # -> nil 962 */ 963static VALUE 964strscan_matched_size(VALUE self) 965{ 966 struct strscanner *p; 967 968 GET_SCANNER(self, p); 969 if (! MATCHED_P(p)) return Qnil; 970 return INT2NUM(p->regs.end[0] - p->regs.beg[0]); 971} 972 973/* 974 * call-seq: [](n) 975 * 976 * Return the n-th subgroup in the most recent match. 977 * 978 * s = StringScanner.new("Fri Dec 12 1975 14:39") 979 * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 " 980 * s[0] # -> "Fri Dec 12 " 981 * s[1] # -> "Fri" 982 * s[2] # -> "Dec" 983 * s[3] # -> "12" 984 * s.post_match # -> "1975 14:39" 985 * s.pre_match # -> "" 986 */ 987static VALUE 988strscan_aref(VALUE self, VALUE idx) 989{ 990 struct strscanner *p; 991 long i; 992 993 GET_SCANNER(self, p); 994 if (! MATCHED_P(p)) return Qnil; 995 996 i = NUM2LONG(idx); 997 if (i < 0) 998 i += p->regs.num_regs; 999 if (i < 0) return Qnil; 1000 if (i >= p->regs.num_regs) return Qnil; 1001 if (p->regs.beg[i] == -1) return Qnil; 1002 1003 return extract_range(p, p->prev + p->regs.beg[i], 1004 p->prev + p->regs.end[i]); 1005} 1006 1007/* 1008 * Return the <i><b>pre</b>-match</i> (in the regular expression sense) of the last scan. 1009 * 1010 * s = StringScanner.new('test string') 1011 * s.scan(/\w+/) # -> "test" 1012 * s.scan(/\s+/) # -> " " 1013 * s.pre_match # -> "test" 1014 * s.post_match # -> "string" 1015 */ 1016static VALUE 1017strscan_pre_match(VALUE self) 1018{ 1019 struct strscanner *p; 1020 1021 GET_SCANNER(self, p); 1022 if (! MATCHED_P(p)) return Qnil; 1023 return extract_range(p, 0, p->prev + p->regs.beg[0]); 1024} 1025 1026/* 1027 * Return the <i><b>post</b>-match</i> (in the regular expression sense) of the last scan. 1028 * 1029 * s = StringScanner.new('test string') 1030 * s.scan(/\w+/) # -> "test" 1031 * s.scan(/\s+/) # -> " " 1032 * s.pre_match # -> "test" 1033 * s.post_match # -> "string" 1034 */ 1035static VALUE 1036strscan_post_match(VALUE self) 1037{ 1038 struct strscanner *p; 1039 1040 GET_SCANNER(self, p); 1041 if (! MATCHED_P(p)) return Qnil; 1042 return extract_range(p, p->prev + p->regs.end[0], S_LEN(p)); 1043} 1044 1045/* 1046 * Returns the "rest" of the string (i.e. everything after the scan pointer). 1047 * If there is no more data (eos? = true), it returns <tt>""</tt>. 1048 */ 1049static VALUE 1050strscan_rest(VALUE self) 1051{ 1052 struct strscanner *p; 1053 1054 GET_SCANNER(self, p); 1055 if (EOS_P(p)) { 1056 return infect(str_new(p, "", 0), p); 1057 } 1058 return extract_range(p, p->curr, S_LEN(p)); 1059} 1060 1061/* 1062 * <tt>s.rest_size</tt> is equivalent to <tt>s.rest.size</tt>. 1063 */ 1064static VALUE 1065strscan_rest_size(VALUE self) 1066{ 1067 struct strscanner *p; 1068 long i; 1069 1070 GET_SCANNER(self, p); 1071 if (EOS_P(p)) { 1072 return INT2FIX(0); 1073 } 1074 i = S_LEN(p) - p->curr; 1075 return INT2FIX(i); 1076} 1077 1078/* 1079 * <tt>s.restsize</tt> is equivalent to <tt>s.rest_size</tt>. 1080 * This method is obsolete; use #rest_size instead. 1081 */ 1082static VALUE 1083strscan_restsize(VALUE self) 1084{ 1085 rb_warning("StringScanner#restsize is obsolete; use #rest_size instead"); 1086 return strscan_rest_size(self); 1087} 1088 1089#define INSPECT_LENGTH 5 1090#define BUFSIZE 256 1091 1092/* 1093 * Returns a string that represents the StringScanner object, showing: 1094 * - the current position 1095 * - the size of the string 1096 * - the characters surrounding the scan pointer 1097 * 1098 * s = StringScanner.new("Fri Dec 12 1975 14:39") 1099 * s.inspect # -> '#<StringScanner 0/21 @ "Fri D...">' 1100 * s.scan_until /12/ # -> "Fri Dec 12" 1101 * s.inspect # -> '#<StringScanner 10/21 "...ec 12" @ " 1975...">' 1102 */ 1103static VALUE 1104strscan_inspect(VALUE self) 1105{ 1106 struct strscanner *p; 1107 VALUE a, b; 1108 1109 p = check_strscan(self); 1110 if (NIL_P(p->str)) { 1111 a = rb_sprintf("#<%"PRIsVALUE" (uninitialized)>", rb_obj_class(self)); 1112 return infect(a, p); 1113 } 1114 if (EOS_P(p)) { 1115 a = rb_sprintf("#<%"PRIsVALUE" fin>", rb_obj_class(self)); 1116 return infect(a, p); 1117 } 1118 if (p->curr == 0) { 1119 b = inspect2(p); 1120 a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld @ %"PRIsVALUE">", 1121 rb_obj_class(self), 1122 p->curr, S_LEN(p), 1123 b); 1124 return infect(a, p); 1125 } 1126 a = inspect1(p); 1127 b = inspect2(p); 1128 a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld %"PRIsVALUE" @ %"PRIsVALUE">", 1129 rb_obj_class(self), 1130 p->curr, S_LEN(p), 1131 a, b); 1132 return infect(a, p); 1133} 1134 1135static VALUE 1136inspect1(struct strscanner *p) 1137{ 1138 VALUE str; 1139 long len; 1140 1141 if (p->curr == 0) return rb_str_new2(""); 1142 if (p->curr > INSPECT_LENGTH) { 1143 str = rb_str_new_cstr("..."); 1144 len = INSPECT_LENGTH; 1145 } 1146 else { 1147 str = rb_str_new(0, 0); 1148 len = p->curr; 1149 } 1150 rb_str_cat(str, CURPTR(p) - len, len); 1151 return rb_str_dump(str); 1152} 1153 1154static VALUE 1155inspect2(struct strscanner *p) 1156{ 1157 VALUE str; 1158 long len; 1159 1160 if (EOS_P(p)) return rb_str_new2(""); 1161 len = S_LEN(p) - p->curr; 1162 if (len > INSPECT_LENGTH) { 1163 str = rb_str_new(CURPTR(p), INSPECT_LENGTH); 1164 rb_str_cat2(str, "..."); 1165 } 1166 else { 1167 str = rb_str_new(CURPTR(p), len); 1168 } 1169 return rb_str_dump(str); 1170} 1171 1172/* ======================================================================= 1173 Ruby Interface 1174 ======================================================================= */ 1175 1176/* 1177 * Document-class: StringScanner 1178 * 1179 * StringScanner provides for lexical scanning operations on a String. Here is 1180 * an example of its usage: 1181 * 1182 * s = StringScanner.new('This is an example string') 1183 * s.eos? # -> false 1184 * 1185 * p s.scan(/\w+/) # -> "This" 1186 * p s.scan(/\w+/) # -> nil 1187 * p s.scan(/\s+/) # -> " " 1188 * p s.scan(/\s+/) # -> nil 1189 * p s.scan(/\w+/) # -> "is" 1190 * s.eos? # -> false 1191 * 1192 * p s.scan(/\s+/) # -> " " 1193 * p s.scan(/\w+/) # -> "an" 1194 * p s.scan(/\s+/) # -> " " 1195 * p s.scan(/\w+/) # -> "example" 1196 * p s.scan(/\s+/) # -> " " 1197 * p s.scan(/\w+/) # -> "string" 1198 * s.eos? # -> true 1199 * 1200 * p s.scan(/\s+/) # -> nil 1201 * p s.scan(/\w+/) # -> nil 1202 * 1203 * Scanning a string means remembering the position of a <i>scan pointer</i>, 1204 * which is just an index. The point of scanning is to move forward a bit at 1205 * a time, so matches are sought after the scan pointer; usually immediately 1206 * after it. 1207 * 1208 * Given the string "test string", here are the pertinent scan pointer 1209 * positions: 1210 * 1211 * t e s t s t r i n g 1212 * 0 1 2 ... 1 1213 * 0 1214 * 1215 * When you #scan for a pattern (a regular expression), the match must occur 1216 * at the character after the scan pointer. If you use #scan_until, then the 1217 * match can occur anywhere after the scan pointer. In both cases, the scan 1218 * pointer moves <i>just beyond</i> the last character of the match, ready to 1219 * scan again from the next character onwards. This is demonstrated by the 1220 * example above. 1221 * 1222 * == Method Categories 1223 * 1224 * There are other methods besides the plain scanners. You can look ahead in 1225 * the string without actually scanning. You can access the most recent match. 1226 * You can modify the string being scanned, reset or terminate the scanner, 1227 * find out or change the position of the scan pointer, skip ahead, and so on. 1228 * 1229 * === Advancing the Scan Pointer 1230 * 1231 * - #getch 1232 * - #get_byte 1233 * - #scan 1234 * - #scan_until 1235 * - #skip 1236 * - #skip_until 1237 * 1238 * === Looking Ahead 1239 * 1240 * - #check 1241 * - #check_until 1242 * - #exist? 1243 * - #match? 1244 * - #peek 1245 * 1246 * === Finding Where we Are 1247 * 1248 * - #beginning_of_line? (#bol?) 1249 * - #eos? 1250 * - #rest? 1251 * - #rest_size 1252 * - #pos 1253 * 1254 * === Setting Where we Are 1255 * 1256 * - #reset 1257 * - #terminate 1258 * - #pos= 1259 * 1260 * === Match Data 1261 * 1262 * - #matched 1263 * - #matched? 1264 * - #matched_size 1265 * - [] 1266 * - #pre_match 1267 * - #post_match 1268 * 1269 * === Miscellaneous 1270 * 1271 * - << 1272 * - #concat 1273 * - #string 1274 * - #string= 1275 * - #unscan 1276 * 1277 * There are aliases to several of the methods. 1278 */ 1279void 1280Init_strscan() 1281{ 1282 ID id_scanerr = rb_intern("ScanError"); 1283 VALUE tmp; 1284 1285 id_byteslice = rb_intern("byteslice"); 1286 1287 StringScanner = rb_define_class("StringScanner", rb_cObject); 1288 ScanError = rb_define_class_under(StringScanner, "Error", rb_eStandardError); 1289 if (!rb_const_defined(rb_cObject, id_scanerr)) { 1290 rb_const_set(rb_cObject, id_scanerr, ScanError); 1291 } 1292 tmp = rb_str_new2(STRSCAN_VERSION); 1293 rb_obj_freeze(tmp); 1294 rb_const_set(StringScanner, rb_intern("Version"), tmp); 1295 tmp = rb_str_new2("$Id: strscan.c 44659 2014-01-19 16:28:53Z nagachika $"); 1296 rb_obj_freeze(tmp); 1297 rb_const_set(StringScanner, rb_intern("Id"), tmp); 1298 1299 rb_define_alloc_func(StringScanner, strscan_s_allocate); 1300 rb_define_private_method(StringScanner, "initialize", strscan_initialize, -1); 1301 rb_define_private_method(StringScanner, "initialize_copy", strscan_init_copy, 1); 1302 rb_define_singleton_method(StringScanner, "must_C_version", strscan_s_mustc, 0); 1303 rb_define_method(StringScanner, "reset", strscan_reset, 0); 1304 rb_define_method(StringScanner, "terminate", strscan_terminate, 0); 1305 rb_define_method(StringScanner, "clear", strscan_clear, 0); 1306 rb_define_method(StringScanner, "string", strscan_get_string, 0); 1307 rb_define_method(StringScanner, "string=", strscan_set_string, 1); 1308 rb_define_method(StringScanner, "concat", strscan_concat, 1); 1309 rb_define_method(StringScanner, "<<", strscan_concat, 1); 1310 rb_define_method(StringScanner, "pos", strscan_get_pos, 0); 1311 rb_define_method(StringScanner, "pos=", strscan_set_pos, 1); 1312 rb_define_method(StringScanner, "charpos", strscan_get_charpos, 0); 1313 rb_define_method(StringScanner, "pointer", strscan_get_pos, 0); 1314 rb_define_method(StringScanner, "pointer=", strscan_set_pos, 1); 1315 1316 rb_define_method(StringScanner, "scan", strscan_scan, 1); 1317 rb_define_method(StringScanner, "skip", strscan_skip, 1); 1318 rb_define_method(StringScanner, "match?", strscan_match_p, 1); 1319 rb_define_method(StringScanner, "check", strscan_check, 1); 1320 rb_define_method(StringScanner, "scan_full", strscan_scan_full, 3); 1321 1322 rb_define_method(StringScanner, "scan_until", strscan_scan_until, 1); 1323 rb_define_method(StringScanner, "skip_until", strscan_skip_until, 1); 1324 rb_define_method(StringScanner, "exist?", strscan_exist_p, 1); 1325 rb_define_method(StringScanner, "check_until", strscan_check_until, 1); 1326 rb_define_method(StringScanner, "search_full", strscan_search_full, 3); 1327 1328 rb_define_method(StringScanner, "getch", strscan_getch, 0); 1329 rb_define_method(StringScanner, "get_byte", strscan_get_byte, 0); 1330 rb_define_method(StringScanner, "getbyte", strscan_getbyte, 0); 1331 rb_define_method(StringScanner, "peek", strscan_peek, 1); 1332 rb_define_method(StringScanner, "peep", strscan_peep, 1); 1333 1334 rb_define_method(StringScanner, "unscan", strscan_unscan, 0); 1335 1336 rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0); 1337 rb_alias(StringScanner, rb_intern("bol?"), rb_intern("beginning_of_line?")); 1338 rb_define_method(StringScanner, "eos?", strscan_eos_p, 0); 1339 rb_define_method(StringScanner, "empty?", strscan_empty_p, 0); 1340 rb_define_method(StringScanner, "rest?", strscan_rest_p, 0); 1341 1342 rb_define_method(StringScanner, "matched?", strscan_matched_p, 0); 1343 rb_define_method(StringScanner, "matched", strscan_matched, 0); 1344 rb_define_method(StringScanner, "matched_size", strscan_matched_size, 0); 1345 rb_define_method(StringScanner, "[]", strscan_aref, 1); 1346 rb_define_method(StringScanner, "pre_match", strscan_pre_match, 0); 1347 rb_define_method(StringScanner, "post_match", strscan_post_match, 0); 1348 1349 rb_define_method(StringScanner, "rest", strscan_rest, 0); 1350 rb_define_method(StringScanner, "rest_size", strscan_rest_size, 0); 1351 rb_define_method(StringScanner, "restsize", strscan_restsize, 0); 1352 1353 rb_define_method(StringScanner, "inspect", strscan_inspect, 0); 1354} 1355