1/*
2    $Id: strscan.c 44659 2014-01-19 16:28:53Z nagachika $
3
4    Copyright (c) 1999-2006 Minero Aoki
5
6    This program is free software.
7    You can distribute/modify this program under the terms of
8    the Ruby License. For details, see the file COPYING.
9*/
10
11#include "ruby/ruby.h"
12#include "ruby/re.h"
13#include "ruby/encoding.h"
14#include "regint.h"
15
16#define STRSCAN_VERSION "0.7.0"
17
18/* =======================================================================
19                         Data Type Definitions
20   ======================================================================= */
21
22static VALUE StringScanner;
23static VALUE ScanError;
24static ID id_byteslice;
25
26struct strscanner
27{
28    /* multi-purpose flags */
29    unsigned long flags;
30#define FLAG_MATCHED (1 << 0)
31
32    /* the string to scan */
33    VALUE str;
34
35    /* scan pointers */
36    long prev;   /* legal only when MATCHED_P(s) */
37    long curr;   /* always legal */
38
39    /* the regexp register; legal only when MATCHED_P(s) */
40    struct re_registers regs;
41};
42
43#define MATCHED_P(s)          ((s)->flags & FLAG_MATCHED)
44#define MATCHED(s)             (s)->flags |= FLAG_MATCHED
45#define CLEAR_MATCH_STATUS(s)  (s)->flags &= ~FLAG_MATCHED
46
47#define S_PBEG(s)  (RSTRING_PTR((s)->str))
48#define S_LEN(s)  (RSTRING_LEN((s)->str))
49#define S_PEND(s)  (S_PBEG(s) + S_LEN(s))
50#define CURPTR(s) (S_PBEG(s) + (s)->curr)
51#define S_RESTLEN(s) (S_LEN(s) - (s)->curr)
52
53#define EOS_P(s) ((s)->curr >= RSTRING_LEN(p->str))
54
55#define GET_SCANNER(obj,var) do {\
56    (var) = check_strscan(obj);\
57    if (NIL_P((var)->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");\
58} while (0)
59
60/* =======================================================================
61                            Function Prototypes
62   ======================================================================= */
63
64static VALUE infect _((VALUE str, struct strscanner *p));
65static VALUE extract_range _((struct strscanner *p, long beg_i, long end_i));
66static VALUE extract_beg_len _((struct strscanner *p, long beg_i, long len));
67
68static struct strscanner *check_strscan _((VALUE obj));
69static void strscan_mark _((void *p));
70static void strscan_free _((void *p));
71static size_t strscan_memsize _((const void *p));
72static VALUE strscan_s_allocate _((VALUE klass));
73static VALUE strscan_initialize _((int argc, VALUE *argv, VALUE self));
74static VALUE strscan_init_copy _((VALUE vself, VALUE vorig));
75
76static VALUE strscan_s_mustc _((VALUE self));
77static VALUE strscan_terminate _((VALUE self));
78static VALUE strscan_clear _((VALUE self));
79static VALUE strscan_get_string _((VALUE self));
80static VALUE strscan_set_string _((VALUE self, VALUE str));
81static VALUE strscan_concat _((VALUE self, VALUE str));
82static VALUE strscan_get_pos _((VALUE self));
83static VALUE strscan_set_pos _((VALUE self, VALUE pos));
84static VALUE strscan_do_scan _((VALUE self, VALUE regex,
85                                int succptr, int getstr, int headonly));
86static VALUE strscan_scan _((VALUE self, VALUE re));
87static VALUE strscan_match_p _((VALUE self, VALUE re));
88static VALUE strscan_skip _((VALUE self, VALUE re));
89static VALUE strscan_check _((VALUE self, VALUE re));
90static VALUE strscan_scan_full _((VALUE self, VALUE re,
91                                  VALUE succp, VALUE getp));
92static VALUE strscan_scan_until _((VALUE self, VALUE re));
93static VALUE strscan_skip_until _((VALUE self, VALUE re));
94static VALUE strscan_check_until _((VALUE self, VALUE re));
95static VALUE strscan_search_full _((VALUE self, VALUE re,
96                                    VALUE succp, VALUE getp));
97static void adjust_registers_to_matched _((struct strscanner *p));
98static VALUE strscan_getch _((VALUE self));
99static VALUE strscan_get_byte _((VALUE self));
100static VALUE strscan_getbyte _((VALUE self));
101static VALUE strscan_peek _((VALUE self, VALUE len));
102static VALUE strscan_peep _((VALUE self, VALUE len));
103static VALUE strscan_unscan _((VALUE self));
104static VALUE strscan_bol_p _((VALUE self));
105static VALUE strscan_eos_p _((VALUE self));
106static VALUE strscan_empty_p _((VALUE self));
107static VALUE strscan_rest_p _((VALUE self));
108static VALUE strscan_matched_p _((VALUE self));
109static VALUE strscan_matched _((VALUE self));
110static VALUE strscan_matched_size _((VALUE self));
111static VALUE strscan_aref _((VALUE self, VALUE idx));
112static VALUE strscan_pre_match _((VALUE self));
113static VALUE strscan_post_match _((VALUE self));
114static VALUE strscan_rest _((VALUE self));
115static VALUE strscan_rest_size _((VALUE self));
116
117static VALUE strscan_inspect _((VALUE self));
118static VALUE inspect1 _((struct strscanner *p));
119static VALUE inspect2 _((struct strscanner *p));
120
121/* =======================================================================
122                                   Utils
123   ======================================================================= */
124
125static VALUE
126infect(VALUE str, struct strscanner *p)
127{
128    OBJ_INFECT(str, p->str);
129    return str;
130}
131
132static VALUE
133str_new(struct strscanner *p, const char *ptr, long len)
134{
135    VALUE str = rb_str_new(ptr, len);
136    rb_enc_copy(str, p->str);
137    return str;
138}
139
140static VALUE
141extract_range(struct strscanner *p, long beg_i, long end_i)
142{
143    if (beg_i > S_LEN(p)) return Qnil;
144    if (end_i > S_LEN(p))
145        end_i = S_LEN(p);
146    return infect(str_new(p, S_PBEG(p) + beg_i, end_i - beg_i), p);
147}
148
149static VALUE
150extract_beg_len(struct strscanner *p, long beg_i, long len)
151{
152    if (beg_i > S_LEN(p)) return Qnil;
153    if (beg_i + len > S_LEN(p))
154        len = S_LEN(p) - beg_i;
155    return infect(str_new(p, S_PBEG(p) + beg_i, len), p);
156}
157
158/* =======================================================================
159                               Constructor
160   ======================================================================= */
161
162static void
163strscan_mark(void *ptr)
164{
165    struct strscanner *p = ptr;
166    rb_gc_mark(p->str);
167}
168
169static void
170strscan_free(void *ptr)
171{
172    struct strscanner *p = ptr;
173    onig_region_free(&(p->regs), 0);
174    ruby_xfree(p);
175}
176
177static size_t
178strscan_memsize(const void *ptr)
179{
180    const struct strscanner *p = ptr;
181    size_t size = 0;
182    if (p) {
183	size = sizeof(*p) - sizeof(p->regs) + onig_region_memsize(&p->regs);
184    }
185    return size;
186}
187
188static const rb_data_type_t strscanner_type = {
189    "StringScanner",
190    {strscan_mark, strscan_free, strscan_memsize}
191};
192
193static VALUE
194strscan_s_allocate(VALUE klass)
195{
196    struct strscanner *p;
197
198    p = ALLOC(struct strscanner);
199    MEMZERO(p, struct strscanner, 1);
200    CLEAR_MATCH_STATUS(p);
201    onig_region_init(&(p->regs));
202    p->str = Qnil;
203    return TypedData_Wrap_Struct(klass, &strscanner_type, p);
204}
205
206/*
207 * call-seq: StringScanner.new(string, dup = false)
208 *
209 * Creates a new StringScanner object to scan over the given +string+.
210 * +dup+ argument is obsolete and not used now.
211 */
212static VALUE
213strscan_initialize(int argc, VALUE *argv, VALUE self)
214{
215    struct strscanner *p;
216    VALUE str, need_dup;
217
218    p = check_strscan(self);
219    rb_scan_args(argc, argv, "11", &str, &need_dup);
220    StringValue(str);
221    p->str = str;
222
223    return self;
224}
225
226static struct strscanner *
227check_strscan(VALUE obj)
228{
229    return rb_check_typeddata(obj, &strscanner_type);
230}
231
232/*
233 * call-seq:
234 *   dup
235 *   clone
236 *
237 * Duplicates a StringScanner object.
238 */
239static VALUE
240strscan_init_copy(VALUE vself, VALUE vorig)
241{
242    struct strscanner *self, *orig;
243
244    self = check_strscan(vself);
245    orig = check_strscan(vorig);
246    if (self != orig) {
247	self->flags = orig->flags;
248	self->str = orig->str;
249	self->prev = orig->prev;
250	self->curr = orig->curr;
251	onig_region_copy(&self->regs, &orig->regs);
252    }
253
254    return vself;
255}
256
257/* =======================================================================
258                          Instance Methods
259   ======================================================================= */
260
261/*
262 * call-seq: StringScanner.must_C_version
263 *
264 * This method is defined for backward compatibility.
265 */
266static VALUE
267strscan_s_mustc(VALUE self)
268{
269    return self;
270}
271
272/*
273 * Reset the scan pointer (index 0) and clear matching data.
274 */
275static VALUE
276strscan_reset(VALUE self)
277{
278    struct strscanner *p;
279
280    GET_SCANNER(self, p);
281    p->curr = 0;
282    CLEAR_MATCH_STATUS(p);
283    return self;
284}
285
286/*
287 * call-seq:
288 *   terminate
289 *   clear
290 *
291 * Set the scan pointer to the end of the string and clear matching data.
292 */
293static VALUE
294strscan_terminate(VALUE self)
295{
296    struct strscanner *p;
297
298    GET_SCANNER(self, p);
299    p->curr = S_LEN(p);
300    CLEAR_MATCH_STATUS(p);
301    return self;
302}
303
304/*
305 * Equivalent to #terminate.
306 * This method is obsolete; use #terminate instead.
307 */
308static VALUE
309strscan_clear(VALUE self)
310{
311    rb_warning("StringScanner#clear is obsolete; use #terminate instead");
312    return strscan_terminate(self);
313}
314
315/*
316 * Returns the string being scanned.
317 */
318static VALUE
319strscan_get_string(VALUE self)
320{
321    struct strscanner *p;
322
323    GET_SCANNER(self, p);
324    return p->str;
325}
326
327/*
328 * call-seq: string=(str)
329 *
330 * Changes the string being scanned to +str+ and resets the scanner.
331 * Returns +str+.
332 */
333static VALUE
334strscan_set_string(VALUE self, VALUE str)
335{
336    struct strscanner *p = check_strscan(self);
337
338    StringValue(str);
339    p->str = str;
340    p->curr = 0;
341    CLEAR_MATCH_STATUS(p);
342    return str;
343}
344
345/*
346 * call-seq:
347 *   concat(str)
348 *   <<(str)
349 *
350 * Appends +str+ to the string being scanned.
351 * This method does not affect scan pointer.
352 *
353 *   s = StringScanner.new("Fri Dec 12 1975 14:39")
354 *   s.scan(/Fri /)
355 *   s << " +1000 GMT"
356 *   s.string            # -> "Fri Dec 12 1975 14:39 +1000 GMT"
357 *   s.scan(/Dec/)       # -> "Dec"
358 */
359static VALUE
360strscan_concat(VALUE self, VALUE str)
361{
362    struct strscanner *p;
363
364    GET_SCANNER(self, p);
365    StringValue(str);
366    rb_str_append(p->str, str);
367    return self;
368}
369
370/*
371 * Returns the byte position of the scan pointer.  In the 'reset' position, this
372 * value is zero.  In the 'terminated' position (i.e. the string is exhausted),
373 * this value is the bytesize of the string.
374 *
375 * In short, it's a 0-based index into bytes of the string.
376 *
377 *   s = StringScanner.new('test string')
378 *   s.pos               # -> 0
379 *   s.scan_until /str/  # -> "test str"
380 *   s.pos               # -> 8
381 *   s.terminate         # -> #<StringScanner fin>
382 *   s.pos               # -> 11
383 */
384static VALUE
385strscan_get_pos(VALUE self)
386{
387    struct strscanner *p;
388
389    GET_SCANNER(self, p);
390    return INT2FIX(p->curr);
391}
392
393/*
394 * Returns the character position of the scan pointer.  In the 'reset' position, this
395 * value is zero.  In the 'terminated' position (i.e. the string is exhausted),
396 * this value is the size of the string.
397 *
398 * In short, it's a 0-based index into the string.
399 *
400 *   s = StringScanner.new("abcädeföghi")
401 *   s.charpos           # -> 0
402 *   s.scan_until(/ä/)   # -> "abcä"
403 *   s.pos               # -> 5
404 *   s.charpos           # -> 4
405 */
406static VALUE
407strscan_get_charpos(VALUE self)
408{
409    struct strscanner *p;
410    VALUE substr;
411
412    GET_SCANNER(self, p);
413
414    substr = rb_funcall(p->str, id_byteslice, 2, INT2FIX(0), INT2NUM(p->curr));
415
416    return rb_str_length(substr);
417}
418
419/*
420 * call-seq: pos=(n)
421 *
422 * Set the byte position of the scan pointer.
423 *
424 *   s = StringScanner.new('test string')
425 *   s.pos = 7            # -> 7
426 *   s.rest               # -> "ring"
427 */
428static VALUE
429strscan_set_pos(VALUE self, VALUE v)
430{
431    struct strscanner *p;
432    long i;
433
434    GET_SCANNER(self, p);
435    i = NUM2INT(v);
436    if (i < 0) i += S_LEN(p);
437    if (i < 0) rb_raise(rb_eRangeError, "index out of range");
438    if (i > S_LEN(p)) rb_raise(rb_eRangeError, "index out of range");
439    p->curr = i;
440    return INT2NUM(i);
441}
442
443static VALUE
444strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
445{
446    regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
447    struct strscanner *p;
448    regex_t *re;
449    long ret;
450    int tmpreg;
451
452    Check_Type(regex, T_REGEXP);
453    GET_SCANNER(self, p);
454
455    CLEAR_MATCH_STATUS(p);
456    if (S_RESTLEN(p) < 0) {
457        return Qnil;
458    }
459    re = rb_reg_prepare_re(regex, p->str);
460    tmpreg = re != RREGEXP(regex)->ptr;
461    if (!tmpreg) RREGEXP(regex)->usecnt++;
462
463    if (headonly) {
464        ret = onig_match(re, (UChar* )CURPTR(p),
465                         (UChar* )(CURPTR(p) + S_RESTLEN(p)),
466                         (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE);
467    }
468    else {
469        ret = onig_search(re,
470                          (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
471                          (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
472                          &(p->regs), ONIG_OPTION_NONE);
473    }
474    if (!tmpreg) RREGEXP(regex)->usecnt--;
475    if (tmpreg) {
476        if (RREGEXP(regex)->usecnt) {
477            onig_free(re);
478        }
479        else {
480            onig_free(RREGEXP(regex)->ptr);
481            RREGEXP(regex)->ptr = re;
482        }
483    }
484
485    if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
486    if (ret < 0) {
487        /* not matched */
488        return Qnil;
489    }
490
491    MATCHED(p);
492    p->prev = p->curr;
493    if (succptr) {
494        p->curr += p->regs.end[0];
495    }
496    if (getstr) {
497        return extract_beg_len(p, p->prev, p->regs.end[0]);
498    }
499    else {
500        return INT2FIX(p->regs.end[0]);
501    }
502}
503
504/*
505 * call-seq: scan(pattern) => String
506 *
507 * Tries to match with +pattern+ at the current position. If there's a match,
508 * the scanner advances the "scan pointer" and returns the matched string.
509 * Otherwise, the scanner returns +nil+.
510 *
511 *   s = StringScanner.new('test string')
512 *   p s.scan(/\w+/)   # -> "test"
513 *   p s.scan(/\w+/)   # -> nil
514 *   p s.scan(/\s+/)   # -> " "
515 *   p s.scan(/\w+/)   # -> "string"
516 *   p s.scan(/./)     # -> nil
517 *
518 */
519static VALUE
520strscan_scan(VALUE self, VALUE re)
521{
522    return strscan_do_scan(self, re, 1, 1, 1);
523}
524
525/*
526 * call-seq: match?(pattern)
527 *
528 * Tests whether the given +pattern+ is matched from the current scan pointer.
529 * Returns the length of the match, or +nil+.  The scan pointer is not advanced.
530 *
531 *   s = StringScanner.new('test string')
532 *   p s.match?(/\w+/)   # -> 4
533 *   p s.match?(/\w+/)   # -> 4
534 *   p s.match?(/\s+/)   # -> nil
535 */
536static VALUE
537strscan_match_p(VALUE self, VALUE re)
538{
539    return strscan_do_scan(self, re, 0, 0, 1);
540}
541
542/*
543 * call-seq: skip(pattern)
544 *
545 * Attempts to skip over the given +pattern+ beginning with the scan pointer.
546 * If it matches, the scan pointer is advanced to the end of the match, and the
547 * length of the match is returned.  Otherwise, +nil+ is returned.
548 *
549 * It's similar to #scan, but without returning the matched string.
550 *
551 *   s = StringScanner.new('test string')
552 *   p s.skip(/\w+/)   # -> 4
553 *   p s.skip(/\w+/)   # -> nil
554 *   p s.skip(/\s+/)   # -> 1
555 *   p s.skip(/\w+/)   # -> 6
556 *   p s.skip(/./)     # -> nil
557 *
558 */
559static VALUE
560strscan_skip(VALUE self, VALUE re)
561{
562    return strscan_do_scan(self, re, 1, 0, 1);
563}
564
565/*
566 * call-seq: check(pattern)
567 *
568 * This returns the value that #scan would return, without advancing the scan
569 * pointer.  The match register is affected, though.
570 *
571 *   s = StringScanner.new("Fri Dec 12 1975 14:39")
572 *   s.check /Fri/               # -> "Fri"
573 *   s.pos                       # -> 0
574 *   s.matched                   # -> "Fri"
575 *   s.check /12/                # -> nil
576 *   s.matched                   # -> nil
577 *
578 * Mnemonic: it "checks" to see whether a #scan will return a value.
579 */
580static VALUE
581strscan_check(VALUE self, VALUE re)
582{
583    return strscan_do_scan(self, re, 0, 1, 1);
584}
585
586/*
587 * call-seq: scan_full(pattern, advance_pointer_p, return_string_p)
588 *
589 * Tests whether the given +pattern+ is matched from the current scan pointer.
590 * Advances the scan pointer if +advance_pointer_p+ is true.
591 * Returns the matched string if +return_string_p+ is true.
592 * The match register is affected.
593 *
594 * "full" means "#scan with full parameters".
595 */
596static VALUE
597strscan_scan_full(VALUE self, VALUE re, VALUE s, VALUE f)
598{
599    return strscan_do_scan(self, re, RTEST(s), RTEST(f), 1);
600}
601
602/*
603 * call-seq: scan_until(pattern)
604 *
605 * Scans the string _until_ the +pattern+ is matched.  Returns the substring up
606 * to and including the end of the match, advancing the scan pointer to that
607 * location. If there is no match, +nil+ is returned.
608 *
609 *   s = StringScanner.new("Fri Dec 12 1975 14:39")
610 *   s.scan_until(/1/)        # -> "Fri Dec 1"
611 *   s.pre_match              # -> "Fri Dec "
612 *   s.scan_until(/XYZ/)      # -> nil
613 */
614static VALUE
615strscan_scan_until(VALUE self, VALUE re)
616{
617    return strscan_do_scan(self, re, 1, 1, 0);
618}
619
620/*
621 * call-seq: exist?(pattern)
622 *
623 * Looks _ahead_ to see if the +pattern+ exists _anywhere_ in the string,
624 * without advancing the scan pointer.  This predicates whether a #scan_until
625 * will return a value.
626 *
627 *   s = StringScanner.new('test string')
628 *   s.exist? /s/            # -> 3
629 *   s.scan /test/           # -> "test"
630 *   s.exist? /s/            # -> 2
631 *   s.exist? /e/            # -> nil
632 */
633static VALUE
634strscan_exist_p(VALUE self, VALUE re)
635{
636    return strscan_do_scan(self, re, 0, 0, 0);
637}
638
639/*
640 * call-seq: skip_until(pattern)
641 *
642 * Advances the scan pointer until +pattern+ is matched and consumed.  Returns
643 * the number of bytes advanced, or +nil+ if no match was found.
644 *
645 * Look ahead to match +pattern+, and advance the scan pointer to the _end_
646 * of the match.  Return the number of characters advanced, or +nil+ if the
647 * match was unsuccessful.
648 *
649 * It's similar to #scan_until, but without returning the intervening string.
650 *
651 *   s = StringScanner.new("Fri Dec 12 1975 14:39")
652 *   s.skip_until /12/           # -> 10
653 *   s                           #
654 */
655static VALUE
656strscan_skip_until(VALUE self, VALUE re)
657{
658    return strscan_do_scan(self, re, 1, 0, 0);
659}
660
661/*
662 * call-seq: check_until(pattern)
663 *
664 * This returns the value that #scan_until would return, without advancing the
665 * scan pointer.  The match register is affected, though.
666 *
667 *   s = StringScanner.new("Fri Dec 12 1975 14:39")
668 *   s.check_until /12/          # -> "Fri Dec 12"
669 *   s.pos                       # -> 0
670 *   s.matched                   # -> 12
671 *
672 * Mnemonic: it "checks" to see whether a #scan_until will return a value.
673 */
674static VALUE
675strscan_check_until(VALUE self, VALUE re)
676{
677    return strscan_do_scan(self, re, 0, 1, 0);
678}
679
680/*
681 * call-seq: search_full(pattern, advance_pointer_p, return_string_p)
682 *
683 * Scans the string _until_ the +pattern+ is matched.
684 * Advances the scan pointer if +advance_pointer_p+, otherwise not.
685 * Returns the matched string if +return_string_p+ is true, otherwise
686 * returns the number of bytes advanced.
687 * This method does affect the match register.
688 */
689static VALUE
690strscan_search_full(VALUE self, VALUE re, VALUE s, VALUE f)
691{
692    return strscan_do_scan(self, re, RTEST(s), RTEST(f), 0);
693}
694
695static void
696adjust_registers_to_matched(struct strscanner *p)
697{
698    onig_region_clear(&(p->regs));
699    onig_region_set(&(p->regs), 0, 0, (int)(p->curr - p->prev));
700}
701
702/*
703 * Scans one character and returns it.
704 * This method is multibyte character sensitive.
705 *
706 *   s = StringScanner.new("ab")
707 *   s.getch           # => "a"
708 *   s.getch           # => "b"
709 *   s.getch           # => nil
710 *
711 *   $KCODE = 'EUC'
712 *   s = StringScanner.new("\244\242")
713 *   s.getch           # => "\244\242"   # Japanese hira-kana "A" in EUC-JP
714 *   s.getch           # => nil
715 */
716static VALUE
717strscan_getch(VALUE self)
718{
719    struct strscanner *p;
720    long len;
721
722    GET_SCANNER(self, p);
723    CLEAR_MATCH_STATUS(p);
724    if (EOS_P(p))
725        return Qnil;
726
727    len = rb_enc_mbclen(CURPTR(p), S_PEND(p), rb_enc_get(p->str));
728    if (p->curr + len > S_LEN(p)) {
729        len = S_LEN(p) - p->curr;
730    }
731    p->prev = p->curr;
732    p->curr += len;
733    MATCHED(p);
734    adjust_registers_to_matched(p);
735    return extract_range(p, p->prev + p->regs.beg[0],
736                            p->prev + p->regs.end[0]);
737}
738
739/*
740 * Scans one byte and returns it.
741 * This method is not multibyte character sensitive.
742 * See also: #getch.
743 *
744 *   s = StringScanner.new('ab')
745 *   s.get_byte         # => "a"
746 *   s.get_byte         # => "b"
747 *   s.get_byte         # => nil
748 *
749 *   $KCODE = 'EUC'
750 *   s = StringScanner.new("\244\242")
751 *   s.get_byte         # => "\244"
752 *   s.get_byte         # => "\242"
753 *   s.get_byte         # => nil
754 */
755static VALUE
756strscan_get_byte(VALUE self)
757{
758    struct strscanner *p;
759
760    GET_SCANNER(self, p);
761    CLEAR_MATCH_STATUS(p);
762    if (EOS_P(p))
763        return Qnil;
764
765    p->prev = p->curr;
766    p->curr++;
767    MATCHED(p);
768    adjust_registers_to_matched(p);
769    return extract_range(p, p->prev + p->regs.beg[0],
770                            p->prev + p->regs.end[0]);
771}
772
773/*
774 * Equivalent to #get_byte.
775 * This method is obsolete; use #get_byte instead.
776 */
777static VALUE
778strscan_getbyte(VALUE self)
779{
780    rb_warning("StringScanner#getbyte is obsolete; use #get_byte instead");
781    return strscan_get_byte(self);
782}
783
784/*
785 * call-seq: peek(len)
786 *
787 * Extracts a string corresponding to <tt>string[pos,len]</tt>, without
788 * advancing the scan pointer.
789 *
790 *   s = StringScanner.new('test string')
791 *   s.peek(7)          # => "test st"
792 *   s.peek(7)          # => "test st"
793 *
794 */
795static VALUE
796strscan_peek(VALUE self, VALUE vlen)
797{
798    struct strscanner *p;
799    long len;
800
801    GET_SCANNER(self, p);
802
803    len = NUM2LONG(vlen);
804    if (EOS_P(p))
805        return infect(str_new(p, "", 0), p);
806
807    if (p->curr + len > S_LEN(p))
808        len = S_LEN(p) - p->curr;
809    return extract_beg_len(p, p->curr, len);
810}
811
812/*
813 * Equivalent to #peek.
814 * This method is obsolete; use #peek instead.
815 */
816static VALUE
817strscan_peep(VALUE self, VALUE vlen)
818{
819    rb_warning("StringScanner#peep is obsolete; use #peek instead");
820    return strscan_peek(self, vlen);
821}
822
823/*
824 * Set the scan pointer to the previous position.  Only one previous position is
825 * remembered, and it changes with each scanning operation.
826 *
827 *   s = StringScanner.new('test string')
828 *   s.scan(/\w+/)        # => "test"
829 *   s.unscan
830 *   s.scan(/../)         # => "te"
831 *   s.scan(/\d/)         # => nil
832 *   s.unscan             # ScanError: unscan failed: previous match record not exist
833 */
834static VALUE
835strscan_unscan(VALUE self)
836{
837    struct strscanner *p;
838
839    GET_SCANNER(self, p);
840    if (! MATCHED_P(p))
841        rb_raise(ScanError, "unscan failed: previous match record not exist");
842    p->curr = p->prev;
843    CLEAR_MATCH_STATUS(p);
844    return self;
845}
846
847/*
848 * Returns +true+ iff the scan pointer is at the beginning of the line.
849 *
850 *   s = StringScanner.new("test\ntest\n")
851 *   s.bol?           # => true
852 *   s.scan(/te/)
853 *   s.bol?           # => false
854 *   s.scan(/st\n/)
855 *   s.bol?           # => true
856 *   s.terminate
857 *   s.bol?           # => true
858 */
859static VALUE
860strscan_bol_p(VALUE self)
861{
862    struct strscanner *p;
863
864    GET_SCANNER(self, p);
865    if (CURPTR(p) > S_PEND(p)) return Qnil;
866    if (p->curr == 0) return Qtrue;
867    return (*(CURPTR(p) - 1) == '\n') ? Qtrue : Qfalse;
868}
869
870/*
871 * Returns +true+ if the scan pointer is at the end of the string.
872 *
873 *   s = StringScanner.new('test string')
874 *   p s.eos?          # => false
875 *   s.scan(/test/)
876 *   p s.eos?          # => false
877 *   s.terminate
878 *   p s.eos?          # => true
879 */
880static VALUE
881strscan_eos_p(VALUE self)
882{
883    struct strscanner *p;
884
885    GET_SCANNER(self, p);
886    return EOS_P(p) ? Qtrue : Qfalse;
887}
888
889/*
890 * Equivalent to #eos?.
891 * This method is obsolete, use #eos? instead.
892 */
893static VALUE
894strscan_empty_p(VALUE self)
895{
896    rb_warning("StringScanner#empty? is obsolete; use #eos? instead");
897    return strscan_eos_p(self);
898}
899
900/*
901 * Returns true iff there is more data in the string.  See #eos?.
902 * This method is obsolete; use #eos? instead.
903 *
904 *   s = StringScanner.new('test string')
905 *   s.eos?              # These two
906 *   s.rest?             # are opposites.
907 */
908static VALUE
909strscan_rest_p(VALUE self)
910{
911    struct strscanner *p;
912
913    GET_SCANNER(self, p);
914    return EOS_P(p) ? Qfalse : Qtrue;
915}
916
917/*
918 * Returns +true+ iff the last match was successful.
919 *
920 *   s = StringScanner.new('test string')
921 *   s.match?(/\w+/)     # => 4
922 *   s.matched?          # => true
923 *   s.match?(/\d+/)     # => nil
924 *   s.matched?          # => false
925 */
926static VALUE
927strscan_matched_p(VALUE self)
928{
929    struct strscanner *p;
930
931    GET_SCANNER(self, p);
932    return MATCHED_P(p) ? Qtrue : Qfalse;
933}
934
935/*
936 * Returns the last matched string.
937 *
938 *   s = StringScanner.new('test string')
939 *   s.match?(/\w+/)     # -> 4
940 *   s.matched           # -> "test"
941 */
942static VALUE
943strscan_matched(VALUE self)
944{
945    struct strscanner *p;
946
947    GET_SCANNER(self, p);
948    if (! MATCHED_P(p)) return Qnil;
949    return extract_range(p, p->prev + p->regs.beg[0],
950                            p->prev + p->regs.end[0]);
951}
952
953/*
954 * Returns the size of the most recent match (see #matched), or +nil+ if there
955 * was no recent match.
956 *
957 *   s = StringScanner.new('test string')
958 *   s.check /\w+/           # -> "test"
959 *   s.matched_size          # -> 4
960 *   s.check /\d+/           # -> nil
961 *   s.matched_size          # -> nil
962 */
963static VALUE
964strscan_matched_size(VALUE self)
965{
966    struct strscanner *p;
967
968    GET_SCANNER(self, p);
969    if (! MATCHED_P(p)) return Qnil;
970    return INT2NUM(p->regs.end[0] - p->regs.beg[0]);
971}
972
973/*
974 * call-seq: [](n)
975 *
976 * Return the n-th subgroup in the most recent match.
977 *
978 *   s = StringScanner.new("Fri Dec 12 1975 14:39")
979 *   s.scan(/(\w+) (\w+) (\d+) /)       # -> "Fri Dec 12 "
980 *   s[0]                               # -> "Fri Dec 12 "
981 *   s[1]                               # -> "Fri"
982 *   s[2]                               # -> "Dec"
983 *   s[3]                               # -> "12"
984 *   s.post_match                       # -> "1975 14:39"
985 *   s.pre_match                        # -> ""
986 */
987static VALUE
988strscan_aref(VALUE self, VALUE idx)
989{
990    struct strscanner *p;
991    long i;
992
993    GET_SCANNER(self, p);
994    if (! MATCHED_P(p))        return Qnil;
995
996    i = NUM2LONG(idx);
997    if (i < 0)
998        i += p->regs.num_regs;
999    if (i < 0)                 return Qnil;
1000    if (i >= p->regs.num_regs) return Qnil;
1001    if (p->regs.beg[i] == -1)  return Qnil;
1002
1003    return extract_range(p, p->prev + p->regs.beg[i],
1004                            p->prev + p->regs.end[i]);
1005}
1006
1007/*
1008 * Return the <i><b>pre</b>-match</i> (in the regular expression sense) of the last scan.
1009 *
1010 *   s = StringScanner.new('test string')
1011 *   s.scan(/\w+/)           # -> "test"
1012 *   s.scan(/\s+/)           # -> " "
1013 *   s.pre_match             # -> "test"
1014 *   s.post_match            # -> "string"
1015 */
1016static VALUE
1017strscan_pre_match(VALUE self)
1018{
1019    struct strscanner *p;
1020
1021    GET_SCANNER(self, p);
1022    if (! MATCHED_P(p)) return Qnil;
1023    return extract_range(p, 0, p->prev + p->regs.beg[0]);
1024}
1025
1026/*
1027 * Return the <i><b>post</b>-match</i> (in the regular expression sense) of the last scan.
1028 *
1029 *   s = StringScanner.new('test string')
1030 *   s.scan(/\w+/)           # -> "test"
1031 *   s.scan(/\s+/)           # -> " "
1032 *   s.pre_match             # -> "test"
1033 *   s.post_match            # -> "string"
1034 */
1035static VALUE
1036strscan_post_match(VALUE self)
1037{
1038    struct strscanner *p;
1039
1040    GET_SCANNER(self, p);
1041    if (! MATCHED_P(p)) return Qnil;
1042    return extract_range(p, p->prev + p->regs.end[0], S_LEN(p));
1043}
1044
1045/*
1046 * Returns the "rest" of the string (i.e. everything after the scan pointer).
1047 * If there is no more data (eos? = true), it returns <tt>""</tt>.
1048 */
1049static VALUE
1050strscan_rest(VALUE self)
1051{
1052    struct strscanner *p;
1053
1054    GET_SCANNER(self, p);
1055    if (EOS_P(p)) {
1056        return infect(str_new(p, "", 0), p);
1057    }
1058    return extract_range(p, p->curr, S_LEN(p));
1059}
1060
1061/*
1062 * <tt>s.rest_size</tt> is equivalent to <tt>s.rest.size</tt>.
1063 */
1064static VALUE
1065strscan_rest_size(VALUE self)
1066{
1067    struct strscanner *p;
1068    long i;
1069
1070    GET_SCANNER(self, p);
1071    if (EOS_P(p)) {
1072        return INT2FIX(0);
1073    }
1074    i = S_LEN(p) - p->curr;
1075    return INT2FIX(i);
1076}
1077
1078/*
1079 * <tt>s.restsize</tt> is equivalent to <tt>s.rest_size</tt>.
1080 * This method is obsolete; use #rest_size instead.
1081 */
1082static VALUE
1083strscan_restsize(VALUE self)
1084{
1085    rb_warning("StringScanner#restsize is obsolete; use #rest_size instead");
1086    return strscan_rest_size(self);
1087}
1088
1089#define INSPECT_LENGTH 5
1090#define BUFSIZE 256
1091
1092/*
1093 * Returns a string that represents the StringScanner object, showing:
1094 * - the current position
1095 * - the size of the string
1096 * - the characters surrounding the scan pointer
1097 *
1098 *   s = StringScanner.new("Fri Dec 12 1975 14:39")
1099 *   s.inspect            # -> '#<StringScanner 0/21 @ "Fri D...">'
1100 *   s.scan_until /12/    # -> "Fri Dec 12"
1101 *   s.inspect            # -> '#<StringScanner 10/21 "...ec 12" @ " 1975...">'
1102 */
1103static VALUE
1104strscan_inspect(VALUE self)
1105{
1106    struct strscanner *p;
1107    VALUE a, b;
1108
1109    p = check_strscan(self);
1110    if (NIL_P(p->str)) {
1111	a = rb_sprintf("#<%"PRIsVALUE" (uninitialized)>", rb_obj_class(self));
1112	return infect(a, p);
1113    }
1114    if (EOS_P(p)) {
1115	a = rb_sprintf("#<%"PRIsVALUE" fin>", rb_obj_class(self));
1116	return infect(a, p);
1117    }
1118    if (p->curr == 0) {
1119	b = inspect2(p);
1120	a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld @ %"PRIsVALUE">",
1121		       rb_obj_class(self),
1122		       p->curr, S_LEN(p),
1123		       b);
1124	return infect(a, p);
1125    }
1126    a = inspect1(p);
1127    b = inspect2(p);
1128    a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld %"PRIsVALUE" @ %"PRIsVALUE">",
1129		   rb_obj_class(self),
1130		   p->curr, S_LEN(p),
1131		   a, b);
1132    return infect(a, p);
1133}
1134
1135static VALUE
1136inspect1(struct strscanner *p)
1137{
1138    VALUE str;
1139    long len;
1140
1141    if (p->curr == 0) return rb_str_new2("");
1142    if (p->curr > INSPECT_LENGTH) {
1143	str = rb_str_new_cstr("...");
1144	len = INSPECT_LENGTH;
1145    }
1146    else {
1147	str = rb_str_new(0, 0);
1148	len = p->curr;
1149    }
1150    rb_str_cat(str, CURPTR(p) - len, len);
1151    return rb_str_dump(str);
1152}
1153
1154static VALUE
1155inspect2(struct strscanner *p)
1156{
1157    VALUE str;
1158    long len;
1159
1160    if (EOS_P(p)) return rb_str_new2("");
1161    len = S_LEN(p) - p->curr;
1162    if (len > INSPECT_LENGTH) {
1163	str = rb_str_new(CURPTR(p), INSPECT_LENGTH);
1164	rb_str_cat2(str, "...");
1165    }
1166    else {
1167	str = rb_str_new(CURPTR(p), len);
1168    }
1169    return rb_str_dump(str);
1170}
1171
1172/* =======================================================================
1173                              Ruby Interface
1174   ======================================================================= */
1175
1176/*
1177 * Document-class: StringScanner
1178 *
1179 * StringScanner provides for lexical scanning operations on a String.  Here is
1180 * an example of its usage:
1181 *
1182 *   s = StringScanner.new('This is an example string')
1183 *   s.eos?               # -> false
1184 *
1185 *   p s.scan(/\w+/)      # -> "This"
1186 *   p s.scan(/\w+/)      # -> nil
1187 *   p s.scan(/\s+/)      # -> " "
1188 *   p s.scan(/\s+/)      # -> nil
1189 *   p s.scan(/\w+/)      # -> "is"
1190 *   s.eos?               # -> false
1191 *
1192 *   p s.scan(/\s+/)      # -> " "
1193 *   p s.scan(/\w+/)      # -> "an"
1194 *   p s.scan(/\s+/)      # -> " "
1195 *   p s.scan(/\w+/)      # -> "example"
1196 *   p s.scan(/\s+/)      # -> " "
1197 *   p s.scan(/\w+/)      # -> "string"
1198 *   s.eos?               # -> true
1199 *
1200 *   p s.scan(/\s+/)      # -> nil
1201 *   p s.scan(/\w+/)      # -> nil
1202 *
1203 * Scanning a string means remembering the position of a <i>scan pointer</i>,
1204 * which is just an index.  The point of scanning is to move forward a bit at
1205 * a time, so matches are sought after the scan pointer; usually immediately
1206 * after it.
1207 *
1208 * Given the string "test string", here are the pertinent scan pointer
1209 * positions:
1210 *
1211 *     t e s t   s t r i n g
1212 *   0 1 2 ...             1
1213 *                         0
1214 *
1215 * When you #scan for a pattern (a regular expression), the match must occur
1216 * at the character after the scan pointer.  If you use #scan_until, then the
1217 * match can occur anywhere after the scan pointer.  In both cases, the scan
1218 * pointer moves <i>just beyond</i> the last character of the match, ready to
1219 * scan again from the next character onwards.  This is demonstrated by the
1220 * example above.
1221 *
1222 * == Method Categories
1223 *
1224 * There are other methods besides the plain scanners.  You can look ahead in
1225 * the string without actually scanning.  You can access the most recent match.
1226 * You can modify the string being scanned, reset or terminate the scanner,
1227 * find out or change the position of the scan pointer, skip ahead, and so on.
1228 *
1229 * === Advancing the Scan Pointer
1230 *
1231 * - #getch
1232 * - #get_byte
1233 * - #scan
1234 * - #scan_until
1235 * - #skip
1236 * - #skip_until
1237 *
1238 * === Looking Ahead
1239 *
1240 * - #check
1241 * - #check_until
1242 * - #exist?
1243 * - #match?
1244 * - #peek
1245 *
1246 * === Finding Where we Are
1247 *
1248 * - #beginning_of_line? (#bol?)
1249 * - #eos?
1250 * - #rest?
1251 * - #rest_size
1252 * - #pos
1253 *
1254 * === Setting Where we Are
1255 *
1256 * - #reset
1257 * - #terminate
1258 * - #pos=
1259 *
1260 * === Match Data
1261 *
1262 * - #matched
1263 * - #matched?
1264 * - #matched_size
1265 * - []
1266 * - #pre_match
1267 * - #post_match
1268 *
1269 * === Miscellaneous
1270 *
1271 * - <<
1272 * - #concat
1273 * - #string
1274 * - #string=
1275 * - #unscan
1276 *
1277 * There are aliases to several of the methods.
1278 */
1279void
1280Init_strscan()
1281{
1282    ID id_scanerr = rb_intern("ScanError");
1283    VALUE tmp;
1284
1285    id_byteslice = rb_intern("byteslice");
1286
1287    StringScanner = rb_define_class("StringScanner", rb_cObject);
1288    ScanError = rb_define_class_under(StringScanner, "Error", rb_eStandardError);
1289    if (!rb_const_defined(rb_cObject, id_scanerr)) {
1290	rb_const_set(rb_cObject, id_scanerr, ScanError);
1291    }
1292    tmp = rb_str_new2(STRSCAN_VERSION);
1293    rb_obj_freeze(tmp);
1294    rb_const_set(StringScanner, rb_intern("Version"), tmp);
1295    tmp = rb_str_new2("$Id: strscan.c 44659 2014-01-19 16:28:53Z nagachika $");
1296    rb_obj_freeze(tmp);
1297    rb_const_set(StringScanner, rb_intern("Id"), tmp);
1298
1299    rb_define_alloc_func(StringScanner, strscan_s_allocate);
1300    rb_define_private_method(StringScanner, "initialize", strscan_initialize, -1);
1301    rb_define_private_method(StringScanner, "initialize_copy", strscan_init_copy, 1);
1302    rb_define_singleton_method(StringScanner, "must_C_version", strscan_s_mustc, 0);
1303    rb_define_method(StringScanner, "reset",       strscan_reset,       0);
1304    rb_define_method(StringScanner, "terminate",   strscan_terminate,   0);
1305    rb_define_method(StringScanner, "clear",       strscan_clear,       0);
1306    rb_define_method(StringScanner, "string",      strscan_get_string,  0);
1307    rb_define_method(StringScanner, "string=",     strscan_set_string,  1);
1308    rb_define_method(StringScanner, "concat",      strscan_concat,      1);
1309    rb_define_method(StringScanner, "<<",          strscan_concat,      1);
1310    rb_define_method(StringScanner, "pos",         strscan_get_pos,     0);
1311    rb_define_method(StringScanner, "pos=",        strscan_set_pos,     1);
1312    rb_define_method(StringScanner, "charpos",     strscan_get_charpos, 0);
1313    rb_define_method(StringScanner, "pointer",     strscan_get_pos,     0);
1314    rb_define_method(StringScanner, "pointer=",    strscan_set_pos,     1);
1315
1316    rb_define_method(StringScanner, "scan",        strscan_scan,        1);
1317    rb_define_method(StringScanner, "skip",        strscan_skip,        1);
1318    rb_define_method(StringScanner, "match?",      strscan_match_p,     1);
1319    rb_define_method(StringScanner, "check",       strscan_check,       1);
1320    rb_define_method(StringScanner, "scan_full",   strscan_scan_full,   3);
1321
1322    rb_define_method(StringScanner, "scan_until",  strscan_scan_until,  1);
1323    rb_define_method(StringScanner, "skip_until",  strscan_skip_until,  1);
1324    rb_define_method(StringScanner, "exist?",      strscan_exist_p,     1);
1325    rb_define_method(StringScanner, "check_until", strscan_check_until, 1);
1326    rb_define_method(StringScanner, "search_full", strscan_search_full, 3);
1327
1328    rb_define_method(StringScanner, "getch",       strscan_getch,       0);
1329    rb_define_method(StringScanner, "get_byte",    strscan_get_byte,    0);
1330    rb_define_method(StringScanner, "getbyte",     strscan_getbyte,     0);
1331    rb_define_method(StringScanner, "peek",        strscan_peek,        1);
1332    rb_define_method(StringScanner, "peep",        strscan_peep,        1);
1333
1334    rb_define_method(StringScanner, "unscan",      strscan_unscan,      0);
1335
1336    rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0);
1337    rb_alias(StringScanner, rb_intern("bol?"), rb_intern("beginning_of_line?"));
1338    rb_define_method(StringScanner, "eos?",        strscan_eos_p,       0);
1339    rb_define_method(StringScanner, "empty?",      strscan_empty_p,     0);
1340    rb_define_method(StringScanner, "rest?",       strscan_rest_p,      0);
1341
1342    rb_define_method(StringScanner, "matched?",    strscan_matched_p,   0);
1343    rb_define_method(StringScanner, "matched",     strscan_matched,     0);
1344    rb_define_method(StringScanner, "matched_size", strscan_matched_size, 0);
1345    rb_define_method(StringScanner, "[]",          strscan_aref,        1);
1346    rb_define_method(StringScanner, "pre_match",   strscan_pre_match,   0);
1347    rb_define_method(StringScanner, "post_match",  strscan_post_match,  0);
1348
1349    rb_define_method(StringScanner, "rest",        strscan_rest,        0);
1350    rb_define_method(StringScanner, "rest_size",   strscan_rest_size,   0);
1351    rb_define_method(StringScanner, "restsize",    strscan_restsize,    0);
1352
1353    rb_define_method(StringScanner, "inspect",     strscan_inspect,     0);
1354}
1355