1/********************************************************************** 2 3 re.c - 4 5 $Author: marcandre $ 6 created at: Mon Aug 9 18:24:49 JST 1993 7 8 Copyright (C) 1993-2007 Yukihiro Matsumoto 9 10**********************************************************************/ 11 12#include "ruby/ruby.h" 13#include "ruby/re.h" 14#include "ruby/encoding.h" 15#include "ruby/util.h" 16#include "internal.h" 17#include "regint.h" 18#include <ctype.h> 19 20VALUE rb_eRegexpError; 21 22typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN]; 23#define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN) 24 25#define BEG(no) (regs->beg[(no)]) 26#define END(no) (regs->end[(no)]) 27 28#if 'a' == 97 /* it's ascii */ 29static const char casetable[] = { 30 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', 31 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', 32 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', 33 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', 34 /* ' ' '!' '"' '#' '$' '%' '&' ''' */ 35 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', 36 /* '(' ')' '*' '+' ',' '-' '.' '/' */ 37 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', 38 /* '0' '1' '2' '3' '4' '5' '6' '7' */ 39 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', 40 /* '8' '9' ':' ';' '<' '=' '>' '?' */ 41 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', 42 /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */ 43 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 44 /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */ 45 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 46 /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */ 47 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 48 /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */ 49 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', 50 /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */ 51 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 52 /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */ 53 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 54 /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */ 55 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 56 /* 'x' 'y' 'z' '{' '|' '}' '~' */ 57 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', 58 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', 59 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', 60 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', 61 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', 62 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', 63 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', 64 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', 65 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', 66 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', 67 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', 68 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', 69 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', 70 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 71 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 72 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 73 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', 74}; 75#else 76# error >>> "You lose. You will need a translation table for your character set." <<< 77#endif 78 79int 80rb_memcicmp(const void *x, const void *y, long len) 81{ 82 const unsigned char *p1 = x, *p2 = y; 83 int tmp; 84 85 while (len--) { 86 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++])) 87 return tmp; 88 } 89 return 0; 90} 91 92#undef rb_memcmp 93 94int 95rb_memcmp(const void *p1, const void *p2, long len) 96{ 97 return memcmp(p1, p2, len); 98} 99 100#ifdef HAVE_MEMMEM 101static inline long 102rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n) 103{ 104 const unsigned char *y; 105 106 if (y = memmem(ys, n, xs, m)) 107 return y - ys; 108 else 109 return -1; 110} 111#else 112static inline long 113rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n) 114{ 115 const unsigned char *x = xs, *xe = xs + m; 116 const unsigned char *y = ys, *ye = ys + n; 117#ifndef VALUE_MAX 118# if SIZEOF_VALUE == 8 119# define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL 120# elif SIZEOF_VALUE == 4 121# define VALUE_MAX 0xFFFFFFFFUL 122# endif 123#endif 124 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT); 125 126 if (m > SIZEOF_VALUE) 127 rb_bug("!!too long pattern string!!"); 128 129 if (!(y = memchr(y, *x, n - m + 1))) 130 return -1; 131 132 /* Prepare hash value */ 133 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) { 134 hx <<= CHAR_BIT; 135 hy <<= CHAR_BIT; 136 hx |= *x; 137 hy |= *y; 138 } 139 /* Searching */ 140 while (hx != hy) { 141 if (y == ye) 142 return -1; 143 hy <<= CHAR_BIT; 144 hy |= *y; 145 hy &= mask; 146 y++; 147 } 148 return y - ys - m; 149} 150#endif 151 152static inline long 153rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n) 154{ 155 const unsigned char *x = xs, *xe = xs + m; 156 const unsigned char *y = ys; 157 VALUE i, qstable[256]; 158 159 /* Preprocessing */ 160 for (i = 0; i < 256; ++i) 161 qstable[i] = m + 1; 162 for (; x < xe; ++x) 163 qstable[*x] = xe - x; 164 /* Searching */ 165 for (; y + m <= ys + n; y += *(qstable + y[m])) { 166 if (*xs == *y && memcmp(xs, y, m) == 0) 167 return y - ys; 168 } 169 return -1; 170} 171 172static inline unsigned int 173rb_memsearch_qs_utf8_hash(const unsigned char *x) 174{ 175 register const unsigned int mix = 8353; 176 register unsigned int h = *x; 177 if (h < 0xC0) { 178 return h + 256; 179 } 180 else if (h < 0xE0) { 181 h *= mix; 182 h += x[1]; 183 } 184 else if (h < 0xF0) { 185 h *= mix; 186 h += x[1]; 187 h *= mix; 188 h += x[2]; 189 } 190 else if (h < 0xF5) { 191 h *= mix; 192 h += x[1]; 193 h *= mix; 194 h += x[2]; 195 h *= mix; 196 h += x[3]; 197 } 198 else { 199 return h + 256; 200 } 201 return (unsigned char)h; 202} 203 204static inline long 205rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n) 206{ 207 const unsigned char *x = xs, *xe = xs + m; 208 const unsigned char *y = ys; 209 VALUE i, qstable[512]; 210 211 /* Preprocessing */ 212 for (i = 0; i < 512; ++i) { 213 qstable[i] = m + 1; 214 } 215 for (; x < xe; ++x) { 216 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x; 217 } 218 /* Searching */ 219 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) { 220 if (*xs == *y && memcmp(xs, y, m) == 0) 221 return y - ys; 222 } 223 return -1; 224} 225 226long 227rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc) 228{ 229 const unsigned char *x = x0, *y = y0; 230 231 if (m > n) return -1; 232 else if (m == n) { 233 return memcmp(x0, y0, m) == 0 ? 0 : -1; 234 } 235 else if (m < 1) { 236 return 0; 237 } 238 else if (m == 1) { 239 const unsigned char *ys; 240 241 if (ys = memchr(y, *x, n)) 242 return ys - y; 243 else 244 return -1; 245 } 246 else if (m <= SIZEOF_VALUE) { 247 return rb_memsearch_ss(x0, m, y0, n); 248 } 249 else if (enc == rb_utf8_encoding()){ 250 return rb_memsearch_qs_utf8(x0, m, y0, n); 251 } 252 else { 253 return rb_memsearch_qs(x0, m, y0, n); 254 } 255} 256 257#define REG_LITERAL FL_USER5 258#define REG_ENCODING_NONE FL_USER6 259 260#define KCODE_FIXED FL_USER4 261 262#define ARG_REG_OPTION_MASK \ 263 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND) 264#define ARG_ENCODING_FIXED 16 265#define ARG_ENCODING_NONE 32 266 267static int 268char_to_option(int c) 269{ 270 int val; 271 272 switch (c) { 273 case 'i': 274 val = ONIG_OPTION_IGNORECASE; 275 break; 276 case 'x': 277 val = ONIG_OPTION_EXTEND; 278 break; 279 case 'm': 280 val = ONIG_OPTION_MULTILINE; 281 break; 282 default: 283 val = 0; 284 break; 285 } 286 return val; 287} 288 289static char * 290option_to_str(char str[4], int options) 291{ 292 char *p = str; 293 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm'; 294 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i'; 295 if (options & ONIG_OPTION_EXTEND) *p++ = 'x'; 296 *p = 0; 297 return str; 298} 299 300extern int 301rb_char_to_option_kcode(int c, int *option, int *kcode) 302{ 303 *option = 0; 304 305 switch (c) { 306 case 'n': 307 *kcode = rb_ascii8bit_encindex(); 308 return (*option = ARG_ENCODING_NONE); 309 case 'e': 310 *kcode = rb_enc_find_index("EUC-JP"); 311 break; 312 case 's': 313 *kcode = rb_enc_find_index("Windows-31J"); 314 break; 315 case 'u': 316 *kcode = rb_utf8_encindex(); 317 break; 318 default: 319 *kcode = -1; 320 return (*option = char_to_option(c)); 321 } 322 *option = ARG_ENCODING_FIXED; 323 return 1; 324} 325 326static void 327rb_reg_check(VALUE re) 328{ 329 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) { 330 rb_raise(rb_eTypeError, "uninitialized Regexp"); 331 } 332} 333 334static void 335rb_reg_expr_str(VALUE str, const char *s, long len, 336 rb_encoding *enc, rb_encoding *resenc) 337{ 338 const char *p, *pend; 339 int cr = ENC_CODERANGE_UNKNOWN; 340 int need_escape = 0; 341 int c, clen; 342 343 p = s; pend = p + len; 344 rb_str_coderange_scan_restartable(p, pend, enc, &cr); 345 if (rb_enc_asciicompat(enc) && 346 (cr == ENC_CODERANGE_VALID || cr == ENC_CODERANGE_7BIT)) { 347 while (p < pend) { 348 c = rb_enc_ascget(p, pend, &clen, enc); 349 if (c == -1) { 350 if (enc == resenc) { 351 p += mbclen(p, pend, enc); 352 } 353 else { 354 need_escape = 1; 355 break; 356 } 357 } 358 else if (c != '/' && rb_enc_isprint(c, enc)) { 359 p += clen; 360 } 361 else { 362 need_escape = 1; 363 break; 364 } 365 } 366 } 367 else { 368 need_escape = 1; 369 } 370 371 if (!need_escape) { 372 rb_str_buf_cat(str, s, len); 373 } 374 else { 375 int unicode_p = rb_enc_unicode_p(enc); 376 p = s; 377 while (p<pend) { 378 c = rb_enc_ascget(p, pend, &clen, enc); 379 if (c == '\\' && p+clen < pend) { 380 int n = clen + mbclen(p+clen, pend, enc); 381 rb_str_buf_cat(str, p, n); 382 p += n; 383 continue; 384 } 385 else if (c == '/') { 386 char c = '\\'; 387 rb_str_buf_cat(str, &c, 1); 388 rb_str_buf_cat(str, p, clen); 389 } 390 else if (c == -1) { 391 clen = rb_enc_precise_mbclen(p, pend, enc); 392 if (!MBCLEN_CHARFOUND_P(clen)) { 393 c = (unsigned char)*p; 394 clen = 1; 395 goto hex; 396 } 397 if (resenc) { 398 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc); 399 rb_str_buf_cat_escaped_char(str, c, unicode_p); 400 } 401 else { 402 clen = MBCLEN_CHARFOUND_LEN(clen); 403 rb_str_buf_cat(str, p, clen); 404 } 405 } 406 else if (rb_enc_isprint(c, enc)) { 407 rb_str_buf_cat(str, p, clen); 408 } 409 else if (!rb_enc_isspace(c, enc)) { 410 char b[8]; 411 412 hex: 413 snprintf(b, sizeof(b), "\\x%02X", c); 414 rb_str_buf_cat(str, b, 4); 415 } 416 else { 417 rb_str_buf_cat(str, p, clen); 418 } 419 p += clen; 420 } 421 } 422} 423 424static VALUE 425rb_reg_desc(const char *s, long len, VALUE re) 426{ 427 rb_encoding *enc = rb_enc_get(re); 428 VALUE str = rb_str_buf_new2("/"); 429 rb_encoding *resenc = rb_default_internal_encoding(); 430 if (resenc == NULL) resenc = rb_default_external_encoding(); 431 432 if (re && rb_enc_asciicompat(enc)) { 433 rb_enc_copy(str, re); 434 } 435 else { 436 rb_enc_associate(str, rb_usascii_encoding()); 437 } 438 rb_reg_expr_str(str, s, len, enc, resenc); 439 rb_str_buf_cat2(str, "/"); 440 if (re) { 441 char opts[4]; 442 rb_reg_check(re); 443 if (*option_to_str(opts, RREGEXP(re)->ptr->options)) 444 rb_str_buf_cat2(str, opts); 445 if (RBASIC(re)->flags & REG_ENCODING_NONE) 446 rb_str_buf_cat2(str, "n"); 447 } 448 OBJ_INFECT(str, re); 449 return str; 450} 451 452 453/* 454 * call-seq: 455 * rxp.source -> str 456 * 457 * Returns the original string of the pattern. 458 * 459 * /ab+c/ix.source #=> "ab+c" 460 * 461 * Note that escape sequences are retained as is. 462 * 463 * /\x20\+/.source #=> "\\x20\\+" 464 * 465 */ 466 467static VALUE 468rb_reg_source(VALUE re) 469{ 470 VALUE str; 471 472 rb_reg_check(re); 473 str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re)); 474 if (OBJ_TAINTED(re)) OBJ_TAINT(str); 475 return str; 476} 477 478/* 479 * call-seq: 480 * rxp.inspect -> string 481 * 482 * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly, 483 * <code>#inspect</code> actually produces the more natural version of 484 * the string than <code>#to_s</code>. 485 * 486 * /ab+c/ix.inspect #=> "/ab+c/ix" 487 * 488 */ 489 490static VALUE 491rb_reg_inspect(VALUE re) 492{ 493 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) { 494 return rb_any_to_s(re); 495 } 496 return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re); 497} 498 499 500/* 501 * call-seq: 502 * rxp.to_s -> str 503 * 504 * Returns a string containing the regular expression and its options (using the 505 * <code>(?opts:source)</code> notation. This string can be fed back in to 506 * <code>Regexp::new</code> to a regular expression with the same semantics as 507 * the original. (However, <code>Regexp#==</code> may not return true when 508 * comparing the two, as the source of the regular expression itself may 509 * differ, as the example shows). <code>Regexp#inspect</code> produces a 510 * generally more readable version of <i>rxp</i>. 511 * 512 * r1 = /ab+c/ix #=> /ab+c/ix 513 * s1 = r1.to_s #=> "(?ix-m:ab+c)" 514 * r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/ 515 * r1 == r2 #=> false 516 * r1.source #=> "ab+c" 517 * r2.source #=> "(?ix-m:ab+c)" 518 */ 519 520static VALUE 521rb_reg_to_s(VALUE re) 522{ 523 int options, opt; 524 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND; 525 long len; 526 const UChar* ptr; 527 VALUE str = rb_str_buf_new2("(?"); 528 char optbuf[5]; 529 rb_encoding *enc = rb_enc_get(re); 530 531 rb_reg_check(re); 532 533 rb_enc_copy(str, re); 534 options = RREGEXP(re)->ptr->options; 535 ptr = (UChar*)RREGEXP_SRC_PTR(re); 536 len = RREGEXP_SRC_LEN(re); 537 again: 538 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') { 539 int err = 1; 540 ptr += 2; 541 if ((len -= 2) > 0) { 542 do { 543 opt = char_to_option((int )*ptr); 544 if (opt != 0) { 545 options |= opt; 546 } 547 else { 548 break; 549 } 550 ++ptr; 551 } while (--len > 0); 552 } 553 if (len > 1 && *ptr == '-') { 554 ++ptr; 555 --len; 556 do { 557 opt = char_to_option((int )*ptr); 558 if (opt != 0) { 559 options &= ~opt; 560 } 561 else { 562 break; 563 } 564 ++ptr; 565 } while (--len > 0); 566 } 567 if (*ptr == ')') { 568 --len; 569 ++ptr; 570 goto again; 571 } 572 if (*ptr == ':' && ptr[len-1] == ')') { 573 Regexp *rp; 574 575 ++ptr; 576 len -= 2; 577 err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT, 578 enc, OnigDefaultSyntax, NULL); 579 onig_free(rp); 580 } 581 if (err) { 582 options = RREGEXP(re)->ptr->options; 583 ptr = (UChar*)RREGEXP_SRC_PTR(re); 584 len = RREGEXP_SRC_LEN(re); 585 } 586 } 587 588 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf); 589 590 if ((options & embeddable) != embeddable) { 591 optbuf[0] = '-'; 592 option_to_str(optbuf + 1, ~options); 593 rb_str_buf_cat2(str, optbuf); 594 } 595 596 rb_str_buf_cat2(str, ":"); 597 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL); 598 rb_str_buf_cat2(str, ")"); 599 rb_enc_copy(str, re); 600 601 OBJ_INFECT(str, re); 602 return str; 603} 604 605static void 606rb_reg_raise(const char *s, long len, const char *err, VALUE re) 607{ 608 volatile VALUE desc = rb_reg_desc(s, len, re); 609 610 rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc)); 611} 612 613static VALUE 614rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err) 615{ 616 char opts[6]; 617 VALUE desc = rb_str_buf_new2(err); 618 rb_encoding *resenc = rb_default_internal_encoding(); 619 if (resenc == NULL) resenc = rb_default_external_encoding(); 620 621 rb_enc_associate(desc, enc); 622 rb_str_buf_cat2(desc, ": /"); 623 rb_reg_expr_str(desc, s, len, enc, resenc); 624 opts[0] = '/'; 625 option_to_str(opts + 1, options); 626 rb_str_buf_cat2(desc, opts); 627 return rb_exc_new3(rb_eRegexpError, desc); 628} 629 630static void 631rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err) 632{ 633 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err)); 634} 635 636static VALUE 637rb_reg_error_desc(VALUE str, int options, const char *err) 638{ 639 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str), 640 rb_enc_get(str), options, err); 641} 642 643static void 644rb_reg_raise_str(VALUE str, int options, const char *err) 645{ 646 rb_exc_raise(rb_reg_error_desc(str, options, err)); 647} 648 649 650/* 651 * call-seq: 652 * rxp.casefold? -> true or false 653 * 654 * Returns the value of the case-insensitive flag. 655 * 656 * /a/.casefold? #=> false 657 * /a/i.casefold? #=> true 658 * /(?i:a)/.casefold? #=> false 659 */ 660 661static VALUE 662rb_reg_casefold_p(VALUE re) 663{ 664 rb_reg_check(re); 665 if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue; 666 return Qfalse; 667} 668 669 670/* 671 * call-seq: 672 * rxp.options -> fixnum 673 * 674 * Returns the set of bits corresponding to the options used when creating this 675 * Regexp (see <code>Regexp::new</code> for details. Note that additional bits 676 * may be set in the returned options: these are used internally by the regular 677 * expression code. These extra bits are ignored if the options are passed to 678 * <code>Regexp::new</code>. 679 * 680 * Regexp::IGNORECASE #=> 1 681 * Regexp::EXTENDED #=> 2 682 * Regexp::MULTILINE #=> 4 683 * 684 * /cat/.options #=> 0 685 * /cat/ix.options #=> 3 686 * Regexp.new('cat', true).options #=> 1 687 * /\xa1\xa2/e.options #=> 16 688 * 689 * r = /cat/ix 690 * Regexp.new(r.source, r.options) #=> /cat/ix 691 */ 692 693static VALUE 694rb_reg_options_m(VALUE re) 695{ 696 int options = rb_reg_options(re); 697 return INT2NUM(options); 698} 699 700static int 701reg_names_iter(const OnigUChar *name, const OnigUChar *name_end, 702 int back_num, int *back_refs, OnigRegex regex, void *arg) 703{ 704 VALUE ary = (VALUE)arg; 705 rb_ary_push(ary, rb_str_new((const char *)name, name_end-name)); 706 return 0; 707} 708 709/* 710 * call-seq: 711 * rxp.names -> [name1, name2, ...] 712 * 713 * Returns a list of names of captures as an array of strings. 714 * 715 * /(?<foo>.)(?<bar>.)(?<baz>.)/.names 716 * #=> ["foo", "bar", "baz"] 717 * 718 * /(?<foo>.)(?<foo>.)/.names 719 * #=> ["foo"] 720 * 721 * /(.)(.)/.names 722 * #=> [] 723 */ 724 725static VALUE 726rb_reg_names(VALUE re) 727{ 728 VALUE ary = rb_ary_new(); 729 rb_reg_check(re); 730 onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary); 731 return ary; 732} 733 734static int 735reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end, 736 int back_num, int *back_refs, OnigRegex regex, void *arg) 737{ 738 VALUE hash = (VALUE)arg; 739 VALUE ary = rb_ary_new2(back_num); 740 int i; 741 742 for (i = 0; i < back_num; i++) 743 rb_ary_store(ary, i, INT2NUM(back_refs[i])); 744 745 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary); 746 747 return 0; 748} 749 750/* 751 * call-seq: 752 * rxp.named_captures -> hash 753 * 754 * Returns a hash representing information about named captures of <i>rxp</i>. 755 * 756 * A key of the hash is a name of the named captures. 757 * A value of the hash is an array which is list of indexes of corresponding 758 * named captures. 759 * 760 * /(?<foo>.)(?<bar>.)/.named_captures 761 * #=> {"foo"=>[1], "bar"=>[2]} 762 * 763 * /(?<foo>.)(?<foo>.)/.named_captures 764 * #=> {"foo"=>[1, 2]} 765 * 766 * If there are no named captures, an empty hash is returned. 767 * 768 * /(.)(.)/.named_captures 769 * #=> {} 770 */ 771 772static VALUE 773rb_reg_named_captures(VALUE re) 774{ 775 VALUE hash = rb_hash_new(); 776 rb_reg_check(re); 777 onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash); 778 return hash; 779} 780 781static int 782onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end, 783 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax, 784 OnigErrorInfo* einfo, const char *sourcefile, int sourceline) 785{ 786 int r; 787 788 *reg = (regex_t* )malloc(sizeof(regex_t)); 789 if (IS_NULL(*reg)) return ONIGERR_MEMORY; 790 791 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); 792 if (r) goto err; 793 794 r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline); 795 if (r) { 796 err: 797 onig_free(*reg); 798 *reg = NULL; 799 } 800 return r; 801} 802 803static Regexp* 804make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err, 805 const char *sourcefile, int sourceline) 806{ 807 Regexp *rp; 808 int r; 809 OnigErrorInfo einfo; 810 811 /* Handle escaped characters first. */ 812 813 /* Build a copy of the string (in dest) with the 814 escaped characters translated, and generate the regex 815 from that. 816 */ 817 818 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags, 819 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline); 820 if (r) { 821 onig_error_code_to_str((UChar*)err, r, &einfo); 822 return 0; 823 } 824 return rp; 825} 826 827 828/* 829 * Document-class: MatchData 830 * 831 * <code>MatchData</code> is the type of the special variable <code>$~</code>, 832 * and is the type of the object returned by <code>Regexp#match</code> and 833 * <code>Regexp.last_match</code>. It encapsulates all the results of a pattern 834 * match, results normally accessed through the special variables 835 * <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>, 836 * <code>$2</code>, and so on. 837 * 838 */ 839 840VALUE rb_cMatch; 841 842static VALUE 843match_alloc(VALUE klass) 844{ 845 NEWOBJ_OF(match, struct RMatch, klass, T_MATCH); 846 847 match->str = 0; 848 match->rmatch = 0; 849 match->regexp = 0; 850 match->rmatch = ALLOC(struct rmatch); 851 MEMZERO(match->rmatch, struct rmatch, 1); 852 853 return (VALUE)match; 854} 855 856typedef struct { 857 long byte_pos; 858 long char_pos; 859} pair_t; 860 861static int 862pair_byte_cmp(const void *pair1, const void *pair2) 863{ 864 long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos; 865#if SIZEOF_LONG > SIZEOF_INT 866 return diff ? diff > 0 ? 1 : -1 : 0; 867#else 868 return (int)diff; 869#endif 870} 871 872static void 873update_char_offset(VALUE match) 874{ 875 struct rmatch *rm = RMATCH(match)->rmatch; 876 struct re_registers *regs; 877 int i, num_regs, num_pos; 878 long c; 879 char *s, *p, *q; 880 rb_encoding *enc; 881 pair_t *pairs; 882 883 if (rm->char_offset_updated) 884 return; 885 886 regs = &rm->regs; 887 num_regs = rm->regs.num_regs; 888 889 if (rm->char_offset_num_allocated < num_regs) { 890 REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs); 891 rm->char_offset_num_allocated = num_regs; 892 } 893 894 enc = rb_enc_get(RMATCH(match)->str); 895 if (rb_enc_mbmaxlen(enc) == 1) { 896 for (i = 0; i < num_regs; i++) { 897 rm->char_offset[i].beg = BEG(i); 898 rm->char_offset[i].end = END(i); 899 } 900 rm->char_offset_updated = 1; 901 return; 902 } 903 904 pairs = ALLOCA_N(pair_t, num_regs*2); 905 num_pos = 0; 906 for (i = 0; i < num_regs; i++) { 907 if (BEG(i) < 0) 908 continue; 909 pairs[num_pos++].byte_pos = BEG(i); 910 pairs[num_pos++].byte_pos = END(i); 911 } 912 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp); 913 914 s = p = RSTRING_PTR(RMATCH(match)->str); 915 c = 0; 916 for (i = 0; i < num_pos; i++) { 917 q = s + pairs[i].byte_pos; 918 c += rb_enc_strlen(p, q, enc); 919 pairs[i].char_pos = c; 920 p = q; 921 } 922 923 for (i = 0; i < num_regs; i++) { 924 pair_t key, *found; 925 if (BEG(i) < 0) { 926 rm->char_offset[i].beg = -1; 927 rm->char_offset[i].end = -1; 928 continue; 929 } 930 931 key.byte_pos = BEG(i); 932 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); 933 rm->char_offset[i].beg = found->char_pos; 934 935 key.byte_pos = END(i); 936 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); 937 rm->char_offset[i].end = found->char_pos; 938 } 939 940 rm->char_offset_updated = 1; 941} 942 943static void 944match_check(VALUE match) 945{ 946 if (!RMATCH(match)->regexp) { 947 rb_raise(rb_eTypeError, "uninitialized Match"); 948 } 949} 950 951/* :nodoc: */ 952static VALUE 953match_init_copy(VALUE obj, VALUE orig) 954{ 955 struct rmatch *rm; 956 957 if (!OBJ_INIT_COPY(obj, orig)) return obj; 958 959 RMATCH(obj)->str = RMATCH(orig)->str; 960 RMATCH(obj)->regexp = RMATCH(orig)->regexp; 961 962 rm = RMATCH(obj)->rmatch; 963 onig_region_copy(&rm->regs, RMATCH_REGS(orig)); 964 965 if (!RMATCH(orig)->rmatch->char_offset_updated) { 966 rm->char_offset_updated = 0; 967 } 968 else { 969 if (rm->char_offset_num_allocated < rm->regs.num_regs) { 970 REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs); 971 rm->char_offset_num_allocated = rm->regs.num_regs; 972 } 973 MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset, 974 struct rmatch_offset, rm->regs.num_regs); 975 rm->char_offset_updated = 1; 976 } 977 978 return obj; 979} 980 981 982/* 983 * call-seq: 984 * mtch.regexp -> regexp 985 * 986 * Returns the regexp. 987 * 988 * m = /a.*b/.match("abc") 989 * m.regexp #=> /a.*b/ 990 */ 991 992static VALUE 993match_regexp(VALUE match) 994{ 995 match_check(match); 996 return RMATCH(match)->regexp; 997} 998 999/* 1000 * call-seq: 1001 * mtch.names -> [name1, name2, ...] 1002 * 1003 * Returns a list of names of captures as an array of strings. 1004 * It is same as mtch.regexp.names. 1005 * 1006 * /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names 1007 * #=> ["foo", "bar", "baz"] 1008 * 1009 * m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil> 1010 * m.names #=> ["x", "y"] 1011 */ 1012 1013static VALUE 1014match_names(VALUE match) 1015{ 1016 match_check(match); 1017 return rb_reg_names(RMATCH(match)->regexp); 1018} 1019 1020/* 1021 * call-seq: 1022 * mtch.length -> integer 1023 * mtch.size -> integer 1024 * 1025 * Returns the number of elements in the match array. 1026 * 1027 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 1028 * m.length #=> 5 1029 * m.size #=> 5 1030 */ 1031 1032static VALUE 1033match_size(VALUE match) 1034{ 1035 match_check(match); 1036 return INT2FIX(RMATCH_REGS(match)->num_regs); 1037} 1038 1039static int 1040match_backref_number(VALUE match, VALUE backref) 1041{ 1042 const char *name; 1043 int num; 1044 1045 struct re_registers *regs = RMATCH_REGS(match); 1046 VALUE regexp = RMATCH(match)->regexp; 1047 1048 match_check(match); 1049 switch (TYPE(backref)) { 1050 default: 1051 return NUM2INT(backref); 1052 1053 case T_SYMBOL: 1054 name = rb_id2name(SYM2ID(backref)); 1055 break; 1056 1057 case T_STRING: 1058 name = StringValueCStr(backref); 1059 break; 1060 } 1061 1062 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr, 1063 (const unsigned char*)name, 1064 (const unsigned char*)name + strlen(name), 1065 regs); 1066 1067 if (num < 1) { 1068 rb_raise(rb_eIndexError, "undefined group name reference: %s", name); 1069 } 1070 1071 return num; 1072} 1073 1074int 1075rb_reg_backref_number(VALUE match, VALUE backref) 1076{ 1077 return match_backref_number(match, backref); 1078} 1079 1080/* 1081 * call-seq: 1082 * mtch.offset(n) -> array 1083 * 1084 * Returns a two-element array containing the beginning and ending offsets of 1085 * the <em>n</em>th match. 1086 * <em>n</em> can be a string or symbol to reference a named capture. 1087 * 1088 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 1089 * m.offset(0) #=> [1, 7] 1090 * m.offset(4) #=> [6, 7] 1091 * 1092 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") 1093 * p m.offset(:foo) #=> [0, 1] 1094 * p m.offset(:bar) #=> [2, 3] 1095 * 1096 */ 1097 1098static VALUE 1099match_offset(VALUE match, VALUE n) 1100{ 1101 int i = match_backref_number(match, n); 1102 struct re_registers *regs = RMATCH_REGS(match); 1103 1104 match_check(match); 1105 if (i < 0 || regs->num_regs <= i) 1106 rb_raise(rb_eIndexError, "index %d out of matches", i); 1107 1108 if (BEG(i) < 0) 1109 return rb_assoc_new(Qnil, Qnil); 1110 1111 update_char_offset(match); 1112 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg), 1113 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end)); 1114} 1115 1116 1117/* 1118 * call-seq: 1119 * mtch.begin(n) -> integer 1120 * 1121 * Returns the offset of the start of the <em>n</em>th element of the match 1122 * array in the string. 1123 * <em>n</em> can be a string or symbol to reference a named capture. 1124 * 1125 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 1126 * m.begin(0) #=> 1 1127 * m.begin(2) #=> 2 1128 * 1129 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") 1130 * p m.begin(:foo) #=> 0 1131 * p m.begin(:bar) #=> 2 1132 */ 1133 1134static VALUE 1135match_begin(VALUE match, VALUE n) 1136{ 1137 int i = match_backref_number(match, n); 1138 struct re_registers *regs = RMATCH_REGS(match); 1139 1140 match_check(match); 1141 if (i < 0 || regs->num_regs <= i) 1142 rb_raise(rb_eIndexError, "index %d out of matches", i); 1143 1144 if (BEG(i) < 0) 1145 return Qnil; 1146 1147 update_char_offset(match); 1148 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg); 1149} 1150 1151 1152/* 1153 * call-seq: 1154 * mtch.end(n) -> integer 1155 * 1156 * Returns the offset of the character immediately following the end of the 1157 * <em>n</em>th element of the match array in the string. 1158 * <em>n</em> can be a string or symbol to reference a named capture. 1159 * 1160 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 1161 * m.end(0) #=> 7 1162 * m.end(2) #=> 3 1163 * 1164 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") 1165 * p m.end(:foo) #=> 1 1166 * p m.end(:bar) #=> 3 1167 */ 1168 1169static VALUE 1170match_end(VALUE match, VALUE n) 1171{ 1172 int i = match_backref_number(match, n); 1173 struct re_registers *regs = RMATCH_REGS(match); 1174 1175 match_check(match); 1176 if (i < 0 || regs->num_regs <= i) 1177 rb_raise(rb_eIndexError, "index %d out of matches", i); 1178 1179 if (BEG(i) < 0) 1180 return Qnil; 1181 1182 update_char_offset(match); 1183 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end); 1184} 1185 1186#define MATCH_BUSY FL_USER2 1187 1188void 1189rb_match_busy(VALUE match) 1190{ 1191 FL_SET(match, MATCH_BUSY); 1192} 1193 1194/* 1195 * call-seq: 1196 * rxp.fixed_encoding? -> true or false 1197 * 1198 * Returns false if rxp is applicable to 1199 * a string with any ASCII compatible encoding. 1200 * Returns true otherwise. 1201 * 1202 * r = /a/ 1203 * r.fixed_encoding? #=> false 1204 * r =~ "\u{6666} a" #=> 2 1205 * r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2 1206 * r =~ "abc".force_encoding("euc-jp") #=> 0 1207 * 1208 * r = /a/u 1209 * r.fixed_encoding? #=> true 1210 * r.encoding #=> #<Encoding:UTF-8> 1211 * r =~ "\u{6666} a" #=> 2 1212 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError 1213 * r =~ "abc".force_encoding("euc-jp") #=> 0 1214 * 1215 * r = /\u{6666}/ 1216 * r.fixed_encoding? #=> true 1217 * r.encoding #=> #<Encoding:UTF-8> 1218 * r =~ "\u{6666} a" #=> 0 1219 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError 1220 * r =~ "abc".force_encoding("euc-jp") #=> nil 1221 */ 1222 1223static VALUE 1224rb_reg_fixed_encoding_p(VALUE re) 1225{ 1226 if (FL_TEST(re, KCODE_FIXED)) 1227 return Qtrue; 1228 else 1229 return Qfalse; 1230} 1231 1232static VALUE 1233rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, 1234 rb_encoding **fixed_enc, onig_errmsg_buffer err); 1235 1236 1237static void 1238reg_enc_error(VALUE re, VALUE str) 1239{ 1240 rb_raise(rb_eEncCompatError, 1241 "incompatible encoding regexp match (%s regexp with %s string)", 1242 rb_enc_name(rb_enc_get(re)), 1243 rb_enc_name(rb_enc_get(str))); 1244} 1245 1246static rb_encoding* 1247rb_reg_prepare_enc(VALUE re, VALUE str, int warn) 1248{ 1249 rb_encoding *enc = 0; 1250 1251 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) { 1252 rb_raise(rb_eArgError, 1253 "invalid byte sequence in %s", 1254 rb_enc_name(rb_enc_get(str))); 1255 } 1256 1257 rb_reg_check(re); 1258 enc = rb_enc_get(str); 1259 if (!rb_enc_str_asciicompat_p(str)) { 1260 if (RREGEXP(re)->ptr->enc != enc) { 1261 reg_enc_error(re, str); 1262 } 1263 } 1264 else if (rb_reg_fixed_encoding_p(re)) { 1265 if (RREGEXP(re)->ptr->enc != enc && 1266 (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) || 1267 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) { 1268 reg_enc_error(re, str); 1269 } 1270 enc = RREGEXP(re)->ptr->enc; 1271 } 1272 if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) && 1273 enc != rb_ascii8bit_encoding() && 1274 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 1275 rb_warn("regexp match /.../n against to %s string", 1276 rb_enc_name(enc)); 1277 } 1278 return enc; 1279} 1280 1281regex_t * 1282rb_reg_prepare_re(VALUE re, VALUE str) 1283{ 1284 regex_t *reg = RREGEXP(re)->ptr; 1285 onig_errmsg_buffer err = ""; 1286 int r; 1287 OnigErrorInfo einfo; 1288 const char *pattern; 1289 VALUE unescaped; 1290 rb_encoding *fixed_enc = 0; 1291 rb_encoding *enc = rb_reg_prepare_enc(re, str, 1); 1292 1293 if (reg->enc == enc) return reg; 1294 1295 rb_reg_check(re); 1296 reg = RREGEXP(re)->ptr; 1297 pattern = RREGEXP_SRC_PTR(re); 1298 1299 unescaped = rb_reg_preprocess( 1300 pattern, pattern + RREGEXP_SRC_LEN(re), enc, 1301 &fixed_enc, err); 1302 1303 if (unescaped == Qnil) { 1304 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err); 1305 } 1306 1307 r = onig_new(®, (UChar* )RSTRING_PTR(unescaped), 1308 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)), 1309 reg->options, enc, 1310 OnigDefaultSyntax, &einfo); 1311 if (r) { 1312 onig_error_code_to_str((UChar*)err, r, &einfo); 1313 rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re); 1314 } 1315 1316 RB_GC_GUARD(unescaped); 1317 return reg; 1318} 1319 1320long 1321rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse) 1322{ 1323 long range; 1324 rb_encoding *enc; 1325 UChar *p, *string; 1326 1327 enc = rb_reg_prepare_enc(re, str, 0); 1328 1329 if (reverse) { 1330 range = -pos; 1331 } 1332 else { 1333 range = RSTRING_LEN(str) - pos; 1334 } 1335 1336 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) { 1337 string = (UChar*)RSTRING_PTR(str); 1338 1339 if (range > 0) { 1340 p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str)); 1341 } 1342 else { 1343 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str)); 1344 } 1345 return p - string; 1346 } 1347 1348 return pos; 1349} 1350 1351long 1352rb_reg_search(VALUE re, VALUE str, long pos, int reverse) 1353{ 1354 long result; 1355 VALUE match; 1356 struct re_registers regi, *regs = ®i; 1357 char *range = RSTRING_PTR(str); 1358 regex_t *reg; 1359 int tmpreg; 1360 1361 if (pos > RSTRING_LEN(str) || pos < 0) { 1362 rb_backref_set(Qnil); 1363 return -1; 1364 } 1365 1366 reg = rb_reg_prepare_re(re, str); 1367 tmpreg = reg != RREGEXP(re)->ptr; 1368 if (!tmpreg) RREGEXP(re)->usecnt++; 1369 1370 match = rb_backref_get(); 1371 if (!NIL_P(match)) { 1372 if (FL_TEST(match, MATCH_BUSY)) { 1373 match = Qnil; 1374 } 1375 else { 1376 regs = RMATCH_REGS(match); 1377 } 1378 } 1379 if (NIL_P(match)) { 1380 MEMZERO(regs, struct re_registers, 1); 1381 } 1382 if (!reverse) { 1383 range += RSTRING_LEN(str); 1384 } 1385 result = onig_search(reg, 1386 (UChar*)(RSTRING_PTR(str)), 1387 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)), 1388 ((UChar*)(RSTRING_PTR(str)) + pos), 1389 ((UChar*)range), 1390 regs, ONIG_OPTION_NONE); 1391 if (!tmpreg) RREGEXP(re)->usecnt--; 1392 if (tmpreg) { 1393 if (RREGEXP(re)->usecnt) { 1394 onig_free(reg); 1395 } 1396 else { 1397 onig_free(RREGEXP(re)->ptr); 1398 RREGEXP(re)->ptr = reg; 1399 } 1400 } 1401 if (result < 0) { 1402 if (regs == ®i) 1403 onig_region_free(regs, 0); 1404 if (result == ONIG_MISMATCH) { 1405 rb_backref_set(Qnil); 1406 return result; 1407 } 1408 else { 1409 onig_errmsg_buffer err = ""; 1410 onig_error_code_to_str((UChar*)err, (int)result); 1411 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re); 1412 } 1413 } 1414 1415 if (NIL_P(match)) { 1416 match = match_alloc(rb_cMatch); 1417 onig_region_copy(RMATCH_REGS(match), regs); 1418 onig_region_free(regs, 0); 1419 } 1420 else { 1421 if (rb_safe_level() >= 3) 1422 OBJ_TAINT(match); 1423 else 1424 FL_UNSET(match, FL_TAINT); 1425 } 1426 1427 RMATCH(match)->str = rb_str_new4(str); 1428 RMATCH(match)->regexp = re; 1429 RMATCH(match)->rmatch->char_offset_updated = 0; 1430 rb_backref_set(match); 1431 1432 OBJ_INFECT(match, re); 1433 OBJ_INFECT(match, str); 1434 1435 return result; 1436} 1437 1438VALUE 1439rb_reg_nth_defined(int nth, VALUE match) 1440{ 1441 struct re_registers *regs; 1442 if (NIL_P(match)) return Qnil; 1443 match_check(match); 1444 regs = RMATCH_REGS(match); 1445 if (nth >= regs->num_regs) { 1446 return Qnil; 1447 } 1448 if (nth < 0) { 1449 nth += regs->num_regs; 1450 if (nth <= 0) return Qnil; 1451 } 1452 if (BEG(nth) == -1) return Qfalse; 1453 return Qtrue; 1454} 1455 1456VALUE 1457rb_reg_nth_match(int nth, VALUE match) 1458{ 1459 VALUE str; 1460 long start, end, len; 1461 struct re_registers *regs; 1462 1463 if (NIL_P(match)) return Qnil; 1464 match_check(match); 1465 regs = RMATCH_REGS(match); 1466 if (nth >= regs->num_regs) { 1467 return Qnil; 1468 } 1469 if (nth < 0) { 1470 nth += regs->num_regs; 1471 if (nth <= 0) return Qnil; 1472 } 1473 start = BEG(nth); 1474 if (start == -1) return Qnil; 1475 end = END(nth); 1476 len = end - start; 1477 str = rb_str_subseq(RMATCH(match)->str, start, len); 1478 OBJ_INFECT(str, match); 1479 return str; 1480} 1481 1482VALUE 1483rb_reg_last_match(VALUE match) 1484{ 1485 return rb_reg_nth_match(0, match); 1486} 1487 1488 1489/* 1490 * call-seq: 1491 * mtch.pre_match -> str 1492 * 1493 * Returns the portion of the original string before the current match. 1494 * Equivalent to the special variable <code>$`</code>. 1495 * 1496 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 1497 * m.pre_match #=> "T" 1498 */ 1499 1500VALUE 1501rb_reg_match_pre(VALUE match) 1502{ 1503 VALUE str; 1504 struct re_registers *regs; 1505 1506 if (NIL_P(match)) return Qnil; 1507 match_check(match); 1508 regs = RMATCH_REGS(match); 1509 if (BEG(0) == -1) return Qnil; 1510 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0)); 1511 if (OBJ_TAINTED(match)) OBJ_TAINT(str); 1512 return str; 1513} 1514 1515 1516/* 1517 * call-seq: 1518 * mtch.post_match -> str 1519 * 1520 * Returns the portion of the original string after the current match. 1521 * Equivalent to the special variable <code>$'</code>. 1522 * 1523 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie") 1524 * m.post_match #=> ": The Movie" 1525 */ 1526 1527VALUE 1528rb_reg_match_post(VALUE match) 1529{ 1530 VALUE str; 1531 long pos; 1532 struct re_registers *regs; 1533 1534 if (NIL_P(match)) return Qnil; 1535 match_check(match); 1536 regs = RMATCH_REGS(match); 1537 if (BEG(0) == -1) return Qnil; 1538 str = RMATCH(match)->str; 1539 pos = END(0); 1540 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos); 1541 if (OBJ_TAINTED(match)) OBJ_TAINT(str); 1542 return str; 1543} 1544 1545VALUE 1546rb_reg_match_last(VALUE match) 1547{ 1548 int i; 1549 struct re_registers *regs; 1550 1551 if (NIL_P(match)) return Qnil; 1552 match_check(match); 1553 regs = RMATCH_REGS(match); 1554 if (BEG(0) == -1) return Qnil; 1555 1556 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--) 1557 ; 1558 if (i == 0) return Qnil; 1559 return rb_reg_nth_match(i, match); 1560} 1561 1562static VALUE 1563last_match_getter(void) 1564{ 1565 return rb_reg_last_match(rb_backref_get()); 1566} 1567 1568static VALUE 1569prematch_getter(void) 1570{ 1571 return rb_reg_match_pre(rb_backref_get()); 1572} 1573 1574static VALUE 1575postmatch_getter(void) 1576{ 1577 return rb_reg_match_post(rb_backref_get()); 1578} 1579 1580static VALUE 1581last_paren_match_getter(void) 1582{ 1583 return rb_reg_match_last(rb_backref_get()); 1584} 1585 1586static VALUE 1587match_array(VALUE match, int start) 1588{ 1589 struct re_registers *regs; 1590 VALUE ary; 1591 VALUE target; 1592 int i; 1593 int taint = OBJ_TAINTED(match); 1594 1595 match_check(match); 1596 regs = RMATCH_REGS(match); 1597 ary = rb_ary_new2(regs->num_regs); 1598 target = RMATCH(match)->str; 1599 1600 for (i=start; i<regs->num_regs; i++) { 1601 if (regs->beg[i] == -1) { 1602 rb_ary_push(ary, Qnil); 1603 } 1604 else { 1605 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]); 1606 if (taint) OBJ_TAINT(str); 1607 rb_ary_push(ary, str); 1608 } 1609 } 1610 return ary; 1611} 1612 1613 1614/* [MG]:FIXME: I put parens around the /.../.match() in the first line of the 1615 second example to prevent the '*' followed by a '/' from ending the 1616 comment. */ 1617 1618/* 1619 * call-seq: 1620 * mtch.to_a -> anArray 1621 * 1622 * Returns the array of matches. 1623 * 1624 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 1625 * m.to_a #=> ["HX1138", "H", "X", "113", "8"] 1626 * 1627 * Because <code>to_a</code> is called when expanding 1628 * <code>*</code><em>variable</em>, there's a useful assignment 1629 * shortcut for extracting matched fields. This is slightly slower than 1630 * accessing the fields directly (as an intermediate array is 1631 * generated). 1632 * 1633 * all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138.")) 1634 * all #=> "HX1138" 1635 * f1 #=> "H" 1636 * f2 #=> "X" 1637 * f3 #=> "113" 1638 */ 1639 1640static VALUE 1641match_to_a(VALUE match) 1642{ 1643 return match_array(match, 0); 1644} 1645 1646 1647/* 1648 * call-seq: 1649 * mtch.captures -> array 1650 * 1651 * Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>. 1652 * 1653 * f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures 1654 * f1 #=> "H" 1655 * f2 #=> "X" 1656 * f3 #=> "113" 1657 * f4 #=> "8" 1658 */ 1659static VALUE 1660match_captures(VALUE match) 1661{ 1662 return match_array(match, 1); 1663} 1664 1665static int 1666name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end) 1667{ 1668 int num; 1669 1670 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr, 1671 (const unsigned char* )name, (const unsigned char* )name_end, regs); 1672 if (num >= 1) { 1673 return num; 1674 } 1675 else { 1676 VALUE s = rb_str_new(name, (long )(name_end - name)); 1677 rb_raise(rb_eIndexError, "undefined group name reference: %s", 1678 StringValuePtr(s)); 1679 } 1680 1681 UNREACHABLE; 1682} 1683 1684/* 1685 * call-seq: 1686 * mtch[i] -> str or nil 1687 * mtch[start, length] -> array 1688 * mtch[range] -> array 1689 * mtch[name] -> str or nil 1690 * 1691 * Match Reference -- <code>MatchData</code> acts as an array, and may be 1692 * accessed using the normal array indexing techniques. <code>mtch[0]</code> 1693 * is equivalent to the special variable <code>$&</code>, and returns the 1694 * entire matched string. <code>mtch[1]</code>, <code>mtch[2]</code>, and so 1695 * on return the values of the matched backreferences (portions of the 1696 * pattern between parentheses). 1697 * 1698 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 1699 * m #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8"> 1700 * m[0] #=> "HX1138" 1701 * m[1, 2] #=> ["H", "X"] 1702 * m[1..3] #=> ["H", "X", "113"] 1703 * m[-3, 2] #=> ["X", "113"] 1704 * 1705 * m = /(?<foo>a+)b/.match("ccaaab") 1706 * m #=> #<MatchData "aaab" foo:"aaa"> 1707 * m["foo"] #=> "aaa" 1708 * m[:foo] #=> "aaa" 1709 */ 1710 1711static VALUE 1712match_aref(int argc, VALUE *argv, VALUE match) 1713{ 1714 VALUE idx, rest; 1715 1716 match_check(match); 1717 rb_scan_args(argc, argv, "11", &idx, &rest); 1718 1719 if (NIL_P(rest)) { 1720 if (FIXNUM_P(idx)) { 1721 if (FIX2INT(idx) >= 0) { 1722 return rb_reg_nth_match(FIX2INT(idx), match); 1723 } 1724 } 1725 else { 1726 const char *p; 1727 int num; 1728 1729 switch (TYPE(idx)) { 1730 case T_SYMBOL: 1731 p = rb_id2name(SYM2ID(idx)); 1732 goto name_to_backref; 1733 break; 1734 case T_STRING: 1735 p = StringValuePtr(idx); 1736 1737 name_to_backref: 1738 num = name_to_backref_number(RMATCH_REGS(match), 1739 RMATCH(match)->regexp, p, p + strlen(p)); 1740 return rb_reg_nth_match(num, match); 1741 break; 1742 1743 default: 1744 break; 1745 } 1746 } 1747 } 1748 1749 return rb_ary_aref(argc, argv, match_to_a(match)); 1750} 1751 1752static VALUE 1753match_entry(VALUE match, long n) 1754{ 1755 /* n should not exceed num_regs */ 1756 return rb_reg_nth_match((int)n, match); 1757} 1758 1759 1760/* 1761 * call-seq: 1762 * 1763 * mtch.values_at([index]*) -> array 1764 * 1765 * Uses each <i>index</i> to access the matching values, returning an array of 1766 * the corresponding matches. 1767 * 1768 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie") 1769 * m.to_a #=> ["HX1138", "H", "X", "113", "8"] 1770 * m.values_at(0, 2, -2) #=> ["HX1138", "X", "113"] 1771 */ 1772 1773static VALUE 1774match_values_at(int argc, VALUE *argv, VALUE match) 1775{ 1776 struct re_registers *regs; 1777 1778 match_check(match); 1779 regs = RMATCH_REGS(match); 1780 return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry); 1781} 1782 1783 1784/* 1785 * call-seq: 1786 * mtch.to_s -> str 1787 * 1788 * Returns the entire matched string. 1789 * 1790 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 1791 * m.to_s #=> "HX1138" 1792 */ 1793 1794static VALUE 1795match_to_s(VALUE match) 1796{ 1797 VALUE str = rb_reg_last_match(match); 1798 1799 match_check(match); 1800 if (NIL_P(str)) str = rb_str_new(0,0); 1801 if (OBJ_TAINTED(match)) OBJ_TAINT(str); 1802 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str); 1803 return str; 1804} 1805 1806 1807/* 1808 * call-seq: 1809 * mtch.string -> str 1810 * 1811 * Returns a frozen copy of the string passed in to <code>match</code>. 1812 * 1813 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 1814 * m.string #=> "THX1138." 1815 */ 1816 1817static VALUE 1818match_string(VALUE match) 1819{ 1820 match_check(match); 1821 return RMATCH(match)->str; /* str is frozen */ 1822} 1823 1824struct backref_name_tag { 1825 const UChar *name; 1826 long len; 1827}; 1828 1829static int 1830match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end, 1831 int back_num, int *back_refs, OnigRegex regex, void *arg0) 1832{ 1833 struct backref_name_tag *arg = (struct backref_name_tag *)arg0; 1834 int i; 1835 1836 for (i = 0; i < back_num; i++) { 1837 arg[back_refs[i]].name = name; 1838 arg[back_refs[i]].len = name_end - name; 1839 } 1840 return 0; 1841} 1842 1843/* 1844 * call-seq: 1845 * mtch.inspect -> str 1846 * 1847 * Returns a printable version of <i>mtch</i>. 1848 * 1849 * puts /.$/.match("foo").inspect 1850 * #=> #<MatchData "o"> 1851 * 1852 * puts /(.)(.)(.)/.match("foo").inspect 1853 * #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o"> 1854 * 1855 * puts /(.)(.)?(.)/.match("fo").inspect 1856 * #=> #<MatchData "fo" 1:"f" 2:nil 3:"o"> 1857 * 1858 * puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect 1859 * #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g"> 1860 * 1861 */ 1862 1863static VALUE 1864match_inspect(VALUE match) 1865{ 1866 const char *cname = rb_obj_classname(match); 1867 VALUE str; 1868 int i; 1869 struct re_registers *regs = RMATCH_REGS(match); 1870 int num_regs = regs->num_regs; 1871 struct backref_name_tag *names; 1872 VALUE regexp = RMATCH(match)->regexp; 1873 1874 if (regexp == 0) { 1875 return rb_sprintf("#<%s:%p>", cname, (void*)match); 1876 } 1877 1878 names = ALLOCA_N(struct backref_name_tag, num_regs); 1879 MEMZERO(names, struct backref_name_tag, num_regs); 1880 1881 onig_foreach_name(RREGEXP(regexp)->ptr, 1882 match_inspect_name_iter, names); 1883 1884 str = rb_str_buf_new2("#<"); 1885 rb_str_buf_cat2(str, cname); 1886 1887 for (i = 0; i < num_regs; i++) { 1888 VALUE v; 1889 rb_str_buf_cat2(str, " "); 1890 if (0 < i) { 1891 if (names[i].name) 1892 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len); 1893 else { 1894 rb_str_catf(str, "%d", i); 1895 } 1896 rb_str_buf_cat2(str, ":"); 1897 } 1898 v = rb_reg_nth_match(i, match); 1899 if (v == Qnil) 1900 rb_str_buf_cat2(str, "nil"); 1901 else 1902 rb_str_buf_append(str, rb_str_inspect(v)); 1903 } 1904 rb_str_buf_cat2(str, ">"); 1905 1906 return str; 1907} 1908 1909VALUE rb_cRegexp; 1910 1911static int 1912read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err) 1913{ 1914 const char *p = *pp; 1915 int code; 1916 int meta_prefix = 0, ctrl_prefix = 0; 1917 size_t len; 1918 1919 if (p == end || *p++ != '\\') { 1920 errcpy(err, "too short escaped multibyte character"); 1921 return -1; 1922 } 1923 1924again: 1925 if (p == end) { 1926 errcpy(err, "too short escape sequence"); 1927 return -1; 1928 } 1929 switch (*p++) { 1930 case '\\': code = '\\'; break; 1931 case 'n': code = '\n'; break; 1932 case 't': code = '\t'; break; 1933 case 'r': code = '\r'; break; 1934 case 'f': code = '\f'; break; 1935 case 'v': code = '\013'; break; 1936 case 'a': code = '\007'; break; 1937 case 'e': code = '\033'; break; 1938 1939 /* \OOO */ 1940 case '0': case '1': case '2': case '3': 1941 case '4': case '5': case '6': case '7': 1942 p--; 1943 code = scan_oct(p, end < p+3 ? end-p : 3, &len); 1944 p += len; 1945 break; 1946 1947 case 'x': /* \xHH */ 1948 code = scan_hex(p, end < p+2 ? end-p : 2, &len); 1949 if (len < 1) { 1950 errcpy(err, "invalid hex escape"); 1951 return -1; 1952 } 1953 p += len; 1954 break; 1955 1956 case 'M': /* \M-X, \M-\C-X, \M-\cX */ 1957 if (meta_prefix) { 1958 errcpy(err, "duplicate meta escape"); 1959 return -1; 1960 } 1961 meta_prefix = 1; 1962 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) { 1963 if (*p == '\\') { 1964 p++; 1965 goto again; 1966 } 1967 else { 1968 code = *p++; 1969 break; 1970 } 1971 } 1972 errcpy(err, "too short meta escape"); 1973 return -1; 1974 1975 case 'C': /* \C-X, \C-\M-X */ 1976 if (p == end || *p++ != '-') { 1977 errcpy(err, "too short control escape"); 1978 return -1; 1979 } 1980 case 'c': /* \cX, \c\M-X */ 1981 if (ctrl_prefix) { 1982 errcpy(err, "duplicate control escape"); 1983 return -1; 1984 } 1985 ctrl_prefix = 1; 1986 if (p < end && (*p & 0x80) == 0) { 1987 if (*p == '\\') { 1988 p++; 1989 goto again; 1990 } 1991 else { 1992 code = *p++; 1993 break; 1994 } 1995 } 1996 errcpy(err, "too short control escape"); 1997 return -1; 1998 1999 default: 2000 errcpy(err, "unexpected escape sequence"); 2001 return -1; 2002 } 2003 if (code < 0 || 0xff < code) { 2004 errcpy(err, "invalid escape code"); 2005 return -1; 2006 } 2007 2008 if (ctrl_prefix) 2009 code &= 0x1f; 2010 if (meta_prefix) 2011 code |= 0x80; 2012 2013 *pp = p; 2014 return code; 2015} 2016 2017static int 2018unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, 2019 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 2020{ 2021 const char *p = *pp; 2022 int chmaxlen = rb_enc_mbmaxlen(enc); 2023 char *chbuf = ALLOCA_N(char, chmaxlen); 2024 int chlen = 0; 2025 int byte; 2026 int l; 2027 2028 memset(chbuf, 0, chmaxlen); 2029 2030 byte = read_escaped_byte(&p, end, err); 2031 if (byte == -1) { 2032 return -1; 2033 } 2034 2035 chbuf[chlen++] = byte; 2036 while (chlen < chmaxlen && 2037 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) { 2038 byte = read_escaped_byte(&p, end, err); 2039 if (byte == -1) { 2040 return -1; 2041 } 2042 chbuf[chlen++] = byte; 2043 } 2044 2045 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc); 2046 if (MBCLEN_INVALID_P(l)) { 2047 errcpy(err, "invalid multibyte escape"); 2048 return -1; 2049 } 2050 if (1 < chlen || (chbuf[0] & 0x80)) { 2051 rb_str_buf_cat(buf, chbuf, chlen); 2052 2053 if (*encp == 0) 2054 *encp = enc; 2055 else if (*encp != enc) { 2056 errcpy(err, "escaped non ASCII character in UTF-8 regexp"); 2057 return -1; 2058 } 2059 } 2060 else { 2061 char escbuf[5]; 2062 snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff); 2063 rb_str_buf_cat(buf, escbuf, 4); 2064 } 2065 *pp = p; 2066 return 0; 2067} 2068 2069static int 2070check_unicode_range(unsigned long code, onig_errmsg_buffer err) 2071{ 2072 if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */ 2073 0x10ffff < code) { 2074 errcpy(err, "invalid Unicode range"); 2075 return -1; 2076 } 2077 return 0; 2078} 2079 2080static int 2081append_utf8(unsigned long uv, 2082 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 2083{ 2084 if (check_unicode_range(uv, err) != 0) 2085 return -1; 2086 if (uv < 0x80) { 2087 char escbuf[5]; 2088 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv); 2089 rb_str_buf_cat(buf, escbuf, 4); 2090 } 2091 else { 2092 int len; 2093 char utf8buf[6]; 2094 len = rb_uv_to_utf8(utf8buf, uv); 2095 rb_str_buf_cat(buf, utf8buf, len); 2096 2097 if (*encp == 0) 2098 *encp = rb_utf8_encoding(); 2099 else if (*encp != rb_utf8_encoding()) { 2100 errcpy(err, "UTF-8 character in non UTF-8 regexp"); 2101 return -1; 2102 } 2103 } 2104 return 0; 2105} 2106 2107static int 2108unescape_unicode_list(const char **pp, const char *end, 2109 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 2110{ 2111 const char *p = *pp; 2112 int has_unicode = 0; 2113 unsigned long code; 2114 size_t len; 2115 2116 while (p < end && ISSPACE(*p)) p++; 2117 2118 while (1) { 2119 code = ruby_scan_hex(p, end-p, &len); 2120 if (len == 0) 2121 break; 2122 if (6 < len) { /* max 10FFFF */ 2123 errcpy(err, "invalid Unicode range"); 2124 return -1; 2125 } 2126 p += len; 2127 if (append_utf8(code, buf, encp, err) != 0) 2128 return -1; 2129 has_unicode = 1; 2130 2131 while (p < end && ISSPACE(*p)) p++; 2132 } 2133 2134 if (has_unicode == 0) { 2135 errcpy(err, "invalid Unicode list"); 2136 return -1; 2137 } 2138 2139 *pp = p; 2140 2141 return 0; 2142} 2143 2144static int 2145unescape_unicode_bmp(const char **pp, const char *end, 2146 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 2147{ 2148 const char *p = *pp; 2149 size_t len; 2150 unsigned long code; 2151 2152 if (end < p+4) { 2153 errcpy(err, "invalid Unicode escape"); 2154 return -1; 2155 } 2156 code = ruby_scan_hex(p, 4, &len); 2157 if (len != 4) { 2158 errcpy(err, "invalid Unicode escape"); 2159 return -1; 2160 } 2161 if (append_utf8(code, buf, encp, err) != 0) 2162 return -1; 2163 *pp = p + 4; 2164 return 0; 2165} 2166 2167static int 2168unescape_nonascii(const char *p, const char *end, rb_encoding *enc, 2169 VALUE buf, rb_encoding **encp, int *has_property, 2170 onig_errmsg_buffer err) 2171{ 2172 char c; 2173 char smallbuf[2]; 2174 2175 while (p < end) { 2176 int chlen = rb_enc_precise_mbclen(p, end, enc); 2177 if (!MBCLEN_CHARFOUND_P(chlen)) { 2178 errcpy(err, "invalid multibyte character"); 2179 return -1; 2180 } 2181 chlen = MBCLEN_CHARFOUND_LEN(chlen); 2182 if (1 < chlen || (*p & 0x80)) { 2183 rb_str_buf_cat(buf, p, chlen); 2184 p += chlen; 2185 if (*encp == 0) 2186 *encp = enc; 2187 else if (*encp != enc) { 2188 errcpy(err, "non ASCII character in UTF-8 regexp"); 2189 return -1; 2190 } 2191 continue; 2192 } 2193 2194 switch (c = *p++) { 2195 case '\\': 2196 if (p == end) { 2197 errcpy(err, "too short escape sequence"); 2198 return -1; 2199 } 2200 switch (c = *p++) { 2201 case '1': case '2': case '3': 2202 case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */ 2203 { 2204 size_t octlen; 2205 if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) { 2206 /* backref or 7bit octal. 2207 no need to unescape anyway. 2208 re-escaping may break backref */ 2209 goto escape_asis; 2210 } 2211 } 2212 /* xxx: How about more than 199 subexpressions? */ 2213 2214 case '0': /* \0, \0O, \0OO */ 2215 2216 case 'x': /* \xHH */ 2217 case 'c': /* \cX, \c\M-X */ 2218 case 'C': /* \C-X, \C-\M-X */ 2219 case 'M': /* \M-X, \M-\C-X, \M-\cX */ 2220 p = p-2; 2221 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0) 2222 return -1; 2223 break; 2224 2225 case 'u': 2226 if (p == end) { 2227 errcpy(err, "too short escape sequence"); 2228 return -1; 2229 } 2230 if (*p == '{') { 2231 /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */ 2232 p++; 2233 if (unescape_unicode_list(&p, end, buf, encp, err) != 0) 2234 return -1; 2235 if (p == end || *p++ != '}') { 2236 errcpy(err, "invalid Unicode list"); 2237 return -1; 2238 } 2239 break; 2240 } 2241 else { 2242 /* \uHHHH */ 2243 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0) 2244 return -1; 2245 break; 2246 } 2247 2248 case 'p': /* \p{Hiragana} */ 2249 case 'P': 2250 if (!*encp) { 2251 *has_property = 1; 2252 } 2253 goto escape_asis; 2254 2255 default: /* \n, \\, \d, \9, etc. */ 2256escape_asis: 2257 smallbuf[0] = '\\'; 2258 smallbuf[1] = c; 2259 rb_str_buf_cat(buf, smallbuf, 2); 2260 break; 2261 } 2262 break; 2263 2264 default: 2265 rb_str_buf_cat(buf, &c, 1); 2266 break; 2267 } 2268 } 2269 2270 return 0; 2271} 2272 2273static VALUE 2274rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, 2275 rb_encoding **fixed_enc, onig_errmsg_buffer err) 2276{ 2277 VALUE buf; 2278 int has_property = 0; 2279 2280 buf = rb_str_buf_new(0); 2281 2282 if (rb_enc_asciicompat(enc)) 2283 *fixed_enc = 0; 2284 else { 2285 *fixed_enc = enc; 2286 rb_enc_associate(buf, enc); 2287 } 2288 2289 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0) 2290 return Qnil; 2291 2292 if (has_property && !*fixed_enc) { 2293 *fixed_enc = enc; 2294 } 2295 2296 if (*fixed_enc) { 2297 rb_enc_associate(buf, *fixed_enc); 2298 } 2299 2300 return buf; 2301} 2302 2303VALUE 2304rb_reg_check_preprocess(VALUE str) 2305{ 2306 rb_encoding *fixed_enc = 0; 2307 onig_errmsg_buffer err = ""; 2308 VALUE buf; 2309 char *p, *end; 2310 rb_encoding *enc; 2311 2312 StringValue(str); 2313 p = RSTRING_PTR(str); 2314 end = p + RSTRING_LEN(str); 2315 enc = rb_enc_get(str); 2316 2317 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err); 2318 RB_GC_GUARD(str); 2319 2320 if (buf == Qnil) { 2321 return rb_reg_error_desc(str, 0, err); 2322 } 2323 return Qnil; 2324} 2325 2326static VALUE 2327rb_reg_preprocess_dregexp(VALUE ary, int options) 2328{ 2329 rb_encoding *fixed_enc = 0; 2330 rb_encoding *regexp_enc = 0; 2331 onig_errmsg_buffer err = ""; 2332 int i; 2333 VALUE result = 0; 2334 rb_encoding *ascii8bit = rb_ascii8bit_encoding(); 2335 2336 if (RARRAY_LEN(ary) == 0) { 2337 rb_raise(rb_eArgError, "no arguments given"); 2338 } 2339 2340 for (i = 0; i < RARRAY_LEN(ary); i++) { 2341 VALUE str = RARRAY_PTR(ary)[i]; 2342 VALUE buf; 2343 char *p, *end; 2344 rb_encoding *src_enc; 2345 2346 src_enc = rb_enc_get(str); 2347 if (options & ARG_ENCODING_NONE && 2348 src_enc != ascii8bit) { 2349 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) 2350 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script"); 2351 else 2352 src_enc = ascii8bit; 2353 } 2354 2355 StringValue(str); 2356 p = RSTRING_PTR(str); 2357 end = p + RSTRING_LEN(str); 2358 2359 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err); 2360 2361 if (buf == Qnil) 2362 rb_raise(rb_eArgError, "%s", err); 2363 2364 if (fixed_enc != 0) { 2365 if (regexp_enc != 0 && regexp_enc != fixed_enc) { 2366 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s", 2367 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc)); 2368 } 2369 regexp_enc = fixed_enc; 2370 } 2371 2372 if (!result) 2373 result = rb_str_new3(str); 2374 else 2375 rb_str_buf_append(result, str); 2376 } 2377 if (regexp_enc) { 2378 rb_enc_associate(result, regexp_enc); 2379 } 2380 2381 return result; 2382} 2383 2384static int 2385rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc, 2386 int options, onig_errmsg_buffer err, 2387 const char *sourcefile, int sourceline) 2388{ 2389 struct RRegexp *re = RREGEXP(obj); 2390 VALUE unescaped; 2391 rb_encoding *fixed_enc = 0; 2392 rb_encoding *a_enc = rb_ascii8bit_encoding(); 2393 2394 if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4) 2395 rb_raise(rb_eSecurityError, "Insecure: can't modify regexp"); 2396 rb_check_frozen(obj); 2397 if (FL_TEST(obj, REG_LITERAL)) 2398 rb_raise(rb_eSecurityError, "can't modify literal regexp"); 2399 if (re->ptr) 2400 rb_raise(rb_eTypeError, "already initialized regexp"); 2401 re->ptr = 0; 2402 2403 if (rb_enc_dummy_p(enc)) { 2404 errcpy(err, "can't make regexp with dummy encoding"); 2405 return -1; 2406 } 2407 2408 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err); 2409 if (unescaped == Qnil) 2410 return -1; 2411 2412 if (fixed_enc) { 2413 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) || 2414 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) { 2415 errcpy(err, "incompatible character encoding"); 2416 return -1; 2417 } 2418 if (fixed_enc != a_enc) { 2419 options |= ARG_ENCODING_FIXED; 2420 enc = fixed_enc; 2421 } 2422 } 2423 else if (!(options & ARG_ENCODING_FIXED)) { 2424 enc = rb_usascii_encoding(); 2425 } 2426 2427 rb_enc_associate((VALUE)re, enc); 2428 if ((options & ARG_ENCODING_FIXED) || fixed_enc) { 2429 re->basic.flags |= KCODE_FIXED; 2430 } 2431 if (options & ARG_ENCODING_NONE) { 2432 re->basic.flags |= REG_ENCODING_NONE; 2433 } 2434 2435 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc, 2436 options & ARG_REG_OPTION_MASK, err, 2437 sourcefile, sourceline); 2438 if (!re->ptr) return -1; 2439 re->src = rb_enc_str_new(s, len, enc); 2440 OBJ_FREEZE(re->src); 2441 RB_GC_GUARD(unescaped); 2442 return 0; 2443} 2444 2445static int 2446rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err, 2447 const char *sourcefile, int sourceline) 2448{ 2449 int ret; 2450 rb_encoding *enc = rb_enc_get(str); 2451 if (options & ARG_ENCODING_NONE) { 2452 rb_encoding *ascii8bit = rb_ascii8bit_encoding(); 2453 if (enc != ascii8bit) { 2454 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 2455 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script"); 2456 return -1; 2457 } 2458 enc = ascii8bit; 2459 } 2460 } 2461 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc, 2462 options, err, sourcefile, sourceline); 2463 OBJ_INFECT(obj, str); 2464 RB_GC_GUARD(str); 2465 return ret; 2466} 2467 2468static VALUE 2469rb_reg_s_alloc(VALUE klass) 2470{ 2471 NEWOBJ_OF(re, struct RRegexp, klass, T_REGEXP); 2472 2473 re->ptr = 0; 2474 re->src = 0; 2475 re->usecnt = 0; 2476 2477 return (VALUE)re; 2478} 2479 2480VALUE 2481rb_reg_alloc(void) 2482{ 2483 return rb_reg_s_alloc(rb_cRegexp); 2484} 2485 2486VALUE 2487rb_reg_new_str(VALUE s, int options) 2488{ 2489 return rb_reg_init_str(rb_reg_alloc(), s, options); 2490} 2491 2492VALUE 2493rb_reg_init_str(VALUE re, VALUE s, int options) 2494{ 2495 onig_errmsg_buffer err = ""; 2496 2497 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) { 2498 rb_reg_raise_str(s, options, err); 2499 } 2500 2501 return re; 2502} 2503 2504VALUE 2505rb_reg_new_ary(VALUE ary, int opt) 2506{ 2507 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt); 2508} 2509 2510VALUE 2511rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options) 2512{ 2513 VALUE re = rb_reg_alloc(); 2514 onig_errmsg_buffer err = ""; 2515 2516 if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) { 2517 rb_enc_reg_raise(s, len, enc, options, err); 2518 } 2519 2520 return re; 2521} 2522 2523VALUE 2524rb_reg_new(const char *s, long len, int options) 2525{ 2526 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options); 2527} 2528 2529VALUE 2530rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline) 2531{ 2532 VALUE re = rb_reg_alloc(); 2533 onig_errmsg_buffer err = ""; 2534 2535 if (!str) str = rb_str_new(0,0); 2536 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) { 2537 rb_set_errinfo(rb_reg_error_desc(str, options, err)); 2538 return Qnil; 2539 } 2540 FL_SET(re, REG_LITERAL); 2541 return re; 2542} 2543 2544static VALUE reg_cache; 2545 2546VALUE 2547rb_reg_regcomp(VALUE str) 2548{ 2549 volatile VALUE save_str = str; 2550 if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str) 2551 && ENCODING_GET(reg_cache) == ENCODING_GET(str) 2552 && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0) 2553 return reg_cache; 2554 2555 return reg_cache = rb_reg_new_str(save_str, 0); 2556} 2557 2558static st_index_t reg_hash(VALUE re); 2559/* 2560 * call-seq: 2561 * rxp.hash -> fixnum 2562 * 2563 * Produce a hash based on the text and options of this regular expression. 2564 */ 2565 2566static VALUE 2567rb_reg_hash(VALUE re) 2568{ 2569 st_index_t hashval = reg_hash(re); 2570 return LONG2FIX(hashval); 2571} 2572 2573static st_index_t 2574reg_hash(VALUE re) 2575{ 2576 st_index_t hashval; 2577 2578 rb_reg_check(re); 2579 hashval = RREGEXP(re)->ptr->options; 2580 hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re))); 2581 return rb_hash_end(hashval); 2582} 2583 2584 2585/* 2586 * call-seq: 2587 * rxp == other_rxp -> true or false 2588 * rxp.eql?(other_rxp) -> true or false 2589 * 2590 * Equality---Two regexps are equal if their patterns are identical, they have 2591 * the same character set code, and their <code>casefold?</code> values are the 2592 * same. 2593 * 2594 * /abc/ == /abc/x #=> false 2595 * /abc/ == /abc/i #=> false 2596 * /abc/ == /abc/u #=> false 2597 * /abc/u == /abc/n #=> false 2598 */ 2599 2600static VALUE 2601rb_reg_equal(VALUE re1, VALUE re2) 2602{ 2603 if (re1 == re2) return Qtrue; 2604 if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse; 2605 rb_reg_check(re1); rb_reg_check(re2); 2606 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse; 2607 if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse; 2608 if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse; 2609 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse; 2610 if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) { 2611 return Qtrue; 2612 } 2613 return Qfalse; 2614} 2615 2616/* 2617 * call-seq: 2618 * mtch.hash -> integer 2619 * 2620 * Produce a hash based on the target string, regexp and matched 2621 * positions of this matchdata. 2622 */ 2623 2624static VALUE 2625match_hash(VALUE match) 2626{ 2627 const struct re_registers *regs; 2628 st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str)); 2629 2630 rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp)); 2631 regs = RMATCH_REGS(match); 2632 hashval = rb_hash_uint(hashval, regs->num_regs); 2633 hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg))); 2634 hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end))); 2635 hashval = rb_hash_end(hashval); 2636 return LONG2FIX(hashval); 2637} 2638 2639/* 2640 * call-seq: 2641 * mtch == mtch2 -> true or false 2642 * 2643 * Equality---Two matchdata are equal if their target strings, 2644 * patterns, and matched positions are identical. 2645 */ 2646 2647static VALUE 2648match_equal(VALUE match1, VALUE match2) 2649{ 2650 const struct re_registers *regs1, *regs2; 2651 if (match1 == match2) return Qtrue; 2652 if (!RB_TYPE_P(match2, T_MATCH)) return Qfalse; 2653 if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse; 2654 if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse; 2655 regs1 = RMATCH_REGS(match1); 2656 regs2 = RMATCH_REGS(match2); 2657 if (regs1->num_regs != regs2->num_regs) return Qfalse; 2658 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse; 2659 if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse; 2660 return Qtrue; 2661} 2662 2663static VALUE 2664reg_operand(VALUE s, int check) 2665{ 2666 if (SYMBOL_P(s)) { 2667 return rb_sym_to_s(s); 2668 } 2669 else { 2670 return (check ? rb_str_to_str : rb_check_string_type)(s); 2671 } 2672} 2673 2674static long 2675reg_match_pos(VALUE re, VALUE *strp, long pos) 2676{ 2677 VALUE str = *strp; 2678 2679 if (NIL_P(str)) { 2680 rb_backref_set(Qnil); 2681 return -1; 2682 } 2683 *strp = str = reg_operand(str, TRUE); 2684 if (pos != 0) { 2685 if (pos < 0) { 2686 VALUE l = rb_str_length(str); 2687 pos += NUM2INT(l); 2688 if (pos < 0) { 2689 return pos; 2690 } 2691 } 2692 pos = rb_str_offset(str, pos); 2693 } 2694 return rb_reg_search(re, str, pos, 0); 2695} 2696 2697/* 2698 * call-seq: 2699 * rxp =~ str -> integer or nil 2700 * 2701 * Match---Matches <i>rxp</i> against <i>str</i>. 2702 * 2703 * /at/ =~ "input data" #=> 7 2704 * /ax/ =~ "input data" #=> nil 2705 * 2706 * If <code>=~</code> is used with a regexp literal with named captures, 2707 * captured strings (or nil) is assigned to local variables named by 2708 * the capture names. 2709 * 2710 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y " 2711 * p lhs #=> "x" 2712 * p rhs #=> "y" 2713 * 2714 * If it is not matched, nil is assigned for the variables. 2715 * 2716 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = " 2717 * p lhs #=> nil 2718 * p rhs #=> nil 2719 * 2720 * This assignment is implemented in the Ruby parser. 2721 * The parser detects 'regexp-literal =~ expression' for the assignment. 2722 * The regexp must be a literal without interpolation and placed at left hand side. 2723 * 2724 * The assignment does not occur if the regexp is not a literal. 2725 * 2726 * re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ 2727 * re =~ " x = y " 2728 * p lhs # undefined local variable 2729 * p rhs # undefined local variable 2730 * 2731 * A regexp interpolation, <code>#{}</code>, also disables 2732 * the assignment. 2733 * 2734 * rhs_pat = /(?<rhs>\w+)/ 2735 * /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y" 2736 * p lhs # undefined local variable 2737 * 2738 * The assignment does not occur if the regexp is placed at the right hand side. 2739 * 2740 * " x = y " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ 2741 * p lhs, rhs # undefined local variable 2742 * 2743 */ 2744 2745VALUE 2746rb_reg_match(VALUE re, VALUE str) 2747{ 2748 long pos = reg_match_pos(re, &str, 0); 2749 if (pos < 0) return Qnil; 2750 pos = rb_str_sublen(str, pos); 2751 return LONG2FIX(pos); 2752} 2753 2754/* 2755 * call-seq: 2756 * rxp === str -> true or false 2757 * 2758 * Case Equality---Used in case statements. 2759 * 2760 * a = "HELLO" 2761 * case a 2762 * when /^[a-z]*$/; print "Lower case\n" 2763 * when /^[A-Z]*$/; print "Upper case\n" 2764 * else; print "Mixed case\n" 2765 * end 2766 * #=> "Upper case" 2767 * 2768 * Following a regular expression literal with the #=== operator allows you to 2769 * compare against a String. 2770 * 2771 * /^[a-z]*$/ === "HELLO" #=> false 2772 * /^[A-Z]*$/ === "HELLO" #=> true 2773 */ 2774 2775VALUE 2776rb_reg_eqq(VALUE re, VALUE str) 2777{ 2778 long start; 2779 2780 str = reg_operand(str, FALSE); 2781 if (NIL_P(str)) { 2782 rb_backref_set(Qnil); 2783 return Qfalse; 2784 } 2785 start = rb_reg_search(re, str, 0, 0); 2786 if (start < 0) { 2787 return Qfalse; 2788 } 2789 return Qtrue; 2790} 2791 2792 2793/* 2794 * call-seq: 2795 * ~ rxp -> integer or nil 2796 * 2797 * Match---Matches <i>rxp</i> against the contents of <code>$_</code>. 2798 * Equivalent to <code><i>rxp</i> =~ $_</code>. 2799 * 2800 * $_ = "input data" 2801 * ~ /at/ #=> 7 2802 */ 2803 2804VALUE 2805rb_reg_match2(VALUE re) 2806{ 2807 long start; 2808 VALUE line = rb_lastline_get(); 2809 2810 if (!RB_TYPE_P(line, T_STRING)) { 2811 rb_backref_set(Qnil); 2812 return Qnil; 2813 } 2814 2815 start = rb_reg_search(re, line, 0, 0); 2816 if (start < 0) { 2817 return Qnil; 2818 } 2819 start = rb_str_sublen(line, start); 2820 return LONG2FIX(start); 2821} 2822 2823 2824/* 2825 * call-seq: 2826 * rxp.match(str) -> matchdata or nil 2827 * rxp.match(str,pos) -> matchdata or nil 2828 * 2829 * Returns a <code>MatchData</code> object describing the match, or 2830 * <code>nil</code> if there was no match. This is equivalent to retrieving the 2831 * value of the special variable <code>$~</code> following a normal match. 2832 * If the second parameter is present, it specifies the position in the string 2833 * to begin the search. 2834 * 2835 * /(.)(.)(.)/.match("abc")[2] #=> "b" 2836 * /(.)(.)/.match("abc", 1)[2] #=> "c" 2837 * 2838 * If a block is given, invoke the block with MatchData if match succeed, so 2839 * that you can write 2840 * 2841 * pat.match(str) {|m| ...} 2842 * 2843 * instead of 2844 * 2845 * if m = pat.match(str) 2846 * ... 2847 * end 2848 * 2849 * The return value is a value from block execution in this case. 2850 */ 2851 2852static VALUE 2853rb_reg_match_m(int argc, VALUE *argv, VALUE re) 2854{ 2855 VALUE result, str, initpos; 2856 long pos; 2857 2858 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) { 2859 pos = NUM2LONG(initpos); 2860 } 2861 else { 2862 pos = 0; 2863 } 2864 2865 pos = reg_match_pos(re, &str, pos); 2866 if (pos < 0) { 2867 rb_backref_set(Qnil); 2868 return Qnil; 2869 } 2870 result = rb_backref_get(); 2871 rb_match_busy(result); 2872 if (!NIL_P(result) && rb_block_given_p()) { 2873 return rb_yield(result); 2874 } 2875 return result; 2876} 2877 2878/* 2879 * Document-method: compile 2880 * 2881 * Synonym for <code>Regexp.new</code> 2882 */ 2883 2884 2885/* 2886 * call-seq: 2887 * Regexp.new(string, [options [, kcode]]) -> regexp 2888 * Regexp.new(regexp) -> regexp 2889 * Regexp.compile(string, [options [, kcode]]) -> regexp 2890 * Regexp.compile(regexp) -> regexp 2891 * 2892 * Constructs a new regular expression from +pattern+, which can be either a 2893 * String or a Regexp (in which case that regexp's options are propagated), 2894 * and new options may not be specified (a change as of Ruby 1.8). 2895 * 2896 * If +options+ is a Fixnum, it should be one or more of the constants 2897 * Regexp::EXTENDED, Regexp::IGNORECASE, and Regexp::MULTILINE, 2898 * <em>or</em>-ed together. Otherwise, if +options+ is not 2899 * +nil+ or +false+, the regexp will be case insensitive. 2900 * 2901 * When the +kcode+ parameter is `n' or `N' sets the regexp no encoding. 2902 * It means that the regexp is for binary strings. 2903 * 2904 * r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/ 2905 * r2 = Regexp.new('cat', true) #=> /cat/i 2906 * r3 = Regexp.new(r2) #=> /cat/i 2907 * r4 = Regexp.new('dog', Regexp::EXTENDED | Regexp::IGNORECASE) #=> /dog/ix 2908 */ 2909 2910static VALUE 2911rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) 2912{ 2913 onig_errmsg_buffer err = ""; 2914 int flags = 0; 2915 VALUE str; 2916 rb_encoding *enc; 2917 const char *ptr; 2918 long len; 2919 2920 rb_check_arity(argc, 1, 3); 2921 if (RB_TYPE_P(argv[0], T_REGEXP)) { 2922 VALUE re = argv[0]; 2923 2924 if (argc > 1) { 2925 rb_warn("flags ignored"); 2926 } 2927 rb_reg_check(re); 2928 flags = rb_reg_options(re); 2929 ptr = RREGEXP_SRC_PTR(re); 2930 len = RREGEXP_SRC_LEN(re); 2931 enc = rb_enc_get(re); 2932 if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) { 2933 str = rb_enc_str_new(ptr, len, enc); 2934 rb_reg_raise_str(str, flags, err); 2935 } 2936 } 2937 else { 2938 if (argc >= 2) { 2939 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]); 2940 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE; 2941 } 2942 enc = 0; 2943 if (argc == 3 && !NIL_P(argv[2])) { 2944 char *kcode = StringValuePtr(argv[2]); 2945 if (kcode[0] == 'n' || kcode[0] == 'N') { 2946 enc = rb_ascii8bit_encoding(); 2947 flags |= ARG_ENCODING_NONE; 2948 } 2949 else { 2950 rb_warn("encoding option is ignored - %s", kcode); 2951 } 2952 } 2953 str = argv[0]; 2954 ptr = StringValuePtr(str); 2955 if (enc 2956 ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0) 2957 : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) { 2958 rb_reg_raise_str(str, flags, err); 2959 } 2960 } 2961 return self; 2962} 2963 2964VALUE 2965rb_reg_quote(VALUE str) 2966{ 2967 rb_encoding *enc = rb_enc_get(str); 2968 char *s, *send, *t; 2969 VALUE tmp; 2970 int c, clen; 2971 int ascii_only = rb_enc_str_asciionly_p(str); 2972 2973 s = RSTRING_PTR(str); 2974 send = s + RSTRING_LEN(str); 2975 while (s < send) { 2976 c = rb_enc_ascget(s, send, &clen, enc); 2977 if (c == -1) { 2978 s += mbclen(s, send, enc); 2979 continue; 2980 } 2981 switch (c) { 2982 case '[': case ']': case '{': case '}': 2983 case '(': case ')': case '|': case '-': 2984 case '*': case '.': case '\\': 2985 case '?': case '+': case '^': case '$': 2986 case ' ': case '#': 2987 case '\t': case '\f': case '\v': case '\n': case '\r': 2988 goto meta_found; 2989 } 2990 s += clen; 2991 } 2992 tmp = rb_str_new3(str); 2993 if (ascii_only) { 2994 rb_enc_associate(tmp, rb_usascii_encoding()); 2995 } 2996 return tmp; 2997 2998 meta_found: 2999 tmp = rb_str_new(0, RSTRING_LEN(str)*2); 3000 if (ascii_only) { 3001 rb_enc_associate(tmp, rb_usascii_encoding()); 3002 } 3003 else { 3004 rb_enc_copy(tmp, str); 3005 } 3006 t = RSTRING_PTR(tmp); 3007 /* copy upto metacharacter */ 3008 memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str)); 3009 t += s - RSTRING_PTR(str); 3010 3011 while (s < send) { 3012 c = rb_enc_ascget(s, send, &clen, enc); 3013 if (c == -1) { 3014 int n = mbclen(s, send, enc); 3015 3016 while (n--) 3017 *t++ = *s++; 3018 continue; 3019 } 3020 s += clen; 3021 switch (c) { 3022 case '[': case ']': case '{': case '}': 3023 case '(': case ')': case '|': case '-': 3024 case '*': case '.': case '\\': 3025 case '?': case '+': case '^': case '$': 3026 case '#': 3027 t += rb_enc_mbcput('\\', t, enc); 3028 break; 3029 case ' ': 3030 t += rb_enc_mbcput('\\', t, enc); 3031 t += rb_enc_mbcput(' ', t, enc); 3032 continue; 3033 case '\t': 3034 t += rb_enc_mbcput('\\', t, enc); 3035 t += rb_enc_mbcput('t', t, enc); 3036 continue; 3037 case '\n': 3038 t += rb_enc_mbcput('\\', t, enc); 3039 t += rb_enc_mbcput('n', t, enc); 3040 continue; 3041 case '\r': 3042 t += rb_enc_mbcput('\\', t, enc); 3043 t += rb_enc_mbcput('r', t, enc); 3044 continue; 3045 case '\f': 3046 t += rb_enc_mbcput('\\', t, enc); 3047 t += rb_enc_mbcput('f', t, enc); 3048 continue; 3049 case '\v': 3050 t += rb_enc_mbcput('\\', t, enc); 3051 t += rb_enc_mbcput('v', t, enc); 3052 continue; 3053 } 3054 t += rb_enc_mbcput(c, t, enc); 3055 } 3056 rb_str_resize(tmp, t - RSTRING_PTR(tmp)); 3057 OBJ_INFECT(tmp, str); 3058 return tmp; 3059} 3060 3061 3062/* 3063 * call-seq: 3064 * Regexp.escape(str) -> string 3065 * Regexp.quote(str) -> string 3066 * 3067 * Escapes any characters that would have special meaning in a regular 3068 * expression. Returns a new escaped string, or self if no characters are 3069 * escaped. For any string, 3070 * <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true. 3071 * 3072 * Regexp.escape('\*?{}.') #=> \\\*\?\{\}\. 3073 * 3074 */ 3075 3076static VALUE 3077rb_reg_s_quote(VALUE c, VALUE str) 3078{ 3079 return rb_reg_quote(reg_operand(str, TRUE)); 3080} 3081 3082int 3083rb_reg_options(VALUE re) 3084{ 3085 int options; 3086 3087 rb_reg_check(re); 3088 options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK; 3089 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED; 3090 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE; 3091 return options; 3092} 3093 3094VALUE 3095rb_check_regexp_type(VALUE re) 3096{ 3097 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp"); 3098} 3099 3100/* 3101 * call-seq: 3102 * Regexp.try_convert(obj) -> re or nil 3103 * 3104 * Try to convert <i>obj</i> into a Regexp, using to_regexp method. 3105 * Returns converted regexp or nil if <i>obj</i> cannot be converted 3106 * for any reason. 3107 * 3108 * Regexp.try_convert(/re/) #=> /re/ 3109 * Regexp.try_convert("re") #=> nil 3110 * 3111 * o = Object.new 3112 * Regexp.try_convert(o) #=> nil 3113 * def o.to_regexp() /foo/ end 3114 * Regexp.try_convert(o) #=> /foo/ 3115 * 3116 */ 3117static VALUE 3118rb_reg_s_try_convert(VALUE dummy, VALUE re) 3119{ 3120 return rb_check_regexp_type(re); 3121} 3122 3123static VALUE 3124rb_reg_s_union(VALUE self, VALUE args0) 3125{ 3126 long argc = RARRAY_LEN(args0); 3127 3128 if (argc == 0) { 3129 VALUE args[1]; 3130 args[0] = rb_str_new2("(?!)"); 3131 return rb_class_new_instance(1, args, rb_cRegexp); 3132 } 3133 else if (argc == 1) { 3134 VALUE arg = rb_ary_entry(args0, 0); 3135 VALUE re = rb_check_regexp_type(arg); 3136 if (!NIL_P(re)) 3137 return re; 3138 else { 3139 VALUE quoted; 3140 quoted = rb_reg_s_quote(Qnil, arg); 3141 return rb_reg_new_str(quoted, 0); 3142 } 3143 } 3144 else { 3145 int i; 3146 VALUE source = rb_str_buf_new(0); 3147 rb_encoding *result_enc; 3148 3149 int has_asciionly = 0; 3150 rb_encoding *has_ascii_compat_fixed = 0; 3151 rb_encoding *has_ascii_incompat = 0; 3152 3153 for (i = 0; i < argc; i++) { 3154 volatile VALUE v; 3155 VALUE e = rb_ary_entry(args0, i); 3156 3157 if (0 < i) 3158 rb_str_buf_cat_ascii(source, "|"); 3159 3160 v = rb_check_regexp_type(e); 3161 if (!NIL_P(v)) { 3162 rb_encoding *enc = rb_enc_get(v); 3163 if (!rb_enc_asciicompat(enc)) { 3164 if (!has_ascii_incompat) 3165 has_ascii_incompat = enc; 3166 else if (has_ascii_incompat != enc) 3167 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 3168 rb_enc_name(has_ascii_incompat), rb_enc_name(enc)); 3169 } 3170 else if (rb_reg_fixed_encoding_p(v)) { 3171 if (!has_ascii_compat_fixed) 3172 has_ascii_compat_fixed = enc; 3173 else if (has_ascii_compat_fixed != enc) 3174 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 3175 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc)); 3176 } 3177 else { 3178 has_asciionly = 1; 3179 } 3180 v = rb_reg_to_s(v); 3181 } 3182 else { 3183 rb_encoding *enc; 3184 StringValue(e); 3185 enc = rb_enc_get(e); 3186 if (!rb_enc_str_asciicompat_p(e)) { 3187 if (!has_ascii_incompat) 3188 has_ascii_incompat = enc; 3189 else if (has_ascii_incompat != enc) 3190 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 3191 rb_enc_name(has_ascii_incompat), rb_enc_name(enc)); 3192 } 3193 else if (rb_enc_str_asciionly_p(e)) { 3194 has_asciionly = 1; 3195 } 3196 else { 3197 if (!has_ascii_compat_fixed) 3198 has_ascii_compat_fixed = enc; 3199 else if (has_ascii_compat_fixed != enc) 3200 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 3201 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc)); 3202 } 3203 v = rb_reg_s_quote(Qnil, e); 3204 } 3205 if (has_ascii_incompat) { 3206 if (has_asciionly) { 3207 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", 3208 rb_enc_name(has_ascii_incompat)); 3209 } 3210 if (has_ascii_compat_fixed) { 3211 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 3212 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed)); 3213 } 3214 } 3215 3216 if (i == 0) { 3217 rb_enc_copy(source, v); 3218 } 3219 rb_str_append(source, v); 3220 } 3221 3222 if (has_ascii_incompat) { 3223 result_enc = has_ascii_incompat; 3224 } 3225 else if (has_ascii_compat_fixed) { 3226 result_enc = has_ascii_compat_fixed; 3227 } 3228 else { 3229 result_enc = rb_ascii8bit_encoding(); 3230 } 3231 3232 rb_enc_associate(source, result_enc); 3233 return rb_class_new_instance(1, &source, rb_cRegexp); 3234 } 3235} 3236 3237/* 3238 * call-seq: 3239 * Regexp.union(pat1, pat2, ...) -> new_regexp 3240 * Regexp.union(pats_ary) -> new_regexp 3241 * 3242 * Return a <code>Regexp</code> object that is the union of the given 3243 * <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s 3244 * can be Regexp objects, in which case their options will be preserved, or 3245 * Strings. If no patterns are given, returns <code>/(?!)/</code>. 3246 * The behavior is unspecified if any given <em>pattern</em> contains capture. 3247 * 3248 * Regexp.union #=> /(?!)/ 3249 * Regexp.union("penzance") #=> /penzance/ 3250 * Regexp.union("a+b*c") #=> /a\+b\*c/ 3251 * Regexp.union("skiing", "sledding") #=> /skiing|sledding/ 3252 * Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/ 3253 * Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/ 3254 */ 3255static VALUE 3256rb_reg_s_union_m(VALUE self, VALUE args) 3257{ 3258 VALUE v; 3259 if (RARRAY_LEN(args) == 1 && 3260 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) { 3261 return rb_reg_s_union(self, v); 3262 } 3263 return rb_reg_s_union(self, args); 3264} 3265 3266/* :nodoc: */ 3267static VALUE 3268rb_reg_init_copy(VALUE copy, VALUE re) 3269{ 3270 onig_errmsg_buffer err = ""; 3271 const char *s; 3272 long len; 3273 3274 if (!OBJ_INIT_COPY(copy, re)) return copy; 3275 rb_reg_check(re); 3276 s = RREGEXP_SRC_PTR(re); 3277 len = RREGEXP_SRC_LEN(re); 3278 if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re), 3279 err, NULL, 0) != 0) { 3280 rb_reg_raise(s, len, err, re); 3281 } 3282 return copy; 3283} 3284 3285VALUE 3286rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) 3287{ 3288 VALUE val = 0; 3289 char *p, *s, *e; 3290 int no, clen; 3291 rb_encoding *str_enc = rb_enc_get(str); 3292 rb_encoding *src_enc = rb_enc_get(src); 3293 int acompat = rb_enc_asciicompat(str_enc); 3294#define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc)) 3295 3296 p = s = RSTRING_PTR(str); 3297 e = s + RSTRING_LEN(str); 3298 3299 while (s < e) { 3300 int c = ASCGET(s, e, &clen); 3301 char *ss; 3302 3303 if (c == -1) { 3304 s += mbclen(s, e, str_enc); 3305 continue; 3306 } 3307 ss = s; 3308 s += clen; 3309 3310 if (c != '\\' || s == e) continue; 3311 3312 if (!val) { 3313 val = rb_str_buf_new(ss-p); 3314 } 3315 rb_enc_str_buf_cat(val, p, ss-p, str_enc); 3316 3317 c = ASCGET(s, e, &clen); 3318 if (c == -1) { 3319 s += mbclen(s, e, str_enc); 3320 rb_enc_str_buf_cat(val, ss, s-ss, str_enc); 3321 p = s; 3322 continue; 3323 } 3324 s += clen; 3325 3326 p = s; 3327 switch (c) { 3328 case '1': case '2': case '3': case '4': 3329 case '5': case '6': case '7': case '8': case '9': 3330 if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) { 3331 no = c - '0'; 3332 } 3333 else { 3334 continue; 3335 } 3336 break; 3337 3338 case 'k': 3339 if (s < e && ASCGET(s, e, &clen) == '<') { 3340 char *name, *name_end; 3341 3342 name_end = name = s + clen; 3343 while (name_end < e) { 3344 c = ASCGET(name_end, e, &clen); 3345 if (c == '>') break; 3346 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen; 3347 } 3348 if (name_end < e) { 3349 no = name_to_backref_number(regs, regexp, name, name_end); 3350 p = s = name_end + clen; 3351 break; 3352 } 3353 else { 3354 rb_raise(rb_eRuntimeError, "invalid group name reference format"); 3355 } 3356 } 3357 3358 rb_enc_str_buf_cat(val, ss, s-ss, str_enc); 3359 continue; 3360 3361 case '0': 3362 case '&': 3363 no = 0; 3364 break; 3365 3366 case '`': 3367 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc); 3368 continue; 3369 3370 case '\'': 3371 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc); 3372 continue; 3373 3374 case '+': 3375 no = regs->num_regs-1; 3376 while (BEG(no) == -1 && no > 0) no--; 3377 if (no == 0) continue; 3378 break; 3379 3380 case '\\': 3381 rb_enc_str_buf_cat(val, s-clen, clen, str_enc); 3382 continue; 3383 3384 default: 3385 rb_enc_str_buf_cat(val, ss, s-ss, str_enc); 3386 continue; 3387 } 3388 3389 if (no >= 0) { 3390 if (no >= regs->num_regs) continue; 3391 if (BEG(no) == -1) continue; 3392 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc); 3393 } 3394 } 3395 3396 if (!val) return str; 3397 if (p < e) { 3398 rb_enc_str_buf_cat(val, p, e-p, str_enc); 3399 } 3400 3401 return val; 3402} 3403 3404static VALUE 3405kcode_getter(void) 3406{ 3407 rb_warn("variable $KCODE is no longer effective"); 3408 return Qnil; 3409} 3410 3411static void 3412kcode_setter(VALUE val, ID id) 3413{ 3414 rb_warn("variable $KCODE is no longer effective; ignored"); 3415} 3416 3417static VALUE 3418ignorecase_getter(void) 3419{ 3420 rb_warn("variable $= is no longer effective"); 3421 return Qfalse; 3422} 3423 3424static void 3425ignorecase_setter(VALUE val, ID id) 3426{ 3427 rb_warn("variable $= is no longer effective; ignored"); 3428} 3429 3430static VALUE 3431match_getter(void) 3432{ 3433 VALUE match = rb_backref_get(); 3434 3435 if (NIL_P(match)) return Qnil; 3436 rb_match_busy(match); 3437 return match; 3438} 3439 3440static void 3441match_setter(VALUE val) 3442{ 3443 if (!NIL_P(val)) { 3444 Check_Type(val, T_MATCH); 3445 } 3446 rb_backref_set(val); 3447} 3448 3449/* 3450 * call-seq: 3451 * Regexp.last_match -> matchdata 3452 * Regexp.last_match(n) -> str 3453 * 3454 * The first form returns the MatchData object generated by the 3455 * last successful pattern match. Equivalent to reading the special global 3456 * variable <code>$~</code> (see Special global variables in Regexp for 3457 * details). 3458 * 3459 * The second form returns the <i>n</i>th field in this MatchData object. 3460 * _n_ can be a string or symbol to reference a named capture. 3461 * 3462 * Note that the last_match is local to the thread and method scope of the 3463 * method that did the pattern match. 3464 * 3465 * /c(.)t/ =~ 'cat' #=> 0 3466 * Regexp.last_match #=> #<MatchData "cat" 1:"a"> 3467 * Regexp.last_match(0) #=> "cat" 3468 * Regexp.last_match(1) #=> "a" 3469 * Regexp.last_match(2) #=> nil 3470 * 3471 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val" 3472 * Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val"> 3473 * Regexp.last_match(:lhs) #=> "var" 3474 * Regexp.last_match(:rhs) #=> "val" 3475 */ 3476 3477static VALUE 3478rb_reg_s_last_match(int argc, VALUE *argv) 3479{ 3480 VALUE nth; 3481 3482 if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) { 3483 VALUE match = rb_backref_get(); 3484 int n; 3485 if (NIL_P(match)) return Qnil; 3486 n = match_backref_number(match, nth); 3487 return rb_reg_nth_match(n, match); 3488 } 3489 return match_getter(); 3490} 3491 3492static void 3493re_warn(const char *s) 3494{ 3495 rb_warn("%s", s); 3496} 3497 3498/* 3499 * Document-class: RegexpError 3500 * 3501 * Raised when given an invalid regexp expression. 3502 * 3503 * Regexp.new("?") 3504 * 3505 * <em>raises the exception:</em> 3506 * 3507 * RegexpError: target of repeat operator is not specified: /?/ 3508 */ 3509 3510/* 3511 * Document-class: Regexp 3512 * 3513 * A <code>Regexp</code> holds a regular expression, used to match a pattern 3514 * against strings. Regexps are created using the <code>/.../</code> and 3515 * <code>%r{...}</code> literals, and by the <code>Regexp::new</code> 3516 * constructor. 3517 * 3518 * :include: doc/re.rdoc 3519 */ 3520 3521void 3522Init_Regexp(void) 3523{ 3524 rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError); 3525 3526 onigenc_set_default_caseconv_table((UChar*)casetable); 3527 onigenc_set_default_encoding(ONIG_ENCODING_ASCII); 3528 onig_set_warn_func(re_warn); 3529 onig_set_verb_warn_func(re_warn); 3530 3531 rb_define_virtual_variable("$~", match_getter, match_setter); 3532 rb_define_virtual_variable("$&", last_match_getter, 0); 3533 rb_define_virtual_variable("$`", prematch_getter, 0); 3534 rb_define_virtual_variable("$'", postmatch_getter, 0); 3535 rb_define_virtual_variable("$+", last_paren_match_getter, 0); 3536 3537 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter); 3538 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter); 3539 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter); 3540 3541 rb_cRegexp = rb_define_class("Regexp", rb_cObject); 3542 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc); 3543 rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1); 3544 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1); 3545 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1); 3546 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2); 3547 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1); 3548 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1); 3549 3550 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1); 3551 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1); 3552 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0); 3553 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1); 3554 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1); 3555 rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1); 3556 rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1); 3557 rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0); 3558 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1); 3559 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0); 3560 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0); 3561 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0); 3562 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0); 3563 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0); 3564 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */ 3565 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0); 3566 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0); 3567 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0); 3568 3569 /* see Regexp.options and Regexp.new */ 3570 rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE)); 3571 /* see Regexp.options and Regexp.new */ 3572 rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND)); 3573 /* see Regexp.options and Regexp.new */ 3574 rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE)); 3575 /* see Regexp.options and Regexp.new */ 3576 rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED)); 3577 /* see Regexp.options and Regexp.new */ 3578 rb_define_const(rb_cRegexp, "NOENCODING", INT2FIX(ARG_ENCODING_NONE)); 3579 3580 rb_global_variable(®_cache); 3581 3582 rb_cMatch = rb_define_class("MatchData", rb_cObject); 3583 rb_define_alloc_func(rb_cMatch, match_alloc); 3584 rb_undef_method(CLASS_OF(rb_cMatch), "new"); 3585 3586 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1); 3587 rb_define_method(rb_cMatch, "regexp", match_regexp, 0); 3588 rb_define_method(rb_cMatch, "names", match_names, 0); 3589 rb_define_method(rb_cMatch, "size", match_size, 0); 3590 rb_define_method(rb_cMatch, "length", match_size, 0); 3591 rb_define_method(rb_cMatch, "offset", match_offset, 1); 3592 rb_define_method(rb_cMatch, "begin", match_begin, 1); 3593 rb_define_method(rb_cMatch, "end", match_end, 1); 3594 rb_define_method(rb_cMatch, "to_a", match_to_a, 0); 3595 rb_define_method(rb_cMatch, "[]", match_aref, -1); 3596 rb_define_method(rb_cMatch, "captures", match_captures, 0); 3597 rb_define_method(rb_cMatch, "values_at", match_values_at, -1); 3598 rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0); 3599 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0); 3600 rb_define_method(rb_cMatch, "to_s", match_to_s, 0); 3601 rb_define_method(rb_cMatch, "inspect", match_inspect, 0); 3602 rb_define_method(rb_cMatch, "string", match_string, 0); 3603 rb_define_method(rb_cMatch, "hash", match_hash, 0); 3604 rb_define_method(rb_cMatch, "eql?", match_equal, 1); 3605 rb_define_method(rb_cMatch, "==", match_equal, 1); 3606} 3607