1/********************************************************************** 2 3 string.c - 4 5 $Author: nagachika $ 6 created at: Mon Aug 9 17:12:58 JST 1993 7 8 Copyright (C) 1993-2007 Yukihiro Matsumoto 9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc. 10 Copyright (C) 2000 Information-technology Promotion Agency, Japan 11 12**********************************************************************/ 13 14#include "ruby/ruby.h" 15#include "ruby/re.h" 16#include "ruby/encoding.h" 17#include "vm_core.h" 18#include "internal.h" 19#include "probes.h" 20#include <assert.h> 21 22#define BEG(no) (regs->beg[(no)]) 23#define END(no) (regs->end[(no)]) 24 25#include <math.h> 26#include <ctype.h> 27 28#ifdef HAVE_UNISTD_H 29#include <unistd.h> 30#endif 31 32#define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) 33 34#undef rb_str_new_cstr 35#undef rb_tainted_str_new_cstr 36#undef rb_usascii_str_new_cstr 37#undef rb_external_str_new_cstr 38#undef rb_locale_str_new_cstr 39#undef rb_str_new2 40#undef rb_str_new3 41#undef rb_str_new4 42#undef rb_str_new5 43#undef rb_tainted_str_new2 44#undef rb_usascii_str_new2 45#undef rb_str_dup_frozen 46#undef rb_str_buf_new_cstr 47#undef rb_str_buf_new2 48#undef rb_str_buf_cat2 49#undef rb_str_cat2 50 51static VALUE rb_str_clear(VALUE str); 52 53VALUE rb_cString; 54VALUE rb_cSymbol; 55 56#define RUBY_MAX_CHAR_LEN 16 57#define STR_TMPLOCK FL_USER7 58#define STR_NOEMBED FL_USER1 59#define STR_SHARED FL_USER2 /* = ELTS_SHARED */ 60#define STR_ASSOC FL_USER3 61#define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED) 62#define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC) 63#define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC) 64#define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC)) 65#define STR_UNSET_NOCAPA(s) do {\ 66 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\ 67} while (0) 68 69 70#define STR_SET_NOEMBED(str) do {\ 71 FL_SET((str), STR_NOEMBED);\ 72 STR_SET_EMBED_LEN((str), 0);\ 73} while (0) 74#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED) 75#define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED)) 76#define STR_SET_EMBED_LEN(str, n) do { \ 77 long tmp_n = (n);\ 78 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\ 79 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\ 80} while (0) 81 82#define STR_SET_LEN(str, n) do { \ 83 if (STR_EMBED_P(str)) {\ 84 STR_SET_EMBED_LEN((str), (n));\ 85 }\ 86 else {\ 87 RSTRING(str)->as.heap.len = (n);\ 88 }\ 89} while (0) 90 91#define STR_DEC_LEN(str) do {\ 92 if (STR_EMBED_P(str)) {\ 93 long n = RSTRING_LEN(str);\ 94 n--;\ 95 STR_SET_EMBED_LEN((str), n);\ 96 }\ 97 else {\ 98 RSTRING(str)->as.heap.len--;\ 99 }\ 100} while (0) 101 102#define RESIZE_CAPA(str,capacity) do {\ 103 if (STR_EMBED_P(str)) {\ 104 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\ 105 char *tmp = ALLOC_N(char, (capacity)+1);\ 106 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\ 107 RSTRING(str)->as.heap.ptr = tmp;\ 108 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\ 109 STR_SET_NOEMBED(str);\ 110 RSTRING(str)->as.heap.aux.capa = (capacity);\ 111 }\ 112 }\ 113 else {\ 114 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\ 115 if (!STR_NOCAPA_P(str))\ 116 RSTRING(str)->as.heap.aux.capa = (capacity);\ 117 }\ 118} while (0) 119 120#define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 121#define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) 122 123#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) 124 125static inline int 126single_byte_optimizable(VALUE str) 127{ 128 rb_encoding *enc; 129 130 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */ 131 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) 132 return 1; 133 134 enc = STR_ENC_GET(str); 135 if (rb_enc_mbmaxlen(enc) == 1) 136 return 1; 137 138 /* Conservative. Possibly single byte. 139 * "\xa1" in Shift_JIS for example. */ 140 return 0; 141} 142 143VALUE rb_fs; 144 145static inline const char * 146search_nonascii(const char *p, const char *e) 147{ 148#if SIZEOF_VALUE == 8 149# define NONASCII_MASK 0x8080808080808080ULL 150#elif SIZEOF_VALUE == 4 151# define NONASCII_MASK 0x80808080UL 152#endif 153#ifdef NONASCII_MASK 154 if ((int)sizeof(VALUE) * 2 < e - p) { 155 const VALUE *s, *t; 156 const VALUE lowbits = sizeof(VALUE) - 1; 157 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 158 while (p < (const char *)s) { 159 if (!ISASCII(*p)) 160 return p; 161 p++; 162 } 163 t = (const VALUE*)(~lowbits & (VALUE)e); 164 while (s < t) { 165 if (*s & NONASCII_MASK) { 166 t = s; 167 break; 168 } 169 s++; 170 } 171 p = (const char *)t; 172 } 173#endif 174 while (p < e) { 175 if (!ISASCII(*p)) 176 return p; 177 p++; 178 } 179 return NULL; 180} 181 182static int 183coderange_scan(const char *p, long len, rb_encoding *enc) 184{ 185 const char *e = p + len; 186 187 if (rb_enc_to_index(enc) == 0) { 188 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 189 p = search_nonascii(p, e); 190 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; 191 } 192 193 if (rb_enc_asciicompat(enc)) { 194 p = search_nonascii(p, e); 195 if (!p) { 196 return ENC_CODERANGE_7BIT; 197 } 198 while (p < e) { 199 int ret = rb_enc_precise_mbclen(p, e, enc); 200 if (!MBCLEN_CHARFOUND_P(ret)) { 201 return ENC_CODERANGE_BROKEN; 202 } 203 p += MBCLEN_CHARFOUND_LEN(ret); 204 if (p < e) { 205 p = search_nonascii(p, e); 206 if (!p) { 207 return ENC_CODERANGE_VALID; 208 } 209 } 210 } 211 if (e < p) { 212 return ENC_CODERANGE_BROKEN; 213 } 214 return ENC_CODERANGE_VALID; 215 } 216 217 while (p < e) { 218 int ret = rb_enc_precise_mbclen(p, e, enc); 219 220 if (!MBCLEN_CHARFOUND_P(ret)) { 221 return ENC_CODERANGE_BROKEN; 222 } 223 p += MBCLEN_CHARFOUND_LEN(ret); 224 } 225 if (e < p) { 226 return ENC_CODERANGE_BROKEN; 227 } 228 return ENC_CODERANGE_VALID; 229} 230 231long 232rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr) 233{ 234 const char *p = s; 235 236 if (*cr == ENC_CODERANGE_BROKEN) 237 return e - s; 238 239 if (rb_enc_to_index(enc) == 0) { 240 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 241 p = search_nonascii(p, e); 242 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 243 return e - s; 244 } 245 else if (rb_enc_asciicompat(enc)) { 246 p = search_nonascii(p, e); 247 if (!p) { 248 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT; 249 return e - s; 250 } 251 while (p < e) { 252 int ret = rb_enc_precise_mbclen(p, e, enc); 253 if (!MBCLEN_CHARFOUND_P(ret)) { 254 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 255 return p - s; 256 } 257 p += MBCLEN_CHARFOUND_LEN(ret); 258 if (p < e) { 259 p = search_nonascii(p, e); 260 if (!p) { 261 *cr = ENC_CODERANGE_VALID; 262 return e - s; 263 } 264 } 265 } 266 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 267 return p - s; 268 } 269 else { 270 while (p < e) { 271 int ret = rb_enc_precise_mbclen(p, e, enc); 272 if (!MBCLEN_CHARFOUND_P(ret)) { 273 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 274 return p - s; 275 } 276 p += MBCLEN_CHARFOUND_LEN(ret); 277 } 278 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 279 return p - s; 280 } 281} 282 283static inline void 284str_enc_copy(VALUE str1, VALUE str2) 285{ 286 rb_enc_set_index(str1, ENCODING_GET(str2)); 287} 288 289static void 290rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src) 291{ 292 /* this function is designed for copying encoding and coderange 293 * from src to new string "dest" which is made from the part of src. 294 */ 295 str_enc_copy(dest, src); 296 if (RSTRING_LEN(dest) == 0) { 297 if (!rb_enc_asciicompat(STR_ENC_GET(src))) 298 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 299 else 300 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 301 return; 302 } 303 switch (ENC_CODERANGE(src)) { 304 case ENC_CODERANGE_7BIT: 305 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 306 break; 307 case ENC_CODERANGE_VALID: 308 if (!rb_enc_asciicompat(STR_ENC_GET(src)) || 309 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest))) 310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 311 else 312 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 313 break; 314 default: 315 break; 316 } 317} 318 319static void 320rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) 321{ 322 str_enc_copy(dest, src); 323 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); 324} 325 326int 327rb_enc_str_coderange(VALUE str) 328{ 329 int cr = ENC_CODERANGE(str); 330 331 if (cr == ENC_CODERANGE_UNKNOWN) { 332 rb_encoding *enc = STR_ENC_GET(str); 333 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); 334 ENC_CODERANGE_SET(str, cr); 335 } 336 return cr; 337} 338 339int 340rb_enc_str_asciionly_p(VALUE str) 341{ 342 rb_encoding *enc = STR_ENC_GET(str); 343 344 if (!rb_enc_asciicompat(enc)) 345 return FALSE; 346 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 347 return TRUE; 348 return FALSE; 349} 350 351static inline void 352str_mod_check(VALUE s, const char *p, long len) 353{ 354 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){ 355 rb_raise(rb_eRuntimeError, "string modified"); 356 } 357} 358 359size_t 360rb_str_capacity(VALUE str) 361{ 362 if (STR_EMBED_P(str)) { 363 return RSTRING_EMBED_LEN_MAX; 364 } 365 else if (STR_NOCAPA_P(str)) { 366 return RSTRING(str)->as.heap.len; 367 } 368 else { 369 return RSTRING(str)->as.heap.aux.capa; 370 } 371} 372 373static inline VALUE 374str_alloc(VALUE klass) 375{ 376 NEWOBJ_OF(str, struct RString, klass, T_STRING); 377 378 str->as.heap.ptr = 0; 379 str->as.heap.len = 0; 380 str->as.heap.aux.capa = 0; 381 382 return (VALUE)str; 383} 384 385static inline VALUE 386empty_str_alloc(VALUE klass) 387{ 388 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) { 389 RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline()); 390 } 391 return str_alloc(klass); 392} 393 394static VALUE 395str_new(VALUE klass, const char *ptr, long len) 396{ 397 VALUE str; 398 399 if (len < 0) { 400 rb_raise(rb_eArgError, "negative string size (or size too big)"); 401 } 402 403 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) { 404 RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline()); 405 } 406 407 str = str_alloc(klass); 408 if (len > RSTRING_EMBED_LEN_MAX) { 409 RSTRING(str)->as.heap.aux.capa = len; 410 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1); 411 STR_SET_NOEMBED(str); 412 } 413 else if (len == 0) { 414 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 415 } 416 if (ptr) { 417 memcpy(RSTRING_PTR(str), ptr, len); 418 } 419 STR_SET_LEN(str, len); 420 RSTRING_PTR(str)[len] = '\0'; 421 return str; 422} 423 424VALUE 425rb_str_new(const char *ptr, long len) 426{ 427 return str_new(rb_cString, ptr, len); 428} 429 430VALUE 431rb_usascii_str_new(const char *ptr, long len) 432{ 433 VALUE str = rb_str_new(ptr, len); 434 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 435 return str; 436} 437 438VALUE 439rb_enc_str_new(const char *ptr, long len, rb_encoding *enc) 440{ 441 VALUE str = rb_str_new(ptr, len); 442 rb_enc_associate(str, enc); 443 return str; 444} 445 446VALUE 447rb_str_new_cstr(const char *ptr) 448{ 449 if (!ptr) { 450 rb_raise(rb_eArgError, "NULL pointer given"); 451 } 452 return rb_str_new(ptr, strlen(ptr)); 453} 454 455RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr)) 456#define rb_str_new2 rb_str_new_cstr 457 458VALUE 459rb_usascii_str_new_cstr(const char *ptr) 460{ 461 VALUE str = rb_str_new2(ptr); 462 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 463 return str; 464} 465 466RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr)) 467#define rb_usascii_str_new2 rb_usascii_str_new_cstr 468 469VALUE 470rb_tainted_str_new(const char *ptr, long len) 471{ 472 VALUE str = rb_str_new(ptr, len); 473 474 OBJ_TAINT(str); 475 return str; 476} 477 478VALUE 479rb_tainted_str_new_cstr(const char *ptr) 480{ 481 VALUE str = rb_str_new2(ptr); 482 483 OBJ_TAINT(str); 484 return str; 485} 486 487RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr)) 488#define rb_tainted_str_new2 rb_tainted_str_new_cstr 489 490VALUE 491rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) 492{ 493 extern VALUE rb_cEncodingConverter; 494 rb_econv_t *ec; 495 rb_econv_result_t ret; 496 long len, olen; 497 VALUE econv_wrapper; 498 VALUE newstr; 499 const unsigned char *start, *sp; 500 unsigned char *dest, *dp; 501 size_t converted_output = 0; 502 503 if (!to) return str; 504 if (!from) from = rb_enc_get(str); 505 if (from == to) return str; 506 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) || 507 to == rb_ascii8bit_encoding()) { 508 if (STR_ENC_GET(str) != to) { 509 str = rb_str_dup(str); 510 rb_enc_associate(str, to); 511 } 512 return str; 513 } 514 515 len = RSTRING_LEN(str); 516 newstr = rb_str_new(0, len); 517 olen = len; 518 519 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter); 520 RBASIC(econv_wrapper)->klass = 0; 521 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts); 522 if (!ec) return str; 523 DATA_PTR(econv_wrapper) = ec; 524 525 sp = (unsigned char*)RSTRING_PTR(str); 526 start = sp; 527 while ((dest = (unsigned char*)RSTRING_PTR(newstr)), 528 (dp = dest + converted_output), 529 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)), 530 ret == econv_destination_buffer_full) { 531 /* destination buffer short */ 532 size_t converted_input = sp - start; 533 size_t rest = len - converted_input; 534 converted_output = dp - dest; 535 rb_str_set_len(newstr, converted_output); 536 if (converted_input && converted_output && 537 rest < (LONG_MAX / converted_output)) { 538 rest = (rest * converted_output) / converted_input; 539 } 540 else { 541 rest = olen; 542 } 543 olen += rest < 2 ? 2 : rest; 544 rb_str_resize(newstr, olen); 545 } 546 DATA_PTR(econv_wrapper) = 0; 547 rb_econv_close(ec); 548 rb_gc_force_recycle(econv_wrapper); 549 switch (ret) { 550 case econv_finished: 551 len = dp - (unsigned char*)RSTRING_PTR(newstr); 552 rb_str_set_len(newstr, len); 553 rb_enc_associate(newstr, to); 554 return newstr; 555 556 default: 557 /* some error, return original */ 558 return str; 559 } 560} 561 562VALUE 563rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to) 564{ 565 return rb_str_conv_enc_opts(str, from, to, 0, Qnil); 566} 567 568VALUE 569rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc) 570{ 571 VALUE str; 572 573 str = rb_tainted_str_new(ptr, len); 574 if (eenc == rb_usascii_encoding() && 575 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 576 rb_enc_associate(str, rb_ascii8bit_encoding()); 577 return str; 578 } 579 rb_enc_associate(str, eenc); 580 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding()); 581} 582 583VALUE 584rb_external_str_new(const char *ptr, long len) 585{ 586 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding()); 587} 588 589VALUE 590rb_external_str_new_cstr(const char *ptr) 591{ 592 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding()); 593} 594 595VALUE 596rb_locale_str_new(const char *ptr, long len) 597{ 598 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding()); 599} 600 601VALUE 602rb_locale_str_new_cstr(const char *ptr) 603{ 604 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding()); 605} 606 607VALUE 608rb_filesystem_str_new(const char *ptr, long len) 609{ 610 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding()); 611} 612 613VALUE 614rb_filesystem_str_new_cstr(const char *ptr) 615{ 616 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding()); 617} 618 619VALUE 620rb_str_export(VALUE str) 621{ 622 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding()); 623} 624 625VALUE 626rb_str_export_locale(VALUE str) 627{ 628 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding()); 629} 630 631VALUE 632rb_str_export_to_enc(VALUE str, rb_encoding *enc) 633{ 634 return rb_str_conv_enc(str, STR_ENC_GET(str), enc); 635} 636 637static VALUE 638str_replace_shared_without_enc(VALUE str2, VALUE str) 639{ 640 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) { 641 STR_SET_EMBED(str2); 642 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1); 643 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str)); 644 } 645 else { 646 str = rb_str_new_frozen(str); 647 FL_SET(str2, STR_NOEMBED); 648 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 649 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 650 RSTRING(str2)->as.heap.aux.shared = str; 651 FL_SET(str2, ELTS_SHARED); 652 } 653 return str2; 654} 655 656static VALUE 657str_replace_shared(VALUE str2, VALUE str) 658{ 659 str_replace_shared_without_enc(str2, str); 660 rb_enc_cr_str_exact_copy(str2, str); 661 return str2; 662} 663 664static VALUE 665str_new_shared(VALUE klass, VALUE str) 666{ 667 return str_replace_shared(str_alloc(klass), str); 668} 669 670static VALUE 671str_new3(VALUE klass, VALUE str) 672{ 673 return str_new_shared(klass, str); 674} 675 676VALUE 677rb_str_new_shared(VALUE str) 678{ 679 VALUE str2 = str_new3(rb_obj_class(str), str); 680 681 OBJ_INFECT(str2, str); 682 return str2; 683} 684 685RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str)) 686#define rb_str_new3 rb_str_new_shared 687 688static VALUE 689str_new4(VALUE klass, VALUE str) 690{ 691 VALUE str2; 692 693 str2 = str_alloc(klass); 694 STR_SET_NOEMBED(str2); 695 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 696 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 697 if (STR_SHARED_P(str)) { 698 VALUE shared = RSTRING(str)->as.heap.aux.shared; 699 assert(OBJ_FROZEN(shared)); 700 FL_SET(str2, ELTS_SHARED); 701 RSTRING(str2)->as.heap.aux.shared = shared; 702 } 703 else { 704 FL_SET(str, ELTS_SHARED); 705 RSTRING(str)->as.heap.aux.shared = str2; 706 } 707 rb_enc_cr_str_exact_copy(str2, str); 708 OBJ_INFECT(str2, str); 709 return str2; 710} 711 712VALUE 713rb_str_new_frozen(VALUE orig) 714{ 715 VALUE klass, str; 716 717 if (OBJ_FROZEN(orig)) return orig; 718 klass = rb_obj_class(orig); 719 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) { 720 long ofs; 721 assert(OBJ_FROZEN(str)); 722 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig); 723 if ((ofs > 0) || (klass != RBASIC(str)->klass) || 724 ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) || 725 ENCODING_GET(str) != ENCODING_GET(orig)) { 726 str = str_new3(klass, str); 727 RSTRING(str)->as.heap.ptr += ofs; 728 RSTRING(str)->as.heap.len -= ofs; 729 rb_enc_cr_str_exact_copy(str, orig); 730 OBJ_INFECT(str, orig); 731 } 732 } 733 else if (STR_EMBED_P(orig)) { 734 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig)); 735 rb_enc_cr_str_exact_copy(str, orig); 736 OBJ_INFECT(str, orig); 737 } 738 else if (STR_ASSOC_P(orig)) { 739 VALUE assoc = RSTRING(orig)->as.heap.aux.shared; 740 FL_UNSET(orig, STR_ASSOC); 741 str = str_new4(klass, orig); 742 FL_SET(str, STR_ASSOC); 743 RSTRING(str)->as.heap.aux.shared = assoc; 744 } 745 else { 746 str = str_new4(klass, orig); 747 } 748 OBJ_FREEZE(str); 749 return str; 750} 751 752RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig)) 753#define rb_str_new4 rb_str_new_frozen 754 755VALUE 756rb_str_new_with_class(VALUE obj, const char *ptr, long len) 757{ 758 return str_new(rb_obj_class(obj), ptr, len); 759} 760 761RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len), 762 rb_str_new_with_class, (obj, ptr, len)) 763#define rb_str_new5 rb_str_new_with_class 764 765static VALUE 766str_new_empty(VALUE str) 767{ 768 VALUE v = rb_str_new5(str, 0, 0); 769 rb_enc_copy(v, str); 770 OBJ_INFECT(v, str); 771 return v; 772} 773 774#define STR_BUF_MIN_SIZE 128 775 776VALUE 777rb_str_buf_new(long capa) 778{ 779 VALUE str = str_alloc(rb_cString); 780 781 if (capa < STR_BUF_MIN_SIZE) { 782 capa = STR_BUF_MIN_SIZE; 783 } 784 FL_SET(str, STR_NOEMBED); 785 RSTRING(str)->as.heap.aux.capa = capa; 786 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1); 787 RSTRING(str)->as.heap.ptr[0] = '\0'; 788 789 return str; 790} 791 792VALUE 793rb_str_buf_new_cstr(const char *ptr) 794{ 795 VALUE str; 796 long len = strlen(ptr); 797 798 str = rb_str_buf_new(len); 799 rb_str_buf_cat(str, ptr, len); 800 801 return str; 802} 803 804RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr)) 805#define rb_str_buf_new2 rb_str_buf_new_cstr 806 807VALUE 808rb_str_tmp_new(long len) 809{ 810 return str_new(0, 0, len); 811} 812 813void * 814rb_alloc_tmp_buffer(volatile VALUE *store, long len) 815{ 816 VALUE s = rb_str_tmp_new(len); 817 *store = s; 818 return RSTRING_PTR(s); 819} 820 821void 822rb_free_tmp_buffer(volatile VALUE *store) 823{ 824 VALUE s = *store; 825 *store = 0; 826 if (s) rb_str_clear(s); 827} 828 829void 830rb_str_free(VALUE str) 831{ 832 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 833 xfree(RSTRING(str)->as.heap.ptr); 834 } 835} 836 837RUBY_FUNC_EXPORTED size_t 838rb_str_memsize(VALUE str) 839{ 840 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 841 return RSTRING(str)->as.heap.aux.capa; 842 } 843 else { 844 return 0; 845 } 846} 847 848VALUE 849rb_str_to_str(VALUE str) 850{ 851 return rb_convert_type(str, T_STRING, "String", "to_str"); 852} 853 854static inline void str_discard(VALUE str); 855 856void 857rb_str_shared_replace(VALUE str, VALUE str2) 858{ 859 rb_encoding *enc; 860 int cr; 861 if (str == str2) return; 862 enc = STR_ENC_GET(str2); 863 cr = ENC_CODERANGE(str2); 864 str_discard(str); 865 OBJ_INFECT(str, str2); 866 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) { 867 STR_SET_EMBED(str); 868 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1); 869 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); 870 rb_enc_associate(str, enc); 871 ENC_CODERANGE_SET(str, cr); 872 return; 873 } 874 STR_SET_NOEMBED(str); 875 STR_UNSET_NOCAPA(str); 876 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 877 RSTRING(str)->as.heap.len = RSTRING_LEN(str2); 878 if (STR_NOCAPA_P(str2)) { 879 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA); 880 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared; 881 } 882 else { 883 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa; 884 } 885 STR_SET_EMBED(str2); /* abandon str2 */ 886 RSTRING_PTR(str2)[0] = 0; 887 STR_SET_EMBED_LEN(str2, 0); 888 rb_enc_associate(str, enc); 889 ENC_CODERANGE_SET(str, cr); 890} 891 892static ID id_to_s; 893 894VALUE 895rb_obj_as_string(VALUE obj) 896{ 897 VALUE str; 898 899 if (RB_TYPE_P(obj, T_STRING)) { 900 return obj; 901 } 902 str = rb_funcall(obj, id_to_s, 0); 903 if (!RB_TYPE_P(str, T_STRING)) 904 return rb_any_to_s(obj); 905 if (OBJ_TAINTED(obj)) OBJ_TAINT(str); 906 return str; 907} 908 909static VALUE 910str_replace(VALUE str, VALUE str2) 911{ 912 long len; 913 914 len = RSTRING_LEN(str2); 915 if (STR_ASSOC_P(str2)) { 916 str2 = rb_str_new4(str2); 917 } 918 if (STR_SHARED_P(str2)) { 919 VALUE shared = RSTRING(str2)->as.heap.aux.shared; 920 assert(OBJ_FROZEN(shared)); 921 STR_SET_NOEMBED(str); 922 RSTRING(str)->as.heap.len = len; 923 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 924 FL_SET(str, ELTS_SHARED); 925 FL_UNSET(str, STR_ASSOC); 926 RSTRING(str)->as.heap.aux.shared = shared; 927 } 928 else { 929 str_replace_shared(str, str2); 930 } 931 932 OBJ_INFECT(str, str2); 933 rb_enc_cr_str_exact_copy(str, str2); 934 return str; 935} 936 937static VALUE 938str_duplicate(VALUE klass, VALUE str) 939{ 940 VALUE dup = str_alloc(klass); 941 str_replace(dup, str); 942 return dup; 943} 944 945VALUE 946rb_str_dup(VALUE str) 947{ 948 return str_duplicate(rb_obj_class(str), str); 949} 950 951VALUE 952rb_str_resurrect(VALUE str) 953{ 954 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) { 955 RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str), 956 rb_sourcefile(), rb_sourceline()); 957 } 958 return str_replace(str_alloc(rb_cString), str); 959} 960 961/* 962 * call-seq: 963 * String.new(str="") -> new_str 964 * 965 * Returns a new string object containing a copy of <i>str</i>. 966 */ 967 968static VALUE 969rb_str_init(int argc, VALUE *argv, VALUE str) 970{ 971 VALUE orig; 972 973 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1) 974 rb_str_replace(str, orig); 975 return str; 976} 977 978static inline long 979enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) 980{ 981 long c; 982 const char *q; 983 984 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 985 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 986 } 987 else if (rb_enc_asciicompat(enc)) { 988 c = 0; 989 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { 990 while (p < e) { 991 if (ISASCII(*p)) { 992 q = search_nonascii(p, e); 993 if (!q) 994 return c + (e - p); 995 c += q - p; 996 p = q; 997 } 998 p += rb_enc_fast_mbclen(p, e, enc); 999 c++; 1000 } 1001 } 1002 else { 1003 while (p < e) { 1004 if (ISASCII(*p)) { 1005 q = search_nonascii(p, e); 1006 if (!q) 1007 return c + (e - p); 1008 c += q - p; 1009 p = q; 1010 } 1011 p += rb_enc_mbclen(p, e, enc); 1012 c++; 1013 } 1014 } 1015 return c; 1016 } 1017 1018 for (c=0; p<e; c++) { 1019 p += rb_enc_mbclen(p, e, enc); 1020 } 1021 return c; 1022} 1023 1024long 1025rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) 1026{ 1027 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN); 1028} 1029 1030long 1031rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) 1032{ 1033 long c; 1034 const char *q; 1035 int ret; 1036 1037 *cr = 0; 1038 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 1039 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 1040 } 1041 else if (rb_enc_asciicompat(enc)) { 1042 c = 0; 1043 while (p < e) { 1044 if (ISASCII(*p)) { 1045 q = search_nonascii(p, e); 1046 if (!q) { 1047 if (!*cr) *cr = ENC_CODERANGE_7BIT; 1048 return c + (e - p); 1049 } 1050 c += q - p; 1051 p = q; 1052 } 1053 ret = rb_enc_precise_mbclen(p, e, enc); 1054 if (MBCLEN_CHARFOUND_P(ret)) { 1055 *cr |= ENC_CODERANGE_VALID; 1056 p += MBCLEN_CHARFOUND_LEN(ret); 1057 } 1058 else { 1059 *cr = ENC_CODERANGE_BROKEN; 1060 p++; 1061 } 1062 c++; 1063 } 1064 if (!*cr) *cr = ENC_CODERANGE_7BIT; 1065 return c; 1066 } 1067 1068 for (c=0; p<e; c++) { 1069 ret = rb_enc_precise_mbclen(p, e, enc); 1070 if (MBCLEN_CHARFOUND_P(ret)) { 1071 *cr |= ENC_CODERANGE_VALID; 1072 p += MBCLEN_CHARFOUND_LEN(ret); 1073 } 1074 else { 1075 *cr = ENC_CODERANGE_BROKEN; 1076 if (p + rb_enc_mbminlen(enc) <= e) 1077 p += rb_enc_mbminlen(enc); 1078 else 1079 p = e; 1080 } 1081 } 1082 if (!*cr) *cr = ENC_CODERANGE_7BIT; 1083 return c; 1084} 1085 1086#ifdef NONASCII_MASK 1087#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) 1088 1089/* 1090 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx 1091 * bit represention. (see http://en.wikipedia.org/wiki/UTF-8) 1092 * Therefore, following pseudo code can detect UTF-8 leading byte. 1093 * 1094 * if (!(byte & 0x80)) 1095 * byte |= 0x40; // turn on bit6 1096 * return ((byte>>6) & 1); // bit6 represent it's leading byte or not. 1097 * 1098 * This function calculate every bytes in the argument word `s' 1099 * using the above logic concurrently. and gather every bytes result. 1100 */ 1101static inline VALUE 1102count_utf8_lead_bytes_with_word(const VALUE *s) 1103{ 1104 VALUE d = *s; 1105 1106 /* Transform into bit0 represent UTF-8 leading or not. */ 1107 d |= ~(d>>1); 1108 d >>= 6; 1109 d &= NONASCII_MASK >> 7; 1110 1111 /* Gather every bytes. */ 1112 d += (d>>8); 1113 d += (d>>16); 1114#if SIZEOF_VALUE == 8 1115 d += (d>>32); 1116#endif 1117 return (d&0xF); 1118} 1119#endif 1120 1121static long 1122str_strlen(VALUE str, rb_encoding *enc) 1123{ 1124 const char *p, *e; 1125 long n; 1126 int cr; 1127 1128 if (single_byte_optimizable(str)) return RSTRING_LEN(str); 1129 if (!enc) enc = STR_ENC_GET(str); 1130 p = RSTRING_PTR(str); 1131 e = RSTRING_END(str); 1132 cr = ENC_CODERANGE(str); 1133#ifdef NONASCII_MASK 1134 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 1135 enc == rb_utf8_encoding()) { 1136 1137 VALUE len = 0; 1138 if ((int)sizeof(VALUE) * 2 < e - p) { 1139 const VALUE *s, *t; 1140 const VALUE lowbits = sizeof(VALUE) - 1; 1141 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 1142 t = (const VALUE*)(~lowbits & (VALUE)e); 1143 while (p < (const char *)s) { 1144 if (is_utf8_lead_byte(*p)) len++; 1145 p++; 1146 } 1147 while (s < t) { 1148 len += count_utf8_lead_bytes_with_word(s); 1149 s++; 1150 } 1151 p = (const char *)s; 1152 } 1153 while (p < e) { 1154 if (is_utf8_lead_byte(*p)) len++; 1155 p++; 1156 } 1157 return (long)len; 1158 } 1159#endif 1160 n = rb_enc_strlen_cr(p, e, enc, &cr); 1161 if (cr) { 1162 ENC_CODERANGE_SET(str, cr); 1163 } 1164 return n; 1165} 1166 1167long 1168rb_str_strlen(VALUE str) 1169{ 1170 return str_strlen(str, STR_ENC_GET(str)); 1171} 1172 1173/* 1174 * call-seq: 1175 * str.length -> integer 1176 * str.size -> integer 1177 * 1178 * Returns the character length of <i>str</i>. 1179 */ 1180 1181VALUE 1182rb_str_length(VALUE str) 1183{ 1184 long len; 1185 1186 len = str_strlen(str, STR_ENC_GET(str)); 1187 return LONG2NUM(len); 1188} 1189 1190/* 1191 * call-seq: 1192 * str.bytesize -> integer 1193 * 1194 * Returns the length of +str+ in bytes. 1195 * 1196 * "\x80\u3042".bytesize #=> 4 1197 * "hello".bytesize #=> 5 1198 */ 1199 1200static VALUE 1201rb_str_bytesize(VALUE str) 1202{ 1203 return LONG2NUM(RSTRING_LEN(str)); 1204} 1205 1206/* 1207 * call-seq: 1208 * str.empty? -> true or false 1209 * 1210 * Returns <code>true</code> if <i>str</i> has a length of zero. 1211 * 1212 * "hello".empty? #=> false 1213 * " ".empty? #=> false 1214 * "".empty? #=> true 1215 */ 1216 1217static VALUE 1218rb_str_empty(VALUE str) 1219{ 1220 if (RSTRING_LEN(str) == 0) 1221 return Qtrue; 1222 return Qfalse; 1223} 1224 1225/* 1226 * call-seq: 1227 * str + other_str -> new_str 1228 * 1229 * Concatenation---Returns a new <code>String</code> containing 1230 * <i>other_str</i> concatenated to <i>str</i>. 1231 * 1232 * "Hello from " + self.to_s #=> "Hello from main" 1233 */ 1234 1235VALUE 1236rb_str_plus(VALUE str1, VALUE str2) 1237{ 1238 VALUE str3; 1239 rb_encoding *enc; 1240 1241 StringValue(str2); 1242 enc = rb_enc_check(str1, str2); 1243 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2)); 1244 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1)); 1245 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1), 1246 RSTRING_PTR(str2), RSTRING_LEN(str2)); 1247 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0'; 1248 1249 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2)) 1250 OBJ_TAINT(str3); 1251 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc), 1252 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2))); 1253 return str3; 1254} 1255 1256/* 1257 * call-seq: 1258 * str * integer -> new_str 1259 * 1260 * Copy --- Returns a new String containing +integer+ copies of the receiver. 1261 * +integer+ must be greater than or equal to 0. 1262 * 1263 * "Ho! " * 3 #=> "Ho! Ho! Ho! " 1264 * "Ho! " * 0 #=> "" 1265 */ 1266 1267VALUE 1268rb_str_times(VALUE str, VALUE times) 1269{ 1270 VALUE str2; 1271 long n, len; 1272 char *ptr2; 1273 1274 len = NUM2LONG(times); 1275 if (len < 0) { 1276 rb_raise(rb_eArgError, "negative argument"); 1277 } 1278 if (len && LONG_MAX/len < RSTRING_LEN(str)) { 1279 rb_raise(rb_eArgError, "argument too big"); 1280 } 1281 1282 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str)); 1283 ptr2 = RSTRING_PTR(str2); 1284 if (len) { 1285 n = RSTRING_LEN(str); 1286 memcpy(ptr2, RSTRING_PTR(str), n); 1287 while (n <= len/2) { 1288 memcpy(ptr2 + n, ptr2, n); 1289 n *= 2; 1290 } 1291 memcpy(ptr2 + n, ptr2, len-n); 1292 } 1293 ptr2[RSTRING_LEN(str2)] = '\0'; 1294 OBJ_INFECT(str2, str); 1295 rb_enc_cr_str_copy_for_substr(str2, str); 1296 1297 return str2; 1298} 1299 1300/* 1301 * call-seq: 1302 * str % arg -> new_str 1303 * 1304 * Format---Uses <i>str</i> as a format specification, and returns the result 1305 * of applying it to <i>arg</i>. If the format specification contains more than 1306 * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code> 1307 * containing the values to be substituted. See <code>Kernel::sprintf</code> for 1308 * details of the format string. 1309 * 1310 * "%05d" % 123 #=> "00123" 1311 * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6" 1312 * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar" 1313 */ 1314 1315static VALUE 1316rb_str_format_m(VALUE str, VALUE arg) 1317{ 1318 volatile VALUE tmp = rb_check_array_type(arg); 1319 1320 if (!NIL_P(tmp)) { 1321 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str); 1322 } 1323 return rb_str_format(1, &arg, str); 1324} 1325 1326static inline void 1327str_modifiable(VALUE str) 1328{ 1329 if (FL_TEST(str, STR_TMPLOCK)) { 1330 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked"); 1331 } 1332 rb_check_frozen(str); 1333 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4) 1334 rb_raise(rb_eSecurityError, "Insecure: can't modify string"); 1335} 1336 1337static inline int 1338str_independent(VALUE str) 1339{ 1340 str_modifiable(str); 1341 if (!STR_SHARED_P(str)) return 1; 1342 if (STR_EMBED_P(str)) return 1; 1343 return 0; 1344} 1345 1346static void 1347str_make_independent_expand(VALUE str, long expand) 1348{ 1349 char *ptr; 1350 long len = RSTRING_LEN(str); 1351 long capa = len + expand; 1352 1353 if (len > capa) len = capa; 1354 ptr = ALLOC_N(char, capa + 1); 1355 if (RSTRING_PTR(str)) { 1356 memcpy(ptr, RSTRING_PTR(str), len); 1357 } 1358 STR_SET_NOEMBED(str); 1359 STR_UNSET_NOCAPA(str); 1360 ptr[len] = 0; 1361 RSTRING(str)->as.heap.ptr = ptr; 1362 RSTRING(str)->as.heap.len = len; 1363 RSTRING(str)->as.heap.aux.capa = capa; 1364} 1365 1366#define str_make_independent(str) str_make_independent_expand((str), 0L) 1367 1368void 1369rb_str_modify(VALUE str) 1370{ 1371 if (!str_independent(str)) 1372 str_make_independent(str); 1373 ENC_CODERANGE_CLEAR(str); 1374} 1375 1376void 1377rb_str_modify_expand(VALUE str, long expand) 1378{ 1379 if (expand < 0) { 1380 rb_raise(rb_eArgError, "negative expanding string size"); 1381 } 1382 if (!str_independent(str)) { 1383 str_make_independent_expand(str, expand); 1384 } 1385 else if (expand > 0) { 1386 long len = RSTRING_LEN(str); 1387 long capa = len + expand; 1388 if (!STR_EMBED_P(str)) { 1389 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1); 1390 STR_UNSET_NOCAPA(str); 1391 RSTRING(str)->as.heap.aux.capa = capa; 1392 } 1393 else if (capa > RSTRING_EMBED_LEN_MAX) { 1394 str_make_independent_expand(str, expand); 1395 } 1396 } 1397 ENC_CODERANGE_CLEAR(str); 1398} 1399 1400/* As rb_str_modify(), but don't clear coderange */ 1401static void 1402str_modify_keep_cr(VALUE str) 1403{ 1404 if (!str_independent(str)) 1405 str_make_independent(str); 1406 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN) 1407 /* Force re-scan later */ 1408 ENC_CODERANGE_CLEAR(str); 1409} 1410 1411static inline void 1412str_discard(VALUE str) 1413{ 1414 str_modifiable(str); 1415 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) { 1416 xfree(RSTRING_PTR(str)); 1417 RSTRING(str)->as.heap.ptr = 0; 1418 RSTRING(str)->as.heap.len = 0; 1419 } 1420} 1421 1422void 1423rb_str_associate(VALUE str, VALUE add) 1424{ 1425 /* sanity check */ 1426 rb_check_frozen(str); 1427 if (STR_ASSOC_P(str)) { 1428 /* already associated */ 1429 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add); 1430 } 1431 else { 1432 if (STR_SHARED_P(str)) { 1433 VALUE assoc = RSTRING(str)->as.heap.aux.shared; 1434 str_make_independent(str); 1435 if (STR_ASSOC_P(assoc)) { 1436 assoc = RSTRING(assoc)->as.heap.aux.shared; 1437 rb_ary_concat(assoc, add); 1438 add = assoc; 1439 } 1440 } 1441 else if (STR_EMBED_P(str)) { 1442 str_make_independent(str); 1443 } 1444 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) { 1445 RESIZE_CAPA(str, RSTRING_LEN(str)); 1446 } 1447 FL_SET(str, STR_ASSOC); 1448 RBASIC(add)->klass = 0; 1449 RSTRING(str)->as.heap.aux.shared = add; 1450 } 1451} 1452 1453VALUE 1454rb_str_associated(VALUE str) 1455{ 1456 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared; 1457 if (STR_ASSOC_P(str)) { 1458 return RSTRING(str)->as.heap.aux.shared; 1459 } 1460 return Qfalse; 1461} 1462 1463void 1464rb_must_asciicompat(VALUE str) 1465{ 1466 rb_encoding *enc = rb_enc_get(str); 1467 if (!rb_enc_asciicompat(enc)) { 1468 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc)); 1469 } 1470} 1471 1472VALUE 1473rb_string_value(volatile VALUE *ptr) 1474{ 1475 VALUE s = *ptr; 1476 if (!RB_TYPE_P(s, T_STRING)) { 1477 s = rb_str_to_str(s); 1478 *ptr = s; 1479 } 1480 return s; 1481} 1482 1483char * 1484rb_string_value_ptr(volatile VALUE *ptr) 1485{ 1486 VALUE str = rb_string_value(ptr); 1487 return RSTRING_PTR(str); 1488} 1489 1490char * 1491rb_string_value_cstr(volatile VALUE *ptr) 1492{ 1493 VALUE str = rb_string_value(ptr); 1494 char *s = RSTRING_PTR(str); 1495 long len = RSTRING_LEN(str); 1496 1497 if (!s || memchr(s, 0, len)) { 1498 rb_raise(rb_eArgError, "string contains null byte"); 1499 } 1500 if (s[len]) { 1501 rb_str_modify(str); 1502 s = RSTRING_PTR(str); 1503 s[RSTRING_LEN(str)] = 0; 1504 } 1505 return s; 1506} 1507 1508VALUE 1509rb_check_string_type(VALUE str) 1510{ 1511 str = rb_check_convert_type(str, T_STRING, "String", "to_str"); 1512 return str; 1513} 1514 1515/* 1516 * call-seq: 1517 * String.try_convert(obj) -> string or nil 1518 * 1519 * Try to convert <i>obj</i> into a String, using to_str method. 1520 * Returns converted string or nil if <i>obj</i> cannot be converted 1521 * for any reason. 1522 * 1523 * String.try_convert("str") #=> "str" 1524 * String.try_convert(/re/) #=> nil 1525 */ 1526static VALUE 1527rb_str_s_try_convert(VALUE dummy, VALUE str) 1528{ 1529 return rb_check_string_type(str); 1530} 1531 1532static char* 1533str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc) 1534{ 1535 long nth = *nthp; 1536 if (rb_enc_mbmaxlen(enc) == 1) { 1537 p += nth; 1538 } 1539 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 1540 p += nth * rb_enc_mbmaxlen(enc); 1541 } 1542 else if (rb_enc_asciicompat(enc)) { 1543 const char *p2, *e2; 1544 int n; 1545 1546 while (p < e && 0 < nth) { 1547 e2 = p + nth; 1548 if (e < e2) { 1549 *nthp = nth; 1550 return (char *)e; 1551 } 1552 if (ISASCII(*p)) { 1553 p2 = search_nonascii(p, e2); 1554 if (!p2) { 1555 nth -= e2 - p; 1556 *nthp = nth; 1557 return (char *)e2; 1558 } 1559 nth -= p2 - p; 1560 p = p2; 1561 } 1562 n = rb_enc_mbclen(p, e, enc); 1563 p += n; 1564 nth--; 1565 } 1566 *nthp = nth; 1567 if (nth != 0) { 1568 return (char *)e; 1569 } 1570 return (char *)p; 1571 } 1572 else { 1573 while (p < e && nth--) { 1574 p += rb_enc_mbclen(p, e, enc); 1575 } 1576 } 1577 if (p > e) p = e; 1578 *nthp = nth; 1579 return (char*)p; 1580} 1581 1582char* 1583rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc) 1584{ 1585 return str_nth_len(p, e, &nth, enc); 1586} 1587 1588static char* 1589str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 1590{ 1591 if (singlebyte) 1592 p += nth; 1593 else { 1594 p = str_nth_len(p, e, &nth, enc); 1595 } 1596 if (!p) return 0; 1597 if (p > e) p = e; 1598 return (char *)p; 1599} 1600 1601/* char offset to byte offset */ 1602static long 1603str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 1604{ 1605 const char *pp = str_nth(p, e, nth, enc, singlebyte); 1606 if (!pp) return e - p; 1607 return pp - p; 1608} 1609 1610long 1611rb_str_offset(VALUE str, long pos) 1612{ 1613 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 1614 STR_ENC_GET(str), single_byte_optimizable(str)); 1615} 1616 1617#ifdef NONASCII_MASK 1618static char * 1619str_utf8_nth(const char *p, const char *e, long *nthp) 1620{ 1621 long nth = *nthp; 1622 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) { 1623 const VALUE *s, *t; 1624 const VALUE lowbits = sizeof(VALUE) - 1; 1625 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 1626 t = (const VALUE*)(~lowbits & (VALUE)e); 1627 while (p < (const char *)s) { 1628 if (is_utf8_lead_byte(*p)) nth--; 1629 p++; 1630 } 1631 do { 1632 nth -= count_utf8_lead_bytes_with_word(s); 1633 s++; 1634 } while (s < t && (int)sizeof(VALUE) <= nth); 1635 p = (char *)s; 1636 } 1637 while (p < e) { 1638 if (is_utf8_lead_byte(*p)) { 1639 if (nth == 0) break; 1640 nth--; 1641 } 1642 p++; 1643 } 1644 *nthp = nth; 1645 return (char *)p; 1646} 1647 1648static long 1649str_utf8_offset(const char *p, const char *e, long nth) 1650{ 1651 const char *pp = str_utf8_nth(p, e, &nth); 1652 return pp - p; 1653} 1654#endif 1655 1656/* byte offset to char offset */ 1657long 1658rb_str_sublen(VALUE str, long pos) 1659{ 1660 if (single_byte_optimizable(str) || pos < 0) 1661 return pos; 1662 else { 1663 char *p = RSTRING_PTR(str); 1664 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str)); 1665 } 1666} 1667 1668VALUE 1669rb_str_subseq(VALUE str, long beg, long len) 1670{ 1671 VALUE str2; 1672 1673 if (RSTRING_LEN(str) == beg + len && 1674 RSTRING_EMBED_LEN_MAX < len) { 1675 str2 = rb_str_new_shared(rb_str_new_frozen(str)); 1676 rb_str_drop_bytes(str2, beg); 1677 } 1678 else { 1679 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len); 1680 RB_GC_GUARD(str); 1681 } 1682 1683 rb_enc_cr_str_copy_for_substr(str2, str); 1684 OBJ_INFECT(str2, str); 1685 1686 return str2; 1687} 1688 1689static char * 1690rb_str_subpos(VALUE str, long beg, long *lenp) 1691{ 1692 long len = *lenp; 1693 long slen = -1L; 1694 long blen = RSTRING_LEN(str); 1695 rb_encoding *enc = STR_ENC_GET(str); 1696 char *p, *s = RSTRING_PTR(str), *e = s + blen; 1697 1698 if (len < 0) return 0; 1699 if (!blen) { 1700 len = 0; 1701 } 1702 if (single_byte_optimizable(str)) { 1703 if (beg > blen) return 0; 1704 if (beg < 0) { 1705 beg += blen; 1706 if (beg < 0) return 0; 1707 } 1708 if (beg + len > blen) 1709 len = blen - beg; 1710 if (len < 0) return 0; 1711 p = s + beg; 1712 goto end; 1713 } 1714 if (beg < 0) { 1715 if (len > -beg) len = -beg; 1716 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) { 1717 beg = -beg; 1718 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0); 1719 p = e; 1720 if (!p) return 0; 1721 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0); 1722 if (!p) return 0; 1723 len = e - p; 1724 goto end; 1725 } 1726 else { 1727 slen = str_strlen(str, enc); 1728 beg += slen; 1729 if (beg < 0) return 0; 1730 p = s + beg; 1731 if (len == 0) goto end; 1732 } 1733 } 1734 else if (beg > 0 && beg > RSTRING_LEN(str)) { 1735 return 0; 1736 } 1737 if (len == 0) { 1738 if (beg > str_strlen(str, enc)) return 0; 1739 p = s + beg; 1740 } 1741#ifdef NONASCII_MASK 1742 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 1743 enc == rb_utf8_encoding()) { 1744 p = str_utf8_nth(s, e, &beg); 1745 if (beg > 0) return 0; 1746 len = str_utf8_offset(p, e, len); 1747 } 1748#endif 1749 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 1750 int char_sz = rb_enc_mbmaxlen(enc); 1751 1752 p = s + beg * char_sz; 1753 if (p > e) { 1754 return 0; 1755 } 1756 else if (len * char_sz > e - p) 1757 len = e - p; 1758 else 1759 len *= char_sz; 1760 } 1761 else if ((p = str_nth_len(s, e, &beg, enc)) == e) { 1762 if (beg > 0) return 0; 1763 len = 0; 1764 } 1765 else { 1766 len = str_offset(p, e, len, enc, 0); 1767 } 1768 end: 1769 *lenp = len; 1770 RB_GC_GUARD(str); 1771 return p; 1772} 1773 1774VALUE 1775rb_str_substr(VALUE str, long beg, long len) 1776{ 1777 VALUE str2; 1778 char *p = rb_str_subpos(str, beg, &len); 1779 1780 if (!p) return Qnil; 1781 if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) { 1782 str2 = rb_str_new4(str); 1783 str2 = str_new3(rb_obj_class(str2), str2); 1784 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 1785 RSTRING(str2)->as.heap.len = len; 1786 } 1787 else { 1788 str2 = rb_str_new5(str, p, len); 1789 rb_enc_cr_str_copy_for_substr(str2, str); 1790 OBJ_INFECT(str2, str); 1791 RB_GC_GUARD(str); 1792 } 1793 1794 return str2; 1795} 1796 1797VALUE 1798rb_str_freeze(VALUE str) 1799{ 1800 if (STR_ASSOC_P(str)) { 1801 VALUE ary = RSTRING(str)->as.heap.aux.shared; 1802 OBJ_FREEZE(ary); 1803 } 1804 return rb_obj_freeze(str); 1805} 1806 1807RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str)) 1808#define rb_str_dup_frozen rb_str_new_frozen 1809 1810VALUE 1811rb_str_locktmp(VALUE str) 1812{ 1813 if (FL_TEST(str, STR_TMPLOCK)) { 1814 rb_raise(rb_eRuntimeError, "temporal locking already locked string"); 1815 } 1816 FL_SET(str, STR_TMPLOCK); 1817 return str; 1818} 1819 1820VALUE 1821rb_str_unlocktmp(VALUE str) 1822{ 1823 if (!FL_TEST(str, STR_TMPLOCK)) { 1824 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string"); 1825 } 1826 FL_UNSET(str, STR_TMPLOCK); 1827 return str; 1828} 1829 1830VALUE 1831rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg) 1832{ 1833 rb_str_locktmp(str); 1834 return rb_ensure(func, arg, rb_str_unlocktmp, str); 1835} 1836 1837void 1838rb_str_set_len(VALUE str, long len) 1839{ 1840 long capa; 1841 1842 str_modifiable(str); 1843 if (STR_SHARED_P(str)) { 1844 rb_raise(rb_eRuntimeError, "can't set length of shared string"); 1845 } 1846 if (len > (capa = (long)rb_str_capacity(str))) { 1847 rb_bug("probable buffer overflow: %ld for %ld", len, capa); 1848 } 1849 STR_SET_LEN(str, len); 1850 RSTRING_PTR(str)[len] = '\0'; 1851} 1852 1853VALUE 1854rb_str_resize(VALUE str, long len) 1855{ 1856 long slen; 1857 int independent; 1858 1859 if (len < 0) { 1860 rb_raise(rb_eArgError, "negative string size (or size too big)"); 1861 } 1862 1863 independent = str_independent(str); 1864 ENC_CODERANGE_CLEAR(str); 1865 slen = RSTRING_LEN(str); 1866 if (len != slen) { 1867 if (STR_EMBED_P(str)) { 1868 if (len <= RSTRING_EMBED_LEN_MAX) { 1869 STR_SET_EMBED_LEN(str, len); 1870 RSTRING(str)->as.ary[len] = '\0'; 1871 return str; 1872 } 1873 str_make_independent_expand(str, len - slen); 1874 STR_SET_NOEMBED(str); 1875 } 1876 else if (len <= RSTRING_EMBED_LEN_MAX) { 1877 char *ptr = RSTRING(str)->as.heap.ptr; 1878 STR_SET_EMBED(str); 1879 if (slen > len) slen = len; 1880 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen); 1881 RSTRING(str)->as.ary[len] = '\0'; 1882 STR_SET_EMBED_LEN(str, len); 1883 if (independent) xfree(ptr); 1884 return str; 1885 } 1886 else if (!independent) { 1887 str_make_independent_expand(str, len - slen); 1888 } 1889 else if (slen < len || slen - len > 1024) { 1890 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1); 1891 } 1892 if (!STR_NOCAPA_P(str)) { 1893 RSTRING(str)->as.heap.aux.capa = len; 1894 } 1895 RSTRING(str)->as.heap.len = len; 1896 RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */ 1897 } 1898 return str; 1899} 1900 1901static VALUE 1902str_buf_cat(VALUE str, const char *ptr, long len) 1903{ 1904 long capa, total, off = -1; 1905 1906 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) { 1907 off = ptr - RSTRING_PTR(str); 1908 } 1909 rb_str_modify(str); 1910 if (len == 0) return 0; 1911 if (STR_ASSOC_P(str)) { 1912 FL_UNSET(str, STR_ASSOC); 1913 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str); 1914 } 1915 else if (STR_EMBED_P(str)) { 1916 capa = RSTRING_EMBED_LEN_MAX; 1917 } 1918 else { 1919 capa = RSTRING(str)->as.heap.aux.capa; 1920 } 1921 if (RSTRING_LEN(str) >= LONG_MAX - len) { 1922 rb_raise(rb_eArgError, "string sizes too big"); 1923 } 1924 total = RSTRING_LEN(str)+len; 1925 if (capa <= total) { 1926 while (total > capa) { 1927 if (capa + 1 >= LONG_MAX / 2) { 1928 capa = (total + 4095) / 4096; 1929 break; 1930 } 1931 capa = (capa + 1) * 2; 1932 } 1933 RESIZE_CAPA(str, capa); 1934 } 1935 if (off != -1) { 1936 ptr = RSTRING_PTR(str) + off; 1937 } 1938 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len); 1939 STR_SET_LEN(str, total); 1940 RSTRING_PTR(str)[total] = '\0'; /* sentinel */ 1941 1942 return str; 1943} 1944 1945#define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr)) 1946 1947VALUE 1948rb_str_buf_cat(VALUE str, const char *ptr, long len) 1949{ 1950 if (len == 0) return str; 1951 if (len < 0) { 1952 rb_raise(rb_eArgError, "negative string size (or size too big)"); 1953 } 1954 return str_buf_cat(str, ptr, len); 1955} 1956 1957VALUE 1958rb_str_buf_cat2(VALUE str, const char *ptr) 1959{ 1960 return rb_str_buf_cat(str, ptr, strlen(ptr)); 1961} 1962 1963VALUE 1964rb_str_cat(VALUE str, const char *ptr, long len) 1965{ 1966 if (len < 0) { 1967 rb_raise(rb_eArgError, "negative string size (or size too big)"); 1968 } 1969 if (STR_ASSOC_P(str)) { 1970 char *p; 1971 rb_str_modify_expand(str, len); 1972 p = RSTRING(str)->as.heap.ptr; 1973 memcpy(p + RSTRING(str)->as.heap.len, ptr, len); 1974 len = RSTRING(str)->as.heap.len += len; 1975 p[len] = '\0'; /* sentinel */ 1976 return str; 1977 } 1978 1979 return rb_str_buf_cat(str, ptr, len); 1980} 1981 1982VALUE 1983rb_str_cat2(VALUE str, const char *ptr) 1984{ 1985 return rb_str_cat(str, ptr, strlen(ptr)); 1986} 1987 1988static VALUE 1989rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, 1990 int ptr_encindex, int ptr_cr, int *ptr_cr_ret) 1991{ 1992 int str_encindex = ENCODING_GET(str); 1993 int res_encindex; 1994 int str_cr, res_cr; 1995 1996 str_cr = ENC_CODERANGE(str); 1997 1998 if (str_encindex == ptr_encindex) { 1999 if (str_cr == ENC_CODERANGE_UNKNOWN) 2000 ptr_cr = ENC_CODERANGE_UNKNOWN; 2001 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 2002 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex)); 2003 } 2004 } 2005 else { 2006 rb_encoding *str_enc = rb_enc_from_index(str_encindex); 2007 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex); 2008 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) { 2009 if (len == 0) 2010 return str; 2011 if (RSTRING_LEN(str) == 0) { 2012 rb_str_buf_cat(str, ptr, len); 2013 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr); 2014 return str; 2015 } 2016 goto incompatible; 2017 } 2018 if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 2019 ptr_cr = coderange_scan(ptr, len, ptr_enc); 2020 } 2021 if (str_cr == ENC_CODERANGE_UNKNOWN) { 2022 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) { 2023 str_cr = rb_enc_str_coderange(str); 2024 } 2025 } 2026 } 2027 if (ptr_cr_ret) 2028 *ptr_cr_ret = ptr_cr; 2029 2030 if (str_encindex != ptr_encindex && 2031 str_cr != ENC_CODERANGE_7BIT && 2032 ptr_cr != ENC_CODERANGE_7BIT) { 2033 incompatible: 2034 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 2035 rb_enc_name(rb_enc_from_index(str_encindex)), 2036 rb_enc_name(rb_enc_from_index(ptr_encindex))); 2037 } 2038 2039 if (str_cr == ENC_CODERANGE_UNKNOWN) { 2040 res_encindex = str_encindex; 2041 res_cr = ENC_CODERANGE_UNKNOWN; 2042 } 2043 else if (str_cr == ENC_CODERANGE_7BIT) { 2044 if (ptr_cr == ENC_CODERANGE_7BIT) { 2045 res_encindex = str_encindex; 2046 res_cr = ENC_CODERANGE_7BIT; 2047 } 2048 else { 2049 res_encindex = ptr_encindex; 2050 res_cr = ptr_cr; 2051 } 2052 } 2053 else if (str_cr == ENC_CODERANGE_VALID) { 2054 res_encindex = str_encindex; 2055 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID) 2056 res_cr = str_cr; 2057 else 2058 res_cr = ptr_cr; 2059 } 2060 else { /* str_cr == ENC_CODERANGE_BROKEN */ 2061 res_encindex = str_encindex; 2062 res_cr = str_cr; 2063 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN; 2064 } 2065 2066 if (len < 0) { 2067 rb_raise(rb_eArgError, "negative string size (or size too big)"); 2068 } 2069 str_buf_cat(str, ptr, len); 2070 ENCODING_CODERANGE_SET(str, res_encindex, res_cr); 2071 return str; 2072} 2073 2074VALUE 2075rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc) 2076{ 2077 return rb_enc_cr_str_buf_cat(str, ptr, len, 2078 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL); 2079} 2080 2081VALUE 2082rb_str_buf_cat_ascii(VALUE str, const char *ptr) 2083{ 2084 /* ptr must reference NUL terminated ASCII string. */ 2085 int encindex = ENCODING_GET(str); 2086 rb_encoding *enc = rb_enc_from_index(encindex); 2087 if (rb_enc_asciicompat(enc)) { 2088 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr), 2089 encindex, ENC_CODERANGE_7BIT, 0); 2090 } 2091 else { 2092 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc)); 2093 while (*ptr) { 2094 unsigned int c = (unsigned char)*ptr; 2095 int len = rb_enc_codelen(c, enc); 2096 rb_enc_mbcput(c, buf, enc); 2097 rb_enc_cr_str_buf_cat(str, buf, len, 2098 encindex, ENC_CODERANGE_VALID, 0); 2099 ptr++; 2100 } 2101 return str; 2102 } 2103} 2104 2105VALUE 2106rb_str_buf_append(VALUE str, VALUE str2) 2107{ 2108 int str2_cr; 2109 2110 str2_cr = ENC_CODERANGE(str2); 2111 2112 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2), 2113 ENCODING_GET(str2), str2_cr, &str2_cr); 2114 2115 OBJ_INFECT(str, str2); 2116 ENC_CODERANGE_SET(str2, str2_cr); 2117 2118 return str; 2119} 2120 2121VALUE 2122rb_str_append(VALUE str, VALUE str2) 2123{ 2124 rb_encoding *enc; 2125 int cr, cr2; 2126 long len2; 2127 2128 StringValue(str2); 2129 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) { 2130 long len = RSTRING_LEN(str) + len2; 2131 enc = rb_enc_check(str, str2); 2132 cr = ENC_CODERANGE(str); 2133 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2; 2134 rb_str_modify_expand(str, len2); 2135 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, 2136 RSTRING_PTR(str2), len2+1); 2137 RSTRING(str)->as.heap.len = len; 2138 rb_enc_associate(str, enc); 2139 ENC_CODERANGE_SET(str, cr); 2140 OBJ_INFECT(str, str2); 2141 return str; 2142 } 2143 return rb_str_buf_append(str, str2); 2144} 2145 2146/* 2147 * call-seq: 2148 * str << integer -> str 2149 * str.concat(integer) -> str 2150 * str << obj -> str 2151 * str.concat(obj) -> str 2152 * 2153 * Append---Concatenates the given object to <i>str</i>. If the object is a 2154 * <code>Integer</code>, it is considered as a codepoint, and is converted 2155 * to a character before concatenation. 2156 * 2157 * a = "hello " 2158 * a << "world" #=> "hello world" 2159 * a.concat(33) #=> "hello world!" 2160 */ 2161 2162VALUE 2163rb_str_concat(VALUE str1, VALUE str2) 2164{ 2165 unsigned int code; 2166 rb_encoding *enc = STR_ENC_GET(str1); 2167 2168 if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) { 2169 if (rb_num_to_uint(str2, &code) == 0) { 2170 } 2171 else if (FIXNUM_P(str2)) { 2172 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2)); 2173 } 2174 else { 2175 rb_raise(rb_eRangeError, "bignum out of char range"); 2176 } 2177 } 2178 else { 2179 return rb_str_append(str1, str2); 2180 } 2181 2182 if (enc == rb_usascii_encoding()) { 2183 /* US-ASCII automatically extended to ASCII-8BIT */ 2184 char buf[1]; 2185 buf[0] = (char)code; 2186 if (code > 0xFF) { 2187 rb_raise(rb_eRangeError, "%u out of char range", code); 2188 } 2189 rb_str_cat(str1, buf, 1); 2190 if (code > 127) { 2191 rb_enc_associate(str1, rb_ascii8bit_encoding()); 2192 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID); 2193 } 2194 } 2195 else { 2196 long pos = RSTRING_LEN(str1); 2197 int cr = ENC_CODERANGE(str1); 2198 int len; 2199 char *buf; 2200 2201 switch (len = rb_enc_codelen(code, enc)) { 2202 case ONIGERR_INVALID_CODE_POINT_VALUE: 2203 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 2204 break; 2205 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: 2206 case 0: 2207 rb_raise(rb_eRangeError, "%u out of char range", code); 2208 break; 2209 } 2210 buf = ALLOCA_N(char, len + 1); 2211 rb_enc_mbcput(code, buf, enc); 2212 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) { 2213 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 2214 } 2215 rb_str_resize(str1, pos+len); 2216 memcpy(RSTRING_PTR(str1) + pos, buf, len); 2217 if (cr == ENC_CODERANGE_7BIT && code > 127) 2218 cr = ENC_CODERANGE_VALID; 2219 ENC_CODERANGE_SET(str1, cr); 2220 } 2221 return str1; 2222} 2223 2224/* 2225 * call-seq: 2226 * str.prepend(other_str) -> str 2227 * 2228 * Prepend---Prepend the given string to <i>str</i>. 2229 * 2230 * a = "world" 2231 * a.prepend("hello ") #=> "hello world" 2232 * a #=> "hello world" 2233 */ 2234 2235static VALUE 2236rb_str_prepend(VALUE str, VALUE str2) 2237{ 2238 StringValue(str2); 2239 StringValue(str); 2240 rb_str_update(str, 0L, 0L, str2); 2241 return str; 2242} 2243 2244st_index_t 2245rb_str_hash(VALUE str) 2246{ 2247 int e = ENCODING_GET(str); 2248 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { 2249 e = 0; 2250 } 2251 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e; 2252} 2253 2254int 2255rb_str_hash_cmp(VALUE str1, VALUE str2) 2256{ 2257 long len; 2258 2259 if (!rb_str_comparable(str1, str2)) return 1; 2260 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) && 2261 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) { 2262 return 0; 2263 } 2264 return 1; 2265} 2266 2267/* 2268 * call-seq: 2269 * str.hash -> fixnum 2270 * 2271 * Return a hash based on the string's length and content. 2272 */ 2273 2274static VALUE 2275rb_str_hash_m(VALUE str) 2276{ 2277 st_index_t hval = rb_str_hash(str); 2278 return INT2FIX(hval); 2279} 2280 2281#define lesser(a,b) (((a)>(b))?(b):(a)) 2282 2283int 2284rb_str_comparable(VALUE str1, VALUE str2) 2285{ 2286 int idx1, idx2; 2287 int rc1, rc2; 2288 2289 if (RSTRING_LEN(str1) == 0) return TRUE; 2290 if (RSTRING_LEN(str2) == 0) return TRUE; 2291 idx1 = ENCODING_GET(str1); 2292 idx2 = ENCODING_GET(str2); 2293 if (idx1 == idx2) return TRUE; 2294 rc1 = rb_enc_str_coderange(str1); 2295 rc2 = rb_enc_str_coderange(str2); 2296 if (rc1 == ENC_CODERANGE_7BIT) { 2297 if (rc2 == ENC_CODERANGE_7BIT) return TRUE; 2298 if (rb_enc_asciicompat(rb_enc_from_index(idx2))) 2299 return TRUE; 2300 } 2301 if (rc2 == ENC_CODERANGE_7BIT) { 2302 if (rb_enc_asciicompat(rb_enc_from_index(idx1))) 2303 return TRUE; 2304 } 2305 return FALSE; 2306} 2307 2308int 2309rb_str_cmp(VALUE str1, VALUE str2) 2310{ 2311 long len1, len2; 2312 const char *ptr1, *ptr2; 2313 int retval; 2314 2315 if (str1 == str2) return 0; 2316 RSTRING_GETMEM(str1, ptr1, len1); 2317 RSTRING_GETMEM(str2, ptr2, len2); 2318 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) { 2319 if (len1 == len2) { 2320 if (!rb_str_comparable(str1, str2)) { 2321 if (ENCODING_GET(str1) > ENCODING_GET(str2)) 2322 return 1; 2323 return -1; 2324 } 2325 return 0; 2326 } 2327 if (len1 > len2) return 1; 2328 return -1; 2329 } 2330 if (retval > 0) return 1; 2331 return -1; 2332} 2333 2334/* expect tail call optimization */ 2335static VALUE 2336str_eql(const VALUE str1, const VALUE str2) 2337{ 2338 const long len = RSTRING_LEN(str1); 2339 const char *ptr1, *ptr2; 2340 2341 if (len != RSTRING_LEN(str2)) return Qfalse; 2342 if (!rb_str_comparable(str1, str2)) return Qfalse; 2343 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2))) 2344 return Qtrue; 2345 if (memcmp(ptr1, ptr2, len) == 0) 2346 return Qtrue; 2347 return Qfalse; 2348} 2349 2350/* 2351 * call-seq: 2352 * str == obj -> true or false 2353 * 2354 * Equality---If <i>obj</i> is not a <code>String</code>, returns 2355 * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i> 2356 * <code><=></code> <i>obj</i> returns zero. 2357 */ 2358 2359VALUE 2360rb_str_equal(VALUE str1, VALUE str2) 2361{ 2362 if (str1 == str2) return Qtrue; 2363 if (!RB_TYPE_P(str2, T_STRING)) { 2364 if (!rb_respond_to(str2, rb_intern("to_str"))) { 2365 return Qfalse; 2366 } 2367 return rb_equal(str2, str1); 2368 } 2369 return str_eql(str1, str2); 2370} 2371 2372/* 2373 * call-seq: 2374 * str.eql?(other) -> true or false 2375 * 2376 * Two strings are equal if they have the same length and content. 2377 */ 2378 2379static VALUE 2380rb_str_eql(VALUE str1, VALUE str2) 2381{ 2382 if (str1 == str2) return Qtrue; 2383 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse; 2384 return str_eql(str1, str2); 2385} 2386 2387/* 2388 * call-seq: 2389 * string <=> other_string -> -1, 0, +1 or nil 2390 * 2391 * 2392 * Comparison---Returns -1, 0, +1 or nil depending on whether +string+ is less 2393 * than, equal to, or greater than +other_string+. 2394 * 2395 * +nil+ is returned if the two values are incomparable. 2396 * 2397 * If the strings are of different lengths, and the strings are equal when 2398 * compared up to the shortest length, then the longer string is considered 2399 * greater than the shorter one. 2400 * 2401 * <code><=></code> is the basis for the methods <code><</code>, 2402 * <code><=</code>, <code>></code>, <code>>=</code>, and 2403 * <code>between?</code>, included from module Comparable. The method 2404 * String#== does not use Comparable#==. 2405 * 2406 * "abcdef" <=> "abcde" #=> 1 2407 * "abcdef" <=> "abcdef" #=> 0 2408 * "abcdef" <=> "abcdefg" #=> -1 2409 * "abcdef" <=> "ABCDEF" #=> 1 2410 */ 2411 2412static VALUE 2413rb_str_cmp_m(VALUE str1, VALUE str2) 2414{ 2415 int result; 2416 2417 if (!RB_TYPE_P(str2, T_STRING)) { 2418 VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0); 2419 if (RB_TYPE_P(tmp, T_STRING)) { 2420 result = rb_str_cmp(str1, tmp); 2421 } 2422 else { 2423 return rb_invcmp(str1, str2); 2424 } 2425 } 2426 else { 2427 result = rb_str_cmp(str1, str2); 2428 } 2429 return INT2FIX(result); 2430} 2431 2432/* 2433 * call-seq: 2434 * str.casecmp(other_str) -> -1, 0, +1 or nil 2435 * 2436 * Case-insensitive version of <code>String#<=></code>. 2437 * 2438 * "abcdef".casecmp("abcde") #=> 1 2439 * "aBcDeF".casecmp("abcdef") #=> 0 2440 * "abcdef".casecmp("abcdefg") #=> -1 2441 * "abcdef".casecmp("ABCDEF") #=> 0 2442 */ 2443 2444static VALUE 2445rb_str_casecmp(VALUE str1, VALUE str2) 2446{ 2447 long len; 2448 rb_encoding *enc; 2449 char *p1, *p1end, *p2, *p2end; 2450 2451 StringValue(str2); 2452 enc = rb_enc_compatible(str1, str2); 2453 if (!enc) { 2454 return Qnil; 2455 } 2456 2457 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1); 2458 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2); 2459 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) { 2460 while (p1 < p1end && p2 < p2end) { 2461 if (*p1 != *p2) { 2462 unsigned int c1 = TOUPPER(*p1 & 0xff); 2463 unsigned int c2 = TOUPPER(*p2 & 0xff); 2464 if (c1 != c2) 2465 return INT2FIX(c1 < c2 ? -1 : 1); 2466 } 2467 p1++; 2468 p2++; 2469 } 2470 } 2471 else { 2472 while (p1 < p1end && p2 < p2end) { 2473 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc); 2474 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc); 2475 2476 if (0 <= c1 && 0 <= c2) { 2477 c1 = TOUPPER(c1); 2478 c2 = TOUPPER(c2); 2479 if (c1 != c2) 2480 return INT2FIX(c1 < c2 ? -1 : 1); 2481 } 2482 else { 2483 int r; 2484 l1 = rb_enc_mbclen(p1, p1end, enc); 2485 l2 = rb_enc_mbclen(p2, p2end, enc); 2486 len = l1 < l2 ? l1 : l2; 2487 r = memcmp(p1, p2, len); 2488 if (r != 0) 2489 return INT2FIX(r < 0 ? -1 : 1); 2490 if (l1 != l2) 2491 return INT2FIX(l1 < l2 ? -1 : 1); 2492 } 2493 p1 += l1; 2494 p2 += l2; 2495 } 2496 } 2497 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0); 2498 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1); 2499 return INT2FIX(-1); 2500} 2501 2502static long 2503rb_str_index(VALUE str, VALUE sub, long offset) 2504{ 2505 long pos; 2506 char *s, *sptr, *e; 2507 long len, slen; 2508 rb_encoding *enc; 2509 2510 enc = rb_enc_check(str, sub); 2511 if (is_broken_string(sub)) { 2512 return -1; 2513 } 2514 len = str_strlen(str, enc); 2515 slen = str_strlen(sub, enc); 2516 if (offset < 0) { 2517 offset += len; 2518 if (offset < 0) return -1; 2519 } 2520 if (len - offset < slen) return -1; 2521 s = RSTRING_PTR(str); 2522 e = s + RSTRING_LEN(str); 2523 if (offset) { 2524 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str)); 2525 s += offset; 2526 } 2527 if (slen == 0) return offset; 2528 /* need proceed one character at a time */ 2529 sptr = RSTRING_PTR(sub); 2530 slen = RSTRING_LEN(sub); 2531 len = RSTRING_LEN(str) - offset; 2532 for (;;) { 2533 char *t; 2534 pos = rb_memsearch(sptr, slen, s, len, enc); 2535 if (pos < 0) return pos; 2536 t = rb_enc_right_char_head(s, s+pos, e, enc); 2537 if (t == s + pos) break; 2538 if ((len -= t - s) <= 0) return -1; 2539 offset += t - s; 2540 s = t; 2541 } 2542 return pos + offset; 2543} 2544 2545 2546/* 2547 * call-seq: 2548 * str.index(substring [, offset]) -> fixnum or nil 2549 * str.index(regexp [, offset]) -> fixnum or nil 2550 * 2551 * Returns the index of the first occurrence of the given <i>substring</i> or 2552 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 2553 * found. If the second parameter is present, it specifies the position in the 2554 * string to begin the search. 2555 * 2556 * "hello".index('e') #=> 1 2557 * "hello".index('lo') #=> 3 2558 * "hello".index('a') #=> nil 2559 * "hello".index(?e) #=> 1 2560 * "hello".index(/[aeiou]/, -3) #=> 4 2561 */ 2562 2563static VALUE 2564rb_str_index_m(int argc, VALUE *argv, VALUE str) 2565{ 2566 VALUE sub; 2567 VALUE initpos; 2568 long pos; 2569 2570 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { 2571 pos = NUM2LONG(initpos); 2572 } 2573 else { 2574 pos = 0; 2575 } 2576 if (pos < 0) { 2577 pos += str_strlen(str, STR_ENC_GET(str)); 2578 if (pos < 0) { 2579 if (RB_TYPE_P(sub, T_REGEXP)) { 2580 rb_backref_set(Qnil); 2581 } 2582 return Qnil; 2583 } 2584 } 2585 2586 if (SPECIAL_CONST_P(sub)) goto generic; 2587 switch (BUILTIN_TYPE(sub)) { 2588 case T_REGEXP: 2589 if (pos > str_strlen(str, STR_ENC_GET(str))) 2590 return Qnil; 2591 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 2592 rb_enc_check(str, sub), single_byte_optimizable(str)); 2593 2594 pos = rb_reg_search(sub, str, pos, 0); 2595 pos = rb_str_sublen(str, pos); 2596 break; 2597 2598 generic: 2599 default: { 2600 VALUE tmp; 2601 2602 tmp = rb_check_string_type(sub); 2603 if (NIL_P(tmp)) { 2604 rb_raise(rb_eTypeError, "type mismatch: %s given", 2605 rb_obj_classname(sub)); 2606 } 2607 sub = tmp; 2608 } 2609 /* fall through */ 2610 case T_STRING: 2611 pos = rb_str_index(str, sub, pos); 2612 pos = rb_str_sublen(str, pos); 2613 break; 2614 } 2615 2616 if (pos == -1) return Qnil; 2617 return LONG2NUM(pos); 2618} 2619 2620static long 2621rb_str_rindex(VALUE str, VALUE sub, long pos) 2622{ 2623 long len, slen; 2624 char *s, *sbeg, *e, *t; 2625 rb_encoding *enc; 2626 int singlebyte = single_byte_optimizable(str); 2627 2628 enc = rb_enc_check(str, sub); 2629 if (is_broken_string(sub)) { 2630 return -1; 2631 } 2632 len = str_strlen(str, enc); 2633 slen = str_strlen(sub, enc); 2634 /* substring longer than string */ 2635 if (len < slen) return -1; 2636 if (len - pos < slen) { 2637 pos = len - slen; 2638 } 2639 if (len == 0) { 2640 return pos; 2641 } 2642 sbeg = RSTRING_PTR(str); 2643 e = RSTRING_END(str); 2644 t = RSTRING_PTR(sub); 2645 slen = RSTRING_LEN(sub); 2646 s = str_nth(sbeg, e, pos, enc, singlebyte); 2647 while (s) { 2648 if (memcmp(s, t, slen) == 0) { 2649 return pos; 2650 } 2651 if (pos == 0) break; 2652 pos--; 2653 s = rb_enc_prev_char(sbeg, s, e, enc); 2654 } 2655 return -1; 2656} 2657 2658 2659/* 2660 * call-seq: 2661 * str.rindex(substring [, fixnum]) -> fixnum or nil 2662 * str.rindex(regexp [, fixnum]) -> fixnum or nil 2663 * 2664 * Returns the index of the last occurrence of the given <i>substring</i> or 2665 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 2666 * found. If the second parameter is present, it specifies the position in the 2667 * string to end the search---characters beyond this point will not be 2668 * considered. 2669 * 2670 * "hello".rindex('e') #=> 1 2671 * "hello".rindex('l') #=> 3 2672 * "hello".rindex('a') #=> nil 2673 * "hello".rindex(?e) #=> 1 2674 * "hello".rindex(/[aeiou]/, -2) #=> 1 2675 */ 2676 2677static VALUE 2678rb_str_rindex_m(int argc, VALUE *argv, VALUE str) 2679{ 2680 VALUE sub; 2681 VALUE vpos; 2682 rb_encoding *enc = STR_ENC_GET(str); 2683 long pos, len = str_strlen(str, enc); 2684 2685 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { 2686 pos = NUM2LONG(vpos); 2687 if (pos < 0) { 2688 pos += len; 2689 if (pos < 0) { 2690 if (RB_TYPE_P(sub, T_REGEXP)) { 2691 rb_backref_set(Qnil); 2692 } 2693 return Qnil; 2694 } 2695 } 2696 if (pos > len) pos = len; 2697 } 2698 else { 2699 pos = len; 2700 } 2701 2702 if (SPECIAL_CONST_P(sub)) goto generic; 2703 switch (BUILTIN_TYPE(sub)) { 2704 case T_REGEXP: 2705 /* enc = rb_get_check(str, sub); */ 2706 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 2707 STR_ENC_GET(str), single_byte_optimizable(str)); 2708 2709 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) { 2710 pos = rb_reg_search(sub, str, pos, 1); 2711 pos = rb_str_sublen(str, pos); 2712 } 2713 if (pos >= 0) return LONG2NUM(pos); 2714 break; 2715 2716 generic: 2717 default: { 2718 VALUE tmp; 2719 2720 tmp = rb_check_string_type(sub); 2721 if (NIL_P(tmp)) { 2722 rb_raise(rb_eTypeError, "type mismatch: %s given", 2723 rb_obj_classname(sub)); 2724 } 2725 sub = tmp; 2726 } 2727 /* fall through */ 2728 case T_STRING: 2729 pos = rb_str_rindex(str, sub, pos); 2730 if (pos >= 0) return LONG2NUM(pos); 2731 break; 2732 } 2733 return Qnil; 2734} 2735 2736/* 2737 * call-seq: 2738 * str =~ obj -> fixnum or nil 2739 * 2740 * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match 2741 * against <i>str</i>,and returns the position the match starts, or 2742 * <code>nil</code> if there is no match. Otherwise, invokes 2743 * <i>obj.=~</i>, passing <i>str</i> as an argument. The default 2744 * <code>=~</code> in <code>Object</code> returns <code>nil</code>. 2745 * 2746 * Note: <code>str =~ regexp</code> is not the same as 2747 * <code>regexp =~ str</code>. Strings captured from named capture groups 2748 * are assigned to local variables only in the second case. 2749 * 2750 * "cat o' 9 tails" =~ /\d/ #=> 7 2751 * "cat o' 9 tails" =~ 9 #=> nil 2752 */ 2753 2754static VALUE 2755rb_str_match(VALUE x, VALUE y) 2756{ 2757 if (SPECIAL_CONST_P(y)) goto generic; 2758 switch (BUILTIN_TYPE(y)) { 2759 case T_STRING: 2760 rb_raise(rb_eTypeError, "type mismatch: String given"); 2761 2762 case T_REGEXP: 2763 return rb_reg_match(y, x); 2764 2765 generic: 2766 default: 2767 return rb_funcall(y, rb_intern("=~"), 1, x); 2768 } 2769} 2770 2771 2772static VALUE get_pat(VALUE, int); 2773 2774 2775/* 2776 * call-seq: 2777 * str.match(pattern) -> matchdata or nil 2778 * str.match(pattern, pos) -> matchdata or nil 2779 * 2780 * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one), 2781 * then invokes its <code>match</code> method on <i>str</i>. If the second 2782 * parameter is present, it specifies the position in the string to begin the 2783 * search. 2784 * 2785 * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l"> 2786 * 'hello'.match('(.)\1')[0] #=> "ll" 2787 * 'hello'.match(/(.)\1/)[0] #=> "ll" 2788 * 'hello'.match('xx') #=> nil 2789 * 2790 * If a block is given, invoke the block with MatchData if match succeed, so 2791 * that you can write 2792 * 2793 * str.match(pat) {|m| ...} 2794 * 2795 * instead of 2796 * 2797 * if m = str.match(pat) 2798 * ... 2799 * end 2800 * 2801 * The return value is a value from block execution in this case. 2802 */ 2803 2804static VALUE 2805rb_str_match_m(int argc, VALUE *argv, VALUE str) 2806{ 2807 VALUE re, result; 2808 if (argc < 1) 2809 rb_check_arity(argc, 1, 2); 2810 re = argv[0]; 2811 argv[0] = str; 2812 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv); 2813 if (!NIL_P(result) && rb_block_given_p()) { 2814 return rb_yield(result); 2815 } 2816 return result; 2817} 2818 2819enum neighbor_char { 2820 NEIGHBOR_NOT_CHAR, 2821 NEIGHBOR_FOUND, 2822 NEIGHBOR_WRAPPED 2823}; 2824 2825static enum neighbor_char 2826enc_succ_char(char *p, long len, rb_encoding *enc) 2827{ 2828 long i; 2829 int l; 2830 while (1) { 2831 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--) 2832 p[i] = '\0'; 2833 if (i < 0) 2834 return NEIGHBOR_WRAPPED; 2835 ++((unsigned char*)p)[i]; 2836 l = rb_enc_precise_mbclen(p, p+len, enc); 2837 if (MBCLEN_CHARFOUND_P(l)) { 2838 l = MBCLEN_CHARFOUND_LEN(l); 2839 if (l == len) { 2840 return NEIGHBOR_FOUND; 2841 } 2842 else { 2843 memset(p+l, 0xff, len-l); 2844 } 2845 } 2846 if (MBCLEN_INVALID_P(l) && i < len-1) { 2847 long len2; 2848 int l2; 2849 for (len2 = len-1; 0 < len2; len2--) { 2850 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 2851 if (!MBCLEN_INVALID_P(l2)) 2852 break; 2853 } 2854 memset(p+len2+1, 0xff, len-(len2+1)); 2855 } 2856 } 2857} 2858 2859static enum neighbor_char 2860enc_pred_char(char *p, long len, rb_encoding *enc) 2861{ 2862 long i; 2863 int l; 2864 while (1) { 2865 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--) 2866 p[i] = '\xff'; 2867 if (i < 0) 2868 return NEIGHBOR_WRAPPED; 2869 --((unsigned char*)p)[i]; 2870 l = rb_enc_precise_mbclen(p, p+len, enc); 2871 if (MBCLEN_CHARFOUND_P(l)) { 2872 l = MBCLEN_CHARFOUND_LEN(l); 2873 if (l == len) { 2874 return NEIGHBOR_FOUND; 2875 } 2876 else { 2877 memset(p+l, 0, len-l); 2878 } 2879 } 2880 if (MBCLEN_INVALID_P(l) && i < len-1) { 2881 long len2; 2882 int l2; 2883 for (len2 = len-1; 0 < len2; len2--) { 2884 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 2885 if (!MBCLEN_INVALID_P(l2)) 2886 break; 2887 } 2888 memset(p+len2+1, 0, len-(len2+1)); 2889 } 2890 } 2891} 2892 2893/* 2894 overwrite +p+ by succeeding letter in +enc+ and returns 2895 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED. 2896 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry. 2897 assuming each ranges are successive, and mbclen 2898 never change in each ranges. 2899 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one 2900 character. 2901 */ 2902static enum neighbor_char 2903enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) 2904{ 2905 enum neighbor_char ret; 2906 unsigned int c; 2907 int ctype; 2908 int range; 2909 char save[ONIGENC_CODE_TO_MBC_MAXLEN]; 2910 2911 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 2912 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc)) 2913 ctype = ONIGENC_CTYPE_DIGIT; 2914 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc)) 2915 ctype = ONIGENC_CTYPE_ALPHA; 2916 else 2917 return NEIGHBOR_NOT_CHAR; 2918 2919 MEMCPY(save, p, char, len); 2920 ret = enc_succ_char(p, len, enc); 2921 if (ret == NEIGHBOR_FOUND) { 2922 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 2923 if (rb_enc_isctype(c, ctype, enc)) 2924 return NEIGHBOR_FOUND; 2925 } 2926 MEMCPY(p, save, char, len); 2927 range = 1; 2928 while (1) { 2929 MEMCPY(save, p, char, len); 2930 ret = enc_pred_char(p, len, enc); 2931 if (ret == NEIGHBOR_FOUND) { 2932 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 2933 if (!rb_enc_isctype(c, ctype, enc)) { 2934 MEMCPY(p, save, char, len); 2935 break; 2936 } 2937 } 2938 else { 2939 MEMCPY(p, save, char, len); 2940 break; 2941 } 2942 range++; 2943 } 2944 if (range == 1) { 2945 return NEIGHBOR_NOT_CHAR; 2946 } 2947 2948 if (ctype != ONIGENC_CTYPE_DIGIT) { 2949 MEMCPY(carry, p, char, len); 2950 return NEIGHBOR_WRAPPED; 2951 } 2952 2953 MEMCPY(carry, p, char, len); 2954 enc_succ_char(carry, len, enc); 2955 return NEIGHBOR_WRAPPED; 2956} 2957 2958 2959/* 2960 * call-seq: 2961 * str.succ -> new_str 2962 * str.next -> new_str 2963 * 2964 * Returns the successor to <i>str</i>. The successor is calculated by 2965 * incrementing characters starting from the rightmost alphanumeric (or 2966 * the rightmost character if there are no alphanumerics) in the 2967 * string. Incrementing a digit always results in another digit, and 2968 * incrementing a letter results in another letter of the same case. 2969 * Incrementing nonalphanumerics uses the underlying character set's 2970 * collating sequence. 2971 * 2972 * If the increment generates a ``carry,'' the character to the left of 2973 * it is incremented. This process repeats until there is no carry, 2974 * adding an additional character if necessary. 2975 * 2976 * "abcd".succ #=> "abce" 2977 * "THX1138".succ #=> "THX1139" 2978 * "<<koala>>".succ #=> "<<koalb>>" 2979 * "1999zzz".succ #=> "2000aaa" 2980 * "ZZZ9999".succ #=> "AAAA0000" 2981 * "***".succ #=> "**+" 2982 */ 2983 2984VALUE 2985rb_str_succ(VALUE orig) 2986{ 2987 rb_encoding *enc; 2988 VALUE str; 2989 char *sbeg, *s, *e, *last_alnum = 0; 2990 int c = -1; 2991 long l; 2992 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1"; 2993 long carry_pos = 0, carry_len = 1; 2994 enum neighbor_char neighbor = NEIGHBOR_FOUND; 2995 2996 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); 2997 rb_enc_cr_str_copy_for_substr(str, orig); 2998 OBJ_INFECT(str, orig); 2999 if (RSTRING_LEN(str) == 0) return str; 3000 3001 enc = STR_ENC_GET(orig); 3002 sbeg = RSTRING_PTR(str); 3003 s = e = sbeg + RSTRING_LEN(str); 3004 3005 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 3006 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) { 3007 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) : 3008 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) { 3009 s = last_alnum; 3010 break; 3011 } 3012 } 3013 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 3014 neighbor = enc_succ_alnum_char(s, l, enc, carry); 3015 switch (neighbor) { 3016 case NEIGHBOR_NOT_CHAR: 3017 continue; 3018 case NEIGHBOR_FOUND: 3019 return str; 3020 case NEIGHBOR_WRAPPED: 3021 last_alnum = s; 3022 break; 3023 } 3024 c = 1; 3025 carry_pos = s - sbeg; 3026 carry_len = l; 3027 } 3028 if (c == -1) { /* str contains no alnum */ 3029 s = e; 3030 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 3031 enum neighbor_char neighbor; 3032 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 3033 neighbor = enc_succ_char(s, l, enc); 3034 if (neighbor == NEIGHBOR_FOUND) 3035 return str; 3036 if (rb_enc_precise_mbclen(s, s+l, enc) != l) { 3037 /* wrapped to \0...\0. search next valid char. */ 3038 enc_succ_char(s, l, enc); 3039 } 3040 if (!rb_enc_asciicompat(enc)) { 3041 MEMCPY(carry, s, char, l); 3042 carry_len = l; 3043 } 3044 carry_pos = s - sbeg; 3045 } 3046 } 3047 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len); 3048 s = RSTRING_PTR(str) + carry_pos; 3049 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos); 3050 memmove(s, carry, carry_len); 3051 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len); 3052 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 3053 rb_enc_str_coderange(str); 3054 return str; 3055} 3056 3057 3058/* 3059 * call-seq: 3060 * str.succ! -> str 3061 * str.next! -> str 3062 * 3063 * Equivalent to <code>String#succ</code>, but modifies the receiver in 3064 * place. 3065 */ 3066 3067static VALUE 3068rb_str_succ_bang(VALUE str) 3069{ 3070 rb_str_shared_replace(str, rb_str_succ(str)); 3071 3072 return str; 3073} 3074 3075 3076/* 3077 * call-seq: 3078 * str.upto(other_str, exclusive=false) {|s| block } -> str 3079 * str.upto(other_str, exclusive=false) -> an_enumerator 3080 * 3081 * Iterates through successive values, starting at <i>str</i> and 3082 * ending at <i>other_str</i> inclusive, passing each value in turn to 3083 * the block. The <code>String#succ</code> method is used to generate 3084 * each value. If optional second argument exclusive is omitted or is false, 3085 * the last value will be included; otherwise it will be excluded. 3086 * 3087 * If no block is given, an enumerator is returned instead. 3088 * 3089 * "a8".upto("b6") {|s| print s, ' ' } 3090 * for s in "a8".."b6" 3091 * print s, ' ' 3092 * end 3093 * 3094 * <em>produces:</em> 3095 * 3096 * a8 a9 b0 b1 b2 b3 b4 b5 b6 3097 * a8 a9 b0 b1 b2 b3 b4 b5 b6 3098 * 3099 * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters, 3100 * both are recognized as decimal numbers. In addition, the width of 3101 * string (e.g. leading zeros) is handled appropriately. 3102 * 3103 * "9".upto("11").to_a #=> ["9", "10", "11"] 3104 * "25".upto("5").to_a #=> [] 3105 * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"] 3106 */ 3107 3108static VALUE 3109rb_str_upto(int argc, VALUE *argv, VALUE beg) 3110{ 3111 VALUE end, exclusive; 3112 VALUE current, after_end; 3113 ID succ; 3114 int n, excl, ascii; 3115 rb_encoding *enc; 3116 3117 rb_scan_args(argc, argv, "11", &end, &exclusive); 3118 RETURN_ENUMERATOR(beg, argc, argv); 3119 excl = RTEST(exclusive); 3120 CONST_ID(succ, "succ"); 3121 StringValue(end); 3122 enc = rb_enc_check(beg, end); 3123 ascii = (is_ascii_string(beg) && is_ascii_string(end)); 3124 /* single character */ 3125 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) { 3126 char c = RSTRING_PTR(beg)[0]; 3127 char e = RSTRING_PTR(end)[0]; 3128 3129 if (c > e || (excl && c == e)) return beg; 3130 for (;;) { 3131 rb_yield(rb_enc_str_new(&c, 1, enc)); 3132 if (!excl && c == e) break; 3133 c++; 3134 if (excl && c == e) break; 3135 } 3136 return beg; 3137 } 3138 /* both edges are all digits */ 3139 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) { 3140 char *s, *send; 3141 VALUE b, e; 3142 int width; 3143 3144 s = RSTRING_PTR(beg); send = RSTRING_END(beg); 3145 width = rb_long2int(send - s); 3146 while (s < send) { 3147 if (!ISDIGIT(*s)) goto no_digits; 3148 s++; 3149 } 3150 s = RSTRING_PTR(end); send = RSTRING_END(end); 3151 while (s < send) { 3152 if (!ISDIGIT(*s)) goto no_digits; 3153 s++; 3154 } 3155 b = rb_str_to_inum(beg, 10, FALSE); 3156 e = rb_str_to_inum(end, 10, FALSE); 3157 if (FIXNUM_P(b) && FIXNUM_P(e)) { 3158 long bi = FIX2LONG(b); 3159 long ei = FIX2LONG(e); 3160 rb_encoding *usascii = rb_usascii_encoding(); 3161 3162 while (bi <= ei) { 3163 if (excl && bi == ei) break; 3164 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi)); 3165 bi++; 3166 } 3167 } 3168 else { 3169 ID op = excl ? '<' : rb_intern("<="); 3170 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d")); 3171 3172 args[0] = INT2FIX(width); 3173 while (rb_funcall(b, op, 1, e)) { 3174 args[1] = b; 3175 rb_yield(rb_str_format(numberof(args), args, fmt)); 3176 b = rb_funcall(b, succ, 0, 0); 3177 } 3178 } 3179 return beg; 3180 } 3181 /* normal case */ 3182 no_digits: 3183 n = rb_str_cmp(beg, end); 3184 if (n > 0 || (excl && n == 0)) return beg; 3185 3186 after_end = rb_funcall(end, succ, 0, 0); 3187 current = rb_str_dup(beg); 3188 while (!rb_str_equal(current, after_end)) { 3189 VALUE next = Qnil; 3190 if (excl || !rb_str_equal(current, end)) 3191 next = rb_funcall(current, succ, 0, 0); 3192 rb_yield(current); 3193 if (NIL_P(next)) break; 3194 current = next; 3195 StringValue(current); 3196 if (excl && rb_str_equal(current, end)) break; 3197 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0) 3198 break; 3199 } 3200 3201 return beg; 3202} 3203 3204static VALUE 3205rb_str_subpat(VALUE str, VALUE re, VALUE backref) 3206{ 3207 if (rb_reg_search(re, str, 0, 0) >= 0) { 3208 VALUE match = rb_backref_get(); 3209 int nth = rb_reg_backref_number(match, backref); 3210 return rb_reg_nth_match(nth, match); 3211 } 3212 return Qnil; 3213} 3214 3215static VALUE 3216rb_str_aref(VALUE str, VALUE indx) 3217{ 3218 long idx; 3219 3220 if (FIXNUM_P(indx)) { 3221 idx = FIX2LONG(indx); 3222 3223 num_index: 3224 str = rb_str_substr(str, idx, 1); 3225 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil; 3226 return str; 3227 } 3228 3229 if (SPECIAL_CONST_P(indx)) goto generic; 3230 switch (BUILTIN_TYPE(indx)) { 3231 case T_REGEXP: 3232 return rb_str_subpat(str, indx, INT2FIX(0)); 3233 3234 case T_STRING: 3235 if (rb_str_index(str, indx, 0) != -1) 3236 return rb_str_dup(indx); 3237 return Qnil; 3238 3239 generic: 3240 default: 3241 /* check if indx is Range */ 3242 { 3243 long beg, len; 3244 VALUE tmp; 3245 3246 len = str_strlen(str, STR_ENC_GET(str)); 3247 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 3248 case Qfalse: 3249 break; 3250 case Qnil: 3251 return Qnil; 3252 default: 3253 tmp = rb_str_substr(str, beg, len); 3254 return tmp; 3255 } 3256 } 3257 idx = NUM2LONG(indx); 3258 goto num_index; 3259 } 3260 3261 UNREACHABLE; 3262} 3263 3264 3265/* 3266 * call-seq: 3267 * str[index] -> new_str or nil 3268 * str[start, length] -> new_str or nil 3269 * str[range] -> new_str or nil 3270 * str[regexp] -> new_str or nil 3271 * str[regexp, capture] -> new_str or nil 3272 * str[match_str] -> new_str or nil 3273 * str.slice(index) -> new_str or nil 3274 * str.slice(start, length) -> new_str or nil 3275 * str.slice(range) -> new_str or nil 3276 * str.slice(regexp) -> new_str or nil 3277 * str.slice(regexp, capture) -> new_str or nil 3278 * str.slice(match_str) -> new_str or nil 3279 * 3280 * Element Reference --- If passed a single +index+, returns a substring of 3281 * one character at that index. If passed a +start+ index and a +length+, 3282 * returns a substring containing +length+ characters starting at the 3283 * +index+. If passed a +range+, its beginning and end are interpreted as 3284 * offsets delimiting the substring to be returned. 3285 * 3286 * In these three cases, if an index is negative, it is counted from the end 3287 * of the string. For the +start+ and +range+ cases the starting index 3288 * is just before a character and an index matching the string's size. 3289 * Additionally, an empty string is returned when the starting index for a 3290 * character range is at the end of the string. 3291 * 3292 * Returns +nil+ if the initial index falls outside the string or the length 3293 * is negative. 3294 * 3295 * If a +Regexp+ is supplied, the matching portion of the string is 3296 * returned. If a +capture+ follows the regular expression, which may be a 3297 * capture group index or name, follows the regular expression that component 3298 * of the MatchData is returned instead. 3299 * 3300 * If a +match_str+ is given, that string is returned if it occurs in 3301 * the string. 3302 * 3303 * Returns +nil+ if the regular expression does not match or the match string 3304 * cannot be found. 3305 * 3306 * a = "hello there" 3307 * 3308 * a[1] #=> "e" 3309 * a[2, 3] #=> "llo" 3310 * a[2..3] #=> "ll" 3311 * 3312 * a[-3, 2] #=> "er" 3313 * a[7..-2] #=> "her" 3314 * a[-4..-2] #=> "her" 3315 * a[-2..-4] #=> "" 3316 * 3317 * a[11, 0] #=> "" 3318 * a[11] #=> nil 3319 * a[12, 0] #=> nil 3320 * a[12..-1] #=> nil 3321 * 3322 * a[/[aeiou](.)\1/] #=> "ell" 3323 * a[/[aeiou](.)\1/, 0] #=> "ell" 3324 * a[/[aeiou](.)\1/, 1] #=> "l" 3325 * a[/[aeiou](.)\1/, 2] #=> nil 3326 * 3327 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l" 3328 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"] #=> "e" 3329 * 3330 * a["lo"] #=> "lo" 3331 * a["bye"] #=> nil 3332 */ 3333 3334static VALUE 3335rb_str_aref_m(int argc, VALUE *argv, VALUE str) 3336{ 3337 if (argc == 2) { 3338 if (RB_TYPE_P(argv[0], T_REGEXP)) { 3339 return rb_str_subpat(str, argv[0], argv[1]); 3340 } 3341 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 3342 } 3343 rb_check_arity(argc, 1, 2); 3344 return rb_str_aref(str, argv[0]); 3345} 3346 3347VALUE 3348rb_str_drop_bytes(VALUE str, long len) 3349{ 3350 char *ptr = RSTRING_PTR(str); 3351 long olen = RSTRING_LEN(str), nlen; 3352 3353 str_modifiable(str); 3354 if (len > olen) len = olen; 3355 nlen = olen - len; 3356 if (nlen <= RSTRING_EMBED_LEN_MAX) { 3357 char *oldptr = ptr; 3358 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED)); 3359 STR_SET_EMBED(str); 3360 STR_SET_EMBED_LEN(str, nlen); 3361 ptr = RSTRING(str)->as.ary; 3362 memmove(ptr, oldptr + len, nlen); 3363 if (fl == STR_NOEMBED) xfree(oldptr); 3364 } 3365 else { 3366 if (!STR_SHARED_P(str)) rb_str_new4(str); 3367 ptr = RSTRING(str)->as.heap.ptr += len; 3368 RSTRING(str)->as.heap.len = nlen; 3369 } 3370 ptr[nlen] = 0; 3371 ENC_CODERANGE_CLEAR(str); 3372 return str; 3373} 3374 3375static void 3376rb_str_splice_0(VALUE str, long beg, long len, VALUE val) 3377{ 3378 if (beg == 0 && RSTRING_LEN(val) == 0) { 3379 rb_str_drop_bytes(str, len); 3380 OBJ_INFECT(str, val); 3381 return; 3382 } 3383 3384 rb_str_modify(str); 3385 if (len < RSTRING_LEN(val)) { 3386 /* expand string */ 3387 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1); 3388 } 3389 3390 if (RSTRING_LEN(val) != len) { 3391 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val), 3392 RSTRING_PTR(str) + beg + len, 3393 RSTRING_LEN(str) - (beg + len)); 3394 } 3395 if (RSTRING_LEN(val) < beg && len < 0) { 3396 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len); 3397 } 3398 if (RSTRING_LEN(val) > 0) { 3399 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val)); 3400 } 3401 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len); 3402 if (RSTRING_PTR(str)) { 3403 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 3404 } 3405 OBJ_INFECT(str, val); 3406} 3407 3408static void 3409rb_str_splice(VALUE str, long beg, long len, VALUE val) 3410{ 3411 long slen; 3412 char *p, *e; 3413 rb_encoding *enc; 3414 int singlebyte = single_byte_optimizable(str); 3415 int cr; 3416 3417 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 3418 3419 StringValue(val); 3420 enc = rb_enc_check(str, val); 3421 slen = str_strlen(str, enc); 3422 3423 if (slen < beg) { 3424 out_of_range: 3425 rb_raise(rb_eIndexError, "index %ld out of string", beg); 3426 } 3427 if (beg < 0) { 3428 if (-beg > slen) { 3429 goto out_of_range; 3430 } 3431 beg += slen; 3432 } 3433 if (slen < len || slen < beg + len) { 3434 len = slen - beg; 3435 } 3436 str_modify_keep_cr(str); 3437 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte); 3438 if (!p) p = RSTRING_END(str); 3439 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte); 3440 if (!e) e = RSTRING_END(str); 3441 /* error check */ 3442 beg = p - RSTRING_PTR(str); /* physical position */ 3443 len = e - p; /* physical length */ 3444 rb_str_splice_0(str, beg, len, val); 3445 rb_enc_associate(str, enc); 3446 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val)); 3447 if (cr != ENC_CODERANGE_BROKEN) 3448 ENC_CODERANGE_SET(str, cr); 3449} 3450 3451void 3452rb_str_update(VALUE str, long beg, long len, VALUE val) 3453{ 3454 rb_str_splice(str, beg, len, val); 3455} 3456 3457static void 3458rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val) 3459{ 3460 int nth; 3461 VALUE match; 3462 long start, end, len; 3463 rb_encoding *enc; 3464 struct re_registers *regs; 3465 3466 if (rb_reg_search(re, str, 0, 0) < 0) { 3467 rb_raise(rb_eIndexError, "regexp not matched"); 3468 } 3469 match = rb_backref_get(); 3470 nth = rb_reg_backref_number(match, backref); 3471 regs = RMATCH_REGS(match); 3472 if (nth >= regs->num_regs) { 3473 out_of_range: 3474 rb_raise(rb_eIndexError, "index %d out of regexp", nth); 3475 } 3476 if (nth < 0) { 3477 if (-nth >= regs->num_regs) { 3478 goto out_of_range; 3479 } 3480 nth += regs->num_regs; 3481 } 3482 3483 start = BEG(nth); 3484 if (start == -1) { 3485 rb_raise(rb_eIndexError, "regexp group %d not matched", nth); 3486 } 3487 end = END(nth); 3488 len = end - start; 3489 StringValue(val); 3490 enc = rb_enc_check(str, val); 3491 rb_str_splice_0(str, start, len, val); 3492 rb_enc_associate(str, enc); 3493} 3494 3495static VALUE 3496rb_str_aset(VALUE str, VALUE indx, VALUE val) 3497{ 3498 long idx, beg; 3499 3500 if (FIXNUM_P(indx)) { 3501 idx = FIX2LONG(indx); 3502 num_index: 3503 rb_str_splice(str, idx, 1, val); 3504 return val; 3505 } 3506 3507 if (SPECIAL_CONST_P(indx)) goto generic; 3508 switch (TYPE(indx)) { 3509 case T_REGEXP: 3510 rb_str_subpat_set(str, indx, INT2FIX(0), val); 3511 return val; 3512 3513 case T_STRING: 3514 beg = rb_str_index(str, indx, 0); 3515 if (beg < 0) { 3516 rb_raise(rb_eIndexError, "string not matched"); 3517 } 3518 beg = rb_str_sublen(str, beg); 3519 rb_str_splice(str, beg, str_strlen(indx, 0), val); 3520 return val; 3521 3522 generic: 3523 default: 3524 /* check if indx is Range */ 3525 { 3526 long beg, len; 3527 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) { 3528 rb_str_splice(str, beg, len, val); 3529 return val; 3530 } 3531 } 3532 idx = NUM2LONG(indx); 3533 goto num_index; 3534 } 3535} 3536 3537/* 3538 * call-seq: 3539 * str[fixnum] = new_str 3540 * str[fixnum, fixnum] = new_str 3541 * str[range] = aString 3542 * str[regexp] = new_str 3543 * str[regexp, fixnum] = new_str 3544 * str[regexp, name] = new_str 3545 * str[other_str] = new_str 3546 * 3547 * Element Assignment---Replaces some or all of the content of <i>str</i>. The 3548 * portion of the string affected is determined using the same criteria as 3549 * <code>String#[]</code>. If the replacement string is not the same length as 3550 * the text it is replacing, the string will be adjusted accordingly. If the 3551 * regular expression or string is used as the index doesn't match a position 3552 * in the string, <code>IndexError</code> is raised. If the regular expression 3553 * form is used, the optional second <code>Fixnum</code> allows you to specify 3554 * which portion of the match to replace (effectively using the 3555 * <code>MatchData</code> indexing rules. The forms that take a 3556 * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is 3557 * out of range; the <code>Range</code> form will raise a 3558 * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code> 3559 * will raise an <code>IndexError</code> on negative match. 3560 */ 3561 3562static VALUE 3563rb_str_aset_m(int argc, VALUE *argv, VALUE str) 3564{ 3565 if (argc == 3) { 3566 if (RB_TYPE_P(argv[0], T_REGEXP)) { 3567 rb_str_subpat_set(str, argv[0], argv[1], argv[2]); 3568 } 3569 else { 3570 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]); 3571 } 3572 return argv[2]; 3573 } 3574 rb_check_arity(argc, 2, 3); 3575 return rb_str_aset(str, argv[0], argv[1]); 3576} 3577 3578/* 3579 * call-seq: 3580 * str.insert(index, other_str) -> str 3581 * 3582 * Inserts <i>other_str</i> before the character at the given 3583 * <i>index</i>, modifying <i>str</i>. Negative indices count from the 3584 * end of the string, and insert <em>after</em> the given character. 3585 * The intent is insert <i>aString</i> so that it starts at the given 3586 * <i>index</i>. 3587 * 3588 * "abcd".insert(0, 'X') #=> "Xabcd" 3589 * "abcd".insert(3, 'X') #=> "abcXd" 3590 * "abcd".insert(4, 'X') #=> "abcdX" 3591 * "abcd".insert(-3, 'X') #=> "abXcd" 3592 * "abcd".insert(-1, 'X') #=> "abcdX" 3593 */ 3594 3595static VALUE 3596rb_str_insert(VALUE str, VALUE idx, VALUE str2) 3597{ 3598 long pos = NUM2LONG(idx); 3599 3600 if (pos == -1) { 3601 return rb_str_append(str, str2); 3602 } 3603 else if (pos < 0) { 3604 pos++; 3605 } 3606 rb_str_splice(str, pos, 0, str2); 3607 return str; 3608} 3609 3610 3611/* 3612 * call-seq: 3613 * str.slice!(fixnum) -> fixnum or nil 3614 * str.slice!(fixnum, fixnum) -> new_str or nil 3615 * str.slice!(range) -> new_str or nil 3616 * str.slice!(regexp) -> new_str or nil 3617 * str.slice!(other_str) -> new_str or nil 3618 * 3619 * Deletes the specified portion from <i>str</i>, and returns the portion 3620 * deleted. 3621 * 3622 * string = "this is a string" 3623 * string.slice!(2) #=> "i" 3624 * string.slice!(3..6) #=> " is " 3625 * string.slice!(/s.*t/) #=> "sa st" 3626 * string.slice!("r") #=> "r" 3627 * string #=> "thing" 3628 */ 3629 3630static VALUE 3631rb_str_slice_bang(int argc, VALUE *argv, VALUE str) 3632{ 3633 VALUE result; 3634 VALUE buf[3]; 3635 int i; 3636 3637 rb_check_arity(argc, 1, 2); 3638 for (i=0; i<argc; i++) { 3639 buf[i] = argv[i]; 3640 } 3641 str_modify_keep_cr(str); 3642 result = rb_str_aref_m(argc, buf, str); 3643 if (!NIL_P(result)) { 3644 buf[i] = rb_str_new(0,0); 3645 rb_str_aset_m(argc+1, buf, str); 3646 } 3647 return result; 3648} 3649 3650static VALUE 3651get_pat(VALUE pat, int quote) 3652{ 3653 VALUE val; 3654 3655 switch (TYPE(pat)) { 3656 case T_REGEXP: 3657 return pat; 3658 3659 case T_STRING: 3660 break; 3661 3662 default: 3663 val = rb_check_string_type(pat); 3664 if (NIL_P(val)) { 3665 Check_Type(pat, T_REGEXP); 3666 } 3667 pat = val; 3668 } 3669 3670 if (quote) { 3671 pat = rb_reg_quote(pat); 3672 } 3673 3674 return rb_reg_regcomp(pat); 3675} 3676 3677 3678/* 3679 * call-seq: 3680 * str.sub!(pattern, replacement) -> str or nil 3681 * str.sub!(pattern) {|match| block } -> str or nil 3682 * 3683 * Performs the same substitution as String#sub in-place. 3684 * 3685 * Returns +str+ if a substitution was performed or +nil+ if no substitution 3686 * was performed. 3687 */ 3688 3689static VALUE 3690rb_str_sub_bang(int argc, VALUE *argv, VALUE str) 3691{ 3692 VALUE pat, repl, hash = Qnil; 3693 int iter = 0; 3694 int tainted = 0; 3695 int untrusted = 0; 3696 long plen; 3697 int min_arity = rb_block_given_p() ? 1 : 2; 3698 3699 rb_check_arity(argc, min_arity, 2); 3700 if (argc == 1) { 3701 iter = 1; 3702 } 3703 else { 3704 repl = argv[1]; 3705 hash = rb_check_hash_type(argv[1]); 3706 if (NIL_P(hash)) { 3707 StringValue(repl); 3708 } 3709 if (OBJ_TAINTED(repl)) tainted = 1; 3710 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 3711 } 3712 3713 pat = get_pat(argv[0], 1); 3714 str_modifiable(str); 3715 if (rb_reg_search(pat, str, 0, 0) >= 0) { 3716 rb_encoding *enc; 3717 int cr = ENC_CODERANGE(str); 3718 VALUE match = rb_backref_get(); 3719 struct re_registers *regs = RMATCH_REGS(match); 3720 long beg0 = BEG(0); 3721 long end0 = END(0); 3722 char *p, *rp; 3723 long len, rlen; 3724 3725 if (iter || !NIL_P(hash)) { 3726 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 3727 3728 if (iter) { 3729 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 3730 } 3731 else { 3732 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); 3733 repl = rb_obj_as_string(repl); 3734 } 3735 str_mod_check(str, p, len); 3736 rb_check_frozen(str); 3737 } 3738 else { 3739 repl = rb_reg_regsub(repl, str, regs, pat); 3740 } 3741 enc = rb_enc_compatible(str, repl); 3742 if (!enc) { 3743 rb_encoding *str_enc = STR_ENC_GET(str); 3744 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 3745 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT || 3746 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) { 3747 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 3748 rb_enc_name(str_enc), 3749 rb_enc_name(STR_ENC_GET(repl))); 3750 } 3751 enc = STR_ENC_GET(repl); 3752 } 3753 rb_str_modify(str); 3754 rb_enc_associate(str, enc); 3755 if (OBJ_TAINTED(repl)) tainted = 1; 3756 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 3757 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { 3758 int cr2 = ENC_CODERANGE(repl); 3759 if (cr2 == ENC_CODERANGE_BROKEN || 3760 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT)) 3761 cr = ENC_CODERANGE_UNKNOWN; 3762 else 3763 cr = cr2; 3764 } 3765 plen = end0 - beg0; 3766 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl); 3767 len = RSTRING_LEN(str); 3768 if (rlen > plen) { 3769 RESIZE_CAPA(str, len + rlen - plen); 3770 } 3771 p = RSTRING_PTR(str); 3772 if (rlen != plen) { 3773 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen); 3774 } 3775 memcpy(p + beg0, rp, rlen); 3776 len += rlen - plen; 3777 STR_SET_LEN(str, len); 3778 RSTRING_PTR(str)[len] = '\0'; 3779 ENC_CODERANGE_SET(str, cr); 3780 if (tainted) OBJ_TAINT(str); 3781 if (untrusted) OBJ_UNTRUST(str); 3782 3783 return str; 3784 } 3785 return Qnil; 3786} 3787 3788 3789/* 3790 * call-seq: 3791 * str.sub(pattern, replacement) -> new_str 3792 * str.sub(pattern, hash) -> new_str 3793 * str.sub(pattern) {|match| block } -> new_str 3794 * 3795 * Returns a copy of +str+ with the _first_ occurrence of +pattern+ 3796 * replaced by the second argument. The +pattern+ is typically a Regexp; if 3797 * given as a String, any regular expression metacharacters it contains will 3798 * be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash 3799 * followed by 'd', instead of a digit. 3800 * 3801 * If +replacement+ is a String it will be substituted for the matched text. 3802 * It may contain back-references to the pattern's capture groups of the form 3803 * <code>"\\d"</code>, where <i>d</i> is a group number, or 3804 * <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a 3805 * double-quoted string, both back-references must be preceded by an 3806 * additional backslash. However, within +replacement+ the special match 3807 * variables, such as <code>&$</code>, will not refer to the current match. 3808 * 3809 * If the second argument is a Hash, and the matched text is one of its keys, 3810 * the corresponding value is the replacement string. 3811 * 3812 * In the block form, the current match string is passed in as a parameter, 3813 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 3814 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 3815 * returned by the block will be substituted for the match on each call. 3816 * 3817 * The result inherits any tainting in the original string or any supplied 3818 * replacement string. 3819 * 3820 * "hello".sub(/[aeiou]/, '*') #=> "h*llo" 3821 * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo" 3822 * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello" 3823 * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo" 3824 * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV) 3825 * #=> "Is /bin/bash your preferred shell?" 3826 */ 3827 3828static VALUE 3829rb_str_sub(int argc, VALUE *argv, VALUE str) 3830{ 3831 str = rb_str_dup(str); 3832 rb_str_sub_bang(argc, argv, str); 3833 return str; 3834} 3835 3836static VALUE 3837str_gsub(int argc, VALUE *argv, VALUE str, int bang) 3838{ 3839 VALUE pat, val, repl, match, dest, hash = Qnil; 3840 struct re_registers *regs; 3841 long beg, n; 3842 long beg0, end0; 3843 long offset, blen, slen, len, last; 3844 int iter = 0; 3845 char *sp, *cp; 3846 int tainted = 0; 3847 rb_encoding *str_enc; 3848 3849 switch (argc) { 3850 case 1: 3851 RETURN_ENUMERATOR(str, argc, argv); 3852 iter = 1; 3853 break; 3854 case 2: 3855 repl = argv[1]; 3856 hash = rb_check_hash_type(argv[1]); 3857 if (NIL_P(hash)) { 3858 StringValue(repl); 3859 } 3860 if (OBJ_TAINTED(repl)) tainted = 1; 3861 break; 3862 default: 3863 rb_check_arity(argc, 1, 2); 3864 } 3865 3866 pat = get_pat(argv[0], 1); 3867 beg = rb_reg_search(pat, str, 0, 0); 3868 if (beg < 0) { 3869 if (bang) return Qnil; /* no match, no substitution */ 3870 return rb_str_dup(str); 3871 } 3872 3873 offset = 0; 3874 n = 0; 3875 blen = RSTRING_LEN(str) + 30; /* len + margin */ 3876 dest = rb_str_buf_new(blen); 3877 sp = RSTRING_PTR(str); 3878 slen = RSTRING_LEN(str); 3879 cp = sp; 3880 str_enc = STR_ENC_GET(str); 3881 rb_enc_associate(dest, str_enc); 3882 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); 3883 3884 do { 3885 n++; 3886 match = rb_backref_get(); 3887 regs = RMATCH_REGS(match); 3888 beg0 = BEG(0); 3889 end0 = END(0); 3890 if (iter || !NIL_P(hash)) { 3891 if (iter) { 3892 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 3893 } 3894 else { 3895 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0))); 3896 val = rb_obj_as_string(val); 3897 } 3898 str_mod_check(str, sp, slen); 3899 if (val == dest) { /* paranoid check [ruby-dev:24827] */ 3900 rb_raise(rb_eRuntimeError, "block should not cheat"); 3901 } 3902 } 3903 else { 3904 val = rb_reg_regsub(repl, str, regs, pat); 3905 } 3906 3907 if (OBJ_TAINTED(val)) tainted = 1; 3908 3909 len = beg0 - offset; /* copy pre-match substr */ 3910 if (len) { 3911 rb_enc_str_buf_cat(dest, cp, len, str_enc); 3912 } 3913 3914 rb_str_buf_append(dest, val); 3915 3916 last = offset; 3917 offset = end0; 3918 if (beg0 == end0) { 3919 /* 3920 * Always consume at least one character of the input string 3921 * in order to prevent infinite loops. 3922 */ 3923 if (RSTRING_LEN(str) <= end0) break; 3924 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc); 3925 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc); 3926 offset = end0 + len; 3927 } 3928 cp = RSTRING_PTR(str) + offset; 3929 if (offset > RSTRING_LEN(str)) break; 3930 beg = rb_reg_search(pat, str, offset, 0); 3931 } while (beg >= 0); 3932 if (RSTRING_LEN(str) > offset) { 3933 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc); 3934 } 3935 rb_reg_search(pat, str, last, 0); 3936 if (bang) { 3937 rb_str_shared_replace(str, dest); 3938 } 3939 else { 3940 RBASIC(dest)->klass = rb_obj_class(str); 3941 OBJ_INFECT(dest, str); 3942 str = dest; 3943 } 3944 3945 if (tainted) OBJ_TAINT(str); 3946 return str; 3947} 3948 3949 3950/* 3951 * call-seq: 3952 * str.gsub!(pattern, replacement) -> str or nil 3953 * str.gsub!(pattern) {|match| block } -> str or nil 3954 * str.gsub!(pattern) -> an_enumerator 3955 * 3956 * Performs the substitutions of <code>String#gsub</code> in place, returning 3957 * <i>str</i>, or <code>nil</code> if no substitutions were performed. 3958 * If no block and no <i>replacement</i> is given, an enumerator is returned instead. 3959 */ 3960 3961static VALUE 3962rb_str_gsub_bang(int argc, VALUE *argv, VALUE str) 3963{ 3964 str_modify_keep_cr(str); 3965 return str_gsub(argc, argv, str, 1); 3966} 3967 3968 3969/* 3970 * call-seq: 3971 * str.gsub(pattern, replacement) -> new_str 3972 * str.gsub(pattern, hash) -> new_str 3973 * str.gsub(pattern) {|match| block } -> new_str 3974 * str.gsub(pattern) -> enumerator 3975 * 3976 * Returns a copy of <i>str</i> with the <em>all</em> occurrences of 3977 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is 3978 * typically a <code>Regexp</code>; if given as a <code>String</code>, any 3979 * regular expression metacharacters it contains will be interpreted 3980 * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', 3981 * instead of a digit. 3982 * 3983 * If <i>replacement</i> is a <code>String</code> it will be substituted for 3984 * the matched text. It may contain back-references to the pattern's capture 3985 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or 3986 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a 3987 * double-quoted string, both back-references must be preceded by an 3988 * additional backslash. However, within <i>replacement</i> the special match 3989 * variables, such as <code>$&</code>, will not refer to the current match. 3990 * 3991 * If the second argument is a <code>Hash</code>, and the matched text is one 3992 * of its keys, the corresponding value is the replacement string. 3993 * 3994 * In the block form, the current match string is passed in as a parameter, 3995 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 3996 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 3997 * returned by the block will be substituted for the match on each call. 3998 * 3999 * The result inherits any tainting in the original string or any supplied 4000 * replacement string. 4001 * 4002 * When neither a block nor a second argument is supplied, an 4003 * <code>Enumerator</code> is returned. 4004 * 4005 * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*" 4006 * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>" 4007 * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 " 4008 * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}" 4009 * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*" 4010 */ 4011 4012static VALUE 4013rb_str_gsub(int argc, VALUE *argv, VALUE str) 4014{ 4015 return str_gsub(argc, argv, str, 0); 4016} 4017 4018 4019/* 4020 * call-seq: 4021 * str.replace(other_str) -> str 4022 * 4023 * Replaces the contents and taintedness of <i>str</i> with the corresponding 4024 * values in <i>other_str</i>. 4025 * 4026 * s = "hello" #=> "hello" 4027 * s.replace "world" #=> "world" 4028 */ 4029 4030VALUE 4031rb_str_replace(VALUE str, VALUE str2) 4032{ 4033 str_modifiable(str); 4034 if (str == str2) return str; 4035 4036 StringValue(str2); 4037 str_discard(str); 4038 return str_replace(str, str2); 4039} 4040 4041/* 4042 * call-seq: 4043 * string.clear -> string 4044 * 4045 * Makes string empty. 4046 * 4047 * a = "abcde" 4048 * a.clear #=> "" 4049 */ 4050 4051static VALUE 4052rb_str_clear(VALUE str) 4053{ 4054 str_discard(str); 4055 STR_SET_EMBED(str); 4056 STR_SET_EMBED_LEN(str, 0); 4057 RSTRING_PTR(str)[0] = 0; 4058 if (rb_enc_asciicompat(STR_ENC_GET(str))) 4059 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 4060 else 4061 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 4062 return str; 4063} 4064 4065/* 4066 * call-seq: 4067 * string.chr -> string 4068 * 4069 * Returns a one-character string at the beginning of the string. 4070 * 4071 * a = "abcde" 4072 * a.chr #=> "a" 4073 */ 4074 4075static VALUE 4076rb_str_chr(VALUE str) 4077{ 4078 return rb_str_substr(str, 0, 1); 4079} 4080 4081/* 4082 * call-seq: 4083 * str.getbyte(index) -> 0 .. 255 4084 * 4085 * returns the <i>index</i>th byte as an integer. 4086 */ 4087static VALUE 4088rb_str_getbyte(VALUE str, VALUE index) 4089{ 4090 long pos = NUM2LONG(index); 4091 4092 if (pos < 0) 4093 pos += RSTRING_LEN(str); 4094 if (pos < 0 || RSTRING_LEN(str) <= pos) 4095 return Qnil; 4096 4097 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]); 4098} 4099 4100/* 4101 * call-seq: 4102 * str.setbyte(index, integer) -> integer 4103 * 4104 * modifies the <i>index</i>th byte as <i>integer</i>. 4105 */ 4106static VALUE 4107rb_str_setbyte(VALUE str, VALUE index, VALUE value) 4108{ 4109 long pos = NUM2LONG(index); 4110 int byte = NUM2INT(value); 4111 4112 rb_str_modify(str); 4113 4114 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos) 4115 rb_raise(rb_eIndexError, "index %ld out of string", pos); 4116 if (pos < 0) 4117 pos += RSTRING_LEN(str); 4118 4119 RSTRING_PTR(str)[pos] = byte; 4120 4121 return value; 4122} 4123 4124static VALUE 4125str_byte_substr(VALUE str, long beg, long len) 4126{ 4127 char *p, *s = RSTRING_PTR(str); 4128 long n = RSTRING_LEN(str); 4129 VALUE str2; 4130 4131 if (beg > n || len < 0) return Qnil; 4132 if (beg < 0) { 4133 beg += n; 4134 if (beg < 0) return Qnil; 4135 } 4136 if (beg + len > n) 4137 len = n - beg; 4138 if (len <= 0) { 4139 len = 0; 4140 p = 0; 4141 } 4142 else 4143 p = s + beg; 4144 4145 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) { 4146 str2 = rb_str_new4(str); 4147 str2 = str_new3(rb_obj_class(str2), str2); 4148 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 4149 RSTRING(str2)->as.heap.len = len; 4150 } 4151 else { 4152 str2 = rb_str_new5(str, p, len); 4153 } 4154 4155 str_enc_copy(str2, str); 4156 4157 if (RSTRING_LEN(str2) == 0) { 4158 if (!rb_enc_asciicompat(STR_ENC_GET(str))) 4159 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID); 4160 else 4161 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 4162 } 4163 else { 4164 switch (ENC_CODERANGE(str)) { 4165 case ENC_CODERANGE_7BIT: 4166 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 4167 break; 4168 default: 4169 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN); 4170 break; 4171 } 4172 } 4173 4174 OBJ_INFECT(str2, str); 4175 4176 return str2; 4177} 4178 4179static VALUE 4180str_byte_aref(VALUE str, VALUE indx) 4181{ 4182 long idx; 4183 switch (TYPE(indx)) { 4184 case T_FIXNUM: 4185 idx = FIX2LONG(indx); 4186 4187 num_index: 4188 str = str_byte_substr(str, idx, 1); 4189 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil; 4190 return str; 4191 4192 default: 4193 /* check if indx is Range */ 4194 { 4195 long beg, len = RSTRING_LEN(str); 4196 4197 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 4198 case Qfalse: 4199 break; 4200 case Qnil: 4201 return Qnil; 4202 default: 4203 return str_byte_substr(str, beg, len); 4204 } 4205 } 4206 idx = NUM2LONG(indx); 4207 goto num_index; 4208 } 4209 4210 UNREACHABLE; 4211} 4212 4213/* 4214 * call-seq: 4215 * str.byteslice(fixnum) -> new_str or nil 4216 * str.byteslice(fixnum, fixnum) -> new_str or nil 4217 * str.byteslice(range) -> new_str or nil 4218 * 4219 * Byte Reference---If passed a single <code>Fixnum</code>, returns a 4220 * substring of one byte at that position. If passed two <code>Fixnum</code> 4221 * objects, returns a substring starting at the offset given by the first, and 4222 * a length given by the second. If given a <code>Range</code>, a substring containing 4223 * bytes at offsets given by the range is returned. In all three cases, if 4224 * an offset is negative, it is counted from the end of <i>str</i>. Returns 4225 * <code>nil</code> if the initial offset falls outside the string, the length 4226 * is negative, or the beginning of the range is greater than the end. 4227 * The encoding of the resulted string keeps original encoding. 4228 * 4229 * "hello".byteslice(1) #=> "e" 4230 * "hello".byteslice(-1) #=> "o" 4231 * "hello".byteslice(1, 2) #=> "el" 4232 * "\x80\u3042".byteslice(1, 3) #=> "\u3042" 4233 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042" 4234 */ 4235 4236static VALUE 4237rb_str_byteslice(int argc, VALUE *argv, VALUE str) 4238{ 4239 if (argc == 2) { 4240 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 4241 } 4242 rb_check_arity(argc, 1, 2); 4243 return str_byte_aref(str, argv[0]); 4244} 4245 4246/* 4247 * call-seq: 4248 * str.reverse -> new_str 4249 * 4250 * Returns a new string with the characters from <i>str</i> in reverse order. 4251 * 4252 * "stressed".reverse #=> "desserts" 4253 */ 4254 4255static VALUE 4256rb_str_reverse(VALUE str) 4257{ 4258 rb_encoding *enc; 4259 VALUE rev; 4260 char *s, *e, *p; 4261 int single = 1; 4262 4263 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str); 4264 enc = STR_ENC_GET(str); 4265 rev = rb_str_new5(str, 0, RSTRING_LEN(str)); 4266 s = RSTRING_PTR(str); e = RSTRING_END(str); 4267 p = RSTRING_END(rev); 4268 4269 if (RSTRING_LEN(str) > 1) { 4270 if (single_byte_optimizable(str)) { 4271 while (s < e) { 4272 *--p = *s++; 4273 } 4274 } 4275 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) { 4276 while (s < e) { 4277 int clen = rb_enc_fast_mbclen(s, e, enc); 4278 4279 if (clen > 1 || (*s & 0x80)) single = 0; 4280 p -= clen; 4281 memcpy(p, s, clen); 4282 s += clen; 4283 } 4284 } 4285 else { 4286 while (s < e) { 4287 int clen = rb_enc_mbclen(s, e, enc); 4288 4289 if (clen > 1 || (*s & 0x80)) single = 0; 4290 p -= clen; 4291 memcpy(p, s, clen); 4292 s += clen; 4293 } 4294 } 4295 } 4296 STR_SET_LEN(rev, RSTRING_LEN(str)); 4297 OBJ_INFECT(rev, str); 4298 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) { 4299 if (single) { 4300 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 4301 } 4302 else { 4303 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 4304 } 4305 } 4306 rb_enc_cr_str_copy_for_substr(rev, str); 4307 4308 return rev; 4309} 4310 4311 4312/* 4313 * call-seq: 4314 * str.reverse! -> str 4315 * 4316 * Reverses <i>str</i> in place. 4317 */ 4318 4319static VALUE 4320rb_str_reverse_bang(VALUE str) 4321{ 4322 if (RSTRING_LEN(str) > 1) { 4323 if (single_byte_optimizable(str)) { 4324 char *s, *e, c; 4325 4326 str_modify_keep_cr(str); 4327 s = RSTRING_PTR(str); 4328 e = RSTRING_END(str) - 1; 4329 while (s < e) { 4330 c = *s; 4331 *s++ = *e; 4332 *e-- = c; 4333 } 4334 } 4335 else { 4336 rb_str_shared_replace(str, rb_str_reverse(str)); 4337 } 4338 } 4339 else { 4340 str_modify_keep_cr(str); 4341 } 4342 return str; 4343} 4344 4345 4346/* 4347 * call-seq: 4348 * str.include? other_str -> true or false 4349 * 4350 * Returns <code>true</code> if <i>str</i> contains the given string or 4351 * character. 4352 * 4353 * "hello".include? "lo" #=> true 4354 * "hello".include? "ol" #=> false 4355 * "hello".include? ?h #=> true 4356 */ 4357 4358static VALUE 4359rb_str_include(VALUE str, VALUE arg) 4360{ 4361 long i; 4362 4363 StringValue(arg); 4364 i = rb_str_index(str, arg, 0); 4365 4366 if (i == -1) return Qfalse; 4367 return Qtrue; 4368} 4369 4370 4371/* 4372 * call-seq: 4373 * str.to_i(base=10) -> integer 4374 * 4375 * Returns the result of interpreting leading characters in <i>str</i> as an 4376 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the 4377 * end of a valid number are ignored. If there is not a valid number at the 4378 * start of <i>str</i>, <code>0</code> is returned. This method never raises an 4379 * exception when <i>base</i> is valid. 4380 * 4381 * "12345".to_i #=> 12345 4382 * "99 red balloons".to_i #=> 99 4383 * "0a".to_i #=> 0 4384 * "0a".to_i(16) #=> 10 4385 * "hello".to_i #=> 0 4386 * "1100101".to_i(2) #=> 101 4387 * "1100101".to_i(8) #=> 294977 4388 * "1100101".to_i(10) #=> 1100101 4389 * "1100101".to_i(16) #=> 17826049 4390 */ 4391 4392static VALUE 4393rb_str_to_i(int argc, VALUE *argv, VALUE str) 4394{ 4395 int base; 4396 4397 if (argc == 0) base = 10; 4398 else { 4399 VALUE b; 4400 4401 rb_scan_args(argc, argv, "01", &b); 4402 base = NUM2INT(b); 4403 } 4404 if (base < 0) { 4405 rb_raise(rb_eArgError, "invalid radix %d", base); 4406 } 4407 return rb_str_to_inum(str, base, FALSE); 4408} 4409 4410 4411/* 4412 * call-seq: 4413 * str.to_f -> float 4414 * 4415 * Returns the result of interpreting leading characters in <i>str</i> as a 4416 * floating point number. Extraneous characters past the end of a valid number 4417 * are ignored. If there is not a valid number at the start of <i>str</i>, 4418 * <code>0.0</code> is returned. This method never raises an exception. 4419 * 4420 * "123.45e1".to_f #=> 1234.5 4421 * "45.67 degrees".to_f #=> 45.67 4422 * "thx1138".to_f #=> 0.0 4423 */ 4424 4425static VALUE 4426rb_str_to_f(VALUE str) 4427{ 4428 return DBL2NUM(rb_str_to_dbl(str, FALSE)); 4429} 4430 4431 4432/* 4433 * call-seq: 4434 * str.to_s -> str 4435 * str.to_str -> str 4436 * 4437 * Returns the receiver. 4438 */ 4439 4440static VALUE 4441rb_str_to_s(VALUE str) 4442{ 4443 if (rb_obj_class(str) != rb_cString) { 4444 return str_duplicate(rb_cString, str); 4445 } 4446 return str; 4447} 4448 4449#if 0 4450static void 4451str_cat_char(VALUE str, unsigned int c, rb_encoding *enc) 4452{ 4453 char s[RUBY_MAX_CHAR_LEN]; 4454 int n = rb_enc_codelen(c, enc); 4455 4456 rb_enc_mbcput(c, s, enc); 4457 rb_enc_str_buf_cat(str, s, n, enc); 4458} 4459#endif 4460 4461#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ 4462 4463int 4464rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p) 4465{ 4466 char buf[CHAR_ESC_LEN + 1]; 4467 int l; 4468 4469#if SIZEOF_INT > 4 4470 c &= 0xffffffff; 4471#endif 4472 if (unicode_p) { 4473 if (c < 0x7F && ISPRINT(c)) { 4474 snprintf(buf, CHAR_ESC_LEN, "%c", c); 4475 } 4476 else if (c < 0x10000) { 4477 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c); 4478 } 4479 else { 4480 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c); 4481 } 4482 } 4483 else { 4484 if (c < 0x100) { 4485 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c); 4486 } 4487 else { 4488 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c); 4489 } 4490 } 4491 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */ 4492 rb_str_buf_cat(result, buf, l); 4493 return l; 4494} 4495 4496/* 4497 * call-seq: 4498 * str.inspect -> string 4499 * 4500 * Returns a printable version of _str_, surrounded by quote marks, 4501 * with special characters escaped. 4502 * 4503 * str = "hello" 4504 * str[3] = "\b" 4505 * str.inspect #=> "\"hel\\bo\"" 4506 */ 4507 4508VALUE 4509rb_str_inspect(VALUE str) 4510{ 4511 rb_encoding *enc = STR_ENC_GET(str); 4512 const char *p, *pend, *prev; 4513 char buf[CHAR_ESC_LEN + 1]; 4514 VALUE result = rb_str_buf_new(0); 4515 rb_encoding *resenc = rb_default_internal_encoding(); 4516 int unicode_p = rb_enc_unicode_p(enc); 4517 int asciicompat = rb_enc_asciicompat(enc); 4518 static rb_encoding *utf16, *utf32; 4519 4520 if (!utf16) utf16 = rb_enc_find("UTF-16"); 4521 if (!utf32) utf32 = rb_enc_find("UTF-32"); 4522 if (resenc == NULL) resenc = rb_default_external_encoding(); 4523 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding(); 4524 rb_enc_associate(result, resenc); 4525 str_buf_cat2(result, "\""); 4526 4527 p = RSTRING_PTR(str); pend = RSTRING_END(str); 4528 prev = p; 4529 if (enc == utf16) { 4530 const unsigned char *q = (const unsigned char *)p; 4531 if (q[0] == 0xFE && q[1] == 0xFF) 4532 enc = rb_enc_find("UTF-16BE"); 4533 else if (q[0] == 0xFF && q[1] == 0xFE) 4534 enc = rb_enc_find("UTF-16LE"); 4535 else 4536 unicode_p = 0; 4537 } 4538 else if (enc == utf32) { 4539 const unsigned char *q = (const unsigned char *)p; 4540 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) 4541 enc = rb_enc_find("UTF-32BE"); 4542 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) 4543 enc = rb_enc_find("UTF-32LE"); 4544 else 4545 unicode_p = 0; 4546 } 4547 while (p < pend) { 4548 unsigned int c, cc; 4549 int n; 4550 4551 n = rb_enc_precise_mbclen(p, pend, enc); 4552 if (!MBCLEN_CHARFOUND_P(n)) { 4553 if (p > prev) str_buf_cat(result, prev, p - prev); 4554 n = rb_enc_mbminlen(enc); 4555 if (pend < p + n) 4556 n = (int)(pend - p); 4557 while (n--) { 4558 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377); 4559 str_buf_cat(result, buf, strlen(buf)); 4560 prev = ++p; 4561 } 4562 continue; 4563 } 4564 n = MBCLEN_CHARFOUND_LEN(n); 4565 c = rb_enc_mbc_to_codepoint(p, pend, enc); 4566 p += n; 4567 if ((asciicompat || unicode_p) && 4568 (c == '"'|| c == '\\' || 4569 (c == '#' && 4570 p < pend && 4571 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) && 4572 (cc = rb_enc_codepoint(p,pend,enc), 4573 (cc == '$' || cc == '@' || cc == '{'))))) { 4574 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 4575 str_buf_cat2(result, "\\"); 4576 if (asciicompat || enc == resenc) { 4577 prev = p - n; 4578 continue; 4579 } 4580 } 4581 switch (c) { 4582 case '\n': cc = 'n'; break; 4583 case '\r': cc = 'r'; break; 4584 case '\t': cc = 't'; break; 4585 case '\f': cc = 'f'; break; 4586 case '\013': cc = 'v'; break; 4587 case '\010': cc = 'b'; break; 4588 case '\007': cc = 'a'; break; 4589 case 033: cc = 'e'; break; 4590 default: cc = 0; break; 4591 } 4592 if (cc) { 4593 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 4594 buf[0] = '\\'; 4595 buf[1] = (char)cc; 4596 str_buf_cat(result, buf, 2); 4597 prev = p; 4598 continue; 4599 } 4600 if ((enc == resenc && rb_enc_isprint(c, enc)) || 4601 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) { 4602 continue; 4603 } 4604 else { 4605 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 4606 rb_str_buf_cat_escaped_char(result, c, unicode_p); 4607 prev = p; 4608 continue; 4609 } 4610 } 4611 if (p > prev) str_buf_cat(result, prev, p - prev); 4612 str_buf_cat2(result, "\""); 4613 4614 OBJ_INFECT(result, str); 4615 return result; 4616} 4617 4618#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) 4619 4620/* 4621 * call-seq: 4622 * str.dump -> new_str 4623 * 4624 * Produces a version of +str+ with all non-printing characters replaced by 4625 * <code>\nnn</code> notation and all special characters escaped. 4626 * 4627 * "hello \n ''".dump #=> "\"hello \\n ''\" 4628 */ 4629 4630VALUE 4631rb_str_dump(VALUE str) 4632{ 4633 rb_encoding *enc = rb_enc_get(str); 4634 long len; 4635 const char *p, *pend; 4636 char *q, *qend; 4637 VALUE result; 4638 int u8 = (enc == rb_utf8_encoding()); 4639 4640 len = 2; /* "" */ 4641 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 4642 while (p < pend) { 4643 unsigned char c = *p++; 4644 switch (c) { 4645 case '"': case '\\': 4646 case '\n': case '\r': 4647 case '\t': case '\f': 4648 case '\013': case '\010': case '\007': case '\033': 4649 len += 2; 4650 break; 4651 4652 case '#': 4653 len += IS_EVSTR(p, pend) ? 2 : 1; 4654 break; 4655 4656 default: 4657 if (ISPRINT(c)) { 4658 len++; 4659 } 4660 else { 4661 if (u8) { /* \u{NN} */ 4662 int n = rb_enc_precise_mbclen(p-1, pend, enc); 4663 if (MBCLEN_CHARFOUND_P(n-1)) { 4664 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 4665 while (cc >>= 4) len++; 4666 len += 5; 4667 p += MBCLEN_CHARFOUND_LEN(n)-1; 4668 break; 4669 } 4670 } 4671 len += 4; /* \xNN */ 4672 } 4673 break; 4674 } 4675 } 4676 if (!rb_enc_asciicompat(enc)) { 4677 len += 19; /* ".force_encoding('')" */ 4678 len += strlen(enc->name); 4679 } 4680 4681 result = rb_str_new5(str, 0, len); 4682 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 4683 q = RSTRING_PTR(result); qend = q + len + 1; 4684 4685 *q++ = '"'; 4686 while (p < pend) { 4687 unsigned char c = *p++; 4688 4689 if (c == '"' || c == '\\') { 4690 *q++ = '\\'; 4691 *q++ = c; 4692 } 4693 else if (c == '#') { 4694 if (IS_EVSTR(p, pend)) *q++ = '\\'; 4695 *q++ = '#'; 4696 } 4697 else if (c == '\n') { 4698 *q++ = '\\'; 4699 *q++ = 'n'; 4700 } 4701 else if (c == '\r') { 4702 *q++ = '\\'; 4703 *q++ = 'r'; 4704 } 4705 else if (c == '\t') { 4706 *q++ = '\\'; 4707 *q++ = 't'; 4708 } 4709 else if (c == '\f') { 4710 *q++ = '\\'; 4711 *q++ = 'f'; 4712 } 4713 else if (c == '\013') { 4714 *q++ = '\\'; 4715 *q++ = 'v'; 4716 } 4717 else if (c == '\010') { 4718 *q++ = '\\'; 4719 *q++ = 'b'; 4720 } 4721 else if (c == '\007') { 4722 *q++ = '\\'; 4723 *q++ = 'a'; 4724 } 4725 else if (c == '\033') { 4726 *q++ = '\\'; 4727 *q++ = 'e'; 4728 } 4729 else if (ISPRINT(c)) { 4730 *q++ = c; 4731 } 4732 else { 4733 *q++ = '\\'; 4734 if (u8) { 4735 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1; 4736 if (MBCLEN_CHARFOUND_P(n)) { 4737 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 4738 p += n; 4739 snprintf(q, qend-q, "u{%x}", cc); 4740 q += strlen(q); 4741 continue; 4742 } 4743 } 4744 snprintf(q, qend-q, "x%02X", c); 4745 q += 3; 4746 } 4747 } 4748 *q++ = '"'; 4749 *q = '\0'; 4750 if (!rb_enc_asciicompat(enc)) { 4751 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name); 4752 enc = rb_ascii8bit_encoding(); 4753 } 4754 OBJ_INFECT(result, str); 4755 /* result from dump is ASCII */ 4756 rb_enc_associate(result, enc); 4757 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT); 4758 return result; 4759} 4760 4761 4762static void 4763rb_str_check_dummy_enc(rb_encoding *enc) 4764{ 4765 if (rb_enc_dummy_p(enc)) { 4766 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s", 4767 rb_enc_name(enc)); 4768 } 4769} 4770 4771/* 4772 * call-seq: 4773 * str.upcase! -> str or nil 4774 * 4775 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes 4776 * were made. 4777 * Note: case replacement is effective only in ASCII region. 4778 */ 4779 4780static VALUE 4781rb_str_upcase_bang(VALUE str) 4782{ 4783 rb_encoding *enc; 4784 char *s, *send; 4785 int modify = 0; 4786 int n; 4787 4788 str_modify_keep_cr(str); 4789 enc = STR_ENC_GET(str); 4790 rb_str_check_dummy_enc(enc); 4791 s = RSTRING_PTR(str); send = RSTRING_END(str); 4792 if (single_byte_optimizable(str)) { 4793 while (s < send) { 4794 unsigned int c = *(unsigned char*)s; 4795 4796 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 4797 *s = 'A' + (c - 'a'); 4798 modify = 1; 4799 } 4800 s++; 4801 } 4802 } 4803 else { 4804 int ascompat = rb_enc_asciicompat(enc); 4805 4806 while (s < send) { 4807 unsigned int c; 4808 4809 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 4810 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 4811 *s = 'A' + (c - 'a'); 4812 modify = 1; 4813 } 4814 s++; 4815 } 4816 else { 4817 c = rb_enc_codepoint_len(s, send, &n, enc); 4818 if (rb_enc_islower(c, enc)) { 4819 /* assuming toupper returns codepoint with same size */ 4820 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 4821 modify = 1; 4822 } 4823 s += n; 4824 } 4825 } 4826 } 4827 4828 if (modify) return str; 4829 return Qnil; 4830} 4831 4832 4833/* 4834 * call-seq: 4835 * str.upcase -> new_str 4836 * 4837 * Returns a copy of <i>str</i> with all lowercase letters replaced with their 4838 * uppercase counterparts. The operation is locale insensitive---only 4839 * characters ``a'' to ``z'' are affected. 4840 * Note: case replacement is effective only in ASCII region. 4841 * 4842 * "hEllO".upcase #=> "HELLO" 4843 */ 4844 4845static VALUE 4846rb_str_upcase(VALUE str) 4847{ 4848 str = rb_str_dup(str); 4849 rb_str_upcase_bang(str); 4850 return str; 4851} 4852 4853 4854/* 4855 * call-seq: 4856 * str.downcase! -> str or nil 4857 * 4858 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no 4859 * changes were made. 4860 * Note: case replacement is effective only in ASCII region. 4861 */ 4862 4863static VALUE 4864rb_str_downcase_bang(VALUE str) 4865{ 4866 rb_encoding *enc; 4867 char *s, *send; 4868 int modify = 0; 4869 4870 str_modify_keep_cr(str); 4871 enc = STR_ENC_GET(str); 4872 rb_str_check_dummy_enc(enc); 4873 s = RSTRING_PTR(str); send = RSTRING_END(str); 4874 if (single_byte_optimizable(str)) { 4875 while (s < send) { 4876 unsigned int c = *(unsigned char*)s; 4877 4878 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 4879 *s = 'a' + (c - 'A'); 4880 modify = 1; 4881 } 4882 s++; 4883 } 4884 } 4885 else { 4886 int ascompat = rb_enc_asciicompat(enc); 4887 4888 while (s < send) { 4889 unsigned int c; 4890 int n; 4891 4892 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 4893 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 4894 *s = 'a' + (c - 'A'); 4895 modify = 1; 4896 } 4897 s++; 4898 } 4899 else { 4900 c = rb_enc_codepoint_len(s, send, &n, enc); 4901 if (rb_enc_isupper(c, enc)) { 4902 /* assuming toupper returns codepoint with same size */ 4903 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 4904 modify = 1; 4905 } 4906 s += n; 4907 } 4908 } 4909 } 4910 4911 if (modify) return str; 4912 return Qnil; 4913} 4914 4915 4916/* 4917 * call-seq: 4918 * str.downcase -> new_str 4919 * 4920 * Returns a copy of <i>str</i> with all uppercase letters replaced with their 4921 * lowercase counterparts. The operation is locale insensitive---only 4922 * characters ``A'' to ``Z'' are affected. 4923 * Note: case replacement is effective only in ASCII region. 4924 * 4925 * "hEllO".downcase #=> "hello" 4926 */ 4927 4928static VALUE 4929rb_str_downcase(VALUE str) 4930{ 4931 str = rb_str_dup(str); 4932 rb_str_downcase_bang(str); 4933 return str; 4934} 4935 4936 4937/* 4938 * call-seq: 4939 * str.capitalize! -> str or nil 4940 * 4941 * Modifies <i>str</i> by converting the first character to uppercase and the 4942 * remainder to lowercase. Returns <code>nil</code> if no changes are made. 4943 * Note: case conversion is effective only in ASCII region. 4944 * 4945 * a = "hello" 4946 * a.capitalize! #=> "Hello" 4947 * a #=> "Hello" 4948 * a.capitalize! #=> nil 4949 */ 4950 4951static VALUE 4952rb_str_capitalize_bang(VALUE str) 4953{ 4954 rb_encoding *enc; 4955 char *s, *send; 4956 int modify = 0; 4957 unsigned int c; 4958 int n; 4959 4960 str_modify_keep_cr(str); 4961 enc = STR_ENC_GET(str); 4962 rb_str_check_dummy_enc(enc); 4963 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 4964 s = RSTRING_PTR(str); send = RSTRING_END(str); 4965 4966 c = rb_enc_codepoint_len(s, send, &n, enc); 4967 if (rb_enc_islower(c, enc)) { 4968 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 4969 modify = 1; 4970 } 4971 s += n; 4972 while (s < send) { 4973 c = rb_enc_codepoint_len(s, send, &n, enc); 4974 if (rb_enc_isupper(c, enc)) { 4975 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 4976 modify = 1; 4977 } 4978 s += n; 4979 } 4980 4981 if (modify) return str; 4982 return Qnil; 4983} 4984 4985 4986/* 4987 * call-seq: 4988 * str.capitalize -> new_str 4989 * 4990 * Returns a copy of <i>str</i> with the first character converted to uppercase 4991 * and the remainder to lowercase. 4992 * Note: case conversion is effective only in ASCII region. 4993 * 4994 * "hello".capitalize #=> "Hello" 4995 * "HELLO".capitalize #=> "Hello" 4996 * "123ABC".capitalize #=> "123abc" 4997 */ 4998 4999static VALUE 5000rb_str_capitalize(VALUE str) 5001{ 5002 str = rb_str_dup(str); 5003 rb_str_capitalize_bang(str); 5004 return str; 5005} 5006 5007 5008/* 5009 * call-seq: 5010 * str.swapcase! -> str or nil 5011 * 5012 * Equivalent to <code>String#swapcase</code>, but modifies the receiver in 5013 * place, returning <i>str</i>, or <code>nil</code> if no changes were made. 5014 * Note: case conversion is effective only in ASCII region. 5015 */ 5016 5017static VALUE 5018rb_str_swapcase_bang(VALUE str) 5019{ 5020 rb_encoding *enc; 5021 char *s, *send; 5022 int modify = 0; 5023 int n; 5024 5025 str_modify_keep_cr(str); 5026 enc = STR_ENC_GET(str); 5027 rb_str_check_dummy_enc(enc); 5028 s = RSTRING_PTR(str); send = RSTRING_END(str); 5029 while (s < send) { 5030 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc); 5031 5032 if (rb_enc_isupper(c, enc)) { 5033 /* assuming toupper returns codepoint with same size */ 5034 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 5035 modify = 1; 5036 } 5037 else if (rb_enc_islower(c, enc)) { 5038 /* assuming tolower returns codepoint with same size */ 5039 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 5040 modify = 1; 5041 } 5042 s += n; 5043 } 5044 5045 if (modify) return str; 5046 return Qnil; 5047} 5048 5049 5050/* 5051 * call-seq: 5052 * str.swapcase -> new_str 5053 * 5054 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted 5055 * to lowercase and lowercase characters converted to uppercase. 5056 * Note: case conversion is effective only in ASCII region. 5057 * 5058 * "Hello".swapcase #=> "hELLO" 5059 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11" 5060 */ 5061 5062static VALUE 5063rb_str_swapcase(VALUE str) 5064{ 5065 str = rb_str_dup(str); 5066 rb_str_swapcase_bang(str); 5067 return str; 5068} 5069 5070typedef unsigned char *USTR; 5071 5072struct tr { 5073 int gen; 5074 unsigned int now, max; 5075 char *p, *pend; 5076}; 5077 5078static unsigned int 5079trnext(struct tr *t, rb_encoding *enc) 5080{ 5081 int n; 5082 5083 for (;;) { 5084 if (!t->gen) { 5085nextpart: 5086 if (t->p == t->pend) return -1; 5087 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) { 5088 t->p += n; 5089 } 5090 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 5091 t->p += n; 5092 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) { 5093 t->p += n; 5094 if (t->p < t->pend) { 5095 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 5096 t->p += n; 5097 if (t->now > c) { 5098 if (t->now < 0x80 && c < 0x80) { 5099 rb_raise(rb_eArgError, 5100 "invalid range \"%c-%c\" in string transliteration", 5101 t->now, c); 5102 } 5103 else { 5104 rb_raise(rb_eArgError, "invalid range in string transliteration"); 5105 } 5106 continue; /* not reached */ 5107 } 5108 t->gen = 1; 5109 t->max = c; 5110 } 5111 } 5112 return t->now; 5113 } 5114 else { 5115 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) { 5116 if (t->now == t->max) { 5117 t->gen = 0; 5118 goto nextpart; 5119 } 5120 } 5121 if (t->now < t->max) { 5122 return t->now; 5123 } 5124 else { 5125 t->gen = 0; 5126 return t->max; 5127 } 5128 } 5129 } 5130} 5131 5132static VALUE rb_str_delete_bang(int,VALUE*,VALUE); 5133 5134static VALUE 5135tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) 5136{ 5137 const unsigned int errc = -1; 5138 unsigned int trans[256]; 5139 rb_encoding *enc, *e1, *e2; 5140 struct tr trsrc, trrepl; 5141 int cflag = 0; 5142 unsigned int c, c0, last = 0; 5143 int modify = 0, i, l; 5144 char *s, *send; 5145 VALUE hash = 0; 5146 int singlebyte = single_byte_optimizable(str); 5147 int cr; 5148 5149#define CHECK_IF_ASCII(c) \ 5150 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \ 5151 (cr = ENC_CODERANGE_VALID) : 0) 5152 5153 StringValue(src); 5154 StringValue(repl); 5155 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 5156 if (RSTRING_LEN(repl) == 0) { 5157 return rb_str_delete_bang(1, &src, str); 5158 } 5159 5160 cr = ENC_CODERANGE(str); 5161 e1 = rb_enc_check(str, src); 5162 e2 = rb_enc_check(str, repl); 5163 if (e1 == e2) { 5164 enc = e1; 5165 } 5166 else { 5167 enc = rb_enc_check(src, repl); 5168 } 5169 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src); 5170 if (RSTRING_LEN(src) > 1 && 5171 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' && 5172 trsrc.p + l < trsrc.pend) { 5173 cflag = 1; 5174 trsrc.p += l; 5175 } 5176 trrepl.p = RSTRING_PTR(repl); 5177 trrepl.pend = trrepl.p + RSTRING_LEN(repl); 5178 trsrc.gen = trrepl.gen = 0; 5179 trsrc.now = trrepl.now = 0; 5180 trsrc.max = trrepl.max = 0; 5181 5182 if (cflag) { 5183 for (i=0; i<256; i++) { 5184 trans[i] = 1; 5185 } 5186 while ((c = trnext(&trsrc, enc)) != errc) { 5187 if (c < 256) { 5188 trans[c] = errc; 5189 } 5190 else { 5191 if (!hash) hash = rb_hash_new(); 5192 rb_hash_aset(hash, UINT2NUM(c), Qtrue); 5193 } 5194 } 5195 while ((c = trnext(&trrepl, enc)) != errc) 5196 /* retrieve last replacer */; 5197 last = trrepl.now; 5198 for (i=0; i<256; i++) { 5199 if (trans[i] != errc) { 5200 trans[i] = last; 5201 } 5202 } 5203 } 5204 else { 5205 unsigned int r; 5206 5207 for (i=0; i<256; i++) { 5208 trans[i] = errc; 5209 } 5210 while ((c = trnext(&trsrc, enc)) != errc) { 5211 r = trnext(&trrepl, enc); 5212 if (r == errc) r = trrepl.now; 5213 if (c < 256) { 5214 trans[c] = r; 5215 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0; 5216 } 5217 else { 5218 if (!hash) hash = rb_hash_new(); 5219 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r)); 5220 } 5221 } 5222 } 5223 5224 if (cr == ENC_CODERANGE_VALID) 5225 cr = ENC_CODERANGE_7BIT; 5226 str_modify_keep_cr(str); 5227 s = RSTRING_PTR(str); send = RSTRING_END(str); 5228 if (sflag) { 5229 int clen, tlen; 5230 long offset, max = RSTRING_LEN(str); 5231 unsigned int save = -1; 5232 char *buf = ALLOC_N(char, max), *t = buf; 5233 5234 while (s < send) { 5235 int may_modify = 0; 5236 5237 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 5238 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 5239 5240 s += clen; 5241 if (c < 256) { 5242 c = trans[c]; 5243 } 5244 else if (hash) { 5245 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 5246 if (NIL_P(tmp)) { 5247 if (cflag) c = last; 5248 else c = errc; 5249 } 5250 else if (cflag) c = errc; 5251 else c = NUM2INT(tmp); 5252 } 5253 else { 5254 c = errc; 5255 } 5256 if (c != (unsigned int)-1) { 5257 if (save == c) { 5258 CHECK_IF_ASCII(c); 5259 continue; 5260 } 5261 save = c; 5262 tlen = rb_enc_codelen(c, enc); 5263 modify = 1; 5264 } 5265 else { 5266 save = -1; 5267 c = c0; 5268 if (enc != e1) may_modify = 1; 5269 } 5270 while (t - buf + tlen >= max) { 5271 offset = t - buf; 5272 max *= 2; 5273 REALLOC_N(buf, char, max); 5274 t = buf + offset; 5275 } 5276 rb_enc_mbcput(c, t, enc); 5277 if (may_modify && memcmp(s, t, tlen) != 0) { 5278 modify = 1; 5279 } 5280 CHECK_IF_ASCII(c); 5281 t += tlen; 5282 } 5283 if (!STR_EMBED_P(str)) { 5284 xfree(RSTRING(str)->as.heap.ptr); 5285 } 5286 *t = '\0'; 5287 RSTRING(str)->as.heap.ptr = buf; 5288 RSTRING(str)->as.heap.len = t - buf; 5289 STR_SET_NOEMBED(str); 5290 RSTRING(str)->as.heap.aux.capa = max; 5291 } 5292 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) { 5293 while (s < send) { 5294 c = (unsigned char)*s; 5295 if (trans[c] != errc) { 5296 if (!cflag) { 5297 c = trans[c]; 5298 *s = c; 5299 modify = 1; 5300 } 5301 else { 5302 *s = last; 5303 modify = 1; 5304 } 5305 } 5306 CHECK_IF_ASCII(c); 5307 s++; 5308 } 5309 } 5310 else { 5311 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2); 5312 long offset; 5313 char *buf = ALLOC_N(char, max), *t = buf; 5314 5315 while (s < send) { 5316 int may_modify = 0; 5317 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 5318 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 5319 5320 if (c < 256) { 5321 c = trans[c]; 5322 } 5323 else if (hash) { 5324 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 5325 if (NIL_P(tmp)) { 5326 if (cflag) c = last; 5327 else c = errc; 5328 } 5329 else if (cflag) c = errc; 5330 else c = NUM2INT(tmp); 5331 } 5332 else { 5333 c = cflag ? last : errc; 5334 } 5335 if (c != errc) { 5336 tlen = rb_enc_codelen(c, enc); 5337 modify = 1; 5338 } 5339 else { 5340 c = c0; 5341 if (enc != e1) may_modify = 1; 5342 } 5343 while (t - buf + tlen >= max) { 5344 offset = t - buf; 5345 max *= 2; 5346 REALLOC_N(buf, char, max); 5347 t = buf + offset; 5348 } 5349 if (s != t) { 5350 rb_enc_mbcput(c, t, enc); 5351 if (may_modify && memcmp(s, t, tlen) != 0) { 5352 modify = 1; 5353 } 5354 } 5355 CHECK_IF_ASCII(c); 5356 s += clen; 5357 t += tlen; 5358 } 5359 if (!STR_EMBED_P(str)) { 5360 xfree(RSTRING(str)->as.heap.ptr); 5361 } 5362 *t = '\0'; 5363 RSTRING(str)->as.heap.ptr = buf; 5364 RSTRING(str)->as.heap.len = t - buf; 5365 STR_SET_NOEMBED(str); 5366 RSTRING(str)->as.heap.aux.capa = max; 5367 } 5368 5369 if (modify) { 5370 if (cr != ENC_CODERANGE_BROKEN) 5371 ENC_CODERANGE_SET(str, cr); 5372 rb_enc_associate(str, enc); 5373 return str; 5374 } 5375 return Qnil; 5376} 5377 5378 5379/* 5380 * call-seq: 5381 * str.tr!(from_str, to_str) -> str or nil 5382 * 5383 * Translates <i>str</i> in place, using the same rules as 5384 * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no 5385 * changes were made. 5386 */ 5387 5388static VALUE 5389rb_str_tr_bang(VALUE str, VALUE src, VALUE repl) 5390{ 5391 return tr_trans(str, src, repl, 0); 5392} 5393 5394 5395/* 5396 * call-seq: 5397 * str.tr(from_str, to_str) => new_str 5398 * 5399 * Returns a copy of +str+ with the characters in +from_str+ replaced by the 5400 * corresponding characters in +to_str+. If +to_str+ is shorter than 5401 * +from_str+, it is padded with its last character in order to maintain the 5402 * correspondence. 5403 * 5404 * "hello".tr('el', 'ip') #=> "hippo" 5405 * "hello".tr('aeiou', '*') #=> "h*ll*" 5406 * "hello".tr('aeiou', 'AA*') #=> "hAll*" 5407 * 5408 * Both strings may use the <code>c1-c2</code> notation to denote ranges of 5409 * characters, and +from_str+ may start with a <code>^</code>, which denotes 5410 * all characters except those listed. 5411 * 5412 * "hello".tr('a-y', 'b-z') #=> "ifmmp" 5413 * "hello".tr('^aeiou', '*') #=> "*e**o" 5414 * 5415 * The backslash character <code>\</code> can be used to escape 5416 * <code>^</code> or <code>-</code> and is otherwise ignored unless it 5417 * appears at the end of a range or the end of the +from_str+ or +to_str+: 5418 * 5419 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld" 5420 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld" 5421 * 5422 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld" 5423 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold" 5424 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld" 5425 * 5426 * "X['\\b']".tr("X\\", "") #=> "['b']" 5427 * "X['\\b']".tr("X-\\]", "") #=> "'b'" 5428 */ 5429 5430static VALUE 5431rb_str_tr(VALUE str, VALUE src, VALUE repl) 5432{ 5433 str = rb_str_dup(str); 5434 tr_trans(str, src, repl, 0); 5435 return str; 5436} 5437 5438#define TR_TABLE_SIZE 257 5439static void 5440tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first, 5441 VALUE *tablep, VALUE *ctablep, rb_encoding *enc) 5442{ 5443 const unsigned int errc = -1; 5444 char buf[256]; 5445 struct tr tr; 5446 unsigned int c; 5447 VALUE table = 0, ptable = 0; 5448 int i, l, cflag = 0; 5449 5450 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str); 5451 tr.gen = tr.now = tr.max = 0; 5452 5453 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') { 5454 cflag = 1; 5455 tr.p += l; 5456 } 5457 if (first) { 5458 for (i=0; i<256; i++) { 5459 stable[i] = 1; 5460 } 5461 stable[256] = cflag; 5462 } 5463 else if (stable[256] && !cflag) { 5464 stable[256] = 0; 5465 } 5466 for (i=0; i<256; i++) { 5467 buf[i] = cflag; 5468 } 5469 5470 while ((c = trnext(&tr, enc)) != errc) { 5471 if (c < 256) { 5472 buf[c & 0xff] = !cflag; 5473 } 5474 else { 5475 VALUE key = UINT2NUM(c); 5476 5477 if (!table && (first || *tablep || stable[256])) { 5478 if (cflag) { 5479 ptable = *ctablep; 5480 table = ptable ? ptable : rb_hash_new(); 5481 *ctablep = table; 5482 } 5483 else { 5484 table = rb_hash_new(); 5485 ptable = *tablep; 5486 *tablep = table; 5487 } 5488 } 5489 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) { 5490 rb_hash_aset(table, key, Qtrue); 5491 } 5492 } 5493 } 5494 for (i=0; i<256; i++) { 5495 stable[i] = stable[i] && buf[i]; 5496 } 5497 if (!table && !cflag) { 5498 *tablep = 0; 5499 } 5500} 5501 5502 5503static int 5504tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel) 5505{ 5506 if (c < 256) { 5507 return table[c] != 0; 5508 } 5509 else { 5510 VALUE v = UINT2NUM(c); 5511 5512 if (del) { 5513 if (!NIL_P(rb_hash_lookup(del, v)) && 5514 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) { 5515 return TRUE; 5516 } 5517 } 5518 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) { 5519 return FALSE; 5520 } 5521 return table[256] ? TRUE : FALSE; 5522 } 5523} 5524 5525/* 5526 * call-seq: 5527 * str.delete!([other_str]+) -> str or nil 5528 * 5529 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or 5530 * <code>nil</code> if <i>str</i> was not modified. 5531 */ 5532 5533static VALUE 5534rb_str_delete_bang(int argc, VALUE *argv, VALUE str) 5535{ 5536 char squeez[TR_TABLE_SIZE]; 5537 rb_encoding *enc = 0; 5538 char *s, *send, *t; 5539 VALUE del = 0, nodel = 0; 5540 int modify = 0; 5541 int i, ascompat, cr; 5542 5543 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 5544 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS); 5545 for (i=0; i<argc; i++) { 5546 VALUE s = argv[i]; 5547 5548 StringValue(s); 5549 enc = rb_enc_check(str, s); 5550 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 5551 } 5552 5553 str_modify_keep_cr(str); 5554 ascompat = rb_enc_asciicompat(enc); 5555 s = t = RSTRING_PTR(str); 5556 send = RSTRING_END(str); 5557 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 5558 while (s < send) { 5559 unsigned int c; 5560 int clen; 5561 5562 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 5563 if (squeez[c]) { 5564 modify = 1; 5565 } 5566 else { 5567 if (t != s) *t = c; 5568 t++; 5569 } 5570 s++; 5571 } 5572 else { 5573 c = rb_enc_codepoint_len(s, send, &clen, enc); 5574 5575 if (tr_find(c, squeez, del, nodel)) { 5576 modify = 1; 5577 } 5578 else { 5579 if (t != s) rb_enc_mbcput(c, t, enc); 5580 t += clen; 5581 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID; 5582 } 5583 s += clen; 5584 } 5585 } 5586 *t = '\0'; 5587 STR_SET_LEN(str, t - RSTRING_PTR(str)); 5588 ENC_CODERANGE_SET(str, cr); 5589 5590 if (modify) return str; 5591 return Qnil; 5592} 5593 5594 5595/* 5596 * call-seq: 5597 * str.delete([other_str]+) -> new_str 5598 * 5599 * Returns a copy of <i>str</i> with all characters in the intersection of its 5600 * arguments deleted. Uses the same rules for building the set of characters as 5601 * <code>String#count</code>. 5602 * 5603 * "hello".delete "l","lo" #=> "heo" 5604 * "hello".delete "lo" #=> "he" 5605 * "hello".delete "aeiou", "^e" #=> "hell" 5606 * "hello".delete "ej-m" #=> "ho" 5607 */ 5608 5609static VALUE 5610rb_str_delete(int argc, VALUE *argv, VALUE str) 5611{ 5612 str = rb_str_dup(str); 5613 rb_str_delete_bang(argc, argv, str); 5614 return str; 5615} 5616 5617 5618/* 5619 * call-seq: 5620 * str.squeeze!([other_str]*) -> str or nil 5621 * 5622 * Squeezes <i>str</i> in place, returning either <i>str</i>, or 5623 * <code>nil</code> if no changes were made. 5624 */ 5625 5626static VALUE 5627rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) 5628{ 5629 char squeez[TR_TABLE_SIZE]; 5630 rb_encoding *enc = 0; 5631 VALUE del = 0, nodel = 0; 5632 char *s, *send, *t; 5633 int i, modify = 0; 5634 int ascompat, singlebyte = single_byte_optimizable(str); 5635 unsigned int save; 5636 5637 if (argc == 0) { 5638 enc = STR_ENC_GET(str); 5639 } 5640 else { 5641 for (i=0; i<argc; i++) { 5642 VALUE s = argv[i]; 5643 5644 StringValue(s); 5645 enc = rb_enc_check(str, s); 5646 if (singlebyte && !single_byte_optimizable(s)) 5647 singlebyte = 0; 5648 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 5649 } 5650 } 5651 5652 str_modify_keep_cr(str); 5653 s = t = RSTRING_PTR(str); 5654 if (!s || RSTRING_LEN(str) == 0) return Qnil; 5655 send = RSTRING_END(str); 5656 save = -1; 5657 ascompat = rb_enc_asciicompat(enc); 5658 5659 if (singlebyte) { 5660 while (s < send) { 5661 unsigned int c = *(unsigned char*)s++; 5662 if (c != save || (argc > 0 && !squeez[c])) { 5663 *t++ = save = c; 5664 } 5665 } 5666 } else { 5667 while (s < send) { 5668 unsigned int c; 5669 int clen; 5670 5671 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 5672 if (c != save || (argc > 0 && !squeez[c])) { 5673 *t++ = save = c; 5674 } 5675 s++; 5676 } 5677 else { 5678 c = rb_enc_codepoint_len(s, send, &clen, enc); 5679 5680 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) { 5681 if (t != s) rb_enc_mbcput(c, t, enc); 5682 save = c; 5683 t += clen; 5684 } 5685 s += clen; 5686 } 5687 } 5688 } 5689 5690 *t = '\0'; 5691 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) { 5692 STR_SET_LEN(str, t - RSTRING_PTR(str)); 5693 modify = 1; 5694 } 5695 5696 if (modify) return str; 5697 return Qnil; 5698} 5699 5700 5701/* 5702 * call-seq: 5703 * str.squeeze([other_str]*) -> new_str 5704 * 5705 * Builds a set of characters from the <i>other_str</i> parameter(s) using the 5706 * procedure described for <code>String#count</code>. Returns a new string 5707 * where runs of the same character that occur in this set are replaced by a 5708 * single character. If no arguments are given, all runs of identical 5709 * characters are replaced by a single character. 5710 * 5711 * "yellow moon".squeeze #=> "yelow mon" 5712 * " now is the".squeeze(" ") #=> " now is the" 5713 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls" 5714 */ 5715 5716static VALUE 5717rb_str_squeeze(int argc, VALUE *argv, VALUE str) 5718{ 5719 str = rb_str_dup(str); 5720 rb_str_squeeze_bang(argc, argv, str); 5721 return str; 5722} 5723 5724 5725/* 5726 * call-seq: 5727 * str.tr_s!(from_str, to_str) -> str or nil 5728 * 5729 * Performs <code>String#tr_s</code> processing on <i>str</i> in place, 5730 * returning <i>str</i>, or <code>nil</code> if no changes were made. 5731 */ 5732 5733static VALUE 5734rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl) 5735{ 5736 return tr_trans(str, src, repl, 1); 5737} 5738 5739 5740/* 5741 * call-seq: 5742 * str.tr_s(from_str, to_str) -> new_str 5743 * 5744 * Processes a copy of <i>str</i> as described under <code>String#tr</code>, 5745 * then removes duplicate characters in regions that were affected by the 5746 * translation. 5747 * 5748 * "hello".tr_s('l', 'r') #=> "hero" 5749 * "hello".tr_s('el', '*') #=> "h*o" 5750 * "hello".tr_s('el', 'hx') #=> "hhxo" 5751 */ 5752 5753static VALUE 5754rb_str_tr_s(VALUE str, VALUE src, VALUE repl) 5755{ 5756 str = rb_str_dup(str); 5757 tr_trans(str, src, repl, 1); 5758 return str; 5759} 5760 5761 5762/* 5763 * call-seq: 5764 * str.count([other_str]+) -> fixnum 5765 * 5766 * Each +other_str+ parameter defines a set of characters to count. The 5767 * intersection of these sets defines the characters to count in +str+. Any 5768 * +other_str+ that starts with a caret <code>^</code> is negated. The 5769 * sequence <code>c1-c2</code> means all characters between c1 and c2. The 5770 * backslash character <code>\</code> can be used to escape <code>^</code> or 5771 * <code>-</code> and is otherwise ignored unless it appears at the end of a 5772 * sequence or the end of a +other_str+. 5773 * 5774 * a = "hello world" 5775 * a.count "lo" #=> 5 5776 * a.count "lo", "o" #=> 2 5777 * a.count "hello", "^l" #=> 4 5778 * a.count "ej-m" #=> 4 5779 * 5780 * "hello^world".count "\\^aeiou" #=> 4 5781 * "hello-world".count "a\\-eo" #=> 4 5782 * 5783 * c = "hello world\\r\\n" 5784 * c.count "\\" #=> 2 5785 * c.count "\\A" #=> 0 5786 * c.count "X-\\w" #=> 3 5787 */ 5788 5789static VALUE 5790rb_str_count(int argc, VALUE *argv, VALUE str) 5791{ 5792 char table[TR_TABLE_SIZE]; 5793 rb_encoding *enc = 0; 5794 VALUE del = 0, nodel = 0; 5795 char *s, *send; 5796 int i; 5797 int ascompat; 5798 5799 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS); 5800 for (i=0; i<argc; i++) { 5801 VALUE tstr = argv[i]; 5802 unsigned char c; 5803 5804 StringValue(tstr); 5805 enc = rb_enc_check(str, tstr); 5806 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) && 5807 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) { 5808 int n = 0; 5809 5810 s = RSTRING_PTR(str); 5811 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 5812 send = RSTRING_END(str); 5813 while (s < send) { 5814 if (*(unsigned char*)s++ == c) n++; 5815 } 5816 return INT2NUM(n); 5817 } 5818 tr_setup_table(tstr, table, i==0, &del, &nodel, enc); 5819 } 5820 5821 s = RSTRING_PTR(str); 5822 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 5823 send = RSTRING_END(str); 5824 ascompat = rb_enc_asciicompat(enc); 5825 i = 0; 5826 while (s < send) { 5827 unsigned int c; 5828 5829 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 5830 if (table[c]) { 5831 i++; 5832 } 5833 s++; 5834 } 5835 else { 5836 int clen; 5837 c = rb_enc_codepoint_len(s, send, &clen, enc); 5838 if (tr_find(c, table, del, nodel)) { 5839 i++; 5840 } 5841 s += clen; 5842 } 5843 } 5844 5845 return INT2NUM(i); 5846} 5847 5848static const char isspacetable[256] = { 5849 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 5850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5851 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5856 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5858 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5859 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5860 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5861 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 5865}; 5866 5867#define ascii_isspace(c) isspacetable[(unsigned char)(c)] 5868 5869/* 5870 * call-seq: 5871 * str.split(pattern=$;, [limit]) -> anArray 5872 * 5873 * Divides <i>str</i> into substrings based on a delimiter, returning an array 5874 * of these substrings. 5875 * 5876 * If <i>pattern</i> is a <code>String</code>, then its contents are used as 5877 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single 5878 * space, <i>str</i> is split on whitespace, with leading whitespace and runs 5879 * of contiguous whitespace characters ignored. 5880 * 5881 * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the 5882 * pattern matches. Whenever the pattern matches a zero-length string, 5883 * <i>str</i> is split into individual characters. If <i>pattern</i> contains 5884 * groups, the respective matches will be returned in the array as well. 5885 * 5886 * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If 5887 * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is 5888 * split on whitespace as if ` ' were specified. 5889 * 5890 * If the <i>limit</i> parameter is omitted, trailing null fields are 5891 * suppressed. If <i>limit</i> is a positive number, at most that number of 5892 * fields will be returned (if <i>limit</i> is <code>1</code>, the entire 5893 * string is returned as the only entry in an array). If negative, there is no 5894 * limit to the number of fields returned, and trailing null fields are not 5895 * suppressed. 5896 * 5897 * When the input +str+ is empty an empty Array is returned as the string is 5898 * considered to have no fields to split. 5899 * 5900 * " now's the time".split #=> ["now's", "the", "time"] 5901 * " now's the time".split(' ') #=> ["now's", "the", "time"] 5902 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"] 5903 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"] 5904 * "hello".split(//) #=> ["h", "e", "l", "l", "o"] 5905 * "hello".split(//, 3) #=> ["h", "e", "llo"] 5906 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"] 5907 * 5908 * "mellow yellow".split("ello") #=> ["m", "w y", "w"] 5909 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"] 5910 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"] 5911 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""] 5912 * 5913 * "".split(',', -1) #=> [] 5914 */ 5915 5916static VALUE 5917rb_str_split_m(int argc, VALUE *argv, VALUE str) 5918{ 5919 rb_encoding *enc; 5920 VALUE spat; 5921 VALUE limit; 5922 enum {awk, string, regexp} split_type; 5923 long beg, end, i = 0; 5924 int lim = 0; 5925 VALUE result, tmp; 5926 5927 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) { 5928 lim = NUM2INT(limit); 5929 if (lim <= 0) limit = Qnil; 5930 else if (lim == 1) { 5931 if (RSTRING_LEN(str) == 0) 5932 return rb_ary_new2(0); 5933 return rb_ary_new3(1, str); 5934 } 5935 i = 1; 5936 } 5937 5938 enc = STR_ENC_GET(str); 5939 if (NIL_P(spat)) { 5940 if (!NIL_P(rb_fs)) { 5941 spat = rb_fs; 5942 goto fs_set; 5943 } 5944 split_type = awk; 5945 } 5946 else { 5947 fs_set: 5948 if (RB_TYPE_P(spat, T_STRING)) { 5949 rb_encoding *enc2 = STR_ENC_GET(spat); 5950 5951 split_type = string; 5952 if (RSTRING_LEN(spat) == 0) { 5953 /* Special case - split into chars */ 5954 spat = rb_reg_regcomp(spat); 5955 split_type = regexp; 5956 } 5957 else if (rb_enc_asciicompat(enc2) == 1) { 5958 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){ 5959 split_type = awk; 5960 } 5961 } 5962 else { 5963 int l; 5964 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' && 5965 RSTRING_LEN(spat) == l) { 5966 split_type = awk; 5967 } 5968 } 5969 } 5970 else { 5971 spat = get_pat(spat, 1); 5972 split_type = regexp; 5973 } 5974 } 5975 5976 result = rb_ary_new(); 5977 beg = 0; 5978 if (split_type == awk) { 5979 char *ptr = RSTRING_PTR(str); 5980 char *eptr = RSTRING_END(str); 5981 char *bptr = ptr; 5982 int skip = 1; 5983 unsigned int c; 5984 5985 end = beg; 5986 if (is_ascii_string(str)) { 5987 while (ptr < eptr) { 5988 c = (unsigned char)*ptr++; 5989 if (skip) { 5990 if (ascii_isspace(c)) { 5991 beg = ptr - bptr; 5992 } 5993 else { 5994 end = ptr - bptr; 5995 skip = 0; 5996 if (!NIL_P(limit) && lim <= i) break; 5997 } 5998 } 5999 else if (ascii_isspace(c)) { 6000 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 6001 skip = 1; 6002 beg = ptr - bptr; 6003 if (!NIL_P(limit)) ++i; 6004 } 6005 else { 6006 end = ptr - bptr; 6007 } 6008 } 6009 } 6010 else { 6011 while (ptr < eptr) { 6012 int n; 6013 6014 c = rb_enc_codepoint_len(ptr, eptr, &n, enc); 6015 ptr += n; 6016 if (skip) { 6017 if (rb_isspace(c)) { 6018 beg = ptr - bptr; 6019 } 6020 else { 6021 end = ptr - bptr; 6022 skip = 0; 6023 if (!NIL_P(limit) && lim <= i) break; 6024 } 6025 } 6026 else if (rb_isspace(c)) { 6027 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 6028 skip = 1; 6029 beg = ptr - bptr; 6030 if (!NIL_P(limit)) ++i; 6031 } 6032 else { 6033 end = ptr - bptr; 6034 } 6035 } 6036 } 6037 } 6038 else if (split_type == string) { 6039 char *ptr = RSTRING_PTR(str); 6040 char *temp = ptr; 6041 char *eptr = RSTRING_END(str); 6042 char *sptr = RSTRING_PTR(spat); 6043 long slen = RSTRING_LEN(spat); 6044 6045 if (is_broken_string(str)) { 6046 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str))); 6047 } 6048 if (is_broken_string(spat)) { 6049 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat))); 6050 } 6051 enc = rb_enc_check(str, spat); 6052 while (ptr < eptr && 6053 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) { 6054 /* Check we are at the start of a char */ 6055 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc); 6056 if (t != ptr + end) { 6057 ptr = t; 6058 continue; 6059 } 6060 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end)); 6061 ptr += end + slen; 6062 if (!NIL_P(limit) && lim <= ++i) break; 6063 } 6064 beg = ptr - temp; 6065 } 6066 else { 6067 char *ptr = RSTRING_PTR(str); 6068 long len = RSTRING_LEN(str); 6069 long start = beg; 6070 long idx; 6071 int last_null = 0; 6072 struct re_registers *regs; 6073 6074 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) { 6075 regs = RMATCH_REGS(rb_backref_get()); 6076 if (start == end && BEG(0) == END(0)) { 6077 if (!ptr) { 6078 rb_ary_push(result, str_new_empty(str)); 6079 break; 6080 } 6081 else if (last_null == 1) { 6082 rb_ary_push(result, rb_str_subseq(str, beg, 6083 rb_enc_fast_mbclen(ptr+beg, 6084 ptr+len, 6085 enc))); 6086 beg = start; 6087 } 6088 else { 6089 if (ptr+start == ptr+len) 6090 start++; 6091 else 6092 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc); 6093 last_null = 1; 6094 continue; 6095 } 6096 } 6097 else { 6098 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 6099 beg = start = END(0); 6100 } 6101 last_null = 0; 6102 6103 for (idx=1; idx < regs->num_regs; idx++) { 6104 if (BEG(idx) == -1) continue; 6105 if (BEG(idx) == END(idx)) 6106 tmp = str_new_empty(str); 6107 else 6108 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx)); 6109 rb_ary_push(result, tmp); 6110 } 6111 if (!NIL_P(limit) && lim <= ++i) break; 6112 } 6113 } 6114 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) { 6115 if (RSTRING_LEN(str) == beg) 6116 tmp = str_new_empty(str); 6117 else 6118 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg); 6119 rb_ary_push(result, tmp); 6120 } 6121 if (NIL_P(limit) && lim == 0) { 6122 long len; 6123 while ((len = RARRAY_LEN(result)) > 0 && 6124 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0)) 6125 rb_ary_pop(result); 6126 } 6127 6128 return result; 6129} 6130 6131VALUE 6132rb_str_split(VALUE str, const char *sep0) 6133{ 6134 VALUE sep; 6135 6136 StringValue(str); 6137 sep = rb_str_new2(sep0); 6138 return rb_str_split_m(1, &sep, str); 6139} 6140 6141 6142static VALUE 6143rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray) 6144{ 6145 rb_encoding *enc; 6146 VALUE rs; 6147 unsigned int newline; 6148 const char *p, *pend, *s, *ptr; 6149 long len, rslen; 6150 VALUE line; 6151 int n; 6152 VALUE orig = str; 6153 VALUE UNINITIALIZED_VAR(ary); 6154 6155 if (argc == 0) { 6156 rs = rb_rs; 6157 } 6158 else { 6159 rb_scan_args(argc, argv, "01", &rs); 6160 } 6161 6162 if (rb_block_given_p()) { 6163 if (wantarray) { 6164#if 0 /* next major */ 6165 rb_warn("given block not used"); 6166 ary = rb_ary_new(); 6167#else 6168 rb_warning("passing a block to String#lines is deprecated"); 6169 wantarray = 0; 6170#endif 6171 } 6172 } 6173 else { 6174 if (wantarray) 6175 ary = rb_ary_new(); 6176 else 6177 RETURN_ENUMERATOR(str, argc, argv); 6178 } 6179 6180 if (NIL_P(rs)) { 6181 if (wantarray) { 6182 rb_ary_push(ary, str); 6183 return ary; 6184 } 6185 else { 6186 rb_yield(str); 6187 return orig; 6188 } 6189 } 6190 str = rb_str_new4(str); 6191 ptr = p = s = RSTRING_PTR(str); 6192 pend = p + RSTRING_LEN(str); 6193 len = RSTRING_LEN(str); 6194 StringValue(rs); 6195 if (rs == rb_default_rs) { 6196 enc = rb_enc_get(str); 6197 while (p < pend) { 6198 char *p0; 6199 6200 p = memchr(p, '\n', pend - p); 6201 if (!p) break; 6202 p0 = rb_enc_left_char_head(s, p, pend, enc); 6203 if (!rb_enc_is_newline(p0, pend, enc)) { 6204 p++; 6205 continue; 6206 } 6207 p = p0 + rb_enc_mbclen(p0, pend, enc); 6208 line = rb_str_subseq(str, s - ptr, p - s); 6209 if (wantarray) 6210 rb_ary_push(ary, line); 6211 else 6212 rb_yield(line); 6213 str_mod_check(str, ptr, len); 6214 s = p; 6215 } 6216 goto finish; 6217 } 6218 6219 enc = rb_enc_check(str, rs); 6220 rslen = RSTRING_LEN(rs); 6221 if (rslen == 0) { 6222 newline = '\n'; 6223 } 6224 else { 6225 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); 6226 } 6227 6228 while (p < pend) { 6229 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc); 6230 6231 again: 6232 if (rslen == 0 && c == newline) { 6233 p += n; 6234 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) { 6235 goto again; 6236 } 6237 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) { 6238 p += n; 6239 } 6240 p -= n; 6241 } 6242 if (c == newline && 6243 (rslen <= 1 || 6244 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) { 6245 const char *pp = p + (rslen ? rslen : n); 6246 line = rb_str_subseq(str, s - ptr, pp - s); 6247 if (wantarray) 6248 rb_ary_push(ary, line); 6249 else 6250 rb_yield(line); 6251 str_mod_check(str, ptr, len); 6252 s = pp; 6253 } 6254 p += n; 6255 } 6256 6257 finish: 6258 if (s != pend) { 6259 line = rb_str_subseq(str, s - ptr, pend - s); 6260 if (wantarray) 6261 rb_ary_push(ary, line); 6262 else 6263 rb_yield(line); 6264 RB_GC_GUARD(str); 6265 } 6266 6267 if (wantarray) 6268 return ary; 6269 else 6270 return orig; 6271} 6272 6273/* 6274 * call-seq: 6275 * str.each_line(separator=$/) {|substr| block } -> str 6276 * str.each_line(separator=$/) -> an_enumerator 6277 * 6278 * Splits <i>str</i> using the supplied parameter as the record 6279 * separator (<code>$/</code> by default), passing each substring in 6280 * turn to the supplied block. If a zero-length record separator is 6281 * supplied, the string is split into paragraphs delimited by 6282 * multiple successive newlines. 6283 * 6284 * If no block is given, an enumerator is returned instead. 6285 * 6286 * print "Example one\n" 6287 * "hello\nworld".each_line {|s| p s} 6288 * print "Example two\n" 6289 * "hello\nworld".each_line('l') {|s| p s} 6290 * print "Example three\n" 6291 * "hello\n\n\nworld".each_line('') {|s| p s} 6292 * 6293 * <em>produces:</em> 6294 * 6295 * Example one 6296 * "hello\n" 6297 * "world" 6298 * Example two 6299 * "hel" 6300 * "l" 6301 * "o\nworl" 6302 * "d" 6303 * Example three 6304 * "hello\n\n\n" 6305 * "world" 6306 */ 6307 6308static VALUE 6309rb_str_each_line(int argc, VALUE *argv, VALUE str) 6310{ 6311 return rb_str_enumerate_lines(argc, argv, str, 0); 6312} 6313 6314/* 6315 * call-seq: 6316 * str.lines(separator=$/) -> an_array 6317 * 6318 * Returns an array of lines in <i>str</i> split using the supplied 6319 * record separator (<code>$/</code> by default). This is a 6320 * shorthand for <code>str.each_line(separator).to_a</code>. 6321 * 6322 * If a block is given, which is a deprecated form, works the same as 6323 * <code>each_line</code>. 6324 */ 6325 6326static VALUE 6327rb_str_lines(int argc, VALUE *argv, VALUE str) 6328{ 6329 return rb_str_enumerate_lines(argc, argv, str, 1); 6330} 6331 6332static VALUE 6333rb_str_each_byte_size(VALUE str, VALUE args) 6334{ 6335 return LONG2FIX(RSTRING_LEN(str)); 6336} 6337 6338static VALUE 6339rb_str_enumerate_bytes(VALUE str, int wantarray) 6340{ 6341 long i; 6342 VALUE UNINITIALIZED_VAR(ary); 6343 6344 if (rb_block_given_p()) { 6345 if (wantarray) { 6346#if 0 /* next major */ 6347 rb_warn("given block not used"); 6348 ary = rb_ary_new(); 6349#else 6350 rb_warning("passing a block to String#bytes is deprecated"); 6351 wantarray = 0; 6352#endif 6353 } 6354 } 6355 else { 6356 if (wantarray) 6357 ary = rb_ary_new2(RSTRING_LEN(str)); 6358 else 6359 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size); 6360 } 6361 6362 for (i=0; i<RSTRING_LEN(str); i++) { 6363 if (wantarray) 6364 rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff)); 6365 else 6366 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff)); 6367 } 6368 if (wantarray) 6369 return ary; 6370 else 6371 return str; 6372} 6373 6374/* 6375 * call-seq: 6376 * str.each_byte {|fixnum| block } -> str 6377 * str.each_byte -> an_enumerator 6378 * 6379 * Passes each byte in <i>str</i> to the given block, or returns an 6380 * enumerator if no block is given. 6381 * 6382 * "hello".each_byte {|c| print c, ' ' } 6383 * 6384 * <em>produces:</em> 6385 * 6386 * 104 101 108 108 111 6387 */ 6388 6389static VALUE 6390rb_str_each_byte(VALUE str) 6391{ 6392 return rb_str_enumerate_bytes(str, 0); 6393} 6394 6395/* 6396 * call-seq: 6397 * str.bytes -> an_array 6398 * 6399 * Returns an array of bytes in <i>str</i>. This is a shorthand for 6400 * <code>str.each_byte.to_a</code>. 6401 * 6402 * If a block is given, which is a deprecated form, works the same as 6403 * <code>each_byte</code>. 6404 */ 6405 6406static VALUE 6407rb_str_bytes(VALUE str) 6408{ 6409 return rb_str_enumerate_bytes(str, 1); 6410} 6411 6412static VALUE 6413rb_str_each_char_size(VALUE str) 6414{ 6415 long len = RSTRING_LEN(str); 6416 if (!single_byte_optimizable(str)) { 6417 const char *ptr = RSTRING_PTR(str); 6418 rb_encoding *enc = rb_enc_get(str); 6419 const char *end_ptr = ptr + len; 6420 for (len = 0; ptr < end_ptr; ++len) { 6421 ptr += rb_enc_mbclen(ptr, end_ptr, enc); 6422 } 6423 } 6424 return LONG2FIX(len); 6425} 6426 6427static VALUE 6428rb_str_enumerate_chars(VALUE str, int wantarray) 6429{ 6430 VALUE orig = str; 6431 VALUE substr; 6432 long i, len, n; 6433 const char *ptr; 6434 rb_encoding *enc; 6435 VALUE UNINITIALIZED_VAR(ary); 6436 6437 if (rb_block_given_p()) { 6438 if (wantarray) { 6439#if 0 /* next major */ 6440 rb_warn("given block not used"); 6441 ary = rb_ary_new(); 6442#else 6443 rb_warning("passing a block to String#chars is deprecated"); 6444 wantarray = 0; 6445#endif 6446 } 6447 } 6448 else { 6449 if (wantarray) 6450 ary = rb_ary_new(); 6451 else 6452 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size); 6453 } 6454 6455 str = rb_str_new4(str); 6456 ptr = RSTRING_PTR(str); 6457 len = RSTRING_LEN(str); 6458 enc = rb_enc_get(str); 6459 switch (ENC_CODERANGE(str)) { 6460 case ENC_CODERANGE_VALID: 6461 case ENC_CODERANGE_7BIT: 6462 for (i = 0; i < len; i += n) { 6463 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc); 6464 substr = rb_str_subseq(str, i, n); 6465 if (wantarray) 6466 rb_ary_push(ary, substr); 6467 else 6468 rb_yield(substr); 6469 } 6470 break; 6471 default: 6472 for (i = 0; i < len; i += n) { 6473 n = rb_enc_mbclen(ptr + i, ptr + len, enc); 6474 substr = rb_str_subseq(str, i, n); 6475 if (wantarray) 6476 rb_ary_push(ary, substr); 6477 else 6478 rb_yield(substr); 6479 } 6480 } 6481 RB_GC_GUARD(str); 6482 if (wantarray) 6483 return ary; 6484 else 6485 return orig; 6486} 6487 6488/* 6489 * call-seq: 6490 * str.each_char {|cstr| block } -> str 6491 * str.each_char -> an_enumerator 6492 * 6493 * Passes each character in <i>str</i> to the given block, or returns 6494 * an enumerator if no block is given. 6495 * 6496 * "hello".each_char {|c| print c, ' ' } 6497 * 6498 * <em>produces:</em> 6499 * 6500 * h e l l o 6501 */ 6502 6503static VALUE 6504rb_str_each_char(VALUE str) 6505{ 6506 return rb_str_enumerate_chars(str, 0); 6507} 6508 6509/* 6510 * call-seq: 6511 * str.chars -> an_array 6512 * 6513 * Returns an array of characters in <i>str</i>. This is a shorthand 6514 * for <code>str.each_char.to_a</code>. 6515 * 6516 * If a block is given, which is a deprecated form, works the same as 6517 * <code>each_char</code>. 6518 */ 6519 6520static VALUE 6521rb_str_chars(VALUE str) 6522{ 6523 return rb_str_enumerate_chars(str, 1); 6524} 6525 6526 6527static VALUE 6528rb_str_enumerate_codepoints(VALUE str, int wantarray) 6529{ 6530 VALUE orig = str; 6531 int n; 6532 unsigned int c; 6533 const char *ptr, *end; 6534 rb_encoding *enc; 6535 VALUE UNINITIALIZED_VAR(ary); 6536 6537 if (single_byte_optimizable(str)) 6538 return rb_str_enumerate_bytes(str, wantarray); 6539 6540 if (rb_block_given_p()) { 6541 if (wantarray) { 6542#if 0 /* next major */ 6543 rb_warn("given block not used"); 6544 ary = rb_ary_new(); 6545#else 6546 rb_warning("passing a block to String#codepoints is deprecated"); 6547 wantarray = 0; 6548#endif 6549 } 6550 } 6551 else { 6552 if (wantarray) 6553 ary = rb_ary_new(); 6554 else 6555 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size); 6556 } 6557 6558 str = rb_str_new4(str); 6559 ptr = RSTRING_PTR(str); 6560 end = RSTRING_END(str); 6561 enc = STR_ENC_GET(str); 6562 while (ptr < end) { 6563 c = rb_enc_codepoint_len(ptr, end, &n, enc); 6564 if (wantarray) 6565 rb_ary_push(ary, UINT2NUM(c)); 6566 else 6567 rb_yield(UINT2NUM(c)); 6568 ptr += n; 6569 } 6570 RB_GC_GUARD(str); 6571 if (wantarray) 6572 return ary; 6573 else 6574 return orig; 6575} 6576 6577/* 6578 * call-seq: 6579 * str.each_codepoint {|integer| block } -> str 6580 * str.each_codepoint -> an_enumerator 6581 * 6582 * Passes the <code>Integer</code> ordinal of each character in <i>str</i>, 6583 * also known as a <i>codepoint</i> when applied to Unicode strings to the 6584 * given block. 6585 * 6586 * If no block is given, an enumerator is returned instead. 6587 * 6588 * "hello\u0639".each_codepoint {|c| print c, ' ' } 6589 * 6590 * <em>produces:</em> 6591 * 6592 * 104 101 108 108 111 1593 6593 */ 6594 6595static VALUE 6596rb_str_each_codepoint(VALUE str) 6597{ 6598 return rb_str_enumerate_codepoints(str, 0); 6599} 6600 6601/* 6602 * call-seq: 6603 * str.codepoints -> an_array 6604 * 6605 * Returns an array of the <code>Integer</code> ordinals of the 6606 * characters in <i>str</i>. This is a shorthand for 6607 * <code>str.each_codepoint.to_a</code>. 6608 * 6609 * If a block is given, which is a deprecated form, works the same as 6610 * <code>each_codepoint</code>. 6611 */ 6612 6613static VALUE 6614rb_str_codepoints(VALUE str) 6615{ 6616 return rb_str_enumerate_codepoints(str, 1); 6617} 6618 6619 6620static long 6621chopped_length(VALUE str) 6622{ 6623 rb_encoding *enc = STR_ENC_GET(str); 6624 const char *p, *p2, *beg, *end; 6625 6626 beg = RSTRING_PTR(str); 6627 end = beg + RSTRING_LEN(str); 6628 if (beg > end) return 0; 6629 p = rb_enc_prev_char(beg, end, end, enc); 6630 if (!p) return 0; 6631 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') { 6632 p2 = rb_enc_prev_char(beg, p, end, enc); 6633 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2; 6634 } 6635 return p - beg; 6636} 6637 6638/* 6639 * call-seq: 6640 * str.chop! -> str or nil 6641 * 6642 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>, 6643 * or <code>nil</code> if <i>str</i> is the empty string. See also 6644 * <code>String#chomp!</code>. 6645 */ 6646 6647static VALUE 6648rb_str_chop_bang(VALUE str) 6649{ 6650 str_modify_keep_cr(str); 6651 if (RSTRING_LEN(str) > 0) { 6652 long len; 6653 len = chopped_length(str); 6654 STR_SET_LEN(str, len); 6655 RSTRING_PTR(str)[len] = '\0'; 6656 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 6657 ENC_CODERANGE_CLEAR(str); 6658 } 6659 return str; 6660 } 6661 return Qnil; 6662} 6663 6664 6665/* 6666 * call-seq: 6667 * str.chop -> new_str 6668 * 6669 * Returns a new <code>String</code> with the last character removed. If the 6670 * string ends with <code>\r\n</code>, both characters are removed. Applying 6671 * <code>chop</code> to an empty string returns an empty 6672 * string. <code>String#chomp</code> is often a safer alternative, as it leaves 6673 * the string unchanged if it doesn't end in a record separator. 6674 * 6675 * "string\r\n".chop #=> "string" 6676 * "string\n\r".chop #=> "string\n" 6677 * "string\n".chop #=> "string" 6678 * "string".chop #=> "strin" 6679 * "x".chop.chop #=> "" 6680 */ 6681 6682static VALUE 6683rb_str_chop(VALUE str) 6684{ 6685 return rb_str_subseq(str, 0, chopped_length(str)); 6686} 6687 6688 6689/* 6690 * call-seq: 6691 * str.chomp!(separator=$/) -> str or nil 6692 * 6693 * Modifies <i>str</i> in place as described for <code>String#chomp</code>, 6694 * returning <i>str</i>, or <code>nil</code> if no modifications were made. 6695 */ 6696 6697static VALUE 6698rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) 6699{ 6700 rb_encoding *enc; 6701 VALUE rs; 6702 int newline; 6703 char *p, *pp, *e; 6704 long len, rslen; 6705 6706 str_modify_keep_cr(str); 6707 len = RSTRING_LEN(str); 6708 if (len == 0) return Qnil; 6709 p = RSTRING_PTR(str); 6710 e = p + len; 6711 if (argc == 0) { 6712 rs = rb_rs; 6713 if (rs == rb_default_rs) { 6714 smart_chomp: 6715 enc = rb_enc_get(str); 6716 if (rb_enc_mbminlen(enc) > 1) { 6717 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); 6718 if (rb_enc_is_newline(pp, e, enc)) { 6719 e = pp; 6720 } 6721 pp = e - rb_enc_mbminlen(enc); 6722 if (pp >= p) { 6723 pp = rb_enc_left_char_head(p, pp, e, enc); 6724 if (rb_enc_ascget(pp, e, 0, enc) == '\r') { 6725 e = pp; 6726 } 6727 } 6728 if (e == RSTRING_END(str)) { 6729 return Qnil; 6730 } 6731 len = e - RSTRING_PTR(str); 6732 STR_SET_LEN(str, len); 6733 } 6734 else { 6735 if (RSTRING_PTR(str)[len-1] == '\n') { 6736 STR_DEC_LEN(str); 6737 if (RSTRING_LEN(str) > 0 && 6738 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') { 6739 STR_DEC_LEN(str); 6740 } 6741 } 6742 else if (RSTRING_PTR(str)[len-1] == '\r') { 6743 STR_DEC_LEN(str); 6744 } 6745 else { 6746 return Qnil; 6747 } 6748 } 6749 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 6750 return str; 6751 } 6752 } 6753 else { 6754 rb_scan_args(argc, argv, "01", &rs); 6755 } 6756 if (NIL_P(rs)) return Qnil; 6757 StringValue(rs); 6758 rslen = RSTRING_LEN(rs); 6759 if (rslen == 0) { 6760 while (len>0 && p[len-1] == '\n') { 6761 len--; 6762 if (len>0 && p[len-1] == '\r') 6763 len--; 6764 } 6765 if (len < RSTRING_LEN(str)) { 6766 STR_SET_LEN(str, len); 6767 RSTRING_PTR(str)[len] = '\0'; 6768 return str; 6769 } 6770 return Qnil; 6771 } 6772 if (rslen > len) return Qnil; 6773 newline = RSTRING_PTR(rs)[rslen-1]; 6774 if (rslen == 1 && newline == '\n') 6775 goto smart_chomp; 6776 6777 enc = rb_enc_check(str, rs); 6778 if (is_broken_string(rs)) { 6779 return Qnil; 6780 } 6781 pp = e - rslen; 6782 if (p[len-1] == newline && 6783 (rslen <= 1 || 6784 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) { 6785 if (rb_enc_left_char_head(p, pp, e, enc) != pp) 6786 return Qnil; 6787 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 6788 ENC_CODERANGE_CLEAR(str); 6789 } 6790 STR_SET_LEN(str, RSTRING_LEN(str) - rslen); 6791 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 6792 return str; 6793 } 6794 return Qnil; 6795} 6796 6797 6798/* 6799 * call-seq: 6800 * str.chomp(separator=$/) -> new_str 6801 * 6802 * Returns a new <code>String</code> with the given record separator removed 6803 * from the end of <i>str</i> (if present). If <code>$/</code> has not been 6804 * changed from the default Ruby record separator, then <code>chomp</code> also 6805 * removes carriage return characters (that is it will remove <code>\n</code>, 6806 * <code>\r</code>, and <code>\r\n</code>). 6807 * 6808 * "hello".chomp #=> "hello" 6809 * "hello\n".chomp #=> "hello" 6810 * "hello\r\n".chomp #=> "hello" 6811 * "hello\n\r".chomp #=> "hello\n" 6812 * "hello\r".chomp #=> "hello" 6813 * "hello \n there".chomp #=> "hello \n there" 6814 * "hello".chomp("llo") #=> "he" 6815 */ 6816 6817static VALUE 6818rb_str_chomp(int argc, VALUE *argv, VALUE str) 6819{ 6820 str = rb_str_dup(str); 6821 rb_str_chomp_bang(argc, argv, str); 6822 return str; 6823} 6824 6825/* 6826 * call-seq: 6827 * str.lstrip! -> self or nil 6828 * 6829 * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no 6830 * change was made. See also <code>String#rstrip!</code> and 6831 * <code>String#strip!</code>. 6832 * 6833 * " hello ".lstrip #=> "hello " 6834 * "hello".lstrip! #=> nil 6835 */ 6836 6837static VALUE 6838rb_str_lstrip_bang(VALUE str) 6839{ 6840 rb_encoding *enc; 6841 char *s, *t, *e; 6842 6843 str_modify_keep_cr(str); 6844 enc = STR_ENC_GET(str); 6845 s = RSTRING_PTR(str); 6846 if (!s || RSTRING_LEN(str) == 0) return Qnil; 6847 e = t = RSTRING_END(str); 6848 /* remove spaces at head */ 6849 while (s < e) { 6850 int n; 6851 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); 6852 6853 if (!rb_isspace(cc)) break; 6854 s += n; 6855 } 6856 6857 if (s > RSTRING_PTR(str)) { 6858 STR_SET_LEN(str, t-s); 6859 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str)); 6860 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 6861 return str; 6862 } 6863 return Qnil; 6864} 6865 6866 6867/* 6868 * call-seq: 6869 * str.lstrip -> new_str 6870 * 6871 * Returns a copy of <i>str</i> with leading whitespace removed. See also 6872 * <code>String#rstrip</code> and <code>String#strip</code>. 6873 * 6874 * " hello ".lstrip #=> "hello " 6875 * "hello".lstrip #=> "hello" 6876 */ 6877 6878static VALUE 6879rb_str_lstrip(VALUE str) 6880{ 6881 str = rb_str_dup(str); 6882 rb_str_lstrip_bang(str); 6883 return str; 6884} 6885 6886 6887/* 6888 * call-seq: 6889 * str.rstrip! -> self or nil 6890 * 6891 * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if 6892 * no change was made. See also <code>String#lstrip!</code> and 6893 * <code>String#strip!</code>. 6894 * 6895 * " hello ".rstrip #=> " hello" 6896 * "hello".rstrip! #=> nil 6897 */ 6898 6899static VALUE 6900rb_str_rstrip_bang(VALUE str) 6901{ 6902 rb_encoding *enc; 6903 char *s, *t, *e; 6904 6905 str_modify_keep_cr(str); 6906 enc = STR_ENC_GET(str); 6907 rb_str_check_dummy_enc(enc); 6908 s = RSTRING_PTR(str); 6909 if (!s || RSTRING_LEN(str) == 0) return Qnil; 6910 t = e = RSTRING_END(str); 6911 6912 /* remove trailing spaces or '\0's */ 6913 if (single_byte_optimizable(str)) { 6914 unsigned char c; 6915 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--; 6916 } 6917 else { 6918 char *tp; 6919 6920 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) { 6921 unsigned int c = rb_enc_codepoint(tp, e, enc); 6922 if (c && !rb_isspace(c)) break; 6923 t = tp; 6924 } 6925 } 6926 if (t < e) { 6927 long len = t-RSTRING_PTR(str); 6928 6929 STR_SET_LEN(str, len); 6930 RSTRING_PTR(str)[len] = '\0'; 6931 return str; 6932 } 6933 return Qnil; 6934} 6935 6936 6937/* 6938 * call-seq: 6939 * str.rstrip -> new_str 6940 * 6941 * Returns a copy of <i>str</i> with trailing whitespace removed. See also 6942 * <code>String#lstrip</code> and <code>String#strip</code>. 6943 * 6944 * " hello ".rstrip #=> " hello" 6945 * "hello".rstrip #=> "hello" 6946 */ 6947 6948static VALUE 6949rb_str_rstrip(VALUE str) 6950{ 6951 str = rb_str_dup(str); 6952 rb_str_rstrip_bang(str); 6953 return str; 6954} 6955 6956 6957/* 6958 * call-seq: 6959 * str.strip! -> str or nil 6960 * 6961 * Removes leading and trailing whitespace from <i>str</i>. Returns 6962 * <code>nil</code> if <i>str</i> was not altered. 6963 */ 6964 6965static VALUE 6966rb_str_strip_bang(VALUE str) 6967{ 6968 VALUE l = rb_str_lstrip_bang(str); 6969 VALUE r = rb_str_rstrip_bang(str); 6970 6971 if (NIL_P(l) && NIL_P(r)) return Qnil; 6972 return str; 6973} 6974 6975 6976/* 6977 * call-seq: 6978 * str.strip -> new_str 6979 * 6980 * Returns a copy of <i>str</i> with leading and trailing whitespace removed. 6981 * 6982 * " hello ".strip #=> "hello" 6983 * "\tgoodbye\r\n".strip #=> "goodbye" 6984 */ 6985 6986static VALUE 6987rb_str_strip(VALUE str) 6988{ 6989 str = rb_str_dup(str); 6990 rb_str_strip_bang(str); 6991 return str; 6992} 6993 6994static VALUE 6995scan_once(VALUE str, VALUE pat, long *start) 6996{ 6997 VALUE result, match; 6998 struct re_registers *regs; 6999 int i; 7000 7001 if (rb_reg_search(pat, str, *start, 0) >= 0) { 7002 match = rb_backref_get(); 7003 regs = RMATCH_REGS(match); 7004 if (BEG(0) == END(0)) { 7005 rb_encoding *enc = STR_ENC_GET(str); 7006 /* 7007 * Always consume at least one character of the input string 7008 */ 7009 if (RSTRING_LEN(str) > END(0)) 7010 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0), 7011 RSTRING_END(str), enc); 7012 else 7013 *start = END(0)+1; 7014 } 7015 else { 7016 *start = END(0); 7017 } 7018 if (regs->num_regs == 1) { 7019 return rb_reg_nth_match(0, match); 7020 } 7021 result = rb_ary_new2(regs->num_regs); 7022 for (i=1; i < regs->num_regs; i++) { 7023 rb_ary_push(result, rb_reg_nth_match(i, match)); 7024 } 7025 7026 return result; 7027 } 7028 return Qnil; 7029} 7030 7031 7032/* 7033 * call-seq: 7034 * str.scan(pattern) -> array 7035 * str.scan(pattern) {|match, ...| block } -> str 7036 * 7037 * Both forms iterate through <i>str</i>, matching the pattern (which may be a 7038 * <code>Regexp</code> or a <code>String</code>). For each match, a result is 7039 * generated and either added to the result array or passed to the block. If 7040 * the pattern contains no groups, each individual result consists of the 7041 * matched string, <code>$&</code>. If the pattern contains groups, each 7042 * individual result is itself an array containing one entry per group. 7043 * 7044 * a = "cruel world" 7045 * a.scan(/\w+/) #=> ["cruel", "world"] 7046 * a.scan(/.../) #=> ["cru", "el ", "wor"] 7047 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]] 7048 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]] 7049 * 7050 * And the block form: 7051 * 7052 * a.scan(/\w+/) {|w| print "<<#{w}>> " } 7053 * print "\n" 7054 * a.scan(/(.)(.)/) {|x,y| print y, x } 7055 * print "\n" 7056 * 7057 * <em>produces:</em> 7058 * 7059 * <<cruel>> <<world>> 7060 * rceu lowlr 7061 */ 7062 7063static VALUE 7064rb_str_scan(VALUE str, VALUE pat) 7065{ 7066 VALUE result; 7067 long start = 0; 7068 long last = -1, prev = 0; 7069 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str); 7070 7071 pat = get_pat(pat, 1); 7072 if (!rb_block_given_p()) { 7073 VALUE ary = rb_ary_new(); 7074 7075 while (!NIL_P(result = scan_once(str, pat, &start))) { 7076 last = prev; 7077 prev = start; 7078 rb_ary_push(ary, result); 7079 } 7080 if (last >= 0) rb_reg_search(pat, str, last, 0); 7081 return ary; 7082 } 7083 7084 while (!NIL_P(result = scan_once(str, pat, &start))) { 7085 last = prev; 7086 prev = start; 7087 rb_yield(result); 7088 str_mod_check(str, p, len); 7089 } 7090 if (last >= 0) rb_reg_search(pat, str, last, 0); 7091 return str; 7092} 7093 7094 7095/* 7096 * call-seq: 7097 * str.hex -> integer 7098 * 7099 * Treats leading characters from <i>str</i> as a string of hexadecimal digits 7100 * (with an optional sign and an optional <code>0x</code>) and returns the 7101 * corresponding number. Zero is returned on error. 7102 * 7103 * "0x0a".hex #=> 10 7104 * "-1234".hex #=> -4660 7105 * "0".hex #=> 0 7106 * "wombat".hex #=> 0 7107 */ 7108 7109static VALUE 7110rb_str_hex(VALUE str) 7111{ 7112 return rb_str_to_inum(str, 16, FALSE); 7113} 7114 7115 7116/* 7117 * call-seq: 7118 * str.oct -> integer 7119 * 7120 * Treats leading characters of <i>str</i> as a string of octal digits (with an 7121 * optional sign) and returns the corresponding number. Returns 0 if the 7122 * conversion fails. 7123 * 7124 * "123".oct #=> 83 7125 * "-377".oct #=> -255 7126 * "bad".oct #=> 0 7127 * "0377bad".oct #=> 255 7128 */ 7129 7130static VALUE 7131rb_str_oct(VALUE str) 7132{ 7133 return rb_str_to_inum(str, -8, FALSE); 7134} 7135 7136 7137/* 7138 * call-seq: 7139 * str.crypt(salt_str) -> new_str 7140 * 7141 * Applies a one-way cryptographic hash to <i>str</i> by invoking the 7142 * standard library function <code>crypt(3)</code> with the given 7143 * salt string. While the format and the result are system and 7144 * implementation dependent, using a salt matching the regular 7145 * expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and 7146 * safe on any platform, in which only the first two characters are 7147 * significant. 7148 * 7149 * This method is for use in system specific scripts, so if you want 7150 * a cross-platform hash function consider using Digest or OpenSSL 7151 * instead. 7152 */ 7153 7154static VALUE 7155rb_str_crypt(VALUE str, VALUE salt) 7156{ 7157 extern char *crypt(const char *, const char *); 7158 VALUE result; 7159 const char *s, *saltp; 7160 char *res; 7161#ifdef BROKEN_CRYPT 7162 char salt_8bit_clean[3]; 7163#endif 7164 7165 StringValue(salt); 7166 if (RSTRING_LEN(salt) < 2) 7167 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)"); 7168 7169 s = RSTRING_PTR(str); 7170 if (!s) s = ""; 7171 saltp = RSTRING_PTR(salt); 7172#ifdef BROKEN_CRYPT 7173 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) { 7174 salt_8bit_clean[0] = saltp[0] & 0x7f; 7175 salt_8bit_clean[1] = saltp[1] & 0x7f; 7176 salt_8bit_clean[2] = '\0'; 7177 saltp = salt_8bit_clean; 7178 } 7179#endif 7180 res = crypt(s, saltp); 7181 if (!res) { 7182 rb_sys_fail("crypt"); 7183 } 7184 result = rb_str_new2(res); 7185 OBJ_INFECT(result, str); 7186 OBJ_INFECT(result, salt); 7187 return result; 7188} 7189 7190 7191/* 7192 * call-seq: 7193 * str.intern -> symbol 7194 * str.to_sym -> symbol 7195 * 7196 * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the 7197 * symbol if it did not previously exist. See <code>Symbol#id2name</code>. 7198 * 7199 * "Koala".intern #=> :Koala 7200 * s = 'cat'.to_sym #=> :cat 7201 * s == :cat #=> true 7202 * s = '@cat'.to_sym #=> :@cat 7203 * s == :@cat #=> true 7204 * 7205 * This can also be used to create symbols that cannot be represented using the 7206 * <code>:xxx</code> notation. 7207 * 7208 * 'cat and dog'.to_sym #=> :"cat and dog" 7209 */ 7210 7211VALUE 7212rb_str_intern(VALUE s) 7213{ 7214 VALUE str = RB_GC_GUARD(s); 7215 ID id; 7216 7217 id = rb_intern_str(str); 7218 return ID2SYM(id); 7219} 7220 7221 7222/* 7223 * call-seq: 7224 * str.ord -> integer 7225 * 7226 * Return the <code>Integer</code> ordinal of a one-character string. 7227 * 7228 * "a".ord #=> 97 7229 */ 7230 7231VALUE 7232rb_str_ord(VALUE s) 7233{ 7234 unsigned int c; 7235 7236 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s)); 7237 return UINT2NUM(c); 7238} 7239/* 7240 * call-seq: 7241 * str.sum(n=16) -> integer 7242 * 7243 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>, 7244 * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting 7245 * to 16. The result is simply the sum of the binary value of each character in 7246 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good 7247 * checksum. 7248 */ 7249 7250static VALUE 7251rb_str_sum(int argc, VALUE *argv, VALUE str) 7252{ 7253 VALUE vbits; 7254 int bits; 7255 char *ptr, *p, *pend; 7256 long len; 7257 VALUE sum = INT2FIX(0); 7258 unsigned long sum0 = 0; 7259 7260 if (argc == 0) { 7261 bits = 16; 7262 } 7263 else { 7264 rb_scan_args(argc, argv, "01", &vbits); 7265 bits = NUM2INT(vbits); 7266 } 7267 ptr = p = RSTRING_PTR(str); 7268 len = RSTRING_LEN(str); 7269 pend = p + len; 7270 7271 while (p < pend) { 7272 if (FIXNUM_MAX - UCHAR_MAX < sum0) { 7273 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 7274 str_mod_check(str, ptr, len); 7275 sum0 = 0; 7276 } 7277 sum0 += (unsigned char)*p; 7278 p++; 7279 } 7280 7281 if (bits == 0) { 7282 if (sum0) { 7283 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 7284 } 7285 } 7286 else { 7287 if (sum == INT2FIX(0)) { 7288 if (bits < (int)sizeof(long)*CHAR_BIT) { 7289 sum0 &= (((unsigned long)1)<<bits)-1; 7290 } 7291 sum = LONG2FIX(sum0); 7292 } 7293 else { 7294 VALUE mod; 7295 7296 if (sum0) { 7297 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 7298 } 7299 7300 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits)); 7301 mod = rb_funcall(mod, '-', 1, INT2FIX(1)); 7302 sum = rb_funcall(sum, '&', 1, mod); 7303 } 7304 } 7305 return sum; 7306} 7307 7308static VALUE 7309rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) 7310{ 7311 rb_encoding *enc; 7312 VALUE w; 7313 long width, len, flen = 1, fclen = 1; 7314 VALUE res; 7315 char *p; 7316 const char *f = " "; 7317 long n, size, llen, rlen, llen2 = 0, rlen2 = 0; 7318 volatile VALUE pad; 7319 int singlebyte = 1, cr; 7320 7321 rb_scan_args(argc, argv, "11", &w, &pad); 7322 enc = STR_ENC_GET(str); 7323 width = NUM2LONG(w); 7324 if (argc == 2) { 7325 StringValue(pad); 7326 enc = rb_enc_check(str, pad); 7327 f = RSTRING_PTR(pad); 7328 flen = RSTRING_LEN(pad); 7329 fclen = str_strlen(pad, enc); 7330 singlebyte = single_byte_optimizable(pad); 7331 if (flen == 0 || fclen == 0) { 7332 rb_raise(rb_eArgError, "zero width padding"); 7333 } 7334 } 7335 len = str_strlen(str, enc); 7336 if (width < 0 || len >= width) return rb_str_dup(str); 7337 n = width - len; 7338 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2); 7339 rlen = n - llen; 7340 cr = ENC_CODERANGE(str); 7341 if (flen > 1) { 7342 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte); 7343 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte); 7344 } 7345 size = RSTRING_LEN(str); 7346 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen || 7347 (len *= flen) >= LONG_MAX - llen2 - rlen2 || 7348 (len += llen2 + rlen2) >= LONG_MAX - size) { 7349 rb_raise(rb_eArgError, "argument too big"); 7350 } 7351 len += size; 7352 res = rb_str_new5(str, 0, len); 7353 p = RSTRING_PTR(res); 7354 if (flen <= 1) { 7355 memset(p, *f, llen); 7356 p += llen; 7357 } 7358 else { 7359 while (llen >= fclen) { 7360 memcpy(p,f,flen); 7361 p += flen; 7362 llen -= fclen; 7363 } 7364 if (llen > 0) { 7365 memcpy(p, f, llen2); 7366 p += llen2; 7367 } 7368 } 7369 memcpy(p, RSTRING_PTR(str), size); 7370 p += size; 7371 if (flen <= 1) { 7372 memset(p, *f, rlen); 7373 p += rlen; 7374 } 7375 else { 7376 while (rlen >= fclen) { 7377 memcpy(p,f,flen); 7378 p += flen; 7379 rlen -= fclen; 7380 } 7381 if (rlen > 0) { 7382 memcpy(p, f, rlen2); 7383 p += rlen2; 7384 } 7385 } 7386 *p = '\0'; 7387 STR_SET_LEN(res, p-RSTRING_PTR(res)); 7388 OBJ_INFECT(res, str); 7389 if (!NIL_P(pad)) OBJ_INFECT(res, pad); 7390 rb_enc_associate(res, enc); 7391 if (argc == 2) 7392 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad)); 7393 if (cr != ENC_CODERANGE_BROKEN) 7394 ENC_CODERANGE_SET(res, cr); 7395 return res; 7396} 7397 7398 7399/* 7400 * call-seq: 7401 * str.ljust(integer, padstr=' ') -> new_str 7402 * 7403 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 7404 * <code>String</code> of length <i>integer</i> with <i>str</i> left justified 7405 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 7406 * 7407 * "hello".ljust(4) #=> "hello" 7408 * "hello".ljust(20) #=> "hello " 7409 * "hello".ljust(20, '1234') #=> "hello123412341234123" 7410 */ 7411 7412static VALUE 7413rb_str_ljust(int argc, VALUE *argv, VALUE str) 7414{ 7415 return rb_str_justify(argc, argv, str, 'l'); 7416} 7417 7418 7419/* 7420 * call-seq: 7421 * str.rjust(integer, padstr=' ') -> new_str 7422 * 7423 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 7424 * <code>String</code> of length <i>integer</i> with <i>str</i> right justified 7425 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 7426 * 7427 * "hello".rjust(4) #=> "hello" 7428 * "hello".rjust(20) #=> " hello" 7429 * "hello".rjust(20, '1234') #=> "123412341234123hello" 7430 */ 7431 7432static VALUE 7433rb_str_rjust(int argc, VALUE *argv, VALUE str) 7434{ 7435 return rb_str_justify(argc, argv, str, 'r'); 7436} 7437 7438 7439/* 7440 * call-seq: 7441 * str.center(width, padstr=' ') -> new_str 7442 * 7443 * Centers +str+ in +width+. If +width+ is greater than the length of +str+, 7444 * returns a new String of length +width+ with +str+ centered and padded with 7445 * +padstr+; otherwise, returns +str+. 7446 * 7447 * "hello".center(4) #=> "hello" 7448 * "hello".center(20) #=> " hello " 7449 * "hello".center(20, '123') #=> "1231231hello12312312" 7450 */ 7451 7452static VALUE 7453rb_str_center(int argc, VALUE *argv, VALUE str) 7454{ 7455 return rb_str_justify(argc, argv, str, 'c'); 7456} 7457 7458/* 7459 * call-seq: 7460 * str.partition(sep) -> [head, sep, tail] 7461 * str.partition(regexp) -> [head, match, tail] 7462 * 7463 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string 7464 * and returns the part before it, the match, and the part 7465 * after it. 7466 * If it is not found, returns two empty strings and <i>str</i>. 7467 * 7468 * "hello".partition("l") #=> ["he", "l", "lo"] 7469 * "hello".partition("x") #=> ["hello", "", ""] 7470 * "hello".partition(/.l/) #=> ["h", "el", "lo"] 7471 */ 7472 7473static VALUE 7474rb_str_partition(VALUE str, VALUE sep) 7475{ 7476 long pos; 7477 int regex = FALSE; 7478 7479 if (RB_TYPE_P(sep, T_REGEXP)) { 7480 pos = rb_reg_search(sep, str, 0, 0); 7481 regex = TRUE; 7482 } 7483 else { 7484 VALUE tmp; 7485 7486 tmp = rb_check_string_type(sep); 7487 if (NIL_P(tmp)) { 7488 rb_raise(rb_eTypeError, "type mismatch: %s given", 7489 rb_obj_classname(sep)); 7490 } 7491 sep = tmp; 7492 pos = rb_str_index(str, sep, 0); 7493 } 7494 if (pos < 0) { 7495 failed: 7496 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str)); 7497 } 7498 if (regex) { 7499 sep = rb_str_subpat(str, sep, INT2FIX(0)); 7500 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed; 7501 } 7502 return rb_ary_new3(3, rb_str_subseq(str, 0, pos), 7503 sep, 7504 rb_str_subseq(str, pos+RSTRING_LEN(sep), 7505 RSTRING_LEN(str)-pos-RSTRING_LEN(sep))); 7506} 7507 7508/* 7509 * call-seq: 7510 * str.rpartition(sep) -> [head, sep, tail] 7511 * str.rpartition(regexp) -> [head, match, tail] 7512 * 7513 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end 7514 * of the string, and returns the part before it, the match, and the part 7515 * after it. 7516 * If it is not found, returns two empty strings and <i>str</i>. 7517 * 7518 * "hello".rpartition("l") #=> ["hel", "l", "o"] 7519 * "hello".rpartition("x") #=> ["", "", "hello"] 7520 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"] 7521 */ 7522 7523static VALUE 7524rb_str_rpartition(VALUE str, VALUE sep) 7525{ 7526 long pos = RSTRING_LEN(str); 7527 int regex = FALSE; 7528 7529 if (RB_TYPE_P(sep, T_REGEXP)) { 7530 pos = rb_reg_search(sep, str, pos, 1); 7531 regex = TRUE; 7532 } 7533 else { 7534 VALUE tmp; 7535 7536 tmp = rb_check_string_type(sep); 7537 if (NIL_P(tmp)) { 7538 rb_raise(rb_eTypeError, "type mismatch: %s given", 7539 rb_obj_classname(sep)); 7540 } 7541 sep = tmp; 7542 pos = rb_str_sublen(str, pos); 7543 pos = rb_str_rindex(str, sep, pos); 7544 } 7545 if (pos < 0) { 7546 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str); 7547 } 7548 if (regex) { 7549 sep = rb_reg_nth_match(0, rb_backref_get()); 7550 } 7551 return rb_ary_new3(3, rb_str_substr(str, 0, pos), 7552 sep, 7553 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str))); 7554} 7555 7556/* 7557 * call-seq: 7558 * str.start_with?([prefixes]+) -> true or false 7559 * 7560 * Returns true if +str+ starts with one of the +prefixes+ given. 7561 * 7562 * "hello".start_with?("hell") #=> true 7563 * 7564 * # returns true if one of the prefixes matches. 7565 * "hello".start_with?("heaven", "hell") #=> true 7566 * "hello".start_with?("heaven", "paradise") #=> false 7567 */ 7568 7569static VALUE 7570rb_str_start_with(int argc, VALUE *argv, VALUE str) 7571{ 7572 int i; 7573 7574 for (i=0; i<argc; i++) { 7575 VALUE tmp = argv[i]; 7576 StringValue(tmp); 7577 rb_enc_check(str, tmp); 7578 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 7579 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 7580 return Qtrue; 7581 } 7582 return Qfalse; 7583} 7584 7585/* 7586 * call-seq: 7587 * str.end_with?([suffixes]+) -> true or false 7588 * 7589 * Returns true if +str+ ends with one of the +suffixes+ given. 7590 */ 7591 7592static VALUE 7593rb_str_end_with(int argc, VALUE *argv, VALUE str) 7594{ 7595 int i; 7596 char *p, *s, *e; 7597 rb_encoding *enc; 7598 7599 for (i=0; i<argc; i++) { 7600 VALUE tmp = argv[i]; 7601 StringValue(tmp); 7602 enc = rb_enc_check(str, tmp); 7603 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 7604 p = RSTRING_PTR(str); 7605 e = p + RSTRING_LEN(str); 7606 s = e - RSTRING_LEN(tmp); 7607 if (rb_enc_left_char_head(p, s, e, enc) != s) 7608 continue; 7609 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 7610 return Qtrue; 7611 } 7612 return Qfalse; 7613} 7614 7615void 7616rb_str_setter(VALUE val, ID id, VALUE *var) 7617{ 7618 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) { 7619 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id)); 7620 } 7621 *var = val; 7622} 7623 7624 7625/* 7626 * call-seq: 7627 * str.force_encoding(encoding) -> str 7628 * 7629 * Changes the encoding to +encoding+ and returns self. 7630 */ 7631 7632static VALUE 7633rb_str_force_encoding(VALUE str, VALUE enc) 7634{ 7635 str_modifiable(str); 7636 rb_enc_associate(str, rb_to_encoding(enc)); 7637 ENC_CODERANGE_CLEAR(str); 7638 return str; 7639} 7640 7641/* 7642 * call-seq: 7643 * str.b -> str 7644 * 7645 * Returns a copied string whose encoding is ASCII-8BIT. 7646 */ 7647 7648static VALUE 7649rb_str_b(VALUE str) 7650{ 7651 VALUE str2 = str_alloc(rb_cString); 7652 str_replace_shared_without_enc(str2, str); 7653 OBJ_INFECT(str2, str); 7654 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID); 7655 return str2; 7656} 7657 7658/* 7659 * call-seq: 7660 * str.valid_encoding? -> true or false 7661 * 7662 * Returns true for a string which encoded correctly. 7663 * 7664 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true 7665 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false 7666 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false 7667 */ 7668 7669static VALUE 7670rb_str_valid_encoding_p(VALUE str) 7671{ 7672 int cr = rb_enc_str_coderange(str); 7673 7674 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue; 7675} 7676 7677/* 7678 * call-seq: 7679 * str.ascii_only? -> true or false 7680 * 7681 * Returns true for a string which has only ASCII characters. 7682 * 7683 * "abc".force_encoding("UTF-8").ascii_only? #=> true 7684 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false 7685 */ 7686 7687static VALUE 7688rb_str_is_ascii_only_p(VALUE str) 7689{ 7690 int cr = rb_enc_str_coderange(str); 7691 7692 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse; 7693} 7694 7695/** 7696 * Shortens _str_ and adds three dots, an ellipsis, if it is longer 7697 * than _len_ characters. 7698 * 7699 * \param str the string to ellipsize. 7700 * \param len the maximum string length. 7701 * \return the ellipsized string. 7702 * \pre _len_ must not be negative. 7703 * \post the length of the returned string in characters is less than or equal to _len_. 7704 * \post If the length of _str_ is less than or equal _len_, returns _str_ itself. 7705 * \post the encoded of returned string is equal to the encoded of _str_. 7706 * \post the class of returned string is equal to the class of _str_. 7707 * \note the length is counted in characters. 7708 */ 7709VALUE 7710rb_str_ellipsize(VALUE str, long len) 7711{ 7712 static const char ellipsis[] = "..."; 7713 const long ellipsislen = sizeof(ellipsis) - 1; 7714 rb_encoding *const enc = rb_enc_get(str); 7715 const long blen = RSTRING_LEN(str); 7716 const char *const p = RSTRING_PTR(str), *e = p + blen; 7717 VALUE estr, ret = 0; 7718 7719 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 7720 if (len * rb_enc_mbminlen(enc) >= blen || 7721 (e = rb_enc_nth(p, e, len, enc)) - p == blen) { 7722 ret = str; 7723 } 7724 else if (len <= ellipsislen || 7725 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) { 7726 if (rb_enc_asciicompat(enc)) { 7727 ret = rb_str_new_with_class(str, ellipsis, len); 7728 rb_enc_associate(ret, enc); 7729 } 7730 else { 7731 estr = rb_usascii_str_new(ellipsis, len); 7732 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil); 7733 } 7734 } 7735 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) { 7736 rb_str_cat(ret, ellipsis, ellipsislen); 7737 } 7738 else { 7739 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen), 7740 rb_enc_from_encoding(enc), 0, Qnil); 7741 rb_str_append(ret, estr); 7742 } 7743 return ret; 7744} 7745 7746/********************************************************************** 7747 * Document-class: Symbol 7748 * 7749 * <code>Symbol</code> objects represent names and some strings 7750 * inside the Ruby 7751 * interpreter. They are generated using the <code>:name</code> and 7752 * <code>:"string"</code> literals 7753 * syntax, and by the various <code>to_sym</code> methods. The same 7754 * <code>Symbol</code> object will be created for a given name or string 7755 * for the duration of a program's execution, regardless of the context 7756 * or meaning of that name. Thus if <code>Fred</code> is a constant in 7757 * one context, a method in another, and a class in a third, the 7758 * <code>Symbol</code> <code>:Fred</code> will be the same object in 7759 * all three contexts. 7760 * 7761 * module One 7762 * class Fred 7763 * end 7764 * $f1 = :Fred 7765 * end 7766 * module Two 7767 * Fred = 1 7768 * $f2 = :Fred 7769 * end 7770 * def Fred() 7771 * end 7772 * $f3 = :Fred 7773 * $f1.object_id #=> 2514190 7774 * $f2.object_id #=> 2514190 7775 * $f3.object_id #=> 2514190 7776 * 7777 */ 7778 7779 7780/* 7781 * call-seq: 7782 * sym == obj -> true or false 7783 * 7784 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same 7785 * symbol, returns <code>true</code>. 7786 */ 7787 7788static VALUE 7789sym_equal(VALUE sym1, VALUE sym2) 7790{ 7791 if (sym1 == sym2) return Qtrue; 7792 return Qfalse; 7793} 7794 7795 7796static int 7797sym_printable(const char *s, const char *send, rb_encoding *enc) 7798{ 7799 while (s < send) { 7800 int n; 7801 int c = rb_enc_codepoint_len(s, send, &n, enc); 7802 7803 if (!rb_enc_isprint(c, enc)) return FALSE; 7804 s += n; 7805 } 7806 return TRUE; 7807} 7808 7809int 7810rb_str_symname_p(VALUE sym) 7811{ 7812 rb_encoding *enc; 7813 const char *ptr; 7814 long len; 7815 rb_encoding *resenc = rb_default_internal_encoding(); 7816 7817 if (resenc == NULL) resenc = rb_default_external_encoding(); 7818 enc = STR_ENC_GET(sym); 7819 ptr = RSTRING_PTR(sym); 7820 len = RSTRING_LEN(sym); 7821 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) || 7822 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) { 7823 return FALSE; 7824 } 7825 return TRUE; 7826} 7827 7828VALUE 7829rb_str_quote_unprintable(VALUE str) 7830{ 7831 rb_encoding *enc; 7832 const char *ptr; 7833 long len; 7834 rb_encoding *resenc; 7835 7836 Check_Type(str, T_STRING); 7837 resenc = rb_default_internal_encoding(); 7838 if (resenc == NULL) resenc = rb_default_external_encoding(); 7839 enc = STR_ENC_GET(str); 7840 ptr = RSTRING_PTR(str); 7841 len = RSTRING_LEN(str); 7842 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) || 7843 !sym_printable(ptr, ptr + len, enc)) { 7844 return rb_str_inspect(str); 7845 } 7846 return str; 7847} 7848 7849VALUE 7850rb_id_quote_unprintable(ID id) 7851{ 7852 return rb_str_quote_unprintable(rb_id2str(id)); 7853} 7854 7855/* 7856 * call-seq: 7857 * sym.inspect -> string 7858 * 7859 * Returns the representation of <i>sym</i> as a symbol literal. 7860 * 7861 * :fred.inspect #=> ":fred" 7862 */ 7863 7864static VALUE 7865sym_inspect(VALUE sym) 7866{ 7867 VALUE str; 7868 const char *ptr; 7869 long len; 7870 ID id = SYM2ID(sym); 7871 char *dest; 7872 7873 sym = rb_id2str(id); 7874 if (!rb_str_symname_p(sym)) { 7875 str = rb_str_inspect(sym); 7876 len = RSTRING_LEN(str); 7877 rb_str_resize(str, len + 1); 7878 dest = RSTRING_PTR(str); 7879 memmove(dest + 1, dest, len); 7880 dest[0] = ':'; 7881 } 7882 else { 7883 rb_encoding *enc = STR_ENC_GET(sym); 7884 ptr = RSTRING_PTR(sym); 7885 len = RSTRING_LEN(sym); 7886 str = rb_enc_str_new(0, len + 1, enc); 7887 dest = RSTRING_PTR(str); 7888 dest[0] = ':'; 7889 memcpy(dest + 1, ptr, len); 7890 } 7891 return str; 7892} 7893 7894 7895/* 7896 * call-seq: 7897 * sym.id2name -> string 7898 * sym.to_s -> string 7899 * 7900 * Returns the name or string corresponding to <i>sym</i>. 7901 * 7902 * :fred.id2name #=> "fred" 7903 */ 7904 7905 7906VALUE 7907rb_sym_to_s(VALUE sym) 7908{ 7909 ID id = SYM2ID(sym); 7910 7911 return str_new3(rb_cString, rb_id2str(id)); 7912} 7913 7914 7915/* 7916 * call-seq: 7917 * sym.to_sym -> sym 7918 * sym.intern -> sym 7919 * 7920 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding 7921 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned 7922 * in this case. 7923 */ 7924 7925static VALUE 7926sym_to_sym(VALUE sym) 7927{ 7928 return sym; 7929} 7930 7931static VALUE 7932sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc) 7933{ 7934 VALUE obj; 7935 7936 if (argc < 1) { 7937 rb_raise(rb_eArgError, "no receiver given"); 7938 } 7939 obj = argv[0]; 7940 return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc); 7941} 7942 7943/* 7944 * call-seq: 7945 * sym.to_proc 7946 * 7947 * Returns a _Proc_ object which respond to the given method by _sym_. 7948 * 7949 * (1..3).collect(&:to_s) #=> ["1", "2", "3"] 7950 */ 7951 7952static VALUE 7953sym_to_proc(VALUE sym) 7954{ 7955 static VALUE sym_proc_cache = Qfalse; 7956 enum {SYM_PROC_CACHE_SIZE = 67}; 7957 VALUE proc; 7958 long id, index; 7959 VALUE *aryp; 7960 7961 if (!sym_proc_cache) { 7962 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2); 7963 rb_gc_register_mark_object(sym_proc_cache); 7964 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil); 7965 } 7966 7967 id = SYM2ID(sym); 7968 index = (id % SYM_PROC_CACHE_SIZE) << 1; 7969 7970 aryp = RARRAY_PTR(sym_proc_cache); 7971 if (aryp[index] == sym) { 7972 return aryp[index + 1]; 7973 } 7974 else { 7975 proc = rb_proc_new(sym_call, (VALUE)id); 7976 aryp[index] = sym; 7977 aryp[index + 1] = proc; 7978 return proc; 7979 } 7980} 7981 7982/* 7983 * call-seq: 7984 * 7985 * sym.succ 7986 * 7987 * Same as <code>sym.to_s.succ.intern</code>. 7988 */ 7989 7990static VALUE 7991sym_succ(VALUE sym) 7992{ 7993 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym))); 7994} 7995 7996/* 7997 * call-seq: 7998 * 7999 * symbol <=> other_symbol -> -1, 0, +1 or nil 8000 * 8001 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the 8002 * symbols. Returns -1, 0, +1 or nil depending on whether +symbol+ is less 8003 * than, equal to, or greater than +other_symbol+. 8004 * 8005 * +nil+ is returned if the two values are incomparable. 8006 * 8007 * See String#<=> for more information. 8008 */ 8009 8010static VALUE 8011sym_cmp(VALUE sym, VALUE other) 8012{ 8013 if (!SYMBOL_P(other)) { 8014 return Qnil; 8015 } 8016 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other)); 8017} 8018 8019/* 8020 * call-seq: 8021 * 8022 * sym.casecmp(other) -> -1, 0, +1 or nil 8023 * 8024 * Case-insensitive version of <code>Symbol#<=></code>. 8025 */ 8026 8027static VALUE 8028sym_casecmp(VALUE sym, VALUE other) 8029{ 8030 if (!SYMBOL_P(other)) { 8031 return Qnil; 8032 } 8033 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other)); 8034} 8035 8036/* 8037 * call-seq: 8038 * sym =~ obj -> fixnum or nil 8039 * 8040 * Returns <code>sym.to_s =~ obj</code>. 8041 */ 8042 8043static VALUE 8044sym_match(VALUE sym, VALUE other) 8045{ 8046 return rb_str_match(rb_sym_to_s(sym), other); 8047} 8048 8049/* 8050 * call-seq: 8051 * sym[idx] -> char 8052 * sym[b, n] -> char 8053 * 8054 * Returns <code>sym.to_s[]</code>. 8055 */ 8056 8057static VALUE 8058sym_aref(int argc, VALUE *argv, VALUE sym) 8059{ 8060 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym)); 8061} 8062 8063/* 8064 * call-seq: 8065 * sym.length -> integer 8066 * 8067 * Same as <code>sym.to_s.length</code>. 8068 */ 8069 8070static VALUE 8071sym_length(VALUE sym) 8072{ 8073 return rb_str_length(rb_id2str(SYM2ID(sym))); 8074} 8075 8076/* 8077 * call-seq: 8078 * sym.empty? -> true or false 8079 * 8080 * Returns that _sym_ is :"" or not. 8081 */ 8082 8083static VALUE 8084sym_empty(VALUE sym) 8085{ 8086 return rb_str_empty(rb_id2str(SYM2ID(sym))); 8087} 8088 8089/* 8090 * call-seq: 8091 * sym.upcase -> symbol 8092 * 8093 * Same as <code>sym.to_s.upcase.intern</code>. 8094 */ 8095 8096static VALUE 8097sym_upcase(VALUE sym) 8098{ 8099 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym)))); 8100} 8101 8102/* 8103 * call-seq: 8104 * sym.downcase -> symbol 8105 * 8106 * Same as <code>sym.to_s.downcase.intern</code>. 8107 */ 8108 8109static VALUE 8110sym_downcase(VALUE sym) 8111{ 8112 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym)))); 8113} 8114 8115/* 8116 * call-seq: 8117 * sym.capitalize -> symbol 8118 * 8119 * Same as <code>sym.to_s.capitalize.intern</code>. 8120 */ 8121 8122static VALUE 8123sym_capitalize(VALUE sym) 8124{ 8125 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym)))); 8126} 8127 8128/* 8129 * call-seq: 8130 * sym.swapcase -> symbol 8131 * 8132 * Same as <code>sym.to_s.swapcase.intern</code>. 8133 */ 8134 8135static VALUE 8136sym_swapcase(VALUE sym) 8137{ 8138 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym)))); 8139} 8140 8141/* 8142 * call-seq: 8143 * sym.encoding -> encoding 8144 * 8145 * Returns the Encoding object that represents the encoding of _sym_. 8146 */ 8147 8148static VALUE 8149sym_encoding(VALUE sym) 8150{ 8151 return rb_obj_encoding(rb_id2str(SYM2ID(sym))); 8152} 8153 8154ID 8155rb_to_id(VALUE name) 8156{ 8157 VALUE tmp; 8158 8159 switch (TYPE(name)) { 8160 default: 8161 tmp = rb_check_string_type(name); 8162 if (NIL_P(tmp)) { 8163 tmp = rb_inspect(name); 8164 rb_raise(rb_eTypeError, "%s is not a symbol", 8165 RSTRING_PTR(tmp)); 8166 } 8167 name = tmp; 8168 /* fall through */ 8169 case T_STRING: 8170 name = rb_str_intern(name); 8171 /* fall through */ 8172 case T_SYMBOL: 8173 return SYM2ID(name); 8174 } 8175 8176 UNREACHABLE; 8177} 8178 8179/* 8180 * A <code>String</code> object holds and manipulates an arbitrary sequence of 8181 * bytes, typically representing characters. String objects may be created 8182 * using <code>String::new</code> or as literals. 8183 * 8184 * Because of aliasing issues, users of strings should be aware of the methods 8185 * that modify the contents of a <code>String</code> object. Typically, 8186 * methods with names ending in ``!'' modify their receiver, while those 8187 * without a ``!'' return a new <code>String</code>. However, there are 8188 * exceptions, such as <code>String#[]=</code>. 8189 * 8190 */ 8191 8192void 8193Init_String(void) 8194{ 8195#undef rb_intern 8196#define rb_intern(str) rb_intern_const(str) 8197 8198 rb_cString = rb_define_class("String", rb_cObject); 8199 rb_include_module(rb_cString, rb_mComparable); 8200 rb_define_alloc_func(rb_cString, empty_str_alloc); 8201 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1); 8202 rb_define_method(rb_cString, "initialize", rb_str_init, -1); 8203 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1); 8204 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1); 8205 rb_define_method(rb_cString, "==", rb_str_equal, 1); 8206 rb_define_method(rb_cString, "===", rb_str_equal, 1); 8207 rb_define_method(rb_cString, "eql?", rb_str_eql, 1); 8208 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0); 8209 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1); 8210 rb_define_method(rb_cString, "+", rb_str_plus, 1); 8211 rb_define_method(rb_cString, "*", rb_str_times, 1); 8212 rb_define_method(rb_cString, "%", rb_str_format_m, 1); 8213 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1); 8214 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1); 8215 rb_define_method(rb_cString, "insert", rb_str_insert, 2); 8216 rb_define_method(rb_cString, "length", rb_str_length, 0); 8217 rb_define_method(rb_cString, "size", rb_str_length, 0); 8218 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0); 8219 rb_define_method(rb_cString, "empty?", rb_str_empty, 0); 8220 rb_define_method(rb_cString, "=~", rb_str_match, 1); 8221 rb_define_method(rb_cString, "match", rb_str_match_m, -1); 8222 rb_define_method(rb_cString, "succ", rb_str_succ, 0); 8223 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0); 8224 rb_define_method(rb_cString, "next", rb_str_succ, 0); 8225 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0); 8226 rb_define_method(rb_cString, "upto", rb_str_upto, -1); 8227 rb_define_method(rb_cString, "index", rb_str_index_m, -1); 8228 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1); 8229 rb_define_method(rb_cString, "replace", rb_str_replace, 1); 8230 rb_define_method(rb_cString, "clear", rb_str_clear, 0); 8231 rb_define_method(rb_cString, "chr", rb_str_chr, 0); 8232 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); 8233 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); 8234 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); 8235 8236 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); 8237 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); 8238 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0); 8239 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); 8240 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); 8241 rb_define_method(rb_cString, "dump", rb_str_dump, 0); 8242 8243 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0); 8244 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0); 8245 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0); 8246 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0); 8247 8248 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0); 8249 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0); 8250 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0); 8251 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0); 8252 8253 rb_define_method(rb_cString, "hex", rb_str_hex, 0); 8254 rb_define_method(rb_cString, "oct", rb_str_oct, 0); 8255 rb_define_method(rb_cString, "split", rb_str_split_m, -1); 8256 rb_define_method(rb_cString, "lines", rb_str_lines, -1); 8257 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0); 8258 rb_define_method(rb_cString, "chars", rb_str_chars, 0); 8259 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0); 8260 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0); 8261 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0); 8262 rb_define_method(rb_cString, "concat", rb_str_concat, 1); 8263 rb_define_method(rb_cString, "<<", rb_str_concat, 1); 8264 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1); 8265 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1); 8266 rb_define_method(rb_cString, "intern", rb_str_intern, 0); 8267 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); 8268 rb_define_method(rb_cString, "ord", rb_str_ord, 0); 8269 8270 rb_define_method(rb_cString, "include?", rb_str_include, 1); 8271 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1); 8272 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1); 8273 8274 rb_define_method(rb_cString, "scan", rb_str_scan, 1); 8275 8276 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1); 8277 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1); 8278 rb_define_method(rb_cString, "center", rb_str_center, -1); 8279 8280 rb_define_method(rb_cString, "sub", rb_str_sub, -1); 8281 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1); 8282 rb_define_method(rb_cString, "chop", rb_str_chop, 0); 8283 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1); 8284 rb_define_method(rb_cString, "strip", rb_str_strip, 0); 8285 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0); 8286 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0); 8287 8288 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1); 8289 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1); 8290 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0); 8291 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1); 8292 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0); 8293 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0); 8294 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0); 8295 8296 rb_define_method(rb_cString, "tr", rb_str_tr, 2); 8297 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2); 8298 rb_define_method(rb_cString, "delete", rb_str_delete, -1); 8299 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1); 8300 rb_define_method(rb_cString, "count", rb_str_count, -1); 8301 8302 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2); 8303 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2); 8304 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1); 8305 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1); 8306 8307 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1); 8308 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0); 8309 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0); 8310 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0); 8311 8312 rb_define_method(rb_cString, "sum", rb_str_sum, -1); 8313 8314 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1); 8315 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1); 8316 8317 rb_define_method(rb_cString, "partition", rb_str_partition, 1); 8318 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1); 8319 8320 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ 8321 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); 8322 rb_define_method(rb_cString, "b", rb_str_b, 0); 8323 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0); 8324 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0); 8325 8326 id_to_s = rb_intern("to_s"); 8327 8328 rb_fs = Qnil; 8329 rb_define_variable("$;", &rb_fs); 8330 rb_define_variable("$-F", &rb_fs); 8331 8332 rb_cSymbol = rb_define_class("Symbol", rb_cObject); 8333 rb_include_module(rb_cSymbol, rb_mComparable); 8334 rb_undef_alloc_func(rb_cSymbol); 8335 rb_undef_method(CLASS_OF(rb_cSymbol), "new"); 8336 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */ 8337 8338 rb_define_method(rb_cSymbol, "==", sym_equal, 1); 8339 rb_define_method(rb_cSymbol, "===", sym_equal, 1); 8340 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0); 8341 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0); 8342 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0); 8343 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0); 8344 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0); 8345 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0); 8346 rb_define_method(rb_cSymbol, "succ", sym_succ, 0); 8347 rb_define_method(rb_cSymbol, "next", sym_succ, 0); 8348 8349 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1); 8350 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1); 8351 rb_define_method(rb_cSymbol, "=~", sym_match, 1); 8352 8353 rb_define_method(rb_cSymbol, "[]", sym_aref, -1); 8354 rb_define_method(rb_cSymbol, "slice", sym_aref, -1); 8355 rb_define_method(rb_cSymbol, "length", sym_length, 0); 8356 rb_define_method(rb_cSymbol, "size", sym_length, 0); 8357 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0); 8358 rb_define_method(rb_cSymbol, "match", sym_match, 1); 8359 8360 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0); 8361 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0); 8362 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0); 8363 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0); 8364 8365 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0); 8366} 8367