1/**********************************************************************
2
3  string.c -
4
5  $Author: nagachika $
6  created at: Mon Aug  9 17:12:58 JST 1993
7
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000  Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/ruby.h"
15#include "ruby/re.h"
16#include "ruby/encoding.h"
17#include "vm_core.h"
18#include "internal.h"
19#include "probes.h"
20#include <assert.h>
21
22#define BEG(no) (regs->beg[(no)])
23#define END(no) (regs->end[(no)])
24
25#include <math.h>
26#include <ctype.h>
27
28#ifdef HAVE_UNISTD_H
29#include <unistd.h>
30#endif
31
32#define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
33
34#undef rb_str_new_cstr
35#undef rb_tainted_str_new_cstr
36#undef rb_usascii_str_new_cstr
37#undef rb_external_str_new_cstr
38#undef rb_locale_str_new_cstr
39#undef rb_str_new2
40#undef rb_str_new3
41#undef rb_str_new4
42#undef rb_str_new5
43#undef rb_tainted_str_new2
44#undef rb_usascii_str_new2
45#undef rb_str_dup_frozen
46#undef rb_str_buf_new_cstr
47#undef rb_str_buf_new2
48#undef rb_str_buf_cat2
49#undef rb_str_cat2
50
51static VALUE rb_str_clear(VALUE str);
52
53VALUE rb_cString;
54VALUE rb_cSymbol;
55
56#define RUBY_MAX_CHAR_LEN 16
57#define STR_TMPLOCK FL_USER7
58#define STR_NOEMBED FL_USER1
59#define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
60#define STR_ASSOC   FL_USER3
61#define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
62#define STR_ASSOC_P(s)  FL_ALL((s), STR_NOEMBED|STR_ASSOC)
63#define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
64#define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
65#define STR_UNSET_NOCAPA(s) do {\
66    if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
67} while (0)
68
69
70#define STR_SET_NOEMBED(str) do {\
71    FL_SET((str), STR_NOEMBED);\
72    STR_SET_EMBED_LEN((str), 0);\
73} while (0)
74#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
75#define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
76#define STR_SET_EMBED_LEN(str, n) do { \
77    long tmp_n = (n);\
78    RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
79    RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
80} while (0)
81
82#define STR_SET_LEN(str, n) do { \
83    if (STR_EMBED_P(str)) {\
84	STR_SET_EMBED_LEN((str), (n));\
85    }\
86    else {\
87	RSTRING(str)->as.heap.len = (n);\
88    }\
89} while (0)
90
91#define STR_DEC_LEN(str) do {\
92    if (STR_EMBED_P(str)) {\
93	long n = RSTRING_LEN(str);\
94	n--;\
95	STR_SET_EMBED_LEN((str), n);\
96    }\
97    else {\
98	RSTRING(str)->as.heap.len--;\
99    }\
100} while (0)
101
102#define RESIZE_CAPA(str,capacity) do {\
103    if (STR_EMBED_P(str)) {\
104	if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
105	    char *tmp = ALLOC_N(char, (capacity)+1);\
106	    memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
107	    RSTRING(str)->as.heap.ptr = tmp;\
108	    RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
109            STR_SET_NOEMBED(str);\
110	    RSTRING(str)->as.heap.aux.capa = (capacity);\
111	}\
112    }\
113    else {\
114	REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
115	if (!STR_NOCAPA_P(str))\
116	    RSTRING(str)->as.heap.aux.capa = (capacity);\
117    }\
118} while (0)
119
120#define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
121#define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
122
123#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
124
125static inline int
126single_byte_optimizable(VALUE str)
127{
128    rb_encoding *enc;
129
130    /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
131    if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
132        return 1;
133
134    enc = STR_ENC_GET(str);
135    if (rb_enc_mbmaxlen(enc) == 1)
136        return 1;
137
138    /* Conservative.  Possibly single byte.
139     * "\xa1" in Shift_JIS for example. */
140    return 0;
141}
142
143VALUE rb_fs;
144
145static inline const char *
146search_nonascii(const char *p, const char *e)
147{
148#if SIZEOF_VALUE == 8
149# define NONASCII_MASK 0x8080808080808080ULL
150#elif SIZEOF_VALUE == 4
151# define NONASCII_MASK 0x80808080UL
152#endif
153#ifdef NONASCII_MASK
154    if ((int)sizeof(VALUE) * 2 < e - p) {
155        const VALUE *s, *t;
156        const VALUE lowbits = sizeof(VALUE) - 1;
157        s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
158        while (p < (const char *)s) {
159            if (!ISASCII(*p))
160                return p;
161            p++;
162        }
163        t = (const VALUE*)(~lowbits & (VALUE)e);
164        while (s < t) {
165            if (*s & NONASCII_MASK) {
166                t = s;
167                break;
168            }
169            s++;
170        }
171        p = (const char *)t;
172    }
173#endif
174    while (p < e) {
175        if (!ISASCII(*p))
176            return p;
177        p++;
178    }
179    return NULL;
180}
181
182static int
183coderange_scan(const char *p, long len, rb_encoding *enc)
184{
185    const char *e = p + len;
186
187    if (rb_enc_to_index(enc) == 0) {
188        /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
189        p = search_nonascii(p, e);
190        return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
191    }
192
193    if (rb_enc_asciicompat(enc)) {
194        p = search_nonascii(p, e);
195        if (!p) {
196            return ENC_CODERANGE_7BIT;
197        }
198        while (p < e) {
199            int ret = rb_enc_precise_mbclen(p, e, enc);
200            if (!MBCLEN_CHARFOUND_P(ret)) {
201                return ENC_CODERANGE_BROKEN;
202            }
203            p += MBCLEN_CHARFOUND_LEN(ret);
204            if (p < e) {
205                p = search_nonascii(p, e);
206                if (!p) {
207                    return ENC_CODERANGE_VALID;
208                }
209            }
210        }
211        if (e < p) {
212            return ENC_CODERANGE_BROKEN;
213        }
214        return ENC_CODERANGE_VALID;
215    }
216
217    while (p < e) {
218        int ret = rb_enc_precise_mbclen(p, e, enc);
219
220        if (!MBCLEN_CHARFOUND_P(ret)) {
221            return ENC_CODERANGE_BROKEN;
222        }
223        p += MBCLEN_CHARFOUND_LEN(ret);
224    }
225    if (e < p) {
226        return ENC_CODERANGE_BROKEN;
227    }
228    return ENC_CODERANGE_VALID;
229}
230
231long
232rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
233{
234    const char *p = s;
235
236    if (*cr == ENC_CODERANGE_BROKEN)
237	return e - s;
238
239    if (rb_enc_to_index(enc) == 0) {
240	/* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
241	p = search_nonascii(p, e);
242	*cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
243	return e - s;
244    }
245    else if (rb_enc_asciicompat(enc)) {
246	p = search_nonascii(p, e);
247	if (!p) {
248	    if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
249	    return e - s;
250	}
251	while (p < e) {
252	    int ret = rb_enc_precise_mbclen(p, e, enc);
253	    if (!MBCLEN_CHARFOUND_P(ret)) {
254		*cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
255		return p - s;
256	    }
257	    p += MBCLEN_CHARFOUND_LEN(ret);
258	    if (p < e) {
259		p = search_nonascii(p, e);
260		if (!p) {
261		    *cr = ENC_CODERANGE_VALID;
262		    return e - s;
263		}
264	    }
265	}
266	*cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
267	return p - s;
268    }
269    else {
270	while (p < e) {
271	    int ret = rb_enc_precise_mbclen(p, e, enc);
272	    if (!MBCLEN_CHARFOUND_P(ret)) {
273		*cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
274		return p - s;
275	    }
276	    p += MBCLEN_CHARFOUND_LEN(ret);
277	}
278	*cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
279	return p - s;
280    }
281}
282
283static inline void
284str_enc_copy(VALUE str1, VALUE str2)
285{
286    rb_enc_set_index(str1, ENCODING_GET(str2));
287}
288
289static void
290rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
291{
292    /* this function is designed for copying encoding and coderange
293     * from src to new string "dest" which is made from the part of src.
294     */
295    str_enc_copy(dest, src);
296    if (RSTRING_LEN(dest) == 0) {
297	if (!rb_enc_asciicompat(STR_ENC_GET(src)))
298	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
299	else
300	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
301	return;
302    }
303    switch (ENC_CODERANGE(src)) {
304      case ENC_CODERANGE_7BIT:
305	ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
306	break;
307      case ENC_CODERANGE_VALID:
308	if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
309	    search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
310	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
311	else
312	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
313	break;
314      default:
315	break;
316    }
317}
318
319static void
320rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
321{
322    str_enc_copy(dest, src);
323    ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
324}
325
326int
327rb_enc_str_coderange(VALUE str)
328{
329    int cr = ENC_CODERANGE(str);
330
331    if (cr == ENC_CODERANGE_UNKNOWN) {
332	rb_encoding *enc = STR_ENC_GET(str);
333        cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
334        ENC_CODERANGE_SET(str, cr);
335    }
336    return cr;
337}
338
339int
340rb_enc_str_asciionly_p(VALUE str)
341{
342    rb_encoding *enc = STR_ENC_GET(str);
343
344    if (!rb_enc_asciicompat(enc))
345        return FALSE;
346    else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
347        return TRUE;
348    return FALSE;
349}
350
351static inline void
352str_mod_check(VALUE s, const char *p, long len)
353{
354    if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
355	rb_raise(rb_eRuntimeError, "string modified");
356    }
357}
358
359size_t
360rb_str_capacity(VALUE str)
361{
362    if (STR_EMBED_P(str)) {
363	return RSTRING_EMBED_LEN_MAX;
364    }
365    else if (STR_NOCAPA_P(str)) {
366	return RSTRING(str)->as.heap.len;
367    }
368    else {
369	return RSTRING(str)->as.heap.aux.capa;
370    }
371}
372
373static inline VALUE
374str_alloc(VALUE klass)
375{
376    NEWOBJ_OF(str, struct RString, klass, T_STRING);
377
378    str->as.heap.ptr = 0;
379    str->as.heap.len = 0;
380    str->as.heap.aux.capa = 0;
381
382    return (VALUE)str;
383}
384
385static inline VALUE
386empty_str_alloc(VALUE klass)
387{
388    if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
389	RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
390    }
391    return str_alloc(klass);
392}
393
394static VALUE
395str_new(VALUE klass, const char *ptr, long len)
396{
397    VALUE str;
398
399    if (len < 0) {
400	rb_raise(rb_eArgError, "negative string size (or size too big)");
401    }
402
403    if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
404	RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
405    }
406
407    str = str_alloc(klass);
408    if (len > RSTRING_EMBED_LEN_MAX) {
409	RSTRING(str)->as.heap.aux.capa = len;
410	RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
411	STR_SET_NOEMBED(str);
412    }
413    else if (len == 0) {
414	ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
415    }
416    if (ptr) {
417	memcpy(RSTRING_PTR(str), ptr, len);
418    }
419    STR_SET_LEN(str, len);
420    RSTRING_PTR(str)[len] = '\0';
421    return str;
422}
423
424VALUE
425rb_str_new(const char *ptr, long len)
426{
427    return str_new(rb_cString, ptr, len);
428}
429
430VALUE
431rb_usascii_str_new(const char *ptr, long len)
432{
433    VALUE str = rb_str_new(ptr, len);
434    ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
435    return str;
436}
437
438VALUE
439rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
440{
441    VALUE str = rb_str_new(ptr, len);
442    rb_enc_associate(str, enc);
443    return str;
444}
445
446VALUE
447rb_str_new_cstr(const char *ptr)
448{
449    if (!ptr) {
450	rb_raise(rb_eArgError, "NULL pointer given");
451    }
452    return rb_str_new(ptr, strlen(ptr));
453}
454
455RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
456#define rb_str_new2 rb_str_new_cstr
457
458VALUE
459rb_usascii_str_new_cstr(const char *ptr)
460{
461    VALUE str = rb_str_new2(ptr);
462    ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
463    return str;
464}
465
466RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
467#define rb_usascii_str_new2 rb_usascii_str_new_cstr
468
469VALUE
470rb_tainted_str_new(const char *ptr, long len)
471{
472    VALUE str = rb_str_new(ptr, len);
473
474    OBJ_TAINT(str);
475    return str;
476}
477
478VALUE
479rb_tainted_str_new_cstr(const char *ptr)
480{
481    VALUE str = rb_str_new2(ptr);
482
483    OBJ_TAINT(str);
484    return str;
485}
486
487RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
488#define rb_tainted_str_new2 rb_tainted_str_new_cstr
489
490VALUE
491rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
492{
493    extern VALUE rb_cEncodingConverter;
494    rb_econv_t *ec;
495    rb_econv_result_t ret;
496    long len, olen;
497    VALUE econv_wrapper;
498    VALUE newstr;
499    const unsigned char *start, *sp;
500    unsigned char *dest, *dp;
501    size_t converted_output = 0;
502
503    if (!to) return str;
504    if (!from) from = rb_enc_get(str);
505    if (from == to) return str;
506    if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
507	to == rb_ascii8bit_encoding()) {
508	if (STR_ENC_GET(str) != to) {
509	    str = rb_str_dup(str);
510	    rb_enc_associate(str, to);
511	}
512	return str;
513    }
514
515    len = RSTRING_LEN(str);
516    newstr = rb_str_new(0, len);
517    olen = len;
518
519    econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
520    RBASIC(econv_wrapper)->klass = 0;
521    ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
522    if (!ec) return str;
523    DATA_PTR(econv_wrapper) = ec;
524
525    sp = (unsigned char*)RSTRING_PTR(str);
526    start = sp;
527    while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
528	   (dp = dest + converted_output),
529	   (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
530	   ret == econv_destination_buffer_full) {
531	/* destination buffer short */
532	size_t converted_input = sp - start;
533	size_t rest = len - converted_input;
534	converted_output = dp - dest;
535	rb_str_set_len(newstr, converted_output);
536	if (converted_input && converted_output &&
537	    rest < (LONG_MAX / converted_output)) {
538	    rest = (rest * converted_output) / converted_input;
539	}
540	else {
541	    rest = olen;
542	}
543	olen += rest < 2 ? 2 : rest;
544	rb_str_resize(newstr, olen);
545    }
546    DATA_PTR(econv_wrapper) = 0;
547    rb_econv_close(ec);
548    rb_gc_force_recycle(econv_wrapper);
549    switch (ret) {
550      case econv_finished:
551	len = dp - (unsigned char*)RSTRING_PTR(newstr);
552	rb_str_set_len(newstr, len);
553	rb_enc_associate(newstr, to);
554	return newstr;
555
556      default:
557	/* some error, return original */
558	return str;
559    }
560}
561
562VALUE
563rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
564{
565    return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
566}
567
568VALUE
569rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
570{
571    VALUE str;
572
573    str = rb_tainted_str_new(ptr, len);
574    if (eenc == rb_usascii_encoding() &&
575	rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
576	rb_enc_associate(str, rb_ascii8bit_encoding());
577	return str;
578    }
579    rb_enc_associate(str, eenc);
580    return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
581}
582
583VALUE
584rb_external_str_new(const char *ptr, long len)
585{
586    return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
587}
588
589VALUE
590rb_external_str_new_cstr(const char *ptr)
591{
592    return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
593}
594
595VALUE
596rb_locale_str_new(const char *ptr, long len)
597{
598    return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
599}
600
601VALUE
602rb_locale_str_new_cstr(const char *ptr)
603{
604    return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
605}
606
607VALUE
608rb_filesystem_str_new(const char *ptr, long len)
609{
610    return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
611}
612
613VALUE
614rb_filesystem_str_new_cstr(const char *ptr)
615{
616    return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
617}
618
619VALUE
620rb_str_export(VALUE str)
621{
622    return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
623}
624
625VALUE
626rb_str_export_locale(VALUE str)
627{
628    return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
629}
630
631VALUE
632rb_str_export_to_enc(VALUE str, rb_encoding *enc)
633{
634    return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
635}
636
637static VALUE
638str_replace_shared_without_enc(VALUE str2, VALUE str)
639{
640    if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
641	STR_SET_EMBED(str2);
642	memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
643	STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
644    }
645    else {
646	str = rb_str_new_frozen(str);
647	FL_SET(str2, STR_NOEMBED);
648	RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
649	RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
650	RSTRING(str2)->as.heap.aux.shared = str;
651	FL_SET(str2, ELTS_SHARED);
652    }
653    return str2;
654}
655
656static VALUE
657str_replace_shared(VALUE str2, VALUE str)
658{
659    str_replace_shared_without_enc(str2, str);
660    rb_enc_cr_str_exact_copy(str2, str);
661    return str2;
662}
663
664static VALUE
665str_new_shared(VALUE klass, VALUE str)
666{
667    return str_replace_shared(str_alloc(klass), str);
668}
669
670static VALUE
671str_new3(VALUE klass, VALUE str)
672{
673    return str_new_shared(klass, str);
674}
675
676VALUE
677rb_str_new_shared(VALUE str)
678{
679    VALUE str2 = str_new3(rb_obj_class(str), str);
680
681    OBJ_INFECT(str2, str);
682    return str2;
683}
684
685RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
686#define rb_str_new3 rb_str_new_shared
687
688static VALUE
689str_new4(VALUE klass, VALUE str)
690{
691    VALUE str2;
692
693    str2 = str_alloc(klass);
694    STR_SET_NOEMBED(str2);
695    RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
696    RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
697    if (STR_SHARED_P(str)) {
698	VALUE shared = RSTRING(str)->as.heap.aux.shared;
699	assert(OBJ_FROZEN(shared));
700	FL_SET(str2, ELTS_SHARED);
701	RSTRING(str2)->as.heap.aux.shared = shared;
702    }
703    else {
704	FL_SET(str, ELTS_SHARED);
705	RSTRING(str)->as.heap.aux.shared = str2;
706    }
707    rb_enc_cr_str_exact_copy(str2, str);
708    OBJ_INFECT(str2, str);
709    return str2;
710}
711
712VALUE
713rb_str_new_frozen(VALUE orig)
714{
715    VALUE klass, str;
716
717    if (OBJ_FROZEN(orig)) return orig;
718    klass = rb_obj_class(orig);
719    if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
720	long ofs;
721	assert(OBJ_FROZEN(str));
722	ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
723	if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
724	    ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) ||
725	    ENCODING_GET(str) != ENCODING_GET(orig)) {
726	    str = str_new3(klass, str);
727	    RSTRING(str)->as.heap.ptr += ofs;
728	    RSTRING(str)->as.heap.len -= ofs;
729	    rb_enc_cr_str_exact_copy(str, orig);
730	    OBJ_INFECT(str, orig);
731	}
732    }
733    else if (STR_EMBED_P(orig)) {
734	str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
735	rb_enc_cr_str_exact_copy(str, orig);
736	OBJ_INFECT(str, orig);
737    }
738    else if (STR_ASSOC_P(orig)) {
739	VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
740	FL_UNSET(orig, STR_ASSOC);
741	str = str_new4(klass, orig);
742	FL_SET(str, STR_ASSOC);
743	RSTRING(str)->as.heap.aux.shared = assoc;
744    }
745    else {
746	str = str_new4(klass, orig);
747    }
748    OBJ_FREEZE(str);
749    return str;
750}
751
752RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
753#define rb_str_new4 rb_str_new_frozen
754
755VALUE
756rb_str_new_with_class(VALUE obj, const char *ptr, long len)
757{
758    return str_new(rb_obj_class(obj), ptr, len);
759}
760
761RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
762	   rb_str_new_with_class, (obj, ptr, len))
763#define rb_str_new5 rb_str_new_with_class
764
765static VALUE
766str_new_empty(VALUE str)
767{
768    VALUE v = rb_str_new5(str, 0, 0);
769    rb_enc_copy(v, str);
770    OBJ_INFECT(v, str);
771    return v;
772}
773
774#define STR_BUF_MIN_SIZE 128
775
776VALUE
777rb_str_buf_new(long capa)
778{
779    VALUE str = str_alloc(rb_cString);
780
781    if (capa < STR_BUF_MIN_SIZE) {
782	capa = STR_BUF_MIN_SIZE;
783    }
784    FL_SET(str, STR_NOEMBED);
785    RSTRING(str)->as.heap.aux.capa = capa;
786    RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
787    RSTRING(str)->as.heap.ptr[0] = '\0';
788
789    return str;
790}
791
792VALUE
793rb_str_buf_new_cstr(const char *ptr)
794{
795    VALUE str;
796    long len = strlen(ptr);
797
798    str = rb_str_buf_new(len);
799    rb_str_buf_cat(str, ptr, len);
800
801    return str;
802}
803
804RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
805#define rb_str_buf_new2 rb_str_buf_new_cstr
806
807VALUE
808rb_str_tmp_new(long len)
809{
810    return str_new(0, 0, len);
811}
812
813void *
814rb_alloc_tmp_buffer(volatile VALUE *store, long len)
815{
816    VALUE s = rb_str_tmp_new(len);
817    *store = s;
818    return RSTRING_PTR(s);
819}
820
821void
822rb_free_tmp_buffer(volatile VALUE *store)
823{
824    VALUE s = *store;
825    *store = 0;
826    if (s) rb_str_clear(s);
827}
828
829void
830rb_str_free(VALUE str)
831{
832    if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
833	xfree(RSTRING(str)->as.heap.ptr);
834    }
835}
836
837RUBY_FUNC_EXPORTED size_t
838rb_str_memsize(VALUE str)
839{
840    if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
841	return RSTRING(str)->as.heap.aux.capa;
842    }
843    else {
844	return 0;
845    }
846}
847
848VALUE
849rb_str_to_str(VALUE str)
850{
851    return rb_convert_type(str, T_STRING, "String", "to_str");
852}
853
854static inline void str_discard(VALUE str);
855
856void
857rb_str_shared_replace(VALUE str, VALUE str2)
858{
859    rb_encoding *enc;
860    int cr;
861    if (str == str2) return;
862    enc = STR_ENC_GET(str2);
863    cr = ENC_CODERANGE(str2);
864    str_discard(str);
865    OBJ_INFECT(str, str2);
866    if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
867	STR_SET_EMBED(str);
868	memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
869	STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
870        rb_enc_associate(str, enc);
871        ENC_CODERANGE_SET(str, cr);
872	return;
873    }
874    STR_SET_NOEMBED(str);
875    STR_UNSET_NOCAPA(str);
876    RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
877    RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
878    if (STR_NOCAPA_P(str2)) {
879	FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
880	RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
881    }
882    else {
883	RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
884    }
885    STR_SET_EMBED(str2);	/* abandon str2 */
886    RSTRING_PTR(str2)[0] = 0;
887    STR_SET_EMBED_LEN(str2, 0);
888    rb_enc_associate(str, enc);
889    ENC_CODERANGE_SET(str, cr);
890}
891
892static ID id_to_s;
893
894VALUE
895rb_obj_as_string(VALUE obj)
896{
897    VALUE str;
898
899    if (RB_TYPE_P(obj, T_STRING)) {
900	return obj;
901    }
902    str = rb_funcall(obj, id_to_s, 0);
903    if (!RB_TYPE_P(str, T_STRING))
904	return rb_any_to_s(obj);
905    if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
906    return str;
907}
908
909static VALUE
910str_replace(VALUE str, VALUE str2)
911{
912    long len;
913
914    len = RSTRING_LEN(str2);
915    if (STR_ASSOC_P(str2)) {
916	str2 = rb_str_new4(str2);
917    }
918    if (STR_SHARED_P(str2)) {
919	VALUE shared = RSTRING(str2)->as.heap.aux.shared;
920	assert(OBJ_FROZEN(shared));
921	STR_SET_NOEMBED(str);
922	RSTRING(str)->as.heap.len = len;
923	RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
924	FL_SET(str, ELTS_SHARED);
925	FL_UNSET(str, STR_ASSOC);
926	RSTRING(str)->as.heap.aux.shared = shared;
927    }
928    else {
929	str_replace_shared(str, str2);
930    }
931
932    OBJ_INFECT(str, str2);
933    rb_enc_cr_str_exact_copy(str, str2);
934    return str;
935}
936
937static VALUE
938str_duplicate(VALUE klass, VALUE str)
939{
940    VALUE dup = str_alloc(klass);
941    str_replace(dup, str);
942    return dup;
943}
944
945VALUE
946rb_str_dup(VALUE str)
947{
948    return str_duplicate(rb_obj_class(str), str);
949}
950
951VALUE
952rb_str_resurrect(VALUE str)
953{
954    if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
955	RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
956				  rb_sourcefile(), rb_sourceline());
957    }
958    return str_replace(str_alloc(rb_cString), str);
959}
960
961/*
962 *  call-seq:
963 *     String.new(str="")   -> new_str
964 *
965 *  Returns a new string object containing a copy of <i>str</i>.
966 */
967
968static VALUE
969rb_str_init(int argc, VALUE *argv, VALUE str)
970{
971    VALUE orig;
972
973    if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
974	rb_str_replace(str, orig);
975    return str;
976}
977
978static inline long
979enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
980{
981    long c;
982    const char *q;
983
984    if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
985        return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
986    }
987    else if (rb_enc_asciicompat(enc)) {
988        c = 0;
989	if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
990	    while (p < e) {
991		if (ISASCII(*p)) {
992		    q = search_nonascii(p, e);
993		    if (!q)
994			return c + (e - p);
995		    c += q - p;
996		    p = q;
997		}
998		p += rb_enc_fast_mbclen(p, e, enc);
999		c++;
1000	    }
1001	}
1002	else {
1003	    while (p < e) {
1004		if (ISASCII(*p)) {
1005		    q = search_nonascii(p, e);
1006		    if (!q)
1007			return c + (e - p);
1008		    c += q - p;
1009		    p = q;
1010		}
1011		p += rb_enc_mbclen(p, e, enc);
1012		c++;
1013	    }
1014	}
1015        return c;
1016    }
1017
1018    for (c=0; p<e; c++) {
1019        p += rb_enc_mbclen(p, e, enc);
1020    }
1021    return c;
1022}
1023
1024long
1025rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1026{
1027    return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1028}
1029
1030long
1031rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1032{
1033    long c;
1034    const char *q;
1035    int ret;
1036
1037    *cr = 0;
1038    if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1039	return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
1040    }
1041    else if (rb_enc_asciicompat(enc)) {
1042	c = 0;
1043	while (p < e) {
1044	    if (ISASCII(*p)) {
1045		q = search_nonascii(p, e);
1046		if (!q) {
1047		    if (!*cr) *cr = ENC_CODERANGE_7BIT;
1048		    return c + (e - p);
1049		}
1050		c += q - p;
1051		p = q;
1052	    }
1053	    ret = rb_enc_precise_mbclen(p, e, enc);
1054	    if (MBCLEN_CHARFOUND_P(ret)) {
1055		*cr |= ENC_CODERANGE_VALID;
1056		p += MBCLEN_CHARFOUND_LEN(ret);
1057	    }
1058	    else {
1059		*cr = ENC_CODERANGE_BROKEN;
1060		p++;
1061	    }
1062	    c++;
1063	}
1064	if (!*cr) *cr = ENC_CODERANGE_7BIT;
1065	return c;
1066    }
1067
1068    for (c=0; p<e; c++) {
1069	ret = rb_enc_precise_mbclen(p, e, enc);
1070	if (MBCLEN_CHARFOUND_P(ret)) {
1071	    *cr |= ENC_CODERANGE_VALID;
1072	    p += MBCLEN_CHARFOUND_LEN(ret);
1073	}
1074	else {
1075	    *cr = ENC_CODERANGE_BROKEN;
1076            if (p + rb_enc_mbminlen(enc) <= e)
1077                p += rb_enc_mbminlen(enc);
1078            else
1079                p = e;
1080	}
1081    }
1082    if (!*cr) *cr = ENC_CODERANGE_7BIT;
1083    return c;
1084}
1085
1086#ifdef NONASCII_MASK
1087#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1088
1089/*
1090 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1091 * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
1092 * Therefore, following pseudo code can detect UTF-8 leading byte.
1093 *
1094 * if (!(byte & 0x80))
1095 *   byte |= 0x40;          // turn on bit6
1096 * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
1097 *
1098 * This function calculate every bytes in the argument word `s'
1099 * using the above logic concurrently. and gather every bytes result.
1100 */
1101static inline VALUE
1102count_utf8_lead_bytes_with_word(const VALUE *s)
1103{
1104    VALUE d = *s;
1105
1106    /* Transform into bit0 represent UTF-8 leading or not. */
1107    d |= ~(d>>1);
1108    d >>= 6;
1109    d &= NONASCII_MASK >> 7;
1110
1111    /* Gather every bytes. */
1112    d += (d>>8);
1113    d += (d>>16);
1114#if SIZEOF_VALUE == 8
1115    d += (d>>32);
1116#endif
1117    return (d&0xF);
1118}
1119#endif
1120
1121static long
1122str_strlen(VALUE str, rb_encoding *enc)
1123{
1124    const char *p, *e;
1125    long n;
1126    int cr;
1127
1128    if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1129    if (!enc) enc = STR_ENC_GET(str);
1130    p = RSTRING_PTR(str);
1131    e = RSTRING_END(str);
1132    cr = ENC_CODERANGE(str);
1133#ifdef NONASCII_MASK
1134    if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1135        enc == rb_utf8_encoding()) {
1136
1137	VALUE len = 0;
1138	if ((int)sizeof(VALUE) * 2 < e - p) {
1139	    const VALUE *s, *t;
1140	    const VALUE lowbits = sizeof(VALUE) - 1;
1141	    s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1142	    t = (const VALUE*)(~lowbits & (VALUE)e);
1143	    while (p < (const char *)s) {
1144		if (is_utf8_lead_byte(*p)) len++;
1145		p++;
1146	    }
1147	    while (s < t) {
1148		len += count_utf8_lead_bytes_with_word(s);
1149		s++;
1150	    }
1151	    p = (const char *)s;
1152	}
1153	while (p < e) {
1154	    if (is_utf8_lead_byte(*p)) len++;
1155	    p++;
1156	}
1157	return (long)len;
1158    }
1159#endif
1160    n = rb_enc_strlen_cr(p, e, enc, &cr);
1161    if (cr) {
1162        ENC_CODERANGE_SET(str, cr);
1163    }
1164    return n;
1165}
1166
1167long
1168rb_str_strlen(VALUE str)
1169{
1170    return str_strlen(str, STR_ENC_GET(str));
1171}
1172
1173/*
1174 *  call-seq:
1175 *     str.length   -> integer
1176 *     str.size     -> integer
1177 *
1178 *  Returns the character length of <i>str</i>.
1179 */
1180
1181VALUE
1182rb_str_length(VALUE str)
1183{
1184    long len;
1185
1186    len = str_strlen(str, STR_ENC_GET(str));
1187    return LONG2NUM(len);
1188}
1189
1190/*
1191 *  call-seq:
1192 *     str.bytesize  -> integer
1193 *
1194 *  Returns the length of +str+ in bytes.
1195 *
1196 *    "\x80\u3042".bytesize  #=> 4
1197 *    "hello".bytesize       #=> 5
1198 */
1199
1200static VALUE
1201rb_str_bytesize(VALUE str)
1202{
1203    return LONG2NUM(RSTRING_LEN(str));
1204}
1205
1206/*
1207 *  call-seq:
1208 *     str.empty?   -> true or false
1209 *
1210 *  Returns <code>true</code> if <i>str</i> has a length of zero.
1211 *
1212 *     "hello".empty?   #=> false
1213 *     " ".empty?       #=> false
1214 *     "".empty?        #=> true
1215 */
1216
1217static VALUE
1218rb_str_empty(VALUE str)
1219{
1220    if (RSTRING_LEN(str) == 0)
1221	return Qtrue;
1222    return Qfalse;
1223}
1224
1225/*
1226 *  call-seq:
1227 *     str + other_str   -> new_str
1228 *
1229 *  Concatenation---Returns a new <code>String</code> containing
1230 *  <i>other_str</i> concatenated to <i>str</i>.
1231 *
1232 *     "Hello from " + self.to_s   #=> "Hello from main"
1233 */
1234
1235VALUE
1236rb_str_plus(VALUE str1, VALUE str2)
1237{
1238    VALUE str3;
1239    rb_encoding *enc;
1240
1241    StringValue(str2);
1242    enc = rb_enc_check(str1, str2);
1243    str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
1244    memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
1245    memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
1246	   RSTRING_PTR(str2), RSTRING_LEN(str2));
1247    RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
1248
1249    if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
1250	OBJ_TAINT(str3);
1251    ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
1252			   ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
1253    return str3;
1254}
1255
1256/*
1257 *  call-seq:
1258 *     str * integer   -> new_str
1259 *
1260 *  Copy --- Returns a new String containing +integer+ copies of the receiver.
1261 *  +integer+ must be greater than or equal to 0.
1262 *
1263 *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
1264 *     "Ho! " * 0   #=> ""
1265 */
1266
1267VALUE
1268rb_str_times(VALUE str, VALUE times)
1269{
1270    VALUE str2;
1271    long n, len;
1272    char *ptr2;
1273
1274    len = NUM2LONG(times);
1275    if (len < 0) {
1276	rb_raise(rb_eArgError, "negative argument");
1277    }
1278    if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
1279	rb_raise(rb_eArgError, "argument too big");
1280    }
1281
1282    str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
1283    ptr2 = RSTRING_PTR(str2);
1284    if (len) {
1285        n = RSTRING_LEN(str);
1286        memcpy(ptr2, RSTRING_PTR(str), n);
1287        while (n <= len/2) {
1288            memcpy(ptr2 + n, ptr2, n);
1289            n *= 2;
1290        }
1291        memcpy(ptr2 + n, ptr2, len-n);
1292    }
1293    ptr2[RSTRING_LEN(str2)] = '\0';
1294    OBJ_INFECT(str2, str);
1295    rb_enc_cr_str_copy_for_substr(str2, str);
1296
1297    return str2;
1298}
1299
1300/*
1301 *  call-seq:
1302 *     str % arg   -> new_str
1303 *
1304 *  Format---Uses <i>str</i> as a format specification, and returns the result
1305 *  of applying it to <i>arg</i>. If the format specification contains more than
1306 *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
1307 *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
1308 *  details of the format string.
1309 *
1310 *     "%05d" % 123                              #=> "00123"
1311 *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
1312 *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
1313 */
1314
1315static VALUE
1316rb_str_format_m(VALUE str, VALUE arg)
1317{
1318    volatile VALUE tmp = rb_check_array_type(arg);
1319
1320    if (!NIL_P(tmp)) {
1321	return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
1322    }
1323    return rb_str_format(1, &arg, str);
1324}
1325
1326static inline void
1327str_modifiable(VALUE str)
1328{
1329    if (FL_TEST(str, STR_TMPLOCK)) {
1330	rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1331    }
1332    rb_check_frozen(str);
1333    if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
1334	rb_raise(rb_eSecurityError, "Insecure: can't modify string");
1335}
1336
1337static inline int
1338str_independent(VALUE str)
1339{
1340    str_modifiable(str);
1341    if (!STR_SHARED_P(str)) return 1;
1342    if (STR_EMBED_P(str)) return 1;
1343    return 0;
1344}
1345
1346static void
1347str_make_independent_expand(VALUE str, long expand)
1348{
1349    char *ptr;
1350    long len = RSTRING_LEN(str);
1351    long capa = len + expand;
1352
1353    if (len > capa) len = capa;
1354    ptr = ALLOC_N(char, capa + 1);
1355    if (RSTRING_PTR(str)) {
1356	memcpy(ptr, RSTRING_PTR(str), len);
1357    }
1358    STR_SET_NOEMBED(str);
1359    STR_UNSET_NOCAPA(str);
1360    ptr[len] = 0;
1361    RSTRING(str)->as.heap.ptr = ptr;
1362    RSTRING(str)->as.heap.len = len;
1363    RSTRING(str)->as.heap.aux.capa = capa;
1364}
1365
1366#define str_make_independent(str) str_make_independent_expand((str), 0L)
1367
1368void
1369rb_str_modify(VALUE str)
1370{
1371    if (!str_independent(str))
1372	str_make_independent(str);
1373    ENC_CODERANGE_CLEAR(str);
1374}
1375
1376void
1377rb_str_modify_expand(VALUE str, long expand)
1378{
1379    if (expand < 0) {
1380	rb_raise(rb_eArgError, "negative expanding string size");
1381    }
1382    if (!str_independent(str)) {
1383	str_make_independent_expand(str, expand);
1384    }
1385    else if (expand > 0) {
1386	long len = RSTRING_LEN(str);
1387	long capa = len + expand;
1388	if (!STR_EMBED_P(str)) {
1389	    REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
1390	    STR_UNSET_NOCAPA(str);
1391	    RSTRING(str)->as.heap.aux.capa = capa;
1392	}
1393	else if (capa > RSTRING_EMBED_LEN_MAX) {
1394	    str_make_independent_expand(str, expand);
1395	}
1396    }
1397    ENC_CODERANGE_CLEAR(str);
1398}
1399
1400/* As rb_str_modify(), but don't clear coderange */
1401static void
1402str_modify_keep_cr(VALUE str)
1403{
1404    if (!str_independent(str))
1405	str_make_independent(str);
1406    if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
1407	/* Force re-scan later */
1408	ENC_CODERANGE_CLEAR(str);
1409}
1410
1411static inline void
1412str_discard(VALUE str)
1413{
1414    str_modifiable(str);
1415    if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
1416	xfree(RSTRING_PTR(str));
1417	RSTRING(str)->as.heap.ptr = 0;
1418	RSTRING(str)->as.heap.len = 0;
1419    }
1420}
1421
1422void
1423rb_str_associate(VALUE str, VALUE add)
1424{
1425    /* sanity check */
1426    rb_check_frozen(str);
1427    if (STR_ASSOC_P(str)) {
1428	/* already associated */
1429	rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1430    }
1431    else {
1432	if (STR_SHARED_P(str)) {
1433	    VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1434	    str_make_independent(str);
1435	    if (STR_ASSOC_P(assoc)) {
1436		assoc = RSTRING(assoc)->as.heap.aux.shared;
1437		rb_ary_concat(assoc, add);
1438		add = assoc;
1439	    }
1440	}
1441	else if (STR_EMBED_P(str)) {
1442	    str_make_independent(str);
1443	}
1444	else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1445	    RESIZE_CAPA(str, RSTRING_LEN(str));
1446	}
1447	FL_SET(str, STR_ASSOC);
1448	RBASIC(add)->klass = 0;
1449	RSTRING(str)->as.heap.aux.shared = add;
1450    }
1451}
1452
1453VALUE
1454rb_str_associated(VALUE str)
1455{
1456    if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1457    if (STR_ASSOC_P(str)) {
1458	return RSTRING(str)->as.heap.aux.shared;
1459    }
1460    return Qfalse;
1461}
1462
1463void
1464rb_must_asciicompat(VALUE str)
1465{
1466    rb_encoding *enc = rb_enc_get(str);
1467    if (!rb_enc_asciicompat(enc)) {
1468	rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
1469    }
1470}
1471
1472VALUE
1473rb_string_value(volatile VALUE *ptr)
1474{
1475    VALUE s = *ptr;
1476    if (!RB_TYPE_P(s, T_STRING)) {
1477	s = rb_str_to_str(s);
1478	*ptr = s;
1479    }
1480    return s;
1481}
1482
1483char *
1484rb_string_value_ptr(volatile VALUE *ptr)
1485{
1486    VALUE str = rb_string_value(ptr);
1487    return RSTRING_PTR(str);
1488}
1489
1490char *
1491rb_string_value_cstr(volatile VALUE *ptr)
1492{
1493    VALUE str = rb_string_value(ptr);
1494    char *s = RSTRING_PTR(str);
1495    long len = RSTRING_LEN(str);
1496
1497    if (!s || memchr(s, 0, len)) {
1498	rb_raise(rb_eArgError, "string contains null byte");
1499    }
1500    if (s[len]) {
1501	rb_str_modify(str);
1502	s = RSTRING_PTR(str);
1503	s[RSTRING_LEN(str)] = 0;
1504    }
1505    return s;
1506}
1507
1508VALUE
1509rb_check_string_type(VALUE str)
1510{
1511    str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1512    return str;
1513}
1514
1515/*
1516 *  call-seq:
1517 *     String.try_convert(obj) -> string or nil
1518 *
1519 *  Try to convert <i>obj</i> into a String, using to_str method.
1520 *  Returns converted string or nil if <i>obj</i> cannot be converted
1521 *  for any reason.
1522 *
1523 *     String.try_convert("str")     #=> "str"
1524 *     String.try_convert(/re/)      #=> nil
1525 */
1526static VALUE
1527rb_str_s_try_convert(VALUE dummy, VALUE str)
1528{
1529    return rb_check_string_type(str);
1530}
1531
1532static char*
1533str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
1534{
1535    long nth = *nthp;
1536    if (rb_enc_mbmaxlen(enc) == 1) {
1537        p += nth;
1538    }
1539    else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1540        p += nth * rb_enc_mbmaxlen(enc);
1541    }
1542    else if (rb_enc_asciicompat(enc)) {
1543        const char *p2, *e2;
1544        int n;
1545
1546        while (p < e && 0 < nth) {
1547            e2 = p + nth;
1548            if (e < e2) {
1549                *nthp = nth;
1550                return (char *)e;
1551            }
1552            if (ISASCII(*p)) {
1553                p2 = search_nonascii(p, e2);
1554                if (!p2) {
1555		    nth -= e2 - p;
1556		    *nthp = nth;
1557                    return (char *)e2;
1558                }
1559                nth -= p2 - p;
1560                p = p2;
1561            }
1562            n = rb_enc_mbclen(p, e, enc);
1563            p += n;
1564            nth--;
1565        }
1566        *nthp = nth;
1567        if (nth != 0) {
1568            return (char *)e;
1569        }
1570        return (char *)p;
1571    }
1572    else {
1573        while (p < e && nth--) {
1574            p += rb_enc_mbclen(p, e, enc);
1575        }
1576    }
1577    if (p > e) p = e;
1578    *nthp = nth;
1579    return (char*)p;
1580}
1581
1582char*
1583rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
1584{
1585    return str_nth_len(p, e, &nth, enc);
1586}
1587
1588static char*
1589str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1590{
1591    if (singlebyte)
1592	p += nth;
1593    else {
1594	p = str_nth_len(p, e, &nth, enc);
1595    }
1596    if (!p) return 0;
1597    if (p > e) p = e;
1598    return (char *)p;
1599}
1600
1601/* char offset to byte offset */
1602static long
1603str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1604{
1605    const char *pp = str_nth(p, e, nth, enc, singlebyte);
1606    if (!pp) return e - p;
1607    return pp - p;
1608}
1609
1610long
1611rb_str_offset(VALUE str, long pos)
1612{
1613    return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
1614		      STR_ENC_GET(str), single_byte_optimizable(str));
1615}
1616
1617#ifdef NONASCII_MASK
1618static char *
1619str_utf8_nth(const char *p, const char *e, long *nthp)
1620{
1621    long nth = *nthp;
1622    if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
1623	const VALUE *s, *t;
1624	const VALUE lowbits = sizeof(VALUE) - 1;
1625	s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1626	t = (const VALUE*)(~lowbits & (VALUE)e);
1627	while (p < (const char *)s) {
1628	    if (is_utf8_lead_byte(*p)) nth--;
1629	    p++;
1630	}
1631	do {
1632	    nth -= count_utf8_lead_bytes_with_word(s);
1633	    s++;
1634	} while (s < t && (int)sizeof(VALUE) <= nth);
1635	p = (char *)s;
1636    }
1637    while (p < e) {
1638	if (is_utf8_lead_byte(*p)) {
1639	    if (nth == 0) break;
1640	    nth--;
1641	}
1642	p++;
1643    }
1644    *nthp = nth;
1645    return (char *)p;
1646}
1647
1648static long
1649str_utf8_offset(const char *p, const char *e, long nth)
1650{
1651    const char *pp = str_utf8_nth(p, e, &nth);
1652    return pp - p;
1653}
1654#endif
1655
1656/* byte offset to char offset */
1657long
1658rb_str_sublen(VALUE str, long pos)
1659{
1660    if (single_byte_optimizable(str) || pos < 0)
1661        return pos;
1662    else {
1663	char *p = RSTRING_PTR(str);
1664        return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
1665    }
1666}
1667
1668VALUE
1669rb_str_subseq(VALUE str, long beg, long len)
1670{
1671    VALUE str2;
1672
1673    if (RSTRING_LEN(str) == beg + len &&
1674        RSTRING_EMBED_LEN_MAX < len) {
1675        str2 = rb_str_new_shared(rb_str_new_frozen(str));
1676        rb_str_drop_bytes(str2, beg);
1677    }
1678    else {
1679        str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1680	RB_GC_GUARD(str);
1681    }
1682
1683    rb_enc_cr_str_copy_for_substr(str2, str);
1684    OBJ_INFECT(str2, str);
1685
1686    return str2;
1687}
1688
1689static char *
1690rb_str_subpos(VALUE str, long beg, long *lenp)
1691{
1692    long len = *lenp;
1693    long slen = -1L;
1694    long blen = RSTRING_LEN(str);
1695    rb_encoding *enc = STR_ENC_GET(str);
1696    char *p, *s = RSTRING_PTR(str), *e = s + blen;
1697
1698    if (len < 0) return 0;
1699    if (!blen) {
1700	len = 0;
1701    }
1702    if (single_byte_optimizable(str)) {
1703	if (beg > blen) return 0;
1704	if (beg < 0) {
1705	    beg += blen;
1706	    if (beg < 0) return 0;
1707	}
1708	if (beg + len > blen)
1709	    len = blen - beg;
1710	if (len < 0) return 0;
1711	p = s + beg;
1712	goto end;
1713    }
1714    if (beg < 0) {
1715	if (len > -beg) len = -beg;
1716	if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1717	    beg = -beg;
1718	    while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
1719	    p = e;
1720	    if (!p) return 0;
1721	    while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
1722	    if (!p) return 0;
1723	    len = e - p;
1724	    goto end;
1725	}
1726	else {
1727	    slen = str_strlen(str, enc);
1728	    beg += slen;
1729	    if (beg < 0) return 0;
1730	    p = s + beg;
1731	    if (len == 0) goto end;
1732	}
1733    }
1734    else if (beg > 0 && beg > RSTRING_LEN(str)) {
1735	return 0;
1736    }
1737    if (len == 0) {
1738	if (beg > str_strlen(str, enc)) return 0;
1739	p = s + beg;
1740    }
1741#ifdef NONASCII_MASK
1742    else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1743        enc == rb_utf8_encoding()) {
1744        p = str_utf8_nth(s, e, &beg);
1745        if (beg > 0) return 0;
1746        len = str_utf8_offset(p, e, len);
1747    }
1748#endif
1749    else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1750	int char_sz = rb_enc_mbmaxlen(enc);
1751
1752	p = s + beg * char_sz;
1753	if (p > e) {
1754	    return 0;
1755	}
1756        else if (len * char_sz > e - p)
1757            len = e - p;
1758        else
1759	    len *= char_sz;
1760    }
1761    else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
1762	if (beg > 0) return 0;
1763	len = 0;
1764    }
1765    else {
1766	len = str_offset(p, e, len, enc, 0);
1767    }
1768  end:
1769    *lenp = len;
1770    RB_GC_GUARD(str);
1771    return p;
1772}
1773
1774VALUE
1775rb_str_substr(VALUE str, long beg, long len)
1776{
1777    VALUE str2;
1778    char *p = rb_str_subpos(str, beg, &len);
1779
1780    if (!p) return Qnil;
1781    if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
1782	str2 = rb_str_new4(str);
1783	str2 = str_new3(rb_obj_class(str2), str2);
1784	RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1785	RSTRING(str2)->as.heap.len = len;
1786    }
1787    else {
1788	str2 = rb_str_new5(str, p, len);
1789	rb_enc_cr_str_copy_for_substr(str2, str);
1790	OBJ_INFECT(str2, str);
1791	RB_GC_GUARD(str);
1792    }
1793
1794    return str2;
1795}
1796
1797VALUE
1798rb_str_freeze(VALUE str)
1799{
1800    if (STR_ASSOC_P(str)) {
1801	VALUE ary = RSTRING(str)->as.heap.aux.shared;
1802	OBJ_FREEZE(ary);
1803    }
1804    return rb_obj_freeze(str);
1805}
1806
1807RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
1808#define rb_str_dup_frozen rb_str_new_frozen
1809
1810VALUE
1811rb_str_locktmp(VALUE str)
1812{
1813    if (FL_TEST(str, STR_TMPLOCK)) {
1814	rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1815    }
1816    FL_SET(str, STR_TMPLOCK);
1817    return str;
1818}
1819
1820VALUE
1821rb_str_unlocktmp(VALUE str)
1822{
1823    if (!FL_TEST(str, STR_TMPLOCK)) {
1824	rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1825    }
1826    FL_UNSET(str, STR_TMPLOCK);
1827    return str;
1828}
1829
1830VALUE
1831rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
1832{
1833    rb_str_locktmp(str);
1834    return rb_ensure(func, arg, rb_str_unlocktmp, str);
1835}
1836
1837void
1838rb_str_set_len(VALUE str, long len)
1839{
1840    long capa;
1841
1842    str_modifiable(str);
1843    if (STR_SHARED_P(str)) {
1844	rb_raise(rb_eRuntimeError, "can't set length of shared string");
1845    }
1846    if (len > (capa = (long)rb_str_capacity(str))) {
1847	rb_bug("probable buffer overflow: %ld for %ld", len, capa);
1848    }
1849    STR_SET_LEN(str, len);
1850    RSTRING_PTR(str)[len] = '\0';
1851}
1852
1853VALUE
1854rb_str_resize(VALUE str, long len)
1855{
1856    long slen;
1857    int independent;
1858
1859    if (len < 0) {
1860	rb_raise(rb_eArgError, "negative string size (or size too big)");
1861    }
1862
1863    independent = str_independent(str);
1864    ENC_CODERANGE_CLEAR(str);
1865    slen = RSTRING_LEN(str);
1866    if (len != slen) {
1867	if (STR_EMBED_P(str)) {
1868	    if (len <= RSTRING_EMBED_LEN_MAX) {
1869		STR_SET_EMBED_LEN(str, len);
1870		RSTRING(str)->as.ary[len] = '\0';
1871		return str;
1872	    }
1873	    str_make_independent_expand(str, len - slen);
1874	    STR_SET_NOEMBED(str);
1875	}
1876	else if (len <= RSTRING_EMBED_LEN_MAX) {
1877	    char *ptr = RSTRING(str)->as.heap.ptr;
1878	    STR_SET_EMBED(str);
1879	    if (slen > len) slen = len;
1880	    if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
1881	    RSTRING(str)->as.ary[len] = '\0';
1882	    STR_SET_EMBED_LEN(str, len);
1883	    if (independent) xfree(ptr);
1884	    return str;
1885	}
1886	else if (!independent) {
1887	    str_make_independent_expand(str, len - slen);
1888	}
1889	else if (slen < len || slen - len > 1024) {
1890	    REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1891	}
1892	if (!STR_NOCAPA_P(str)) {
1893	    RSTRING(str)->as.heap.aux.capa = len;
1894	}
1895	RSTRING(str)->as.heap.len = len;
1896	RSTRING(str)->as.heap.ptr[len] = '\0';	/* sentinel */
1897    }
1898    return str;
1899}
1900
1901static VALUE
1902str_buf_cat(VALUE str, const char *ptr, long len)
1903{
1904    long capa, total, off = -1;
1905
1906    if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1907        off = ptr - RSTRING_PTR(str);
1908    }
1909    rb_str_modify(str);
1910    if (len == 0) return 0;
1911    if (STR_ASSOC_P(str)) {
1912	FL_UNSET(str, STR_ASSOC);
1913	capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1914    }
1915    else if (STR_EMBED_P(str)) {
1916	capa = RSTRING_EMBED_LEN_MAX;
1917    }
1918    else {
1919	capa = RSTRING(str)->as.heap.aux.capa;
1920    }
1921    if (RSTRING_LEN(str) >= LONG_MAX - len) {
1922	rb_raise(rb_eArgError, "string sizes too big");
1923    }
1924    total = RSTRING_LEN(str)+len;
1925    if (capa <= total) {
1926	while (total > capa) {
1927	    if (capa + 1 >= LONG_MAX / 2) {
1928		capa = (total + 4095) / 4096;
1929		break;
1930	    }
1931	    capa = (capa + 1) * 2;
1932	}
1933	RESIZE_CAPA(str, capa);
1934    }
1935    if (off != -1) {
1936        ptr = RSTRING_PTR(str) + off;
1937    }
1938    memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1939    STR_SET_LEN(str, total);
1940    RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1941
1942    return str;
1943}
1944
1945#define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
1946
1947VALUE
1948rb_str_buf_cat(VALUE str, const char *ptr, long len)
1949{
1950    if (len == 0) return str;
1951    if (len < 0) {
1952	rb_raise(rb_eArgError, "negative string size (or size too big)");
1953    }
1954    return str_buf_cat(str, ptr, len);
1955}
1956
1957VALUE
1958rb_str_buf_cat2(VALUE str, const char *ptr)
1959{
1960    return rb_str_buf_cat(str, ptr, strlen(ptr));
1961}
1962
1963VALUE
1964rb_str_cat(VALUE str, const char *ptr, long len)
1965{
1966    if (len < 0) {
1967	rb_raise(rb_eArgError, "negative string size (or size too big)");
1968    }
1969    if (STR_ASSOC_P(str)) {
1970	char *p;
1971	rb_str_modify_expand(str, len);
1972	p = RSTRING(str)->as.heap.ptr;
1973	memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
1974	len = RSTRING(str)->as.heap.len += len;
1975	p[len] = '\0'; /* sentinel */
1976	return str;
1977    }
1978
1979    return rb_str_buf_cat(str, ptr, len);
1980}
1981
1982VALUE
1983rb_str_cat2(VALUE str, const char *ptr)
1984{
1985    return rb_str_cat(str, ptr, strlen(ptr));
1986}
1987
1988static VALUE
1989rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1990    int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1991{
1992    int str_encindex = ENCODING_GET(str);
1993    int res_encindex;
1994    int str_cr, res_cr;
1995
1996    str_cr = ENC_CODERANGE(str);
1997
1998    if (str_encindex == ptr_encindex) {
1999        if (str_cr == ENC_CODERANGE_UNKNOWN)
2000            ptr_cr = ENC_CODERANGE_UNKNOWN;
2001        else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2002            ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
2003        }
2004    }
2005    else {
2006        rb_encoding *str_enc = rb_enc_from_index(str_encindex);
2007        rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
2008        if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2009            if (len == 0)
2010                return str;
2011            if (RSTRING_LEN(str) == 0) {
2012                rb_str_buf_cat(str, ptr, len);
2013                ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2014                return str;
2015            }
2016            goto incompatible;
2017        }
2018	if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2019	    ptr_cr = coderange_scan(ptr, len, ptr_enc);
2020	}
2021        if (str_cr == ENC_CODERANGE_UNKNOWN) {
2022            if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2023                str_cr = rb_enc_str_coderange(str);
2024            }
2025        }
2026    }
2027    if (ptr_cr_ret)
2028        *ptr_cr_ret = ptr_cr;
2029
2030    if (str_encindex != ptr_encindex &&
2031        str_cr != ENC_CODERANGE_7BIT &&
2032        ptr_cr != ENC_CODERANGE_7BIT) {
2033      incompatible:
2034        rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2035            rb_enc_name(rb_enc_from_index(str_encindex)),
2036            rb_enc_name(rb_enc_from_index(ptr_encindex)));
2037    }
2038
2039    if (str_cr == ENC_CODERANGE_UNKNOWN) {
2040        res_encindex = str_encindex;
2041        res_cr = ENC_CODERANGE_UNKNOWN;
2042    }
2043    else if (str_cr == ENC_CODERANGE_7BIT) {
2044        if (ptr_cr == ENC_CODERANGE_7BIT) {
2045            res_encindex = str_encindex;
2046            res_cr = ENC_CODERANGE_7BIT;
2047        }
2048        else {
2049            res_encindex = ptr_encindex;
2050            res_cr = ptr_cr;
2051        }
2052    }
2053    else if (str_cr == ENC_CODERANGE_VALID) {
2054        res_encindex = str_encindex;
2055	if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
2056	    res_cr = str_cr;
2057	else
2058	    res_cr = ptr_cr;
2059    }
2060    else { /* str_cr == ENC_CODERANGE_BROKEN */
2061        res_encindex = str_encindex;
2062        res_cr = str_cr;
2063        if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2064    }
2065
2066    if (len < 0) {
2067	rb_raise(rb_eArgError, "negative string size (or size too big)");
2068    }
2069    str_buf_cat(str, ptr, len);
2070    ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2071    return str;
2072}
2073
2074VALUE
2075rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2076{
2077    return rb_enc_cr_str_buf_cat(str, ptr, len,
2078        rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
2079}
2080
2081VALUE
2082rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2083{
2084    /* ptr must reference NUL terminated ASCII string. */
2085    int encindex = ENCODING_GET(str);
2086    rb_encoding *enc = rb_enc_from_index(encindex);
2087    if (rb_enc_asciicompat(enc)) {
2088        return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2089            encindex, ENC_CODERANGE_7BIT, 0);
2090    }
2091    else {
2092        char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2093        while (*ptr) {
2094            unsigned int c = (unsigned char)*ptr;
2095            int len = rb_enc_codelen(c, enc);
2096            rb_enc_mbcput(c, buf, enc);
2097            rb_enc_cr_str_buf_cat(str, buf, len,
2098                encindex, ENC_CODERANGE_VALID, 0);
2099            ptr++;
2100        }
2101        return str;
2102    }
2103}
2104
2105VALUE
2106rb_str_buf_append(VALUE str, VALUE str2)
2107{
2108    int str2_cr;
2109
2110    str2_cr = ENC_CODERANGE(str2);
2111
2112    rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2113        ENCODING_GET(str2), str2_cr, &str2_cr);
2114
2115    OBJ_INFECT(str, str2);
2116    ENC_CODERANGE_SET(str2, str2_cr);
2117
2118    return str;
2119}
2120
2121VALUE
2122rb_str_append(VALUE str, VALUE str2)
2123{
2124    rb_encoding *enc;
2125    int cr, cr2;
2126    long len2;
2127
2128    StringValue(str2);
2129    if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
2130        long len = RSTRING_LEN(str) + len2;
2131        enc = rb_enc_check(str, str2);
2132        cr = ENC_CODERANGE(str);
2133        if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
2134        rb_str_modify_expand(str, len2);
2135        memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
2136               RSTRING_PTR(str2), len2+1);
2137        RSTRING(str)->as.heap.len = len;
2138        rb_enc_associate(str, enc);
2139        ENC_CODERANGE_SET(str, cr);
2140        OBJ_INFECT(str, str2);
2141        return str;
2142    }
2143    return rb_str_buf_append(str, str2);
2144}
2145
2146/*
2147 *  call-seq:
2148 *     str << integer       -> str
2149 *     str.concat(integer)  -> str
2150 *     str << obj           -> str
2151 *     str.concat(obj)      -> str
2152 *
2153 *  Append---Concatenates the given object to <i>str</i>. If the object is a
2154 *  <code>Integer</code>, it is considered as a codepoint, and is converted
2155 *  to a character before concatenation.
2156 *
2157 *     a = "hello "
2158 *     a << "world"   #=> "hello world"
2159 *     a.concat(33)   #=> "hello world!"
2160 */
2161
2162VALUE
2163rb_str_concat(VALUE str1, VALUE str2)
2164{
2165    unsigned int code;
2166    rb_encoding *enc = STR_ENC_GET(str1);
2167
2168    if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) {
2169	if (rb_num_to_uint(str2, &code) == 0) {
2170	}
2171	else if (FIXNUM_P(str2)) {
2172	    rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
2173	}
2174	else {
2175	    rb_raise(rb_eRangeError, "bignum out of char range");
2176	}
2177    }
2178    else {
2179	return rb_str_append(str1, str2);
2180    }
2181
2182    if (enc == rb_usascii_encoding()) {
2183	/* US-ASCII automatically extended to ASCII-8BIT */
2184	char buf[1];
2185	buf[0] = (char)code;
2186	if (code > 0xFF) {
2187	    rb_raise(rb_eRangeError, "%u out of char range", code);
2188	}
2189	rb_str_cat(str1, buf, 1);
2190	if (code > 127) {
2191	    rb_enc_associate(str1, rb_ascii8bit_encoding());
2192	    ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
2193	}
2194    }
2195    else {
2196	long pos = RSTRING_LEN(str1);
2197	int cr = ENC_CODERANGE(str1);
2198	int len;
2199	char *buf;
2200
2201	switch (len = rb_enc_codelen(code, enc)) {
2202	  case ONIGERR_INVALID_CODE_POINT_VALUE:
2203	    rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2204	    break;
2205	  case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
2206	  case 0:
2207	    rb_raise(rb_eRangeError, "%u out of char range", code);
2208	    break;
2209	}
2210	buf = ALLOCA_N(char, len + 1);
2211	rb_enc_mbcput(code, buf, enc);
2212	if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
2213	    rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2214	}
2215	rb_str_resize(str1, pos+len);
2216	memcpy(RSTRING_PTR(str1) + pos, buf, len);
2217	if (cr == ENC_CODERANGE_7BIT && code > 127)
2218	    cr = ENC_CODERANGE_VALID;
2219	ENC_CODERANGE_SET(str1, cr);
2220    }
2221    return str1;
2222}
2223
2224/*
2225 *  call-seq:
2226 *     str.prepend(other_str)  -> str
2227 *
2228 *  Prepend---Prepend the given string to <i>str</i>.
2229 *
2230 *     a = "world"
2231 *     a.prepend("hello ") #=> "hello world"
2232 *     a                   #=> "hello world"
2233 */
2234
2235static VALUE
2236rb_str_prepend(VALUE str, VALUE str2)
2237{
2238    StringValue(str2);
2239    StringValue(str);
2240    rb_str_update(str, 0L, 0L, str2);
2241    return str;
2242}
2243
2244st_index_t
2245rb_str_hash(VALUE str)
2246{
2247    int e = ENCODING_GET(str);
2248    if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2249	e = 0;
2250    }
2251    return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
2252}
2253
2254int
2255rb_str_hash_cmp(VALUE str1, VALUE str2)
2256{
2257    long len;
2258
2259    if (!rb_str_comparable(str1, str2)) return 1;
2260    if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
2261	memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
2262	return 0;
2263    }
2264    return 1;
2265}
2266
2267/*
2268 * call-seq:
2269 *    str.hash   -> fixnum
2270 *
2271 * Return a hash based on the string's length and content.
2272 */
2273
2274static VALUE
2275rb_str_hash_m(VALUE str)
2276{
2277    st_index_t hval = rb_str_hash(str);
2278    return INT2FIX(hval);
2279}
2280
2281#define lesser(a,b) (((a)>(b))?(b):(a))
2282
2283int
2284rb_str_comparable(VALUE str1, VALUE str2)
2285{
2286    int idx1, idx2;
2287    int rc1, rc2;
2288
2289    if (RSTRING_LEN(str1) == 0) return TRUE;
2290    if (RSTRING_LEN(str2) == 0) return TRUE;
2291    idx1 = ENCODING_GET(str1);
2292    idx2 = ENCODING_GET(str2);
2293    if (idx1 == idx2) return TRUE;
2294    rc1 = rb_enc_str_coderange(str1);
2295    rc2 = rb_enc_str_coderange(str2);
2296    if (rc1 == ENC_CODERANGE_7BIT) {
2297	if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
2298	if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
2299	    return TRUE;
2300    }
2301    if (rc2 == ENC_CODERANGE_7BIT) {
2302	if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
2303	    return TRUE;
2304    }
2305    return FALSE;
2306}
2307
2308int
2309rb_str_cmp(VALUE str1, VALUE str2)
2310{
2311    long len1, len2;
2312    const char *ptr1, *ptr2;
2313    int retval;
2314
2315    if (str1 == str2) return 0;
2316    RSTRING_GETMEM(str1, ptr1, len1);
2317    RSTRING_GETMEM(str2, ptr2, len2);
2318    if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
2319	if (len1 == len2) {
2320	    if (!rb_str_comparable(str1, str2)) {
2321		if (ENCODING_GET(str1) > ENCODING_GET(str2))
2322		    return 1;
2323		return -1;
2324	    }
2325	    return 0;
2326	}
2327	if (len1 > len2) return 1;
2328	return -1;
2329    }
2330    if (retval > 0) return 1;
2331    return -1;
2332}
2333
2334/* expect tail call optimization */
2335static VALUE
2336str_eql(const VALUE str1, const VALUE str2)
2337{
2338    const long len = RSTRING_LEN(str1);
2339    const char *ptr1, *ptr2;
2340
2341    if (len != RSTRING_LEN(str2)) return Qfalse;
2342    if (!rb_str_comparable(str1, str2)) return Qfalse;
2343    if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
2344	return Qtrue;
2345    if (memcmp(ptr1, ptr2, len) == 0)
2346	return Qtrue;
2347    return Qfalse;
2348}
2349
2350/*
2351 *  call-seq:
2352 *     str == obj   -> true or false
2353 *
2354 *  Equality---If <i>obj</i> is not a <code>String</code>, returns
2355 *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
2356 *  <code><=></code> <i>obj</i> returns zero.
2357 */
2358
2359VALUE
2360rb_str_equal(VALUE str1, VALUE str2)
2361{
2362    if (str1 == str2) return Qtrue;
2363    if (!RB_TYPE_P(str2, T_STRING)) {
2364	if (!rb_respond_to(str2, rb_intern("to_str"))) {
2365	    return Qfalse;
2366	}
2367	return rb_equal(str2, str1);
2368    }
2369    return str_eql(str1, str2);
2370}
2371
2372/*
2373 * call-seq:
2374 *   str.eql?(other)   -> true or false
2375 *
2376 * Two strings are equal if they have the same length and content.
2377 */
2378
2379static VALUE
2380rb_str_eql(VALUE str1, VALUE str2)
2381{
2382    if (str1 == str2) return Qtrue;
2383    if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
2384    return str_eql(str1, str2);
2385}
2386
2387/*
2388 *  call-seq:
2389 *     string <=> other_string   -> -1, 0, +1 or nil
2390 *
2391 *
2392 *  Comparison---Returns -1, 0, +1 or nil depending on whether +string+ is less
2393 *  than, equal to, or greater than +other_string+.
2394 *
2395 *  +nil+ is returned if the two values are incomparable.
2396 *
2397 *  If the strings are of different lengths, and the strings are equal when
2398 *  compared up to the shortest length, then the longer string is considered
2399 *  greater than the shorter one.
2400 *
2401 *  <code><=></code> is the basis for the methods <code><</code>,
2402 *  <code><=</code>, <code>></code>, <code>>=</code>, and
2403 *  <code>between?</code>, included from module Comparable. The method
2404 *  String#== does not use Comparable#==.
2405 *
2406 *     "abcdef" <=> "abcde"     #=> 1
2407 *     "abcdef" <=> "abcdef"    #=> 0
2408 *     "abcdef" <=> "abcdefg"   #=> -1
2409 *     "abcdef" <=> "ABCDEF"    #=> 1
2410 */
2411
2412static VALUE
2413rb_str_cmp_m(VALUE str1, VALUE str2)
2414{
2415    int result;
2416
2417    if (!RB_TYPE_P(str2, T_STRING)) {
2418	VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0);
2419	if (RB_TYPE_P(tmp, T_STRING)) {
2420	    result = rb_str_cmp(str1, tmp);
2421	}
2422	else {
2423	    return rb_invcmp(str1, str2);
2424	}
2425    }
2426    else {
2427	result = rb_str_cmp(str1, str2);
2428    }
2429    return INT2FIX(result);
2430}
2431
2432/*
2433 *  call-seq:
2434 *     str.casecmp(other_str)   -> -1, 0, +1 or nil
2435 *
2436 *  Case-insensitive version of <code>String#<=></code>.
2437 *
2438 *     "abcdef".casecmp("abcde")     #=> 1
2439 *     "aBcDeF".casecmp("abcdef")    #=> 0
2440 *     "abcdef".casecmp("abcdefg")   #=> -1
2441 *     "abcdef".casecmp("ABCDEF")    #=> 0
2442 */
2443
2444static VALUE
2445rb_str_casecmp(VALUE str1, VALUE str2)
2446{
2447    long len;
2448    rb_encoding *enc;
2449    char *p1, *p1end, *p2, *p2end;
2450
2451    StringValue(str2);
2452    enc = rb_enc_compatible(str1, str2);
2453    if (!enc) {
2454	return Qnil;
2455    }
2456
2457    p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2458    p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2459    if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
2460	while (p1 < p1end && p2 < p2end) {
2461	    if (*p1 != *p2) {
2462		unsigned int c1 = TOUPPER(*p1 & 0xff);
2463		unsigned int c2 = TOUPPER(*p2 & 0xff);
2464                if (c1 != c2)
2465                    return INT2FIX(c1 < c2 ? -1 : 1);
2466	    }
2467	    p1++;
2468	    p2++;
2469	}
2470    }
2471    else {
2472	while (p1 < p1end && p2 < p2end) {
2473            int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
2474            int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
2475
2476            if (0 <= c1 && 0 <= c2) {
2477                c1 = TOUPPER(c1);
2478                c2 = TOUPPER(c2);
2479                if (c1 != c2)
2480                    return INT2FIX(c1 < c2 ? -1 : 1);
2481            }
2482            else {
2483                int r;
2484                l1 = rb_enc_mbclen(p1, p1end, enc);
2485                l2 = rb_enc_mbclen(p2, p2end, enc);
2486                len = l1 < l2 ? l1 : l2;
2487                r = memcmp(p1, p2, len);
2488                if (r != 0)
2489                    return INT2FIX(r < 0 ? -1 : 1);
2490                if (l1 != l2)
2491                    return INT2FIX(l1 < l2 ? -1 : 1);
2492            }
2493	    p1 += l1;
2494	    p2 += l2;
2495	}
2496    }
2497    if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2498    if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2499    return INT2FIX(-1);
2500}
2501
2502static long
2503rb_str_index(VALUE str, VALUE sub, long offset)
2504{
2505    long pos;
2506    char *s, *sptr, *e;
2507    long len, slen;
2508    rb_encoding *enc;
2509
2510    enc = rb_enc_check(str, sub);
2511    if (is_broken_string(sub)) {
2512	return -1;
2513    }
2514    len = str_strlen(str, enc);
2515    slen = str_strlen(sub, enc);
2516    if (offset < 0) {
2517	offset += len;
2518	if (offset < 0) return -1;
2519    }
2520    if (len - offset < slen) return -1;
2521    s = RSTRING_PTR(str);
2522    e = s + RSTRING_LEN(str);
2523    if (offset) {
2524	offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2525	s += offset;
2526    }
2527    if (slen == 0) return offset;
2528    /* need proceed one character at a time */
2529    sptr = RSTRING_PTR(sub);
2530    slen = RSTRING_LEN(sub);
2531    len = RSTRING_LEN(str) - offset;
2532    for (;;) {
2533	char *t;
2534	pos = rb_memsearch(sptr, slen, s, len, enc);
2535	if (pos < 0) return pos;
2536	t = rb_enc_right_char_head(s, s+pos, e, enc);
2537	if (t == s + pos) break;
2538	if ((len -= t - s) <= 0) return -1;
2539	offset += t - s;
2540	s = t;
2541    }
2542    return pos + offset;
2543}
2544
2545
2546/*
2547 *  call-seq:
2548 *     str.index(substring [, offset])   -> fixnum or nil
2549 *     str.index(regexp [, offset])      -> fixnum or nil
2550 *
2551 *  Returns the index of the first occurrence of the given <i>substring</i> or
2552 *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2553 *  found. If the second parameter is present, it specifies the position in the
2554 *  string to begin the search.
2555 *
2556 *     "hello".index('e')             #=> 1
2557 *     "hello".index('lo')            #=> 3
2558 *     "hello".index('a')             #=> nil
2559 *     "hello".index(?e)              #=> 1
2560 *     "hello".index(/[aeiou]/, -3)   #=> 4
2561 */
2562
2563static VALUE
2564rb_str_index_m(int argc, VALUE *argv, VALUE str)
2565{
2566    VALUE sub;
2567    VALUE initpos;
2568    long pos;
2569
2570    if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2571	pos = NUM2LONG(initpos);
2572    }
2573    else {
2574	pos = 0;
2575    }
2576    if (pos < 0) {
2577	pos += str_strlen(str, STR_ENC_GET(str));
2578	if (pos < 0) {
2579	    if (RB_TYPE_P(sub, T_REGEXP)) {
2580		rb_backref_set(Qnil);
2581	    }
2582	    return Qnil;
2583	}
2584    }
2585
2586    if (SPECIAL_CONST_P(sub)) goto generic;
2587    switch (BUILTIN_TYPE(sub)) {
2588      case T_REGEXP:
2589	if (pos > str_strlen(str, STR_ENC_GET(str)))
2590	    return Qnil;
2591	pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2592			 rb_enc_check(str, sub), single_byte_optimizable(str));
2593
2594	pos = rb_reg_search(sub, str, pos, 0);
2595	pos = rb_str_sublen(str, pos);
2596	break;
2597
2598      generic:
2599      default: {
2600	VALUE tmp;
2601
2602	tmp = rb_check_string_type(sub);
2603	if (NIL_P(tmp)) {
2604	    rb_raise(rb_eTypeError, "type mismatch: %s given",
2605		     rb_obj_classname(sub));
2606	}
2607	sub = tmp;
2608      }
2609	/* fall through */
2610      case T_STRING:
2611	pos = rb_str_index(str, sub, pos);
2612	pos = rb_str_sublen(str, pos);
2613	break;
2614    }
2615
2616    if (pos == -1) return Qnil;
2617    return LONG2NUM(pos);
2618}
2619
2620static long
2621rb_str_rindex(VALUE str, VALUE sub, long pos)
2622{
2623    long len, slen;
2624    char *s, *sbeg, *e, *t;
2625    rb_encoding *enc;
2626    int singlebyte = single_byte_optimizable(str);
2627
2628    enc = rb_enc_check(str, sub);
2629    if (is_broken_string(sub)) {
2630	return -1;
2631    }
2632    len = str_strlen(str, enc);
2633    slen = str_strlen(sub, enc);
2634    /* substring longer than string */
2635    if (len < slen) return -1;
2636    if (len - pos < slen) {
2637	pos = len - slen;
2638    }
2639    if (len == 0) {
2640	return pos;
2641    }
2642    sbeg = RSTRING_PTR(str);
2643    e = RSTRING_END(str);
2644    t = RSTRING_PTR(sub);
2645    slen = RSTRING_LEN(sub);
2646    s = str_nth(sbeg, e, pos, enc, singlebyte);
2647    while (s) {
2648	if (memcmp(s, t, slen) == 0) {
2649	    return pos;
2650	}
2651	if (pos == 0) break;
2652	pos--;
2653	s = rb_enc_prev_char(sbeg, s, e, enc);
2654    }
2655    return -1;
2656}
2657
2658
2659/*
2660 *  call-seq:
2661 *     str.rindex(substring [, fixnum])   -> fixnum or nil
2662 *     str.rindex(regexp [, fixnum])   -> fixnum or nil
2663 *
2664 *  Returns the index of the last occurrence of the given <i>substring</i> or
2665 *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2666 *  found. If the second parameter is present, it specifies the position in the
2667 *  string to end the search---characters beyond this point will not be
2668 *  considered.
2669 *
2670 *     "hello".rindex('e')             #=> 1
2671 *     "hello".rindex('l')             #=> 3
2672 *     "hello".rindex('a')             #=> nil
2673 *     "hello".rindex(?e)              #=> 1
2674 *     "hello".rindex(/[aeiou]/, -2)   #=> 1
2675 */
2676
2677static VALUE
2678rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2679{
2680    VALUE sub;
2681    VALUE vpos;
2682    rb_encoding *enc = STR_ENC_GET(str);
2683    long pos, len = str_strlen(str, enc);
2684
2685    if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2686	pos = NUM2LONG(vpos);
2687	if (pos < 0) {
2688	    pos += len;
2689	    if (pos < 0) {
2690		if (RB_TYPE_P(sub, T_REGEXP)) {
2691		    rb_backref_set(Qnil);
2692		}
2693		return Qnil;
2694	    }
2695	}
2696	if (pos > len) pos = len;
2697    }
2698    else {
2699	pos = len;
2700    }
2701
2702    if (SPECIAL_CONST_P(sub)) goto generic;
2703    switch (BUILTIN_TYPE(sub)) {
2704      case T_REGEXP:
2705	/* enc = rb_get_check(str, sub); */
2706	pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2707			 STR_ENC_GET(str), single_byte_optimizable(str));
2708
2709	if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2710	    pos = rb_reg_search(sub, str, pos, 1);
2711	    pos = rb_str_sublen(str, pos);
2712	}
2713	if (pos >= 0) return LONG2NUM(pos);
2714	break;
2715
2716      generic:
2717      default: {
2718	VALUE tmp;
2719
2720	tmp = rb_check_string_type(sub);
2721	if (NIL_P(tmp)) {
2722	    rb_raise(rb_eTypeError, "type mismatch: %s given",
2723		     rb_obj_classname(sub));
2724	}
2725	sub = tmp;
2726      }
2727	/* fall through */
2728      case T_STRING:
2729	pos = rb_str_rindex(str, sub, pos);
2730	if (pos >= 0) return LONG2NUM(pos);
2731	break;
2732    }
2733    return Qnil;
2734}
2735
2736/*
2737 *  call-seq:
2738 *     str =~ obj   -> fixnum or nil
2739 *
2740 *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2741 *  against <i>str</i>,and returns the position the match starts, or
2742 *  <code>nil</code> if there is no match. Otherwise, invokes
2743 *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2744 *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
2745 *
2746 *  Note: <code>str =~ regexp</code> is not the same as
2747 *  <code>regexp =~ str</code>. Strings captured from named capture groups
2748 *  are assigned to local variables only in the second case.
2749 *
2750 *     "cat o' 9 tails" =~ /\d/   #=> 7
2751 *     "cat o' 9 tails" =~ 9      #=> nil
2752 */
2753
2754static VALUE
2755rb_str_match(VALUE x, VALUE y)
2756{
2757    if (SPECIAL_CONST_P(y)) goto generic;
2758    switch (BUILTIN_TYPE(y)) {
2759      case T_STRING:
2760	rb_raise(rb_eTypeError, "type mismatch: String given");
2761
2762      case T_REGEXP:
2763	return rb_reg_match(y, x);
2764
2765      generic:
2766      default:
2767	return rb_funcall(y, rb_intern("=~"), 1, x);
2768    }
2769}
2770
2771
2772static VALUE get_pat(VALUE, int);
2773
2774
2775/*
2776 *  call-seq:
2777 *     str.match(pattern)        -> matchdata or nil
2778 *     str.match(pattern, pos)   -> matchdata or nil
2779 *
2780 *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2781 *  then invokes its <code>match</code> method on <i>str</i>.  If the second
2782 *  parameter is present, it specifies the position in the string to begin the
2783 *  search.
2784 *
2785 *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
2786 *     'hello'.match('(.)\1')[0]   #=> "ll"
2787 *     'hello'.match(/(.)\1/)[0]   #=> "ll"
2788 *     'hello'.match('xx')         #=> nil
2789 *
2790 *  If a block is given, invoke the block with MatchData if match succeed, so
2791 *  that you can write
2792 *
2793 *     str.match(pat) {|m| ...}
2794 *
2795 *  instead of
2796 *
2797 *     if m = str.match(pat)
2798 *       ...
2799 *     end
2800 *
2801 *  The return value is a value from block execution in this case.
2802 */
2803
2804static VALUE
2805rb_str_match_m(int argc, VALUE *argv, VALUE str)
2806{
2807    VALUE re, result;
2808    if (argc < 1)
2809	rb_check_arity(argc, 1, 2);
2810    re = argv[0];
2811    argv[0] = str;
2812    result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2813    if (!NIL_P(result) && rb_block_given_p()) {
2814	return rb_yield(result);
2815    }
2816    return result;
2817}
2818
2819enum neighbor_char {
2820    NEIGHBOR_NOT_CHAR,
2821    NEIGHBOR_FOUND,
2822    NEIGHBOR_WRAPPED
2823};
2824
2825static enum neighbor_char
2826enc_succ_char(char *p, long len, rb_encoding *enc)
2827{
2828    long i;
2829    int l;
2830    while (1) {
2831        for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2832            p[i] = '\0';
2833        if (i < 0)
2834            return NEIGHBOR_WRAPPED;
2835        ++((unsigned char*)p)[i];
2836        l = rb_enc_precise_mbclen(p, p+len, enc);
2837        if (MBCLEN_CHARFOUND_P(l)) {
2838            l = MBCLEN_CHARFOUND_LEN(l);
2839            if (l == len) {
2840                return NEIGHBOR_FOUND;
2841            }
2842            else {
2843                memset(p+l, 0xff, len-l);
2844            }
2845        }
2846        if (MBCLEN_INVALID_P(l) && i < len-1) {
2847            long len2;
2848            int l2;
2849            for (len2 = len-1; 0 < len2; len2--) {
2850                l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2851                if (!MBCLEN_INVALID_P(l2))
2852                    break;
2853            }
2854            memset(p+len2+1, 0xff, len-(len2+1));
2855        }
2856    }
2857}
2858
2859static enum neighbor_char
2860enc_pred_char(char *p, long len, rb_encoding *enc)
2861{
2862    long i;
2863    int l;
2864    while (1) {
2865        for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2866            p[i] = '\xff';
2867        if (i < 0)
2868            return NEIGHBOR_WRAPPED;
2869        --((unsigned char*)p)[i];
2870        l = rb_enc_precise_mbclen(p, p+len, enc);
2871        if (MBCLEN_CHARFOUND_P(l)) {
2872            l = MBCLEN_CHARFOUND_LEN(l);
2873            if (l == len) {
2874                return NEIGHBOR_FOUND;
2875            }
2876            else {
2877                memset(p+l, 0, len-l);
2878            }
2879        }
2880        if (MBCLEN_INVALID_P(l) && i < len-1) {
2881            long len2;
2882            int l2;
2883            for (len2 = len-1; 0 < len2; len2--) {
2884                l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2885                if (!MBCLEN_INVALID_P(l2))
2886                    break;
2887            }
2888            memset(p+len2+1, 0, len-(len2+1));
2889        }
2890    }
2891}
2892
2893/*
2894  overwrite +p+ by succeeding letter in +enc+ and returns
2895  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2896  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2897  assuming each ranges are successive, and mbclen
2898  never change in each ranges.
2899  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2900  character.
2901 */
2902static enum neighbor_char
2903enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
2904{
2905    enum neighbor_char ret;
2906    unsigned int c;
2907    int ctype;
2908    int range;
2909    char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2910
2911    c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2912    if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2913        ctype = ONIGENC_CTYPE_DIGIT;
2914    else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2915        ctype = ONIGENC_CTYPE_ALPHA;
2916    else
2917        return NEIGHBOR_NOT_CHAR;
2918
2919    MEMCPY(save, p, char, len);
2920    ret = enc_succ_char(p, len, enc);
2921    if (ret == NEIGHBOR_FOUND) {
2922        c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2923        if (rb_enc_isctype(c, ctype, enc))
2924            return NEIGHBOR_FOUND;
2925    }
2926    MEMCPY(p, save, char, len);
2927    range = 1;
2928    while (1) {
2929        MEMCPY(save, p, char, len);
2930        ret = enc_pred_char(p, len, enc);
2931        if (ret == NEIGHBOR_FOUND) {
2932            c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2933            if (!rb_enc_isctype(c, ctype, enc)) {
2934                MEMCPY(p, save, char, len);
2935                break;
2936            }
2937        }
2938        else {
2939            MEMCPY(p, save, char, len);
2940            break;
2941        }
2942        range++;
2943    }
2944    if (range == 1) {
2945        return NEIGHBOR_NOT_CHAR;
2946    }
2947
2948    if (ctype != ONIGENC_CTYPE_DIGIT) {
2949        MEMCPY(carry, p, char, len);
2950        return NEIGHBOR_WRAPPED;
2951    }
2952
2953    MEMCPY(carry, p, char, len);
2954    enc_succ_char(carry, len, enc);
2955    return NEIGHBOR_WRAPPED;
2956}
2957
2958
2959/*
2960 *  call-seq:
2961 *     str.succ   -> new_str
2962 *     str.next   -> new_str
2963 *
2964 *  Returns the successor to <i>str</i>. The successor is calculated by
2965 *  incrementing characters starting from the rightmost alphanumeric (or
2966 *  the rightmost character if there are no alphanumerics) in the
2967 *  string. Incrementing a digit always results in another digit, and
2968 *  incrementing a letter results in another letter of the same case.
2969 *  Incrementing nonalphanumerics uses the underlying character set's
2970 *  collating sequence.
2971 *
2972 *  If the increment generates a ``carry,'' the character to the left of
2973 *  it is incremented. This process repeats until there is no carry,
2974 *  adding an additional character if necessary.
2975 *
2976 *     "abcd".succ        #=> "abce"
2977 *     "THX1138".succ     #=> "THX1139"
2978 *     "<<koala>>".succ   #=> "<<koalb>>"
2979 *     "1999zzz".succ     #=> "2000aaa"
2980 *     "ZZZ9999".succ     #=> "AAAA0000"
2981 *     "***".succ         #=> "**+"
2982 */
2983
2984VALUE
2985rb_str_succ(VALUE orig)
2986{
2987    rb_encoding *enc;
2988    VALUE str;
2989    char *sbeg, *s, *e, *last_alnum = 0;
2990    int c = -1;
2991    long l;
2992    char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2993    long carry_pos = 0, carry_len = 1;
2994    enum neighbor_char neighbor = NEIGHBOR_FOUND;
2995
2996    str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2997    rb_enc_cr_str_copy_for_substr(str, orig);
2998    OBJ_INFECT(str, orig);
2999    if (RSTRING_LEN(str) == 0) return str;
3000
3001    enc = STR_ENC_GET(orig);
3002    sbeg = RSTRING_PTR(str);
3003    s = e = sbeg + RSTRING_LEN(str);
3004
3005    while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
3006	if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
3007	    if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
3008		ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
3009		s = last_alnum;
3010		break;
3011	    }
3012	}
3013	if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
3014        neighbor = enc_succ_alnum_char(s, l, enc, carry);
3015        switch (neighbor) {
3016	  case NEIGHBOR_NOT_CHAR:
3017	    continue;
3018	  case NEIGHBOR_FOUND:
3019	    return str;
3020	  case NEIGHBOR_WRAPPED:
3021	    last_alnum = s;
3022	    break;
3023	}
3024        c = 1;
3025        carry_pos = s - sbeg;
3026        carry_len = l;
3027    }
3028    if (c == -1) {		/* str contains no alnum */
3029	s = e;
3030	while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
3031            enum neighbor_char neighbor;
3032            if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
3033            neighbor = enc_succ_char(s, l, enc);
3034            if (neighbor == NEIGHBOR_FOUND)
3035                return str;
3036            if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
3037                /* wrapped to \0...\0.  search next valid char. */
3038                enc_succ_char(s, l, enc);
3039            }
3040            if (!rb_enc_asciicompat(enc)) {
3041                MEMCPY(carry, s, char, l);
3042                carry_len = l;
3043            }
3044            carry_pos = s - sbeg;
3045	}
3046    }
3047    RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
3048    s = RSTRING_PTR(str) + carry_pos;
3049    memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
3050    memmove(s, carry, carry_len);
3051    STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
3052    RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3053    rb_enc_str_coderange(str);
3054    return str;
3055}
3056
3057
3058/*
3059 *  call-seq:
3060 *     str.succ!   -> str
3061 *     str.next!   -> str
3062 *
3063 *  Equivalent to <code>String#succ</code>, but modifies the receiver in
3064 *  place.
3065 */
3066
3067static VALUE
3068rb_str_succ_bang(VALUE str)
3069{
3070    rb_str_shared_replace(str, rb_str_succ(str));
3071
3072    return str;
3073}
3074
3075
3076/*
3077 *  call-seq:
3078 *     str.upto(other_str, exclusive=false) {|s| block }   -> str
3079 *     str.upto(other_str, exclusive=false)                -> an_enumerator
3080 *
3081 *  Iterates through successive values, starting at <i>str</i> and
3082 *  ending at <i>other_str</i> inclusive, passing each value in turn to
3083 *  the block. The <code>String#succ</code> method is used to generate
3084 *  each value.  If optional second argument exclusive is omitted or is false,
3085 *  the last value will be included; otherwise it will be excluded.
3086 *
3087 *  If no block is given, an enumerator is returned instead.
3088 *
3089 *     "a8".upto("b6") {|s| print s, ' ' }
3090 *     for s in "a8".."b6"
3091 *       print s, ' '
3092 *     end
3093 *
3094 *  <em>produces:</em>
3095 *
3096 *     a8 a9 b0 b1 b2 b3 b4 b5 b6
3097 *     a8 a9 b0 b1 b2 b3 b4 b5 b6
3098 *
3099 *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
3100 *  both are recognized as decimal numbers. In addition, the width of
3101 *  string (e.g. leading zeros) is handled appropriately.
3102 *
3103 *     "9".upto("11").to_a   #=> ["9", "10", "11"]
3104 *     "25".upto("5").to_a   #=> []
3105 *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
3106 */
3107
3108static VALUE
3109rb_str_upto(int argc, VALUE *argv, VALUE beg)
3110{
3111    VALUE end, exclusive;
3112    VALUE current, after_end;
3113    ID succ;
3114    int n, excl, ascii;
3115    rb_encoding *enc;
3116
3117    rb_scan_args(argc, argv, "11", &end, &exclusive);
3118    RETURN_ENUMERATOR(beg, argc, argv);
3119    excl = RTEST(exclusive);
3120    CONST_ID(succ, "succ");
3121    StringValue(end);
3122    enc = rb_enc_check(beg, end);
3123    ascii = (is_ascii_string(beg) && is_ascii_string(end));
3124    /* single character */
3125    if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
3126	char c = RSTRING_PTR(beg)[0];
3127	char e = RSTRING_PTR(end)[0];
3128
3129	if (c > e || (excl && c == e)) return beg;
3130	for (;;) {
3131	    rb_yield(rb_enc_str_new(&c, 1, enc));
3132	    if (!excl && c == e) break;
3133	    c++;
3134	    if (excl && c == e) break;
3135	}
3136	return beg;
3137    }
3138    /* both edges are all digits */
3139    if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
3140	char *s, *send;
3141	VALUE b, e;
3142	int width;
3143
3144	s = RSTRING_PTR(beg); send = RSTRING_END(beg);
3145	width = rb_long2int(send - s);
3146	while (s < send) {
3147	    if (!ISDIGIT(*s)) goto no_digits;
3148	    s++;
3149	}
3150	s = RSTRING_PTR(end); send = RSTRING_END(end);
3151	while (s < send) {
3152	    if (!ISDIGIT(*s)) goto no_digits;
3153	    s++;
3154	}
3155	b = rb_str_to_inum(beg, 10, FALSE);
3156	e = rb_str_to_inum(end, 10, FALSE);
3157	if (FIXNUM_P(b) && FIXNUM_P(e)) {
3158	    long bi = FIX2LONG(b);
3159	    long ei = FIX2LONG(e);
3160	    rb_encoding *usascii = rb_usascii_encoding();
3161
3162	    while (bi <= ei) {
3163		if (excl && bi == ei) break;
3164		rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
3165		bi++;
3166	    }
3167	}
3168	else {
3169	    ID op = excl ? '<' : rb_intern("<=");
3170	    VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
3171
3172	    args[0] = INT2FIX(width);
3173	    while (rb_funcall(b, op, 1, e)) {
3174		args[1] = b;
3175		rb_yield(rb_str_format(numberof(args), args, fmt));
3176		b = rb_funcall(b, succ, 0, 0);
3177	    }
3178	}
3179	return beg;
3180    }
3181    /* normal case */
3182  no_digits:
3183    n = rb_str_cmp(beg, end);
3184    if (n > 0 || (excl && n == 0)) return beg;
3185
3186    after_end = rb_funcall(end, succ, 0, 0);
3187    current = rb_str_dup(beg);
3188    while (!rb_str_equal(current, after_end)) {
3189	VALUE next = Qnil;
3190	if (excl || !rb_str_equal(current, end))
3191	    next = rb_funcall(current, succ, 0, 0);
3192	rb_yield(current);
3193	if (NIL_P(next)) break;
3194	current = next;
3195	StringValue(current);
3196	if (excl && rb_str_equal(current, end)) break;
3197	if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
3198	    break;
3199    }
3200
3201    return beg;
3202}
3203
3204static VALUE
3205rb_str_subpat(VALUE str, VALUE re, VALUE backref)
3206{
3207    if (rb_reg_search(re, str, 0, 0) >= 0) {
3208        VALUE match = rb_backref_get();
3209        int nth = rb_reg_backref_number(match, backref);
3210	return rb_reg_nth_match(nth, match);
3211    }
3212    return Qnil;
3213}
3214
3215static VALUE
3216rb_str_aref(VALUE str, VALUE indx)
3217{
3218    long idx;
3219
3220    if (FIXNUM_P(indx)) {
3221	idx = FIX2LONG(indx);
3222
3223      num_index:
3224	str = rb_str_substr(str, idx, 1);
3225	if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
3226	return str;
3227    }
3228
3229    if (SPECIAL_CONST_P(indx)) goto generic;
3230    switch (BUILTIN_TYPE(indx)) {
3231      case T_REGEXP:
3232	return rb_str_subpat(str, indx, INT2FIX(0));
3233
3234      case T_STRING:
3235	if (rb_str_index(str, indx, 0) != -1)
3236	    return rb_str_dup(indx);
3237	return Qnil;
3238
3239      generic:
3240      default:
3241	/* check if indx is Range */
3242	{
3243	    long beg, len;
3244	    VALUE tmp;
3245
3246	    len = str_strlen(str, STR_ENC_GET(str));
3247	    switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
3248	      case Qfalse:
3249		break;
3250	      case Qnil:
3251		return Qnil;
3252	      default:
3253		tmp = rb_str_substr(str, beg, len);
3254		return tmp;
3255	    }
3256	}
3257	idx = NUM2LONG(indx);
3258	goto num_index;
3259    }
3260
3261    UNREACHABLE;
3262}
3263
3264
3265/*
3266 *  call-seq:
3267 *     str[index]                 -> new_str or nil
3268 *     str[start, length]         -> new_str or nil
3269 *     str[range]                 -> new_str or nil
3270 *     str[regexp]                -> new_str or nil
3271 *     str[regexp, capture]       -> new_str or nil
3272 *     str[match_str]             -> new_str or nil
3273 *     str.slice(index)           -> new_str or nil
3274 *     str.slice(start, length)   -> new_str or nil
3275 *     str.slice(range)           -> new_str or nil
3276 *     str.slice(regexp)          -> new_str or nil
3277 *     str.slice(regexp, capture) -> new_str or nil
3278 *     str.slice(match_str)       -> new_str or nil
3279 *
3280 *  Element Reference --- If passed a single +index+, returns a substring of
3281 *  one character at that index. If passed a +start+ index and a +length+,
3282 *  returns a substring containing +length+ characters starting at the
3283 *  +index+. If passed a +range+, its beginning and end are interpreted as
3284 *  offsets delimiting the substring to be returned.
3285 *
3286 *  In these three cases, if an index is negative, it is counted from the end
3287 *  of the string.  For the +start+ and +range+ cases the starting index
3288 *  is just before a character and an index matching the string's size.
3289 *  Additionally, an empty string is returned when the starting index for a
3290 *  character range is at the end of the string.
3291 *
3292 *  Returns +nil+ if the initial index falls outside the string or the length
3293 *  is negative.
3294 *
3295 *  If a +Regexp+ is supplied, the matching portion of the string is
3296 *  returned.  If a +capture+ follows the regular expression, which may be a
3297 *  capture group index or name, follows the regular expression that component
3298 *  of the MatchData is returned instead.
3299 *
3300 *  If a +match_str+ is given, that string is returned if it occurs in
3301 *  the string.
3302 *
3303 *  Returns +nil+ if the regular expression does not match or the match string
3304 *  cannot be found.
3305 *
3306 *     a = "hello there"
3307 *
3308 *     a[1]                   #=> "e"
3309 *     a[2, 3]                #=> "llo"
3310 *     a[2..3]                #=> "ll"
3311 *
3312 *     a[-3, 2]               #=> "er"
3313 *     a[7..-2]               #=> "her"
3314 *     a[-4..-2]              #=> "her"
3315 *     a[-2..-4]              #=> ""
3316 *
3317 *     a[11, 0]               #=> ""
3318 *     a[11]                  #=> nil
3319 *     a[12, 0]               #=> nil
3320 *     a[12..-1]              #=> nil
3321 *
3322 *     a[/[aeiou](.)\1/]      #=> "ell"
3323 *     a[/[aeiou](.)\1/, 0]   #=> "ell"
3324 *     a[/[aeiou](.)\1/, 1]   #=> "l"
3325 *     a[/[aeiou](.)\1/, 2]   #=> nil
3326 *
3327 *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
3328 *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"]     #=> "e"
3329 *
3330 *     a["lo"]                #=> "lo"
3331 *     a["bye"]               #=> nil
3332 */
3333
3334static VALUE
3335rb_str_aref_m(int argc, VALUE *argv, VALUE str)
3336{
3337    if (argc == 2) {
3338	if (RB_TYPE_P(argv[0], T_REGEXP)) {
3339	    return rb_str_subpat(str, argv[0], argv[1]);
3340	}
3341	return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
3342    }
3343    rb_check_arity(argc, 1, 2);
3344    return rb_str_aref(str, argv[0]);
3345}
3346
3347VALUE
3348rb_str_drop_bytes(VALUE str, long len)
3349{
3350    char *ptr = RSTRING_PTR(str);
3351    long olen = RSTRING_LEN(str), nlen;
3352
3353    str_modifiable(str);
3354    if (len > olen) len = olen;
3355    nlen = olen - len;
3356    if (nlen <= RSTRING_EMBED_LEN_MAX) {
3357	char *oldptr = ptr;
3358	int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
3359	STR_SET_EMBED(str);
3360	STR_SET_EMBED_LEN(str, nlen);
3361	ptr = RSTRING(str)->as.ary;
3362	memmove(ptr, oldptr + len, nlen);
3363	if (fl == STR_NOEMBED) xfree(oldptr);
3364    }
3365    else {
3366	if (!STR_SHARED_P(str)) rb_str_new4(str);
3367	ptr = RSTRING(str)->as.heap.ptr += len;
3368	RSTRING(str)->as.heap.len = nlen;
3369    }
3370    ptr[nlen] = 0;
3371    ENC_CODERANGE_CLEAR(str);
3372    return str;
3373}
3374
3375static void
3376rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
3377{
3378    if (beg == 0 && RSTRING_LEN(val) == 0) {
3379	rb_str_drop_bytes(str, len);
3380	OBJ_INFECT(str, val);
3381	return;
3382    }
3383
3384    rb_str_modify(str);
3385    if (len < RSTRING_LEN(val)) {
3386	/* expand string */
3387	RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
3388    }
3389
3390    if (RSTRING_LEN(val) != len) {
3391	memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
3392		RSTRING_PTR(str) + beg + len,
3393		RSTRING_LEN(str) - (beg + len));
3394    }
3395    if (RSTRING_LEN(val) < beg && len < 0) {
3396	MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
3397    }
3398    if (RSTRING_LEN(val) > 0) {
3399	memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
3400    }
3401    STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
3402    if (RSTRING_PTR(str)) {
3403	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3404    }
3405    OBJ_INFECT(str, val);
3406}
3407
3408static void
3409rb_str_splice(VALUE str, long beg, long len, VALUE val)
3410{
3411    long slen;
3412    char *p, *e;
3413    rb_encoding *enc;
3414    int singlebyte = single_byte_optimizable(str);
3415    int cr;
3416
3417    if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
3418
3419    StringValue(val);
3420    enc = rb_enc_check(str, val);
3421    slen = str_strlen(str, enc);
3422
3423    if (slen < beg) {
3424      out_of_range:
3425	rb_raise(rb_eIndexError, "index %ld out of string", beg);
3426    }
3427    if (beg < 0) {
3428	if (-beg > slen) {
3429	    goto out_of_range;
3430	}
3431	beg += slen;
3432    }
3433    if (slen < len || slen < beg + len) {
3434	len = slen - beg;
3435    }
3436    str_modify_keep_cr(str);
3437    p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
3438    if (!p) p = RSTRING_END(str);
3439    e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
3440    if (!e) e = RSTRING_END(str);
3441    /* error check */
3442    beg = p - RSTRING_PTR(str);	/* physical position */
3443    len = e - p;		/* physical length */
3444    rb_str_splice_0(str, beg, len, val);
3445    rb_enc_associate(str, enc);
3446    cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
3447    if (cr != ENC_CODERANGE_BROKEN)
3448	ENC_CODERANGE_SET(str, cr);
3449}
3450
3451void
3452rb_str_update(VALUE str, long beg, long len, VALUE val)
3453{
3454    rb_str_splice(str, beg, len, val);
3455}
3456
3457static void
3458rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
3459{
3460    int nth;
3461    VALUE match;
3462    long start, end, len;
3463    rb_encoding *enc;
3464    struct re_registers *regs;
3465
3466    if (rb_reg_search(re, str, 0, 0) < 0) {
3467	rb_raise(rb_eIndexError, "regexp not matched");
3468    }
3469    match = rb_backref_get();
3470    nth = rb_reg_backref_number(match, backref);
3471    regs = RMATCH_REGS(match);
3472    if (nth >= regs->num_regs) {
3473      out_of_range:
3474	rb_raise(rb_eIndexError, "index %d out of regexp", nth);
3475    }
3476    if (nth < 0) {
3477	if (-nth >= regs->num_regs) {
3478	    goto out_of_range;
3479	}
3480	nth += regs->num_regs;
3481    }
3482
3483    start = BEG(nth);
3484    if (start == -1) {
3485	rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
3486    }
3487    end = END(nth);
3488    len = end - start;
3489    StringValue(val);
3490    enc = rb_enc_check(str, val);
3491    rb_str_splice_0(str, start, len, val);
3492    rb_enc_associate(str, enc);
3493}
3494
3495static VALUE
3496rb_str_aset(VALUE str, VALUE indx, VALUE val)
3497{
3498    long idx, beg;
3499
3500    if (FIXNUM_P(indx)) {
3501	idx = FIX2LONG(indx);
3502      num_index:
3503	rb_str_splice(str, idx, 1, val);
3504	return val;
3505    }
3506
3507    if (SPECIAL_CONST_P(indx)) goto generic;
3508    switch (TYPE(indx)) {
3509      case T_REGEXP:
3510	rb_str_subpat_set(str, indx, INT2FIX(0), val);
3511	return val;
3512
3513      case T_STRING:
3514	beg = rb_str_index(str, indx, 0);
3515	if (beg < 0) {
3516	    rb_raise(rb_eIndexError, "string not matched");
3517	}
3518	beg = rb_str_sublen(str, beg);
3519	rb_str_splice(str, beg, str_strlen(indx, 0), val);
3520	return val;
3521
3522      generic:
3523      default:
3524	/* check if indx is Range */
3525	{
3526	    long beg, len;
3527	    if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
3528		rb_str_splice(str, beg, len, val);
3529		return val;
3530	    }
3531	}
3532	idx = NUM2LONG(indx);
3533	goto num_index;
3534    }
3535}
3536
3537/*
3538 *  call-seq:
3539 *     str[fixnum] = new_str
3540 *     str[fixnum, fixnum] = new_str
3541 *     str[range] = aString
3542 *     str[regexp] = new_str
3543 *     str[regexp, fixnum] = new_str
3544 *     str[regexp, name] = new_str
3545 *     str[other_str] = new_str
3546 *
3547 *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
3548 *  portion of the string affected is determined using the same criteria as
3549 *  <code>String#[]</code>. If the replacement string is not the same length as
3550 *  the text it is replacing, the string will be adjusted accordingly. If the
3551 *  regular expression or string is used as the index doesn't match a position
3552 *  in the string, <code>IndexError</code> is raised. If the regular expression
3553 *  form is used, the optional second <code>Fixnum</code> allows you to specify
3554 *  which portion of the match to replace (effectively using the
3555 *  <code>MatchData</code> indexing rules. The forms that take a
3556 *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
3557 *  out of range; the <code>Range</code> form will raise a
3558 *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
3559 *  will raise an <code>IndexError</code> on negative match.
3560 */
3561
3562static VALUE
3563rb_str_aset_m(int argc, VALUE *argv, VALUE str)
3564{
3565    if (argc == 3) {
3566	if (RB_TYPE_P(argv[0], T_REGEXP)) {
3567	    rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
3568	}
3569	else {
3570	    rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3571	}
3572	return argv[2];
3573    }
3574    rb_check_arity(argc, 2, 3);
3575    return rb_str_aset(str, argv[0], argv[1]);
3576}
3577
3578/*
3579 *  call-seq:
3580 *     str.insert(index, other_str)   -> str
3581 *
3582 *  Inserts <i>other_str</i> before the character at the given
3583 *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
3584 *  end of the string, and insert <em>after</em> the given character.
3585 *  The intent is insert <i>aString</i> so that it starts at the given
3586 *  <i>index</i>.
3587 *
3588 *     "abcd".insert(0, 'X')    #=> "Xabcd"
3589 *     "abcd".insert(3, 'X')    #=> "abcXd"
3590 *     "abcd".insert(4, 'X')    #=> "abcdX"
3591 *     "abcd".insert(-3, 'X')   #=> "abXcd"
3592 *     "abcd".insert(-1, 'X')   #=> "abcdX"
3593 */
3594
3595static VALUE
3596rb_str_insert(VALUE str, VALUE idx, VALUE str2)
3597{
3598    long pos = NUM2LONG(idx);
3599
3600    if (pos == -1) {
3601	return rb_str_append(str, str2);
3602    }
3603    else if (pos < 0) {
3604	pos++;
3605    }
3606    rb_str_splice(str, pos, 0, str2);
3607    return str;
3608}
3609
3610
3611/*
3612 *  call-seq:
3613 *     str.slice!(fixnum)           -> fixnum or nil
3614 *     str.slice!(fixnum, fixnum)   -> new_str or nil
3615 *     str.slice!(range)            -> new_str or nil
3616 *     str.slice!(regexp)           -> new_str or nil
3617 *     str.slice!(other_str)        -> new_str or nil
3618 *
3619 *  Deletes the specified portion from <i>str</i>, and returns the portion
3620 *  deleted.
3621 *
3622 *     string = "this is a string"
3623 *     string.slice!(2)        #=> "i"
3624 *     string.slice!(3..6)     #=> " is "
3625 *     string.slice!(/s.*t/)   #=> "sa st"
3626 *     string.slice!("r")      #=> "r"
3627 *     string                  #=> "thing"
3628 */
3629
3630static VALUE
3631rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3632{
3633    VALUE result;
3634    VALUE buf[3];
3635    int i;
3636
3637    rb_check_arity(argc, 1, 2);
3638    for (i=0; i<argc; i++) {
3639	buf[i] = argv[i];
3640    }
3641    str_modify_keep_cr(str);
3642    result = rb_str_aref_m(argc, buf, str);
3643    if (!NIL_P(result)) {
3644	buf[i] = rb_str_new(0,0);
3645	rb_str_aset_m(argc+1, buf, str);
3646    }
3647    return result;
3648}
3649
3650static VALUE
3651get_pat(VALUE pat, int quote)
3652{
3653    VALUE val;
3654
3655    switch (TYPE(pat)) {
3656      case T_REGEXP:
3657	return pat;
3658
3659      case T_STRING:
3660	break;
3661
3662      default:
3663	val = rb_check_string_type(pat);
3664	if (NIL_P(val)) {
3665	    Check_Type(pat, T_REGEXP);
3666	}
3667	pat = val;
3668    }
3669
3670    if (quote) {
3671	pat = rb_reg_quote(pat);
3672    }
3673
3674    return rb_reg_regcomp(pat);
3675}
3676
3677
3678/*
3679 *  call-seq:
3680 *     str.sub!(pattern, replacement)          -> str or nil
3681 *     str.sub!(pattern) {|match| block }      -> str or nil
3682 *
3683 *  Performs the same substitution as String#sub in-place.
3684 *
3685 *  Returns +str+ if a substitution was performed or +nil+ if no substitution
3686 *  was performed.
3687 */
3688
3689static VALUE
3690rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3691{
3692    VALUE pat, repl, hash = Qnil;
3693    int iter = 0;
3694    int tainted = 0;
3695    int untrusted = 0;
3696    long plen;
3697    int min_arity = rb_block_given_p() ? 1 : 2;
3698
3699    rb_check_arity(argc, min_arity, 2);
3700    if (argc == 1) {
3701	iter = 1;
3702    }
3703    else {
3704	repl = argv[1];
3705	hash = rb_check_hash_type(argv[1]);
3706	if (NIL_P(hash)) {
3707	    StringValue(repl);
3708	}
3709	if (OBJ_TAINTED(repl)) tainted = 1;
3710	if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3711    }
3712
3713    pat = get_pat(argv[0], 1);
3714    str_modifiable(str);
3715    if (rb_reg_search(pat, str, 0, 0) >= 0) {
3716	rb_encoding *enc;
3717	int cr = ENC_CODERANGE(str);
3718	VALUE match = rb_backref_get();
3719	struct re_registers *regs = RMATCH_REGS(match);
3720	long beg0 = BEG(0);
3721	long end0 = END(0);
3722	char *p, *rp;
3723	long len, rlen;
3724
3725	if (iter || !NIL_P(hash)) {
3726	    p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3727
3728            if (iter) {
3729                repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3730            }
3731            else {
3732                repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3733                repl = rb_obj_as_string(repl);
3734            }
3735	    str_mod_check(str, p, len);
3736	    rb_check_frozen(str);
3737	}
3738	else {
3739	    repl = rb_reg_regsub(repl, str, regs, pat);
3740	}
3741        enc = rb_enc_compatible(str, repl);
3742        if (!enc) {
3743            rb_encoding *str_enc = STR_ENC_GET(str);
3744	    p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3745	    if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
3746		coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
3747                rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3748			 rb_enc_name(str_enc),
3749			 rb_enc_name(STR_ENC_GET(repl)));
3750            }
3751            enc = STR_ENC_GET(repl);
3752        }
3753	rb_str_modify(str);
3754	rb_enc_associate(str, enc);
3755	if (OBJ_TAINTED(repl)) tainted = 1;
3756	if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3757	if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3758	    int cr2 = ENC_CODERANGE(repl);
3759            if (cr2 == ENC_CODERANGE_BROKEN ||
3760                (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
3761                cr = ENC_CODERANGE_UNKNOWN;
3762            else
3763                cr = cr2;
3764	}
3765	plen = end0 - beg0;
3766	rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
3767	len = RSTRING_LEN(str);
3768	if (rlen > plen) {
3769	    RESIZE_CAPA(str, len + rlen - plen);
3770	}
3771	p = RSTRING_PTR(str);
3772	if (rlen != plen) {
3773	    memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
3774	}
3775	memcpy(p + beg0, rp, rlen);
3776	len += rlen - plen;
3777	STR_SET_LEN(str, len);
3778	RSTRING_PTR(str)[len] = '\0';
3779	ENC_CODERANGE_SET(str, cr);
3780	if (tainted) OBJ_TAINT(str);
3781	if (untrusted) OBJ_UNTRUST(str);
3782
3783	return str;
3784    }
3785    return Qnil;
3786}
3787
3788
3789/*
3790 *  call-seq:
3791 *     str.sub(pattern, replacement)         -> new_str
3792 *     str.sub(pattern, hash)                -> new_str
3793 *     str.sub(pattern) {|match| block }     -> new_str
3794 *
3795 *  Returns a copy of +str+ with the _first_ occurrence of +pattern+
3796 *  replaced by the second argument. The +pattern+ is typically a Regexp; if
3797 *  given as a String, any regular expression metacharacters it contains will
3798 *  be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash
3799 *  followed by 'd', instead of a digit.
3800 *
3801 *  If +replacement+ is a String it will be substituted for the matched text.
3802 *  It may contain back-references to the pattern's capture groups of the form
3803 *  <code>"\\d"</code>, where <i>d</i> is a group number, or
3804 *  <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
3805 *  double-quoted string, both back-references must be preceded by an
3806 *  additional backslash. However, within +replacement+ the special match
3807 *  variables, such as <code>&$</code>, will not refer to the current match.
3808 *
3809 *  If the second argument is a Hash, and the matched text is one of its keys,
3810 *  the corresponding value is the replacement string.
3811 *
3812 *  In the block form, the current match string is passed in as a parameter,
3813 *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3814 *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3815 *  returned by the block will be substituted for the match on each call.
3816 *
3817 *  The result inherits any tainting in the original string or any supplied
3818 *  replacement string.
3819 *
3820 *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
3821 *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
3822 *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
3823 *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
3824 *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
3825 *      #=> "Is /bin/bash your preferred shell?"
3826 */
3827
3828static VALUE
3829rb_str_sub(int argc, VALUE *argv, VALUE str)
3830{
3831    str = rb_str_dup(str);
3832    rb_str_sub_bang(argc, argv, str);
3833    return str;
3834}
3835
3836static VALUE
3837str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3838{
3839    VALUE pat, val, repl, match, dest, hash = Qnil;
3840    struct re_registers *regs;
3841    long beg, n;
3842    long beg0, end0;
3843    long offset, blen, slen, len, last;
3844    int iter = 0;
3845    char *sp, *cp;
3846    int tainted = 0;
3847    rb_encoding *str_enc;
3848
3849    switch (argc) {
3850      case 1:
3851	RETURN_ENUMERATOR(str, argc, argv);
3852	iter = 1;
3853	break;
3854      case 2:
3855	repl = argv[1];
3856	hash = rb_check_hash_type(argv[1]);
3857	if (NIL_P(hash)) {
3858	    StringValue(repl);
3859	}
3860	if (OBJ_TAINTED(repl)) tainted = 1;
3861	break;
3862      default:
3863	rb_check_arity(argc, 1, 2);
3864    }
3865
3866    pat = get_pat(argv[0], 1);
3867    beg = rb_reg_search(pat, str, 0, 0);
3868    if (beg < 0) {
3869	if (bang) return Qnil;	/* no match, no substitution */
3870	return rb_str_dup(str);
3871    }
3872
3873    offset = 0;
3874    n = 0;
3875    blen = RSTRING_LEN(str) + 30; /* len + margin */
3876    dest = rb_str_buf_new(blen);
3877    sp = RSTRING_PTR(str);
3878    slen = RSTRING_LEN(str);
3879    cp = sp;
3880    str_enc = STR_ENC_GET(str);
3881    rb_enc_associate(dest, str_enc);
3882    ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
3883
3884    do {
3885	n++;
3886	match = rb_backref_get();
3887	regs = RMATCH_REGS(match);
3888	beg0 = BEG(0);
3889	end0 = END(0);
3890	if (iter || !NIL_P(hash)) {
3891            if (iter) {
3892                val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3893            }
3894            else {
3895                val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3896                val = rb_obj_as_string(val);
3897            }
3898	    str_mod_check(str, sp, slen);
3899	    if (val == dest) { 	/* paranoid check [ruby-dev:24827] */
3900		rb_raise(rb_eRuntimeError, "block should not cheat");
3901	    }
3902	}
3903	else {
3904	    val = rb_reg_regsub(repl, str, regs, pat);
3905	}
3906
3907	if (OBJ_TAINTED(val)) tainted = 1;
3908
3909	len = beg0 - offset;	/* copy pre-match substr */
3910        if (len) {
3911            rb_enc_str_buf_cat(dest, cp, len, str_enc);
3912        }
3913
3914        rb_str_buf_append(dest, val);
3915
3916	last = offset;
3917	offset = end0;
3918	if (beg0 == end0) {
3919	    /*
3920	     * Always consume at least one character of the input string
3921	     * in order to prevent infinite loops.
3922	     */
3923	    if (RSTRING_LEN(str) <= end0) break;
3924	    len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3925            rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3926	    offset = end0 + len;
3927	}
3928	cp = RSTRING_PTR(str) + offset;
3929	if (offset > RSTRING_LEN(str)) break;
3930	beg = rb_reg_search(pat, str, offset, 0);
3931    } while (beg >= 0);
3932    if (RSTRING_LEN(str) > offset) {
3933        rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3934    }
3935    rb_reg_search(pat, str, last, 0);
3936    if (bang) {
3937        rb_str_shared_replace(str, dest);
3938    }
3939    else {
3940	RBASIC(dest)->klass = rb_obj_class(str);
3941	OBJ_INFECT(dest, str);
3942	str = dest;
3943    }
3944
3945    if (tainted) OBJ_TAINT(str);
3946    return str;
3947}
3948
3949
3950/*
3951 *  call-seq:
3952 *     str.gsub!(pattern, replacement)        -> str or nil
3953 *     str.gsub!(pattern) {|match| block }    -> str or nil
3954 *     str.gsub!(pattern)                     -> an_enumerator
3955 *
3956 *  Performs the substitutions of <code>String#gsub</code> in place, returning
3957 *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
3958 *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
3959 */
3960
3961static VALUE
3962rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
3963{
3964    str_modify_keep_cr(str);
3965    return str_gsub(argc, argv, str, 1);
3966}
3967
3968
3969/*
3970 *  call-seq:
3971 *     str.gsub(pattern, replacement)       -> new_str
3972 *     str.gsub(pattern, hash)              -> new_str
3973 *     str.gsub(pattern) {|match| block }   -> new_str
3974 *     str.gsub(pattern)                    -> enumerator
3975 *
3976 *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
3977 *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3978 *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
3979 *  regular expression metacharacters it contains will be interpreted
3980 *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3981 *  instead of a digit.
3982 *
3983 *  If <i>replacement</i> is a <code>String</code> it will be substituted for
3984 *  the matched text. It may contain back-references to the pattern's capture
3985 *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3986 *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3987 *  double-quoted string, both back-references must be preceded by an
3988 *  additional backslash. However, within <i>replacement</i> the special match
3989 *  variables, such as <code>$&</code>, will not refer to the current match.
3990 *
3991 *  If the second argument is a <code>Hash</code>, and the matched text is one
3992 *  of its keys, the corresponding value is the replacement string.
3993 *
3994 *  In the block form, the current match string is passed in as a parameter,
3995 *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3996 *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3997 *  returned by the block will be substituted for the match on each call.
3998 *
3999 *  The result inherits any tainting in the original string or any supplied
4000 *  replacement string.
4001 *
4002 *  When neither a block nor a second argument is supplied, an
4003 *  <code>Enumerator</code> is returned.
4004 *
4005 *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
4006 *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
4007 *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
4008 *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
4009 *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
4010 */
4011
4012static VALUE
4013rb_str_gsub(int argc, VALUE *argv, VALUE str)
4014{
4015    return str_gsub(argc, argv, str, 0);
4016}
4017
4018
4019/*
4020 *  call-seq:
4021 *     str.replace(other_str)   -> str
4022 *
4023 *  Replaces the contents and taintedness of <i>str</i> with the corresponding
4024 *  values in <i>other_str</i>.
4025 *
4026 *     s = "hello"         #=> "hello"
4027 *     s.replace "world"   #=> "world"
4028 */
4029
4030VALUE
4031rb_str_replace(VALUE str, VALUE str2)
4032{
4033    str_modifiable(str);
4034    if (str == str2) return str;
4035
4036    StringValue(str2);
4037    str_discard(str);
4038    return str_replace(str, str2);
4039}
4040
4041/*
4042 *  call-seq:
4043 *     string.clear    ->  string
4044 *
4045 *  Makes string empty.
4046 *
4047 *     a = "abcde"
4048 *     a.clear    #=> ""
4049 */
4050
4051static VALUE
4052rb_str_clear(VALUE str)
4053{
4054    str_discard(str);
4055    STR_SET_EMBED(str);
4056    STR_SET_EMBED_LEN(str, 0);
4057    RSTRING_PTR(str)[0] = 0;
4058    if (rb_enc_asciicompat(STR_ENC_GET(str)))
4059	ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
4060    else
4061	ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
4062    return str;
4063}
4064
4065/*
4066 *  call-seq:
4067 *     string.chr    ->  string
4068 *
4069 *  Returns a one-character string at the beginning of the string.
4070 *
4071 *     a = "abcde"
4072 *     a.chr    #=> "a"
4073 */
4074
4075static VALUE
4076rb_str_chr(VALUE str)
4077{
4078    return rb_str_substr(str, 0, 1);
4079}
4080
4081/*
4082 *  call-seq:
4083 *     str.getbyte(index)          -> 0 .. 255
4084 *
4085 *  returns the <i>index</i>th byte as an integer.
4086 */
4087static VALUE
4088rb_str_getbyte(VALUE str, VALUE index)
4089{
4090    long pos = NUM2LONG(index);
4091
4092    if (pos < 0)
4093        pos += RSTRING_LEN(str);
4094    if (pos < 0 ||  RSTRING_LEN(str) <= pos)
4095        return Qnil;
4096
4097    return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
4098}
4099
4100/*
4101 *  call-seq:
4102 *     str.setbyte(index, integer) -> integer
4103 *
4104 *  modifies the <i>index</i>th byte as <i>integer</i>.
4105 */
4106static VALUE
4107rb_str_setbyte(VALUE str, VALUE index, VALUE value)
4108{
4109    long pos = NUM2LONG(index);
4110    int byte = NUM2INT(value);
4111
4112    rb_str_modify(str);
4113
4114    if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
4115        rb_raise(rb_eIndexError, "index %ld out of string", pos);
4116    if (pos < 0)
4117        pos += RSTRING_LEN(str);
4118
4119    RSTRING_PTR(str)[pos] = byte;
4120
4121    return value;
4122}
4123
4124static VALUE
4125str_byte_substr(VALUE str, long beg, long len)
4126{
4127    char *p, *s = RSTRING_PTR(str);
4128    long n = RSTRING_LEN(str);
4129    VALUE str2;
4130
4131    if (beg > n || len < 0) return Qnil;
4132    if (beg < 0) {
4133	beg += n;
4134	if (beg < 0) return Qnil;
4135    }
4136    if (beg + len > n)
4137	len = n - beg;
4138    if (len <= 0) {
4139	len = 0;
4140	p = 0;
4141    }
4142    else
4143	p = s + beg;
4144
4145    if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
4146	str2 = rb_str_new4(str);
4147	str2 = str_new3(rb_obj_class(str2), str2);
4148	RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
4149	RSTRING(str2)->as.heap.len = len;
4150    }
4151    else {
4152	str2 = rb_str_new5(str, p, len);
4153    }
4154
4155    str_enc_copy(str2, str);
4156
4157    if (RSTRING_LEN(str2) == 0) {
4158	if (!rb_enc_asciicompat(STR_ENC_GET(str)))
4159	    ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
4160	else
4161	    ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
4162    }
4163    else {
4164	switch (ENC_CODERANGE(str)) {
4165	  case ENC_CODERANGE_7BIT:
4166	    ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
4167	    break;
4168	  default:
4169	    ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
4170	    break;
4171	}
4172    }
4173
4174    OBJ_INFECT(str2, str);
4175
4176    return str2;
4177}
4178
4179static VALUE
4180str_byte_aref(VALUE str, VALUE indx)
4181{
4182    long idx;
4183    switch (TYPE(indx)) {
4184      case T_FIXNUM:
4185	idx = FIX2LONG(indx);
4186
4187      num_index:
4188	str = str_byte_substr(str, idx, 1);
4189	if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
4190	return str;
4191
4192      default:
4193	/* check if indx is Range */
4194	{
4195	    long beg, len = RSTRING_LEN(str);
4196
4197	    switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4198	      case Qfalse:
4199		break;
4200	      case Qnil:
4201		return Qnil;
4202	      default:
4203		return str_byte_substr(str, beg, len);
4204	    }
4205	}
4206	idx = NUM2LONG(indx);
4207	goto num_index;
4208    }
4209
4210    UNREACHABLE;
4211}
4212
4213/*
4214 *  call-seq:
4215 *     str.byteslice(fixnum)           -> new_str or nil
4216 *     str.byteslice(fixnum, fixnum)   -> new_str or nil
4217 *     str.byteslice(range)            -> new_str or nil
4218 *
4219 *  Byte Reference---If passed a single <code>Fixnum</code>, returns a
4220 *  substring of one byte at that position. If passed two <code>Fixnum</code>
4221 *  objects, returns a substring starting at the offset given by the first, and
4222 *  a length given by the second. If given a <code>Range</code>, a substring containing
4223 *  bytes at offsets given by the range is returned. In all three cases, if
4224 *  an offset is negative, it is counted from the end of <i>str</i>. Returns
4225 *  <code>nil</code> if the initial offset falls outside the string, the length
4226 *  is negative, or the beginning of the range is greater than the end.
4227 *  The encoding of the resulted string keeps original encoding.
4228 *
4229 *     "hello".byteslice(1)     #=> "e"
4230 *     "hello".byteslice(-1)    #=> "o"
4231 *     "hello".byteslice(1, 2)  #=> "el"
4232 *     "\x80\u3042".byteslice(1, 3) #=> "\u3042"
4233 *     "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
4234 */
4235
4236static VALUE
4237rb_str_byteslice(int argc, VALUE *argv, VALUE str)
4238{
4239    if (argc == 2) {
4240	return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
4241    }
4242    rb_check_arity(argc, 1, 2);
4243    return str_byte_aref(str, argv[0]);
4244}
4245
4246/*
4247 *  call-seq:
4248 *     str.reverse   -> new_str
4249 *
4250 *  Returns a new string with the characters from <i>str</i> in reverse order.
4251 *
4252 *     "stressed".reverse   #=> "desserts"
4253 */
4254
4255static VALUE
4256rb_str_reverse(VALUE str)
4257{
4258    rb_encoding *enc;
4259    VALUE rev;
4260    char *s, *e, *p;
4261    int single = 1;
4262
4263    if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
4264    enc = STR_ENC_GET(str);
4265    rev = rb_str_new5(str, 0, RSTRING_LEN(str));
4266    s = RSTRING_PTR(str); e = RSTRING_END(str);
4267    p = RSTRING_END(rev);
4268
4269    if (RSTRING_LEN(str) > 1) {
4270	if (single_byte_optimizable(str)) {
4271	    while (s < e) {
4272		*--p = *s++;
4273	    }
4274	}
4275	else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
4276	    while (s < e) {
4277		int clen = rb_enc_fast_mbclen(s, e, enc);
4278
4279		if (clen > 1 || (*s & 0x80)) single = 0;
4280		p -= clen;
4281		memcpy(p, s, clen);
4282		s += clen;
4283	    }
4284	}
4285	else {
4286	    while (s < e) {
4287		int clen = rb_enc_mbclen(s, e, enc);
4288
4289		if (clen > 1 || (*s & 0x80)) single = 0;
4290		p -= clen;
4291		memcpy(p, s, clen);
4292		s += clen;
4293	    }
4294	}
4295    }
4296    STR_SET_LEN(rev, RSTRING_LEN(str));
4297    OBJ_INFECT(rev, str);
4298    if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
4299	if (single) {
4300	    ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
4301	}
4302	else {
4303	    ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
4304	}
4305    }
4306    rb_enc_cr_str_copy_for_substr(rev, str);
4307
4308    return rev;
4309}
4310
4311
4312/*
4313 *  call-seq:
4314 *     str.reverse!   -> str
4315 *
4316 *  Reverses <i>str</i> in place.
4317 */
4318
4319static VALUE
4320rb_str_reverse_bang(VALUE str)
4321{
4322    if (RSTRING_LEN(str) > 1) {
4323	if (single_byte_optimizable(str)) {
4324	    char *s, *e, c;
4325
4326	    str_modify_keep_cr(str);
4327	    s = RSTRING_PTR(str);
4328	    e = RSTRING_END(str) - 1;
4329	    while (s < e) {
4330		c = *s;
4331		*s++ = *e;
4332		*e-- = c;
4333	    }
4334	}
4335	else {
4336	    rb_str_shared_replace(str, rb_str_reverse(str));
4337	}
4338    }
4339    else {
4340	str_modify_keep_cr(str);
4341    }
4342    return str;
4343}
4344
4345
4346/*
4347 *  call-seq:
4348 *     str.include? other_str   -> true or false
4349 *
4350 *  Returns <code>true</code> if <i>str</i> contains the given string or
4351 *  character.
4352 *
4353 *     "hello".include? "lo"   #=> true
4354 *     "hello".include? "ol"   #=> false
4355 *     "hello".include? ?h     #=> true
4356 */
4357
4358static VALUE
4359rb_str_include(VALUE str, VALUE arg)
4360{
4361    long i;
4362
4363    StringValue(arg);
4364    i = rb_str_index(str, arg, 0);
4365
4366    if (i == -1) return Qfalse;
4367    return Qtrue;
4368}
4369
4370
4371/*
4372 *  call-seq:
4373 *     str.to_i(base=10)   -> integer
4374 *
4375 *  Returns the result of interpreting leading characters in <i>str</i> as an
4376 *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
4377 *  end of a valid number are ignored. If there is not a valid number at the
4378 *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
4379 *  exception when <i>base</i> is valid.
4380 *
4381 *     "12345".to_i             #=> 12345
4382 *     "99 red balloons".to_i   #=> 99
4383 *     "0a".to_i                #=> 0
4384 *     "0a".to_i(16)            #=> 10
4385 *     "hello".to_i             #=> 0
4386 *     "1100101".to_i(2)        #=> 101
4387 *     "1100101".to_i(8)        #=> 294977
4388 *     "1100101".to_i(10)       #=> 1100101
4389 *     "1100101".to_i(16)       #=> 17826049
4390 */
4391
4392static VALUE
4393rb_str_to_i(int argc, VALUE *argv, VALUE str)
4394{
4395    int base;
4396
4397    if (argc == 0) base = 10;
4398    else {
4399	VALUE b;
4400
4401	rb_scan_args(argc, argv, "01", &b);
4402	base = NUM2INT(b);
4403    }
4404    if (base < 0) {
4405	rb_raise(rb_eArgError, "invalid radix %d", base);
4406    }
4407    return rb_str_to_inum(str, base, FALSE);
4408}
4409
4410
4411/*
4412 *  call-seq:
4413 *     str.to_f   -> float
4414 *
4415 *  Returns the result of interpreting leading characters in <i>str</i> as a
4416 *  floating point number. Extraneous characters past the end of a valid number
4417 *  are ignored. If there is not a valid number at the start of <i>str</i>,
4418 *  <code>0.0</code> is returned. This method never raises an exception.
4419 *
4420 *     "123.45e1".to_f        #=> 1234.5
4421 *     "45.67 degrees".to_f   #=> 45.67
4422 *     "thx1138".to_f         #=> 0.0
4423 */
4424
4425static VALUE
4426rb_str_to_f(VALUE str)
4427{
4428    return DBL2NUM(rb_str_to_dbl(str, FALSE));
4429}
4430
4431
4432/*
4433 *  call-seq:
4434 *     str.to_s     -> str
4435 *     str.to_str   -> str
4436 *
4437 *  Returns the receiver.
4438 */
4439
4440static VALUE
4441rb_str_to_s(VALUE str)
4442{
4443    if (rb_obj_class(str) != rb_cString) {
4444	return str_duplicate(rb_cString, str);
4445    }
4446    return str;
4447}
4448
4449#if 0
4450static void
4451str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
4452{
4453    char s[RUBY_MAX_CHAR_LEN];
4454    int n = rb_enc_codelen(c, enc);
4455
4456    rb_enc_mbcput(c, s, enc);
4457    rb_enc_str_buf_cat(str, s, n, enc);
4458}
4459#endif
4460
4461#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
4462
4463int
4464rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
4465{
4466    char buf[CHAR_ESC_LEN + 1];
4467    int l;
4468
4469#if SIZEOF_INT > 4
4470    c &= 0xffffffff;
4471#endif
4472    if (unicode_p) {
4473	if (c < 0x7F && ISPRINT(c)) {
4474	    snprintf(buf, CHAR_ESC_LEN, "%c", c);
4475	}
4476	else if (c < 0x10000) {
4477	    snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
4478	}
4479	else {
4480	    snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
4481	}
4482    }
4483    else {
4484	if (c < 0x100) {
4485	    snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
4486	}
4487	else {
4488	    snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
4489	}
4490    }
4491    l = (int)strlen(buf);	/* CHAR_ESC_LEN cannot exceed INT_MAX */
4492    rb_str_buf_cat(result, buf, l);
4493    return l;
4494}
4495
4496/*
4497 * call-seq:
4498 *   str.inspect   -> string
4499 *
4500 * Returns a printable version of _str_, surrounded by quote marks,
4501 * with special characters escaped.
4502 *
4503 *    str = "hello"
4504 *    str[3] = "\b"
4505 *    str.inspect       #=> "\"hel\\bo\""
4506 */
4507
4508VALUE
4509rb_str_inspect(VALUE str)
4510{
4511    rb_encoding *enc = STR_ENC_GET(str);
4512    const char *p, *pend, *prev;
4513    char buf[CHAR_ESC_LEN + 1];
4514    VALUE result = rb_str_buf_new(0);
4515    rb_encoding *resenc = rb_default_internal_encoding();
4516    int unicode_p = rb_enc_unicode_p(enc);
4517    int asciicompat = rb_enc_asciicompat(enc);
4518    static rb_encoding *utf16, *utf32;
4519
4520    if (!utf16) utf16 = rb_enc_find("UTF-16");
4521    if (!utf32) utf32 = rb_enc_find("UTF-32");
4522    if (resenc == NULL) resenc = rb_default_external_encoding();
4523    if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
4524    rb_enc_associate(result, resenc);
4525    str_buf_cat2(result, "\"");
4526
4527    p = RSTRING_PTR(str); pend = RSTRING_END(str);
4528    prev = p;
4529    if (enc == utf16) {
4530	const unsigned char *q = (const unsigned char *)p;
4531	if (q[0] == 0xFE && q[1] == 0xFF)
4532	    enc = rb_enc_find("UTF-16BE");
4533	else if (q[0] == 0xFF && q[1] == 0xFE)
4534	    enc = rb_enc_find("UTF-16LE");
4535	else
4536	    unicode_p = 0;
4537    }
4538    else if (enc == utf32) {
4539	const unsigned char *q = (const unsigned char *)p;
4540	if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
4541	    enc = rb_enc_find("UTF-32BE");
4542	else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
4543	    enc = rb_enc_find("UTF-32LE");
4544	else
4545	    unicode_p = 0;
4546    }
4547    while (p < pend) {
4548	unsigned int c, cc;
4549	int n;
4550
4551        n = rb_enc_precise_mbclen(p, pend, enc);
4552        if (!MBCLEN_CHARFOUND_P(n)) {
4553	    if (p > prev) str_buf_cat(result, prev, p - prev);
4554            n = rb_enc_mbminlen(enc);
4555            if (pend < p + n)
4556                n = (int)(pend - p);
4557            while (n--) {
4558                snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
4559                str_buf_cat(result, buf, strlen(buf));
4560                prev = ++p;
4561            }
4562	    continue;
4563	}
4564        n = MBCLEN_CHARFOUND_LEN(n);
4565	c = rb_enc_mbc_to_codepoint(p, pend, enc);
4566	p += n;
4567	if ((asciicompat || unicode_p) &&
4568	  (c == '"'|| c == '\\' ||
4569	    (c == '#' &&
4570             p < pend &&
4571             MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
4572             (cc = rb_enc_codepoint(p,pend,enc),
4573              (cc == '$' || cc == '@' || cc == '{'))))) {
4574	    if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4575	    str_buf_cat2(result, "\\");
4576	    if (asciicompat || enc == resenc) {
4577		prev = p - n;
4578		continue;
4579	    }
4580	}
4581	switch (c) {
4582	  case '\n': cc = 'n'; break;
4583	  case '\r': cc = 'r'; break;
4584	  case '\t': cc = 't'; break;
4585	  case '\f': cc = 'f'; break;
4586	  case '\013': cc = 'v'; break;
4587	  case '\010': cc = 'b'; break;
4588	  case '\007': cc = 'a'; break;
4589	  case 033: cc = 'e'; break;
4590	  default: cc = 0; break;
4591	}
4592	if (cc) {
4593	    if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4594	    buf[0] = '\\';
4595	    buf[1] = (char)cc;
4596	    str_buf_cat(result, buf, 2);
4597	    prev = p;
4598	    continue;
4599	}
4600	if ((enc == resenc && rb_enc_isprint(c, enc)) ||
4601	    (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
4602	    continue;
4603	}
4604	else {
4605	    if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4606	    rb_str_buf_cat_escaped_char(result, c, unicode_p);
4607	    prev = p;
4608	    continue;
4609	}
4610    }
4611    if (p > prev) str_buf_cat(result, prev, p - prev);
4612    str_buf_cat2(result, "\"");
4613
4614    OBJ_INFECT(result, str);
4615    return result;
4616}
4617
4618#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
4619
4620/*
4621 *  call-seq:
4622 *     str.dump   -> new_str
4623 *
4624 *  Produces a version of +str+ with all non-printing characters replaced by
4625 *  <code>\nnn</code> notation and all special characters escaped.
4626 *
4627 *    "hello \n ''".dump  #=> "\"hello \\n ''\"
4628 */
4629
4630VALUE
4631rb_str_dump(VALUE str)
4632{
4633    rb_encoding *enc = rb_enc_get(str);
4634    long len;
4635    const char *p, *pend;
4636    char *q, *qend;
4637    VALUE result;
4638    int u8 = (enc == rb_utf8_encoding());
4639
4640    len = 2;			/* "" */
4641    p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4642    while (p < pend) {
4643	unsigned char c = *p++;
4644	switch (c) {
4645	  case '"':  case '\\':
4646	  case '\n': case '\r':
4647	  case '\t': case '\f':
4648	  case '\013': case '\010': case '\007': case '\033':
4649	    len += 2;
4650	    break;
4651
4652	  case '#':
4653	    len += IS_EVSTR(p, pend) ? 2 : 1;
4654	    break;
4655
4656	  default:
4657	    if (ISPRINT(c)) {
4658		len++;
4659	    }
4660	    else {
4661		if (u8) {	/* \u{NN} */
4662		    int n = rb_enc_precise_mbclen(p-1, pend, enc);
4663		    if (MBCLEN_CHARFOUND_P(n-1)) {
4664			unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4665			while (cc >>= 4) len++;
4666			len += 5;
4667			p += MBCLEN_CHARFOUND_LEN(n)-1;
4668			break;
4669		    }
4670		}
4671		len += 4;	/* \xNN */
4672	    }
4673	    break;
4674	}
4675    }
4676    if (!rb_enc_asciicompat(enc)) {
4677	len += 19;		/* ".force_encoding('')" */
4678	len += strlen(enc->name);
4679    }
4680
4681    result = rb_str_new5(str, 0, len);
4682    p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4683    q = RSTRING_PTR(result); qend = q + len + 1;
4684
4685    *q++ = '"';
4686    while (p < pend) {
4687	unsigned char c = *p++;
4688
4689	if (c == '"' || c == '\\') {
4690	    *q++ = '\\';
4691	    *q++ = c;
4692	}
4693	else if (c == '#') {
4694	    if (IS_EVSTR(p, pend)) *q++ = '\\';
4695	    *q++ = '#';
4696	}
4697	else if (c == '\n') {
4698	    *q++ = '\\';
4699	    *q++ = 'n';
4700	}
4701	else if (c == '\r') {
4702	    *q++ = '\\';
4703	    *q++ = 'r';
4704	}
4705	else if (c == '\t') {
4706	    *q++ = '\\';
4707	    *q++ = 't';
4708	}
4709	else if (c == '\f') {
4710	    *q++ = '\\';
4711	    *q++ = 'f';
4712	}
4713	else if (c == '\013') {
4714	    *q++ = '\\';
4715	    *q++ = 'v';
4716	}
4717	else if (c == '\010') {
4718	    *q++ = '\\';
4719	    *q++ = 'b';
4720	}
4721	else if (c == '\007') {
4722	    *q++ = '\\';
4723	    *q++ = 'a';
4724	}
4725	else if (c == '\033') {
4726	    *q++ = '\\';
4727	    *q++ = 'e';
4728	}
4729	else if (ISPRINT(c)) {
4730	    *q++ = c;
4731	}
4732	else {
4733	    *q++ = '\\';
4734	    if (u8) {
4735		int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
4736		if (MBCLEN_CHARFOUND_P(n)) {
4737		    int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4738		    p += n;
4739		    snprintf(q, qend-q, "u{%x}", cc);
4740		    q += strlen(q);
4741		    continue;
4742		}
4743	    }
4744	    snprintf(q, qend-q, "x%02X", c);
4745	    q += 3;
4746	}
4747    }
4748    *q++ = '"';
4749    *q = '\0';
4750    if (!rb_enc_asciicompat(enc)) {
4751	snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
4752	enc = rb_ascii8bit_encoding();
4753    }
4754    OBJ_INFECT(result, str);
4755    /* result from dump is ASCII */
4756    rb_enc_associate(result, enc);
4757    ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
4758    return result;
4759}
4760
4761
4762static void
4763rb_str_check_dummy_enc(rb_encoding *enc)
4764{
4765    if (rb_enc_dummy_p(enc)) {
4766	rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
4767		 rb_enc_name(enc));
4768    }
4769}
4770
4771/*
4772 *  call-seq:
4773 *     str.upcase!   -> str or nil
4774 *
4775 *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
4776 *  were made.
4777 *  Note: case replacement is effective only in ASCII region.
4778 */
4779
4780static VALUE
4781rb_str_upcase_bang(VALUE str)
4782{
4783    rb_encoding *enc;
4784    char *s, *send;
4785    int modify = 0;
4786    int n;
4787
4788    str_modify_keep_cr(str);
4789    enc = STR_ENC_GET(str);
4790    rb_str_check_dummy_enc(enc);
4791    s = RSTRING_PTR(str); send = RSTRING_END(str);
4792    if (single_byte_optimizable(str)) {
4793	while (s < send) {
4794	    unsigned int c = *(unsigned char*)s;
4795
4796	    if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4797		*s = 'A' + (c - 'a');
4798		modify = 1;
4799	    }
4800	    s++;
4801	}
4802    }
4803    else {
4804	int ascompat = rb_enc_asciicompat(enc);
4805
4806	while (s < send) {
4807	    unsigned int c;
4808
4809	    if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4810		if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4811		    *s = 'A' + (c - 'a');
4812		    modify = 1;
4813		}
4814		s++;
4815	    }
4816	    else {
4817		c = rb_enc_codepoint_len(s, send, &n, enc);
4818		if (rb_enc_islower(c, enc)) {
4819		    /* assuming toupper returns codepoint with same size */
4820		    rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4821		    modify = 1;
4822		}
4823		s += n;
4824	    }
4825	}
4826    }
4827
4828    if (modify) return str;
4829    return Qnil;
4830}
4831
4832
4833/*
4834 *  call-seq:
4835 *     str.upcase   -> new_str
4836 *
4837 *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
4838 *  uppercase counterparts. The operation is locale insensitive---only
4839 *  characters ``a'' to ``z'' are affected.
4840 *  Note: case replacement is effective only in ASCII region.
4841 *
4842 *     "hEllO".upcase   #=> "HELLO"
4843 */
4844
4845static VALUE
4846rb_str_upcase(VALUE str)
4847{
4848    str = rb_str_dup(str);
4849    rb_str_upcase_bang(str);
4850    return str;
4851}
4852
4853
4854/*
4855 *  call-seq:
4856 *     str.downcase!   -> str or nil
4857 *
4858 *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4859 *  changes were made.
4860 *  Note: case replacement is effective only in ASCII region.
4861 */
4862
4863static VALUE
4864rb_str_downcase_bang(VALUE str)
4865{
4866    rb_encoding *enc;
4867    char *s, *send;
4868    int modify = 0;
4869
4870    str_modify_keep_cr(str);
4871    enc = STR_ENC_GET(str);
4872    rb_str_check_dummy_enc(enc);
4873    s = RSTRING_PTR(str); send = RSTRING_END(str);
4874    if (single_byte_optimizable(str)) {
4875	while (s < send) {
4876	    unsigned int c = *(unsigned char*)s;
4877
4878	    if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4879		*s = 'a' + (c - 'A');
4880		modify = 1;
4881	    }
4882	    s++;
4883	}
4884    }
4885    else {
4886	int ascompat = rb_enc_asciicompat(enc);
4887
4888	while (s < send) {
4889	    unsigned int c;
4890	    int n;
4891
4892	    if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4893		if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4894		    *s = 'a' + (c - 'A');
4895		    modify = 1;
4896		}
4897		s++;
4898	    }
4899	    else {
4900		c = rb_enc_codepoint_len(s, send, &n, enc);
4901		if (rb_enc_isupper(c, enc)) {
4902		    /* assuming toupper returns codepoint with same size */
4903		    rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4904		    modify = 1;
4905		}
4906		s += n;
4907	    }
4908	}
4909    }
4910
4911    if (modify) return str;
4912    return Qnil;
4913}
4914
4915
4916/*
4917 *  call-seq:
4918 *     str.downcase   -> new_str
4919 *
4920 *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
4921 *  lowercase counterparts. The operation is locale insensitive---only
4922 *  characters ``A'' to ``Z'' are affected.
4923 *  Note: case replacement is effective only in ASCII region.
4924 *
4925 *     "hEllO".downcase   #=> "hello"
4926 */
4927
4928static VALUE
4929rb_str_downcase(VALUE str)
4930{
4931    str = rb_str_dup(str);
4932    rb_str_downcase_bang(str);
4933    return str;
4934}
4935
4936
4937/*
4938 *  call-seq:
4939 *     str.capitalize!   -> str or nil
4940 *
4941 *  Modifies <i>str</i> by converting the first character to uppercase and the
4942 *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
4943 *  Note: case conversion is effective only in ASCII region.
4944 *
4945 *     a = "hello"
4946 *     a.capitalize!   #=> "Hello"
4947 *     a               #=> "Hello"
4948 *     a.capitalize!   #=> nil
4949 */
4950
4951static VALUE
4952rb_str_capitalize_bang(VALUE str)
4953{
4954    rb_encoding *enc;
4955    char *s, *send;
4956    int modify = 0;
4957    unsigned int c;
4958    int n;
4959
4960    str_modify_keep_cr(str);
4961    enc = STR_ENC_GET(str);
4962    rb_str_check_dummy_enc(enc);
4963    if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4964    s = RSTRING_PTR(str); send = RSTRING_END(str);
4965
4966    c = rb_enc_codepoint_len(s, send, &n, enc);
4967    if (rb_enc_islower(c, enc)) {
4968	rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4969	modify = 1;
4970    }
4971    s += n;
4972    while (s < send) {
4973	c = rb_enc_codepoint_len(s, send, &n, enc);
4974	if (rb_enc_isupper(c, enc)) {
4975	    rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4976	    modify = 1;
4977	}
4978	s += n;
4979    }
4980
4981    if (modify) return str;
4982    return Qnil;
4983}
4984
4985
4986/*
4987 *  call-seq:
4988 *     str.capitalize   -> new_str
4989 *
4990 *  Returns a copy of <i>str</i> with the first character converted to uppercase
4991 *  and the remainder to lowercase.
4992 *  Note: case conversion is effective only in ASCII region.
4993 *
4994 *     "hello".capitalize    #=> "Hello"
4995 *     "HELLO".capitalize    #=> "Hello"
4996 *     "123ABC".capitalize   #=> "123abc"
4997 */
4998
4999static VALUE
5000rb_str_capitalize(VALUE str)
5001{
5002    str = rb_str_dup(str);
5003    rb_str_capitalize_bang(str);
5004    return str;
5005}
5006
5007
5008/*
5009 *  call-seq:
5010 *     str.swapcase!   -> str or nil
5011 *
5012 *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
5013 *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
5014 *  Note: case conversion is effective only in ASCII region.
5015 */
5016
5017static VALUE
5018rb_str_swapcase_bang(VALUE str)
5019{
5020    rb_encoding *enc;
5021    char *s, *send;
5022    int modify = 0;
5023    int n;
5024
5025    str_modify_keep_cr(str);
5026    enc = STR_ENC_GET(str);
5027    rb_str_check_dummy_enc(enc);
5028    s = RSTRING_PTR(str); send = RSTRING_END(str);
5029    while (s < send) {
5030	unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
5031
5032	if (rb_enc_isupper(c, enc)) {
5033	    /* assuming toupper returns codepoint with same size */
5034	    rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
5035	    modify = 1;
5036	}
5037	else if (rb_enc_islower(c, enc)) {
5038	    /* assuming tolower returns codepoint with same size */
5039	    rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
5040	    modify = 1;
5041	}
5042	s += n;
5043    }
5044
5045    if (modify) return str;
5046    return Qnil;
5047}
5048
5049
5050/*
5051 *  call-seq:
5052 *     str.swapcase   -> new_str
5053 *
5054 *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
5055 *  to lowercase and lowercase characters converted to uppercase.
5056 *  Note: case conversion is effective only in ASCII region.
5057 *
5058 *     "Hello".swapcase          #=> "hELLO"
5059 *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
5060 */
5061
5062static VALUE
5063rb_str_swapcase(VALUE str)
5064{
5065    str = rb_str_dup(str);
5066    rb_str_swapcase_bang(str);
5067    return str;
5068}
5069
5070typedef unsigned char *USTR;
5071
5072struct tr {
5073    int gen;
5074    unsigned int now, max;
5075    char *p, *pend;
5076};
5077
5078static unsigned int
5079trnext(struct tr *t, rb_encoding *enc)
5080{
5081    int n;
5082
5083    for (;;) {
5084	if (!t->gen) {
5085nextpart:
5086	    if (t->p == t->pend) return -1;
5087	    if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
5088		t->p += n;
5089	    }
5090	    t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
5091	    t->p += n;
5092	    if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
5093		t->p += n;
5094		if (t->p < t->pend) {
5095		    unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
5096		    t->p += n;
5097		    if (t->now > c) {
5098			if (t->now < 0x80 && c < 0x80) {
5099			    rb_raise(rb_eArgError,
5100				     "invalid range \"%c-%c\" in string transliteration",
5101				     t->now, c);
5102			}
5103			else {
5104			    rb_raise(rb_eArgError, "invalid range in string transliteration");
5105			}
5106			continue; /* not reached */
5107		    }
5108		    t->gen = 1;
5109		    t->max = c;
5110		}
5111	    }
5112	    return t->now;
5113	}
5114	else {
5115	    while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
5116		if (t->now == t->max) {
5117		    t->gen = 0;
5118		    goto nextpart;
5119		}
5120	    }
5121	    if (t->now < t->max) {
5122		return t->now;
5123	    }
5124	    else {
5125		t->gen = 0;
5126		return t->max;
5127	    }
5128	}
5129    }
5130}
5131
5132static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
5133
5134static VALUE
5135tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
5136{
5137    const unsigned int errc = -1;
5138    unsigned int trans[256];
5139    rb_encoding *enc, *e1, *e2;
5140    struct tr trsrc, trrepl;
5141    int cflag = 0;
5142    unsigned int c, c0, last = 0;
5143    int modify = 0, i, l;
5144    char *s, *send;
5145    VALUE hash = 0;
5146    int singlebyte = single_byte_optimizable(str);
5147    int cr;
5148
5149#define CHECK_IF_ASCII(c) \
5150    (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
5151	   (cr = ENC_CODERANGE_VALID) : 0)
5152
5153    StringValue(src);
5154    StringValue(repl);
5155    if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5156    if (RSTRING_LEN(repl) == 0) {
5157	return rb_str_delete_bang(1, &src, str);
5158    }
5159
5160    cr = ENC_CODERANGE(str);
5161    e1 = rb_enc_check(str, src);
5162    e2 = rb_enc_check(str, repl);
5163    if (e1 == e2) {
5164	enc = e1;
5165    }
5166    else {
5167	enc = rb_enc_check(src, repl);
5168    }
5169    trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
5170    if (RSTRING_LEN(src) > 1 &&
5171	rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
5172	trsrc.p + l < trsrc.pend) {
5173	cflag = 1;
5174	trsrc.p += l;
5175    }
5176    trrepl.p = RSTRING_PTR(repl);
5177    trrepl.pend = trrepl.p + RSTRING_LEN(repl);
5178    trsrc.gen = trrepl.gen = 0;
5179    trsrc.now = trrepl.now = 0;
5180    trsrc.max = trrepl.max = 0;
5181
5182    if (cflag) {
5183	for (i=0; i<256; i++) {
5184	    trans[i] = 1;
5185	}
5186	while ((c = trnext(&trsrc, enc)) != errc) {
5187	    if (c < 256) {
5188		trans[c] = errc;
5189	    }
5190	    else {
5191		if (!hash) hash = rb_hash_new();
5192		rb_hash_aset(hash, UINT2NUM(c), Qtrue);
5193	    }
5194	}
5195	while ((c = trnext(&trrepl, enc)) != errc)
5196	    /* retrieve last replacer */;
5197	last = trrepl.now;
5198	for (i=0; i<256; i++) {
5199	    if (trans[i] != errc) {
5200		trans[i] = last;
5201	    }
5202	}
5203    }
5204    else {
5205	unsigned int r;
5206
5207	for (i=0; i<256; i++) {
5208	    trans[i] = errc;
5209	}
5210	while ((c = trnext(&trsrc, enc)) != errc) {
5211	    r = trnext(&trrepl, enc);
5212	    if (r == errc) r = trrepl.now;
5213	    if (c < 256) {
5214		trans[c] = r;
5215		if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
5216	    }
5217	    else {
5218		if (!hash) hash = rb_hash_new();
5219		rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
5220	    }
5221	}
5222    }
5223
5224    if (cr == ENC_CODERANGE_VALID)
5225	cr = ENC_CODERANGE_7BIT;
5226    str_modify_keep_cr(str);
5227    s = RSTRING_PTR(str); send = RSTRING_END(str);
5228    if (sflag) {
5229	int clen, tlen;
5230	long offset, max = RSTRING_LEN(str);
5231	unsigned int save = -1;
5232	char *buf = ALLOC_N(char, max), *t = buf;
5233
5234	while (s < send) {
5235	    int may_modify = 0;
5236
5237	    c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5238	    tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5239
5240	    s += clen;
5241	    if (c < 256) {
5242		c = trans[c];
5243	    }
5244	    else if (hash) {
5245		VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5246		if (NIL_P(tmp)) {
5247		    if (cflag) c = last;
5248		    else c = errc;
5249		}
5250		else if (cflag) c = errc;
5251		else c = NUM2INT(tmp);
5252	    }
5253	    else {
5254		c = errc;
5255	    }
5256	    if (c != (unsigned int)-1) {
5257		if (save == c) {
5258		    CHECK_IF_ASCII(c);
5259		    continue;
5260		}
5261		save = c;
5262		tlen = rb_enc_codelen(c, enc);
5263		modify = 1;
5264	    }
5265	    else {
5266		save = -1;
5267		c = c0;
5268		if (enc != e1) may_modify = 1;
5269	    }
5270	    while (t - buf + tlen >= max) {
5271		offset = t - buf;
5272		max *= 2;
5273		REALLOC_N(buf, char, max);
5274		t = buf + offset;
5275	    }
5276	    rb_enc_mbcput(c, t, enc);
5277	    if (may_modify && memcmp(s, t, tlen) != 0) {
5278		modify = 1;
5279	    }
5280	    CHECK_IF_ASCII(c);
5281	    t += tlen;
5282	}
5283	if (!STR_EMBED_P(str)) {
5284	    xfree(RSTRING(str)->as.heap.ptr);
5285	}
5286	*t = '\0';
5287	RSTRING(str)->as.heap.ptr = buf;
5288	RSTRING(str)->as.heap.len = t - buf;
5289	STR_SET_NOEMBED(str);
5290	RSTRING(str)->as.heap.aux.capa = max;
5291    }
5292    else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
5293	while (s < send) {
5294	    c = (unsigned char)*s;
5295	    if (trans[c] != errc) {
5296		if (!cflag) {
5297		    c = trans[c];
5298		    *s = c;
5299		    modify = 1;
5300		}
5301		else {
5302		    *s = last;
5303		    modify = 1;
5304		}
5305	    }
5306	    CHECK_IF_ASCII(c);
5307	    s++;
5308	}
5309    }
5310    else {
5311	int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
5312	long offset;
5313	char *buf = ALLOC_N(char, max), *t = buf;
5314
5315	while (s < send) {
5316	    int may_modify = 0;
5317	    c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5318	    tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5319
5320	    if (c < 256) {
5321		c = trans[c];
5322	    }
5323	    else if (hash) {
5324		VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5325		if (NIL_P(tmp)) {
5326		    if (cflag) c = last;
5327		    else c = errc;
5328		}
5329		else if (cflag) c = errc;
5330		else c = NUM2INT(tmp);
5331	    }
5332	    else {
5333		c = cflag ? last : errc;
5334	    }
5335	    if (c != errc) {
5336		tlen = rb_enc_codelen(c, enc);
5337		modify = 1;
5338	    }
5339	    else {
5340		c = c0;
5341		if (enc != e1) may_modify = 1;
5342	    }
5343	    while (t - buf + tlen >= max) {
5344		offset = t - buf;
5345		max *= 2;
5346		REALLOC_N(buf, char, max);
5347		t = buf + offset;
5348	    }
5349	    if (s != t) {
5350		rb_enc_mbcput(c, t, enc);
5351		if (may_modify && memcmp(s, t, tlen) != 0) {
5352		    modify = 1;
5353		}
5354	    }
5355	    CHECK_IF_ASCII(c);
5356	    s += clen;
5357	    t += tlen;
5358	}
5359	if (!STR_EMBED_P(str)) {
5360	    xfree(RSTRING(str)->as.heap.ptr);
5361	}
5362	*t = '\0';
5363	RSTRING(str)->as.heap.ptr = buf;
5364	RSTRING(str)->as.heap.len = t - buf;
5365	STR_SET_NOEMBED(str);
5366	RSTRING(str)->as.heap.aux.capa = max;
5367    }
5368
5369    if (modify) {
5370	if (cr != ENC_CODERANGE_BROKEN)
5371	    ENC_CODERANGE_SET(str, cr);
5372	rb_enc_associate(str, enc);
5373	return str;
5374    }
5375    return Qnil;
5376}
5377
5378
5379/*
5380 *  call-seq:
5381 *     str.tr!(from_str, to_str)   -> str or nil
5382 *
5383 *  Translates <i>str</i> in place, using the same rules as
5384 *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
5385 *  changes were made.
5386 */
5387
5388static VALUE
5389rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
5390{
5391    return tr_trans(str, src, repl, 0);
5392}
5393
5394
5395/*
5396 *  call-seq:
5397 *     str.tr(from_str, to_str)   => new_str
5398 *
5399 *  Returns a copy of +str+ with the characters in +from_str+ replaced by the
5400 *  corresponding characters in +to_str+.  If +to_str+ is shorter than
5401 *  +from_str+, it is padded with its last character in order to maintain the
5402 *  correspondence.
5403 *
5404 *     "hello".tr('el', 'ip')      #=> "hippo"
5405 *     "hello".tr('aeiou', '*')    #=> "h*ll*"
5406 *     "hello".tr('aeiou', 'AA*')  #=> "hAll*"
5407 *
5408 *  Both strings may use the <code>c1-c2</code> notation to denote ranges of
5409 *  characters, and +from_str+ may start with a <code>^</code>, which denotes
5410 *  all characters except those listed.
5411 *
5412 *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
5413 *     "hello".tr('^aeiou', '*')   #=> "*e**o"
5414 *
5415 *  The backslash character <code>\</code> can be used to escape
5416 *  <code>^</code> or <code>-</code> and is otherwise ignored unless it
5417 *  appears at the end of a range or the end of the +from_str+ or +to_str+:
5418 *
5419 *     "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
5420 *     "hello-world".tr("a\\-eo", "*")   #=> "h*ll**w*rld"
5421 *
5422 *     "hello\r\nworld".tr("\r", "")   #=> "hello\nworld"
5423 *     "hello\r\nworld".tr("\\r", "")  #=> "hello\r\nwold"
5424 *     "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
5425 *
5426 *     "X['\\b']".tr("X\\", "")   #=> "['b']"
5427 *     "X['\\b']".tr("X-\\]", "") #=> "'b'"
5428 */
5429
5430static VALUE
5431rb_str_tr(VALUE str, VALUE src, VALUE repl)
5432{
5433    str = rb_str_dup(str);
5434    tr_trans(str, src, repl, 0);
5435    return str;
5436}
5437
5438#define TR_TABLE_SIZE 257
5439static void
5440tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
5441	       VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
5442{
5443    const unsigned int errc = -1;
5444    char buf[256];
5445    struct tr tr;
5446    unsigned int c;
5447    VALUE table = 0, ptable = 0;
5448    int i, l, cflag = 0;
5449
5450    tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
5451    tr.gen = tr.now = tr.max = 0;
5452
5453    if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
5454	cflag = 1;
5455	tr.p += l;
5456    }
5457    if (first) {
5458	for (i=0; i<256; i++) {
5459	    stable[i] = 1;
5460	}
5461	stable[256] = cflag;
5462    }
5463    else if (stable[256] && !cflag) {
5464	stable[256] = 0;
5465    }
5466    for (i=0; i<256; i++) {
5467	buf[i] = cflag;
5468    }
5469
5470    while ((c = trnext(&tr, enc)) != errc) {
5471	if (c < 256) {
5472	    buf[c & 0xff] = !cflag;
5473	}
5474	else {
5475	    VALUE key = UINT2NUM(c);
5476
5477	    if (!table && (first || *tablep || stable[256])) {
5478		if (cflag) {
5479		    ptable = *ctablep;
5480		    table = ptable ? ptable : rb_hash_new();
5481		    *ctablep = table;
5482		}
5483		else {
5484		    table = rb_hash_new();
5485		    ptable = *tablep;
5486		    *tablep = table;
5487		}
5488	    }
5489	    if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
5490		rb_hash_aset(table, key, Qtrue);
5491	    }
5492	}
5493    }
5494    for (i=0; i<256; i++) {
5495	stable[i] = stable[i] && buf[i];
5496    }
5497    if (!table && !cflag) {
5498	*tablep = 0;
5499    }
5500}
5501
5502
5503static int
5504tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
5505{
5506    if (c < 256) {
5507	return table[c] != 0;
5508    }
5509    else {
5510	VALUE v = UINT2NUM(c);
5511
5512	if (del) {
5513	    if (!NIL_P(rb_hash_lookup(del, v)) &&
5514		    (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
5515		return TRUE;
5516	    }
5517	}
5518	else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
5519	    return FALSE;
5520	}
5521	return table[256] ? TRUE : FALSE;
5522    }
5523}
5524
5525/*
5526 *  call-seq:
5527 *     str.delete!([other_str]+)   -> str or nil
5528 *
5529 *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
5530 *  <code>nil</code> if <i>str</i> was not modified.
5531 */
5532
5533static VALUE
5534rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
5535{
5536    char squeez[TR_TABLE_SIZE];
5537    rb_encoding *enc = 0;
5538    char *s, *send, *t;
5539    VALUE del = 0, nodel = 0;
5540    int modify = 0;
5541    int i, ascompat, cr;
5542
5543    if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5544    rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
5545    for (i=0; i<argc; i++) {
5546	VALUE s = argv[i];
5547
5548	StringValue(s);
5549	enc = rb_enc_check(str, s);
5550	tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5551    }
5552
5553    str_modify_keep_cr(str);
5554    ascompat = rb_enc_asciicompat(enc);
5555    s = t = RSTRING_PTR(str);
5556    send = RSTRING_END(str);
5557    cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5558    while (s < send) {
5559	unsigned int c;
5560	int clen;
5561
5562	if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5563	    if (squeez[c]) {
5564		modify = 1;
5565	    }
5566	    else {
5567		if (t != s) *t = c;
5568		t++;
5569	    }
5570	    s++;
5571	}
5572	else {
5573	    c = rb_enc_codepoint_len(s, send, &clen, enc);
5574
5575	    if (tr_find(c, squeez, del, nodel)) {
5576		modify = 1;
5577	    }
5578	    else {
5579		if (t != s) rb_enc_mbcput(c, t, enc);
5580		t += clen;
5581		if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
5582	    }
5583	    s += clen;
5584	}
5585    }
5586    *t = '\0';
5587    STR_SET_LEN(str, t - RSTRING_PTR(str));
5588    ENC_CODERANGE_SET(str, cr);
5589
5590    if (modify) return str;
5591    return Qnil;
5592}
5593
5594
5595/*
5596 *  call-seq:
5597 *     str.delete([other_str]+)   -> new_str
5598 *
5599 *  Returns a copy of <i>str</i> with all characters in the intersection of its
5600 *  arguments deleted. Uses the same rules for building the set of characters as
5601 *  <code>String#count</code>.
5602 *
5603 *     "hello".delete "l","lo"        #=> "heo"
5604 *     "hello".delete "lo"            #=> "he"
5605 *     "hello".delete "aeiou", "^e"   #=> "hell"
5606 *     "hello".delete "ej-m"          #=> "ho"
5607 */
5608
5609static VALUE
5610rb_str_delete(int argc, VALUE *argv, VALUE str)
5611{
5612    str = rb_str_dup(str);
5613    rb_str_delete_bang(argc, argv, str);
5614    return str;
5615}
5616
5617
5618/*
5619 *  call-seq:
5620 *     str.squeeze!([other_str]*)   -> str or nil
5621 *
5622 *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
5623 *  <code>nil</code> if no changes were made.
5624 */
5625
5626static VALUE
5627rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
5628{
5629    char squeez[TR_TABLE_SIZE];
5630    rb_encoding *enc = 0;
5631    VALUE del = 0, nodel = 0;
5632    char *s, *send, *t;
5633    int i, modify = 0;
5634    int ascompat, singlebyte = single_byte_optimizable(str);
5635    unsigned int save;
5636
5637    if (argc == 0) {
5638	enc = STR_ENC_GET(str);
5639    }
5640    else {
5641	for (i=0; i<argc; i++) {
5642	    VALUE s = argv[i];
5643
5644	    StringValue(s);
5645	    enc = rb_enc_check(str, s);
5646	    if (singlebyte && !single_byte_optimizable(s))
5647		singlebyte = 0;
5648	    tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5649	}
5650    }
5651
5652    str_modify_keep_cr(str);
5653    s = t = RSTRING_PTR(str);
5654    if (!s || RSTRING_LEN(str) == 0) return Qnil;
5655    send = RSTRING_END(str);
5656    save = -1;
5657    ascompat = rb_enc_asciicompat(enc);
5658
5659    if (singlebyte) {
5660        while (s < send) {
5661	    unsigned int c = *(unsigned char*)s++;
5662	    if (c != save || (argc > 0 && !squeez[c])) {
5663	        *t++ = save = c;
5664	    }
5665	}
5666    } else {
5667	while (s < send) {
5668	    unsigned int c;
5669	    int clen;
5670
5671	    if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5672		if (c != save || (argc > 0 && !squeez[c])) {
5673		    *t++ = save = c;
5674		}
5675		s++;
5676	    }
5677	    else {
5678		c = rb_enc_codepoint_len(s, send, &clen, enc);
5679
5680		if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
5681		    if (t != s) rb_enc_mbcput(c, t, enc);
5682		    save = c;
5683		    t += clen;
5684		}
5685		s += clen;
5686	    }
5687	}
5688    }
5689
5690    *t = '\0';
5691    if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
5692	STR_SET_LEN(str, t - RSTRING_PTR(str));
5693	modify = 1;
5694    }
5695
5696    if (modify) return str;
5697    return Qnil;
5698}
5699
5700
5701/*
5702 *  call-seq:
5703 *     str.squeeze([other_str]*)    -> new_str
5704 *
5705 *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
5706 *  procedure described for <code>String#count</code>. Returns a new string
5707 *  where runs of the same character that occur in this set are replaced by a
5708 *  single character. If no arguments are given, all runs of identical
5709 *  characters are replaced by a single character.
5710 *
5711 *     "yellow moon".squeeze                  #=> "yelow mon"
5712 *     "  now   is  the".squeeze(" ")         #=> " now is the"
5713 *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
5714 */
5715
5716static VALUE
5717rb_str_squeeze(int argc, VALUE *argv, VALUE str)
5718{
5719    str = rb_str_dup(str);
5720    rb_str_squeeze_bang(argc, argv, str);
5721    return str;
5722}
5723
5724
5725/*
5726 *  call-seq:
5727 *     str.tr_s!(from_str, to_str)   -> str or nil
5728 *
5729 *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
5730 *  returning <i>str</i>, or <code>nil</code> if no changes were made.
5731 */
5732
5733static VALUE
5734rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
5735{
5736    return tr_trans(str, src, repl, 1);
5737}
5738
5739
5740/*
5741 *  call-seq:
5742 *     str.tr_s(from_str, to_str)   -> new_str
5743 *
5744 *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
5745 *  then removes duplicate characters in regions that were affected by the
5746 *  translation.
5747 *
5748 *     "hello".tr_s('l', 'r')     #=> "hero"
5749 *     "hello".tr_s('el', '*')    #=> "h*o"
5750 *     "hello".tr_s('el', 'hx')   #=> "hhxo"
5751 */
5752
5753static VALUE
5754rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
5755{
5756    str = rb_str_dup(str);
5757    tr_trans(str, src, repl, 1);
5758    return str;
5759}
5760
5761
5762/*
5763 *  call-seq:
5764 *     str.count([other_str]+)   -> fixnum
5765 *
5766 *  Each +other_str+ parameter defines a set of characters to count.  The
5767 *  intersection of these sets defines the characters to count in +str+.  Any
5768 *  +other_str+ that starts with a caret <code>^</code> is negated.  The
5769 *  sequence <code>c1-c2</code> means all characters between c1 and c2.  The
5770 *  backslash character <code>\</code> can be used to escape <code>^</code> or
5771 *  <code>-</code> and is otherwise ignored unless it appears at the end of a
5772 *  sequence or the end of a +other_str+.
5773 *
5774 *     a = "hello world"
5775 *     a.count "lo"                   #=> 5
5776 *     a.count "lo", "o"              #=> 2
5777 *     a.count "hello", "^l"          #=> 4
5778 *     a.count "ej-m"                 #=> 4
5779 *
5780 *     "hello^world".count "\\^aeiou" #=> 4
5781 *     "hello-world".count "a\\-eo"   #=> 4
5782 *
5783 *     c = "hello world\\r\\n"
5784 *     c.count "\\"                   #=> 2
5785 *     c.count "\\A"                  #=> 0
5786 *     c.count "X-\\w"                #=> 3
5787 */
5788
5789static VALUE
5790rb_str_count(int argc, VALUE *argv, VALUE str)
5791{
5792    char table[TR_TABLE_SIZE];
5793    rb_encoding *enc = 0;
5794    VALUE del = 0, nodel = 0;
5795    char *s, *send;
5796    int i;
5797    int ascompat;
5798
5799    rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
5800    for (i=0; i<argc; i++) {
5801	VALUE tstr = argv[i];
5802	unsigned char c;
5803
5804	StringValue(tstr);
5805	enc = rb_enc_check(str, tstr);
5806	if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
5807	    (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
5808	    int n = 0;
5809
5810	    s = RSTRING_PTR(str);
5811	    if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5812	    send = RSTRING_END(str);
5813	    while (s < send) {
5814		if (*(unsigned char*)s++ == c) n++;
5815	    }
5816	    return INT2NUM(n);
5817	}
5818	tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
5819    }
5820
5821    s = RSTRING_PTR(str);
5822    if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5823    send = RSTRING_END(str);
5824    ascompat = rb_enc_asciicompat(enc);
5825    i = 0;
5826    while (s < send) {
5827	unsigned int c;
5828
5829	if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5830	    if (table[c]) {
5831		i++;
5832	    }
5833	    s++;
5834	}
5835	else {
5836	    int clen;
5837	    c = rb_enc_codepoint_len(s, send, &clen, enc);
5838	    if (tr_find(c, table, del, nodel)) {
5839		i++;
5840	    }
5841	    s += clen;
5842	}
5843    }
5844
5845    return INT2NUM(i);
5846}
5847
5848static const char isspacetable[256] = {
5849    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
5850    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5851    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5852    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5853    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5854    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5855    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5856    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5857    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5858    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5859    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5860    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5861    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5862    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5863    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5864    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5865};
5866
5867#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
5868
5869/*
5870 *  call-seq:
5871 *     str.split(pattern=$;, [limit])   -> anArray
5872 *
5873 *  Divides <i>str</i> into substrings based on a delimiter, returning an array
5874 *  of these substrings.
5875 *
5876 *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
5877 *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
5878 *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
5879 *  of contiguous whitespace characters ignored.
5880 *
5881 *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
5882 *  pattern matches. Whenever the pattern matches a zero-length string,
5883 *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
5884 *  groups, the respective matches will be returned in the array as well.
5885 *
5886 *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
5887 *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
5888 *  split on whitespace as if ` ' were specified.
5889 *
5890 *  If the <i>limit</i> parameter is omitted, trailing null fields are
5891 *  suppressed. If <i>limit</i> is a positive number, at most that number of
5892 *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
5893 *  string is returned as the only entry in an array). If negative, there is no
5894 *  limit to the number of fields returned, and trailing null fields are not
5895 *  suppressed.
5896 *
5897 *  When the input +str+ is empty an empty Array is returned as the string is
5898 *  considered to have no fields to split.
5899 *
5900 *     " now's  the time".split        #=> ["now's", "the", "time"]
5901 *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
5902 *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
5903 *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
5904 *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
5905 *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
5906 *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
5907 *
5908 *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
5909 *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
5910 *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
5911 *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
5912 *
5913 *     "".split(',', -1)               #=> []
5914 */
5915
5916static VALUE
5917rb_str_split_m(int argc, VALUE *argv, VALUE str)
5918{
5919    rb_encoding *enc;
5920    VALUE spat;
5921    VALUE limit;
5922    enum {awk, string, regexp} split_type;
5923    long beg, end, i = 0;
5924    int lim = 0;
5925    VALUE result, tmp;
5926
5927    if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
5928	lim = NUM2INT(limit);
5929	if (lim <= 0) limit = Qnil;
5930	else if (lim == 1) {
5931	    if (RSTRING_LEN(str) == 0)
5932		return rb_ary_new2(0);
5933	    return rb_ary_new3(1, str);
5934	}
5935	i = 1;
5936    }
5937
5938    enc = STR_ENC_GET(str);
5939    if (NIL_P(spat)) {
5940	if (!NIL_P(rb_fs)) {
5941	    spat = rb_fs;
5942	    goto fs_set;
5943	}
5944	split_type = awk;
5945    }
5946    else {
5947      fs_set:
5948	if (RB_TYPE_P(spat, T_STRING)) {
5949	    rb_encoding *enc2 = STR_ENC_GET(spat);
5950
5951	    split_type = string;
5952	    if (RSTRING_LEN(spat) == 0) {
5953		/* Special case - split into chars */
5954		spat = rb_reg_regcomp(spat);
5955		split_type = regexp;
5956	    }
5957	    else if (rb_enc_asciicompat(enc2) == 1) {
5958		if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
5959		    split_type = awk;
5960		}
5961	    }
5962	    else {
5963		int l;
5964		if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
5965		    RSTRING_LEN(spat) == l) {
5966		    split_type = awk;
5967		}
5968	    }
5969	}
5970	else {
5971	    spat = get_pat(spat, 1);
5972	    split_type = regexp;
5973	}
5974    }
5975
5976    result = rb_ary_new();
5977    beg = 0;
5978    if (split_type == awk) {
5979	char *ptr = RSTRING_PTR(str);
5980	char *eptr = RSTRING_END(str);
5981	char *bptr = ptr;
5982	int skip = 1;
5983	unsigned int c;
5984
5985	end = beg;
5986	if (is_ascii_string(str)) {
5987	    while (ptr < eptr) {
5988		c = (unsigned char)*ptr++;
5989		if (skip) {
5990		    if (ascii_isspace(c)) {
5991			beg = ptr - bptr;
5992		    }
5993		    else {
5994			end = ptr - bptr;
5995			skip = 0;
5996			if (!NIL_P(limit) && lim <= i) break;
5997		    }
5998		}
5999		else if (ascii_isspace(c)) {
6000		    rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6001		    skip = 1;
6002		    beg = ptr - bptr;
6003		    if (!NIL_P(limit)) ++i;
6004		}
6005		else {
6006		    end = ptr - bptr;
6007		}
6008	    }
6009	}
6010	else {
6011	    while (ptr < eptr) {
6012		int n;
6013
6014		c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
6015		ptr += n;
6016		if (skip) {
6017		    if (rb_isspace(c)) {
6018			beg = ptr - bptr;
6019		    }
6020		    else {
6021			end = ptr - bptr;
6022			skip = 0;
6023			if (!NIL_P(limit) && lim <= i) break;
6024		    }
6025		}
6026		else if (rb_isspace(c)) {
6027		    rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6028		    skip = 1;
6029		    beg = ptr - bptr;
6030		    if (!NIL_P(limit)) ++i;
6031		}
6032		else {
6033		    end = ptr - bptr;
6034		}
6035	    }
6036	}
6037    }
6038    else if (split_type == string) {
6039	char *ptr = RSTRING_PTR(str);
6040	char *temp = ptr;
6041	char *eptr = RSTRING_END(str);
6042	char *sptr = RSTRING_PTR(spat);
6043	long slen = RSTRING_LEN(spat);
6044
6045	if (is_broken_string(str)) {
6046	    rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
6047	}
6048	if (is_broken_string(spat)) {
6049	    rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
6050	}
6051	enc = rb_enc_check(str, spat);
6052	while (ptr < eptr &&
6053	       (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
6054	    /* Check we are at the start of a char */
6055	    char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
6056	    if (t != ptr + end) {
6057		ptr = t;
6058		continue;
6059	    }
6060	    rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
6061	    ptr += end + slen;
6062	    if (!NIL_P(limit) && lim <= ++i) break;
6063	}
6064	beg = ptr - temp;
6065    }
6066    else {
6067	char *ptr = RSTRING_PTR(str);
6068	long len = RSTRING_LEN(str);
6069	long start = beg;
6070	long idx;
6071	int last_null = 0;
6072	struct re_registers *regs;
6073
6074	while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
6075	    regs = RMATCH_REGS(rb_backref_get());
6076	    if (start == end && BEG(0) == END(0)) {
6077		if (!ptr) {
6078		    rb_ary_push(result, str_new_empty(str));
6079		    break;
6080		}
6081		else if (last_null == 1) {
6082		    rb_ary_push(result, rb_str_subseq(str, beg,
6083						      rb_enc_fast_mbclen(ptr+beg,
6084									 ptr+len,
6085									 enc)));
6086		    beg = start;
6087		}
6088		else {
6089                    if (ptr+start == ptr+len)
6090                        start++;
6091                    else
6092                        start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
6093		    last_null = 1;
6094		    continue;
6095		}
6096	    }
6097	    else {
6098		rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6099		beg = start = END(0);
6100	    }
6101	    last_null = 0;
6102
6103	    for (idx=1; idx < regs->num_regs; idx++) {
6104		if (BEG(idx) == -1) continue;
6105		if (BEG(idx) == END(idx))
6106		    tmp = str_new_empty(str);
6107		else
6108		    tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
6109		rb_ary_push(result, tmp);
6110	    }
6111	    if (!NIL_P(limit) && lim <= ++i) break;
6112	}
6113    }
6114    if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
6115	if (RSTRING_LEN(str) == beg)
6116	    tmp = str_new_empty(str);
6117	else
6118	    tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
6119	rb_ary_push(result, tmp);
6120    }
6121    if (NIL_P(limit) && lim == 0) {
6122	long len;
6123	while ((len = RARRAY_LEN(result)) > 0 &&
6124	       (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
6125	    rb_ary_pop(result);
6126    }
6127
6128    return result;
6129}
6130
6131VALUE
6132rb_str_split(VALUE str, const char *sep0)
6133{
6134    VALUE sep;
6135
6136    StringValue(str);
6137    sep = rb_str_new2(sep0);
6138    return rb_str_split_m(1, &sep, str);
6139}
6140
6141
6142static VALUE
6143rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
6144{
6145    rb_encoding *enc;
6146    VALUE rs;
6147    unsigned int newline;
6148    const char *p, *pend, *s, *ptr;
6149    long len, rslen;
6150    VALUE line;
6151    int n;
6152    VALUE orig = str;
6153    VALUE UNINITIALIZED_VAR(ary);
6154
6155    if (argc == 0) {
6156	rs = rb_rs;
6157    }
6158    else {
6159	rb_scan_args(argc, argv, "01", &rs);
6160    }
6161
6162    if (rb_block_given_p()) {
6163	if (wantarray) {
6164#if 0 /* next major */
6165	    rb_warn("given block not used");
6166	    ary = rb_ary_new();
6167#else
6168	    rb_warning("passing a block to String#lines is deprecated");
6169	    wantarray = 0;
6170#endif
6171	}
6172    }
6173    else {
6174	if (wantarray)
6175	    ary = rb_ary_new();
6176	else
6177	    RETURN_ENUMERATOR(str, argc, argv);
6178    }
6179
6180    if (NIL_P(rs)) {
6181	if (wantarray) {
6182	    rb_ary_push(ary, str);
6183	    return ary;
6184	}
6185	else {
6186	    rb_yield(str);
6187	    return orig;
6188	}
6189    }
6190    str = rb_str_new4(str);
6191    ptr = p = s = RSTRING_PTR(str);
6192    pend = p + RSTRING_LEN(str);
6193    len = RSTRING_LEN(str);
6194    StringValue(rs);
6195    if (rs == rb_default_rs) {
6196	enc = rb_enc_get(str);
6197	while (p < pend) {
6198	    char *p0;
6199
6200	    p = memchr(p, '\n', pend - p);
6201	    if (!p) break;
6202	    p0 = rb_enc_left_char_head(s, p, pend, enc);
6203	    if (!rb_enc_is_newline(p0, pend, enc)) {
6204		p++;
6205		continue;
6206	    }
6207	    p = p0 + rb_enc_mbclen(p0, pend, enc);
6208	    line = rb_str_subseq(str, s - ptr, p - s);
6209	    if (wantarray)
6210		rb_ary_push(ary, line);
6211	    else
6212		rb_yield(line);
6213	    str_mod_check(str, ptr, len);
6214	    s = p;
6215	}
6216	goto finish;
6217    }
6218
6219    enc = rb_enc_check(str, rs);
6220    rslen = RSTRING_LEN(rs);
6221    if (rslen == 0) {
6222	newline = '\n';
6223    }
6224    else {
6225	newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
6226    }
6227
6228    while (p < pend) {
6229	unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
6230
6231      again:
6232	if (rslen == 0 && c == newline) {
6233	    p += n;
6234	    if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
6235		goto again;
6236	    }
6237	    while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
6238		p += n;
6239	    }
6240	    p -= n;
6241	}
6242	if (c == newline &&
6243	    (rslen <= 1 ||
6244	     (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
6245	    const char *pp = p + (rslen ? rslen : n);
6246	    line = rb_str_subseq(str, s - ptr, pp - s);
6247	    if (wantarray)
6248		rb_ary_push(ary, line);
6249	    else
6250		rb_yield(line);
6251	    str_mod_check(str, ptr, len);
6252	    s = pp;
6253	}
6254	p += n;
6255    }
6256
6257  finish:
6258    if (s != pend) {
6259	line = rb_str_subseq(str, s - ptr, pend - s);
6260	if (wantarray)
6261	    rb_ary_push(ary, line);
6262	else
6263	    rb_yield(line);
6264	RB_GC_GUARD(str);
6265    }
6266
6267    if (wantarray)
6268	return ary;
6269    else
6270	return orig;
6271}
6272
6273/*
6274 *  call-seq:
6275 *     str.each_line(separator=$/) {|substr| block }   -> str
6276 *     str.each_line(separator=$/)                     -> an_enumerator
6277 *
6278 *  Splits <i>str</i> using the supplied parameter as the record
6279 *  separator (<code>$/</code> by default), passing each substring in
6280 *  turn to the supplied block.  If a zero-length record separator is
6281 *  supplied, the string is split into paragraphs delimited by
6282 *  multiple successive newlines.
6283 *
6284 *  If no block is given, an enumerator is returned instead.
6285 *
6286 *     print "Example one\n"
6287 *     "hello\nworld".each_line {|s| p s}
6288 *     print "Example two\n"
6289 *     "hello\nworld".each_line('l') {|s| p s}
6290 *     print "Example three\n"
6291 *     "hello\n\n\nworld".each_line('') {|s| p s}
6292 *
6293 *  <em>produces:</em>
6294 *
6295 *     Example one
6296 *     "hello\n"
6297 *     "world"
6298 *     Example two
6299 *     "hel"
6300 *     "l"
6301 *     "o\nworl"
6302 *     "d"
6303 *     Example three
6304 *     "hello\n\n\n"
6305 *     "world"
6306 */
6307
6308static VALUE
6309rb_str_each_line(int argc, VALUE *argv, VALUE str)
6310{
6311    return rb_str_enumerate_lines(argc, argv, str, 0);
6312}
6313
6314/*
6315 *  call-seq:
6316 *     str.lines(separator=$/)  -> an_array
6317 *
6318 *  Returns an array of lines in <i>str</i> split using the supplied
6319 *  record separator (<code>$/</code> by default).  This is a
6320 *  shorthand for <code>str.each_line(separator).to_a</code>.
6321 *
6322 *  If a block is given, which is a deprecated form, works the same as
6323 *  <code>each_line</code>.
6324 */
6325
6326static VALUE
6327rb_str_lines(int argc, VALUE *argv, VALUE str)
6328{
6329    return rb_str_enumerate_lines(argc, argv, str, 1);
6330}
6331
6332static VALUE
6333rb_str_each_byte_size(VALUE str, VALUE args)
6334{
6335    return LONG2FIX(RSTRING_LEN(str));
6336}
6337
6338static VALUE
6339rb_str_enumerate_bytes(VALUE str, int wantarray)
6340{
6341    long i;
6342    VALUE UNINITIALIZED_VAR(ary);
6343
6344    if (rb_block_given_p()) {
6345	if (wantarray) {
6346#if 0 /* next major */
6347	    rb_warn("given block not used");
6348	    ary = rb_ary_new();
6349#else
6350	    rb_warning("passing a block to String#bytes is deprecated");
6351	    wantarray = 0;
6352#endif
6353	}
6354    }
6355    else {
6356	if (wantarray)
6357	    ary = rb_ary_new2(RSTRING_LEN(str));
6358	else
6359	    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
6360    }
6361
6362    for (i=0; i<RSTRING_LEN(str); i++) {
6363	if (wantarray)
6364	    rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6365	else
6366	    rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6367    }
6368    if (wantarray)
6369	return ary;
6370    else
6371	return str;
6372}
6373
6374/*
6375 *  call-seq:
6376 *     str.each_byte {|fixnum| block }    -> str
6377 *     str.each_byte                      -> an_enumerator
6378 *
6379 *  Passes each byte in <i>str</i> to the given block, or returns an
6380 *  enumerator if no block is given.
6381 *
6382 *     "hello".each_byte {|c| print c, ' ' }
6383 *
6384 *  <em>produces:</em>
6385 *
6386 *     104 101 108 108 111
6387 */
6388
6389static VALUE
6390rb_str_each_byte(VALUE str)
6391{
6392    return rb_str_enumerate_bytes(str, 0);
6393}
6394
6395/*
6396 *  call-seq:
6397 *     str.bytes    -> an_array
6398 *
6399 *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
6400 *  <code>str.each_byte.to_a</code>.
6401 *
6402 *  If a block is given, which is a deprecated form, works the same as
6403 *  <code>each_byte</code>.
6404 */
6405
6406static VALUE
6407rb_str_bytes(VALUE str)
6408{
6409    return rb_str_enumerate_bytes(str, 1);
6410}
6411
6412static VALUE
6413rb_str_each_char_size(VALUE str)
6414{
6415    long len = RSTRING_LEN(str);
6416    if (!single_byte_optimizable(str)) {
6417	const char *ptr = RSTRING_PTR(str);
6418	rb_encoding *enc = rb_enc_get(str);
6419	const char *end_ptr = ptr + len;
6420	for (len = 0; ptr < end_ptr; ++len) {
6421	    ptr += rb_enc_mbclen(ptr, end_ptr, enc);
6422	}
6423    }
6424    return LONG2FIX(len);
6425}
6426
6427static VALUE
6428rb_str_enumerate_chars(VALUE str, int wantarray)
6429{
6430    VALUE orig = str;
6431    VALUE substr;
6432    long i, len, n;
6433    const char *ptr;
6434    rb_encoding *enc;
6435    VALUE UNINITIALIZED_VAR(ary);
6436
6437    if (rb_block_given_p()) {
6438	if (wantarray) {
6439#if 0 /* next major */
6440	    rb_warn("given block not used");
6441	    ary = rb_ary_new();
6442#else
6443	    rb_warning("passing a block to String#chars is deprecated");
6444	    wantarray = 0;
6445#endif
6446	}
6447    }
6448    else {
6449	if (wantarray)
6450	    ary = rb_ary_new();
6451	else
6452	    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
6453    }
6454
6455    str = rb_str_new4(str);
6456    ptr = RSTRING_PTR(str);
6457    len = RSTRING_LEN(str);
6458    enc = rb_enc_get(str);
6459    switch (ENC_CODERANGE(str)) {
6460      case ENC_CODERANGE_VALID:
6461      case ENC_CODERANGE_7BIT:
6462	for (i = 0; i < len; i += n) {
6463	    n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
6464	    substr = rb_str_subseq(str, i, n);
6465	    if (wantarray)
6466		rb_ary_push(ary, substr);
6467	    else
6468		rb_yield(substr);
6469	}
6470	break;
6471      default:
6472	for (i = 0; i < len; i += n) {
6473	    n = rb_enc_mbclen(ptr + i, ptr + len, enc);
6474	    substr = rb_str_subseq(str, i, n);
6475	    if (wantarray)
6476		rb_ary_push(ary, substr);
6477	    else
6478		rb_yield(substr);
6479	}
6480    }
6481    RB_GC_GUARD(str);
6482    if (wantarray)
6483	return ary;
6484    else
6485	return orig;
6486}
6487
6488/*
6489 *  call-seq:
6490 *     str.each_char {|cstr| block }    -> str
6491 *     str.each_char                    -> an_enumerator
6492 *
6493 *  Passes each character in <i>str</i> to the given block, or returns
6494 *  an enumerator if no block is given.
6495 *
6496 *     "hello".each_char {|c| print c, ' ' }
6497 *
6498 *  <em>produces:</em>
6499 *
6500 *     h e l l o
6501 */
6502
6503static VALUE
6504rb_str_each_char(VALUE str)
6505{
6506    return rb_str_enumerate_chars(str, 0);
6507}
6508
6509/*
6510 *  call-seq:
6511 *     str.chars    -> an_array
6512 *
6513 *  Returns an array of characters in <i>str</i>.  This is a shorthand
6514 *  for <code>str.each_char.to_a</code>.
6515 *
6516 *  If a block is given, which is a deprecated form, works the same as
6517 *  <code>each_char</code>.
6518 */
6519
6520static VALUE
6521rb_str_chars(VALUE str)
6522{
6523    return rb_str_enumerate_chars(str, 1);
6524}
6525
6526
6527static VALUE
6528rb_str_enumerate_codepoints(VALUE str, int wantarray)
6529{
6530    VALUE orig = str;
6531    int n;
6532    unsigned int c;
6533    const char *ptr, *end;
6534    rb_encoding *enc;
6535    VALUE UNINITIALIZED_VAR(ary);
6536
6537    if (single_byte_optimizable(str))
6538	return rb_str_enumerate_bytes(str, wantarray);
6539
6540    if (rb_block_given_p()) {
6541	if (wantarray) {
6542#if 0 /* next major */
6543	    rb_warn("given block not used");
6544	    ary = rb_ary_new();
6545#else
6546	    rb_warning("passing a block to String#codepoints is deprecated");
6547	    wantarray = 0;
6548#endif
6549	}
6550    }
6551    else {
6552	if (wantarray)
6553	    ary = rb_ary_new();
6554	else
6555	    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
6556    }
6557
6558    str = rb_str_new4(str);
6559    ptr = RSTRING_PTR(str);
6560    end = RSTRING_END(str);
6561    enc = STR_ENC_GET(str);
6562    while (ptr < end) {
6563	c = rb_enc_codepoint_len(ptr, end, &n, enc);
6564	if (wantarray)
6565	    rb_ary_push(ary, UINT2NUM(c));
6566	else
6567	    rb_yield(UINT2NUM(c));
6568	ptr += n;
6569    }
6570    RB_GC_GUARD(str);
6571    if (wantarray)
6572	return ary;
6573    else
6574	return orig;
6575}
6576
6577/*
6578 *  call-seq:
6579 *     str.each_codepoint {|integer| block }    -> str
6580 *     str.each_codepoint                       -> an_enumerator
6581 *
6582 *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
6583 *  also known as a <i>codepoint</i> when applied to Unicode strings to the
6584 *  given block.
6585 *
6586 *  If no block is given, an enumerator is returned instead.
6587 *
6588 *     "hello\u0639".each_codepoint {|c| print c, ' ' }
6589 *
6590 *  <em>produces:</em>
6591 *
6592 *     104 101 108 108 111 1593
6593 */
6594
6595static VALUE
6596rb_str_each_codepoint(VALUE str)
6597{
6598    return rb_str_enumerate_codepoints(str, 0);
6599}
6600
6601/*
6602 *  call-seq:
6603 *     str.codepoints   -> an_array
6604 *
6605 *  Returns an array of the <code>Integer</code> ordinals of the
6606 *  characters in <i>str</i>.  This is a shorthand for
6607 *  <code>str.each_codepoint.to_a</code>.
6608 *
6609 *  If a block is given, which is a deprecated form, works the same as
6610 *  <code>each_codepoint</code>.
6611 */
6612
6613static VALUE
6614rb_str_codepoints(VALUE str)
6615{
6616    return rb_str_enumerate_codepoints(str, 1);
6617}
6618
6619
6620static long
6621chopped_length(VALUE str)
6622{
6623    rb_encoding *enc = STR_ENC_GET(str);
6624    const char *p, *p2, *beg, *end;
6625
6626    beg = RSTRING_PTR(str);
6627    end = beg + RSTRING_LEN(str);
6628    if (beg > end) return 0;
6629    p = rb_enc_prev_char(beg, end, end, enc);
6630    if (!p) return 0;
6631    if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
6632	p2 = rb_enc_prev_char(beg, p, end, enc);
6633	if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
6634    }
6635    return p - beg;
6636}
6637
6638/*
6639 *  call-seq:
6640 *     str.chop!   -> str or nil
6641 *
6642 *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
6643 *  or <code>nil</code> if <i>str</i> is the empty string.  See also
6644 *  <code>String#chomp!</code>.
6645 */
6646
6647static VALUE
6648rb_str_chop_bang(VALUE str)
6649{
6650    str_modify_keep_cr(str);
6651    if (RSTRING_LEN(str) > 0) {
6652	long len;
6653	len = chopped_length(str);
6654	STR_SET_LEN(str, len);
6655	RSTRING_PTR(str)[len] = '\0';
6656	if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6657	    ENC_CODERANGE_CLEAR(str);
6658	}
6659	return str;
6660    }
6661    return Qnil;
6662}
6663
6664
6665/*
6666 *  call-seq:
6667 *     str.chop   -> new_str
6668 *
6669 *  Returns a new <code>String</code> with the last character removed.  If the
6670 *  string ends with <code>\r\n</code>, both characters are removed. Applying
6671 *  <code>chop</code> to an empty string returns an empty
6672 *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
6673 *  the string unchanged if it doesn't end in a record separator.
6674 *
6675 *     "string\r\n".chop   #=> "string"
6676 *     "string\n\r".chop   #=> "string\n"
6677 *     "string\n".chop     #=> "string"
6678 *     "string".chop       #=> "strin"
6679 *     "x".chop.chop       #=> ""
6680 */
6681
6682static VALUE
6683rb_str_chop(VALUE str)
6684{
6685    return rb_str_subseq(str, 0, chopped_length(str));
6686}
6687
6688
6689/*
6690 *  call-seq:
6691 *     str.chomp!(separator=$/)   -> str or nil
6692 *
6693 *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
6694 *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
6695 */
6696
6697static VALUE
6698rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
6699{
6700    rb_encoding *enc;
6701    VALUE rs;
6702    int newline;
6703    char *p, *pp, *e;
6704    long len, rslen;
6705
6706    str_modify_keep_cr(str);
6707    len = RSTRING_LEN(str);
6708    if (len == 0) return Qnil;
6709    p = RSTRING_PTR(str);
6710    e = p + len;
6711    if (argc == 0) {
6712	rs = rb_rs;
6713	if (rs == rb_default_rs) {
6714	  smart_chomp:
6715	    enc = rb_enc_get(str);
6716	    if (rb_enc_mbminlen(enc) > 1) {
6717		pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
6718		if (rb_enc_is_newline(pp, e, enc)) {
6719		    e = pp;
6720		}
6721		pp = e - rb_enc_mbminlen(enc);
6722		if (pp >= p) {
6723		    pp = rb_enc_left_char_head(p, pp, e, enc);
6724		    if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
6725			e = pp;
6726		    }
6727		}
6728		if (e == RSTRING_END(str)) {
6729		    return Qnil;
6730		}
6731		len = e - RSTRING_PTR(str);
6732		STR_SET_LEN(str, len);
6733	    }
6734	    else {
6735		if (RSTRING_PTR(str)[len-1] == '\n') {
6736		    STR_DEC_LEN(str);
6737		    if (RSTRING_LEN(str) > 0 &&
6738			RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
6739			STR_DEC_LEN(str);
6740		    }
6741		}
6742		else if (RSTRING_PTR(str)[len-1] == '\r') {
6743		    STR_DEC_LEN(str);
6744		}
6745		else {
6746		    return Qnil;
6747		}
6748	    }
6749	    RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6750	    return str;
6751	}
6752    }
6753    else {
6754	rb_scan_args(argc, argv, "01", &rs);
6755    }
6756    if (NIL_P(rs)) return Qnil;
6757    StringValue(rs);
6758    rslen = RSTRING_LEN(rs);
6759    if (rslen == 0) {
6760	while (len>0 && p[len-1] == '\n') {
6761	    len--;
6762	    if (len>0 && p[len-1] == '\r')
6763		len--;
6764	}
6765	if (len < RSTRING_LEN(str)) {
6766	    STR_SET_LEN(str, len);
6767	    RSTRING_PTR(str)[len] = '\0';
6768	    return str;
6769	}
6770	return Qnil;
6771    }
6772    if (rslen > len) return Qnil;
6773    newline = RSTRING_PTR(rs)[rslen-1];
6774    if (rslen == 1 && newline == '\n')
6775	goto smart_chomp;
6776
6777    enc = rb_enc_check(str, rs);
6778    if (is_broken_string(rs)) {
6779	return Qnil;
6780    }
6781    pp = e - rslen;
6782    if (p[len-1] == newline &&
6783	(rslen <= 1 ||
6784	 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
6785	if (rb_enc_left_char_head(p, pp, e, enc) != pp)
6786	    return Qnil;
6787	if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6788	    ENC_CODERANGE_CLEAR(str);
6789	}
6790	STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
6791	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6792	return str;
6793    }
6794    return Qnil;
6795}
6796
6797
6798/*
6799 *  call-seq:
6800 *     str.chomp(separator=$/)   -> new_str
6801 *
6802 *  Returns a new <code>String</code> with the given record separator removed
6803 *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
6804 *  changed from the default Ruby record separator, then <code>chomp</code> also
6805 *  removes carriage return characters (that is it will remove <code>\n</code>,
6806 *  <code>\r</code>, and <code>\r\n</code>).
6807 *
6808 *     "hello".chomp            #=> "hello"
6809 *     "hello\n".chomp          #=> "hello"
6810 *     "hello\r\n".chomp        #=> "hello"
6811 *     "hello\n\r".chomp        #=> "hello\n"
6812 *     "hello\r".chomp          #=> "hello"
6813 *     "hello \n there".chomp   #=> "hello \n there"
6814 *     "hello".chomp("llo")     #=> "he"
6815 */
6816
6817static VALUE
6818rb_str_chomp(int argc, VALUE *argv, VALUE str)
6819{
6820    str = rb_str_dup(str);
6821    rb_str_chomp_bang(argc, argv, str);
6822    return str;
6823}
6824
6825/*
6826 *  call-seq:
6827 *     str.lstrip!   -> self or nil
6828 *
6829 *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
6830 *  change was made. See also <code>String#rstrip!</code> and
6831 *  <code>String#strip!</code>.
6832 *
6833 *     "  hello  ".lstrip   #=> "hello  "
6834 *     "hello".lstrip!      #=> nil
6835 */
6836
6837static VALUE
6838rb_str_lstrip_bang(VALUE str)
6839{
6840    rb_encoding *enc;
6841    char *s, *t, *e;
6842
6843    str_modify_keep_cr(str);
6844    enc = STR_ENC_GET(str);
6845    s = RSTRING_PTR(str);
6846    if (!s || RSTRING_LEN(str) == 0) return Qnil;
6847    e = t = RSTRING_END(str);
6848    /* remove spaces at head */
6849    while (s < e) {
6850	int n;
6851	unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
6852
6853	if (!rb_isspace(cc)) break;
6854	s += n;
6855    }
6856
6857    if (s > RSTRING_PTR(str)) {
6858	STR_SET_LEN(str, t-s);
6859	memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
6860	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6861	return str;
6862    }
6863    return Qnil;
6864}
6865
6866
6867/*
6868 *  call-seq:
6869 *     str.lstrip   -> new_str
6870 *
6871 *  Returns a copy of <i>str</i> with leading whitespace removed. See also
6872 *  <code>String#rstrip</code> and <code>String#strip</code>.
6873 *
6874 *     "  hello  ".lstrip   #=> "hello  "
6875 *     "hello".lstrip       #=> "hello"
6876 */
6877
6878static VALUE
6879rb_str_lstrip(VALUE str)
6880{
6881    str = rb_str_dup(str);
6882    rb_str_lstrip_bang(str);
6883    return str;
6884}
6885
6886
6887/*
6888 *  call-seq:
6889 *     str.rstrip!   -> self or nil
6890 *
6891 *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
6892 *  no change was made. See also <code>String#lstrip!</code> and
6893 *  <code>String#strip!</code>.
6894 *
6895 *     "  hello  ".rstrip   #=> "  hello"
6896 *     "hello".rstrip!      #=> nil
6897 */
6898
6899static VALUE
6900rb_str_rstrip_bang(VALUE str)
6901{
6902    rb_encoding *enc;
6903    char *s, *t, *e;
6904
6905    str_modify_keep_cr(str);
6906    enc = STR_ENC_GET(str);
6907    rb_str_check_dummy_enc(enc);
6908    s = RSTRING_PTR(str);
6909    if (!s || RSTRING_LEN(str) == 0) return Qnil;
6910    t = e = RSTRING_END(str);
6911
6912    /* remove trailing spaces or '\0's */
6913    if (single_byte_optimizable(str)) {
6914	unsigned char c;
6915	while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
6916    }
6917    else {
6918	char *tp;
6919
6920        while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
6921	    unsigned int c = rb_enc_codepoint(tp, e, enc);
6922	    if (c && !rb_isspace(c)) break;
6923	    t = tp;
6924	}
6925    }
6926    if (t < e) {
6927	long len = t-RSTRING_PTR(str);
6928
6929	STR_SET_LEN(str, len);
6930	RSTRING_PTR(str)[len] = '\0';
6931	return str;
6932    }
6933    return Qnil;
6934}
6935
6936
6937/*
6938 *  call-seq:
6939 *     str.rstrip   -> new_str
6940 *
6941 *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
6942 *  <code>String#lstrip</code> and <code>String#strip</code>.
6943 *
6944 *     "  hello  ".rstrip   #=> "  hello"
6945 *     "hello".rstrip       #=> "hello"
6946 */
6947
6948static VALUE
6949rb_str_rstrip(VALUE str)
6950{
6951    str = rb_str_dup(str);
6952    rb_str_rstrip_bang(str);
6953    return str;
6954}
6955
6956
6957/*
6958 *  call-seq:
6959 *     str.strip!   -> str or nil
6960 *
6961 *  Removes leading and trailing whitespace from <i>str</i>. Returns
6962 *  <code>nil</code> if <i>str</i> was not altered.
6963 */
6964
6965static VALUE
6966rb_str_strip_bang(VALUE str)
6967{
6968    VALUE l = rb_str_lstrip_bang(str);
6969    VALUE r = rb_str_rstrip_bang(str);
6970
6971    if (NIL_P(l) && NIL_P(r)) return Qnil;
6972    return str;
6973}
6974
6975
6976/*
6977 *  call-seq:
6978 *     str.strip   -> new_str
6979 *
6980 *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
6981 *
6982 *     "    hello    ".strip   #=> "hello"
6983 *     "\tgoodbye\r\n".strip   #=> "goodbye"
6984 */
6985
6986static VALUE
6987rb_str_strip(VALUE str)
6988{
6989    str = rb_str_dup(str);
6990    rb_str_strip_bang(str);
6991    return str;
6992}
6993
6994static VALUE
6995scan_once(VALUE str, VALUE pat, long *start)
6996{
6997    VALUE result, match;
6998    struct re_registers *regs;
6999    int i;
7000
7001    if (rb_reg_search(pat, str, *start, 0) >= 0) {
7002	match = rb_backref_get();
7003	regs = RMATCH_REGS(match);
7004	if (BEG(0) == END(0)) {
7005	    rb_encoding *enc = STR_ENC_GET(str);
7006	    /*
7007	     * Always consume at least one character of the input string
7008	     */
7009	    if (RSTRING_LEN(str) > END(0))
7010		*start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
7011						   RSTRING_END(str), enc);
7012	    else
7013		*start = END(0)+1;
7014	}
7015	else {
7016	    *start = END(0);
7017	}
7018	if (regs->num_regs == 1) {
7019	    return rb_reg_nth_match(0, match);
7020	}
7021	result = rb_ary_new2(regs->num_regs);
7022	for (i=1; i < regs->num_regs; i++) {
7023	    rb_ary_push(result, rb_reg_nth_match(i, match));
7024	}
7025
7026	return result;
7027    }
7028    return Qnil;
7029}
7030
7031
7032/*
7033 *  call-seq:
7034 *     str.scan(pattern)                         -> array
7035 *     str.scan(pattern) {|match, ...| block }   -> str
7036 *
7037 *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
7038 *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
7039 *  generated and either added to the result array or passed to the block. If
7040 *  the pattern contains no groups, each individual result consists of the
7041 *  matched string, <code>$&</code>.  If the pattern contains groups, each
7042 *  individual result is itself an array containing one entry per group.
7043 *
7044 *     a = "cruel world"
7045 *     a.scan(/\w+/)        #=> ["cruel", "world"]
7046 *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
7047 *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
7048 *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
7049 *
7050 *  And the block form:
7051 *
7052 *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
7053 *     print "\n"
7054 *     a.scan(/(.)(.)/) {|x,y| print y, x }
7055 *     print "\n"
7056 *
7057 *  <em>produces:</em>
7058 *
7059 *     <<cruel>> <<world>>
7060 *     rceu lowlr
7061 */
7062
7063static VALUE
7064rb_str_scan(VALUE str, VALUE pat)
7065{
7066    VALUE result;
7067    long start = 0;
7068    long last = -1, prev = 0;
7069    char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
7070
7071    pat = get_pat(pat, 1);
7072    if (!rb_block_given_p()) {
7073	VALUE ary = rb_ary_new();
7074
7075	while (!NIL_P(result = scan_once(str, pat, &start))) {
7076	    last = prev;
7077	    prev = start;
7078	    rb_ary_push(ary, result);
7079	}
7080	if (last >= 0) rb_reg_search(pat, str, last, 0);
7081	return ary;
7082    }
7083
7084    while (!NIL_P(result = scan_once(str, pat, &start))) {
7085	last = prev;
7086	prev = start;
7087	rb_yield(result);
7088	str_mod_check(str, p, len);
7089    }
7090    if (last >= 0) rb_reg_search(pat, str, last, 0);
7091    return str;
7092}
7093
7094
7095/*
7096 *  call-seq:
7097 *     str.hex   -> integer
7098 *
7099 *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
7100 *  (with an optional sign and an optional <code>0x</code>) and returns the
7101 *  corresponding number. Zero is returned on error.
7102 *
7103 *     "0x0a".hex     #=> 10
7104 *     "-1234".hex    #=> -4660
7105 *     "0".hex        #=> 0
7106 *     "wombat".hex   #=> 0
7107 */
7108
7109static VALUE
7110rb_str_hex(VALUE str)
7111{
7112    return rb_str_to_inum(str, 16, FALSE);
7113}
7114
7115
7116/*
7117 *  call-seq:
7118 *     str.oct   -> integer
7119 *
7120 *  Treats leading characters of <i>str</i> as a string of octal digits (with an
7121 *  optional sign) and returns the corresponding number.  Returns 0 if the
7122 *  conversion fails.
7123 *
7124 *     "123".oct       #=> 83
7125 *     "-377".oct      #=> -255
7126 *     "bad".oct       #=> 0
7127 *     "0377bad".oct   #=> 255
7128 */
7129
7130static VALUE
7131rb_str_oct(VALUE str)
7132{
7133    return rb_str_to_inum(str, -8, FALSE);
7134}
7135
7136
7137/*
7138 *  call-seq:
7139 *     str.crypt(salt_str)   -> new_str
7140 *
7141 *  Applies a one-way cryptographic hash to <i>str</i> by invoking the
7142 *  standard library function <code>crypt(3)</code> with the given
7143 *  salt string.  While the format and the result are system and
7144 *  implementation dependent, using a salt matching the regular
7145 *  expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and
7146 *  safe on any platform, in which only the first two characters are
7147 *  significant.
7148 *
7149 *  This method is for use in system specific scripts, so if you want
7150 *  a cross-platform hash function consider using Digest or OpenSSL
7151 *  instead.
7152 */
7153
7154static VALUE
7155rb_str_crypt(VALUE str, VALUE salt)
7156{
7157    extern char *crypt(const char *, const char *);
7158    VALUE result;
7159    const char *s, *saltp;
7160    char *res;
7161#ifdef BROKEN_CRYPT
7162    char salt_8bit_clean[3];
7163#endif
7164
7165    StringValue(salt);
7166    if (RSTRING_LEN(salt) < 2)
7167	rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
7168
7169    s = RSTRING_PTR(str);
7170    if (!s) s = "";
7171    saltp = RSTRING_PTR(salt);
7172#ifdef BROKEN_CRYPT
7173    if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
7174	salt_8bit_clean[0] = saltp[0] & 0x7f;
7175	salt_8bit_clean[1] = saltp[1] & 0x7f;
7176	salt_8bit_clean[2] = '\0';
7177	saltp = salt_8bit_clean;
7178    }
7179#endif
7180    res = crypt(s, saltp);
7181    if (!res) {
7182	rb_sys_fail("crypt");
7183    }
7184    result = rb_str_new2(res);
7185    OBJ_INFECT(result, str);
7186    OBJ_INFECT(result, salt);
7187    return result;
7188}
7189
7190
7191/*
7192 *  call-seq:
7193 *     str.intern   -> symbol
7194 *     str.to_sym   -> symbol
7195 *
7196 *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
7197 *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
7198 *
7199 *     "Koala".intern         #=> :Koala
7200 *     s = 'cat'.to_sym       #=> :cat
7201 *     s == :cat              #=> true
7202 *     s = '@cat'.to_sym      #=> :@cat
7203 *     s == :@cat             #=> true
7204 *
7205 *  This can also be used to create symbols that cannot be represented using the
7206 *  <code>:xxx</code> notation.
7207 *
7208 *     'cat and dog'.to_sym   #=> :"cat and dog"
7209 */
7210
7211VALUE
7212rb_str_intern(VALUE s)
7213{
7214    VALUE str = RB_GC_GUARD(s);
7215    ID id;
7216
7217    id = rb_intern_str(str);
7218    return ID2SYM(id);
7219}
7220
7221
7222/*
7223 *  call-seq:
7224 *     str.ord   -> integer
7225 *
7226 *  Return the <code>Integer</code> ordinal of a one-character string.
7227 *
7228 *     "a".ord         #=> 97
7229 */
7230
7231VALUE
7232rb_str_ord(VALUE s)
7233{
7234    unsigned int c;
7235
7236    c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
7237    return UINT2NUM(c);
7238}
7239/*
7240 *  call-seq:
7241 *     str.sum(n=16)   -> integer
7242 *
7243 *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
7244 *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
7245 *  to 16. The result is simply the sum of the binary value of each character in
7246 *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
7247 *  checksum.
7248 */
7249
7250static VALUE
7251rb_str_sum(int argc, VALUE *argv, VALUE str)
7252{
7253    VALUE vbits;
7254    int bits;
7255    char *ptr, *p, *pend;
7256    long len;
7257    VALUE sum = INT2FIX(0);
7258    unsigned long sum0 = 0;
7259
7260    if (argc == 0) {
7261	bits = 16;
7262    }
7263    else {
7264	rb_scan_args(argc, argv, "01", &vbits);
7265	bits = NUM2INT(vbits);
7266    }
7267    ptr = p = RSTRING_PTR(str);
7268    len = RSTRING_LEN(str);
7269    pend = p + len;
7270
7271    while (p < pend) {
7272        if (FIXNUM_MAX - UCHAR_MAX < sum0) {
7273            sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
7274            str_mod_check(str, ptr, len);
7275            sum0 = 0;
7276        }
7277        sum0 += (unsigned char)*p;
7278        p++;
7279    }
7280
7281    if (bits == 0) {
7282        if (sum0) {
7283            sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
7284        }
7285    }
7286    else {
7287        if (sum == INT2FIX(0)) {
7288            if (bits < (int)sizeof(long)*CHAR_BIT) {
7289                sum0 &= (((unsigned long)1)<<bits)-1;
7290            }
7291            sum = LONG2FIX(sum0);
7292        }
7293        else {
7294            VALUE mod;
7295
7296            if (sum0) {
7297                sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
7298            }
7299
7300            mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
7301            mod = rb_funcall(mod, '-', 1, INT2FIX(1));
7302            sum = rb_funcall(sum, '&', 1, mod);
7303        }
7304    }
7305    return sum;
7306}
7307
7308static VALUE
7309rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
7310{
7311    rb_encoding *enc;
7312    VALUE w;
7313    long width, len, flen = 1, fclen = 1;
7314    VALUE res;
7315    char *p;
7316    const char *f = " ";
7317    long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
7318    volatile VALUE pad;
7319    int singlebyte = 1, cr;
7320
7321    rb_scan_args(argc, argv, "11", &w, &pad);
7322    enc = STR_ENC_GET(str);
7323    width = NUM2LONG(w);
7324    if (argc == 2) {
7325	StringValue(pad);
7326	enc = rb_enc_check(str, pad);
7327	f = RSTRING_PTR(pad);
7328	flen = RSTRING_LEN(pad);
7329	fclen = str_strlen(pad, enc);
7330	singlebyte = single_byte_optimizable(pad);
7331	if (flen == 0 || fclen == 0) {
7332	    rb_raise(rb_eArgError, "zero width padding");
7333	}
7334    }
7335    len = str_strlen(str, enc);
7336    if (width < 0 || len >= width) return rb_str_dup(str);
7337    n = width - len;
7338    llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
7339    rlen = n - llen;
7340    cr = ENC_CODERANGE(str);
7341    if (flen > 1) {
7342       llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
7343       rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
7344    }
7345    size = RSTRING_LEN(str);
7346    if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
7347       (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
7348       (len += llen2 + rlen2) >= LONG_MAX - size) {
7349       rb_raise(rb_eArgError, "argument too big");
7350    }
7351    len += size;
7352    res = rb_str_new5(str, 0, len);
7353    p = RSTRING_PTR(res);
7354    if (flen <= 1) {
7355       memset(p, *f, llen);
7356       p += llen;
7357    }
7358    else {
7359       while (llen >= fclen) {
7360	    memcpy(p,f,flen);
7361	    p += flen;
7362	    llen -= fclen;
7363	}
7364       if (llen > 0) {
7365           memcpy(p, f, llen2);
7366           p += llen2;
7367	}
7368    }
7369    memcpy(p, RSTRING_PTR(str), size);
7370    p += size;
7371    if (flen <= 1) {
7372       memset(p, *f, rlen);
7373       p += rlen;
7374    }
7375    else {
7376       while (rlen >= fclen) {
7377	    memcpy(p,f,flen);
7378	    p += flen;
7379	    rlen -= fclen;
7380	}
7381       if (rlen > 0) {
7382           memcpy(p, f, rlen2);
7383           p += rlen2;
7384	}
7385    }
7386    *p = '\0';
7387    STR_SET_LEN(res, p-RSTRING_PTR(res));
7388    OBJ_INFECT(res, str);
7389    if (!NIL_P(pad)) OBJ_INFECT(res, pad);
7390    rb_enc_associate(res, enc);
7391    if (argc == 2)
7392	cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
7393    if (cr != ENC_CODERANGE_BROKEN)
7394	ENC_CODERANGE_SET(res, cr);
7395    return res;
7396}
7397
7398
7399/*
7400 *  call-seq:
7401 *     str.ljust(integer, padstr=' ')   -> new_str
7402 *
7403 *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7404 *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
7405 *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7406 *
7407 *     "hello".ljust(4)            #=> "hello"
7408 *     "hello".ljust(20)           #=> "hello               "
7409 *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
7410 */
7411
7412static VALUE
7413rb_str_ljust(int argc, VALUE *argv, VALUE str)
7414{
7415    return rb_str_justify(argc, argv, str, 'l');
7416}
7417
7418
7419/*
7420 *  call-seq:
7421 *     str.rjust(integer, padstr=' ')   -> new_str
7422 *
7423 *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7424 *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
7425 *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7426 *
7427 *     "hello".rjust(4)            #=> "hello"
7428 *     "hello".rjust(20)           #=> "               hello"
7429 *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
7430 */
7431
7432static VALUE
7433rb_str_rjust(int argc, VALUE *argv, VALUE str)
7434{
7435    return rb_str_justify(argc, argv, str, 'r');
7436}
7437
7438
7439/*
7440 *  call-seq:
7441 *     str.center(width, padstr=' ')   -> new_str
7442 *
7443 *  Centers +str+ in +width+.  If +width+ is greater than the length of +str+,
7444 *  returns a new String of length +width+ with +str+ centered and padded with
7445 *  +padstr+; otherwise, returns +str+.
7446 *
7447 *     "hello".center(4)         #=> "hello"
7448 *     "hello".center(20)        #=> "       hello        "
7449 *     "hello".center(20, '123') #=> "1231231hello12312312"
7450 */
7451
7452static VALUE
7453rb_str_center(int argc, VALUE *argv, VALUE str)
7454{
7455    return rb_str_justify(argc, argv, str, 'c');
7456}
7457
7458/*
7459 *  call-seq:
7460 *     str.partition(sep)              -> [head, sep, tail]
7461 *     str.partition(regexp)           -> [head, match, tail]
7462 *
7463 *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
7464 *  and returns the part before it, the match, and the part
7465 *  after it.
7466 *  If it is not found, returns two empty strings and <i>str</i>.
7467 *
7468 *     "hello".partition("l")         #=> ["he", "l", "lo"]
7469 *     "hello".partition("x")         #=> ["hello", "", ""]
7470 *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
7471 */
7472
7473static VALUE
7474rb_str_partition(VALUE str, VALUE sep)
7475{
7476    long pos;
7477    int regex = FALSE;
7478
7479    if (RB_TYPE_P(sep, T_REGEXP)) {
7480	pos = rb_reg_search(sep, str, 0, 0);
7481	regex = TRUE;
7482    }
7483    else {
7484	VALUE tmp;
7485
7486	tmp = rb_check_string_type(sep);
7487	if (NIL_P(tmp)) {
7488	    rb_raise(rb_eTypeError, "type mismatch: %s given",
7489		     rb_obj_classname(sep));
7490	}
7491	sep = tmp;
7492	pos = rb_str_index(str, sep, 0);
7493    }
7494    if (pos < 0) {
7495      failed:
7496	return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
7497    }
7498    if (regex) {
7499	sep = rb_str_subpat(str, sep, INT2FIX(0));
7500	if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
7501    }
7502    return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
7503		          sep,
7504		          rb_str_subseq(str, pos+RSTRING_LEN(sep),
7505					     RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
7506}
7507
7508/*
7509 *  call-seq:
7510 *     str.rpartition(sep)             -> [head, sep, tail]
7511 *     str.rpartition(regexp)          -> [head, match, tail]
7512 *
7513 *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
7514 *  of the string, and returns the part before it, the match, and the part
7515 *  after it.
7516 *  If it is not found, returns two empty strings and <i>str</i>.
7517 *
7518 *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
7519 *     "hello".rpartition("x")         #=> ["", "", "hello"]
7520 *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
7521 */
7522
7523static VALUE
7524rb_str_rpartition(VALUE str, VALUE sep)
7525{
7526    long pos = RSTRING_LEN(str);
7527    int regex = FALSE;
7528
7529    if (RB_TYPE_P(sep, T_REGEXP)) {
7530	pos = rb_reg_search(sep, str, pos, 1);
7531	regex = TRUE;
7532    }
7533    else {
7534	VALUE tmp;
7535
7536	tmp = rb_check_string_type(sep);
7537	if (NIL_P(tmp)) {
7538	    rb_raise(rb_eTypeError, "type mismatch: %s given",
7539		     rb_obj_classname(sep));
7540	}
7541	sep = tmp;
7542	pos = rb_str_sublen(str, pos);
7543	pos = rb_str_rindex(str, sep, pos);
7544    }
7545    if (pos < 0) {
7546	return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
7547    }
7548    if (regex) {
7549	sep = rb_reg_nth_match(0, rb_backref_get());
7550    }
7551    return rb_ary_new3(3, rb_str_substr(str, 0, pos),
7552		          sep,
7553		          rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
7554}
7555
7556/*
7557 *  call-seq:
7558 *     str.start_with?([prefixes]+)   -> true or false
7559 *
7560 *  Returns true if +str+ starts with one of the +prefixes+ given.
7561 *
7562 *    "hello".start_with?("hell")               #=> true
7563 *
7564 *    # returns true if one of the prefixes matches.
7565 *    "hello".start_with?("heaven", "hell")     #=> true
7566 *    "hello".start_with?("heaven", "paradise") #=> false
7567 */
7568
7569static VALUE
7570rb_str_start_with(int argc, VALUE *argv, VALUE str)
7571{
7572    int i;
7573
7574    for (i=0; i<argc; i++) {
7575	VALUE tmp = argv[i];
7576	StringValue(tmp);
7577	rb_enc_check(str, tmp);
7578	if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7579	if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7580	    return Qtrue;
7581    }
7582    return Qfalse;
7583}
7584
7585/*
7586 *  call-seq:
7587 *     str.end_with?([suffixes]+)   -> true or false
7588 *
7589 *  Returns true if +str+ ends with one of the +suffixes+ given.
7590 */
7591
7592static VALUE
7593rb_str_end_with(int argc, VALUE *argv, VALUE str)
7594{
7595    int i;
7596    char *p, *s, *e;
7597    rb_encoding *enc;
7598
7599    for (i=0; i<argc; i++) {
7600	VALUE tmp = argv[i];
7601	StringValue(tmp);
7602	enc = rb_enc_check(str, tmp);
7603	if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7604	p = RSTRING_PTR(str);
7605        e = p + RSTRING_LEN(str);
7606	s = e - RSTRING_LEN(tmp);
7607	if (rb_enc_left_char_head(p, s, e, enc) != s)
7608	    continue;
7609	if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7610	    return Qtrue;
7611    }
7612    return Qfalse;
7613}
7614
7615void
7616rb_str_setter(VALUE val, ID id, VALUE *var)
7617{
7618    if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
7619	rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
7620    }
7621    *var = val;
7622}
7623
7624
7625/*
7626 *  call-seq:
7627 *     str.force_encoding(encoding)   -> str
7628 *
7629 *  Changes the encoding to +encoding+ and returns self.
7630 */
7631
7632static VALUE
7633rb_str_force_encoding(VALUE str, VALUE enc)
7634{
7635    str_modifiable(str);
7636    rb_enc_associate(str, rb_to_encoding(enc));
7637    ENC_CODERANGE_CLEAR(str);
7638    return str;
7639}
7640
7641/*
7642 *  call-seq:
7643 *     str.b   -> str
7644 *
7645 *  Returns a copied string whose encoding is ASCII-8BIT.
7646 */
7647
7648static VALUE
7649rb_str_b(VALUE str)
7650{
7651    VALUE str2 = str_alloc(rb_cString);
7652    str_replace_shared_without_enc(str2, str);
7653    OBJ_INFECT(str2, str);
7654    ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
7655    return str2;
7656}
7657
7658/*
7659 *  call-seq:
7660 *     str.valid_encoding?  -> true or false
7661 *
7662 *  Returns true for a string which encoded correctly.
7663 *
7664 *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
7665 *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
7666 *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
7667 */
7668
7669static VALUE
7670rb_str_valid_encoding_p(VALUE str)
7671{
7672    int cr = rb_enc_str_coderange(str);
7673
7674    return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
7675}
7676
7677/*
7678 *  call-seq:
7679 *     str.ascii_only?  -> true or false
7680 *
7681 *  Returns true for a string which has only ASCII characters.
7682 *
7683 *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
7684 *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
7685 */
7686
7687static VALUE
7688rb_str_is_ascii_only_p(VALUE str)
7689{
7690    int cr = rb_enc_str_coderange(str);
7691
7692    return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
7693}
7694
7695/**
7696 * Shortens _str_ and adds three dots, an ellipsis, if it is longer
7697 * than _len_ characters.
7698 *
7699 * \param str	the string to ellipsize.
7700 * \param len	the maximum string length.
7701 * \return	the ellipsized string.
7702 * \pre 	_len_ must not be negative.
7703 * \post	the length of the returned string in characters is less than or equal to _len_.
7704 * \post	If the length of _str_ is less than or equal _len_, returns _str_ itself.
7705 * \post	the encoded of returned string is equal to the encoded of _str_.
7706 * \post	the class of returned string is equal to the class of _str_.
7707 * \note	the length is counted in characters.
7708 */
7709VALUE
7710rb_str_ellipsize(VALUE str, long len)
7711{
7712    static const char ellipsis[] = "...";
7713    const long ellipsislen = sizeof(ellipsis) - 1;
7714    rb_encoding *const enc = rb_enc_get(str);
7715    const long blen = RSTRING_LEN(str);
7716    const char *const p = RSTRING_PTR(str), *e = p + blen;
7717    VALUE estr, ret = 0;
7718
7719    if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
7720    if (len * rb_enc_mbminlen(enc) >= blen ||
7721	(e = rb_enc_nth(p, e, len, enc)) - p == blen) {
7722	ret = str;
7723    }
7724    else if (len <= ellipsislen ||
7725	     !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
7726	if (rb_enc_asciicompat(enc)) {
7727	    ret = rb_str_new_with_class(str, ellipsis, len);
7728	    rb_enc_associate(ret, enc);
7729	}
7730	else {
7731	    estr = rb_usascii_str_new(ellipsis, len);
7732	    ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
7733	}
7734    }
7735    else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
7736	rb_str_cat(ret, ellipsis, ellipsislen);
7737    }
7738    else {
7739	estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
7740			     rb_enc_from_encoding(enc), 0, Qnil);
7741	rb_str_append(ret, estr);
7742    }
7743    return ret;
7744}
7745
7746/**********************************************************************
7747 * Document-class: Symbol
7748 *
7749 *  <code>Symbol</code> objects represent names and some strings
7750 *  inside the Ruby
7751 *  interpreter. They are generated using the <code>:name</code> and
7752 *  <code>:"string"</code> literals
7753 *  syntax, and by the various <code>to_sym</code> methods. The same
7754 *  <code>Symbol</code> object will be created for a given name or string
7755 *  for the duration of a program's execution, regardless of the context
7756 *  or meaning of that name. Thus if <code>Fred</code> is a constant in
7757 *  one context, a method in another, and a class in a third, the
7758 *  <code>Symbol</code> <code>:Fred</code> will be the same object in
7759 *  all three contexts.
7760 *
7761 *     module One
7762 *       class Fred
7763 *       end
7764 *       $f1 = :Fred
7765 *     end
7766 *     module Two
7767 *       Fred = 1
7768 *       $f2 = :Fred
7769 *     end
7770 *     def Fred()
7771 *     end
7772 *     $f3 = :Fred
7773 *     $f1.object_id   #=> 2514190
7774 *     $f2.object_id   #=> 2514190
7775 *     $f3.object_id   #=> 2514190
7776 *
7777 */
7778
7779
7780/*
7781 *  call-seq:
7782 *     sym == obj   -> true or false
7783 *
7784 *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
7785 *  symbol, returns <code>true</code>.
7786 */
7787
7788static VALUE
7789sym_equal(VALUE sym1, VALUE sym2)
7790{
7791    if (sym1 == sym2) return Qtrue;
7792    return Qfalse;
7793}
7794
7795
7796static int
7797sym_printable(const char *s, const char *send, rb_encoding *enc)
7798{
7799    while (s < send) {
7800	int n;
7801	int c = rb_enc_codepoint_len(s, send, &n, enc);
7802
7803	if (!rb_enc_isprint(c, enc)) return FALSE;
7804	s += n;
7805    }
7806    return TRUE;
7807}
7808
7809int
7810rb_str_symname_p(VALUE sym)
7811{
7812    rb_encoding *enc;
7813    const char *ptr;
7814    long len;
7815    rb_encoding *resenc = rb_default_internal_encoding();
7816
7817    if (resenc == NULL) resenc = rb_default_external_encoding();
7818    enc = STR_ENC_GET(sym);
7819    ptr = RSTRING_PTR(sym);
7820    len = RSTRING_LEN(sym);
7821    if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
7822	!rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
7823	return FALSE;
7824    }
7825    return TRUE;
7826}
7827
7828VALUE
7829rb_str_quote_unprintable(VALUE str)
7830{
7831    rb_encoding *enc;
7832    const char *ptr;
7833    long len;
7834    rb_encoding *resenc;
7835
7836    Check_Type(str, T_STRING);
7837    resenc = rb_default_internal_encoding();
7838    if (resenc == NULL) resenc = rb_default_external_encoding();
7839    enc = STR_ENC_GET(str);
7840    ptr = RSTRING_PTR(str);
7841    len = RSTRING_LEN(str);
7842    if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
7843	!sym_printable(ptr, ptr + len, enc)) {
7844	return rb_str_inspect(str);
7845    }
7846    return str;
7847}
7848
7849VALUE
7850rb_id_quote_unprintable(ID id)
7851{
7852    return rb_str_quote_unprintable(rb_id2str(id));
7853}
7854
7855/*
7856 *  call-seq:
7857 *     sym.inspect    -> string
7858 *
7859 *  Returns the representation of <i>sym</i> as a symbol literal.
7860 *
7861 *     :fred.inspect   #=> ":fred"
7862 */
7863
7864static VALUE
7865sym_inspect(VALUE sym)
7866{
7867    VALUE str;
7868    const char *ptr;
7869    long len;
7870    ID id = SYM2ID(sym);
7871    char *dest;
7872
7873    sym = rb_id2str(id);
7874    if (!rb_str_symname_p(sym)) {
7875	str = rb_str_inspect(sym);
7876	len = RSTRING_LEN(str);
7877	rb_str_resize(str, len + 1);
7878	dest = RSTRING_PTR(str);
7879	memmove(dest + 1, dest, len);
7880	dest[0] = ':';
7881    }
7882    else {
7883	rb_encoding *enc = STR_ENC_GET(sym);
7884	ptr = RSTRING_PTR(sym);
7885	len = RSTRING_LEN(sym);
7886	str = rb_enc_str_new(0, len + 1, enc);
7887	dest = RSTRING_PTR(str);
7888	dest[0] = ':';
7889	memcpy(dest + 1, ptr, len);
7890    }
7891    return str;
7892}
7893
7894
7895/*
7896 *  call-seq:
7897 *     sym.id2name   -> string
7898 *     sym.to_s      -> string
7899 *
7900 *  Returns the name or string corresponding to <i>sym</i>.
7901 *
7902 *     :fred.id2name   #=> "fred"
7903 */
7904
7905
7906VALUE
7907rb_sym_to_s(VALUE sym)
7908{
7909    ID id = SYM2ID(sym);
7910
7911    return str_new3(rb_cString, rb_id2str(id));
7912}
7913
7914
7915/*
7916 * call-seq:
7917 *   sym.to_sym   -> sym
7918 *   sym.intern   -> sym
7919 *
7920 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
7921 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
7922 * in this case.
7923 */
7924
7925static VALUE
7926sym_to_sym(VALUE sym)
7927{
7928    return sym;
7929}
7930
7931static VALUE
7932sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
7933{
7934    VALUE obj;
7935
7936    if (argc < 1) {
7937	rb_raise(rb_eArgError, "no receiver given");
7938    }
7939    obj = argv[0];
7940    return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
7941}
7942
7943/*
7944 * call-seq:
7945 *   sym.to_proc
7946 *
7947 * Returns a _Proc_ object which respond to the given method by _sym_.
7948 *
7949 *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
7950 */
7951
7952static VALUE
7953sym_to_proc(VALUE sym)
7954{
7955    static VALUE sym_proc_cache = Qfalse;
7956    enum {SYM_PROC_CACHE_SIZE = 67};
7957    VALUE proc;
7958    long id, index;
7959    VALUE *aryp;
7960
7961    if (!sym_proc_cache) {
7962	sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
7963	rb_gc_register_mark_object(sym_proc_cache);
7964	rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
7965    }
7966
7967    id = SYM2ID(sym);
7968    index = (id % SYM_PROC_CACHE_SIZE) << 1;
7969
7970    aryp = RARRAY_PTR(sym_proc_cache);
7971    if (aryp[index] == sym) {
7972	return aryp[index + 1];
7973    }
7974    else {
7975	proc = rb_proc_new(sym_call, (VALUE)id);
7976	aryp[index] = sym;
7977	aryp[index + 1] = proc;
7978	return proc;
7979    }
7980}
7981
7982/*
7983 * call-seq:
7984 *
7985 *   sym.succ
7986 *
7987 * Same as <code>sym.to_s.succ.intern</code>.
7988 */
7989
7990static VALUE
7991sym_succ(VALUE sym)
7992{
7993    return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
7994}
7995
7996/*
7997 * call-seq:
7998 *
7999 *   symbol <=> other_symbol       -> -1, 0, +1 or nil
8000 *
8001 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
8002 * symbols. Returns -1, 0, +1 or nil depending on whether +symbol+ is less
8003 * than, equal to, or greater than +other_symbol+.
8004 *
8005 *  +nil+ is returned if the two values are incomparable.
8006 *
8007 * See String#<=> for more information.
8008 */
8009
8010static VALUE
8011sym_cmp(VALUE sym, VALUE other)
8012{
8013    if (!SYMBOL_P(other)) {
8014	return Qnil;
8015    }
8016    return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
8017}
8018
8019/*
8020 * call-seq:
8021 *
8022 *   sym.casecmp(other)  -> -1, 0, +1 or nil
8023 *
8024 * Case-insensitive version of <code>Symbol#<=></code>.
8025 */
8026
8027static VALUE
8028sym_casecmp(VALUE sym, VALUE other)
8029{
8030    if (!SYMBOL_P(other)) {
8031	return Qnil;
8032    }
8033    return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
8034}
8035
8036/*
8037 * call-seq:
8038 *   sym =~ obj   -> fixnum or nil
8039 *
8040 * Returns <code>sym.to_s =~ obj</code>.
8041 */
8042
8043static VALUE
8044sym_match(VALUE sym, VALUE other)
8045{
8046    return rb_str_match(rb_sym_to_s(sym), other);
8047}
8048
8049/*
8050 * call-seq:
8051 *   sym[idx]      -> char
8052 *   sym[b, n]     -> char
8053 *
8054 * Returns <code>sym.to_s[]</code>.
8055 */
8056
8057static VALUE
8058sym_aref(int argc, VALUE *argv, VALUE sym)
8059{
8060    return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
8061}
8062
8063/*
8064 * call-seq:
8065 *   sym.length    -> integer
8066 *
8067 * Same as <code>sym.to_s.length</code>.
8068 */
8069
8070static VALUE
8071sym_length(VALUE sym)
8072{
8073    return rb_str_length(rb_id2str(SYM2ID(sym)));
8074}
8075
8076/*
8077 * call-seq:
8078 *   sym.empty?   -> true or false
8079 *
8080 * Returns that _sym_ is :"" or not.
8081 */
8082
8083static VALUE
8084sym_empty(VALUE sym)
8085{
8086    return rb_str_empty(rb_id2str(SYM2ID(sym)));
8087}
8088
8089/*
8090 * call-seq:
8091 *   sym.upcase    -> symbol
8092 *
8093 * Same as <code>sym.to_s.upcase.intern</code>.
8094 */
8095
8096static VALUE
8097sym_upcase(VALUE sym)
8098{
8099    return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
8100}
8101
8102/*
8103 * call-seq:
8104 *   sym.downcase  -> symbol
8105 *
8106 * Same as <code>sym.to_s.downcase.intern</code>.
8107 */
8108
8109static VALUE
8110sym_downcase(VALUE sym)
8111{
8112    return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
8113}
8114
8115/*
8116 * call-seq:
8117 *   sym.capitalize  -> symbol
8118 *
8119 * Same as <code>sym.to_s.capitalize.intern</code>.
8120 */
8121
8122static VALUE
8123sym_capitalize(VALUE sym)
8124{
8125    return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
8126}
8127
8128/*
8129 * call-seq:
8130 *   sym.swapcase  -> symbol
8131 *
8132 * Same as <code>sym.to_s.swapcase.intern</code>.
8133 */
8134
8135static VALUE
8136sym_swapcase(VALUE sym)
8137{
8138    return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
8139}
8140
8141/*
8142 * call-seq:
8143 *   sym.encoding   -> encoding
8144 *
8145 * Returns the Encoding object that represents the encoding of _sym_.
8146 */
8147
8148static VALUE
8149sym_encoding(VALUE sym)
8150{
8151    return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
8152}
8153
8154ID
8155rb_to_id(VALUE name)
8156{
8157    VALUE tmp;
8158
8159    switch (TYPE(name)) {
8160      default:
8161	tmp = rb_check_string_type(name);
8162	if (NIL_P(tmp)) {
8163	    tmp = rb_inspect(name);
8164	    rb_raise(rb_eTypeError, "%s is not a symbol",
8165		     RSTRING_PTR(tmp));
8166	}
8167	name = tmp;
8168	/* fall through */
8169      case T_STRING:
8170	name = rb_str_intern(name);
8171	/* fall through */
8172      case T_SYMBOL:
8173	return SYM2ID(name);
8174    }
8175
8176    UNREACHABLE;
8177}
8178
8179/*
8180 *  A <code>String</code> object holds and manipulates an arbitrary sequence of
8181 *  bytes, typically representing characters. String objects may be created
8182 *  using <code>String::new</code> or as literals.
8183 *
8184 *  Because of aliasing issues, users of strings should be aware of the methods
8185 *  that modify the contents of a <code>String</code> object.  Typically,
8186 *  methods with names ending in ``!'' modify their receiver, while those
8187 *  without a ``!'' return a new <code>String</code>.  However, there are
8188 *  exceptions, such as <code>String#[]=</code>.
8189 *
8190 */
8191
8192void
8193Init_String(void)
8194{
8195#undef rb_intern
8196#define rb_intern(str) rb_intern_const(str)
8197
8198    rb_cString  = rb_define_class("String", rb_cObject);
8199    rb_include_module(rb_cString, rb_mComparable);
8200    rb_define_alloc_func(rb_cString, empty_str_alloc);
8201    rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
8202    rb_define_method(rb_cString, "initialize", rb_str_init, -1);
8203    rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
8204    rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
8205    rb_define_method(rb_cString, "==", rb_str_equal, 1);
8206    rb_define_method(rb_cString, "===", rb_str_equal, 1);
8207    rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
8208    rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
8209    rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
8210    rb_define_method(rb_cString, "+", rb_str_plus, 1);
8211    rb_define_method(rb_cString, "*", rb_str_times, 1);
8212    rb_define_method(rb_cString, "%", rb_str_format_m, 1);
8213    rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
8214    rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
8215    rb_define_method(rb_cString, "insert", rb_str_insert, 2);
8216    rb_define_method(rb_cString, "length", rb_str_length, 0);
8217    rb_define_method(rb_cString, "size", rb_str_length, 0);
8218    rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
8219    rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
8220    rb_define_method(rb_cString, "=~", rb_str_match, 1);
8221    rb_define_method(rb_cString, "match", rb_str_match_m, -1);
8222    rb_define_method(rb_cString, "succ", rb_str_succ, 0);
8223    rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
8224    rb_define_method(rb_cString, "next", rb_str_succ, 0);
8225    rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
8226    rb_define_method(rb_cString, "upto", rb_str_upto, -1);
8227    rb_define_method(rb_cString, "index", rb_str_index_m, -1);
8228    rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
8229    rb_define_method(rb_cString, "replace", rb_str_replace, 1);
8230    rb_define_method(rb_cString, "clear", rb_str_clear, 0);
8231    rb_define_method(rb_cString, "chr", rb_str_chr, 0);
8232    rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
8233    rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
8234    rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
8235
8236    rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
8237    rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
8238    rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
8239    rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
8240    rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
8241    rb_define_method(rb_cString, "dump", rb_str_dump, 0);
8242
8243    rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
8244    rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
8245    rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
8246    rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
8247
8248    rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
8249    rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
8250    rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
8251    rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
8252
8253    rb_define_method(rb_cString, "hex", rb_str_hex, 0);
8254    rb_define_method(rb_cString, "oct", rb_str_oct, 0);
8255    rb_define_method(rb_cString, "split", rb_str_split_m, -1);
8256    rb_define_method(rb_cString, "lines", rb_str_lines, -1);
8257    rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
8258    rb_define_method(rb_cString, "chars", rb_str_chars, 0);
8259    rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
8260    rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
8261    rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
8262    rb_define_method(rb_cString, "concat", rb_str_concat, 1);
8263    rb_define_method(rb_cString, "<<", rb_str_concat, 1);
8264    rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
8265    rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
8266    rb_define_method(rb_cString, "intern", rb_str_intern, 0);
8267    rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
8268    rb_define_method(rb_cString, "ord", rb_str_ord, 0);
8269
8270    rb_define_method(rb_cString, "include?", rb_str_include, 1);
8271    rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
8272    rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
8273
8274    rb_define_method(rb_cString, "scan", rb_str_scan, 1);
8275
8276    rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
8277    rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
8278    rb_define_method(rb_cString, "center", rb_str_center, -1);
8279
8280    rb_define_method(rb_cString, "sub", rb_str_sub, -1);
8281    rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
8282    rb_define_method(rb_cString, "chop", rb_str_chop, 0);
8283    rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
8284    rb_define_method(rb_cString, "strip", rb_str_strip, 0);
8285    rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
8286    rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
8287
8288    rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
8289    rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
8290    rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
8291    rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
8292    rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
8293    rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
8294    rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
8295
8296    rb_define_method(rb_cString, "tr", rb_str_tr, 2);
8297    rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
8298    rb_define_method(rb_cString, "delete", rb_str_delete, -1);
8299    rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
8300    rb_define_method(rb_cString, "count", rb_str_count, -1);
8301
8302    rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
8303    rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
8304    rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
8305    rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
8306
8307    rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
8308    rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
8309    rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
8310    rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
8311
8312    rb_define_method(rb_cString, "sum", rb_str_sum, -1);
8313
8314    rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
8315    rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
8316
8317    rb_define_method(rb_cString, "partition", rb_str_partition, 1);
8318    rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
8319
8320    rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
8321    rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
8322    rb_define_method(rb_cString, "b", rb_str_b, 0);
8323    rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
8324    rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
8325
8326    id_to_s = rb_intern("to_s");
8327
8328    rb_fs = Qnil;
8329    rb_define_variable("$;", &rb_fs);
8330    rb_define_variable("$-F", &rb_fs);
8331
8332    rb_cSymbol = rb_define_class("Symbol", rb_cObject);
8333    rb_include_module(rb_cSymbol, rb_mComparable);
8334    rb_undef_alloc_func(rb_cSymbol);
8335    rb_undef_method(CLASS_OF(rb_cSymbol), "new");
8336    rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
8337
8338    rb_define_method(rb_cSymbol, "==", sym_equal, 1);
8339    rb_define_method(rb_cSymbol, "===", sym_equal, 1);
8340    rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
8341    rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
8342    rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
8343    rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
8344    rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
8345    rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
8346    rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
8347    rb_define_method(rb_cSymbol, "next", sym_succ, 0);
8348
8349    rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
8350    rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
8351    rb_define_method(rb_cSymbol, "=~", sym_match, 1);
8352
8353    rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
8354    rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
8355    rb_define_method(rb_cSymbol, "length", sym_length, 0);
8356    rb_define_method(rb_cSymbol, "size", sym_length, 0);
8357    rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
8358    rb_define_method(rb_cSymbol, "match", sym_match, 1);
8359
8360    rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
8361    rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
8362    rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
8363    rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
8364
8365    rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
8366}
8367