1/**********************************************************************
2
3  encoding.c -
4
5  $Author: nagachika $
6  created at: Thu May 24 17:23:27 JST 2007
7
8  Copyright (C) 2007 Yukihiro Matsumoto
9
10**********************************************************************/
11
12#include "ruby/ruby.h"
13#include "ruby/encoding.h"
14#include "internal.h"
15#include "regenc.h"
16#include <ctype.h>
17#ifndef NO_LOCALE_CHARMAP
18#ifdef __CYGWIN__
19#include <windows.h>
20#endif
21#ifdef HAVE_LANGINFO_H
22#include <langinfo.h>
23#endif
24#endif
25#include "ruby/util.h"
26
27#if defined __GNUC__ && __GNUC__ >= 4
28#pragma GCC visibility push(default)
29int rb_enc_register(const char *name, rb_encoding *encoding);
30void rb_enc_set_base(const char *name, const char *orig);
31void rb_encdb_declare(const char *name);
32int rb_encdb_replicate(const char *name, const char *orig);
33int rb_encdb_dummy(const char *name);
34int rb_encdb_alias(const char *alias, const char *orig);
35void rb_encdb_set_unicode(int index);
36#pragma GCC visibility pop
37#endif
38
39static ID id_encoding;
40VALUE rb_cEncoding;
41static VALUE rb_encoding_list;
42
43struct rb_encoding_entry {
44    const char *name;
45    rb_encoding *enc;
46    rb_encoding *base;
47};
48
49static struct {
50    struct rb_encoding_entry *list;
51    int count;
52    int size;
53    st_table *names;
54} enc_table;
55
56void rb_enc_init(void);
57
58#define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
59#define UNSPECIFIED_ENCODING INT_MAX
60
61#define ENCODING_NAMELEN_MAX 63
62#define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
63
64#define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
65
66static int load_encoding(const char *name);
67
68static size_t
69enc_memsize(const void *p)
70{
71    return 0;
72}
73
74static const rb_data_type_t encoding_data_type = {
75    "encoding",
76    {0, 0, enc_memsize,},
77};
78
79#define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type)
80
81static VALUE
82enc_new(rb_encoding *encoding)
83{
84    return TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, encoding);
85}
86
87static VALUE
88rb_enc_from_encoding_index(int idx)
89{
90    VALUE list, enc;
91
92    if (!(list = rb_encoding_list)) {
93	rb_bug("rb_enc_from_encoding_index(%d): no rb_encoding_list", idx);
94    }
95    enc = rb_ary_entry(list, idx);
96    if (NIL_P(enc)) {
97	rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx);
98    }
99    return enc;
100}
101
102VALUE
103rb_enc_from_encoding(rb_encoding *encoding)
104{
105    int idx;
106    if (!encoding) return Qnil;
107    idx = ENC_TO_ENCINDEX(encoding);
108    return rb_enc_from_encoding_index(idx);
109}
110
111static int enc_autoload(rb_encoding *);
112
113static int
114check_encoding(rb_encoding *enc)
115{
116    int index = rb_enc_to_index(enc);
117    if (rb_enc_from_index(index) != enc)
118	return -1;
119    if (enc_autoload_p(enc)) {
120	index = enc_autoload(enc);
121    }
122    return index;
123}
124
125static int
126enc_check_encoding(VALUE obj)
127{
128    if (SPECIAL_CONST_P(obj) || !rb_typeddata_is_kind_of(obj, &encoding_data_type)) {
129	return -1;
130    }
131    return check_encoding(RDATA(obj)->data);
132}
133
134static int
135must_encoding(VALUE enc)
136{
137    int index = enc_check_encoding(enc);
138    if (index < 0) {
139	rb_raise(rb_eTypeError, "wrong argument type %s (expected Encoding)",
140		 rb_obj_classname(enc));
141    }
142    return index;
143}
144
145int
146rb_to_encoding_index(VALUE enc)
147{
148    int idx;
149
150    idx = enc_check_encoding(enc);
151    if (idx >= 0) {
152	return idx;
153    }
154    else if (NIL_P(enc = rb_check_string_type(enc))) {
155	return -1;
156    }
157    if (!rb_enc_asciicompat(rb_enc_get(enc))) {
158	return -1;
159    }
160    return rb_enc_find_index(StringValueCStr(enc));
161}
162
163/* Returns encoding index or UNSPECIFIED_ENCODING */
164static int
165str_find_encindex(VALUE enc)
166{
167    int idx;
168
169    StringValue(enc);
170    if (!rb_enc_asciicompat(rb_enc_get(enc))) {
171	rb_raise(rb_eArgError, "invalid name encoding (non ASCII)");
172    }
173    idx = rb_enc_find_index(StringValueCStr(enc));
174    return idx;
175}
176
177static int
178str_to_encindex(VALUE enc)
179{
180    int idx = str_find_encindex(enc);
181    if (idx < 0) {
182	rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc));
183    }
184    return idx;
185}
186
187static rb_encoding *
188str_to_encoding(VALUE enc)
189{
190    return rb_enc_from_index(str_to_encindex(enc));
191}
192
193rb_encoding *
194rb_to_encoding(VALUE enc)
195{
196    if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
197    return str_to_encoding(enc);
198}
199
200rb_encoding *
201rb_find_encoding(VALUE enc)
202{
203    int idx;
204    if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
205    idx = str_find_encindex(enc);
206    if (idx < 0) return NULL;
207    return rb_enc_from_index(idx);
208}
209
210void
211rb_gc_mark_encodings(void)
212{
213}
214
215static int
216enc_table_expand(int newsize)
217{
218    struct rb_encoding_entry *ent;
219    int count = newsize;
220
221    if (enc_table.size >= newsize) return newsize;
222    newsize = (newsize + 7) / 8 * 8;
223    ent = realloc(enc_table.list, sizeof(*enc_table.list) * newsize);
224    if (!ent) return -1;
225    memset(ent + enc_table.size, 0, sizeof(*ent)*(newsize - enc_table.size));
226    enc_table.list = ent;
227    enc_table.size = newsize;
228    return count;
229}
230
231static int
232enc_register_at(int index, const char *name, rb_encoding *encoding)
233{
234    struct rb_encoding_entry *ent = &enc_table.list[index];
235    VALUE list;
236
237    if (!valid_encoding_name_p(name)) return -1;
238    if (!ent->name) {
239	ent->name = name = strdup(name);
240    }
241    else if (STRCASECMP(name, ent->name)) {
242	return -1;
243    }
244    if (!ent->enc) {
245	ent->enc = xmalloc(sizeof(rb_encoding));
246    }
247    if (encoding) {
248	*ent->enc = *encoding;
249    }
250    else {
251	memset(ent->enc, 0, sizeof(*ent->enc));
252    }
253    encoding = ent->enc;
254    encoding->name = name;
255    encoding->ruby_encoding_index = index;
256    st_insert(enc_table.names, (st_data_t)name, (st_data_t)index);
257    list = rb_encoding_list;
258    if (list && NIL_P(rb_ary_entry(list, index))) {
259	/* initialize encoding data */
260	rb_ary_store(list, index, enc_new(encoding));
261    }
262    return index;
263}
264
265static int
266enc_register(const char *name, rb_encoding *encoding)
267{
268    int index = enc_table.count;
269
270    if ((index = enc_table_expand(index + 1)) < 0) return -1;
271    enc_table.count = index;
272    return enc_register_at(index - 1, name, encoding);
273}
274
275static void set_encoding_const(const char *, rb_encoding *);
276int rb_enc_registered(const char *name);
277
278int
279rb_enc_register(const char *name, rb_encoding *encoding)
280{
281    int index = rb_enc_registered(name);
282
283    if (index >= 0) {
284	rb_encoding *oldenc = rb_enc_from_index(index);
285	if (STRCASECMP(name, rb_enc_name(oldenc))) {
286	    index = enc_register(name, encoding);
287	}
288	else if (enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) {
289	    enc_register_at(index, name, encoding);
290	}
291	else {
292	    rb_raise(rb_eArgError, "encoding %s is already registered", name);
293	}
294    }
295    else {
296	index = enc_register(name, encoding);
297	set_encoding_const(name, rb_enc_from_index(index));
298    }
299    return index;
300}
301
302void
303rb_encdb_declare(const char *name)
304{
305    int idx = rb_enc_registered(name);
306    if (idx < 0) {
307	idx = enc_register(name, 0);
308    }
309    set_encoding_const(name, rb_enc_from_index(idx));
310}
311
312static void
313enc_check_duplication(const char *name)
314{
315    if (rb_enc_registered(name) >= 0) {
316	rb_raise(rb_eArgError, "encoding %s is already registered", name);
317    }
318}
319
320static rb_encoding*
321set_base_encoding(int index, rb_encoding *base)
322{
323    rb_encoding *enc = enc_table.list[index].enc;
324
325    enc_table.list[index].base = base;
326    if (rb_enc_dummy_p(base)) ENC_SET_DUMMY(enc);
327    return enc;
328}
329
330/* for encdb.h
331 * Set base encoding for encodings which are not replicas
332 * but not in their own files.
333 */
334void
335rb_enc_set_base(const char *name, const char *orig)
336{
337    int idx = rb_enc_registered(name);
338    int origidx = rb_enc_registered(orig);
339    set_base_encoding(idx, rb_enc_from_index(origidx));
340}
341
342int
343rb_enc_replicate(const char *name, rb_encoding *encoding)
344{
345    int idx;
346
347    enc_check_duplication(name);
348    idx = enc_register(name, encoding);
349    set_base_encoding(idx, encoding);
350    set_encoding_const(name, rb_enc_from_index(idx));
351    return idx;
352}
353
354/*
355 * call-seq:
356 *   enc.replicate(name) -> encoding
357 *
358 * Returns a replicated encoding of _enc_ whose name is _name_.
359 * The new encoding should have the same byte structure of _enc_.
360 * If _name_ is used by another encoding, raise ArgumentError.
361 *
362 */
363static VALUE
364enc_replicate(VALUE encoding, VALUE name)
365{
366    return rb_enc_from_encoding_index(
367	rb_enc_replicate(StringValueCStr(name),
368			 rb_to_encoding(encoding)));
369}
370
371static int
372enc_replicate_with_index(const char *name, rb_encoding *origenc, int idx)
373{
374    if (idx < 0) {
375	idx = enc_register(name, origenc);
376    }
377    else {
378	idx = enc_register_at(idx, name, origenc);
379    }
380    if (idx >= 0) {
381	set_base_encoding(idx, origenc);
382	set_encoding_const(name, rb_enc_from_index(idx));
383    }
384    return idx;
385}
386
387int
388rb_encdb_replicate(const char *name, const char *orig)
389{
390    int origidx = rb_enc_registered(orig);
391    int idx = rb_enc_registered(name);
392
393    if (origidx < 0) {
394	origidx = enc_register(orig, 0);
395    }
396    return enc_replicate_with_index(name, rb_enc_from_index(origidx), idx);
397}
398
399int
400rb_define_dummy_encoding(const char *name)
401{
402    int index = rb_enc_replicate(name, rb_ascii8bit_encoding());
403    rb_encoding *enc = enc_table.list[index].enc;
404
405    ENC_SET_DUMMY(enc);
406    return index;
407}
408
409int
410rb_encdb_dummy(const char *name)
411{
412    int index = enc_replicate_with_index(name, rb_ascii8bit_encoding(),
413					 rb_enc_registered(name));
414    rb_encoding *enc = enc_table.list[index].enc;
415
416    ENC_SET_DUMMY(enc);
417    return index;
418}
419
420/*
421 * call-seq:
422 *   enc.dummy? -> true or false
423 *
424 * Returns true for dummy encodings.
425 * A dummy encoding is an encoding for which character handling is not properly
426 * implemented.
427 * It is used for stateful encodings.
428 *
429 *   Encoding::ISO_2022_JP.dummy?       #=> true
430 *   Encoding::UTF_8.dummy?             #=> false
431 *
432 */
433static VALUE
434enc_dummy_p(VALUE enc)
435{
436    return ENC_DUMMY_P(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse;
437}
438
439/*
440 * call-seq:
441 *   enc.ascii_compatible? -> true or false
442 *
443 * Returns whether ASCII-compatible or not.
444 *
445 *   Encoding::UTF_8.ascii_compatible?     #=> true
446 *   Encoding::UTF_16BE.ascii_compatible?  #=> false
447 *
448 */
449static VALUE
450enc_ascii_compatible_p(VALUE enc)
451{
452    return rb_enc_asciicompat(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse;
453}
454
455/*
456 * Returns 1 when the encoding is Unicode series other than UTF-7 else 0.
457 */
458int
459rb_enc_unicode_p(rb_encoding *enc)
460{
461    return ONIGENC_IS_UNICODE(enc);
462}
463
464static st_data_t
465enc_dup_name(st_data_t name)
466{
467    return (st_data_t)strdup((const char *)name);
468}
469
470/*
471 * Returns copied alias name when the key is added for st_table,
472 * else returns NULL.
473 */
474static int
475enc_alias_internal(const char *alias, int idx)
476{
477    return st_insert2(enc_table.names, (st_data_t)alias, (st_data_t)idx,
478		      enc_dup_name);
479}
480
481static int
482enc_alias(const char *alias, int idx)
483{
484    if (!valid_encoding_name_p(alias)) return -1;
485    if (!enc_alias_internal(alias, idx))
486	set_encoding_const(alias, rb_enc_from_index(idx));
487    return idx;
488}
489
490int
491rb_enc_alias(const char *alias, const char *orig)
492{
493    int idx;
494
495    enc_check_duplication(alias);
496    if (!enc_table.list) {
497	rb_enc_init();
498    }
499    if ((idx = rb_enc_find_index(orig)) < 0) {
500	return -1;
501    }
502    return enc_alias(alias, idx);
503}
504
505int
506rb_encdb_alias(const char *alias, const char *orig)
507{
508    int idx = rb_enc_registered(orig);
509
510    if (idx < 0) {
511	idx = enc_register(orig, 0);
512    }
513    return enc_alias(alias, idx);
514}
515
516void
517rb_encdb_set_unicode(int index)
518{
519    rb_enc_from_index(index)->flags |= ONIGENC_FLAG_UNICODE;
520}
521
522enum {
523    ENCINDEX_ASCII,
524    ENCINDEX_UTF_8,
525    ENCINDEX_US_ASCII,
526    ENCINDEX_BUILTIN_MAX
527};
528
529extern rb_encoding OnigEncodingUTF_8;
530extern rb_encoding OnigEncodingUS_ASCII;
531
532void
533rb_enc_init(void)
534{
535    enc_table_expand(ENCODING_COUNT + 1);
536    if (!enc_table.names) {
537	enc_table.names = st_init_strcasetable();
538    }
539#define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
540    ENC_REGISTER(ASCII);
541    ENC_REGISTER(UTF_8);
542    ENC_REGISTER(US_ASCII);
543#undef ENC_REGISTER
544    enc_table.count = ENCINDEX_BUILTIN_MAX;
545}
546
547rb_encoding *
548rb_enc_from_index(int index)
549{
550    if (!enc_table.list) {
551	rb_enc_init();
552    }
553    if (index < 0 || enc_table.count <= index) {
554	return 0;
555    }
556    return enc_table.list[index].enc;
557}
558
559int
560rb_enc_registered(const char *name)
561{
562    st_data_t idx = 0;
563
564    if (!name) return -1;
565    if (!enc_table.list) return -1;
566    if (st_lookup(enc_table.names, (st_data_t)name, &idx)) {
567	return (int)idx;
568    }
569    return -1;
570}
571
572static VALUE
573require_enc(VALUE enclib)
574{
575    int safe = rb_safe_level();
576    return rb_require_safe(enclib, safe > 3 ? 3 : safe);
577}
578
579static int
580load_encoding(const char *name)
581{
582    VALUE enclib = rb_sprintf("enc/%s.so", name);
583    VALUE verbose = ruby_verbose;
584    VALUE debug = ruby_debug;
585    VALUE errinfo;
586    VALUE loaded;
587    char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3;
588    int idx;
589
590    while (s < e) {
591	if (!ISALNUM(*s)) *s = '_';
592	else if (ISUPPER(*s)) *s = (char)TOLOWER(*s);
593	++s;
594    }
595    FL_UNSET(enclib, FL_TAINT|FL_UNTRUSTED);
596    OBJ_FREEZE(enclib);
597    ruby_verbose = Qfalse;
598    ruby_debug = Qfalse;
599    errinfo = rb_errinfo();
600    loaded = rb_protect(require_enc, enclib, 0);
601    ruby_verbose = verbose;
602    ruby_debug = debug;
603    rb_set_errinfo(errinfo);
604    if (NIL_P(loaded)) return -1;
605    if ((idx = rb_enc_registered(name)) < 0) return -1;
606    if (enc_autoload_p(enc_table.list[idx].enc)) return -1;
607    return idx;
608}
609
610static int
611enc_autoload(rb_encoding *enc)
612{
613    int i;
614    rb_encoding *base = enc_table.list[ENC_TO_ENCINDEX(enc)].base;
615
616    if (base) {
617	i = 0;
618	do {
619	    if (i >= enc_table.count) return -1;
620	} while (enc_table.list[i].enc != base && (++i, 1));
621	if (enc_autoload_p(base)) {
622	    if (enc_autoload(base) < 0) return -1;
623	}
624	i = ENC_TO_ENCINDEX(enc);
625	enc_register_at(i, rb_enc_name(enc), base);
626    }
627    else {
628	i = load_encoding(rb_enc_name(enc));
629    }
630    return i;
631}
632
633/* Return encoding index or UNSPECIFIED_ENCODING from encoding name */
634int
635rb_enc_find_index(const char *name)
636{
637    int i = rb_enc_registered(name);
638    rb_encoding *enc;
639
640    if (i < 0) {
641	i = load_encoding(name);
642    }
643    else if (!(enc = rb_enc_from_index(i))) {
644	if (i != UNSPECIFIED_ENCODING) {
645	    rb_raise(rb_eArgError, "encoding %s is not registered", name);
646	}
647    }
648    else if (enc_autoload_p(enc)) {
649	if (enc_autoload(enc) < 0) {
650	    rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
651		    name);
652	    return 0;
653	}
654    }
655    return i;
656}
657
658rb_encoding *
659rb_enc_find(const char *name)
660{
661    int idx = rb_enc_find_index(name);
662    if (idx < 0) idx = 0;
663    return rb_enc_from_index(idx);
664}
665
666static inline int
667enc_capable(VALUE obj)
668{
669    if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj);
670    switch (BUILTIN_TYPE(obj)) {
671      case T_STRING:
672      case T_REGEXP:
673      case T_FILE:
674	return TRUE;
675      case T_DATA:
676	if (is_data_encoding(obj)) return TRUE;
677      default:
678	return FALSE;
679    }
680}
681
682ID
683rb_id_encoding(void)
684{
685    CONST_ID(id_encoding, "encoding");
686    return id_encoding;
687}
688
689int
690rb_enc_get_index(VALUE obj)
691{
692    int i = -1;
693    VALUE tmp;
694
695    if (SPECIAL_CONST_P(obj)) {
696	if (!SYMBOL_P(obj)) return -1;
697	obj = rb_id2str(SYM2ID(obj));
698    }
699    switch (BUILTIN_TYPE(obj)) {
700      as_default:
701      default:
702      case T_STRING:
703      case T_REGEXP:
704	i = ENCODING_GET_INLINED(obj);
705	if (i == ENCODING_INLINE_MAX) {
706	    VALUE iv;
707
708	    iv = rb_ivar_get(obj, rb_id_encoding());
709	    i = NUM2INT(iv);
710	}
711	break;
712      case T_FILE:
713	tmp = rb_funcall(obj, rb_intern("internal_encoding"), 0, 0);
714	if (NIL_P(tmp)) obj = rb_funcall(obj, rb_intern("external_encoding"), 0, 0);
715	else obj = tmp;
716	if (NIL_P(obj)) break;
717      case T_DATA:
718	if (is_data_encoding(obj)) {
719	    i = enc_check_encoding(obj);
720	}
721	else {
722	    goto as_default;
723	}
724	break;
725    }
726    return i;
727}
728
729static void
730enc_set_index(VALUE obj, int idx)
731{
732    if (idx < ENCODING_INLINE_MAX) {
733	ENCODING_SET_INLINED(obj, idx);
734	return;
735    }
736    ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
737    rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
738}
739
740void
741rb_enc_set_index(VALUE obj, int idx)
742{
743    rb_check_frozen(obj);
744    enc_set_index(obj, idx);
745}
746
747VALUE
748rb_enc_associate_index(VALUE obj, int idx)
749{
750/*    enc_check_capable(obj);*/
751    rb_check_frozen(obj);
752    if (rb_enc_get_index(obj) == idx)
753	return obj;
754    if (SPECIAL_CONST_P(obj)) {
755	rb_raise(rb_eArgError, "cannot set encoding");
756    }
757    if (!ENC_CODERANGE_ASCIIONLY(obj) ||
758	!rb_enc_asciicompat(rb_enc_from_index(idx))) {
759	ENC_CODERANGE_CLEAR(obj);
760    }
761    enc_set_index(obj, idx);
762    return obj;
763}
764
765VALUE
766rb_enc_associate(VALUE obj, rb_encoding *enc)
767{
768    return rb_enc_associate_index(obj, rb_enc_to_index(enc));
769}
770
771rb_encoding*
772rb_enc_get(VALUE obj)
773{
774    return rb_enc_from_index(rb_enc_get_index(obj));
775}
776
777rb_encoding*
778rb_enc_check(VALUE str1, VALUE str2)
779{
780    rb_encoding *enc = rb_enc_compatible(str1, str2);
781    if (!enc)
782	rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
783		 rb_enc_name(rb_enc_get(str1)),
784		 rb_enc_name(rb_enc_get(str2)));
785    return enc;
786}
787
788rb_encoding*
789rb_enc_compatible(VALUE str1, VALUE str2)
790{
791    int idx1, idx2;
792    rb_encoding *enc1, *enc2;
793    int isstr1, isstr2;
794
795    idx1 = rb_enc_get_index(str1);
796    idx2 = rb_enc_get_index(str2);
797
798    if (idx1 < 0 || idx2 < 0)
799        return 0;
800
801    if (idx1 == idx2) {
802	return rb_enc_from_index(idx1);
803    }
804    enc1 = rb_enc_from_index(idx1);
805    enc2 = rb_enc_from_index(idx2);
806
807    isstr2 = RB_TYPE_P(str2, T_STRING);
808    if (isstr2 && RSTRING_LEN(str2) == 0)
809	return enc1;
810    isstr1 = RB_TYPE_P(str1, T_STRING);
811    if (isstr1 && RSTRING_LEN(str1) == 0)
812	return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
813    if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
814	return 0;
815    }
816
817    /* objects whose encoding is the same of contents */
818    if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
819	return enc1;
820    if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
821	return enc2;
822
823    if (!isstr1) {
824	VALUE tmp = str1;
825	int idx0 = idx1;
826	str1 = str2;
827	str2 = tmp;
828	idx1 = idx2;
829	idx2 = idx0;
830	idx0 = isstr1;
831	isstr1 = isstr2;
832	isstr2 = idx0;
833    }
834    if (isstr1) {
835	int cr1, cr2;
836
837	cr1 = rb_enc_str_coderange(str1);
838	if (isstr2) {
839	    cr2 = rb_enc_str_coderange(str2);
840	    if (cr1 != cr2) {
841		/* may need to handle ENC_CODERANGE_BROKEN */
842		if (cr1 == ENC_CODERANGE_7BIT) return enc2;
843		if (cr2 == ENC_CODERANGE_7BIT) return enc1;
844	    }
845	    if (cr2 == ENC_CODERANGE_7BIT) {
846		return enc1;
847	    }
848	}
849	if (cr1 == ENC_CODERANGE_7BIT)
850	    return enc2;
851    }
852    return 0;
853}
854
855void
856rb_enc_copy(VALUE obj1, VALUE obj2)
857{
858    rb_enc_associate_index(obj1, rb_enc_get_index(obj2));
859}
860
861
862/*
863 *  call-seq:
864 *     obj.encoding   -> encoding
865 *
866 *  Returns the Encoding object that represents the encoding of obj.
867 */
868
869VALUE
870rb_obj_encoding(VALUE obj)
871{
872    int idx = rb_enc_get_index(obj);
873    if (idx < 0) {
874	rb_raise(rb_eTypeError, "unknown encoding");
875    }
876    return rb_enc_from_encoding_index(idx);
877}
878
879int
880rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
881{
882    return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
883}
884
885int
886rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
887{
888    int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
889    if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
890        return MBCLEN_CHARFOUND_LEN(n);
891    else {
892        int min = rb_enc_mbminlen(enc);
893        return min <= e-p ? min : (int)(e-p);
894    }
895}
896
897int
898rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
899{
900    int n;
901    if (e <= p)
902        return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
903    n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
904    if (e-p < n)
905        return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p));
906    return n;
907}
908
909int
910rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
911{
912    unsigned int c, l;
913    if (e <= p)
914        return -1;
915    if (rb_enc_asciicompat(enc)) {
916        c = (unsigned char)*p;
917        if (!ISASCII(c))
918            return -1;
919        if (len) *len = 1;
920        return c;
921    }
922    l = rb_enc_precise_mbclen(p, e, enc);
923    if (!MBCLEN_CHARFOUND_P(l))
924        return -1;
925    c = rb_enc_mbc_to_codepoint(p, e, enc);
926    if (!rb_enc_isascii(c, enc))
927        return -1;
928    if (len) *len = l;
929    return c;
930}
931
932unsigned int
933rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
934{
935    int r;
936    if (e <= p)
937        rb_raise(rb_eArgError, "empty string");
938    r = rb_enc_precise_mbclen(p, e, enc);
939    if (!MBCLEN_CHARFOUND_P(r)) {
940	rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
941    }
942    if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
943    return rb_enc_mbc_to_codepoint(p, e, enc);
944}
945
946#undef rb_enc_codepoint
947unsigned int
948rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
949{
950    return rb_enc_codepoint_len(p, e, 0, enc);
951}
952
953int
954rb_enc_codelen(int c, rb_encoding *enc)
955{
956    int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
957    if (n == 0) {
958	rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc));
959    }
960    return n;
961}
962
963int
964rb_enc_toupper(int c, rb_encoding *enc)
965{
966    return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c));
967}
968
969int
970rb_enc_tolower(int c, rb_encoding *enc)
971{
972    return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c));
973}
974
975/*
976 * call-seq:
977 *   enc.inspect -> string
978 *
979 * Returns a string which represents the encoding for programmers.
980 *
981 *   Encoding::UTF_8.inspect       #=> "#<Encoding:UTF-8>"
982 *   Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
983 */
984static VALUE
985enc_inspect(VALUE self)
986{
987    VALUE str = rb_sprintf("#<%s:%s%s>", rb_obj_classname(self),
988		      rb_enc_name((rb_encoding*)DATA_PTR(self)),
989		      (enc_dummy_p(self) ? " (dummy)" : ""));
990    ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
991    return str;
992}
993
994/*
995 * call-seq:
996 *   enc.name -> string
997 *
998 * Returns the name of the encoding.
999 *
1000 *   Encoding::UTF_8.name      #=> "UTF-8"
1001 */
1002static VALUE
1003enc_name(VALUE self)
1004{
1005    return rb_usascii_str_new2(rb_enc_name((rb_encoding*)DATA_PTR(self)));
1006}
1007
1008static int
1009enc_names_i(st_data_t name, st_data_t idx, st_data_t args)
1010{
1011    VALUE *arg = (VALUE *)args;
1012
1013    if ((int)idx == (int)arg[0]) {
1014	VALUE str = rb_usascii_str_new2((char *)name);
1015	OBJ_FREEZE(str);
1016	rb_ary_push(arg[1], str);
1017    }
1018    return ST_CONTINUE;
1019}
1020
1021/*
1022 * call-seq:
1023 *   enc.names -> array
1024 *
1025 * Returns the list of name and aliases of the encoding.
1026 *
1027 *   Encoding::WINDOWS_31J.names  #=> ["Windows-31J", "CP932", "csWindows31J"]
1028 */
1029static VALUE
1030enc_names(VALUE self)
1031{
1032    VALUE args[2];
1033
1034    args[0] = (VALUE)rb_to_encoding_index(self);
1035    args[1] = rb_ary_new2(0);
1036    st_foreach(enc_table.names, enc_names_i, (st_data_t)args);
1037    return args[1];
1038}
1039
1040/*
1041 * call-seq:
1042 *   Encoding.list -> [enc1, enc2, ...]
1043 *
1044 * Returns the list of loaded encodings.
1045 *
1046 *   Encoding.list
1047 *   #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1048 *         #<Encoding:ISO-2022-JP (dummy)>]
1049 *
1050 *   Encoding.find("US-ASCII")
1051 *   #=> #<Encoding:US-ASCII>
1052 *
1053 *   Encoding.list
1054 *   #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1055 *         #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
1056 *
1057 */
1058static VALUE
1059enc_list(VALUE klass)
1060{
1061    VALUE ary = rb_ary_new2(0);
1062    rb_ary_replace(ary, rb_encoding_list);
1063    return ary;
1064}
1065
1066/*
1067 * call-seq:
1068 *   Encoding.find(string) -> enc
1069 *   Encoding.find(symbol) -> enc
1070 *
1071 * Search the encoding with specified <i>name</i>.
1072 * <i>name</i> should be a string or symbol.
1073 *
1074 *   Encoding.find("US-ASCII")  #=> #<Encoding:US-ASCII>
1075 *   Encoding.find(:Shift_JIS)  #=> #<Encoding:Shift_JIS>
1076 *
1077 * Names which this method accept are encoding names and aliases
1078 * including following special aliases
1079 *
1080 * "external"::   default external encoding
1081 * "internal"::   default internal encoding
1082 * "locale"::     locale encoding
1083 * "filesystem":: filesystem encoding
1084 *
1085 * An ArgumentError is raised when no encoding with <i>name</i>.
1086 * Only <code>Encoding.find("internal")</code> however returns nil
1087 * when no encoding named "internal", in other words, when Ruby has no
1088 * default internal encoding.
1089 */
1090static VALUE
1091enc_find(VALUE klass, VALUE enc)
1092{
1093    int idx;
1094    if (RB_TYPE_P(enc, T_DATA) && is_data_encoding(enc))
1095	return enc;
1096    idx = str_to_encindex(enc);
1097    if (idx == UNSPECIFIED_ENCODING) return Qnil;
1098    return rb_enc_from_encoding_index(idx);
1099}
1100
1101/*
1102 * call-seq:
1103 *   Encoding.compatible?(obj1, obj2) -> enc or nil
1104 *
1105 * Checks the compatibility of two objects.
1106 *
1107 * If the objects are both strings they are compatible when they are
1108 * concatenatable.  The encoding of the concatenated string will be returned
1109 * if they are compatible, nil if they are not.
1110 *
1111 *   Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
1112 *   #=> #<Encoding:ISO-8859-1>
1113 *
1114 *   Encoding.compatible?(
1115 *     "\xa1".force_encoding("iso-8859-1"),
1116 *     "\xa1\xa1".force_encoding("euc-jp"))
1117 *   #=> nil
1118 *
1119 * If the objects are non-strings their encodings are compatible when they
1120 * have an encoding and:
1121 * * Either encoding is US-ASCII compatible
1122 * * One of the encodings is a 7-bit encoding
1123 *
1124 */
1125static VALUE
1126enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
1127{
1128    rb_encoding *enc;
1129
1130    if (!enc_capable(str1)) return Qnil;
1131    if (!enc_capable(str2)) return Qnil;
1132    enc = rb_enc_compatible(str1, str2);
1133    if (!enc) return Qnil;
1134    return rb_enc_from_encoding(enc);
1135}
1136
1137/* :nodoc: */
1138static VALUE
1139enc_dump(int argc, VALUE *argv, VALUE self)
1140{
1141    rb_scan_args(argc, argv, "01", 0);
1142    return enc_name(self);
1143}
1144
1145/* :nodoc: */
1146static VALUE
1147enc_load(VALUE klass, VALUE str)
1148{
1149    return enc_find(klass, str);
1150}
1151
1152rb_encoding *
1153rb_ascii8bit_encoding(void)
1154{
1155    if (!enc_table.list) {
1156	rb_enc_init();
1157    }
1158    return enc_table.list[ENCINDEX_ASCII].enc;
1159}
1160
1161int
1162rb_ascii8bit_encindex(void)
1163{
1164    return ENCINDEX_ASCII;
1165}
1166
1167rb_encoding *
1168rb_utf8_encoding(void)
1169{
1170    if (!enc_table.list) {
1171	rb_enc_init();
1172    }
1173    return enc_table.list[ENCINDEX_UTF_8].enc;
1174}
1175
1176int
1177rb_utf8_encindex(void)
1178{
1179    return ENCINDEX_UTF_8;
1180}
1181
1182rb_encoding *
1183rb_usascii_encoding(void)
1184{
1185    if (!enc_table.list) {
1186	rb_enc_init();
1187    }
1188    return enc_table.list[ENCINDEX_US_ASCII].enc;
1189}
1190
1191int
1192rb_usascii_encindex(void)
1193{
1194    return ENCINDEX_US_ASCII;
1195}
1196
1197int
1198rb_locale_encindex(void)
1199{
1200    VALUE charmap = rb_locale_charmap(rb_cEncoding);
1201    int idx;
1202
1203    if (NIL_P(charmap))
1204        idx = rb_usascii_encindex();
1205    else if ((idx = rb_enc_find_index(StringValueCStr(charmap))) < 0)
1206        idx = rb_ascii8bit_encindex();
1207
1208    if (rb_enc_registered("locale") < 0) enc_alias_internal("locale", idx);
1209
1210    return idx;
1211}
1212
1213rb_encoding *
1214rb_locale_encoding(void)
1215{
1216    return rb_enc_from_index(rb_locale_encindex());
1217}
1218
1219static int
1220enc_set_filesystem_encoding(void)
1221{
1222    int idx;
1223#if defined NO_LOCALE_CHARMAP
1224    idx = rb_enc_to_index(rb_default_external_encoding());
1225#elif defined _WIN32 || defined __CYGWIN__
1226    char cp[sizeof(int) * 8 / 3 + 4];
1227    snprintf(cp, sizeof cp, "CP%d", AreFileApisANSI() ? GetACP() : GetOEMCP());
1228    idx = rb_enc_find_index(cp);
1229    if (idx < 0) idx = rb_ascii8bit_encindex();
1230#else
1231    idx = rb_enc_to_index(rb_default_external_encoding());
1232#endif
1233
1234    enc_alias_internal("filesystem", idx);
1235    return idx;
1236}
1237
1238int
1239rb_filesystem_encindex(void)
1240{
1241    int idx = rb_enc_registered("filesystem");
1242    if (idx < 0)
1243	idx = rb_ascii8bit_encindex();
1244    return idx;
1245}
1246
1247rb_encoding *
1248rb_filesystem_encoding(void)
1249{
1250    return rb_enc_from_index(rb_filesystem_encindex());
1251}
1252
1253struct default_encoding {
1254    int index;			/* -2 => not yet set, -1 => nil */
1255    rb_encoding *enc;
1256};
1257
1258static struct default_encoding default_external = {0};
1259
1260static int
1261enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name)
1262{
1263    int overridden = FALSE;
1264
1265    if (def->index != -2)
1266	/* Already set */
1267	overridden = TRUE;
1268
1269    if (NIL_P(encoding)) {
1270	def->index = -1;
1271	def->enc = 0;
1272	st_insert(enc_table.names, (st_data_t)strdup(name),
1273		  (st_data_t)UNSPECIFIED_ENCODING);
1274    }
1275    else {
1276	def->index = rb_enc_to_index(rb_to_encoding(encoding));
1277	def->enc = 0;
1278	enc_alias_internal(name, def->index);
1279    }
1280
1281    if (def == &default_external)
1282	enc_set_filesystem_encoding();
1283
1284    return overridden;
1285}
1286
1287rb_encoding *
1288rb_default_external_encoding(void)
1289{
1290    if (default_external.enc) return default_external.enc;
1291
1292    if (default_external.index >= 0) {
1293        default_external.enc = rb_enc_from_index(default_external.index);
1294        return default_external.enc;
1295    }
1296    else {
1297        return rb_locale_encoding();
1298    }
1299}
1300
1301VALUE
1302rb_enc_default_external(void)
1303{
1304    return rb_enc_from_encoding(rb_default_external_encoding());
1305}
1306
1307/*
1308 * call-seq:
1309 *   Encoding.default_external -> enc
1310 *
1311 * Returns default external encoding.
1312 *
1313 * The default external encoding is used by default for strings created from
1314 * the following locations:
1315 *
1316 * * CSV
1317 * * File data read from disk
1318 * * SDBM
1319 * * StringIO
1320 * * Zlib::GzipReader
1321 * * Zlib::GzipWriter
1322 * * String#inspect
1323 * * Regexp#inspect
1324 *
1325 * While strings created from these locations will have this encoding, the
1326 * encoding may not be valid.  Be sure to check String#valid_encoding?.
1327 *
1328 * File data written to disk will be transcoded to the default external
1329 * encoding when written.
1330 *
1331 * The default external encoding is initialized by the locale or -E option.
1332 */
1333static VALUE
1334get_default_external(VALUE klass)
1335{
1336    return rb_enc_default_external();
1337}
1338
1339void
1340rb_enc_set_default_external(VALUE encoding)
1341{
1342    if (NIL_P(encoding)) {
1343        rb_raise(rb_eArgError, "default external can not be nil");
1344    }
1345    enc_set_default_encoding(&default_external, encoding,
1346                            "external");
1347}
1348
1349/*
1350 * call-seq:
1351 *   Encoding.default_external = enc
1352 *
1353 * Sets default external encoding.  You should not set
1354 * Encoding::default_external in ruby code as strings created before changing
1355 * the value may have a different encoding from strings created after the value
1356 * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with
1357 * the correct default_external.
1358 *
1359 * See Encoding::default_external for information on how the default external
1360 * encoding is used.
1361 */
1362static VALUE
1363set_default_external(VALUE klass, VALUE encoding)
1364{
1365    rb_warning("setting Encoding.default_external");
1366    rb_enc_set_default_external(encoding);
1367    return encoding;
1368}
1369
1370static struct default_encoding default_internal = {-2};
1371
1372rb_encoding *
1373rb_default_internal_encoding(void)
1374{
1375    if (!default_internal.enc && default_internal.index >= 0) {
1376        default_internal.enc = rb_enc_from_index(default_internal.index);
1377    }
1378    return default_internal.enc; /* can be NULL */
1379}
1380
1381VALUE
1382rb_enc_default_internal(void)
1383{
1384    /* Note: These functions cope with default_internal not being set */
1385    return rb_enc_from_encoding(rb_default_internal_encoding());
1386}
1387
1388/*
1389 * call-seq:
1390 *   Encoding.default_internal -> enc
1391 *
1392 * Returns default internal encoding.  Strings will be transcoded to the
1393 * default internal encoding in the following places if the default internal
1394 * encoding is not nil:
1395 *
1396 * * CSV
1397 * * Etc.sysconfdir and Etc.systmpdir
1398 * * File data read from disk
1399 * * File names from Dir
1400 * * Integer#chr
1401 * * String#inspect and Regexp#inspect
1402 * * Strings returned from Curses
1403 * * Strings returned from Readline
1404 * * Strings returned from SDBM
1405 * * Time#zone
1406 * * Values from ENV
1407 * * Values in ARGV including $PROGRAM_NAME
1408 * * __FILE__
1409 *
1410 * Additionally String#encode and String#encode! use the default internal
1411 * encoding if no encoding is given.
1412 *
1413 * The locale encoding (__ENCODING__), not default_internal, is used as the
1414 * encoding of created strings.
1415 *
1416 * Encoding::default_internal is initialized by the source file's
1417 * internal_encoding or -E option.
1418 */
1419static VALUE
1420get_default_internal(VALUE klass)
1421{
1422    return rb_enc_default_internal();
1423}
1424
1425void
1426rb_enc_set_default_internal(VALUE encoding)
1427{
1428    enc_set_default_encoding(&default_internal, encoding,
1429                            "internal");
1430}
1431
1432/*
1433 * call-seq:
1434 *   Encoding.default_internal = enc or nil
1435 *
1436 * Sets default internal encoding or removes default internal encoding when
1437 * passed nil.  You should not set Encoding::default_internal in ruby code as
1438 * strings created before changing the value may have a different encoding
1439 * from strings created after the change.  Instead you should use
1440 * <tt>ruby -E</tt> to invoke ruby with the correct default_internal.
1441 *
1442 * See Encoding::default_internal for information on how the default internal
1443 * encoding is used.
1444 */
1445static VALUE
1446set_default_internal(VALUE klass, VALUE encoding)
1447{
1448    rb_warning("setting Encoding.default_internal");
1449    rb_enc_set_default_internal(encoding);
1450    return encoding;
1451}
1452
1453/*
1454 * call-seq:
1455 *   Encoding.locale_charmap -> string
1456 *
1457 * Returns the locale charmap name.
1458 * It returns nil if no appropriate information.
1459 *
1460 *   Debian GNU/Linux
1461 *     LANG=C
1462 *       Encoding.locale_charmap  #=> "ANSI_X3.4-1968"
1463 *     LANG=ja_JP.EUC-JP
1464 *       Encoding.locale_charmap  #=> "EUC-JP"
1465 *
1466 *   SunOS 5
1467 *     LANG=C
1468 *       Encoding.locale_charmap  #=> "646"
1469 *     LANG=ja
1470 *       Encoding.locale_charmap  #=> "eucJP"
1471 *
1472 * The result is highly platform dependent.
1473 * So Encoding.find(Encoding.locale_charmap) may cause an error.
1474 * If you need some encoding object even for unknown locale,
1475 * Encoding.find("locale") can be used.
1476 *
1477 */
1478VALUE
1479rb_locale_charmap(VALUE klass)
1480{
1481#if defined NO_LOCALE_CHARMAP
1482    return rb_usascii_str_new2("ASCII-8BIT");
1483#elif defined _WIN32 || defined __CYGWIN__
1484    const char *codeset = 0;
1485    char cp[sizeof(int) * 3 + 4];
1486# ifdef __CYGWIN__
1487    const char *nl_langinfo_codeset(void);
1488    codeset = nl_langinfo_codeset();
1489# endif
1490    if (!codeset) {
1491	UINT codepage = GetConsoleCP();
1492	if (!codepage) codepage = GetACP();
1493	snprintf(cp, sizeof(cp), "CP%d", codepage);
1494	codeset = cp;
1495    }
1496    return rb_usascii_str_new2(codeset);
1497#elif defined HAVE_LANGINFO_H
1498    char *codeset;
1499    codeset = nl_langinfo(CODESET);
1500    return rb_usascii_str_new2(codeset);
1501#else
1502    return Qnil;
1503#endif
1504}
1505
1506static void
1507set_encoding_const(const char *name, rb_encoding *enc)
1508{
1509    VALUE encoding = rb_enc_from_encoding(enc);
1510    char *s = (char *)name;
1511    int haslower = 0, hasupper = 0, valid = 0;
1512
1513    if (ISDIGIT(*s)) return;
1514    if (ISUPPER(*s)) {
1515	hasupper = 1;
1516	while (*++s && (ISALNUM(*s) || *s == '_')) {
1517	    if (ISLOWER(*s)) haslower = 1;
1518	}
1519    }
1520    if (!*s) {
1521	if (s - name > ENCODING_NAMELEN_MAX) return;
1522	valid = 1;
1523	rb_define_const(rb_cEncoding, name, encoding);
1524    }
1525    if (!valid || haslower) {
1526	size_t len = s - name;
1527	if (len > ENCODING_NAMELEN_MAX) return;
1528	if (!haslower || !hasupper) {
1529	    do {
1530		if (ISLOWER(*s)) haslower = 1;
1531		if (ISUPPER(*s)) hasupper = 1;
1532	    } while (*++s && (!haslower || !hasupper));
1533	    len = s - name;
1534	}
1535	len += strlen(s);
1536	if (len++ > ENCODING_NAMELEN_MAX) return;
1537	MEMCPY(s = ALLOCA_N(char, len), name, char, len);
1538	name = s;
1539	if (!valid) {
1540	    if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1541	    for (; *s; ++s) {
1542		if (!ISALNUM(*s)) *s = '_';
1543	    }
1544	    if (hasupper) {
1545		rb_define_const(rb_cEncoding, name, encoding);
1546	    }
1547	}
1548	if (haslower) {
1549	    for (s = (char *)name; *s; ++s) {
1550		if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1551	    }
1552	    rb_define_const(rb_cEncoding, name, encoding);
1553	}
1554    }
1555}
1556
1557static int
1558rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
1559{
1560    VALUE ary = (VALUE)arg;
1561    VALUE str = rb_usascii_str_new2((char *)name);
1562    OBJ_FREEZE(str);
1563    rb_ary_push(ary, str);
1564    return ST_CONTINUE;
1565}
1566
1567/*
1568 * call-seq:
1569 *   Encoding.name_list -> ["enc1", "enc2", ...]
1570 *
1571 * Returns the list of available encoding names.
1572 *
1573 *   Encoding.name_list
1574 *   #=> ["US-ASCII", "ASCII-8BIT", "UTF-8",
1575 *         "ISO-8859-1", "Shift_JIS", "EUC-JP",
1576 *         "Windows-31J",
1577 *         "BINARY", "CP932", "eucJP"]
1578 *
1579 */
1580
1581static VALUE
1582rb_enc_name_list(VALUE klass)
1583{
1584    VALUE ary = rb_ary_new2(enc_table.names->num_entries);
1585    st_foreach(enc_table.names, rb_enc_name_list_i, (st_data_t)ary);
1586    return ary;
1587}
1588
1589static int
1590rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg)
1591{
1592    VALUE *p = (VALUE *)arg;
1593    VALUE aliases = p[0], ary = p[1];
1594    int idx = (int)orig;
1595    VALUE key, str = rb_ary_entry(ary, idx);
1596
1597    if (NIL_P(str)) {
1598	rb_encoding *enc = rb_enc_from_index(idx);
1599
1600	if (!enc) return ST_CONTINUE;
1601	if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) {
1602	    return ST_CONTINUE;
1603	}
1604	str = rb_usascii_str_new2(rb_enc_name(enc));
1605	OBJ_FREEZE(str);
1606	rb_ary_store(ary, idx, str);
1607    }
1608    key = rb_usascii_str_new2((char *)name);
1609    OBJ_FREEZE(key);
1610    rb_hash_aset(aliases, key, str);
1611    return ST_CONTINUE;
1612}
1613
1614/*
1615 * call-seq:
1616 *   Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...}
1617 *
1618 * Returns the hash of available encoding alias and original encoding name.
1619 *
1620 *   Encoding.aliases
1621 *   #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
1622 *         "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1623 *
1624 */
1625
1626static VALUE
1627rb_enc_aliases(VALUE klass)
1628{
1629    VALUE aliases[2];
1630    aliases[0] = rb_hash_new();
1631    aliases[1] = rb_ary_new();
1632    st_foreach(enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases);
1633    return aliases[0];
1634}
1635
1636/*
1637 * An Encoding instance represents a character encoding usable in Ruby. It is
1638 * defined as a constant under the Encoding namespace. It has a name and
1639 * optionally, aliases:
1640 *
1641 *   Encoding::ISO_8859_1.name
1642 *   #=> #<Encoding:ISO-8859-1>
1643 *
1644 *   Encoding::ISO_8859_1.names
1645 *   #=> ["ISO-8859-1", "ISO8859-1"]
1646 *
1647 * Ruby methods dealing with encodings return or accept Encoding instances as
1648 * arguments (when a method accepts an Encoding instance as an argument, it
1649 * can be passed an Encoding name or alias instead).
1650 *
1651 *   "some string".encoding
1652 *   #=> #<Encoding:UTF-8>
1653 *
1654 *   string = "some string".encode(Encoding::ISO_8859_1)
1655 *   #=> "some string"
1656 *   string.encoding
1657 *   #=> #<Encoding:ISO-8859-1>
1658 *
1659 *   "some string".encode "ISO-8859-1"
1660 *   #=> "some string"
1661 *
1662 * <code>Encoding::ASCII_8BIT</code> is a special encoding that is usually
1663 * used for a byte string, not a character string. But as the name insists,
1664 * its characters in the range of ASCII are considered as ASCII characters.
1665 * This is useful when you use ASCII-8BIT characters with other ASCII
1666 * compatible characters.
1667 *
1668 * == Changing an encoding
1669 *
1670 * The associated Encoding of a String can be changed in two different ways.
1671 *
1672 * First, it is possible to set the Encoding of a string to a new Encoding
1673 * without changing the internal byte representation of the string, with
1674 * String#force_encoding. This is how you can tell Ruby the correct encoding
1675 * of a string.
1676 *
1677 *   string
1678 *   #=> "R\xC3\xA9sum\xC3\xA9"
1679 *   string.encoding
1680 *   #=> #<Encoding:ISO-8859-1>
1681 *   string.force_encoding(Encoding::UTF-8)
1682 *   #=> "R\u00E9sum\u00E9"
1683 *
1684 * Second, it is possible to transcode a string, i.e. translate its internal
1685 * byte representation to another encoding. Its associated encoding is also
1686 * set to the other encoding. See String#encode for the various forms of
1687 * transcoding, and the Encoding::Converter class for additional control over
1688 * the transcoding process.
1689 *
1690 *   string
1691 *   #=> "R\u00E9sum\u00E9"
1692 *   string.encoding
1693 *   #=> #<Encoding:UTF-8>
1694 *   string = string.encode!(Encoding::ISO_8859_1)
1695 *   #=> "R\xE9sum\xE9"
1696 *   string.encoding
1697 *   #=> #<Encoding::ISO-8859-1>
1698 *
1699 * == Script encoding
1700 *
1701 * All Ruby script code has an associated Encoding which any String literal
1702 * created in the source code will be associated to.
1703 *
1704 * The default script encoding is <code>Encoding::US-ASCII</code>, but it can
1705 * be changed by a magic comment on the first line of the source code file (or
1706 * second line, if there is a shebang line on the first). The comment must
1707 * contain the word <code>coding</code> or <code>encoding</code>, followed
1708 * by a colon, space and the Encoding name or alias:
1709 *
1710 *   # encoding: UTF-8
1711 *
1712 *   "some string".encoding
1713 *   #=> #<Encoding:UTF-8>
1714 *
1715 * The <code>__ENCODING__</code> keyword returns the script encoding of the file
1716 * which the keyword is written:
1717 *
1718 *   # encoding: ISO-8859-1
1719 *
1720 *   __ENCODING__
1721 *   #=> #<Encoding:ISO-8859-1>
1722 *
1723 * <code>ruby -K</code> will change the default locale encoding, but this is
1724 * not recommended. Ruby source files should declare its script encoding by a
1725 * magic comment even when they only depend on US-ASCII strings or regular
1726 * expressions.
1727 *
1728 * == Locale encoding
1729 *
1730 * The default encoding of the environment. Usually derived from locale.
1731 *
1732 * see Encoding.locale_charmap, Encoding.find('locale')
1733 *
1734 * == Filesystem encoding
1735 *
1736 * The default encoding of strings from the filesystem of the environment.
1737 * This is used for strings of file names or paths.
1738 *
1739 * see Encoding.find('filesystem')
1740 *
1741 * == External encoding
1742 *
1743 * Each IO object has an external encoding which indicates the encoding that
1744 * Ruby will use to read its data. By default Ruby sets the external encoding
1745 * of an IO object to the default external encoding. The default external
1746 * encoding is set by locale encoding or the interpreter <code>-E</code> option.
1747 * Encoding.default_external returns the current value of the external
1748 * encoding.
1749 *
1750 *   ENV["LANG"]
1751 *   #=> "UTF-8"
1752 *   Encoding.default_external
1753 *   #=> #<Encoding:UTF-8>
1754 *
1755 *   $ ruby -E ISO-8859-1 -e "p Encoding.default_external"
1756 *   #<Encoding:ISO-8859-1>
1757 *
1758 *   $ LANG=C ruby -e 'p Encoding.default_external'
1759 *   #<Encoding:US-ASCII>
1760 *
1761 * The default external encoding may also be set through
1762 * Encoding.default_external=, but you should not do this as strings created
1763 * before and after the change will have inconsistent encodings.  Instead use
1764 * <code>ruby -E</code> to invoke ruby with the correct external encoding.
1765 *
1766 * When you know that the actual encoding of the data of an IO object is not
1767 * the default external encoding, you can reset its external encoding with
1768 * IO#set_encoding or set it at IO object creation (see IO.new options).
1769 *
1770 * == Internal encoding
1771 *
1772 * To process the data of an IO object which has an encoding different
1773 * from its external encoding, you can set its internal encoding. Ruby will use
1774 * this internal encoding to transcode the data when it is read from the IO
1775 * object.
1776 *
1777 * Conversely, when data is written to the IO object it is transcoded from the
1778 * internal encoding to the external encoding of the IO object.
1779 *
1780 * The internal encoding of an IO object can be set with
1781 * IO#set_encoding or at IO object creation (see IO.new options).
1782 *
1783 * The internal encoding is optional and when not set, the Ruby default
1784 * internal encoding is used. If not explicitly set this default internal
1785 * encoding is +nil+ meaning that by default, no transcoding occurs.
1786 *
1787 * The default internal encoding can be set with the interpreter option
1788 * <code>-E</code>. Encoding.default_internal returns the current internal
1789 * encoding.
1790 *
1791 *    $ ruby -e 'p Encoding.default_internal'
1792 *    nil
1793 *
1794 *    $ ruby -E ISO-8859-1:UTF-8 -e "p [Encoding.default_external, \
1795 *      Encoding.default_internal]"
1796 *    [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>]
1797 *
1798 * The default internal encoding may also be set through
1799 * Encoding.default_internal=, but you should not do this as strings created
1800 * before and after the change will have inconsistent encodings.  Instead use
1801 * <code>ruby -E</code> to invoke ruby with the correct internal encoding.
1802 *
1803 * == IO encoding example
1804 *
1805 * In the following example a UTF-8 encoded string "R\u00E9sum\u00E9" is transcoded for
1806 * output to ISO-8859-1 encoding, then read back in and transcoded to UTF-8:
1807 *
1808 *   string = "R\u00E9sum\u00E9"
1809 *
1810 *   open("transcoded.txt", "w:ISO-8859-1") do |io|
1811 *     io.write(string)
1812 *   end
1813 *
1814 *   puts "raw text:"
1815 *   p File.binread("transcoded.txt")
1816 *   puts
1817 *
1818 *   open("transcoded.txt", "r:ISO-8859-1:UTF-8") do |io|
1819 *     puts "transcoded text:"
1820 *     p io.read
1821 *   end
1822 *
1823 * While writing the file, the internal encoding is not specified as it is
1824 * only necessary for reading.  While reading the file both the internal and
1825 * external encoding must be specified to obtain the correct result.
1826 *
1827 *   $ ruby t.rb
1828 *   raw text:
1829 *   "R\xE9sum\xE9"
1830 *
1831 *   transcoded text:
1832 *   "R\u00E9sum\u00E9"
1833 *
1834 */
1835
1836void
1837Init_Encoding(void)
1838{
1839#undef rb_intern
1840#define rb_intern(str) rb_intern_const(str)
1841    VALUE list;
1842    int i;
1843
1844    rb_cEncoding = rb_define_class("Encoding", rb_cObject);
1845    rb_undef_alloc_func(rb_cEncoding);
1846    rb_undef_method(CLASS_OF(rb_cEncoding), "new");
1847    rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
1848    rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
1849    rb_define_method(rb_cEncoding, "name", enc_name, 0);
1850    rb_define_method(rb_cEncoding, "names", enc_names, 0);
1851    rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
1852    rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0);
1853    rb_define_method(rb_cEncoding, "replicate", enc_replicate, 1);
1854    rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
1855    rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
1856    rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
1857    rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
1858    rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
1859
1860    rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
1861    rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
1862
1863    rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
1864    rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1);
1865    rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0);
1866    rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1);
1867    rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
1868
1869    list = rb_ary_new2(enc_table.count);
1870    RBASIC(list)->klass = 0;
1871    rb_encoding_list = list;
1872    rb_gc_register_mark_object(list);
1873
1874    for (i = 0; i < enc_table.count; ++i) {
1875	rb_ary_push(list, enc_new(enc_table.list[i].enc));
1876    }
1877}
1878
1879/* locale insensitive ctype functions */
1880
1881#define ctype_test(c, ctype) \
1882    (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), (ctype)))
1883
1884int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); }
1885int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); }
1886int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); }
1887int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); }
1888int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); }
1889int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); }
1890int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); }
1891int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); }
1892int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); }
1893int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); }
1894int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); }
1895int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); }
1896
1897int
1898rb_tolower(int c)
1899{
1900    return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c;
1901}
1902
1903int
1904rb_toupper(int c)
1905{
1906    return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c;
1907}
1908
1909