1/*
2 * $Id: encoding.xs,v 0.3 2002/04/21 22:14:41 dankogai Exp $
3 */
4
5#define PERL_NO_GET_CONTEXT
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
9#define U8 U8
10
11#define OUR_DEFAULT_FB	"Encode::PERLQQ"
12
13#if defined(USE_PERLIO) && !defined(USE_SFIO)
14
15/* Define an encoding "layer" in the perliol.h sense.
16
17   The layer defined here "inherits" in an object-oriented sense from
18   the "perlio" layer with its PerlIOBuf_* "methods".  The
19   implementation is particularly efficient as until Encode settles
20   down there is no point in tryint to tune it.
21
22   The layer works by overloading the "fill" and "flush" methods.
23
24   "fill" calls "SUPER::fill" in perl terms, then calls the encode OO
25   perl API to convert the encoded data to UTF-8 form, then copies it
26   back to the buffer. The "base class's" read methods then see the
27   UTF-8 data.
28
29   "flush" transforms the UTF-8 data deposited by the "base class's
30   write method in the buffer back into the encoded form using the
31   encode OO perl API, then copies data back into the buffer and calls
32   "SUPER::flush.
33
34   Note that "flush" is _also_ called for read mode - we still do the
35   (back)-translate so that the base class's "flush" sees the
36   correct number of encoded chars for positioning the seek
37   pointer. (This double translation is the worst performance issue -
38   particularly with all-perl encode engine.)
39
40*/
41
42#include "perliol.h"
43
44typedef struct {
45    PerlIOBuf base;		/* PerlIOBuf stuff */
46    SV *bufsv;			/* buffer seen by layers above */
47    SV *dataSV;			/* data we have read from layer below */
48    SV *enc;			/* the encoding object */
49    SV *chk;                    /* CHECK in Encode methods */
50    int flags;			/* Flags currently just needs lines */
51} PerlIOEncode;
52
53#define NEEDS_LINES	1
54
55SV *
56PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags)
57{
58    PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
59    SV *sv = &PL_sv_undef;
60    if (e->enc) {
61	dSP;
62	/* Not 100% sure stack swap is right thing to do during dup ... */
63	PUSHSTACKi(PERLSI_MAGIC);
64	SPAGAIN;
65	ENTER;
66	SAVETMPS;
67	PUSHMARK(sp);
68	XPUSHs(e->enc);
69	PUTBACK;
70	if (call_method("name", G_SCALAR) == 1) {
71	    SPAGAIN;
72	    sv = newSVsv(POPs);
73	    PUTBACK;
74	}
75	FREETMPS;
76	LEAVE;
77	POPSTACK;
78    }
79    return sv;
80}
81
82IV
83PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg, PerlIO_funcs *tab)
84{
85    PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
86    dSP;
87    IV  code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv,tab);
88    SV *result = Nullsv;
89
90    PUSHSTACKi(PERLSI_MAGIC);
91    SPAGAIN;
92
93    ENTER;
94    SAVETMPS;
95
96    PUSHMARK(sp);
97    XPUSHs(arg);
98    PUTBACK;
99    if (call_pv("Encode::find_encoding", G_SCALAR) != 1) {
100	/* should never happen */
101	Perl_die(aTHX_ "Encode::find_encoding did not return a value");
102	return -1;
103    }
104    SPAGAIN;
105    result = POPs;
106    PUTBACK;
107
108    if (!SvROK(result) || !SvOBJECT(SvRV(result))) {
109	e->enc = Nullsv;
110	Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"",
111		    arg);
112	errno = EINVAL;
113	code = -1;
114    }
115    else {
116
117       /* $enc->renew */
118	PUSHMARK(sp);
119	XPUSHs(result);
120	PUTBACK;
121	if (call_method("renew",G_SCALAR|G_EVAL) != 1 || SvTRUE(ERRSV)) {
122	    Perl_warner(aTHX_ packWARN(WARN_IO), "\"%" SVf "\" does not support renew method",
123			arg);
124	}
125	else {
126	    SPAGAIN;
127	    result = POPs;
128	    PUTBACK;
129	}
130	e->enc = newSVsv(result);
131	PUSHMARK(sp);
132	XPUSHs(e->enc);
133	PUTBACK;
134	if (call_method("needs_lines",G_SCALAR|G_EVAL) != 1 || SvTRUE(ERRSV)) {
135	    Perl_warner(aTHX_ packWARN(WARN_IO), "\"%" SVf "\" does not support needs_lines",
136			arg);
137	}
138	else {
139	    SPAGAIN;
140	    result = POPs;
141	    PUTBACK;
142	    if (SvTRUE(result)) {
143		e->flags |= NEEDS_LINES;
144	    }
145	}
146	PerlIOBase(f)->flags |= PERLIO_F_UTF8;
147    }
148
149    e->chk = newSVsv(get_sv("PerlIO::encoding::fallback", 0));
150
151    FREETMPS;
152    LEAVE;
153    POPSTACK;
154    return code;
155}
156
157IV
158PerlIOEncode_popped(pTHX_ PerlIO * f)
159{
160    PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
161    if (e->enc) {
162	SvREFCNT_dec(e->enc);
163	e->enc = Nullsv;
164    }
165    if (e->bufsv) {
166	SvREFCNT_dec(e->bufsv);
167	e->bufsv = Nullsv;
168    }
169    if (e->dataSV) {
170	SvREFCNT_dec(e->dataSV);
171	e->dataSV = Nullsv;
172    }
173    if (e->chk) {
174	SvREFCNT_dec(e->chk);
175	e->chk = Nullsv;
176    }
177    return 0;
178}
179
180STDCHAR *
181PerlIOEncode_get_base(pTHX_ PerlIO * f)
182{
183    PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
184    if (!e->base.bufsiz)
185	e->base.bufsiz = 1024;
186    if (!e->bufsv) {
187	e->bufsv = newSV(e->base.bufsiz);
188	sv_setpvn(e->bufsv, "", 0);
189    }
190    e->base.buf = (STDCHAR *) SvPVX(e->bufsv);
191    if (!e->base.ptr)
192	e->base.ptr = e->base.buf;
193    if (!e->base.end)
194	e->base.end = e->base.buf;
195    if (e->base.ptr < e->base.buf
196	|| e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
197	Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
198		  e->base.buf + SvLEN(e->bufsv));
199	abort();
200    }
201    if (SvLEN(e->bufsv) < e->base.bufsiz) {
202	SSize_t poff = e->base.ptr - e->base.buf;
203	SSize_t eoff = e->base.end - e->base.buf;
204	e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz);
205	e->base.ptr = e->base.buf + poff;
206	e->base.end = e->base.buf + eoff;
207    }
208    if (e->base.ptr < e->base.buf
209	|| e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
210	Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
211		  e->base.buf + SvLEN(e->bufsv));
212	abort();
213    }
214    return e->base.buf;
215}
216
217IV
218PerlIOEncode_fill(pTHX_ PerlIO * f)
219{
220    PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
221    dSP;
222    IV code = 0;
223    PerlIO *n;
224    SSize_t avail;
225
226    if (PerlIO_flush(f) != 0)
227	return -1;
228    n  = PerlIONext(f);
229    if (!PerlIO_fast_gets(n)) {
230	/* Things get too messy if we don't have a buffer layer
231	   push a :perlio to do the job */
232	char mode[8];
233	n  = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv);
234	if (!n) {
235	    Perl_die(aTHX_ "panic: cannot push :perlio for %p",f);
236	}
237    }
238    PUSHSTACKi(PERLSI_MAGIC);
239    SPAGAIN;
240    ENTER;
241    SAVETMPS;
242  retry:
243    avail = PerlIO_get_cnt(n);
244    if (avail <= 0) {
245	avail = PerlIO_fill(n);
246	if (avail == 0) {
247	    avail = PerlIO_get_cnt(n);
248	}
249	else {
250	    if (!PerlIO_error(n) && PerlIO_eof(n))
251		avail = 0;
252	}
253    }
254    if (avail > 0 || (e->flags & NEEDS_LINES)) {
255	STDCHAR *ptr = PerlIO_get_ptr(n);
256	SSize_t use  = (avail >= 0) ? avail : 0;
257	SV *uni;
258	char *s;
259	STRLEN len = 0;
260	e->base.ptr = e->base.end = (STDCHAR *) Nullch;
261	(void) PerlIOEncode_get_base(aTHX_ f);
262	if (!e->dataSV)
263	    e->dataSV = newSV(0);
264	if (SvTYPE(e->dataSV) < SVt_PV) {
265	    sv_upgrade(e->dataSV,SVt_PV);
266	}
267	if (e->flags & NEEDS_LINES) {
268	    /* Encoding needs whole lines (e.g. iso-2022-*)
269	       search back from end of available data for
270	       and line marker
271	     */
272	    STDCHAR *nl = ptr+use-1;
273	    while (nl >= ptr) {
274		if (*nl == '\n') {
275		    break;
276		}
277		nl--;
278	    }
279	    if (nl >= ptr && *nl == '\n') {
280		/* found a line - take up to and including that */
281		use = (nl+1)-ptr;
282	    }
283	    else if (avail > 0) {
284		/* No line, but not EOF - append avail to the pending data */
285		sv_catpvn(e->dataSV, (char*)ptr, use);
286		PerlIO_set_ptrcnt(n, ptr+use, 0);
287		goto retry;
288	    }
289	    else if (!SvCUR(e->dataSV)) {
290		goto end_of_file;
291	    }
292	}
293	if (SvCUR(e->dataSV)) {
294	    /* something left over from last time - create a normal
295	       SV with new data appended
296	     */
297	    if (use + SvCUR(e->dataSV) > e->base.bufsiz) {
298		if (e->flags & NEEDS_LINES) {
299		    /* Have to grow buffer */
300		    e->base.bufsiz = use + SvCUR(e->dataSV);
301		    PerlIOEncode_get_base(aTHX_ f);
302		}
303		else {
304	       use = e->base.bufsiz - SvCUR(e->dataSV);
305	    }
306	    }
307	    sv_catpvn(e->dataSV,(char*)ptr,use);
308	}
309	else {
310	    /* Create a "dummy" SV to represent the available data from layer below */
311	    if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) {
312		Safefree(SvPVX(e->dataSV));
313	    }
314	    if (use > (SSize_t)e->base.bufsiz) {
315		if (e->flags & NEEDS_LINES) {
316		    /* Have to grow buffer */
317		    e->base.bufsiz = use;
318		    PerlIOEncode_get_base(aTHX_ f);
319		}
320		else {
321	       use = e->base.bufsiz;
322	    }
323	    }
324	    SvPVX(e->dataSV) = (char *) ptr;
325	    SvLEN(e->dataSV) = 0;  /* Hands off sv.c - it isn't yours */
326	    SvCUR_set(e->dataSV,use);
327	    SvPOK_only(e->dataSV);
328	}
329	SvUTF8_off(e->dataSV);
330	PUSHMARK(sp);
331	XPUSHs(e->enc);
332	XPUSHs(e->dataSV);
333	XPUSHs(e->chk);
334	PUTBACK;
335	if (call_method("decode", G_SCALAR) != 1) {
336	    Perl_die(aTHX_ "panic: decode did not return a value");
337	}
338	SPAGAIN;
339	uni = POPs;
340	PUTBACK;
341	/* Now get translated string (forced to UTF-8) and use as buffer */
342	if (SvPOK(uni)) {
343	    s = SvPVutf8(uni, len);
344#ifdef PARANOID_ENCODE_CHECKS
345	    if (len && !is_utf8_string((U8*)s,len)) {
346		Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s);
347	    }
348#endif
349	}
350	if (len > 0) {
351	    /* Got _something */
352	    /* if decode gave us back dataSV then data may vanish when
353	       we do ptrcnt adjust - so take our copy now.
354	       (The copy is a pain - need a put-it-here option for decode.)
355	     */
356	    sv_setpvn(e->bufsv,s,len);
357	    e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv);
358	    e->base.end = e->base.ptr + SvCUR(e->bufsv);
359	    PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
360	    SvUTF8_on(e->bufsv);
361
362	    /* Adjust ptr/cnt not taking anything which
363	       did not translate - not clear this is a win */
364	    /* compute amount we took */
365	    use -= SvCUR(e->dataSV);
366	    PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
367	    /* and as we did not take it it isn't pending */
368	    SvCUR_set(e->dataSV,0);
369	} else {
370	    /* Got nothing - assume partial character so we need some more */
371	    /* Make sure e->dataSV is a normal SV before re-filling as
372	       buffer alias will change under us
373	     */
374	    s = SvPV(e->dataSV,len);
375	    sv_setpvn(e->dataSV,s,len);
376	    PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
377	    goto retry;
378	}
379    }
380    else {
381    end_of_file:
382	code = -1;
383	if (avail == 0)
384	    PerlIOBase(f)->flags |= PERLIO_F_EOF;
385	else
386	    PerlIOBase(f)->flags |= PERLIO_F_ERROR;
387    }
388    FREETMPS;
389    LEAVE;
390    POPSTACK;
391    return code;
392}
393
394IV
395PerlIOEncode_flush(pTHX_ PerlIO * f)
396{
397    PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
398    IV code = 0;
399
400    if (e->bufsv) {
401	dSP;
402	SV *str;
403	char *s;
404	STRLEN len;
405	SSize_t count = 0;
406	if ((PerlIOBase(f)->flags & PERLIO_F_WRBUF) && (e->base.ptr > e->base.buf)) {
407	    /* Write case - encode the buffer and write() to layer below */
408	    PUSHSTACKi(PERLSI_MAGIC);
409	    SPAGAIN;
410	    ENTER;
411	    SAVETMPS;
412	    PUSHMARK(sp);
413	    XPUSHs(e->enc);
414	    SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
415	    SvUTF8_on(e->bufsv);
416	    XPUSHs(e->bufsv);
417	    XPUSHs(e->chk);
418	    PUTBACK;
419	    if (call_method("encode", G_SCALAR) != 1) {
420		Perl_die(aTHX_ "panic: encode did not return a value");
421	    }
422	    SPAGAIN;
423	    str = POPs;
424	    PUTBACK;
425	    s = SvPV(str, len);
426	    count = PerlIO_write(PerlIONext(f),s,len);
427	    if ((STRLEN)count != len) {
428		code = -1;
429	    }
430	    FREETMPS;
431	    LEAVE;
432	    POPSTACK;
433	    if (PerlIO_flush(PerlIONext(f)) != 0) {
434		code = -1;
435	    }
436	    if (SvCUR(e->bufsv)) {
437		/* Did not all translate */
438		e->base.ptr = e->base.buf+SvCUR(e->bufsv);
439		return code;
440	    }
441	}
442	else if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF)) {
443	    /* read case */
444	    /* if we have any untranslated stuff then unread that first */
445	    /* FIXME - unread is fragile is there a better way ? */
446	    if (e->dataSV && SvCUR(e->dataSV)) {
447		s = SvPV(e->dataSV, len);
448		count = PerlIO_unread(PerlIONext(f),s,len);
449		if ((STRLEN)count != len) {
450		    code = -1;
451		}
452		SvCUR_set(e->dataSV,0);
453	    }
454	    /* See if there is anything left in the buffer */
455	    if (e->base.ptr < e->base.end) {
456		/* Bother - have unread data.
457		   re-encode and unread() to layer below
458		 */
459		PUSHSTACKi(PERLSI_MAGIC);
460		SPAGAIN;
461		ENTER;
462		SAVETMPS;
463		str = sv_newmortal();
464		sv_upgrade(str, SVt_PV);
465		SvPVX(str) = (char*)e->base.ptr;
466		SvLEN(str) = 0;
467		SvCUR_set(str, e->base.end - e->base.ptr);
468		SvPOK_only(str);
469		SvUTF8_on(str);
470		PUSHMARK(sp);
471		XPUSHs(e->enc);
472		XPUSHs(str);
473		XPUSHs(e->chk);
474		PUTBACK;
475		if (call_method("encode", G_SCALAR) != 1) {
476		     Perl_die(aTHX_ "panic: encode did not return a value");
477		}
478		SPAGAIN;
479		str = POPs;
480		PUTBACK;
481		s = SvPV(str, len);
482		count = PerlIO_unread(PerlIONext(f),s,len);
483		if ((STRLEN)count != len) {
484		    code = -1;
485		}
486		FREETMPS;
487		LEAVE;
488		POPSTACK;
489	    }
490	}
491	e->base.ptr = e->base.end = e->base.buf;
492	PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
493    }
494    return code;
495}
496
497IV
498PerlIOEncode_close(pTHX_ PerlIO * f)
499{
500    PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
501    IV code;
502    if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
503	/* Discard partial character */
504	if (e->dataSV) {
505	    SvCUR_set(e->dataSV,0);
506	}
507	/* Don't back decode and unread any pending data */
508	e->base.ptr = e->base.end = e->base.buf;
509    }
510    code = PerlIOBase_close(aTHX_ f);
511    if (e->bufsv) {
512	/* This should only fire for write case */
513	if (e->base.buf && e->base.ptr > e->base.buf) {
514	    Perl_croak(aTHX_ "Close with partial character");
515	}
516	SvREFCNT_dec(e->bufsv);
517	e->bufsv = Nullsv;
518    }
519    e->base.buf = NULL;
520    e->base.ptr = NULL;
521    e->base.end = NULL;
522    PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
523    return code;
524}
525
526Off_t
527PerlIOEncode_tell(pTHX_ PerlIO * f)
528{
529    PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
530    /* Unfortunately the only way to get a postion is to (re-)translate,
531       the UTF8 we have in bufefr and then ask layer below
532     */
533    PerlIO_flush(f);
534    if (b->buf && b->ptr > b->buf) {
535	Perl_croak(aTHX_ "Cannot tell at partial character");
536    }
537    return PerlIO_tell(PerlIONext(f));
538}
539
540PerlIO *
541PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o,
542		 CLONE_PARAMS * params, int flags)
543{
544    if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) {
545	PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode);
546	PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode);
547	if (oe->enc) {
548	    fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params);
549	}
550    }
551    return f;
552}
553
554SSize_t
555PerlIOEncode_write(pTHX_ PerlIO *f, const void *vbuf, Size_t count)
556{
557    PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
558    if (e->flags & NEEDS_LINES) {
559	SSize_t done = 0;
560	const char *ptr = (const char *) vbuf;
561	const char *end = ptr+count;
562	while (ptr < end) {
563	    const char *nl = ptr;
564	    while (nl < end && *nl++ != '\n') /* empty body */;
565	    done = PerlIOBuf_write(aTHX_ f, ptr, nl-ptr);
566	    if (done != nl-ptr) {
567		if (done > 0) {
568		    ptr += done;
569		}
570		break;
571	    }
572	    ptr += done;
573	    if (ptr[-1] == '\n') {
574		if (PerlIOEncode_flush(aTHX_ f) != 0) {
575		    break;
576		}
577	    }
578	}
579	return (SSize_t) (ptr - (const char *) vbuf);
580    }
581    else {
582	return PerlIOBuf_write(aTHX_ f, vbuf, count);
583    }
584}
585
586PerlIO_funcs PerlIO_encode = {
587    sizeof(PerlIO_funcs),
588    "encoding",
589    sizeof(PerlIOEncode),
590    PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT,
591    PerlIOEncode_pushed,
592    PerlIOEncode_popped,
593    PerlIOBuf_open,
594    NULL, /* binmode - always pop */
595    PerlIOEncode_getarg,
596    PerlIOBase_fileno,
597    PerlIOEncode_dup,
598    PerlIOBuf_read,
599    PerlIOBuf_unread,
600    PerlIOEncode_write,
601    PerlIOBuf_seek,
602    PerlIOEncode_tell,
603    PerlIOEncode_close,
604    PerlIOEncode_flush,
605    PerlIOEncode_fill,
606    PerlIOBase_eof,
607    PerlIOBase_error,
608    PerlIOBase_clearerr,
609    PerlIOBase_setlinebuf,
610    PerlIOEncode_get_base,
611    PerlIOBuf_bufsiz,
612    PerlIOBuf_get_ptr,
613    PerlIOBuf_get_cnt,
614    PerlIOBuf_set_ptrcnt,
615};
616#endif				/* encode layer */
617
618MODULE = PerlIO::encoding PACKAGE = PerlIO::encoding
619
620PROTOTYPES: ENABLE
621
622BOOT:
623{
624    SV *chk = get_sv("PerlIO::encoding::fallback", GV_ADD|GV_ADDMULTI);
625    /*
626     * we now "use Encode ()" here instead of
627     * PerlIO/encoding.pm.  This avoids SEGV when ":encoding()"
628     * is invoked without prior "use Encode". -- dankogai
629     */
630    PUSHSTACKi(PERLSI_MAGIC);
631    SPAGAIN;
632    if (!get_cv(OUR_DEFAULT_FB, 0)) {
633#if 0
634	/* This would just be an irritant now loading works */
635	Perl_warner(aTHX_ packWARN(WARN_IO), ":encoding without 'use Encode'");
636#endif
637	ENTER;
638	/* Encode needs a lot of stack - it is likely to move ... */
639	PUTBACK;
640	/* The SV is magically freed by load_module */
641	load_module(PERL_LOADMOD_NOIMPORT, newSVpvn("Encode", 6), Nullsv, Nullsv);
642	SPAGAIN;
643	LEAVE;
644    }
645    PUSHMARK(sp);
646    PUTBACK;
647    if (call_pv(OUR_DEFAULT_FB, G_SCALAR) != 1) {
648	    /* should never happen */
649	    Perl_die(aTHX_ "%s did not return a value",OUR_DEFAULT_FB);
650    }
651    SPAGAIN;
652    sv_setsv(chk, POPs);
653    PUTBACK;
654#ifdef PERLIO_LAYERS
655    PerlIO_define_layer(aTHX_ &PerlIO_encode);
656#endif
657    POPSTACK;
658}
659