1/*
2 * Copyright 2002-2009, Haiku, Inc. All rights reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 *		Michael Wilber
7 *		Axel Dörfler, axeld@pinc-software.de
8 */
9
10
11#include "STXTTranslator.h"
12#include "STXTView.h"
13
14#include <Catalog.h>
15#include <CharacterSet.h>
16#include <CharacterSetRoster.h>
17#include <MimeType.h>
18#include <String.h>
19#include <UTF8.h>
20
21#include <algorithm>
22#include <new>
23#include <string.h>
24#include <stdio.h>
25#include <stdint.h>
26
27
28using namespace BPrivate;
29using namespace std;
30
31#undef B_TRANSLATION_CONTEXT
32#define B_TRANSLATION_CONTEXT "STXTTranslator"
33
34#define READ_BUFFER_SIZE 32768
35#define DATA_BUFFER_SIZE 256
36
37// The input formats that this translator supports.
38static const translation_format sInputFormats[] = {
39	{
40		B_TRANSLATOR_TEXT,
41		B_TRANSLATOR_TEXT,
42		TEXT_IN_QUALITY,
43		TEXT_IN_CAPABILITY,
44		"text/plain",
45		"Plain text file"
46	},
47	{
48		B_STYLED_TEXT_FORMAT,
49		B_TRANSLATOR_TEXT,
50		STXT_IN_QUALITY,
51		STXT_IN_CAPABILITY,
52		"text/x-vnd.Be-stxt",
53		"Be styled text file"
54	}
55};
56
57// The output formats that this translator supports.
58static const translation_format sOutputFormats[] = {
59	{
60		B_TRANSLATOR_TEXT,
61		B_TRANSLATOR_TEXT,
62		TEXT_OUT_QUALITY,
63		TEXT_OUT_CAPABILITY,
64		"text/plain",
65		"Plain text file"
66	},
67	{
68		B_STYLED_TEXT_FORMAT,
69		B_TRANSLATOR_TEXT,
70		STXT_OUT_QUALITY,
71		STXT_OUT_CAPABILITY,
72		"text/x-vnd.Be-stxt",
73		"Be styled text file"
74	}
75};
76
77// Default settings for the Translator
78static const TranSetting sDefaultSettings[] = {
79	{B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false},
80	{B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false}
81};
82
83const uint32 kNumInputFormats = sizeof(sInputFormats) / sizeof(translation_format);
84const uint32 kNumOutputFormats = sizeof(sOutputFormats) / sizeof(translation_format);
85const uint32 kNumDefaultSettings = sizeof(sDefaultSettings) / sizeof(TranSetting);
86
87// ---------------------------------------------------------------
88// make_nth_translator
89//
90// Creates a STXTTranslator object to be used by BTranslatorRoster
91//
92// Preconditions:
93//
94// Parameters: n,		The translator to return. Since
95//						STXTTranslator only publishes one
96//						translator, it only returns a
97//						STXTTranslator if n == 0
98//
99//             you, 	The image_id of the add-on that
100//						contains code (not used).
101//
102//             flags,	Has no meaning yet, should be 0.
103//
104// Postconditions:
105//
106// Returns: NULL if n is not zero,
107//          a new STXTTranslator if n is zero
108// ---------------------------------------------------------------
109BTranslator *
110make_nth_translator(int32 n, image_id you, uint32 flags, ...)
111{
112	if (!n)
113		return new (std::nothrow) STXTTranslator();
114
115	return NULL;
116}
117
118
119// #pragma mark - ascmagic.c from the BSD file tool
120/*
121 * The following code has been taken from version 4.17 of the BSD file tool,
122 * file ascmagic.c, modified for our purpose.
123 */
124
125/*
126 * Copyright (c) Ian F. Darwin 1986-1995.
127 * Software written by Ian F. Darwin and others;
128 * maintained 1995-present by Christos Zoulas and others.
129 *
130 * Redistribution and use in source and binary forms, with or without
131 * modification, are permitted provided that the following conditions
132 * are met:
133 * 1. Redistributions of source code must retain the above copyright
134 *    notice immediately at the beginning of the file, without modification,
135 *    this list of conditions, and the following disclaimer.
136 * 2. Redistributions in binary form must reproduce the above copyright
137 *    notice, this list of conditions and the following disclaimer in the
138 *    documentation and/or other materials provided with the distribution.
139 *
140 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
141 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
142 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
143 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
144 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
145 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
146 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
147 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
148 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
149 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
150 * SUCH DAMAGE.
151 */
152/*
153 * ASCII magic -- file types that we know based on keywords
154 * that can appear anywhere in the file.
155 *		bool found = false;
156		if (subtypeMimeSpecific != NULL) {
157			mimeType->SetTo(subtypeMimeSpecific);
158			if (mimeType->IsInstalled())
159				found = true;
160		}
161		if (!found && subtypeMimeGeneric != NULL) {
162			mimeType->SetTo(subtypeMimeGeneric);
163			if (mimeType->IsInstalled())
164				found = true;
165		}
166		if (!found)
167			mimeType->SetTo("text/plain");
168
169 * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
170 * to handle character codes other than ASCII on a unified basis.
171 *
172 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
173 * international characters, now subsumed into this file.
174 */
175
176#include <stdio.h>
177#include <string.h>
178#include <memory.h>
179#include <ctype.h>
180#include <stdlib.h>
181#include <unistd.h>
182#include "names.h"
183
184typedef unsigned long my_unichar;
185
186#define MAXLINELEN 300	/* longest sane line length */
187#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
188		  || (x) == 0x85 || (x) == '\f')
189
190static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *);
191static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *);
192static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *);
193static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *);
194static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *);
195static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
196static int ascmatch(const unsigned char *, const my_unichar *, size_t);
197
198
199static int
200file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType,
201	const char*& encoding)
202{
203	size_t i;
204	unsigned char *nbuf = NULL;
205	my_unichar *ubuf = NULL;
206	size_t ulen;
207	struct names *p;
208	int rv = -1;
209
210	const char *code = NULL;
211	encoding = NULL;
212	const char *type = NULL;
213	const char *subtype = NULL;
214	const char *subtypeMimeGeneric = NULL;
215	const char *subtypeMimeSpecific = NULL;
216
217	int has_escapes = 0;
218	int has_backspace = 0;
219	int seen_cr = 0;
220
221	int n_crlf = 0;
222	int n_lf = 0;
223	int n_cr = 0;
224	int n_nel = 0;
225
226	int last_line_end = -1;
227	int has_long_lines = 0;
228
229	if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
230		goto done;
231	if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
232		goto done;
233
234	/*
235	 * Then try to determine whether it's any character code we can
236	 * identify.  Each of these tests, if it succeeds, will leave
237	 * the text converted into one-my_unichar-per-character Unicode in
238	 * ubuf, and the number of characters converted in ulen.
239	 */
240	if (nbytes == 0) {
241		code = "UTF-8 Unicode";
242		encoding = NULL; // "UTF-8";
243		type = "text";
244		rv = 1;
245	} else if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
246		code = "ASCII";
247		encoding = NULL; //"us-ascii";
248		type = "text";
249		if (nbytes == 1) {
250			// no further tests
251			rv = 1;
252		}
253	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
254		code = "UTF-8 Unicode";
255		encoding = NULL; // "UTF-8";
256		type = "text";
257	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
258		if (i == 1) {
259			code = "Little-endian UTF-16 Unicode";
260			encoding = "UTF-16";
261		} else {
262			code = "Big-endian UTF-16 Unicode";
263			encoding = "UTF-16";
264		}
265
266		type = "character data";
267	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
268		code = "ISO-8859";
269		type = "text";
270		encoding = "iso-8859-1";
271	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
272		code = "Non-ISO extended-ASCII";
273		type = "text";
274		encoding = "unknown";
275	} else {
276		from_ebcdic(buf, nbytes, nbuf);
277
278		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
279			code = "EBCDIC";
280			type = "character data";
281			encoding = "ebcdic";
282		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
283			code = "International EBCDIC";
284			type = "character data";
285			encoding = "ebcdic";
286		} else {
287			rv = 0;
288			goto done;  /* doesn't look like text at all */
289		}
290	}
291
292	if (nbytes <= 1) {
293		if (rv == -1)
294			rv = 0;
295		goto done;
296	}
297
298	/*
299	 * for troff, look for . + letter + letter or .\";
300	 * this must be done to disambiguate tar archives' ./file
301	 * and other trash from real troff input.
302	 *
303	 * I believe Plan 9 troff allows non-ASCII characters in the names
304	 * of macros, so this test might possibly fail on such a file.
305	 */
306	if (*ubuf == '.') {
307		my_unichar *tp = ubuf + 1;
308
309		while (ISSPC(*tp))
310			++tp;	/* skip leading whitespace */
311		if ((tp[0] == '\\' && tp[1] == '\"') ||
312		    (isascii((unsigned char)tp[0]) &&
313		     isalnum((unsigned char)tp[0]) &&
314		     isascii((unsigned char)tp[1]) &&
315		     isalnum((unsigned char)tp[1]) &&
316		     ISSPC(tp[2]))) {
317		    subtypeMimeGeneric = "text/x-source-code";
318			subtypeMimeSpecific = "text/troff";
319			subtype = "troff or preprocessor input";
320			goto subtype_identified;
321		}
322	}
323
324	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
325		subtypeMimeGeneric = "text/x-source-code";
326		subtypeMimeSpecific = "text/fortran";
327		subtype = "fortran program";
328		goto subtype_identified;
329	}
330
331	/* look for tokens from names.h - this is expensive! */
332
333	i = 0;
334	while (i < ulen) {
335		size_t end;
336
337		/*
338		 * skip past any leading space
339		 */
340		while (i < ulen && ISSPC(ubuf[i]))
341			i++;
342		if (i >= ulen)
343			break;
344
345		/*
346		 * find the next whitespace
347		 */
348		for (end = i + 1; end < nbytes; end++)
349			if (ISSPC(ubuf[end]))
350				break;
351
352		/*
353		 * compare the word thus isolated against the token list
354		 */
355		for (p = names; p < names + NNAMES; p++) {
356			if (ascmatch((const unsigned char *)p->name, ubuf + i,
357			    end - i)) {
358				subtype = types[p->type].human;
359				subtypeMimeGeneric = types[p->type].generic_mime;
360				subtypeMimeSpecific = types[p->type].specific_mime;
361				goto subtype_identified;
362			}
363		}
364
365		i = end;
366	}
367
368subtype_identified:
369
370	/*
371	 * Now try to discover other details about the file.
372	 */
373	for (i = 0; i < ulen; i++) {
374		if (ubuf[i] == '\n') {
375			if (seen_cr)
376				n_crlf++;
377			else
378				n_lf++;
379			last_line_end = i;
380		} else if (seen_cr)
381			n_cr++;
382
383		seen_cr = (ubuf[i] == '\r');
384		if (seen_cr)
385			last_line_end = i;
386
387		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
388			n_nel++;
389			last_line_end = i;
390		}
391
392		/* If this line is _longer_ than MAXLINELEN, remember it. */
393		if ((int)i > last_line_end + MAXLINELEN)
394			has_long_lines = 1;
395
396		if (ubuf[i] == '\033')
397			has_escapes = 1;
398		if (ubuf[i] == '\b')
399			has_backspace = 1;
400	}
401
402	rv = 1;
403done:
404	if (nbuf)
405		free(nbuf);
406	if (ubuf)
407		free(ubuf);
408
409	if (rv) {
410		// If we have identified the subtype, return it, otherwise just
411		// text/plain.
412
413		bool found = false;
414		if (subtypeMimeSpecific != NULL) {
415			mimeType->SetTo(subtypeMimeSpecific);
416			if (mimeType->IsInstalled())
417				found = true;
418		}
419		if (!found && subtypeMimeGeneric != NULL) {
420			mimeType->SetTo(subtypeMimeGeneric);
421			if (mimeType->IsInstalled())
422				found = true;
423		}
424		if (!found)
425			mimeType->SetTo("text/plain");
426	}
427
428	return rv;
429}
430
431static int
432ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen)
433{
434	size_t i;
435
436	for (i = 0; i < ulen; i++) {
437		if (s[i] != us[i])
438			return 0;
439	}
440
441	if (s[i])
442		return 0;
443	else
444		return 1;
445}
446
447/*
448 * This table reflects a particular philosophy about what constitutes
449 * "text," and there is room for disagreement about it.
450 *
451 * Version 3.31 of the file command considered a file to be ASCII if
452 * each of its characters was approved by either the isascii() or
453 * isalpha() function.  On most systems, this would mean that any
454 * file consisting only of characters in the range 0x00 ... 0x7F
455 * would be called ASCII text, but many systems might reasonably
456 * consider some characters outside this range to be alphabetic,
457 * so the file command would call such characters ASCII.  It might
458 * have been more accurate to call this "considered textual on the
459 * local system" than "ASCII."
460 *
461 * It considered a file to be "International language text" if each
462 * of its characters was either an ASCII printing character (according
463 * to the real ASCII standard, not the above test), a character in
464 * the range 0x80 ... 0xFF, or one of the following control characters:
465 * backspace, tab, line feed, vertical tab, form feed, carriage return,
466 * escape.  No attempt was made to determine the language in which files
467 * of this type were written.
468 *
469 *
470 * The table below considers a file to be ASCII if all of its characters
471 * are either ASCII printing characters (again, according to the X3.4
472 * standard, not isascii()) or any of the following controls: bell,
473 * backspace, tab, line feed, form feed, carriage return, esc, nextline.
474 *
475 * I include bell because some programs (particularly shell scripts)
476 * use it literally, even though it is rare in normal text.  I exclude
477 * vertical tab because it never seems to be used in real text.  I also
478 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
479 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
480 * character to.  It might be more appropriate to include it in the 8859
481 * set instead of the ASCII set, but it's got to be included in *something*
482 * we recognize or EBCDIC files aren't going to be considered textual.
483 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
484 * and Latin characters, so these should possibly be allowed.  But they
485 * make a real mess on VT100-style displays if they're not paired properly,
486 * so we are probably better off not calling them text.
487 *
488 * A file is considered to be ISO-8859 text if its characters are all
489 * either ASCII, according to the above definition, or printing characters
490 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
491 *
492 * Finally, a file is considered to be international text from some other
493 * character code if its characters are all either ISO-8859 (according to
494 * the above definition) or characters in the range 0x80 ... 0x9F, which
495 * ISO-8859 considers to be control characters but the IBM PC and Macintosh
496 * consider to be printing characters.
497 */
498
499#define F 0   /* character never appears in text */
500#define T 1   /* character appears in plain ASCII text */
501#define I 2   /* character appears in ISO-8859 text */
502#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
503
504static char text_chars[256] = {
505	/*                  BEL BS HT LF    FF CR    */
506	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
507        /*                              ESC          */
508	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
509	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
510	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
511	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
512	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
513	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
514	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
515	/*            NEL                            */
516	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
517	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
518	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
519	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
520	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
521	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
522	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
523	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
524};
525
526static int
527looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
528    size_t *ulen)
529{
530	int i;
531
532	*ulen = 0;
533
534	for (i = 0; i < (int)nbytes; i++) {
535		int t = text_chars[buf[i]];
536
537		if (t != T)
538			return 0;
539
540		ubuf[(*ulen)++] = buf[i];
541	}
542
543	return 1;
544}
545
546static int
547looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
548{
549	int i;
550
551	*ulen = 0;
552
553	for (i = 0; i < (int)nbytes; i++) {
554		int t = text_chars[buf[i]];
555
556		if (t != T && t != I)
557			return 0;
558
559		ubuf[(*ulen)++] = buf[i];
560	}
561
562	return 1;
563}
564
565static int
566looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
567    size_t *ulen)
568{
569	int i;
570
571	*ulen = 0;
572
573	for (i = 0; i < (int)nbytes; i++) {
574		int t = text_chars[buf[i]];
575
576		if (t != T && t != I && t != X)
577			return 0;
578
579		ubuf[(*ulen)++] = buf[i];
580	}
581
582	return 1;
583}
584
585static int
586looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
587{
588	int i, n;
589	my_unichar c;
590	int gotone = 0;
591
592	*ulen = 0;
593
594	for (i = 0; i < (int)nbytes; i++) {
595		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
596			/*
597			 * Even if the whole file is valid UTF-8 sequences,
598			 * still reject it if it uses weird control characters.
599			 */
600
601			if (text_chars[buf[i]] != T)
602				return 0;
603
604			ubuf[(*ulen)++] = buf[i];
605		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
606			return 0;
607		} else {			   /* 11xxxxxx begins UTF-8 */
608			int following;
609
610			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
611				c = buf[i] & 0x1f;
612				following = 1;
613			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
614				c = buf[i] & 0x0f;
615				following = 2;
616			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
617				c = buf[i] & 0x07;
618				following = 3;
619			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
620				c = buf[i] & 0x03;
621				following = 4;
622			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
623				c = buf[i] & 0x01;
624				following = 5;
625			} else
626				return 0;
627
628			for (n = 0; n < following; n++) {
629				i++;
630				if (i >= (int)nbytes)
631					goto done;
632
633				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
634					return 0;
635
636				c = (c << 6) + (buf[i] & 0x3f);
637			}
638
639			ubuf[(*ulen)++] = c;
640			gotone = 1;
641		}
642	}
643done:
644	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
645}
646
647static int
648looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
649    size_t *ulen)
650{
651	int bigend;
652	int i;
653
654	if (nbytes < 2)
655		return 0;
656
657	if (buf[0] == 0xff && buf[1] == 0xfe)
658		bigend = 0;
659	else if (buf[0] == 0xfe && buf[1] == 0xff)
660		bigend = 1;
661	else
662		return 0;
663
664	*ulen = 0;
665
666	for (i = 2; i + 1 < (int)nbytes; i += 2) {
667		/* XXX fix to properly handle chars > 65536 */
668
669		if (bigend)
670			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
671		else
672			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
673
674		if (ubuf[*ulen - 1] == 0xfffe)
675			return 0;
676		if (ubuf[*ulen - 1] < 128 &&
677		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
678			return 0;
679	}
680
681	return 1 + bigend;
682}
683
684#undef F
685#undef T
686#undef I
687#undef X
688
689/*
690 * This table maps each EBCDIC character to an (8-bit extended) ASCII
691 * character, as specified in the rationale for the dd(1) command in
692 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
693 *
694 * Unfortunately it does not seem to correspond exactly to any of the
695 * five variants of EBCDIC documented in IBM's _Enterprise Systems
696 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
697 * Edition, July, 1999, pp. I-1 - I-4.
698 *
699 * Fortunately, though, all versions of EBCDIC, including this one, agree
700 * on most of the printing characters that also appear in (7-bit) ASCII.
701 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
702 *
703 * Fortunately too, there is general agreement that codes 0x00 through
704 * 0x3F represent control characters, 0x41 a nonbreaking space, and the
705 * remainder printing characters.
706 *
707 * This is sufficient to allow us to identify EBCDIC text and to distinguish
708 * between old-style and internationalized examples of text.
709 */
710
711static unsigned char ebcdic_to_ascii[] = {
712  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
713 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
714128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
715144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
716' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
717'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
718'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
719186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
720195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
721202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
722209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
723216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
724'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
725'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
726'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
727'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
728};
729
730#ifdef notdef
731/*
732 * The following EBCDIC-to-ASCII table may relate more closely to reality,
733 * or at least to modern reality.  It comes from
734 *
735 *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
736 *
737 * and maps the characters of EBCDIC code page 1047 (the code used for
738 * Unix-derived software on IBM's 390 systems) to the corresponding
739 * characters from ISO 8859-1.
740 *
741 * If this table is used instead of the above one, some of the special
742 * cases for the NEL character can be taken out of the code.
743 */
744
745static unsigned char ebcdic_1047_to_8859[] = {
7460x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
7470x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
7480x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
7490x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
7500x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
7510x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
7520x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
7530xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
7540xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
7550xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
7560xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
7570xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
7580x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
7590x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
7600x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
7610x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
762};
763#endif
764
765/*
766 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
767 */
768static void
769from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
770{
771	int i;
772
773	for (i = 0; i < (int)nbytes; i++) {
774		out[i] = ebcdic_to_ascii[buf[i]];
775	}
776}
777
778
779//	#pragma mark -
780
781
782/*!
783	Determines if the data in inSource is of the STXT format.
784
785	\param header the STXT stream header read in by Identify() or Translate()
786	\param inSource the stream with the STXT data
787	\param outInfo information about the type of data from inSource is stored here
788	\param outType the desired output type for the data in inSource
789	\param ptxtheader if this is not NULL, the TEXT header from
790		inSource is copied to it
791*/
792status_t
793identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
794	BPositionIO *inSource, translator_info *outInfo, uint32 outType,
795	TranslatorStyledTextTextHeader *ptxtheader = NULL)
796{
797	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
798	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
799
800	uint8 buffer[max(ktxtsize, kstylsize)];
801
802	// Check the TEXT header
803	TranslatorStyledTextTextHeader txtheader;
804	if (inSource->Read(buffer, ktxtsize) != ktxtsize)
805		return B_NO_TRANSLATOR;
806
807	memcpy(&txtheader, buffer, ktxtsize);
808	if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize,
809		B_SWAP_BENDIAN_TO_HOST) != B_OK)
810		return B_ERROR;
811
812	if (txtheader.header.magic != 'TEXT'
813		|| txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader)
814		|| txtheader.charset != B_UNICODE_UTF8)
815		return B_NO_TRANSLATOR;
816
817	// skip the text data
818	off_t seekresult, pos;
819	pos = header.header.header_size + txtheader.header.header_size
820		+ txtheader.header.data_size;
821	seekresult = inSource->Seek(txtheader.header.data_size,
822		SEEK_CUR);
823	if (seekresult < pos)
824		return B_NO_TRANSLATOR;
825	if (seekresult > pos)
826		return B_ERROR;
827
828	// check the STYL header (not all STXT files have this)
829	ssize_t read = 0;
830	TranslatorStyledTextStyleHeader stylheader;
831	read = inSource->Read(buffer, kstylsize);
832	if (read < 0)
833		return read;
834	if (read != kstylsize && read != 0)
835		return B_NO_TRANSLATOR;
836
837	// If there is a STYL header
838	if (read == kstylsize) {
839		memcpy(&stylheader, buffer, kstylsize);
840		if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize,
841			B_SWAP_BENDIAN_TO_HOST) != B_OK)
842			return B_ERROR;
843
844		if (stylheader.header.magic != 'STYL'
845			|| stylheader.header.header_size !=
846				sizeof(TranslatorStyledTextStyleHeader))
847			return B_NO_TRANSLATOR;
848	}
849
850	// if output TEXT header is supplied, fill it with data
851	if (ptxtheader) {
852		ptxtheader->header.magic = txtheader.header.magic;
853		ptxtheader->header.header_size = txtheader.header.header_size;
854		ptxtheader->header.data_size = txtheader.header.data_size;
855		ptxtheader->charset = txtheader.charset;
856	}
857
858	// return information about the data in the stream
859	outInfo->type = B_STYLED_TEXT_FORMAT;
860	outInfo->group = B_TRANSLATOR_TEXT;
861	outInfo->quality = STXT_IN_QUALITY;
862	outInfo->capability = STXT_IN_CAPABILITY;
863	strlcpy(outInfo->name, B_TRANSLATE("Be styled text file"),
864		sizeof(outInfo->name));
865	strcpy(outInfo->MIME, "text/x-vnd.Be-stxt");
866
867	return B_OK;
868}
869
870
871/*!
872	Determines if the data in \a inSource is of the UTF8 plain
873
874	\param data buffer containing data already read (must be at
875		least DATA_BUFFER_SIZE bytes large)
876	\param nread number of bytes that have already been read from the stream
877	\param header the STXT stream header read in by Identify() or Translate()
878	\param inSource the stream with the STXT data
879	\param outInfo information about the type of data from inSource is stored here
880	\param outType the desired output type for the data in inSource
881*/
882status_t
883identify_text(uint8* data, int32 bytesRead, BPositionIO* source,
884	translator_info* outInfo, uint32 outType, const char*& encoding)
885{
886	ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead);
887	if (readLater < B_OK)
888		return B_NO_TRANSLATOR;
889
890	bytesRead += readLater;
891
892	// TODO: identify encoding as possible!
893	BMimeType type;
894	if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding))
895		return B_NO_TRANSLATOR;
896
897	float capability = TEXT_IN_CAPABILITY;
898	if (bytesRead < 20)
899		capability = .1f;
900
901	// return information about the data in the stream
902	outInfo->type = B_TRANSLATOR_TEXT;
903	outInfo->group = B_TRANSLATOR_TEXT;
904	outInfo->quality = TEXT_IN_QUALITY;
905	outInfo->capability = capability;
906
907	char description[B_MIME_TYPE_LENGTH];
908	if (type.GetLongDescription(description) == B_OK)
909		strlcpy(outInfo->name, description, sizeof(outInfo->name));
910	else
911		strlcpy(outInfo->name, B_TRANSLATE("Plain text file"),
912			sizeof(outInfo->name));
913
914	//strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME));
915	strcpy(outInfo->MIME, "text/plain");
916	return B_OK;
917}
918
919
920// ---------------------------------------------------------------
921// translate_from_stxt
922//
923// Translates the data in inSource to the type outType and stores
924// the translated data in outDestination.
925//
926// Preconditions:
927//
928// Parameters:	inSource,	the data to be translated
929//
930//				outDestination,	where the translated data is
931//								put
932//
933//				outType,	the type to convert inSource to
934//
935//				txtheader, 	the TEXT header from inSource
936//
937//
938// Postconditions:
939//
940// Returns: B_BAD_VALUE, if outType is invalid
941//
942// B_NO_TRANSLATOR, if this translator doesn't understand the data
943//
944// B_ERROR, if there was an error allocating memory or converting
945//          data
946//
947// B_OK, if all went well
948// ---------------------------------------------------------------
949status_t
950translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination,
951		uint32 outType, const TranslatorStyledTextTextHeader &txtheader)
952{
953	if (inSource->Seek(0, SEEK_SET) != 0)
954		return B_ERROR;
955
956	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
957	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
958
959	bool btoplain;
960	if (outType == B_TRANSLATOR_TEXT)
961		btoplain = true;
962	else if (outType == B_STYLED_TEXT_FORMAT)
963		btoplain = false;
964	else
965		return B_BAD_VALUE;
966
967	uint8 buffer[READ_BUFFER_SIZE];
968	ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0;
969
970	// skip to the actual text data when outputting a
971	// plain text file
972	if (btoplain) {
973		if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) !=
974			kstxtsize + ktxtsize)
975			return B_ERROR;
976	}
977
978	// Read data from inSource
979	// When outputing B_TRANSLATOR_TEXT, the loop stops when all of
980	// the text data has been read and written.
981	// When outputting B_STYLED_TEXT_FORMAT, the loop stops when all
982	// of the data from inSource has been read and written.
983	if (btoplain)
984		nreed = min((size_t)READ_BUFFER_SIZE,
985			(size_t)txtheader.header.data_size - ntotalread);
986	else
987		nreed = READ_BUFFER_SIZE;
988	nread = inSource->Read(buffer, nreed);
989	while (nread > 0) {
990		nwritten = outDestination->Write(buffer, nread);
991		if (nwritten != nread)
992			return B_ERROR;
993
994		if (btoplain) {
995			ntotalread += nread;
996			nreed = min((size_t)READ_BUFFER_SIZE,
997				(size_t)txtheader.header.data_size - ntotalread);
998		} else
999			nreed = READ_BUFFER_SIZE;
1000		nread = inSource->Read(buffer, nreed);
1001	}
1002
1003	if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) !=
1004		ntotalread)
1005		// If not all of the text data was able to be read...
1006		return B_NO_TRANSLATOR;
1007	else
1008		return B_OK;
1009}
1010
1011// ---------------------------------------------------------------
1012// output_headers
1013//
1014// Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT
1015// to outDestination, setting the data_size member of the text header
1016// to text_data_size
1017//
1018// Preconditions:
1019//
1020// Parameters:	outDestination,	where the translated data is
1021//								put
1022//
1023//				text_data_size, number of bytes in data section
1024//							    of the TEXT header
1025//
1026//
1027// Postconditions:
1028//
1029// Returns:
1030//
1031// B_ERROR, if there was an error writing to outDestination or
1032// 	an error with converting the byte order
1033//
1034// B_OK, if all went well
1035// ---------------------------------------------------------------
1036status_t
1037output_headers(BPositionIO *outDestination, uint32 text_data_size)
1038{
1039	const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) +
1040		sizeof(TranslatorStyledTextTextHeader);
1041	status_t result;
1042	TranslatorStyledTextStreamHeader stxtheader;
1043	TranslatorStyledTextTextHeader txtheader;
1044
1045	uint8 buffer[kHeadersSize];
1046
1047	stxtheader.header.magic = 'STXT';
1048	stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader);
1049	stxtheader.header.data_size = 0;
1050	stxtheader.version = 100;
1051	memcpy(buffer, &stxtheader, stxtheader.header.header_size);
1052
1053	txtheader.header.magic = 'TEXT';
1054	txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader);
1055	txtheader.header.data_size = text_data_size;
1056	txtheader.charset = B_UNICODE_UTF8;
1057	memcpy(buffer + stxtheader.header.header_size, &txtheader,
1058		txtheader.header.header_size);
1059
1060	// write out headers in Big Endian byte order
1061	result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize,
1062		B_SWAP_HOST_TO_BENDIAN);
1063	if (result == B_OK) {
1064		ssize_t nwritten = 0;
1065		nwritten = outDestination->Write(buffer, kHeadersSize);
1066		if (nwritten != kHeadersSize)
1067			return B_ERROR;
1068		else
1069			return B_OK;
1070	}
1071
1072	return result;
1073}
1074
1075// ---------------------------------------------------------------
1076// output_styles
1077//
1078// Writes out the actual style information into outDestination
1079// using the data from pflatRunArray
1080//
1081// Preconditions:
1082//
1083// Parameters:	outDestination,	where the translated data is
1084//								put
1085//
1086//				text_size,		size in bytes of the text in
1087//								outDestination
1088//
1089//				data_size,		size of pflatRunArray
1090//
1091// Postconditions:
1092//
1093// Returns:
1094//
1095// B_ERROR, if there was an error writing to outDestination or
1096// 	an error with converting the byte order
1097//
1098// B_OK, if all went well
1099// ---------------------------------------------------------------
1100status_t
1101output_styles(BPositionIO *outDestination, uint32 text_size,
1102	uint8 *pflatRunArray, ssize_t data_size)
1103{
1104	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
1105
1106	uint8 buffer[kstylsize];
1107
1108	// output STYL header
1109	TranslatorStyledTextStyleHeader stylheader;
1110	stylheader.header.magic = 'STYL';
1111	stylheader.header.header_size =
1112		sizeof(TranslatorStyledTextStyleHeader);
1113	stylheader.header.data_size = data_size;
1114	stylheader.apply_offset = 0;
1115	stylheader.apply_length = text_size;
1116
1117	memcpy(buffer, &stylheader, kstylsize);
1118	if (swap_data(B_UINT32_TYPE, buffer, kstylsize,
1119		B_SWAP_HOST_TO_BENDIAN) != B_OK)
1120		return B_ERROR;
1121	if (outDestination->Write(buffer, kstylsize) != kstylsize)
1122		return B_ERROR;
1123
1124	// output actual style information
1125	if (outDestination->Write(pflatRunArray,
1126		data_size) != data_size)
1127		return B_ERROR;
1128
1129	return B_OK;
1130}
1131
1132
1133/*!
1134	Convert the plain text (UTF8) from inSource to plain or
1135	styled text in outDestination
1136*/
1137status_t
1138translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding,
1139	BPositionIO* destination, uint32 outType)
1140{
1141	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1142		return B_BAD_VALUE;
1143
1144	// find the length of the text
1145	off_t size = source->Seek(0, SEEK_END);
1146	if (size < 0)
1147		return (status_t)size;
1148	if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT)
1149		return B_NOT_SUPPORTED;
1150
1151	status_t status = source->Seek(0, SEEK_SET);
1152	if (status < B_OK)
1153		return status;
1154
1155	if (outType == B_STYLED_TEXT_FORMAT) {
1156		// output styled text headers
1157		status = output_headers(destination, (uint32)size);
1158		if (status != B_OK)
1159			return status;
1160	}
1161
1162	class MallocBuffer {
1163		public:
1164			MallocBuffer() : fBuffer(NULL), fSize(0) {}
1165			~MallocBuffer() { free(fBuffer); }
1166
1167			void* Buffer() { return fBuffer; }
1168			size_t Size() const { return fSize; }
1169
1170			status_t
1171			Allocate(size_t size)
1172			{
1173				fBuffer = malloc(size);
1174				if (fBuffer != NULL) {
1175					fSize = size;
1176					return B_OK;
1177				}
1178				return B_NO_MEMORY;
1179			}
1180
1181		private:
1182			void*	fBuffer;
1183			size_t	fSize;
1184	} encodingBuffer;
1185	BMallocIO encodingIO;
1186	uint32 encodingID = 0;
1187		// defaults to UTF-8 or no encoding
1188
1189	BNode* node = dynamic_cast<BNode*>(source);
1190	if (node != NULL) {
1191		// determine encoding, if available
1192		const BCharacterSet* characterSet = NULL;
1193		bool hasAttribute = false;
1194		if (encoding != NULL && !forceEncoding) {
1195			BString name;
1196			if (node->ReadAttrString("be:encoding", &name) == B_OK) {
1197				encoding = name.String();
1198				hasAttribute = true;
1199			} else {
1200				int32 value;
1201				ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0,
1202					&value, sizeof(value));
1203				if (bytesRead == (ssize_t)sizeof(value)) {
1204					hasAttribute = true;
1205					if (value != 65535)
1206						characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value);
1207				}
1208			}
1209		} else {
1210			hasAttribute = true;
1211				// we don't write the encoding in this case
1212		}
1213		if (characterSet == NULL && encoding != NULL)
1214			characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding);
1215
1216		if (characterSet != NULL) {
1217			encodingID = characterSet->GetConversionID();
1218			encodingBuffer.Allocate(READ_BUFFER_SIZE * 4);
1219		}
1220
1221		if (!hasAttribute && encoding != NULL) {
1222			// add encoding attribute, so that someone opening the file can
1223			// retrieve it for persistance
1224			node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding,
1225				strlen(encoding));
1226		}
1227	}
1228
1229	off_t outputSize = 0;
1230	ssize_t bytesRead;
1231	int32 state = 0;
1232
1233	// output the actual text part of the data
1234	do {
1235		uint8 buffer[READ_BUFFER_SIZE];
1236		bytesRead = source->Read(buffer, READ_BUFFER_SIZE);
1237		if (bytesRead < B_OK)
1238			return bytesRead;
1239		if (bytesRead == 0)
1240			break;
1241
1242		if (encodingBuffer.Size() == 0) {
1243			// default, no encoding
1244			ssize_t bytesWritten = destination->Write(buffer, bytesRead);
1245			if (bytesWritten != bytesRead) {
1246				if (bytesWritten < B_OK)
1247					return bytesWritten;
1248
1249				return B_ERROR;
1250			}
1251
1252			outputSize += bytesRead;
1253		} else {
1254			// decode text file to UTF-8
1255			char* pos = (char*)buffer;
1256			int32 encodingLength = encodingIO.BufferLength();
1257			int32 bytesLeft = bytesRead;
1258			int32 bytes;
1259			do {
1260				encodingLength = READ_BUFFER_SIZE * 4;
1261				bytes = bytesLeft;
1262
1263				status = convert_to_utf8(encodingID, pos, &bytes,
1264					(char*)encodingBuffer.Buffer(), &encodingLength, &state);
1265				if (status < B_OK)
1266					return status;
1267
1268				ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(),
1269					encodingLength);
1270				if (bytesWritten < encodingLength) {
1271					if (bytesWritten < B_OK)
1272						return bytesWritten;
1273
1274					return B_ERROR;
1275				}
1276
1277				pos += bytes;
1278				bytesLeft -= bytes;
1279				outputSize += encodingLength;
1280			} while (encodingLength > 0 && bytesLeft > 0);
1281		}
1282	} while (bytesRead > 0);
1283
1284	if (outType != B_STYLED_TEXT_FORMAT)
1285		return B_OK;
1286
1287	if (encodingBuffer.Size() != 0 && size != outputSize) {
1288		if (outputSize > UINT32_MAX)
1289			return B_NOT_SUPPORTED;
1290
1291		// we need to update the header as the decoded text size has changed
1292		status = destination->Seek(0, SEEK_SET);
1293		if (status == B_OK)
1294			status = output_headers(destination, (uint32)outputSize);
1295		if (status == B_OK)
1296			status = destination->Seek(0, SEEK_END);
1297
1298		if (status < B_OK)
1299			return status;
1300	}
1301
1302	// Read file attributes if outputting styled data
1303	// and source is a BNode object
1304
1305	if (node == NULL)
1306		return B_OK;
1307
1308	// Try to read styles - we only propagate an error if the actual on-disk
1309	// data is likely to be okay
1310
1311	const char *kAttrName = "styles";
1312	attr_info info;
1313	if (node->GetAttrInfo(kAttrName, &info) != B_OK)
1314		return B_OK;
1315
1316	if (info.type != B_RAW_TYPE || info.size < 160) {
1317		// styles seem to be broken, but since we got the text,
1318		// we don't propagate the error
1319		return B_OK;
1320	}
1321
1322	uint8* flatRunArray = new (std::nothrow) uint8[info.size];
1323	if (flatRunArray == NULL)
1324		return B_NO_MEMORY;
1325
1326	bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size);
1327	if (bytesRead != info.size)
1328		return B_OK;
1329
1330	output_styles(destination, size, flatRunArray, info.size);
1331
1332	delete[] flatRunArray;
1333	return B_OK;
1334}
1335
1336
1337//	#pragma mark -
1338
1339
1340STXTTranslator::STXTTranslator()
1341	: BaseTranslator(B_TRANSLATE("StyledEdit files"),
1342		B_TRANSLATE("StyledEdit file translator"),
1343		STXT_TRANSLATOR_VERSION,
1344		sInputFormats, kNumInputFormats,
1345		sOutputFormats, kNumOutputFormats,
1346		"STXTTranslator_Settings",
1347		sDefaultSettings, kNumDefaultSettings,
1348		B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT)
1349{
1350}
1351
1352
1353STXTTranslator::~STXTTranslator()
1354{
1355}
1356
1357
1358status_t
1359STXTTranslator::Identify(BPositionIO *inSource,
1360	const translation_format *inFormat, BMessage *ioExtension,
1361	translator_info *outInfo, uint32 outType)
1362{
1363	if (!outType)
1364		outType = B_TRANSLATOR_TEXT;
1365	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1366		return B_NO_TRANSLATOR;
1367
1368	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
1369
1370	uint8 buffer[DATA_BUFFER_SIZE];
1371	status_t nread = 0;
1372	// Read in the header to determine
1373	// if the data is supported
1374	nread = inSource->Read(buffer, kstxtsize);
1375	if (nread < 0)
1376		return nread;
1377
1378	// read in enough data to fill the stream header
1379	if (nread == kstxtsize) {
1380		TranslatorStyledTextStreamHeader header;
1381		memcpy(&header, buffer, kstxtsize);
1382		if (swap_data(B_UINT32_TYPE, &header, kstxtsize,
1383				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1384			return B_ERROR;
1385
1386		if (header.header.magic == B_STYLED_TEXT_FORMAT
1387			&& header.header.header_size == (int32)kstxtsize
1388			&& header.header.data_size == 0
1389			&& header.version == 100)
1390			return identify_stxt_header(header, inSource, outInfo, outType);
1391	}
1392
1393	// if the data is not styled text, check if it is plain text
1394	const char* encoding;
1395	return identify_text(buffer, nread, inSource, outInfo, outType, encoding);
1396}
1397
1398
1399status_t
1400STXTTranslator::Translate(BPositionIO* source, const translator_info* info,
1401	BMessage* ioExtension, uint32 outType, BPositionIO* outDestination)
1402{
1403	if (!outType)
1404		outType = B_TRANSLATOR_TEXT;
1405	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1406		return B_NO_TRANSLATOR;
1407
1408	const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader);
1409	uint8 buffer[DATA_BUFFER_SIZE];
1410	status_t result;
1411	translator_info outInfo;
1412	// Read in the header to determine
1413	// if the data is supported
1414	ssize_t bytesRead = source->Read(buffer, headerSize);
1415	if (bytesRead < 0)
1416		return bytesRead;
1417
1418	// read in enough data to fill the stream header
1419	if (bytesRead == headerSize) {
1420		TranslatorStyledTextStreamHeader header;
1421		memcpy(&header, buffer, headerSize);
1422		if (swap_data(B_UINT32_TYPE, &header, headerSize,
1423				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1424			return B_ERROR;
1425
1426		if (header.header.magic == B_STYLED_TEXT_FORMAT
1427			&& header.header.header_size == sizeof(TranslatorStyledTextStreamHeader)
1428			&& header.header.data_size == 0
1429			&& header.version == 100) {
1430			TranslatorStyledTextTextHeader textHeader;
1431			result = identify_stxt_header(header, source, &outInfo, outType,
1432				&textHeader);
1433			if (result != B_OK)
1434				return result;
1435
1436			return translate_from_stxt(source, outDestination, outType, textHeader);
1437		}
1438	}
1439
1440	// if the data is not styled text, check if it is ASCII text
1441	bool forceEncoding = false;
1442	const char* encoding = NULL;
1443	result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding);
1444	if (result != B_OK)
1445		return result;
1446
1447	if (ioExtension != NULL) {
1448		const char* value;
1449		if (ioExtension->FindString("be:encoding", &value) == B_OK
1450			&& value[0]) {
1451			// override encoding
1452			encoding = value;
1453			forceEncoding = true;
1454		}
1455	}
1456
1457	return translate_from_text(source, encoding, forceEncoding, outDestination, outType);
1458}
1459
1460
1461BView *
1462STXTTranslator::NewConfigView(TranslatorSettings *settings)
1463{
1464	return new STXTView(BRect(0, 0, 225, 175),
1465		B_TRANSLATE("STXTTranslator Settings"),
1466		B_FOLLOW_ALL, B_WILL_DRAW, settings);
1467}
1468
1469