1/**
2 * Test the UTF-8 decoding routines
3 *
4 * author: Daniel Veillard
5 * copy: see Copyright for the status of this software.
6 */
7
8#include <stdio.h>
9#include <string.h>
10#include <libxml/parser.h>
11#include <libxml/parserInternals.h>
12
13int lastError;
14
15static void errorHandler(void *unused, xmlErrorPtr err) {
16    if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
17        lastError = err->code;
18    }
19}
20
21char document1[100] = "<doc>XXXX</doc>";
22char document2[100] = "<doc foo='XXXX'/>";
23
24static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
25                  int len,  char *data, int forbid1, int forbid2) {
26    int i;
27    xmlDocPtr res;
28
29    for (i = 0;i <= 0xFF;i++) {
30	lastError = 0;
31	xmlCtxtReset(ctxt);
32
33        data[0] = i;
34
35	res = xmlReadMemory(document, len, "test", NULL, 0);
36
37	if ((i == forbid1) || (i == forbid2)) {
38	    if ((lastError == 0) || (res != NULL))
39	        fprintf(stderr,
40		    "Failed to detect invalid char for Byte 0x%02X: %c\n",
41		        i, i);
42	}
43
44	else if ((i == '<') || (i == '&')) {
45	    if ((lastError == 0) || (res != NULL))
46	        fprintf(stderr,
47		    "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
48	}
49	else if (((i < 0x20) || (i >= 0x80)) &&
50	    (i != 0x9) && (i != 0xA) && (i != 0xD)) {
51	    if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
52	        fprintf(stderr,
53		    "Failed to detect invalid char for Byte 0x%02X\n", i);
54	}
55	else if (res == NULL) {
56	    fprintf(stderr,
57		"Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
58	}
59	if (res != NULL)
60	    xmlFreeDoc(res);
61    }
62}
63
64static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
65                  int len,  char *data) {
66    int i, j;
67    xmlDocPtr res;
68
69    for (i = 0x80;i <= 0xFF;i++) {
70    for (j = 0;j <= 0xFF;j++) {
71	lastError = 0;
72	xmlCtxtReset(ctxt);
73
74        data[0] = i;
75        data[1] = j;
76
77	res = xmlReadMemory(document, len, "test", NULL, 0);
78
79	/* if first bit of first char is set, then second bit must too */
80	if ((i & 0x80) && ((i & 0x40) == 0)) {
81	    if ((lastError == 0) || (res != NULL))
82		fprintf(stderr,
83		"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
84			i, j);
85	}
86
87	/*
88	 * if first bit of first char is set, then second char first
89	 * bits must be 10
90	 */
91	else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
92	    if ((lastError == 0) || (res != NULL))
93		fprintf(stderr,
94	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
95			i, j);
96	}
97
98	/*
99	 * if using a 2 byte encoding then the value must be greater
100	 * than 0x80, i.e. one of bits 5 to 1 of i must be set
101	 */
102	else if ((i & 0x80) && ((i & 0x1E) == 0)) {
103	    if ((lastError == 0) || (res != NULL))
104		fprintf(stderr,
105	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
106			i, j);
107	}
108
109	/*
110	 * if third bit of first char is set, then the sequence would need
111	 * at least 3 bytes, but we give only 2 !
112	 */
113	else if ((i & 0xE0) == 0xE0) {
114	    if ((lastError == 0) || (res != NULL))
115		fprintf(stderr,
116	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
117			i, j);
118	}
119
120	/*
121	 * We should see no error in remaning cases
122	 */
123	else if ((lastError != 0) || (res == NULL)) {
124	    fprintf(stderr,
125		"Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
126	}
127	if (res != NULL)
128	    xmlFreeDoc(res);
129    }
130    }
131}
132
133/**
134 * testDocumentRanges:
135 *
136 * Test the correct UTF8 character parsing in context of XML documents
137 * Those are in-context injection tests checking the parser behaviour on
138 * edge case values at different point in content, beginning and end of
139 * CDATA in text or in attribute values.
140 */
141
142static void testDocumentRanges(void) {
143    xmlParserCtxtPtr ctxt;
144    char *data;
145
146    /*
147     * Set up a parsing context using the first document as
148     * the current input source.
149     */
150    ctxt = xmlNewParserCtxt();
151    if (ctxt == NULL) {
152        fprintf(stderr, "Failed to allocate parser context\n");
153	return;
154    }
155
156    printf("testing 1 byte char in document: 1");
157    fflush(stdout);
158    data = &document1[5];
159    data[0] = ' ';
160    data[1] = ' ';
161    data[2] = ' ';
162    data[3] = ' ';
163    /* test 1 byte injection at beginning of area */
164    testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
165                           data, -1, -1);
166    printf(" 2");
167    fflush(stdout);
168    data[0] = ' ';
169    data[1] = ' ';
170    data[2] = ' ';
171    data[3] = ' ';
172    /* test 1 byte injection at end of area */
173    testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
174                           data + 3, -1, -1);
175
176    printf(" 3");
177    fflush(stdout);
178    data = &document2[10];
179    data[0] = ' ';
180    data[1] = ' ';
181    data[2] = ' ';
182    data[3] = ' ';
183    /* test 1 byte injection at beginning of area */
184    testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
185                           data, '\'', -1);
186    printf(" 4");
187    fflush(stdout);
188    data[0] = ' ';
189    data[1] = ' ';
190    data[2] = ' ';
191    data[3] = ' ';
192    /* test 1 byte injection at end of area */
193    testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
194                           data + 3, '\'', -1);
195    printf(" done\n");
196
197    printf("testing 2 byte char in document: 1");
198    fflush(stdout);
199    data = &document1[5];
200    data[0] = ' ';
201    data[1] = ' ';
202    data[2] = ' ';
203    data[3] = ' ';
204    /* test 2 byte injection at beginning of area */
205    testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
206                           data);
207    printf(" 2");
208    fflush(stdout);
209    data[0] = ' ';
210    data[1] = ' ';
211    data[2] = ' ';
212    data[3] = ' ';
213    /* test 2 byte injection at end of area */
214    testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
215                           data + 2);
216
217    printf(" 3");
218    fflush(stdout);
219    data = &document2[10];
220    data[0] = ' ';
221    data[1] = ' ';
222    data[2] = ' ';
223    data[3] = ' ';
224    /* test 2 byte injection at beginning of area */
225    testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
226                           data);
227    printf(" 4");
228    fflush(stdout);
229    data[0] = ' ';
230    data[1] = ' ';
231    data[2] = ' ';
232    data[3] = ' ';
233    /* test 2 byte injection at end of area */
234    testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
235                           data + 2);
236    printf(" done\n");
237
238    xmlFreeParserCtxt(ctxt);
239}
240
241static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
242    int i = 0;
243    int len, c;
244
245    data[1] = 0;
246    data[2] = 0;
247    data[3] = 0;
248    for (i = 0;i <= 0xFF;i++) {
249        data[0] = i;
250	ctxt->charset = XML_CHAR_ENCODING_UTF8;
251
252	lastError = 0;
253        c = xmlCurrentChar(ctxt, &len);
254	if ((i == 0) || (i >= 0x80)) {
255	    /* we must see an error there */
256	    if (lastError != XML_ERR_INVALID_CHAR)
257	        fprintf(stderr,
258		    "Failed to detect invalid char for Byte 0x%02X\n", i);
259	} else if (i == 0xD) {
260	    if ((c != 0xA) || (len != 1))
261		fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
262	} else if ((c != i) || (len != 1)) {
263	    fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
264	}
265    }
266}
267
268static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
269    int i, j;
270    int len, c;
271
272    data[2] = 0;
273    data[3] = 0;
274    for (i = 0x80;i <= 0xFF;i++) {
275	for (j = 0;j <= 0xFF;j++) {
276	    data[0] = i;
277	    data[1] = j;
278	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
279
280	    lastError = 0;
281	    c = xmlCurrentChar(ctxt, &len);
282
283	    /* if first bit of first char is set, then second bit must too */
284	    if ((i & 0x80) && ((i & 0x40) == 0)) {
285		if (lastError != XML_ERR_INVALID_CHAR)
286		    fprintf(stderr,
287		    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
288		            i, j);
289	    }
290
291	    /*
292	     * if first bit of first char is set, then second char first
293	     * bits must be 10
294	     */
295	    else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
296		if (lastError != XML_ERR_INVALID_CHAR)
297		    fprintf(stderr,
298		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
299		            i, j, c);
300	    }
301
302	    /*
303	     * if using a 2 byte encoding then the value must be greater
304	     * than 0x80, i.e. one of bits 5 to 1 of i must be set
305	     */
306	    else if ((i & 0x80) && ((i & 0x1E) == 0)) {
307		if (lastError != XML_ERR_INVALID_CHAR)
308		    fprintf(stderr,
309		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
310		            i, j, c);
311	    }
312
313	    /*
314	     * if third bit of first char is set, then the sequence would need
315	     * at least 3 bytes, but we give only 2 !
316	     */
317	    else if ((i & 0xE0) == 0xE0) {
318		if (lastError != XML_ERR_INVALID_CHAR)
319		    fprintf(stderr,
320		"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
321		            i, j);
322	    }
323
324            /*
325	     * We should see no error in remaning cases
326	     */
327	    else if ((lastError != 0) || (len != 2)) {
328		fprintf(stderr,
329		    "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
330	    }
331
332            /*
333	     * Finally check the value is right
334	     */
335	    else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
336		fprintf(stderr,
337	"Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
338	                i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
339	    }
340        }
341    }
342}
343
344static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
345    int i, j, k, K;
346    int len, c;
347    unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
348    int value;
349
350    data[3] = 0;
351    for (i = 0xE0;i <= 0xFF;i++) {
352    for (j = 0;j <= 0xFF;j++) {
353    for (k = 0;k < 6;k++) {
354	data[0] = i;
355	data[1] = j;
356	K = lows[k];
357	data[2] = (char) K;
358	value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
359	ctxt->charset = XML_CHAR_ENCODING_UTF8;
360
361	lastError = 0;
362	c = xmlCurrentChar(ctxt, &len);
363
364	/*
365	 * if fourth bit of first char is set, then the sequence would need
366	 * at least 4 bytes, but we give only 3 !
367	 */
368	if ((i & 0xF0) == 0xF0) {
369	    if (lastError != XML_ERR_INVALID_CHAR)
370		fprintf(stderr,
371	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
372			i, j, K, data[3]);
373	}
374
375        /*
376	 * The second and the third bytes must start with 10
377	 */
378	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
379	    if (lastError != XML_ERR_INVALID_CHAR)
380		fprintf(stderr,
381	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
382			i, j, K);
383	}
384
385	/*
386	 * if using a 3 byte encoding then the value must be greater
387	 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
388	 * the 6th byte of data[1] must be set
389	 */
390	else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
391	    if (lastError != XML_ERR_INVALID_CHAR)
392		fprintf(stderr,
393	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
394			i, j, K);
395	}
396
397        /*
398	 * There are values in that range that are not allowed in XML-1.0
399	 */
400	else if (((value > 0xD7FF) && (value <0xE000)) ||
401	         ((value > 0xFFFD) && (value <0x10000))) {
402	    if (lastError != XML_ERR_INVALID_CHAR)
403		fprintf(stderr,
404	"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
405			value, i, j, K);
406	}
407
408	/*
409	 * We should see no error in remaining cases
410	 */
411	else if ((lastError != 0) || (len != 3)) {
412	    fprintf(stderr,
413		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
414		    i, j, K);
415	}
416
417	/*
418	 * Finally check the value is right
419	 */
420	else if (c != value) {
421	    fprintf(stderr,
422    "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
423		i, j, data[2], value, c);
424	}
425    }
426    }
427    }
428}
429
430static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
431    int i, j, k, K, l, L;
432    int len, c;
433    unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
434    int value;
435
436    data[4] = 0;
437    for (i = 0xF0;i <= 0xFF;i++) {
438    for (j = 0;j <= 0xFF;j++) {
439    for (k = 0;k < 6;k++) {
440    for (l = 0;l < 6;l++) {
441	data[0] = i;
442	data[1] = j;
443	K = lows[k];
444	data[2] = (char) K;
445	L = lows[l];
446	data[3] = (char) L;
447	value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
448	        ((i & 0x7) << 18);
449	ctxt->charset = XML_CHAR_ENCODING_UTF8;
450
451	lastError = 0;
452	c = xmlCurrentChar(ctxt, &len);
453
454	/*
455	 * if fifth bit of first char is set, then the sequence would need
456	 * at least 5 bytes, but we give only 4 !
457	 */
458	if ((i & 0xF8) == 0xF8) {
459	    if (lastError != XML_ERR_INVALID_CHAR)
460		fprintf(stderr,
461  "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
462			i, j, K, data[3]);
463	}
464
465        /*
466	 * The second, third and fourth bytes must start with 10
467	 */
468	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
469	         ((L & 0xC0) != 0x80)) {
470	    if (lastError != XML_ERR_INVALID_CHAR)
471		fprintf(stderr,
472	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
473			i, j, K, L);
474	}
475
476	/*
477	 * if using a 3 byte encoding then the value must be greater
478	 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
479	 * the 6 or 5th byte of j must be set
480	 */
481	else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
482	    if (lastError != XML_ERR_INVALID_CHAR)
483		fprintf(stderr,
484	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
485			i, j, K, L);
486	}
487
488        /*
489	 * There are values in that range that are not allowed in XML-1.0
490	 */
491	else if (((value > 0xD7FF) && (value <0xE000)) ||
492	         ((value > 0xFFFD) && (value <0x10000)) ||
493		 (value > 0x10FFFF)) {
494	    if (lastError != XML_ERR_INVALID_CHAR)
495		fprintf(stderr,
496"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
497			value, i, j, K, L);
498	}
499
500	/*
501	 * We should see no error in remaining cases
502	 */
503	else if ((lastError != 0) || (len != 4)) {
504	    fprintf(stderr,
505		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
506		    i, j, K);
507	}
508
509	/*
510	 * Finally check the value is right
511	 */
512	else if (c != value) {
513	    fprintf(stderr,
514    "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
515		i, j, data[2], value, c);
516	}
517    }
518    }
519    }
520    }
521}
522
523/**
524 * testCharRanges:
525 *
526 * Test the correct UTF8 character parsing in isolation i.e.
527 * not when parsing a full document, this is less expensive and we can
528 * cover the full range of UTF-8 chars accepted by XML-1.0
529 */
530
531static void testCharRanges(void) {
532    char data[5];
533    xmlParserCtxtPtr ctxt;
534    xmlParserInputBufferPtr buf;
535    xmlParserInputPtr input;
536
537    memset(data, 0, 5);
538
539    /*
540     * Set up a parsing context using the above data buffer as
541     * the current input source.
542     */
543    ctxt = xmlNewParserCtxt();
544    if (ctxt == NULL) {
545        fprintf(stderr, "Failed to allocate parser context\n");
546	return;
547    }
548    buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
549                                           XML_CHAR_ENCODING_NONE);
550    if (buf == NULL) {
551        fprintf(stderr, "Failed to allocate input buffer\n");
552	goto error;
553    }
554    input = xmlNewInputStream(ctxt);
555    if (input == NULL) {
556        xmlFreeParserInputBuffer(buf);
557	goto error;
558    }
559    input->filename = NULL;
560    input->buf = buf;
561    input->base = input->buf->buffer->content;
562    input->cur = input->buf->buffer->content;
563    input->end = &input->buf->buffer->content[4];
564    inputPush(ctxt, input);
565
566    printf("testing char range: 1");
567    fflush(stdout);
568    testCharRangeByte1(ctxt, data);
569    printf(" 2");
570    fflush(stdout);
571    testCharRangeByte2(ctxt, data);
572    printf(" 3");
573    fflush(stdout);
574    testCharRangeByte3(ctxt, data);
575    printf(" 4");
576    fflush(stdout);
577    testCharRangeByte4(ctxt, data);
578    printf(" done\n");
579    fflush(stdout);
580
581error:
582    xmlFreeParserCtxt(ctxt);
583}
584
585int main(void) {
586
587    /*
588     * this initialize the library and check potential ABI mismatches
589     * between the version it was compiled for and the actual shared
590     * library used.
591     */
592    LIBXML_TEST_VERSION
593
594    /*
595     * Catch errors separately
596     */
597
598    xmlSetStructuredErrorFunc(NULL, errorHandler);
599
600    /*
601     * Run the tests
602     */
603    testCharRanges();
604    testDocumentRanges();
605
606    /*
607     * Cleanup function for the XML library.
608     */
609    xmlCleanupParser();
610    /*
611     * this is to debug memory for regression tests
612     */
613    xmlMemoryDump();
614    return(0);
615}
616