1/** 2 * Test the UTF-8 decoding routines 3 * 4 * author: Daniel Veillard 5 * copy: see Copyright for the status of this software. 6 */ 7 8#include <stdio.h> 9#include <string.h> 10#include <libxml/parser.h> 11#include <libxml/parserInternals.h> 12 13int lastError; 14 15static void errorHandler(void *unused, xmlErrorPtr err) { 16 if ((unused == NULL) && (err != NULL) && (lastError == 0)) { 17 lastError = err->code; 18 } 19} 20 21char document1[100] = "<doc>XXXX</doc>"; 22char document2[100] = "<doc foo='XXXX'/>"; 23 24static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document, 25 int len, char *data, int forbid1, int forbid2) { 26 int i; 27 xmlDocPtr res; 28 29 for (i = 0;i <= 0xFF;i++) { 30 lastError = 0; 31 xmlCtxtReset(ctxt); 32 33 data[0] = i; 34 35 res = xmlReadMemory(document, len, "test", NULL, 0); 36 37 if ((i == forbid1) || (i == forbid2)) { 38 if ((lastError == 0) || (res != NULL)) 39 fprintf(stderr, 40 "Failed to detect invalid char for Byte 0x%02X: %c\n", 41 i, i); 42 } 43 44 else if ((i == '<') || (i == '&')) { 45 if ((lastError == 0) || (res != NULL)) 46 fprintf(stderr, 47 "Failed to detect illegal char %c for Byte 0x%02X\n", i, i); 48 } 49 else if (((i < 0x20) || (i >= 0x80)) && 50 (i != 0x9) && (i != 0xA) && (i != 0xD)) { 51 if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL)) 52 fprintf(stderr, 53 "Failed to detect invalid char for Byte 0x%02X\n", i); 54 } 55 else if (res == NULL) { 56 fprintf(stderr, 57 "Failed to parse valid char for Byte 0x%02X : %c\n", i, i); 58 } 59 if (res != NULL) 60 xmlFreeDoc(res); 61 } 62} 63 64static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document, 65 int len, char *data) { 66 int i, j; 67 xmlDocPtr res; 68 69 for (i = 0x80;i <= 0xFF;i++) { 70 for (j = 0;j <= 0xFF;j++) { 71 lastError = 0; 72 xmlCtxtReset(ctxt); 73 74 data[0] = i; 75 data[1] = j; 76 77 res = xmlReadMemory(document, len, "test", NULL, 0); 78 79 /* if first bit of first char is set, then second bit must too */ 80 if ((i & 0x80) && ((i & 0x40) == 0)) { 81 if ((lastError == 0) || (res != NULL)) 82 fprintf(stderr, 83 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", 84 i, j); 85 } 86 87 /* 88 * if first bit of first char is set, then second char first 89 * bits must be 10 90 */ 91 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) { 92 if ((lastError == 0) || (res != NULL)) 93 fprintf(stderr, 94 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", 95 i, j); 96 } 97 98 /* 99 * if using a 2 byte encoding then the value must be greater 100 * than 0x80, i.e. one of bits 5 to 1 of i must be set 101 */ 102 else if ((i & 0x80) && ((i & 0x1E) == 0)) { 103 if ((lastError == 0) || (res != NULL)) 104 fprintf(stderr, 105 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", 106 i, j); 107 } 108 109 /* 110 * if third bit of first char is set, then the sequence would need 111 * at least 3 bytes, but we give only 2 ! 112 */ 113 else if ((i & 0xE0) == 0xE0) { 114 if ((lastError == 0) || (res != NULL)) 115 fprintf(stderr, 116 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n", 117 i, j); 118 } 119 120 /* 121 * We should see no error in remaning cases 122 */ 123 else if ((lastError != 0) || (res == NULL)) { 124 fprintf(stderr, 125 "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j); 126 } 127 if (res != NULL) 128 xmlFreeDoc(res); 129 } 130 } 131} 132 133/** 134 * testDocumentRanges: 135 * 136 * Test the correct UTF8 character parsing in context of XML documents 137 * Those are in-context injection tests checking the parser behaviour on 138 * edge case values at different point in content, beginning and end of 139 * CDATA in text or in attribute values. 140 */ 141 142static void testDocumentRanges(void) { 143 xmlParserCtxtPtr ctxt; 144 char *data; 145 146 /* 147 * Set up a parsing context using the first document as 148 * the current input source. 149 */ 150 ctxt = xmlNewParserCtxt(); 151 if (ctxt == NULL) { 152 fprintf(stderr, "Failed to allocate parser context\n"); 153 return; 154 } 155 156 printf("testing 1 byte char in document: 1"); 157 fflush(stdout); 158 data = &document1[5]; 159 data[0] = ' '; 160 data[1] = ' '; 161 data[2] = ' '; 162 data[3] = ' '; 163 /* test 1 byte injection at beginning of area */ 164 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1), 165 data, -1, -1); 166 printf(" 2"); 167 fflush(stdout); 168 data[0] = ' '; 169 data[1] = ' '; 170 data[2] = ' '; 171 data[3] = ' '; 172 /* test 1 byte injection at end of area */ 173 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1), 174 data + 3, -1, -1); 175 176 printf(" 3"); 177 fflush(stdout); 178 data = &document2[10]; 179 data[0] = ' '; 180 data[1] = ' '; 181 data[2] = ' '; 182 data[3] = ' '; 183 /* test 1 byte injection at beginning of area */ 184 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2), 185 data, '\'', -1); 186 printf(" 4"); 187 fflush(stdout); 188 data[0] = ' '; 189 data[1] = ' '; 190 data[2] = ' '; 191 data[3] = ' '; 192 /* test 1 byte injection at end of area */ 193 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2), 194 data + 3, '\'', -1); 195 printf(" done\n"); 196 197 printf("testing 2 byte char in document: 1"); 198 fflush(stdout); 199 data = &document1[5]; 200 data[0] = ' '; 201 data[1] = ' '; 202 data[2] = ' '; 203 data[3] = ' '; 204 /* test 2 byte injection at beginning of area */ 205 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1), 206 data); 207 printf(" 2"); 208 fflush(stdout); 209 data[0] = ' '; 210 data[1] = ' '; 211 data[2] = ' '; 212 data[3] = ' '; 213 /* test 2 byte injection at end of area */ 214 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1), 215 data + 2); 216 217 printf(" 3"); 218 fflush(stdout); 219 data = &document2[10]; 220 data[0] = ' '; 221 data[1] = ' '; 222 data[2] = ' '; 223 data[3] = ' '; 224 /* test 2 byte injection at beginning of area */ 225 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2), 226 data); 227 printf(" 4"); 228 fflush(stdout); 229 data[0] = ' '; 230 data[1] = ' '; 231 data[2] = ' '; 232 data[3] = ' '; 233 /* test 2 byte injection at end of area */ 234 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2), 235 data + 2); 236 printf(" done\n"); 237 238 xmlFreeParserCtxt(ctxt); 239} 240 241static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) { 242 int i = 0; 243 int len, c; 244 245 data[1] = 0; 246 data[2] = 0; 247 data[3] = 0; 248 for (i = 0;i <= 0xFF;i++) { 249 data[0] = i; 250 ctxt->charset = XML_CHAR_ENCODING_UTF8; 251 252 lastError = 0; 253 c = xmlCurrentChar(ctxt, &len); 254 if ((i == 0) || (i >= 0x80)) { 255 /* we must see an error there */ 256 if (lastError != XML_ERR_INVALID_CHAR) 257 fprintf(stderr, 258 "Failed to detect invalid char for Byte 0x%02X\n", i); 259 } else if (i == 0xD) { 260 if ((c != 0xA) || (len != 1)) 261 fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i); 262 } else if ((c != i) || (len != 1)) { 263 fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i); 264 } 265 } 266} 267 268static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) { 269 int i, j; 270 int len, c; 271 272 data[2] = 0; 273 data[3] = 0; 274 for (i = 0x80;i <= 0xFF;i++) { 275 for (j = 0;j <= 0xFF;j++) { 276 data[0] = i; 277 data[1] = j; 278 ctxt->charset = XML_CHAR_ENCODING_UTF8; 279 280 lastError = 0; 281 c = xmlCurrentChar(ctxt, &len); 282 283 /* if first bit of first char is set, then second bit must too */ 284 if ((i & 0x80) && ((i & 0x40) == 0)) { 285 if (lastError != XML_ERR_INVALID_CHAR) 286 fprintf(stderr, 287 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", 288 i, j); 289 } 290 291 /* 292 * if first bit of first char is set, then second char first 293 * bits must be 10 294 */ 295 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) { 296 if (lastError != XML_ERR_INVALID_CHAR) 297 fprintf(stderr, 298 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n", 299 i, j, c); 300 } 301 302 /* 303 * if using a 2 byte encoding then the value must be greater 304 * than 0x80, i.e. one of bits 5 to 1 of i must be set 305 */ 306 else if ((i & 0x80) && ((i & 0x1E) == 0)) { 307 if (lastError != XML_ERR_INVALID_CHAR) 308 fprintf(stderr, 309 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n", 310 i, j, c); 311 } 312 313 /* 314 * if third bit of first char is set, then the sequence would need 315 * at least 3 bytes, but we give only 2 ! 316 */ 317 else if ((i & 0xE0) == 0xE0) { 318 if (lastError != XML_ERR_INVALID_CHAR) 319 fprintf(stderr, 320 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n", 321 i, j); 322 } 323 324 /* 325 * We should see no error in remaning cases 326 */ 327 else if ((lastError != 0) || (len != 2)) { 328 fprintf(stderr, 329 "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j); 330 } 331 332 /* 333 * Finally check the value is right 334 */ 335 else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) { 336 fprintf(stderr, 337 "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n", 338 i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c); 339 } 340 } 341 } 342} 343 344static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) { 345 int i, j, k, K; 346 int len, c; 347 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF}; 348 int value; 349 350 data[3] = 0; 351 for (i = 0xE0;i <= 0xFF;i++) { 352 for (j = 0;j <= 0xFF;j++) { 353 for (k = 0;k < 6;k++) { 354 data[0] = i; 355 data[1] = j; 356 K = lows[k]; 357 data[2] = (char) K; 358 value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12); 359 ctxt->charset = XML_CHAR_ENCODING_UTF8; 360 361 lastError = 0; 362 c = xmlCurrentChar(ctxt, &len); 363 364 /* 365 * if fourth bit of first char is set, then the sequence would need 366 * at least 4 bytes, but we give only 3 ! 367 */ 368 if ((i & 0xF0) == 0xF0) { 369 if (lastError != XML_ERR_INVALID_CHAR) 370 fprintf(stderr, 371 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 372 i, j, K, data[3]); 373 } 374 375 /* 376 * The second and the third bytes must start with 10 377 */ 378 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) { 379 if (lastError != XML_ERR_INVALID_CHAR) 380 fprintf(stderr, 381 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n", 382 i, j, K); 383 } 384 385 /* 386 * if using a 3 byte encoding then the value must be greater 387 * than 0x800, i.e. one of bits 4 to 0 of i must be set or 388 * the 6th byte of data[1] must be set 389 */ 390 else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) { 391 if (lastError != XML_ERR_INVALID_CHAR) 392 fprintf(stderr, 393 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n", 394 i, j, K); 395 } 396 397 /* 398 * There are values in that range that are not allowed in XML-1.0 399 */ 400 else if (((value > 0xD7FF) && (value <0xE000)) || 401 ((value > 0xFFFD) && (value <0x10000))) { 402 if (lastError != XML_ERR_INVALID_CHAR) 403 fprintf(stderr, 404 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n", 405 value, i, j, K); 406 } 407 408 /* 409 * We should see no error in remaining cases 410 */ 411 else if ((lastError != 0) || (len != 3)) { 412 fprintf(stderr, 413 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n", 414 i, j, K); 415 } 416 417 /* 418 * Finally check the value is right 419 */ 420 else if (c != value) { 421 fprintf(stderr, 422 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n", 423 i, j, data[2], value, c); 424 } 425 } 426 } 427 } 428} 429 430static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) { 431 int i, j, k, K, l, L; 432 int len, c; 433 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF}; 434 int value; 435 436 data[4] = 0; 437 for (i = 0xF0;i <= 0xFF;i++) { 438 for (j = 0;j <= 0xFF;j++) { 439 for (k = 0;k < 6;k++) { 440 for (l = 0;l < 6;l++) { 441 data[0] = i; 442 data[1] = j; 443 K = lows[k]; 444 data[2] = (char) K; 445 L = lows[l]; 446 data[3] = (char) L; 447 value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) + 448 ((i & 0x7) << 18); 449 ctxt->charset = XML_CHAR_ENCODING_UTF8; 450 451 lastError = 0; 452 c = xmlCurrentChar(ctxt, &len); 453 454 /* 455 * if fifth bit of first char is set, then the sequence would need 456 * at least 5 bytes, but we give only 4 ! 457 */ 458 if ((i & 0xF8) == 0xF8) { 459 if (lastError != XML_ERR_INVALID_CHAR) 460 fprintf(stderr, 461 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 462 i, j, K, data[3]); 463 } 464 465 /* 466 * The second, third and fourth bytes must start with 10 467 */ 468 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) || 469 ((L & 0xC0) != 0x80)) { 470 if (lastError != XML_ERR_INVALID_CHAR) 471 fprintf(stderr, 472 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 473 i, j, K, L); 474 } 475 476 /* 477 * if using a 3 byte encoding then the value must be greater 478 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or 479 * the 6 or 5th byte of j must be set 480 */ 481 else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) { 482 if (lastError != XML_ERR_INVALID_CHAR) 483 fprintf(stderr, 484 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 485 i, j, K, L); 486 } 487 488 /* 489 * There are values in that range that are not allowed in XML-1.0 490 */ 491 else if (((value > 0xD7FF) && (value <0xE000)) || 492 ((value > 0xFFFD) && (value <0x10000)) || 493 (value > 0x10FFFF)) { 494 if (lastError != XML_ERR_INVALID_CHAR) 495 fprintf(stderr, 496"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 497 value, i, j, K, L); 498 } 499 500 /* 501 * We should see no error in remaining cases 502 */ 503 else if ((lastError != 0) || (len != 4)) { 504 fprintf(stderr, 505 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n", 506 i, j, K); 507 } 508 509 /* 510 * Finally check the value is right 511 */ 512 else if (c != value) { 513 fprintf(stderr, 514 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n", 515 i, j, data[2], value, c); 516 } 517 } 518 } 519 } 520 } 521} 522 523/** 524 * testCharRanges: 525 * 526 * Test the correct UTF8 character parsing in isolation i.e. 527 * not when parsing a full document, this is less expensive and we can 528 * cover the full range of UTF-8 chars accepted by XML-1.0 529 */ 530 531static void testCharRanges(void) { 532 char data[5]; 533 xmlParserCtxtPtr ctxt; 534 xmlParserInputBufferPtr buf; 535 xmlParserInputPtr input; 536 537 memset(data, 0, 5); 538 539 /* 540 * Set up a parsing context using the above data buffer as 541 * the current input source. 542 */ 543 ctxt = xmlNewParserCtxt(); 544 if (ctxt == NULL) { 545 fprintf(stderr, "Failed to allocate parser context\n"); 546 return; 547 } 548 buf = xmlParserInputBufferCreateStatic(data, sizeof(data), 549 XML_CHAR_ENCODING_NONE); 550 if (buf == NULL) { 551 fprintf(stderr, "Failed to allocate input buffer\n"); 552 goto error; 553 } 554 input = xmlNewInputStream(ctxt); 555 if (input == NULL) { 556 xmlFreeParserInputBuffer(buf); 557 goto error; 558 } 559 input->filename = NULL; 560 input->buf = buf; 561 input->base = input->buf->buffer->content; 562 input->cur = input->buf->buffer->content; 563 input->end = &input->buf->buffer->content[4]; 564 inputPush(ctxt, input); 565 566 printf("testing char range: 1"); 567 fflush(stdout); 568 testCharRangeByte1(ctxt, data); 569 printf(" 2"); 570 fflush(stdout); 571 testCharRangeByte2(ctxt, data); 572 printf(" 3"); 573 fflush(stdout); 574 testCharRangeByte3(ctxt, data); 575 printf(" 4"); 576 fflush(stdout); 577 testCharRangeByte4(ctxt, data); 578 printf(" done\n"); 579 fflush(stdout); 580 581error: 582 xmlFreeParserCtxt(ctxt); 583} 584 585int main(void) { 586 587 /* 588 * this initialize the library and check potential ABI mismatches 589 * between the version it was compiled for and the actual shared 590 * library used. 591 */ 592 LIBXML_TEST_VERSION 593 594 /* 595 * Catch errors separately 596 */ 597 598 xmlSetStructuredErrorFunc(NULL, errorHandler); 599 600 /* 601 * Run the tests 602 */ 603 testCharRanges(); 604 testDocumentRanges(); 605 606 /* 607 * Cleanup function for the XML library. 608 */ 609 xmlCleanupParser(); 610 /* 611 * this is to debug memory for regression tests 612 */ 613 xmlMemoryDump(); 614 return(0); 615} 616