1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2 See the file COPYING for copying permission. 3*/ 4 5#include <stddef.h> 6 7#ifdef COMPILED_FROM_DSP 8#include "winconfig.h" 9#elif defined(MACOS_CLASSIC) 10#include "macconfig.h" 11#else 12#ifdef HAVE_EXPAT_CONFIG_H 13#include <expat_config.h> 14#endif 15#endif /* ndef COMPILED_FROM_DSP */ 16 17#include "expat_external.h" 18#include "internal.h" 19#include "xmltok.h" 20#include "nametab.h" 21 22#ifdef XML_DTD 23#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 24#else 25#define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 26#endif 27 28#define VTABLE1 \ 29 { PREFIX(prologTok), PREFIX(contentTok), \ 30 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ 31 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 32 PREFIX(sameName), \ 33 PREFIX(nameMatchesAscii), \ 34 PREFIX(nameLength), \ 35 PREFIX(skipS), \ 36 PREFIX(getAtts), \ 37 PREFIX(charRefNumber), \ 38 PREFIX(predefinedEntityName), \ 39 PREFIX(updatePosition), \ 40 PREFIX(isPublicId) 41 42#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 43 44#define UCS2_GET_NAMING(pages, hi, lo) \ 45 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) 46 47/* A 2 byte UTF-8 representation splits the characters 11 bits between 48 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 49 pages, 3 bits to add to that index and 5 bits to generate the mask. 50*/ 51#define UTF8_GET_NAMING2(pages, byte) \ 52 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 53 + ((((byte)[0]) & 3) << 1) \ 54 + ((((byte)[1]) >> 5) & 1)] \ 55 & (1 << (((byte)[1]) & 0x1F))) 56 57/* A 3 byte UTF-8 representation splits the characters 16 bits between 58 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 59 into pages, 3 bits to add to that index and 5 bits to generate the 60 mask. 61*/ 62#define UTF8_GET_NAMING3(pages, byte) \ 63 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 64 + ((((byte)[1]) >> 2) & 0xF)] \ 65 << 3) \ 66 + ((((byte)[1]) & 3) << 1) \ 67 + ((((byte)[2]) >> 5) & 1)] \ 68 & (1 << (((byte)[2]) & 0x1F))) 69 70#define UTF8_GET_NAMING(pages, p, n) \ 71 ((n) == 2 \ 72 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 73 : ((n) == 3 \ 74 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 75 : 0)) 76 77/* Detection of invalid UTF-8 sequences is based on Table 3.1B 78 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 79 with the additional restriction of not allowing the Unicode 80 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 81 Implementation details: 82 (A & 0x80) == 0 means A < 0x80 83 and 84 (A & 0xC0) == 0xC0 means A > 0xBF 85*/ 86 87#define UTF8_INVALID2(p) \ 88 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 89 90#define UTF8_INVALID3(p) \ 91 (((p)[2] & 0x80) == 0 \ 92 || \ 93 ((*p) == 0xEF && (p)[1] == 0xBF \ 94 ? \ 95 (p)[2] > 0xBD \ 96 : \ 97 ((p)[2] & 0xC0) == 0xC0) \ 98 || \ 99 ((*p) == 0xE0 \ 100 ? \ 101 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 102 : \ 103 ((p)[1] & 0x80) == 0 \ 104 || \ 105 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 106 107#define UTF8_INVALID4(p) \ 108 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ 109 || \ 110 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ 111 || \ 112 ((*p) == 0xF0 \ 113 ? \ 114 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 115 : \ 116 ((p)[1] & 0x80) == 0 \ 117 || \ 118 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 119 120static int PTRFASTCALL 121isNever(const ENCODING *enc, const char *p) 122{ 123 return 0; 124} 125 126static int PTRFASTCALL 127utf8_isName2(const ENCODING *enc, const char *p) 128{ 129 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 130} 131 132static int PTRFASTCALL 133utf8_isName3(const ENCODING *enc, const char *p) 134{ 135 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 136} 137 138#define utf8_isName4 isNever 139 140static int PTRFASTCALL 141utf8_isNmstrt2(const ENCODING *enc, const char *p) 142{ 143 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 144} 145 146static int PTRFASTCALL 147utf8_isNmstrt3(const ENCODING *enc, const char *p) 148{ 149 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 150} 151 152#define utf8_isNmstrt4 isNever 153 154static int PTRFASTCALL 155utf8_isInvalid2(const ENCODING *enc, const char *p) 156{ 157 return UTF8_INVALID2((const unsigned char *)p); 158} 159 160static int PTRFASTCALL 161utf8_isInvalid3(const ENCODING *enc, const char *p) 162{ 163 return UTF8_INVALID3((const unsigned char *)p); 164} 165 166static int PTRFASTCALL 167utf8_isInvalid4(const ENCODING *enc, const char *p) 168{ 169 return UTF8_INVALID4((const unsigned char *)p); 170} 171 172struct normal_encoding { 173 ENCODING enc; 174 unsigned char type[256]; 175#ifdef XML_MIN_SIZE 176 int (PTRFASTCALL *byteType)(const ENCODING *, const char *); 177 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 178 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 179 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 180 int (PTRCALL *charMatches)(const ENCODING *, const char *, int); 181#endif /* XML_MIN_SIZE */ 182 int (PTRFASTCALL *isName2)(const ENCODING *, const char *); 183 int (PTRFASTCALL *isName3)(const ENCODING *, const char *); 184 int (PTRFASTCALL *isName4)(const ENCODING *, const char *); 185 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 186 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 187 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 188 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 189 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 190 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 191}; 192 193#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) 194 195#ifdef XML_MIN_SIZE 196 197#define STANDARD_VTABLE(E) \ 198 E ## byteType, \ 199 E ## isNameMin, \ 200 E ## isNmstrtMin, \ 201 E ## byteToAscii, \ 202 E ## charMatches, 203 204#else 205 206#define STANDARD_VTABLE(E) /* as nothing */ 207 208#endif 209 210#define NORMAL_VTABLE(E) \ 211 E ## isName2, \ 212 E ## isName3, \ 213 E ## isName4, \ 214 E ## isNmstrt2, \ 215 E ## isNmstrt3, \ 216 E ## isNmstrt4, \ 217 E ## isInvalid2, \ 218 E ## isInvalid3, \ 219 E ## isInvalid4 220 221static int FASTCALL checkCharRefNumber(int); 222 223#include "xmltok_impl.h" 224#include "ascii.h" 225 226#ifdef XML_MIN_SIZE 227#define sb_isNameMin isNever 228#define sb_isNmstrtMin isNever 229#endif 230 231#ifdef XML_MIN_SIZE 232#define MINBPC(enc) ((enc)->minBytesPerChar) 233#else 234/* minimum bytes per character */ 235#define MINBPC(enc) 1 236#endif 237 238#define SB_BYTE_TYPE(enc, p) \ 239 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 240 241#ifdef XML_MIN_SIZE 242static int PTRFASTCALL 243sb_byteType(const ENCODING *enc, const char *p) 244{ 245 return SB_BYTE_TYPE(enc, p); 246} 247#define BYTE_TYPE(enc, p) \ 248 (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 249#else 250#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 251#endif 252 253#ifdef XML_MIN_SIZE 254#define BYTE_TO_ASCII(enc, p) \ 255 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 256static int PTRFASTCALL 257sb_byteToAscii(const ENCODING *enc, const char *p) 258{ 259 return *p; 260} 261#else 262#define BYTE_TO_ASCII(enc, p) (*(p)) 263#endif 264 265#define IS_NAME_CHAR(enc, p, n) \ 266 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) 267#define IS_NMSTRT_CHAR(enc, p, n) \ 268 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) 269#define IS_INVALID_CHAR(enc, p, n) \ 270 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) 271 272#ifdef XML_MIN_SIZE 273#define IS_NAME_CHAR_MINBPC(enc, p) \ 274 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 275#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 276 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 277#else 278#define IS_NAME_CHAR_MINBPC(enc, p) (0) 279#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 280#endif 281 282#ifdef XML_MIN_SIZE 283#define CHAR_MATCHES(enc, p, c) \ 284 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 285static int PTRCALL 286sb_charMatches(const ENCODING *enc, const char *p, int c) 287{ 288 return *p == c; 289} 290#else 291/* c is an ASCII character */ 292#define CHAR_MATCHES(enc, p, c) (*(p) == c) 293#endif 294 295#define PREFIX(ident) normal_ ## ident 296#include "xmltok_impl.c" 297 298#undef MINBPC 299#undef BYTE_TYPE 300#undef BYTE_TO_ASCII 301#undef CHAR_MATCHES 302#undef IS_NAME_CHAR 303#undef IS_NAME_CHAR_MINBPC 304#undef IS_NMSTRT_CHAR 305#undef IS_NMSTRT_CHAR_MINBPC 306#undef IS_INVALID_CHAR 307 308enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 309 UTF8_cval1 = 0x00, 310 UTF8_cval2 = 0xc0, 311 UTF8_cval3 = 0xe0, 312 UTF8_cval4 = 0xf0 313}; 314 315static void PTRCALL 316utf8_toUtf8(const ENCODING *enc, 317 const char **fromP, const char *fromLim, 318 char **toP, const char *toLim) 319{ 320 char *to; 321 const char *from; 322 if (fromLim - *fromP > toLim - *toP) { 323 /* Avoid copying partial characters. */ 324 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) 325 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) 326 break; 327 } 328 for (to = *toP, from = *fromP; from != fromLim; from++, to++) 329 *to = *from; 330 *fromP = from; 331 *toP = to; 332} 333 334static void PTRCALL 335utf8_toUtf16(const ENCODING *enc, 336 const char **fromP, const char *fromLim, 337 unsigned short **toP, const unsigned short *toLim) 338{ 339 unsigned short *to = *toP; 340 const char *from = *fromP; 341 while (from != fromLim && to != toLim) { 342 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 343 case BT_LEAD2: 344 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 345 from += 2; 346 break; 347 case BT_LEAD3: 348 *to++ = (unsigned short)(((from[0] & 0xf) << 12) 349 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); 350 from += 3; 351 break; 352 case BT_LEAD4: 353 { 354 unsigned long n; 355 if (to + 1 == toLim) 356 goto after; 357 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 358 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 359 n -= 0x10000; 360 to[0] = (unsigned short)((n >> 10) | 0xD800); 361 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 362 to += 2; 363 from += 4; 364 } 365 break; 366 default: 367 *to++ = *from++; 368 break; 369 } 370 } 371after: 372 *fromP = from; 373 *toP = to; 374} 375 376#ifdef XML_NS 377static const struct normal_encoding utf8_encoding_ns = { 378 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 379 { 380#include "asciitab.h" 381#include "utf8tab.h" 382 }, 383 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 384}; 385#endif 386 387static const struct normal_encoding utf8_encoding = { 388 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 389 { 390#define BT_COLON BT_NMSTRT 391#include "asciitab.h" 392#undef BT_COLON 393#include "utf8tab.h" 394 }, 395 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 396}; 397 398#ifdef XML_NS 399 400static const struct normal_encoding internal_utf8_encoding_ns = { 401 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 402 { 403#include "iasciitab.h" 404#include "utf8tab.h" 405 }, 406 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 407}; 408 409#endif 410 411static const struct normal_encoding internal_utf8_encoding = { 412 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 413 { 414#define BT_COLON BT_NMSTRT 415#include "iasciitab.h" 416#undef BT_COLON 417#include "utf8tab.h" 418 }, 419 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 420}; 421 422static void PTRCALL 423latin1_toUtf8(const ENCODING *enc, 424 const char **fromP, const char *fromLim, 425 char **toP, const char *toLim) 426{ 427 for (;;) { 428 unsigned char c; 429 if (*fromP == fromLim) 430 break; 431 c = (unsigned char)**fromP; 432 if (c & 0x80) { 433 if (toLim - *toP < 2) 434 break; 435 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 436 *(*toP)++ = (char)((c & 0x3f) | 0x80); 437 (*fromP)++; 438 } 439 else { 440 if (*toP == toLim) 441 break; 442 *(*toP)++ = *(*fromP)++; 443 } 444 } 445} 446 447static void PTRCALL 448latin1_toUtf16(const ENCODING *enc, 449 const char **fromP, const char *fromLim, 450 unsigned short **toP, const unsigned short *toLim) 451{ 452 while (*fromP != fromLim && *toP != toLim) 453 *(*toP)++ = (unsigned char)*(*fromP)++; 454} 455 456#ifdef XML_NS 457 458static const struct normal_encoding latin1_encoding_ns = { 459 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 460 { 461#include "asciitab.h" 462#include "latin1tab.h" 463 }, 464 STANDARD_VTABLE(sb_) 465}; 466 467#endif 468 469static const struct normal_encoding latin1_encoding = { 470 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 471 { 472#define BT_COLON BT_NMSTRT 473#include "asciitab.h" 474#undef BT_COLON 475#include "latin1tab.h" 476 }, 477 STANDARD_VTABLE(sb_) 478}; 479 480static void PTRCALL 481ascii_toUtf8(const ENCODING *enc, 482 const char **fromP, const char *fromLim, 483 char **toP, const char *toLim) 484{ 485 while (*fromP != fromLim && *toP != toLim) 486 *(*toP)++ = *(*fromP)++; 487} 488 489#ifdef XML_NS 490 491static const struct normal_encoding ascii_encoding_ns = { 492 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 493 { 494#include "asciitab.h" 495/* BT_NONXML == 0 */ 496 }, 497 STANDARD_VTABLE(sb_) 498}; 499 500#endif 501 502static const struct normal_encoding ascii_encoding = { 503 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 504 { 505#define BT_COLON BT_NMSTRT 506#include "asciitab.h" 507#undef BT_COLON 508/* BT_NONXML == 0 */ 509 }, 510 STANDARD_VTABLE(sb_) 511}; 512 513static int PTRFASTCALL 514unicode_byte_type(char hi, char lo) 515{ 516 switch ((unsigned char)hi) { 517 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 518 return BT_LEAD4; 519 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 520 return BT_TRAIL; 521 case 0xFF: 522 switch ((unsigned char)lo) { 523 case 0xFF: 524 case 0xFE: 525 return BT_NONXML; 526 } 527 break; 528 } 529 return BT_NONASCII; 530} 531 532#define DEFINE_UTF16_TO_UTF8(E) \ 533static void PTRCALL \ 534E ## toUtf8(const ENCODING *enc, \ 535 const char **fromP, const char *fromLim, \ 536 char **toP, const char *toLim) \ 537{ \ 538 const char *from; \ 539 for (from = *fromP; from != fromLim; from += 2) { \ 540 int plane; \ 541 unsigned char lo2; \ 542 unsigned char lo = GET_LO(from); \ 543 unsigned char hi = GET_HI(from); \ 544 switch (hi) { \ 545 case 0: \ 546 if (lo < 0x80) { \ 547 if (*toP == toLim) { \ 548 *fromP = from; \ 549 return; \ 550 } \ 551 *(*toP)++ = lo; \ 552 break; \ 553 } \ 554 /* fall through */ \ 555 case 0x1: case 0x2: case 0x3: \ 556 case 0x4: case 0x5: case 0x6: case 0x7: \ 557 if (toLim - *toP < 2) { \ 558 *fromP = from; \ 559 return; \ 560 } \ 561 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 562 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 563 break; \ 564 default: \ 565 if (toLim - *toP < 3) { \ 566 *fromP = from; \ 567 return; \ 568 } \ 569 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 570 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 571 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 572 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 573 break; \ 574 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 575 if (toLim - *toP < 4) { \ 576 *fromP = from; \ 577 return; \ 578 } \ 579 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 580 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 581 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 582 from += 2; \ 583 lo2 = GET_LO(from); \ 584 *(*toP)++ = (((lo & 0x3) << 4) \ 585 | ((GET_HI(from) & 0x3) << 2) \ 586 | (lo2 >> 6) \ 587 | 0x80); \ 588 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 589 break; \ 590 } \ 591 } \ 592 *fromP = from; \ 593} 594 595#define DEFINE_UTF16_TO_UTF16(E) \ 596static void PTRCALL \ 597E ## toUtf16(const ENCODING *enc, \ 598 const char **fromP, const char *fromLim, \ 599 unsigned short **toP, const unsigned short *toLim) \ 600{ \ 601 /* Avoid copying first half only of surrogate */ \ 602 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 603 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ 604 fromLim -= 2; \ 605 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ 606 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 607} 608 609#define SET2(ptr, ch) \ 610 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 611#define GET_LO(ptr) ((unsigned char)(ptr)[0]) 612#define GET_HI(ptr) ((unsigned char)(ptr)[1]) 613 614DEFINE_UTF16_TO_UTF8(little2_) 615DEFINE_UTF16_TO_UTF16(little2_) 616 617#undef SET2 618#undef GET_LO 619#undef GET_HI 620 621#define SET2(ptr, ch) \ 622 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 623#define GET_LO(ptr) ((unsigned char)(ptr)[1]) 624#define GET_HI(ptr) ((unsigned char)(ptr)[0]) 625 626DEFINE_UTF16_TO_UTF8(big2_) 627DEFINE_UTF16_TO_UTF16(big2_) 628 629#undef SET2 630#undef GET_LO 631#undef GET_HI 632 633#define LITTLE2_BYTE_TYPE(enc, p) \ 634 ((p)[1] == 0 \ 635 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 636 : unicode_byte_type((p)[1], (p)[0])) 637#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 638#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 639#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ 640 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 641#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 642 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 643 644#ifdef XML_MIN_SIZE 645 646static int PTRFASTCALL 647little2_byteType(const ENCODING *enc, const char *p) 648{ 649 return LITTLE2_BYTE_TYPE(enc, p); 650} 651 652static int PTRFASTCALL 653little2_byteToAscii(const ENCODING *enc, const char *p) 654{ 655 return LITTLE2_BYTE_TO_ASCII(enc, p); 656} 657 658static int PTRCALL 659little2_charMatches(const ENCODING *enc, const char *p, int c) 660{ 661 return LITTLE2_CHAR_MATCHES(enc, p, c); 662} 663 664static int PTRFASTCALL 665little2_isNameMin(const ENCODING *enc, const char *p) 666{ 667 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); 668} 669 670static int PTRFASTCALL 671little2_isNmstrtMin(const ENCODING *enc, const char *p) 672{ 673 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); 674} 675 676#undef VTABLE 677#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 678 679#else /* not XML_MIN_SIZE */ 680 681#undef PREFIX 682#define PREFIX(ident) little2_ ## ident 683#define MINBPC(enc) 2 684/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 685#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 686#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 687#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) 688#define IS_NAME_CHAR(enc, p, n) 0 689#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) 690#define IS_NMSTRT_CHAR(enc, p, n) (0) 691#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) 692 693#include "xmltok_impl.c" 694 695#undef MINBPC 696#undef BYTE_TYPE 697#undef BYTE_TO_ASCII 698#undef CHAR_MATCHES 699#undef IS_NAME_CHAR 700#undef IS_NAME_CHAR_MINBPC 701#undef IS_NMSTRT_CHAR 702#undef IS_NMSTRT_CHAR_MINBPC 703#undef IS_INVALID_CHAR 704 705#endif /* not XML_MIN_SIZE */ 706 707#ifdef XML_NS 708 709static const struct normal_encoding little2_encoding_ns = { 710 { VTABLE, 2, 0, 711#if BYTEORDER == 1234 712 1 713#else 714 0 715#endif 716 }, 717 { 718#include "asciitab.h" 719#include "latin1tab.h" 720 }, 721 STANDARD_VTABLE(little2_) 722}; 723 724#endif 725 726static const struct normal_encoding little2_encoding = { 727 { VTABLE, 2, 0, 728#if BYTEORDER == 1234 729 1 730#else 731 0 732#endif 733 }, 734 { 735#define BT_COLON BT_NMSTRT 736#include "asciitab.h" 737#undef BT_COLON 738#include "latin1tab.h" 739 }, 740 STANDARD_VTABLE(little2_) 741}; 742 743#if BYTEORDER != 4321 744 745#ifdef XML_NS 746 747static const struct normal_encoding internal_little2_encoding_ns = { 748 { VTABLE, 2, 0, 1 }, 749 { 750#include "iasciitab.h" 751#include "latin1tab.h" 752 }, 753 STANDARD_VTABLE(little2_) 754}; 755 756#endif 757 758static const struct normal_encoding internal_little2_encoding = { 759 { VTABLE, 2, 0, 1 }, 760 { 761#define BT_COLON BT_NMSTRT 762#include "iasciitab.h" 763#undef BT_COLON 764#include "latin1tab.h" 765 }, 766 STANDARD_VTABLE(little2_) 767}; 768 769#endif 770 771 772#define BIG2_BYTE_TYPE(enc, p) \ 773 ((p)[0] == 0 \ 774 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 775 : unicode_byte_type((p)[0], (p)[1])) 776#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 777#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 778#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ 779 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 780#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 781 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 782 783#ifdef XML_MIN_SIZE 784 785static int PTRFASTCALL 786big2_byteType(const ENCODING *enc, const char *p) 787{ 788 return BIG2_BYTE_TYPE(enc, p); 789} 790 791static int PTRFASTCALL 792big2_byteToAscii(const ENCODING *enc, const char *p) 793{ 794 return BIG2_BYTE_TO_ASCII(enc, p); 795} 796 797static int PTRCALL 798big2_charMatches(const ENCODING *enc, const char *p, int c) 799{ 800 return BIG2_CHAR_MATCHES(enc, p, c); 801} 802 803static int PTRFASTCALL 804big2_isNameMin(const ENCODING *enc, const char *p) 805{ 806 return BIG2_IS_NAME_CHAR_MINBPC(enc, p); 807} 808 809static int PTRFASTCALL 810big2_isNmstrtMin(const ENCODING *enc, const char *p) 811{ 812 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); 813} 814 815#undef VTABLE 816#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 817 818#else /* not XML_MIN_SIZE */ 819 820#undef PREFIX 821#define PREFIX(ident) big2_ ## ident 822#define MINBPC(enc) 2 823/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 824#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 825#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 826#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) 827#define IS_NAME_CHAR(enc, p, n) 0 828#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) 829#define IS_NMSTRT_CHAR(enc, p, n) (0) 830#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) 831 832#include "xmltok_impl.c" 833 834#undef MINBPC 835#undef BYTE_TYPE 836#undef BYTE_TO_ASCII 837#undef CHAR_MATCHES 838#undef IS_NAME_CHAR 839#undef IS_NAME_CHAR_MINBPC 840#undef IS_NMSTRT_CHAR 841#undef IS_NMSTRT_CHAR_MINBPC 842#undef IS_INVALID_CHAR 843 844#endif /* not XML_MIN_SIZE */ 845 846#ifdef XML_NS 847 848static const struct normal_encoding big2_encoding_ns = { 849 { VTABLE, 2, 0, 850#if BYTEORDER == 4321 851 1 852#else 853 0 854#endif 855 }, 856 { 857#include "asciitab.h" 858#include "latin1tab.h" 859 }, 860 STANDARD_VTABLE(big2_) 861}; 862 863#endif 864 865static const struct normal_encoding big2_encoding = { 866 { VTABLE, 2, 0, 867#if BYTEORDER == 4321 868 1 869#else 870 0 871#endif 872 }, 873 { 874#define BT_COLON BT_NMSTRT 875#include "asciitab.h" 876#undef BT_COLON 877#include "latin1tab.h" 878 }, 879 STANDARD_VTABLE(big2_) 880}; 881 882#if BYTEORDER != 1234 883 884#ifdef XML_NS 885 886static const struct normal_encoding internal_big2_encoding_ns = { 887 { VTABLE, 2, 0, 1 }, 888 { 889#include "iasciitab.h" 890#include "latin1tab.h" 891 }, 892 STANDARD_VTABLE(big2_) 893}; 894 895#endif 896 897static const struct normal_encoding internal_big2_encoding = { 898 { VTABLE, 2, 0, 1 }, 899 { 900#define BT_COLON BT_NMSTRT 901#include "iasciitab.h" 902#undef BT_COLON 903#include "latin1tab.h" 904 }, 905 STANDARD_VTABLE(big2_) 906}; 907 908#endif 909 910#undef PREFIX 911 912static int FASTCALL 913streqci(const char *s1, const char *s2) 914{ 915 for (;;) { 916 char c1 = *s1++; 917 char c2 = *s2++; 918 if (ASCII_a <= c1 && c1 <= ASCII_z) 919 c1 += ASCII_A - ASCII_a; 920 if (ASCII_a <= c2 && c2 <= ASCII_z) 921 c2 += ASCII_A - ASCII_a; 922 if (c1 != c2) 923 return 0; 924 if (!c1) 925 break; 926 } 927 return 1; 928} 929 930static void PTRCALL 931initUpdatePosition(const ENCODING *enc, const char *ptr, 932 const char *end, POSITION *pos) 933{ 934 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 935} 936 937static int 938toAscii(const ENCODING *enc, const char *ptr, const char *end) 939{ 940 char buf[1]; 941 char *p = buf; 942 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 943 if (p == buf) 944 return -1; 945 else 946 return buf[0]; 947} 948 949static int FASTCALL 950isSpace(int c) 951{ 952 switch (c) { 953 case 0x20: 954 case 0xD: 955 case 0xA: 956 case 0x9: 957 return 1; 958 } 959 return 0; 960} 961 962/* Return 1 if there's just optional white space or there's an S 963 followed by name=val. 964*/ 965static int 966parsePseudoAttribute(const ENCODING *enc, 967 const char *ptr, 968 const char *end, 969 const char **namePtr, 970 const char **nameEndPtr, 971 const char **valPtr, 972 const char **nextTokPtr) 973{ 974 int c; 975 char open; 976 if (ptr == end) { 977 *namePtr = NULL; 978 return 1; 979 } 980 if (!isSpace(toAscii(enc, ptr, end))) { 981 *nextTokPtr = ptr; 982 return 0; 983 } 984 do { 985 ptr += enc->minBytesPerChar; 986 } while (isSpace(toAscii(enc, ptr, end))); 987 if (ptr == end) { 988 *namePtr = NULL; 989 return 1; 990 } 991 *namePtr = ptr; 992 for (;;) { 993 c = toAscii(enc, ptr, end); 994 if (c == -1) { 995 *nextTokPtr = ptr; 996 return 0; 997 } 998 if (c == ASCII_EQUALS) { 999 *nameEndPtr = ptr; 1000 break; 1001 } 1002 if (isSpace(c)) { 1003 *nameEndPtr = ptr; 1004 do { 1005 ptr += enc->minBytesPerChar; 1006 } while (isSpace(c = toAscii(enc, ptr, end))); 1007 if (c != ASCII_EQUALS) { 1008 *nextTokPtr = ptr; 1009 return 0; 1010 } 1011 break; 1012 } 1013 ptr += enc->minBytesPerChar; 1014 } 1015 if (ptr == *namePtr) { 1016 *nextTokPtr = ptr; 1017 return 0; 1018 } 1019 ptr += enc->minBytesPerChar; 1020 c = toAscii(enc, ptr, end); 1021 while (isSpace(c)) { 1022 ptr += enc->minBytesPerChar; 1023 c = toAscii(enc, ptr, end); 1024 } 1025 if (c != ASCII_QUOT && c != ASCII_APOS) { 1026 *nextTokPtr = ptr; 1027 return 0; 1028 } 1029 open = (char)c; 1030 ptr += enc->minBytesPerChar; 1031 *valPtr = ptr; 1032 for (;; ptr += enc->minBytesPerChar) { 1033 c = toAscii(enc, ptr, end); 1034 if (c == open) 1035 break; 1036 if (!(ASCII_a <= c && c <= ASCII_z) 1037 && !(ASCII_A <= c && c <= ASCII_Z) 1038 && !(ASCII_0 <= c && c <= ASCII_9) 1039 && c != ASCII_PERIOD 1040 && c != ASCII_MINUS 1041 && c != ASCII_UNDERSCORE) { 1042 *nextTokPtr = ptr; 1043 return 0; 1044 } 1045 } 1046 *nextTokPtr = ptr + enc->minBytesPerChar; 1047 return 1; 1048} 1049 1050static const char KW_version[] = { 1051 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0' 1052}; 1053 1054static const char KW_encoding[] = { 1055 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0' 1056}; 1057 1058static const char KW_standalone[] = { 1059 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, 1060 ASCII_n, ASCII_e, '\0' 1061}; 1062 1063static const char KW_yes[] = { 1064 ASCII_y, ASCII_e, ASCII_s, '\0' 1065}; 1066 1067static const char KW_no[] = { 1068 ASCII_n, ASCII_o, '\0' 1069}; 1070 1071static int 1072doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, 1073 const char *, 1074 const char *), 1075 int isGeneralTextEntity, 1076 const ENCODING *enc, 1077 const char *ptr, 1078 const char *end, 1079 const char **badPtr, 1080 const char **versionPtr, 1081 const char **versionEndPtr, 1082 const char **encodingName, 1083 const ENCODING **encoding, 1084 int *standalone) 1085{ 1086 const char *val = NULL; 1087 const char *name = NULL; 1088 const char *nameEnd = NULL; 1089 ptr += 5 * enc->minBytesPerChar; 1090 end -= 2 * enc->minBytesPerChar; 1091 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1092 || !name) { 1093 *badPtr = ptr; 1094 return 0; 1095 } 1096 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1097 if (!isGeneralTextEntity) { 1098 *badPtr = name; 1099 return 0; 1100 } 1101 } 1102 else { 1103 if (versionPtr) 1104 *versionPtr = val; 1105 if (versionEndPtr) 1106 *versionEndPtr = ptr; 1107 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1108 *badPtr = ptr; 1109 return 0; 1110 } 1111 if (!name) { 1112 if (isGeneralTextEntity) { 1113 /* a TextDecl must have an EncodingDecl */ 1114 *badPtr = ptr; 1115 return 0; 1116 } 1117 return 1; 1118 } 1119 } 1120 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1121 int c = toAscii(enc, val, end); 1122 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) { 1123 *badPtr = val; 1124 return 0; 1125 } 1126 if (encodingName) 1127 *encodingName = val; 1128 if (encoding) 1129 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1130 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1131 *badPtr = ptr; 1132 return 0; 1133 } 1134 if (!name) 1135 return 1; 1136 } 1137 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1138 || isGeneralTextEntity) { 1139 *badPtr = name; 1140 return 0; 1141 } 1142 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1143 if (standalone) 1144 *standalone = 1; 1145 } 1146 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1147 if (standalone) 1148 *standalone = 0; 1149 } 1150 else { 1151 *badPtr = val; 1152 return 0; 1153 } 1154 while (isSpace(toAscii(enc, ptr, end))) 1155 ptr += enc->minBytesPerChar; 1156 if (ptr != end) { 1157 *badPtr = ptr; 1158 return 0; 1159 } 1160 return 1; 1161} 1162 1163static int FASTCALL 1164checkCharRefNumber(int result) 1165{ 1166 switch (result >> 8) { 1167 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 1168 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 1169 return -1; 1170 case 0: 1171 if (latin1_encoding.type[result] == BT_NONXML) 1172 return -1; 1173 break; 1174 case 0xFF: 1175 if (result == 0xFFFE || result == 0xFFFF) 1176 return -1; 1177 break; 1178 } 1179 return result; 1180} 1181 1182int FASTCALL 1183XmlUtf8Encode(int c, char *buf) 1184{ 1185 enum { 1186 /* minN is minimum legal resulting value for N byte sequence */ 1187 min2 = 0x80, 1188 min3 = 0x800, 1189 min4 = 0x10000 1190 }; 1191 1192 if (c < 0) 1193 return 0; 1194 if (c < min2) { 1195 buf[0] = (char)(c | UTF8_cval1); 1196 return 1; 1197 } 1198 if (c < min3) { 1199 buf[0] = (char)((c >> 6) | UTF8_cval2); 1200 buf[1] = (char)((c & 0x3f) | 0x80); 1201 return 2; 1202 } 1203 if (c < min4) { 1204 buf[0] = (char)((c >> 12) | UTF8_cval3); 1205 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1206 buf[2] = (char)((c & 0x3f) | 0x80); 1207 return 3; 1208 } 1209 if (c < 0x110000) { 1210 buf[0] = (char)((c >> 18) | UTF8_cval4); 1211 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1212 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1213 buf[3] = (char)((c & 0x3f) | 0x80); 1214 return 4; 1215 } 1216 return 0; 1217} 1218 1219int FASTCALL 1220XmlUtf16Encode(int charNum, unsigned short *buf) 1221{ 1222 if (charNum < 0) 1223 return 0; 1224 if (charNum < 0x10000) { 1225 buf[0] = (unsigned short)charNum; 1226 return 1; 1227 } 1228 if (charNum < 0x110000) { 1229 charNum -= 0x10000; 1230 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1231 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1232 return 2; 1233 } 1234 return 0; 1235} 1236 1237struct unknown_encoding { 1238 struct normal_encoding normal; 1239 CONVERTER convert; 1240 void *userData; 1241 unsigned short utf16[256]; 1242 char utf8[256][4]; 1243}; 1244 1245#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc)) 1246 1247int 1248XmlSizeOfUnknownEncoding(void) 1249{ 1250 return sizeof(struct unknown_encoding); 1251} 1252 1253static int PTRFASTCALL 1254unknown_isName(const ENCODING *enc, const char *p) 1255{ 1256 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1257 int c = uenc->convert(uenc->userData, p); 1258 if (c & ~0xFFFF) 1259 return 0; 1260 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1261} 1262 1263static int PTRFASTCALL 1264unknown_isNmstrt(const ENCODING *enc, const char *p) 1265{ 1266 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1267 int c = uenc->convert(uenc->userData, p); 1268 if (c & ~0xFFFF) 1269 return 0; 1270 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1271} 1272 1273static int PTRFASTCALL 1274unknown_isInvalid(const ENCODING *enc, const char *p) 1275{ 1276 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1277 int c = uenc->convert(uenc->userData, p); 1278 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1279} 1280 1281static void PTRCALL 1282unknown_toUtf8(const ENCODING *enc, 1283 const char **fromP, const char *fromLim, 1284 char **toP, const char *toLim) 1285{ 1286 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1287 char buf[XML_UTF8_ENCODE_MAX]; 1288 for (;;) { 1289 const char *utf8; 1290 int n; 1291 if (*fromP == fromLim) 1292 break; 1293 utf8 = uenc->utf8[(unsigned char)**fromP]; 1294 n = *utf8++; 1295 if (n == 0) { 1296 int c = uenc->convert(uenc->userData, *fromP); 1297 n = XmlUtf8Encode(c, buf); 1298 if (n > toLim - *toP) 1299 break; 1300 utf8 = buf; 1301 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1302 - (BT_LEAD2 - 2)); 1303 } 1304 else { 1305 if (n > toLim - *toP) 1306 break; 1307 (*fromP)++; 1308 } 1309 do { 1310 *(*toP)++ = *utf8++; 1311 } while (--n != 0); 1312 } 1313} 1314 1315static void PTRCALL 1316unknown_toUtf16(const ENCODING *enc, 1317 const char **fromP, const char *fromLim, 1318 unsigned short **toP, const unsigned short *toLim) 1319{ 1320 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1321 while (*fromP != fromLim && *toP != toLim) { 1322 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1323 if (c == 0) { 1324 c = (unsigned short) 1325 uenc->convert(uenc->userData, *fromP); 1326 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1327 - (BT_LEAD2 - 2)); 1328 } 1329 else 1330 (*fromP)++; 1331 *(*toP)++ = c; 1332 } 1333} 1334 1335ENCODING * 1336XmlInitUnknownEncoding(void *mem, 1337 int *table, 1338 CONVERTER convert, 1339 void *userData) 1340{ 1341 int i; 1342 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1343 for (i = 0; i < (int)sizeof(struct normal_encoding); i++) 1344 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; 1345 for (i = 0; i < 128; i++) 1346 if (latin1_encoding.type[i] != BT_OTHER 1347 && latin1_encoding.type[i] != BT_NONXML 1348 && table[i] != i) 1349 return 0; 1350 for (i = 0; i < 256; i++) { 1351 int c = table[i]; 1352 if (c == -1) { 1353 e->normal.type[i] = BT_MALFORM; 1354 /* This shouldn't really get used. */ 1355 e->utf16[i] = 0xFFFF; 1356 e->utf8[i][0] = 1; 1357 e->utf8[i][1] = 0; 1358 } 1359 else if (c < 0) { 1360 if (c < -4) 1361 return 0; 1362 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1363 e->utf8[i][0] = 0; 1364 e->utf16[i] = 0; 1365 } 1366 else if (c < 0x80) { 1367 if (latin1_encoding.type[c] != BT_OTHER 1368 && latin1_encoding.type[c] != BT_NONXML 1369 && c != i) 1370 return 0; 1371 e->normal.type[i] = latin1_encoding.type[c]; 1372 e->utf8[i][0] = 1; 1373 e->utf8[i][1] = (char)c; 1374 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1375 } 1376 else if (checkCharRefNumber(c) < 0) { 1377 e->normal.type[i] = BT_NONXML; 1378 /* This shouldn't really get used. */ 1379 e->utf16[i] = 0xFFFF; 1380 e->utf8[i][0] = 1; 1381 e->utf8[i][1] = 0; 1382 } 1383 else { 1384 if (c > 0xFFFF) 1385 return 0; 1386 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1387 e->normal.type[i] = BT_NMSTRT; 1388 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1389 e->normal.type[i] = BT_NAME; 1390 else 1391 e->normal.type[i] = BT_OTHER; 1392 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1393 e->utf16[i] = (unsigned short)c; 1394 } 1395 } 1396 e->userData = userData; 1397 e->convert = convert; 1398 if (convert) { 1399 e->normal.isName2 = unknown_isName; 1400 e->normal.isName3 = unknown_isName; 1401 e->normal.isName4 = unknown_isName; 1402 e->normal.isNmstrt2 = unknown_isNmstrt; 1403 e->normal.isNmstrt3 = unknown_isNmstrt; 1404 e->normal.isNmstrt4 = unknown_isNmstrt; 1405 e->normal.isInvalid2 = unknown_isInvalid; 1406 e->normal.isInvalid3 = unknown_isInvalid; 1407 e->normal.isInvalid4 = unknown_isInvalid; 1408 } 1409 e->normal.enc.utf8Convert = unknown_toUtf8; 1410 e->normal.enc.utf16Convert = unknown_toUtf16; 1411 return &(e->normal.enc); 1412} 1413 1414/* If this enumeration is changed, getEncodingIndex and encodings 1415must also be changed. */ 1416enum { 1417 UNKNOWN_ENC = -1, 1418 ISO_8859_1_ENC = 0, 1419 US_ASCII_ENC, 1420 UTF_8_ENC, 1421 UTF_16_ENC, 1422 UTF_16BE_ENC, 1423 UTF_16LE_ENC, 1424 /* must match encodingNames up to here */ 1425 NO_ENC 1426}; 1427 1428static const char KW_ISO_8859_1[] = { 1429 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, 1430 ASCII_MINUS, ASCII_1, '\0' 1431}; 1432static const char KW_US_ASCII[] = { 1433 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, 1434 '\0' 1435}; 1436static const char KW_UTF_8[] = { 1437 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0' 1438}; 1439static const char KW_UTF_16[] = { 1440 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0' 1441}; 1442static const char KW_UTF_16BE[] = { 1443 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, 1444 '\0' 1445}; 1446static const char KW_UTF_16LE[] = { 1447 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, 1448 '\0' 1449}; 1450 1451static int FASTCALL 1452getEncodingIndex(const char *name) 1453{ 1454 static const char *encodingNames[] = { 1455 KW_ISO_8859_1, 1456 KW_US_ASCII, 1457 KW_UTF_8, 1458 KW_UTF_16, 1459 KW_UTF_16BE, 1460 KW_UTF_16LE, 1461 }; 1462 int i; 1463 if (name == NULL) 1464 return NO_ENC; 1465 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) 1466 if (streqci(name, encodingNames[i])) 1467 return i; 1468 return UNKNOWN_ENC; 1469} 1470 1471/* For binary compatibility, we store the index of the encoding 1472 specified at initialization in the isUtf16 member. 1473*/ 1474 1475#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1476#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1477 1478/* This is what detects the encoding. encodingTable maps from 1479 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1480 the external (protocol) specified encoding; state is 1481 XML_CONTENT_STATE if we're parsing an external text entity, and 1482 XML_PROLOG_STATE otherwise. 1483*/ 1484 1485 1486static int 1487initScan(const ENCODING **encodingTable, 1488 const INIT_ENCODING *enc, 1489 int state, 1490 const char *ptr, 1491 const char *end, 1492 const char **nextTokPtr) 1493{ 1494 const ENCODING **encPtr; 1495 1496 if (ptr == end) 1497 return XML_TOK_NONE; 1498 encPtr = enc->encPtr; 1499 if (ptr + 1 == end) { 1500 /* only a single byte available for auto-detection */ 1501#ifndef XML_DTD /* FIXME */ 1502 /* a well-formed document entity must have more than one byte */ 1503 if (state != XML_CONTENT_STATE) 1504 return XML_TOK_PARTIAL; 1505#endif 1506 /* so we're parsing an external text entity... */ 1507 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1508 switch (INIT_ENC_INDEX(enc)) { 1509 case UTF_16_ENC: 1510 case UTF_16LE_ENC: 1511 case UTF_16BE_ENC: 1512 return XML_TOK_PARTIAL; 1513 } 1514 switch ((unsigned char)*ptr) { 1515 case 0xFE: 1516 case 0xFF: 1517 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1518 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1519 && state == XML_CONTENT_STATE) 1520 break; 1521 /* fall through */ 1522 case 0x00: 1523 case 0x3C: 1524 return XML_TOK_PARTIAL; 1525 } 1526 } 1527 else { 1528 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1529 case 0xFEFF: 1530 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1531 && state == XML_CONTENT_STATE) 1532 break; 1533 *nextTokPtr = ptr + 2; 1534 *encPtr = encodingTable[UTF_16BE_ENC]; 1535 return XML_TOK_BOM; 1536 /* 00 3C is handled in the default case */ 1537 case 0x3C00: 1538 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1539 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1540 && state == XML_CONTENT_STATE) 1541 break; 1542 *encPtr = encodingTable[UTF_16LE_ENC]; 1543 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1544 case 0xFFFE: 1545 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1546 && state == XML_CONTENT_STATE) 1547 break; 1548 *nextTokPtr = ptr + 2; 1549 *encPtr = encodingTable[UTF_16LE_ENC]; 1550 return XML_TOK_BOM; 1551 case 0xEFBB: 1552 /* Maybe a UTF-8 BOM (EF BB BF) */ 1553 /* If there's an explicitly specified (external) encoding 1554 of ISO-8859-1 or some flavour of UTF-16 1555 and this is an external text entity, 1556 don't look for the BOM, 1557 because it might be a legal data. 1558 */ 1559 if (state == XML_CONTENT_STATE) { 1560 int e = INIT_ENC_INDEX(enc); 1561 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC 1562 || e == UTF_16LE_ENC || e == UTF_16_ENC) 1563 break; 1564 } 1565 if (ptr + 2 == end) 1566 return XML_TOK_PARTIAL; 1567 if ((unsigned char)ptr[2] == 0xBF) { 1568 *nextTokPtr = ptr + 3; 1569 *encPtr = encodingTable[UTF_8_ENC]; 1570 return XML_TOK_BOM; 1571 } 1572 break; 1573 default: 1574 if (ptr[0] == '\0') { 1575 /* 0 isn't a legal data character. Furthermore a document 1576 entity can only start with ASCII characters. So the only 1577 way this can fail to be big-endian UTF-16 if it it's an 1578 external parsed general entity that's labelled as 1579 UTF-16LE. 1580 */ 1581 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1582 break; 1583 *encPtr = encodingTable[UTF_16BE_ENC]; 1584 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1585 } 1586 else if (ptr[1] == '\0') { 1587 /* We could recover here in the case: 1588 - parsing an external entity 1589 - second byte is 0 1590 - no externally specified encoding 1591 - no encoding declaration 1592 by assuming UTF-16LE. But we don't, because this would mean when 1593 presented just with a single byte, we couldn't reliably determine 1594 whether we needed further bytes. 1595 */ 1596 if (state == XML_CONTENT_STATE) 1597 break; 1598 *encPtr = encodingTable[UTF_16LE_ENC]; 1599 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1600 } 1601 break; 1602 } 1603 } 1604 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1605 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1606} 1607 1608 1609#define NS(x) x 1610#define ns(x) x 1611#include "xmltok_ns.c" 1612#undef NS 1613#undef ns 1614 1615#ifdef XML_NS 1616 1617#define NS(x) x ## NS 1618#define ns(x) x ## _ns 1619 1620#include "xmltok_ns.c" 1621 1622#undef NS 1623#undef ns 1624 1625ENCODING * 1626XmlInitUnknownEncodingNS(void *mem, 1627 int *table, 1628 CONVERTER convert, 1629 void *userData) 1630{ 1631 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1632 if (enc) 1633 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1634 return enc; 1635} 1636 1637#endif /* XML_NS */ 1638