1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2 See the file COPYING for copying permission. 3*/ 4 5#ifdef COMPILED_FROM_DSP 6#include "winconfig.h" 7#elif defined(MACOS_CLASSIC) 8#include "macconfig.h" 9#else 10#ifdef HAVE_EXPAT_CONFIG_H 11#include <expat_config.h> 12#endif 13#endif /* ndef COMPILED_FROM_DSP */ 14 15#include "internal.h" 16#include "xmltok.h" 17#include "nametab.h" 18 19#ifdef XML_DTD 20#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 21#else 22#define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 23#endif 24 25#define VTABLE1 \ 26 { PREFIX(prologTok), PREFIX(contentTok), \ 27 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ 28 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 29 PREFIX(sameName), \ 30 PREFIX(nameMatchesAscii), \ 31 PREFIX(nameLength), \ 32 PREFIX(skipS), \ 33 PREFIX(getAtts), \ 34 PREFIX(charRefNumber), \ 35 PREFIX(predefinedEntityName), \ 36 PREFIX(updatePosition), \ 37 PREFIX(isPublicId) 38 39#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 40 41#define UCS2_GET_NAMING(pages, hi, lo) \ 42 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) 43 44/* A 2 byte UTF-8 representation splits the characters 11 bits between 45 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 46 pages, 3 bits to add to that index and 5 bits to generate the mask. 47*/ 48#define UTF8_GET_NAMING2(pages, byte) \ 49 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 50 + ((((byte)[0]) & 3) << 1) \ 51 + ((((byte)[1]) >> 5) & 1)] \ 52 & (1 << (((byte)[1]) & 0x1F))) 53 54/* A 3 byte UTF-8 representation splits the characters 16 bits between 55 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 56 into pages, 3 bits to add to that index and 5 bits to generate the 57 mask. 58*/ 59#define UTF8_GET_NAMING3(pages, byte) \ 60 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 61 + ((((byte)[1]) >> 2) & 0xF)] \ 62 << 3) \ 63 + ((((byte)[1]) & 3) << 1) \ 64 + ((((byte)[2]) >> 5) & 1)] \ 65 & (1 << (((byte)[2]) & 0x1F))) 66 67#define UTF8_GET_NAMING(pages, p, n) \ 68 ((n) == 2 \ 69 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 70 : ((n) == 3 \ 71 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 72 : 0)) 73 74/* Detection of invalid UTF-8 sequences is based on Table 3.1B 75 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 76 with the additional restriction of not allowing the Unicode 77 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 78 Implementation details: 79 (A & 0x80) == 0 means A < 0x80 80 and 81 (A & 0xC0) == 0xC0 means A > 0xBF 82*/ 83 84#define UTF8_INVALID2(p) \ 85 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 86 87#define UTF8_INVALID3(p) \ 88 (((p)[2] & 0x80) == 0 \ 89 || \ 90 ((*p) == 0xEF && (p)[1] == 0xBF \ 91 ? \ 92 (p)[2] > 0xBD \ 93 : \ 94 ((p)[2] & 0xC0) == 0xC0) \ 95 || \ 96 ((*p) == 0xE0 \ 97 ? \ 98 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 99 : \ 100 ((p)[1] & 0x80) == 0 \ 101 || \ 102 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 103 104#define UTF8_INVALID4(p) \ 105 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ 106 || \ 107 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ 108 || \ 109 ((*p) == 0xF0 \ 110 ? \ 111 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 112 : \ 113 ((p)[1] & 0x80) == 0 \ 114 || \ 115 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 116 117static int PTRFASTCALL 118isNever(const ENCODING *enc, const char *p) 119{ 120 return 0; 121} 122 123static int PTRFASTCALL 124utf8_isName2(const ENCODING *enc, const char *p) 125{ 126 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 127} 128 129static int PTRFASTCALL 130utf8_isName3(const ENCODING *enc, const char *p) 131{ 132 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 133} 134 135#define utf8_isName4 isNever 136 137static int PTRFASTCALL 138utf8_isNmstrt2(const ENCODING *enc, const char *p) 139{ 140 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 141} 142 143static int PTRFASTCALL 144utf8_isNmstrt3(const ENCODING *enc, const char *p) 145{ 146 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 147} 148 149#define utf8_isNmstrt4 isNever 150 151static int PTRFASTCALL 152utf8_isInvalid2(const ENCODING *enc, const char *p) 153{ 154 return UTF8_INVALID2((const unsigned char *)p); 155} 156 157static int PTRFASTCALL 158utf8_isInvalid3(const ENCODING *enc, const char *p) 159{ 160 return UTF8_INVALID3((const unsigned char *)p); 161} 162 163static int PTRFASTCALL 164utf8_isInvalid4(const ENCODING *enc, const char *p) 165{ 166 return UTF8_INVALID4((const unsigned char *)p); 167} 168 169struct normal_encoding { 170 ENCODING enc; 171 unsigned char type[256]; 172#ifdef XML_MIN_SIZE 173 int (PTRFASTCALL *byteType)(const ENCODING *, const char *); 174 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 175 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 176 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 177 int (PTRCALL *charMatches)(const ENCODING *, const char *, int); 178#endif /* XML_MIN_SIZE */ 179 int (PTRFASTCALL *isName2)(const ENCODING *, const char *); 180 int (PTRFASTCALL *isName3)(const ENCODING *, const char *); 181 int (PTRFASTCALL *isName4)(const ENCODING *, const char *); 182 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 183 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 184 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 185 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 186 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 187 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 188}; 189 190#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) 191 192#ifdef XML_MIN_SIZE 193 194#define STANDARD_VTABLE(E) \ 195 E ## byteType, \ 196 E ## isNameMin, \ 197 E ## isNmstrtMin, \ 198 E ## byteToAscii, \ 199 E ## charMatches, 200 201#else 202 203#define STANDARD_VTABLE(E) /* as nothing */ 204 205#endif 206 207#define NORMAL_VTABLE(E) \ 208 E ## isName2, \ 209 E ## isName3, \ 210 E ## isName4, \ 211 E ## isNmstrt2, \ 212 E ## isNmstrt3, \ 213 E ## isNmstrt4, \ 214 E ## isInvalid2, \ 215 E ## isInvalid3, \ 216 E ## isInvalid4 217 218static int FASTCALL checkCharRefNumber(int); 219 220#include "xmltok_impl.h" 221#include "ascii.h" 222 223#ifdef XML_MIN_SIZE 224#define sb_isNameMin isNever 225#define sb_isNmstrtMin isNever 226#endif 227 228#ifdef XML_MIN_SIZE 229#define MINBPC(enc) ((enc)->minBytesPerChar) 230#else 231/* minimum bytes per character */ 232#define MINBPC(enc) 1 233#endif 234 235#define SB_BYTE_TYPE(enc, p) \ 236 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 237 238#ifdef XML_MIN_SIZE 239static int PTRFASTCALL 240sb_byteType(const ENCODING *enc, const char *p) 241{ 242 return SB_BYTE_TYPE(enc, p); 243} 244#define BYTE_TYPE(enc, p) \ 245 (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 246#else 247#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 248#endif 249 250#ifdef XML_MIN_SIZE 251#define BYTE_TO_ASCII(enc, p) \ 252 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 253static int PTRFASTCALL 254sb_byteToAscii(const ENCODING *enc, const char *p) 255{ 256 return *p; 257} 258#else 259#define BYTE_TO_ASCII(enc, p) (*(p)) 260#endif 261 262#define IS_NAME_CHAR(enc, p, n) \ 263 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) 264#define IS_NMSTRT_CHAR(enc, p, n) \ 265 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) 266#define IS_INVALID_CHAR(enc, p, n) \ 267 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) 268 269#ifdef XML_MIN_SIZE 270#define IS_NAME_CHAR_MINBPC(enc, p) \ 271 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 272#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 273 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 274#else 275#define IS_NAME_CHAR_MINBPC(enc, p) (0) 276#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 277#endif 278 279#ifdef XML_MIN_SIZE 280#define CHAR_MATCHES(enc, p, c) \ 281 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 282static int PTRCALL 283sb_charMatches(const ENCODING *enc, const char *p, int c) 284{ 285 return *p == c; 286} 287#else 288/* c is an ASCII character */ 289#define CHAR_MATCHES(enc, p, c) (*(p) == c) 290#endif 291 292#define PREFIX(ident) normal_ ## ident 293#include "xmltok_impl.c" 294 295#undef MINBPC 296#undef BYTE_TYPE 297#undef BYTE_TO_ASCII 298#undef CHAR_MATCHES 299#undef IS_NAME_CHAR 300#undef IS_NAME_CHAR_MINBPC 301#undef IS_NMSTRT_CHAR 302#undef IS_NMSTRT_CHAR_MINBPC 303#undef IS_INVALID_CHAR 304 305enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 306 UTF8_cval1 = 0x00, 307 UTF8_cval2 = 0xc0, 308 UTF8_cval3 = 0xe0, 309 UTF8_cval4 = 0xf0 310}; 311 312static void PTRCALL 313utf8_toUtf8(const ENCODING *enc, 314 const char **fromP, const char *fromLim, 315 char **toP, const char *toLim) 316{ 317 char *to; 318 const char *from; 319 if (fromLim - *fromP > toLim - *toP) { 320 /* Avoid copying partial characters. */ 321 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) 322 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) 323 break; 324 } 325 for (to = *toP, from = *fromP; from != fromLim; from++, to++) 326 *to = *from; 327 *fromP = from; 328 *toP = to; 329} 330 331static void PTRCALL 332utf8_toUtf16(const ENCODING *enc, 333 const char **fromP, const char *fromLim, 334 unsigned short **toP, const unsigned short *toLim) 335{ 336 unsigned short *to = *toP; 337 const char *from = *fromP; 338 while (from != fromLim && to != toLim) { 339 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 340 case BT_LEAD2: 341 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 342 from += 2; 343 break; 344 case BT_LEAD3: 345 *to++ = (unsigned short)(((from[0] & 0xf) << 12) 346 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); 347 from += 3; 348 break; 349 case BT_LEAD4: 350 { 351 unsigned long n; 352 if (to + 1 == toLim) 353 goto after; 354 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 355 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 356 n -= 0x10000; 357 to[0] = (unsigned short)((n >> 10) | 0xD800); 358 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 359 to += 2; 360 from += 4; 361 } 362 break; 363 default: 364 *to++ = *from++; 365 break; 366 } 367 } 368after: 369 *fromP = from; 370 *toP = to; 371} 372 373#ifdef XML_NS 374static const struct normal_encoding utf8_encoding_ns = { 375 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 376 { 377#include "asciitab.h" 378#include "utf8tab.h" 379 }, 380 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 381}; 382#endif 383 384static const struct normal_encoding utf8_encoding = { 385 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 386 { 387#define BT_COLON BT_NMSTRT 388#include "asciitab.h" 389#undef BT_COLON 390#include "utf8tab.h" 391 }, 392 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 393}; 394 395#ifdef XML_NS 396 397static const struct normal_encoding internal_utf8_encoding_ns = { 398 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 399 { 400#include "iasciitab.h" 401#include "utf8tab.h" 402 }, 403 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 404}; 405 406#endif 407 408static const struct normal_encoding internal_utf8_encoding = { 409 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 410 { 411#define BT_COLON BT_NMSTRT 412#include "iasciitab.h" 413#undef BT_COLON 414#include "utf8tab.h" 415 }, 416 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 417}; 418 419static void PTRCALL 420latin1_toUtf8(const ENCODING *enc, 421 const char **fromP, const char *fromLim, 422 char **toP, const char *toLim) 423{ 424 for (;;) { 425 unsigned char c; 426 if (*fromP == fromLim) 427 break; 428 c = (unsigned char)**fromP; 429 if (c & 0x80) { 430 if (toLim - *toP < 2) 431 break; 432 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 433 *(*toP)++ = (char)((c & 0x3f) | 0x80); 434 (*fromP)++; 435 } 436 else { 437 if (*toP == toLim) 438 break; 439 *(*toP)++ = *(*fromP)++; 440 } 441 } 442} 443 444static void PTRCALL 445latin1_toUtf16(const ENCODING *enc, 446 const char **fromP, const char *fromLim, 447 unsigned short **toP, const unsigned short *toLim) 448{ 449 while (*fromP != fromLim && *toP != toLim) 450 *(*toP)++ = (unsigned char)*(*fromP)++; 451} 452 453#ifdef XML_NS 454 455static const struct normal_encoding latin1_encoding_ns = { 456 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 457 { 458#include "asciitab.h" 459#include "latin1tab.h" 460 }, 461 STANDARD_VTABLE(sb_) 462}; 463 464#endif 465 466static const struct normal_encoding latin1_encoding = { 467 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 468 { 469#define BT_COLON BT_NMSTRT 470#include "asciitab.h" 471#undef BT_COLON 472#include "latin1tab.h" 473 }, 474 STANDARD_VTABLE(sb_) 475}; 476 477static void PTRCALL 478ascii_toUtf8(const ENCODING *enc, 479 const char **fromP, const char *fromLim, 480 char **toP, const char *toLim) 481{ 482 while (*fromP != fromLim && *toP != toLim) 483 *(*toP)++ = *(*fromP)++; 484} 485 486#ifdef XML_NS 487 488static const struct normal_encoding ascii_encoding_ns = { 489 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 490 { 491#include "asciitab.h" 492/* BT_NONXML == 0 */ 493 }, 494 STANDARD_VTABLE(sb_) 495}; 496 497#endif 498 499static const struct normal_encoding ascii_encoding = { 500 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 501 { 502#define BT_COLON BT_NMSTRT 503#include "asciitab.h" 504#undef BT_COLON 505/* BT_NONXML == 0 */ 506 }, 507 STANDARD_VTABLE(sb_) 508}; 509 510static int PTRFASTCALL 511unicode_byte_type(char hi, char lo) 512{ 513 switch ((unsigned char)hi) { 514 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 515 return BT_LEAD4; 516 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 517 return BT_TRAIL; 518 case 0xFF: 519 switch ((unsigned char)lo) { 520 case 0xFF: 521 case 0xFE: 522 return BT_NONXML; 523 } 524 break; 525 } 526 return BT_NONASCII; 527} 528 529#define DEFINE_UTF16_TO_UTF8(E) \ 530static void PTRCALL \ 531E ## toUtf8(const ENCODING *enc, \ 532 const char **fromP, const char *fromLim, \ 533 char **toP, const char *toLim) \ 534{ \ 535 const char *from; \ 536 for (from = *fromP; from != fromLim; from += 2) { \ 537 int plane; \ 538 unsigned char lo2; \ 539 unsigned char lo = GET_LO(from); \ 540 unsigned char hi = GET_HI(from); \ 541 switch (hi) { \ 542 case 0: \ 543 if (lo < 0x80) { \ 544 if (*toP == toLim) { \ 545 *fromP = from; \ 546 return; \ 547 } \ 548 *(*toP)++ = lo; \ 549 break; \ 550 } \ 551 /* fall through */ \ 552 case 0x1: case 0x2: case 0x3: \ 553 case 0x4: case 0x5: case 0x6: case 0x7: \ 554 if (toLim - *toP < 2) { \ 555 *fromP = from; \ 556 return; \ 557 } \ 558 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 559 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 560 break; \ 561 default: \ 562 if (toLim - *toP < 3) { \ 563 *fromP = from; \ 564 return; \ 565 } \ 566 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 567 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 568 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 569 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 570 break; \ 571 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 572 if (toLim - *toP < 4) { \ 573 *fromP = from; \ 574 return; \ 575 } \ 576 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 577 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 578 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 579 from += 2; \ 580 lo2 = GET_LO(from); \ 581 *(*toP)++ = (((lo & 0x3) << 4) \ 582 | ((GET_HI(from) & 0x3) << 2) \ 583 | (lo2 >> 6) \ 584 | 0x80); \ 585 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 586 break; \ 587 } \ 588 } \ 589 *fromP = from; \ 590} 591 592#define DEFINE_UTF16_TO_UTF16(E) \ 593static void PTRCALL \ 594E ## toUtf16(const ENCODING *enc, \ 595 const char **fromP, const char *fromLim, \ 596 unsigned short **toP, const unsigned short *toLim) \ 597{ \ 598 /* Avoid copying first half only of surrogate */ \ 599 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 600 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ 601 fromLim -= 2; \ 602 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ 603 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 604} 605 606#define SET2(ptr, ch) \ 607 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 608#define GET_LO(ptr) ((unsigned char)(ptr)[0]) 609#define GET_HI(ptr) ((unsigned char)(ptr)[1]) 610 611DEFINE_UTF16_TO_UTF8(little2_) 612DEFINE_UTF16_TO_UTF16(little2_) 613 614#undef SET2 615#undef GET_LO 616#undef GET_HI 617 618#define SET2(ptr, ch) \ 619 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 620#define GET_LO(ptr) ((unsigned char)(ptr)[1]) 621#define GET_HI(ptr) ((unsigned char)(ptr)[0]) 622 623DEFINE_UTF16_TO_UTF8(big2_) 624DEFINE_UTF16_TO_UTF16(big2_) 625 626#undef SET2 627#undef GET_LO 628#undef GET_HI 629 630#define LITTLE2_BYTE_TYPE(enc, p) \ 631 ((p)[1] == 0 \ 632 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 633 : unicode_byte_type((p)[1], (p)[0])) 634#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 635#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 636#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ 637 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 638#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 639 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 640 641#ifdef XML_MIN_SIZE 642 643static int PTRFASTCALL 644little2_byteType(const ENCODING *enc, const char *p) 645{ 646 return LITTLE2_BYTE_TYPE(enc, p); 647} 648 649static int PTRFASTCALL 650little2_byteToAscii(const ENCODING *enc, const char *p) 651{ 652 return LITTLE2_BYTE_TO_ASCII(enc, p); 653} 654 655static int PTRCALL 656little2_charMatches(const ENCODING *enc, const char *p, int c) 657{ 658 return LITTLE2_CHAR_MATCHES(enc, p, c); 659} 660 661static int PTRFASTCALL 662little2_isNameMin(const ENCODING *enc, const char *p) 663{ 664 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); 665} 666 667static int PTRFASTCALL 668little2_isNmstrtMin(const ENCODING *enc, const char *p) 669{ 670 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); 671} 672 673#undef VTABLE 674#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 675 676#else /* not XML_MIN_SIZE */ 677 678#undef PREFIX 679#define PREFIX(ident) little2_ ## ident 680#define MINBPC(enc) 2 681/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 682#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 683#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 684#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) 685#define IS_NAME_CHAR(enc, p, n) 0 686#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) 687#define IS_NMSTRT_CHAR(enc, p, n) (0) 688#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) 689 690#include "xmltok_impl.c" 691 692#undef MINBPC 693#undef BYTE_TYPE 694#undef BYTE_TO_ASCII 695#undef CHAR_MATCHES 696#undef IS_NAME_CHAR 697#undef IS_NAME_CHAR_MINBPC 698#undef IS_NMSTRT_CHAR 699#undef IS_NMSTRT_CHAR_MINBPC 700#undef IS_INVALID_CHAR 701 702#endif /* not XML_MIN_SIZE */ 703 704#ifdef XML_NS 705 706static const struct normal_encoding little2_encoding_ns = { 707 { VTABLE, 2, 0, 708#if BYTEORDER == 1234 709 1 710#else 711 0 712#endif 713 }, 714 { 715#include "asciitab.h" 716#include "latin1tab.h" 717 }, 718 STANDARD_VTABLE(little2_) 719}; 720 721#endif 722 723static const struct normal_encoding little2_encoding = { 724 { VTABLE, 2, 0, 725#if BYTEORDER == 1234 726 1 727#else 728 0 729#endif 730 }, 731 { 732#define BT_COLON BT_NMSTRT 733#include "asciitab.h" 734#undef BT_COLON 735#include "latin1tab.h" 736 }, 737 STANDARD_VTABLE(little2_) 738}; 739 740#if BYTEORDER != 4321 741 742#ifdef XML_NS 743 744static const struct normal_encoding internal_little2_encoding_ns = { 745 { VTABLE, 2, 0, 1 }, 746 { 747#include "iasciitab.h" 748#include "latin1tab.h" 749 }, 750 STANDARD_VTABLE(little2_) 751}; 752 753#endif 754 755static const struct normal_encoding internal_little2_encoding = { 756 { VTABLE, 2, 0, 1 }, 757 { 758#define BT_COLON BT_NMSTRT 759#include "iasciitab.h" 760#undef BT_COLON 761#include "latin1tab.h" 762 }, 763 STANDARD_VTABLE(little2_) 764}; 765 766#endif 767 768 769#define BIG2_BYTE_TYPE(enc, p) \ 770 ((p)[0] == 0 \ 771 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 772 : unicode_byte_type((p)[0], (p)[1])) 773#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 774#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 775#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ 776 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 777#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 778 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 779 780#ifdef XML_MIN_SIZE 781 782static int PTRFASTCALL 783big2_byteType(const ENCODING *enc, const char *p) 784{ 785 return BIG2_BYTE_TYPE(enc, p); 786} 787 788static int PTRFASTCALL 789big2_byteToAscii(const ENCODING *enc, const char *p) 790{ 791 return BIG2_BYTE_TO_ASCII(enc, p); 792} 793 794static int PTRCALL 795big2_charMatches(const ENCODING *enc, const char *p, int c) 796{ 797 return BIG2_CHAR_MATCHES(enc, p, c); 798} 799 800static int PTRFASTCALL 801big2_isNameMin(const ENCODING *enc, const char *p) 802{ 803 return BIG2_IS_NAME_CHAR_MINBPC(enc, p); 804} 805 806static int PTRFASTCALL 807big2_isNmstrtMin(const ENCODING *enc, const char *p) 808{ 809 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); 810} 811 812#undef VTABLE 813#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 814 815#else /* not XML_MIN_SIZE */ 816 817#undef PREFIX 818#define PREFIX(ident) big2_ ## ident 819#define MINBPC(enc) 2 820/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 821#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 822#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 823#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) 824#define IS_NAME_CHAR(enc, p, n) 0 825#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) 826#define IS_NMSTRT_CHAR(enc, p, n) (0) 827#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) 828 829#include "xmltok_impl.c" 830 831#undef MINBPC 832#undef BYTE_TYPE 833#undef BYTE_TO_ASCII 834#undef CHAR_MATCHES 835#undef IS_NAME_CHAR 836#undef IS_NAME_CHAR_MINBPC 837#undef IS_NMSTRT_CHAR 838#undef IS_NMSTRT_CHAR_MINBPC 839#undef IS_INVALID_CHAR 840 841#endif /* not XML_MIN_SIZE */ 842 843#ifdef XML_NS 844 845static const struct normal_encoding big2_encoding_ns = { 846 { VTABLE, 2, 0, 847#if BYTEORDER == 4321 848 1 849#else 850 0 851#endif 852 }, 853 { 854#include "asciitab.h" 855#include "latin1tab.h" 856 }, 857 STANDARD_VTABLE(big2_) 858}; 859 860#endif 861 862static const struct normal_encoding big2_encoding = { 863 { VTABLE, 2, 0, 864#if BYTEORDER == 4321 865 1 866#else 867 0 868#endif 869 }, 870 { 871#define BT_COLON BT_NMSTRT 872#include "asciitab.h" 873#undef BT_COLON 874#include "latin1tab.h" 875 }, 876 STANDARD_VTABLE(big2_) 877}; 878 879#if BYTEORDER != 1234 880 881#ifdef XML_NS 882 883static const struct normal_encoding internal_big2_encoding_ns = { 884 { VTABLE, 2, 0, 1 }, 885 { 886#include "iasciitab.h" 887#include "latin1tab.h" 888 }, 889 STANDARD_VTABLE(big2_) 890}; 891 892#endif 893 894static const struct normal_encoding internal_big2_encoding = { 895 { VTABLE, 2, 0, 1 }, 896 { 897#define BT_COLON BT_NMSTRT 898#include "iasciitab.h" 899#undef BT_COLON 900#include "latin1tab.h" 901 }, 902 STANDARD_VTABLE(big2_) 903}; 904 905#endif 906 907#undef PREFIX 908 909static int FASTCALL 910streqci(const char *s1, const char *s2) 911{ 912 for (;;) { 913 char c1 = *s1++; 914 char c2 = *s2++; 915 if (ASCII_a <= c1 && c1 <= ASCII_z) 916 c1 += ASCII_A - ASCII_a; 917 if (ASCII_a <= c2 && c2 <= ASCII_z) 918 c2 += ASCII_A - ASCII_a; 919 if (c1 != c2) 920 return 0; 921 if (!c1) 922 break; 923 } 924 return 1; 925} 926 927static void PTRCALL 928initUpdatePosition(const ENCODING *enc, const char *ptr, 929 const char *end, POSITION *pos) 930{ 931 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 932} 933 934static int 935toAscii(const ENCODING *enc, const char *ptr, const char *end) 936{ 937 char buf[1]; 938 char *p = buf; 939 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 940 if (p == buf) 941 return -1; 942 else 943 return buf[0]; 944} 945 946static int FASTCALL 947isSpace(int c) 948{ 949 switch (c) { 950 case 0x20: 951 case 0xD: 952 case 0xA: 953 case 0x9: 954 return 1; 955 } 956 return 0; 957} 958 959/* Return 1 if there's just optional white space or there's an S 960 followed by name=val. 961*/ 962static int 963parsePseudoAttribute(const ENCODING *enc, 964 const char *ptr, 965 const char *end, 966 const char **namePtr, 967 const char **nameEndPtr, 968 const char **valPtr, 969 const char **nextTokPtr) 970{ 971 int c; 972 char open; 973 if (ptr == end) { 974 *namePtr = NULL; 975 return 1; 976 } 977 if (!isSpace(toAscii(enc, ptr, end))) { 978 *nextTokPtr = ptr; 979 return 0; 980 } 981 do { 982 ptr += enc->minBytesPerChar; 983 } while (isSpace(toAscii(enc, ptr, end))); 984 if (ptr == end) { 985 *namePtr = NULL; 986 return 1; 987 } 988 *namePtr = ptr; 989 for (;;) { 990 c = toAscii(enc, ptr, end); 991 if (c == -1) { 992 *nextTokPtr = ptr; 993 return 0; 994 } 995 if (c == ASCII_EQUALS) { 996 *nameEndPtr = ptr; 997 break; 998 } 999 if (isSpace(c)) { 1000 *nameEndPtr = ptr; 1001 do { 1002 ptr += enc->minBytesPerChar; 1003 } while (isSpace(c = toAscii(enc, ptr, end))); 1004 if (c != ASCII_EQUALS) { 1005 *nextTokPtr = ptr; 1006 return 0; 1007 } 1008 break; 1009 } 1010 ptr += enc->minBytesPerChar; 1011 } 1012 if (ptr == *namePtr) { 1013 *nextTokPtr = ptr; 1014 return 0; 1015 } 1016 ptr += enc->minBytesPerChar; 1017 c = toAscii(enc, ptr, end); 1018 while (isSpace(c)) { 1019 ptr += enc->minBytesPerChar; 1020 c = toAscii(enc, ptr, end); 1021 } 1022 if (c != ASCII_QUOT && c != ASCII_APOS) { 1023 *nextTokPtr = ptr; 1024 return 0; 1025 } 1026 open = (char)c; 1027 ptr += enc->minBytesPerChar; 1028 *valPtr = ptr; 1029 for (;; ptr += enc->minBytesPerChar) { 1030 c = toAscii(enc, ptr, end); 1031 if (c == open) 1032 break; 1033 if (!(ASCII_a <= c && c <= ASCII_z) 1034 && !(ASCII_A <= c && c <= ASCII_Z) 1035 && !(ASCII_0 <= c && c <= ASCII_9) 1036 && c != ASCII_PERIOD 1037 && c != ASCII_MINUS 1038 && c != ASCII_UNDERSCORE) { 1039 *nextTokPtr = ptr; 1040 return 0; 1041 } 1042 } 1043 *nextTokPtr = ptr + enc->minBytesPerChar; 1044 return 1; 1045} 1046 1047static const char KW_version[] = { 1048 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0' 1049}; 1050 1051static const char KW_encoding[] = { 1052 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0' 1053}; 1054 1055static const char KW_standalone[] = { 1056 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, 1057 ASCII_n, ASCII_e, '\0' 1058}; 1059 1060static const char KW_yes[] = { 1061 ASCII_y, ASCII_e, ASCII_s, '\0' 1062}; 1063 1064static const char KW_no[] = { 1065 ASCII_n, ASCII_o, '\0' 1066}; 1067 1068static int 1069doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, 1070 const char *, 1071 const char *), 1072 int isGeneralTextEntity, 1073 const ENCODING *enc, 1074 const char *ptr, 1075 const char *end, 1076 const char **badPtr, 1077 const char **versionPtr, 1078 const char **versionEndPtr, 1079 const char **encodingName, 1080 const ENCODING **encoding, 1081 int *standalone) 1082{ 1083 const char *val = NULL; 1084 const char *name = NULL; 1085 const char *nameEnd = NULL; 1086 ptr += 5 * enc->minBytesPerChar; 1087 end -= 2 * enc->minBytesPerChar; 1088 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1089 || !name) { 1090 *badPtr = ptr; 1091 return 0; 1092 } 1093 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1094 if (!isGeneralTextEntity) { 1095 *badPtr = name; 1096 return 0; 1097 } 1098 } 1099 else { 1100 if (versionPtr) 1101 *versionPtr = val; 1102 if (versionEndPtr) 1103 *versionEndPtr = ptr; 1104 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1105 *badPtr = ptr; 1106 return 0; 1107 } 1108 if (!name) { 1109 if (isGeneralTextEntity) { 1110 /* a TextDecl must have an EncodingDecl */ 1111 *badPtr = ptr; 1112 return 0; 1113 } 1114 return 1; 1115 } 1116 } 1117 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1118 int c = toAscii(enc, val, end); 1119 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) { 1120 *badPtr = val; 1121 return 0; 1122 } 1123 if (encodingName) 1124 *encodingName = val; 1125 if (encoding) 1126 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1127 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1128 *badPtr = ptr; 1129 return 0; 1130 } 1131 if (!name) 1132 return 1; 1133 } 1134 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1135 || isGeneralTextEntity) { 1136 *badPtr = name; 1137 return 0; 1138 } 1139 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1140 if (standalone) 1141 *standalone = 1; 1142 } 1143 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1144 if (standalone) 1145 *standalone = 0; 1146 } 1147 else { 1148 *badPtr = val; 1149 return 0; 1150 } 1151 while (isSpace(toAscii(enc, ptr, end))) 1152 ptr += enc->minBytesPerChar; 1153 if (ptr != end) { 1154 *badPtr = ptr; 1155 return 0; 1156 } 1157 return 1; 1158} 1159 1160static int FASTCALL 1161checkCharRefNumber(int result) 1162{ 1163 switch (result >> 8) { 1164 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 1165 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 1166 return -1; 1167 case 0: 1168 if (latin1_encoding.type[result] == BT_NONXML) 1169 return -1; 1170 break; 1171 case 0xFF: 1172 if (result == 0xFFFE || result == 0xFFFF) 1173 return -1; 1174 break; 1175 } 1176 return result; 1177} 1178 1179int FASTCALL 1180XmlUtf8Encode(int c, char *buf) 1181{ 1182 enum { 1183 /* minN is minimum legal resulting value for N byte sequence */ 1184 min2 = 0x80, 1185 min3 = 0x800, 1186 min4 = 0x10000 1187 }; 1188 1189 if (c < 0) 1190 return 0; 1191 if (c < min2) { 1192 buf[0] = (char)(c | UTF8_cval1); 1193 return 1; 1194 } 1195 if (c < min3) { 1196 buf[0] = (char)((c >> 6) | UTF8_cval2); 1197 buf[1] = (char)((c & 0x3f) | 0x80); 1198 return 2; 1199 } 1200 if (c < min4) { 1201 buf[0] = (char)((c >> 12) | UTF8_cval3); 1202 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1203 buf[2] = (char)((c & 0x3f) | 0x80); 1204 return 3; 1205 } 1206 if (c < 0x110000) { 1207 buf[0] = (char)((c >> 18) | UTF8_cval4); 1208 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1209 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1210 buf[3] = (char)((c & 0x3f) | 0x80); 1211 return 4; 1212 } 1213 return 0; 1214} 1215 1216int FASTCALL 1217XmlUtf16Encode(int charNum, unsigned short *buf) 1218{ 1219 if (charNum < 0) 1220 return 0; 1221 if (charNum < 0x10000) { 1222 buf[0] = (unsigned short)charNum; 1223 return 1; 1224 } 1225 if (charNum < 0x110000) { 1226 charNum -= 0x10000; 1227 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1228 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1229 return 2; 1230 } 1231 return 0; 1232} 1233 1234struct unknown_encoding { 1235 struct normal_encoding normal; 1236 int (*convert)(void *userData, const char *p); 1237 void *userData; 1238 unsigned short utf16[256]; 1239 char utf8[256][4]; 1240}; 1241 1242#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc)) 1243 1244int 1245XmlSizeOfUnknownEncoding(void) 1246{ 1247 return sizeof(struct unknown_encoding); 1248} 1249 1250static int PTRFASTCALL 1251unknown_isName(const ENCODING *enc, const char *p) 1252{ 1253 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1254 int c = uenc->convert(uenc->userData, p); 1255 if (c & ~0xFFFF) 1256 return 0; 1257 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1258} 1259 1260static int PTRFASTCALL 1261unknown_isNmstrt(const ENCODING *enc, const char *p) 1262{ 1263 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1264 int c = uenc->convert(uenc->userData, p); 1265 if (c & ~0xFFFF) 1266 return 0; 1267 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1268} 1269 1270static int PTRFASTCALL 1271unknown_isInvalid(const ENCODING *enc, const char *p) 1272{ 1273 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1274 int c = uenc->convert(uenc->userData, p); 1275 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1276} 1277 1278static void PTRCALL 1279unknown_toUtf8(const ENCODING *enc, 1280 const char **fromP, const char *fromLim, 1281 char **toP, const char *toLim) 1282{ 1283 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1284 char buf[XML_UTF8_ENCODE_MAX]; 1285 for (;;) { 1286 const char *utf8; 1287 int n; 1288 if (*fromP == fromLim) 1289 break; 1290 utf8 = uenc->utf8[(unsigned char)**fromP]; 1291 n = *utf8++; 1292 if (n == 0) { 1293 int c = uenc->convert(uenc->userData, *fromP); 1294 n = XmlUtf8Encode(c, buf); 1295 if (n > toLim - *toP) 1296 break; 1297 utf8 = buf; 1298 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1299 - (BT_LEAD2 - 2)); 1300 } 1301 else { 1302 if (n > toLim - *toP) 1303 break; 1304 (*fromP)++; 1305 } 1306 do { 1307 *(*toP)++ = *utf8++; 1308 } while (--n != 0); 1309 } 1310} 1311 1312static void PTRCALL 1313unknown_toUtf16(const ENCODING *enc, 1314 const char **fromP, const char *fromLim, 1315 unsigned short **toP, const unsigned short *toLim) 1316{ 1317 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1318 while (*fromP != fromLim && *toP != toLim) { 1319 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1320 if (c == 0) { 1321 c = (unsigned short) 1322 uenc->convert(uenc->userData, *fromP); 1323 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1324 - (BT_LEAD2 - 2)); 1325 } 1326 else 1327 (*fromP)++; 1328 *(*toP)++ = c; 1329 } 1330} 1331 1332ENCODING * 1333XmlInitUnknownEncoding(void *mem, 1334 int *table, 1335 CONVERTER convert, 1336 void *userData) 1337{ 1338 int i; 1339 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1340 for (i = 0; i < (int)sizeof(struct normal_encoding); i++) 1341 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; 1342 for (i = 0; i < 128; i++) 1343 if (latin1_encoding.type[i] != BT_OTHER 1344 && latin1_encoding.type[i] != BT_NONXML 1345 && table[i] != i) 1346 return 0; 1347 for (i = 0; i < 256; i++) { 1348 int c = table[i]; 1349 if (c == -1) { 1350 e->normal.type[i] = BT_MALFORM; 1351 /* This shouldn't really get used. */ 1352 e->utf16[i] = 0xFFFF; 1353 e->utf8[i][0] = 1; 1354 e->utf8[i][1] = 0; 1355 } 1356 else if (c < 0) { 1357 if (c < -4) 1358 return 0; 1359 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1360 e->utf8[i][0] = 0; 1361 e->utf16[i] = 0; 1362 } 1363 else if (c < 0x80) { 1364 if (latin1_encoding.type[c] != BT_OTHER 1365 && latin1_encoding.type[c] != BT_NONXML 1366 && c != i) 1367 return 0; 1368 e->normal.type[i] = latin1_encoding.type[c]; 1369 e->utf8[i][0] = 1; 1370 e->utf8[i][1] = (char)c; 1371 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1372 } 1373 else if (checkCharRefNumber(c) < 0) { 1374 e->normal.type[i] = BT_NONXML; 1375 /* This shouldn't really get used. */ 1376 e->utf16[i] = 0xFFFF; 1377 e->utf8[i][0] = 1; 1378 e->utf8[i][1] = 0; 1379 } 1380 else { 1381 if (c > 0xFFFF) 1382 return 0; 1383 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1384 e->normal.type[i] = BT_NMSTRT; 1385 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1386 e->normal.type[i] = BT_NAME; 1387 else 1388 e->normal.type[i] = BT_OTHER; 1389 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1390 e->utf16[i] = (unsigned short)c; 1391 } 1392 } 1393 e->userData = userData; 1394 e->convert = convert; 1395 if (convert) { 1396 e->normal.isName2 = unknown_isName; 1397 e->normal.isName3 = unknown_isName; 1398 e->normal.isName4 = unknown_isName; 1399 e->normal.isNmstrt2 = unknown_isNmstrt; 1400 e->normal.isNmstrt3 = unknown_isNmstrt; 1401 e->normal.isNmstrt4 = unknown_isNmstrt; 1402 e->normal.isInvalid2 = unknown_isInvalid; 1403 e->normal.isInvalid3 = unknown_isInvalid; 1404 e->normal.isInvalid4 = unknown_isInvalid; 1405 } 1406 e->normal.enc.utf8Convert = unknown_toUtf8; 1407 e->normal.enc.utf16Convert = unknown_toUtf16; 1408 return &(e->normal.enc); 1409} 1410 1411/* If this enumeration is changed, getEncodingIndex and encodings 1412must also be changed. */ 1413enum { 1414 UNKNOWN_ENC = -1, 1415 ISO_8859_1_ENC = 0, 1416 US_ASCII_ENC, 1417 UTF_8_ENC, 1418 UTF_16_ENC, 1419 UTF_16BE_ENC, 1420 UTF_16LE_ENC, 1421 /* must match encodingNames up to here */ 1422 NO_ENC 1423}; 1424 1425static const char KW_ISO_8859_1[] = { 1426 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, 1427 ASCII_MINUS, ASCII_1, '\0' 1428}; 1429static const char KW_US_ASCII[] = { 1430 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, 1431 '\0' 1432}; 1433static const char KW_UTF_8[] = { 1434 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0' 1435}; 1436static const char KW_UTF_16[] = { 1437 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0' 1438}; 1439static const char KW_UTF_16BE[] = { 1440 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, 1441 '\0' 1442}; 1443static const char KW_UTF_16LE[] = { 1444 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, 1445 '\0' 1446}; 1447 1448static int FASTCALL 1449getEncodingIndex(const char *name) 1450{ 1451 static const char *encodingNames[] = { 1452 KW_ISO_8859_1, 1453 KW_US_ASCII, 1454 KW_UTF_8, 1455 KW_UTF_16, 1456 KW_UTF_16BE, 1457 KW_UTF_16LE, 1458 }; 1459 int i; 1460 if (name == NULL) 1461 return NO_ENC; 1462 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) 1463 if (streqci(name, encodingNames[i])) 1464 return i; 1465 return UNKNOWN_ENC; 1466} 1467 1468/* For binary compatibility, we store the index of the encoding 1469 specified at initialization in the isUtf16 member. 1470*/ 1471 1472#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1473#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1474 1475/* This is what detects the encoding. encodingTable maps from 1476 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1477 the external (protocol) specified encoding; state is 1478 XML_CONTENT_STATE if we're parsing an external text entity, and 1479 XML_PROLOG_STATE otherwise. 1480*/ 1481 1482 1483static int 1484initScan(const ENCODING **encodingTable, 1485 const INIT_ENCODING *enc, 1486 int state, 1487 const char *ptr, 1488 const char *end, 1489 const char **nextTokPtr) 1490{ 1491 const ENCODING **encPtr; 1492 1493 if (ptr == end) 1494 return XML_TOK_NONE; 1495 encPtr = enc->encPtr; 1496 if (ptr + 1 == end) { 1497 /* only a single byte available for auto-detection */ 1498#ifndef XML_DTD /* FIXME */ 1499 /* a well-formed document entity must have more than one byte */ 1500 if (state != XML_CONTENT_STATE) 1501 return XML_TOK_PARTIAL; 1502#endif 1503 /* so we're parsing an external text entity... */ 1504 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1505 switch (INIT_ENC_INDEX(enc)) { 1506 case UTF_16_ENC: 1507 case UTF_16LE_ENC: 1508 case UTF_16BE_ENC: 1509 return XML_TOK_PARTIAL; 1510 } 1511 switch ((unsigned char)*ptr) { 1512 case 0xFE: 1513 case 0xFF: 1514 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1515 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1516 && state == XML_CONTENT_STATE) 1517 break; 1518 /* fall through */ 1519 case 0x00: 1520 case 0x3C: 1521 return XML_TOK_PARTIAL; 1522 } 1523 } 1524 else { 1525 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1526 case 0xFEFF: 1527 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1528 && state == XML_CONTENT_STATE) 1529 break; 1530 *nextTokPtr = ptr + 2; 1531 *encPtr = encodingTable[UTF_16BE_ENC]; 1532 return XML_TOK_BOM; 1533 /* 00 3C is handled in the default case */ 1534 case 0x3C00: 1535 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1536 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1537 && state == XML_CONTENT_STATE) 1538 break; 1539 *encPtr = encodingTable[UTF_16LE_ENC]; 1540 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1541 case 0xFFFE: 1542 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1543 && state == XML_CONTENT_STATE) 1544 break; 1545 *nextTokPtr = ptr + 2; 1546 *encPtr = encodingTable[UTF_16LE_ENC]; 1547 return XML_TOK_BOM; 1548 case 0xEFBB: 1549 /* Maybe a UTF-8 BOM (EF BB BF) */ 1550 /* If there's an explicitly specified (external) encoding 1551 of ISO-8859-1 or some flavour of UTF-16 1552 and this is an external text entity, 1553 don't look for the BOM, 1554 because it might be a legal data. 1555 */ 1556 if (state == XML_CONTENT_STATE) { 1557 int e = INIT_ENC_INDEX(enc); 1558 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC 1559 || e == UTF_16LE_ENC || e == UTF_16_ENC) 1560 break; 1561 } 1562 if (ptr + 2 == end) 1563 return XML_TOK_PARTIAL; 1564 if ((unsigned char)ptr[2] == 0xBF) { 1565 *nextTokPtr = ptr + 3; 1566 *encPtr = encodingTable[UTF_8_ENC]; 1567 return XML_TOK_BOM; 1568 } 1569 break; 1570 default: 1571 if (ptr[0] == '\0') { 1572 /* 0 isn't a legal data character. Furthermore a document 1573 entity can only start with ASCII characters. So the only 1574 way this can fail to be big-endian UTF-16 if it it's an 1575 external parsed general entity that's labelled as 1576 UTF-16LE. 1577 */ 1578 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1579 break; 1580 *encPtr = encodingTable[UTF_16BE_ENC]; 1581 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1582 } 1583 else if (ptr[1] == '\0') { 1584 /* We could recover here in the case: 1585 - parsing an external entity 1586 - second byte is 0 1587 - no externally specified encoding 1588 - no encoding declaration 1589 by assuming UTF-16LE. But we don't, because this would mean when 1590 presented just with a single byte, we couldn't reliably determine 1591 whether we needed further bytes. 1592 */ 1593 if (state == XML_CONTENT_STATE) 1594 break; 1595 *encPtr = encodingTable[UTF_16LE_ENC]; 1596 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1597 } 1598 break; 1599 } 1600 } 1601 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1602 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1603} 1604 1605 1606#define NS(x) x 1607#define ns(x) x 1608#include "xmltok_ns.c" 1609#undef NS 1610#undef ns 1611 1612#ifdef XML_NS 1613 1614#define NS(x) x ## NS 1615#define ns(x) x ## _ns 1616 1617#include "xmltok_ns.c" 1618 1619#undef NS 1620#undef ns 1621 1622ENCODING * 1623XmlInitUnknownEncodingNS(void *mem, 1624 int *table, 1625 CONVERTER convert, 1626 void *userData) 1627{ 1628 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1629 if (enc) 1630 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1631 return enc; 1632} 1633 1634#endif /* XML_NS */ 1635