1/* 2The contents of this file are subject to the Mozilla Public License 3Version 1.0 (the "License"); you may not use this file except in 4compliance with the License. You may obtain a copy of the License at 5http://www.mozilla.org/MPL/ 6 7Software distributed under the License is distributed on an "AS IS" 8basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 9License for the specific language governing rights and limitations 10under the License. 11 12The Original Code is expat. 13 14The Initial Developer of the Original Code is James Clark. 15Portions created by James Clark are Copyright (C) 1998 16James Clark. All Rights Reserved. 17 18Contributor(s): 19*/ 20 21#include <tcl.h> /*for size_t */ 22#include "xmldef.h" 23#include "xmltok.h" 24#include "nametab.h" 25 26#define VTABLE1 \ 27 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \ 28 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 29 PREFIX(sameName), \ 30 PREFIX(nameMatchesAscii), \ 31 PREFIX(nameLength), \ 32 PREFIX(skipS), \ 33 PREFIX(getAtts), \ 34 PREFIX(charRefNumber), \ 35 PREFIX(predefinedEntityName), \ 36 PREFIX(updatePosition), \ 37 PREFIX(isPublicId) 38 39#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 40 41#define UCS2_GET_NAMING(pages, hi, lo) \ 42 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) 43 44/* A 2 byte UTF-8 representation splits the characters 11 bits 45between the bottom 5 and 6 bits of the bytes. 46We need 8 bits to index into pages, 3 bits to add to that index and 475 bits to generate the mask. */ 48#define UTF8_GET_NAMING2(pages, byte) \ 49 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 50 + ((((byte)[0]) & 3) << 1) \ 51 + ((((byte)[1]) >> 5) & 1)] \ 52 & (1 << (((byte)[1]) & 0x1F))) 53 54/* A 3 byte UTF-8 representation splits the characters 16 bits 55between the bottom 4, 6 and 6 bits of the bytes. 56We need 8 bits to index into pages, 3 bits to add to that index and 575 bits to generate the mask. */ 58#define UTF8_GET_NAMING3(pages, byte) \ 59 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 60 + ((((byte)[1]) >> 2) & 0xF)] \ 61 << 3) \ 62 + ((((byte)[1]) & 3) << 1) \ 63 + ((((byte)[2]) >> 5) & 1)] \ 64 & (1 << (((byte)[2]) & 0x1F))) 65 66#define UTF8_GET_NAMING(pages, p, n) \ 67 ((n) == 2 \ 68 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 69 : ((n) == 3 \ 70 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 71 : 0)) 72 73#define UTF8_INVALID3(p) \ 74 ((*p) == 0xED \ 75 ? (((p)[1] & 0x20) != 0) \ 76 : ((*p) == 0xEF \ 77 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \ 78 : 0)) 79 80#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0) 81 82static 83int isNever(const ENCODING *enc, const char *p) 84{ 85 return 0; 86} 87 88static 89int utf8_isName2(const ENCODING *enc, const char *p) 90{ 91 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 92} 93 94static 95int utf8_isName3(const ENCODING *enc, const char *p) 96{ 97 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 98} 99 100#define utf8_isName4 isNever 101 102static 103int utf8_isNmstrt2(const ENCODING *enc, const char *p) 104{ 105 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 106} 107 108static 109int utf8_isNmstrt3(const ENCODING *enc, const char *p) 110{ 111 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 112} 113 114#define utf8_isNmstrt4 isNever 115 116#define utf8_isInvalid2 isNever 117 118static 119int utf8_isInvalid3(const ENCODING *enc, const char *p) 120{ 121 return UTF8_INVALID3((const unsigned char *)p); 122} 123 124static 125int utf8_isInvalid4(const ENCODING *enc, const char *p) 126{ 127 return UTF8_INVALID4((const unsigned char *)p); 128} 129 130struct normal_encoding { 131 ENCODING enc; 132 unsigned char type[256]; 133 int (*isName2)(const ENCODING *, const char *); 134 int (*isName3)(const ENCODING *, const char *); 135 int (*isName4)(const ENCODING *, const char *); 136 int (*isNmstrt2)(const ENCODING *, const char *); 137 int (*isNmstrt3)(const ENCODING *, const char *); 138 int (*isNmstrt4)(const ENCODING *, const char *); 139 int (*isInvalid2)(const ENCODING *, const char *); 140 int (*isInvalid3)(const ENCODING *, const char *); 141 int (*isInvalid4)(const ENCODING *, const char *); 142}; 143 144#define NORMAL_VTABLE(E) \ 145 E ## isName2, \ 146 E ## isName3, \ 147 E ## isName4, \ 148 E ## isNmstrt2, \ 149 E ## isNmstrt3, \ 150 E ## isNmstrt4, \ 151 E ## isInvalid2, \ 152 E ## isInvalid3, \ 153 E ## isInvalid4 154 155static int checkCharRefNumber(int); 156 157#include "xmltok_impl.h" 158 159/* minimum bytes per character */ 160#define MINBPC 1 161#define BYTE_TYPE(enc, p) \ 162 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 163#define BYTE_TO_ASCII(enc, p) (*p) 164 165#define IS_NAME_CHAR(enc, p, n) \ 166 (((const struct normal_encoding *)(enc))->isName ## n(enc, p)) 167#define IS_NMSTRT_CHAR(enc, p, n) \ 168 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p)) 169#define IS_INVALID_CHAR(enc, p, n) \ 170 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p)) 171 172#define IS_NAME_CHAR_MINBPC(enc, p) (0) 173#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 174 175/* c is an ASCII character */ 176#define CHAR_MATCHES(enc, p, c) (*(p) == c) 177 178#define PREFIX(ident) normal_ ## ident 179#include "xmltok_impl.c" 180 181#undef MINBPC 182#undef BYTE_TYPE 183#undef BYTE_TO_ASCII 184#undef CHAR_MATCHES 185#undef IS_NAME_CHAR 186#undef IS_NAME_CHAR_MINBPC 187#undef IS_NMSTRT_CHAR 188#undef IS_NMSTRT_CHAR_MINBPC 189#undef IS_INVALID_CHAR 190 191enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 192 UTF8_cval1 = 0x00, 193 UTF8_cval2 = 0xc0, 194 UTF8_cval3 = 0xe0, 195 UTF8_cval4 = 0xf0 196}; 197 198static 199void utf8_toUtf8(const ENCODING *enc, 200 const char **fromP, const char *fromLim, 201 char **toP, const char *toLim) 202{ 203 char *to; 204 const char *from; 205 if (fromLim - *fromP > toLim - *toP) { 206 /* Avoid copying partial characters. */ 207 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) 208 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) 209 break; 210 } 211 for (to = *toP, from = *fromP; from != fromLim; from++, to++) 212 *to = *from; 213 *fromP = from; 214 *toP = to; 215} 216 217static 218void utf8_toUtf16(const ENCODING *enc, 219 const char **fromP, const char *fromLim, 220 unsigned short **toP, const unsigned short *toLim) 221{ 222 unsigned short *to = *toP; 223 const char *from = *fromP; 224 while (from != fromLim && to != toLim) { 225 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 226 case BT_LEAD2: 227 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f); 228 from += 2; 229 break; 230 case BT_LEAD3: 231 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f); 232 from += 3; 233 break; 234 case BT_LEAD4: 235 { 236 unsigned long n; 237 if (to + 1 == toLim) 238 break; 239 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 240 n -= 0x10000; 241 to[0] = (unsigned short)((n >> 10) | 0xD800); 242 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 243 to += 2; 244 from += 4; 245 } 246 break; 247 default: 248 *to++ = *from++; 249 break; 250 } 251 } 252 *fromP = from; 253 *toP = to; 254} 255 256static const struct normal_encoding utf8_encoding = { 257 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 258 { 259#include "asciitab.h" 260#include "utf8tab.h" 261 }, 262 NORMAL_VTABLE(utf8_) 263}; 264 265static const struct normal_encoding internal_utf8_encoding = { 266 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 267 { 268#include "iasciitab.h" 269#include "utf8tab.h" 270 }, 271 NORMAL_VTABLE(utf8_) 272}; 273 274static 275void latin1_toUtf8(const ENCODING *enc, 276 const char **fromP, const char *fromLim, 277 char **toP, const char *toLim) 278{ 279 for (;;) { 280 unsigned char c; 281 if (*fromP == fromLim) 282 break; 283 c = (unsigned char)**fromP; 284 if (c & 0x80) { 285 if (toLim - *toP < 2) 286 break; 287 *(*toP)++ = ((c >> 6) | UTF8_cval2); 288 *(*toP)++ = ((c & 0x3f) | 0x80); 289 (*fromP)++; 290 } 291 else { 292 if (*toP == toLim) 293 break; 294 *(*toP)++ = *(*fromP)++; 295 } 296 } 297} 298 299static 300void latin1_toUtf16(const ENCODING *enc, 301 const char **fromP, const char *fromLim, 302 unsigned short **toP, const unsigned short *toLim) 303{ 304 while (*fromP != fromLim && *toP != toLim) 305 *(*toP)++ = (unsigned char)*(*fromP)++; 306} 307 308static const struct normal_encoding latin1_encoding = { 309 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 310 { 311#include "asciitab.h" 312#include "latin1tab.h" 313 } 314}; 315 316static 317void ascii_toUtf8(const ENCODING *enc, 318 const char **fromP, const char *fromLim, 319 char **toP, const char *toLim) 320{ 321 while (*fromP != fromLim && *toP != toLim) 322 *(*toP)++ = *(*fromP)++; 323} 324 325static const struct normal_encoding ascii_encoding = { 326 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 327 { 328#include "asciitab.h" 329/* BT_NONXML == 0 */ 330 } 331}; 332 333#undef PREFIX 334 335static int unicode_byte_type(char hi, char lo) 336{ 337 switch ((unsigned char)hi) { 338 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 339 return BT_LEAD4; 340 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 341 return BT_TRAIL; 342 case 0xFF: 343 switch ((unsigned char)lo) { 344 case 0xFF: 345 case 0xFE: 346 return BT_NONXML; 347 } 348 break; 349 } 350 return BT_NONASCII; 351} 352 353#define DEFINE_UTF16_TO_UTF8 \ 354static \ 355void PREFIX(toUtf8)(const ENCODING *enc, \ 356 const char **fromP, const char *fromLim, \ 357 char **toP, const char *toLim) \ 358{ \ 359 const char *from; \ 360 for (from = *fromP; from != fromLim; from += 2) { \ 361 int plane; \ 362 unsigned char lo2; \ 363 unsigned char lo = GET_LO(from); \ 364 unsigned char hi = GET_HI(from); \ 365 switch (hi) { \ 366 case 0: \ 367 if (lo < 0x80) { \ 368 if (*toP == toLim) { \ 369 *fromP = from; \ 370 return; \ 371 } \ 372 *(*toP)++ = lo; \ 373 break; \ 374 } \ 375 /* fall through */ \ 376 case 0x1: case 0x2: case 0x3: \ 377 case 0x4: case 0x5: case 0x6: case 0x7: \ 378 if (toLim - *toP < 2) { \ 379 *fromP = from; \ 380 return; \ 381 } \ 382 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 383 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 384 break; \ 385 default: \ 386 if (toLim - *toP < 3) { \ 387 *fromP = from; \ 388 return; \ 389 } \ 390 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 391 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 392 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 393 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 394 break; \ 395 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 396 if (toLim - *toP < 4) { \ 397 *fromP = from; \ 398 return; \ 399 } \ 400 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 401 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 402 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 403 from += 2; \ 404 lo2 = GET_LO(from); \ 405 *(*toP)++ = (((lo & 0x3) << 4) \ 406 | ((GET_HI(from) & 0x3) << 2) \ 407 | (lo2 >> 6) \ 408 | 0x80); \ 409 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 410 break; \ 411 } \ 412 } \ 413 *fromP = from; \ 414} 415 416#define DEFINE_UTF16_TO_UTF16 \ 417static \ 418void PREFIX(toUtf16)(const ENCODING *enc, \ 419 const char **fromP, const char *fromLim, \ 420 unsigned short **toP, const unsigned short *toLim) \ 421{ \ 422 /* Avoid copying first half only of surrogate */ \ 423 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 424 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ 425 fromLim -= 2; \ 426 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ 427 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 428} 429 430#define PREFIX(ident) little2_ ## ident 431#define MINBPC 2 432#define BYTE_TYPE(enc, p) \ 433 ((p)[1] == 0 \ 434 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 435 : unicode_byte_type((p)[1], (p)[0])) 436#define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 437#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 438#define IS_NAME_CHAR(enc, p, n) (0) 439#define IS_NAME_CHAR_MINBPC(enc, p) \ 440 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 441#define IS_NMSTRT_CHAR(enc, p, n) (0) 442#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 443 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 444 445#include "xmltok_impl.c" 446 447#define SET2(ptr, ch) \ 448 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 449#define GET_LO(ptr) ((unsigned char)(ptr)[0]) 450#define GET_HI(ptr) ((unsigned char)(ptr)[1]) 451 452DEFINE_UTF16_TO_UTF8 453DEFINE_UTF16_TO_UTF16 454 455#undef SET2 456#undef GET_LO 457#undef GET_HI 458#undef MINBPC 459#undef BYTE_TYPE 460#undef BYTE_TO_ASCII 461#undef CHAR_MATCHES 462#undef IS_NAME_CHAR 463#undef IS_NAME_CHAR_MINBPC 464#undef IS_NMSTRT_CHAR 465#undef IS_NMSTRT_CHAR_MINBPC 466#undef IS_INVALID_CHAR 467 468static const struct normal_encoding little2_encoding = { 469 { VTABLE, 2, 0, 470#if BYTE_ORDER == 12 471 1 472#else 473 0 474#endif 475 }, 476#include "asciitab.h" 477#include "latin1tab.h" 478}; 479 480#if BYTE_ORDER != 21 481 482static const struct normal_encoding internal_little2_encoding = { 483 { VTABLE, 2, 0, 1 }, 484#include "iasciitab.h" 485#include "latin1tab.h" 486}; 487 488#endif 489 490#undef PREFIX 491 492#define PREFIX(ident) big2_ ## ident 493#define MINBPC 2 494/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 495#define BYTE_TYPE(enc, p) \ 496 ((p)[0] == 0 \ 497 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 498 : unicode_byte_type((p)[0], (p)[1])) 499#define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 500#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 501#define IS_NAME_CHAR(enc, p, n) 0 502#define IS_NAME_CHAR_MINBPC(enc, p) \ 503 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 504#define IS_NMSTRT_CHAR(enc, p, n) (0) 505#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 506 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 507 508#include "xmltok_impl.c" 509 510#define SET2(ptr, ch) \ 511 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 512#define GET_LO(ptr) ((unsigned char)(ptr)[1]) 513#define GET_HI(ptr) ((unsigned char)(ptr)[0]) 514 515DEFINE_UTF16_TO_UTF8 516DEFINE_UTF16_TO_UTF16 517 518#undef SET2 519#undef GET_LO 520#undef GET_HI 521#undef MINBPC 522#undef BYTE_TYPE 523#undef BYTE_TO_ASCII 524#undef CHAR_MATCHES 525#undef IS_NAME_CHAR 526#undef IS_NAME_CHAR_MINBPC 527#undef IS_NMSTRT_CHAR 528#undef IS_NMSTRT_CHAR_MINBPC 529#undef IS_INVALID_CHAR 530 531static const struct normal_encoding big2_encoding = { 532 { VTABLE, 2, 0, 533#if BYTE_ORDER == 21 534 1 535#else 536 0 537#endif 538 }, 539#include "asciitab.h" 540#include "latin1tab.h" 541}; 542 543#if BYTE_ORDER != 12 544 545static const struct normal_encoding internal_big2_encoding = { 546 { VTABLE, 2, 0, 1 }, 547#include "iasciitab.h" 548#include "latin1tab.h" 549}; 550 551#endif 552 553#undef PREFIX 554 555static 556int streqci(const char *s1, const char *s2) 557{ 558 for (;;) { 559 char c1 = *s1++; 560 char c2 = *s2++; 561 if ('a' <= c1 && c1 <= 'z') 562 c1 += 'A' - 'a'; 563 if ('a' <= c2 && c2 <= 'z') 564 c2 += 'A' - 'a'; 565 if (c1 != c2) 566 return 0; 567 if (!c1) 568 break; 569 } 570 return 1; 571} 572 573static 574int initScan(const ENCODING *enc, int state, const char *ptr, const char *end, 575 const char **nextTokPtr) 576{ 577 const ENCODING **encPtr; 578 579 if (ptr == end) 580 return XML_TOK_NONE; 581 encPtr = ((const INIT_ENCODING *)enc)->encPtr; 582 if (ptr + 1 == end) { 583 switch ((unsigned char)*ptr) { 584 case 0xFE: 585 case 0xFF: 586 case 0x00: 587 case 0x3C: 588 return XML_TOK_PARTIAL; 589 } 590 } 591 else { 592 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 593 case 0x003C: 594 *encPtr = &big2_encoding.enc; 595 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 596 case 0xFEFF: 597 *nextTokPtr = ptr + 2; 598 *encPtr = &big2_encoding.enc; 599 return XML_TOK_BOM; 600 case 0x3C00: 601 *encPtr = &little2_encoding.enc; 602 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 603 case 0xFFFE: 604 *nextTokPtr = ptr + 2; 605 *encPtr = &little2_encoding.enc; 606 return XML_TOK_BOM; 607 } 608 } 609 *encPtr = &utf8_encoding.enc; 610 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 611} 612 613static 614int initScanProlog(const ENCODING *enc, const char *ptr, const char *end, 615 const char **nextTokPtr) 616{ 617 return initScan(enc, XML_PROLOG_STATE, ptr, end, nextTokPtr); 618} 619 620static 621int initScanContent(const ENCODING *enc, const char *ptr, const char *end, 622 const char **nextTokPtr) 623{ 624 return initScan(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr); 625} 626 627static 628void initUpdatePosition(const ENCODING *enc, const char *ptr, 629 const char *end, POSITION *pos) 630{ 631 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 632} 633 634const ENCODING *XmlGetUtf8InternalEncoding() 635{ 636 return &internal_utf8_encoding.enc; 637} 638 639const ENCODING *XmlGetUtf16InternalEncoding() 640{ 641#if BYTE_ORDER == 12 642 return &internal_little2_encoding.enc; 643#elif BYTE_ORDER == 21 644 return &internal_big2_encoding.enc; 645#else 646 const short n = 1; 647 return *(const char *)&n ? &internal_little2_encoding.enc : &internal_big2_encoding.enc; 648#endif 649} 650 651int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name) 652{ 653 if (name) { 654 if (streqci(name, "ISO-8859-1")) { 655 *encPtr = &latin1_encoding.enc; 656 return 1; 657 } 658 if (streqci(name, "UTF-8")) { 659 *encPtr = &utf8_encoding.enc; 660 return 1; 661 } 662 if (streqci(name, "US-ASCII")) { 663 *encPtr = &ascii_encoding.enc; 664 return 1; 665 } 666 if (!streqci(name, "UTF-16")) 667 return 0; 668 } 669 p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog; 670 p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent; 671 p->initEnc.updatePosition = initUpdatePosition; 672 p->initEnc.minBytesPerChar = 1; 673 p->encPtr = encPtr; 674 *encPtr = &(p->initEnc); 675 return 1; 676} 677 678static 679int toAscii(const ENCODING *enc, const char *ptr, const char *end) 680{ 681 char buf[1]; 682 char *p = buf; 683 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 684 if (p == buf) 685 return -1; 686 else 687 return buf[0]; 688} 689 690static 691int isSpace(int c) 692{ 693 switch (c) { 694 case ' ': 695 case '\r': 696 case '\n': 697 case '\t': 698 return 1; 699 } 700 return 0; 701} 702 703/* Return 1 if there's just optional white space 704or there's an S followed by name=val. */ 705static 706int parsePseudoAttribute(const ENCODING *enc, 707 const char *ptr, 708 const char *end, 709 const char **namePtr, 710 const char **valPtr, 711 const char **nextTokPtr) 712{ 713 int c; 714 char open; 715 if (ptr == end) { 716 *namePtr = 0; 717 return 1; 718 } 719 if (!isSpace(toAscii(enc, ptr, end))) { 720 *nextTokPtr = ptr; 721 return 0; 722 } 723 do { 724 ptr += enc->minBytesPerChar; 725 } while (isSpace(toAscii(enc, ptr, end))); 726 if (ptr == end) { 727 *namePtr = 0; 728 return 1; 729 } 730 *namePtr = ptr; 731 for (;;) { 732 c = toAscii(enc, ptr, end); 733 if (c == -1) { 734 *nextTokPtr = ptr; 735 return 0; 736 } 737 if (c == '=') 738 break; 739 if (isSpace(c)) { 740 do { 741 ptr += enc->minBytesPerChar; 742 } while (isSpace(c = toAscii(enc, ptr, end))); 743 if (c != '=') { 744 *nextTokPtr = ptr; 745 return 0; 746 } 747 break; 748 } 749 ptr += enc->minBytesPerChar; 750 } 751 if (ptr == *namePtr) { 752 *nextTokPtr = ptr; 753 return 0; 754 } 755 ptr += enc->minBytesPerChar; 756 c = toAscii(enc, ptr, end); 757 while (isSpace(c)) { 758 ptr += enc->minBytesPerChar; 759 c = toAscii(enc, ptr, end); 760 } 761 if (c != '"' && c != '\'') { 762 *nextTokPtr = ptr; 763 return 0; 764 } 765 open = c; 766 ptr += enc->minBytesPerChar; 767 *valPtr = ptr; 768 for (;; ptr += enc->minBytesPerChar) { 769 c = toAscii(enc, ptr, end); 770 if (c == open) 771 break; 772 if (!('a' <= c && c <= 'z') 773 && !('A' <= c && c <= 'Z') 774 && !('0' <= c && c <= '9') 775 && c != '.' 776 && c != '-' 777 && c != '_') { 778 *nextTokPtr = ptr; 779 return 0; 780 } 781 } 782 *nextTokPtr = ptr + enc->minBytesPerChar; 783 return 1; 784} 785 786static 787const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *end) 788{ 789#define ENCODING_MAX 128 790 char buf[ENCODING_MAX]; 791 char *p = buf; 792 int i; 793 XmlUtf8Convert(enc, &ptr, end, &p, p + ENCODING_MAX - 1); 794 if (ptr != end) 795 return 0; 796 *p = 0; 797 for (i = 0; buf[i]; i++) { 798 if ('a' <= buf[i] && buf[i] <= 'z') 799 buf[i] += 'A' - 'a'; 800 } 801 if (streqci(buf, "UTF-8")) 802 return &utf8_encoding.enc; 803 if (streqci(buf, "ISO-8859-1")) 804 return &latin1_encoding.enc; 805 if (streqci(buf, "US-ASCII")) 806 return &ascii_encoding.enc; 807 if (streqci(buf, "UTF-16")) { 808 if (enc->minBytesPerChar == 2) 809 return enc; 810 return &big2_encoding.enc; 811 } 812 return 0; 813} 814 815int XmlParseXmlDecl(int isGeneralTextEntity, 816 const ENCODING *enc, 817 const char *ptr, 818 const char *end, 819 const char **badPtr, 820 const char **versionPtr, 821 const char **encodingName, 822 const ENCODING **encoding, 823 int *standalone) 824{ 825 const char *val = 0; 826 const char *name = 0; 827 ptr += 5 * enc->minBytesPerChar; 828 end -= 2 * enc->minBytesPerChar; 829 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) { 830 *badPtr = ptr; 831 return 0; 832 } 833 if (!XmlNameMatchesAscii(enc, name, "version")) { 834 if (!isGeneralTextEntity) { 835 *badPtr = name; 836 return 0; 837 } 838 } 839 else { 840 if (versionPtr) 841 *versionPtr = val; 842 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { 843 *badPtr = ptr; 844 return 0; 845 } 846 if (!name) 847 return 1; 848 } 849 if (XmlNameMatchesAscii(enc, name, "encoding")) { 850 int c = toAscii(enc, val, end); 851 if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) { 852 *badPtr = val; 853 return 0; 854 } 855 if (encodingName) 856 *encodingName = val; 857 if (encoding) 858 *encoding = findEncoding(enc, val, ptr - enc->minBytesPerChar); 859 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { 860 *badPtr = ptr; 861 return 0; 862 } 863 if (!name) 864 return 1; 865 } 866 if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) { 867 *badPtr = name; 868 return 0; 869 } 870 if (XmlNameMatchesAscii(enc, val, "yes")) { 871 if (standalone) 872 *standalone = 1; 873 } 874 else if (XmlNameMatchesAscii(enc, val, "no")) { 875 if (standalone) 876 *standalone = 0; 877 } 878 else { 879 *badPtr = val; 880 return 0; 881 } 882 while (isSpace(toAscii(enc, ptr, end))) 883 ptr += enc->minBytesPerChar; 884 if (ptr != end) { 885 *badPtr = ptr; 886 return 0; 887 } 888 return 1; 889} 890 891static 892int checkCharRefNumber(int result) 893{ 894 switch (result >> 8) { 895 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 896 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 897 return -1; 898 case 0: 899 if (latin1_encoding.type[result] == BT_NONXML) 900 return -1; 901 break; 902 case 0xFF: 903 if (result == 0xFFFE || result == 0xFFFF) 904 return -1; 905 break; 906 } 907 return result; 908} 909 910size_t XmlUtf8Encode(int c, char *buf) 911{ 912 enum { 913 /* minN is minimum legal resulting value for N byte sequence */ 914 min2 = 0x80, 915 min3 = 0x800, 916 min4 = 0x10000 917 }; 918 919 if (c < 0) 920 return 0; 921 if (c < min2) { 922 buf[0] = (c | UTF8_cval1); 923 return 1; 924 } 925 if (c < min3) { 926 buf[0] = ((c >> 6) | UTF8_cval2); 927 buf[1] = ((c & 0x3f) | 0x80); 928 return 2; 929 } 930 if (c < min4) { 931 buf[0] = ((c >> 12) | UTF8_cval3); 932 buf[1] = (((c >> 6) & 0x3f) | 0x80); 933 buf[2] = ((c & 0x3f) | 0x80); 934 return 3; 935 } 936 if (c < 0x110000) { 937 buf[0] = ((c >> 18) | UTF8_cval4); 938 buf[1] = (((c >> 12) & 0x3f) | 0x80); 939 buf[2] = (((c >> 6) & 0x3f) | 0x80); 940 buf[3] = ((c & 0x3f) | 0x80); 941 return 4; 942 } 943 return 0; 944} 945 946size_t XmlUtf16Encode(int charNum, unsigned short *buf) 947{ 948 if (charNum < 0) 949 return 0; 950 if (charNum < 0x10000) { 951 buf[0] = charNum; 952 return 1; 953 } 954 if (charNum < 0x110000) { 955 charNum -= 0x10000; 956 buf[0] = (charNum >> 10) + 0xD800; 957 buf[1] = (charNum & 0x3FF) + 0xDC00; 958 return 2; 959 } 960 return 0; 961} 962 963struct unknown_encoding { 964 struct normal_encoding normal; 965 int (*convert)(void *userData, const char *p); 966 void *userData; 967 unsigned short utf16[256]; 968 char utf8[256][4]; 969}; 970 971int XmlSizeOfUnknownEncoding() 972{ 973 return sizeof(struct unknown_encoding); 974} 975 976static 977int unknown_isName(const ENCODING *enc, const char *p) 978{ 979 int c = ((const struct unknown_encoding *)enc) 980 ->convert(((const struct unknown_encoding *)enc)->userData, p); 981 if (c & ~0xFFFF) 982 return 0; 983 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 984} 985 986static 987int unknown_isNmstrt(const ENCODING *enc, const char *p) 988{ 989 int c = ((const struct unknown_encoding *)enc) 990 ->convert(((const struct unknown_encoding *)enc)->userData, p); 991 if (c & ~0xFFFF) 992 return 0; 993 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 994} 995 996static 997int unknown_isInvalid(const ENCODING *enc, const char *p) 998{ 999 int c = ((const struct unknown_encoding *)enc) 1000 ->convert(((const struct unknown_encoding *)enc)->userData, p); 1001 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1002} 1003 1004static 1005void unknown_toUtf8(const ENCODING *enc, 1006 const char **fromP, const char *fromLim, 1007 char **toP, const char *toLim) 1008{ 1009 char buf[XML_UTF8_ENCODE_MAX]; 1010 for (;;) { 1011 const char *utf8; 1012 int n; 1013 if (*fromP == fromLim) 1014 break; 1015 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP]; 1016 n = *utf8++; 1017 if (n == 0) { 1018 int c = ((const struct unknown_encoding *)enc) 1019 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); 1020 n = XmlUtf8Encode(c, buf); 1021 if (n > toLim - *toP) 1022 break; 1023 utf8 = buf; 1024 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] 1025 - (BT_LEAD2 - 2); 1026 } 1027 else { 1028 if (n > toLim - *toP) 1029 break; 1030 (*fromP)++; 1031 } 1032 do { 1033 *(*toP)++ = *utf8++; 1034 } while (--n != 0); 1035 } 1036} 1037 1038static 1039void unknown_toUtf16(const ENCODING *enc, 1040 const char **fromP, const char *fromLim, 1041 unsigned short **toP, const unsigned short *toLim) 1042{ 1043 while (*fromP != fromLim && *toP != toLim) { 1044 unsigned short c 1045 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP]; 1046 if (c == 0) { 1047 c = (unsigned short)((const struct unknown_encoding *)enc) 1048 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); 1049 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] 1050 - (BT_LEAD2 - 2); 1051 } 1052 else 1053 (*fromP)++; 1054 *(*toP)++ = c; 1055 } 1056} 1057 1058ENCODING * 1059XmlInitUnknownEncoding(void *mem, 1060 int *table, 1061 int (*convert)(void *userData, const char *p), 1062 void *userData) 1063{ 1064 int i; 1065 struct unknown_encoding *e = mem; 1066 for (i = 0; i < sizeof(struct normal_encoding); i++) 1067 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; 1068 for (i = 0; i < 128; i++) 1069 if (latin1_encoding.type[i] != BT_OTHER 1070 && latin1_encoding.type[i] != BT_NONXML 1071 && table[i] != i) 1072 return 0; 1073 for (i = 0; i < 256; i++) { 1074 int c = table[i]; 1075 if (c == -1) { 1076 e->normal.type[i] = BT_MALFORM; 1077 /* This shouldn't really get used. */ 1078 e->utf16[i] = 0xFFFF; 1079 e->utf8[i][0] = 1; 1080 e->utf8[i][1] = 0; 1081 } 1082 else if (c < 0) { 1083 if (c < -4) 1084 return 0; 1085 e->normal.type[i] = BT_LEAD2 - (c + 2); 1086 e->utf8[i][0] = 0; 1087 e->utf16[i] = 0; 1088 } 1089 else if (c < 0x80) { 1090 if (latin1_encoding.type[c] != BT_OTHER 1091 && latin1_encoding.type[c] != BT_NONXML 1092 && c != i) 1093 return 0; 1094 e->normal.type[i] = latin1_encoding.type[c]; 1095 e->utf8[i][0] = 1; 1096 e->utf8[i][1] = (char)c; 1097 e->utf16[i] = c == 0 ? 0xFFFF : c; 1098 } 1099 else if (checkCharRefNumber(c) < 0) { 1100 e->normal.type[i] = BT_NONXML; 1101 /* This shouldn't really get used. */ 1102 e->utf16[i] = 0xFFFF; 1103 e->utf8[i][0] = 1; 1104 e->utf8[i][1] = 0; 1105 } 1106 else { 1107 if (c > 0xFFFF) 1108 return 0; 1109 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1110 e->normal.type[i] = BT_NMSTRT; 1111 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1112 e->normal.type[i] = BT_NAME; 1113 else 1114 e->normal.type[i] = BT_OTHER; 1115 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1116 e->utf16[i] = c; 1117 } 1118 } 1119 e->userData = userData; 1120 e->convert = convert; 1121 if (convert) { 1122 e->normal.isName2 = unknown_isName; 1123 e->normal.isName3 = unknown_isName; 1124 e->normal.isName4 = unknown_isName; 1125 e->normal.isNmstrt2 = unknown_isNmstrt; 1126 e->normal.isNmstrt3 = unknown_isNmstrt; 1127 e->normal.isNmstrt4 = unknown_isNmstrt; 1128 e->normal.isInvalid2 = unknown_isInvalid; 1129 e->normal.isInvalid3 = unknown_isInvalid; 1130 e->normal.isInvalid4 = unknown_isInvalid; 1131 } 1132 e->normal.enc.utf8Convert = unknown_toUtf8; 1133 e->normal.enc.utf16Convert = unknown_toUtf16; 1134 return &(e->normal.enc); 1135} 1136