1/*---------------------------------------------------------------------------* 2 | PDFlib - A library for generating PDF on the fly | 3 +---------------------------------------------------------------------------+ 4 | Copyright (c) 1997-2004 Thomas Merz and PDFlib GmbH. All rights reserved. | 5 +---------------------------------------------------------------------------+ 6 | | 7 | This software is subject to the PDFlib license. It is NOT in the | 8 | public domain. Extended versions and commercial licenses are | 9 | available, please check http://www.pdflib.com. | 10 | | 11 *---------------------------------------------------------------------------*/ 12 13/* $Id: pc_unicode.c 14574 2005-10-29 16:27:43Z bonefish $ 14 * 15 * PDFlib routines for converting between Unicode values and Adobe glyph names 16 * 17 */ 18 19#include "pc_util.h" 20#include "pc_chartabs.h" 21 22 23/* 24 * Returns the Unicode value of a glyph name. If the name is not 25 * contained in the Adobe Glyph List (AGL) 0 will be returned. 26 */ 27 28pdc_ushort 29pdc_adobe2unicode(const char *name) 30{ 31 int lo = 0; 32 int hi = ((sizeof tab_agl2uni) / (sizeof (pdc_glyph_tab))); 33 34 if (name) 35 { 36 while (lo < hi) 37 { 38 int i = (lo + hi) / 2; 39 int cmp = strcmp(name, tab_agl2uni[i].glyphname); 40 41 if (cmp == 0) 42 return tab_agl2uni[i].code; 43 44 if (cmp < 0) 45 hi = i; 46 else 47 lo = i + 1; 48 } 49 } 50 51 return 0; 52} 53 54/* 55 * Returns the name in the Adobe Glyph List which corresponds to 56 * the supplied Unicode value. If the value doesn't have a 57 * corresponding Unicode name NULL will be returned. 58 */ 59 60const char * 61pdc_unicode2adobe(pdc_ushort uv) 62{ 63 int lo = 0; 64 int hi = ((sizeof tab_uni2agl) / (sizeof (pdc_glyph_tab))); 65 66 if (uv) 67 { 68 while (lo < hi) 69 { 70 int i = (lo + hi) / 2; 71 72 if (uv == tab_uni2agl[i].code) 73 return tab_uni2agl[i].glyphname; 74 75 if (uv < tab_uni2agl[i].code) 76 hi = i; 77 else 78 lo = i + 1; 79 } 80 } 81 82 return (char *) 0; 83} 84 85 86 87/* 88 * Returns true if a character name is contained in pc_standard_latin_charset. 89 * Otherwise false will be returned. 90 */ 91 92pdc_bool 93pdc_is_std_charname(const char *name) 94{ 95 int lo = 0; 96 int hi = ((sizeof pc_standard_latin_charset) / (sizeof (char *))); 97 98 if (name) 99 { 100 while (lo < hi) 101 { 102 int i = (lo + hi) / 2; 103 int cmp = strcmp(name, pc_standard_latin_charset[i]); 104 105 if (cmp == 0) 106 return pdc_true; 107 108 if (cmp < 0) 109 hi = i; 110 else 111 lo = i + 1; 112 } 113 } 114 115 return pdc_false; 116} 117 118/* 119 * The following source is based on Unicode's original source 120 * code ConvertUTF.c. It has been adapted to PDFlib programming 121 * conventions. 122 * 123 * The original file had the following notice: 124 * 125 * Copyright 2001 Unicode, Inc. 126 * 127 * Limitations on Rights to Redistribute This Code 128 * 129 * Author: Mark E. Davis, 1994. 130 * Rev History: Rick McGowan, fixes & updates May 2001. 131 * 132 * 133 * Functions for conversions between UTF32, UTF-16, and UTF-8. 134 * These funtions forming a complete set of conversions between 135 * the three formats. UTF-7 is not included here. 136 * 137 * Each of these routines takes pointers to input buffers and output 138 * buffers. The input buffers are const. 139 * 140 * Each routine converts the text between *sourceStart and sourceEnd, 141 * putting the result into the buffer between *targetStart and 142 * targetEnd. Note: the end pointers are *after* the last item: e.g. 143 * *(sourceEnd - 1) is the last item. 144 * 145 * The return result indicates whether the conversion was successful, 146 * and if not, whether the problem was in the source or target buffers. 147 * (Only the first encountered problem is indicated.) 148 * 149 * After the conversion, *sourceStart and *targetStart are both 150 * updated to point to the end of last text successfully converted in 151 * the respective buffers. 152 * 153 * Input parameters: 154 * sourceStart - pointer to a pointer to the source buffer. 155 * The contents of this are modified on return so that 156 * it points at the next thing to be converted. 157 * targetStart - similarly, pointer to pointer to the target buffer. 158 * sourceEnd, targetEnd - respectively pointers to the ends of the 159 * two buffers, for overflow checking only. 160 * 161 * These conversion functions take a pdc_convers_flags argument. When this 162 * flag is set to strict, both irregular sequences and isolated surrogates 163 * will cause an error. When the flag is set to lenient, both irregular 164 * sequences and isolated surrogates are converted. 165 * 166 * Whether the flag is strict or lenient, all illegal sequences will cause 167 * an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, 168 * or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code 169 * must check for illegal sequences. 170 * 171 * When the flag is set to lenient, characters over 0x10FFFF are converted 172 * to the replacement character; otherwise (when the flag is set to strict) 173 * they constitute an error. 174 * 175 * Output parameters: 176 * The value "sourceIllegal" is returned from some routines if the input 177 * sequence is malformed. When "sourceIllegal" is returned, the source 178 * value will point to the illegal value that caused the problem. E.g., 179 * in UTF-8 when a sequence is malformed, it points to the start of the 180 * malformed sequence. 181 * 182 * Author: Mark E. Davis, 1994. 183 * Rev History: Rick McGowan, fixes & updates May 2001. 184 * 185 */ 186 187/* 188 * The following 4 definitions are compiler-specific. 189 * The C standard does not guarantee that wchar_t has at least 190 * 16 bits, so wchar_t is no less portable than unsigned short! 191 * All should be unsigned values to avoid sign extension during 192 * bit mask & shift operations. 193 */ 194 195typedef unsigned long UTF32; /* at least 32 bits */ 196typedef unsigned short UTF16; /* at least 16 bits */ 197typedef unsigned char UTF8; /* typically 8 bits */ 198 199/* Some fundamental constants */ 200#define UNI_SUR_HIGH_START (UTF32)0xD800 201#define UNI_SUR_HIGH_END (UTF32)0xDBFF 202#define UNI_SUR_LOW_START (UTF32)0xDC00 203#define UNI_SUR_LOW_END (UTF32)0xDFFF 204#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD 205#define UNI_MAX_BMP (UTF32)0x0000FFFF 206#define UNI_MAX_UTF16 (UTF32)0x0010FFFF 207#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF 208 209static const int halfShift = 10; /* used for shifting by 10 bits */ 210 211static const UTF32 halfBase = 0x0010000UL; 212static const UTF32 halfMask = 0x3FFUL; 213 214 215/* --------------------------------------------------------------------- */ 216 217#if 0 218static pdc_convers_result 219pdc_convertUTF32toUTF16 ( 220 UTF32** sourceStart, const UTF32* sourceEnd, 221 UTF16** targetStart, const UTF16* targetEnd, 222 const pdc_convers_flags flags) { 223 pdc_convers_result result = conversionOK; 224 UTF32* source = *sourceStart; 225 UTF16* target = *targetStart; 226 while (source < sourceEnd) { 227 UTF32 ch; 228 if (target >= targetEnd) { 229 result = targetExhausted; break; 230 } 231 ch = *source++; 232 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 233 if ((flags == strictConversion) && 234 (ch >= UNI_SUR_HIGH_START && 235 ch <= UNI_SUR_LOW_END)) { 236 --source; /* return to the illegal value itself */ 237 result = sourceIllegal; 238 break; 239 } else { 240 *target++ = (UTF16) ch; /* normal case */ 241 } 242 } else if (ch > UNI_MAX_UTF16) { 243 if (flags == strictConversion) { 244 result = sourceIllegal; 245 } else { 246 *target++ = UNI_REPLACEMENT_CHAR; 247 } 248 } else { 249 /* target is a character in range 0xFFFF - 0x10FFFF. */ 250 if (target + 1 >= targetEnd) { 251 result = targetExhausted; 252 break; 253 } 254 ch -= halfBase; 255 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START); 256 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START); 257 } 258 } 259 *sourceStart = source; 260 *targetStart = target; 261 return result; 262} 263 264/* --------------------------------------------------------------------- */ 265 266static pdc_convers_result 267pdc_convertUTF16toUTF32 ( 268 UTF16** sourceStart, UTF16* sourceEnd, 269 UTF32** targetStart, const UTF32* targetEnd, 270 const pdc_convers_flags flags) { 271 pdc_convers_result result = conversionOK; 272 UTF16* source = *sourceStart; 273 UTF32* target = *targetStart; 274 UTF32 ch, ch2; 275 while (source < sourceEnd) { 276 ch = *source++; 277 if (ch >= UNI_SUR_HIGH_START && 278 ch <= UNI_SUR_HIGH_END && 279 source < sourceEnd) { 280 ch2 = *source; 281 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 282 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 283 + (ch2 - UNI_SUR_LOW_START) + halfBase; 284 ++source; 285 } else if (flags == strictConversion) { 286 /* it's an unpaired high surrogate */ 287 --source; /* return to the illegal value itself */ 288 result = sourceIllegal; 289 break; 290 } 291 } else if ((flags == strictConversion) && 292 (ch >= UNI_SUR_LOW_START && 293 ch <= UNI_SUR_LOW_END)) { 294 /* an unpaired low surrogate */ 295 --source; /* return to the illegal value itself */ 296 result = sourceIllegal; 297 break; 298 } 299 if (target >= targetEnd) { 300 result = targetExhausted; 301 break; 302 } 303 *target++ = ch; 304 } 305 *sourceStart = source; 306 *targetStart = target; 307#ifdef CVTUTF_DEBUG 308if (result == sourceIllegal) { 309 fprintf(stderr, "pdc_convertUTF16toUTF32 illegal seq 0x%04x,%04x\n", 310 ch, ch2); 311 fflush(stderr); 312} 313#endif 314 return result; 315} 316#endif 317 318/* --------------------------------------------------------------------- */ 319 320/* 321 * Index into the table below with the first byte of a UTF-8 sequence to 322 * get the number of trailing bytes that are supposed to follow it. 323 */ 324static const char trailingBytesForUTF8[256] = { 325 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 326 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 327 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 328 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 329 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 330 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 331 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 332 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 333}; 334 335#if 0 336static const char 337pdc_get_trailingBytesForUTF8(int i) { 338 return (trailingBytesForUTF8[i]); 339} 340#endif 341 342/* 343 * Magic values subtracted from a buffer value during UTF8 conversion. 344 * This table contains as many values as there might be trailing bytes 345 * in a UTF-8 sequence. 346 */ 347static const UTF32 offsetsFromUTF8[6] = { 348 0x00000000UL, 0x00003080UL, 0x000E2080UL, 349 0x03C82080UL, 0xFA082080UL, 0x82082080UL 350}; 351 352/* 353 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 354 * into the first byte, depending on how many bytes follow. There are 355 * as many entries in this table as there are UTF-8 sequence types. 356 * (I.e., one byte sequence, two byte... six byte sequence.) 357 */ 358static const UTF8 firstByteMark[7] = { 359 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC 360}; 361 362/* --------------------------------------------------------------------- */ 363 364/* The interface converts a whole buffer to avoid function-call overhead. 365 * Constants have been gathered. Loops & conditionals have been removed as 366 * much as possible for efficiency, in favor of drop-through switches. 367 * (See "Note A" at the bottom of the file for equivalent code.) 368 * If your compiler supports it, the "pdc_islegalUTF8" call can be turned 369 * into an inline function. 370 */ 371 372/* --------------------------------------------------------------------- */ 373 374static pdc_convers_result 375pdc_convertUTF16toUTF8 ( 376 UTF16** sourceStart, const UTF16* sourceEnd, 377 UTF8** targetStart, const UTF8* targetEnd, 378 const pdc_convers_flags flags) { 379 pdc_convers_result result = conversionOK; 380 UTF16* source = *sourceStart; 381 UTF8* target = *targetStart; 382 while (source < sourceEnd) { 383 UTF32 ch; 384 unsigned short bytesToWrite = 0; 385 const UTF32 byteMask = 0xBF; 386 const UTF32 byteMark = 0x80; 387 ch = *source++; 388 /* If we have a surrogate pair, convert to UTF32 first. */ 389 if (ch >= UNI_SUR_HIGH_START && 390 ch <= UNI_SUR_HIGH_END && 391 source < sourceEnd) { 392 UTF32 ch2 = *source; 393 if (ch2 >= UNI_SUR_LOW_START && 394 ch2 <= UNI_SUR_LOW_END) { 395 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 396 + (ch2 - UNI_SUR_LOW_START) + halfBase; 397 ++source; 398 } else if (flags == strictConversion) { 399 /* it's an unpaired high surrogate */ 400 --source; /* return to the illegal value itself */ 401 result = sourceIllegal; 402 break; 403 } 404 } else if ((flags == strictConversion) && 405 (ch >= UNI_SUR_LOW_START && 406 ch <= UNI_SUR_LOW_END)) { 407 --source; /* return to the illegal value itself */ 408 result = sourceIllegal; 409 break; 410 } 411 /* Figure out how many bytes the result will require */ 412 if (ch < (UTF32)0x80) { bytesToWrite = 1; 413 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 414 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 415 } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; 416 } else { bytesToWrite = 2; 417 ch = UNI_REPLACEMENT_CHAR; 418 } 419 420 target += bytesToWrite; 421 if (target > targetEnd) { 422 target -= bytesToWrite; result = targetExhausted; break; 423 } 424 switch (bytesToWrite) { /* note: everything falls through. */ 425 case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; 426 case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; 427 case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; 428 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 429 } 430 target += bytesToWrite; 431 } 432 *sourceStart = source; 433 *targetStart = target; 434 return result; 435} 436 437/* --------------------------------------------------------------------- */ 438 439/* 440 * Utility routine to tell whether a sequence of bytes is legal UTF-8. 441 * This must be called with the length pre-determined by the first byte. 442 * If not calling this from pdc_convertUTF8to*, then the length can be set by: 443 * length = trailingBytesForUTF8[*source]+1; 444 * and the sequence is illegal right away if there aren't that many bytes 445 * available. 446 * If presented with a length > 4, this returns pdc_false. The Unicode 447 * definition of UTF-8 goes up to 4-byte sequences. 448 */ 449 450static pdc_bool 451pdc_islegalUTF8(UTF8 *source, int length) { 452 UTF8 a; 453 UTF8 *srcptr = source+length; 454 switch (length) { 455 default: return pdc_false; 456 /* Everything else falls through when "pdc_true"... */ 457 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false; 458 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false; 459 case 2: if ((a = (*--srcptr)) > 0xBF) return pdc_false; 460 switch (*source) { 461 /* no fall-through in this inner switch */ 462 case 0xE0: if (a < 0xA0) return pdc_false; break; 463 case 0xF0: if (a < 0x90) return pdc_false; break; 464 case 0xF4: if (a > 0x8F) return pdc_false; break; 465 default: if (a < 0x80) return pdc_false; 466 } 467 case 1: if (*source >= 0x80 && *source < 0xC2) return pdc_false; 468 if (*source > 0xF4) return pdc_false; 469 } 470 return pdc_true; 471} 472 473/* --------------------------------------------------------------------- */ 474 475#if 0 476/* 477 * Exported function to return whether a UTF-8 sequence is legal or not. 478 * This is not used here; it's just exported. 479 */ 480static pdc_bool pdc_islegalUTF8sequence(UTF8 *source, UTF8 *sourceEnd) { 481 int length = trailingBytesForUTF8[*source]+1; 482 if (source+length > sourceEnd) { 483 return pdc_false; 484 } 485 return pdc_islegalUTF8(source, length); 486} 487#endif 488 489/* --------------------------------------------------------------------- */ 490 491static pdc_convers_result 492pdc_convertUTF8toUTF16 ( 493 UTF8** sourceStart, UTF8* sourceEnd, 494 UTF16** targetStart, const UTF16* targetEnd, 495 const pdc_convers_flags flags) { 496 pdc_convers_result result = conversionOK; 497 UTF8* source = *sourceStart; 498 UTF16* target = *targetStart; 499 while (source < sourceEnd) { 500 UTF32 ch = 0L; 501 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 502 if (source + extraBytesToRead >= sourceEnd) { 503 result = sourceExhausted; 504 break; 505 } 506 /* Do this check whether lenient or strict */ 507 if (! pdc_islegalUTF8(source, extraBytesToRead+1)) { 508 result = sourceIllegal; 509 break; 510 } 511 /* 512 * The cases all fall through. See "Note A" below. 513 */ 514 switch (extraBytesToRead) { 515 case 3: ch += *source++; ch <<= 6; 516 case 2: ch += *source++; ch <<= 6; 517 case 1: ch += *source++; ch <<= 6; 518 case 0: ch += *source++; 519 } 520 ch -= offsetsFromUTF8[extraBytesToRead]; 521 522 if (target >= targetEnd) { 523 result = targetExhausted; 524 break; 525 } 526 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 527 if ((flags == strictConversion) && 528 (ch >= UNI_SUR_HIGH_START && 529 ch <= UNI_SUR_LOW_END)) { 530 --source; /* return to the illegal value itself */ 531 result = sourceIllegal; 532 break; 533 } else { 534 *target++ = (UTF16) ch; /* normal case */ 535 } 536 } else if (ch > UNI_MAX_UTF16) { 537 if (flags == strictConversion) { 538 result = sourceIllegal; 539 source -= extraBytesToRead; /* return to the start */ 540 } else { 541 *target++ = UNI_REPLACEMENT_CHAR; 542 } 543 } else { 544 /* target is a character in range 0xFFFF - 0x10FFFF. */ 545 if (target + 1 >= targetEnd) { 546 result = targetExhausted; 547 break; 548 } 549 ch -= halfBase; 550 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START); 551 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START); 552 } 553 } 554 *sourceStart = source; 555 *targetStart = target; 556 return result; 557} 558 559/* --------------------------------------------------------------------- */ 560 561#if 0 562static pdc_convers_result 563pdc_convertUTF32toUTF8 ( 564 UTF32** sourceStart, const UTF32* sourceEnd, 565 UTF8** targetStart, const UTF8* targetEnd, 566 const pdc_convers_flags flags) { 567 pdc_convers_result result = conversionOK; 568 UTF32* source = *sourceStart; 569 UTF8* target = *targetStart; 570 while (source < sourceEnd) { 571 UTF32 ch; 572 unsigned short bytesToWrite = 0; 573 const UTF32 byteMask = 0x000000BF; 574 const UTF32 byteMark = 0x00000080; 575 ch = *source++; 576 /* surrogates of any stripe are not legal UTF32 characters */ 577 if (flags == strictConversion ) { 578 if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) { 579 --source; /* return to the illegal value itself */ 580 result = sourceIllegal; 581 break; 582 } 583 } 584 /* Figure out how many bytes the result will require */ 585 if (ch < (UTF32)0x80) { bytesToWrite = 1; 586 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 587 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 588 } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; 589 } else { bytesToWrite = 2; 590 ch = UNI_REPLACEMENT_CHAR; 591 } 592 593 target += bytesToWrite; 594 if (target > targetEnd) { 595 target -= bytesToWrite; result = targetExhausted; break; 596 } 597 switch (bytesToWrite) { /* note: everything falls through. */ 598 case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; 599 case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; 600 case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; 601 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 602 } 603 target += bytesToWrite; 604 } 605 *sourceStart = source; 606 *targetStart = target; 607 return result; 608} 609 610/* --------------------------------------------------------------------- */ 611 612static pdc_convers_result 613pdc_convertUTF8toUTF32 ( 614 UTF8** sourceStart, UTF8* sourceEnd, 615 UTF32** targetStart, const UTF32* targetEnd, 616 const pdc_convers_flags flags) { 617 pdc_convers_result result = conversionOK; 618 UTF8* source = *sourceStart; 619 UTF32* target = *targetStart; 620 621 (void) flags; 622 623 while (source < sourceEnd) { 624 UTF32 ch = 0; 625 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 626 if (source + extraBytesToRead >= sourceEnd) { 627 result = sourceExhausted; break; 628 } 629 /* Do this check whether lenient or strict */ 630 if (! pdc_islegalUTF8(source, extraBytesToRead+1)) { 631 result = sourceIllegal; 632 break; 633 } 634 /* 635 * The cases all fall through. See "Note A" below. 636 */ 637 switch (extraBytesToRead) { 638 case 3: ch += *source++; ch <<= 6; 639 case 2: ch += *source++; ch <<= 6; 640 case 1: ch += *source++; ch <<= 6; 641 case 0: ch += *source++; 642 } 643 ch -= offsetsFromUTF8[extraBytesToRead]; 644 645 if (target >= targetEnd) { 646 result = targetExhausted; 647 break; 648 } 649 if (ch <= UNI_MAX_UTF32) { 650 *target++ = ch; 651 } else if (ch > UNI_MAX_UTF32) { 652 *target++ = UNI_REPLACEMENT_CHAR; 653 } else { 654 if (target + 1 >= targetEnd) { 655 result = targetExhausted; 656 break; 657 } 658 ch -= halfBase; 659 *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START; 660 *target++ = (ch & halfMask) + UNI_SUR_LOW_START; 661 } 662 } 663 *sourceStart = source; 664 *targetStart = target; 665 return result; 666} 667#endif 668 669/* --------------------------------------------------------------------- 670 671 Note A. 672 The fall-through switches in UTF-8 reading code save a 673 temp variable, some decrements & conditionals. The switches 674 are equivalent to the following loop: 675 { 676 int tmpBytesToRead = extraBytesToRead+1; 677 do { 678 ch += *source++; 679 --tmpBytesToRead; 680 if (tmpBytesToRead) ch <<= 6; 681 } while (tmpBytesToRead > 0); 682 } 683 In UTF-8 writing code, the switches on "bytesToWrite" are 684 similarly unrolled loops. 685 686 --------------------------------------------------------------------- */ 687 688/* 689 * pdc_convert_string converts a arbitrary encoded string (maybe UTF) to 690 * another string. 691 * 692 * The new converted string is allocated and terminated by required zeros. 693 * The caller is responsible for freeing the string buffer. 694 * 695 * 696 * LBP: low byte picking 697 * 698 * Input-Parameter: 699 * 700 * inutf: input string format (see pc_unicode.h): 701 * 702 * pdc_auto: If a BOM is recognized: 703 * pdc_utf8 or pdc_utf16xx resp. 704 * Otherwise if input encoding <inev> is specified: 705 * pdc_bytes 706 * Otherwise: 707 * pdc_utf16 708 * 709 * pdc_auto2: If input encoding is not specified: 710 * pdc_utf16 711 * Otherwise after successfull LBP: 712 * pdc_auto 713 * Otherwise 714 * pdc_utf16 715 * 716 * pdc_bytes: 8-bit string. Encoding is <inev> if specified. 717 * 718 * pdc_bytes2: After successfull LBP: 719 * pdc_bytes 720 * Otherwise 721 * pdc_utf16 722 * 723 * pdc_utf8: UTF-8 formatted string. 724 * 725 * pdc_utf16: If a UTF16 BOM is recognized: 726 * pdc_utf16be or pdc_utf16le 727 * Otherwise UTF-16 machine byte ordered string. 728 * 729 * pdc_utf16be UTF-16 big endian formatted string. 730 * 731 * pdc_utf16le UTF-16 little endian formatted string. 732 * 733 * inev: Encoding vector for input pdc_bytes string. 734 * 735 * instring: Input string. 736 * 737 * inlen: Length of input string in byte. 738 * 739 * oututf: Target format for output string. 740 * pdc_auto, pdc_auto2 and pdc_bytes2 are not supported. 741 * 742 * outev: Encoding vector for output pdc_bytes string. 743 * 744 * flags: PDC_CONV_KEEPBYTES: 745 * Input pdc_bytes strings will be kept differing from oututf. 746 * *oututf: pdc_byte. 747 * 748 * PDC_CONV_TRY7BYTES: 749 * UTF-8 output strings will have no BOM if every byte 750 * is smaller than x80. 751 * *oututf: pdc_byte. 752 * 753 * PDC_CONV_TRYBYTES: 754 * UTF-UTF-16xx output strings will be converted by LBP 755 * if every character is smaller than x0100. 756 * *oututf: pdc_byte. 757 * 758 * PDC_CONV_WITHBOM: 759 * UTF-8 or UTF-UTF-16xx output strings will be armed 760 * with an appropriate BOM. 761 * 762 * PDC_CONV_NOBOM: 763 * In UTF-8 or UTF-UTF-16xx output strings any BOM sequence 764 * will be removed. 765 * 766 * verbose: Error messages are put out. Otherwise they are saved only. 767 * 768 * Output-Parameter: 769 * 770 * oututf: Reached format for output string. 771 * 772 * outstring: Pointer of allocated output string 773 * 774 * outlen: Length of output string. 775 * 776 */ 777 778int 779pdc_convert_string(pdc_core *pdc, 780 pdc_text_format inutf, pdc_encodingvector *inev, 781 pdc_byte *instring, int inlen, 782 pdc_text_format *oututf_p, pdc_encodingvector *outev, 783 pdc_byte **outstring, int *outlen, int flags, 784 pdc_bool verbose) 785{ 786 static const char *fn = "pdc_convert_string"; 787 pdc_text_format oututf = *oututf_p; 788 pdc_text_format oututf_s; 789 pdc_ushort *usinstr = (pdc_ushort *) instring; 790 pdc_ushort uv = 0; 791 pdc_byte *instr = (pdc_byte *) instring; 792 pdc_bool inalloc = pdc_false; 793 pdc_bool hasbom = pdc_false; 794 pdc_bool toswap = pdc_false; 795 int errcode = 0; 796 int i, j, len; 797 798 /* analyzing 2 byte textformat */ 799 if (inutf == pdc_auto2 || inutf == pdc_bytes2) 800 { 801 if (inutf == pdc_auto2 && !inev) 802 { 803 inutf = pdc_utf16; 804 } 805 else 806 { 807 len = inlen / 2; 808 if (2 * len != inlen) 809 { 810 errcode = PDC_E_CONV_ILLUTF16; 811 goto PDC_CONV_ERROR; 812 } 813 for (i = 0; i < len; i++) 814 if (usinstr[i] > 0x00FF) 815 break; 816 817 /* low byte picking */ 818 if (i == len) 819 { 820 instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 2), fn); 821 for (i = 0; i < len; i++) 822 instr[i] = (pdc_byte) usinstr[i]; 823 instr[len] = 0; 824 instr[len + 1] = 0; 825 826 inalloc = pdc_true; 827 instring = instr; 828 inlen = len; 829 830 if (inutf == pdc_bytes2) 831 inutf = pdc_bytes; 832 else 833 inutf = pdc_auto; 834 } 835 else 836 { 837 inutf = pdc_utf16; 838 } 839 } 840 } 841 842 /* analyzing UTF-16 textformat */ 843 if (inutf == pdc_utf16) 844 { 845 if (pdc_is_utf16be_unicode(instring)) 846 inutf = pdc_utf16be; 847 else if (pdc_is_utf16le_unicode(instring)) 848 inutf = pdc_utf16le; 849 } 850 851 /* analyzing auto textformat */ 852 else if (inutf == pdc_auto) 853 { 854 if (pdc_is_utf8_unicode(instring)) 855 inutf = pdc_utf8; 856 else if (pdc_is_utf16be_unicode(instring)) 857 inutf = pdc_utf16be; 858 else if (pdc_is_utf16le_unicode(instring)) 859 inutf = pdc_utf16le; 860 else if (inev) 861 inutf = pdc_bytes; 862 else 863 inutf = pdc_utf16; 864 } 865 866 /* conversion to UTF-16 by swapping */ 867 if ((inutf == pdc_utf16be || inutf == pdc_utf16le) && 868 (inutf != oututf || flags & PDC_CONV_TRYBYTES)) 869 { 870 if (inlen && 871 ((inutf == pdc_utf16be && !PDC_ISBIGENDIAN) || 872 (inutf == pdc_utf16le && PDC_ISBIGENDIAN))) 873 { 874 if (inalloc) 875 pdc_swap_bytes((char *) instring, inlen, NULL); 876 else 877 { 878 instr = (pdc_byte *) pdc_malloc(pdc, (size_t) inlen, fn); 879 pdc_swap_bytes((char *) instring, inlen, (char *) instr); 880 881 inalloc = pdc_true; 882 instring = instr; 883 } 884 } 885 inutf = pdc_utf16; 886 } 887 888 /* conversion to UTF-16 by inflation or encoding vector */ 889 if (inutf == pdc_bytes) 890 { 891 if ((oututf != pdc_bytes && !(flags & PDC_CONV_KEEPBYTES)) || 892 inev != NULL || outev != NULL) 893 { 894 len = 2 * inlen; 895 instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 2), fn); 896 usinstr = (pdc_ushort *) instr; 897 898 for (i = 0; i < inlen; i++) 899 { 900 uv = (pdc_ushort) instring[i]; 901 if (inev && uv) 902 { 903 uv = inev->codes[uv]; 904 if (!uv) uv = 0x0020; 905 } 906 usinstr[i] = uv; 907 } 908 909 if (inalloc) 910 pdc_free(pdc, instring); 911 912 inalloc = pdc_true; 913 instring = instr; 914 inlen = len; 915 inutf = pdc_utf16; 916 } 917 else if (flags & PDC_CONV_KEEPBYTES) 918 { 919 oututf = pdc_bytes; 920 } 921 } 922 923 /* illegal UTF-16 */ 924 if (inutf != pdc_bytes && inutf != pdc_utf8 && inlen % 2) 925 { 926 if (inalloc) 927 pdc_free(pdc, instring); 928 errcode = PDC_E_CONV_ILLUTF16; 929 goto PDC_CONV_ERROR; 930 } 931 932 /* UTF conversion */ 933 oututf_s = oututf; 934 if ((oututf_s == pdc_bytes && inutf == pdc_utf8) || 935 oututf_s == pdc_utf16be || oututf_s == pdc_utf16le) 936 oututf_s = pdc_utf16; 937 if (inutf != oututf_s && oututf_s != pdc_bytes) 938 { 939 len = 4 * inlen + 2; 940 instr = (pdc_byte *) pdc_malloc(pdc, (size_t) len, fn); 941 942 if (inlen) 943 { 944 pdc_convers_result result; 945 pdc_byte *instringa, *instra, *instringe, *instre; 946 947 instringa = instring; 948 instringe = instring + inlen; 949 instra = instr; 950 instre = instr + len; 951 952 if (inutf == pdc_utf8) 953 result = pdc_convertUTF8toUTF16( 954 (UTF8 **) &instringa, (UTF8 *) instringe, 955 (UTF16 **) &instra, (UTF16 *) instre, 956 strictConversion); 957 else 958 result = pdc_convertUTF16toUTF8( 959 (UTF16 **) &instringa, (UTF16 *) instringe, 960 (UTF8 **) &instra, (UTF8 *) instre, 961 strictConversion); 962 963 if (inalloc) 964 pdc_free(pdc, instring); 965 966 switch (result) 967 { 968 case targetExhausted: 969 errcode = PDC_E_CONV_MEMOVERFLOW; 970 break; 971 972 case sourceExhausted: 973 case sourceIllegal: 974 errcode = PDC_E_CONV_ILLUTF; 975 break; 976 977 default: 978 break; 979 } 980 981 if (errcode) 982 { 983 pdc_free(pdc, instr); 984 goto PDC_CONV_ERROR; 985 } 986 987 inlen = instra - instr; 988 } 989 990 if (inlen + 2 != len) 991 instr = pdc_realloc(pdc, instr, (size_t) (inlen + 2), fn); 992 instr[inlen] = 0; 993 instr[inlen + 1] = 0; 994 995 inalloc = pdc_true; 996 instring = instr; 997 inutf = oututf_s; 998 } 999 1000 if (inutf == pdc_bytes) 1001 { 1002 if (!inalloc) 1003 { 1004 instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (inlen + 2), fn); 1005 memcpy(instr, instring, (size_t) inlen); 1006 instr[inlen] = 0; 1007 instr[inlen + 1] = 0; 1008 1009 instring = instr; 1010 } 1011 } 1012 1013 /* trying to reduce UTF-16 string to bytes string */ 1014 if (inutf == pdc_utf16 && 1015 (flags & PDC_CONV_TRYBYTES || oututf == pdc_bytes)) 1016 { 1017 len = inlen / 2; 1018 instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 2), fn); 1019 usinstr = (pdc_ushort *) instring; 1020 1021 for (i = 0; i < len; i++) 1022 { 1023 uv = usinstr[i]; 1024 if (outev && uv) 1025 uv = (pdc_ushort) pdc_get_encoding_bytecode(pdc, outev, uv); 1026 if (uv > 0x00FF) 1027 break; 1028 1029 instr[i] = (pdc_byte) uv; 1030 } 1031 1032 if (i == len) 1033 { 1034 instr[len] = 0; 1035 instr[len + 1] = 0; 1036 1037 if (inalloc) 1038 pdc_free(pdc, instring); 1039 1040 inalloc = pdc_true; 1041 instring = instr; 1042 inlen = len; 1043 inutf = pdc_bytes; 1044 } 1045 else 1046 pdc_free(pdc, instr); 1047 } 1048 1049 /* UTF-8 format */ 1050 if (inutf == pdc_utf8) 1051 { 1052 hasbom = pdc_is_utf8_unicode(instring); 1053 1054 if (flags & PDC_CONV_TRY7BYTES) 1055 { 1056 for (i = hasbom ? 3 : 0; i < inlen; i++) 1057 if (instring[i] > 0x7F) 1058 break; 1059 if (i == inlen) 1060 { 1061 flags &= ~PDC_CONV_WITHBOM; 1062 flags |= PDC_CONV_NOBOM; 1063 inutf = pdc_bytes; 1064 } 1065 } 1066 1067 if (!inalloc || flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM) 1068 { 1069 i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 3 : 0; 1070 j = (flags & PDC_CONV_NOBOM && hasbom) ? 3 : 0; 1071 1072 len = inlen + i - j; 1073 instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 1), fn); 1074 memcpy(&instr[i], &instring[j], (size_t) (inlen - j)); 1075 instr[len] = 0; 1076 1077 if (inalloc) 1078 pdc_free(pdc, instring); 1079 1080 instring = instr; 1081 inlen = len; 1082 1083 hasbom = (flags & PDC_CONV_WITHBOM); 1084 } 1085 1086 if (hasbom) 1087 { 1088 instring[0] = PDF_BOM2; 1089 instring[1] = PDF_BOM3; 1090 instring[2] = PDF_BOM4; 1091 } 1092 } 1093 1094 /* UTF-16 formats */ 1095 if (inutf == pdc_utf16 || inutf == pdc_utf16be || inutf == pdc_utf16le) 1096 { 1097 hasbom = pdc_is_utf16be_unicode(instring) || 1098 pdc_is_utf16le_unicode(instring); 1099 1100 if (!inalloc || oututf == pdc_utf16be || oututf == pdc_utf16le || 1101 flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM) 1102 { 1103 i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 2 : 0; 1104 j = (flags & PDC_CONV_NOBOM && hasbom) ? 2 : 0; 1105 1106 len = inlen + i - j; 1107 instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 2), fn); 1108 memcpy(&instr[i], &instring[j], (size_t) (inlen - j)); 1109 instr[len] = 0; 1110 instr[len + 1] = 0; 1111 1112 if (inalloc) 1113 pdc_free(pdc, instring); 1114 1115 instring = instr; 1116 inlen = len; 1117 1118 hasbom = (flags & PDC_CONV_WITHBOM); 1119 } 1120 1121 i = hasbom ? 2 : 0; 1122 if (inutf == pdc_utf16) 1123 { 1124 if (oututf == pdc_utf16be) 1125 { 1126 inutf = pdc_utf16be; 1127 toswap = !PDC_ISBIGENDIAN; 1128 } 1129 if (oututf == pdc_utf16le) 1130 { 1131 inutf = pdc_utf16le; 1132 toswap = PDC_ISBIGENDIAN; 1133 } 1134 if (toswap) 1135 pdc_swap_bytes((char *) &instring[i], inlen - i, NULL); 1136 } 1137 1138 if (hasbom) 1139 { 1140 if (inutf == pdc_utf16be || 1141 (inutf == pdc_utf16 && PDC_ISBIGENDIAN)) 1142 { 1143 instring[0] = PDF_BOM0; 1144 instring[1] = PDF_BOM1; 1145 } 1146 if (inutf == pdc_utf16le || 1147 (inutf == pdc_utf16 && !PDC_ISBIGENDIAN)) 1148 { 1149 instring[0] = PDF_BOM1; 1150 instring[1] = PDF_BOM0; 1151 } 1152 } 1153 } 1154 1155 *oututf_p = inutf; 1156 *outlen = inlen; 1157 *outstring = instring; 1158 return 0; 1159 1160 PDC_CONV_ERROR: 1161 *outlen = 0; 1162 *outstring = NULL; 1163 1164 if (errcode == PDC_E_CONV_ILLUTF) 1165 { 1166 const char *stemp = 1167 pdc_errprintf(pdc, "%d", inutf == pdc_utf8 ? 8 : 16); 1168 pdc_set_errmsg(pdc, errcode, stemp, 0, 0, 0); 1169 } 1170 else 1171 pdc_set_errmsg(pdc, errcode, 0, 0, 0, 0); 1172 1173 if (verbose) 1174 pdc_error(pdc, -1, 0, 0, 0, 0); 1175 1176 return errcode; 1177} 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215