1/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */ 2 3/* 4 * This file is part of The Croco Library 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of version 2.1 of the GNU General Public 8 * License as published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 * USA 19 * 20 * Author: Dodji Seketeli 21 * See COPYRIGHTS file for copyright information. 22 */ 23 24#include <config.h> 25#include "cr-utils.h" 26#include "cr-string.h" 27 28/** 29 *@file: 30 *Some misc utility functions used 31 *in the libcroco. 32 *Note that troughout this file I will 33 *refer to the CSS SPECIFICATIONS DOCUMENTATION 34 *written by the w3c guys. You can find that document 35 *at http://www.w3.org/TR/REC-CSS2/ . 36 */ 37 38/**************************** 39 *Encoding transformations and 40 *encoding helpers 41 ****************************/ 42 43/* 44 *Here is the correspondance between the ucs-4 charactere codes 45 *and there matching utf-8 encoding pattern as dscribed by RFC 2279: 46 * 47 *UCS-4 range (hex.) UTF-8 octet sequence (binary) 48 *------------------ ----------------------------- 49 *0000 0000-0000 007F 0xxxxxxx 50 *0000 0080-0000 07FF 110xxxxx 10xxxxxx 51 *0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 52 *0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 53 *0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 54 *0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx 55 */ 56 57/** 58 *Given an utf8 string buffer, calculates 59 *the length of this string if it was encoded 60 *in ucs4. 61 *@param a_in_start a pointer to the begining of 62 *the input utf8 string. 63 *@param a_in_end a pointre to the end of the input 64 *utf8 string (points to the last byte of the buffer) 65 *@param a_len out parameter the calculated length. 66 *@return CR_OK upon succesfull completion, an error code 67 *otherwise. 68 */ 69enum CRStatus 70cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start, 71 const guchar * a_in_end, gulong * a_len) 72{ 73 guchar *byte_ptr = NULL; 74 gint len = 0; 75 76 /* 77 *to store the final decoded 78 *unicode char 79 */ 80 guint c = 0; 81 82 g_return_val_if_fail (a_in_start && a_in_end && a_len, 83 CR_BAD_PARAM_ERROR); 84 *a_len = 0; 85 86 for (byte_ptr = (guchar *) a_in_start; 87 byte_ptr <= a_in_end; byte_ptr++) { 88 gint nb_bytes_2_decode = 0; 89 90 if (*byte_ptr <= 0x7F) { 91 /* 92 *7 bits long char 93 *encoded over 1 byte: 94 * 0xxx xxxx 95 */ 96 c = *byte_ptr; 97 nb_bytes_2_decode = 1; 98 99 } else if ((*byte_ptr & 0xE0) == 0xC0) { 100 /* 101 *up to 11 bits long char. 102 *encoded over 2 bytes: 103 *110x xxxx 10xx xxxx 104 */ 105 c = *byte_ptr & 0x1F; 106 nb_bytes_2_decode = 2; 107 108 } else if ((*byte_ptr & 0xF0) == 0xE0) { 109 /* 110 *up to 16 bit long char 111 *encoded over 3 bytes: 112 *1110 xxxx 10xx xxxx 10xx xxxx 113 */ 114 c = *byte_ptr & 0x0F; 115 nb_bytes_2_decode = 3; 116 117 } else if ((*byte_ptr & 0xF8) == 0xF0) { 118 /* 119 *up to 21 bits long char 120 *encoded over 4 bytes: 121 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 122 */ 123 c = *byte_ptr & 0x7; 124 nb_bytes_2_decode = 4; 125 126 } else if ((*byte_ptr & 0xFC) == 0xF8) { 127 /* 128 *up to 26 bits long char 129 *encoded over 5 bytes. 130 *1111 10xx 10xx xxxx 10xx xxxx 131 *10xx xxxx 10xx xxxx 132 */ 133 c = *byte_ptr & 3; 134 nb_bytes_2_decode = 5; 135 136 } else if ((*byte_ptr & 0xFE) == 0xFC) { 137 /* 138 *up to 31 bits long char 139 *encoded over 6 bytes: 140 *1111 110x 10xx xxxx 10xx xxxx 141 *10xx xxxx 10xx xxxx 10xx xxxx 142 */ 143 c = *byte_ptr & 1; 144 nb_bytes_2_decode = 6; 145 146 } else { 147 /* 148 *BAD ENCODING 149 */ 150 return CR_ENCODING_ERROR; 151 } 152 153 /* 154 *Go and decode the remaining byte(s) 155 *(if any) to get the current character. 156 */ 157 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 158 /*decode the next byte */ 159 byte_ptr++; 160 161 /*byte pattern must be: 10xx xxxx */ 162 if ((*byte_ptr & 0xC0) != 0x80) { 163 return CR_ENCODING_ERROR; 164 } 165 166 c = (c << 6) | (*byte_ptr & 0x3F); 167 } 168 169 len++; 170 } 171 172 *a_len = len; 173 174 return CR_OK; 175} 176 177/** 178 *Given an ucs4 string, this function 179 *returns the size (in bytes) this string 180 *would have occupied if it was encoded in utf-8. 181 *@param a_in_start a pointer to the beginning of the input 182 *buffer. 183 *@param a_in_end a pointer to the end of the input buffer. 184 *@param a_len out parameter. The computed length. 185 *@return CR_OK upon successfull completion, an error code otherwise. 186 */ 187enum CRStatus 188cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start, 189 const guint32 * a_in_end, gulong * a_len) 190{ 191 gint len = 0; 192 guint32 *char_ptr = NULL; 193 194 g_return_val_if_fail (a_in_start && a_in_end && a_len, 195 CR_BAD_PARAM_ERROR); 196 197 for (char_ptr = (guint32 *) a_in_start; 198 char_ptr <= a_in_end; char_ptr++) { 199 if (*char_ptr <= 0x7F) { 200 /*the utf-8 char would take 1 byte */ 201 len += 1; 202 } else if (*char_ptr <= 0x7FF) { 203 /*the utf-8 char would take 2 bytes */ 204 len += 2; 205 } else if (*char_ptr <= 0xFFFF) { 206 len += 3; 207 } else if (*char_ptr <= 0x1FFFFF) { 208 len += 4; 209 } else if (*char_ptr <= 0x3FFFFFF) { 210 len += 5; 211 } else if (*char_ptr <= 0x7FFFFFFF) { 212 len += 6; 213 } 214 } 215 216 *a_len = len; 217 return CR_OK; 218} 219 220/** 221 *Given an ucsA string, this function 222 *returns the size (in bytes) this string 223 *would have occupied if it was encoded in utf-8. 224 *@param a_in_start a pointer to the beginning of the input 225 *buffer. 226 *@param a_in_end a pointer to the end of the input buffer. 227 *@param a_len out parameter. The computed length. 228 *@return CR_OK upon successfull completion, an error code otherwise. 229 */ 230enum CRStatus 231cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start, 232 const guchar * a_in_end, gulong * a_len) 233{ 234 gint len = 0; 235 guchar *char_ptr = NULL; 236 237 g_return_val_if_fail (a_in_start && a_in_end && a_len, 238 CR_BAD_PARAM_ERROR); 239 240 for (char_ptr = (guchar *) a_in_start; 241 char_ptr <= a_in_end; char_ptr++) { 242 if (*char_ptr <= 0x7F) { 243 /*the utf-8 char would take 1 byte */ 244 len += 1; 245 } else { 246 /*the utf-8 char would take 2 bytes */ 247 len += 2; 248 } 249 } 250 251 *a_len = len; 252 return CR_OK; 253} 254 255/** 256 *Converts an utf8 buffer into an ucs4 buffer. 257 * 258 *@param a_in the input utf8 buffer to convert. 259 *@param a_in_len in/out parameter. The size of the 260 *input buffer to convert. After return, this parameter contains 261 *the actual number of bytes consumed. 262 *@param a_out the output converted ucs4 buffer. Must be allocated by 263 *the caller. 264 *@param a_out_len in/out parameter. The size of the output buffer. 265 *If this size is actually smaller than the real needed size, the function 266 *just converts what it can and returns a success status. After return, 267 *this param points to the actual number of characters decoded. 268 *@return CR_OK upon successfull completion, an error code otherwise. 269 */ 270enum CRStatus 271cr_utils_utf8_to_ucs4 (const guchar * a_in, 272 gulong * a_in_len, guint32 * a_out, gulong * a_out_len) 273{ 274 gulong in_len = 0, 275 out_len = 0, 276 in_index = 0, 277 out_index = 0; 278 enum CRStatus status = CR_OK; 279 280 /* 281 *to store the final decoded 282 *unicode char 283 */ 284 guint c = 0; 285 286 g_return_val_if_fail (a_in && a_in_len 287 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 288 289 if (*a_in_len < 1) { 290 status = CR_OK; 291 goto end; 292 } 293 294 in_len = *a_in_len; 295 out_len = *a_out_len; 296 297 for (in_index = 0, out_index = 0; 298 (in_index < in_len) && (out_index < out_len); 299 in_index++, out_index++) { 300 gint nb_bytes_2_decode = 0; 301 302 if (a_in[in_index] <= 0x7F) { 303 /* 304 *7 bits long char 305 *encoded over 1 byte: 306 * 0xxx xxxx 307 */ 308 c = a_in[in_index]; 309 nb_bytes_2_decode = 1; 310 311 } else if ((a_in[in_index] & 0xE0) == 0xC0) { 312 /* 313 *up to 11 bits long char. 314 *encoded over 2 bytes: 315 *110x xxxx 10xx xxxx 316 */ 317 c = a_in[in_index] & 0x1F; 318 nb_bytes_2_decode = 2; 319 320 } else if ((a_in[in_index] & 0xF0) == 0xE0) { 321 /* 322 *up to 16 bit long char 323 *encoded over 3 bytes: 324 *1110 xxxx 10xx xxxx 10xx xxxx 325 */ 326 c = a_in[in_index] & 0x0F; 327 nb_bytes_2_decode = 3; 328 329 } else if ((a_in[in_index] & 0xF8) == 0xF0) { 330 /* 331 *up to 21 bits long char 332 *encoded over 4 bytes: 333 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 334 */ 335 c = a_in[in_index] & 0x7; 336 nb_bytes_2_decode = 4; 337 338 } else if ((a_in[in_index] & 0xFC) == 0xF8) { 339 /* 340 *up to 26 bits long char 341 *encoded over 5 bytes. 342 *1111 10xx 10xx xxxx 10xx xxxx 343 *10xx xxxx 10xx xxxx 344 */ 345 c = a_in[in_index] & 3; 346 nb_bytes_2_decode = 5; 347 348 } else if ((a_in[in_index] & 0xFE) == 0xFC) { 349 /* 350 *up to 31 bits long char 351 *encoded over 6 bytes: 352 *1111 110x 10xx xxxx 10xx xxxx 353 *10xx xxxx 10xx xxxx 10xx xxxx 354 */ 355 c = a_in[in_index] & 1; 356 nb_bytes_2_decode = 6; 357 358 } else { 359 /*BAD ENCODING */ 360 goto end; 361 } 362 363 /* 364 *Go and decode the remaining byte(s) 365 *(if any) to get the current character. 366 */ 367 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 368 /*decode the next byte */ 369 in_index++; 370 371 /*byte pattern must be: 10xx xxxx */ 372 if ((a_in[in_index] & 0xC0) != 0x80) { 373 goto end; 374 } 375 376 c = (c << 6) | (a_in[in_index] & 0x3F); 377 } 378 379 /* 380 *The decoded ucs4 char is now 381 *in c. 382 */ 383 384 /************************ 385 *Some security tests 386 ***********************/ 387 388 /*be sure c is a char */ 389 if (c == 0xFFFF || c == 0xFFFE) 390 goto end; 391 392 /*be sure c is inferior to the max ucs4 char value */ 393 if (c > 0x10FFFF) 394 goto end; 395 396 /* 397 *c must be less than UTF16 "lower surrogate begin" 398 *or higher than UTF16 "High surrogate end" 399 */ 400 if (c >= 0xD800 && c <= 0xDFFF) 401 goto end; 402 403 /*Avoid characters that equals zero */ 404 if (c == 0) 405 goto end; 406 407 a_out[out_index] = c; 408 } 409 410 end: 411 *a_out_len = out_index + 1; 412 *a_in_len = in_index + 1; 413 414 return status; 415} 416 417/** 418 *Reads a character from an utf8 buffer. 419 *Actually decode the next character code (unicode character code) 420 *and returns it. 421 *@param a_in the starting address of the utf8 buffer. 422 *@param a_in_len the length of the utf8 buffer. 423 *@param a_out output parameter. The resulting read char. 424 *@param a_consumed the number of the bytes consumed to 425 *decode the returned character code. 426 *@return CR_OK upon successfull completion, an error code otherwise. 427 */ 428enum CRStatus 429cr_utils_read_char_from_utf8_buf (const guchar * a_in, 430 gulong a_in_len, 431 guint32 * a_out, gulong * a_consumed) 432{ 433 gulong in_len = 0, 434 in_index = 0, 435 nb_bytes_2_decode = 0; 436 enum CRStatus status = CR_OK; 437 438 /* 439 *to store the final decoded 440 *unicode char 441 */ 442 guint32 c = 0; 443 444 g_return_val_if_fail (a_in && a_out && a_out 445 && a_consumed, CR_BAD_PARAM_ERROR); 446 447 if (a_in_len < 1) { 448 status = CR_OK; 449 goto end; 450 } 451 452 in_len = a_in_len; 453 454 if (*a_in <= 0x7F) { 455 /* 456 *7 bits long char 457 *encoded over 1 byte: 458 * 0xxx xxxx 459 */ 460 c = *a_in; 461 nb_bytes_2_decode = 1; 462 463 } else if ((*a_in & 0xE0) == 0xC0) { 464 /* 465 *up to 11 bits long char. 466 *encoded over 2 bytes: 467 *110x xxxx 10xx xxxx 468 */ 469 c = *a_in & 0x1F; 470 nb_bytes_2_decode = 2; 471 472 } else if ((*a_in & 0xF0) == 0xE0) { 473 /* 474 *up to 16 bit long char 475 *encoded over 3 bytes: 476 *1110 xxxx 10xx xxxx 10xx xxxx 477 */ 478 c = *a_in & 0x0F; 479 nb_bytes_2_decode = 3; 480 481 } else if ((*a_in & 0xF8) == 0xF0) { 482 /* 483 *up to 21 bits long char 484 *encoded over 4 bytes: 485 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 486 */ 487 c = *a_in & 0x7; 488 nb_bytes_2_decode = 4; 489 490 } else if ((*a_in & 0xFC) == 0xF8) { 491 /* 492 *up to 26 bits long char 493 *encoded over 5 bytes. 494 *1111 10xx 10xx xxxx 10xx xxxx 495 *10xx xxxx 10xx xxxx 496 */ 497 c = *a_in & 3; 498 nb_bytes_2_decode = 5; 499 500 } else if ((*a_in & 0xFE) == 0xFC) { 501 /* 502 *up to 31 bits long char 503 *encoded over 6 bytes: 504 *1111 110x 10xx xxxx 10xx xxxx 505 *10xx xxxx 10xx xxxx 10xx xxxx 506 */ 507 c = *a_in & 1; 508 nb_bytes_2_decode = 6; 509 510 } else { 511 /*BAD ENCODING */ 512 goto end; 513 } 514 515 if (nb_bytes_2_decode > a_in_len) { 516 status = CR_END_OF_INPUT_ERROR; 517 goto end; 518 } 519 520 /* 521 *Go and decode the remaining byte(s) 522 *(if any) to get the current character. 523 */ 524 for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) { 525 /*byte pattern must be: 10xx xxxx */ 526 if ((a_in[in_index] & 0xC0) != 0x80) { 527 goto end; 528 } 529 530 c = (c << 6) | (a_in[in_index] & 0x3F); 531 } 532 533 /* 534 *The decoded ucs4 char is now 535 *in c. 536 */ 537 538 /************************ 539 *Some security tests 540 ***********************/ 541 542 /*be sure c is a char */ 543 if (c == 0xFFFF || c == 0xFFFE) 544 goto end; 545 546 /*be sure c is inferior to the max ucs4 char value */ 547 if (c > 0x10FFFF) 548 goto end; 549 550 /* 551 *c must be less than UTF16 "lower surrogate begin" 552 *or higher than UTF16 "High surrogate end" 553 */ 554 if (c >= 0xD800 && c <= 0xDFFF) 555 goto end; 556 557 /*Avoid characters that equals zero */ 558 if (c == 0) 559 goto end; 560 561 *a_out = c; 562 563 end: 564 *a_consumed = nb_bytes_2_decode; 565 566 return status; 567} 568 569/** 570 * 571 */ 572enum CRStatus 573cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start, 574 const guchar * a_in_end, gulong * a_len) 575{ 576 /* 577 *Note: this function can be made shorter 578 *but it considers all the cases of the utf8 encoding 579 *to ease further extensions ... 580 */ 581 582 guchar *byte_ptr = NULL; 583 gint len = 0; 584 585 /* 586 *to store the final decoded 587 *unicode char 588 */ 589 guint c = 0; 590 591 g_return_val_if_fail (a_in_start && a_in_end && a_len, 592 CR_BAD_PARAM_ERROR); 593 *a_len = 0; 594 595 for (byte_ptr = (guchar *) a_in_start; 596 byte_ptr <= a_in_end; byte_ptr++) { 597 gint nb_bytes_2_decode = 0; 598 599 if (*byte_ptr <= 0x7F) { 600 /* 601 *7 bits long char 602 *encoded over 1 byte: 603 * 0xxx xxxx 604 */ 605 c = *byte_ptr; 606 nb_bytes_2_decode = 1; 607 608 } else if ((*byte_ptr & 0xE0) == 0xC0) { 609 /* 610 *up to 11 bits long char. 611 *encoded over 2 bytes: 612 *110x xxxx 10xx xxxx 613 */ 614 c = *byte_ptr & 0x1F; 615 nb_bytes_2_decode = 2; 616 617 } else if ((*byte_ptr & 0xF0) == 0xE0) { 618 /* 619 *up to 16 bit long char 620 *encoded over 3 bytes: 621 *1110 xxxx 10xx xxxx 10xx xxxx 622 */ 623 c = *byte_ptr & 0x0F; 624 nb_bytes_2_decode = 3; 625 626 } else if ((*byte_ptr & 0xF8) == 0xF0) { 627 /* 628 *up to 21 bits long char 629 *encoded over 4 bytes: 630 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 631 */ 632 c = *byte_ptr & 0x7; 633 nb_bytes_2_decode = 4; 634 635 } else if ((*byte_ptr & 0xFC) == 0xF8) { 636 /* 637 *up to 26 bits long char 638 *encoded over 5 bytes. 639 *1111 10xx 10xx xxxx 10xx xxxx 640 *10xx xxxx 10xx xxxx 641 */ 642 c = *byte_ptr & 3; 643 nb_bytes_2_decode = 5; 644 645 } else if ((*byte_ptr & 0xFE) == 0xFC) { 646 /* 647 *up to 31 bits long char 648 *encoded over 6 bytes: 649 *1111 110x 10xx xxxx 10xx xxxx 650 *10xx xxxx 10xx xxxx 10xx xxxx 651 */ 652 c = *byte_ptr & 1; 653 nb_bytes_2_decode = 6; 654 655 } else { 656 /* 657 *BAD ENCODING 658 */ 659 return CR_ENCODING_ERROR; 660 } 661 662 /* 663 *Go and decode the remaining byte(s) 664 *(if any) to get the current character. 665 */ 666 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 667 /*decode the next byte */ 668 byte_ptr++; 669 670 /*byte pattern must be: 10xx xxxx */ 671 if ((*byte_ptr & 0xC0) != 0x80) { 672 return CR_ENCODING_ERROR; 673 } 674 675 c = (c << 6) | (*byte_ptr & 0x3F); 676 } 677 678 /* 679 *The decoded ucs4 char is now 680 *in c. 681 */ 682 683 if (c <= 0xFF) { /*Add other conditions to support 684 *other char sets (ucs2, ucs3, ucs4). 685 */ 686 len++; 687 } else { 688 /*the char is too long to fit 689 *into the supposed charset len. 690 */ 691 return CR_ENCODING_ERROR; 692 } 693 } 694 695 *a_len = len; 696 697 return CR_OK; 698} 699 700/** 701 *Converts an utf8 string into an ucs4 string. 702 *@param a_in the input string to convert. 703 *@param a_in_len in/out parameter. The length of the input 704 *string. After return, points to the actual number of bytes 705 *consumed. This can be usefull to debug the input stream in case 706 *of encoding error. 707 *@param a_out out parameter. Points to the output string. It is allocated 708 *by this function and must be freed by the caller. 709 *@param a_out_len out parameter. The length of the output string. 710 *@return CR_OK upon successfull completion, an error code otherwise. 711 * 712 */ 713enum CRStatus 714cr_utils_utf8_str_to_ucs4 (const guchar * a_in, 715 gulong * a_in_len, 716 guint32 ** a_out, gulong * a_out_len) 717{ 718 enum CRStatus status = CR_OK; 719 720 g_return_val_if_fail (a_in && a_in_len 721 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 722 723 status = cr_utils_utf8_str_len_as_ucs4 (a_in, 724 &a_in[*a_in_len - 1], 725 a_out_len); 726 727 g_return_val_if_fail (status == CR_OK, status); 728 729 *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); 730 731 status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len); 732 733 return status; 734} 735 736/** 737 *Converts an ucs4 buffer into an utf8 buffer. 738 * 739 *@param a_in the input ucs4 buffer to convert. 740 *@param a_in_len in/out parameter. The size of the 741 *input buffer to convert. After return, this parameter contains 742 *the actual number of characters consumed. 743 *@param a_out the output converted utf8 buffer. Must be allocated by 744 *the caller. 745 *@param a_out_len in/out parameter. The size of the output buffer. 746 *If this size is actually smaller than the real needed size, the function 747 *just converts what it can and returns a success status. After return, 748 *this param points to the actual number of bytes in the buffer. 749 *@return CR_OK upon successfull completion, an error code otherwise. 750 */ 751enum CRStatus 752cr_utils_ucs4_to_utf8 (const guint32 * a_in, 753 gulong * a_in_len, guchar * a_out, gulong * a_out_len) 754{ 755 gulong in_len = 0, 756 in_index = 0, 757 out_index = 0; 758 enum CRStatus status = CR_OK; 759 760 g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, 761 CR_BAD_PARAM_ERROR); 762 763 if (*a_in_len < 1) { 764 status = CR_OK; 765 goto end; 766 } 767 768 in_len = *a_in_len; 769 770 for (in_index = 0; in_index < in_len; in_index++) { 771 /* 772 *FIXME: return whenever we encounter forbidden char values. 773 */ 774 775 if (a_in[in_index] <= 0x7F) { 776 a_out[out_index] = a_in[in_index]; 777 out_index++; 778 } else if (a_in[in_index] <= 0x7FF) { 779 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); 780 a_out[out_index + 1] = 781 (0x80 | (a_in[in_index] & 0x3F)); 782 out_index += 2; 783 } else if (a_in[in_index] <= 0xFFFF) { 784 a_out[out_index] = (0xE0 | (a_in[in_index] >> 12)); 785 a_out[out_index + 1] = 786 (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 787 a_out[out_index + 2] = 788 (0x80 | (a_in[in_index] & 0x3F)); 789 out_index += 3; 790 } else if (a_in[in_index] <= 0x1FFFFF) { 791 a_out[out_index] = (0xF0 | (a_in[in_index] >> 18)); 792 a_out[out_index + 1] 793 = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); 794 a_out[out_index + 2] 795 = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 796 a_out[out_index + 3] 797 = (0x80 | (a_in[in_index] & 0x3F)); 798 out_index += 4; 799 } else if (a_in[in_index] <= 0x3FFFFFF) { 800 a_out[out_index] = (0xF8 | (a_in[in_index] >> 24)); 801 a_out[out_index + 1] = 802 (0x80 | (a_in[in_index] >> 18)); 803 a_out[out_index + 2] 804 = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); 805 a_out[out_index + 3] 806 = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 807 a_out[out_index + 4] 808 = (0x80 | (a_in[in_index] & 0x3F)); 809 out_index += 5; 810 } else if (a_in[in_index] <= 0x7FFFFFFF) { 811 a_out[out_index] = (0xFC | (a_in[in_index] >> 30)); 812 a_out[out_index + 1] = 813 (0x80 | (a_in[in_index] >> 24)); 814 a_out[out_index + 2] 815 = (0x80 | ((a_in[in_index] >> 18) & 0x3F)); 816 a_out[out_index + 3] 817 = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); 818 a_out[out_index + 4] 819 = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 820 a_out[out_index + 4] 821 = (0x80 | (a_in[in_index] & 0x3F)); 822 out_index += 6; 823 } else { 824 status = CR_ENCODING_ERROR; 825 goto end; 826 } 827 } /*end for */ 828 829 end: 830 *a_in_len = in_index + 1; 831 *a_out_len = out_index + 1; 832 833 return status; 834} 835 836/** 837 *Converts an ucs4 string into an utf8 string. 838 *@param a_in the input string to convert. 839 *@param a_in_len in/out parameter. The length of the input 840 *string. After return, points to the actual number of characters 841 *consumed. This can be usefull to debug the input string in case 842 *of encoding error. 843 *@param a_out out parameter. Points to the output string. It is allocated 844 *by this function and must be freed by the caller. 845 *@param a_out_len out parameter. The length (in bytes) of the output string. 846 *@return CR_OK upon successfull completion, an error code otherwise. 847 */ 848enum CRStatus 849cr_utils_ucs4_str_to_utf8 (const guint32 * a_in, 850 gulong * a_in_len, 851 guchar ** a_out, gulong * a_out_len) 852{ 853 enum CRStatus status = CR_OK; 854 855 g_return_val_if_fail (a_in && a_in_len && a_out 856 && a_out_len, CR_BAD_PARAM_ERROR); 857 858 status = cr_utils_ucs4_str_len_as_utf8 (a_in, 859 &a_in[*a_out_len - 1], 860 a_out_len); 861 862 g_return_val_if_fail (status == CR_OK, status); 863 864 status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len); 865 866 return status; 867} 868 869/** 870 *Converts an ucs1 buffer into an utf8 buffer. 871 *The caller must know the size of the resulting buffer and 872 *allocate it prior to calling this function. 873 * 874 *@param a_in the input ucs1 buffer. 875 * 876 *@param a_in_len in/out parameter. The length of the input buffer. 877 *After return, points to the number of bytes actually consumed even 878 *in case of encoding error. 879 * 880 *@param a_out out parameter. The output utf8 converted buffer. 881 * 882 *@param a_out_len in/out parameter. The size of the output buffer. 883 *If the output buffer size is shorter than the actual needed size, 884 *this function just convert what it can. 885 * 886 *@return CR_OK upon successfull completion, an error code otherwise. 887 * 888 */ 889enum CRStatus 890cr_utils_ucs1_to_utf8 (const guchar * a_in, 891 gulong * a_in_len, guchar * a_out, gulong * a_out_len) 892{ 893 gulong out_index = 0, 894 in_index = 0, 895 in_len = 0, 896 out_len = 0; 897 enum CRStatus status = CR_OK; 898 899 g_return_val_if_fail (a_in && a_in_len 900 && a_out_len, 901 CR_BAD_PARAM_ERROR); 902 903 if (*a_in_len == 0) { 904 *a_out_len = 0 ; 905 return CR_OK ; 906 } 907 g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ; 908 909 if (*a_in_len < 1) { 910 status = CR_OK; 911 goto end; 912 } 913 914 in_len = *a_in_len; 915 out_len = *a_out_len; 916 917 for (in_index = 0, out_index = 0; 918 (in_index < in_len) && (out_index < out_len); in_index++) { 919 /* 920 *FIXME: return whenever we encounter forbidden char values. 921 */ 922 923 if (a_in[in_index] <= 0x7F) { 924 a_out[out_index] = a_in[in_index]; 925 out_index++; 926 } else { 927 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); 928 a_out[out_index + 1] = 929 (0x80 | (a_in[in_index] & 0x3F)); 930 out_index += 2; 931 } 932 } /*end for */ 933 934 end: 935 *a_in_len = in_index; 936 *a_out_len = out_index; 937 938 return CR_OK; 939} 940 941/** 942 *Converts an ucs1 string into an utf8 string. 943 *@param a_in_start the beginning of the input string to convert. 944 *@param a_in_end the end of the input string to convert. 945 *@param a_out out parameter. The converted string. 946 *@param a_out out parameter. The length of the converted string. 947 *@return CR_OK upon successfull completion, an error code otherwise. 948 * 949 */ 950enum CRStatus 951cr_utils_ucs1_str_to_utf8 (const guchar * a_in, 952 gulong * a_in_len, 953 guchar ** a_out, gulong * a_out_len) 954{ 955 gulong in_len = 0, 956 out_len = 0; 957 enum CRStatus status = CR_OK; 958 959 g_return_val_if_fail (a_in && a_in_len && a_out 960 && a_out_len, CR_BAD_PARAM_ERROR); 961 962 if (*a_in_len < 1) { 963 *a_out_len = 0; 964 *a_out = NULL; 965 return CR_OK; 966 } 967 968 status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1], 969 &out_len); 970 971 g_return_val_if_fail (status == CR_OK, status); 972 973 in_len = *a_in_len; 974 975 *a_out = g_malloc0 (out_len); 976 977 status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len); 978 979 *a_out_len = out_len; 980 981 return status; 982} 983 984/** 985 *Converts an utf8 buffer into an ucs1 buffer. 986 *The caller must know the size of the resulting 987 *converted buffer, and allocated it prior to calling this 988 *function. 989 * 990 *@param a_in the input utf8 buffer to convert. 991 * 992 *@param a_in_len in/out parameter. The size of the input utf8 buffer. 993 *After return, points to the number of bytes consumed 994 *by the function even in case of encoding error. 995 * 996 *@param a_out out parameter. Points to the resulting buffer. 997 *Must be allocated by the caller. If the size of a_out is shorter 998 *than its required size, this function converts what it can and return 999 *a successfull status. 1000 * 1001 *@param a_out_len in/out parameter. The size of the output buffer. 1002 *After return, points to the number of bytes consumed even in case of 1003 *encoding error. 1004 * 1005 *@return CR_OK upon successfull completion, an error code otherwise. 1006 */ 1007enum CRStatus 1008cr_utils_utf8_to_ucs1 (const guchar * a_in, 1009 gulong * a_in_len, guchar * a_out, gulong * a_out_len) 1010{ 1011 gulong in_index = 0, 1012 out_index = 0, 1013 in_len = 0, 1014 out_len = 0; 1015 enum CRStatus status = CR_OK; 1016 1017 /* 1018 *to store the final decoded 1019 *unicode char 1020 */ 1021 guint32 c = 0; 1022 1023 g_return_val_if_fail (a_in && a_in_len 1024 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 1025 1026 if (*a_in_len < 1) { 1027 status = CR_OK; 1028 goto end; 1029 } 1030 1031 in_len = *a_in_len; 1032 out_len = *a_out_len; 1033 1034 for (in_index = 0, out_index = 0; 1035 (in_index < in_len) && (out_index < out_len); 1036 in_index++, out_index++) { 1037 gint nb_bytes_2_decode = 0; 1038 1039 if (a_in[in_index] <= 0x7F) { 1040 /* 1041 *7 bits long char 1042 *encoded over 1 byte: 1043 * 0xxx xxxx 1044 */ 1045 c = a_in[in_index]; 1046 nb_bytes_2_decode = 1; 1047 1048 } else if ((a_in[in_index] & 0xE0) == 0xC0) { 1049 /* 1050 *up to 11 bits long char. 1051 *encoded over 2 bytes: 1052 *110x xxxx 10xx xxxx 1053 */ 1054 c = a_in[in_index] & 0x1F; 1055 nb_bytes_2_decode = 2; 1056 1057 } else if ((a_in[in_index] & 0xF0) == 0xE0) { 1058 /* 1059 *up to 16 bit long char 1060 *encoded over 3 bytes: 1061 *1110 xxxx 10xx xxxx 10xx xxxx 1062 */ 1063 c = a_in[in_index] & 0x0F; 1064 nb_bytes_2_decode = 3; 1065 1066 } else if ((a_in[in_index] & 0xF8) == 0xF0) { 1067 /* 1068 *up to 21 bits long char 1069 *encoded over 4 bytes: 1070 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 1071 */ 1072 c = a_in[in_index] & 0x7; 1073 nb_bytes_2_decode = 4; 1074 1075 } else if ((a_in[in_index] & 0xFC) == 0xF8) { 1076 /* 1077 *up to 26 bits long char 1078 *encoded over 5 bytes. 1079 *1111 10xx 10xx xxxx 10xx xxxx 1080 *10xx xxxx 10xx xxxx 1081 */ 1082 c = a_in[in_index] & 3; 1083 nb_bytes_2_decode = 5; 1084 1085 } else if ((a_in[in_index] & 0xFE) == 0xFC) { 1086 /* 1087 *up to 31 bits long char 1088 *encoded over 6 bytes: 1089 *1111 110x 10xx xxxx 10xx xxxx 1090 *10xx xxxx 10xx xxxx 10xx xxxx 1091 */ 1092 c = a_in[in_index] & 1; 1093 nb_bytes_2_decode = 6; 1094 1095 } else { 1096 /*BAD ENCODING */ 1097 status = CR_ENCODING_ERROR; 1098 goto end; 1099 } 1100 1101 /* 1102 *Go and decode the remaining byte(s) 1103 *(if any) to get the current character. 1104 */ 1105 if (in_index + nb_bytes_2_decode - 1 >= in_len) { 1106 status = CR_OK; 1107 goto end; 1108 } 1109 1110 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 1111 /*decode the next byte */ 1112 in_index++; 1113 1114 /*byte pattern must be: 10xx xxxx */ 1115 if ((a_in[in_index] & 0xC0) != 0x80) { 1116 status = CR_ENCODING_ERROR; 1117 goto end; 1118 } 1119 1120 c = (c << 6) | (a_in[in_index] & 0x3F); 1121 } 1122 1123 /* 1124 *The decoded ucs4 char is now 1125 *in c. 1126 */ 1127 1128 if (c > 0xFF) { 1129 status = CR_ENCODING_ERROR; 1130 goto end; 1131 } 1132 1133 a_out[out_index] = c; 1134 } 1135 1136 end: 1137 *a_out_len = out_index; 1138 *a_in_len = in_index; 1139 1140 return CR_OK; 1141} 1142 1143/** 1144 *Converts an utf8 buffer into an 1145 *ucs1 buffer. 1146 *@param a_in_start the start of the input buffer. 1147 *@param a_in_end the end of the input buffer. 1148 *@param a_out out parameter. The resulting converted ucs4 buffer. 1149 *Must be freed by the caller. 1150 *@param a_out_len out parameter. The length of the converted buffer. 1151 *@return CR_OK upon successfull completion, an error code otherwise. 1152 *Note that out parameters are valid if and only if this function 1153 *returns CR_OK. 1154 */ 1155enum CRStatus 1156cr_utils_utf8_str_to_ucs1 (const guchar * a_in, 1157 gulong * a_in_len, 1158 guchar ** a_out, gulong * a_out_len) 1159{ 1160 enum CRStatus status = CR_OK; 1161 1162 g_return_val_if_fail (a_in && a_in_len 1163 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 1164 1165 if (*a_in_len < 1) { 1166 *a_out_len = 0; 1167 *a_out = NULL; 1168 return CR_OK; 1169 } 1170 1171 status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1], 1172 a_out_len); 1173 1174 g_return_val_if_fail (status == CR_OK, status); 1175 1176 *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); 1177 1178 status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len); 1179 return status; 1180} 1181 1182/***************************************** 1183 *CSS basic types identification utilities 1184 *****************************************/ 1185 1186/** 1187 *Returns TRUE if a_char is a white space as 1188 *defined in the css spec in chap 4.1.1. 1189 * 1190 *white-space ::= ' '| \t|\r|\n|\f 1191 * 1192 *@param a_char the character to test. 1193 *return TRUE if is a white space, false otherwise. 1194 */ 1195gboolean 1196cr_utils_is_white_space (guint32 a_char) 1197{ 1198 switch (a_char) { 1199 case ' ': 1200 case '\t': 1201 case '\r': 1202 case '\n': 1203 case '\f': 1204 return TRUE; 1205 break; 1206 default: 1207 return FALSE; 1208 } 1209} 1210 1211/** 1212 *Returns true if the character is a newline 1213 *as defined in the css spec in the chap 4.1.1. 1214 * 1215 *nl ::= \n|\r\n|\r|\f 1216 * 1217 *@param a_char the character to test. 1218 *@return TRUE if the character is a newline, FALSE otherwise. 1219 */ 1220gboolean 1221cr_utils_is_newline (guint32 a_char) 1222{ 1223 switch (a_char) { 1224 case '\n': 1225 case '\r': 1226 case '\f': 1227 return TRUE; 1228 break; 1229 default: 1230 return FALSE; 1231 } 1232} 1233 1234/** 1235 *returns TRUE if the char is part of an hexa num char: 1236 *i.e hexa_char ::= [0-9A-F] 1237 */ 1238gboolean 1239cr_utils_is_hexa_char (guint32 a_char) 1240{ 1241 if ((a_char >= '0' && a_char <= '9') 1242 || (a_char >= 'A' && a_char <= 'F')) { 1243 return TRUE; 1244 } 1245 return FALSE; 1246} 1247 1248/** 1249 *Returns true if the character is a nonascii 1250 *character (as defined in the css spec chap 4.1.1): 1251 * 1252 *nonascii ::= [^\0-\177] 1253 * 1254 *@param a_char the character to test. 1255 *@return TRUE if the character is a nonascii char, 1256 *FALSE otherwise. 1257 */ 1258gboolean 1259cr_utils_is_nonascii (guint32 a_char) 1260{ 1261 if (a_char <= 177) { 1262 return FALSE; 1263 } 1264 1265 return TRUE; 1266} 1267 1268/** 1269 *Dumps a character a_nb times on a file. 1270 *@param a_char the char to dump 1271 *@param a_fp the destination file pointer 1272 *@param a_nb the number of times a_char is to be dumped. 1273 */ 1274void 1275cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb) 1276{ 1277 glong i = 0; 1278 1279 for (i = 0; i < a_nb; i++) { 1280 fprintf (a_fp, "%c", a_char); 1281 } 1282} 1283 1284void 1285cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb) 1286{ 1287 glong i = 0; 1288 1289 g_return_if_fail (a_string); 1290 1291 for (i = 0; i < a_nb; i++) { 1292 g_string_append_printf (a_string, "%c", a_char); 1293 } 1294} 1295 1296gdouble 1297cr_utils_n_to_0_dot_n (glong a_n, glong decimal_places) 1298{ 1299 gdouble result = a_n; 1300 1301 while (decimal_places > 0) { 1302 result = result / 10; 1303 decimal_places--; 1304 } 1305 1306 return result; 1307} 1308 1309/** 1310 *Duplicates a list of GString instances. 1311 *@return the duplicated list of GString instances or NULL if 1312 *something bad happened. 1313 *@param a_list_of_strings the list of strings to be duplicated. 1314 */ 1315GList * 1316cr_utils_dup_glist_of_string (GList * a_list_of_strings) 1317{ 1318 GList *cur = NULL, 1319 *result = NULL; 1320 1321 g_return_val_if_fail (a_list_of_strings, NULL); 1322 1323 for (cur = a_list_of_strings; cur; cur = cur->next) { 1324 GString *str = NULL; 1325 1326 str = g_string_new_len (((GString *) cur->data)->str, 1327 ((GString *) cur->data)->len); 1328 if (str) 1329 result = g_list_append (result, str); 1330 } 1331 1332 return result; 1333} 1334 1335/** 1336 *Duplicate a GList where the GList::data is a CRString. 1337 *@param a_list_of_strings the list to duplicate 1338 *@return the duplicated list, or NULL if something bad 1339 *happened. 1340 */ 1341GList * 1342cr_utils_dup_glist_of_cr_string (GList * a_list_of_strings) 1343{ 1344 GList *cur = NULL, *result = NULL; 1345 1346 g_return_val_if_fail (a_list_of_strings, NULL); 1347 1348 for (cur = a_list_of_strings; cur; cur = cur->next) { 1349 CRString *str = NULL; 1350 1351 str = cr_string_dup ((CRString *) cur->data) ; 1352 if (str) 1353 result = g_list_append (result, str); 1354 } 1355 1356 return result; 1357} 1358