• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/gettext-0.17/gettext-tools/gnulib-lib/libcroco/
1/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
2
3/*
4 * This file is part of The Croco Library
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2.1 of the GNU General Public
8 * License as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 *
20 * Author: Dodji Seketeli
21 * See COPYRIGHTS file for copyright information.
22 */
23
24#include <config.h>
25#include "cr-utils.h"
26#include "cr-string.h"
27
28/**
29 *@file:
30 *Some misc utility functions used
31 *in the libcroco.
32 *Note that troughout this file I will
33 *refer to the CSS SPECIFICATIONS DOCUMENTATION
34 *written by the w3c guys. You can find that document
35 *at http://www.w3.org/TR/REC-CSS2/ .
36 */
37
38/****************************
39 *Encoding transformations and
40 *encoding helpers
41 ****************************/
42
43/*
44 *Here is the correspondance between the ucs-4 charactere codes
45 *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
46 *
47 *UCS-4 range (hex.)    UTF-8 octet sequence (binary)
48 *------------------    -----------------------------
49 *0000 0000-0000 007F   0xxxxxxx
50 *0000 0080-0000 07FF   110xxxxx 10xxxxxx
51 *0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
52 *0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
53 *0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
54 *0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
55 */
56
57/**
58 *Given an utf8 string buffer, calculates
59 *the length of this string if it was encoded
60 *in ucs4.
61 *@param a_in_start a pointer to the begining of
62 *the input utf8 string.
63 *@param a_in_end a pointre to the end of the input
64 *utf8 string (points to the last byte of the buffer)
65 *@param a_len out parameter the calculated length.
66 *@return CR_OK upon succesfull completion, an error code
67 *otherwise.
68 */
69enum CRStatus
70cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
71                               const guchar * a_in_end, gulong * a_len)
72{
73        guchar *byte_ptr = NULL;
74        gint len = 0;
75
76        /*
77         *to store the final decoded
78         *unicode char
79         */
80        guint c = 0;
81
82        g_return_val_if_fail (a_in_start && a_in_end && a_len,
83                              CR_BAD_PARAM_ERROR);
84        *a_len = 0;
85
86        for (byte_ptr = (guchar *) a_in_start;
87             byte_ptr <= a_in_end; byte_ptr++) {
88                gint nb_bytes_2_decode = 0;
89
90                if (*byte_ptr <= 0x7F) {
91                        /*
92                         *7 bits long char
93                         *encoded over 1 byte:
94                         * 0xxx xxxx
95                         */
96                        c = *byte_ptr;
97                        nb_bytes_2_decode = 1;
98
99                } else if ((*byte_ptr & 0xE0) == 0xC0) {
100                        /*
101                         *up to 11 bits long char.
102                         *encoded over 2 bytes:
103                         *110x xxxx  10xx xxxx
104                         */
105                        c = *byte_ptr & 0x1F;
106                        nb_bytes_2_decode = 2;
107
108                } else if ((*byte_ptr & 0xF0) == 0xE0) {
109                        /*
110                         *up to 16 bit long char
111                         *encoded over 3 bytes:
112                         *1110 xxxx  10xx xxxx  10xx xxxx
113                         */
114                        c = *byte_ptr & 0x0F;
115                        nb_bytes_2_decode = 3;
116
117                } else if ((*byte_ptr & 0xF8) == 0xF0) {
118                        /*
119                         *up to 21 bits long char
120                         *encoded over 4 bytes:
121                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
122                         */
123                        c = *byte_ptr & 0x7;
124                        nb_bytes_2_decode = 4;
125
126                } else if ((*byte_ptr & 0xFC) == 0xF8) {
127                        /*
128                         *up to 26 bits long char
129                         *encoded over 5 bytes.
130                         *1111 10xx  10xx xxxx  10xx xxxx
131                         *10xx xxxx  10xx xxxx
132                         */
133                        c = *byte_ptr & 3;
134                        nb_bytes_2_decode = 5;
135
136                } else if ((*byte_ptr & 0xFE) == 0xFC) {
137                        /*
138                         *up to 31 bits long char
139                         *encoded over 6 bytes:
140                         *1111 110x  10xx xxxx  10xx xxxx
141                         *10xx xxxx  10xx xxxx  10xx xxxx
142                         */
143                        c = *byte_ptr & 1;
144                        nb_bytes_2_decode = 6;
145
146                } else {
147                        /*
148                         *BAD ENCODING
149                         */
150                        return CR_ENCODING_ERROR;
151                }
152
153                /*
154                 *Go and decode the remaining byte(s)
155                 *(if any) to get the current character.
156                 */
157                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
158                        /*decode the next byte */
159                        byte_ptr++;
160
161                        /*byte pattern must be: 10xx xxxx */
162                        if ((*byte_ptr & 0xC0) != 0x80) {
163                                return CR_ENCODING_ERROR;
164                        }
165
166                        c = (c << 6) | (*byte_ptr & 0x3F);
167                }
168
169                len++;
170        }
171
172        *a_len = len;
173
174        return CR_OK;
175}
176
177/**
178 *Given an ucs4 string, this function
179 *returns the size (in bytes) this string
180 *would have occupied if it was encoded in utf-8.
181 *@param a_in_start a pointer to the beginning of the input
182 *buffer.
183 *@param a_in_end a pointer to the end of the input buffer.
184 *@param a_len out parameter. The computed length.
185 *@return CR_OK upon successfull completion, an error code otherwise.
186 */
187enum CRStatus
188cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
189                               const guint32 * a_in_end, gulong * a_len)
190{
191        gint len = 0;
192        guint32 *char_ptr = NULL;
193
194        g_return_val_if_fail (a_in_start && a_in_end && a_len,
195                              CR_BAD_PARAM_ERROR);
196
197        for (char_ptr = (guint32 *) a_in_start;
198             char_ptr <= a_in_end; char_ptr++) {
199                if (*char_ptr <= 0x7F) {
200                        /*the utf-8 char would take 1 byte */
201                        len += 1;
202                } else if (*char_ptr <= 0x7FF) {
203                        /*the utf-8 char would take 2 bytes */
204                        len += 2;
205                } else if (*char_ptr <= 0xFFFF) {
206                        len += 3;
207                } else if (*char_ptr <= 0x1FFFFF) {
208                        len += 4;
209                } else if (*char_ptr <= 0x3FFFFFF) {
210                        len += 5;
211                } else if (*char_ptr <= 0x7FFFFFFF) {
212                        len += 6;
213                }
214        }
215
216        *a_len = len;
217        return CR_OK;
218}
219
220/**
221 *Given an ucsA string, this function
222 *returns the size (in bytes) this string
223 *would have occupied if it was encoded in utf-8.
224 *@param a_in_start a pointer to the beginning of the input
225 *buffer.
226 *@param a_in_end a pointer to the end of the input buffer.
227 *@param a_len out parameter. The computed length.
228 *@return CR_OK upon successfull completion, an error code otherwise.
229 */
230enum CRStatus
231cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
232                               const guchar * a_in_end, gulong * a_len)
233{
234        gint len = 0;
235        guchar *char_ptr = NULL;
236
237        g_return_val_if_fail (a_in_start && a_in_end && a_len,
238                              CR_BAD_PARAM_ERROR);
239
240        for (char_ptr = (guchar *) a_in_start;
241             char_ptr <= a_in_end; char_ptr++) {
242                if (*char_ptr <= 0x7F) {
243                        /*the utf-8 char would take 1 byte */
244                        len += 1;
245                } else {
246                        /*the utf-8 char would take 2 bytes */
247                        len += 2;
248                }
249        }
250
251        *a_len = len;
252        return CR_OK;
253}
254
255/**
256 *Converts an utf8 buffer into an ucs4 buffer.
257 *
258 *@param a_in the input utf8 buffer to convert.
259 *@param a_in_len in/out parameter. The size of the
260 *input buffer to convert. After return, this parameter contains
261 *the actual number of bytes consumed.
262 *@param a_out the output converted ucs4 buffer. Must be allocated by
263 *the caller.
264 *@param a_out_len in/out parameter. The size of the output buffer.
265 *If this size is actually smaller than the real needed size, the function
266 *just converts what it can and returns a success status. After return,
267 *this param points to the actual number of characters decoded.
268 *@return CR_OK upon successfull completion, an error code otherwise.
269 */
270enum CRStatus
271cr_utils_utf8_to_ucs4 (const guchar * a_in,
272                       gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
273{
274        gulong in_len = 0,
275                out_len = 0,
276                in_index = 0,
277                out_index = 0;
278        enum CRStatus status = CR_OK;
279
280        /*
281         *to store the final decoded
282         *unicode char
283         */
284        guint c = 0;
285
286        g_return_val_if_fail (a_in && a_in_len
287                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);
288
289        if (*a_in_len < 1) {
290                status = CR_OK;
291                goto end;
292        }
293
294        in_len = *a_in_len;
295        out_len = *a_out_len;
296
297        for (in_index = 0, out_index = 0;
298             (in_index < in_len) && (out_index < out_len);
299             in_index++, out_index++) {
300                gint nb_bytes_2_decode = 0;
301
302                if (a_in[in_index] <= 0x7F) {
303                        /*
304                         *7 bits long char
305                         *encoded over 1 byte:
306                         * 0xxx xxxx
307                         */
308                        c = a_in[in_index];
309                        nb_bytes_2_decode = 1;
310
311                } else if ((a_in[in_index] & 0xE0) == 0xC0) {
312                        /*
313                         *up to 11 bits long char.
314                         *encoded over 2 bytes:
315                         *110x xxxx  10xx xxxx
316                         */
317                        c = a_in[in_index] & 0x1F;
318                        nb_bytes_2_decode = 2;
319
320                } else if ((a_in[in_index] & 0xF0) == 0xE0) {
321                        /*
322                         *up to 16 bit long char
323                         *encoded over 3 bytes:
324                         *1110 xxxx  10xx xxxx  10xx xxxx
325                         */
326                        c = a_in[in_index] & 0x0F;
327                        nb_bytes_2_decode = 3;
328
329                } else if ((a_in[in_index] & 0xF8) == 0xF0) {
330                        /*
331                         *up to 21 bits long char
332                         *encoded over 4 bytes:
333                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
334                         */
335                        c = a_in[in_index] & 0x7;
336                        nb_bytes_2_decode = 4;
337
338                } else if ((a_in[in_index] & 0xFC) == 0xF8) {
339                        /*
340                         *up to 26 bits long char
341                         *encoded over 5 bytes.
342                         *1111 10xx  10xx xxxx  10xx xxxx
343                         *10xx xxxx  10xx xxxx
344                         */
345                        c = a_in[in_index] & 3;
346                        nb_bytes_2_decode = 5;
347
348                } else if ((a_in[in_index] & 0xFE) == 0xFC) {
349                        /*
350                         *up to 31 bits long char
351                         *encoded over 6 bytes:
352                         *1111 110x  10xx xxxx  10xx xxxx
353                         *10xx xxxx  10xx xxxx  10xx xxxx
354                         */
355                        c = a_in[in_index] & 1;
356                        nb_bytes_2_decode = 6;
357
358                } else {
359                        /*BAD ENCODING */
360                        goto end;
361                }
362
363                /*
364                 *Go and decode the remaining byte(s)
365                 *(if any) to get the current character.
366                 */
367                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
368                        /*decode the next byte */
369                        in_index++;
370
371                        /*byte pattern must be: 10xx xxxx */
372                        if ((a_in[in_index] & 0xC0) != 0x80) {
373                                goto end;
374                        }
375
376                        c = (c << 6) | (a_in[in_index] & 0x3F);
377                }
378
379                /*
380                 *The decoded ucs4 char is now
381                 *in c.
382                 */
383
384                /************************
385                 *Some security tests
386                 ***********************/
387
388                /*be sure c is a char */
389                if (c == 0xFFFF || c == 0xFFFE)
390                        goto end;
391
392                /*be sure c is inferior to the max ucs4 char value */
393                if (c > 0x10FFFF)
394                        goto end;
395
396                /*
397                 *c must be less than UTF16 "lower surrogate begin"
398                 *or higher than UTF16 "High surrogate end"
399                 */
400                if (c >= 0xD800 && c <= 0xDFFF)
401                        goto end;
402
403                /*Avoid characters that equals zero */
404                if (c == 0)
405                        goto end;
406
407                a_out[out_index] = c;
408        }
409
410      end:
411        *a_out_len = out_index + 1;
412        *a_in_len = in_index + 1;
413
414        return status;
415}
416
417/**
418 *Reads a character from an utf8 buffer.
419 *Actually decode the next character code (unicode character code)
420 *and returns it.
421 *@param a_in the starting address of the utf8 buffer.
422 *@param a_in_len the length of the utf8 buffer.
423 *@param a_out output parameter. The resulting read char.
424 *@param a_consumed the number of the bytes consumed to
425 *decode the returned character code.
426 *@return CR_OK upon successfull completion, an error code otherwise.
427 */
428enum CRStatus
429cr_utils_read_char_from_utf8_buf (const guchar * a_in,
430                                  gulong a_in_len,
431                                  guint32 * a_out, gulong * a_consumed)
432{
433        gulong in_len = 0,
434                in_index = 0,
435                nb_bytes_2_decode = 0;
436        enum CRStatus status = CR_OK;
437
438        /*
439         *to store the final decoded
440         *unicode char
441         */
442        guint32 c = 0;
443
444        g_return_val_if_fail (a_in && a_out && a_out
445                              && a_consumed, CR_BAD_PARAM_ERROR);
446
447        if (a_in_len < 1) {
448                status = CR_OK;
449                goto end;
450        }
451
452        in_len = a_in_len;
453
454        if (*a_in <= 0x7F) {
455                /*
456                 *7 bits long char
457                 *encoded over 1 byte:
458                 * 0xxx xxxx
459                 */
460                c = *a_in;
461                nb_bytes_2_decode = 1;
462
463        } else if ((*a_in & 0xE0) == 0xC0) {
464                /*
465                 *up to 11 bits long char.
466                 *encoded over 2 bytes:
467                 *110x xxxx  10xx xxxx
468                 */
469                c = *a_in & 0x1F;
470                nb_bytes_2_decode = 2;
471
472        } else if ((*a_in & 0xF0) == 0xE0) {
473                /*
474                 *up to 16 bit long char
475                 *encoded over 3 bytes:
476                 *1110 xxxx  10xx xxxx  10xx xxxx
477                 */
478                c = *a_in & 0x0F;
479                nb_bytes_2_decode = 3;
480
481        } else if ((*a_in & 0xF8) == 0xF0) {
482                /*
483                 *up to 21 bits long char
484                 *encoded over 4 bytes:
485                 *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
486                 */
487                c = *a_in & 0x7;
488                nb_bytes_2_decode = 4;
489
490        } else if ((*a_in & 0xFC) == 0xF8) {
491                /*
492                 *up to 26 bits long char
493                 *encoded over 5 bytes.
494                 *1111 10xx  10xx xxxx  10xx xxxx
495                 *10xx xxxx  10xx xxxx
496                 */
497                c = *a_in & 3;
498                nb_bytes_2_decode = 5;
499
500        } else if ((*a_in & 0xFE) == 0xFC) {
501                /*
502                 *up to 31 bits long char
503                 *encoded over 6 bytes:
504                 *1111 110x  10xx xxxx  10xx xxxx
505                 *10xx xxxx  10xx xxxx  10xx xxxx
506                 */
507                c = *a_in & 1;
508                nb_bytes_2_decode = 6;
509
510        } else {
511                /*BAD ENCODING */
512                goto end;
513        }
514
515        if (nb_bytes_2_decode > a_in_len) {
516                status = CR_END_OF_INPUT_ERROR;
517                goto end;
518        }
519
520        /*
521         *Go and decode the remaining byte(s)
522         *(if any) to get the current character.
523         */
524        for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
525                /*byte pattern must be: 10xx xxxx */
526                if ((a_in[in_index] & 0xC0) != 0x80) {
527                        goto end;
528                }
529
530                c = (c << 6) | (a_in[in_index] & 0x3F);
531        }
532
533        /*
534         *The decoded ucs4 char is now
535         *in c.
536         */
537
538    /************************
539     *Some security tests
540     ***********************/
541
542        /*be sure c is a char */
543        if (c == 0xFFFF || c == 0xFFFE)
544                goto end;
545
546        /*be sure c is inferior to the max ucs4 char value */
547        if (c > 0x10FFFF)
548                goto end;
549
550        /*
551         *c must be less than UTF16 "lower surrogate begin"
552         *or higher than UTF16 "High surrogate end"
553         */
554        if (c >= 0xD800 && c <= 0xDFFF)
555                goto end;
556
557        /*Avoid characters that equals zero */
558        if (c == 0)
559                goto end;
560
561        *a_out = c;
562
563      end:
564        *a_consumed = nb_bytes_2_decode;
565
566        return status;
567}
568
569/**
570 *
571 */
572enum CRStatus
573cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
574                               const guchar * a_in_end, gulong * a_len)
575{
576        /*
577         *Note: this function can be made shorter
578         *but it considers all the cases of the utf8 encoding
579         *to ease further extensions ...
580         */
581
582        guchar *byte_ptr = NULL;
583        gint len = 0;
584
585        /*
586         *to store the final decoded
587         *unicode char
588         */
589        guint c = 0;
590
591        g_return_val_if_fail (a_in_start && a_in_end && a_len,
592                              CR_BAD_PARAM_ERROR);
593        *a_len = 0;
594
595        for (byte_ptr = (guchar *) a_in_start;
596             byte_ptr <= a_in_end; byte_ptr++) {
597                gint nb_bytes_2_decode = 0;
598
599                if (*byte_ptr <= 0x7F) {
600                        /*
601                         *7 bits long char
602                         *encoded over 1 byte:
603                         * 0xxx xxxx
604                         */
605                        c = *byte_ptr;
606                        nb_bytes_2_decode = 1;
607
608                } else if ((*byte_ptr & 0xE0) == 0xC0) {
609                        /*
610                         *up to 11 bits long char.
611                         *encoded over 2 bytes:
612                         *110x xxxx  10xx xxxx
613                         */
614                        c = *byte_ptr & 0x1F;
615                        nb_bytes_2_decode = 2;
616
617                } else if ((*byte_ptr & 0xF0) == 0xE0) {
618                        /*
619                         *up to 16 bit long char
620                         *encoded over 3 bytes:
621                         *1110 xxxx  10xx xxxx  10xx xxxx
622                         */
623                        c = *byte_ptr & 0x0F;
624                        nb_bytes_2_decode = 3;
625
626                } else if ((*byte_ptr & 0xF8) == 0xF0) {
627                        /*
628                         *up to 21 bits long char
629                         *encoded over 4 bytes:
630                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
631                         */
632                        c = *byte_ptr & 0x7;
633                        nb_bytes_2_decode = 4;
634
635                } else if ((*byte_ptr & 0xFC) == 0xF8) {
636                        /*
637                         *up to 26 bits long char
638                         *encoded over 5 bytes.
639                         *1111 10xx  10xx xxxx  10xx xxxx
640                         *10xx xxxx  10xx xxxx
641                         */
642                        c = *byte_ptr & 3;
643                        nb_bytes_2_decode = 5;
644
645                } else if ((*byte_ptr & 0xFE) == 0xFC) {
646                        /*
647                         *up to 31 bits long char
648                         *encoded over 6 bytes:
649                         *1111 110x  10xx xxxx  10xx xxxx
650                         *10xx xxxx  10xx xxxx  10xx xxxx
651                         */
652                        c = *byte_ptr & 1;
653                        nb_bytes_2_decode = 6;
654
655                } else {
656                        /*
657                         *BAD ENCODING
658                         */
659                        return CR_ENCODING_ERROR;
660                }
661
662                /*
663                 *Go and decode the remaining byte(s)
664                 *(if any) to get the current character.
665                 */
666                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
667                        /*decode the next byte */
668                        byte_ptr++;
669
670                        /*byte pattern must be: 10xx xxxx */
671                        if ((*byte_ptr & 0xC0) != 0x80) {
672                                return CR_ENCODING_ERROR;
673                        }
674
675                        c = (c << 6) | (*byte_ptr & 0x3F);
676                }
677
678                /*
679                 *The decoded ucs4 char is now
680                 *in c.
681                 */
682
683                if (c <= 0xFF) { /*Add other conditions to support
684                                  *other char sets (ucs2, ucs3, ucs4).
685                                  */
686                        len++;
687                } else {
688                        /*the char is too long to fit
689                         *into the supposed charset len.
690                         */
691                        return CR_ENCODING_ERROR;
692                }
693        }
694
695        *a_len = len;
696
697        return CR_OK;
698}
699
700/**
701 *Converts an utf8 string into an ucs4 string.
702 *@param a_in the input string to convert.
703 *@param a_in_len in/out parameter. The length of the input
704 *string. After return, points to the actual number of bytes
705 *consumed. This can be usefull to debug the input stream in case
706 *of encoding error.
707 *@param a_out out parameter. Points to the output string. It is allocated
708 *by this function and must be freed by the caller.
709 *@param a_out_len out parameter. The length of the output string.
710 *@return CR_OK upon successfull completion, an error code otherwise.
711 *
712 */
713enum CRStatus
714cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
715                           gulong * a_in_len,
716                           guint32 ** a_out, gulong * a_out_len)
717{
718        enum CRStatus status = CR_OK;
719
720        g_return_val_if_fail (a_in && a_in_len
721                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);
722
723        status = cr_utils_utf8_str_len_as_ucs4 (a_in,
724                                                &a_in[*a_in_len - 1],
725                                                a_out_len);
726
727        g_return_val_if_fail (status == CR_OK, status);
728
729        *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
730
731        status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
732
733        return status;
734}
735
736/**
737 *Converts an ucs4 buffer into an utf8 buffer.
738 *
739 *@param a_in the input ucs4 buffer to convert.
740 *@param a_in_len in/out parameter. The size of the
741 *input buffer to convert. After return, this parameter contains
742 *the actual number of characters consumed.
743 *@param a_out the output converted utf8 buffer. Must be allocated by
744 *the caller.
745 *@param a_out_len in/out parameter. The size of the output buffer.
746 *If this size is actually smaller than the real needed size, the function
747 *just converts what it can and returns a success status. After return,
748 *this param points to the actual number of bytes in the buffer.
749 *@return CR_OK upon successfull completion, an error code otherwise.
750 */
751enum CRStatus
752cr_utils_ucs4_to_utf8 (const guint32 * a_in,
753                       gulong * a_in_len, guchar * a_out, gulong * a_out_len)
754{
755        gulong in_len = 0,
756                in_index = 0,
757                out_index = 0;
758        enum CRStatus status = CR_OK;
759
760        g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
761                              CR_BAD_PARAM_ERROR);
762
763        if (*a_in_len < 1) {
764                status = CR_OK;
765                goto end;
766        }
767
768        in_len = *a_in_len;
769
770        for (in_index = 0; in_index < in_len; in_index++) {
771                /*
772                 *FIXME: return whenever we encounter forbidden char values.
773                 */
774
775                if (a_in[in_index] <= 0x7F) {
776                        a_out[out_index] = a_in[in_index];
777                        out_index++;
778                } else if (a_in[in_index] <= 0x7FF) {
779                        a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
780                        a_out[out_index + 1] =
781                                (0x80 | (a_in[in_index] & 0x3F));
782                        out_index += 2;
783                } else if (a_in[in_index] <= 0xFFFF) {
784                        a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
785                        a_out[out_index + 1] =
786                                (0x80 | ((a_in[in_index] >> 6) & 0x3F));
787                        a_out[out_index + 2] =
788                                (0x80 | (a_in[in_index] & 0x3F));
789                        out_index += 3;
790                } else if (a_in[in_index] <= 0x1FFFFF) {
791                        a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
792                        a_out[out_index + 1]
793                                = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
794                        a_out[out_index + 2]
795                                = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
796                        a_out[out_index + 3]
797                                = (0x80 | (a_in[in_index] & 0x3F));
798                        out_index += 4;
799                } else if (a_in[in_index] <= 0x3FFFFFF) {
800                        a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
801                        a_out[out_index + 1] =
802                                (0x80 | (a_in[in_index] >> 18));
803                        a_out[out_index + 2]
804                                = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
805                        a_out[out_index + 3]
806                                = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
807                        a_out[out_index + 4]
808                                = (0x80 | (a_in[in_index] & 0x3F));
809                        out_index += 5;
810                } else if (a_in[in_index] <= 0x7FFFFFFF) {
811                        a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
812                        a_out[out_index + 1] =
813                                (0x80 | (a_in[in_index] >> 24));
814                        a_out[out_index + 2]
815                                = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
816                        a_out[out_index + 3]
817                                = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
818                        a_out[out_index + 4]
819                                = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
820                        a_out[out_index + 4]
821                                = (0x80 | (a_in[in_index] & 0x3F));
822                        out_index += 6;
823                } else {
824                        status = CR_ENCODING_ERROR;
825                        goto end;
826                }
827        }                       /*end for */
828
829      end:
830        *a_in_len = in_index + 1;
831        *a_out_len = out_index + 1;
832
833        return status;
834}
835
836/**
837 *Converts an ucs4 string into an utf8 string.
838 *@param a_in the input string to convert.
839 *@param a_in_len in/out parameter. The length of the input
840 *string. After return, points to the actual number of characters
841 *consumed. This can be usefull to debug the input string in case
842 *of encoding error.
843 *@param a_out out parameter. Points to the output string. It is allocated
844 *by this function and must be freed by the caller.
845 *@param a_out_len out parameter. The length (in bytes) of the output string.
846 *@return CR_OK upon successfull completion, an error code otherwise.
847 */
848enum CRStatus
849cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
850                           gulong * a_in_len,
851                           guchar ** a_out, gulong * a_out_len)
852{
853        enum CRStatus status = CR_OK;
854
855        g_return_val_if_fail (a_in && a_in_len && a_out
856                              && a_out_len, CR_BAD_PARAM_ERROR);
857
858        status = cr_utils_ucs4_str_len_as_utf8 (a_in,
859                                                &a_in[*a_out_len - 1],
860                                                a_out_len);
861
862        g_return_val_if_fail (status == CR_OK, status);
863
864        status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
865
866        return status;
867}
868
869/**
870 *Converts an ucs1 buffer into an utf8 buffer.
871 *The caller must know the size of the resulting buffer and
872 *allocate it prior to calling this function.
873 *
874 *@param a_in the input ucs1 buffer.
875 *
876 *@param a_in_len in/out parameter. The length of the input buffer.
877 *After return, points to the number of bytes actually consumed even
878 *in case of encoding error.
879 *
880 *@param a_out out parameter. The output utf8 converted buffer.
881 *
882 *@param a_out_len in/out parameter. The size of the output buffer.
883 *If the output buffer size is shorter than the actual needed size,
884 *this function just convert what it can.
885 *
886 *@return CR_OK upon successfull completion, an error code otherwise.
887 *
888 */
889enum CRStatus
890cr_utils_ucs1_to_utf8 (const guchar * a_in,
891                       gulong * a_in_len, guchar * a_out, gulong * a_out_len)
892{
893        gulong out_index = 0,
894                in_index = 0,
895                in_len = 0,
896                out_len = 0;
897        enum CRStatus status = CR_OK;
898
899        g_return_val_if_fail (a_in && a_in_len
900                              && a_out_len,
901                              CR_BAD_PARAM_ERROR);
902
903        if (*a_in_len == 0) {
904                *a_out_len = 0 ;
905                return CR_OK ;
906        }
907        g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
908
909        if (*a_in_len < 1) {
910                status = CR_OK;
911                goto end;
912        }
913
914        in_len = *a_in_len;
915        out_len = *a_out_len;
916
917        for (in_index = 0, out_index = 0;
918             (in_index < in_len) && (out_index < out_len); in_index++) {
919                /*
920                 *FIXME: return whenever we encounter forbidden char values.
921                 */
922
923                if (a_in[in_index] <= 0x7F) {
924                        a_out[out_index] = a_in[in_index];
925                        out_index++;
926                } else {
927                        a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
928                        a_out[out_index + 1] =
929                                (0x80 | (a_in[in_index] & 0x3F));
930                        out_index += 2;
931                }
932        }                       /*end for */
933
934      end:
935        *a_in_len = in_index;
936        *a_out_len = out_index;
937
938        return CR_OK;
939}
940
941/**
942 *Converts an ucs1 string into an utf8 string.
943 *@param a_in_start the beginning of the input string to convert.
944 *@param a_in_end the end of the input string to convert.
945 *@param a_out out parameter. The converted string.
946 *@param a_out out parameter. The length of the converted string.
947 *@return CR_OK upon successfull completion, an error code otherwise.
948 *
949 */
950enum CRStatus
951cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
952                           gulong * a_in_len,
953                           guchar ** a_out, gulong * a_out_len)
954{
955        gulong in_len = 0,
956                out_len = 0;
957        enum CRStatus status = CR_OK;
958
959        g_return_val_if_fail (a_in && a_in_len && a_out
960                              && a_out_len, CR_BAD_PARAM_ERROR);
961
962        if (*a_in_len < 1) {
963                *a_out_len = 0;
964                *a_out = NULL;
965                return CR_OK;
966        }
967
968        status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
969                                                &out_len);
970
971        g_return_val_if_fail (status == CR_OK, status);
972
973        in_len = *a_in_len;
974
975        *a_out = g_malloc0 (out_len);
976
977        status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
978
979        *a_out_len = out_len;
980
981        return status;
982}
983
984/**
985 *Converts an utf8 buffer into an ucs1 buffer.
986 *The caller must know the size of the resulting
987 *converted buffer, and allocated it prior to calling this
988 *function.
989 *
990 *@param a_in the input utf8 buffer to convert.
991 *
992 *@param a_in_len in/out parameter. The size of the input utf8 buffer.
993 *After return, points to the number of bytes consumed
994 *by the function even in case of encoding error.
995 *
996 *@param a_out out parameter. Points to the resulting buffer.
997 *Must be allocated by the caller. If the size of a_out is shorter
998 *than its required size, this function converts what it can and return
999 *a successfull status.
1000 *
1001 *@param a_out_len in/out parameter. The size of the output buffer.
1002 *After return, points to the number of bytes consumed even in case of
1003 *encoding error.
1004 *
1005 *@return CR_OK upon successfull completion, an error code otherwise.
1006 */
1007enum CRStatus
1008cr_utils_utf8_to_ucs1 (const guchar * a_in,
1009                       gulong * a_in_len, guchar * a_out, gulong * a_out_len)
1010{
1011        gulong in_index = 0,
1012                out_index = 0,
1013                in_len = 0,
1014                out_len = 0;
1015        enum CRStatus status = CR_OK;
1016
1017        /*
1018         *to store the final decoded
1019         *unicode char
1020         */
1021        guint32 c = 0;
1022
1023        g_return_val_if_fail (a_in && a_in_len
1024                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1025
1026        if (*a_in_len < 1) {
1027                status = CR_OK;
1028                goto end;
1029        }
1030
1031        in_len = *a_in_len;
1032        out_len = *a_out_len;
1033
1034        for (in_index = 0, out_index = 0;
1035             (in_index < in_len) && (out_index < out_len);
1036             in_index++, out_index++) {
1037                gint nb_bytes_2_decode = 0;
1038
1039                if (a_in[in_index] <= 0x7F) {
1040                        /*
1041                         *7 bits long char
1042                         *encoded over 1 byte:
1043                         * 0xxx xxxx
1044                         */
1045                        c = a_in[in_index];
1046                        nb_bytes_2_decode = 1;
1047
1048                } else if ((a_in[in_index] & 0xE0) == 0xC0) {
1049                        /*
1050                         *up to 11 bits long char.
1051                         *encoded over 2 bytes:
1052                         *110x xxxx  10xx xxxx
1053                         */
1054                        c = a_in[in_index] & 0x1F;
1055                        nb_bytes_2_decode = 2;
1056
1057                } else if ((a_in[in_index] & 0xF0) == 0xE0) {
1058                        /*
1059                         *up to 16 bit long char
1060                         *encoded over 3 bytes:
1061                         *1110 xxxx  10xx xxxx  10xx xxxx
1062                         */
1063                        c = a_in[in_index] & 0x0F;
1064                        nb_bytes_2_decode = 3;
1065
1066                } else if ((a_in[in_index] & 0xF8) == 0xF0) {
1067                        /*
1068                         *up to 21 bits long char
1069                         *encoded over 4 bytes:
1070                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
1071                         */
1072                        c = a_in[in_index] & 0x7;
1073                        nb_bytes_2_decode = 4;
1074
1075                } else if ((a_in[in_index] & 0xFC) == 0xF8) {
1076                        /*
1077                         *up to 26 bits long char
1078                         *encoded over 5 bytes.
1079                         *1111 10xx  10xx xxxx  10xx xxxx
1080                         *10xx xxxx  10xx xxxx
1081                         */
1082                        c = a_in[in_index] & 3;
1083                        nb_bytes_2_decode = 5;
1084
1085                } else if ((a_in[in_index] & 0xFE) == 0xFC) {
1086                        /*
1087                         *up to 31 bits long char
1088                         *encoded over 6 bytes:
1089                         *1111 110x  10xx xxxx  10xx xxxx
1090                         *10xx xxxx  10xx xxxx  10xx xxxx
1091                         */
1092                        c = a_in[in_index] & 1;
1093                        nb_bytes_2_decode = 6;
1094
1095                } else {
1096                        /*BAD ENCODING */
1097                        status = CR_ENCODING_ERROR;
1098                        goto end;
1099                }
1100
1101                /*
1102                 *Go and decode the remaining byte(s)
1103                 *(if any) to get the current character.
1104                 */
1105                if (in_index + nb_bytes_2_decode - 1 >= in_len) {
1106                        status = CR_OK;
1107                        goto end;
1108                }
1109
1110                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
1111                        /*decode the next byte */
1112                        in_index++;
1113
1114                        /*byte pattern must be: 10xx xxxx */
1115                        if ((a_in[in_index] & 0xC0) != 0x80) {
1116                                status = CR_ENCODING_ERROR;
1117                                goto end;
1118                        }
1119
1120                        c = (c << 6) | (a_in[in_index] & 0x3F);
1121                }
1122
1123                /*
1124                 *The decoded ucs4 char is now
1125                 *in c.
1126                 */
1127
1128                if (c > 0xFF) {
1129                        status = CR_ENCODING_ERROR;
1130                        goto end;
1131                }
1132
1133                a_out[out_index] = c;
1134        }
1135
1136      end:
1137        *a_out_len = out_index;
1138        *a_in_len = in_index;
1139
1140        return CR_OK;
1141}
1142
1143/**
1144 *Converts an utf8 buffer into an
1145 *ucs1 buffer.
1146 *@param a_in_start the start of the input buffer.
1147 *@param a_in_end the end of the input buffer.
1148 *@param a_out out parameter. The resulting converted ucs4 buffer.
1149 *Must be freed by the caller.
1150 *@param a_out_len out parameter. The length of the converted buffer.
1151 *@return CR_OK upon successfull completion, an error code otherwise.
1152 *Note that out parameters are valid if and only if this function
1153 *returns CR_OK.
1154 */
1155enum CRStatus
1156cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
1157                           gulong * a_in_len,
1158                           guchar ** a_out, gulong * a_out_len)
1159{
1160        enum CRStatus status = CR_OK;
1161
1162        g_return_val_if_fail (a_in && a_in_len
1163                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1164
1165        if (*a_in_len < 1) {
1166                *a_out_len = 0;
1167                *a_out = NULL;
1168                return CR_OK;
1169        }
1170
1171        status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
1172                                                a_out_len);
1173
1174        g_return_val_if_fail (status == CR_OK, status);
1175
1176        *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
1177
1178        status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
1179        return status;
1180}
1181
1182/*****************************************
1183 *CSS basic types identification utilities
1184 *****************************************/
1185
1186/**
1187 *Returns TRUE if a_char is a white space as
1188 *defined in the css spec in chap 4.1.1.
1189 *
1190 *white-space ::= ' '| \t|\r|\n|\f
1191 *
1192 *@param a_char the character to test.
1193 *return TRUE if is a white space, false otherwise.
1194 */
1195gboolean
1196cr_utils_is_white_space (guint32 a_char)
1197{
1198        switch (a_char) {
1199        case ' ':
1200        case '\t':
1201        case '\r':
1202        case '\n':
1203        case '\f':
1204                return TRUE;
1205                break;
1206        default:
1207                return FALSE;
1208        }
1209}
1210
1211/**
1212 *Returns true if the character is a newline
1213 *as defined in the css spec in the chap 4.1.1.
1214 *
1215 *nl ::= \n|\r\n|\r|\f
1216 *
1217 *@param a_char the character to test.
1218 *@return TRUE if the character is a newline, FALSE otherwise.
1219 */
1220gboolean
1221cr_utils_is_newline (guint32 a_char)
1222{
1223        switch (a_char) {
1224        case '\n':
1225        case '\r':
1226        case '\f':
1227                return TRUE;
1228                break;
1229        default:
1230                return FALSE;
1231        }
1232}
1233
1234/**
1235 *returns TRUE if the char is part of an hexa num char:
1236 *i.e hexa_char ::= [0-9A-F]
1237 */
1238gboolean
1239cr_utils_is_hexa_char (guint32 a_char)
1240{
1241        if ((a_char >= '0' && a_char <= '9')
1242            || (a_char >= 'A' && a_char <= 'F')) {
1243                return TRUE;
1244        }
1245        return FALSE;
1246}
1247
1248/**
1249 *Returns true if the character is a nonascii
1250 *character (as defined in the css spec chap 4.1.1):
1251 *
1252 *nonascii ::= [^\0-\177]
1253 *
1254 *@param a_char the character to test.
1255 *@return TRUE if the character is a nonascii char,
1256 *FALSE otherwise.
1257 */
1258gboolean
1259cr_utils_is_nonascii (guint32 a_char)
1260{
1261        if (a_char <= 177) {
1262                return FALSE;
1263        }
1264
1265        return TRUE;
1266}
1267
1268/**
1269 *Dumps a character a_nb times on a file.
1270 *@param a_char the char to dump
1271 *@param a_fp the destination file pointer
1272 *@param a_nb the number of times a_char is to be dumped.
1273 */
1274void
1275cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
1276{
1277        glong i = 0;
1278
1279        for (i = 0; i < a_nb; i++) {
1280                fprintf (a_fp, "%c", a_char);
1281        }
1282}
1283
1284void
1285cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
1286{
1287        glong i = 0;
1288
1289        g_return_if_fail (a_string);
1290
1291        for (i = 0; i < a_nb; i++) {
1292                g_string_append_printf (a_string, "%c", a_char);
1293        }
1294}
1295
1296gdouble
1297cr_utils_n_to_0_dot_n (glong a_n, glong decimal_places)
1298{
1299        gdouble result = a_n;
1300
1301        while (decimal_places > 0) {
1302                result = result / 10;
1303                decimal_places--;
1304        }
1305
1306        return result;
1307}
1308
1309/**
1310 *Duplicates a list of GString instances.
1311 *@return the duplicated list of GString instances or NULL if
1312 *something bad happened.
1313 *@param a_list_of_strings the list of strings to be duplicated.
1314 */
1315GList *
1316cr_utils_dup_glist_of_string (GList * a_list_of_strings)
1317{
1318        GList *cur = NULL,
1319                *result = NULL;
1320
1321        g_return_val_if_fail (a_list_of_strings, NULL);
1322
1323        for (cur = a_list_of_strings; cur; cur = cur->next) {
1324                GString *str = NULL;
1325
1326                str = g_string_new_len (((GString *) cur->data)->str,
1327                                        ((GString *) cur->data)->len);
1328                if (str)
1329                        result = g_list_append (result, str);
1330        }
1331
1332        return result;
1333}
1334
1335/**
1336 *Duplicate a GList where the GList::data is a CRString.
1337 *@param a_list_of_strings the list to duplicate
1338 *@return the duplicated list, or NULL if something bad
1339 *happened.
1340 */
1341GList *
1342cr_utils_dup_glist_of_cr_string (GList * a_list_of_strings)
1343{
1344        GList *cur = NULL, *result = NULL;
1345
1346        g_return_val_if_fail (a_list_of_strings, NULL);
1347
1348        for (cur = a_list_of_strings; cur; cur = cur->next) {
1349                CRString *str = NULL;
1350
1351                str = cr_string_dup ((CRString *) cur->data) ;
1352                if (str)
1353                        result = g_list_append (result, str);
1354        }
1355
1356        return result;
1357}
1358