1/*
2 * Copyright (c) 2009-2014 Petri Lehtinen <petri@digip.org>
3 *
4 * Jansson is free software; you can redistribute it and/or modify
5 * it under the terms of the MIT license. See LICENSE for details.
6 */
7
8#include <string.h>
9#include "utf.h"
10
11int utf8_encode(int32_t codepoint, char *buffer, size_t *size)
12{
13    if(codepoint < 0)
14        return -1;
15    else if(codepoint < 0x80)
16    {
17        buffer[0] = (char)codepoint;
18        *size = 1;
19    }
20    else if(codepoint < 0x800)
21    {
22        buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
23        buffer[1] = 0x80 + ((codepoint & 0x03F));
24        *size = 2;
25    }
26    else if(codepoint < 0x10000)
27    {
28        buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
29        buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
30        buffer[2] = 0x80 + ((codepoint & 0x003F));
31        *size = 3;
32    }
33    else if(codepoint <= 0x10FFFF)
34    {
35        buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
36        buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
37        buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
38        buffer[3] = 0x80 + ((codepoint & 0x00003F));
39        *size = 4;
40    }
41    else
42        return -1;
43
44    return 0;
45}
46
47size_t utf8_check_first(char byte)
48{
49    unsigned char u = (unsigned char)byte;
50
51    if(u < 0x80)
52        return 1;
53
54    if(0x80 <= u && u <= 0xBF) {
55        /* second, third or fourth byte of a multi-byte
56           sequence, i.e. a "continuation byte" */
57        return 0;
58    }
59    else if(u == 0xC0 || u == 0xC1) {
60        /* overlong encoding of an ASCII byte */
61        return 0;
62    }
63    else if(0xC2 <= u && u <= 0xDF) {
64        /* 2-byte sequence */
65        return 2;
66    }
67
68    else if(0xE0 <= u && u <= 0xEF) {
69        /* 3-byte sequence */
70        return 3;
71    }
72    else if(0xF0 <= u && u <= 0xF4) {
73        /* 4-byte sequence */
74        return 4;
75    }
76    else { /* u >= 0xF5 */
77        /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
78           UTF-8 */
79        return 0;
80    }
81}
82
83size_t utf8_check_full(const char *buffer, size_t size, int32_t *codepoint)
84{
85    size_t i;
86    int32_t value = 0;
87    unsigned char u = (unsigned char)buffer[0];
88
89    if(size == 2)
90    {
91        value = u & 0x1F;
92    }
93    else if(size == 3)
94    {
95        value = u & 0xF;
96    }
97    else if(size == 4)
98    {
99        value = u & 0x7;
100    }
101    else
102        return 0;
103
104    for(i = 1; i < size; i++)
105    {
106        u = (unsigned char)buffer[i];
107
108        if(u < 0x80 || u > 0xBF) {
109            /* not a continuation byte */
110            return 0;
111        }
112
113        value = (value << 6) + (u & 0x3F);
114    }
115
116    if(value > 0x10FFFF) {
117        /* not in Unicode range */
118        return 0;
119    }
120
121    else if(0xD800 <= value && value <= 0xDFFF) {
122        /* invalid code point (UTF-16 surrogate halves) */
123        return 0;
124    }
125
126    else if((size == 2 && value < 0x80) ||
127            (size == 3 && value < 0x800) ||
128            (size == 4 && value < 0x10000)) {
129        /* overlong encoding */
130        return 0;
131    }
132
133    if(codepoint)
134        *codepoint = value;
135
136    return 1;
137}
138
139const char *utf8_iterate(const char *buffer, size_t bufsize, int32_t *codepoint)
140{
141    size_t count;
142    int32_t value;
143
144    if(!bufsize)
145        return buffer;
146
147    count = utf8_check_first(buffer[0]);
148    if(count <= 0)
149        return NULL;
150
151    if(count == 1)
152        value = (unsigned char)buffer[0];
153    else
154    {
155        if(count > bufsize || !utf8_check_full(buffer, count, &value))
156            return NULL;
157    }
158
159    if(codepoint)
160        *codepoint = value;
161
162    return buffer + count;
163}
164
165int utf8_check_string(const char *string, size_t length)
166{
167    size_t i;
168
169    for(i = 0; i < length; i++)
170    {
171        size_t count = utf8_check_first(string[i]);
172        if(count == 0)
173            return 0;
174        else if(count > 1)
175        {
176            if(count > length - i)
177                return 0;
178
179            if(!utf8_check_full(&string[i], count, NULL))
180                return 0;
181
182            i += count - 1;
183        }
184    }
185
186    return 1;
187}
188