lutf8lib.c revision 344220
1/*
2** $Id: lutf8lib.c,v 1.16.1.1 2017/04/19 17:29:57 roberto Exp $
3** Standard library for UTF-8 manipulation
4** See Copyright Notice in lua.h
5*/
6
7#define lutf8lib_c
8#define LUA_LIB
9
10#include "lprefix.h"
11
12
13#include <assert.h>
14#include <limits.h>
15#include <stdlib.h>
16#include <string.h>
17
18#include "lua.h"
19
20#include "lauxlib.h"
21#include "lualib.h"
22
23#define MAXUNICODE	0x10FFFF
24
25#define iscont(p)	((*(p) & 0xC0) == 0x80)
26
27
28/* from strlib */
29/* translate a relative string position: negative means back from end */
30static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
31  if (pos >= 0) return pos;
32  else if (0u - (size_t)pos > len) return 0;
33  else return (lua_Integer)len + pos + 1;
34}
35
36
37/*
38** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
39*/
40static const char *utf8_decode (const char *o, int *val) {
41  static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
42  const unsigned char *s = (const unsigned char *)o;
43  unsigned int c = s[0];
44  unsigned int res = 0;  /* final result */
45  if (c < 0x80)  /* ascii? */
46    res = c;
47  else {
48    int count = 0;  /* to count number of continuation bytes */
49    while (c & 0x40) {  /* still have continuation bytes? */
50      int cc = s[++count];  /* read next byte */
51      if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
52        return NULL;  /* invalid byte sequence */
53      res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
54      c <<= 1;  /* to test next bit */
55    }
56    res |= ((c & 0x7F) << (count * 5));  /* add first byte */
57    if (count > 3 || res > MAXUNICODE || res <= limits[count])
58      return NULL;  /* invalid byte sequence */
59    s += count;  /* skip continuation bytes read */
60  }
61  if (val) *val = res;
62  return (const char *)s + 1;  /* +1 to include first byte */
63}
64
65
66/*
67** utf8len(s [, i [, j]]) --> number of characters that start in the
68** range [i,j], or nil + current position if 's' is not well formed in
69** that interval
70*/
71static int utflen (lua_State *L) {
72  int n = 0;
73  size_t len;
74  const char *s = luaL_checklstring(L, 1, &len);
75  lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
76  lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
77  luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
78                   "initial position out of string");
79  luaL_argcheck(L, --posj < (lua_Integer)len, 3,
80                   "final position out of string");
81  while (posi <= posj) {
82    const char *s1 = utf8_decode(s + posi, NULL);
83    if (s1 == NULL) {  /* conversion error? */
84      lua_pushnil(L);  /* return nil ... */
85      lua_pushinteger(L, posi + 1);  /* ... and current position */
86      return 2;
87    }
88    posi = s1 - s;
89    n++;
90  }
91  lua_pushinteger(L, n);
92  return 1;
93}
94
95
96/*
97** codepoint(s, [i, [j]])  -> returns codepoints for all characters
98** that start in the range [i,j]
99*/
100static int codepoint (lua_State *L) {
101  size_t len;
102  const char *s = luaL_checklstring(L, 1, &len);
103  lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
104  lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
105  int n;
106  const char *se;
107  luaL_argcheck(L, posi >= 1, 2, "out of range");
108  luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
109  if (posi > pose) return 0;  /* empty interval; return no values */
110  if (pose - posi >= INT_MAX)  /* (lua_Integer -> int) overflow? */
111    return luaL_error(L, "string slice too long");
112  n = (int)(pose -  posi) + 1;
113  luaL_checkstack(L, n, "string slice too long");
114  n = 0;
115  se = s + pose;
116  for (s += posi - 1; s < se;) {
117    int code;
118    s = utf8_decode(s, &code);
119    if (s == NULL)
120      return luaL_error(L, "invalid UTF-8 code");
121    lua_pushinteger(L, code);
122    n++;
123  }
124  return n;
125}
126
127
128static void pushutfchar (lua_State *L, int arg) {
129  lua_Integer code = luaL_checkinteger(L, arg);
130  luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
131  lua_pushfstring(L, "%U", (long)code);
132}
133
134
135/*
136** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
137*/
138static int utfchar (lua_State *L) {
139  int n = lua_gettop(L);  /* number of arguments */
140  if (n == 1)  /* optimize common case of single char */
141    pushutfchar(L, 1);
142  else {
143    int i;
144    luaL_Buffer b;
145    luaL_buffinit(L, &b);
146    for (i = 1; i <= n; i++) {
147      pushutfchar(L, i);
148      luaL_addvalue(&b);
149    }
150    luaL_pushresult(&b);
151  }
152  return 1;
153}
154
155
156/*
157** offset(s, n, [i])  -> index where n-th character counting from
158**   position 'i' starts; 0 means character at 'i'.
159*/
160static int byteoffset (lua_State *L) {
161  size_t len;
162  const char *s = luaL_checklstring(L, 1, &len);
163  lua_Integer n  = luaL_checkinteger(L, 2);
164  lua_Integer posi = (n >= 0) ? 1 : len + 1;
165  posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
166  luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
167                   "position out of range");
168  if (n == 0) {
169    /* find beginning of current byte sequence */
170    while (posi > 0 && iscont(s + posi)) posi--;
171  }
172  else {
173    if (iscont(s + posi))
174      return luaL_error(L, "initial position is a continuation byte");
175    if (n < 0) {
176       while (n < 0 && posi > 0) {  /* move back */
177         do {  /* find beginning of previous character */
178           posi--;
179         } while (posi > 0 && iscont(s + posi));
180         n++;
181       }
182     }
183     else {
184       n--;  /* do not move for 1st character */
185       while (n > 0 && posi < (lua_Integer)len) {
186         do {  /* find beginning of next character */
187           posi++;
188         } while (iscont(s + posi));  /* (cannot pass final '\0') */
189         n--;
190       }
191     }
192  }
193  if (n == 0)  /* did it find given character? */
194    lua_pushinteger(L, posi + 1);
195  else  /* no such character */
196    lua_pushnil(L);
197  return 1;
198}
199
200
201static int iter_aux (lua_State *L) {
202  size_t len;
203  const char *s = luaL_checklstring(L, 1, &len);
204  lua_Integer n = lua_tointeger(L, 2) - 1;
205  if (n < 0)  /* first iteration? */
206    n = 0;  /* start from here */
207  else if (n < (lua_Integer)len) {
208    n++;  /* skip current byte */
209    while (iscont(s + n)) n++;  /* and its continuations */
210  }
211  if (n >= (lua_Integer)len)
212    return 0;  /* no more codepoints */
213  else {
214    int code;
215    const char *next = utf8_decode(s + n, &code);
216    if (next == NULL || iscont(next))
217      return luaL_error(L, "invalid UTF-8 code");
218    lua_pushinteger(L, n + 1);
219    lua_pushinteger(L, code);
220    return 2;
221  }
222}
223
224
225static int iter_codes (lua_State *L) {
226  luaL_checkstring(L, 1);
227  lua_pushcfunction(L, iter_aux);
228  lua_pushvalue(L, 1);
229  lua_pushinteger(L, 0);
230  return 3;
231}
232
233
234/* pattern to match a single UTF-8 character */
235#define UTF8PATT	"[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
236
237
238static const luaL_Reg funcs[] = {
239  {"offset", byteoffset},
240  {"codepoint", codepoint},
241  {"char", utfchar},
242  {"len", utflen},
243  {"codes", iter_codes},
244  /* placeholders */
245  {"charpattern", NULL},
246  {NULL, NULL}
247};
248
249
250LUAMOD_API int luaopen_utf8 (lua_State *L) {
251  luaL_newlib(L, funcs);
252  lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
253  lua_setfield(L, -2, "charpattern");
254  return 1;
255}
256
257