1280405Srpaulo/*
2344220Skevans** $Id: lutf8lib.c,v 1.16.1.1 2017/04/19 17:29:57 roberto Exp $
3280405Srpaulo** Standard library for UTF-8 manipulation
4280405Srpaulo** See Copyright Notice in lua.h
5280405Srpaulo*/
6280405Srpaulo
7280405Srpaulo#define lutf8lib_c
8280405Srpaulo#define LUA_LIB
9280405Srpaulo
10280405Srpaulo#include "lprefix.h"
11280405Srpaulo
12280405Srpaulo
13280405Srpaulo#include <assert.h>
14326344Simp#include <limits.h>
15280405Srpaulo#include <stdlib.h>
16280405Srpaulo#include <string.h>
17280405Srpaulo
18280405Srpaulo#include "lua.h"
19280405Srpaulo
20280405Srpaulo#include "lauxlib.h"
21280405Srpaulo#include "lualib.h"
22280405Srpaulo
23280405Srpaulo#define MAXUNICODE	0x10FFFF
24280405Srpaulo
25280405Srpaulo#define iscont(p)	((*(p) & 0xC0) == 0x80)
26280405Srpaulo
27280405Srpaulo
28280405Srpaulo/* from strlib */
29280405Srpaulo/* translate a relative string position: negative means back from end */
30280405Srpaulostatic lua_Integer u_posrelat (lua_Integer pos, size_t len) {
31280405Srpaulo  if (pos >= 0) return pos;
32280405Srpaulo  else if (0u - (size_t)pos > len) return 0;
33280405Srpaulo  else return (lua_Integer)len + pos + 1;
34280405Srpaulo}
35280405Srpaulo
36280405Srpaulo
37280405Srpaulo/*
38280405Srpaulo** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
39280405Srpaulo*/
40280405Srpaulostatic const char *utf8_decode (const char *o, int *val) {
41326344Simp  static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
42280405Srpaulo  const unsigned char *s = (const unsigned char *)o;
43280405Srpaulo  unsigned int c = s[0];
44280405Srpaulo  unsigned int res = 0;  /* final result */
45280405Srpaulo  if (c < 0x80)  /* ascii? */
46280405Srpaulo    res = c;
47280405Srpaulo  else {
48280405Srpaulo    int count = 0;  /* to count number of continuation bytes */
49280405Srpaulo    while (c & 0x40) {  /* still have continuation bytes? */
50280405Srpaulo      int cc = s[++count];  /* read next byte */
51280405Srpaulo      if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
52280405Srpaulo        return NULL;  /* invalid byte sequence */
53280405Srpaulo      res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
54280405Srpaulo      c <<= 1;  /* to test next bit */
55280405Srpaulo    }
56280405Srpaulo    res |= ((c & 0x7F) << (count * 5));  /* add first byte */
57280405Srpaulo    if (count > 3 || res > MAXUNICODE || res <= limits[count])
58280405Srpaulo      return NULL;  /* invalid byte sequence */
59280405Srpaulo    s += count;  /* skip continuation bytes read */
60280405Srpaulo  }
61280405Srpaulo  if (val) *val = res;
62280405Srpaulo  return (const char *)s + 1;  /* +1 to include first byte */
63280405Srpaulo}
64280405Srpaulo
65280405Srpaulo
66280405Srpaulo/*
67280405Srpaulo** utf8len(s [, i [, j]]) --> number of characters that start in the
68280405Srpaulo** range [i,j], or nil + current position if 's' is not well formed in
69280405Srpaulo** that interval
70280405Srpaulo*/
71280405Srpaulostatic int utflen (lua_State *L) {
72280405Srpaulo  int n = 0;
73280405Srpaulo  size_t len;
74280405Srpaulo  const char *s = luaL_checklstring(L, 1, &len);
75280405Srpaulo  lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
76280405Srpaulo  lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
77280405Srpaulo  luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
78280405Srpaulo                   "initial position out of string");
79280405Srpaulo  luaL_argcheck(L, --posj < (lua_Integer)len, 3,
80280405Srpaulo                   "final position out of string");
81280405Srpaulo  while (posi <= posj) {
82280405Srpaulo    const char *s1 = utf8_decode(s + posi, NULL);
83280405Srpaulo    if (s1 == NULL) {  /* conversion error? */
84280405Srpaulo      lua_pushnil(L);  /* return nil ... */
85280405Srpaulo      lua_pushinteger(L, posi + 1);  /* ... and current position */
86280405Srpaulo      return 2;
87280405Srpaulo    }
88280405Srpaulo    posi = s1 - s;
89280405Srpaulo    n++;
90280405Srpaulo  }
91280405Srpaulo  lua_pushinteger(L, n);
92280405Srpaulo  return 1;
93280405Srpaulo}
94280405Srpaulo
95280405Srpaulo
96280405Srpaulo/*
97280405Srpaulo** codepoint(s, [i, [j]])  -> returns codepoints for all characters
98280405Srpaulo** that start in the range [i,j]
99280405Srpaulo*/
100280405Srpaulostatic int codepoint (lua_State *L) {
101280405Srpaulo  size_t len;
102280405Srpaulo  const char *s = luaL_checklstring(L, 1, &len);
103280405Srpaulo  lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
104280405Srpaulo  lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
105280405Srpaulo  int n;
106280405Srpaulo  const char *se;
107280405Srpaulo  luaL_argcheck(L, posi >= 1, 2, "out of range");
108280405Srpaulo  luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
109280405Srpaulo  if (posi > pose) return 0;  /* empty interval; return no values */
110326344Simp  if (pose - posi >= INT_MAX)  /* (lua_Integer -> int) overflow? */
111280405Srpaulo    return luaL_error(L, "string slice too long");
112326344Simp  n = (int)(pose -  posi) + 1;
113280405Srpaulo  luaL_checkstack(L, n, "string slice too long");
114280405Srpaulo  n = 0;
115280405Srpaulo  se = s + pose;
116280405Srpaulo  for (s += posi - 1; s < se;) {
117280405Srpaulo    int code;
118280405Srpaulo    s = utf8_decode(s, &code);
119280405Srpaulo    if (s == NULL)
120280405Srpaulo      return luaL_error(L, "invalid UTF-8 code");
121280405Srpaulo    lua_pushinteger(L, code);
122280405Srpaulo    n++;
123280405Srpaulo  }
124280405Srpaulo  return n;
125280405Srpaulo}
126280405Srpaulo
127280405Srpaulo
128280405Srpaulostatic void pushutfchar (lua_State *L, int arg) {
129280405Srpaulo  lua_Integer code = luaL_checkinteger(L, arg);
130280405Srpaulo  luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
131280405Srpaulo  lua_pushfstring(L, "%U", (long)code);
132280405Srpaulo}
133280405Srpaulo
134280405Srpaulo
135280405Srpaulo/*
136280405Srpaulo** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
137280405Srpaulo*/
138280405Srpaulostatic int utfchar (lua_State *L) {
139280405Srpaulo  int n = lua_gettop(L);  /* number of arguments */
140280405Srpaulo  if (n == 1)  /* optimize common case of single char */
141280405Srpaulo    pushutfchar(L, 1);
142280405Srpaulo  else {
143280405Srpaulo    int i;
144280405Srpaulo    luaL_Buffer b;
145280405Srpaulo    luaL_buffinit(L, &b);
146280405Srpaulo    for (i = 1; i <= n; i++) {
147280405Srpaulo      pushutfchar(L, i);
148280405Srpaulo      luaL_addvalue(&b);
149280405Srpaulo    }
150280405Srpaulo    luaL_pushresult(&b);
151280405Srpaulo  }
152280405Srpaulo  return 1;
153280405Srpaulo}
154280405Srpaulo
155280405Srpaulo
156280405Srpaulo/*
157280405Srpaulo** offset(s, n, [i])  -> index where n-th character counting from
158280405Srpaulo**   position 'i' starts; 0 means character at 'i'.
159280405Srpaulo*/
160280405Srpaulostatic int byteoffset (lua_State *L) {
161280405Srpaulo  size_t len;
162280405Srpaulo  const char *s = luaL_checklstring(L, 1, &len);
163280405Srpaulo  lua_Integer n  = luaL_checkinteger(L, 2);
164280405Srpaulo  lua_Integer posi = (n >= 0) ? 1 : len + 1;
165280405Srpaulo  posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
166280405Srpaulo  luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
167280405Srpaulo                   "position out of range");
168280405Srpaulo  if (n == 0) {
169280405Srpaulo    /* find beginning of current byte sequence */
170280405Srpaulo    while (posi > 0 && iscont(s + posi)) posi--;
171280405Srpaulo  }
172280405Srpaulo  else {
173280405Srpaulo    if (iscont(s + posi))
174344220Skevans      return luaL_error(L, "initial position is a continuation byte");
175280405Srpaulo    if (n < 0) {
176280405Srpaulo       while (n < 0 && posi > 0) {  /* move back */
177280405Srpaulo         do {  /* find beginning of previous character */
178280405Srpaulo           posi--;
179280405Srpaulo         } while (posi > 0 && iscont(s + posi));
180280405Srpaulo         n++;
181280405Srpaulo       }
182280405Srpaulo     }
183280405Srpaulo     else {
184280405Srpaulo       n--;  /* do not move for 1st character */
185280405Srpaulo       while (n > 0 && posi < (lua_Integer)len) {
186280405Srpaulo         do {  /* find beginning of next character */
187280405Srpaulo           posi++;
188280405Srpaulo         } while (iscont(s + posi));  /* (cannot pass final '\0') */
189280405Srpaulo         n--;
190280405Srpaulo       }
191280405Srpaulo     }
192280405Srpaulo  }
193280405Srpaulo  if (n == 0)  /* did it find given character? */
194280405Srpaulo    lua_pushinteger(L, posi + 1);
195280405Srpaulo  else  /* no such character */
196280405Srpaulo    lua_pushnil(L);
197326344Simp  return 1;
198280405Srpaulo}
199280405Srpaulo
200280405Srpaulo
201280405Srpaulostatic int iter_aux (lua_State *L) {
202280405Srpaulo  size_t len;
203280405Srpaulo  const char *s = luaL_checklstring(L, 1, &len);
204280405Srpaulo  lua_Integer n = lua_tointeger(L, 2) - 1;
205280405Srpaulo  if (n < 0)  /* first iteration? */
206280405Srpaulo    n = 0;  /* start from here */
207280405Srpaulo  else if (n < (lua_Integer)len) {
208280405Srpaulo    n++;  /* skip current byte */
209280405Srpaulo    while (iscont(s + n)) n++;  /* and its continuations */
210280405Srpaulo  }
211280405Srpaulo  if (n >= (lua_Integer)len)
212280405Srpaulo    return 0;  /* no more codepoints */
213280405Srpaulo  else {
214280405Srpaulo    int code;
215280405Srpaulo    const char *next = utf8_decode(s + n, &code);
216280405Srpaulo    if (next == NULL || iscont(next))
217280405Srpaulo      return luaL_error(L, "invalid UTF-8 code");
218280405Srpaulo    lua_pushinteger(L, n + 1);
219280405Srpaulo    lua_pushinteger(L, code);
220280405Srpaulo    return 2;
221280405Srpaulo  }
222280405Srpaulo}
223280405Srpaulo
224280405Srpaulo
225280405Srpaulostatic int iter_codes (lua_State *L) {
226280405Srpaulo  luaL_checkstring(L, 1);
227280405Srpaulo  lua_pushcfunction(L, iter_aux);
228280405Srpaulo  lua_pushvalue(L, 1);
229280405Srpaulo  lua_pushinteger(L, 0);
230280405Srpaulo  return 3;
231280405Srpaulo}
232280405Srpaulo
233280405Srpaulo
234280405Srpaulo/* pattern to match a single UTF-8 character */
235280405Srpaulo#define UTF8PATT	"[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
236280405Srpaulo
237280405Srpaulo
238326344Simpstatic const luaL_Reg funcs[] = {
239280405Srpaulo  {"offset", byteoffset},
240280405Srpaulo  {"codepoint", codepoint},
241280405Srpaulo  {"char", utfchar},
242280405Srpaulo  {"len", utflen},
243280405Srpaulo  {"codes", iter_codes},
244280405Srpaulo  /* placeholders */
245280405Srpaulo  {"charpattern", NULL},
246280405Srpaulo  {NULL, NULL}
247280405Srpaulo};
248280405Srpaulo
249280405Srpaulo
250280405SrpauloLUAMOD_API int luaopen_utf8 (lua_State *L) {
251280405Srpaulo  luaL_newlib(L, funcs);
252326344Simp  lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
253280405Srpaulo  lua_setfield(L, -2, "charpattern");
254280405Srpaulo  return 1;
255280405Srpaulo}
256280405Srpaulo
257