1/* $NetBSD: lutf8lib.c,v 1.11 2023/06/08 21:12:08 nikita Exp $ */ 2 3/* 4** Id: lutf8lib.c 5** Standard library for UTF-8 manipulation 6** See Copyright Notice in lua.h 7*/ 8 9#define lutf8lib_c 10#define LUA_LIB 11 12#include "lprefix.h" 13 14 15#ifndef _KERNEL 16#include <assert.h> 17#include <limits.h> 18#include <stdlib.h> 19#include <string.h> 20#endif /* _KERNEL */ 21 22#include "lua.h" 23 24#include "lauxlib.h" 25#include "lualib.h" 26 27 28#define MAXUNICODE 0x10FFFFu 29 30#define MAXUTF 0x7FFFFFFFu 31 32 33#define MSGInvalid "invalid UTF-8 code" 34 35/* 36** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits. 37*/ 38#if (UINT_MAX >> 30) >= 1 39typedef unsigned int utfint; 40#else 41typedef unsigned long utfint; 42#endif 43 44 45#define iscont(c) (((c) & 0xC0) == 0x80) 46#define iscontp(p) iscont(*(p)) 47 48 49/* from strlib */ 50/* translate a relative string position: negative means back from end */ 51static lua_Integer u_posrelat (lua_Integer pos, size_t len) { 52 if (pos >= 0) return pos; 53 else if (0u - (size_t)pos > len) return 0; 54 else return (lua_Integer)len + pos + 1; 55} 56 57 58/* 59** Decode one UTF-8 sequence, returning NULL if byte sequence is 60** invalid. The array 'limits' stores the minimum value for each 61** sequence length, to check for overlong representations. Its first 62** entry forces an error for non-ascii bytes with no continuation 63** bytes (count == 0). 64*/ 65static const char *utf8_decode (const char *s, utfint *val, int strict) { 66 static const utfint limits[] = 67 {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u}; 68 unsigned int c = (unsigned char)s[0]; 69 utfint res = 0; /* final result */ 70 if (c < 0x80) /* ascii? */ 71 res = c; 72 else { 73 int count = 0; /* to count number of continuation bytes */ 74 for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ 75 unsigned int cc = (unsigned char)s[++count]; /* read next byte */ 76 if (!iscont(cc)) /* not a continuation byte? */ 77 return NULL; /* invalid byte sequence */ 78 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ 79 } 80 res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ 81 if (count > 5 || res > MAXUTF || res < limits[count]) 82 return NULL; /* invalid byte sequence */ 83 s += count; /* skip continuation bytes read */ 84 } 85 if (strict) { 86 /* check for invalid code points; too large or surrogates */ 87 if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu)) 88 return NULL; 89 } 90 if (val) *val = res; 91 return s + 1; /* +1 to include first byte */ 92} 93 94 95/* 96** utf8len(s [, i [, j [, lax]]]) --> number of characters that 97** start in the range [i,j], or nil + current position if 's' is not 98** well formed in that interval 99*/ 100static int utflen (lua_State *L) { 101 lua_Integer n = 0; /* counter for the number of characters */ 102 size_t len; /* string length in bytes */ 103 const char *s = luaL_checklstring(L, 1, &len); 104 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 105 lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len); 106 int lax = lua_toboolean(L, 4); 107 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, 108 "initial position out of bounds"); 109 luaL_argcheck(L, --posj < (lua_Integer)len, 3, 110 "final position out of bounds"); 111 while (posi <= posj) { 112 const char *s1 = utf8_decode(s + posi, NULL, !lax); 113 if (s1 == NULL) { /* conversion error? */ 114 luaL_pushfail(L); /* return fail ... */ 115 lua_pushinteger(L, posi + 1); /* ... and current position */ 116 return 2; 117 } 118 posi = s1 - s; 119 n++; 120 } 121 lua_pushinteger(L, n); 122 return 1; 123} 124 125 126/* 127** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all 128** characters that start in the range [i,j] 129*/ 130static int codepoint (lua_State *L) { 131 size_t len; 132 const char *s = luaL_checklstring(L, 1, &len); 133 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 134 lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len); 135 int lax = lua_toboolean(L, 4); 136 int n; 137 const char *se; 138 luaL_argcheck(L, posi >= 1, 2, "out of bounds"); 139 luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds"); 140 if (posi > pose) return 0; /* empty interval; return no values */ 141 if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ 142 return luaL_error(L, "string slice too long"); 143 n = (int)(pose - posi) + 1; /* upper bound for number of returns */ 144 luaL_checkstack(L, n, "string slice too long"); 145 n = 0; /* count the number of returns */ 146 se = s + pose; /* string end */ 147 for (s += posi - 1; s < se;) { 148 utfint code; 149 s = utf8_decode(s, &code, !lax); 150 if (s == NULL) 151 return luaL_error(L, MSGInvalid); 152 lua_pushinteger(L, code); 153 n++; 154 } 155 return n; 156} 157 158 159static void pushutfchar (lua_State *L, int arg) { 160 lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg); 161 luaL_argcheck(L, code <= MAXUTF, arg, "value out of range"); 162 lua_pushfstring(L, "%U", (long)code); 163} 164 165 166/* 167** utfchar(n1, n2, ...) -> char(n1)..char(n2)... 168*/ 169static int utfchar (lua_State *L) { 170 int n = lua_gettop(L); /* number of arguments */ 171 if (n == 1) /* optimize common case of single char */ 172 pushutfchar(L, 1); 173 else { 174 int i; 175 luaL_Buffer b; 176 luaL_buffinit(L, &b); 177 for (i = 1; i <= n; i++) { 178 pushutfchar(L, i); 179 luaL_addvalue(&b); 180 } 181 luaL_pushresult(&b); 182 } 183 return 1; 184} 185 186 187/* 188** offset(s, n, [i]) -> index where n-th character counting from 189** position 'i' starts; 0 means character at 'i'. 190*/ 191static int byteoffset (lua_State *L) { 192 size_t len; 193 const char *s = luaL_checklstring(L, 1, &len); 194 lua_Integer n = luaL_checkinteger(L, 2); 195 lua_Integer posi = (n >= 0) ? 1 : len + 1; 196 posi = u_posrelat(luaL_optinteger(L, 3, posi), len); 197 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, 198 "position out of bounds"); 199 if (n == 0) { 200 /* find beginning of current byte sequence */ 201 while (posi > 0 && iscontp(s + posi)) posi--; 202 } 203 else { 204 if (iscontp(s + posi)) 205 return luaL_error(L, "initial position is a continuation byte"); 206 if (n < 0) { 207 while (n < 0 && posi > 0) { /* move back */ 208 do { /* find beginning of previous character */ 209 posi--; 210 } while (posi > 0 && iscontp(s + posi)); 211 n++; 212 } 213 } 214 else { 215 n--; /* do not move for 1st character */ 216 while (n > 0 && posi < (lua_Integer)len) { 217 do { /* find beginning of next character */ 218 posi++; 219 } while (iscontp(s + posi)); /* (cannot pass final '\0') */ 220 n--; 221 } 222 } 223 } 224 if (n == 0) /* did it find given character? */ 225 lua_pushinteger(L, posi + 1); 226 else /* no such character */ 227 luaL_pushfail(L); 228 return 1; 229} 230 231 232static int iter_aux (lua_State *L, int strict) { 233 size_t len; 234 const char *s = luaL_checklstring(L, 1, &len); 235 lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2); 236 if (n < len) { 237 while (iscontp(s + n)) n++; /* go to next character */ 238 } 239 if (n >= len) /* (also handles original 'n' being negative) */ 240 return 0; /* no more codepoints */ 241 else { 242 utfint code; 243 const char *next = utf8_decode(s + n, &code, strict); 244 if (next == NULL || iscontp(next)) 245 return luaL_error(L, MSGInvalid); 246 lua_pushinteger(L, n + 1); 247 lua_pushinteger(L, code); 248 return 2; 249 } 250} 251 252 253static int iter_auxstrict (lua_State *L) { 254 return iter_aux(L, 1); 255} 256 257static int iter_auxlax (lua_State *L) { 258 return iter_aux(L, 0); 259} 260 261 262static int iter_codes (lua_State *L) { 263 int lax = lua_toboolean(L, 2); 264 const char *s = luaL_checkstring(L, 1); 265 luaL_argcheck(L, !iscontp(s), 1, MSGInvalid); 266 lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); 267 lua_pushvalue(L, 1); 268 lua_pushinteger(L, 0); 269 return 3; 270} 271 272 273/* pattern to match a single UTF-8 character */ 274#define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*" 275 276 277static const luaL_Reg funcs[] = { 278 {"offset", byteoffset}, 279 {"codepoint", codepoint}, 280 {"char", utfchar}, 281 {"len", utflen}, 282 {"codes", iter_codes}, 283 /* placeholders */ 284 {"charpattern", NULL}, 285 {NULL, NULL} 286}; 287 288 289LUAMOD_API int luaopen_utf8 (lua_State *L) { 290 luaL_newlib(L, funcs); 291 lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1); 292 lua_setfield(L, -2, "charpattern"); 293 return 1; 294} 295 296