utf8.c revision 1.1
1/*	$NetBSD: utf8.c,v 1.1 2020/08/03 17:07:14 christos Exp $	*/
2
3/*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * This Source Code Form is subject to the terms of the Mozilla Public
7 * License, v. 2.0. If a copy of the MPL was not distributed with this
8 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 *
10 * See the COPYRIGHT file distributed with this work for additional
11 * information regarding copyright ownership.
12 */
13
14#include <string.h>
15
16#include <isc/utf8.h>
17#include <isc/util.h>
18
19/*
20 * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
21 * Also see RFC 3629.
22 *
23 * Char. number range  |        UTF-8 octet sequence
24 *    (hexadecimal)    |              (binary)
25 *  --------------------+---------------------------------------------
26 * 0000 0000-0000 007F | 0xxxxxxx
27 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
28 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
29 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
30 */
31bool
32isc_utf8_valid(const unsigned char *buf, size_t len) {
33	REQUIRE(buf != NULL);
34
35	for (size_t i = 0; i < len; i++) {
36		if (buf[i] <= 0x7f) {
37			continue;
38		}
39		if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
40		    (buf[i + 1] & 0xc0) == 0x80) {
41			unsigned int w;
42			w = (buf[i] & 0x1f) << 6;
43			w |= (buf[++i] & 0x3f);
44			if (w < 0x80) {
45				return (false);
46			}
47			continue;
48		}
49		if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
50		    (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
51		{
52			unsigned int w;
53			w = (buf[i] & 0x0f) << 12;
54			w |= (buf[++i] & 0x3f) << 6;
55			w |= (buf[++i] & 0x3f);
56			if (w < 0x0800) {
57				return (false);
58			}
59			continue;
60		}
61		if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
62		    (buf[i + 1] & 0xc0) == 0x80 &&
63		    (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
64		{
65			unsigned int w;
66			w = (buf[i] & 0x07) << 18;
67			w |= (buf[++i] & 0x3f) << 12;
68			w |= (buf[++i] & 0x3f) << 6;
69			w |= (buf[++i] & 0x3f);
70			if (w < 0x10000 || w > 0x10FFFF) {
71				return (false);
72			}
73			continue;
74		}
75		return (false);
76	}
77	return (true);
78}
79
80bool
81isc_utf8_bom(const unsigned char *buf, size_t len) {
82	REQUIRE(buf != NULL);
83
84	if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
85		return (true);
86	}
87	return (false);
88}
89