utf8.c revision 1.3
1/*	$NetBSD: utf8.c,v 1.3 2022/09/23 12:15:33 christos Exp $	*/
2
3/*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16#include <string.h>
17
18#include <isc/utf8.h>
19#include <isc/util.h>
20
21/*
22 * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
23 * Also see RFC 3629.
24 *
25 * Char. number range  |        UTF-8 octet sequence
26 *    (hexadecimal)    |              (binary)
27 *  --------------------+---------------------------------------------
28 * 0000 0000-0000 007F | 0xxxxxxx
29 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
30 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
31 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
32 */
33bool
34isc_utf8_valid(const unsigned char *buf, size_t len) {
35	REQUIRE(buf != NULL);
36
37	for (size_t i = 0; i < len; i++) {
38		if (buf[i] <= 0x7f) {
39			continue;
40		}
41		if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
42		    (buf[i + 1] & 0xc0) == 0x80) {
43			unsigned int w;
44			w = (buf[i] & 0x1f) << 6;
45			w |= (buf[++i] & 0x3f);
46			if (w < 0x80) {
47				return (false);
48			}
49			continue;
50		}
51		if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
52		    (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
53		{
54			unsigned int w;
55			w = (buf[i] & 0x0f) << 12;
56			w |= (buf[++i] & 0x3f) << 6;
57			w |= (buf[++i] & 0x3f);
58			if (w < 0x0800) {
59				return (false);
60			}
61			continue;
62		}
63		if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
64		    (buf[i + 1] & 0xc0) == 0x80 &&
65		    (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
66		{
67			unsigned int w;
68			w = (buf[i] & 0x07) << 18;
69			w |= (buf[++i] & 0x3f) << 12;
70			w |= (buf[++i] & 0x3f) << 6;
71			w |= (buf[++i] & 0x3f);
72			if (w < 0x10000 || w > 0x10FFFF) {
73				return (false);
74			}
75			continue;
76		}
77		return (false);
78	}
79	return (true);
80}
81
82bool
83isc_utf8_bom(const unsigned char *buf, size_t len) {
84	REQUIRE(buf != NULL);
85
86	if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
87		return (true);
88	}
89	return (false);
90}
91