133965Sjdp/*	$NetBSD: utf8.c,v 1.4 2023/01/25 21:43:31 christos Exp $	*/
233965Sjdp
333965Sjdp/*
433965Sjdp * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
533965Sjdp *
633965Sjdp * SPDX-License-Identifier: MPL-2.0
733965Sjdp *
833965Sjdp * This Source Code Form is subject to the terms of the Mozilla Public
933965Sjdp * License, v. 2.0. If a copy of the MPL was not distributed with this
1033965Sjdp * file, you can obtain one at https://mozilla.org/MPL/2.0/.
1133965Sjdp *
1233965Sjdp * See the COPYRIGHT file distributed with this work for additional
1333965Sjdp * information regarding copyright ownership.
1433965Sjdp */
1533965Sjdp
1633965Sjdp#include <string.h>
1733965Sjdp
1833965Sjdp#include <isc/utf8.h>
1933965Sjdp#include <isc/util.h>
2033965Sjdp
2133965Sjdp/*
2233965Sjdp * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
2333965Sjdp * Also see RFC 3629.
2433965Sjdp *
2533965Sjdp * Char. number range  |        UTF-8 octet sequence
2633965Sjdp *    (hexadecimal)    |              (binary)
2733965Sjdp *  --------------------+---------------------------------------------
2833965Sjdp * 0000 0000-0000 007F | 0xxxxxxx
2933965Sjdp * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
3033965Sjdp * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
3133965Sjdp * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
3233965Sjdp */
3333965Sjdpbool
3433965Sjdpisc_utf8_valid(const unsigned char *buf, size_t len) {
3533965Sjdp	REQUIRE(buf != NULL);
3633965Sjdp
3733965Sjdp	for (size_t i = 0; i < len; i++) {
3833965Sjdp		if (buf[i] <= 0x7f) {
3933965Sjdp			continue;
4033965Sjdp		}
4133965Sjdp		if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
4233965Sjdp		    (buf[i + 1] & 0xc0) == 0x80)
4333965Sjdp		{
4433965Sjdp			unsigned int w;
4533965Sjdp			w = (buf[i] & 0x1f) << 6;
4633965Sjdp			w |= (buf[++i] & 0x3f);
4733965Sjdp			if (w < 0x80) {
4833965Sjdp				return (false);
4933965Sjdp			}
5033965Sjdp			continue;
5133965Sjdp		}
5233965Sjdp		if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
5333965Sjdp		    (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
5433965Sjdp		{
5533965Sjdp			unsigned int w;
5633965Sjdp			w = (buf[i] & 0x0f) << 12;
5733965Sjdp			w |= (buf[++i] & 0x3f) << 6;
5833965Sjdp			w |= (buf[++i] & 0x3f);
5933965Sjdp			if (w < 0x0800) {
6033965Sjdp				return (false);
6133965Sjdp			}
6233965Sjdp			continue;
6333965Sjdp		}
6433965Sjdp		if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
6533965Sjdp		    (buf[i + 1] & 0xc0) == 0x80 &&
6633965Sjdp		    (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
6733965Sjdp		{
6833965Sjdp			unsigned int w;
6933965Sjdp			w = (buf[i] & 0x07) << 18;
7033965Sjdp			w |= (buf[++i] & 0x3f) << 12;
7133965Sjdp			w |= (buf[++i] & 0x3f) << 6;
7233965Sjdp			w |= (buf[++i] & 0x3f);
7333965Sjdp			if (w < 0x10000 || w > 0x10FFFF) {
7433965Sjdp				return (false);
7533965Sjdp			}
7633965Sjdp			continue;
7733965Sjdp		}
7833965Sjdp		return (false);
7933965Sjdp	}
8033965Sjdp	return (true);
8133965Sjdp}
8233965Sjdp
8333965Sjdpbool
8433965Sjdpisc_utf8_bom(const unsigned char *buf, size_t len) {
8533965Sjdp	REQUIRE(buf != NULL);
8633965Sjdp
8733965Sjdp	if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
8833965Sjdp		return (true);
8933965Sjdp	}
9033965Sjdp	return (false);
9133965Sjdp}
9233965Sjdp