133965Sjdp/* $NetBSD: utf8.c,v 1.4 2023/01/25 21:43:31 christos Exp $ */ 233965Sjdp 333965Sjdp/* 433965Sjdp * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 533965Sjdp * 633965Sjdp * SPDX-License-Identifier: MPL-2.0 733965Sjdp * 833965Sjdp * This Source Code Form is subject to the terms of the Mozilla Public 933965Sjdp * License, v. 2.0. If a copy of the MPL was not distributed with this 1033965Sjdp * file, you can obtain one at https://mozilla.org/MPL/2.0/. 1133965Sjdp * 1233965Sjdp * See the COPYRIGHT file distributed with this work for additional 1333965Sjdp * information regarding copyright ownership. 1433965Sjdp */ 1533965Sjdp 1633965Sjdp#include <string.h> 1733965Sjdp 1833965Sjdp#include <isc/utf8.h> 1933965Sjdp#include <isc/util.h> 2033965Sjdp 2133965Sjdp/* 2233965Sjdp * UTF-8 is defined in "The Unicode Standard -- Version 4.0" 2333965Sjdp * Also see RFC 3629. 2433965Sjdp * 2533965Sjdp * Char. number range | UTF-8 octet sequence 2633965Sjdp * (hexadecimal) | (binary) 2733965Sjdp * --------------------+--------------------------------------------- 2833965Sjdp * 0000 0000-0000 007F | 0xxxxxxx 2933965Sjdp * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 3033965Sjdp * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 3133965Sjdp * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 3233965Sjdp */ 3333965Sjdpbool 3433965Sjdpisc_utf8_valid(const unsigned char *buf, size_t len) { 3533965Sjdp REQUIRE(buf != NULL); 3633965Sjdp 3733965Sjdp for (size_t i = 0; i < len; i++) { 3833965Sjdp if (buf[i] <= 0x7f) { 3933965Sjdp continue; 4033965Sjdp } 4133965Sjdp if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 && 4233965Sjdp (buf[i + 1] & 0xc0) == 0x80) 4333965Sjdp { 4433965Sjdp unsigned int w; 4533965Sjdp w = (buf[i] & 0x1f) << 6; 4633965Sjdp w |= (buf[++i] & 0x3f); 4733965Sjdp if (w < 0x80) { 4833965Sjdp return (false); 4933965Sjdp } 5033965Sjdp continue; 5133965Sjdp } 5233965Sjdp if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 && 5333965Sjdp (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80) 5433965Sjdp { 5533965Sjdp unsigned int w; 5633965Sjdp w = (buf[i] & 0x0f) << 12; 5733965Sjdp w |= (buf[++i] & 0x3f) << 6; 5833965Sjdp w |= (buf[++i] & 0x3f); 5933965Sjdp if (w < 0x0800) { 6033965Sjdp return (false); 6133965Sjdp } 6233965Sjdp continue; 6333965Sjdp } 6433965Sjdp if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 && 6533965Sjdp (buf[i + 1] & 0xc0) == 0x80 && 6633965Sjdp (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80) 6733965Sjdp { 6833965Sjdp unsigned int w; 6933965Sjdp w = (buf[i] & 0x07) << 18; 7033965Sjdp w |= (buf[++i] & 0x3f) << 12; 7133965Sjdp w |= (buf[++i] & 0x3f) << 6; 7233965Sjdp w |= (buf[++i] & 0x3f); 7333965Sjdp if (w < 0x10000 || w > 0x10FFFF) { 7433965Sjdp return (false); 7533965Sjdp } 7633965Sjdp continue; 7733965Sjdp } 7833965Sjdp return (false); 7933965Sjdp } 8033965Sjdp return (true); 8133965Sjdp} 8233965Sjdp 8333965Sjdpbool 8433965Sjdpisc_utf8_bom(const unsigned char *buf, size_t len) { 8533965Sjdp REQUIRE(buf != NULL); 8633965Sjdp 8733965Sjdp if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) { 8833965Sjdp return (true); 8933965Sjdp } 9033965Sjdp return (false); 9133965Sjdp} 9233965Sjdp