utf8.c revision 1.3
1/* $NetBSD: utf8.c,v 1.3 2022/09/23 12:15:33 christos Exp $ */ 2 3/* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16#include <string.h> 17 18#include <isc/utf8.h> 19#include <isc/util.h> 20 21/* 22 * UTF-8 is defined in "The Unicode Standard -- Version 4.0" 23 * Also see RFC 3629. 24 * 25 * Char. number range | UTF-8 octet sequence 26 * (hexadecimal) | (binary) 27 * --------------------+--------------------------------------------- 28 * 0000 0000-0000 007F | 0xxxxxxx 29 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 30 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 31 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 32 */ 33bool 34isc_utf8_valid(const unsigned char *buf, size_t len) { 35 REQUIRE(buf != NULL); 36 37 for (size_t i = 0; i < len; i++) { 38 if (buf[i] <= 0x7f) { 39 continue; 40 } 41 if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 && 42 (buf[i + 1] & 0xc0) == 0x80) { 43 unsigned int w; 44 w = (buf[i] & 0x1f) << 6; 45 w |= (buf[++i] & 0x3f); 46 if (w < 0x80) { 47 return (false); 48 } 49 continue; 50 } 51 if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 && 52 (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80) 53 { 54 unsigned int w; 55 w = (buf[i] & 0x0f) << 12; 56 w |= (buf[++i] & 0x3f) << 6; 57 w |= (buf[++i] & 0x3f); 58 if (w < 0x0800) { 59 return (false); 60 } 61 continue; 62 } 63 if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 && 64 (buf[i + 1] & 0xc0) == 0x80 && 65 (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80) 66 { 67 unsigned int w; 68 w = (buf[i] & 0x07) << 18; 69 w |= (buf[++i] & 0x3f) << 12; 70 w |= (buf[++i] & 0x3f) << 6; 71 w |= (buf[++i] & 0x3f); 72 if (w < 0x10000 || w > 0x10FFFF) { 73 return (false); 74 } 75 continue; 76 } 77 return (false); 78 } 79 return (true); 80} 81 82bool 83isc_utf8_bom(const unsigned char *buf, size_t len) { 84 REQUIRE(buf != NULL); 85 86 if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) { 87 return (true); 88 } 89 return (false); 90} 91