utf8.c revision 1.4
1/* $NetBSD: utf8.c,v 1.4 2023/01/25 21:43:31 christos Exp $ */ 2 3/* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16#include <string.h> 17 18#include <isc/utf8.h> 19#include <isc/util.h> 20 21/* 22 * UTF-8 is defined in "The Unicode Standard -- Version 4.0" 23 * Also see RFC 3629. 24 * 25 * Char. number range | UTF-8 octet sequence 26 * (hexadecimal) | (binary) 27 * --------------------+--------------------------------------------- 28 * 0000 0000-0000 007F | 0xxxxxxx 29 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 30 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 31 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 32 */ 33bool 34isc_utf8_valid(const unsigned char *buf, size_t len) { 35 REQUIRE(buf != NULL); 36 37 for (size_t i = 0; i < len; i++) { 38 if (buf[i] <= 0x7f) { 39 continue; 40 } 41 if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 && 42 (buf[i + 1] & 0xc0) == 0x80) 43 { 44 unsigned int w; 45 w = (buf[i] & 0x1f) << 6; 46 w |= (buf[++i] & 0x3f); 47 if (w < 0x80) { 48 return (false); 49 } 50 continue; 51 } 52 if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 && 53 (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80) 54 { 55 unsigned int w; 56 w = (buf[i] & 0x0f) << 12; 57 w |= (buf[++i] & 0x3f) << 6; 58 w |= (buf[++i] & 0x3f); 59 if (w < 0x0800) { 60 return (false); 61 } 62 continue; 63 } 64 if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 && 65 (buf[i + 1] & 0xc0) == 0x80 && 66 (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80) 67 { 68 unsigned int w; 69 w = (buf[i] & 0x07) << 18; 70 w |= (buf[++i] & 0x3f) << 12; 71 w |= (buf[++i] & 0x3f) << 6; 72 w |= (buf[++i] & 0x3f); 73 if (w < 0x10000 || w > 0x10FFFF) { 74 return (false); 75 } 76 continue; 77 } 78 return (false); 79 } 80 return (true); 81} 82 83bool 84isc_utf8_bom(const unsigned char *buf, size_t len) { 85 REQUIRE(buf != NULL); 86 87 if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) { 88 return (true); 89 } 90 return (false); 91} 92