utf8.c revision 1.2
1/* $NetBSD: utf8.c,v 1.2 2021/02/19 16:42:19 christos Exp $ */ 2 3/* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * This Source Code Form is subject to the terms of the Mozilla Public 7 * License, v. 2.0. If a copy of the MPL was not distributed with this 8 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 9 * 10 * See the COPYRIGHT file distributed with this work for additional 11 * information regarding copyright ownership. 12 */ 13 14#include <string.h> 15 16#include <isc/utf8.h> 17#include <isc/util.h> 18 19/* 20 * UTF-8 is defined in "The Unicode Standard -- Version 4.0" 21 * Also see RFC 3629. 22 * 23 * Char. number range | UTF-8 octet sequence 24 * (hexadecimal) | (binary) 25 * --------------------+--------------------------------------------- 26 * 0000 0000-0000 007F | 0xxxxxxx 27 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 28 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 29 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 30 */ 31bool 32isc_utf8_valid(const unsigned char *buf, size_t len) { 33 REQUIRE(buf != NULL); 34 35 for (size_t i = 0; i < len; i++) { 36 if (buf[i] <= 0x7f) { 37 continue; 38 } 39 if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 && 40 (buf[i + 1] & 0xc0) == 0x80) { 41 unsigned int w; 42 w = (buf[i] & 0x1f) << 6; 43 w |= (buf[++i] & 0x3f); 44 if (w < 0x80) { 45 return (false); 46 } 47 continue; 48 } 49 if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 && 50 (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80) 51 { 52 unsigned int w; 53 w = (buf[i] & 0x0f) << 12; 54 w |= (buf[++i] & 0x3f) << 6; 55 w |= (buf[++i] & 0x3f); 56 if (w < 0x0800) { 57 return (false); 58 } 59 continue; 60 } 61 if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 && 62 (buf[i + 1] & 0xc0) == 0x80 && 63 (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80) 64 { 65 unsigned int w; 66 w = (buf[i] & 0x07) << 18; 67 w |= (buf[++i] & 0x3f) << 12; 68 w |= (buf[++i] & 0x3f) << 6; 69 w |= (buf[++i] & 0x3f); 70 if (w < 0x10000 || w > 0x10FFFF) { 71 return (false); 72 } 73 continue; 74 } 75 return (false); 76 } 77 return (true); 78} 79 80bool 81isc_utf8_bom(const unsigned char *buf, size_t len) { 82 REQUIRE(buf != NULL); 83 84 if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) { 85 return (true); 86 } 87 return (false); 88} 89