1/*++ 2/* NAME 3/* valid_utf_8 3 4/* SUMMARY 5/* predicate if string is valid UTF-8 6/* SYNOPSIS 7/* #include <stringops.h> 8/* 9/* int valid_utf_8(str, len) 10/* const char *str; 11/* ssize_t len; 12/* DESCRIPTION 13/* valid_utf_8() determines if a string satisfies the UTF-8 14/* definition in RFC 3629. That is, it contains proper encodings 15/* of code points U+0000..U+10FFFF, excluding over-long encodings 16/* and excluding U+D800..U+DFFF surrogates. 17/* 18/* A zero-length string is considered valid. 19/* DIAGNOSTICS 20/* The result value is zero when the caller specifies a negative 21/* length, or a string that violates RFC 3629, for example a 22/* string that is truncated in the middle of a multi-byte 23/* sequence. 24/* BUGS 25/* But wait, there is more. Code points in the range U+FDD0..U+FDEF 26/* and ending in FFFE or FFFF are non-characters in UNICODE. This 27/* function does not block these. 28/* SEE ALSO 29/* RFC 3629 30/* LICENSE 31/* .ad 32/* .fi 33/* The Secure Mailer license must be distributed with this software. 34/* AUTHOR(S) 35/* Wietse Venema 36/* IBM T.J. Watson Research 37/* P.O. Box 704 38/* Yorktown Heights, NY 10598, USA 39/*--*/ 40 41/* System library. */ 42 43#include <sys_defs.h> 44 45/* Utility library. */ 46 47#include <stringops.h> 48 49/* valid_utf_8 - validate string according to RFC 3629 */ 50 51int valid_utf_8(const char *str, ssize_t len) 52{ 53 const unsigned char *end = (const unsigned char *) str + len; 54 const unsigned char *cp; 55 unsigned char c0, ch; 56 57 if (len < 0) 58 return (0); 59 if (len <= 0) 60 return (1); 61 62 /* 63 * Optimized for correct input, time, space, and for CPUs that have a 64 * decent number of registers. 65 */ 66 for (cp = (const unsigned char *) str; cp < end; cp++) { 67 /* Single-byte encodings. */ 68 if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) { 69 /* void */ ; 70 } 71 /* Two-byte encodings. */ 72 else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) { 73 /* Exclude over-long encodings. */ 74 if (UNEXPECTED(c0 < 0xc2) 75 || UNEXPECTED(cp + 1 >= end) 76 /* Require UTF-8 tail byte. */ 77 || UNEXPECTED((ch = *++cp) < 0x80) || UNEXPECTED(ch > 0xbf)) 78 return (0); 79 } 80 /* Three-byte encodings. */ 81 else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) { 82 if (UNEXPECTED(cp + 2 >= end) 83 /* Exclude over-long encodings. */ 84 || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80)) 85 /* Exclude U+D800..U+DFFF. */ 86 || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf)) 87 /* Require UTF-8 tail byte. */ 88 || UNEXPECTED((ch = *++cp) < 0x80) || UNEXPECTED(ch > 0xbf)) 89 return (0); 90 } 91 /* Four-byte encodings. */ 92 else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) { 93 if (UNEXPECTED(cp + 3 >= end) 94 /* Exclude over-long encodings. */ 95 || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80)) 96 /* Exclude code points above U+10FFFF. */ 97 || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf)) 98 /* Require UTF-8 tail byte. */ 99 || UNEXPECTED((ch = *++cp) < 0x80) || UNEXPECTED(ch > 0xbf) 100 /* Require UTF-8 tail byte. */ 101 || UNEXPECTED((ch = *++cp) < 0x80) || UNEXPECTED(ch > 0xbf)) 102 return (0); 103 } 104 /* Invalid: c0 >= 0xf5 */ 105 else { 106 return (0); 107 } 108 } 109 return (1); 110} 111 112 /* 113 * Stand-alone test program. Each string is a line without line terminator. 114 */ 115#ifdef TEST 116#include <stdlib.h> 117#include <vstream.h> 118#include <vstring.h> 119#include <vstring_vstream.h> 120 121#define STR(x) vstring_str(x) 122#define LEN(x) VSTRING_LEN(x) 123 124int main(void) 125{ 126 VSTRING *buf = vstring_alloc(1); 127 128 while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) { 129 vstream_printf("%c", (LEN(buf) && !valid_utf_8(STR(buf), LEN(buf))) ? 130 '!' : ' '); 131 vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf)); 132 vstream_printf("\n"); 133 } 134 vstream_fflush(VSTREAM_OUT); 135 vstring_free(buf); 136 exit(0); 137} 138 139#endif 140