1/*++
2/* NAME
3/*	valid_utf_8 3
4/* SUMMARY
5/*	predicate if string is valid UTF-8
6/* SYNOPSIS
7/*	#include <stringops.h>
8/*
9/*	int	valid_utf_8(str, len)
10/*	const char *str;
11/*	ssize_t	len;
12/* DESCRIPTION
13/*	valid_utf_8() determines if a string satisfies the UTF-8
14/*	definition in RFC 3629. That is, it contains proper encodings
15/*	of code points U+0000..U+10FFFF, excluding over-long encodings
16/*	and excluding U+D800..U+DFFF surrogates.
17/*
18/*	A zero-length string is considered valid.
19/* DIAGNOSTICS
20/*	The result value is zero when the caller specifies a negative
21/*	length, or a string that violates RFC 3629, for example a
22/*	string that is truncated in the middle of a multi-byte
23/*	sequence.
24/* BUGS
25/*	But wait, there is more. Code points in the range U+FDD0..U+FDEF
26/*	and ending in FFFE or FFFF are non-characters in UNICODE. This
27/*	function does not block these.
28/* SEE ALSO
29/*	RFC 3629
30/* LICENSE
31/* .ad
32/* .fi
33/*	The Secure Mailer license must be distributed with this software.
34/* AUTHOR(S)
35/*	Wietse Venema
36/*	IBM T.J. Watson Research
37/*	P.O. Box 704
38/*	Yorktown Heights, NY 10598, USA
39/*--*/
40
41/* System library. */
42
43#include <sys_defs.h>
44
45/* Utility library. */
46
47#include <stringops.h>
48
49/* valid_utf_8 - validate string according to RFC 3629 */
50
51int     valid_utf_8(const char *str, ssize_t len)
52{
53    const unsigned char *end = (const unsigned char *) str + len;
54    const unsigned char *cp;
55    unsigned char c0, ch;
56
57    if (len < 0)
58	return (0);
59    if (len <= 0)
60	return (1);
61
62    /*
63     * Optimized for correct input, time, space, and for CPUs that have a
64     * decent number of registers.
65     */
66    for (cp = (const unsigned char *) str; cp < end; cp++) {
67	/* Single-byte encodings. */
68	if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
69	     /* void */ ;
70	}
71	/* Two-byte encodings. */
72	else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
73	    /* Exclude over-long encodings. */
74	    if (UNEXPECTED(c0 < 0xc2)
75		|| UNEXPECTED(cp + 1 >= end)
76	    /* Require UTF-8 tail byte. */
77		|| UNEXPECTED((ch = *++cp) < 0x80) || UNEXPECTED(ch > 0xbf))
78		return (0);
79	}
80	/* Three-byte encodings. */
81	else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
82	    if (UNEXPECTED(cp + 2 >= end)
83	    /* Exclude over-long encodings. */
84		|| UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
85	    /* Exclude U+D800..U+DFFF. */
86		|| UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
87	    /* Require UTF-8 tail byte. */
88		|| UNEXPECTED((ch = *++cp) < 0x80) || UNEXPECTED(ch > 0xbf))
89		return (0);
90	}
91	/* Four-byte encodings. */
92	else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
93	    if (UNEXPECTED(cp + 3 >= end)
94	    /* Exclude over-long encodings. */
95		|| UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
96	    /* Exclude code points above U+10FFFF. */
97		|| UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
98	    /* Require UTF-8 tail byte. */
99		|| UNEXPECTED((ch = *++cp) < 0x80) || UNEXPECTED(ch > 0xbf)
100	    /* Require UTF-8 tail byte. */
101		|| UNEXPECTED((ch = *++cp) < 0x80) || UNEXPECTED(ch > 0xbf))
102		return (0);
103	}
104	/* Invalid: c0 >= 0xf5 */
105	else {
106	    return (0);
107	}
108    }
109    return (1);
110}
111
112 /*
113  * Stand-alone test program. Each string is a line without line terminator.
114  */
115#ifdef TEST
116#include <stdlib.h>
117#include <vstream.h>
118#include <vstring.h>
119#include <vstring_vstream.h>
120
121#define STR(x) vstring_str(x)
122#define LEN(x) VSTRING_LEN(x)
123
124int     main(void)
125{
126    VSTRING *buf = vstring_alloc(1);
127
128    while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) {
129	vstream_printf("%c", (LEN(buf) && !valid_utf_8(STR(buf), LEN(buf))) ?
130		       '!' : ' ');
131	vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf));
132	vstream_printf("\n");
133    }
134    vstream_fflush(VSTREAM_OUT);
135    vstring_free(buf);
136    exit(0);
137}
138
139#endif
140