1/* $NetBSD: strcasecmp_utf8.c,v 1.2 2017/02/14 01:16:49 christos Exp $ */ 2 3/*++ 4/* NAME 5/* strcasecmp_utf8 3 6/* SUMMARY 7/* caseless string comparison 8/* SYNOPSIS 9/* #include <stringops.h> 10/* 11/* int strcasecmp_utf8( 12/* const char *s1, 13/* const char *s2) 14/* 15/* int strncasecmp_utf8( 16/* const char *s1, 17/* const char *s2, 18/* ssize_t len) 19/* AUXILIARY FUNCTIONS 20/* int strcasecmp_utf8x( 21/* int flags, 22/* const char *s1, 23/* const char *s2) 24/* 25/* int strncasecmp_utf8x( 26/* int flags, 27/* const char *s1, 28/* const char *s2, 29/* ssize_t len) 30/* DESCRIPTION 31/* strcasecmp_utf8() implements caseless string comparison for 32/* UTF-8 text, with an API similar to strcasecmp(). Only ASCII 33/* characters are casefolded when the code is compiled without 34/* EAI support or when util_utf8_enable is zero. 35/* 36/* strncasecmp_utf8() implements caseless string comparison 37/* for UTF-8 text, with an API similar to strncasecmp(). Only 38/* ASCII characters are casefolded when the code is compiled 39/* without EAI support or when util_utf8_enable is zero. 40/* 41/* strcasecmp_utf8x() and strncasecmp_utf8x() implement a more 42/* complex API that provides the above functionality and more. 43/* 44/* Arguments: 45/* .IP "s1, s2" 46/* Null-terminated strings to be compared. 47/* .IP len 48/* String length before casefolding. 49/* .IP flags 50/* Zero or CASEF_FLAG_UTF8. The latter flag enables UTF-8 case 51/* folding instead of folding only ASCII characters. This flag 52/* is ignored when compiled without EAI support. 53/* SEE ALSO 54/* casefold(), casefold text for caseless comparison. 55/* LICENSE 56/* .ad 57/* .fi 58/* The Secure Mailer license must be distributed with this software. 59/* AUTHOR(S) 60/* Wietse Venema 61/* IBM T.J. Watson Research 62/* P.O. Box 704 63/* Yorktown Heights, NY 10598, USA 64/* 65/* Wietse Venema 66/* Google, Inc. 67/* 111 8th Avenue 68/* New York, NY 10011, USA 69/*--*/ 70 71 /* 72 * System library. 73 */ 74#include <sys_defs.h> 75#include <string.h> 76 77#ifdef STRCASECMP_IN_STRINGS_H 78#include <strings.h> 79#endif 80 81 /* 82 * Utility library. 83 */ 84#include <stringops.h> 85 86#define STR(x) vstring_str(x) 87 88static VSTRING *f1; /* casefold result for s1 */ 89static VSTRING *f2; /* casefold result for s2 */ 90 91/* strcasecmp_utf8_init - initialize */ 92 93static void strcasecmp_utf8_init(void) 94{ 95 f1 = vstring_alloc(100); 96 f2 = vstring_alloc(100); 97} 98 99/* strcasecmp_utf8x - caseless string comparison */ 100 101int strcasecmp_utf8x(int flags, const char *s1, const char *s2) 102{ 103 104 /* 105 * Short-circuit optimization for ASCII-only text. This may be slower 106 * than using a cache for all results. We must not expose strcasecmp(3) 107 * to non-ASCII text. 108 */ 109 if (allascii(s1) && allascii(s2)) 110 return (strcasecmp(s1, s2)); 111 112 if (f1 == 0) 113 strcasecmp_utf8_init(); 114 115 /* 116 * Cross our fingers and hope that strcmp() remains agnostic of 117 * charactersets and locales. 118 */ 119 flags &= CASEF_FLAG_UTF8; 120 casefoldx(flags, f1, s1, -1); 121 casefoldx(flags, f2, s2, -1); 122 return (strcmp(STR(f1), STR(f2))); 123} 124 125/* strncasecmp_utf8x - caseless string comparison */ 126 127int strncasecmp_utf8x(int flags, const char *s1, const char *s2, 128 ssize_t len) 129{ 130 131 /* 132 * Consider using a cache for all results. 133 */ 134 if (f1 == 0) 135 strcasecmp_utf8_init(); 136 137 /* 138 * Short-circuit optimization for ASCII-only text. This may be slower 139 * than using a cache for all results. See comments above for limitations 140 * of strcasecmp(). 141 */ 142 if (allascii_len(s1, len) && allascii_len(s2, len)) 143 return (strncasecmp(s1, s2, len)); 144 145 /* 146 * Caution: casefolding may change the number of bytes. See comments 147 * above for concerns about strcmp(). 148 */ 149 flags &= CASEF_FLAG_UTF8; 150 casefoldx(flags, f1, s1, len); 151 casefoldx(flags, f2, s2, len); 152 return (strcmp(STR(f1), STR(f2))); 153} 154 155#ifdef TEST 156#include <stdio.h> 157#include <stdlib.h> 158#include <vstream.h> 159#include <vstring_vstream.h> 160#include <msg_vstream.h> 161#include <argv.h> 162 163int main(int argc, char **argv) 164{ 165 VSTRING *buffer = vstring_alloc(1); 166 ARGV *cmd; 167 char **args; 168 int len; 169 int flags; 170 int res; 171 172 msg_vstream_init(argv[0], VSTREAM_ERR); 173 flags = CASEF_FLAG_UTF8; 174 util_utf8_enable = 1; 175 while (vstring_fgets_nonl(buffer, VSTREAM_IN)) { 176 vstream_printf("> %s\n", STR(buffer)); 177 cmd = argv_split(STR(buffer), CHARS_SPACE); 178 if (cmd->argc == 0 || cmd->argv[0][0] == '#') 179 continue; 180 args = cmd->argv; 181 182 /* 183 * Compare two strings. 184 */ 185 if (strcmp(args[0], "compare") == 0 && cmd->argc == 3) { 186 res = strcasecmp_utf8x(flags, args[1], args[2]); 187 vstream_printf("\"%s\" %s \"%s\"\n", 188 args[1], 189 res < 0 ? "<" : res == 0 ? "==" : ">", 190 args[2]); 191 } 192 193 /* 194 * Compare two substrings. 195 */ 196 else if (strcmp(args[0], "compare-len") == 0 && cmd->argc == 4 197 && sscanf(args[3], "%d", &len) == 1 && len >= 0) { 198 res = strncasecmp_utf8x(flags, args[1], args[2], len); 199 vstream_printf("\"%.*s\" %s \"%.*s\"\n", 200 len, args[1], 201 res < 0 ? "<" : res == 0 ? "==" : ">", 202 len, args[2]); 203 } 204 205 /* 206 * Usage. 207 */ 208 else { 209 vstream_printf("Usage: %s compare <s1> <s2> | compare-len <s1> <s2> <len>\n", 210 argv[0]); 211 } 212 vstream_fflush(VSTREAM_OUT); 213 argv_free(cmd); 214 } 215 exit(0); 216} 217 218#endif /* TEST */ 219