1/* $NetBSD: phonetic.c,v 1.1.1.3 2010/12/12 15:22:35 adam Exp $ */ 2 3/* phonetic.c - routines to do phonetic matching */ 4/* OpenLDAP: pkg/ldap/servers/slapd/phonetic.c,v 1.22.2.5 2010/04/13 20:23:17 kurt Exp */ 5/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 6 * 7 * Copyright 1998-2010 The OpenLDAP Foundation. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted only as authorized by the OpenLDAP 12 * Public License. 13 * 14 * A copy of this license is available in the file LICENSE in the 15 * top-level directory of the distribution or, alternatively, at 16 * <http://www.OpenLDAP.org/license.html>. 17 */ 18/* Portions Copyright (c) 1995 Regents of the University of Michigan. 19 * All rights reserved. 20 * 21 * Redistribution and use in source and binary forms are permitted 22 * provided that this notice is preserved and that due credit is given 23 * to the University of Michigan at Ann Arbor. The name of the University 24 * may not be used to endorse or promote products derived from this 25 * software without specific prior written permission. This software 26 * is provided ``as is'' without express or implied warranty. 27 */ 28 29#include "portable.h" 30 31#include <stdio.h> 32 33#include <ac/ctype.h> 34#include <ac/string.h> 35#include <ac/socket.h> 36#include <ac/time.h> 37 38#include "slap.h" 39 40#if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC) 41#define SLAPD_METAPHONE 42#endif 43 44#define iswordbreak(x) (!isascii(x) || isspace((unsigned char) (x)) || \ 45 ispunct((unsigned char) (x)) || \ 46 isdigit((unsigned char) (x)) || (x) == '\0') 47 48#if 0 49static char * 50first_word( char *s ) 51{ 52 if ( s == NULL ) { 53 return( NULL ); 54 } 55 56 while ( iswordbreak( *s ) ) { 57 if ( *s == '\0' ) { 58 return( NULL ); 59 } else { 60 s++; 61 } 62 } 63 64 return( s ); 65} 66 67static char * 68next_word( char *s ) 69{ 70 if ( s == NULL ) { 71 return( NULL ); 72 } 73 74 while ( ! iswordbreak( *s ) ) { 75 s++; 76 } 77 78 while ( iswordbreak( *s ) ) { 79 if ( *s == '\0' ) { 80 return( NULL ); 81 } else { 82 s++; 83 } 84 } 85 86 return( s ); 87} 88 89static char * 90word_dup( char *w ) 91{ 92 char *s, *ret; 93 char save; 94 95 for ( s = w; !iswordbreak( *s ); s++ ) 96 ; /* NULL */ 97 save = *s; 98 *s = '\0'; 99 ret = ch_strdup( w ); 100 *s = save; 101 102 return( ret ); 103} 104#endif /* 0 */ 105 106#ifndef MAXPHONEMELEN 107#define MAXPHONEMELEN 4 108#endif 109 110#if defined(SLAPD_PHONETIC) 111 112/* lifted from isode-8.0 */ 113char * 114phonetic( char *s ) 115{ 116 char code, adjacent, ch; 117 char *p; 118 int i; 119 char phoneme[MAXPHONEMELEN + 1]; 120 121 p = s; 122 if ( p == NULL || *p == '\0' ) { 123 return( NULL ); 124 } 125 126 adjacent = '0'; 127 phoneme[0] = TOUPPER((unsigned char)*p); 128 129 phoneme[1] = '\0'; 130 for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) { 131 ch = TOUPPER ((unsigned char)*p); 132 133 code = '0'; 134 135 switch (ch) { 136 case 'B': 137 case 'F': 138 case 'P': 139 case 'V': 140 code = (adjacent != '1') ? '1' : '0'; 141 break; 142 case 'S': 143 case 'C': 144 case 'G': 145 case 'J': 146 case 'K': 147 case 'Q': 148 case 'X': 149 case 'Z': 150 code = (adjacent != '2') ? '2' : '0'; 151 break; 152 case 'D': 153 case 'T': 154 code = (adjacent != '3') ? '3' : '0'; 155 break; 156 case 'L': 157 code = (adjacent != '4') ? '4' : '0'; 158 break; 159 case 'M': 160 case 'N': 161 code = (adjacent != '5') ? '5' : '0'; 162 break; 163 case 'R': 164 code = (adjacent != '6') ? '6' : '0'; 165 break; 166 default: 167 adjacent = '0'; 168 } 169 170 if ( i == 0 ) { 171 adjacent = code; 172 i++; 173 } else if ( code != '0' ) { 174 if ( i == MAXPHONEMELEN ) 175 break; 176 adjacent = phoneme[i] = code; 177 i++; 178 } 179 } 180 181 if ( i > 0 ) 182 phoneme[i] = '\0'; 183 184 return( ch_strdup( phoneme ) ); 185} 186 187#elif defined(SLAPD_METAPHONE) 188 189/* 190 * Metaphone was originally developed by Lawrence Philips and 191 * published in the "Computer Language" magazine in 1990. 192 */ 193/* 194 * Metaphone copied from C Gazette, June/July 1991, pp 56-57, 195 * author Gary A. Parker, with changes by Bernard Tiffany of the 196 * University of Michigan, and more changes by Tim Howes of the 197 * University of Michigan. 198 */ 199 200/* Character coding array */ 201static const char vsvfn[26] = { 202 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 203 /* A B C D E F G H I J K L M */ 204 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0}; 205 /* N O P Q R S T U V W X Y Z */ 206 207/* Macros to access character coding array */ 208#define vowel(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 1) /* AEIOU */ 209#define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */ 210#define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */ 211#define frontv(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 8) /* EIY */ 212#define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */ 213 214char * 215phonetic( char *Word ) 216{ 217 char *n, *n_start, *n_end; /* pointers to string */ 218 char *metaph_end; /* pointers to metaph */ 219 char ntrans[40]; /* word with uppercase letters */ 220 int KSflag; /* state flag for X -> KS */ 221 char buf[MAXPHONEMELEN + 2]; 222 char *Metaph; 223 224 /* 225 * Copy Word to internal buffer, dropping non-alphabetic characters 226 * and converting to upper case 227 */ 228 229 for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) && 230 n < n_end; Word++) { 231 if (isalpha((unsigned char)*Word)) 232 *n++ = TOUPPER((unsigned char)*Word); 233 } 234 Metaph = buf; 235 *Metaph = '\0'; 236 if (n == ntrans + 4) { 237 return( ch_strdup( buf ) ); /* Return if null */ 238 } 239 n_end = n; /* Set n_end to end of string */ 240 241 /* ntrans[0] will always be == 0 */ 242 ntrans[0] = '\0'; 243 ntrans[1] = '\0'; 244 ntrans[2] = '\0'; 245 ntrans[3] = '\0'; 246 *n++ = 0; 247 *n++ = 0; 248 *n++ = 0; 249 *n = 0; /* Pad with nulls */ 250 n = ntrans + 4; /* Assign pointer to start */ 251 252 /* Check for PN, KN, GN, AE, WR, WH, and X at start */ 253 switch (*n) { 254 case 'P': 255 case 'K': 256 case 'G': 257 /* 'PN', 'KN', 'GN' becomes 'N' */ 258 if (*(n + 1) == 'N') 259 *n++ = 0; 260 break; 261 case 'A': 262 /* 'AE' becomes 'E' */ 263 if (*(n + 1) == 'E') 264 *n++ = 0; 265 break; 266 case 'W': 267 /* 'WR' becomes 'R', and 'WH' to 'H' */ 268 if (*(n + 1) == 'R') 269 *n++ = 0; 270 else if (*(n + 1) == 'H') { 271 *(n + 1) = *n; 272 *n++ = 0; 273 } 274 break; 275 case 'X': 276 /* 'X' becomes 'S' */ 277 *n = 'S'; 278 break; 279 } 280 281 /* 282 * Now, loop step through string, stopping at end of string or when 283 * the computed 'metaph' is MAXPHONEMELEN characters long 284 */ 285 286 KSflag = 0; /* state flag for KS translation */ 287 for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n; 288 n <= n_end && Metaph < metaph_end; n++) { 289 if (KSflag) { 290 KSflag = 0; 291 *Metaph++ = 'S'; 292 } else { 293 /* Drop duplicates except for CC */ 294 if (*(n - 1) == *n && *n != 'C') 295 continue; 296 /* Check for F J L M N R or first letter vowel */ 297 if (same(*n) || (n == n_start && vowel(*n))) 298 *Metaph++ = *n; 299 else 300 switch (*n) { 301 case 'B': 302 303 /* 304 * B unless in -MB 305 */ 306 if (n == (n_end - 1) && *(n - 1) != 'M') 307 *Metaph++ = *n; 308 break; 309 case 'C': 310 311 /* 312 * X if in -CIA-, -CH- else S if in 313 * -CI-, -CE-, -CY- else dropped if 314 * in -SCI-, -SCE-, -SCY- else K 315 */ 316 if (*(n - 1) != 'S' || !frontv(*(n + 1))) { 317 if (*(n + 1) == 'I' && *(n + 2) == 'A') 318 *Metaph++ = 'X'; 319 else if (frontv(*(n + 1))) 320 *Metaph++ = 'S'; 321 else if (*(n + 1) == 'H') 322 *Metaph++ = ((n == n_start && !vowel(*(n + 2))) 323 || *(n - 1) == 'S') 324 ? (char) 'K' : (char) 'X'; 325 else 326 *Metaph++ = 'K'; 327 } 328 break; 329 case 'D': 330 331 /* 332 * J if in DGE or DGI or DGY else T 333 */ 334 *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2))) 335 ? (char) 'J' : (char) 'T'; 336 break; 337 case 'G': 338 339 /* 340 * F if in -GH and not B--GH, D--GH, 341 * -H--GH, -H---GH else dropped if 342 * -GNED, -GN, -DGE-, -DGI-, -DGY- 343 * else J if in -GE-, -GI-, -GY- and 344 * not GG else K 345 */ 346 if ((*(n + 1) != 'J' || vowel(*(n + 2))) && 347 (*(n + 1) != 'N' || ((n + 1) < n_end && 348 (*(n + 2) != 'E' || *(n + 3) != 'D'))) && 349 (*(n - 1) != 'D' || !frontv(*(n + 1)))) 350 *Metaph++ = (frontv(*(n + 1)) && 351 *(n + 2) != 'G') ? (char) 'G' : (char) 'K'; 352 else if (*(n + 1) == 'H' && !noghf(*(n - 3)) && 353 *(n - 4) != 'H') 354 *Metaph++ = 'F'; 355 break; 356 case 'H': 357 358 /* 359 * H if before a vowel and not after 360 * C, G, P, S, T else dropped 361 */ 362 if (!varson(*(n - 1)) && (!vowel(*(n - 1)) || 363 vowel(*(n + 1)))) 364 *Metaph++ = 'H'; 365 break; 366 case 'K': 367 368 /* 369 * dropped if after C else K 370 */ 371 if (*(n - 1) != 'C') 372 *Metaph++ = 'K'; 373 break; 374 case 'P': 375 376 /* 377 * F if before H, else P 378 */ 379 *Metaph++ = *(n + 1) == 'H' ? 380 (char) 'F' : (char) 'P'; 381 break; 382 case 'Q': 383 384 /* 385 * K 386 */ 387 *Metaph++ = 'K'; 388 break; 389 case 'S': 390 391 /* 392 * X in -SH-, -SIO- or -SIA- else S 393 */ 394 *Metaph++ = (*(n + 1) == 'H' || 395 (*(n + 1) == 'I' && (*(n + 2) == 'O' || 396 *(n + 2) == 'A'))) 397 ? (char) 'X' : (char) 'S'; 398 break; 399 case 'T': 400 401 /* 402 * X in -TIA- or -TIO- else 0 (zero) 403 * before H else dropped if in -TCH- 404 * else T 405 */ 406 if (*(n + 1) == 'I' && (*(n + 2) == 'O' || 407 *(n + 2) == 'A')) 408 *Metaph++ = 'X'; 409 else if (*(n + 1) == 'H') 410 *Metaph++ = '0'; 411 else if (*(n + 1) != 'C' || *(n + 2) != 'H') 412 *Metaph++ = 'T'; 413 break; 414 case 'V': 415 416 /* 417 * F 418 */ 419 *Metaph++ = 'F'; 420 break; 421 case 'W': 422 423 /* 424 * W after a vowel, else dropped 425 */ 426 case 'Y': 427 428 /* 429 * Y unless followed by a vowel 430 */ 431 if (vowel(*(n + 1))) 432 *Metaph++ = *n; 433 break; 434 case 'X': 435 436 /* 437 * KS 438 */ 439 if (n == n_start) 440 *Metaph++ = 'S'; 441 else { 442 *Metaph++ = 'K'; /* Insert K, then S */ 443 KSflag = 1; 444 } 445 break; 446 case 'Z': 447 448 /* 449 * S 450 */ 451 *Metaph++ = 'S'; 452 break; 453 } 454 } 455 } 456 457 *Metaph = 0; /* Null terminate */ 458 return( ch_strdup( buf ) ); 459} 460 461#endif /* SLAPD_METAPHONE */ 462