1/*	$NetBSD: phonetic.c,v 1.1.1.3 2010/12/12 15:22:35 adam Exp $	*/
2
3/* phonetic.c - routines to do phonetic matching */
4/* OpenLDAP: pkg/ldap/servers/slapd/phonetic.c,v 1.22.2.5 2010/04/13 20:23:17 kurt Exp */
5/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6 *
7 * Copyright 1998-2010 The OpenLDAP Foundation.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted only as authorized by the OpenLDAP
12 * Public License.
13 *
14 * A copy of this license is available in the file LICENSE in the
15 * top-level directory of the distribution or, alternatively, at
16 * <http://www.OpenLDAP.org/license.html>.
17 */
18/* Portions Copyright (c) 1995 Regents of the University of Michigan.
19 * All rights reserved.
20 *
21 * Redistribution and use in source and binary forms are permitted
22 * provided that this notice is preserved and that due credit is given
23 * to the University of Michigan at Ann Arbor. The name of the University
24 * may not be used to endorse or promote products derived from this
25 * software without specific prior written permission. This software
26 * is provided ``as is'' without express or implied warranty.
27 */
28
29#include "portable.h"
30
31#include <stdio.h>
32
33#include <ac/ctype.h>
34#include <ac/string.h>
35#include <ac/socket.h>
36#include <ac/time.h>
37
38#include "slap.h"
39
40#if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC)
41#define SLAPD_METAPHONE
42#endif
43
44#define iswordbreak(x)  (!isascii(x) || isspace((unsigned char) (x)) || \
45			 ispunct((unsigned char) (x)) || \
46			 isdigit((unsigned char) (x)) || (x) == '\0')
47
48#if 0
49static char *
50first_word( char *s )
51{
52	if ( s == NULL ) {
53		return( NULL );
54	}
55
56	while ( iswordbreak( *s ) ) {
57		if ( *s == '\0' ) {
58			return( NULL );
59		} else {
60			s++;
61		}
62	}
63
64	return( s );
65}
66
67static char *
68next_word( char *s )
69{
70	if ( s == NULL ) {
71		return( NULL );
72	}
73
74	while ( ! iswordbreak( *s ) ) {
75		s++;
76	}
77
78	while ( iswordbreak( *s ) ) {
79		if ( *s == '\0' ) {
80			return( NULL );
81		} else {
82			s++;
83		}
84	}
85
86	return( s );
87}
88
89static char *
90word_dup( char *w )
91{
92	char	*s, *ret;
93	char	save;
94
95	for ( s = w; !iswordbreak( *s ); s++ )
96		;	/* NULL */
97	save = *s;
98	*s = '\0';
99	ret = ch_strdup( w );
100	*s = save;
101
102	return( ret );
103}
104#endif /* 0 */
105
106#ifndef MAXPHONEMELEN
107#define MAXPHONEMELEN	4
108#endif
109
110#if defined(SLAPD_PHONETIC)
111
112/* lifted from isode-8.0 */
113char *
114phonetic( char *s )
115{
116        char	code, adjacent, ch;
117	char	*p;
118        int	i;
119	char	phoneme[MAXPHONEMELEN + 1];
120
121        p = s;
122        if (  p == NULL || *p == '\0' ) {
123                return( NULL );
124        }
125
126        adjacent = '0';
127	phoneme[0] = TOUPPER((unsigned char)*p);
128
129	phoneme[1]  = '\0';
130        for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
131		ch = TOUPPER ((unsigned char)*p);
132
133                code = '0';
134
135                switch (ch) {
136                case 'B':
137                case 'F':
138		case 'P':
139                case 'V':
140                        code = (adjacent != '1') ? '1' : '0';
141                        break;
142                case 'S':
143                case 'C':
144                case 'G':
145                case 'J':
146                case 'K':
147                case 'Q':
148                case 'X':
149                case 'Z':
150                        code = (adjacent != '2') ? '2' : '0';
151                        break;
152                case 'D':
153                case 'T':
154                        code = (adjacent != '3') ? '3' : '0';
155                        break;
156                case 'L':
157                        code = (adjacent != '4') ? '4' : '0';
158                        break;
159                case 'M':
160                case 'N':
161                        code = (adjacent != '5') ? '5' : '0';
162                        break;
163                case 'R':
164                        code = (adjacent != '6') ? '6' : '0';
165                        break;
166                default:
167                        adjacent = '0';
168                }
169
170                if ( i == 0 ) {
171			adjacent = code;
172			i++;
173		} else if ( code != '0' ) {
174			if ( i == MAXPHONEMELEN )
175				break;
176                        adjacent = phoneme[i] = code;
177                        i++;
178                }
179        }
180
181	if ( i > 0 )
182		phoneme[i] = '\0';
183
184        return( ch_strdup( phoneme ) );
185}
186
187#elif defined(SLAPD_METAPHONE)
188
189/*
190 * Metaphone was originally developed by Lawrence Philips and
191 * published in the "Computer Language" magazine in 1990.
192 */
193/*
194 * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
195 * author Gary A. Parker, with changes by Bernard Tiffany of the
196 * University of Michigan, and more changes by Tim Howes of the
197 * University of Michigan.
198 */
199
200/* Character coding array */
201static const char  vsvfn[26] = {
202	   1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
203	/* A   B  C   D  E  F  G   H  I  J  K  L  M  */
204	   2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
205	/* N  O  P  Q  R  S  T  U  V  W  X  Y  Z  */
206
207/* Macros to access character coding array */
208#define vowel(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 1)	/* AEIOU */
209#define same(x)         ((x) != '\0' && vsvfn[(x) - 'A'] & 2)	/* FJLMNR */
210#define varson(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 4)	/* CGPST */
211#define frontv(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 8)	/* EIY */
212#define noghf(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 16)	/* BDH */
213
214char *
215phonetic( char *Word )
216{
217	char           *n, *n_start, *n_end;	/* pointers to string */
218	char           *metaph_end;	/* pointers to metaph */
219	char            ntrans[40];	/* word with uppercase letters */
220	int             KSflag;	/* state flag for X -> KS */
221	char		buf[MAXPHONEMELEN + 2];
222	char		*Metaph;
223
224	/*
225	 * Copy Word to internal buffer, dropping non-alphabetic characters
226	 * and converting to upper case
227	 */
228
229	for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
230	    n < n_end; Word++) {
231		if (isalpha((unsigned char)*Word))
232			*n++ = TOUPPER((unsigned char)*Word);
233	}
234	Metaph = buf;
235	*Metaph = '\0';
236	if (n == ntrans + 4) {
237		return( ch_strdup( buf ) );		/* Return if null */
238	}
239	n_end = n;		/* Set n_end to end of string */
240
241	/* ntrans[0] will always be == 0 */
242	ntrans[0] = '\0';
243	ntrans[1] = '\0';
244	ntrans[2] = '\0';
245	ntrans[3] = '\0';
246	*n++ = 0;
247	*n++ = 0;
248	*n++ = 0;
249	*n = 0;			/* Pad with nulls */
250	n = ntrans + 4;		/* Assign pointer to start */
251
252	/* Check for PN, KN, GN, AE, WR, WH, and X at start */
253	switch (*n) {
254	case 'P':
255	case 'K':
256	case 'G':
257		/* 'PN', 'KN', 'GN' becomes 'N' */
258		if (*(n + 1) == 'N')
259			*n++ = 0;
260		break;
261	case 'A':
262		/* 'AE' becomes 'E' */
263		if (*(n + 1) == 'E')
264			*n++ = 0;
265		break;
266	case 'W':
267		/* 'WR' becomes 'R', and 'WH' to 'H' */
268		if (*(n + 1) == 'R')
269			*n++ = 0;
270		else if (*(n + 1) == 'H') {
271			*(n + 1) = *n;
272			*n++ = 0;
273		}
274		break;
275	case 'X':
276		/* 'X' becomes 'S' */
277		*n = 'S';
278		break;
279	}
280
281	/*
282	 * Now, loop step through string, stopping at end of string or when
283	 * the computed 'metaph' is MAXPHONEMELEN characters long
284	 */
285
286	KSflag = 0;		/* state flag for KS translation */
287	for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
288	     n <= n_end && Metaph < metaph_end; n++) {
289		if (KSflag) {
290			KSflag = 0;
291			*Metaph++ = 'S';
292		} else {
293			/* Drop duplicates except for CC */
294			if (*(n - 1) == *n && *n != 'C')
295				continue;
296			/* Check for F J L M N R or first letter vowel */
297			if (same(*n) || (n == n_start && vowel(*n)))
298				*Metaph++ = *n;
299			else
300				switch (*n) {
301				case 'B':
302
303					/*
304					 * B unless in -MB
305					 */
306					if (n == (n_end - 1) && *(n - 1) != 'M')
307						*Metaph++ = *n;
308					break;
309				case 'C':
310
311					/*
312					 * X if in -CIA-, -CH- else S if in
313					 * -CI-, -CE-, -CY- else dropped if
314					 * in -SCI-, -SCE-, -SCY- else K
315					 */
316					if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
317						if (*(n + 1) == 'I' && *(n + 2) == 'A')
318							*Metaph++ = 'X';
319						else if (frontv(*(n + 1)))
320							*Metaph++ = 'S';
321						else if (*(n + 1) == 'H')
322							*Metaph++ = ((n == n_start && !vowel(*(n + 2)))
323							 || *(n - 1) == 'S')
324							    ? (char) 'K' : (char) 'X';
325						else
326							*Metaph++ = 'K';
327					}
328					break;
329				case 'D':
330
331					/*
332					 * J if in DGE or DGI or DGY else T
333					 */
334					*Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
335					    ? (char) 'J' : (char) 'T';
336					break;
337				case 'G':
338
339					/*
340					 * F if in -GH and not B--GH, D--GH,
341					 * -H--GH, -H---GH else dropped if
342					 * -GNED, -GN, -DGE-, -DGI-, -DGY-
343					 * else J if in -GE-, -GI-, -GY- and
344					 * not GG else K
345					 */
346					if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
347					    (*(n + 1) != 'N' || ((n + 1) < n_end &&
348								 (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
349					    (*(n - 1) != 'D' || !frontv(*(n + 1))))
350						*Metaph++ = (frontv(*(n + 1)) &&
351							     *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
352					else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
353						 *(n - 4) != 'H')
354						*Metaph++ = 'F';
355					break;
356				case 'H':
357
358					/*
359					 * H if before a vowel and not after
360					 * C, G, P, S, T else dropped
361					 */
362					if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
363							   vowel(*(n + 1))))
364						*Metaph++ = 'H';
365					break;
366				case 'K':
367
368					/*
369					 * dropped if after C else K
370					 */
371					if (*(n - 1) != 'C')
372						*Metaph++ = 'K';
373					break;
374				case 'P':
375
376					/*
377					 * F if before H, else P
378					 */
379					*Metaph++ = *(n + 1) == 'H' ?
380					    (char) 'F' : (char) 'P';
381					break;
382				case 'Q':
383
384					/*
385					 * K
386					 */
387					*Metaph++ = 'K';
388					break;
389				case 'S':
390
391					/*
392					 * X in -SH-, -SIO- or -SIA- else S
393					 */
394					*Metaph++ = (*(n + 1) == 'H' ||
395						     (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
396							  *(n + 2) == 'A')))
397					    ? (char) 'X' : (char) 'S';
398					break;
399				case 'T':
400
401					/*
402					 * X in -TIA- or -TIO- else 0 (zero)
403					 * before H else dropped if in -TCH-
404					 * else T
405					 */
406					if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
407							   *(n + 2) == 'A'))
408						*Metaph++ = 'X';
409					else if (*(n + 1) == 'H')
410						*Metaph++ = '0';
411					else if (*(n + 1) != 'C' || *(n + 2) != 'H')
412						*Metaph++ = 'T';
413					break;
414				case 'V':
415
416					/*
417					 * F
418					 */
419					*Metaph++ = 'F';
420					break;
421				case 'W':
422
423					/*
424					 * W after a vowel, else dropped
425					 */
426				case 'Y':
427
428					/*
429					 * Y unless followed by a vowel
430					 */
431					if (vowel(*(n + 1)))
432						*Metaph++ = *n;
433					break;
434				case 'X':
435
436					/*
437					 * KS
438					 */
439					if (n == n_start)
440						*Metaph++ = 'S';
441					else {
442						*Metaph++ = 'K';	/* Insert K, then S */
443						KSflag = 1;
444					}
445					break;
446				case 'Z':
447
448					/*
449					 * S
450					 */
451					*Metaph++ = 'S';
452					break;
453				}
454		}
455	}
456
457	*Metaph = 0;		/* Null terminate */
458	return( ch_strdup( buf ) );
459}
460
461#endif /* SLAPD_METAPHONE */
462