1/*
2Open Tracker License
3
4Terms and Conditions
5
6Copyright (c) 1991-2001, Be Incorporated. All rights reserved.
7
8Permission is hereby granted, free of charge, to any person obtaining a copy of
9this software and associated documentation files (the "Software"), to deal in
10the Software without restriction, including without limitation the rights to
11use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12of the Software, and to permit persons to whom the Software is furnished to do
13so, subject to the following conditions:
14
15The above copyright notice and this permission notice applies to all licensees
16and shall be included in all copies or substantial portions of the Software.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
25Except as contained in this notice, the name of Be Incorporated shall not be
26used in advertising or otherwise to promote the sale, use or other dealings in
27this Software without prior written authorization from Be Incorporated.
28
29BeMail(TM), Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30of Be Incorporated in the United States and other countries. Other brand product
31names are registered trademarks or trademarks of their respective holders.
32All rights reserved.
33*/
34
35
36#include "Words.h"
37
38#include <ctype.h>
39#include <stdio.h>
40#include <string.h>
41
42
43enum {
44	FIND_WORD,
45	GET_WORD,
46	GET_FLAGS
47};
48
49
50/*
51	MAXMETAPH is the length of the Metaphone code.
52
53	Four is a good compromise value for English names. For comparing words
54	which are not names or for some non-English names, use a longer code
55	length for more precise matches.
56
57	The default here is 5.
58*/
59
60#define MAXMETAPH 6
61
62
63// Character coding array, A-Z
64static char vsvfn[26] = { 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0,
65	2, 4, 4, 1, 0, 0, 0, 8, 0};
66
67
68static const char* gCmpKey;
69
70
71static int
72word_cmp(BString** firstArg, BString** secondArg)
73{
74	return word_match(gCmpKey, (*firstArg)->String()) - word_match(gCmpKey,
75		(*secondArg)->String());
76}
77
78
79Words::Words(bool useMetaphone)
80	:
81	fUseMetaphone(useMetaphone)
82{
83
84}
85
86
87Words::Words(BPositionIO* thes, bool useMetaphone)
88	:
89	WIndex(thes),
90	fUseMetaphone(useMetaphone)
91{
92
93}
94
95
96Words::~Words(void)
97{
98
99}
100
101
102Words::Words(const char* dataPath, const char* indexPath, bool useMetaphone)
103	:
104	fUseMetaphone(useMetaphone)
105{
106	if (!useMetaphone)
107		fEntrySize = sizeof(uint32);
108	SetTo(dataPath, indexPath);
109}
110
111
112status_t
113Words::BuildIndex(void)
114{
115	// Parse the Words file...
116
117	// Buffer Stuff
118	char			buffer[16384];
119	char			*nptr, *eptr;
120	int64			blockOffset;
121	int32			blockSize;
122
123	// The Word Entry
124	WIndexEntry		entry;
125	char			entryName[256], *namePtr = entryName;
126	char			suffixName[256];
127	char			flags[32], *flagsPtr = flags;
128
129	// State Info
130	int32			state = FIND_WORD;
131
132	// Make sure we are at start of file
133	fDataFile->Seek(0, SEEK_SET);
134	entry.offset = -1;
135
136	// Read blocks from thes until eof
137	while (true) {
138		// Get next block
139		blockOffset = fDataFile->Position();
140		if ((blockSize = fDataFile->Read(buffer, 16384)) == 0)
141			break;
142
143		// parse block
144		for (nptr = buffer, eptr = buffer + blockSize; nptr < eptr; nptr++) {
145			// Looking for start of word?
146			if (state == FIND_WORD) {
147				// Is start of word?
148				if (isalpha(*nptr)) {
149					state = GET_WORD;
150					*namePtr++ = *nptr; // copy word
151					entry.offset = blockOffset + (nptr - buffer);
152				} else {
153					entry.offset++;
154				}
155			} else if ((*nptr == '\n') || (*nptr == '\r')) {
156				// End of word?
157
158				if (namePtr != entryName) {
159					// Add previous entry to word index
160					*namePtr = 0; // terminate word
161					*flagsPtr = 0; // terminate flags
162					NormalizeWord(entryName, entryName);
163					// Add base word
164					entry.key = GetKey(entryName);
165					AddItem(&entry);
166
167					// Add suffixed words if any
168					if (flagsPtr != flags) {
169						// printf("Base: %s, flags: %s\n", entryName, flags);
170						for (flagsPtr = flags; *flagsPtr != 0; flagsPtr++) {
171							if (suffix_word(suffixName, entryName, *flagsPtr)) {
172								// printf("Suffix: %s\n", suffixName);
173								entry.key = GetKey(suffixName);
174								AddItem(&entry);
175							}
176						}
177					}
178				}
179				// Init new entry
180				state = FIND_WORD;
181				namePtr = entryName;
182				flagsPtr = flags;
183			} else if (state == GET_WORD) {
184				// Start of flags?
185				if (*nptr == '/') {
186					*namePtr = 0; // terminate word
187					// printf("Found word: %s\n", entryName);
188
189					state = GET_FLAGS;
190				} else {
191					*namePtr++ = *nptr; // copy word
192				}
193			} else if (state == GET_FLAGS) // Are we getting the flags?
194				*flagsPtr++ = *nptr; // copy flag
195		}	// End for (nptr = buffer, eptr = buffer + blockSize;
196				// nptr < eptr; nptr++, entry.size++)
197	} // End while (true)
198
199	SortItems();
200	return B_OK;
201}
202
203
204int32
205Words::GetKey(const char* s)
206{
207	if (fUseMetaphone) {
208		char		Metaph[12];
209		const char	*sPtr;
210		int32		key = 0;
211		int32		offset;
212		char		c;
213
214		metaphone(s, Metaph, GENERATE);
215		// Compact Metaphone from 6 bytes to 4
216
217		// printf("%s -> %s: \n", s, Metaph);
218
219		for (sPtr = Metaph, offset = 25; *sPtr; sPtr++, offset -= 5) {
220			c = *sPtr - 'A';
221			// printf("%d,", int16(c));
222			key |= int32(c) << offset;
223		}
224		for (; offset >= 0; offset -= 5)
225			key |= int32(31) << offset;
226		// printf(": %ld\n", key);
227		return key;
228	} else {
229		return WIndex::GetKey(s);
230	}
231}
232
233
234// Macros to access the character coding array
235#define vowel(x)	(vsvfn[(x) - 'A'] & 1)	// AEIOU
236#define same(x)		(vsvfn[(x) - 'A'] & 2)	// FJLMNR
237#define varson(x)	(vsvfn[(x) - 'A'] & 4)	// CGPST
238#define frontv(x)	(vsvfn[(x) - 'A'] & 8)	// EIY
239#define noghf(x)	(vsvfn[(x) - 'A'] & 16)	// BDH
240#define NUL '\0'
241
242/*
243	metaphone()
244
245	Arguments:
246	1 - The word to be converted to a metaphone code.
247	2 - A MAXMETAPH + 1 char field for the result.
248	3 - Function flag:
249	If 0: Compute the Metaphone code for the first argument,
250		then compare it to the Metaphone code passed in the second argument.
251	If 1: Compute the Metaphone code for the first argument,
252		then store the result in the area pointed to by the second argument.
253
254	Returns:
255	If function code is 0, returns Success_ for a match, else Error_.
256	If function code is 1, returns Success_.
257*/
258
259
260bool
261metaphone(const char* Word, char* Metaph, metaphlag Flag)
262{
263	char *n, *n_start, *n_end;			// Pointers to string
264	char *metaph = NULL, *metaph_end;	// Pointers to metaph
265	char ntrans[512];					// Word with uppercase letters
266	char newm[MAXMETAPH + 4];			// New metaph for comparison
267	int KSflag;							// State flag for X translation
268
269	// Copy word to internal buffer, dropping non-alphabetic characters
270	// and converting to upper case.
271
272	for (n = ntrans + 1, n_end = ntrans + sizeof(ntrans) - 2;
273		*Word && n < n_end; ++Word) {
274		if (isalpha(*Word))
275			*n++ = toupper(*Word);
276	}
277
278	if (n == ntrans + 1)
279		return false;		// Return if zero characters
280	else
281			n_end = n;		//	Set end of string pointer
282
283	// Pad with NULs, front and rear
284
285	*n++ = NUL;
286	*n = NUL;
287	n = ntrans;
288	*n++ = NUL;
289
290	// If doing comparison, redirect pointers
291
292	if (COMPARE == Flag) {
293		metaph = Metaph;
294		Metaph = newm;
295	}
296
297	// Check for PN, KN, GN, WR, WH, and X at start
298
299	switch (*n) {
300		case 'P':
301		case 'K':
302		case 'G':
303			if ('N' == *(n + 1))
304				*n++ = NUL;
305			break;
306
307		case 'A':
308			if ('E' == *(n + 1))
309				*n++ = NUL;
310			break;
311
312		case 'W':
313			if ('R' == *(n + 1)) {
314				*n++ = NUL;
315			} else if ('H' == *(n + 1)) {
316				*(n + 1) = *n;
317				*n++ = NUL;
318			}
319			break;
320
321		case 'X':
322			*n = 'S';
323			break;
324	}
325
326	// Now loop through the string, stopping at the end of the string
327	// or when the computed Metaphone code is MAXMETAPH characters long.
328
329	KSflag = false;	// State flag for KStranslation
330
331	for (metaph_end = Metaph + MAXMETAPH, n_start = n;
332		n <= n_end && Metaph < metaph_end; ++n) {
333		if (KSflag) {
334			KSflag = false;
335			*Metaph++ = *n;
336		} else {
337			// Drop duplicates except for CC
338
339			if (*(n - 1) == *n && *n != 'C')
340				continue;
341
342			// Check for F J L M N R or first letter vowel
343
344			if (same(*n) || (n == n_start && vowel(*n))) {
345				*Metaph++ = *n;
346			} else {
347				switch (*n) {
348					case 'B':
349						if (n < n_end || *(n - 1) != 'M')
350							*Metaph++ = *n;
351						break;
352
353					case 'C':
354						if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
355							if ('I' == *(n + 1) && 'A' == *(n + 2))
356								*Metaph++ = 'X';
357							else if (frontv(*(n + 1)))
358								*Metaph++ = 'S';
359							else if ('H' == *(n + 1)) {
360								*Metaph++ = ((n == n_start && !vowel(*(n + 2)))
361									|| 'S' == *(n - 1)) ? 'K' : 'X';
362							} else {
363								*Metaph++ = 'K';
364							}
365						}
366						break;
367
368					case 'D':
369						*Metaph++ = ('G' == *(n + 1) && frontv(*(n + 2)))
370							? 'J' : 'T';
371						break;
372
373					case 'G':
374						if ((*(n + 1) != 'H' || vowel(*(n + 2)))
375							&& (*(n + 1) != 'N' || ((n + 1) < n_end
376								&& (*(n + 2) != 'E' || *(n + 3) != 'D')))
377							&& (*(n - 1) != 'D' || !frontv(*(n + 1)))) {
378							*Metaph++ = (frontv(*(n + 1))
379								&& *(n + 2) != 'G') ? 'J' : 'K';
380						} else if ('H' == *(n + 1) && !noghf(*(n - 3))
381							&& *(n - 4) != 'H') {
382							*Metaph++ = 'F';
383						}
384						break;
385
386					case 'H':
387						if (!varson(*(n - 1))
388							&& (!vowel(*(n - 1)) || vowel(*(n + 1)))) {
389							*Metaph++ = 'H';
390						}
391						break;
392
393					case 'K':
394						if (*(n - 1) != 'C')
395							*Metaph++ = 'K';
396						break;
397
398					case 'P':
399						*Metaph++ = ('H' == *(n + 1)) ? 'F' : 'P';
400						break;
401
402					case 'Q':
403						*Metaph++ = 'K';
404						break;
405
406					case 'S':
407						*Metaph++ = ('H' == *(n + 1) || ('I' == *(n + 1)
408								&& ('O' == *(n + 2) || 'A' == *(n + 2))))
409							? 'X' : 'S';
410						break;
411
412					case 'T':
413						if ('I' == *(n + 1)
414							&& ('O' == *(n + 2) || 'A' == *(n + 2))) {
415							*Metaph++ = 'X';
416						} else if ('H' == *(n + 1)) {
417							*Metaph++ = 'O';
418						} else if (*(n + 1) != 'C' || *(n + 2) != 'H') {
419							*Metaph++ = 'T';
420						}
421						break;
422
423					case 'V':
424						*Metaph++ = 'F';
425						break;
426
427					case 'W':
428					case 'Y':
429						if (vowel(*(n + 1)))
430							*Metaph++ = *n;
431						break;
432
433					case 'X':
434						if (n == n_start) {
435							*Metaph++ = 'S';
436						} else {
437							*Metaph++ = 'K';
438							KSflag = true;
439						}
440						break;
441
442					case 'Z':
443						*Metaph++ = 'S';
444						break;
445				}
446			}
447		}
448
449		// Compare new Metaphone code with old
450		if (COMPARE == Flag
451			&& *(Metaph - 1) != metaph[(Metaph - newm) - 1]) {
452			return false;
453		}
454	}
455
456	// If comparing, check if Metaphone codes were equal in length
457	if (COMPARE == Flag && metaph[Metaph - newm])
458		return false;
459
460	*Metaph = NUL;
461	return true;
462}
463
464
465int
466word_match(const char* reference, const char* test)
467{
468	const char	*s1, *s2;
469	int32 		x = 0;
470	char		c1, c2;
471	s1 = test;
472	s2 = reference;
473
474	bool a, b;
475
476	while (*s2 || *s1) {
477		c1 = tolower(*s1);
478		c2 = tolower(*s2);
479
480		if (*s2 && *s1) {
481			if (c1 != c2) {
482				a = (tolower(s1[1]) == c2);
483				b = (tolower(s2[1]) == c1);
484				// Reversed pair
485				if (a && b) {
486					x += 1;
487					s1++;
488					s2++;
489				}
490				// Extra character
491				if (a) {
492					x += 1;
493					s1++;
494				}
495				// Missing Character
496				else if (b) {
497					x += 1;
498					s2++;
499				}
500				// Equivalent Character
501				else if (vsvfn[(unsigned)c1] == vsvfn[(unsigned)c2])
502					x++;
503				// Unrelated Character
504				else
505					x += 3;
506			}
507		} else {
508			x += 1;
509		}
510
511		if (*s2)
512			s2++;
513		if (*s1)
514			s1++;
515	}
516
517	return x;
518}
519
520
521int32
522suffix_word(char* dst, const char* src, char flag)
523{
524	char* end;
525
526	end = stpcpy(dst, src);
527	flag = toupper(flag);
528	switch (flag) {
529		case 'V':
530			switch (end[-1]) {
531				case 'e':
532					end = stpcpy(end - 1, "ive");
533					break;
534				default:
535					end = stpcpy(end, "ive");
536					break;
537			}
538			break;
539		case 'N':
540			switch (end[-1]) {
541				case 'e':
542					end = stpcpy(end - 1, "ion");
543					break;
544				case 'y':
545					end = stpcpy(end - 1, "ication");
546					break;
547				default:
548					end = stpcpy(end, "en");
549					break;
550			}
551			break;
552		case 'X':
553			switch (end[-1]) {
554				case 'e':
555					end = stpcpy(end - 1, "ions");
556					break;
557				case 'y':
558					end = stpcpy(end - 1, "ications");
559					break;
560				default:
561					end = stpcpy(end, "ens");
562					break;
563			}
564			break;
565		case 'H':
566			switch (end[-1]) {
567				case 'y':
568					end = stpcpy(end - 1, "ieth");
569					break;
570				default:
571					end = stpcpy(end, "th");
572					break;
573			}
574			break;
575		case 'Y':
576			end = stpcpy(end, "ly");
577			break;
578		case 'G':
579			switch (end[-1]) {
580				case 'e':
581					end = stpcpy(end - 1, "ing");
582					break;
583				default:
584					end = stpcpy(end, "ing");
585					break;
586			}
587			break;
588		case 'J':
589			switch (end[-1]) {
590				case 'e':
591					end = stpcpy(end - 1, "ings");
592					break;
593				default:
594					end = stpcpy(end, "ings");
595					break;
596			}
597			break;
598		case 'D':
599			switch (end[-1]) {
600				case 'e':
601					end = stpcpy(end - 1, "ed");
602					break;
603				case 'y':
604					if (!strchr("aeiou", end[-2])) {
605						end = stpcpy(end - 1, "ied");
606						break;
607					}
608					// Fall through
609				default:
610					end = stpcpy(end, "ed");
611					break;
612			}
613			break;
614		case 'T':
615			switch (end[-1]) {
616				case 'e':
617					end = stpcpy(end - 1, "est");
618					break;
619				case 'y':
620					if (!strchr("aeiou", end[-2])) {
621						end = stpcpy(end - 1, "iest");
622						break;
623					}
624					// Fall through
625				default:
626					end = stpcpy(end, "est");
627					break;
628			}
629			break;
630		case 'R':
631			switch (end[-1]) {
632				case 'e':
633					end = stpcpy(end - 1, "er");
634					break;
635				case 'y':
636					if (!strchr("aeiou", end[-2])) {
637						end = stpcpy(end - 1, "ier");
638						break;
639					}
640					// Fall through
641				default:
642					end = stpcpy(end, "er");
643					break;
644			}
645			break;
646		case 'Z':
647			switch (end[-1]) {
648				case 'e':
649					end = stpcpy(end - 1, "ers");
650					break;
651				case 'y':
652					if (!strchr("aeiou", end[-2])) {
653						end = stpcpy(end - 1, "iers");
654						break;
655					}
656					// Fall through
657				default:
658					end = stpcpy(end, "ers");
659					break;
660			}
661			break;
662		case 'S':
663			switch (end[-1]) {
664				case 's':
665				case 'x':
666				case 'z':
667				case 'h':
668					end = stpcpy(end, "es");
669					break;
670				case 'y':
671					if (!strchr("aeiou", end[-2])) {
672						end = stpcpy(end - 1, "ies");
673						break;
674					}
675					// Fall through
676				default:
677					end = stpcpy(end, "s");
678					break;
679			}
680			break;
681		case 'P':
682			switch (end[-1]) {
683				case 'y':
684					if (!strchr("aeiou", end[-2])) {
685						end = stpcpy(end - 1, "iness");
686						break;
687					}
688					// Fall through
689				default:
690					end = stpcpy(end, "ness");
691					break;
692			}
693			break;
694		case 'M':
695			end = stpcpy(end, "'s");
696			break;
697		default:
698			return 0;
699	}
700	return end - dst;
701}
702
703
704int32
705Words::FindBestMatches(BList* matches, const char* s)
706{
707	int32		index;
708	// printf("*** Looking for %s: ***\n", s);
709
710	if ((index = FindFirst(s)) >= 0) {
711		BString srcWord(s);
712		FileEntry* entry;
713		WIndexEntry* indexEntry;
714
715		int32		key = (ItemAt(index))->key;
716		int32		suffixLength;
717		char		word[128], suffixWord[128];
718		const char	*src, *testWord;
719		const char 	*suffixFlags;
720		char		*dst;
721
722		gCmpKey = srcWord.String();
723
724		uint8		hashTable[32];
725		uint8		hashValue, highHash, lowHash;
726		for (int32 i = 0; i < 32; i++)
727			hashTable[i] = 0;
728
729		do {
730			indexEntry = ItemAt(index);
731			// Hash the entry offset; we use this to make sure we don't add
732			// the same word file entry twice;
733			// It is possible for the same entry in the words file to have
734			// multiple entries in the index.
735
736			hashValue = indexEntry->offset % 256;
737			highHash = hashValue >> 3;
738			lowHash = 0x01 << (hashValue & 0x07);
739
740			// printf("Testing Entry: %ld: hash=%d, highHash=%d, lowHash=%d\n",
741			// indexEntry->offset, hashValue, (uint16)highHash,
742			// (uint16)lowHash);
743
744			// Has this entry offset been seen before?
745			if (!(hashTable[highHash] & lowHash)) {
746				// printf("New Entry\n");
747				hashTable[highHash] |= lowHash; // Mark this offset so we don't add it twice
748
749				entry = GetEntry(index);
750				src = entry->String();
751				while (*src && !isalpha(*src))
752					src++;
753				dst = word;
754				while (*src && *src != '/')
755					*dst++ = *src++;
756				*dst = 0;
757				if (*src == '/')
758					suffixFlags = src + 1;
759				else
760					suffixFlags = src;
761
762				// printf("Base Word: %s\n", word);
763				// printf("Flags: %s\n", suffixFlags);
764				testWord = word; // Test the base word first
765				do {
766					// printf("Testing: %s\n", testWord);
767					// Does this word match the key
768					if ((GetKey(testWord) == key)
769					// And does it look close enough to the compare key?
770					// word_match(gCmpKey, testWord)
771					//	<= int32((strlen(gCmpKey)-1)/2))
772						&& word_match(gCmpKey, testWord)
773							<= int32(float(strlen(gCmpKey)-1)*.75)) {
774						// printf("Added: %s\n", testWord);
775						matches->AddItem(new BString(testWord));
776					}
777
778					// If suffix, transform and test
779					if (*suffixFlags) {
780						// Repeat until valid suffix found or end is reached
781						suffixLength = 0;
782						while (*suffixFlags
783							&& !(suffixLength = suffix_word(suffixWord,
784								word, *suffixFlags++))) {}
785						if (suffixLength)
786							testWord = suffixWord;
787						else
788							testWord = NULL;
789					} else {
790						testWord = NULL;
791					}
792				} while (testWord);
793				delete entry;
794			}
795			// else printf("Redundant entry\n");
796
797			index++;
798		} while (key == (ItemAt(index))->key);
799
800		return matches->CountItems();
801	} else {
802		return 0;
803	}
804}
805
806
807void
808sort_word_list(BList* matches, const char* reference)
809{
810	if (matches->CountItems() > 0) {
811		BString srcWord(reference);
812		gCmpKey = srcWord.String();
813		matches->SortItems((int(*)(const void*, const void*))word_cmp);
814	}
815}
816
817