1# soundex.tcl -- 2# 3# Implementation of soundex in Tcl 4# 5# Copyright (c) 2003 by Andreas Kupries <andreas_kupries@users.sourceforge.net> 6# 7# See the file "license.terms" for information on usage and redistribution 8# of this file, and for a DISCLAIMER OF ALL WARRANTIES. 9# 10# RCS: @(#) $Id: soundex.tcl,v 1.3 2004/01/15 06:36:14 andreas_kupries Exp $ 11 12package require Tcl 8.2 13 14namespace eval ::soundex {} 15 16## ------------------------------------------------------------ 17## 18## I. Soundex by Knuth. 19 20# This implementation of the Soundex algorithm is released to the public 21# domain: anyone may use it for any purpose. See if I care. 22 23# N. Dean Pentcheff 1/13/89 Dept. of Zoology University of California Berkeley, 24# CA 94720 dean@violet.berkeley.edu 25# TCL port by Evan Rempel 2/10/98 Dept Comp Services University of Victoria. 26# erempel@uvic.ca 27 28# proc ::soundex::knuth ( string ) 29# 30# Given as argument: a character string. Returns: a static string, 4 characters long 31# This string is the Soundex key for the argument string. 32# Side effects and limitations: 33# Does not clobber the string passed in as the argument. No limit on 34# argument string length. Assumes a character set with continuously 35# ascending and contiguous letters within each case and within the digits 36# (e.g. this works for ASCII and bombs in EBCDIC. But then, most things 37# do.). Reference: Adapted from Knuth, D.E. (1973) The art of computer 38# programming; Volume 3: Sorting and searching. Addison-Wesley Publishing 39# Company: Reading, Mass. Page 392. 40# Special cases: Leading or embedded spaces, numerals, or punctuation are squeezed 41# out before encoding begins. 42# 43# Null strings or those with no encodable letters return the code 'Z000'. 44# 45# Test data from Knuth (1973): 46# Euler Gauss Hilbert Knuth Lloyd Lukasiewicz 47# E460 G200 H416 K530 L300 L222 48 49namespace eval ::soundex { 50 variable soundexKnuthCode 51 array set soundexKnuthCode { 52 a 0 b 1 c 2 d 3 e 0 f 1 g 2 h 0 i 0 j 2 k 2 l 4 m 5 53 n 5 o 0 p 1 q 2 r 6 s 2 t 3 u 0 v 1 w 0 x 2 y 0 z 2 54 } 55} 56proc ::soundex::knuth {in} { 57 variable soundexKnuthCode 58 set key "" 59 60 # Remove the leading/trailing white space punctuation etc. 61 62 set TempIn [string trim $in "\t\n\r .,'-"] 63 64 # Only use alphabetic characters, so strip out all others 65 # also, soundex index uses only lower case chars, so force to lower 66 67 regsub -all {[^a-z]} [string tolower $TempIn] {} TempIn 68 if {[string length $TempIn] == 0} { 69 return Z000 70 } 71 set last [string index $TempIn 0] 72 set key [string toupper $last] 73 set last $soundexKnuthCode($last) 74 75 # Scan rest of string, stop at end of string or when the key is 76 # full 77 78 set count 1 79 set MaxIndex [string length $TempIn] 80 81 for {set index 1} {(($count < 4) && ($index < $MaxIndex))} {incr index } { 82 set chcode $soundexKnuthCode([string index $TempIn $index]) 83 # Fold together adjacent letters sharing the same code 84 if {![string equal $last $chcode]} { 85 set last $chcode 86 # Ignore code==0 letters except as separators 87 if {$last != 0} then { 88 set key $key$last 89 incr count 90 } 91 } 92 } 93 return [string range ${key}0000 0 3] 94} 95 96package provide soundex 1.0 97