Commit | Line | Data |
---|---|---|
a0d0e21e LW |
1 | package Text::Soundex; |
2 | require 5.000; | |
3 | require Exporter; | |
4 | ||
5 | @ISA = qw(Exporter); | |
6 | @EXPORT = qw(&soundex $soundex_nocode); | |
7 | ||
d6a466d7 | 8 | $VERSION = '1.01'; |
8cd2b3b0 | 9 | |
a0d0e21e LW |
10 | # $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $ |
11 | # | |
12 | # Implementation of soundex algorithm as described by Knuth in volume | |
13 | # 3 of The Art of Computer Programming, with ideas stolen from Ian | |
7e6e257f | 14 | # Phillipps <ian@pipex.net>. |
a0d0e21e LW |
15 | # |
16 | # Mike Stok <Mike.Stok@meiko.concord.ma.us>, 2 March 1994. | |
17 | # | |
18 | # Knuth's test cases are: | |
19 | # | |
20 | # Euler, Ellery -> E460 | |
21 | # Gauss, Ghosh -> G200 | |
22 | # Hilbert, Heilbronn -> H416 | |
23 | # Knuth, Kant -> K530 | |
24 | # Lloyd, Ladd -> L300 | |
25 | # Lukasiewicz, Lissajous -> L222 | |
26 | # | |
27 | # $Log: soundex.pl,v $ | |
28 | # Revision 1.2 1994/03/24 00:30:27 mike | |
29 | # Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> | |
30 | # in the way I handles leasing characters which were different but had | |
31 | # the same soundex code. This showed up comparing it with Oracle's | |
32 | # soundex output. | |
33 | # | |
34 | # Revision 1.1 1994/03/02 13:01:30 mike | |
35 | # Initial revision | |
36 | # | |
37 | # | |
38 | ############################################################################## | |
39 | ||
40 | # $soundex_nocode is used to indicate a string doesn't have a soundex | |
41 | # code, I like undef other people may want to set it to 'Z000'. | |
42 | ||
43 | $soundex_nocode = undef; | |
44 | ||
a0d0e21e LW |
45 | sub soundex |
46 | { | |
47 | local (@s, $f, $fc, $_) = @_; | |
48 | ||
cb1a09d0 AD |
49 | push @s, '' unless @s; # handle no args as a single empty string |
50 | ||
a0d0e21e LW |
51 | foreach (@s) |
52 | { | |
55497cff | 53 | $_ = uc $_; |
a0d0e21e LW |
54 | tr/A-Z//cd; |
55 | ||
56 | if ($_ eq '') | |
57 | { | |
58 | $_ = $soundex_nocode; | |
59 | } | |
60 | else | |
61 | { | |
62 | ($f) = /^(.)/; | |
63 | tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/; | |
64 | ($fc) = /^(.)/; | |
65 | s/^$fc+//; | |
66 | tr///cs; | |
67 | tr/0//d; | |
68 | $_ = $f . $_ . '000'; | |
69 | s/^(.{4}).*/$1/; | |
70 | } | |
71 | } | |
72 | ||
73 | wantarray ? @s : shift @s; | |
74 | } | |
75 | ||
76 | 1; | |
77 | ||
cb1a09d0 AD |
78 | __END__ |
79 | ||
80 | =head1 NAME | |
81 | ||
82 | Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth | |
83 | ||
84 | =head1 SYNOPSIS | |
85 | ||
86 | use Text::Soundex; | |
87 | ||
88 | $code = soundex $string; # get soundex code for a string | |
89 | @codes = soundex @list; # get list of codes for list of strings | |
90 | ||
91 | # set value to be returned for strings without soundex code | |
92 | ||
93 | $soundex_nocode = 'Z000'; | |
94 | ||
95 | =head1 DESCRIPTION | |
96 | ||
97 | This module implements the soundex algorithm as described by Donald Knuth | |
98 | in Volume 3 of B<The Art of Computer Programming>. The algorithm is | |
99 | intended to hash words (in particular surnames) into a small space using a | |
100 | simple model which approximates the sound of the word when spoken by an English | |
101 | speaker. Each word is reduced to a four character string, the first | |
102 | character being an upper case letter and the remaining three being digits. | |
103 | ||
104 | If there is no soundex code representation for a string then the value of | |
105 | C<$soundex_nocode> is returned. This is initially set to C<undef>, but | |
106 | many people seem to prefer an I<unlikely> value like C<Z000> | |
107 | (how unlikely this is depends on the data set being dealt with.) Any value | |
108 | can be assigned to C<$soundex_nocode>. | |
109 | ||
110 | In scalar context C<soundex> returns the soundex code of its first | |
91e74348 | 111 | argument, and in list context a list is returned in which each element is the |
cb1a09d0 AD |
112 | soundex code for the corresponding argument passed to C<soundex> e.g. |
113 | ||
114 | @codes = soundex qw(Mike Stok); | |
115 | ||
116 | leaves C<@codes> containing C<('M200', 'S320')>. | |
117 | ||
118 | =head1 EXAMPLES | |
119 | ||
120 | Knuth's examples of various names and the soundex codes they map to | |
121 | are listed below: | |
122 | ||
123 | Euler, Ellery -> E460 | |
124 | Gauss, Ghosh -> G200 | |
125 | Hilbert, Heilbronn -> H416 | |
126 | Knuth, Kant -> K530 | |
127 | Lloyd, Ladd -> L300 | |
128 | Lukasiewicz, Lissajous -> L222 | |
129 | ||
130 | so: | |
131 | ||
132 | $code = soundex 'Knuth'; # $code contains 'K530' | |
133 | @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' | |
134 | ||
135 | =head1 LIMITATIONS | |
136 | ||
137 | As the soundex algorithm was originally used a B<long> time ago in the US | |
138 | it considers only the English alphabet and pronunciation. | |
139 | ||
140 | As it is mapping a large space (arbitrary length strings) onto a small | |
141 | space (single letter plus 3 digits) no inference can be made about the | |
142 | similarity of two strings which end up with the same soundex code. For | |
143 | example, both C<Hilbert> and C<Heilbronn> end up with a soundex code | |
144 | of C<H416>. | |
145 | ||
146 | =head1 AUTHOR | |
147 | ||
148 | This code was implemented by Mike Stok (C<stok@cybercom.net>) from the | |
7e6e257f | 149 | description given by Knuth. Ian Phillipps (C<ian@pipex.net>) and Rich Pinder |
cb1a09d0 | 150 | (C<rpinder@hsc.usc.edu>) supplied ideas and spotted mistakes. |