Commit | Line | Data |
---|---|---|
11f885b5 SP |
1 | /* -*- c -*- */ |
2 | ||
3 | /* (c) Copyright 1998-2003 by Mark Mielke | |
4 | * | |
5 | * Freedom to use these sources for whatever you want, as long as credit | |
6 | * is given where credit is due, is hereby granted. You may make modifications | |
7 | * where you see fit but leave this copyright somewhere visible. As well try | |
8 | * to initial any changes you make so that if i like the changes i can | |
9 | * incorporate them into any later versions of mine. | |
10 | * | |
11 | * - Mark Mielke <mark@mielke.cc> | |
12 | */ | |
13 | ||
14 | #include "EXTERN.h" | |
15 | #include "perl.h" | |
16 | #include "XSUB.h" | |
17 | ||
18 | #define SOUNDEX_ACCURACY (4) /* The maximum code length... (should be>=2) */ | |
19 | ||
20 | #if !(PERL_REVISION >= 5 && PERL_VERSION >= 8) | |
21 | # define utf8n_to_uvchr utf8_to_uv | |
22 | #endif | |
23 | ||
ef0f5379 SP |
24 | static char sv_soundex_table[0x100]; |
25 | static void sv_soundex_initialize (void) | |
26 | { | |
27 | memset(&sv_soundex_table[0], '\0', sizeof(sv_soundex_table)); | |
28 | sv_soundex_table['A'] = '0'; | |
29 | sv_soundex_table['a'] = '0'; | |
30 | sv_soundex_table['E'] = '0'; | |
31 | sv_soundex_table['e'] = '0'; | |
32 | sv_soundex_table['H'] = '0'; | |
33 | sv_soundex_table['h'] = '0'; | |
34 | sv_soundex_table['I'] = '0'; | |
35 | sv_soundex_table['i'] = '0'; | |
36 | sv_soundex_table['O'] = '0'; | |
37 | sv_soundex_table['o'] = '0'; | |
38 | sv_soundex_table['U'] = '0'; | |
39 | sv_soundex_table['u'] = '0'; | |
40 | sv_soundex_table['W'] = '0'; | |
41 | sv_soundex_table['w'] = '0'; | |
42 | sv_soundex_table['Y'] = '0'; | |
43 | sv_soundex_table['y'] = '0'; | |
44 | sv_soundex_table['B'] = '1'; | |
45 | sv_soundex_table['b'] = '1'; | |
46 | sv_soundex_table['F'] = '1'; | |
47 | sv_soundex_table['f'] = '1'; | |
48 | sv_soundex_table['P'] = '1'; | |
49 | sv_soundex_table['p'] = '1'; | |
50 | sv_soundex_table['V'] = '1'; | |
51 | sv_soundex_table['v'] = '1'; | |
52 | sv_soundex_table['C'] = '2'; | |
53 | sv_soundex_table['c'] = '2'; | |
54 | sv_soundex_table['G'] = '2'; | |
55 | sv_soundex_table['g'] = '2'; | |
56 | sv_soundex_table['J'] = '2'; | |
57 | sv_soundex_table['j'] = '2'; | |
58 | sv_soundex_table['K'] = '2'; | |
59 | sv_soundex_table['k'] = '2'; | |
60 | sv_soundex_table['Q'] = '2'; | |
61 | sv_soundex_table['q'] = '2'; | |
62 | sv_soundex_table['S'] = '2'; | |
63 | sv_soundex_table['s'] = '2'; | |
64 | sv_soundex_table['X'] = '2'; | |
65 | sv_soundex_table['x'] = '2'; | |
66 | sv_soundex_table['Z'] = '2'; | |
67 | sv_soundex_table['z'] = '2'; | |
68 | sv_soundex_table['D'] = '3'; | |
69 | sv_soundex_table['d'] = '3'; | |
70 | sv_soundex_table['T'] = '3'; | |
71 | sv_soundex_table['t'] = '3'; | |
72 | sv_soundex_table['L'] = '4'; | |
73 | sv_soundex_table['l'] = '4'; | |
74 | sv_soundex_table['M'] = '5'; | |
75 | sv_soundex_table['m'] = '5'; | |
76 | sv_soundex_table['N'] = '5'; | |
77 | sv_soundex_table['n'] = '5'; | |
78 | sv_soundex_table['R'] = '6'; | |
79 | sv_soundex_table['r'] = '6'; | |
80 | } | |
11f885b5 | 81 | |
81a4c762 | 82 | static SV *sv_soundex (SV *source) |
11f885b5 SP |
83 | { |
84 | char *source_p; | |
85 | char *source_end; | |
86 | ||
87 | { | |
88 | STRLEN source_len; | |
89 | source_p = SvPV(source, source_len); | |
90 | source_end = &source_p[source_len]; | |
91 | } | |
92 | ||
93 | while (source_p != source_end) | |
94 | { | |
ef0f5379 SP |
95 | char codepart_last = sv_soundex_table[(unsigned char) *source_p]; |
96 | ||
97 | if (codepart_last != '\0') | |
11f885b5 SP |
98 | { |
99 | SV *code = newSV(SOUNDEX_ACCURACY); | |
100 | char *code_p = SvPVX(code); | |
101 | char *code_end = &code_p[SOUNDEX_ACCURACY]; | |
11f885b5 SP |
102 | |
103 | SvCUR_set(code, SOUNDEX_ACCURACY); | |
104 | SvPOK_only(code); | |
105 | ||
ef0f5379 | 106 | *code_p++ = toupper(*source_p++); |
11f885b5 SP |
107 | |
108 | while (source_p != source_end && code_p != code_end) | |
109 | { | |
110 | char c = *source_p++; | |
ef0f5379 | 111 | char codepart = sv_soundex_table[(unsigned char) c]; |
11f885b5 | 112 | |
ef0f5379 SP |
113 | if (codepart != '\0') |
114 | if (codepart != codepart_last && (codepart_last = codepart) != '0') | |
115 | *code_p++ = codepart; | |
11f885b5 SP |
116 | } |
117 | ||
118 | while (code_p != code_end) | |
119 | *code_p++ = '0'; | |
120 | ||
121 | *code_end = '\0'; | |
122 | ||
123 | return code; | |
124 | } | |
125 | ||
126 | source_p++; | |
127 | } | |
128 | ||
129 | return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE)); | |
130 | } | |
131 | ||
81a4c762 | 132 | static SV *sv_soundex_utf8 (SV* source) |
11f885b5 SP |
133 | { |
134 | U8 *source_p; | |
135 | U8 *source_end; | |
136 | ||
137 | { | |
138 | STRLEN source_len; | |
139 | source_p = (U8 *) SvPV(source, source_len); | |
140 | source_end = &source_p[source_len]; | |
141 | } | |
142 | ||
143 | while (source_p < source_end) | |
144 | { | |
145 | STRLEN offset; | |
146 | UV c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); | |
ef0f5379 | 147 | char codepart_last = (c <= 0xFF) ? sv_soundex_table[c] : '\0'; |
11f885b5 SP |
148 | source_p = (offset >= 1) ? &source_p[offset] : source_end; |
149 | ||
ef0f5379 | 150 | if (codepart_last != '\0') |
11f885b5 SP |
151 | { |
152 | SV *code = newSV(SOUNDEX_ACCURACY); | |
153 | char *code_p = SvPVX(code); | |
154 | char *code_end = &code_p[SOUNDEX_ACCURACY]; | |
11f885b5 SP |
155 | |
156 | SvCUR_set(code, SOUNDEX_ACCURACY); | |
157 | SvPOK_only(code); | |
158 | ||
ef0f5379 | 159 | *code_p++ = toupper(c); |
11f885b5 SP |
160 | |
161 | while (source_p != source_end && code_p != code_end) | |
162 | { | |
ef0f5379 | 163 | char codepart; |
11f885b5 | 164 | c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); |
ef0f5379 | 165 | codepart = (c <= 0xFF) ? sv_soundex_table[c] : '\0'; |
11f885b5 SP |
166 | source_p = (offset >= 1) ? &source_p[offset] : source_end; |
167 | ||
ef0f5379 SP |
168 | if (codepart != '\0') |
169 | if (codepart != codepart_last && (codepart_last = codepart) != '0') | |
170 | *code_p++ = codepart; | |
11f885b5 SP |
171 | } |
172 | ||
173 | while (code_p != code_end) | |
174 | *code_p++ = '0'; | |
175 | ||
176 | *code_end = '\0'; | |
177 | ||
178 | return code; | |
179 | } | |
180 | ||
181 | source_p++; | |
182 | } | |
183 | ||
184 | return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE)); | |
185 | } | |
186 | ||
187 | MODULE = Text::Soundex PACKAGE = Text::Soundex | |
188 | ||
189 | PROTOTYPES: DISABLE | |
190 | ||
191 | void | |
192 | soundex_xs (...) | |
ef0f5379 SP |
193 | INIT: |
194 | { | |
195 | sv_soundex_initialize(); | |
196 | } | |
11f885b5 SP |
197 | PPCODE: |
198 | { | |
199 | int i; | |
200 | for (i = 0; i < items; i++) | |
201 | { | |
202 | SV *sv = ST(i); | |
203 | ||
204 | if (DO_UTF8(sv)) | |
205 | sv = sv_soundex_utf8(sv); | |
206 | else | |
207 | sv = sv_soundex(sv); | |
208 | ||
209 | PUSHs(sv_2mortal(sv)); | |
210 | } | |
211 | } |