opensextant.advas_phonetics

View Source

  1# -*- coding: utf-8 -*-
  2# ----------------------------------------------------------
  3# AdvaS Advanced Search 
  4# module for phonetic algorithms
  5#
  6# (C) 2002 - 2005 Frank Hofmann, Chemnitz, Germany
  7# email fh@efho.de
  8# ----------------------------------------------------------
  9#
 10# changed 2005-01-24
 11# 2012-01-01 MU adapted to support various Unicode transliterations in Metaphone
 12# 2021-03-25 MU migrated to Xponents here
 13
 14import re
 15
 16
 17def soundex(term):
 18    """Return the soundex value to a string argument."""
 19
 20    # Create and compare soundex codes of English words.
 21    #
 22    # Soundex is an algorithm that hashes English strings into
 23    # alpha-numerical value that represents what the word sounds
 24    # like. For more information on soundex and some notes on the
 25    # differences in implemenations visit:
 26    # http://www.bluepoof.com/Soundex/info.html
 27    #
 28    # This version modified by Nathan Heagy at Front Logic Inc., to be
 29    # compatible with php's soundexing and much faster.
 30    #
 31    # eAndroid / Nathan Heagy / Jul 29 2000
 32    # changes by Frank Hofmann / Jan 02 2005
 33
 34    # generate translation table only once. used to translate into soundex numbers
 35    # table = string.maketrans('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '0123012002245501262301020201230120022455012623010202')
 36    table = "".maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ', '01230120022455012623010202')
 37
 38    # check parameter
 39    if not term:
 40        return "0000"  # could be Z000 for compatibility with other implementations
 41    # end if
 42
 43    # convert into uppercase letters
 44    term = term.upper()
 45    first_char = term[0]
 46
 47    # translate the string into soundex code according to the table above
 48    term = term[1:].translate(table)
 49
 50    # remove all 0s
 51    term = term.replace("0", "")
 52
 53    # remove duplicate numbers in-a-row
 54    str2 = first_char
 55    for x in term:
 56        if x != str2[-1]:
 57            str2 = str2 + x
 58    # end if
 59    # end for
 60
 61    # pad with zeros
 62    str2 = str2 + "0" * len(str2)
 63
 64    # take the first four letters
 65    return_value = str2[:4]
 66
 67    # return value
 68    return return_value
 69
 70
 71# MCU: optimization -- put constant tables in global space, not functional space.
 72#
 73# build translation table
 74metaphone_table = {
 75    "ae": "e",
 76    "gn": "n",
 77    # "kn":"n",  -- Generalization for 'known' or 'knowles' => 'nwn' or 'nwls'; But not "Ken" or "Kane"=> "kn"
 78    "pn": "n",
 79    "wr": "r",
 80    "wh": "w"}
 81
 82# define standard translation table
 83metaphone_std_trans = {
 84    "b": "b",
 85    "c": "k",
 86    "d": "t",
 87    "g": "k",
 88    "h": "h",
 89    "k": "k",
 90    "p": "p",
 91    "n": "n",
 92    "q": "k",
 93    "s": "s",
 94    "t": "t",
 95    "v": "f",
 96    "w": "w",
 97    "x": "ks",
 98    "y": "y",
 99    "z": "s"}
100
101EMPTY_STRING = ''
102
103# cStringIO is about 5% slower than normal + operation for short strings.
104# from cStringIO import StringIO
105
106re_sub_nonalpha = re.compile(u'[^a-zḑḩñşţz̧]')
107re_sub_vowels = re.compile(u'[aeiou]')
108vowels = {'a', 'e', 'i', 'o', 'u'}
109re_CI = re.compile('c[iey]')
110re_SCI = re.compile('sc[iey]')
111re_DG = re.compile('dg[eyi]')
112re_GHvowel = re.compile('gh[aeiouy]')
113# vowels of various Latin forms, preceeding H, where H is followed by consonant.
114re_vowelHvowel = re.compile(u'[aeiouyāīū][hḩ][^aeiouy]')
115re_softH = re.compile('[csptg]h')
116re_SIvowel = re.compile('si[ao]')
117re_TIvowel = re.compile('ti[ao]')
118re_shadowedW = re.compile('w[^aeiouy]')
119
120
121def metaphone(text):
122    """returns metaphone code for a given string"""
123
124    # implementation of the original algorithm from Lawrence Philips
125    # extended/rewritten by M. Kuhn
126    # improvements with thanks to John Machin <sjmachin@lexicon.net>
127    #
128    # 2011-FEB
129    # a) substantial perf improve by Marc Ubaldino <ubaldino@mitre.org> -- put regex in global space for lib. 2.5x faster
130    # b) qualitative fixes:  vowel replacements occur as last step.
131    # c) looking at oddball extended latin, e.g., ḑ,
132    # d) improved repeated chars,  pizza, fayyad, etc.
133
134    # i = 0
135    if not text:
136        # empty string ?
137        return EMPTY_STRING
138    # end if
139
140    # extension #1 (added 2005-01-28)
141    # convert to lowercase
142    term = text.lower()
143
144    # extension #2 (added 2005-01-28)
145    # remove all non-english characters, first
146    term = re_sub_nonalpha.sub('', term)
147    if len(term) == 0:
148        # nothing left
149        return EMPTY_STRING
150    # end if
151
152    # extension #3 (added 2005-01-24)
153    # conflate repeated letters
154    firstChar = term[0]
155    str2 = firstChar
156    for x in term:
157        if x != str2[-1]:
158            str2 = str2 + x
159    # end if
160    # end for
161
162    textnorm = str2
163    # text = str2
164
165    # term = str3
166    term_length = len(textnorm)
167    if term_length == 0:
168        # nothing left
169        return EMPTY_STRING
170    # end if
171
172    # define return value
173    code = ''
174    term = textnorm
175
176    # check for exceptions
177    if (term_length > 1):
178        # get first two characters
179        first_chars = term[0:2]
180
181        kn_start = textnorm.startswith('kn')
182        if first_chars in metaphone_table.keys() or kn_start:
183            term = term[2:]
184            if kn_start:
185                code = 'n'
186            else:
187                code = metaphone_table[first_chars]
188            term_length = len(term)
189    # end if
190
191    elif (term[0] == "x"):
192        term = ""
193        code = "s"
194        term_length = 0
195    # end if
196
197    i = 0
198    while (i < term_length):
199        # init character to add, init basic patterns
200        add_char = ""
201        part_n_2 = ""
202        part_n_3 = ""
203        part_n_4 = ""
204        part_c_2 = ""
205        part_c_3 = ""
206
207        # extract a number of patterns, if possible
208        if (i < (term_length - 1)):
209            part_n_2 = term[i:i + 2]
210
211            if (i > 0):
212                part_c_2 = term[i - 1:i + 1]
213                part_c_3 = term[i - 1:i + 2]
214        # end if
215        # end if
216
217        if (i < (term_length - 2)):
218            part_n_3 = term[i:i + 3]
219        # end if
220
221        if (i < (term_length - 3)):
222            part_n_4 = term[i:i + 4]
223        # end if
224
225        ch = term[i]
226
227        # use table with conditions for translations
228        if (ch == "b"):
229            add_char = metaphone_std_trans["b"]
230            if (i == (term_length - 1)):
231                if (i > 0):
232                    if (term[i - 1] == "m"):
233                        add_char = ""
234                # end if
235            # end if
236        # end if
237        elif (ch == "c"):
238            add_char = metaphone_std_trans["c"]
239            if (part_n_2 == "ch"):
240                add_char = "x"
241            elif re_CI.search(part_n_2):
242                add_char = "s"
243            # end if
244
245            if (part_n_3 == "cia"):
246                add_char = "x"
247            # end if
248
249            if re_SCI.search(part_c_3):
250                add_char = ""
251        # end if
252
253        elif (ch == "d" or ch == u'ḑ'):
254            add_char = metaphone_std_trans["d"]
255            if (re_DG.search(part_n_3)):
256                add_char = "j"
257        # end if
258
259        elif (ch == "g"):
260            add_char = metaphone_std_trans["g"]
261
262            if (part_n_2 == "gh"):
263                if (i == (term_length - 2)):
264                    add_char = ""
265            # end if
266            elif (re_GHvowel.search(part_n_3)):
267                add_char = ""
268            elif (part_n_2 == "gn"):
269                add_char = ""
270            elif (part_n_4 == "gned"):
271                add_char = ""
272            elif re_DG.search(part_c_3):
273                add_char = ""
274            elif (part_n_2 == "gi"):
275                if (part_c_3 != "ggi"):
276                    add_char = "j"
277            # end if
278            elif (part_n_2 == "ge"):
279                if (part_c_3 != "gge"):
280                    add_char = "j"
281            # end if
282            elif (part_n_2 == "gy"):
283                if (part_c_3 != "ggy"):
284                    add_char = "j"
285            # end if
286            elif (part_n_2 == "gg"):
287                add_char = ""
288        # end if
289        elif (ch == "h" or ch == u'ḩ'):
290            add_char = metaphone_std_trans["h"]
291            if (re_vowelHvowel.search(part_c_3)):
292                add_char = ""
293            elif (re_softH.search(part_c_2)):
294                add_char = ""
295        # end if
296        elif (ch == "k"):
297            add_char = metaphone_std_trans["k"]
298            if (part_c_2 == "ck"):
299                add_char = ""
300        # end if
301        elif (ch == "p"):
302            add_char = metaphone_std_trans["p"]
303            if (part_n_2 == "ph"):
304                add_char = "f"
305        # end if
306        elif (ch == "q"):
307            add_char = metaphone_std_trans["q"]
308        elif (ch == "s" or ch == u'ş'):
309            add_char = metaphone_std_trans["s"]
310            if (part_n_2 == "sh"):
311                add_char = "x"
312            # end if
313
314            if re_SIvowel.search(part_n_3):
315                add_char = "x"
316        # end if
317        elif (ch == "t" or ch == u'ţ'):
318            add_char = metaphone_std_trans["t"]
319            if (part_n_2 == "th"):
320                add_char = "0"
321            # end if
322
323            if (re_TIvowel.search(part_n_3)):
324                add_char = "x"
325        # end if
326        elif (ch == "v"):
327            add_char = metaphone_std_trans["v"]
328        elif (ch == "w"):
329            add_char = metaphone_std_trans["w"]
330            if (re_shadowedW.search(part_n_2)):
331                add_char = ""
332        # end if
333        elif (ch == "x"):
334            add_char = metaphone_std_trans["x"]
335        elif (ch == "y"):
336            add_char = metaphone_std_trans["y"]
337        elif (ch == "z" or ch == u'z̧'):
338            add_char = metaphone_std_trans["z"]
339        elif (ch == u'ñ'):
340            add_char = metaphone_std_trans['n']
341        else:
342            # alternative
343            add_char = ch
344        # end if
345
346        if add_char:
347            code = code + add_char
348        i += 1
349    # end while
350
351    # extension #4 (added 2005-01-24)
352    # This was moved from before loop
353    #  "mirance" was coming out as "mrnk"  not "mrns"
354    #  So I refactored and retested all of this.  Vowels are to be stripped out after
355    #  above patterns are run.
356    # remove any vowels unless a vowel is the first letter
357    # firstChar = str2[0]
358    # str3 = firstChar
359    # for x in str2[1:]:
360    #	if x not in vowels:
361    #		str3 = str3 + x
362    # end if
363    # end for
364    # return metaphone code
365
366    c0 = code[0]
367    reduced_code = c0 + re_sub_vowels.sub('', code[1:])
368
369    return reduced_code
370
371
372def nysiis(term):
373    """returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term"""
374
375    code = ""
376
377    # i = 0
378    term_length = len(term)
379
380    if (term_length == 0):
381        # empty string ?
382        return code
383    # end if
384
385    # build translation table for the first characters
386    table = {
387        "mac": "mcc",
388        "ph": "ff",
389        "kn": "nn",
390        "pf": "ff",
391        "k": "c",
392        "sch": "sss"
393    }
394
395    table_value_len = 0
396    for table_entry in table.keys():
397        table_value = table[table_entry]  # get table value
398        table_value_len = len(table_value)  # calculate its length
399        first_chars = term[0:table_value_len]
400        if (first_chars == table_entry):
401            term = table_value + term[table_value_len:]
402            break
403    # end if
404    # end for
405
406    # build translation table for the last characters
407    table = {
408        "ee": "y",
409        "ie": "y",
410        "dt": "d",
411        "rt": "d",
412        "rd": "d",
413        "nt": "d",
414        "nd": "d",
415    }
416
417    for table_entry in table.keys():
418        table_value = table[table_entry]  # get table value
419        table_entry_len = len(table_entry)  # calculate its length
420        last_chars = term[(0 - table_entry_len):]
421        # print last_chars, ", ", table_entry, ", ", table_value
422        if (last_chars == table_entry):
423            term = term[:(0 - table_value_len + 1)] + table_value
424            break
425    # end if
426    # end for
427
428    # initialize code
429    code = term
430
431    # transform ev->af
432    code = re.sub(r'ev', r'af', code)
433
434    # transform a,e,i,o,u->a
435    code = re.sub(r'[aeiouy]', r'a', code)
436
437    # transform q->g
438    code = re.sub(r'q', r'g', code)
439
440    # transform z->s
441    code = re.sub(r'z', r's', code)
442
443    # transform m->n
444    code = re.sub(r'm', r'n', code)
445
446    # transform kn->n
447    code = re.sub(r'kn', r'n', code)
448
449    # transform k->c
450    code = re.sub(r'k', r'c', code)
451
452    # transform sch->sss
453    code = re.sub(r'sch', r'sss', code)
454
455    # transform ph->ff
456    code = re.sub(r'ph', r'ff', code)
457
458    # transform h-> if previous or next is nonvowel -> previous
459    occur = re.findall(r'([a-z]{0,1}?)h([a-z]{0,1}?)', code)
460    # print occur
461    for occur_group in occur:
462        occur_item_previous = occur_group[0]
463        occur_item_next = occur_group[1]
464
465        if ((re.match(r'[^aeiouy]', occur_item_previous)) or (re.match(r'[^aeiouy]', occur_item_next))):
466            if (occur_item_previous != ""):
467                # make substitution
468                code = re.sub(occur_item_previous + "h", occur_item_previous * 2, code, 1)
469        # end if
470    # end if
471    # end for
472
473    # transform w-> if previous is vowel -> previous
474    occur = re.findall(r'([aeiouy]{1}?)w', code)
475    # print occur
476    for occur_group in occur:
477        occur_item_previous = occur_group[0]
478        # make substitution
479        code = re.sub(occur_item_previous + "w", occur_item_previous * 2, code, 1)
480    # end for
481
482    # check last character
483    # -s, remove
484    code = re.sub(r's$', r'', code)
485    # -ay, replace by -y
486    code = re.sub(r'ay$', r'y', code)
487    # -a, remove
488    code = re.sub(r'a$', r'', code)
489
490    # return nysiis code
491    return code
492
493
494def caverphone(term):
495    """returns the language key using the caverphone algorithm 2.0"""
496
497    # Developed at the University of Otago, New Zealand.
498    # Project: Caversham Project (http://caversham.otago.ac.nz)
499    # Developer: David Hood, University of Otago, New Zealand
500    # Contact: caversham@otago.ac.nz
501    # Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf
502    # Version 2.0 (2004-08-15)
503
504    code = ""
505
506    # i = 0
507    term_length = len(term)
508
509    if (term_length == 0):
510        # empty string ?
511        return code
512    # end if
513
514    # convert to lowercase
515    code = term.lower()
516
517    # remove anything not in the standard alphabet (a-z)
518    code = re.sub(r'[^a-z]', '', code)
519
520    # remove final e
521    if code.endswith("e"):
522        code = code[:-1]
523
524    # if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough)
525    code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code)
526
527    # if the name starts with gn -> 2n
528    code = re.sub(r'^gn', r'2n', code)
529
530    # if the name ends with mb -> m2
531    code = re.sub(r'mb$', r'm2', code)
532
533    # replace cq -> 2q
534    code = re.sub(r'cq', r'2q', code)
535
536    # replace c[i,e,y] -> s[i,e,y]
537    code = re.sub(r'c([iey])', r's\1', code)
538
539    # replace tch -> 2ch
540    code = re.sub(r'tch', r'2ch', code)
541
542    # replace c,q,x -> k
543    code = re.sub(r'[cqx]', r'k', code)
544
545    # replace v -> f
546    code = re.sub(r'v', r'f', code)
547
548    # replace dg -> 2g
549    code = re.sub(r'dg', r'2g', code)
550
551    # replace ti[o,a] -> si[o,a]
552    code = re.sub(r'ti([oa])', r'si\1', code)
553
554    # replace d -> t
555    code = re.sub(r'd', r't', code)
556
557    # replace ph -> fh
558    code = re.sub(r'ph', r'fh', code)
559
560    # replace b -> p
561    code = re.sub(r'b', r'p', code)
562
563    # replace sh -> s2
564    code = re.sub(r'sh', r's2', code)
565
566    # replace z -> s
567    code = re.sub(r'z', r's', code)
568
569    # replace initial vowel [aeiou] -> A
570    code = re.sub(r'^[aeiou]', r'A', code)
571
572    # replace all other vowels [aeiou] -> 3
573    code = re.sub(r'[aeiou]', r'3', code)
574
575    # replace j -> y
576    code = re.sub(r'j', r'y', code)
577
578    # replace an initial y3 -> Y3
579    code = re.sub(r'^y3', r'Y3', code)
580
581    # replace an initial y -> A
582    code = re.sub(r'^y', r'A', code)
583
584    # replace y -> 3
585    code = re.sub(r'y', r'3', code)
586
587    # replace 3gh3 -> 3kh3
588    code = re.sub(r'3gh3', r'3kh3', code)
589
590    # replace gh -> 22
591    code = re.sub(r'gh', r'22', code)
592
593    # replace g -> k
594    code = re.sub(r'g', r'k', code)
595
596    # replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent
597    for single_letter in ["s", "t", "p", "k", "f", "m", "n"]:
598        otherParts = re.split(single_letter + "+", code)
599        letter = single_letter.upper()
600        code = letter.join(otherParts)
601
602    # replace w[3,h3] by W[3,h3]
603    code = re.sub(r'w(h?3)', r'W\1', code)
604
605    # replace final w with 3
606    code = re.sub(r'w$', r'3', code)
607
608    # replace w -> 2
609    code = re.sub(r'w', r'2', code)
610
611    # replace h at the beginning with an A
612    code = re.sub(r'^h', r'A', code)
613
614    # replace all other occurrences of h with a 2
615    code = re.sub(r'h', r'2', code)
616
617    # replace r3 with R3
618    code = re.sub(r'r3', r'R3', code)
619
620    # replace final r -> 3
621    code = re.sub(r'r$', r'3', code)
622
623    # replace r with 2
624    code = re.sub(r'r', r'2', code)
625
626    # replace l3 with L3
627    code = re.sub(r'l3', r'L3', code)
628
629    # replace final l -> 3
630    code = re.sub(r'l$', r'3', code)
631
632    # replace l with 2
633    code = re.sub(r'l', r'2', code)
634
635    # remove all 2's
636    code = re.sub(r'2', r'', code)
637
638    # replace the final 3 -> A
639    code = re.sub(r'3$', r'A', code)
640
641    # remove all 3's
642    code = re.sub(r'3', r'', code)
643
644    # extend the code by 10 '1' (one)
645    code += '1' * 10
646
647    # take the first 10 characters
648    caverphoneCode = code[:10]
649
650    # return caverphone code
651    return caverphoneCode

def soundex(term): View Source

18def soundex(term):
19    """Return the soundex value to a string argument."""
20
21    # Create and compare soundex codes of English words.
22    #
23    # Soundex is an algorithm that hashes English strings into
24    # alpha-numerical value that represents what the word sounds
25    # like. For more information on soundex and some notes on the
26    # differences in implemenations visit:
27    # http://www.bluepoof.com/Soundex/info.html
28    #
29    # This version modified by Nathan Heagy at Front Logic Inc., to be
30    # compatible with php's soundexing and much faster.
31    #
32    # eAndroid / Nathan Heagy / Jul 29 2000
33    # changes by Frank Hofmann / Jan 02 2005
34
35    # generate translation table only once. used to translate into soundex numbers
36    # table = string.maketrans('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '0123012002245501262301020201230120022455012623010202')
37    table = "".maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ', '01230120022455012623010202')
38
39    # check parameter
40    if not term:
41        return "0000"  # could be Z000 for compatibility with other implementations
42    # end if
43
44    # convert into uppercase letters
45    term = term.upper()
46    first_char = term[0]
47
48    # translate the string into soundex code according to the table above
49    term = term[1:].translate(table)
50
51    # remove all 0s
52    term = term.replace("0", "")
53
54    # remove duplicate numbers in-a-row
55    str2 = first_char
56    for x in term:
57        if x != str2[-1]:
58            str2 = str2 + x
59    # end if
60    # end for
61
62    # pad with zeros
63    str2 = str2 + "0" * len(str2)
64
65    # take the first four letters
66    return_value = str2[:4]
67
68    # return value
69    return return_value

Return the soundex value to a string argument.

def metaphone(text): View Source

122def metaphone(text):
123    """returns metaphone code for a given string"""
124
125    # implementation of the original algorithm from Lawrence Philips
126    # extended/rewritten by M. Kuhn
127    # improvements with thanks to John Machin <sjmachin@lexicon.net>
128    #
129    # 2011-FEB
130    # a) substantial perf improve by Marc Ubaldino <ubaldino@mitre.org> -- put regex in global space for lib. 2.5x faster
131    # b) qualitative fixes:  vowel replacements occur as last step.
132    # c) looking at oddball extended latin, e.g., ḑ,
133    # d) improved repeated chars,  pizza, fayyad, etc.
134
135    # i = 0
136    if not text:
137        # empty string ?
138        return EMPTY_STRING
139    # end if
140
141    # extension #1 (added 2005-01-28)
142    # convert to lowercase
143    term = text.lower()
144
145    # extension #2 (added 2005-01-28)
146    # remove all non-english characters, first
147    term = re_sub_nonalpha.sub('', term)
148    if len(term) == 0:
149        # nothing left
150        return EMPTY_STRING
151    # end if
152
153    # extension #3 (added 2005-01-24)
154    # conflate repeated letters
155    firstChar = term[0]
156    str2 = firstChar
157    for x in term:
158        if x != str2[-1]:
159            str2 = str2 + x
160    # end if
161    # end for
162
163    textnorm = str2
164    # text = str2
165
166    # term = str3
167    term_length = len(textnorm)
168    if term_length == 0:
169        # nothing left
170        return EMPTY_STRING
171    # end if
172
173    # define return value
174    code = ''
175    term = textnorm
176
177    # check for exceptions
178    if (term_length > 1):
179        # get first two characters
180        first_chars = term[0:2]
181
182        kn_start = textnorm.startswith('kn')
183        if first_chars in metaphone_table.keys() or kn_start:
184            term = term[2:]
185            if kn_start:
186                code = 'n'
187            else:
188                code = metaphone_table[first_chars]
189            term_length = len(term)
190    # end if
191
192    elif (term[0] == "x"):
193        term = ""
194        code = "s"
195        term_length = 0
196    # end if
197
198    i = 0
199    while (i < term_length):
200        # init character to add, init basic patterns
201        add_char = ""
202        part_n_2 = ""
203        part_n_3 = ""
204        part_n_4 = ""
205        part_c_2 = ""
206        part_c_3 = ""
207
208        # extract a number of patterns, if possible
209        if (i < (term_length - 1)):
210            part_n_2 = term[i:i + 2]
211
212            if (i > 0):
213                part_c_2 = term[i - 1:i + 1]
214                part_c_3 = term[i - 1:i + 2]
215        # end if
216        # end if
217
218        if (i < (term_length - 2)):
219            part_n_3 = term[i:i + 3]
220        # end if
221
222        if (i < (term_length - 3)):
223            part_n_4 = term[i:i + 4]
224        # end if
225
226        ch = term[i]
227
228        # use table with conditions for translations
229        if (ch == "b"):
230            add_char = metaphone_std_trans["b"]
231            if (i == (term_length - 1)):
232                if (i > 0):
233                    if (term[i - 1] == "m"):
234                        add_char = ""
235                # end if
236            # end if
237        # end if
238        elif (ch == "c"):
239            add_char = metaphone_std_trans["c"]
240            if (part_n_2 == "ch"):
241                add_char = "x"
242            elif re_CI.search(part_n_2):
243                add_char = "s"
244            # end if
245
246            if (part_n_3 == "cia"):
247                add_char = "x"
248            # end if
249
250            if re_SCI.search(part_c_3):
251                add_char = ""
252        # end if
253
254        elif (ch == "d" or ch == u'ḑ'):
255            add_char = metaphone_std_trans["d"]
256            if (re_DG.search(part_n_3)):
257                add_char = "j"
258        # end if
259
260        elif (ch == "g"):
261            add_char = metaphone_std_trans["g"]
262
263            if (part_n_2 == "gh"):
264                if (i == (term_length - 2)):
265                    add_char = ""
266            # end if
267            elif (re_GHvowel.search(part_n_3)):
268                add_char = ""
269            elif (part_n_2 == "gn"):
270                add_char = ""
271            elif (part_n_4 == "gned"):
272                add_char = ""
273            elif re_DG.search(part_c_3):
274                add_char = ""
275            elif (part_n_2 == "gi"):
276                if (part_c_3 != "ggi"):
277                    add_char = "j"
278            # end if
279            elif (part_n_2 == "ge"):
280                if (part_c_3 != "gge"):
281                    add_char = "j"
282            # end if
283            elif (part_n_2 == "gy"):
284                if (part_c_3 != "ggy"):
285                    add_char = "j"
286            # end if
287            elif (part_n_2 == "gg"):
288                add_char = ""
289        # end if
290        elif (ch == "h" or ch == u'ḩ'):
291            add_char = metaphone_std_trans["h"]
292            if (re_vowelHvowel.search(part_c_3)):
293                add_char = ""
294            elif (re_softH.search(part_c_2)):
295                add_char = ""
296        # end if
297        elif (ch == "k"):
298            add_char = metaphone_std_trans["k"]
299            if (part_c_2 == "ck"):
300                add_char = ""
301        # end if
302        elif (ch == "p"):
303            add_char = metaphone_std_trans["p"]
304            if (part_n_2 == "ph"):
305                add_char = "f"
306        # end if
307        elif (ch == "q"):
308            add_char = metaphone_std_trans["q"]
309        elif (ch == "s" or ch == u'ş'):
310            add_char = metaphone_std_trans["s"]
311            if (part_n_2 == "sh"):
312                add_char = "x"
313            # end if
314
315            if re_SIvowel.search(part_n_3):
316                add_char = "x"
317        # end if
318        elif (ch == "t" or ch == u'ţ'):
319            add_char = metaphone_std_trans["t"]
320            if (part_n_2 == "th"):
321                add_char = "0"
322            # end if
323
324            if (re_TIvowel.search(part_n_3)):
325                add_char = "x"
326        # end if
327        elif (ch == "v"):
328            add_char = metaphone_std_trans["v"]
329        elif (ch == "w"):
330            add_char = metaphone_std_trans["w"]
331            if (re_shadowedW.search(part_n_2)):
332                add_char = ""
333        # end if
334        elif (ch == "x"):
335            add_char = metaphone_std_trans["x"]
336        elif (ch == "y"):
337            add_char = metaphone_std_trans["y"]
338        elif (ch == "z" or ch == u'z̧'):
339            add_char = metaphone_std_trans["z"]
340        elif (ch == u'ñ'):
341            add_char = metaphone_std_trans['n']
342        else:
343            # alternative
344            add_char = ch
345        # end if
346
347        if add_char:
348            code = code + add_char
349        i += 1
350    # end while
351
352    # extension #4 (added 2005-01-24)
353    # This was moved from before loop
354    #  "mirance" was coming out as "mrnk"  not "mrns"
355    #  So I refactored and retested all of this.  Vowels are to be stripped out after
356    #  above patterns are run.
357    # remove any vowels unless a vowel is the first letter
358    # firstChar = str2[0]
359    # str3 = firstChar
360    # for x in str2[1:]:
361    #	if x not in vowels:
362    #		str3 = str3 + x
363    # end if
364    # end for
365    # return metaphone code
366
367    c0 = code[0]
368    reduced_code = c0 + re_sub_vowels.sub('', code[1:])
369
370    return reduced_code

returns metaphone code for a given string

def nysiis(term): View Source

373def nysiis(term):
374    """returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term"""
375
376    code = ""
377
378    # i = 0
379    term_length = len(term)
380
381    if (term_length == 0):
382        # empty string ?
383        return code
384    # end if
385
386    # build translation table for the first characters
387    table = {
388        "mac": "mcc",
389        "ph": "ff",
390        "kn": "nn",
391        "pf": "ff",
392        "k": "c",
393        "sch": "sss"
394    }
395
396    table_value_len = 0
397    for table_entry in table.keys():
398        table_value = table[table_entry]  # get table value
399        table_value_len = len(table_value)  # calculate its length
400        first_chars = term[0:table_value_len]
401        if (first_chars == table_entry):
402            term = table_value + term[table_value_len:]
403            break
404    # end if
405    # end for
406
407    # build translation table for the last characters
408    table = {
409        "ee": "y",
410        "ie": "y",
411        "dt": "d",
412        "rt": "d",
413        "rd": "d",
414        "nt": "d",
415        "nd": "d",
416    }
417
418    for table_entry in table.keys():
419        table_value = table[table_entry]  # get table value
420        table_entry_len = len(table_entry)  # calculate its length
421        last_chars = term[(0 - table_entry_len):]
422        # print last_chars, ", ", table_entry, ", ", table_value
423        if (last_chars == table_entry):
424            term = term[:(0 - table_value_len + 1)] + table_value
425            break
426    # end if
427    # end for
428
429    # initialize code
430    code = term
431
432    # transform ev->af
433    code = re.sub(r'ev', r'af', code)
434
435    # transform a,e,i,o,u->a
436    code = re.sub(r'[aeiouy]', r'a', code)
437
438    # transform q->g
439    code = re.sub(r'q', r'g', code)
440
441    # transform z->s
442    code = re.sub(r'z', r's', code)
443
444    # transform m->n
445    code = re.sub(r'm', r'n', code)
446
447    # transform kn->n
448    code = re.sub(r'kn', r'n', code)
449
450    # transform k->c
451    code = re.sub(r'k', r'c', code)
452
453    # transform sch->sss
454    code = re.sub(r'sch', r'sss', code)
455
456    # transform ph->ff
457    code = re.sub(r'ph', r'ff', code)
458
459    # transform h-> if previous or next is nonvowel -> previous
460    occur = re.findall(r'([a-z]{0,1}?)h([a-z]{0,1}?)', code)
461    # print occur
462    for occur_group in occur:
463        occur_item_previous = occur_group[0]
464        occur_item_next = occur_group[1]
465
466        if ((re.match(r'[^aeiouy]', occur_item_previous)) or (re.match(r'[^aeiouy]', occur_item_next))):
467            if (occur_item_previous != ""):
468                # make substitution
469                code = re.sub(occur_item_previous + "h", occur_item_previous * 2, code, 1)
470        # end if
471    # end if
472    # end for
473
474    # transform w-> if previous is vowel -> previous
475    occur = re.findall(r'([aeiouy]{1}?)w', code)
476    # print occur
477    for occur_group in occur:
478        occur_item_previous = occur_group[0]
479        # make substitution
480        code = re.sub(occur_item_previous + "w", occur_item_previous * 2, code, 1)
481    # end for
482
483    # check last character
484    # -s, remove
485    code = re.sub(r's$', r'', code)
486    # -ay, replace by -y
487    code = re.sub(r'ay$', r'y', code)
488    # -a, remove
489    code = re.sub(r'a$', r'', code)
490
491    # return nysiis code
492    return code

returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term

def caverphone(term): View Source

495def caverphone(term):
496    """returns the language key using the caverphone algorithm 2.0"""
497
498    # Developed at the University of Otago, New Zealand.
499    # Project: Caversham Project (http://caversham.otago.ac.nz)
500    # Developer: David Hood, University of Otago, New Zealand
501    # Contact: caversham@otago.ac.nz
502    # Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf
503    # Version 2.0 (2004-08-15)
504
505    code = ""
506
507    # i = 0
508    term_length = len(term)
509
510    if (term_length == 0):
511        # empty string ?
512        return code
513    # end if
514
515    # convert to lowercase
516    code = term.lower()
517
518    # remove anything not in the standard alphabet (a-z)
519    code = re.sub(r'[^a-z]', '', code)
520
521    # remove final e
522    if code.endswith("e"):
523        code = code[:-1]
524
525    # if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough)
526    code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code)
527
528    # if the name starts with gn -> 2n
529    code = re.sub(r'^gn', r'2n', code)
530
531    # if the name ends with mb -> m2
532    code = re.sub(r'mb$', r'm2', code)
533
534    # replace cq -> 2q
535    code = re.sub(r'cq', r'2q', code)
536
537    # replace c[i,e,y] -> s[i,e,y]
538    code = re.sub(r'c([iey])', r's\1', code)
539
540    # replace tch -> 2ch
541    code = re.sub(r'tch', r'2ch', code)
542
543    # replace c,q,x -> k
544    code = re.sub(r'[cqx]', r'k', code)
545
546    # replace v -> f
547    code = re.sub(r'v', r'f', code)
548
549    # replace dg -> 2g
550    code = re.sub(r'dg', r'2g', code)
551
552    # replace ti[o,a] -> si[o,a]
553    code = re.sub(r'ti([oa])', r'si\1', code)
554
555    # replace d -> t
556    code = re.sub(r'd', r't', code)
557
558    # replace ph -> fh
559    code = re.sub(r'ph', r'fh', code)
560
561    # replace b -> p
562    code = re.sub(r'b', r'p', code)
563
564    # replace sh -> s2
565    code = re.sub(r'sh', r's2', code)
566
567    # replace z -> s
568    code = re.sub(r'z', r's', code)
569
570    # replace initial vowel [aeiou] -> A
571    code = re.sub(r'^[aeiou]', r'A', code)
572
573    # replace all other vowels [aeiou] -> 3
574    code = re.sub(r'[aeiou]', r'3', code)
575
576    # replace j -> y
577    code = re.sub(r'j', r'y', code)
578
579    # replace an initial y3 -> Y3
580    code = re.sub(r'^y3', r'Y3', code)
581
582    # replace an initial y -> A
583    code = re.sub(r'^y', r'A', code)
584
585    # replace y -> 3
586    code = re.sub(r'y', r'3', code)
587
588    # replace 3gh3 -> 3kh3
589    code = re.sub(r'3gh3', r'3kh3', code)
590
591    # replace gh -> 22
592    code = re.sub(r'gh', r'22', code)
593
594    # replace g -> k
595    code = re.sub(r'g', r'k', code)
596
597    # replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent
598    for single_letter in ["s", "t", "p", "k", "f", "m", "n"]:
599        otherParts = re.split(single_letter + "+", code)
600        letter = single_letter.upper()
601        code = letter.join(otherParts)
602
603    # replace w[3,h3] by W[3,h3]
604    code = re.sub(r'w(h?3)', r'W\1', code)
605
606    # replace final w with 3
607    code = re.sub(r'w$', r'3', code)
608
609    # replace w -> 2
610    code = re.sub(r'w', r'2', code)
611
612    # replace h at the beginning with an A
613    code = re.sub(r'^h', r'A', code)
614
615    # replace all other occurrences of h with a 2
616    code = re.sub(r'h', r'2', code)
617
618    # replace r3 with R3
619    code = re.sub(r'r3', r'R3', code)
620
621    # replace final r -> 3
622    code = re.sub(r'r$', r'3', code)
623
624    # replace r with 2
625    code = re.sub(r'r', r'2', code)
626
627    # replace l3 with L3
628    code = re.sub(r'l3', r'L3', code)
629
630    # replace final l -> 3
631    code = re.sub(r'l$', r'3', code)
632
633    # replace l with 2
634    code = re.sub(r'l', r'2', code)
635
636    # remove all 2's
637    code = re.sub(r'2', r'', code)
638
639    # replace the final 3 -> A
640    code = re.sub(r'3$', r'A', code)
641
642    # remove all 3's
643    code = re.sub(r'3', r'', code)
644
645    # extend the code by 10 '1' (one)
646    code += '1' * 10
647
648    # take the first 10 characters
649    caverphoneCode = code[:10]
650
651    # return caverphone code
652    return caverphoneCode

returns the language key using the caverphone algorithm 2.0