opensextant.advas_phonetics
1# -*- coding: utf-8 -*- 2# ---------------------------------------------------------- 3# AdvaS Advanced Search 4# module for phonetic algorithms 5# 6# (C) 2002 - 2005 Frank Hofmann, Chemnitz, Germany 7# email fh@efho.de 8# ---------------------------------------------------------- 9# 10# changed 2005-01-24 11# 2012-01-01 MU adapted to support various Unicode transliterations in Metaphone 12# 2021-03-25 MU migrated to Xponents here 13 14import re 15 16 17def soundex(term): 18 """Return the soundex value to a string argument.""" 19 20 # Create and compare soundex codes of English words. 21 # 22 # Soundex is an algorithm that hashes English strings into 23 # alpha-numerical value that represents what the word sounds 24 # like. For more information on soundex and some notes on the 25 # differences in implemenations visit: 26 # http://www.bluepoof.com/Soundex/info.html 27 # 28 # This version modified by Nathan Heagy at Front Logic Inc., to be 29 # compatible with php's soundexing and much faster. 30 # 31 # eAndroid / Nathan Heagy / Jul 29 2000 32 # changes by Frank Hofmann / Jan 02 2005 33 34 # generate translation table only once. used to translate into soundex numbers 35 # table = string.maketrans('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '0123012002245501262301020201230120022455012623010202') 36 table = "".maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ', '01230120022455012623010202') 37 38 # check parameter 39 if not term: 40 return "0000" # could be Z000 for compatibility with other implementations 41 # end if 42 43 # convert into uppercase letters 44 term = term.upper() 45 first_char = term[0] 46 47 # translate the string into soundex code according to the table above 48 term = term[1:].translate(table) 49 50 # remove all 0s 51 term = term.replace("0", "") 52 53 # remove duplicate numbers in-a-row 54 str2 = first_char 55 for x in term: 56 if x != str2[-1]: 57 str2 = str2 + x 58 # end if 59 # end for 60 61 # pad with zeros 62 str2 = str2 + "0" * len(str2) 63 64 # take the first four letters 65 return_value = str2[:4] 66 67 # return value 68 return return_value 69 70 71# MCU: optimization -- put constant tables in global space, not functional space. 72# 73# build translation table 74metaphone_table = { 75 "ae": "e", 76 "gn": "n", 77 # "kn":"n", -- Generalization for 'known' or 'knowles' => 'nwn' or 'nwls'; But not "Ken" or "Kane"=> "kn" 78 "pn": "n", 79 "wr": "r", 80 "wh": "w"} 81 82# define standard translation table 83metaphone_std_trans = { 84 "b": "b", 85 "c": "k", 86 "d": "t", 87 "g": "k", 88 "h": "h", 89 "k": "k", 90 "p": "p", 91 "n": "n", 92 "q": "k", 93 "s": "s", 94 "t": "t", 95 "v": "f", 96 "w": "w", 97 "x": "ks", 98 "y": "y", 99 "z": "s"} 100 101EMPTY_STRING = '' 102 103# cStringIO is about 5% slower than normal + operation for short strings. 104# from cStringIO import StringIO 105 106re_sub_nonalpha = re.compile(u'[^a-zḑḩñşţz̧]') 107re_sub_vowels = re.compile(u'[aeiou]') 108vowels = {'a', 'e', 'i', 'o', 'u'} 109re_CI = re.compile('c[iey]') 110re_SCI = re.compile('sc[iey]') 111re_DG = re.compile('dg[eyi]') 112re_GHvowel = re.compile('gh[aeiouy]') 113# vowels of various Latin forms, preceeding H, where H is followed by consonant. 114re_vowelHvowel = re.compile(u'[aeiouyāīū][hḩ][^aeiouy]') 115re_softH = re.compile('[csptg]h') 116re_SIvowel = re.compile('si[ao]') 117re_TIvowel = re.compile('ti[ao]') 118re_shadowedW = re.compile('w[^aeiouy]') 119 120 121def metaphone(text): 122 """returns metaphone code for a given string""" 123 124 # implementation of the original algorithm from Lawrence Philips 125 # extended/rewritten by M. Kuhn 126 # improvements with thanks to John Machin <sjmachin@lexicon.net> 127 # 128 # 2011-FEB 129 # a) substantial perf improve by Marc Ubaldino <ubaldino@mitre.org> -- put regex in global space for lib. 2.5x faster 130 # b) qualitative fixes: vowel replacements occur as last step. 131 # c) looking at oddball extended latin, e.g., ḑ, 132 # d) improved repeated chars, pizza, fayyad, etc. 133 134 # i = 0 135 if not text: 136 # empty string ? 137 return EMPTY_STRING 138 # end if 139 140 # extension #1 (added 2005-01-28) 141 # convert to lowercase 142 term = text.lower() 143 144 # extension #2 (added 2005-01-28) 145 # remove all non-english characters, first 146 term = re_sub_nonalpha.sub('', term) 147 if len(term) == 0: 148 # nothing left 149 return EMPTY_STRING 150 # end if 151 152 # extension #3 (added 2005-01-24) 153 # conflate repeated letters 154 firstChar = term[0] 155 str2 = firstChar 156 for x in term: 157 if x != str2[-1]: 158 str2 = str2 + x 159 # end if 160 # end for 161 162 textnorm = str2 163 # text = str2 164 165 # term = str3 166 term_length = len(textnorm) 167 if term_length == 0: 168 # nothing left 169 return EMPTY_STRING 170 # end if 171 172 # define return value 173 code = '' 174 term = textnorm 175 176 # check for exceptions 177 if (term_length > 1): 178 # get first two characters 179 first_chars = term[0:2] 180 181 kn_start = textnorm.startswith('kn') 182 if first_chars in metaphone_table.keys() or kn_start: 183 term = term[2:] 184 if kn_start: 185 code = 'n' 186 else: 187 code = metaphone_table[first_chars] 188 term_length = len(term) 189 # end if 190 191 elif (term[0] == "x"): 192 term = "" 193 code = "s" 194 term_length = 0 195 # end if 196 197 i = 0 198 while (i < term_length): 199 # init character to add, init basic patterns 200 add_char = "" 201 part_n_2 = "" 202 part_n_3 = "" 203 part_n_4 = "" 204 part_c_2 = "" 205 part_c_3 = "" 206 207 # extract a number of patterns, if possible 208 if (i < (term_length - 1)): 209 part_n_2 = term[i:i + 2] 210 211 if (i > 0): 212 part_c_2 = term[i - 1:i + 1] 213 part_c_3 = term[i - 1:i + 2] 214 # end if 215 # end if 216 217 if (i < (term_length - 2)): 218 part_n_3 = term[i:i + 3] 219 # end if 220 221 if (i < (term_length - 3)): 222 part_n_4 = term[i:i + 4] 223 # end if 224 225 ch = term[i] 226 227 # use table with conditions for translations 228 if (ch == "b"): 229 add_char = metaphone_std_trans["b"] 230 if (i == (term_length - 1)): 231 if (i > 0): 232 if (term[i - 1] == "m"): 233 add_char = "" 234 # end if 235 # end if 236 # end if 237 elif (ch == "c"): 238 add_char = metaphone_std_trans["c"] 239 if (part_n_2 == "ch"): 240 add_char = "x" 241 elif re_CI.search(part_n_2): 242 add_char = "s" 243 # end if 244 245 if (part_n_3 == "cia"): 246 add_char = "x" 247 # end if 248 249 if re_SCI.search(part_c_3): 250 add_char = "" 251 # end if 252 253 elif (ch == "d" or ch == u'ḑ'): 254 add_char = metaphone_std_trans["d"] 255 if (re_DG.search(part_n_3)): 256 add_char = "j" 257 # end if 258 259 elif (ch == "g"): 260 add_char = metaphone_std_trans["g"] 261 262 if (part_n_2 == "gh"): 263 if (i == (term_length - 2)): 264 add_char = "" 265 # end if 266 elif (re_GHvowel.search(part_n_3)): 267 add_char = "" 268 elif (part_n_2 == "gn"): 269 add_char = "" 270 elif (part_n_4 == "gned"): 271 add_char = "" 272 elif re_DG.search(part_c_3): 273 add_char = "" 274 elif (part_n_2 == "gi"): 275 if (part_c_3 != "ggi"): 276 add_char = "j" 277 # end if 278 elif (part_n_2 == "ge"): 279 if (part_c_3 != "gge"): 280 add_char = "j" 281 # end if 282 elif (part_n_2 == "gy"): 283 if (part_c_3 != "ggy"): 284 add_char = "j" 285 # end if 286 elif (part_n_2 == "gg"): 287 add_char = "" 288 # end if 289 elif (ch == "h" or ch == u'ḩ'): 290 add_char = metaphone_std_trans["h"] 291 if (re_vowelHvowel.search(part_c_3)): 292 add_char = "" 293 elif (re_softH.search(part_c_2)): 294 add_char = "" 295 # end if 296 elif (ch == "k"): 297 add_char = metaphone_std_trans["k"] 298 if (part_c_2 == "ck"): 299 add_char = "" 300 # end if 301 elif (ch == "p"): 302 add_char = metaphone_std_trans["p"] 303 if (part_n_2 == "ph"): 304 add_char = "f" 305 # end if 306 elif (ch == "q"): 307 add_char = metaphone_std_trans["q"] 308 elif (ch == "s" or ch == u'ş'): 309 add_char = metaphone_std_trans["s"] 310 if (part_n_2 == "sh"): 311 add_char = "x" 312 # end if 313 314 if re_SIvowel.search(part_n_3): 315 add_char = "x" 316 # end if 317 elif (ch == "t" or ch == u'ţ'): 318 add_char = metaphone_std_trans["t"] 319 if (part_n_2 == "th"): 320 add_char = "0" 321 # end if 322 323 if (re_TIvowel.search(part_n_3)): 324 add_char = "x" 325 # end if 326 elif (ch == "v"): 327 add_char = metaphone_std_trans["v"] 328 elif (ch == "w"): 329 add_char = metaphone_std_trans["w"] 330 if (re_shadowedW.search(part_n_2)): 331 add_char = "" 332 # end if 333 elif (ch == "x"): 334 add_char = metaphone_std_trans["x"] 335 elif (ch == "y"): 336 add_char = metaphone_std_trans["y"] 337 elif (ch == "z" or ch == u'z̧'): 338 add_char = metaphone_std_trans["z"] 339 elif (ch == u'ñ'): 340 add_char = metaphone_std_trans['n'] 341 else: 342 # alternative 343 add_char = ch 344 # end if 345 346 if add_char: 347 code = code + add_char 348 i += 1 349 # end while 350 351 # extension #4 (added 2005-01-24) 352 # This was moved from before loop 353 # "mirance" was coming out as "mrnk" not "mrns" 354 # So I refactored and retested all of this. Vowels are to be stripped out after 355 # above patterns are run. 356 # remove any vowels unless a vowel is the first letter 357 # firstChar = str2[0] 358 # str3 = firstChar 359 # for x in str2[1:]: 360 # if x not in vowels: 361 # str3 = str3 + x 362 # end if 363 # end for 364 # return metaphone code 365 366 c0 = code[0] 367 reduced_code = c0 + re_sub_vowels.sub('', code[1:]) 368 369 return reduced_code 370 371 372def nysiis(term): 373 """returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term""" 374 375 code = "" 376 377 # i = 0 378 term_length = len(term) 379 380 if (term_length == 0): 381 # empty string ? 382 return code 383 # end if 384 385 # build translation table for the first characters 386 table = { 387 "mac": "mcc", 388 "ph": "ff", 389 "kn": "nn", 390 "pf": "ff", 391 "k": "c", 392 "sch": "sss" 393 } 394 395 table_value_len = 0 396 for table_entry in table.keys(): 397 table_value = table[table_entry] # get table value 398 table_value_len = len(table_value) # calculate its length 399 first_chars = term[0:table_value_len] 400 if (first_chars == table_entry): 401 term = table_value + term[table_value_len:] 402 break 403 # end if 404 # end for 405 406 # build translation table for the last characters 407 table = { 408 "ee": "y", 409 "ie": "y", 410 "dt": "d", 411 "rt": "d", 412 "rd": "d", 413 "nt": "d", 414 "nd": "d", 415 } 416 417 for table_entry in table.keys(): 418 table_value = table[table_entry] # get table value 419 table_entry_len = len(table_entry) # calculate its length 420 last_chars = term[(0 - table_entry_len):] 421 # print last_chars, ", ", table_entry, ", ", table_value 422 if (last_chars == table_entry): 423 term = term[:(0 - table_value_len + 1)] + table_value 424 break 425 # end if 426 # end for 427 428 # initialize code 429 code = term 430 431 # transform ev->af 432 code = re.sub(r'ev', r'af', code) 433 434 # transform a,e,i,o,u->a 435 code = re.sub(r'[aeiouy]', r'a', code) 436 437 # transform q->g 438 code = re.sub(r'q', r'g', code) 439 440 # transform z->s 441 code = re.sub(r'z', r's', code) 442 443 # transform m->n 444 code = re.sub(r'm', r'n', code) 445 446 # transform kn->n 447 code = re.sub(r'kn', r'n', code) 448 449 # transform k->c 450 code = re.sub(r'k', r'c', code) 451 452 # transform sch->sss 453 code = re.sub(r'sch', r'sss', code) 454 455 # transform ph->ff 456 code = re.sub(r'ph', r'ff', code) 457 458 # transform h-> if previous or next is nonvowel -> previous 459 occur = re.findall(r'([a-z]{0,1}?)h([a-z]{0,1}?)', code) 460 # print occur 461 for occur_group in occur: 462 occur_item_previous = occur_group[0] 463 occur_item_next = occur_group[1] 464 465 if ((re.match(r'[^aeiouy]', occur_item_previous)) or (re.match(r'[^aeiouy]', occur_item_next))): 466 if (occur_item_previous != ""): 467 # make substitution 468 code = re.sub(occur_item_previous + "h", occur_item_previous * 2, code, 1) 469 # end if 470 # end if 471 # end for 472 473 # transform w-> if previous is vowel -> previous 474 occur = re.findall(r'([aeiouy]{1}?)w', code) 475 # print occur 476 for occur_group in occur: 477 occur_item_previous = occur_group[0] 478 # make substitution 479 code = re.sub(occur_item_previous + "w", occur_item_previous * 2, code, 1) 480 # end for 481 482 # check last character 483 # -s, remove 484 code = re.sub(r's$', r'', code) 485 # -ay, replace by -y 486 code = re.sub(r'ay$', r'y', code) 487 # -a, remove 488 code = re.sub(r'a$', r'', code) 489 490 # return nysiis code 491 return code 492 493 494def caverphone(term): 495 """returns the language key using the caverphone algorithm 2.0""" 496 497 # Developed at the University of Otago, New Zealand. 498 # Project: Caversham Project (http://caversham.otago.ac.nz) 499 # Developer: David Hood, University of Otago, New Zealand 500 # Contact: caversham@otago.ac.nz 501 # Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf 502 # Version 2.0 (2004-08-15) 503 504 code = "" 505 506 # i = 0 507 term_length = len(term) 508 509 if (term_length == 0): 510 # empty string ? 511 return code 512 # end if 513 514 # convert to lowercase 515 code = term.lower() 516 517 # remove anything not in the standard alphabet (a-z) 518 code = re.sub(r'[^a-z]', '', code) 519 520 # remove final e 521 if code.endswith("e"): 522 code = code[:-1] 523 524 # if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough) 525 code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code) 526 527 # if the name starts with gn -> 2n 528 code = re.sub(r'^gn', r'2n', code) 529 530 # if the name ends with mb -> m2 531 code = re.sub(r'mb$', r'm2', code) 532 533 # replace cq -> 2q 534 code = re.sub(r'cq', r'2q', code) 535 536 # replace c[i,e,y] -> s[i,e,y] 537 code = re.sub(r'c([iey])', r's\1', code) 538 539 # replace tch -> 2ch 540 code = re.sub(r'tch', r'2ch', code) 541 542 # replace c,q,x -> k 543 code = re.sub(r'[cqx]', r'k', code) 544 545 # replace v -> f 546 code = re.sub(r'v', r'f', code) 547 548 # replace dg -> 2g 549 code = re.sub(r'dg', r'2g', code) 550 551 # replace ti[o,a] -> si[o,a] 552 code = re.sub(r'ti([oa])', r'si\1', code) 553 554 # replace d -> t 555 code = re.sub(r'd', r't', code) 556 557 # replace ph -> fh 558 code = re.sub(r'ph', r'fh', code) 559 560 # replace b -> p 561 code = re.sub(r'b', r'p', code) 562 563 # replace sh -> s2 564 code = re.sub(r'sh', r's2', code) 565 566 # replace z -> s 567 code = re.sub(r'z', r's', code) 568 569 # replace initial vowel [aeiou] -> A 570 code = re.sub(r'^[aeiou]', r'A', code) 571 572 # replace all other vowels [aeiou] -> 3 573 code = re.sub(r'[aeiou]', r'3', code) 574 575 # replace j -> y 576 code = re.sub(r'j', r'y', code) 577 578 # replace an initial y3 -> Y3 579 code = re.sub(r'^y3', r'Y3', code) 580 581 # replace an initial y -> A 582 code = re.sub(r'^y', r'A', code) 583 584 # replace y -> 3 585 code = re.sub(r'y', r'3', code) 586 587 # replace 3gh3 -> 3kh3 588 code = re.sub(r'3gh3', r'3kh3', code) 589 590 # replace gh -> 22 591 code = re.sub(r'gh', r'22', code) 592 593 # replace g -> k 594 code = re.sub(r'g', r'k', code) 595 596 # replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent 597 for single_letter in ["s", "t", "p", "k", "f", "m", "n"]: 598 otherParts = re.split(single_letter + "+", code) 599 letter = single_letter.upper() 600 code = letter.join(otherParts) 601 602 # replace w[3,h3] by W[3,h3] 603 code = re.sub(r'w(h?3)', r'W\1', code) 604 605 # replace final w with 3 606 code = re.sub(r'w$', r'3', code) 607 608 # replace w -> 2 609 code = re.sub(r'w', r'2', code) 610 611 # replace h at the beginning with an A 612 code = re.sub(r'^h', r'A', code) 613 614 # replace all other occurrences of h with a 2 615 code = re.sub(r'h', r'2', code) 616 617 # replace r3 with R3 618 code = re.sub(r'r3', r'R3', code) 619 620 # replace final r -> 3 621 code = re.sub(r'r$', r'3', code) 622 623 # replace r with 2 624 code = re.sub(r'r', r'2', code) 625 626 # replace l3 with L3 627 code = re.sub(r'l3', r'L3', code) 628 629 # replace final l -> 3 630 code = re.sub(r'l$', r'3', code) 631 632 # replace l with 2 633 code = re.sub(r'l', r'2', code) 634 635 # remove all 2's 636 code = re.sub(r'2', r'', code) 637 638 # replace the final 3 -> A 639 code = re.sub(r'3$', r'A', code) 640 641 # remove all 3's 642 code = re.sub(r'3', r'', code) 643 644 # extend the code by 10 '1' (one) 645 code += '1' * 10 646 647 # take the first 10 characters 648 caverphoneCode = code[:10] 649 650 # return caverphone code 651 return caverphoneCode
def
soundex(term):
18def soundex(term): 19 """Return the soundex value to a string argument.""" 20 21 # Create and compare soundex codes of English words. 22 # 23 # Soundex is an algorithm that hashes English strings into 24 # alpha-numerical value that represents what the word sounds 25 # like. For more information on soundex and some notes on the 26 # differences in implemenations visit: 27 # http://www.bluepoof.com/Soundex/info.html 28 # 29 # This version modified by Nathan Heagy at Front Logic Inc., to be 30 # compatible with php's soundexing and much faster. 31 # 32 # eAndroid / Nathan Heagy / Jul 29 2000 33 # changes by Frank Hofmann / Jan 02 2005 34 35 # generate translation table only once. used to translate into soundex numbers 36 # table = string.maketrans('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '0123012002245501262301020201230120022455012623010202') 37 table = "".maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ', '01230120022455012623010202') 38 39 # check parameter 40 if not term: 41 return "0000" # could be Z000 for compatibility with other implementations 42 # end if 43 44 # convert into uppercase letters 45 term = term.upper() 46 first_char = term[0] 47 48 # translate the string into soundex code according to the table above 49 term = term[1:].translate(table) 50 51 # remove all 0s 52 term = term.replace("0", "") 53 54 # remove duplicate numbers in-a-row 55 str2 = first_char 56 for x in term: 57 if x != str2[-1]: 58 str2 = str2 + x 59 # end if 60 # end for 61 62 # pad with zeros 63 str2 = str2 + "0" * len(str2) 64 65 # take the first four letters 66 return_value = str2[:4] 67 68 # return value 69 return return_value
Return the soundex value to a string argument.
def
metaphone(text):
122def metaphone(text): 123 """returns metaphone code for a given string""" 124 125 # implementation of the original algorithm from Lawrence Philips 126 # extended/rewritten by M. Kuhn 127 # improvements with thanks to John Machin <sjmachin@lexicon.net> 128 # 129 # 2011-FEB 130 # a) substantial perf improve by Marc Ubaldino <ubaldino@mitre.org> -- put regex in global space for lib. 2.5x faster 131 # b) qualitative fixes: vowel replacements occur as last step. 132 # c) looking at oddball extended latin, e.g., ḑ, 133 # d) improved repeated chars, pizza, fayyad, etc. 134 135 # i = 0 136 if not text: 137 # empty string ? 138 return EMPTY_STRING 139 # end if 140 141 # extension #1 (added 2005-01-28) 142 # convert to lowercase 143 term = text.lower() 144 145 # extension #2 (added 2005-01-28) 146 # remove all non-english characters, first 147 term = re_sub_nonalpha.sub('', term) 148 if len(term) == 0: 149 # nothing left 150 return EMPTY_STRING 151 # end if 152 153 # extension #3 (added 2005-01-24) 154 # conflate repeated letters 155 firstChar = term[0] 156 str2 = firstChar 157 for x in term: 158 if x != str2[-1]: 159 str2 = str2 + x 160 # end if 161 # end for 162 163 textnorm = str2 164 # text = str2 165 166 # term = str3 167 term_length = len(textnorm) 168 if term_length == 0: 169 # nothing left 170 return EMPTY_STRING 171 # end if 172 173 # define return value 174 code = '' 175 term = textnorm 176 177 # check for exceptions 178 if (term_length > 1): 179 # get first two characters 180 first_chars = term[0:2] 181 182 kn_start = textnorm.startswith('kn') 183 if first_chars in metaphone_table.keys() or kn_start: 184 term = term[2:] 185 if kn_start: 186 code = 'n' 187 else: 188 code = metaphone_table[first_chars] 189 term_length = len(term) 190 # end if 191 192 elif (term[0] == "x"): 193 term = "" 194 code = "s" 195 term_length = 0 196 # end if 197 198 i = 0 199 while (i < term_length): 200 # init character to add, init basic patterns 201 add_char = "" 202 part_n_2 = "" 203 part_n_3 = "" 204 part_n_4 = "" 205 part_c_2 = "" 206 part_c_3 = "" 207 208 # extract a number of patterns, if possible 209 if (i < (term_length - 1)): 210 part_n_2 = term[i:i + 2] 211 212 if (i > 0): 213 part_c_2 = term[i - 1:i + 1] 214 part_c_3 = term[i - 1:i + 2] 215 # end if 216 # end if 217 218 if (i < (term_length - 2)): 219 part_n_3 = term[i:i + 3] 220 # end if 221 222 if (i < (term_length - 3)): 223 part_n_4 = term[i:i + 4] 224 # end if 225 226 ch = term[i] 227 228 # use table with conditions for translations 229 if (ch == "b"): 230 add_char = metaphone_std_trans["b"] 231 if (i == (term_length - 1)): 232 if (i > 0): 233 if (term[i - 1] == "m"): 234 add_char = "" 235 # end if 236 # end if 237 # end if 238 elif (ch == "c"): 239 add_char = metaphone_std_trans["c"] 240 if (part_n_2 == "ch"): 241 add_char = "x" 242 elif re_CI.search(part_n_2): 243 add_char = "s" 244 # end if 245 246 if (part_n_3 == "cia"): 247 add_char = "x" 248 # end if 249 250 if re_SCI.search(part_c_3): 251 add_char = "" 252 # end if 253 254 elif (ch == "d" or ch == u'ḑ'): 255 add_char = metaphone_std_trans["d"] 256 if (re_DG.search(part_n_3)): 257 add_char = "j" 258 # end if 259 260 elif (ch == "g"): 261 add_char = metaphone_std_trans["g"] 262 263 if (part_n_2 == "gh"): 264 if (i == (term_length - 2)): 265 add_char = "" 266 # end if 267 elif (re_GHvowel.search(part_n_3)): 268 add_char = "" 269 elif (part_n_2 == "gn"): 270 add_char = "" 271 elif (part_n_4 == "gned"): 272 add_char = "" 273 elif re_DG.search(part_c_3): 274 add_char = "" 275 elif (part_n_2 == "gi"): 276 if (part_c_3 != "ggi"): 277 add_char = "j" 278 # end if 279 elif (part_n_2 == "ge"): 280 if (part_c_3 != "gge"): 281 add_char = "j" 282 # end if 283 elif (part_n_2 == "gy"): 284 if (part_c_3 != "ggy"): 285 add_char = "j" 286 # end if 287 elif (part_n_2 == "gg"): 288 add_char = "" 289 # end if 290 elif (ch == "h" or ch == u'ḩ'): 291 add_char = metaphone_std_trans["h"] 292 if (re_vowelHvowel.search(part_c_3)): 293 add_char = "" 294 elif (re_softH.search(part_c_2)): 295 add_char = "" 296 # end if 297 elif (ch == "k"): 298 add_char = metaphone_std_trans["k"] 299 if (part_c_2 == "ck"): 300 add_char = "" 301 # end if 302 elif (ch == "p"): 303 add_char = metaphone_std_trans["p"] 304 if (part_n_2 == "ph"): 305 add_char = "f" 306 # end if 307 elif (ch == "q"): 308 add_char = metaphone_std_trans["q"] 309 elif (ch == "s" or ch == u'ş'): 310 add_char = metaphone_std_trans["s"] 311 if (part_n_2 == "sh"): 312 add_char = "x" 313 # end if 314 315 if re_SIvowel.search(part_n_3): 316 add_char = "x" 317 # end if 318 elif (ch == "t" or ch == u'ţ'): 319 add_char = metaphone_std_trans["t"] 320 if (part_n_2 == "th"): 321 add_char = "0" 322 # end if 323 324 if (re_TIvowel.search(part_n_3)): 325 add_char = "x" 326 # end if 327 elif (ch == "v"): 328 add_char = metaphone_std_trans["v"] 329 elif (ch == "w"): 330 add_char = metaphone_std_trans["w"] 331 if (re_shadowedW.search(part_n_2)): 332 add_char = "" 333 # end if 334 elif (ch == "x"): 335 add_char = metaphone_std_trans["x"] 336 elif (ch == "y"): 337 add_char = metaphone_std_trans["y"] 338 elif (ch == "z" or ch == u'z̧'): 339 add_char = metaphone_std_trans["z"] 340 elif (ch == u'ñ'): 341 add_char = metaphone_std_trans['n'] 342 else: 343 # alternative 344 add_char = ch 345 # end if 346 347 if add_char: 348 code = code + add_char 349 i += 1 350 # end while 351 352 # extension #4 (added 2005-01-24) 353 # This was moved from before loop 354 # "mirance" was coming out as "mrnk" not "mrns" 355 # So I refactored and retested all of this. Vowels are to be stripped out after 356 # above patterns are run. 357 # remove any vowels unless a vowel is the first letter 358 # firstChar = str2[0] 359 # str3 = firstChar 360 # for x in str2[1:]: 361 # if x not in vowels: 362 # str3 = str3 + x 363 # end if 364 # end for 365 # return metaphone code 366 367 c0 = code[0] 368 reduced_code = c0 + re_sub_vowels.sub('', code[1:]) 369 370 return reduced_code
returns metaphone code for a given string
def
nysiis(term):
373def nysiis(term): 374 """returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term""" 375 376 code = "" 377 378 # i = 0 379 term_length = len(term) 380 381 if (term_length == 0): 382 # empty string ? 383 return code 384 # end if 385 386 # build translation table for the first characters 387 table = { 388 "mac": "mcc", 389 "ph": "ff", 390 "kn": "nn", 391 "pf": "ff", 392 "k": "c", 393 "sch": "sss" 394 } 395 396 table_value_len = 0 397 for table_entry in table.keys(): 398 table_value = table[table_entry] # get table value 399 table_value_len = len(table_value) # calculate its length 400 first_chars = term[0:table_value_len] 401 if (first_chars == table_entry): 402 term = table_value + term[table_value_len:] 403 break 404 # end if 405 # end for 406 407 # build translation table for the last characters 408 table = { 409 "ee": "y", 410 "ie": "y", 411 "dt": "d", 412 "rt": "d", 413 "rd": "d", 414 "nt": "d", 415 "nd": "d", 416 } 417 418 for table_entry in table.keys(): 419 table_value = table[table_entry] # get table value 420 table_entry_len = len(table_entry) # calculate its length 421 last_chars = term[(0 - table_entry_len):] 422 # print last_chars, ", ", table_entry, ", ", table_value 423 if (last_chars == table_entry): 424 term = term[:(0 - table_value_len + 1)] + table_value 425 break 426 # end if 427 # end for 428 429 # initialize code 430 code = term 431 432 # transform ev->af 433 code = re.sub(r'ev', r'af', code) 434 435 # transform a,e,i,o,u->a 436 code = re.sub(r'[aeiouy]', r'a', code) 437 438 # transform q->g 439 code = re.sub(r'q', r'g', code) 440 441 # transform z->s 442 code = re.sub(r'z', r's', code) 443 444 # transform m->n 445 code = re.sub(r'm', r'n', code) 446 447 # transform kn->n 448 code = re.sub(r'kn', r'n', code) 449 450 # transform k->c 451 code = re.sub(r'k', r'c', code) 452 453 # transform sch->sss 454 code = re.sub(r'sch', r'sss', code) 455 456 # transform ph->ff 457 code = re.sub(r'ph', r'ff', code) 458 459 # transform h-> if previous or next is nonvowel -> previous 460 occur = re.findall(r'([a-z]{0,1}?)h([a-z]{0,1}?)', code) 461 # print occur 462 for occur_group in occur: 463 occur_item_previous = occur_group[0] 464 occur_item_next = occur_group[1] 465 466 if ((re.match(r'[^aeiouy]', occur_item_previous)) or (re.match(r'[^aeiouy]', occur_item_next))): 467 if (occur_item_previous != ""): 468 # make substitution 469 code = re.sub(occur_item_previous + "h", occur_item_previous * 2, code, 1) 470 # end if 471 # end if 472 # end for 473 474 # transform w-> if previous is vowel -> previous 475 occur = re.findall(r'([aeiouy]{1}?)w', code) 476 # print occur 477 for occur_group in occur: 478 occur_item_previous = occur_group[0] 479 # make substitution 480 code = re.sub(occur_item_previous + "w", occur_item_previous * 2, code, 1) 481 # end for 482 483 # check last character 484 # -s, remove 485 code = re.sub(r's$', r'', code) 486 # -ay, replace by -y 487 code = re.sub(r'ay$', r'y', code) 488 # -a, remove 489 code = re.sub(r'a$', r'', code) 490 491 # return nysiis code 492 return code
returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term
def
caverphone(term):
495def caverphone(term): 496 """returns the language key using the caverphone algorithm 2.0""" 497 498 # Developed at the University of Otago, New Zealand. 499 # Project: Caversham Project (http://caversham.otago.ac.nz) 500 # Developer: David Hood, University of Otago, New Zealand 501 # Contact: caversham@otago.ac.nz 502 # Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf 503 # Version 2.0 (2004-08-15) 504 505 code = "" 506 507 # i = 0 508 term_length = len(term) 509 510 if (term_length == 0): 511 # empty string ? 512 return code 513 # end if 514 515 # convert to lowercase 516 code = term.lower() 517 518 # remove anything not in the standard alphabet (a-z) 519 code = re.sub(r'[^a-z]', '', code) 520 521 # remove final e 522 if code.endswith("e"): 523 code = code[:-1] 524 525 # if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough) 526 code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code) 527 528 # if the name starts with gn -> 2n 529 code = re.sub(r'^gn', r'2n', code) 530 531 # if the name ends with mb -> m2 532 code = re.sub(r'mb$', r'm2', code) 533 534 # replace cq -> 2q 535 code = re.sub(r'cq', r'2q', code) 536 537 # replace c[i,e,y] -> s[i,e,y] 538 code = re.sub(r'c([iey])', r's\1', code) 539 540 # replace tch -> 2ch 541 code = re.sub(r'tch', r'2ch', code) 542 543 # replace c,q,x -> k 544 code = re.sub(r'[cqx]', r'k', code) 545 546 # replace v -> f 547 code = re.sub(r'v', r'f', code) 548 549 # replace dg -> 2g 550 code = re.sub(r'dg', r'2g', code) 551 552 # replace ti[o,a] -> si[o,a] 553 code = re.sub(r'ti([oa])', r'si\1', code) 554 555 # replace d -> t 556 code = re.sub(r'd', r't', code) 557 558 # replace ph -> fh 559 code = re.sub(r'ph', r'fh', code) 560 561 # replace b -> p 562 code = re.sub(r'b', r'p', code) 563 564 # replace sh -> s2 565 code = re.sub(r'sh', r's2', code) 566 567 # replace z -> s 568 code = re.sub(r'z', r's', code) 569 570 # replace initial vowel [aeiou] -> A 571 code = re.sub(r'^[aeiou]', r'A', code) 572 573 # replace all other vowels [aeiou] -> 3 574 code = re.sub(r'[aeiou]', r'3', code) 575 576 # replace j -> y 577 code = re.sub(r'j', r'y', code) 578 579 # replace an initial y3 -> Y3 580 code = re.sub(r'^y3', r'Y3', code) 581 582 # replace an initial y -> A 583 code = re.sub(r'^y', r'A', code) 584 585 # replace y -> 3 586 code = re.sub(r'y', r'3', code) 587 588 # replace 3gh3 -> 3kh3 589 code = re.sub(r'3gh3', r'3kh3', code) 590 591 # replace gh -> 22 592 code = re.sub(r'gh', r'22', code) 593 594 # replace g -> k 595 code = re.sub(r'g', r'k', code) 596 597 # replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent 598 for single_letter in ["s", "t", "p", "k", "f", "m", "n"]: 599 otherParts = re.split(single_letter + "+", code) 600 letter = single_letter.upper() 601 code = letter.join(otherParts) 602 603 # replace w[3,h3] by W[3,h3] 604 code = re.sub(r'w(h?3)', r'W\1', code) 605 606 # replace final w with 3 607 code = re.sub(r'w$', r'3', code) 608 609 # replace w -> 2 610 code = re.sub(r'w', r'2', code) 611 612 # replace h at the beginning with an A 613 code = re.sub(r'^h', r'A', code) 614 615 # replace all other occurrences of h with a 2 616 code = re.sub(r'h', r'2', code) 617 618 # replace r3 with R3 619 code = re.sub(r'r3', r'R3', code) 620 621 # replace final r -> 3 622 code = re.sub(r'r$', r'3', code) 623 624 # replace r with 2 625 code = re.sub(r'r', r'2', code) 626 627 # replace l3 with L3 628 code = re.sub(r'l3', r'L3', code) 629 630 # replace final l -> 3 631 code = re.sub(r'l$', r'3', code) 632 633 # replace l with 2 634 code = re.sub(r'l', r'2', code) 635 636 # remove all 2's 637 code = re.sub(r'2', r'', code) 638 639 # replace the final 3 -> A 640 code = re.sub(r'3$', r'A', code) 641 642 # remove all 3's 643 code = re.sub(r'3', r'', code) 644 645 # extend the code by 10 '1' (one) 646 code += '1' * 10 647 648 # take the first 10 characters 649 caverphoneCode = code[:10] 650 651 # return caverphone code 652 return caverphoneCode
returns the language key using the caverphone algorithm 2.0