opensextant.phonetics
Geocoding Phonetics Library
:created Created on Mar 15, 2012 :author: ubaldino :copyright: MITRE Corporation, (c) 2010-2012
Requirements: advas.phonetics library is used here; but a modified version of it is included in this package.
1# -*- coding: utf-8 -*- 2""" 3Geocoding Phonetics Library 4 5:created Created on Mar 15, 2012 6:author: ubaldino 7:copyright: MITRE Corporation, (c) 2010-2012 8 9Requirements: advas.phonetics library is used here; but a modified version of it is included in this package. 10""" 11 12from string import ascii_lowercase, digits 13 14from opensextant.utility import levenshtein_distance 15# Metaphone via Advanced Search (advas); Modified by Marc Ubaldino 16# 17from .advas_phonetics import metaphone 18 19# Very basic consonnance matching on word intials. 20# Others not yet implemented: 'd' = 't' 21KA_CONSONNANCE = {"q", "k", "c"} 22PH_CONSONNANCE = {"f", "p"} # Yeah this is actually PH, not "P". 23JA_CONSONNANCE = {"g", "j", "y"} 24SA_CONSONNANCE = {"s", "c", "z"} 25XA_CONSONNANCE = {"s", "z"} 26WA_CONSONNANCE = {"w", "v"} 27 28ARRAY_OF_PHONETICS = [KA_CONSONNANCE, PH_CONSONNANCE, 29 JA_CONSONNANCE, SA_CONSONNANCE, 30 XA_CONSONNANCE, WA_CONSONNANCE] 31 32# Reduce the consanance 33# 34# Sacco River, ME 35# Saco River? 36# 37# Abbattobad or 38# Abbatobod or 39# Abatobod ? 40# 41REDUCE_CONSONNANCE = {"bb", "cc", "dd", "ff", "gg", "kk", "ll", "mm", 42 "nn", "pp", "qq", "rr", "ss", "tt", "vv", "xx", "zz"} 43 44 45class PhoneticMap: 46 """ Convenience class to organize a single Phoneme to a list of names (which have the same phoneme) 47 """ 48 49 def __init__(self, p): 50 self.phoneme = p 51 self.names = set([]) 52 53 def add(self, name): 54 self.names.add(name) 55 56 57def phonetic_redux(tok): 58 t = tok 59 for DUP in REDUCE_CONSONNANCE: 60 t = t.replace(DUP, DUP[0]) 61 return t 62 63 64def phonetic_code(tok): 65 """ 66 An application of Advas phonetics library 67 Metaphone appears to generate a fourth of the matches Caverphone does. 68 ... that is Caverphone is looser, and noisier similarity matching. 69 CAVEAT: If you change phonetics, you must RE-Pickle 70 71 WINNER: metaphone. 72 """ 73 if tok is None: 74 return None 75 # Fix input tokens: 76 return metaphone(tok) 77 78 79def match_phonetically(a, b): 80 """ 81 match_phonetically( a, b ) attempts to match 82 words by the phonetic similarity of their initials. 83 Limitation: F and PH are one intended match, but for now F =? P suffices. 84 """ 85 86 if not a or not b: 87 return False 88 89 a0 = a[0] 90 b0 = b[0] 91 if a0 == b0: 92 return True 93 94 for phset in ARRAY_OF_PHONETICS: 95 if (a0 in phset) and (b0 in phset): 96 # AH,.. Phonetic equivalence by their CONSONNANCE 97 # Find the first match and return True 98 return True 99 100 # No phonetic match here. 101 return False 102 103 104# ======================================= 105# ////////////////////////////////////// 106# A. Kazura, M. Ubaldino 2012 107# phonetic alphabet conversions. 108# 109# Reference: http://www.osric.com/chris/phonetic.html 110# 111# Western Union is difficult becuase it uses city names primarily. 112# 113# NATO implemented here: 114# ======================================= 115phonetic_alphabet = ["alpha", "bravo", "charlie", "delta", "echo", 116 "foxtrot", "golf", "hotel", "india", "juliet", 117 "kilo", "lima", "mike", "november", "oscar", 118 "papa", "quebec", "romeo", "sierra", "tango", 119 "uniform", "victor", "whiskey", "xray", "yankee", "zulu", 120 ] 121phonetic_numbers = ["zero", "one", "two", "three", "four", 122 "five", "six", "seven", "eight", "nine" 123 ] 124 125# Alpha to Word 126phonetic_a2w = dict(zip(ascii_lowercase, phonetic_alphabet)) 127phonetic_a2w.update(dict(zip(digits, phonetic_numbers))) 128 129# Word to Alpha 130phonetic_w2a = dict(zip(phonetic_alphabet, ascii_lowercase)) 131phonetic_w2a.update(dict(zip(phonetic_numbers, digits))) 132 133# additions 134phonetic_w2a["x-ray"] = "x" 135phonetic_w2a["nought"] = "0" 136phonetic_w2a["not"] = "0" 137 138 139def get_phonetic_phrase(word): 140 """ Convert a code word into its expanded phonetic spelling. 141 e.g., given TB generate tango bravo 142 input is assumed lowercase. 143 144 :param word: lower case code word 145 """ 146 # Get the value for x or return x 147 # '#' is not in a2w map, so return it as-is. 148 # 149 phr = map(lambda x: phonetic_a2w.get(x, x), word) 150 return " ".join(phr) 151 152 153def get_phonetic_initials(phrase): 154 """ 155 Convert a word into its acronym. You would only do this if you knew 156 you had a phonetic spelling, e.g., Tango Bravo = TB 157 """ 158 words = phrase.lower().split() 159 phon = [] 160 for w in words: 161 if w in phonetic_w2a: 162 # add first initial keyed to this word. 163 # 'a' for Alpha 164 # '0' for Zero, Not, or Nought 165 phon.append(phonetic_w2a.get(w)) 166 else: 167 phon.append(' ') 168 phon.append(w) 169 phon.append(' ') 170 171 return "".join(phon) 172 173 174# length diff, max edit dist, by 5's 175# increasing term size in terms less than 10 chars guidelines: 176# length-diff should not be more than 1 char 177# but edit dist should be linear to allow increasing for vowel variations, mainly 178# With longer words both consonants and vowels will have an increasing entropy 179# Approximately edit dist threshold is 2x length difference threshold. 180_phonetic_params_block = 3 181_phonetic_params = { 182 0: (0, 0), # 1-3 chars 183 1: (0, 1), # 4-6 chars 184 2: (1, 2), # 10-15 chars 185 3: (1, 3), # etc. 186 4: (2, 4), 187 5: (3, 5), 188 6: (4, 6), 189 7: (4, 7), 190 8: (5, 8), 191 9: (5, 9), 192 10: (6, 10), 193 11: (6, 11), 194 12: (7, 12), 195 13: (7, 13), 196 14: (8, 14), 197 15: (8, 15) # 45-47 chars. 198} 199 200 201def phonetic_params(termlen): 202 """ 203 get params for a given term length 204 :param termlen: term len 205 :return: 206 """ 207 lx = termlen / _phonetic_params_block 208 p = _phonetic_params.get(lx) 209 if lx > 15 or not p: 210 # 5 x 10 = 50, 50 character phrase? What sort of variability 211 return _phonetic_params.get(15) 212 213 return p 214 215 216def match_filter_phonetically(target, targetlen, test, testlen, max_len_diff, max_edit_dst): 217 """ For performance reasons we assume you have lower case versions of target and test 218 and lengths for both. 219 220 Does test match target phonetically? Usage: Given target, find test in [a, b, c, d...] that match target 221 222 :param target: thing you want to match to. 223 :param targetlen: 224 :param test: a test. 225 :param testlen: 226 :param max_len_diff: basic length filter 227 :param max_edit_dst: Finally assess edit distance of text 228 """ 229 230 # FAIL only if one test fails 231 # Otherwise attempt all tests -- that is do not PASS based on one conditional here. 232 233 # Filter just by length alone. 234 if abs(testlen - targetlen) > max_len_diff: 235 return False 236 237 # Filter because phonetic nature of initial syllable or consonants is not valid 238 # 'Ka' or 'Qa' are equivlent, but maybe not 'Cua' 239 if not match_phonetically(test, target): 240 return False 241 242 # Finally if all other filters pass, then check if edit distance of full string makes sense. 243 editdst = levenshtein_distance(test, target) 244 if editdst >= max_edit_dst: 245 return False 246 247 return True
46class PhoneticMap: 47 """ Convenience class to organize a single Phoneme to a list of names (which have the same phoneme) 48 """ 49 50 def __init__(self, p): 51 self.phoneme = p 52 self.names = set([]) 53 54 def add(self, name): 55 self.names.add(name)
Convenience class to organize a single Phoneme to a list of names (which have the same phoneme)
65def phonetic_code(tok): 66 """ 67 An application of Advas phonetics library 68 Metaphone appears to generate a fourth of the matches Caverphone does. 69 ... that is Caverphone is looser, and noisier similarity matching. 70 CAVEAT: If you change phonetics, you must RE-Pickle 71 72 WINNER: metaphone. 73 """ 74 if tok is None: 75 return None 76 # Fix input tokens: 77 return metaphone(tok)
An application of Advas phonetics library Metaphone appears to generate a fourth of the matches Caverphone does. ... that is Caverphone is looser, and noisier similarity matching. CAVEAT: If you change phonetics, you must RE-Pickle
WINNER: metaphone.
80def match_phonetically(a, b): 81 """ 82 match_phonetically( a, b ) attempts to match 83 words by the phonetic similarity of their initials. 84 Limitation: F and PH are one intended match, but for now F =? P suffices. 85 """ 86 87 if not a or not b: 88 return False 89 90 a0 = a[0] 91 b0 = b[0] 92 if a0 == b0: 93 return True 94 95 for phset in ARRAY_OF_PHONETICS: 96 if (a0 in phset) and (b0 in phset): 97 # AH,.. Phonetic equivalence by their CONSONNANCE 98 # Find the first match and return True 99 return True 100 101 # No phonetic match here. 102 return False
match_phonetically( a, b ) attempts to match words by the phonetic similarity of their initials. Limitation: F and PH are one intended match, but for now F =? P suffices.
140def get_phonetic_phrase(word): 141 """ Convert a code word into its expanded phonetic spelling. 142 e.g., given TB generate tango bravo 143 input is assumed lowercase. 144 145 :param word: lower case code word 146 """ 147 # Get the value for x or return x 148 # '#' is not in a2w map, so return it as-is. 149 # 150 phr = map(lambda x: phonetic_a2w.get(x, x), word) 151 return " ".join(phr)
Convert a code word into its expanded phonetic spelling. e.g., given TB generate tango bravo input is assumed lowercase.
:param word: lower case code word
154def get_phonetic_initials(phrase): 155 """ 156 Convert a word into its acronym. You would only do this if you knew 157 you had a phonetic spelling, e.g., Tango Bravo = TB 158 """ 159 words = phrase.lower().split() 160 phon = [] 161 for w in words: 162 if w in phonetic_w2a: 163 # add first initial keyed to this word. 164 # 'a' for Alpha 165 # '0' for Zero, Not, or Nought 166 phon.append(phonetic_w2a.get(w)) 167 else: 168 phon.append(' ') 169 phon.append(w) 170 phon.append(' ') 171 172 return "".join(phon)
Convert a word into its acronym. You would only do this if you knew you had a phonetic spelling, e.g., Tango Bravo = TB
202def phonetic_params(termlen): 203 """ 204 get params for a given term length 205 :param termlen: term len 206 :return: 207 """ 208 lx = termlen / _phonetic_params_block 209 p = _phonetic_params.get(lx) 210 if lx > 15 or not p: 211 # 5 x 10 = 50, 50 character phrase? What sort of variability 212 return _phonetic_params.get(15) 213 214 return p
get params for a given term length :param termlen: term len :return:
217def match_filter_phonetically(target, targetlen, test, testlen, max_len_diff, max_edit_dst): 218 """ For performance reasons we assume you have lower case versions of target and test 219 and lengths for both. 220 221 Does test match target phonetically? Usage: Given target, find test in [a, b, c, d...] that match target 222 223 :param target: thing you want to match to. 224 :param targetlen: 225 :param test: a test. 226 :param testlen: 227 :param max_len_diff: basic length filter 228 :param max_edit_dst: Finally assess edit distance of text 229 """ 230 231 # FAIL only if one test fails 232 # Otherwise attempt all tests -- that is do not PASS based on one conditional here. 233 234 # Filter just by length alone. 235 if abs(testlen - targetlen) > max_len_diff: 236 return False 237 238 # Filter because phonetic nature of initial syllable or consonants is not valid 239 # 'Ka' or 'Qa' are equivlent, but maybe not 'Cua' 240 if not match_phonetically(test, target): 241 return False 242 243 # Finally if all other filters pass, then check if edit distance of full string makes sense. 244 editdst = levenshtein_distance(test, target) 245 if editdst >= max_edit_dst: 246 return False 247 248 return True
For performance reasons we assume you have lower case versions of target and test and lengths for both.
Does test match target phonetically? Usage: Given target, find test in [a, b, c, d...] that match target
:param target: thing you want to match to. :param targetlen: :param test: a test. :param testlen: :param max_len_diff: basic length filter :param max_edit_dst: Finally assess edit distance of text