opensextant.phonetics

Geocoding Phonetics Library

:created Created on Mar 15, 2012 :author: ubaldino :copyright: MITRE Corporation, (c) 2010-2012

Requirements: advas.phonetics library is used here; but a modified version of it is included in this package.

  1# -*- coding: utf-8 -*-
  2"""
  3Geocoding Phonetics Library
  4
  5:created Created on Mar 15, 2012
  6:author: ubaldino
  7:copyright:  MITRE Corporation, (c) 2010-2012
  8
  9Requirements: advas.phonetics library is used here; but a modified version of it is included in this package.
 10"""
 11
 12from string import ascii_lowercase, digits
 13
 14from opensextant.utility import levenshtein_distance
 15# Metaphone via Advanced Search (advas);  Modified by Marc Ubaldino
 16#
 17from .advas_phonetics import metaphone
 18
 19# Very basic consonnance matching on word intials.
 20# Others not yet implemented:  'd' = 't'
 21KA_CONSONNANCE = {"q", "k", "c"}
 22PH_CONSONNANCE = {"f", "p"}  # Yeah this is actually PH, not "P".
 23JA_CONSONNANCE = {"g", "j", "y"}
 24SA_CONSONNANCE = {"s", "c", "z"}
 25XA_CONSONNANCE = {"s", "z"}
 26WA_CONSONNANCE = {"w", "v"}
 27
 28ARRAY_OF_PHONETICS = [KA_CONSONNANCE, PH_CONSONNANCE,
 29                      JA_CONSONNANCE, SA_CONSONNANCE,
 30                      XA_CONSONNANCE, WA_CONSONNANCE]
 31
 32# Reduce the consanance 
 33# 
 34#   Sacco River, ME 
 35#   Saco River?
 36# 
 37#   Abbattobad or
 38#   Abbatobod  or
 39#   Abatobod  ?
 40#
 41REDUCE_CONSONNANCE = {"bb", "cc", "dd", "ff", "gg", "kk", "ll", "mm",
 42                      "nn", "pp", "qq", "rr", "ss", "tt", "vv", "xx", "zz"}
 43
 44
 45class PhoneticMap:
 46    """ Convenience class to organize a single Phoneme to a list of names (which have the same phoneme)
 47    """
 48
 49    def __init__(self, p):
 50        self.phoneme = p
 51        self.names = set([])
 52
 53    def add(self, name):
 54        self.names.add(name)
 55
 56
 57def phonetic_redux(tok):
 58    t = tok
 59    for DUP in REDUCE_CONSONNANCE:
 60        t = t.replace(DUP, DUP[0])
 61    return t
 62
 63
 64def phonetic_code(tok):
 65    """
 66    An application of Advas phonetics library
 67    Metaphone appears to generate a fourth of the matches Caverphone does.
 68    ... that is Caverphone is looser, and noisier similarity matching.
 69    CAVEAT:  If you change phonetics, you must RE-Pickle
 70    
 71    WINNER: metaphone.
 72    """
 73    if tok is None:
 74        return None
 75    # Fix input tokens:
 76    return metaphone(tok)
 77
 78
 79def match_phonetically(a, b):
 80    """
 81    match_phonetically( a, b ) attempts to match 
 82    words by the phonetic similarity of their initials.
 83    Limitation:  F and PH are one intended match, but for now F =? P suffices. 
 84    """
 85
 86    if not a or not b:
 87        return False
 88
 89    a0 = a[0]
 90    b0 = b[0]
 91    if a0 == b0:
 92        return True
 93
 94    for phset in ARRAY_OF_PHONETICS:
 95        if (a0 in phset) and (b0 in phset):
 96            #  AH,.. Phonetic equivalence by their CONSONNANCE
 97            # Find the first match and return True 
 98            return True
 99
100    # No phonetic match here.
101    return False
102
103
104# =======================================
105# //////////////////////////////////////
106#   A. Kazura, M. Ubaldino 2012
107#   phonetic alphabet conversions.
108#
109#  Reference:   http://www.osric.com/chris/phonetic.html
110#
111#  Western Union is difficult becuase it uses city names primarily.
112# 
113#  NATO implemented here:
114# =======================================
115phonetic_alphabet = ["alpha", "bravo", "charlie", "delta", "echo",
116                     "foxtrot", "golf", "hotel", "india", "juliet",
117                     "kilo", "lima", "mike", "november", "oscar",
118                     "papa", "quebec", "romeo", "sierra", "tango",
119                     "uniform", "victor", "whiskey", "xray", "yankee", "zulu",
120                     ]
121phonetic_numbers = ["zero", "one", "two", "three", "four",
122                    "five", "six", "seven", "eight", "nine"
123                    ]
124
125# Alpha to Word
126phonetic_a2w = dict(zip(ascii_lowercase, phonetic_alphabet))
127phonetic_a2w.update(dict(zip(digits, phonetic_numbers)))
128
129# Word to Alpha
130phonetic_w2a = dict(zip(phonetic_alphabet, ascii_lowercase))
131phonetic_w2a.update(dict(zip(phonetic_numbers, digits)))
132
133# additions
134phonetic_w2a["x-ray"] = "x"
135phonetic_w2a["nought"] = "0"
136phonetic_w2a["not"] = "0"
137
138
139def get_phonetic_phrase(word):
140    """ Convert a code word into its expanded phonetic spelling.
141    e.g., given TB generate tango bravo
142    input is assumed lowercase.
143    
144    :param word: lower case code word
145    """
146    # Get the value for x or return x
147    #   '#' is not in a2w map, so return it as-is.
148    #
149    phr = map(lambda x: phonetic_a2w.get(x, x), word)
150    return " ".join(phr)
151
152
153def get_phonetic_initials(phrase):
154    """ 
155    Convert a word into its acronym.  You would only do this if you knew
156    you had a phonetic spelling, e.g., Tango Bravo = TB
157    """
158    words = phrase.lower().split()
159    phon = []
160    for w in words:
161        if w in phonetic_w2a:
162            # add first initial keyed to this word.
163            #  'a' for Alpha
164            #  '0' for Zero, Not, or Nought
165            phon.append(phonetic_w2a.get(w))
166        else:
167            phon.append(' ')
168            phon.append(w)
169            phon.append(' ')
170
171    return "".join(phon)
172
173
174# length diff, max edit dist, by 5's
175# increasing term size in terms less than 10 chars guidelines:
176#    length-diff should not be more than 1 char
177#    but edit dist should be linear to allow increasing for vowel variations, mainly
178# With longer words both consonants and vowels will have an increasing entropy
179# Approximately edit dist threshold is 2x length difference threshold. 
180_phonetic_params_block = 3
181_phonetic_params = {
182    0: (0, 0),  # 1-3 chars
183    1: (0, 1),  # 4-6 chars
184    2: (1, 2),  # 10-15 chars
185    3: (1, 3),  # etc.
186    4: (2, 4),
187    5: (3, 5),
188    6: (4, 6),
189    7: (4, 7),
190    8: (5, 8),
191    9: (5, 9),
192    10: (6, 10),
193    11: (6, 11),
194    12: (7, 12),
195    13: (7, 13),
196    14: (8, 14),
197    15: (8, 15)  # 45-47 chars.
198}
199
200
201def phonetic_params(termlen):
202    """
203    get params for a given term length
204    :param termlen: term len
205    :return:
206    """
207    lx = termlen / _phonetic_params_block
208    p = _phonetic_params.get(lx)
209    if lx > 15 or not p:
210        # 5 x 10 = 50, 50 character phrase? What sort of variability
211        return _phonetic_params.get(15)
212
213    return p
214
215
216def match_filter_phonetically(target, targetlen, test, testlen, max_len_diff, max_edit_dst):
217    """  For performance reasons we assume you have lower case versions of target and test
218    and lengths for both.
219    
220    Does test match target phonetically?  Usage: Given target, find test in [a, b, c, d...] that match target
221    
222    :param target:      thing you want to match to.
223    :param targetlen:
224    :param test:        a test.
225    :param testlen:
226    :param max_len_diff:  basic length filter
227    :param max_edit_dst:  Finally assess edit distance of text
228    """
229
230    # FAIL only if one test fails
231    # Otherwise attempt all tests -- that is do not PASS based on one conditional here.
232
233    # Filter just by length alone.
234    if abs(testlen - targetlen) > max_len_diff:
235        return False
236
237    # Filter because phonetic nature of initial syllable or consonants is not valid
238    #   'Ka' or 'Qa' are equivlent, but maybe not 'Cua'
239    if not match_phonetically(test, target):
240        return False
241
242    # Finally if all other filters pass, then check if edit distance of full string makes sense.
243    editdst = levenshtein_distance(test, target)
244    if editdst >= max_edit_dst:
245        return False
246
247    return True
class PhoneticMap:
46class PhoneticMap:
47    """ Convenience class to organize a single Phoneme to a list of names (which have the same phoneme)
48    """
49
50    def __init__(self, p):
51        self.phoneme = p
52        self.names = set([])
53
54    def add(self, name):
55        self.names.add(name)

Convenience class to organize a single Phoneme to a list of names (which have the same phoneme)

def phonetic_code(tok):
65def phonetic_code(tok):
66    """
67    An application of Advas phonetics library
68    Metaphone appears to generate a fourth of the matches Caverphone does.
69    ... that is Caverphone is looser, and noisier similarity matching.
70    CAVEAT:  If you change phonetics, you must RE-Pickle
71    
72    WINNER: metaphone.
73    """
74    if tok is None:
75        return None
76    # Fix input tokens:
77    return metaphone(tok)

An application of Advas phonetics library Metaphone appears to generate a fourth of the matches Caverphone does. ... that is Caverphone is looser, and noisier similarity matching. CAVEAT: If you change phonetics, you must RE-Pickle

WINNER: metaphone.

def match_phonetically(a, b):
 80def match_phonetically(a, b):
 81    """
 82    match_phonetically( a, b ) attempts to match 
 83    words by the phonetic similarity of their initials.
 84    Limitation:  F and PH are one intended match, but for now F =? P suffices. 
 85    """
 86
 87    if not a or not b:
 88        return False
 89
 90    a0 = a[0]
 91    b0 = b[0]
 92    if a0 == b0:
 93        return True
 94
 95    for phset in ARRAY_OF_PHONETICS:
 96        if (a0 in phset) and (b0 in phset):
 97            #  AH,.. Phonetic equivalence by their CONSONNANCE
 98            # Find the first match and return True 
 99            return True
100
101    # No phonetic match here.
102    return False

match_phonetically( a, b ) attempts to match words by the phonetic similarity of their initials. Limitation: F and PH are one intended match, but for now F =? P suffices.

def get_phonetic_phrase(word):
140def get_phonetic_phrase(word):
141    """ Convert a code word into its expanded phonetic spelling.
142    e.g., given TB generate tango bravo
143    input is assumed lowercase.
144    
145    :param word: lower case code word
146    """
147    # Get the value for x or return x
148    #   '#' is not in a2w map, so return it as-is.
149    #
150    phr = map(lambda x: phonetic_a2w.get(x, x), word)
151    return " ".join(phr)

Convert a code word into its expanded phonetic spelling. e.g., given TB generate tango bravo input is assumed lowercase.

:param word: lower case code word

def get_phonetic_initials(phrase):
154def get_phonetic_initials(phrase):
155    """ 
156    Convert a word into its acronym.  You would only do this if you knew
157    you had a phonetic spelling, e.g., Tango Bravo = TB
158    """
159    words = phrase.lower().split()
160    phon = []
161    for w in words:
162        if w in phonetic_w2a:
163            # add first initial keyed to this word.
164            #  'a' for Alpha
165            #  '0' for Zero, Not, or Nought
166            phon.append(phonetic_w2a.get(w))
167        else:
168            phon.append(' ')
169            phon.append(w)
170            phon.append(' ')
171
172    return "".join(phon)

Convert a word into its acronym. You would only do this if you knew you had a phonetic spelling, e.g., Tango Bravo = TB

def phonetic_params(termlen):
202def phonetic_params(termlen):
203    """
204    get params for a given term length
205    :param termlen: term len
206    :return:
207    """
208    lx = termlen / _phonetic_params_block
209    p = _phonetic_params.get(lx)
210    if lx > 15 or not p:
211        # 5 x 10 = 50, 50 character phrase? What sort of variability
212        return _phonetic_params.get(15)
213
214    return p

get params for a given term length :param termlen: term len :return:

def match_filter_phonetically(target, targetlen, test, testlen, max_len_diff, max_edit_dst):
217def match_filter_phonetically(target, targetlen, test, testlen, max_len_diff, max_edit_dst):
218    """  For performance reasons we assume you have lower case versions of target and test
219    and lengths for both.
220    
221    Does test match target phonetically?  Usage: Given target, find test in [a, b, c, d...] that match target
222    
223    :param target:      thing you want to match to.
224    :param targetlen:
225    :param test:        a test.
226    :param testlen:
227    :param max_len_diff:  basic length filter
228    :param max_edit_dst:  Finally assess edit distance of text
229    """
230
231    # FAIL only if one test fails
232    # Otherwise attempt all tests -- that is do not PASS based on one conditional here.
233
234    # Filter just by length alone.
235    if abs(testlen - targetlen) > max_len_diff:
236        return False
237
238    # Filter because phonetic nature of initial syllable or consonants is not valid
239    #   'Ka' or 'Qa' are equivlent, but maybe not 'Cua'
240    if not match_phonetically(test, target):
241        return False
242
243    # Finally if all other filters pass, then check if edit distance of full string makes sense.
244    editdst = levenshtein_distance(test, target)
245    if editdst >= max_edit_dst:
246        return False
247
248    return True

For performance reasons we assume you have lower case versions of target and test and lengths for both.

Does test match target phonetically? Usage: Given target, find test in [a, b, c, d...] that match target

:param target: thing you want to match to. :param targetlen: :param test: a test. :param testlen: :param max_len_diff: basic length filter :param max_edit_dst: Finally assess edit distance of text