opensextant.utility

Copyright 2015-2021 The MITRE Corporation.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

=============================================================================

@author: ubaldino

OpenSextant utilities

  1# -*- coding: utf-8 -*-
  2"""
  3 
  4                Copyright 2015-2021 The MITRE Corporation.
  5 
  6  Licensed under the Apache License, Version 2.0 (the "License"); you may not
  7  use this file except in compliance with the License. You may obtain a copy of
  8  the License at
  9 
 10  http://www.apache.org/licenses/LICENSE-2.0
 11 
 12  Unless required by applicable law or agreed to in writing, software
 13  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 15  License for the specific language governing permissions and limitations under
 16  the License.
 17 
 18  =============================================================================
 19
 20@author: ubaldino
 21
 22OpenSextant utilities
 23"""
 24import csv
 25import os
 26import re
 27from io import StringIO
 28from math import isnan
 29
 30from chardet import detect as detect_charset
 31from .unicode import LATIN1_FOLDING
 32
 33
 34# ---------------------------------------
 35#  TEXT UTILITIES
 36# ---------------------------------------
 37#
 38def is_text(t):
 39    return isinstance(t, str)
 40
 41
 42code_pattern = re.compile("^[A-Z0-9]{1,}$", re.ASCII)
 43
 44
 45def is_code(t: str, nlen=6):
 46    """
 47    Test if a string is an ASCII code typically 1-3 chars in len.
 48    :param t: text
 49    :param nlen: threshold for string len
 50    :return:
 51    """
 52    if not t:
 53        return False
 54    if len(t) > nlen or not t.isupper():
 55        return False
 56    return code_pattern.match(t) is not None
 57
 58
 59def is_abbreviation(nm: str):
 60    """
 61    Determine if something is an abbreviation.
 62    Otherwise if text ends with "." we'll conclude so.
 63
 64    Examples:
 65        Ala.     YES
 66        Ala      NO
 67        S. Bob   NO   -- abbreviated, yes, but this is more like a contraction.
 68        S. B.    YES
 69
 70    :param nm: textual name
 71    :return: True if obj is inferred to be an abbreviation
 72    """
 73    return nm.endswith(".")
 74
 75
 76def is_ascii(s):
 77    try:
 78        return all(ord(c) < 128 for c in s)
 79    except:
 80        pass
 81    return False
 82
 83def get_text(t):
 84    """ Default is to return Unicode string from raw data"""
 85    if isinstance(t, str):
 86        return t
 87    return str(t, encoding='utf-8')
 88
 89
 90def fast_replace(t, sep, sub=None):
 91    """
 92    Replace separators (sep) with substitute char, sub. Many-to-one substitute.
 93
 94    "a.b, c" SEP='.,'
 95    :param t:  input text
 96    :param sep: string of chars to replace
 97    :param sub: replacement char
 98    :return:  text with separators replaced
 99    """
100    result = []
101    for ch in t:
102        if ch in sep:
103            if sub:
104                result.append(sub)
105        else:
106            result.append(ch)
107    return ''.join(result)
108
109
110# ISO-8859-2 is a common answer, when they really mean ISO-1
111CHARDET_LATIN2_ENCODING = 'ISO-8859-1'
112
113
114def guess_encoding(text):
115    """ Given bytes, determine the character set encoding
116    @return: dict with encoding and confidence
117    """
118    if not text: return {'confidence': 0, 'encoding': None}
119
120    enc = detect_charset(text)
121
122    cset = enc['encoding']
123    if cset.lower() == 'iso-8859-2':
124        # Anomoaly -- chardet things Hungarian (iso-8850-2) is
125        # a close match for a latin-1 document.  At least the quotes match
126        # Other Latin-xxx variants will likely match, but actually be Latin1
127        # or win-1252.   see Chardet explanation for poor reliability of Latin-1 detection
128        #
129        enc['encoding'] = CHARDET_LATIN2_ENCODING
130
131    return enc
132
133
134def bytes2unicode(buf, encoding=None):
135    """
136    Convert bytes 2 unicode by guessing character set.
137    :param buf:
138    :param encoding:
139    :return:
140    """
141    if not encoding:
142        enc = guess_encoding(buf)
143        encoding = enc['encoding']
144        if not encoding:
145            return None
146    return str(buf, encoding=encoding)
147
148
149reSqueezeWhiteSpace = re.compile(r'\s+', re.MULTILINE)
150
151
152def squeeze_whitespace(s):
153    return reSqueezeWhiteSpace.sub(' ', s).strip()
154
155
156def scrub_eol(t):
157    return t.replace('\n', ' ').replace('\r', '')
158
159
160def levenshtein_distance(s, t):
161    """
162    Wikipedia page on Levenshtein Edit Distance
163    https://en.wikipedia.org/wiki/Levenshtein_distance
164
165    This is the fastest, simplest of 3 methods documented for Python.
166    """
167    s = ' ' + s
168    t = ' ' + t
169    d = {}
170    S = len(s)
171    T = len(t)
172    if S == T and s == t:
173        return 0
174    for i in range(S):
175        d[i, 0] = i
176    for j in range(T):
177        d[0, j] = j
178    for j in range(1, T):
179        for i in range(1, S):
180            if s[i] == t[j]:
181                d[i, j] = d[i - 1, j - 1]
182            else:
183                d[i, j] = min(d[i - 1, j] + 1, d[i, j - 1] + 1, d[i - 1, j - 1] + 1)
184    return d[(S - 1, T - 1)]
185
186
187BOOL_F_STR = {"false", 0, "0", "n", "f", "no", "", "null"}
188BOOL_T_STR = {"true", 1, "1", "y", "t", "yes"}
189
190
191def get_bool(token):
192    if not token:
193        return False
194
195    if isinstance(token, bool):
196        return token
197
198    if isinstance(token, int):
199        if token > 0:
200            return True
201        if token == 0:
202            return False
203
204    t = token.lower()
205    if t in BOOL_F_STR:
206        return False
207
208    if t in BOOL_T_STR:
209        return True
210
211    return False
212
213
214def get_number(token):
215    """ Turn leading part of a string into a number, if possible.
216    """
217    num = StringIO()
218    for ch in token:
219        if ch.isdigit() or ch == '.' or ch == '-':
220            num.write(ch)
221        else:
222            break
223    val = num.getvalue()
224    num.close()
225    return val
226
227
228def has_digit(text):
229    """
230    Used primarily to report places and appears to be critical for
231    name filtering when doing phonetics.
232    """
233    if text is None:
234        return False
235
236    for ch in text:
237        # ascii
238        if ch.isdigit():
239            return True
240    return False
241
242
243def measure_case(t):
244    """
245
246    :param t: text
247    :return:  tuple:  counts of UPPER, lower, Alpha, Non-Alpha, WS
248    """
249    if not t:
250        return 0, 0, 0, 0, 0
251
252    u, l, ch, nonch, ws = 0, 0, 0, 0, 0
253    for c in t:
254        if c.isalpha():
255            ch += 1
256            if c.isupper():
257                u += 1
258            elif c.islower():
259                l += 1
260        elif c.isspace():
261            ws += 1
262        else:
263            nonch += 1
264
265    # you should verify
266    #   TOTAL chars = ch + nonch + ws
267    #   Alpha chars, ch = u + l
268    return u, l, ch, nonch, ws
269
270
271def is_upper_text(t, threadshold=0.90):
272    u, l, ch, nonch, ws = measure_case(t)
273    if ch == 0:
274        return False
275    return u / ch > threadshold
276
277
278def is_value(v):
279    """
280    Working more with pandas or sci libraries -- you run into various types of default "Null" values.
281    This checks to see if value is non-trivial, non-empty.
282    :param v:
283    :return:
284    """
285    if v is None:
286        return False
287    if isinstance(v, (float, int)):
288        return not isnan(v)
289    return True
290
291
292def parse_float(v):
293    if not v:
294        return None
295    try:
296        return float(v)
297    except Exception as float_err:
298        print("Unable to parse float", v, str(float_err))
299        return None
300
301
302def get_list(text, delim=',', lower=False):
303    """
304    Take a string and return trim segments given the delimiter:
305
306         "A,  B,\tC" => ["A", "B", "C"]
307    :param text:
308    :param delim: delimiter str
309    :param lower: True if you want items lowercased
310    :return: array
311    """
312    if not text:
313        return []
314
315    data = text.split(delim)
316    arr = []
317    for v in data:
318        _v = v.strip()
319        if _v:
320            if lower:
321                _v = _v.lower()
322            arr.append(_v)
323    return arr
324
325
326def get_text_window(offset, matchlen, textsize, width):
327    """ prepreprepre MATCH postpostpost
328       ^            ^   ^            ^
329       l-width      l   l+len        l+len+width
330       left_y  left_x   right_x      right_y
331    """
332    left_x = offset - width
333    left_y = offset - 1
334    right_x = offset + matchlen
335    right_y = right_x + width
336    if left_x < 0:
337        left_x = 0
338
339    if left_y < left_x:
340        left_y = left_x
341
342    # bounds checking  END....y?  then y=END, results in shorter postmatch
343    if right_y >= textsize:
344        right_y = textsize - 1
345    # bounds checking   y.... x?  then x=y,  results in empty postmatch
346    if right_x > right_y:
347        right_x = right_y
348
349    return [left_x, left_y, right_x, right_y]
350
351
352def has_cjk(text):
353    """
354    infer if chinese (unihan), korean (hangul) or japanese (hirgana) characters are present
355    :param text: 
356    :return: 
357    """
358    #             CJK, Hirgana, Katana.  Unified Ideagoraphs. Hangjul.
359    search = re.search("[\u3000-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af]", text, flags=re.IGNORECASE | re.UNICODE)
360    return search is not None
361
362
363def has_arabic(text):
364    """
365    infer if text has Arabic / Middle-eastern scripts ~ Urdu, Farsi, Arabic.
366    :param text:
367    :return:
368    """
369    search = re.search("[\u0600-\u08ff]", text, flags=re.IGNORECASE | re.UNICODE)
370    return search is not None
371
372
373def trivial_bias(name):
374    """ Experimental: Deteremine unique a name is using length and character set and # of words
375
376    Abcd           4/2 + 1 + 0   x 0.02  = 0.06
377    Abcde fghi    10/2 + 2 + 0   x 0.02  = 0.14
378    Abcdé fghi    10/2 + 2 + 1   x 0.02  = 0.16
379    """
380    l_points = len(name) / 2
381    word_points = len(name.split())
382    charset_points = 1 if not is_ascii(name) else 0
383    score = (l_points + word_points + charset_points) * 0.02
384    return float("{:0.3}".format(score))
385
386
387COMMON_DIACRITC_HASHMARKS = re.compile("[\"'`\u00B4\u2018\u2019]")
388
389
390def replace_diacritics(txt: str):
391    """
392    Leverage the OpenSextant traditional ASCII Folding map for now.
393    Yes encoded("ascii", "ignore") may do this....
394    :param txt:
395    :return: a non-diacritic version of the text
396    """
397    str_prepped = COMMON_DIACRITC_HASHMARKS.sub("'", txt)
398
399    buf = []
400    for ch in str_prepped:
401        buf.append(LATIN1_FOLDING.get(ch, ch))
402    return "".join(buf)
403
404
405def strip_quotes(t):
406    """
407    Run replace_diacritics first -- this routine only attempts to remove normal quotes ~ ', "
408    """
409    return t.strip('"').strip("'")
410
411
412# /---------------------------------------
413#  FILE UTILITIES
414# /---------------------------------------
415#
416def _utf_8_encoder(unicode_csv_data):
417    for line in unicode_csv_data:
418        yield line.encode('utf-8')
419
420
421def get_csv_writer(fh, columns, delim=','):
422    return csv.DictWriter(fh, columns, restval="", extrasaction='raise',
423                          dialect='excel', lineterminator='\n',
424                          delimiter=delim, quotechar='"',
425                          quoting=csv.QUOTE_ALL, escapechar='\\')
426
427
428def get_csv_reader(fh, columns, delim=','):
429    return csv.DictReader(fh, columns, restval="", dialect='excel', lineterminator='\n', escapechar='\\',
430                          delimiter=delim, quotechar='"', quoting=csv.QUOTE_ALL)
431
432
433# |||||||||||||||||||||||||||||||||||||||||||||
434# |||||||||||||||||||||||||||||||||||||||||||||
435class ConfigUtility:
436    """ A utility to load parameter lists, CSV files, word lists, etc. from a folder *dir*
437
438    functions here take an Oxygen cfg parameter keyword or a file path.
439    If the keyword is valid and points to a valid file path, then the file path is used.
440    In otherwords, keywords are aliases for a file on disk.
441
442      Ex.  'mywords' = './cfg/mywords_v03_filtered.txt'
443
444      oxygen.cfg file would have this mapping.  Your code just references 'mywords' to load it.
445    """
446
447    def __init__(self, config=None, rootdir='.'):
448
449        # If config is None, then caller can still use loadDataFromFile(abspath, delim) for example.
450        #
451        self.config = config
452        self.rootdir = rootdir
453
454    def loadCSVFile(self, keyword, delim):
455        """
456          Load a named CSV file.  If the name is not a cfg parameter, the keyword name *is* the file.
457        """
458        f = self.config.get(keyword)
459        if f is None:
460            f = keyword
461
462        path = os.path.join(self.rootdir, f)
463        return self.loadDataFromFile(path, delim)
464
465    def loadDataFromFile(self, path, delim):
466        """
467
468        :param path: file path
469        :param delim: delimiter
470        :return: Array of tuples.
471        """
472        if not os.path.exists(path):
473            raise Exception('File does not exist, FILE=%s' % path)
474
475        with open(path, 'r', encoding="UTF-8") as f:
476            filereader = csv.reader(f, delimiter=delim, lineterminator='\n', dialect="excel")
477            data = []
478            for row in filereader:
479                if not row:
480                    print("Blank line")
481                    continue
482                first_cell = row[0].strip()
483                if first_cell.startswith('#'):
484                    continue
485                data.append(row)
486        return data
487
488    def loadFile(self, keyword):
489        """
490        Load a named word list file.
491        If the name is not a cfg parameter, the keyword name *is* the file.
492        """
493        filename = ''
494
495        if os.path.exists(keyword):
496            path = keyword
497        else:
498            filename = self.config.get(keyword)
499            if filename is None:
500                filename = keyword
501
502            path = os.path.join(self.rootdir, filename)
503            if not os.path.exists(path):
504                raise Exception('File does not exist, FILE=%s' % path)
505
506        return self.loadListFromFile(path)
507
508    def loadListFromFile(self, path):
509        """
510          Load text data from a file.
511          Returns array of non-comment rows. One non-whitespace row per line.
512        """
513        if not os.path.exists(path):
514            raise Exception('File does not exist, FILE=%s' % path)
515
516        with open(path, 'r', encoding="UTF-8") as fh:
517            termlist = []
518            for line in fh:
519                line = line.strip()
520                if line.startswith('#'):
521                    continue
522                if len(line) == 0:
523                    continue
524
525                termlist.append(line.lower())
526
527            return termlist
528
529
530def load_list(path, lower=False):
531    """
532      Load text data from a file.
533      Returns array of non-comment rows. One non-whitespace row per line.
534      :param path: file to load.
535      :param lower: Lowercased is optional.
536      :return: array of terms
537    """
538    if not os.path.exists(path):
539        raise Exception('File does not exist, FILE=%s' % path)
540
541    with open(path, 'r', encoding="UTF-8") as fh:
542        termlist = []
543        for line in fh:
544            line = line.strip()
545            if line.startswith('#') or not line:
546                continue
547
548            termlist.append(line.lower() if lower else line)
549
550        return termlist
551
552
553def load_datafile(path, delim):
554    """
555    :param path: file path
556    :param delim: delimiter
557    :return: Array of tuples.
558    """
559    if not os.path.exists(path):
560        raise Exception(f'File does not exist, FILE={path}')
561
562    with open(path, 'r', encoding="utf-8") as f:
563        data = []
564        text = f.read().replace('\uFEFF', '')
565
566        for line in text.split("\n"):
567            row = line.strip().split(delim)
568            if not row:
569                # print("Blank line")
570                continue
571            first_cell = row[0]
572            if first_cell.startswith('#'):
573                continue
574            data.append(row)
575        return data
576
577
578def ensure_dirs(fpath):
579    """
580    Given a file path, ensure parent folders exist.
581    If path is intended to be a directory -- use os.makedirs(path) instead.
582    May throw exception -- caller should handle.
583
584    :path: path a file
585    """
586    d = os.path.dirname(fpath)
587    if d and not os.path.isdir(d):
588        os.makedirs(d)
589        return True
590    return False
def is_code(t: str, nlen=6):
46def is_code(t: str, nlen=6):
47    """
48    Test if a string is an ASCII code typically 1-3 chars in len.
49    :param t: text
50    :param nlen: threshold for string len
51    :return:
52    """
53    if not t:
54        return False
55    if len(t) > nlen or not t.isupper():
56        return False
57    return code_pattern.match(t) is not None

Test if a string is an ASCII code typically 1-3 chars in len. :param t: text :param nlen: threshold for string len :return:

def is_abbreviation(nm: str):
60def is_abbreviation(nm: str):
61    """
62    Determine if something is an abbreviation.
63    Otherwise if text ends with "." we'll conclude so.
64
65    Examples:
66        Ala.     YES
67        Ala      NO
68        S. Bob   NO   -- abbreviated, yes, but this is more like a contraction.
69        S. B.    YES
70
71    :param nm: textual name
72    :return: True if obj is inferred to be an abbreviation
73    """
74    return nm.endswith(".")

Determine if something is an abbreviation. Otherwise if text ends with "." we'll conclude so.

Examples: Ala. YES Ala NO S. Bob NO -- abbreviated, yes, but this is more like a contraction. S. B. YES

:param nm: textual name :return: True if obj is inferred to be an abbreviation

def get_text(t):
84def get_text(t):
85    """ Default is to return Unicode string from raw data"""
86    if isinstance(t, str):
87        return t
88    return str(t, encoding='utf-8')

Default is to return Unicode string from raw data

def fast_replace(t, sep, sub=None):
 91def fast_replace(t, sep, sub=None):
 92    """
 93    Replace separators (sep) with substitute char, sub. Many-to-one substitute.
 94
 95    "a.b, c" SEP='.,'
 96    :param t:  input text
 97    :param sep: string of chars to replace
 98    :param sub: replacement char
 99    :return:  text with separators replaced
100    """
101    result = []
102    for ch in t:
103        if ch in sep:
104            if sub:
105                result.append(sub)
106        else:
107            result.append(ch)
108    return ''.join(result)

Replace separators (sep) with substitute char, sub. Many-to-one substitute.

"a.b, c" SEP='.,' :param t: input text :param sep: string of chars to replace :param sub: replacement char :return: text with separators replaced

def guess_encoding(text):
115def guess_encoding(text):
116    """ Given bytes, determine the character set encoding
117    @return: dict with encoding and confidence
118    """
119    if not text: return {'confidence': 0, 'encoding': None}
120
121    enc = detect_charset(text)
122
123    cset = enc['encoding']
124    if cset.lower() == 'iso-8859-2':
125        # Anomoaly -- chardet things Hungarian (iso-8850-2) is
126        # a close match for a latin-1 document.  At least the quotes match
127        # Other Latin-xxx variants will likely match, but actually be Latin1
128        # or win-1252.   see Chardet explanation for poor reliability of Latin-1 detection
129        #
130        enc['encoding'] = CHARDET_LATIN2_ENCODING
131
132    return enc

Given bytes, determine the character set encoding @return: dict with encoding and confidence

def bytes2unicode(buf, encoding=None):
135def bytes2unicode(buf, encoding=None):
136    """
137    Convert bytes 2 unicode by guessing character set.
138    :param buf:
139    :param encoding:
140    :return:
141    """
142    if not encoding:
143        enc = guess_encoding(buf)
144        encoding = enc['encoding']
145        if not encoding:
146            return None
147    return str(buf, encoding=encoding)

Convert bytes 2 unicode by guessing character set. :param buf: :param encoding: :return:

def levenshtein_distance(s, t):
161def levenshtein_distance(s, t):
162    """
163    Wikipedia page on Levenshtein Edit Distance
164    https://en.wikipedia.org/wiki/Levenshtein_distance
165
166    This is the fastest, simplest of 3 methods documented for Python.
167    """
168    s = ' ' + s
169    t = ' ' + t
170    d = {}
171    S = len(s)
172    T = len(t)
173    if S == T and s == t:
174        return 0
175    for i in range(S):
176        d[i, 0] = i
177    for j in range(T):
178        d[0, j] = j
179    for j in range(1, T):
180        for i in range(1, S):
181            if s[i] == t[j]:
182                d[i, j] = d[i - 1, j - 1]
183            else:
184                d[i, j] = min(d[i - 1, j] + 1, d[i, j - 1] + 1, d[i - 1, j - 1] + 1)
185    return d[(S - 1, T - 1)]

Wikipedia page on Levenshtein Edit Distance https://en.wikipedia.org/wiki/Levenshtein_distance

This is the fastest, simplest of 3 methods documented for Python.

def get_number(token):
215def get_number(token):
216    """ Turn leading part of a string into a number, if possible.
217    """
218    num = StringIO()
219    for ch in token:
220        if ch.isdigit() or ch == '.' or ch == '-':
221            num.write(ch)
222        else:
223            break
224    val = num.getvalue()
225    num.close()
226    return val

Turn leading part of a string into a number, if possible.

def has_digit(text):
229def has_digit(text):
230    """
231    Used primarily to report places and appears to be critical for
232    name filtering when doing phonetics.
233    """
234    if text is None:
235        return False
236
237    for ch in text:
238        # ascii
239        if ch.isdigit():
240            return True
241    return False

Used primarily to report places and appears to be critical for name filtering when doing phonetics.

def measure_case(t):
244def measure_case(t):
245    """
246
247    :param t: text
248    :return:  tuple:  counts of UPPER, lower, Alpha, Non-Alpha, WS
249    """
250    if not t:
251        return 0, 0, 0, 0, 0
252
253    u, l, ch, nonch, ws = 0, 0, 0, 0, 0
254    for c in t:
255        if c.isalpha():
256            ch += 1
257            if c.isupper():
258                u += 1
259            elif c.islower():
260                l += 1
261        elif c.isspace():
262            ws += 1
263        else:
264            nonch += 1
265
266    # you should verify
267    #   TOTAL chars = ch + nonch + ws
268    #   Alpha chars, ch = u + l
269    return u, l, ch, nonch, ws

:param t: text :return: tuple: counts of UPPER, lower, Alpha, Non-Alpha, WS

def is_value(v):
279def is_value(v):
280    """
281    Working more with pandas or sci libraries -- you run into various types of default "Null" values.
282    This checks to see if value is non-trivial, non-empty.
283    :param v:
284    :return:
285    """
286    if v is None:
287        return False
288    if isinstance(v, (float, int)):
289        return not isnan(v)
290    return True

Working more with pandas or sci libraries -- you run into various types of default "Null" values. This checks to see if value is non-trivial, non-empty. :param v: :return:

def get_list(text, delim=',', lower=False):
303def get_list(text, delim=',', lower=False):
304    """
305    Take a string and return trim segments given the delimiter:
306
307         "A,  B,\tC" => ["A", "B", "C"]
308    :param text:
309    :param delim: delimiter str
310    :param lower: True if you want items lowercased
311    :return: array
312    """
313    if not text:
314        return []
315
316    data = text.split(delim)
317    arr = []
318    for v in data:
319        _v = v.strip()
320        if _v:
321            if lower:
322                _v = _v.lower()
323            arr.append(_v)
324    return arr

Take a string and return trim segments given the delimiter:

 "A,  B,        C" => ["A", "B", "C"]

:param text: :param delim: delimiter str :param lower: True if you want items lowercased :return: array

def get_text_window(offset, matchlen, textsize, width):
327def get_text_window(offset, matchlen, textsize, width):
328    """ prepreprepre MATCH postpostpost
329       ^            ^   ^            ^
330       l-width      l   l+len        l+len+width
331       left_y  left_x   right_x      right_y
332    """
333    left_x = offset - width
334    left_y = offset - 1
335    right_x = offset + matchlen
336    right_y = right_x + width
337    if left_x < 0:
338        left_x = 0
339
340    if left_y < left_x:
341        left_y = left_x
342
343    # bounds checking  END....y?  then y=END, results in shorter postmatch
344    if right_y >= textsize:
345        right_y = textsize - 1
346    # bounds checking   y.... x?  then x=y,  results in empty postmatch
347    if right_x > right_y:
348        right_x = right_y
349
350    return [left_x, left_y, right_x, right_y]

prepreprepre MATCH postpostpost ^ ^ ^ ^ l-width l l+len l+len+width left_y left_x right_x right_y

def has_cjk(text):
353def has_cjk(text):
354    """
355    infer if chinese (unihan), korean (hangul) or japanese (hirgana) characters are present
356    :param text: 
357    :return: 
358    """
359    #             CJK, Hirgana, Katana.  Unified Ideagoraphs. Hangjul.
360    search = re.search("[\u3000-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af]", text, flags=re.IGNORECASE | re.UNICODE)
361    return search is not None

infer if chinese (unihan), korean (hangul) or japanese (hirgana) characters are present :param text: :return:

def has_arabic(text):
364def has_arabic(text):
365    """
366    infer if text has Arabic / Middle-eastern scripts ~ Urdu, Farsi, Arabic.
367    :param text:
368    :return:
369    """
370    search = re.search("[\u0600-\u08ff]", text, flags=re.IGNORECASE | re.UNICODE)
371    return search is not None

infer if text has Arabic / Middle-eastern scripts ~ Urdu, Farsi, Arabic. :param text: :return:

def trivial_bias(name):
374def trivial_bias(name):
375    """ Experimental: Deteremine unique a name is using length and character set and # of words
376
377    Abcd           4/2 + 1 + 0   x 0.02  = 0.06
378    Abcde fghi    10/2 + 2 + 0   x 0.02  = 0.14
379    Abcdé fghi    10/2 + 2 + 1   x 0.02  = 0.16
380    """
381    l_points = len(name) / 2
382    word_points = len(name.split())
383    charset_points = 1 if not is_ascii(name) else 0
384    score = (l_points + word_points + charset_points) * 0.02
385    return float("{:0.3}".format(score))

Experimental: Deteremine unique a name is using length and character set and # of words

Abcd 4/2 + 1 + 0 x 0.02 = 0.06 Abcde fghi 10/2 + 2 + 0 x 0.02 = 0.14 Abcdé fghi 10/2 + 2 + 1 x 0.02 = 0.16

def replace_diacritics(txt: str):
391def replace_diacritics(txt: str):
392    """
393    Leverage the OpenSextant traditional ASCII Folding map for now.
394    Yes encoded("ascii", "ignore") may do this....
395    :param txt:
396    :return: a non-diacritic version of the text
397    """
398    str_prepped = COMMON_DIACRITC_HASHMARKS.sub("'", txt)
399
400    buf = []
401    for ch in str_prepped:
402        buf.append(LATIN1_FOLDING.get(ch, ch))
403    return "".join(buf)

Leverage the OpenSextant traditional ASCII Folding map for now. Yes encoded("ascii", "ignore") may do this.... :param txt: :return: a non-diacritic version of the text

def strip_quotes(t):
406def strip_quotes(t):
407    """
408    Run replace_diacritics first -- this routine only attempts to remove normal quotes ~ ', "
409    """
410    return t.strip('"').strip("'")

Run replace_diacritics first -- this routine only attempts to remove normal quotes ~ ', "

class ConfigUtility:
436class ConfigUtility:
437    """ A utility to load parameter lists, CSV files, word lists, etc. from a folder *dir*
438
439    functions here take an Oxygen cfg parameter keyword or a file path.
440    If the keyword is valid and points to a valid file path, then the file path is used.
441    In otherwords, keywords are aliases for a file on disk.
442
443      Ex.  'mywords' = './cfg/mywords_v03_filtered.txt'
444
445      oxygen.cfg file would have this mapping.  Your code just references 'mywords' to load it.
446    """
447
448    def __init__(self, config=None, rootdir='.'):
449
450        # If config is None, then caller can still use loadDataFromFile(abspath, delim) for example.
451        #
452        self.config = config
453        self.rootdir = rootdir
454
455    def loadCSVFile(self, keyword, delim):
456        """
457          Load a named CSV file.  If the name is not a cfg parameter, the keyword name *is* the file.
458        """
459        f = self.config.get(keyword)
460        if f is None:
461            f = keyword
462
463        path = os.path.join(self.rootdir, f)
464        return self.loadDataFromFile(path, delim)
465
466    def loadDataFromFile(self, path, delim):
467        """
468
469        :param path: file path
470        :param delim: delimiter
471        :return: Array of tuples.
472        """
473        if not os.path.exists(path):
474            raise Exception('File does not exist, FILE=%s' % path)
475
476        with open(path, 'r', encoding="UTF-8") as f:
477            filereader = csv.reader(f, delimiter=delim, lineterminator='\n', dialect="excel")
478            data = []
479            for row in filereader:
480                if not row:
481                    print("Blank line")
482                    continue
483                first_cell = row[0].strip()
484                if first_cell.startswith('#'):
485                    continue
486                data.append(row)
487        return data
488
489    def loadFile(self, keyword):
490        """
491        Load a named word list file.
492        If the name is not a cfg parameter, the keyword name *is* the file.
493        """
494        filename = ''
495
496        if os.path.exists(keyword):
497            path = keyword
498        else:
499            filename = self.config.get(keyword)
500            if filename is None:
501                filename = keyword
502
503            path = os.path.join(self.rootdir, filename)
504            if not os.path.exists(path):
505                raise Exception('File does not exist, FILE=%s' % path)
506
507        return self.loadListFromFile(path)
508
509    def loadListFromFile(self, path):
510        """
511          Load text data from a file.
512          Returns array of non-comment rows. One non-whitespace row per line.
513        """
514        if not os.path.exists(path):
515            raise Exception('File does not exist, FILE=%s' % path)
516
517        with open(path, 'r', encoding="UTF-8") as fh:
518            termlist = []
519            for line in fh:
520                line = line.strip()
521                if line.startswith('#'):
522                    continue
523                if len(line) == 0:
524                    continue
525
526                termlist.append(line.lower())
527
528            return termlist

A utility to load parameter lists, CSV files, word lists, etc. from a folder dir

functions here take an Oxygen cfg parameter keyword or a file path. If the keyword is valid and points to a valid file path, then the file path is used. In otherwords, keywords are aliases for a file on disk.

Ex. 'mywords' = './cfg/mywords_v03_filtered.txt'

oxygen.cfg file would have this mapping. Your code just references 'mywords' to load it.

def loadCSVFile(self, keyword, delim):
455    def loadCSVFile(self, keyword, delim):
456        """
457          Load a named CSV file.  If the name is not a cfg parameter, the keyword name *is* the file.
458        """
459        f = self.config.get(keyword)
460        if f is None:
461            f = keyword
462
463        path = os.path.join(self.rootdir, f)
464        return self.loadDataFromFile(path, delim)

Load a named CSV file. If the name is not a cfg parameter, the keyword name is the file.

def loadDataFromFile(self, path, delim):
466    def loadDataFromFile(self, path, delim):
467        """
468
469        :param path: file path
470        :param delim: delimiter
471        :return: Array of tuples.
472        """
473        if not os.path.exists(path):
474            raise Exception('File does not exist, FILE=%s' % path)
475
476        with open(path, 'r', encoding="UTF-8") as f:
477            filereader = csv.reader(f, delimiter=delim, lineterminator='\n', dialect="excel")
478            data = []
479            for row in filereader:
480                if not row:
481                    print("Blank line")
482                    continue
483                first_cell = row[0].strip()
484                if first_cell.startswith('#'):
485                    continue
486                data.append(row)
487        return data

:param path: file path :param delim: delimiter :return: Array of tuples.

def loadFile(self, keyword):
489    def loadFile(self, keyword):
490        """
491        Load a named word list file.
492        If the name is not a cfg parameter, the keyword name *is* the file.
493        """
494        filename = ''
495
496        if os.path.exists(keyword):
497            path = keyword
498        else:
499            filename = self.config.get(keyword)
500            if filename is None:
501                filename = keyword
502
503            path = os.path.join(self.rootdir, filename)
504            if not os.path.exists(path):
505                raise Exception('File does not exist, FILE=%s' % path)
506
507        return self.loadListFromFile(path)

Load a named word list file. If the name is not a cfg parameter, the keyword name is the file.

def loadListFromFile(self, path):
509    def loadListFromFile(self, path):
510        """
511          Load text data from a file.
512          Returns array of non-comment rows. One non-whitespace row per line.
513        """
514        if not os.path.exists(path):
515            raise Exception('File does not exist, FILE=%s' % path)
516
517        with open(path, 'r', encoding="UTF-8") as fh:
518            termlist = []
519            for line in fh:
520                line = line.strip()
521                if line.startswith('#'):
522                    continue
523                if len(line) == 0:
524                    continue
525
526                termlist.append(line.lower())
527
528            return termlist

Load text data from a file. Returns array of non-comment rows. One non-whitespace row per line.

def load_list(path, lower=False):
531def load_list(path, lower=False):
532    """
533      Load text data from a file.
534      Returns array of non-comment rows. One non-whitespace row per line.
535      :param path: file to load.
536      :param lower: Lowercased is optional.
537      :return: array of terms
538    """
539    if not os.path.exists(path):
540        raise Exception('File does not exist, FILE=%s' % path)
541
542    with open(path, 'r', encoding="UTF-8") as fh:
543        termlist = []
544        for line in fh:
545            line = line.strip()
546            if line.startswith('#') or not line:
547                continue
548
549            termlist.append(line.lower() if lower else line)
550
551        return termlist

Load text data from a file. Returns array of non-comment rows. One non-whitespace row per line. :param path: file to load. :param lower: Lowercased is optional. :return: array of terms

def load_datafile(path, delim):
554def load_datafile(path, delim):
555    """
556    :param path: file path
557    :param delim: delimiter
558    :return: Array of tuples.
559    """
560    if not os.path.exists(path):
561        raise Exception(f'File does not exist, FILE={path}')
562
563    with open(path, 'r', encoding="utf-8") as f:
564        data = []
565        text = f.read().replace('\uFEFF', '')
566
567        for line in text.split("\n"):
568            row = line.strip().split(delim)
569            if not row:
570                # print("Blank line")
571                continue
572            first_cell = row[0]
573            if first_cell.startswith('#'):
574                continue
575            data.append(row)
576        return data

:param path: file path :param delim: delimiter :return: Array of tuples.

def ensure_dirs(fpath):
579def ensure_dirs(fpath):
580    """
581    Given a file path, ensure parent folders exist.
582    If path is intended to be a directory -- use os.makedirs(path) instead.
583    May throw exception -- caller should handle.
584
585    :path: path a file
586    """
587    d = os.path.dirname(fpath)
588    if d and not os.path.isdir(d):
589        os.makedirs(d)
590        return True
591    return False

Given a file path, ensure parent folders exist. If path is intended to be a directory -- use os.makedirs(path) instead. May throw exception -- caller should handle.

:path: path a file