opensextant.utility
Copyright 2015-2021 The MITRE Corporation.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
=============================================================================
@author: ubaldino
OpenSextant utilities
1# -*- coding: utf-8 -*- 2""" 3 4 Copyright 2015-2021 The MITRE Corporation. 5 6 Licensed under the Apache License, Version 2.0 (the "License"); you may not 7 use this file except in compliance with the License. You may obtain a copy of 8 the License at 9 10 http://www.apache.org/licenses/LICENSE-2.0 11 12 Unless required by applicable law or agreed to in writing, software 13 distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 14 WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 15 License for the specific language governing permissions and limitations under 16 the License. 17 18 ============================================================================= 19 20@author: ubaldino 21 22OpenSextant utilities 23""" 24import csv 25import os 26import re 27from io import StringIO 28from math import isnan 29 30from chardet import detect as detect_charset 31from .unicode import LATIN1_FOLDING 32 33 34# --------------------------------------- 35# TEXT UTILITIES 36# --------------------------------------- 37# 38def is_text(t): 39 return isinstance(t, str) 40 41 42code_pattern = re.compile("^[A-Z0-9]{1,}$", re.ASCII) 43 44 45def is_code(t: str, nlen=6): 46 """ 47 Test if a string is an ASCII code typically 1-3 chars in len. 48 :param t: text 49 :param nlen: threshold for string len 50 :return: 51 """ 52 if not t: 53 return False 54 if len(t) > nlen or not t.isupper(): 55 return False 56 return code_pattern.match(t) is not None 57 58 59def is_abbreviation(nm: str): 60 """ 61 Determine if something is an abbreviation. 62 Otherwise if text ends with "." we'll conclude so. 63 64 Examples: 65 Ala. YES 66 Ala NO 67 S. Bob NO -- abbreviated, yes, but this is more like a contraction. 68 S. B. YES 69 70 :param nm: textual name 71 :return: True if obj is inferred to be an abbreviation 72 """ 73 return nm.endswith(".") 74 75 76def is_ascii(s): 77 try: 78 return all(ord(c) < 128 for c in s) 79 except: 80 pass 81 return False 82 83def get_text(t): 84 """ Default is to return Unicode string from raw data""" 85 if isinstance(t, str): 86 return t 87 return str(t, encoding='utf-8') 88 89 90def fast_replace(t, sep, sub=None): 91 """ 92 Replace separators (sep) with substitute char, sub. Many-to-one substitute. 93 94 "a.b, c" SEP='.,' 95 :param t: input text 96 :param sep: string of chars to replace 97 :param sub: replacement char 98 :return: text with separators replaced 99 """ 100 result = [] 101 for ch in t: 102 if ch in sep: 103 if sub: 104 result.append(sub) 105 else: 106 result.append(ch) 107 return ''.join(result) 108 109 110# ISO-8859-2 is a common answer, when they really mean ISO-1 111CHARDET_LATIN2_ENCODING = 'ISO-8859-1' 112 113 114def guess_encoding(text): 115 """ Given bytes, determine the character set encoding 116 @return: dict with encoding and confidence 117 """ 118 if not text: return {'confidence': 0, 'encoding': None} 119 120 enc = detect_charset(text) 121 122 cset = enc['encoding'] 123 if cset.lower() == 'iso-8859-2': 124 # Anomoaly -- chardet things Hungarian (iso-8850-2) is 125 # a close match for a latin-1 document. At least the quotes match 126 # Other Latin-xxx variants will likely match, but actually be Latin1 127 # or win-1252. see Chardet explanation for poor reliability of Latin-1 detection 128 # 129 enc['encoding'] = CHARDET_LATIN2_ENCODING 130 131 return enc 132 133 134def bytes2unicode(buf, encoding=None): 135 """ 136 Convert bytes 2 unicode by guessing character set. 137 :param buf: 138 :param encoding: 139 :return: 140 """ 141 if not encoding: 142 enc = guess_encoding(buf) 143 encoding = enc['encoding'] 144 if not encoding: 145 return None 146 return str(buf, encoding=encoding) 147 148 149reSqueezeWhiteSpace = re.compile(r'\s+', re.MULTILINE) 150 151 152def squeeze_whitespace(s): 153 return reSqueezeWhiteSpace.sub(' ', s).strip() 154 155 156def scrub_eol(t): 157 return t.replace('\n', ' ').replace('\r', '') 158 159 160def levenshtein_distance(s, t): 161 """ 162 Wikipedia page on Levenshtein Edit Distance 163 https://en.wikipedia.org/wiki/Levenshtein_distance 164 165 This is the fastest, simplest of 3 methods documented for Python. 166 """ 167 s = ' ' + s 168 t = ' ' + t 169 d = {} 170 S = len(s) 171 T = len(t) 172 if S == T and s == t: 173 return 0 174 for i in range(S): 175 d[i, 0] = i 176 for j in range(T): 177 d[0, j] = j 178 for j in range(1, T): 179 for i in range(1, S): 180 if s[i] == t[j]: 181 d[i, j] = d[i - 1, j - 1] 182 else: 183 d[i, j] = min(d[i - 1, j] + 1, d[i, j - 1] + 1, d[i - 1, j - 1] + 1) 184 return d[(S - 1, T - 1)] 185 186 187BOOL_F_STR = {"false", 0, "0", "n", "f", "no", "", "null"} 188BOOL_T_STR = {"true", 1, "1", "y", "t", "yes"} 189 190 191def get_bool(token): 192 if not token: 193 return False 194 195 if isinstance(token, bool): 196 return token 197 198 if isinstance(token, int): 199 if token > 0: 200 return True 201 if token == 0: 202 return False 203 204 t = token.lower() 205 if t in BOOL_F_STR: 206 return False 207 208 if t in BOOL_T_STR: 209 return True 210 211 return False 212 213 214def get_number(token): 215 """ Turn leading part of a string into a number, if possible. 216 """ 217 num = StringIO() 218 for ch in token: 219 if ch.isdigit() or ch == '.' or ch == '-': 220 num.write(ch) 221 else: 222 break 223 val = num.getvalue() 224 num.close() 225 return val 226 227 228def has_digit(text): 229 """ 230 Used primarily to report places and appears to be critical for 231 name filtering when doing phonetics. 232 """ 233 if text is None: 234 return False 235 236 for ch in text: 237 # ascii 238 if ch.isdigit(): 239 return True 240 return False 241 242 243def measure_case(t): 244 """ 245 246 :param t: text 247 :return: tuple: counts of UPPER, lower, Alpha, Non-Alpha, WS 248 """ 249 if not t: 250 return 0, 0, 0, 0, 0 251 252 u, l, ch, nonch, ws = 0, 0, 0, 0, 0 253 for c in t: 254 if c.isalpha(): 255 ch += 1 256 if c.isupper(): 257 u += 1 258 elif c.islower(): 259 l += 1 260 elif c.isspace(): 261 ws += 1 262 else: 263 nonch += 1 264 265 # you should verify 266 # TOTAL chars = ch + nonch + ws 267 # Alpha chars, ch = u + l 268 return u, l, ch, nonch, ws 269 270 271def is_upper_text(t, threadshold=0.90): 272 u, l, ch, nonch, ws = measure_case(t) 273 if ch == 0: 274 return False 275 return u / ch > threadshold 276 277 278def is_value(v): 279 """ 280 Working more with pandas or sci libraries -- you run into various types of default "Null" values. 281 This checks to see if value is non-trivial, non-empty. 282 :param v: 283 :return: 284 """ 285 if v is None: 286 return False 287 if isinstance(v, (float, int)): 288 return not isnan(v) 289 return True 290 291 292def parse_float(v): 293 if not v: 294 return None 295 try: 296 return float(v) 297 except Exception as float_err: 298 print("Unable to parse float", v, str(float_err)) 299 return None 300 301 302def get_list(text, delim=',', lower=False): 303 """ 304 Take a string and return trim segments given the delimiter: 305 306 "A, B,\tC" => ["A", "B", "C"] 307 :param text: 308 :param delim: delimiter str 309 :param lower: True if you want items lowercased 310 :return: array 311 """ 312 if not text: 313 return [] 314 315 data = text.split(delim) 316 arr = [] 317 for v in data: 318 _v = v.strip() 319 if _v: 320 if lower: 321 _v = _v.lower() 322 arr.append(_v) 323 return arr 324 325 326def get_text_window(offset, matchlen, textsize, width): 327 """ prepreprepre MATCH postpostpost 328 ^ ^ ^ ^ 329 l-width l l+len l+len+width 330 left_y left_x right_x right_y 331 """ 332 left_x = offset - width 333 left_y = offset - 1 334 right_x = offset + matchlen 335 right_y = right_x + width 336 if left_x < 0: 337 left_x = 0 338 339 if left_y < left_x: 340 left_y = left_x 341 342 # bounds checking END....y? then y=END, results in shorter postmatch 343 if right_y >= textsize: 344 right_y = textsize - 1 345 # bounds checking y.... x? then x=y, results in empty postmatch 346 if right_x > right_y: 347 right_x = right_y 348 349 return [left_x, left_y, right_x, right_y] 350 351 352def has_cjk(text): 353 """ 354 infer if chinese (unihan), korean (hangul) or japanese (hirgana) characters are present 355 :param text: 356 :return: 357 """ 358 # CJK, Hirgana, Katana. Unified Ideagoraphs. Hangjul. 359 search = re.search("[\u3000-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af]", text, flags=re.IGNORECASE | re.UNICODE) 360 return search is not None 361 362 363def has_arabic(text): 364 """ 365 infer if text has Arabic / Middle-eastern scripts ~ Urdu, Farsi, Arabic. 366 :param text: 367 :return: 368 """ 369 search = re.search("[\u0600-\u08ff]", text, flags=re.IGNORECASE | re.UNICODE) 370 return search is not None 371 372 373def trivial_bias(name): 374 """ Experimental: Deteremine unique a name is using length and character set and # of words 375 376 Abcd 4/2 + 1 + 0 x 0.02 = 0.06 377 Abcde fghi 10/2 + 2 + 0 x 0.02 = 0.14 378 Abcdé fghi 10/2 + 2 + 1 x 0.02 = 0.16 379 """ 380 l_points = len(name) / 2 381 word_points = len(name.split()) 382 charset_points = 1 if not is_ascii(name) else 0 383 score = (l_points + word_points + charset_points) * 0.02 384 return float("{:0.3}".format(score)) 385 386 387COMMON_DIACRITC_HASHMARKS = re.compile("[\"'`\u00B4\u2018\u2019]") 388 389 390def replace_diacritics(txt: str): 391 """ 392 Leverage the OpenSextant traditional ASCII Folding map for now. 393 Yes encoded("ascii", "ignore") may do this.... 394 :param txt: 395 :return: a non-diacritic version of the text 396 """ 397 str_prepped = COMMON_DIACRITC_HASHMARKS.sub("'", txt) 398 399 buf = [] 400 for ch in str_prepped: 401 buf.append(LATIN1_FOLDING.get(ch, ch)) 402 return "".join(buf) 403 404 405def strip_quotes(t): 406 """ 407 Run replace_diacritics first -- this routine only attempts to remove normal quotes ~ ', " 408 """ 409 return t.strip('"').strip("'") 410 411 412# /--------------------------------------- 413# FILE UTILITIES 414# /--------------------------------------- 415# 416def _utf_8_encoder(unicode_csv_data): 417 for line in unicode_csv_data: 418 yield line.encode('utf-8') 419 420 421def get_csv_writer(fh, columns, delim=','): 422 return csv.DictWriter(fh, columns, restval="", extrasaction='raise', 423 dialect='excel', lineterminator='\n', 424 delimiter=delim, quotechar='"', 425 quoting=csv.QUOTE_ALL, escapechar='\\') 426 427 428def get_csv_reader(fh, columns, delim=','): 429 return csv.DictReader(fh, columns, restval="", dialect='excel', lineterminator='\n', escapechar='\\', 430 delimiter=delim, quotechar='"', quoting=csv.QUOTE_ALL) 431 432 433# ||||||||||||||||||||||||||||||||||||||||||||| 434# ||||||||||||||||||||||||||||||||||||||||||||| 435class ConfigUtility: 436 """ A utility to load parameter lists, CSV files, word lists, etc. from a folder *dir* 437 438 functions here take an Oxygen cfg parameter keyword or a file path. 439 If the keyword is valid and points to a valid file path, then the file path is used. 440 In otherwords, keywords are aliases for a file on disk. 441 442 Ex. 'mywords' = './cfg/mywords_v03_filtered.txt' 443 444 oxygen.cfg file would have this mapping. Your code just references 'mywords' to load it. 445 """ 446 447 def __init__(self, config=None, rootdir='.'): 448 449 # If config is None, then caller can still use loadDataFromFile(abspath, delim) for example. 450 # 451 self.config = config 452 self.rootdir = rootdir 453 454 def loadCSVFile(self, keyword, delim): 455 """ 456 Load a named CSV file. If the name is not a cfg parameter, the keyword name *is* the file. 457 """ 458 f = self.config.get(keyword) 459 if f is None: 460 f = keyword 461 462 path = os.path.join(self.rootdir, f) 463 return self.loadDataFromFile(path, delim) 464 465 def loadDataFromFile(self, path, delim): 466 """ 467 468 :param path: file path 469 :param delim: delimiter 470 :return: Array of tuples. 471 """ 472 if not os.path.exists(path): 473 raise Exception('File does not exist, FILE=%s' % path) 474 475 with open(path, 'r', encoding="UTF-8") as f: 476 filereader = csv.reader(f, delimiter=delim, lineterminator='\n', dialect="excel") 477 data = [] 478 for row in filereader: 479 if not row: 480 print("Blank line") 481 continue 482 first_cell = row[0].strip() 483 if first_cell.startswith('#'): 484 continue 485 data.append(row) 486 return data 487 488 def loadFile(self, keyword): 489 """ 490 Load a named word list file. 491 If the name is not a cfg parameter, the keyword name *is* the file. 492 """ 493 filename = '' 494 495 if os.path.exists(keyword): 496 path = keyword 497 else: 498 filename = self.config.get(keyword) 499 if filename is None: 500 filename = keyword 501 502 path = os.path.join(self.rootdir, filename) 503 if not os.path.exists(path): 504 raise Exception('File does not exist, FILE=%s' % path) 505 506 return self.loadListFromFile(path) 507 508 def loadListFromFile(self, path): 509 """ 510 Load text data from a file. 511 Returns array of non-comment rows. One non-whitespace row per line. 512 """ 513 if not os.path.exists(path): 514 raise Exception('File does not exist, FILE=%s' % path) 515 516 with open(path, 'r', encoding="UTF-8") as fh: 517 termlist = [] 518 for line in fh: 519 line = line.strip() 520 if line.startswith('#'): 521 continue 522 if len(line) == 0: 523 continue 524 525 termlist.append(line.lower()) 526 527 return termlist 528 529 530def load_list(path, lower=False): 531 """ 532 Load text data from a file. 533 Returns array of non-comment rows. One non-whitespace row per line. 534 :param path: file to load. 535 :param lower: Lowercased is optional. 536 :return: array of terms 537 """ 538 if not os.path.exists(path): 539 raise Exception('File does not exist, FILE=%s' % path) 540 541 with open(path, 'r', encoding="UTF-8") as fh: 542 termlist = [] 543 for line in fh: 544 line = line.strip() 545 if line.startswith('#') or not line: 546 continue 547 548 termlist.append(line.lower() if lower else line) 549 550 return termlist 551 552 553def load_datafile(path, delim): 554 """ 555 :param path: file path 556 :param delim: delimiter 557 :return: Array of tuples. 558 """ 559 if not os.path.exists(path): 560 raise Exception(f'File does not exist, FILE={path}') 561 562 with open(path, 'r', encoding="utf-8") as f: 563 data = [] 564 text = f.read().replace('\uFEFF', '') 565 566 for line in text.split("\n"): 567 row = line.strip().split(delim) 568 if not row: 569 # print("Blank line") 570 continue 571 first_cell = row[0] 572 if first_cell.startswith('#'): 573 continue 574 data.append(row) 575 return data 576 577 578def ensure_dirs(fpath): 579 """ 580 Given a file path, ensure parent folders exist. 581 If path is intended to be a directory -- use os.makedirs(path) instead. 582 May throw exception -- caller should handle. 583 584 :path: path a file 585 """ 586 d = os.path.dirname(fpath) 587 if d and not os.path.isdir(d): 588 os.makedirs(d) 589 return True 590 return False
46def is_code(t: str, nlen=6): 47 """ 48 Test if a string is an ASCII code typically 1-3 chars in len. 49 :param t: text 50 :param nlen: threshold for string len 51 :return: 52 """ 53 if not t: 54 return False 55 if len(t) > nlen or not t.isupper(): 56 return False 57 return code_pattern.match(t) is not None
Test if a string is an ASCII code typically 1-3 chars in len. :param t: text :param nlen: threshold for string len :return:
60def is_abbreviation(nm: str): 61 """ 62 Determine if something is an abbreviation. 63 Otherwise if text ends with "." we'll conclude so. 64 65 Examples: 66 Ala. YES 67 Ala NO 68 S. Bob NO -- abbreviated, yes, but this is more like a contraction. 69 S. B. YES 70 71 :param nm: textual name 72 :return: True if obj is inferred to be an abbreviation 73 """ 74 return nm.endswith(".")
Determine if something is an abbreviation. Otherwise if text ends with "." we'll conclude so.
Examples: Ala. YES Ala NO S. Bob NO -- abbreviated, yes, but this is more like a contraction. S. B. YES
:param nm: textual name :return: True if obj is inferred to be an abbreviation
84def get_text(t): 85 """ Default is to return Unicode string from raw data""" 86 if isinstance(t, str): 87 return t 88 return str(t, encoding='utf-8')
Default is to return Unicode string from raw data
91def fast_replace(t, sep, sub=None): 92 """ 93 Replace separators (sep) with substitute char, sub. Many-to-one substitute. 94 95 "a.b, c" SEP='.,' 96 :param t: input text 97 :param sep: string of chars to replace 98 :param sub: replacement char 99 :return: text with separators replaced 100 """ 101 result = [] 102 for ch in t: 103 if ch in sep: 104 if sub: 105 result.append(sub) 106 else: 107 result.append(ch) 108 return ''.join(result)
Replace separators (sep) with substitute char, sub. Many-to-one substitute.
"a.b, c" SEP='.,' :param t: input text :param sep: string of chars to replace :param sub: replacement char :return: text with separators replaced
115def guess_encoding(text): 116 """ Given bytes, determine the character set encoding 117 @return: dict with encoding and confidence 118 """ 119 if not text: return {'confidence': 0, 'encoding': None} 120 121 enc = detect_charset(text) 122 123 cset = enc['encoding'] 124 if cset.lower() == 'iso-8859-2': 125 # Anomoaly -- chardet things Hungarian (iso-8850-2) is 126 # a close match for a latin-1 document. At least the quotes match 127 # Other Latin-xxx variants will likely match, but actually be Latin1 128 # or win-1252. see Chardet explanation for poor reliability of Latin-1 detection 129 # 130 enc['encoding'] = CHARDET_LATIN2_ENCODING 131 132 return enc
Given bytes, determine the character set encoding @return: dict with encoding and confidence
135def bytes2unicode(buf, encoding=None): 136 """ 137 Convert bytes 2 unicode by guessing character set. 138 :param buf: 139 :param encoding: 140 :return: 141 """ 142 if not encoding: 143 enc = guess_encoding(buf) 144 encoding = enc['encoding'] 145 if not encoding: 146 return None 147 return str(buf, encoding=encoding)
Convert bytes 2 unicode by guessing character set. :param buf: :param encoding: :return:
161def levenshtein_distance(s, t): 162 """ 163 Wikipedia page on Levenshtein Edit Distance 164 https://en.wikipedia.org/wiki/Levenshtein_distance 165 166 This is the fastest, simplest of 3 methods documented for Python. 167 """ 168 s = ' ' + s 169 t = ' ' + t 170 d = {} 171 S = len(s) 172 T = len(t) 173 if S == T and s == t: 174 return 0 175 for i in range(S): 176 d[i, 0] = i 177 for j in range(T): 178 d[0, j] = j 179 for j in range(1, T): 180 for i in range(1, S): 181 if s[i] == t[j]: 182 d[i, j] = d[i - 1, j - 1] 183 else: 184 d[i, j] = min(d[i - 1, j] + 1, d[i, j - 1] + 1, d[i - 1, j - 1] + 1) 185 return d[(S - 1, T - 1)]
Wikipedia page on Levenshtein Edit Distance https://en.wikipedia.org/wiki/Levenshtein_distance
This is the fastest, simplest of 3 methods documented for Python.
215def get_number(token): 216 """ Turn leading part of a string into a number, if possible. 217 """ 218 num = StringIO() 219 for ch in token: 220 if ch.isdigit() or ch == '.' or ch == '-': 221 num.write(ch) 222 else: 223 break 224 val = num.getvalue() 225 num.close() 226 return val
Turn leading part of a string into a number, if possible.
229def has_digit(text): 230 """ 231 Used primarily to report places and appears to be critical for 232 name filtering when doing phonetics. 233 """ 234 if text is None: 235 return False 236 237 for ch in text: 238 # ascii 239 if ch.isdigit(): 240 return True 241 return False
Used primarily to report places and appears to be critical for name filtering when doing phonetics.
244def measure_case(t): 245 """ 246 247 :param t: text 248 :return: tuple: counts of UPPER, lower, Alpha, Non-Alpha, WS 249 """ 250 if not t: 251 return 0, 0, 0, 0, 0 252 253 u, l, ch, nonch, ws = 0, 0, 0, 0, 0 254 for c in t: 255 if c.isalpha(): 256 ch += 1 257 if c.isupper(): 258 u += 1 259 elif c.islower(): 260 l += 1 261 elif c.isspace(): 262 ws += 1 263 else: 264 nonch += 1 265 266 # you should verify 267 # TOTAL chars = ch + nonch + ws 268 # Alpha chars, ch = u + l 269 return u, l, ch, nonch, ws
:param t: text :return: tuple: counts of UPPER, lower, Alpha, Non-Alpha, WS
279def is_value(v): 280 """ 281 Working more with pandas or sci libraries -- you run into various types of default "Null" values. 282 This checks to see if value is non-trivial, non-empty. 283 :param v: 284 :return: 285 """ 286 if v is None: 287 return False 288 if isinstance(v, (float, int)): 289 return not isnan(v) 290 return True
Working more with pandas or sci libraries -- you run into various types of default "Null" values. This checks to see if value is non-trivial, non-empty. :param v: :return:
303def get_list(text, delim=',', lower=False): 304 """ 305 Take a string and return trim segments given the delimiter: 306 307 "A, B,\tC" => ["A", "B", "C"] 308 :param text: 309 :param delim: delimiter str 310 :param lower: True if you want items lowercased 311 :return: array 312 """ 313 if not text: 314 return [] 315 316 data = text.split(delim) 317 arr = [] 318 for v in data: 319 _v = v.strip() 320 if _v: 321 if lower: 322 _v = _v.lower() 323 arr.append(_v) 324 return arr
Take a string and return trim segments given the delimiter:
"A, B, C" => ["A", "B", "C"]
:param text: :param delim: delimiter str :param lower: True if you want items lowercased :return: array
327def get_text_window(offset, matchlen, textsize, width): 328 """ prepreprepre MATCH postpostpost 329 ^ ^ ^ ^ 330 l-width l l+len l+len+width 331 left_y left_x right_x right_y 332 """ 333 left_x = offset - width 334 left_y = offset - 1 335 right_x = offset + matchlen 336 right_y = right_x + width 337 if left_x < 0: 338 left_x = 0 339 340 if left_y < left_x: 341 left_y = left_x 342 343 # bounds checking END....y? then y=END, results in shorter postmatch 344 if right_y >= textsize: 345 right_y = textsize - 1 346 # bounds checking y.... x? then x=y, results in empty postmatch 347 if right_x > right_y: 348 right_x = right_y 349 350 return [left_x, left_y, right_x, right_y]
prepreprepre MATCH postpostpost ^ ^ ^ ^ l-width l l+len l+len+width left_y left_x right_x right_y
353def has_cjk(text): 354 """ 355 infer if chinese (unihan), korean (hangul) or japanese (hirgana) characters are present 356 :param text: 357 :return: 358 """ 359 # CJK, Hirgana, Katana. Unified Ideagoraphs. Hangjul. 360 search = re.search("[\u3000-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af]", text, flags=re.IGNORECASE | re.UNICODE) 361 return search is not None
infer if chinese (unihan), korean (hangul) or japanese (hirgana) characters are present :param text: :return:
364def has_arabic(text): 365 """ 366 infer if text has Arabic / Middle-eastern scripts ~ Urdu, Farsi, Arabic. 367 :param text: 368 :return: 369 """ 370 search = re.search("[\u0600-\u08ff]", text, flags=re.IGNORECASE | re.UNICODE) 371 return search is not None
infer if text has Arabic / Middle-eastern scripts ~ Urdu, Farsi, Arabic. :param text: :return:
374def trivial_bias(name): 375 """ Experimental: Deteremine unique a name is using length and character set and # of words 376 377 Abcd 4/2 + 1 + 0 x 0.02 = 0.06 378 Abcde fghi 10/2 + 2 + 0 x 0.02 = 0.14 379 Abcdé fghi 10/2 + 2 + 1 x 0.02 = 0.16 380 """ 381 l_points = len(name) / 2 382 word_points = len(name.split()) 383 charset_points = 1 if not is_ascii(name) else 0 384 score = (l_points + word_points + charset_points) * 0.02 385 return float("{:0.3}".format(score))
Experimental: Deteremine unique a name is using length and character set and # of words
Abcd 4/2 + 1 + 0 x 0.02 = 0.06 Abcde fghi 10/2 + 2 + 0 x 0.02 = 0.14 Abcdé fghi 10/2 + 2 + 1 x 0.02 = 0.16
391def replace_diacritics(txt: str): 392 """ 393 Leverage the OpenSextant traditional ASCII Folding map for now. 394 Yes encoded("ascii", "ignore") may do this.... 395 :param txt: 396 :return: a non-diacritic version of the text 397 """ 398 str_prepped = COMMON_DIACRITC_HASHMARKS.sub("'", txt) 399 400 buf = [] 401 for ch in str_prepped: 402 buf.append(LATIN1_FOLDING.get(ch, ch)) 403 return "".join(buf)
Leverage the OpenSextant traditional ASCII Folding map for now. Yes encoded("ascii", "ignore") may do this.... :param txt: :return: a non-diacritic version of the text
406def strip_quotes(t): 407 """ 408 Run replace_diacritics first -- this routine only attempts to remove normal quotes ~ ', " 409 """ 410 return t.strip('"').strip("'")
Run replace_diacritics first -- this routine only attempts to remove normal quotes ~ ', "
436class ConfigUtility: 437 """ A utility to load parameter lists, CSV files, word lists, etc. from a folder *dir* 438 439 functions here take an Oxygen cfg parameter keyword or a file path. 440 If the keyword is valid and points to a valid file path, then the file path is used. 441 In otherwords, keywords are aliases for a file on disk. 442 443 Ex. 'mywords' = './cfg/mywords_v03_filtered.txt' 444 445 oxygen.cfg file would have this mapping. Your code just references 'mywords' to load it. 446 """ 447 448 def __init__(self, config=None, rootdir='.'): 449 450 # If config is None, then caller can still use loadDataFromFile(abspath, delim) for example. 451 # 452 self.config = config 453 self.rootdir = rootdir 454 455 def loadCSVFile(self, keyword, delim): 456 """ 457 Load a named CSV file. If the name is not a cfg parameter, the keyword name *is* the file. 458 """ 459 f = self.config.get(keyword) 460 if f is None: 461 f = keyword 462 463 path = os.path.join(self.rootdir, f) 464 return self.loadDataFromFile(path, delim) 465 466 def loadDataFromFile(self, path, delim): 467 """ 468 469 :param path: file path 470 :param delim: delimiter 471 :return: Array of tuples. 472 """ 473 if not os.path.exists(path): 474 raise Exception('File does not exist, FILE=%s' % path) 475 476 with open(path, 'r', encoding="UTF-8") as f: 477 filereader = csv.reader(f, delimiter=delim, lineterminator='\n', dialect="excel") 478 data = [] 479 for row in filereader: 480 if not row: 481 print("Blank line") 482 continue 483 first_cell = row[0].strip() 484 if first_cell.startswith('#'): 485 continue 486 data.append(row) 487 return data 488 489 def loadFile(self, keyword): 490 """ 491 Load a named word list file. 492 If the name is not a cfg parameter, the keyword name *is* the file. 493 """ 494 filename = '' 495 496 if os.path.exists(keyword): 497 path = keyword 498 else: 499 filename = self.config.get(keyword) 500 if filename is None: 501 filename = keyword 502 503 path = os.path.join(self.rootdir, filename) 504 if not os.path.exists(path): 505 raise Exception('File does not exist, FILE=%s' % path) 506 507 return self.loadListFromFile(path) 508 509 def loadListFromFile(self, path): 510 """ 511 Load text data from a file. 512 Returns array of non-comment rows. One non-whitespace row per line. 513 """ 514 if not os.path.exists(path): 515 raise Exception('File does not exist, FILE=%s' % path) 516 517 with open(path, 'r', encoding="UTF-8") as fh: 518 termlist = [] 519 for line in fh: 520 line = line.strip() 521 if line.startswith('#'): 522 continue 523 if len(line) == 0: 524 continue 525 526 termlist.append(line.lower()) 527 528 return termlist
A utility to load parameter lists, CSV files, word lists, etc. from a folder dir
functions here take an Oxygen cfg parameter keyword or a file path. If the keyword is valid and points to a valid file path, then the file path is used. In otherwords, keywords are aliases for a file on disk.
Ex. 'mywords' = './cfg/mywords_v03_filtered.txt'
oxygen.cfg file would have this mapping. Your code just references 'mywords' to load it.
455 def loadCSVFile(self, keyword, delim): 456 """ 457 Load a named CSV file. If the name is not a cfg parameter, the keyword name *is* the file. 458 """ 459 f = self.config.get(keyword) 460 if f is None: 461 f = keyword 462 463 path = os.path.join(self.rootdir, f) 464 return self.loadDataFromFile(path, delim)
Load a named CSV file. If the name is not a cfg parameter, the keyword name is the file.
466 def loadDataFromFile(self, path, delim): 467 """ 468 469 :param path: file path 470 :param delim: delimiter 471 :return: Array of tuples. 472 """ 473 if not os.path.exists(path): 474 raise Exception('File does not exist, FILE=%s' % path) 475 476 with open(path, 'r', encoding="UTF-8") as f: 477 filereader = csv.reader(f, delimiter=delim, lineterminator='\n', dialect="excel") 478 data = [] 479 for row in filereader: 480 if not row: 481 print("Blank line") 482 continue 483 first_cell = row[0].strip() 484 if first_cell.startswith('#'): 485 continue 486 data.append(row) 487 return data
:param path: file path :param delim: delimiter :return: Array of tuples.
489 def loadFile(self, keyword): 490 """ 491 Load a named word list file. 492 If the name is not a cfg parameter, the keyword name *is* the file. 493 """ 494 filename = '' 495 496 if os.path.exists(keyword): 497 path = keyword 498 else: 499 filename = self.config.get(keyword) 500 if filename is None: 501 filename = keyword 502 503 path = os.path.join(self.rootdir, filename) 504 if not os.path.exists(path): 505 raise Exception('File does not exist, FILE=%s' % path) 506 507 return self.loadListFromFile(path)
Load a named word list file. If the name is not a cfg parameter, the keyword name is the file.
509 def loadListFromFile(self, path): 510 """ 511 Load text data from a file. 512 Returns array of non-comment rows. One non-whitespace row per line. 513 """ 514 if not os.path.exists(path): 515 raise Exception('File does not exist, FILE=%s' % path) 516 517 with open(path, 'r', encoding="UTF-8") as fh: 518 termlist = [] 519 for line in fh: 520 line = line.strip() 521 if line.startswith('#'): 522 continue 523 if len(line) == 0: 524 continue 525 526 termlist.append(line.lower()) 527 528 return termlist
Load text data from a file. Returns array of non-comment rows. One non-whitespace row per line.
531def load_list(path, lower=False): 532 """ 533 Load text data from a file. 534 Returns array of non-comment rows. One non-whitespace row per line. 535 :param path: file to load. 536 :param lower: Lowercased is optional. 537 :return: array of terms 538 """ 539 if not os.path.exists(path): 540 raise Exception('File does not exist, FILE=%s' % path) 541 542 with open(path, 'r', encoding="UTF-8") as fh: 543 termlist = [] 544 for line in fh: 545 line = line.strip() 546 if line.startswith('#') or not line: 547 continue 548 549 termlist.append(line.lower() if lower else line) 550 551 return termlist
Load text data from a file. Returns array of non-comment rows. One non-whitespace row per line. :param path: file to load. :param lower: Lowercased is optional. :return: array of terms
554def load_datafile(path, delim): 555 """ 556 :param path: file path 557 :param delim: delimiter 558 :return: Array of tuples. 559 """ 560 if not os.path.exists(path): 561 raise Exception(f'File does not exist, FILE={path}') 562 563 with open(path, 'r', encoding="utf-8") as f: 564 data = [] 565 text = f.read().replace('\uFEFF', '') 566 567 for line in text.split("\n"): 568 row = line.strip().split(delim) 569 if not row: 570 # print("Blank line") 571 continue 572 first_cell = row[0] 573 if first_cell.startswith('#'): 574 continue 575 data.append(row) 576 return data
:param path: file path :param delim: delimiter :return: Array of tuples.
579def ensure_dirs(fpath): 580 """ 581 Given a file path, ensure parent folders exist. 582 If path is intended to be a directory -- use os.makedirs(path) instead. 583 May throw exception -- caller should handle. 584 585 :path: path a file 586 """ 587 d = os.path.dirname(fpath) 588 if d and not os.path.isdir(d): 589 os.makedirs(d) 590 return True 591 return False
Given a file path, ensure parent folders exist. If path is intended to be a directory -- use os.makedirs(path) instead. May throw exception -- caller should handle.
:path: path a file