opensextant
1# -*- coding: utf-8 -*- 2import math 3import os 4import re 5import sys 6from abc import ABC, abstractmethod 7from logging import getLogger 8from logging.config import dictConfig 9from math import sqrt, sin, cos, radians, atan2, log as mathlog, log10 10 11from opensextant.utility import get_csv_reader, get_bool, get_list, load_datafile 12from pygeodesy.ellipsoidalVincenty import LatLon as LL 13from pygeodesy.geohash import encode as geohash_encode, decode as geohash_decode, neighbors as geohash_neighbors 14 15PY3 = sys.version_info.major == 3 16countries = [] 17countries_by_iso = {} 18countries_by_fips = {} 19countries_by_name = {} 20usstates = {} 21adm1_by_hasc = {} 22__loaded = False 23__language_map_init = False 24 25 26def logger_config(logger_level: str, pkg: str): 27 """ 28 LOGGING 29 :param logger_level: 30 :param pkg: Name of package 31 :return: 32 """ 33 handlers = { 34 pkg: { 35 'class': 'logging.StreamHandler', 36 'stream': sys.stdout, 37 'formatter': 'default' 38 } 39 } 40 dictConfig({ 41 'version': 1, 42 'formatters': { 43 'default': { 44 'format': '%(levelname)s in %(module)s: %(message)s', 45 } 46 }, 47 'handlers': handlers, 48 'root': { 49 'level': logger_level, 50 'handlers': [pkg] 51 } 52 }) 53 54 _log = getLogger(pkg) 55 _log.setLevel(logger_level) 56 return _log 57 58 59def pkg_resource_path(rsrc): 60 pkg_dir = os.path.dirname(os.path.abspath(__file__)) 61 fpath = os.path.join(pkg_dir, 'resources', rsrc) 62 if not os.path.exists(fpath): 63 raise Exception(f"Resource not found {rsrc} (tried {fpath}") 64 65 return fpath 66 67 68def make_HASC(cc, adm1, adm2=None): 69 """ 70 Create a simplie hiearchical path for a boundary 71 :param cc: 72 :param adm1: 73 :param adm2: 74 :return: 75 """ 76 if not adm1: 77 adm1 = '0' 78 if adm2: 79 return '{}.{}.{}'.format(cc, adm1, adm2) 80 else: 81 return '{}.{}'.format(cc, adm1) 82 83 84def format_coord(lat, lon): 85 """ 86 2.6, 3.6 format. 87 :param lat: latitude 88 :param lon: longitude 89 :return: string 90 """ 91 return '{:2.5f},{:3.5f}'.format(float(lat), float(lon)) 92 93 94def validate_lat(f): 95 return (f >= -90.0) and (f <= 90.0) 96 97 98def validate_lon(f): 99 return (f >= -180.0) and (f <= 180.0) 100 101 102def parse_admin_code(adm1, delim="."): 103 """ 104 :param delim: 105 :param adm1: admin level 1 code 106 :return: ADM1 code if possible. 107 """ 108 if not adm1: 109 return "0" 110 111 code = adm1 112 if "?" in adm1: 113 code = "0" 114 elif delim in adm1: 115 cc2, code = adm1.split(delim) 116 # Normalize Country-level. Absent ADM1 levels are assigned "0" anyway 117 if code.strip() in {"", None, "0", "00"}: 118 code = "0" 119 return code 120 121 122def distance_cartesian(x1, y1, x2, y2): 123 """ 124 Given X1, Y1 and X2, Y2 provide the 2-D Cartesian distance between two points. 125 """ 126 xdist = x2 - x1 127 ydist = y2 - y1 128 return sqrt(xdist * xdist + ydist * ydist) 129 130 131EARTH_RADIUS_WGS84 = 6378.137 * 1000 # M, True: 6378.137 132 133 134def distance_haversine(ddlon1, ddlat1, ddlon2, ddlat2): 135 """ 136 Returns distance in meters for given decimal degree Lon/Lat (X,Y) pair 137 138 http://www.movable-type.co.uk/scripts/latlong.html 139 """ 140 lat1 = radians(ddlat1) 141 lon1 = radians(ddlon1) 142 lat2 = radians(ddlat2) 143 lon2 = radians(ddlon2) 144 dLat = lat2 - lat1 145 dLon = lon2 - lon1 146 a = (sin(dLat / 2) * sin(dLat / 2)) + (cos(lat1) * cos(lat2) * sin(dLon / 2) * sin(dLon / 2)) 147 c = 2 * atan2(sqrt(a), sqrt(1 - a)) 148 return int(EARTH_RADIUS_WGS84 * c) 149 150 151def location_accuracy(conf, prec_err): 152 """ 153 Both confidence and precision error are required to be non-zero and positive. 154 155 Scale ACCURACY by confidence, and inversely log10( R^2 ) 156 Decreasing accuracy with increasing radius, but keep scale on the order of visible things, 157 e.g., 0.01 to 1.00. This is only one definition of accuracy. 158 159 Consider confidence = 100 (aka 100% chance we have the right location) 160 161 * Country precision ~ +/- 100KM is accuracy = 0.091 162 * GPS precision is 10 M precision is accuracy 0.33 163 * 1M precision , accuracy = 1.0, (1 / (1+log(1*1)) = 1/1. In other words a 1m error is basically "perfect" 164 165 :param conf: confidence on 100 point scale (0-100) 166 :param prec_err: error in location precision, meters 167 :return: 168 """ 169 if not conf or not prec_err: 170 return 0 171 if conf < 0 or prec_err < 0: 172 return 0 173 scale = 0.01 * conf 174 inv_prec = 1 + log10(prec_err * prec_err) 175 acc = scale / inv_prec 176 return float(f"{acc:0.4f}") 177 178 179def _estimate_geohash_precision(r: int): 180 """ 181 Returns hueristic geohash length for the given radius in meters. 182 183 :param r: radius in meters 184 """ 185 if r > 1000000: 186 return 1 187 elif r > 250000: 188 return 2 189 elif r > 50000: 190 return 3 191 elif r > 10000: 192 return 4 193 elif r > 1000: 194 return 5 195 elif r > 250: 196 return 6 197 elif r > 50: 198 return 7 199 elif r > 1: 200 return 8 201 else: 202 raise Exception(f"Not thinking about sub-meter resolution. radius={r}") 203 204 205def _ll2dict(p: LL): 206 return {"lat": p.lat, "lon": p.lon} 207 208 209def _ll2geohash(p: LL): 210 return geohash_encode(lat=p.lat, lon=p.lon) 211 212 213def point2geohash(lat: float, lon: float, precision=6): 214 return geohash_encode(lat=lat, lon=lon, precision=precision) 215 216 217def geohash2point(gh): 218 return (float(x) for x in geohash_decode(gh)) 219 220 221def radial_geohash(lat, lon, radius): 222 """ 223 Propose geohash cells for a given radius from a given point 224 """ 225 corners = {} 226 # Find clockwise points at a radius, E, N, S, W. Bearing for North is 0deg. 227 p1 = LL(lat, lon) 228 corners["N"] = _ll2geohash(p1.destination(radius, 0)) 229 corners["E"] = _ll2geohash(p1.destination(radius, 90)) 230 corners["S"] = _ll2geohash(p1.destination(radius, 180)) 231 corners["W"] = _ll2geohash(p1.destination(radius, 270)) 232 return corners 233 234 235def geohash_cells_radially(lat: float, lon: float, radius: int): 236 """ 237 Create a set of geohashes that contain the given area defined by lat,lon + radius 238 """ 239 ensw = radial_geohash(lat, lon, radius) 240 radius_error = _estimate_geohash_precision(radius) 241 cells = set([]) 242 for directional in ensw: 243 gh = ensw[directional] 244 cells.add(gh[0:radius_error - 1]) 245 return cells 246 247 248def geohash_cells(gh: str, radius: int): 249 """ 250 For a radius in meters generate the cells contained within or touched by that radius. 251 This is approximate precision based on: 252 https://en.wikipedia.org/wiki/Geohash which suggests this approximation could be done mathematically 253 :return: Dict of 8 directionals ~ E, N, S, W; NE, SE, SW, NW. If radius desired fits entirely within a 254 lesser precision geohash grid, the only cell returned is "CENTROID", i.e. radius=2000 (meters) for a geohash such as 255 `9q5t` 256 """ 257 radius_error = _estimate_geohash_precision(radius) 258 if len(gh) < radius_error: 259 return {"CENTROID": gh} 260 ghcell = gh[0:radius_error] 261 return geohash_neighbors(ghcell) 262 263 264class Coordinate: 265 """ 266 Convenient class for Lat/Lon pair. 267 Expects a row dict with 'lat' and 'lon', 268 or kwd args 'lat', 'lon' 269 @param row default dictionary 270 """ 271 272 def __init__(self, row, lat=None, lon=None): 273 # TODO: set coordinate to X, Y = None, None by default. 274 self.X = 0.0 275 self.Y = 0.0 276 self.mgrs = None 277 self.lat = self.Y 278 self.lon = self.X 279 # Set geohash on demand, otherwise it can be computed from lat,lon 280 self.geohash = None 281 282 if row: 283 if 'lat' in row and 'lon' in row: 284 lat = row['lat'] 285 lon = row['lon'] 286 287 if lat and lon: 288 self.set(lat, lon) 289 290 def validate(self): 291 return validate_lat(self.Y) and validate_lon(self.X) and (self.X != 0.0 and self.Y != 0.0) 292 293 def set(self, lat, lon): 294 """ Set the location lat, lon""" 295 self.X = float(lon) 296 self.Y = float(lat) 297 self.lat = self.Y 298 self.lon = self.X 299 300 def format_coord(self): 301 return format_coord(self.Y, self.X) 302 303 def string_coord(self): 304 return ",".join((str(self.lat), str(self.lon))) 305 306 def __str__(self): 307 if self.Y: 308 return format_coord(self.Y, self.X) 309 else: 310 return 'unset' 311 312 313def bbox(lat: float, lon: float, radius: int): 314 """ 315 Calculate coordinates for SW and NE corners of a SQUARE bounding box of edge length 2 x radius 316 :param lat: decimal degree latitude 317 :param lon: decimal degree longitude 318 :param radius: meters from center point 319 """ 320 sw, ne = LL(lon, lat).boundsOf(2 * radius, 2 * radius) 321 return Coordinate(None, lat=sw.lat, lon=sw.lon), Coordinate(None, lat=ne.lat, lon=ne.lon) 322 323 324def centroid(arr: list): 325 """ 326 327 :param arr: a list of numeric coordinates (y,x) 328 :return: Coordinate -- the average of sum(y), sum(x) 329 """ 330 n = len(arr) 331 if not n: 332 return None 333 if n == 1: 334 y, x = arr[0] 335 return Coordinate(None, lat=y, lon=x) 336 337 lat_sum = math.fsum([y for y, x in arr]) 338 lon_sum = math.fsum([x for y, x in arr]) 339 return Coordinate(None, lat=lat_sum / n, lon=lon_sum / n) 340 341 342class Place(Coordinate): 343 """ 344 Location or GeoBase 345 + Coordinate 346 + Place 347 + Country 348 349 or 350 Location 351 + Coordinate 352 + Place 353 354 etc. Not sure of the best data model for inheritance. 355 This Python API hopes to simplify the concepts in the Java API. 356 357 """ 358 359 def __init__(self, pid, name, lat=None, lon=None): 360 Coordinate.__init__(self, None, lat=lat, lon=lon) 361 # Internal DB or Gazetteer ID 362 self.id = None 363 # Public or standards Place ID, e.g., GNS, ISO, etc. 364 self.place_id = pid 365 self.name = name 366 367 self.is_ascii = False 368 self.is_upper = False 369 self.adm1_postalcode = None # Province Postal CODE? 370 self.place_postalcode = None # ZIP CODE? 371 self.name_type = None 372 self.name_script = None # Code or label, e.g. L or LATIN 373 self.country = None 374 self.country_code = None 375 self.country_code_fips = None 376 self.feature_class = None 377 self.feature_code = None 378 self.adm1 = None 379 self.adm1_name = None 380 self.adm1_iso = None # Alternate ISO-based ADM1 code used by NGA and others. 381 self.adm2 = None 382 self.adm2_name = None 383 self.source = None 384 self.name_bias = 0.0 385 self.id_bias = 0.0 386 # Precision is actually "Precision Error" in meters 387 self.precision = -1 388 self.method = None 389 # Population stats, if available. Scale is a power-of-2 scale 390 # starting at about pop of 2^14 as 0, 32K=1, 64K=2, etc. 391 self.population = -1 392 self.population_scale = 0 393 self.hierarchical_path = None 394 395 # Internal fields for gazetteer curation and text analytics: 396 self.name_group = "" 397 self.search_only = False 398 399 def has_coordinate(self): 400 return self.validate() 401 402 def get_location(self): 403 """ Returns (LAT, LON) tuple 404 @return: tuple, (lat,lon) 405 """ 406 return self.Y, self.X 407 408 def set_location(self, lat, lon): 409 self.set(lat, lon) 410 411 def __str__(self): 412 return '{}, {} @({})'.format(self.name, self.country_code, self.string_coord()) 413 414 def format_feature(self): 415 """ 416 Yield a consolidated feature coding. 417 :return: X/xxxx format 418 """ 419 if self.feature_code: 420 return f"{self.feature_class}/{self.feature_code}" 421 return self.feature_class 422 423 424class Country(Coordinate): 425 """ 426 Country metadata 427 """ 428 429 def __init__(self): 430 Coordinate.__init__(self, None) 431 self.cc_iso2 = None 432 self.cc_iso3 = None 433 self.cc_fips = None 434 self.place_id = None 435 self.name = None 436 self.namenorm = None 437 self.name_type = None 438 self.aliases = [] 439 self.is_territory = False 440 self.is_unique_name = False 441 self.timezones = [] 442 self.languages = set([]) 443 self.primary_language = None 444 445 def __str__(self): 446 return u'{} ({})'.format(self.name, self.cc_iso2) 447 448 449def country_as_place(ctry: Country, name: str, name_type="N", oid=None): 450 """ 451 Convert to Place. 452 :param ctry: Country object 453 :param name: the name to use 454 :param name_type: 455 :param oid: row ID 456 :return: 457 """ 458 pl = Place(ctry.cc_iso2, name) 459 pl.id = oid 460 pl.place_id = ctry.place_id 461 pl.name_type = name_type 462 pl.feature_class = "A" 463 pl.feature_code = "PCLI" 464 pl.name_bias = 0.0 465 pl.id_bias = 0.0 466 pl.country_code = ctry.cc_iso2 467 pl.country_code_fips = ctry.cc_fips 468 pl.adm1 = "0" 469 pl.source = "ISO" 470 if ctry.is_territory: 471 pl.feature_code = "PCL" 472 pl.set_location(ctry.lat, ctry.lon) 473 return pl 474 475 476def load_countries(csvpath=None): 477 """ parses Xponents Core/src/main/resource CSV file country-names-2015.csv 478 putting out an array of Country objects. 479 :return: array of Country 480 """ 481 if not csvpath: 482 csvpath = pkg_resource_path('country-names-2021.csv') 483 484 count = 0 485 with open(csvpath, 'r', encoding="UTF-8") as fh: 486 columns = "country_name,FIPS_cc,ISO2_cc,ISO3_cc,unique_name,territory,latitude,longitude".split(',') 487 fio = get_csv_reader(fh, columns) 488 for row in fio: 489 490 # ignore empty row and header. 491 if 'country_name' not in row: 492 continue 493 if row['country_name'] == 'country_name': 494 continue 495 count += 1 496 C = Country() 497 C.name = row.get('country_name') 498 C.cc_iso2 = row.get('ISO2_cc').upper() 499 C.cc_iso3 = row.get('ISO3_cc').upper() 500 C.cc_fips = row.get('FIPS_cc').upper() 501 502 # Internal data set "place ID" 503 C.place_id = f"C{C.cc_iso2}#{C.cc_fips}#{count}" 504 505 C.is_name_unique = get_bool(row.get('unique_name')) 506 C.is_territory = get_bool(row.get('territory')) 507 C.namenorm = C.name.lower() 508 C.set(row.get("latitude"), row.get("longitude")) 509 510 countries.append(C) 511 512 for C in countries: 513 if not C.is_territory and C.cc_iso2 not in countries_by_iso: 514 countries_by_iso[C.cc_iso2] = C 515 countries_by_iso[C.cc_iso3] = C 516 517 if C.cc_fips and C.cc_fips != "*": 518 countries_by_fips[C.cc_fips] = C 519 520 countries_by_name[C.namenorm] = C 521 522 global __loaded 523 __loaded = len(countries_by_iso) > 1 524 525 if __loaded: 526 if "XKX" in countries_by_iso: 527 countries_by_iso["XKS"] = countries_by_iso.get("XKX") 528 if "SJM" in countries_by_iso: 529 countries_by_iso["XSV"] = countries_by_iso.get("SJM") 530 countries_by_iso["XJM"] = countries_by_iso.get("SJM") 531 if "PSE" in countries_by_iso: 532 countries_by_iso["GAZ"] = countries_by_iso.get("PSE") 533 if "TLS" in countries_by_iso: 534 countries_by_iso["TMP"] = countries_by_iso.get("TLS") 535 536 return countries 537 538 539def get_us_province(adm1: str): 540 """ 541 542 :param adm1: ADM1 code or for territories, 543 :return: 544 """ 545 if not usstates: 546 raise Exception("Run load_us_provinces() first") 547 return usstates.get(adm1) 548 549 550def load_us_provinces(): 551 """ 552 Load, store internally and return the LIST of US states. 553 NOTE: Place objects for US States have a location (unlike list of world provinces). 554 To get location and feature information in full, you must use the SQLITE DB or Xponents Solr. 555 :return: array of Place objects 556 """ 557 csvpath = pkg_resource_path('us-state-metadata.csv') 558 usstate_places = [] 559 with open(csvpath, 'r', encoding="UTF-8") as fh: 560 columns = ["POSTAL_CODE", "ADM1_CODE", "STATE", "LAT", "LON", "FIPS_CC", "ISO2_CC"] 561 io = get_csv_reader(fh, columns) 562 for row in io: 563 if row['POSTAL_CODE'] == 'POSTAL_CODE': continue 564 565 cc = row["ISO2_CC"] 566 adm1_code = row["ADM1_CODE"][2:] 567 postal_code = row["POSTAL_CODE"] 568 # HASC path 569 place_id = make_HASC(cc, adm1_code) 570 postal_id = make_HASC(cc, row["POSTAL_CODE"]) 571 adm1 = Place(place_id, row["STATE"], lat=row["LAT"], lon=row["LON"]) 572 adm1.feature_class = "A" 573 adm1.feature_code = "ADM1" 574 adm1.name_type = "N" 575 adm1.geohash = geohash_encode(adm1.lat, adm1.lon, precision=6) 576 577 adm1.country_code = cc 578 adm1.adm1 = adm1_code 579 adm1.adm1_postalcode = row["POSTAL_CODE"] 580 adm1.source = "OpenSextant" 581 582 # Code alone: 583 usstates[adm1_code] = adm1 584 usstates[postal_code] = adm1 585 usstates[place_id] = adm1 586 usstates[postal_id] = adm1 587 588 usstate_places.append(adm1) 589 return usstate_places 590 591 592def load_provinces(): 593 """ 594 Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc. 595 NOTE: Location information is not included in this province listing. Just Country, ADM1, Name tuples. 596 NOTE: This reflects only GEONAMES ADMIN1 CODES ASCII -- which portrays most of the world (except US) as FIPS, 597 not ISO. 598 :return: dict 599 """ 600 return load_world_adm1() 601 602 603def load_world_adm1(): 604 """ 605 Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc. 606 Coding for ADM1 is FIPS based mostly 607 :return: dict 608 """ 609 # Load local country data first, if you have it. US is only one so far. 610 load_us_provinces() 611 612 SOURCE_ID = "G" 613 csvpath = pkg_resource_path(os.path.join('geonames.org', 'admin1CodesASCII.txt')) 614 615 with open(csvpath, 'r', encoding="UTF-8") as fh: 616 adm1Splitter = re.compile(r'\.') 617 lineSplitter = re.compile('\t') 618 for line in fh: 619 row = lineSplitter.split(line.strip()) 620 src_id = f"{SOURCE_ID}{row[3]}" 621 if not row[3]: 622 src_id = None 623 adm1 = Place(src_id, row[1]) 624 adm1.feature_class = "A" 625 adm1.feature_code = "ADM1" 626 adm1.name_type = "N" 627 628 cc_adm1 = adm1Splitter.split(row[0], 2) 629 adm1.country_code = cc_adm1[0] 630 adm1.adm1 = parse_admin_code(cc_adm1[1]) 631 adm1.source = "G" # Geonames.org coded. 632 hasc = make_HASC(adm1.country_code, adm1.adm1) 633 if adm1.country_code == "US": 634 adm1.source = "USGS" 635 if hasc in usstates: 636 us_place = usstates[hasc] 637 us_place.name = adm1.name 638 hasc = make_HASC(us_place.country_code, us_place.adm1) 639 adm1_by_hasc[hasc] = adm1 640 641 adm1_by_hasc[hasc] = adm1 642 return adm1_by_hasc 643 644 645def get_province(cc, adm1): 646 """ REQUIRES you load_provinces() first. 647 """ 648 return adm1_by_hasc.get(make_HASC(cc, adm1)) 649 650 651def get_country(namecode, standard="ISO"): 652 """ 653 Get Country object given a name, ISO or FIPS code. For codes, you must be 654 clear about which standard the code is based in. Some code collisions exist. 655 "ZZ" will NOT be returned for the empty code -- if you pass in a NULL or empty 656 country code you may have a data quality issue. 657 :param namecode: 2- or 3-alpha code. 658 :param standard: 'ISO' or 'FIPS', 'name' 659 :return: Country object 660 """ 661 if not namecode or not isinstance(namecode, str): 662 return None 663 664 if not __loaded: 665 load_countries() 666 667 lookup = namecode.upper() 668 if standard == "ISO": 669 return countries_by_iso.get(lookup) 670 elif standard == "FIPS": 671 return countries_by_fips.get(lookup) 672 elif standard == "name": 673 return countries_by_name.get(namecode.lower()) 674 else: 675 raise Exception("That standards body '{}' is not known for code {}".format(standard, namecode)) 676 677 678def load_major_cities(): 679 """ 680 Loads City geo/demographic information -- this does not try to parse all name variants. 681 682 This produces Geonames use of FIPS codes. 683 :return: 684 """ 685 csvpath = pkg_resource_path(os.path.join('geonames.org', 'cities15000.txt')) 686 687 from csv import reader 688 with open(csvpath, 'r', encoding="UTF-8") as fh: 689 rdr = reader(fh, dialect="excel", delimiter="\t") 690 cities = [] 691 for line in rdr: 692 if len(line) != 19: 693 continue 694 if not line[4]: 695 print("Not location info for City ~ ", line[0]) 696 continue 697 # ID NAME LAT LON 698 pl = Place(line[0], line[1], lat=float(line[4]), lon=float(line[5])) 699 pl.feature_class = line[6] 700 pl.feature_code = line[7] 701 pl.country_code = line[8] 702 alt_cc = line[9] 703 if alt_cc and alt_cc != pl.country_code: 704 print("Alternate Country Code", alt_cc) 705 pl.adm1 = parse_admin_code(line[10]) 706 pl.adm2 = line[11] 707 # pl.geohash = geohash_encode(pl.lat, pl.lon, precision=6) 708 try: 709 pl.population = int(line[14]) 710 pl.population_scale = popscale(pl.population, feature="city") 711 except: 712 pass 713 cities.append(pl) 714 return cities 715 716 717_pop_scale = { 718 "city": 13, # 2^13 ~ 8,000 719 "district": 15, # 2^15 ~ 32,000 720 "province": 17, # 2^17 ~ 130,000 721} 722 723 724def popscale(population, feature="city"): 725 """ 726 Given a population in context of the feature -- provide a 727 approximation of the size of the feature on a 10 point scale. 728 729 Approximations for 10 points: 730 Largest city is ~15 million 731 // Few cities top 30 million, e.g., 2^25. popscale = 25 - 13 = 12. 732 Largest province is ~135 million 733 734 :param population: 735 :param feature: city, district, or province allowed. 736 :return: index on 0..10 scale. 737 """ 738 if population < 1: 739 return 0 740 shifter = _pop_scale.get(feature, 20) 741 index = mathlog(population, 2) - shifter 742 return int(index) if index > 0 else 0 743 744 745def is_political(feat_code: str): 746 """Test a feature code""" 747 if not feat_code: return False 748 return feat_code.startswith("PCL") 749 750 751def is_country(feat_code: str): 752 """Test a feature code""" 753 return "PCLI" == feat_code 754 755 756def is_administrative(feat: str): 757 if not feat: return False 758 return "A" == feat.upper() 759 760 761def is_populated(feat: str): 762 if not feat: return False 763 return "P" == feat.upper() 764 765 766def is_academic(feat_class: str, feat_code: str) -> bool: 767 """ 768 769 :param feat_class: geonames class 770 :param feat_code: geonames designation code 771 :return: 772 """ 773 return feat_class and feat_code and feat_class == "S" and feat_code.startswith("SCH") 774 775 776def characterize_location(place: Place, label: str): 777 """ 778 Experimental: Not comprehensive characterization. This is intended to summarize PlaceCandidates extracted 779 from text. 780 781 Describe a Place in terms of a plain language feature type and the geographic scope or resolution. 782 E.g, Place object "P/PPL", "city" 783 E.g, Place object "A/ADM4" "admin" 784 E.g, Place object "S/COORD", "site" 785 786 :param place: Place object 787 :param label: text match label, e.g., 'country', 'place', 'coord', etc. 788 :return: feature string, resolution string 789 """ 790 res = label 791 fc = place.feature_class 792 resolutions = { 793 "A": "admin", 794 "P": "city", 795 "S": "site", 796 "H": "water", 797 "R": "path", 798 "V": "area", 799 "T": "area", 800 "L": "area" 801 } 802 803 # Note label should be a limited set -- country, postal, coord, place. 804 if label == "place": 805 res = resolutions.get(fc, label) 806 if label == "coord": 807 res = "site" 808 809 return place.format_feature(), res 810 811 812class TextEntity: 813 """ 814 A Text span. 815 816 classes and routines that align with Java org.opensextant.data and org.opensextant.extraction 817 818 * TextEntity: represents a span of text 819 * TextMatch: a TextEntity matched by a particular routine. This is the basis for most all 820 extractors and annotators in OpenSetant. 821 """ 822 823 def __init__(self, text, start, end): 824 self.text = text 825 self.start = start 826 self.end = end 827 self.len = -1 828 self.is_duplicate = False 829 self.is_overlap = False 830 self.is_submatch = False 831 if self._is_valid(): 832 self.len = self.end - self.start 833 834 def __str__(self): 835 return f"{self.text}({self.start},{self.end})" 836 837 def _is_valid(self): 838 if self.start is None or self.end is None: 839 return False 840 return self.start >= 0 and self.end >= 0 841 842 def contains(self, x1): 843 """ if this span contains an offset x1 844 :param x1: 845 """ 846 if self.start < 0 or self.end < 0: 847 return False 848 return self.start <= x1.start < x1.end <= self.end 849 850 def exact_match(self, t): 851 return t.start == self.start and t.end == self.end and self._is_valid() 852 853 def is_within(self, t): 854 """ 855 if the given annotation, t, contains this 856 :param t: 857 :return: 858 """ 859 return t.contains(self.start) and t.contains(self.end) 860 861 def is_after(self, t): 862 return self.start > t.end 863 864 def is_before(self, t): 865 return self.end < t.start 866 867 def overlaps(self, t): 868 """ 869 Determine if t overlaps self. If Right or Left match, t overlaps if it is longer. 870 If t is contained entirely within self, then it is not considered overlap -- it is Contained within. 871 :param t: 872 :return: 873 """ 874 # a1 a2 875 # t1 t2 RIGHT skew 876 # a1 a2 877 # t1 t2 LEFT skew 878 # 879 # a1 a2 880 # t1 t2 RIGHT match 881 # t1 t2 LEFT match 882 # a1 a2 883 # t1 t2 minimal OVERLAP 884 skew_right = t.start < self.start <= t.end < self.end 885 skew_left = self.start < t.start <= self.end < t.end 886 left_match = self.end == t.end 887 right_match = self.start == t.start 888 if skew_right or skew_left: 889 return True 890 return (right_match and skew_left) or (left_match and skew_right) 891 892 893class TextMatch(TextEntity): 894 """ 895 An entity matched by some tagger; it is a text span with lots of metadata. 896 """ 897 898 def __init__(self, *args, label=None): 899 TextEntity.__init__(self, *args) 900 self.id = None 901 self.label = label 902 self.filtered_out = False 903 self.attrs = dict() 904 905 def __str__(self): 906 return f"{self.label}/{self.text}({self.start},{self.end})" 907 908 def populate(self, attrs: dict): 909 """ 910 Populate a TextMatch to normalize the set of attributes -- separate class fields on TextMatch from additional 911 optional attributes. 912 :param attrs: dict of standard Xponents API outputs. 913 :return: 914 """ 915 self.id = attrs.get("match-id") 916 self.label = attrs.get("type") 917 self.attrs.update(attrs) 918 self.filtered_out = get_bool(self.attrs.get("filtered-out")) 919 for k in ['len', 'length']: 920 if k in self.attrs: 921 self.len = self.attrs.get(k) 922 if self.len is not None and self.start >= 0 and not self.end: 923 self.end = self.start + self.len 924 925 # Remove attribute keys that may be confusing. 926 for fld in ['offset', 'start', 'end', 'len', 'length', 'type', 'filtered-out', 'text', 'matchtext']: 927 if fld in self.attrs: 928 del self.attrs[fld] 929 930 def normalize(self): 931 """ 932 Optional, but recommended routine to normalize the matched data. 933 That is, parse fields, uppercase, streamline punctuation, etc. 934 As well, given such normalization result, this is the opportunity to additionally 935 validate the match. 936 :return: 937 """ 938 pass 939 940 941class PlaceCandidate(TextMatch): 942 """ 943 A TextMatch representing any geographic mention -- a Place object will 944 represent the additional attributes for the chosen place. 945 see also in Java org.opensextant.extractors.geo.PlaceCandidate class, which is 946 a more in-depth version of this. This Python class represents the 947 response from the REST API, for example. 948 949 """ 950 951 def __init__(self, *args, **kwargs): 952 TextMatch.__init__(self, *args, **kwargs) 953 self.confidence = 0 954 self.rules = [] 955 self.is_country = False 956 self.place = None 957 # Location certainty is a simple meausre 0.0 to 1.0 to convey confidence + precision in one metric 958 self.location_certainty = -1 959 960 def populate(self, attrs: dict): 961 """ 962 Deserialize the attributes dict from either TextMatch schema or Place schema 963 :param attrs: 964 :return: 965 """ 966 TextMatch.populate(self, attrs) 967 geo = Place(None, attrs.get("name"), lat=attrs.get("lat"), lon=attrs.get("lon")) 968 if not geo.name: 969 geo.name = self.text 970 971 # attribute / schema does not align 100% here. 972 geo.country_code = attrs.get("cc") 973 geo.adm1 = attrs.get("adm1") 974 geo.precision = attrs.get("prec") 975 geo.feature_class = attrs.get("feat_class") 976 geo.feature_code = attrs.get("feat_code") 977 geo.adm1_name = attrs.get("province-name") 978 geo.geohash = attrs.get("geohash") 979 geo.method = attrs.get("method") 980 981 # Combined match + geo-location confidence: 982 self.confidence = attrs.get("confidence") 983 if "rules" in attrs: 984 # One or more geo-inferencing rules 985 self.rules = attrs["rules"].split(";") 986 987 self.is_country = self.label == "country" or is_country(geo.feature_code) 988 if self.is_country: 989 # Zero out country location; Let user derive country from metadata. 990 geo.lat = None 991 geo.lon = None 992 self.place = geo 993 # Items like coordinates and cities, etc receive a location certainty. Countries do not. 994 self.location_certainty = location_accuracy(self.confidence, geo.precision) 995 996 997class Extractor(ABC): 998 def __init__(self): 999 self.id = None 1000 1001 @abstractmethod 1002 def extract(self, text, **kwargs): 1003 """ 1004 1005 :param text: Unicode text input 1006 :keyword features: an array of features to extract, e.g., "coordinate", "place", "MONEY" 1007 :return: array of TextMatch 1008 """ 1009 pass 1010 1011 1012def render_match(m): 1013 """ 1014 1015 :param m: TextMatch 1016 :return: dict 1017 """ 1018 if not isinstance(m, TextMatch): 1019 return None 1020 dct = { 1021 "type": m.label, 1022 "text": m.text, 1023 "offset": m.start, 1024 "length": m.len, 1025 "filtered-out": m.filtered_out 1026 } 1027 return dct 1028 1029 1030NOT_SUBMATCH = 0 1031IS_SUBMATCH = 1 1032IS_DUPLICATE = 2 1033 1034 1035def reduce_matches(matches): 1036 """ 1037 Mark each match if it is a submatch or overlap or exact duplicate of other. 1038 :param matches: array of TextMatch (or TextEntity). This is the more object oriented version 1039 of reduce_matches_dict 1040 :return: 1041 """ 1042 if len(matches) < 2: 1043 return 1044 loop = 0 1045 for M in matches: 1046 loop += 1 1047 if M.filtered_out: 1048 continue 1049 m1 = M.start 1050 m2 = M.end 1051 # print(M.text, loop) 1052 1053 # In this loop you have to compare M against all N 1054 # Cannot exit loop on first match or overlap. 1055 for N in matches[loop:]: 1056 if N.filtered_out: 1057 continue 1058 1059 n1 = N.start 1060 n2 = N.end 1061 1062 if m2 < n1 or m1 > n2: 1063 # M entirely before N 1064 # M entirely after N 1065 continue 1066 1067 # print("\t", N.text, N.start, N.is_duplicate) 1068 if n1 == m1 and n2 == m2: 1069 # Exact duplicate - Mark N as dup, as M is first in array, but only if M is a valid match. 1070 N.is_duplicate = True 1071 elif n1 <= m1 < m2 <= n2: 1072 # M is within N span 1073 M.is_submatch = True 1074 elif m1 <= n1 < n2 <= m2: 1075 # N is within M span 1076 N.is_submatch = True 1077 elif m1 <= n2 <= m2 or n1 <= m2 <= n2: 1078 # n1 n2 1079 # m1 m2 1080 M.is_overlap = True 1081 N.is_overlap = True 1082 1083 1084def reduce_matches_dict(matches): 1085 """ 1086 Accepts an array annotations (dict). Inserts the "submatch" flag in dict if there is a 1087 submatch (that is, if another TextEntity A wholly contains another, B -- B is a submatch). 1088 We just have to loop through half of the array ~ comparing each item to each other item once. 1089 1090 :param matches: array of dicts. 1091 """ 1092 _max = len(matches) 1093 if _max < 2: 1094 return 1095 1096 loops = 0 1097 for i in range(0, _max): 1098 M = matches[i] 1099 m1 = M['start'] 1100 m2 = M['end'] 1101 1102 for j in range(i + 1, _max): 1103 loops += 1 1104 N = matches[j] 1105 n1 = N['start'] 1106 n2 = N['end'] 1107 1108 if m2 < n1: 1109 # M before N 1110 continue 1111 1112 if m1 > n2: 1113 # M after N 1114 continue 1115 1116 # Check for filtered-out matches not done in this version. 1117 # 1118 if n1 == m1 and n2 == m2: 1119 N['submatch'] = IS_DUPLICATE 1120 1121 elif n1 <= m1 < m2 <= n2: 1122 M['submatch'] = IS_SUBMATCH 1123 # Determined state of M. 1124 # break this internal loop 1125 1126 elif m1 <= n1 < n2 <= m2: 1127 N['submatch'] = IS_SUBMATCH 1128 # Determined state of N, 1129 # But possibly more N contained within M. Do not break yet. 1130 return 1131 1132 1133# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 1134# Language Code Support 1135# ISO 639 code book support -- Language codes 1136# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 1137 1138class Language: 1139 """ 1140 Language Represents a single code/name pair 1141 Coding is 3-char or 2-char, either is optional. 1142 In some situations there are competeing 2-char codes in code books, such as Lib of Congress (LOC) 1143 """ 1144 1145 def __init__(self, iso3, iso2, nmlist: list, locale=None): 1146 self.code_iso3 = iso3 1147 self.code = iso2 1148 self.names = nmlist 1149 self.locale = locale 1150 if nmlist: 1151 if not isinstance(nmlist, list): 1152 raise Exception("Name list is a list of names for the language. The first one is the default.") 1153 1154 def get_name(self): 1155 if self.names: 1156 return self.names[0] 1157 return None 1158 1159 def name_code(self): 1160 if self.names: 1161 return self.names[0].lower() 1162 return None 1163 1164 def __str__(self): 1165 return f"{self.name_code()}({self.code})" 1166 1167 1168# ISO 639 lookup 1169language_map = {} 1170 1171 1172def list_languages(): 1173 """ 1174 List out a flattened list of languages, de-duplicated by ISO2 language ID. 1175 1176 TODO: alternatively list out every language 1177 :return: 1178 """ 1179 load_languages() 1180 langs = [] 1181 visited = set([]) 1182 for lg in language_map: 1183 L = language_map[lg] 1184 if L.code: 1185 if not L.code in visited: 1186 langs.append(L) 1187 visited.add(L.code) 1188 if L.code_iso3: 1189 if not L.code_iso3 in visited: 1190 langs.append(L) 1191 visited.add(L.code_iso3) 1192 return langs 1193 1194 1195def add_language(lg: Language, override=False): 1196 """ 1197 The language map for ISO 2-alpha and 3-alpha codes should be protected from language IDs that are dialect or locale 1198 1199 "en" ==> en-au, en-gb, en-us, etc.? This is ambiguous 1200 The reverse is true -- "en-gb" is at least "en" or "eng" english 1201 1202 :param lg: 1203 :param override: 1204 :return: 1205 """ 1206 if not lg: 1207 return 1208 1209 codes = [] 1210 if lg.code: 1211 codes.append(lg.code.lower()) 1212 if lg.code_iso3: 1213 codes.append(lg.code_iso3.lower()) 1214 if lg.locale: 1215 codes.append(lg.locale.lower()) 1216 1217 if lg.names: 1218 for nm in lg.names: 1219 lang_name_norm = nm.lower() 1220 codes.append(lang_name_norm) 1221 if "modern" in lang_name_norm: 1222 nm2 = get_list(lang_name_norm, delim=",")[0] 1223 codes.append(nm2) 1224 override = True 1225 1226 for k in set(codes): 1227 exists = k in language_map 1228 1229 # coding rule: 2 or 3 char alpha codes for ISO or Biblio code books are not overriden. 1230 if len(k) <= 3 and exists: 1231 continue 1232 1233 if exists and not override: 1234 raise Exception(f"Forcibly remap language code? {k}") 1235 1236 language_map[k] = lg 1237 1238 1239def get_language(code: str) -> Language: 1240 """ 1241 1242 :param code: language ID or name 1243 :return: Language or None 1244 """ 1245 if not code: 1246 return None 1247 1248 load_languages() 1249 k = code.lower() 1250 # Most cases: 1251 if len(k) <= 3: 1252 return language_map.get(k) 1253 1254 # Code is odd, like a locale? "EN_GB" or en-gb, etc. 1255 if k.isalpha(): 1256 return language_map.get(k) 1257 1258 if k in language_map: 1259 return language_map.get(k) 1260 1261 for delim in ["-", " ", "_"]: 1262 k = k.split(delim)[0] 1263 if k in language_map: 1264 return language_map.get(k) 1265 return None 1266 1267 1268def get_lang_name(code: str): 1269 if not code: 1270 return None 1271 1272 L = get_language(code) 1273 if L: 1274 return L.get_name() 1275 1276 raise Exception(f"No such language ID {code}") 1277 1278 1279def get_lang_code(txt: str): 1280 L = get_language(txt) 1281 if L: 1282 return L.code 1283 1284 raise Exception(f"No such language ID {txt}") 1285 1286 1287def is_lang_romance(lg: str): 1288 """If spanish, portuguese, italian, french, romanian""" 1289 L = get_language(lg) 1290 if not L: 1291 return False 1292 c = L.code 1293 return c in {"es", "pt", "it", "fr", "ro"} 1294 1295 1296def is_lang_euro(lg: str): 1297 """ 1298 true if lang is European -- romance, german, english, etc 1299 :param lg: 1300 :return: 1301 """ 1302 L = get_language(lg) 1303 if not L: 1304 return False 1305 c = L.code 1306 return c in {"es", "pt", "it", "fr", "ro", 1307 "de", "en", 1308 "bu", "cz", "po", "nl", "el", "sq"} 1309 1310 1311def is_lang_english(lg: str): 1312 L = get_language(lg) 1313 if not L: 1314 return False 1315 return L.code == "en" 1316 1317 1318def is_lang_cjk(lg: str): 1319 L = get_language(lg) 1320 if not L: 1321 return False 1322 return L.code in {"zh", "zt", "ko", "ja"} 1323 1324 1325def is_lang_chinese(lg: str): 1326 L = get_language(lg) 1327 if not L: 1328 return False 1329 return L.code in {"zh", "zt"} 1330 1331 1332IGNORE_LANGUAGES = {"gaa"} 1333 1334 1335def load_languages(): 1336 global __language_map_init 1337 if __language_map_init: 1338 return 1339 1340 fpath = pkg_resource_path("ISO-639-2_utf-8.txt") 1341 langset = load_datafile(fpath, delim="|") 1342 for lang in langset: 1343 lang_names = get_list(lang[3], delim=";") 1344 1345 iso3 = lang[0] 1346 bib3 = lang[1] 1347 if iso3 and iso3.startswith("#"): 1348 continue 1349 1350 if iso3 in IGNORE_LANGUAGES: 1351 continue 1352 1353 iso2=lang[2] 1354 L = Language(iso3, iso2, lang_names) 1355 add_language(L) 1356 if bib3: 1357 L = Language(bib3, iso2, lang_names) 1358 add_language(L, override=True) 1359 1360 # Some odd additions -- Bibliographic vs. Terminologic codes may vary. 1361 # FRE vs. FRA is valid for French, for example. 1362 # 1363 for lg in [Language("fra", "fr", ["French"]), 1364 1365 Language("zho", "zh", ["Chinese"], locale="zh-cn"), 1366 1367 Language(None, "zt", ["Traditional Chinese"]), 1368 Language(None, "zt", ["Traditional Chinese/Taiwain"], locale="zh-tw"), 1369 1370 Language("prs", "dr", ["Dari", "Afghan Persian"], locale="fa-AF"), 1371 Language("prs", "dr", ["Dari", "Afghan Persian"]), 1372 Language("fas", "fa", ["Farsi", "Persian"], locale="fa-IR"), 1373 Language("eng", "en", ["English"]), 1374 1375 Language("eng", "en", ["English/British"], locale="en-gb"), 1376 Language("eng", "en", ["English/USA"], locale="en-us"), 1377 Language("eng", "en", ["English/United Kingdom"], locale="en-uk"), 1378 Language("eng", "en", ["English/Canadian"], locale="en-ca"), 1379 Language("eng", "en", ["English/Australian"], locale="en-au")]: 1380 1381 add_language(lg, override=True) 1382 1383 1384 __language_map_init = True
27def logger_config(logger_level: str, pkg: str): 28 """ 29 LOGGING 30 :param logger_level: 31 :param pkg: Name of package 32 :return: 33 """ 34 handlers = { 35 pkg: { 36 'class': 'logging.StreamHandler', 37 'stream': sys.stdout, 38 'formatter': 'default' 39 } 40 } 41 dictConfig({ 42 'version': 1, 43 'formatters': { 44 'default': { 45 'format': '%(levelname)s in %(module)s: %(message)s', 46 } 47 }, 48 'handlers': handlers, 49 'root': { 50 'level': logger_level, 51 'handlers': [pkg] 52 } 53 }) 54 55 _log = getLogger(pkg) 56 _log.setLevel(logger_level) 57 return _log
LOGGING :param logger_level: :param pkg: Name of package :return:
69def make_HASC(cc, adm1, adm2=None): 70 """ 71 Create a simplie hiearchical path for a boundary 72 :param cc: 73 :param adm1: 74 :param adm2: 75 :return: 76 """ 77 if not adm1: 78 adm1 = '0' 79 if adm2: 80 return '{}.{}.{}'.format(cc, adm1, adm2) 81 else: 82 return '{}.{}'.format(cc, adm1)
Create a simplie hiearchical path for a boundary :param cc: :param adm1: :param adm2: :return:
85def format_coord(lat, lon): 86 """ 87 2.6, 3.6 format. 88 :param lat: latitude 89 :param lon: longitude 90 :return: string 91 """ 92 return '{:2.5f},{:3.5f}'.format(float(lat), float(lon))
2.6, 3.6 format. :param lat: latitude :param lon: longitude :return: string
103def parse_admin_code(adm1, delim="."): 104 """ 105 :param delim: 106 :param adm1: admin level 1 code 107 :return: ADM1 code if possible. 108 """ 109 if not adm1: 110 return "0" 111 112 code = adm1 113 if "?" in adm1: 114 code = "0" 115 elif delim in adm1: 116 cc2, code = adm1.split(delim) 117 # Normalize Country-level. Absent ADM1 levels are assigned "0" anyway 118 if code.strip() in {"", None, "0", "00"}: 119 code = "0" 120 return code
:param delim: :param adm1: admin level 1 code :return: ADM1 code if possible.
123def distance_cartesian(x1, y1, x2, y2): 124 """ 125 Given X1, Y1 and X2, Y2 provide the 2-D Cartesian distance between two points. 126 """ 127 xdist = x2 - x1 128 ydist = y2 - y1 129 return sqrt(xdist * xdist + ydist * ydist)
Given X1, Y1 and X2, Y2 provide the 2-D Cartesian distance between two points.
135def distance_haversine(ddlon1, ddlat1, ddlon2, ddlat2): 136 """ 137 Returns distance in meters for given decimal degree Lon/Lat (X,Y) pair 138 139 http://www.movable-type.co.uk/scripts/latlong.html 140 """ 141 lat1 = radians(ddlat1) 142 lon1 = radians(ddlon1) 143 lat2 = radians(ddlat2) 144 lon2 = radians(ddlon2) 145 dLat = lat2 - lat1 146 dLon = lon2 - lon1 147 a = (sin(dLat / 2) * sin(dLat / 2)) + (cos(lat1) * cos(lat2) * sin(dLon / 2) * sin(dLon / 2)) 148 c = 2 * atan2(sqrt(a), sqrt(1 - a)) 149 return int(EARTH_RADIUS_WGS84 * c)
Returns distance in meters for given decimal degree Lon/Lat (X,Y) pair
152def location_accuracy(conf, prec_err): 153 """ 154 Both confidence and precision error are required to be non-zero and positive. 155 156 Scale ACCURACY by confidence, and inversely log10( R^2 ) 157 Decreasing accuracy with increasing radius, but keep scale on the order of visible things, 158 e.g., 0.01 to 1.00. This is only one definition of accuracy. 159 160 Consider confidence = 100 (aka 100% chance we have the right location) 161 162 * Country precision ~ +/- 100KM is accuracy = 0.091 163 * GPS precision is 10 M precision is accuracy 0.33 164 * 1M precision , accuracy = 1.0, (1 / (1+log(1*1)) = 1/1. In other words a 1m error is basically "perfect" 165 166 :param conf: confidence on 100 point scale (0-100) 167 :param prec_err: error in location precision, meters 168 :return: 169 """ 170 if not conf or not prec_err: 171 return 0 172 if conf < 0 or prec_err < 0: 173 return 0 174 scale = 0.01 * conf 175 inv_prec = 1 + log10(prec_err * prec_err) 176 acc = scale / inv_prec 177 return float(f"{acc:0.4f}")
Both confidence and precision error are required to be non-zero and positive.
Scale ACCURACY by confidence, and inversely log10( R^2 ) Decreasing accuracy with increasing radius, but keep scale on the order of visible things, e.g., 0.01 to 1.00. This is only one definition of accuracy.
Consider confidence = 100 (aka 100% chance we have the right location)
- Country precision ~ +/- 100KM is accuracy = 0.091
- GPS precision is 10 M precision is accuracy 0.33
- 1M precision , accuracy = 1.0, (1 / (1+log(1*1)) = 1/1. In other words a 1m error is basically "perfect"
:param conf: confidence on 100 point scale (0-100) :param prec_err: error in location precision, meters :return:
222def radial_geohash(lat, lon, radius): 223 """ 224 Propose geohash cells for a given radius from a given point 225 """ 226 corners = {} 227 # Find clockwise points at a radius, E, N, S, W. Bearing for North is 0deg. 228 p1 = LL(lat, lon) 229 corners["N"] = _ll2geohash(p1.destination(radius, 0)) 230 corners["E"] = _ll2geohash(p1.destination(radius, 90)) 231 corners["S"] = _ll2geohash(p1.destination(radius, 180)) 232 corners["W"] = _ll2geohash(p1.destination(radius, 270)) 233 return corners
Propose geohash cells for a given radius from a given point
236def geohash_cells_radially(lat: float, lon: float, radius: int): 237 """ 238 Create a set of geohashes that contain the given area defined by lat,lon + radius 239 """ 240 ensw = radial_geohash(lat, lon, radius) 241 radius_error = _estimate_geohash_precision(radius) 242 cells = set([]) 243 for directional in ensw: 244 gh = ensw[directional] 245 cells.add(gh[0:radius_error - 1]) 246 return cells
Create a set of geohashes that contain the given area defined by lat,lon + radius
249def geohash_cells(gh: str, radius: int): 250 """ 251 For a radius in meters generate the cells contained within or touched by that radius. 252 This is approximate precision based on: 253 https://en.wikipedia.org/wiki/Geohash which suggests this approximation could be done mathematically 254 :return: Dict of 8 directionals ~ E, N, S, W; NE, SE, SW, NW. If radius desired fits entirely within a 255 lesser precision geohash grid, the only cell returned is "CENTROID", i.e. radius=2000 (meters) for a geohash such as 256 `9q5t` 257 """ 258 radius_error = _estimate_geohash_precision(radius) 259 if len(gh) < radius_error: 260 return {"CENTROID": gh} 261 ghcell = gh[0:radius_error] 262 return geohash_neighbors(ghcell)
For a radius in meters generate the cells contained within or touched by that radius.
This is approximate precision based on:
https://en.wikipedia.org/wiki/Geohash which suggests this approximation could be done mathematically
:return: Dict of 8 directionals ~ E, N, S, W; NE, SE, SW, NW. If radius desired fits entirely within a
lesser precision geohash grid, the only cell returned is "CENTROID", i.e. radius=2000 (meters) for a geohash such as
9q5t
265class Coordinate: 266 """ 267 Convenient class for Lat/Lon pair. 268 Expects a row dict with 'lat' and 'lon', 269 or kwd args 'lat', 'lon' 270 @param row default dictionary 271 """ 272 273 def __init__(self, row, lat=None, lon=None): 274 # TODO: set coordinate to X, Y = None, None by default. 275 self.X = 0.0 276 self.Y = 0.0 277 self.mgrs = None 278 self.lat = self.Y 279 self.lon = self.X 280 # Set geohash on demand, otherwise it can be computed from lat,lon 281 self.geohash = None 282 283 if row: 284 if 'lat' in row and 'lon' in row: 285 lat = row['lat'] 286 lon = row['lon'] 287 288 if lat and lon: 289 self.set(lat, lon) 290 291 def validate(self): 292 return validate_lat(self.Y) and validate_lon(self.X) and (self.X != 0.0 and self.Y != 0.0) 293 294 def set(self, lat, lon): 295 """ Set the location lat, lon""" 296 self.X = float(lon) 297 self.Y = float(lat) 298 self.lat = self.Y 299 self.lon = self.X 300 301 def format_coord(self): 302 return format_coord(self.Y, self.X) 303 304 def string_coord(self): 305 return ",".join((str(self.lat), str(self.lon))) 306 307 def __str__(self): 308 if self.Y: 309 return format_coord(self.Y, self.X) 310 else: 311 return 'unset'
Convenient class for Lat/Lon pair. Expects a row dict with 'lat' and 'lon', or kwd args 'lat', 'lon' @param row default dictionary
314def bbox(lat: float, lon: float, radius: int): 315 """ 316 Calculate coordinates for SW and NE corners of a SQUARE bounding box of edge length 2 x radius 317 :param lat: decimal degree latitude 318 :param lon: decimal degree longitude 319 :param radius: meters from center point 320 """ 321 sw, ne = LL(lon, lat).boundsOf(2 * radius, 2 * radius) 322 return Coordinate(None, lat=sw.lat, lon=sw.lon), Coordinate(None, lat=ne.lat, lon=ne.lon)
Calculate coordinates for SW and NE corners of a SQUARE bounding box of edge length 2 x radius :param lat: decimal degree latitude :param lon: decimal degree longitude :param radius: meters from center point
325def centroid(arr: list): 326 """ 327 328 :param arr: a list of numeric coordinates (y,x) 329 :return: Coordinate -- the average of sum(y), sum(x) 330 """ 331 n = len(arr) 332 if not n: 333 return None 334 if n == 1: 335 y, x = arr[0] 336 return Coordinate(None, lat=y, lon=x) 337 338 lat_sum = math.fsum([y for y, x in arr]) 339 lon_sum = math.fsum([x for y, x in arr]) 340 return Coordinate(None, lat=lat_sum / n, lon=lon_sum / n)
:param arr: a list of numeric coordinates (y,x) :return: Coordinate -- the average of sum(y), sum(x)
343class Place(Coordinate): 344 """ 345 Location or GeoBase 346 + Coordinate 347 + Place 348 + Country 349 350 or 351 Location 352 + Coordinate 353 + Place 354 355 etc. Not sure of the best data model for inheritance. 356 This Python API hopes to simplify the concepts in the Java API. 357 358 """ 359 360 def __init__(self, pid, name, lat=None, lon=None): 361 Coordinate.__init__(self, None, lat=lat, lon=lon) 362 # Internal DB or Gazetteer ID 363 self.id = None 364 # Public or standards Place ID, e.g., GNS, ISO, etc. 365 self.place_id = pid 366 self.name = name 367 368 self.is_ascii = False 369 self.is_upper = False 370 self.adm1_postalcode = None # Province Postal CODE? 371 self.place_postalcode = None # ZIP CODE? 372 self.name_type = None 373 self.name_script = None # Code or label, e.g. L or LATIN 374 self.country = None 375 self.country_code = None 376 self.country_code_fips = None 377 self.feature_class = None 378 self.feature_code = None 379 self.adm1 = None 380 self.adm1_name = None 381 self.adm1_iso = None # Alternate ISO-based ADM1 code used by NGA and others. 382 self.adm2 = None 383 self.adm2_name = None 384 self.source = None 385 self.name_bias = 0.0 386 self.id_bias = 0.0 387 # Precision is actually "Precision Error" in meters 388 self.precision = -1 389 self.method = None 390 # Population stats, if available. Scale is a power-of-2 scale 391 # starting at about pop of 2^14 as 0, 32K=1, 64K=2, etc. 392 self.population = -1 393 self.population_scale = 0 394 self.hierarchical_path = None 395 396 # Internal fields for gazetteer curation and text analytics: 397 self.name_group = "" 398 self.search_only = False 399 400 def has_coordinate(self): 401 return self.validate() 402 403 def get_location(self): 404 """ Returns (LAT, LON) tuple 405 @return: tuple, (lat,lon) 406 """ 407 return self.Y, self.X 408 409 def set_location(self, lat, lon): 410 self.set(lat, lon) 411 412 def __str__(self): 413 return '{}, {} @({})'.format(self.name, self.country_code, self.string_coord()) 414 415 def format_feature(self): 416 """ 417 Yield a consolidated feature coding. 418 :return: X/xxxx format 419 """ 420 if self.feature_code: 421 return f"{self.feature_class}/{self.feature_code}" 422 return self.feature_class
Location or GeoBase
- Coordinate
- Place
- Country
or Location
- Coordinate
- Place
etc. Not sure of the best data model for inheritance. This Python API hopes to simplify the concepts in the Java API.
403 def get_location(self): 404 """ Returns (LAT, LON) tuple 405 @return: tuple, (lat,lon) 406 """ 407 return self.Y, self.X
Returns (LAT, LON) tuple @return: tuple, (lat,lon)
415 def format_feature(self): 416 """ 417 Yield a consolidated feature coding. 418 :return: X/xxxx format 419 """ 420 if self.feature_code: 421 return f"{self.feature_class}/{self.feature_code}" 422 return self.feature_class
Yield a consolidated feature coding. :return: X/xxxx format
Inherited Members
425class Country(Coordinate): 426 """ 427 Country metadata 428 """ 429 430 def __init__(self): 431 Coordinate.__init__(self, None) 432 self.cc_iso2 = None 433 self.cc_iso3 = None 434 self.cc_fips = None 435 self.place_id = None 436 self.name = None 437 self.namenorm = None 438 self.name_type = None 439 self.aliases = [] 440 self.is_territory = False 441 self.is_unique_name = False 442 self.timezones = [] 443 self.languages = set([]) 444 self.primary_language = None 445 446 def __str__(self): 447 return u'{} ({})'.format(self.name, self.cc_iso2)
Country metadata
Inherited Members
450def country_as_place(ctry: Country, name: str, name_type="N", oid=None): 451 """ 452 Convert to Place. 453 :param ctry: Country object 454 :param name: the name to use 455 :param name_type: 456 :param oid: row ID 457 :return: 458 """ 459 pl = Place(ctry.cc_iso2, name) 460 pl.id = oid 461 pl.place_id = ctry.place_id 462 pl.name_type = name_type 463 pl.feature_class = "A" 464 pl.feature_code = "PCLI" 465 pl.name_bias = 0.0 466 pl.id_bias = 0.0 467 pl.country_code = ctry.cc_iso2 468 pl.country_code_fips = ctry.cc_fips 469 pl.adm1 = "0" 470 pl.source = "ISO" 471 if ctry.is_territory: 472 pl.feature_code = "PCL" 473 pl.set_location(ctry.lat, ctry.lon) 474 return pl
Convert to Place. :param ctry: Country object :param name: the name to use :param name_type: :param oid: row ID :return:
477def load_countries(csvpath=None): 478 """ parses Xponents Core/src/main/resource CSV file country-names-2015.csv 479 putting out an array of Country objects. 480 :return: array of Country 481 """ 482 if not csvpath: 483 csvpath = pkg_resource_path('country-names-2021.csv') 484 485 count = 0 486 with open(csvpath, 'r', encoding="UTF-8") as fh: 487 columns = "country_name,FIPS_cc,ISO2_cc,ISO3_cc,unique_name,territory,latitude,longitude".split(',') 488 fio = get_csv_reader(fh, columns) 489 for row in fio: 490 491 # ignore empty row and header. 492 if 'country_name' not in row: 493 continue 494 if row['country_name'] == 'country_name': 495 continue 496 count += 1 497 C = Country() 498 C.name = row.get('country_name') 499 C.cc_iso2 = row.get('ISO2_cc').upper() 500 C.cc_iso3 = row.get('ISO3_cc').upper() 501 C.cc_fips = row.get('FIPS_cc').upper() 502 503 # Internal data set "place ID" 504 C.place_id = f"C{C.cc_iso2}#{C.cc_fips}#{count}" 505 506 C.is_name_unique = get_bool(row.get('unique_name')) 507 C.is_territory = get_bool(row.get('territory')) 508 C.namenorm = C.name.lower() 509 C.set(row.get("latitude"), row.get("longitude")) 510 511 countries.append(C) 512 513 for C in countries: 514 if not C.is_territory and C.cc_iso2 not in countries_by_iso: 515 countries_by_iso[C.cc_iso2] = C 516 countries_by_iso[C.cc_iso3] = C 517 518 if C.cc_fips and C.cc_fips != "*": 519 countries_by_fips[C.cc_fips] = C 520 521 countries_by_name[C.namenorm] = C 522 523 global __loaded 524 __loaded = len(countries_by_iso) > 1 525 526 if __loaded: 527 if "XKX" in countries_by_iso: 528 countries_by_iso["XKS"] = countries_by_iso.get("XKX") 529 if "SJM" in countries_by_iso: 530 countries_by_iso["XSV"] = countries_by_iso.get("SJM") 531 countries_by_iso["XJM"] = countries_by_iso.get("SJM") 532 if "PSE" in countries_by_iso: 533 countries_by_iso["GAZ"] = countries_by_iso.get("PSE") 534 if "TLS" in countries_by_iso: 535 countries_by_iso["TMP"] = countries_by_iso.get("TLS") 536 537 return countries
parses Xponents Core/src/main/resource CSV file country-names-2015.csv putting out an array of Country objects. :return: array of Country
540def get_us_province(adm1: str): 541 """ 542 543 :param adm1: ADM1 code or for territories, 544 :return: 545 """ 546 if not usstates: 547 raise Exception("Run load_us_provinces() first") 548 return usstates.get(adm1)
:param adm1: ADM1 code or for territories, :return:
551def load_us_provinces(): 552 """ 553 Load, store internally and return the LIST of US states. 554 NOTE: Place objects for US States have a location (unlike list of world provinces). 555 To get location and feature information in full, you must use the SQLITE DB or Xponents Solr. 556 :return: array of Place objects 557 """ 558 csvpath = pkg_resource_path('us-state-metadata.csv') 559 usstate_places = [] 560 with open(csvpath, 'r', encoding="UTF-8") as fh: 561 columns = ["POSTAL_CODE", "ADM1_CODE", "STATE", "LAT", "LON", "FIPS_CC", "ISO2_CC"] 562 io = get_csv_reader(fh, columns) 563 for row in io: 564 if row['POSTAL_CODE'] == 'POSTAL_CODE': continue 565 566 cc = row["ISO2_CC"] 567 adm1_code = row["ADM1_CODE"][2:] 568 postal_code = row["POSTAL_CODE"] 569 # HASC path 570 place_id = make_HASC(cc, adm1_code) 571 postal_id = make_HASC(cc, row["POSTAL_CODE"]) 572 adm1 = Place(place_id, row["STATE"], lat=row["LAT"], lon=row["LON"]) 573 adm1.feature_class = "A" 574 adm1.feature_code = "ADM1" 575 adm1.name_type = "N" 576 adm1.geohash = geohash_encode(adm1.lat, adm1.lon, precision=6) 577 578 adm1.country_code = cc 579 adm1.adm1 = adm1_code 580 adm1.adm1_postalcode = row["POSTAL_CODE"] 581 adm1.source = "OpenSextant" 582 583 # Code alone: 584 usstates[adm1_code] = adm1 585 usstates[postal_code] = adm1 586 usstates[place_id] = adm1 587 usstates[postal_id] = adm1 588 589 usstate_places.append(adm1) 590 return usstate_places
Load, store internally and return the LIST of US states. NOTE: Place objects for US States have a location (unlike list of world provinces). To get location and feature information in full, you must use the SQLITE DB or Xponents Solr. :return: array of Place objects
593def load_provinces(): 594 """ 595 Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc. 596 NOTE: Location information is not included in this province listing. Just Country, ADM1, Name tuples. 597 NOTE: This reflects only GEONAMES ADMIN1 CODES ASCII -- which portrays most of the world (except US) as FIPS, 598 not ISO. 599 :return: dict 600 """ 601 return load_world_adm1()
Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc. NOTE: Location information is not included in this province listing. Just Country, ADM1, Name tuples. NOTE: This reflects only GEONAMES ADMIN1 CODES ASCII -- which portrays most of the world (except US) as FIPS, not ISO. :return: dict
604def load_world_adm1(): 605 """ 606 Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc. 607 Coding for ADM1 is FIPS based mostly 608 :return: dict 609 """ 610 # Load local country data first, if you have it. US is only one so far. 611 load_us_provinces() 612 613 SOURCE_ID = "G" 614 csvpath = pkg_resource_path(os.path.join('geonames.org', 'admin1CodesASCII.txt')) 615 616 with open(csvpath, 'r', encoding="UTF-8") as fh: 617 adm1Splitter = re.compile(r'\.') 618 lineSplitter = re.compile('\t') 619 for line in fh: 620 row = lineSplitter.split(line.strip()) 621 src_id = f"{SOURCE_ID}{row[3]}" 622 if not row[3]: 623 src_id = None 624 adm1 = Place(src_id, row[1]) 625 adm1.feature_class = "A" 626 adm1.feature_code = "ADM1" 627 adm1.name_type = "N" 628 629 cc_adm1 = adm1Splitter.split(row[0], 2) 630 adm1.country_code = cc_adm1[0] 631 adm1.adm1 = parse_admin_code(cc_adm1[1]) 632 adm1.source = "G" # Geonames.org coded. 633 hasc = make_HASC(adm1.country_code, adm1.adm1) 634 if adm1.country_code == "US": 635 adm1.source = "USGS" 636 if hasc in usstates: 637 us_place = usstates[hasc] 638 us_place.name = adm1.name 639 hasc = make_HASC(us_place.country_code, us_place.adm1) 640 adm1_by_hasc[hasc] = adm1 641 642 adm1_by_hasc[hasc] = adm1 643 return adm1_by_hasc
Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc. Coding for ADM1 is FIPS based mostly :return: dict
646def get_province(cc, adm1): 647 """ REQUIRES you load_provinces() first. 648 """ 649 return adm1_by_hasc.get(make_HASC(cc, adm1))
REQUIRES you load_provinces() first.
652def get_country(namecode, standard="ISO"): 653 """ 654 Get Country object given a name, ISO or FIPS code. For codes, you must be 655 clear about which standard the code is based in. Some code collisions exist. 656 "ZZ" will NOT be returned for the empty code -- if you pass in a NULL or empty 657 country code you may have a data quality issue. 658 :param namecode: 2- or 3-alpha code. 659 :param standard: 'ISO' or 'FIPS', 'name' 660 :return: Country object 661 """ 662 if not namecode or not isinstance(namecode, str): 663 return None 664 665 if not __loaded: 666 load_countries() 667 668 lookup = namecode.upper() 669 if standard == "ISO": 670 return countries_by_iso.get(lookup) 671 elif standard == "FIPS": 672 return countries_by_fips.get(lookup) 673 elif standard == "name": 674 return countries_by_name.get(namecode.lower()) 675 else: 676 raise Exception("That standards body '{}' is not known for code {}".format(standard, namecode))
Get Country object given a name, ISO or FIPS code. For codes, you must be clear about which standard the code is based in. Some code collisions exist. "ZZ" will NOT be returned for the empty code -- if you pass in a NULL or empty country code you may have a data quality issue. :param namecode: 2- or 3-alpha code. :param standard: 'ISO' or 'FIPS', 'name' :return: Country object
679def load_major_cities(): 680 """ 681 Loads City geo/demographic information -- this does not try to parse all name variants. 682 683 This produces Geonames use of FIPS codes. 684 :return: 685 """ 686 csvpath = pkg_resource_path(os.path.join('geonames.org', 'cities15000.txt')) 687 688 from csv import reader 689 with open(csvpath, 'r', encoding="UTF-8") as fh: 690 rdr = reader(fh, dialect="excel", delimiter="\t") 691 cities = [] 692 for line in rdr: 693 if len(line) != 19: 694 continue 695 if not line[4]: 696 print("Not location info for City ~ ", line[0]) 697 continue 698 # ID NAME LAT LON 699 pl = Place(line[0], line[1], lat=float(line[4]), lon=float(line[5])) 700 pl.feature_class = line[6] 701 pl.feature_code = line[7] 702 pl.country_code = line[8] 703 alt_cc = line[9] 704 if alt_cc and alt_cc != pl.country_code: 705 print("Alternate Country Code", alt_cc) 706 pl.adm1 = parse_admin_code(line[10]) 707 pl.adm2 = line[11] 708 # pl.geohash = geohash_encode(pl.lat, pl.lon, precision=6) 709 try: 710 pl.population = int(line[14]) 711 pl.population_scale = popscale(pl.population, feature="city") 712 except: 713 pass 714 cities.append(pl) 715 return cities
Loads City geo/demographic information -- this does not try to parse all name variants.
This produces Geonames use of FIPS codes. :return:
725def popscale(population, feature="city"): 726 """ 727 Given a population in context of the feature -- provide a 728 approximation of the size of the feature on a 10 point scale. 729 730 Approximations for 10 points: 731 Largest city is ~15 million 732 // Few cities top 30 million, e.g., 2^25. popscale = 25 - 13 = 12. 733 Largest province is ~135 million 734 735 :param population: 736 :param feature: city, district, or province allowed. 737 :return: index on 0..10 scale. 738 """ 739 if population < 1: 740 return 0 741 shifter = _pop_scale.get(feature, 20) 742 index = mathlog(population, 2) - shifter 743 return int(index) if index > 0 else 0
Given a population in context of the feature -- provide a approximation of the size of the feature on a 10 point scale.
Approximations for 10 points: Largest city is ~15 million // Few cities top 30 million, e.g., 2^25. popscale = 25 - 13 = 12. Largest province is ~135 million
:param population: :param feature: city, district, or province allowed. :return: index on 0..10 scale.
746def is_political(feat_code: str): 747 """Test a feature code""" 748 if not feat_code: return False 749 return feat_code.startswith("PCL")
Test a feature code
Test a feature code
767def is_academic(feat_class: str, feat_code: str) -> bool: 768 """ 769 770 :param feat_class: geonames class 771 :param feat_code: geonames designation code 772 :return: 773 """ 774 return feat_class and feat_code and feat_class == "S" and feat_code.startswith("SCH")
:param feat_class: geonames class :param feat_code: geonames designation code :return:
777def characterize_location(place: Place, label: str): 778 """ 779 Experimental: Not comprehensive characterization. This is intended to summarize PlaceCandidates extracted 780 from text. 781 782 Describe a Place in terms of a plain language feature type and the geographic scope or resolution. 783 E.g, Place object "P/PPL", "city" 784 E.g, Place object "A/ADM4" "admin" 785 E.g, Place object "S/COORD", "site" 786 787 :param place: Place object 788 :param label: text match label, e.g., 'country', 'place', 'coord', etc. 789 :return: feature string, resolution string 790 """ 791 res = label 792 fc = place.feature_class 793 resolutions = { 794 "A": "admin", 795 "P": "city", 796 "S": "site", 797 "H": "water", 798 "R": "path", 799 "V": "area", 800 "T": "area", 801 "L": "area" 802 } 803 804 # Note label should be a limited set -- country, postal, coord, place. 805 if label == "place": 806 res = resolutions.get(fc, label) 807 if label == "coord": 808 res = "site" 809 810 return place.format_feature(), res
Experimental: Not comprehensive characterization. This is intended to summarize PlaceCandidates extracted from text.
Describe a Place in terms of a plain language feature type and the geographic scope or resolution. E.g, Place object "P/PPL", "city" E.g, Place object "A/ADM4" "admin" E.g, Place object "S/COORD", "site"
:param place: Place object :param label: text match label, e.g., 'country', 'place', 'coord', etc. :return: feature string, resolution string
813class TextEntity: 814 """ 815 A Text span. 816 817 classes and routines that align with Java org.opensextant.data and org.opensextant.extraction 818 819 * TextEntity: represents a span of text 820 * TextMatch: a TextEntity matched by a particular routine. This is the basis for most all 821 extractors and annotators in OpenSetant. 822 """ 823 824 def __init__(self, text, start, end): 825 self.text = text 826 self.start = start 827 self.end = end 828 self.len = -1 829 self.is_duplicate = False 830 self.is_overlap = False 831 self.is_submatch = False 832 if self._is_valid(): 833 self.len = self.end - self.start 834 835 def __str__(self): 836 return f"{self.text}({self.start},{self.end})" 837 838 def _is_valid(self): 839 if self.start is None or self.end is None: 840 return False 841 return self.start >= 0 and self.end >= 0 842 843 def contains(self, x1): 844 """ if this span contains an offset x1 845 :param x1: 846 """ 847 if self.start < 0 or self.end < 0: 848 return False 849 return self.start <= x1.start < x1.end <= self.end 850 851 def exact_match(self, t): 852 return t.start == self.start and t.end == self.end and self._is_valid() 853 854 def is_within(self, t): 855 """ 856 if the given annotation, t, contains this 857 :param t: 858 :return: 859 """ 860 return t.contains(self.start) and t.contains(self.end) 861 862 def is_after(self, t): 863 return self.start > t.end 864 865 def is_before(self, t): 866 return self.end < t.start 867 868 def overlaps(self, t): 869 """ 870 Determine if t overlaps self. If Right or Left match, t overlaps if it is longer. 871 If t is contained entirely within self, then it is not considered overlap -- it is Contained within. 872 :param t: 873 :return: 874 """ 875 # a1 a2 876 # t1 t2 RIGHT skew 877 # a1 a2 878 # t1 t2 LEFT skew 879 # 880 # a1 a2 881 # t1 t2 RIGHT match 882 # t1 t2 LEFT match 883 # a1 a2 884 # t1 t2 minimal OVERLAP 885 skew_right = t.start < self.start <= t.end < self.end 886 skew_left = self.start < t.start <= self.end < t.end 887 left_match = self.end == t.end 888 right_match = self.start == t.start 889 if skew_right or skew_left: 890 return True 891 return (right_match and skew_left) or (left_match and skew_right)
A Text span.
classes and routines that align with Java org.opensextant.data and org.opensextant.extraction
- TextEntity: represents a span of text
- TextMatch: a TextEntity matched by a particular routine. This is the basis for most all extractors and annotators in OpenSetant.
843 def contains(self, x1): 844 """ if this span contains an offset x1 845 :param x1: 846 """ 847 if self.start < 0 or self.end < 0: 848 return False 849 return self.start <= x1.start < x1.end <= self.end
if this span contains an offset x1 :param x1:
854 def is_within(self, t): 855 """ 856 if the given annotation, t, contains this 857 :param t: 858 :return: 859 """ 860 return t.contains(self.start) and t.contains(self.end)
if the given annotation, t, contains this :param t: :return:
868 def overlaps(self, t): 869 """ 870 Determine if t overlaps self. If Right or Left match, t overlaps if it is longer. 871 If t is contained entirely within self, then it is not considered overlap -- it is Contained within. 872 :param t: 873 :return: 874 """ 875 # a1 a2 876 # t1 t2 RIGHT skew 877 # a1 a2 878 # t1 t2 LEFT skew 879 # 880 # a1 a2 881 # t1 t2 RIGHT match 882 # t1 t2 LEFT match 883 # a1 a2 884 # t1 t2 minimal OVERLAP 885 skew_right = t.start < self.start <= t.end < self.end 886 skew_left = self.start < t.start <= self.end < t.end 887 left_match = self.end == t.end 888 right_match = self.start == t.start 889 if skew_right or skew_left: 890 return True 891 return (right_match and skew_left) or (left_match and skew_right)
Determine if t overlaps self. If Right or Left match, t overlaps if it is longer. If t is contained entirely within self, then it is not considered overlap -- it is Contained within. :param t: :return:
894class TextMatch(TextEntity): 895 """ 896 An entity matched by some tagger; it is a text span with lots of metadata. 897 """ 898 899 def __init__(self, *args, label=None): 900 TextEntity.__init__(self, *args) 901 self.id = None 902 self.label = label 903 self.filtered_out = False 904 self.attrs = dict() 905 906 def __str__(self): 907 return f"{self.label}/{self.text}({self.start},{self.end})" 908 909 def populate(self, attrs: dict): 910 """ 911 Populate a TextMatch to normalize the set of attributes -- separate class fields on TextMatch from additional 912 optional attributes. 913 :param attrs: dict of standard Xponents API outputs. 914 :return: 915 """ 916 self.id = attrs.get("match-id") 917 self.label = attrs.get("type") 918 self.attrs.update(attrs) 919 self.filtered_out = get_bool(self.attrs.get("filtered-out")) 920 for k in ['len', 'length']: 921 if k in self.attrs: 922 self.len = self.attrs.get(k) 923 if self.len is not None and self.start >= 0 and not self.end: 924 self.end = self.start + self.len 925 926 # Remove attribute keys that may be confusing. 927 for fld in ['offset', 'start', 'end', 'len', 'length', 'type', 'filtered-out', 'text', 'matchtext']: 928 if fld in self.attrs: 929 del self.attrs[fld] 930 931 def normalize(self): 932 """ 933 Optional, but recommended routine to normalize the matched data. 934 That is, parse fields, uppercase, streamline punctuation, etc. 935 As well, given such normalization result, this is the opportunity to additionally 936 validate the match. 937 :return: 938 """ 939 pass
An entity matched by some tagger; it is a text span with lots of metadata.
909 def populate(self, attrs: dict): 910 """ 911 Populate a TextMatch to normalize the set of attributes -- separate class fields on TextMatch from additional 912 optional attributes. 913 :param attrs: dict of standard Xponents API outputs. 914 :return: 915 """ 916 self.id = attrs.get("match-id") 917 self.label = attrs.get("type") 918 self.attrs.update(attrs) 919 self.filtered_out = get_bool(self.attrs.get("filtered-out")) 920 for k in ['len', 'length']: 921 if k in self.attrs: 922 self.len = self.attrs.get(k) 923 if self.len is not None and self.start >= 0 and not self.end: 924 self.end = self.start + self.len 925 926 # Remove attribute keys that may be confusing. 927 for fld in ['offset', 'start', 'end', 'len', 'length', 'type', 'filtered-out', 'text', 'matchtext']: 928 if fld in self.attrs: 929 del self.attrs[fld]
Populate a TextMatch to normalize the set of attributes -- separate class fields on TextMatch from additional optional attributes. :param attrs: dict of standard Xponents API outputs. :return:
931 def normalize(self): 932 """ 933 Optional, but recommended routine to normalize the matched data. 934 That is, parse fields, uppercase, streamline punctuation, etc. 935 As well, given such normalization result, this is the opportunity to additionally 936 validate the match. 937 :return: 938 """ 939 pass
Optional, but recommended routine to normalize the matched data. That is, parse fields, uppercase, streamline punctuation, etc. As well, given such normalization result, this is the opportunity to additionally validate the match. :return:
Inherited Members
942class PlaceCandidate(TextMatch): 943 """ 944 A TextMatch representing any geographic mention -- a Place object will 945 represent the additional attributes for the chosen place. 946 see also in Java org.opensextant.extractors.geo.PlaceCandidate class, which is 947 a more in-depth version of this. This Python class represents the 948 response from the REST API, for example. 949 950 """ 951 952 def __init__(self, *args, **kwargs): 953 TextMatch.__init__(self, *args, **kwargs) 954 self.confidence = 0 955 self.rules = [] 956 self.is_country = False 957 self.place = None 958 # Location certainty is a simple meausre 0.0 to 1.0 to convey confidence + precision in one metric 959 self.location_certainty = -1 960 961 def populate(self, attrs: dict): 962 """ 963 Deserialize the attributes dict from either TextMatch schema or Place schema 964 :param attrs: 965 :return: 966 """ 967 TextMatch.populate(self, attrs) 968 geo = Place(None, attrs.get("name"), lat=attrs.get("lat"), lon=attrs.get("lon")) 969 if not geo.name: 970 geo.name = self.text 971 972 # attribute / schema does not align 100% here. 973 geo.country_code = attrs.get("cc") 974 geo.adm1 = attrs.get("adm1") 975 geo.precision = attrs.get("prec") 976 geo.feature_class = attrs.get("feat_class") 977 geo.feature_code = attrs.get("feat_code") 978 geo.adm1_name = attrs.get("province-name") 979 geo.geohash = attrs.get("geohash") 980 geo.method = attrs.get("method") 981 982 # Combined match + geo-location confidence: 983 self.confidence = attrs.get("confidence") 984 if "rules" in attrs: 985 # One or more geo-inferencing rules 986 self.rules = attrs["rules"].split(";") 987 988 self.is_country = self.label == "country" or is_country(geo.feature_code) 989 if self.is_country: 990 # Zero out country location; Let user derive country from metadata. 991 geo.lat = None 992 geo.lon = None 993 self.place = geo 994 # Items like coordinates and cities, etc receive a location certainty. Countries do not. 995 self.location_certainty = location_accuracy(self.confidence, geo.precision)
A TextMatch representing any geographic mention -- a Place object will represent the additional attributes for the chosen place. see also in Java org.opensextant.extractors.geo.PlaceCandidate class, which is a more in-depth version of this. This Python class represents the response from the REST API, for example.
961 def populate(self, attrs: dict): 962 """ 963 Deserialize the attributes dict from either TextMatch schema or Place schema 964 :param attrs: 965 :return: 966 """ 967 TextMatch.populate(self, attrs) 968 geo = Place(None, attrs.get("name"), lat=attrs.get("lat"), lon=attrs.get("lon")) 969 if not geo.name: 970 geo.name = self.text 971 972 # attribute / schema does not align 100% here. 973 geo.country_code = attrs.get("cc") 974 geo.adm1 = attrs.get("adm1") 975 geo.precision = attrs.get("prec") 976 geo.feature_class = attrs.get("feat_class") 977 geo.feature_code = attrs.get("feat_code") 978 geo.adm1_name = attrs.get("province-name") 979 geo.geohash = attrs.get("geohash") 980 geo.method = attrs.get("method") 981 982 # Combined match + geo-location confidence: 983 self.confidence = attrs.get("confidence") 984 if "rules" in attrs: 985 # One or more geo-inferencing rules 986 self.rules = attrs["rules"].split(";") 987 988 self.is_country = self.label == "country" or is_country(geo.feature_code) 989 if self.is_country: 990 # Zero out country location; Let user derive country from metadata. 991 geo.lat = None 992 geo.lon = None 993 self.place = geo 994 # Items like coordinates and cities, etc receive a location certainty. Countries do not. 995 self.location_certainty = location_accuracy(self.confidence, geo.precision)
Deserialize the attributes dict from either TextMatch schema or Place schema :param attrs: :return:
998class Extractor(ABC): 999 def __init__(self): 1000 self.id = None 1001 1002 @abstractmethod 1003 def extract(self, text, **kwargs): 1004 """ 1005 1006 :param text: Unicode text input 1007 :keyword features: an array of features to extract, e.g., "coordinate", "place", "MONEY" 1008 :return: array of TextMatch 1009 """ 1010 pass
Helper class that provides a standard way to create an ABC using inheritance.
1002 @abstractmethod 1003 def extract(self, text, **kwargs): 1004 """ 1005 1006 :param text: Unicode text input 1007 :keyword features: an array of features to extract, e.g., "coordinate", "place", "MONEY" 1008 :return: array of TextMatch 1009 """ 1010 pass
:param text: Unicode text input :keyword features: an array of features to extract, e.g., "coordinate", "place", "MONEY" :return: array of TextMatch
1013def render_match(m): 1014 """ 1015 1016 :param m: TextMatch 1017 :return: dict 1018 """ 1019 if not isinstance(m, TextMatch): 1020 return None 1021 dct = { 1022 "type": m.label, 1023 "text": m.text, 1024 "offset": m.start, 1025 "length": m.len, 1026 "filtered-out": m.filtered_out 1027 } 1028 return dct
:param m: TextMatch :return: dict
1036def reduce_matches(matches): 1037 """ 1038 Mark each match if it is a submatch or overlap or exact duplicate of other. 1039 :param matches: array of TextMatch (or TextEntity). This is the more object oriented version 1040 of reduce_matches_dict 1041 :return: 1042 """ 1043 if len(matches) < 2: 1044 return 1045 loop = 0 1046 for M in matches: 1047 loop += 1 1048 if M.filtered_out: 1049 continue 1050 m1 = M.start 1051 m2 = M.end 1052 # print(M.text, loop) 1053 1054 # In this loop you have to compare M against all N 1055 # Cannot exit loop on first match or overlap. 1056 for N in matches[loop:]: 1057 if N.filtered_out: 1058 continue 1059 1060 n1 = N.start 1061 n2 = N.end 1062 1063 if m2 < n1 or m1 > n2: 1064 # M entirely before N 1065 # M entirely after N 1066 continue 1067 1068 # print("\t", N.text, N.start, N.is_duplicate) 1069 if n1 == m1 and n2 == m2: 1070 # Exact duplicate - Mark N as dup, as M is first in array, but only if M is a valid match. 1071 N.is_duplicate = True 1072 elif n1 <= m1 < m2 <= n2: 1073 # M is within N span 1074 M.is_submatch = True 1075 elif m1 <= n1 < n2 <= m2: 1076 # N is within M span 1077 N.is_submatch = True 1078 elif m1 <= n2 <= m2 or n1 <= m2 <= n2: 1079 # n1 n2 1080 # m1 m2 1081 M.is_overlap = True 1082 N.is_overlap = True
Mark each match if it is a submatch or overlap or exact duplicate of other. :param matches: array of TextMatch (or TextEntity). This is the more object oriented version of reduce_matches_dict :return:
1085def reduce_matches_dict(matches): 1086 """ 1087 Accepts an array annotations (dict). Inserts the "submatch" flag in dict if there is a 1088 submatch (that is, if another TextEntity A wholly contains another, B -- B is a submatch). 1089 We just have to loop through half of the array ~ comparing each item to each other item once. 1090 1091 :param matches: array of dicts. 1092 """ 1093 _max = len(matches) 1094 if _max < 2: 1095 return 1096 1097 loops = 0 1098 for i in range(0, _max): 1099 M = matches[i] 1100 m1 = M['start'] 1101 m2 = M['end'] 1102 1103 for j in range(i + 1, _max): 1104 loops += 1 1105 N = matches[j] 1106 n1 = N['start'] 1107 n2 = N['end'] 1108 1109 if m2 < n1: 1110 # M before N 1111 continue 1112 1113 if m1 > n2: 1114 # M after N 1115 continue 1116 1117 # Check for filtered-out matches not done in this version. 1118 # 1119 if n1 == m1 and n2 == m2: 1120 N['submatch'] = IS_DUPLICATE 1121 1122 elif n1 <= m1 < m2 <= n2: 1123 M['submatch'] = IS_SUBMATCH 1124 # Determined state of M. 1125 # break this internal loop 1126 1127 elif m1 <= n1 < n2 <= m2: 1128 N['submatch'] = IS_SUBMATCH 1129 # Determined state of N, 1130 # But possibly more N contained within M. Do not break yet. 1131 return
Accepts an array annotations (dict). Inserts the "submatch" flag in dict if there is a submatch (that is, if another TextEntity A wholly contains another, B -- B is a submatch). We just have to loop through half of the array ~ comparing each item to each other item once.
:param matches: array of dicts.
1139class Language: 1140 """ 1141 Language Represents a single code/name pair 1142 Coding is 3-char or 2-char, either is optional. 1143 In some situations there are competeing 2-char codes in code books, such as Lib of Congress (LOC) 1144 """ 1145 1146 def __init__(self, iso3, iso2, nmlist: list, locale=None): 1147 self.code_iso3 = iso3 1148 self.code = iso2 1149 self.names = nmlist 1150 self.locale = locale 1151 if nmlist: 1152 if not isinstance(nmlist, list): 1153 raise Exception("Name list is a list of names for the language. The first one is the default.") 1154 1155 def get_name(self): 1156 if self.names: 1157 return self.names[0] 1158 return None 1159 1160 def name_code(self): 1161 if self.names: 1162 return self.names[0].lower() 1163 return None 1164 1165 def __str__(self): 1166 return f"{self.name_code()}({self.code})"
Language Represents a single code/name pair Coding is 3-char or 2-char, either is optional. In some situations there are competeing 2-char codes in code books, such as Lib of Congress (LOC)
1173def list_languages(): 1174 """ 1175 List out a flattened list of languages, de-duplicated by ISO2 language ID. 1176 1177 TODO: alternatively list out every language 1178 :return: 1179 """ 1180 load_languages() 1181 langs = [] 1182 visited = set([]) 1183 for lg in language_map: 1184 L = language_map[lg] 1185 if L.code: 1186 if not L.code in visited: 1187 langs.append(L) 1188 visited.add(L.code) 1189 if L.code_iso3: 1190 if not L.code_iso3 in visited: 1191 langs.append(L) 1192 visited.add(L.code_iso3) 1193 return langs
List out a flattened list of languages, de-duplicated by ISO2 language ID.
TODO: alternatively list out every language :return:
1196def add_language(lg: Language, override=False): 1197 """ 1198 The language map for ISO 2-alpha and 3-alpha codes should be protected from language IDs that are dialect or locale 1199 1200 "en" ==> en-au, en-gb, en-us, etc.? This is ambiguous 1201 The reverse is true -- "en-gb" is at least "en" or "eng" english 1202 1203 :param lg: 1204 :param override: 1205 :return: 1206 """ 1207 if not lg: 1208 return 1209 1210 codes = [] 1211 if lg.code: 1212 codes.append(lg.code.lower()) 1213 if lg.code_iso3: 1214 codes.append(lg.code_iso3.lower()) 1215 if lg.locale: 1216 codes.append(lg.locale.lower()) 1217 1218 if lg.names: 1219 for nm in lg.names: 1220 lang_name_norm = nm.lower() 1221 codes.append(lang_name_norm) 1222 if "modern" in lang_name_norm: 1223 nm2 = get_list(lang_name_norm, delim=",")[0] 1224 codes.append(nm2) 1225 override = True 1226 1227 for k in set(codes): 1228 exists = k in language_map 1229 1230 # coding rule: 2 or 3 char alpha codes for ISO or Biblio code books are not overriden. 1231 if len(k) <= 3 and exists: 1232 continue 1233 1234 if exists and not override: 1235 raise Exception(f"Forcibly remap language code? {k}") 1236 1237 language_map[k] = lg
The language map for ISO 2-alpha and 3-alpha codes should be protected from language IDs that are dialect or locale
"en" ==> en-au, en-gb, en-us, etc.? This is ambiguous The reverse is true -- "en-gb" is at least "en" or "eng" english
:param lg: :param override: :return:
1240def get_language(code: str) -> Language: 1241 """ 1242 1243 :param code: language ID or name 1244 :return: Language or None 1245 """ 1246 if not code: 1247 return None 1248 1249 load_languages() 1250 k = code.lower() 1251 # Most cases: 1252 if len(k) <= 3: 1253 return language_map.get(k) 1254 1255 # Code is odd, like a locale? "EN_GB" or en-gb, etc. 1256 if k.isalpha(): 1257 return language_map.get(k) 1258 1259 if k in language_map: 1260 return language_map.get(k) 1261 1262 for delim in ["-", " ", "_"]: 1263 k = k.split(delim)[0] 1264 if k in language_map: 1265 return language_map.get(k) 1266 return None
:param code: language ID or name :return: Language or None
1288def is_lang_romance(lg: str): 1289 """If spanish, portuguese, italian, french, romanian""" 1290 L = get_language(lg) 1291 if not L: 1292 return False 1293 c = L.code 1294 return c in {"es", "pt", "it", "fr", "ro"}
If spanish, portuguese, italian, french, romanian
1297def is_lang_euro(lg: str): 1298 """ 1299 true if lang is European -- romance, german, english, etc 1300 :param lg: 1301 :return: 1302 """ 1303 L = get_language(lg) 1304 if not L: 1305 return False 1306 c = L.code 1307 return c in {"es", "pt", "it", "fr", "ro", 1308 "de", "en", 1309 "bu", "cz", "po", "nl", "el", "sq"}
true if lang is European -- romance, german, english, etc :param lg: :return: