opensextant

   1# -*- coding: utf-8 -*-
   2import math
   3import os
   4import re
   5import sys
   6from abc import ABC, abstractmethod
   7from logging import getLogger
   8from logging.config import dictConfig
   9from math import sqrt, sin, cos, radians, atan2, log as mathlog, log10
  10
  11from opensextant.utility import get_csv_reader, get_bool, get_list, load_datafile
  12from pygeodesy.ellipsoidalVincenty import LatLon as LL
  13from pygeodesy.geohash import encode as geohash_encode, decode as geohash_decode, neighbors as geohash_neighbors
  14
  15PY3 = sys.version_info.major == 3
  16countries = []
  17countries_by_iso = {}
  18countries_by_fips = {}
  19countries_by_name = {}
  20usstates = {}
  21adm1_by_hasc = {}
  22__loaded = False
  23__language_map_init = False
  24
  25
  26def logger_config(logger_level: str, pkg: str):
  27    """
  28    LOGGING
  29    :param logger_level:
  30    :param pkg: Name of package
  31    :return:
  32    """
  33    handlers = {
  34        pkg: {
  35            'class': 'logging.StreamHandler',
  36            'stream': sys.stdout,
  37            'formatter': 'default'
  38        }
  39    }
  40    dictConfig({
  41        'version': 1,
  42        'formatters': {
  43            'default': {
  44                'format': '%(levelname)s in %(module)s: %(message)s',
  45            }
  46        },
  47        'handlers': handlers,
  48        'root': {
  49            'level': logger_level,
  50            'handlers': [pkg]
  51        }
  52    })
  53
  54    _log = getLogger(pkg)
  55    _log.setLevel(logger_level)
  56    return _log
  57
  58
  59def pkg_resource_path(rsrc):
  60    pkg_dir = os.path.dirname(os.path.abspath(__file__))
  61    fpath = os.path.join(pkg_dir, 'resources', rsrc)
  62    if not os.path.exists(fpath):
  63        raise Exception(f"Resource not found {rsrc} (tried {fpath}")
  64
  65    return fpath
  66
  67
  68def make_HASC(cc, adm1, adm2=None):
  69    """
  70    Create a simplie hiearchical path for a boundary
  71    :param cc:
  72    :param adm1:
  73    :param adm2:
  74    :return:
  75    """
  76    if not adm1:
  77        adm1 = '0'
  78    if adm2:
  79        return '{}.{}.{}'.format(cc, adm1, adm2)
  80    else:
  81        return '{}.{}'.format(cc, adm1)
  82
  83
  84def format_coord(lat, lon):
  85    """
  86    2.6, 3.6 format.
  87    :param lat: latitude
  88    :param lon: longitude
  89    :return: string
  90    """
  91    return '{:2.5f},{:3.5f}'.format(float(lat), float(lon))
  92
  93
  94def validate_lat(f):
  95    return (f >= -90.0) and (f <= 90.0)
  96
  97
  98def validate_lon(f):
  99    return (f >= -180.0) and (f <= 180.0)
 100
 101
 102def parse_admin_code(adm1, delim="."):
 103    """
 104    :param delim:
 105    :param adm1: admin level 1 code
 106    :return: ADM1 code if possible.
 107    """
 108    if not adm1:
 109        return "0"
 110
 111    code = adm1
 112    if "?" in adm1:
 113        code = "0"
 114    elif delim in adm1:
 115        cc2, code = adm1.split(delim)
 116    # Normalize Country-level.  Absent ADM1 levels are assigned "0" anyway
 117    if code.strip() in {"", None, "0", "00"}:
 118        code = "0"
 119    return code
 120
 121
 122def distance_cartesian(x1, y1, x2, y2):
 123    """
 124        Given X1, Y1 and X2, Y2 provide the 2-D Cartesian distance between two points.
 125    """
 126    xdist = x2 - x1
 127    ydist = y2 - y1
 128    return sqrt(xdist * xdist + ydist * ydist)
 129
 130
 131EARTH_RADIUS_WGS84 = 6378.137 * 1000  # M,  True: 6378.137
 132
 133
 134def distance_haversine(ddlon1, ddlat1, ddlon2, ddlat2):
 135    """
 136    Returns distance in meters for given decimal degree Lon/Lat (X,Y) pair
 137
 138    http://www.movable-type.co.uk/scripts/latlong.html
 139    """
 140    lat1 = radians(ddlat1)
 141    lon1 = radians(ddlon1)
 142    lat2 = radians(ddlat2)
 143    lon2 = radians(ddlon2)
 144    dLat = lat2 - lat1
 145    dLon = lon2 - lon1
 146    a = (sin(dLat / 2) * sin(dLat / 2)) + (cos(lat1) * cos(lat2) * sin(dLon / 2) * sin(dLon / 2))
 147    c = 2 * atan2(sqrt(a), sqrt(1 - a))
 148    return int(EARTH_RADIUS_WGS84 * c)
 149
 150
 151def location_accuracy(conf, prec_err):
 152    """
 153    Both confidence and precision error are required to be non-zero and positive.
 154
 155    Scale ACCURACY by confidence, and inversely log10( R^2 )
 156    Decreasing accuracy with increasing radius, but keep scale on the order of visible things,
 157    e.g., 0.01 to 1.00.  This is only one definition of accuracy.
 158
 159    Consider confidence = 100 (aka 100% chance we have the right location)
 160
 161    * Country precision ~ +/- 100KM is accuracy = 0.091
 162    * GPS precision is   10 M precision is accuracy 0.33
 163    * 1M precision , accuracy =  1.0, (1 / (1+log(1*1)) = 1/1.  In other words a 1m error is basically "perfect"
 164
 165    :param conf: confidence on 100 point scale (0-100)
 166    :param prec_err: error in location precision, meters
 167    :return:
 168    """
 169    if not conf or not prec_err:
 170        return 0
 171    if conf < 0 or prec_err < 0:
 172        return 0
 173    scale = 0.01 * conf
 174    inv_prec = 1 + log10(prec_err * prec_err)
 175    acc = scale / inv_prec
 176    return float(f"{acc:0.4f}")
 177
 178
 179def _estimate_geohash_precision(r: int):
 180    """
 181    Returns hueristic geohash length for the given radius in meters.
 182
 183    :param r: radius in meters
 184    """
 185    if r > 1000000:
 186        return 1
 187    elif r > 250000:
 188        return 2
 189    elif r > 50000:
 190        return 3
 191    elif r > 10000:
 192        return 4
 193    elif r > 1000:
 194        return 5
 195    elif r > 250:
 196        return 6
 197    elif r > 50:
 198        return 7
 199    elif r > 1:
 200        return 8
 201    else:
 202        raise Exception(f"Not thinking about sub-meter resolution. radius={r}")
 203
 204
 205def _ll2dict(p: LL):
 206    return {"lat": p.lat, "lon": p.lon}
 207
 208
 209def _ll2geohash(p: LL):
 210    return geohash_encode(lat=p.lat, lon=p.lon)
 211
 212
 213def point2geohash(lat: float, lon: float, precision=6):
 214    return geohash_encode(lat=lat, lon=lon, precision=precision)
 215
 216
 217def geohash2point(gh):
 218    return (float(x) for x in geohash_decode(gh))
 219
 220
 221def radial_geohash(lat, lon, radius):
 222    """
 223    Propose geohash cells for a given radius from a given point
 224    """
 225    corners = {}
 226    # Find clockwise points at a radius, E, N, S, W. Bearing for North is 0deg.
 227    p1 = LL(lat, lon)
 228    corners["N"] = _ll2geohash(p1.destination(radius, 0))
 229    corners["E"] = _ll2geohash(p1.destination(radius, 90))
 230    corners["S"] = _ll2geohash(p1.destination(radius, 180))
 231    corners["W"] = _ll2geohash(p1.destination(radius, 270))
 232    return corners
 233
 234
 235def geohash_cells_radially(lat: float, lon: float, radius: int):
 236    """
 237    Create a set of geohashes that contain the given area defined by lat,lon + radius
 238    """
 239    ensw = radial_geohash(lat, lon, radius)
 240    radius_error = _estimate_geohash_precision(radius)
 241    cells = set([])
 242    for directional in ensw:
 243        gh = ensw[directional]
 244        cells.add(gh[0:radius_error - 1])
 245    return cells
 246
 247
 248def geohash_cells(gh: str, radius: int):
 249    """
 250    For a radius in meters generate the cells contained within or touched by that radius.
 251    This is approximate precision based on:
 252    https://en.wikipedia.org/wiki/Geohash   which suggests this approximation could be done mathematically
 253    :return: Dict of 8 directionals ~ E, N, S, W; NE, SE, SW, NW.  If radius desired fits entirely within a
 254    lesser precision geohash grid, the only cell returned is "CENTROID", i.e.  radius=2000 (meters) for a geohash such as
 255    `9q5t`
 256    """
 257    radius_error = _estimate_geohash_precision(radius)
 258    if len(gh) < radius_error:
 259        return {"CENTROID": gh}
 260    ghcell = gh[0:radius_error]
 261    return geohash_neighbors(ghcell)
 262
 263
 264class Coordinate:
 265    """
 266    Convenient class for Lat/Lon pair.
 267    Expects a row dict with 'lat' and 'lon',
 268    or kwd args 'lat', 'lon'
 269    @param row default dictionary
 270    """
 271
 272    def __init__(self, row, lat=None, lon=None):
 273        # TODO: set coordinate to X, Y = None, None by default.
 274        self.X = 0.0
 275        self.Y = 0.0
 276        self.mgrs = None
 277        self.lat = self.Y
 278        self.lon = self.X
 279        # Set geohash on demand, otherwise it can be computed from lat,lon
 280        self.geohash = None
 281
 282        if row:
 283            if 'lat' in row and 'lon' in row:
 284                lat = row['lat']
 285                lon = row['lon']
 286
 287        if lat and lon:
 288            self.set(lat, lon)
 289
 290    def validate(self):
 291        return validate_lat(self.Y) and validate_lon(self.X) and (self.X != 0.0 and self.Y != 0.0)
 292
 293    def set(self, lat, lon):
 294        """ Set the location lat, lon"""
 295        self.X = float(lon)
 296        self.Y = float(lat)
 297        self.lat = self.Y
 298        self.lon = self.X
 299
 300    def format_coord(self):
 301        return format_coord(self.Y, self.X)
 302
 303    def string_coord(self):
 304        return ",".join((str(self.lat), str(self.lon)))
 305
 306    def __str__(self):
 307        if self.Y:
 308            return format_coord(self.Y, self.X)
 309        else:
 310            return 'unset'
 311
 312
 313def bbox(lat: float, lon: float, radius: int):
 314    """
 315       Calculate coordinates for SW and NE corners of a SQUARE bounding box of edge length 2 x radius
 316    :param lat: decimal degree latitude
 317    :param lon: decimal degree longitude
 318    :param radius: meters from center point
 319    """
 320    sw, ne = LL(lon, lat).boundsOf(2 * radius, 2 * radius)
 321    return Coordinate(None, lat=sw.lat, lon=sw.lon), Coordinate(None, lat=ne.lat, lon=ne.lon)
 322
 323
 324def centroid(arr: list):
 325    """
 326
 327    :param arr:  a list of numeric coordinates (y,x)
 328    :return: Coordinate -- the average of sum(y), sum(x)
 329    """
 330    n = len(arr)
 331    if not n:
 332        return None
 333    if n == 1:
 334        y, x = arr[0]
 335        return Coordinate(None, lat=y, lon=x)
 336
 337    lat_sum = math.fsum([y for y, x in arr])
 338    lon_sum = math.fsum([x for y, x in arr])
 339    return Coordinate(None, lat=lat_sum / n, lon=lon_sum / n)
 340
 341
 342class Place(Coordinate):
 343    """
 344    Location or GeoBase
 345    + Coordinate
 346    + Place
 347    + Country
 348
 349    or
 350    Location
 351    + Coordinate
 352       + Place
 353
 354       etc.  Not sure of the best data model for inheritance.
 355    This Python API hopes to simplify the concepts in the Java API.
 356
 357    """
 358
 359    def __init__(self, pid, name, lat=None, lon=None):
 360        Coordinate.__init__(self, None, lat=lat, lon=lon)
 361        # Internal DB or Gazetteer ID
 362        self.id = None
 363        # Public or standards Place ID, e.g., GNS, ISO, etc.
 364        self.place_id = pid
 365        self.name = name
 366
 367        self.is_ascii = False
 368        self.is_upper = False
 369        self.adm1_postalcode = None  # Province Postal CODE?
 370        self.place_postalcode = None  # ZIP CODE?
 371        self.name_type = None
 372        self.name_script = None  # Code or label, e.g. L or LATIN
 373        self.country = None
 374        self.country_code = None
 375        self.country_code_fips = None
 376        self.feature_class = None
 377        self.feature_code = None
 378        self.adm1 = None
 379        self.adm1_name = None
 380        self.adm1_iso = None  # Alternate ISO-based ADM1 code used by NGA and others.
 381        self.adm2 = None
 382        self.adm2_name = None
 383        self.source = None
 384        self.name_bias = 0.0
 385        self.id_bias = 0.0
 386        # Precision is actually "Precision Error" in meters
 387        self.precision = -1
 388        self.method = None
 389        # Population stats, if available. Scale is a power-of-2 scale
 390        # starting at about pop of 2^14 as 0, 32K=1, 64K=2, etc.
 391        self.population = -1
 392        self.population_scale = 0
 393        self.hierarchical_path = None
 394
 395        # Internal fields for gazetteer curation and text analytics:
 396        self.name_group = ""
 397        self.search_only = False
 398
 399    def has_coordinate(self):
 400        return self.validate()
 401
 402    def get_location(self):
 403        """ Returns (LAT, LON) tuple
 404        @return: tuple, (lat,lon)
 405        """
 406        return self.Y, self.X
 407
 408    def set_location(self, lat, lon):
 409        self.set(lat, lon)
 410
 411    def __str__(self):
 412        return '{}, {} @({})'.format(self.name, self.country_code, self.string_coord())
 413
 414    def format_feature(self):
 415        """
 416        Yield a consolidated feature coding.
 417        :return:  X/xxxx  format
 418        """
 419        if self.feature_code:
 420            return f"{self.feature_class}/{self.feature_code}"
 421        return self.feature_class
 422
 423
 424class Country(Coordinate):
 425    """
 426    Country metadata
 427    """
 428
 429    def __init__(self):
 430        Coordinate.__init__(self, None)
 431        self.cc_iso2 = None
 432        self.cc_iso3 = None
 433        self.cc_fips = None
 434        self.place_id = None
 435        self.name = None
 436        self.namenorm = None
 437        self.name_type = None
 438        self.aliases = []
 439        self.is_territory = False
 440        self.is_unique_name = False
 441        self.timezones = []
 442        self.languages = set([])
 443        self.primary_language = None
 444
 445    def __str__(self):
 446        return u'{} ({})'.format(self.name, self.cc_iso2)
 447
 448
 449def country_as_place(ctry: Country, name: str, name_type="N", oid=None):
 450    """
 451    Convert to Place.
 452    :param ctry: Country object
 453    :param name: the name to use
 454    :param name_type:
 455    :param oid: row ID
 456    :return:
 457    """
 458    pl = Place(ctry.cc_iso2, name)
 459    pl.id = oid
 460    pl.place_id = ctry.place_id
 461    pl.name_type = name_type
 462    pl.feature_class = "A"
 463    pl.feature_code = "PCLI"
 464    pl.name_bias = 0.0
 465    pl.id_bias = 0.0
 466    pl.country_code = ctry.cc_iso2
 467    pl.country_code_fips = ctry.cc_fips
 468    pl.adm1 = "0"
 469    pl.source = "ISO"
 470    if ctry.is_territory:
 471        pl.feature_code = "PCL"
 472    pl.set_location(ctry.lat, ctry.lon)
 473    return pl
 474
 475
 476def load_countries(csvpath=None):
 477    """ parses Xponents Core/src/main/resource CSV file country-names-2015.csv
 478        putting out an array of Country objects.
 479        :return: array of Country
 480    """
 481    if not csvpath:
 482        csvpath = pkg_resource_path('country-names-2021.csv')
 483
 484    count = 0
 485    with open(csvpath, 'r', encoding="UTF-8") as fh:
 486        columns = "country_name,FIPS_cc,ISO2_cc,ISO3_cc,unique_name,territory,latitude,longitude".split(',')
 487        fio = get_csv_reader(fh, columns)
 488        for row in fio:
 489
 490            # ignore empty row and header.
 491            if 'country_name' not in row:
 492                continue
 493            if row['country_name'] == 'country_name':
 494                continue
 495            count += 1
 496            C = Country()
 497            C.name = row.get('country_name')
 498            C.cc_iso2 = row.get('ISO2_cc').upper()
 499            C.cc_iso3 = row.get('ISO3_cc').upper()
 500            C.cc_fips = row.get('FIPS_cc').upper()
 501
 502            # Internal data set "place ID"
 503            C.place_id = f"C{C.cc_iso2}#{C.cc_fips}#{count}"
 504
 505            C.is_name_unique = get_bool(row.get('unique_name'))
 506            C.is_territory = get_bool(row.get('territory'))
 507            C.namenorm = C.name.lower()
 508            C.set(row.get("latitude"), row.get("longitude"))
 509
 510            countries.append(C)
 511
 512    for C in countries:
 513        if not C.is_territory and C.cc_iso2 not in countries_by_iso:
 514            countries_by_iso[C.cc_iso2] = C
 515            countries_by_iso[C.cc_iso3] = C
 516
 517        if C.cc_fips and C.cc_fips != "*":
 518            countries_by_fips[C.cc_fips] = C
 519
 520        countries_by_name[C.namenorm] = C
 521
 522    global __loaded
 523    __loaded = len(countries_by_iso) > 1
 524
 525    if __loaded:
 526        if "XKX" in countries_by_iso:
 527            countries_by_iso["XKS"] = countries_by_iso.get("XKX")
 528        if "SJM" in countries_by_iso:
 529            countries_by_iso["XSV"] = countries_by_iso.get("SJM")
 530            countries_by_iso["XJM"] = countries_by_iso.get("SJM")
 531        if "PSE" in countries_by_iso:
 532            countries_by_iso["GAZ"] = countries_by_iso.get("PSE")
 533        if "TLS" in countries_by_iso:
 534            countries_by_iso["TMP"] = countries_by_iso.get("TLS")
 535
 536    return countries
 537
 538
 539def get_us_province(adm1: str):
 540    """
 541
 542    :param adm1:  ADM1 code or for territories,
 543    :return:
 544    """
 545    if not usstates:
 546        raise Exception("Run load_us_provinces() first")
 547    return usstates.get(adm1)
 548
 549
 550def load_us_provinces():
 551    """
 552    Load, store internally and return the LIST of US states.
 553    NOTE: Place objects for US States have a location (unlike list of world provinces).
 554    To get location and feature information in full, you must use the SQLITE DB or Xponents Solr.
 555    :return: array of Place objects
 556    """
 557    csvpath = pkg_resource_path('us-state-metadata.csv')
 558    usstate_places = []
 559    with open(csvpath, 'r', encoding="UTF-8") as fh:
 560        columns = ["POSTAL_CODE", "ADM1_CODE", "STATE", "LAT", "LON", "FIPS_CC", "ISO2_CC"]
 561        io = get_csv_reader(fh, columns)
 562        for row in io:
 563            if row['POSTAL_CODE'] == 'POSTAL_CODE': continue
 564
 565            cc = row["ISO2_CC"]
 566            adm1_code = row["ADM1_CODE"][2:]
 567            postal_code = row["POSTAL_CODE"]
 568            # HASC path
 569            place_id = make_HASC(cc, adm1_code)
 570            postal_id = make_HASC(cc, row["POSTAL_CODE"])
 571            adm1 = Place(place_id, row["STATE"], lat=row["LAT"], lon=row["LON"])
 572            adm1.feature_class = "A"
 573            adm1.feature_code = "ADM1"
 574            adm1.name_type = "N"
 575            adm1.geohash = geohash_encode(adm1.lat, adm1.lon, precision=6)
 576
 577            adm1.country_code = cc
 578            adm1.adm1 = adm1_code
 579            adm1.adm1_postalcode = row["POSTAL_CODE"]
 580            adm1.source = "OpenSextant"
 581
 582            # Code alone:
 583            usstates[adm1_code] = adm1
 584            usstates[postal_code] = adm1
 585            usstates[place_id] = adm1
 586            usstates[postal_id] = adm1
 587
 588            usstate_places.append(adm1)
 589    return usstate_places
 590
 591
 592def load_provinces():
 593    """
 594    Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc.
 595    NOTE: Location information is not included in this province listing.  Just Country, ADM1, Name tuples.
 596    NOTE: This reflects only GEONAMES ADMIN1 CODES ASCII -- which portrays most of the world (except US) as FIPS,
 597    not ISO.
 598    :return:  dict
 599    """
 600    return load_world_adm1()
 601
 602
 603def load_world_adm1():
 604    """
 605    Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc.
 606    Coding for ADM1 is FIPS based mostly
 607    :return:  dict
 608    """
 609    # Load local country data first, if you have it. US is only one so far.
 610    load_us_provinces()
 611
 612    SOURCE_ID = "G"
 613    csvpath = pkg_resource_path(os.path.join('geonames.org', 'admin1CodesASCII.txt'))
 614
 615    with open(csvpath, 'r', encoding="UTF-8") as fh:
 616        adm1Splitter = re.compile(r'\.')
 617        lineSplitter = re.compile('\t')
 618        for line in fh:
 619            row = lineSplitter.split(line.strip())
 620            src_id = f"{SOURCE_ID}{row[3]}"
 621            if not row[3]:
 622                src_id = None
 623            adm1 = Place(src_id, row[1])
 624            adm1.feature_class = "A"
 625            adm1.feature_code = "ADM1"
 626            adm1.name_type = "N"
 627
 628            cc_adm1 = adm1Splitter.split(row[0], 2)
 629            adm1.country_code = cc_adm1[0]
 630            adm1.adm1 = parse_admin_code(cc_adm1[1])
 631            adm1.source = "G"  # Geonames.org coded.
 632            hasc = make_HASC(adm1.country_code, adm1.adm1)
 633            if adm1.country_code == "US":
 634                adm1.source = "USGS"
 635                if hasc in usstates:
 636                    us_place = usstates[hasc]
 637                    us_place.name = adm1.name
 638                    hasc = make_HASC(us_place.country_code, us_place.adm1)
 639                    adm1_by_hasc[hasc] = adm1
 640
 641            adm1_by_hasc[hasc] = adm1
 642    return adm1_by_hasc
 643
 644
 645def get_province(cc, adm1):
 646    """ REQUIRES you load_provinces() first.
 647    """
 648    return adm1_by_hasc.get(make_HASC(cc, adm1))
 649
 650
 651def get_country(namecode, standard="ISO"):
 652    """
 653    Get Country object given a name, ISO or FIPS code.  For codes, you must be
 654    clear about which standard the code is based in. Some code collisions exist.
 655    "ZZ" will NOT be returned for the empty code -- if you pass in a NULL or empty
 656    country code you may have a data quality issue.
 657    :param namecode: 2- or 3-alpha code.
 658    :param standard: 'ISO' or 'FIPS', 'name'
 659    :return:  Country object
 660    """
 661    if not namecode or not isinstance(namecode, str):
 662        return None
 663
 664    if not __loaded:
 665        load_countries()
 666
 667    lookup = namecode.upper()
 668    if standard == "ISO":
 669        return countries_by_iso.get(lookup)
 670    elif standard == "FIPS":
 671        return countries_by_fips.get(lookup)
 672    elif standard == "name":
 673        return countries_by_name.get(namecode.lower())
 674    else:
 675        raise Exception("That standards body '{}' is not known for code {}".format(standard, namecode))
 676
 677
 678def load_major_cities():
 679    """
 680    Loads City geo/demographic information -- this does not try to parse all name variants.
 681
 682    This produces Geonames use of FIPS codes.
 683    :return:
 684    """
 685    csvpath = pkg_resource_path(os.path.join('geonames.org', 'cities15000.txt'))
 686
 687    from csv import reader
 688    with open(csvpath, 'r', encoding="UTF-8") as fh:
 689        rdr = reader(fh, dialect="excel", delimiter="\t")
 690        cities = []
 691        for line in rdr:
 692            if len(line) != 19:
 693                continue
 694            if not line[4]:
 695                print("Not location info for City ~ ", line[0])
 696                continue
 697            #          ID       NAME     LAT                 LON
 698            pl = Place(line[0], line[1], lat=float(line[4]), lon=float(line[5]))
 699            pl.feature_class = line[6]
 700            pl.feature_code = line[7]
 701            pl.country_code = line[8]
 702            alt_cc = line[9]
 703            if alt_cc and alt_cc != pl.country_code:
 704                print("Alternate Country Code", alt_cc)
 705            pl.adm1 = parse_admin_code(line[10])
 706            pl.adm2 = line[11]
 707            # pl.geohash = geohash_encode(pl.lat, pl.lon, precision=6)
 708            try:
 709                pl.population = int(line[14])
 710                pl.population_scale = popscale(pl.population, feature="city")
 711            except:
 712                pass
 713            cities.append(pl)
 714    return cities
 715
 716
 717_pop_scale = {
 718    "city": 13,  # 2^13 ~    8,000
 719    "district": 15,  # 2^15 ~   32,000
 720    "province": 17,  # 2^17 ~  130,000
 721}
 722
 723
 724def popscale(population, feature="city"):
 725    """
 726    Given a population in context of the feature -- provide a
 727    approximation of the size of the feature on a 10 point scale.
 728
 729    Approximations for 10 points:
 730    Largest city is ~15 million
 731    // Few cities top 30 million, e.g., 2^25.  popscale = 25 - 13 = 12.
 732    Largest province is ~135 million
 733
 734    :param population:
 735    :param feature:  city, district, or province allowed.
 736    :return: index on 0..10 scale.
 737    """
 738    if population < 1:
 739        return 0
 740    shifter = _pop_scale.get(feature, 20)
 741    index = mathlog(population, 2) - shifter
 742    return int(index) if index > 0 else 0
 743
 744
 745def is_political(feat_code: str):
 746    """Test a feature code"""
 747    if not feat_code: return False
 748    return feat_code.startswith("PCL")
 749
 750
 751def is_country(feat_code: str):
 752    """Test a feature code"""
 753    return "PCLI" == feat_code
 754
 755
 756def is_administrative(feat: str):
 757    if not feat: return False
 758    return "A" == feat.upper()
 759
 760
 761def is_populated(feat: str):
 762    if not feat: return False
 763    return "P" == feat.upper()
 764
 765
 766def is_academic(feat_class: str, feat_code: str) -> bool:
 767    """
 768
 769    :param feat_class: geonames class
 770    :param feat_code:  geonames designation code
 771    :return:
 772    """
 773    return feat_class and feat_code and feat_class == "S" and feat_code.startswith("SCH")
 774
 775
 776def characterize_location(place: Place, label: str):
 777    """
 778    Experimental: Not comprehensive characterization. This is intended to summarize PlaceCandidates extracted
 779    from text.
 780
 781    Describe a Place in terms of a plain language feature type and the geographic scope or resolution.
 782    E.g, Place object "P/PPL", "city"
 783    E.g,  Place object "A/ADM4"  "admin"
 784    E.g,  Place object "S/COORD", "site"
 785
 786    :param place: Place object
 787    :param label:  text match label, e.g., 'country', 'place', 'coord', etc.
 788    :return: feature string, resolution string
 789    """
 790    res = label
 791    fc = place.feature_class
 792    resolutions = {
 793        "A": "admin",
 794        "P": "city",
 795        "S": "site",
 796        "H": "water",
 797        "R": "path",
 798        "V": "area",
 799        "T": "area",
 800        "L": "area"
 801    }
 802
 803    # Note label should be a limited set -- country, postal, coord, place.
 804    if label == "place":
 805        res = resolutions.get(fc, label)
 806    if label == "coord":
 807        res = "site"
 808
 809    return place.format_feature(), res
 810
 811
 812class TextEntity:
 813    """
 814    A Text span.
 815
 816    classes and routines that align with Java org.opensextant.data and org.opensextant.extraction
 817
 818    * TextEntity: represents a span of text
 819    * TextMatch: a TextEntity matched by a particular routine.  This is the basis for most all
 820    extractors and annotators in OpenSetant.
 821    """
 822
 823    def __init__(self, text, start, end):
 824        self.text = text
 825        self.start = start
 826        self.end = end
 827        self.len = -1
 828        self.is_duplicate = False
 829        self.is_overlap = False
 830        self.is_submatch = False
 831        if self._is_valid():
 832            self.len = self.end - self.start
 833
 834    def __str__(self):
 835        return f"{self.text}({self.start},{self.end})"
 836
 837    def _is_valid(self):
 838        if self.start is None or self.end is None:
 839            return False
 840        return self.start >= 0 and self.end >= 0
 841
 842    def contains(self, x1):
 843        """ if this span contains an offset x1
 844        :param x1:
 845        """
 846        if self.start < 0 or self.end < 0:
 847            return False
 848        return self.start <= x1.start < x1.end <= self.end
 849
 850    def exact_match(self, t):
 851        return t.start == self.start and t.end == self.end and self._is_valid()
 852
 853    def is_within(self, t):
 854        """
 855        if the given annotation, t, contains this
 856        :param t:
 857        :return:
 858        """
 859        return t.contains(self.start) and t.contains(self.end)
 860
 861    def is_after(self, t):
 862        return self.start > t.end
 863
 864    def is_before(self, t):
 865        return self.end < t.start
 866
 867    def overlaps(self, t):
 868        """
 869        Determine if t overlaps self.  If Right or Left match, t overlaps if it is longer.
 870        If t is contained entirely within self, then it is not considered overlap -- it is Contained within.
 871        :param t:
 872        :return:
 873        """
 874        #    a1     a2
 875        #  t1     t2        RIGHT skew
 876        #    a1     a2
 877        #       t1     t2   LEFT skew
 878        #
 879        #   a1  a2
 880        #   t1      t2  RIGHT match
 881        # t1    t2      LEFT match
 882        #   a1  a2
 883        #       t1   t2  minimal OVERLAP
 884        skew_right = t.start < self.start <= t.end < self.end
 885        skew_left = self.start < t.start <= self.end < t.end
 886        left_match = self.end == t.end
 887        right_match = self.start == t.start
 888        if skew_right or skew_left:
 889            return True
 890        return (right_match and skew_left) or (left_match and skew_right)
 891
 892
 893class TextMatch(TextEntity):
 894    """
 895    An entity matched by some tagger; it is a text span with lots of metadata.
 896    """
 897
 898    def __init__(self, *args, label=None):
 899        TextEntity.__init__(self, *args)
 900        self.id = None
 901        self.label = label
 902        self.filtered_out = False
 903        self.attrs = dict()
 904
 905    def __str__(self):
 906        return f"{self.label}/{self.text}({self.start},{self.end})"
 907
 908    def populate(self, attrs: dict):
 909        """
 910        Populate a TextMatch to normalize the set of attributes -- separate class fields on TextMatch from additional
 911        optional attributes.
 912        :param attrs: dict of standard Xponents API outputs.
 913        :return:
 914        """
 915        self.id = attrs.get("match-id")
 916        self.label = attrs.get("type")
 917        self.attrs.update(attrs)
 918        self.filtered_out = get_bool(self.attrs.get("filtered-out"))
 919        for k in ['len', 'length']:
 920            if k in self.attrs:
 921                self.len = self.attrs.get(k)
 922        if self.len is not None and self.start >= 0 and not self.end:
 923            self.end = self.start + self.len
 924
 925        # Remove attribute keys that may be confusing.
 926        for fld in ['offset', 'start', 'end', 'len', 'length', 'type', 'filtered-out', 'text', 'matchtext']:
 927            if fld in self.attrs:
 928                del self.attrs[fld]
 929
 930    def normalize(self):
 931        """
 932        Optional, but recommended routine to normalize the matched data.
 933        That is, parse fields, uppercase, streamline punctuation, etc.
 934        As well, given such normalization result, this is the opportunity to additionally
 935        validate the match.
 936        :return:
 937        """
 938        pass
 939
 940
 941class PlaceCandidate(TextMatch):
 942    """
 943    A TextMatch representing any geographic mention -- a Place object will
 944    represent the additional attributes for the chosen place.
 945    see also in Java org.opensextant.extractors.geo.PlaceCandidate class, which is
 946    a more in-depth version of this.  This Python class represents the
 947    response from the REST API, for example.
 948
 949    """
 950
 951    def __init__(self, *args, **kwargs):
 952        TextMatch.__init__(self, *args, **kwargs)
 953        self.confidence = 0
 954        self.rules = []
 955        self.is_country = False
 956        self.place = None
 957        # Location certainty is a simple meausre 0.0 to 1.0 to convey confidence + precision in one metric
 958        self.location_certainty = -1
 959
 960    def populate(self, attrs: dict):
 961        """
 962        Deserialize the attributes dict from either TextMatch schema or Place schema
 963        :param attrs:
 964        :return:
 965        """
 966        TextMatch.populate(self, attrs)
 967        geo = Place(None, attrs.get("name"), lat=attrs.get("lat"), lon=attrs.get("lon"))
 968        if not geo.name:
 969            geo.name = self.text
 970
 971        # attribute / schema does not align 100% here.
 972        geo.country_code = attrs.get("cc")
 973        geo.adm1 = attrs.get("adm1")
 974        geo.precision = attrs.get("prec")
 975        geo.feature_class = attrs.get("feat_class")
 976        geo.feature_code = attrs.get("feat_code")
 977        geo.adm1_name = attrs.get("province-name")
 978        geo.geohash = attrs.get("geohash")
 979        geo.method = attrs.get("method")
 980
 981        # Combined match + geo-location confidence:
 982        self.confidence = attrs.get("confidence")
 983        if "rules" in attrs:
 984            # One or more geo-inferencing rules
 985            self.rules = attrs["rules"].split(";")
 986
 987        self.is_country = self.label == "country" or is_country(geo.feature_code)
 988        if self.is_country:
 989            # Zero out country location; Let user derive country from metadata.
 990            geo.lat = None
 991            geo.lon = None
 992        self.place = geo
 993        # Items like coordinates and cities, etc receive a location certainty.  Countries do not.
 994        self.location_certainty = location_accuracy(self.confidence, geo.precision)
 995
 996
 997class Extractor(ABC):
 998    def __init__(self):
 999        self.id = None
1000
1001    @abstractmethod
1002    def extract(self, text, **kwargs):
1003        """
1004
1005        :param text: Unicode text input
1006        :keyword features: an array of features to extract, e.g., "coordinate", "place", "MONEY"
1007        :return: array of TextMatch
1008        """
1009        pass
1010
1011
1012def render_match(m):
1013    """
1014
1015    :param m: TextMatch
1016    :return: dict
1017    """
1018    if not isinstance(m, TextMatch):
1019        return None
1020    dct = {
1021        "type": m.label,
1022        "text": m.text,
1023        "offset": m.start,
1024        "length": m.len,
1025        "filtered-out": m.filtered_out
1026    }
1027    return dct
1028
1029
1030NOT_SUBMATCH = 0
1031IS_SUBMATCH = 1
1032IS_DUPLICATE = 2
1033
1034
1035def reduce_matches(matches):
1036    """
1037    Mark each match if it is a submatch or overlap or exact duplicate of other.
1038    :param matches: array of TextMatch (or TextEntity). This is the more object oriented version
1039    of reduce_matches_dict
1040    :return:
1041    """
1042    if len(matches) < 2:
1043        return
1044    loop = 0
1045    for M in matches:
1046        loop += 1
1047        if M.filtered_out:
1048            continue
1049        m1 = M.start
1050        m2 = M.end
1051        # print(M.text, loop)
1052
1053        # In this loop you have to compare M against all N
1054        #   Cannot exit loop on first match or overlap.
1055        for N in matches[loop:]:
1056            if N.filtered_out:
1057                continue
1058
1059            n1 = N.start
1060            n2 = N.end
1061
1062            if m2 < n1  or m1 > n2:
1063                # M entirely before N
1064                # M entirely after N
1065                continue
1066
1067            # print("\t", N.text, N.start, N.is_duplicate)
1068            if n1 == m1 and n2 == m2:
1069                # Exact duplicate - Mark N as dup, as M is first in array, but only if M is a valid match.
1070                N.is_duplicate = True
1071            elif n1 <= m1 < m2 <= n2:
1072                # M is within N span
1073                M.is_submatch = True
1074            elif m1 <= n1 < n2 <= m2:
1075                # N is within M span
1076                N.is_submatch = True
1077            elif m1 <= n2 <= m2 or n1 <= m2 <= n2:
1078                #  n1    n2
1079                #     m1    m2
1080                M.is_overlap = True
1081                N.is_overlap = True
1082
1083
1084def reduce_matches_dict(matches):
1085    """
1086    Accepts an array annotations (dict). Inserts the "submatch" flag in dict if there is a
1087    submatch (that is, if another TextEntity A wholly contains another, B -- B is a submatch).
1088    We just have to loop through half of the array ~ comparing each item to each other item once.
1089
1090    :param matches: array of dicts.
1091    """
1092    _max = len(matches)
1093    if _max < 2:
1094        return
1095
1096    loops = 0
1097    for i in range(0, _max):
1098        M = matches[i]
1099        m1 = M['start']
1100        m2 = M['end']
1101
1102        for j in range(i + 1, _max):
1103            loops += 1
1104            N = matches[j]
1105            n1 = N['start']
1106            n2 = N['end']
1107
1108            if m2 < n1:
1109                # M before N
1110                continue
1111
1112            if m1 > n2:
1113                # M after N
1114                continue
1115
1116            # Check for filtered-out matches not done in this version.
1117            #
1118            if n1 == m1 and n2 == m2:
1119                N['submatch'] = IS_DUPLICATE
1120
1121            elif n1 <= m1 < m2 <= n2:
1122                M['submatch'] = IS_SUBMATCH
1123                # Determined state of M.
1124                # break this internal loop
1125
1126            elif m1 <= n1 < n2 <= m2:
1127                N['submatch'] = IS_SUBMATCH
1128                # Determined state of N,
1129                # But possibly more N contained within M. Do not break yet.
1130    return
1131
1132
1133# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
1134# Language Code Support
1135# ISO 639 code book support -- Language codes
1136# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
1137
1138class Language:
1139    """
1140    Language Represents a single code/name pair
1141    Coding is 3-char or 2-char, either is optional.
1142    In some situations there are competeing 2-char codes in code books, such as Lib of Congress (LOC)
1143    """
1144
1145    def __init__(self, iso3, iso2, nmlist: list, locale=None):
1146        self.code_iso3 = iso3
1147        self.code = iso2
1148        self.names = nmlist
1149        self.locale = locale
1150        if nmlist:
1151            if not isinstance(nmlist, list):
1152                raise Exception("Name list is a list of names for the language. The first one is the default.")
1153
1154    def get_name(self):
1155        if self.names:
1156            return self.names[0]
1157        return None
1158
1159    def name_code(self):
1160        if self.names:
1161            return self.names[0].lower()
1162        return None
1163
1164    def __str__(self):
1165        return f"{self.name_code()}({self.code})"
1166
1167
1168# ISO 639 lookup
1169language_map = {}
1170
1171
1172def list_languages():
1173    """
1174    List out a flattened list of languages, de-duplicated by ISO2 language ID.
1175
1176    TODO: alternatively list out every language
1177    :return:
1178    """
1179    load_languages()
1180    langs = []
1181    visited = set([])
1182    for lg in language_map:
1183        L = language_map[lg]
1184        if L.code:
1185            if not L.code in visited:
1186                langs.append(L)
1187                visited.add(L.code)
1188        if L.code_iso3:
1189            if not L.code_iso3 in visited:
1190                langs.append(L)
1191                visited.add(L.code_iso3)
1192    return langs
1193
1194
1195def add_language(lg: Language, override=False):
1196    """
1197    The language map for ISO 2-alpha and 3-alpha codes should be protected from language IDs that are dialect or locale
1198
1199    "en" ==> en-au, en-gb, en-us, etc.?  This is ambiguous
1200    The reverse is true -- "en-gb" is at least "en" or "eng" english
1201
1202    :param lg:
1203    :param override:
1204    :return:
1205    """
1206    if not lg:
1207        return
1208
1209    codes = []
1210    if lg.code:
1211        codes.append(lg.code.lower())
1212    if lg.code_iso3:
1213        codes.append(lg.code_iso3.lower())
1214    if lg.locale:
1215        codes.append(lg.locale.lower())
1216
1217    if lg.names:
1218        for nm in lg.names:
1219            lang_name_norm = nm.lower()
1220            codes.append(lang_name_norm)
1221            if "modern" in lang_name_norm:
1222                nm2 = get_list(lang_name_norm, delim=",")[0]
1223                codes.append(nm2)
1224                override = True
1225
1226    for k in set(codes):
1227        exists = k in language_map
1228
1229        # coding rule: 2 or 3 char alpha codes for ISO or Biblio code books are not overriden.
1230        if len(k) <= 3 and exists:
1231            continue
1232
1233        if exists and not override:
1234            raise Exception(f"Forcibly remap language code? {k}")
1235
1236        language_map[k] = lg
1237
1238
1239def get_language(code: str) -> Language:
1240    """
1241
1242    :param code: language ID or name
1243    :return: Language or None
1244    """
1245    if not code:
1246        return None
1247
1248    load_languages()
1249    k = code.lower()
1250    # Most cases:
1251    if len(k) <= 3:
1252        return language_map.get(k)
1253
1254    # Code is odd, like a locale?  "EN_GB" or en-gb, etc.
1255    if k.isalpha():
1256        return language_map.get(k)
1257
1258    if k in language_map:
1259        return language_map.get(k)
1260
1261    for delim in ["-", " ", "_"]:
1262        k = k.split(delim)[0]
1263        if k in language_map:
1264            return language_map.get(k)
1265    return None
1266
1267
1268def get_lang_name(code: str):
1269    if not code:
1270        return None
1271
1272    L = get_language(code)
1273    if L:
1274        return L.get_name()
1275
1276    raise Exception(f"No such language ID {code}")
1277
1278
1279def get_lang_code(txt: str):
1280    L = get_language(txt)
1281    if L:
1282        return L.code
1283
1284    raise Exception(f"No such language ID {txt}")
1285
1286
1287def is_lang_romance(lg: str):
1288    """If spanish, portuguese, italian, french, romanian"""
1289    L = get_language(lg)
1290    if not L:
1291        return False
1292    c = L.code
1293    return c in {"es", "pt", "it", "fr", "ro"}
1294
1295
1296def is_lang_euro(lg: str):
1297    """
1298    true if lang is European -- romance, german, english, etc
1299    :param lg:
1300    :return:
1301    """
1302    L = get_language(lg)
1303    if not L:
1304        return False
1305    c = L.code
1306    return c in {"es", "pt", "it", "fr", "ro",
1307                 "de", "en",
1308                 "bu", "cz", "po", "nl", "el", "sq"}
1309
1310
1311def is_lang_english(lg: str):
1312    L = get_language(lg)
1313    if not L:
1314        return False
1315    return L.code == "en"
1316
1317
1318def is_lang_cjk(lg: str):
1319    L = get_language(lg)
1320    if not L:
1321        return False
1322    return L.code in {"zh", "zt", "ko", "ja"}
1323
1324
1325def is_lang_chinese(lg: str):
1326    L = get_language(lg)
1327    if not L:
1328        return False
1329    return L.code in {"zh", "zt"}
1330
1331
1332IGNORE_LANGUAGES = {"gaa"}
1333
1334
1335def load_languages():
1336    global __language_map_init
1337    if __language_map_init:
1338        return
1339
1340    fpath = pkg_resource_path("ISO-639-2_utf-8.txt")
1341    langset = load_datafile(fpath, delim="|")
1342    for lang in langset:
1343        lang_names = get_list(lang[3], delim=";")
1344
1345        iso3 = lang[0]
1346        bib3 = lang[1]
1347        if iso3 and iso3.startswith("#"):
1348            continue
1349
1350        if iso3 in IGNORE_LANGUAGES:
1351            continue
1352
1353        iso2=lang[2]
1354        L = Language(iso3, iso2, lang_names)
1355        add_language(L)
1356        if bib3:
1357            L = Language(bib3, iso2, lang_names)
1358            add_language(L, override=True)
1359
1360    # Some odd additions -- Bibliographic vs. Terminologic codes may vary.
1361    # FRE vs. FRA is valid for French, for example.
1362    #
1363    for lg in [Language("fra", "fr", ["French"]),
1364
1365               Language("zho", "zh", ["Chinese"], locale="zh-cn"),
1366
1367               Language(None, "zt", ["Traditional Chinese"]),
1368               Language(None, "zt", ["Traditional Chinese/Taiwain"], locale="zh-tw"),
1369
1370               Language("prs", "dr", ["Dari", "Afghan Persian"], locale="fa-AF"),
1371               Language("prs", "dr", ["Dari", "Afghan Persian"]),
1372               Language("fas", "fa", ["Farsi", "Persian"], locale="fa-IR"),
1373               Language("eng", "en", ["English"]),
1374
1375               Language("eng", "en", ["English/British"], locale="en-gb"),
1376               Language("eng", "en", ["English/USA"], locale="en-us"),
1377               Language("eng", "en", ["English/United Kingdom"], locale="en-uk"),
1378               Language("eng", "en", ["English/Canadian"], locale="en-ca"),
1379               Language("eng", "en", ["English/Australian"], locale="en-au")]:
1380
1381        add_language(lg, override=True)
1382
1383
1384    __language_map_init = True
def logger_config(logger_level: str, pkg: str):
27def logger_config(logger_level: str, pkg: str):
28    """
29    LOGGING
30    :param logger_level:
31    :param pkg: Name of package
32    :return:
33    """
34    handlers = {
35        pkg: {
36            'class': 'logging.StreamHandler',
37            'stream': sys.stdout,
38            'formatter': 'default'
39        }
40    }
41    dictConfig({
42        'version': 1,
43        'formatters': {
44            'default': {
45                'format': '%(levelname)s in %(module)s: %(message)s',
46            }
47        },
48        'handlers': handlers,
49        'root': {
50            'level': logger_level,
51            'handlers': [pkg]
52        }
53    })
54
55    _log = getLogger(pkg)
56    _log.setLevel(logger_level)
57    return _log

LOGGING :param logger_level: :param pkg: Name of package :return:

def make_HASC(cc, adm1, adm2=None):
69def make_HASC(cc, adm1, adm2=None):
70    """
71    Create a simplie hiearchical path for a boundary
72    :param cc:
73    :param adm1:
74    :param adm2:
75    :return:
76    """
77    if not adm1:
78        adm1 = '0'
79    if adm2:
80        return '{}.{}.{}'.format(cc, adm1, adm2)
81    else:
82        return '{}.{}'.format(cc, adm1)

Create a simplie hiearchical path for a boundary :param cc: :param adm1: :param adm2: :return:

def format_coord(lat, lon):
85def format_coord(lat, lon):
86    """
87    2.6, 3.6 format.
88    :param lat: latitude
89    :param lon: longitude
90    :return: string
91    """
92    return '{:2.5f},{:3.5f}'.format(float(lat), float(lon))

2.6, 3.6 format. :param lat: latitude :param lon: longitude :return: string

def parse_admin_code(adm1, delim='.'):
103def parse_admin_code(adm1, delim="."):
104    """
105    :param delim:
106    :param adm1: admin level 1 code
107    :return: ADM1 code if possible.
108    """
109    if not adm1:
110        return "0"
111
112    code = adm1
113    if "?" in adm1:
114        code = "0"
115    elif delim in adm1:
116        cc2, code = adm1.split(delim)
117    # Normalize Country-level.  Absent ADM1 levels are assigned "0" anyway
118    if code.strip() in {"", None, "0", "00"}:
119        code = "0"
120    return code

:param delim: :param adm1: admin level 1 code :return: ADM1 code if possible.

def distance_cartesian(x1, y1, x2, y2):
123def distance_cartesian(x1, y1, x2, y2):
124    """
125        Given X1, Y1 and X2, Y2 provide the 2-D Cartesian distance between two points.
126    """
127    xdist = x2 - x1
128    ydist = y2 - y1
129    return sqrt(xdist * xdist + ydist * ydist)

Given X1, Y1 and X2, Y2 provide the 2-D Cartesian distance between two points.

def distance_haversine(ddlon1, ddlat1, ddlon2, ddlat2):
135def distance_haversine(ddlon1, ddlat1, ddlon2, ddlat2):
136    """
137    Returns distance in meters for given decimal degree Lon/Lat (X,Y) pair
138
139    http://www.movable-type.co.uk/scripts/latlong.html
140    """
141    lat1 = radians(ddlat1)
142    lon1 = radians(ddlon1)
143    lat2 = radians(ddlat2)
144    lon2 = radians(ddlon2)
145    dLat = lat2 - lat1
146    dLon = lon2 - lon1
147    a = (sin(dLat / 2) * sin(dLat / 2)) + (cos(lat1) * cos(lat2) * sin(dLon / 2) * sin(dLon / 2))
148    c = 2 * atan2(sqrt(a), sqrt(1 - a))
149    return int(EARTH_RADIUS_WGS84 * c)

Returns distance in meters for given decimal degree Lon/Lat (X,Y) pair

http://www.movable-type.co.uk/scripts/latlong.html

def location_accuracy(conf, prec_err):
152def location_accuracy(conf, prec_err):
153    """
154    Both confidence and precision error are required to be non-zero and positive.
155
156    Scale ACCURACY by confidence, and inversely log10( R^2 )
157    Decreasing accuracy with increasing radius, but keep scale on the order of visible things,
158    e.g., 0.01 to 1.00.  This is only one definition of accuracy.
159
160    Consider confidence = 100 (aka 100% chance we have the right location)
161
162    * Country precision ~ +/- 100KM is accuracy = 0.091
163    * GPS precision is   10 M precision is accuracy 0.33
164    * 1M precision , accuracy =  1.0, (1 / (1+log(1*1)) = 1/1.  In other words a 1m error is basically "perfect"
165
166    :param conf: confidence on 100 point scale (0-100)
167    :param prec_err: error in location precision, meters
168    :return:
169    """
170    if not conf or not prec_err:
171        return 0
172    if conf < 0 or prec_err < 0:
173        return 0
174    scale = 0.01 * conf
175    inv_prec = 1 + log10(prec_err * prec_err)
176    acc = scale / inv_prec
177    return float(f"{acc:0.4f}")

Both confidence and precision error are required to be non-zero and positive.

Scale ACCURACY by confidence, and inversely log10( R^2 ) Decreasing accuracy with increasing radius, but keep scale on the order of visible things, e.g., 0.01 to 1.00. This is only one definition of accuracy.

Consider confidence = 100 (aka 100% chance we have the right location)

  • Country precision ~ +/- 100KM is accuracy = 0.091
  • GPS precision is 10 M precision is accuracy 0.33
  • 1M precision , accuracy = 1.0, (1 / (1+log(1*1)) = 1/1. In other words a 1m error is basically "perfect"

:param conf: confidence on 100 point scale (0-100) :param prec_err: error in location precision, meters :return:

def radial_geohash(lat, lon, radius):
222def radial_geohash(lat, lon, radius):
223    """
224    Propose geohash cells for a given radius from a given point
225    """
226    corners = {}
227    # Find clockwise points at a radius, E, N, S, W. Bearing for North is 0deg.
228    p1 = LL(lat, lon)
229    corners["N"] = _ll2geohash(p1.destination(radius, 0))
230    corners["E"] = _ll2geohash(p1.destination(radius, 90))
231    corners["S"] = _ll2geohash(p1.destination(radius, 180))
232    corners["W"] = _ll2geohash(p1.destination(radius, 270))
233    return corners

Propose geohash cells for a given radius from a given point

def geohash_cells_radially(lat: float, lon: float, radius: int):
236def geohash_cells_radially(lat: float, lon: float, radius: int):
237    """
238    Create a set of geohashes that contain the given area defined by lat,lon + radius
239    """
240    ensw = radial_geohash(lat, lon, radius)
241    radius_error = _estimate_geohash_precision(radius)
242    cells = set([])
243    for directional in ensw:
244        gh = ensw[directional]
245        cells.add(gh[0:radius_error - 1])
246    return cells

Create a set of geohashes that contain the given area defined by lat,lon + radius

def geohash_cells(gh: str, radius: int):
249def geohash_cells(gh: str, radius: int):
250    """
251    For a radius in meters generate the cells contained within or touched by that radius.
252    This is approximate precision based on:
253    https://en.wikipedia.org/wiki/Geohash   which suggests this approximation could be done mathematically
254    :return: Dict of 8 directionals ~ E, N, S, W; NE, SE, SW, NW.  If radius desired fits entirely within a
255    lesser precision geohash grid, the only cell returned is "CENTROID", i.e.  radius=2000 (meters) for a geohash such as
256    `9q5t`
257    """
258    radius_error = _estimate_geohash_precision(radius)
259    if len(gh) < radius_error:
260        return {"CENTROID": gh}
261    ghcell = gh[0:radius_error]
262    return geohash_neighbors(ghcell)

For a radius in meters generate the cells contained within or touched by that radius. This is approximate precision based on: https://en.wikipedia.org/wiki/Geohash which suggests this approximation could be done mathematically :return: Dict of 8 directionals ~ E, N, S, W; NE, SE, SW, NW. If radius desired fits entirely within a lesser precision geohash grid, the only cell returned is "CENTROID", i.e. radius=2000 (meters) for a geohash such as 9q5t

class Coordinate:
265class Coordinate:
266    """
267    Convenient class for Lat/Lon pair.
268    Expects a row dict with 'lat' and 'lon',
269    or kwd args 'lat', 'lon'
270    @param row default dictionary
271    """
272
273    def __init__(self, row, lat=None, lon=None):
274        # TODO: set coordinate to X, Y = None, None by default.
275        self.X = 0.0
276        self.Y = 0.0
277        self.mgrs = None
278        self.lat = self.Y
279        self.lon = self.X
280        # Set geohash on demand, otherwise it can be computed from lat,lon
281        self.geohash = None
282
283        if row:
284            if 'lat' in row and 'lon' in row:
285                lat = row['lat']
286                lon = row['lon']
287
288        if lat and lon:
289            self.set(lat, lon)
290
291    def validate(self):
292        return validate_lat(self.Y) and validate_lon(self.X) and (self.X != 0.0 and self.Y != 0.0)
293
294    def set(self, lat, lon):
295        """ Set the location lat, lon"""
296        self.X = float(lon)
297        self.Y = float(lat)
298        self.lat = self.Y
299        self.lon = self.X
300
301    def format_coord(self):
302        return format_coord(self.Y, self.X)
303
304    def string_coord(self):
305        return ",".join((str(self.lat), str(self.lon)))
306
307    def __str__(self):
308        if self.Y:
309            return format_coord(self.Y, self.X)
310        else:
311            return 'unset'

Convenient class for Lat/Lon pair. Expects a row dict with 'lat' and 'lon', or kwd args 'lat', 'lon' @param row default dictionary

def set(self, lat, lon):
294    def set(self, lat, lon):
295        """ Set the location lat, lon"""
296        self.X = float(lon)
297        self.Y = float(lat)
298        self.lat = self.Y
299        self.lon = self.X

Set the location lat, lon

def bbox(lat: float, lon: float, radius: int):
314def bbox(lat: float, lon: float, radius: int):
315    """
316       Calculate coordinates for SW and NE corners of a SQUARE bounding box of edge length 2 x radius
317    :param lat: decimal degree latitude
318    :param lon: decimal degree longitude
319    :param radius: meters from center point
320    """
321    sw, ne = LL(lon, lat).boundsOf(2 * radius, 2 * radius)
322    return Coordinate(None, lat=sw.lat, lon=sw.lon), Coordinate(None, lat=ne.lat, lon=ne.lon)

Calculate coordinates for SW and NE corners of a SQUARE bounding box of edge length 2 x radius :param lat: decimal degree latitude :param lon: decimal degree longitude :param radius: meters from center point

def centroid(arr: list):
325def centroid(arr: list):
326    """
327
328    :param arr:  a list of numeric coordinates (y,x)
329    :return: Coordinate -- the average of sum(y), sum(x)
330    """
331    n = len(arr)
332    if not n:
333        return None
334    if n == 1:
335        y, x = arr[0]
336        return Coordinate(None, lat=y, lon=x)
337
338    lat_sum = math.fsum([y for y, x in arr])
339    lon_sum = math.fsum([x for y, x in arr])
340    return Coordinate(None, lat=lat_sum / n, lon=lon_sum / n)

:param arr: a list of numeric coordinates (y,x) :return: Coordinate -- the average of sum(y), sum(x)

class Place(Coordinate):
343class Place(Coordinate):
344    """
345    Location or GeoBase
346    + Coordinate
347    + Place
348    + Country
349
350    or
351    Location
352    + Coordinate
353       + Place
354
355       etc.  Not sure of the best data model for inheritance.
356    This Python API hopes to simplify the concepts in the Java API.
357
358    """
359
360    def __init__(self, pid, name, lat=None, lon=None):
361        Coordinate.__init__(self, None, lat=lat, lon=lon)
362        # Internal DB or Gazetteer ID
363        self.id = None
364        # Public or standards Place ID, e.g., GNS, ISO, etc.
365        self.place_id = pid
366        self.name = name
367
368        self.is_ascii = False
369        self.is_upper = False
370        self.adm1_postalcode = None  # Province Postal CODE?
371        self.place_postalcode = None  # ZIP CODE?
372        self.name_type = None
373        self.name_script = None  # Code or label, e.g. L or LATIN
374        self.country = None
375        self.country_code = None
376        self.country_code_fips = None
377        self.feature_class = None
378        self.feature_code = None
379        self.adm1 = None
380        self.adm1_name = None
381        self.adm1_iso = None  # Alternate ISO-based ADM1 code used by NGA and others.
382        self.adm2 = None
383        self.adm2_name = None
384        self.source = None
385        self.name_bias = 0.0
386        self.id_bias = 0.0
387        # Precision is actually "Precision Error" in meters
388        self.precision = -1
389        self.method = None
390        # Population stats, if available. Scale is a power-of-2 scale
391        # starting at about pop of 2^14 as 0, 32K=1, 64K=2, etc.
392        self.population = -1
393        self.population_scale = 0
394        self.hierarchical_path = None
395
396        # Internal fields for gazetteer curation and text analytics:
397        self.name_group = ""
398        self.search_only = False
399
400    def has_coordinate(self):
401        return self.validate()
402
403    def get_location(self):
404        """ Returns (LAT, LON) tuple
405        @return: tuple, (lat,lon)
406        """
407        return self.Y, self.X
408
409    def set_location(self, lat, lon):
410        self.set(lat, lon)
411
412    def __str__(self):
413        return '{}, {} @({})'.format(self.name, self.country_code, self.string_coord())
414
415    def format_feature(self):
416        """
417        Yield a consolidated feature coding.
418        :return:  X/xxxx  format
419        """
420        if self.feature_code:
421            return f"{self.feature_class}/{self.feature_code}"
422        return self.feature_class

Location or GeoBase

  • Coordinate
  • Place
  • Country

or Location

  • Coordinate
    • Place

etc. Not sure of the best data model for inheritance. This Python API hopes to simplify the concepts in the Java API.

def get_location(self):
403    def get_location(self):
404        """ Returns (LAT, LON) tuple
405        @return: tuple, (lat,lon)
406        """
407        return self.Y, self.X

Returns (LAT, LON) tuple @return: tuple, (lat,lon)

def format_feature(self):
415    def format_feature(self):
416        """
417        Yield a consolidated feature coding.
418        :return:  X/xxxx  format
419        """
420        if self.feature_code:
421            return f"{self.feature_class}/{self.feature_code}"
422        return self.feature_class

Yield a consolidated feature coding. :return: X/xxxx format

Inherited Members
Coordinate
set
class Country(Coordinate):
425class Country(Coordinate):
426    """
427    Country metadata
428    """
429
430    def __init__(self):
431        Coordinate.__init__(self, None)
432        self.cc_iso2 = None
433        self.cc_iso3 = None
434        self.cc_fips = None
435        self.place_id = None
436        self.name = None
437        self.namenorm = None
438        self.name_type = None
439        self.aliases = []
440        self.is_territory = False
441        self.is_unique_name = False
442        self.timezones = []
443        self.languages = set([])
444        self.primary_language = None
445
446    def __str__(self):
447        return u'{} ({})'.format(self.name, self.cc_iso2)

Country metadata

Inherited Members
Coordinate
set
def country_as_place(ctry: Country, name: str, name_type='N', oid=None):
450def country_as_place(ctry: Country, name: str, name_type="N", oid=None):
451    """
452    Convert to Place.
453    :param ctry: Country object
454    :param name: the name to use
455    :param name_type:
456    :param oid: row ID
457    :return:
458    """
459    pl = Place(ctry.cc_iso2, name)
460    pl.id = oid
461    pl.place_id = ctry.place_id
462    pl.name_type = name_type
463    pl.feature_class = "A"
464    pl.feature_code = "PCLI"
465    pl.name_bias = 0.0
466    pl.id_bias = 0.0
467    pl.country_code = ctry.cc_iso2
468    pl.country_code_fips = ctry.cc_fips
469    pl.adm1 = "0"
470    pl.source = "ISO"
471    if ctry.is_territory:
472        pl.feature_code = "PCL"
473    pl.set_location(ctry.lat, ctry.lon)
474    return pl

Convert to Place. :param ctry: Country object :param name: the name to use :param name_type: :param oid: row ID :return:

def load_countries(csvpath=None):
477def load_countries(csvpath=None):
478    """ parses Xponents Core/src/main/resource CSV file country-names-2015.csv
479        putting out an array of Country objects.
480        :return: array of Country
481    """
482    if not csvpath:
483        csvpath = pkg_resource_path('country-names-2021.csv')
484
485    count = 0
486    with open(csvpath, 'r', encoding="UTF-8") as fh:
487        columns = "country_name,FIPS_cc,ISO2_cc,ISO3_cc,unique_name,territory,latitude,longitude".split(',')
488        fio = get_csv_reader(fh, columns)
489        for row in fio:
490
491            # ignore empty row and header.
492            if 'country_name' not in row:
493                continue
494            if row['country_name'] == 'country_name':
495                continue
496            count += 1
497            C = Country()
498            C.name = row.get('country_name')
499            C.cc_iso2 = row.get('ISO2_cc').upper()
500            C.cc_iso3 = row.get('ISO3_cc').upper()
501            C.cc_fips = row.get('FIPS_cc').upper()
502
503            # Internal data set "place ID"
504            C.place_id = f"C{C.cc_iso2}#{C.cc_fips}#{count}"
505
506            C.is_name_unique = get_bool(row.get('unique_name'))
507            C.is_territory = get_bool(row.get('territory'))
508            C.namenorm = C.name.lower()
509            C.set(row.get("latitude"), row.get("longitude"))
510
511            countries.append(C)
512
513    for C in countries:
514        if not C.is_territory and C.cc_iso2 not in countries_by_iso:
515            countries_by_iso[C.cc_iso2] = C
516            countries_by_iso[C.cc_iso3] = C
517
518        if C.cc_fips and C.cc_fips != "*":
519            countries_by_fips[C.cc_fips] = C
520
521        countries_by_name[C.namenorm] = C
522
523    global __loaded
524    __loaded = len(countries_by_iso) > 1
525
526    if __loaded:
527        if "XKX" in countries_by_iso:
528            countries_by_iso["XKS"] = countries_by_iso.get("XKX")
529        if "SJM" in countries_by_iso:
530            countries_by_iso["XSV"] = countries_by_iso.get("SJM")
531            countries_by_iso["XJM"] = countries_by_iso.get("SJM")
532        if "PSE" in countries_by_iso:
533            countries_by_iso["GAZ"] = countries_by_iso.get("PSE")
534        if "TLS" in countries_by_iso:
535            countries_by_iso["TMP"] = countries_by_iso.get("TLS")
536
537    return countries

parses Xponents Core/src/main/resource CSV file country-names-2015.csv putting out an array of Country objects. :return: array of Country

def get_us_province(adm1: str):
540def get_us_province(adm1: str):
541    """
542
543    :param adm1:  ADM1 code or for territories,
544    :return:
545    """
546    if not usstates:
547        raise Exception("Run load_us_provinces() first")
548    return usstates.get(adm1)

:param adm1: ADM1 code or for territories, :return:

def load_us_provinces():
551def load_us_provinces():
552    """
553    Load, store internally and return the LIST of US states.
554    NOTE: Place objects for US States have a location (unlike list of world provinces).
555    To get location and feature information in full, you must use the SQLITE DB or Xponents Solr.
556    :return: array of Place objects
557    """
558    csvpath = pkg_resource_path('us-state-metadata.csv')
559    usstate_places = []
560    with open(csvpath, 'r', encoding="UTF-8") as fh:
561        columns = ["POSTAL_CODE", "ADM1_CODE", "STATE", "LAT", "LON", "FIPS_CC", "ISO2_CC"]
562        io = get_csv_reader(fh, columns)
563        for row in io:
564            if row['POSTAL_CODE'] == 'POSTAL_CODE': continue
565
566            cc = row["ISO2_CC"]
567            adm1_code = row["ADM1_CODE"][2:]
568            postal_code = row["POSTAL_CODE"]
569            # HASC path
570            place_id = make_HASC(cc, adm1_code)
571            postal_id = make_HASC(cc, row["POSTAL_CODE"])
572            adm1 = Place(place_id, row["STATE"], lat=row["LAT"], lon=row["LON"])
573            adm1.feature_class = "A"
574            adm1.feature_code = "ADM1"
575            adm1.name_type = "N"
576            adm1.geohash = geohash_encode(adm1.lat, adm1.lon, precision=6)
577
578            adm1.country_code = cc
579            adm1.adm1 = adm1_code
580            adm1.adm1_postalcode = row["POSTAL_CODE"]
581            adm1.source = "OpenSextant"
582
583            # Code alone:
584            usstates[adm1_code] = adm1
585            usstates[postal_code] = adm1
586            usstates[place_id] = adm1
587            usstates[postal_id] = adm1
588
589            usstate_places.append(adm1)
590    return usstate_places

Load, store internally and return the LIST of US states. NOTE: Place objects for US States have a location (unlike list of world provinces). To get location and feature information in full, you must use the SQLITE DB or Xponents Solr. :return: array of Place objects

def load_provinces():
593def load_provinces():
594    """
595    Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc.
596    NOTE: Location information is not included in this province listing.  Just Country, ADM1, Name tuples.
597    NOTE: This reflects only GEONAMES ADMIN1 CODES ASCII -- which portrays most of the world (except US) as FIPS,
598    not ISO.
599    :return:  dict
600    """
601    return load_world_adm1()

Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc. NOTE: Location information is not included in this province listing. Just Country, ADM1, Name tuples. NOTE: This reflects only GEONAMES ADMIN1 CODES ASCII -- which portrays most of the world (except US) as FIPS, not ISO. :return: dict

def load_world_adm1():
604def load_world_adm1():
605    """
606    Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc.
607    Coding for ADM1 is FIPS based mostly
608    :return:  dict
609    """
610    # Load local country data first, if you have it. US is only one so far.
611    load_us_provinces()
612
613    SOURCE_ID = "G"
614    csvpath = pkg_resource_path(os.path.join('geonames.org', 'admin1CodesASCII.txt'))
615
616    with open(csvpath, 'r', encoding="UTF-8") as fh:
617        adm1Splitter = re.compile(r'\.')
618        lineSplitter = re.compile('\t')
619        for line in fh:
620            row = lineSplitter.split(line.strip())
621            src_id = f"{SOURCE_ID}{row[3]}"
622            if not row[3]:
623                src_id = None
624            adm1 = Place(src_id, row[1])
625            adm1.feature_class = "A"
626            adm1.feature_code = "ADM1"
627            adm1.name_type = "N"
628
629            cc_adm1 = adm1Splitter.split(row[0], 2)
630            adm1.country_code = cc_adm1[0]
631            adm1.adm1 = parse_admin_code(cc_adm1[1])
632            adm1.source = "G"  # Geonames.org coded.
633            hasc = make_HASC(adm1.country_code, adm1.adm1)
634            if adm1.country_code == "US":
635                adm1.source = "USGS"
636                if hasc in usstates:
637                    us_place = usstates[hasc]
638                    us_place.name = adm1.name
639                    hasc = make_HASC(us_place.country_code, us_place.adm1)
640                    adm1_by_hasc[hasc] = adm1
641
642            adm1_by_hasc[hasc] = adm1
643    return adm1_by_hasc

Load, store and return a dictionary of ADM1 boundary names - provinces, states, republics, etc. Coding for ADM1 is FIPS based mostly :return: dict

def get_province(cc, adm1):
646def get_province(cc, adm1):
647    """ REQUIRES you load_provinces() first.
648    """
649    return adm1_by_hasc.get(make_HASC(cc, adm1))

REQUIRES you load_provinces() first.

def get_country(namecode, standard='ISO'):
652def get_country(namecode, standard="ISO"):
653    """
654    Get Country object given a name, ISO or FIPS code.  For codes, you must be
655    clear about which standard the code is based in. Some code collisions exist.
656    "ZZ" will NOT be returned for the empty code -- if you pass in a NULL or empty
657    country code you may have a data quality issue.
658    :param namecode: 2- or 3-alpha code.
659    :param standard: 'ISO' or 'FIPS', 'name'
660    :return:  Country object
661    """
662    if not namecode or not isinstance(namecode, str):
663        return None
664
665    if not __loaded:
666        load_countries()
667
668    lookup = namecode.upper()
669    if standard == "ISO":
670        return countries_by_iso.get(lookup)
671    elif standard == "FIPS":
672        return countries_by_fips.get(lookup)
673    elif standard == "name":
674        return countries_by_name.get(namecode.lower())
675    else:
676        raise Exception("That standards body '{}' is not known for code {}".format(standard, namecode))

Get Country object given a name, ISO or FIPS code. For codes, you must be clear about which standard the code is based in. Some code collisions exist. "ZZ" will NOT be returned for the empty code -- if you pass in a NULL or empty country code you may have a data quality issue. :param namecode: 2- or 3-alpha code. :param standard: 'ISO' or 'FIPS', 'name' :return: Country object

def load_major_cities():
679def load_major_cities():
680    """
681    Loads City geo/demographic information -- this does not try to parse all name variants.
682
683    This produces Geonames use of FIPS codes.
684    :return:
685    """
686    csvpath = pkg_resource_path(os.path.join('geonames.org', 'cities15000.txt'))
687
688    from csv import reader
689    with open(csvpath, 'r', encoding="UTF-8") as fh:
690        rdr = reader(fh, dialect="excel", delimiter="\t")
691        cities = []
692        for line in rdr:
693            if len(line) != 19:
694                continue
695            if not line[4]:
696                print("Not location info for City ~ ", line[0])
697                continue
698            #          ID       NAME     LAT                 LON
699            pl = Place(line[0], line[1], lat=float(line[4]), lon=float(line[5]))
700            pl.feature_class = line[6]
701            pl.feature_code = line[7]
702            pl.country_code = line[8]
703            alt_cc = line[9]
704            if alt_cc and alt_cc != pl.country_code:
705                print("Alternate Country Code", alt_cc)
706            pl.adm1 = parse_admin_code(line[10])
707            pl.adm2 = line[11]
708            # pl.geohash = geohash_encode(pl.lat, pl.lon, precision=6)
709            try:
710                pl.population = int(line[14])
711                pl.population_scale = popscale(pl.population, feature="city")
712            except:
713                pass
714            cities.append(pl)
715    return cities

Loads City geo/demographic information -- this does not try to parse all name variants.

This produces Geonames use of FIPS codes. :return:

def popscale(population, feature='city'):
725def popscale(population, feature="city"):
726    """
727    Given a population in context of the feature -- provide a
728    approximation of the size of the feature on a 10 point scale.
729
730    Approximations for 10 points:
731    Largest city is ~15 million
732    // Few cities top 30 million, e.g., 2^25.  popscale = 25 - 13 = 12.
733    Largest province is ~135 million
734
735    :param population:
736    :param feature:  city, district, or province allowed.
737    :return: index on 0..10 scale.
738    """
739    if population < 1:
740        return 0
741    shifter = _pop_scale.get(feature, 20)
742    index = mathlog(population, 2) - shifter
743    return int(index) if index > 0 else 0

Given a population in context of the feature -- provide a approximation of the size of the feature on a 10 point scale.

Approximations for 10 points: Largest city is ~15 million // Few cities top 30 million, e.g., 2^25. popscale = 25 - 13 = 12. Largest province is ~135 million

:param population: :param feature: city, district, or province allowed. :return: index on 0..10 scale.

def is_political(feat_code: str):
746def is_political(feat_code: str):
747    """Test a feature code"""
748    if not feat_code: return False
749    return feat_code.startswith("PCL")

Test a feature code

def is_country(feat_code: str):
752def is_country(feat_code: str):
753    """Test a feature code"""
754    return "PCLI" == feat_code

Test a feature code

def is_academic(feat_class: str, feat_code: str) -> bool:
767def is_academic(feat_class: str, feat_code: str) -> bool:
768    """
769
770    :param feat_class: geonames class
771    :param feat_code:  geonames designation code
772    :return:
773    """
774    return feat_class and feat_code and feat_class == "S" and feat_code.startswith("SCH")

:param feat_class: geonames class :param feat_code: geonames designation code :return:

def characterize_location(place: Place, label: str):
777def characterize_location(place: Place, label: str):
778    """
779    Experimental: Not comprehensive characterization. This is intended to summarize PlaceCandidates extracted
780    from text.
781
782    Describe a Place in terms of a plain language feature type and the geographic scope or resolution.
783    E.g, Place object "P/PPL", "city"
784    E.g,  Place object "A/ADM4"  "admin"
785    E.g,  Place object "S/COORD", "site"
786
787    :param place: Place object
788    :param label:  text match label, e.g., 'country', 'place', 'coord', etc.
789    :return: feature string, resolution string
790    """
791    res = label
792    fc = place.feature_class
793    resolutions = {
794        "A": "admin",
795        "P": "city",
796        "S": "site",
797        "H": "water",
798        "R": "path",
799        "V": "area",
800        "T": "area",
801        "L": "area"
802    }
803
804    # Note label should be a limited set -- country, postal, coord, place.
805    if label == "place":
806        res = resolutions.get(fc, label)
807    if label == "coord":
808        res = "site"
809
810    return place.format_feature(), res

Experimental: Not comprehensive characterization. This is intended to summarize PlaceCandidates extracted from text.

Describe a Place in terms of a plain language feature type and the geographic scope or resolution. E.g, Place object "P/PPL", "city" E.g, Place object "A/ADM4" "admin" E.g, Place object "S/COORD", "site"

:param place: Place object :param label: text match label, e.g., 'country', 'place', 'coord', etc. :return: feature string, resolution string

class TextEntity:
813class TextEntity:
814    """
815    A Text span.
816
817    classes and routines that align with Java org.opensextant.data and org.opensextant.extraction
818
819    * TextEntity: represents a span of text
820    * TextMatch: a TextEntity matched by a particular routine.  This is the basis for most all
821    extractors and annotators in OpenSetant.
822    """
823
824    def __init__(self, text, start, end):
825        self.text = text
826        self.start = start
827        self.end = end
828        self.len = -1
829        self.is_duplicate = False
830        self.is_overlap = False
831        self.is_submatch = False
832        if self._is_valid():
833            self.len = self.end - self.start
834
835    def __str__(self):
836        return f"{self.text}({self.start},{self.end})"
837
838    def _is_valid(self):
839        if self.start is None or self.end is None:
840            return False
841        return self.start >= 0 and self.end >= 0
842
843    def contains(self, x1):
844        """ if this span contains an offset x1
845        :param x1:
846        """
847        if self.start < 0 or self.end < 0:
848            return False
849        return self.start <= x1.start < x1.end <= self.end
850
851    def exact_match(self, t):
852        return t.start == self.start and t.end == self.end and self._is_valid()
853
854    def is_within(self, t):
855        """
856        if the given annotation, t, contains this
857        :param t:
858        :return:
859        """
860        return t.contains(self.start) and t.contains(self.end)
861
862    def is_after(self, t):
863        return self.start > t.end
864
865    def is_before(self, t):
866        return self.end < t.start
867
868    def overlaps(self, t):
869        """
870        Determine if t overlaps self.  If Right or Left match, t overlaps if it is longer.
871        If t is contained entirely within self, then it is not considered overlap -- it is Contained within.
872        :param t:
873        :return:
874        """
875        #    a1     a2
876        #  t1     t2        RIGHT skew
877        #    a1     a2
878        #       t1     t2   LEFT skew
879        #
880        #   a1  a2
881        #   t1      t2  RIGHT match
882        # t1    t2      LEFT match
883        #   a1  a2
884        #       t1   t2  minimal OVERLAP
885        skew_right = t.start < self.start <= t.end < self.end
886        skew_left = self.start < t.start <= self.end < t.end
887        left_match = self.end == t.end
888        right_match = self.start == t.start
889        if skew_right or skew_left:
890            return True
891        return (right_match and skew_left) or (left_match and skew_right)

A Text span.

classes and routines that align with Java org.opensextant.data and org.opensextant.extraction

  • TextEntity: represents a span of text
  • TextMatch: a TextEntity matched by a particular routine. This is the basis for most all extractors and annotators in OpenSetant.
def contains(self, x1):
843    def contains(self, x1):
844        """ if this span contains an offset x1
845        :param x1:
846        """
847        if self.start < 0 or self.end < 0:
848            return False
849        return self.start <= x1.start < x1.end <= self.end

if this span contains an offset x1 :param x1:

def is_within(self, t):
854    def is_within(self, t):
855        """
856        if the given annotation, t, contains this
857        :param t:
858        :return:
859        """
860        return t.contains(self.start) and t.contains(self.end)

if the given annotation, t, contains this :param t: :return:

def overlaps(self, t):
868    def overlaps(self, t):
869        """
870        Determine if t overlaps self.  If Right or Left match, t overlaps if it is longer.
871        If t is contained entirely within self, then it is not considered overlap -- it is Contained within.
872        :param t:
873        :return:
874        """
875        #    a1     a2
876        #  t1     t2        RIGHT skew
877        #    a1     a2
878        #       t1     t2   LEFT skew
879        #
880        #   a1  a2
881        #   t1      t2  RIGHT match
882        # t1    t2      LEFT match
883        #   a1  a2
884        #       t1   t2  minimal OVERLAP
885        skew_right = t.start < self.start <= t.end < self.end
886        skew_left = self.start < t.start <= self.end < t.end
887        left_match = self.end == t.end
888        right_match = self.start == t.start
889        if skew_right or skew_left:
890            return True
891        return (right_match and skew_left) or (left_match and skew_right)

Determine if t overlaps self. If Right or Left match, t overlaps if it is longer. If t is contained entirely within self, then it is not considered overlap -- it is Contained within. :param t: :return:

class TextMatch(TextEntity):
894class TextMatch(TextEntity):
895    """
896    An entity matched by some tagger; it is a text span with lots of metadata.
897    """
898
899    def __init__(self, *args, label=None):
900        TextEntity.__init__(self, *args)
901        self.id = None
902        self.label = label
903        self.filtered_out = False
904        self.attrs = dict()
905
906    def __str__(self):
907        return f"{self.label}/{self.text}({self.start},{self.end})"
908
909    def populate(self, attrs: dict):
910        """
911        Populate a TextMatch to normalize the set of attributes -- separate class fields on TextMatch from additional
912        optional attributes.
913        :param attrs: dict of standard Xponents API outputs.
914        :return:
915        """
916        self.id = attrs.get("match-id")
917        self.label = attrs.get("type")
918        self.attrs.update(attrs)
919        self.filtered_out = get_bool(self.attrs.get("filtered-out"))
920        for k in ['len', 'length']:
921            if k in self.attrs:
922                self.len = self.attrs.get(k)
923        if self.len is not None and self.start >= 0 and not self.end:
924            self.end = self.start + self.len
925
926        # Remove attribute keys that may be confusing.
927        for fld in ['offset', 'start', 'end', 'len', 'length', 'type', 'filtered-out', 'text', 'matchtext']:
928            if fld in self.attrs:
929                del self.attrs[fld]
930
931    def normalize(self):
932        """
933        Optional, but recommended routine to normalize the matched data.
934        That is, parse fields, uppercase, streamline punctuation, etc.
935        As well, given such normalization result, this is the opportunity to additionally
936        validate the match.
937        :return:
938        """
939        pass

An entity matched by some tagger; it is a text span with lots of metadata.

def populate(self, attrs: dict):
909    def populate(self, attrs: dict):
910        """
911        Populate a TextMatch to normalize the set of attributes -- separate class fields on TextMatch from additional
912        optional attributes.
913        :param attrs: dict of standard Xponents API outputs.
914        :return:
915        """
916        self.id = attrs.get("match-id")
917        self.label = attrs.get("type")
918        self.attrs.update(attrs)
919        self.filtered_out = get_bool(self.attrs.get("filtered-out"))
920        for k in ['len', 'length']:
921            if k in self.attrs:
922                self.len = self.attrs.get(k)
923        if self.len is not None and self.start >= 0 and not self.end:
924            self.end = self.start + self.len
925
926        # Remove attribute keys that may be confusing.
927        for fld in ['offset', 'start', 'end', 'len', 'length', 'type', 'filtered-out', 'text', 'matchtext']:
928            if fld in self.attrs:
929                del self.attrs[fld]

Populate a TextMatch to normalize the set of attributes -- separate class fields on TextMatch from additional optional attributes. :param attrs: dict of standard Xponents API outputs. :return:

def normalize(self):
931    def normalize(self):
932        """
933        Optional, but recommended routine to normalize the matched data.
934        That is, parse fields, uppercase, streamline punctuation, etc.
935        As well, given such normalization result, this is the opportunity to additionally
936        validate the match.
937        :return:
938        """
939        pass

Optional, but recommended routine to normalize the matched data. That is, parse fields, uppercase, streamline punctuation, etc. As well, given such normalization result, this is the opportunity to additionally validate the match. :return:

class PlaceCandidate(TextMatch):
942class PlaceCandidate(TextMatch):
943    """
944    A TextMatch representing any geographic mention -- a Place object will
945    represent the additional attributes for the chosen place.
946    see also in Java org.opensextant.extractors.geo.PlaceCandidate class, which is
947    a more in-depth version of this.  This Python class represents the
948    response from the REST API, for example.
949
950    """
951
952    def __init__(self, *args, **kwargs):
953        TextMatch.__init__(self, *args, **kwargs)
954        self.confidence = 0
955        self.rules = []
956        self.is_country = False
957        self.place = None
958        # Location certainty is a simple meausre 0.0 to 1.0 to convey confidence + precision in one metric
959        self.location_certainty = -1
960
961    def populate(self, attrs: dict):
962        """
963        Deserialize the attributes dict from either TextMatch schema or Place schema
964        :param attrs:
965        :return:
966        """
967        TextMatch.populate(self, attrs)
968        geo = Place(None, attrs.get("name"), lat=attrs.get("lat"), lon=attrs.get("lon"))
969        if not geo.name:
970            geo.name = self.text
971
972        # attribute / schema does not align 100% here.
973        geo.country_code = attrs.get("cc")
974        geo.adm1 = attrs.get("adm1")
975        geo.precision = attrs.get("prec")
976        geo.feature_class = attrs.get("feat_class")
977        geo.feature_code = attrs.get("feat_code")
978        geo.adm1_name = attrs.get("province-name")
979        geo.geohash = attrs.get("geohash")
980        geo.method = attrs.get("method")
981
982        # Combined match + geo-location confidence:
983        self.confidence = attrs.get("confidence")
984        if "rules" in attrs:
985            # One or more geo-inferencing rules
986            self.rules = attrs["rules"].split(";")
987
988        self.is_country = self.label == "country" or is_country(geo.feature_code)
989        if self.is_country:
990            # Zero out country location; Let user derive country from metadata.
991            geo.lat = None
992            geo.lon = None
993        self.place = geo
994        # Items like coordinates and cities, etc receive a location certainty.  Countries do not.
995        self.location_certainty = location_accuracy(self.confidence, geo.precision)

A TextMatch representing any geographic mention -- a Place object will represent the additional attributes for the chosen place. see also in Java org.opensextant.extractors.geo.PlaceCandidate class, which is a more in-depth version of this. This Python class represents the response from the REST API, for example.

def populate(self, attrs: dict):
961    def populate(self, attrs: dict):
962        """
963        Deserialize the attributes dict from either TextMatch schema or Place schema
964        :param attrs:
965        :return:
966        """
967        TextMatch.populate(self, attrs)
968        geo = Place(None, attrs.get("name"), lat=attrs.get("lat"), lon=attrs.get("lon"))
969        if not geo.name:
970            geo.name = self.text
971
972        # attribute / schema does not align 100% here.
973        geo.country_code = attrs.get("cc")
974        geo.adm1 = attrs.get("adm1")
975        geo.precision = attrs.get("prec")
976        geo.feature_class = attrs.get("feat_class")
977        geo.feature_code = attrs.get("feat_code")
978        geo.adm1_name = attrs.get("province-name")
979        geo.geohash = attrs.get("geohash")
980        geo.method = attrs.get("method")
981
982        # Combined match + geo-location confidence:
983        self.confidence = attrs.get("confidence")
984        if "rules" in attrs:
985            # One or more geo-inferencing rules
986            self.rules = attrs["rules"].split(";")
987
988        self.is_country = self.label == "country" or is_country(geo.feature_code)
989        if self.is_country:
990            # Zero out country location; Let user derive country from metadata.
991            geo.lat = None
992            geo.lon = None
993        self.place = geo
994        # Items like coordinates and cities, etc receive a location certainty.  Countries do not.
995        self.location_certainty = location_accuracy(self.confidence, geo.precision)

Deserialize the attributes dict from either TextMatch schema or Place schema :param attrs: :return:

class Extractor(abc.ABC):
 998class Extractor(ABC):
 999    def __init__(self):
1000        self.id = None
1001
1002    @abstractmethod
1003    def extract(self, text, **kwargs):
1004        """
1005
1006        :param text: Unicode text input
1007        :keyword features: an array of features to extract, e.g., "coordinate", "place", "MONEY"
1008        :return: array of TextMatch
1009        """
1010        pass

Helper class that provides a standard way to create an ABC using inheritance.

@abstractmethod
def extract(self, text, **kwargs):
1002    @abstractmethod
1003    def extract(self, text, **kwargs):
1004        """
1005
1006        :param text: Unicode text input
1007        :keyword features: an array of features to extract, e.g., "coordinate", "place", "MONEY"
1008        :return: array of TextMatch
1009        """
1010        pass

:param text: Unicode text input :keyword features: an array of features to extract, e.g., "coordinate", "place", "MONEY" :return: array of TextMatch

def render_match(m):
1013def render_match(m):
1014    """
1015
1016    :param m: TextMatch
1017    :return: dict
1018    """
1019    if not isinstance(m, TextMatch):
1020        return None
1021    dct = {
1022        "type": m.label,
1023        "text": m.text,
1024        "offset": m.start,
1025        "length": m.len,
1026        "filtered-out": m.filtered_out
1027    }
1028    return dct

:param m: TextMatch :return: dict

def reduce_matches(matches):
1036def reduce_matches(matches):
1037    """
1038    Mark each match if it is a submatch or overlap or exact duplicate of other.
1039    :param matches: array of TextMatch (or TextEntity). This is the more object oriented version
1040    of reduce_matches_dict
1041    :return:
1042    """
1043    if len(matches) < 2:
1044        return
1045    loop = 0
1046    for M in matches:
1047        loop += 1
1048        if M.filtered_out:
1049            continue
1050        m1 = M.start
1051        m2 = M.end
1052        # print(M.text, loop)
1053
1054        # In this loop you have to compare M against all N
1055        #   Cannot exit loop on first match or overlap.
1056        for N in matches[loop:]:
1057            if N.filtered_out:
1058                continue
1059
1060            n1 = N.start
1061            n2 = N.end
1062
1063            if m2 < n1  or m1 > n2:
1064                # M entirely before N
1065                # M entirely after N
1066                continue
1067
1068            # print("\t", N.text, N.start, N.is_duplicate)
1069            if n1 == m1 and n2 == m2:
1070                # Exact duplicate - Mark N as dup, as M is first in array, but only if M is a valid match.
1071                N.is_duplicate = True
1072            elif n1 <= m1 < m2 <= n2:
1073                # M is within N span
1074                M.is_submatch = True
1075            elif m1 <= n1 < n2 <= m2:
1076                # N is within M span
1077                N.is_submatch = True
1078            elif m1 <= n2 <= m2 or n1 <= m2 <= n2:
1079                #  n1    n2
1080                #     m1    m2
1081                M.is_overlap = True
1082                N.is_overlap = True

Mark each match if it is a submatch or overlap or exact duplicate of other. :param matches: array of TextMatch (or TextEntity). This is the more object oriented version of reduce_matches_dict :return:

def reduce_matches_dict(matches):
1085def reduce_matches_dict(matches):
1086    """
1087    Accepts an array annotations (dict). Inserts the "submatch" flag in dict if there is a
1088    submatch (that is, if another TextEntity A wholly contains another, B -- B is a submatch).
1089    We just have to loop through half of the array ~ comparing each item to each other item once.
1090
1091    :param matches: array of dicts.
1092    """
1093    _max = len(matches)
1094    if _max < 2:
1095        return
1096
1097    loops = 0
1098    for i in range(0, _max):
1099        M = matches[i]
1100        m1 = M['start']
1101        m2 = M['end']
1102
1103        for j in range(i + 1, _max):
1104            loops += 1
1105            N = matches[j]
1106            n1 = N['start']
1107            n2 = N['end']
1108
1109            if m2 < n1:
1110                # M before N
1111                continue
1112
1113            if m1 > n2:
1114                # M after N
1115                continue
1116
1117            # Check for filtered-out matches not done in this version.
1118            #
1119            if n1 == m1 and n2 == m2:
1120                N['submatch'] = IS_DUPLICATE
1121
1122            elif n1 <= m1 < m2 <= n2:
1123                M['submatch'] = IS_SUBMATCH
1124                # Determined state of M.
1125                # break this internal loop
1126
1127            elif m1 <= n1 < n2 <= m2:
1128                N['submatch'] = IS_SUBMATCH
1129                # Determined state of N,
1130                # But possibly more N contained within M. Do not break yet.
1131    return

Accepts an array annotations (dict). Inserts the "submatch" flag in dict if there is a submatch (that is, if another TextEntity A wholly contains another, B -- B is a submatch). We just have to loop through half of the array ~ comparing each item to each other item once.

:param matches: array of dicts.

class Language:
1139class Language:
1140    """
1141    Language Represents a single code/name pair
1142    Coding is 3-char or 2-char, either is optional.
1143    In some situations there are competeing 2-char codes in code books, such as Lib of Congress (LOC)
1144    """
1145
1146    def __init__(self, iso3, iso2, nmlist: list, locale=None):
1147        self.code_iso3 = iso3
1148        self.code = iso2
1149        self.names = nmlist
1150        self.locale = locale
1151        if nmlist:
1152            if not isinstance(nmlist, list):
1153                raise Exception("Name list is a list of names for the language. The first one is the default.")
1154
1155    def get_name(self):
1156        if self.names:
1157            return self.names[0]
1158        return None
1159
1160    def name_code(self):
1161        if self.names:
1162            return self.names[0].lower()
1163        return None
1164
1165    def __str__(self):
1166        return f"{self.name_code()}({self.code})"

Language Represents a single code/name pair Coding is 3-char or 2-char, either is optional. In some situations there are competeing 2-char codes in code books, such as Lib of Congress (LOC)

def list_languages():
1173def list_languages():
1174    """
1175    List out a flattened list of languages, de-duplicated by ISO2 language ID.
1176
1177    TODO: alternatively list out every language
1178    :return:
1179    """
1180    load_languages()
1181    langs = []
1182    visited = set([])
1183    for lg in language_map:
1184        L = language_map[lg]
1185        if L.code:
1186            if not L.code in visited:
1187                langs.append(L)
1188                visited.add(L.code)
1189        if L.code_iso3:
1190            if not L.code_iso3 in visited:
1191                langs.append(L)
1192                visited.add(L.code_iso3)
1193    return langs

List out a flattened list of languages, de-duplicated by ISO2 language ID.

TODO: alternatively list out every language :return:

def add_language(lg: Language, override=False):
1196def add_language(lg: Language, override=False):
1197    """
1198    The language map for ISO 2-alpha and 3-alpha codes should be protected from language IDs that are dialect or locale
1199
1200    "en" ==> en-au, en-gb, en-us, etc.?  This is ambiguous
1201    The reverse is true -- "en-gb" is at least "en" or "eng" english
1202
1203    :param lg:
1204    :param override:
1205    :return:
1206    """
1207    if not lg:
1208        return
1209
1210    codes = []
1211    if lg.code:
1212        codes.append(lg.code.lower())
1213    if lg.code_iso3:
1214        codes.append(lg.code_iso3.lower())
1215    if lg.locale:
1216        codes.append(lg.locale.lower())
1217
1218    if lg.names:
1219        for nm in lg.names:
1220            lang_name_norm = nm.lower()
1221            codes.append(lang_name_norm)
1222            if "modern" in lang_name_norm:
1223                nm2 = get_list(lang_name_norm, delim=",")[0]
1224                codes.append(nm2)
1225                override = True
1226
1227    for k in set(codes):
1228        exists = k in language_map
1229
1230        # coding rule: 2 or 3 char alpha codes for ISO or Biblio code books are not overriden.
1231        if len(k) <= 3 and exists:
1232            continue
1233
1234        if exists and not override:
1235            raise Exception(f"Forcibly remap language code? {k}")
1236
1237        language_map[k] = lg

The language map for ISO 2-alpha and 3-alpha codes should be protected from language IDs that are dialect or locale

"en" ==> en-au, en-gb, en-us, etc.? This is ambiguous The reverse is true -- "en-gb" is at least "en" or "eng" english

:param lg: :param override: :return:

def get_language(code: str) -> Language:
1240def get_language(code: str) -> Language:
1241    """
1242
1243    :param code: language ID or name
1244    :return: Language or None
1245    """
1246    if not code:
1247        return None
1248
1249    load_languages()
1250    k = code.lower()
1251    # Most cases:
1252    if len(k) <= 3:
1253        return language_map.get(k)
1254
1255    # Code is odd, like a locale?  "EN_GB" or en-gb, etc.
1256    if k.isalpha():
1257        return language_map.get(k)
1258
1259    if k in language_map:
1260        return language_map.get(k)
1261
1262    for delim in ["-", " ", "_"]:
1263        k = k.split(delim)[0]
1264        if k in language_map:
1265            return language_map.get(k)
1266    return None

:param code: language ID or name :return: Language or None

def is_lang_romance(lg: str):
1288def is_lang_romance(lg: str):
1289    """If spanish, portuguese, italian, french, romanian"""
1290    L = get_language(lg)
1291    if not L:
1292        return False
1293    c = L.code
1294    return c in {"es", "pt", "it", "fr", "ro"}

If spanish, portuguese, italian, french, romanian

def is_lang_euro(lg: str):
1297def is_lang_euro(lg: str):
1298    """
1299    true if lang is European -- romance, german, english, etc
1300    :param lg:
1301    :return:
1302    """
1303    L = get_language(lg)
1304    if not L:
1305        return False
1306    c = L.code
1307    return c in {"es", "pt", "it", "fr", "ro",
1308                 "de", "en",
1309                 "bu", "cz", "po", "nl", "el", "sq"}

true if lang is European -- romance, german, english, etc :param lg: :return: