opensextant.gazetteer

   1import json
   2import os
   3import sqlite3
   4from traceback import format_exc
   5
   6import arrow
   7import pysolr
   8from opensextant import Place, Country, distance_haversine, load_major_cities, make_HASC, popscale, \
   9    geohash_cells_radially, bbox, point2geohash, geohash2point, pkg_resource_path
  10from opensextant.utility import ensure_dirs, is_ascii, has_cjk, has_arabic, \
  11    ConfigUtility, get_bool, trivial_bias, replace_diacritics, strip_quotes, parse_float, load_list
  12from opensextant.wordstats import WordStats
  13
  14DEFAULT_SOLR_URL="127.0.0.1:7000"
  15DEFAULT_MASTER = "master_gazetteer.sqlite"
  16DEFAULT_COUNTRY_ID_BIAS = 49
  17DEFAULT_WORDSTATS = "wordstats.sqlite"
  18
  19GAZETTEER_SOURCE_ID = {
  20    "ISO",  # ISO-3166 metadata
  21    "N",  # NGA
  22    "NF",  # NGA fixed
  23    "U",  # USGS
  24    "UF",  # USGS fixed
  25    "OA",  # OpenSextant Adhoc
  26    "OG",  # OpenSextant geonames.org derived
  27    "G",  # Geonames.org
  28    "GP"  # Geonames.org Postal
  29    "X",  # Xponents
  30    "NE"  # Natural Earth
  31}
  32
  33GAZETTEER_SOURCES = {
  34    "NGA": "N",
  35    "USGS": "U",
  36    "USGS-AUTOFIXED": "UF",
  37    "NGA-AUTOFIXED": "NF",
  38    "ADHOC": "OA",  # OpenSextant Adhoc
  39    "NE": "NE",  # Natural Earth.
  40    "GEONAMES": "OG",  # OpenSextant geonames
  41    "Geonames.org": "OG",  # OpenSextant geonames
  42    "XPONENTS": "X",  # Xponents Adhoc or generated
  43    "XpGen": "X",
  44    "XP": "X",
  45    "GP": "GP",  # Geonames Postal
  46    "G": "G"
  47}
  48
  49# Scripts, not languages per se.
  50SCRIPT_CODES = {
  51    None: "",
  52    "LATIN": "L",
  53    "HAN": "H",
  54    "COMMON": "C",
  55    "ARABIC": "A",
  56    "ARMENIAN": "AM",
  57    "BENGALI": "BN",
  58    "CYRILLIC": "CY",
  59    "DEVANAGARI": "DV",
  60    "ETHIOPIC": "ET",
  61    "GEORGIAN": "GE",
  62    "GREEK": "GK",
  63    "GURMUKHI": "GM",
  64    "GUJARATI": "GU",
  65    "HEBREW": "HE",
  66    "HANGUL": "HG",
  67    "HIRAGANA": "HI",
  68    "KANNADA": "KN",
  69    "KATAKANA": "KA",
  70    "KHMER": "KM",
  71    "MALAYALAM": "MY",
  72    "SINHALA": "SI",
  73    "TAMIL": "TA",
  74    "THAI": "TH"
  75}
  76
  77# FIPS code to ISO CC
  78# Extend to other territory codes
  79US_TERRITORY_MAP = {
  80    "FIPS": {
  81        "AQ": "AS",
  82        "GQ": "GU",
  83        "CQ": "MP",
  84        "RQ": "PR",
  85        "VI": "VI",
  86        "FQ": "UM",
  87        "DQ": "UM",
  88        "HQ": "UM",
  89        "JQ": "UM",
  90        "WQ": "UM",
  91        "MQ": "UM"
  92    },
  93    "ISO": {
  94        # Reverse is not true for all cases:  ISO to FIPS
  95        # "UM": "UM",
  96        "PR": "RQ",
  97        "MP": "CQ",
  98        "GU": "CQ",
  99        "AS": "AQ"
 100    }
 101}
 102
 103# IGNROE Historical names and Zones, and Unknowns *H
 104MAJOR_ADMIN_CODES = {'ADM1', 'ADMD', 'ADM2', 'ADM3', 'ADM4', 'PRSH', 'TERR'}
 105
 106
 107def coord_grid(geo: dict) -> str:
 108    """
 109    A less dissatisfying grid than geohash. Its just returning Y,X in low resolution. LLL.l,LLL.l
 110    """
 111    if "lat" not in geo:
 112        return None
 113    x, y = geo["lon"], geo["lat"]
 114    return f"{y:0.1f},{x:0.1f}"
 115
 116
 117def get_default_db():
 118    return os.path.join(".", "tmp", DEFAULT_MASTER)
 119
 120
 121def get_default_wordstats():
 122    return os.path.join(".", "tmp", DEFAULT_WORDSTATS)
 123
 124
 125def load_stopterms(project_dir=".", lower=True):
 126    """
 127    Load default stop terms from source tree for project build.
 128    :param project_dir: The location of Xponents/solr source tree.
 129    :param lower: default case to load data as. If not lower, then terms are loaded as-is
 130    :return:
 131    """
 132    loader = ConfigUtility()
 133    stopterms = set([])
 134    for f in ["etc/gazetteer/filters/non-placenames.csv",
 135              "etc/gazetteer/filters/non-placenames,spa.csv",  # SPANISH
 136              "etc/gazetteer/filters/non-placenames,rus,ukr.csv",  # Cyrillic languages
 137              "etc/gazetteer/filters/non-placenames,deu.csv",  # GERMAN
 138              "etc/gazetteer/filters/non-placenames,acronym.csv"]:
 139        terms = loader.loadDataFromFile(os.path.join(project_dir, f), ",")
 140        for t in terms:
 141            if lower:
 142                stopterms.add(t[0].lower())
 143            else:
 144                stopterms.add(t[0])
 145    return stopterms
 146
 147
 148def run_lookup(url, lookup, parse):
 149    """ Gazetteer demo mimics some of the logic in XponentsGazetteerQuery
 150        try "San Francisco, CA, US"
 151    """
 152
 153    solr_gaz = pysolr.Solr(url)
 154    # specific unit tests
 155
 156    records = None
 157    places = []
 158    if parse:
 159        # See other Java demo, XponentsGazetteerQuery
 160        # assuming NAME, PROV, COUNTRY
 161        slots = [a.strip() for a in lookup.split(',')]
 162
 163        if len(slots) < 3:
 164            print("NAME, PROV, CC  is required format for --lookup")
 165            return None
 166
 167        cityVal = slots[0]
 168        provVal = slots[1]
 169        countryVal = slots[2]
 170
 171        # Find best match for Province. Pass ADM1 code to next query
 172        query = 'name:"{}" AND feat_class:A AND cc:{}'.format(provVal, countryVal)
 173        records = solr_gaz.search(query, **{"rows": 100})
 174
 175        if not records:
 176            return None
 177
 178        # Use a Place object to abstract things.
 179        adm1 = as_place(records.docs[0])
 180        # Find best match for the tuple NAME/PROV/COUNTRY
 181        #
 182        query = 'name:"{}" AND feat_class:A AND cc:{} AND adm1:{}'.format(cityVal, countryVal, adm1.adm1)
 183        records = solr_gaz.search(query, **{"rows": 1000})
 184    else:
 185        query = 'name:"{}" AND feat_class:P'.format(lookup)
 186        records = solr_gaz.search(query, **{"rows": 1000})
 187
 188    if not records:
 189        return None
 190
 191    for r in records:
 192        places.append(as_place(r))
 193
 194    return places
 195
 196
 197GAZETTEER_TEMPLATE = {
 198    'id': -1,
 199    'place_id': -1,
 200    'name': None,
 201    # name_ar or name_cjk are filled in only if name is Arabic or CJK name group
 202    'lat': 0, 'lon': 0,
 203    # geo is the field to use for index.  lat/lon  are used for database.
 204    'feat_class': None, 'feat_code': None,
 205    'FIPS_cc': None, 'cc': None,
 206    'adm1': None, 'adm2': None,
 207    'source': None,
 208    # 'script': None,
 209    'name_bias': 0,
 210    'id_bias': 0,
 211    'name_type': "N",
 212    'search_only': False
 213}
 214
 215
 216def normalize_name(nm: str):
 217    """
 218    convenience method that ensures we have some consistency on normalization of name
 219    :param nm:
 220    :return:
 221    """
 222    return nm.replace("\u2019", "'").replace("\xa0", " ").strip().strip("'")
 223
 224
 225def name_group_for(nm: str):
 226    """
 227    Determine the major language "name group" for the input
 228    :param nm: name or any text
 229    :return:
 230    """
 231    if has_cjk(nm):
 232        return "cjk"
 233    elif has_arabic(nm):
 234        return "ar"
 235    return ""
 236
 237
 238def as_admin_place(r):
 239    """
 240    Convert dict to a Place object
 241    :param r: gazetteer row from Solr or SQlite.
 242    :return: Place
 243    """
 244    keys = {}
 245    if hasattr(r, "keys"):
 246        keys = r.keys()
 247
 248    p = Place(r['place_id'], r['name'])
 249    p.country_code = r["cc"]
 250    p.adm1 = r["adm1"]
 251    p.source = r["source"]
 252    p.geohash = r["geohash"]
 253    if "adm1_iso" in keys:
 254        p.adm1_iso = r["adm1_iso"]
 255
 256    p.lat = p.lon = p.X = p.Y = None
 257    return p
 258
 259
 260def as_place(r, source="index"):
 261    """
 262    Convert dict to a Place object
 263    :param source: db or index (solr)
 264    :param r: gazetteer row from Solr or SQlite.
 265    :return: Place
 266    """
 267    keys = {}
 268    if hasattr(r, "keys"):
 269        keys = r.keys()
 270
 271    lat, lon = 0, 0
 272    if "geo" in r:
 273        (lat, lon) = r['geo'].split(',')
 274    else:
 275        lat, lon = r["lat"], r["lon"]
 276
 277    p = Place(r['place_id'], r['name'], lat=lat, lon=lon)
 278    p.country_code = r["cc"]
 279    p.feature_class = r["feat_class"]
 280    p.feature_code = r["feat_code"]
 281    if "id" in r:
 282        # Required if coming or going into a database:
 283        p.id = r["id"]
 284    p.id_bias = r["id_bias"]
 285    if source == "db":
 286        p.name_bias = r["name_bias"]
 287
 288    # optional fields:
 289    if "FIPS_cc" in keys:
 290        p.country_code_fips = r["FIPS_cc"]
 291    if "adm1" in keys:
 292        p.adm1 = r["adm1"]
 293    if "adm2" in keys:
 294        p.adm2 = r["adm2"]
 295    if "geohash" in keys:
 296        p.geohash = r["geohash"]
 297    if "id" in keys:
 298        p.id = r["id"]
 299    if "source" in keys:
 300        p.source = r["source"]
 301    if "name_group" in keys:
 302        p.name_group = r["name_group"]
 303    if "search_only" in keys:
 304        p.search_only = get_bool(r["search_only"])
 305    if "name_type" in keys:
 306        p.name_type = r["name_type"]
 307
 308    p.is_ascii = is_ascii(p.name)
 309    return p
 310
 311
 312def as_place_record(place, target="index"):
 313    """
 314    Given a Place object, serialize it as a dict consistent with the Solr index schema.
 315    :param place:
 316    :param target: index or db
 317    :return:
 318    """
 319    if not isinstance(place, Place):
 320        return None
 321    # Copy defaults offers nothing.
 322    # rec = copy(GAZETTEER_TEMPLATE)
 323    rec = {
 324        "id": place.id,
 325        "place_id": place.place_id,
 326        "name": place.name,
 327        "name_type": place.name_type,
 328        "feat_class": place.feature_class,
 329        "feat_code": place.feature_code,
 330        "cc": place.country_code,
 331        "FIPS_cc": place.country_code_fips,
 332        "source": place.source,
 333        # "script": place.name_script,
 334        "search_only": place.search_only
 335    }
 336
 337    # ADMIN level 1/2 boundary names:
 338    if place.adm1:
 339        rec["adm1"] = place.adm1
 340    if place.adm2:
 341        rec["adm2"] = place.adm2
 342    # ID BIAS:
 343    rec["id_bias"] = 0 if place.id_bias is None else place.id_bias
 344
 345    if target == "index":
 346        # Preserve innate precision on Lat/Lon: e.g., "4.5,-118.4" is result if only that amount of precision is present
 347        rec["geo"] = ",".join([str(place.lat), str(place.lon)]),
 348        # Name Group / Script tests:
 349        if place.name_group == "ar":
 350            rec["name_ar"] = place.name
 351        elif place.name_group == "cjk":
 352            rec["name_cjk"] = place.name
 353    elif target == "db":
 354        # Required fields:
 355        rec["name_bias"] = 0 if place.name_bias is None else place.name_bias
 356        rec["name_group"] = place.name_group
 357        rec["lat"] = place.lat
 358        rec["lon"] = place.lon
 359        rec["adm1"] = place.adm1
 360        rec["adm2"] = place.adm2
 361
 362    return rec
 363
 364
 365def run_query(url, q):
 366    """ Expert mode:  Run a solr query to see what you get back. 
 367        requires you know the schema
 368    """
 369    solrGaz = pysolr.Solr(url)
 370    records = solrGaz.search(q, **{"rows": 100})
 371    places = []
 372    for r in records:
 373        places.append(as_place(r))
 374
 375    return places
 376
 377
 378def print_places(arr, limit=25):
 379    print("FOUND {}. Showing top {}".format(len(arr), limit))
 380    for p in arr[0:limit]:
 381        print(str(p), f"\tfeature: {p.feature_class}/{p.feature_code}")
 382
 383
 384def capitalize(name: dict):
 385    """ Capitalize all city and major admin boundaries """
 386    nm = name["name"]
 387    if nm and not nm[0].isupper():
 388        return
 389
 390    grp = name.get("name_group")
 391    nt = name.get("name_type")
 392    ft = name["feat_class"]
 393    if nm and grp == '' and nt == 'N' and ft in {'A', 'P'}:
 394        # Because we don't like altering data much:
 395        name["name"] = nm[0].upper() + nm[1:]
 396
 397
 398def gaz_resource(fname):
 399    """
 400    Formats the relative path for an item in the ./solr/etc/gazetteer/ metadata
 401    :param fname:
 402    :return:
 403    """
 404    return os.path.join("etc", "gazetteer", fname)
 405
 406
 407def export_admin_mapping(admin_ids, filepath):
 408    """
 409    Experimental:  Map all source place IDs => ADM ids
 410                   Map all standard ADM ids => place IDs
 411    :param admin_ids:  dict for JSON or array for CSV
 412    :param filepath:
 413    :return:
 414    """
 415    with open(filepath, "w", encoding="UTF-8") as fio:
 416        fio.write("\t".join(["ADM1", "PLACE_ID", "LAT", "LON", "NAME"]))
 417
 418        for a1 in admin_ids:
 419            cc = a1["cc"]
 420            adm1 = a1["adm1"]
 421            hasc = f"{cc}.{adm1}"
 422            y, x = a1["lat"], a1["lon"]
 423            entry = [hasc, a1["place_id"], f"{y:0.1f}", f"{x:0.1f}", a1["name"]]
 424            fio.write("\t".join(entry))
 425            fio.write("\n")
 426
 427class ISO3166Registry:
 428    def __init__(self):
 429        self.hierarchy = {}
 430
 431    def is_iso_country(self, cc):
 432        return cc in self.hierarchy
 433
 434    def has_admin1(self, cc, adm1):
 435        registry = self.hierarchy.get(cc)
 436        if not registry:
 437            # No such country here.
 438            return None
 439        return adm1 in registry["admin1"]
 440
 441    def get_admin1_for(self, cc, adm2):
 442        registry = self.hierarchy.get(cc)
 443        if not registry:
 444            # No such country here.
 445            return None
 446        return registry["admin2"].get(adm2)
 447
 448    @staticmethod
 449    def export_admin_mapping():
 450        """
 451        Generate initial file using pycountry.  pycountry is not a library dependency
 452        so it is externalized here.
 453        :return:
 454        """
 455        import pycountry
 456        fpath = gaz_resource("iso3166_admin1_admin2_hierarchy.csv")
 457        with open(fpath, "w", encoding="UTF-8") as fio:
 458            for subd in pycountry.subdivisions:
 459                fio.write("\t".join([subd.country_code, subd.parent_code or "-", subd.code]))
 460                fio.write("\n")
 461
 462    def load_admin_mapping(self):
 463        from opensextant import parse_admin_code
 464        fpath = gaz_resource("iso3166_admin1_admin2_hierarchy.csv")
 465        with open(fpath, "r", encoding="UTF-8") as fio:
 466            for line in fio:
 467                hasc = line.strip().split("\t")
 468                cc_iso = hasc[0]
 469                adm1, adm2 = None, None
 470                parent = hasc[1]
 471                if parent == "-":
 472                    adm1 = parse_admin_code(hasc[2])
 473                else:
 474                    adm1, adm2 = parse_admin_code(hasc[1]), parse_admin_code(hasc[2])
 475
 476                registry = self.hierarchy.get(cc_iso)
 477                if not registry:
 478                    registry = {"admin1": set([]), "admin2": {}}
 479                    self.hierarchy[cc_iso] = registry
 480                registry["admin1"].add(adm1)
 481                if adm2:
 482                    registry["admin2"][adm2] = adm1
 483
 484
 485class AdminLevelCodes:
 486
 487    # CC = { places = { PLACE_ID: { 'f':FIPS_ADM1, 'i':ISO_ADM1 }},
 488    #        coords = { COORD_ID: { 'f':FIPS_ADM1, 'i':ISO_ADM1 }},
 489    #        admin1 = { 'f':{FIPS_ADM1:ISO_ADM1, ...},
 490    #                   'i':{ISO_ADM1:FIPS_ADM1, ....}}
 491    #
 492    #  Step 1a. Inventory all places by ID across all data sources and standards.
 493    #  Step 1b.       at that time also calculate the relevant codings at the coordinate grid by standard
 494    #  Step 2.  Reindex mapping all paired ADM1-ADM1 equivalents.
 495    #
 496    # NOT every PLACE is represented by both codings, Not every coordinate has both codings
 497    # NOT every ADM1 has a mapping
 498    # Questions:
 499    #     Given an ADM1 in a known standard, what is the alternative code in other standard?
 500    #     Given a place ID (but no ADM1) locate the relevant ADM1 code
 501
 502    def __init__(self, filepath=None):
 503        self.places = {}
 504        self.coords = {}
 505        self.admin1 = {}
 506        self.admin2 = {}  # ADM2 contained in ADM1. {ADM2 : ADM1,...}
 507        self.countries = set([])
 508        self.admin_hierarchy = {}  # Set before use.
 509        self.admin_hierarchy_cc = {}
 510        if filepath:
 511            self.load(filepath)
 512        else:
 513            self.load(pkg_resource_path("global_admin1_mapping.json"))
 514
 515    def adjust_admin1(self, cc, adm1):
 516        if cc in self.admin_hierarchy_cc:
 517            container_adm1 = self.admin_hierarchy[cc].get(adm1)
 518            if container_adm1:
 519                # New mapping:  ADM1, ADM2
 520                return container_adm1, adm1
 521        return None, None
 522
 523    def set_admin_hierarchy(self, countries, adm1_containment):
 524        self.admin_hierarchy_cc = set(countries)
 525        self.admin_hierarchy = adm1_containment
 526
 527    def get_alternate_admin1(self, cc, adm1, std):
 528        """
 529        EXPERIMENTAL still.
 530
 531        :param cc: ISO country code
 532        :param adm1: ADM1 in the given standard
 533        :param std: standard "FIPS" or "ISO"
 534        :return:
 535        """
 536        if cc in self.admin1:
 537            country_registry = self.admin1[cc]
 538            if country_registry:
 539                return country_registry[std].get(adm1)
 540        # No country
 541        return None
 542
 543    def add_country(self, cc):
 544        self.countries.add(cc)
 545        if cc not in self.places:
 546            self.places[cc] = {}
 547        if cc not in self.coords:
 548            self.coords[cc] = {}
 549        if cc not in self.admin1:
 550            self.admin1[cc] = {}
 551        if cc not in self.admin2:
 552            self.admin2[cc] = {}
 553
 554    def add_place(self, place_id, cc, std, adm1, grid):
 555        """
 556        Accumulate discrete ADM1 codings by place instance and location
 557        :param place_id:
 558        :param cc:
 559        :param std:
 560        :param adm1:
 561        :param adm2: optional ADM2 mapping
 562        :param grid:
 563        :return:
 564        """
 565        alt_adm1, adm2 = self.adjust_admin1(cc, adm1)
 566        if adm2:
 567            print("CORRECTION:", cc, alt_adm1, adm2)
 568
 569        obj = self.places[cc].get(place_id, {})
 570        if not obj:
 571            self.places[cc][place_id] = obj
 572        # TODO: detect errors where a place has a standard set already, but the adm1 value conflicts
 573        obj[std] = adm1
 574
 575        crd = self.coords[cc].get(grid, {})
 576        if not crd:
 577            self.coords[cc][grid] = crd
 578        crd[std] = adm1
 579
 580        if adm2:
 581            mapping = self.admin2[cc].get(adm2, {})
 582            mapping[adm2] = adm1
 583
 584    @staticmethod
 585    def _update_adminset(given, iso=None, fips=None):
 586        """
 587        update the iso[i]=f
 588        update the fips[f]=i
 589        from the given (f=i, i=f,)
 590
 591        :param given:  a record of codings {f:val, i:val}
 592        :param iso:  accumulating iso map
 593        :param fips:  accumulating fips map
 594        :return:
 595        """
 596        f = given.get("f", "-")
 597        i = given.get("i", "-")
 598        curr_f = iso.get(i)
 599        curr_i = fips.get(f)
 600        missing_f = not curr_f or curr_f == "-"
 601        missing_i = not curr_i or curr_i == "-"
 602
 603        if f != "-" and i != "-":
 604            if missing_f:
 605                iso[i] = f
 606            if missing_i:
 607                fips[f] = i
 608        else:
 609            if f != "-" and missing_i:
 610                fips[f] = "-"
 611            if i != "-" and missing_f:
 612                iso[i] = "-"
 613
 614    def align_admin1(self):
 615        for cc in self.countries:
 616            # TODO: we'll keep this experimental layer of ADM2s.
 617            # Usage: user will have to consult the admin2 lookup first.
 618            fips = {}
 619            iso = {}
 620            registry = self.places[cc]  # ADMIN places only please.
 621            for plid in registry:
 622                self.update_admin_containment(registry[plid])
 623
 624            for plid in registry:
 625                AdminLevelCodes._update_adminset(registry[plid], iso=iso, fips=fips)
 626
 627            registry = self.coords[cc]
 628            for crd in registry:
 629                AdminLevelCodes._update_adminset(registry[crd], iso=iso, fips=fips)
 630
 631            # Mappings:
 632            self.admin1[cc] = {"FIPS": fips, "ISO": iso}
 633
 634    def as_json(self):
 635        result = {}
 636        for cc in self.countries:
 637            result[cc] = {"places": self.places.get(cc),
 638                          "coords": self.coords.get(cc),
 639                          "admin1": self.admin1.get(cc)}
 640        return result
 641
 642    def save(self, fpath):
 643        with open(fpath, "w", encoding="UTF-8") as fout:
 644            json.dump(self.as_json(), fout)
 645
 646    def load(self, fpath):
 647        if not os.path.exists(fpath):
 648            raise Exception("File does not exist", fpath)
 649
 650        with open(fpath, "r", encoding="UTF-8") as fin:
 651            content = json.load(fin)
 652            self.countries = set(content.keys())
 653            for cc in self.countries:
 654                self.places[cc] = content[cc].get("places")
 655                self.coords[cc] = content[cc].get("coords")
 656                self.admin1[cc] = content[cc].get("admin1")
 657
 658
 659def load_major_cities_iso():
 660    print("Popstats - Load Major Cities / as ISO coded")
 661    admin_lookup = None
 662    try:
 663        admin_lookup = AdminLevelCodes()
 664    except Exception as config_err:
 665        print("Try generating Admin level codes first with \"build prep admin1\" script")
 666        print(str(config_err))
 667        return
 668
 669    # ADM1 hierarchy
 670    #  NGA used FIPS, now uses ISO
 671    #  Geonames uses FIPS, except a few:
 672    #
 673    already_iso = {"US", "BE", "CH", "ME"}
 674
 675    cities = load_major_cities()
 676    problem_countries = {}
 677    all_countries = {}
 678    for city in cities:
 679        cc = city.country_code
 680        all_countries[cc] = 1 + all_countries.get(cc, 0)
 681        if cc in already_iso:
 682            continue
 683
 684        adm1_iso = admin_lookup.get_alternate_admin1(cc, city.adm1, "FIPS")
 685        if adm1_iso == "-":
 686            print("No match for FIPS", cc, city.adm1)
 687            problem_countries[cc] = 1 + problem_countries.get(cc, 0)
 688        elif adm1_iso:
 689            city.adm1 = adm1_iso  # this attr represents the default internal ADM1
 690            city.adm1_iso = adm1_iso  # Yes, this attr represents the ISO ADM1
 691    print("Countries with missing ISO ADM1")
 692    for cc in problem_countries:
 693        if problem_countries[cc] > 1:
 694            print(f"{cc}\t{problem_countries[cc]:4}  /  {all_countries[cc]:4}")
 695    return cities
 696
 697
 698class DB:
 699    def __init__(self, dbpath, commit_rate=1000, debug=False, add_geohash=False):
 700        """
 701        Save items to SQlite db at the commit_rate given.  Call close to finalize any partial batches
 702        and save database.
 703
 704        :param dbpath:
 705        :param commit_rate:
 706        """
 707        self.dbpath = dbpath
 708        self.conn = None
 709        self.queue = []
 710        self.queue_count = 0
 711        self.commit_rate = commit_rate
 712        self.debug = debug
 713        self.geohash_default = add_geohash
 714        if not os.path.exists(dbpath):
 715            ensure_dirs(dbpath)
 716            self.reopen()
 717            self.create()
 718        else:
 719            self.reopen()
 720
 721    def purge(self, q):
 722        if "source" in q:
 723            self.conn.execute("delete from placenames where source = ?", (q["source"],))
 724            self.conn.commit()
 725            print("Purged")
 726        else:
 727            print("Query not implemented ", q)
 728
 729    def delete_places(self, q):
 730        """
 731        :param q: query starting with "WHERE...."
 732        :return:
 733        """
 734        if not q:
 735            raise Exception("Query required silly")
 736        self.conn.execute(f"delete from placenames {q}")
 737
 738    def create(self):
 739        """
 740        Create the placenames table and default indices used for ETL - place_id, source, country, and ADM1
 741        :return:
 742        """
 743        sql_script = """
 744            create TABLE placenames (
 745                `id` INTEGER PRIMARY KEY,
 746                `place_id` TEXT NOT NULL,
 747                `name` TEXT NOT NULL,
 748                `name_type` TEXT NOT NULL,
 749                `name_group` TEXT NULL,
 750                `source` TEXT NOT NULL,
 751                `feat_class` TEXT NOT NULL,
 752                `feat_code` TEXT NOT NULL,
 753                `cc` TEXT NULL,
 754                `FIPS_cc` TEXT  NULL,
 755                `adm1` TEXT  NULL,
 756                `adm2` TEXT  NULL,
 757                `lat` REAL NOT NULL,
 758                `lon` REAL NOT NULL,
 759                `geohash` TEXT NOT NULL,
 760                `duplicate` BIT DEFAULT 0,
 761                `name_bias` INTEGER DEFAULT 0,
 762                `id_bias` INTEGER DEFAULT 0,
 763                `search_only` BIT DEFAULT 0                
 764            ) without rowid;
 765            
 766            create INDEX plid_idx on placenames ("place_id");               
 767            create INDEX s_idx on placenames ("source");
 768            create INDEX c_idx on placenames ("cc");
 769            create INDEX a1_idx on placenames ("adm1");
 770        """
 771        self.conn.executescript(sql_script)
 772        self.conn.commit()
 773
 774        # Population statistics that use location (geohash) as primary key
 775        sql_script = """
 776        create TABLE popstats (
 777                `grid` TEXT NOT NULL, 
 778                `population` INTEGER NOT NULL,
 779                `source` TEXT NOT NULL,
 780                `feat_class` TEXT NOT NULL,
 781                `cc` TEXT NOT NULL,
 782                `FIPS_cc` TEXT  NULL,
 783                `adm1` TEXT  NULL, 
 784                `adm1_path` TEXT NOT NULL,        
 785                `adm2` TEXT  NULL, 
 786                `adm2_path` TEXT NOT NULL        
 787        );
 788        
 789        create INDEX IF NOT EXISTS idx1 on popstats (`grid`);
 790        create INDEX IF NOT EXISTS idx2 on popstats (`source`);
 791        create INDEX IF NOT EXISTS idx3 on popstats (`cc`);
 792        create INDEX IF NOT EXISTS idx4 on popstats (`adm1`);
 793        create INDEX IF NOT EXISTS idx5 on popstats (`adm2`);
 794        
 795        """
 796        self.conn.executescript(sql_script)
 797        self.conn.commit()
 798
 799    def create_indices(self):
 800        """
 801        Create additional indices that are used for advanced ETL functions and optimization.
 802        :return:
 803        """
 804        self.reopen()
 805        indices = """
 806            create INDEX IF NOT EXISTS n_idx on placenames ("name");
 807            create INDEX IF NOT EXISTS nt_idx on placenames ("name_type");
 808            create INDEX IF NOT EXISTS ng_idx on placenames ("name_group");
 809            create INDEX IF NOT EXISTS s_idx on placenames ("source");
 810            create INDEX IF NOT EXISTS c_idx on placenames ("cc");
 811            create INDEX IF NOT EXISTS a1_idx on placenames ("adm1");
 812            create INDEX IF NOT EXISTS fc_idx on placenames ("feat_class");
 813            create INDEX IF NOT EXISTS ft_idx on placenames ("feat_code");
 814            create INDEX IF NOT EXISTS dup_idx on placenames ("duplicate");
 815            create INDEX IF NOT EXISTS so_idx on placenames ("search_only");
 816            create INDEX IF NOT EXISTS lat_idx on placenames ("lat");
 817            create INDEX IF NOT EXISTS lon_idx on placenames ("lon");
 818                        
 819        """
 820        self.conn.executescript(indices)
 821        self.conn.commit()
 822
 823    def optimize(self):
 824        self.reopen()
 825        self.conn.execute("VACUUM")
 826        self.conn.commit()
 827
 828    def reopen(self):
 829        if self.conn is not None:
 830            return
 831
 832        # really close cleanly
 833        self.close()
 834
 835        self.conn = sqlite3.connect(self.dbpath)
 836        self.conn.execute('PRAGMA cache_size = 8092')
 837        self.conn.execute('PRAGMA page_size =  8092')  # twice default. Cache = 8092 x 8KB pages ~ 64MB
 838        self.conn.execute('PRAGMA mmap_size =  1048576000')  # 1000 MB
 839        self.conn.execute("PRAGMA encoding = 'UTF-8'")
 840        self.conn.execute('PRAGMA synchronous = OFF')
 841        self.conn.execute('PRAGMA locking_mode = EXCLUSIVE')
 842        self.conn.execute('PRAGMA journal_mode = MEMORY')
 843        self.conn.execute('PRAGMA temp_store = MEMORY')
 844        self.conn.row_factory = sqlite3.Row
 845
 846    def commit(self):
 847        if self.conn:
 848            self.conn.commit()
 849
 850    def close(self):
 851        try:
 852            if self.conn is not None:
 853                self.__assess_queue(force=True)
 854                self.conn.close()
 855                self.conn = None
 856        except sqlite3.IntegrityError as sql_err:
 857            print("Data integrity issue")
 858            print(format_exc(limit=5))
 859        except Exception as err:
 860            self.conn = None
 861
 862    def _prep_place(self, dct):
 863        """
 864        REQUIRED fields:  'source', 'lat', 'lon'.
 865        OPTIONAL fieldsd: 'search_only'
 866        :param dct:
 867        :return:
 868        """
 869        src = dct["source"]
 870        dct["source"] = GAZETTEER_SOURCES.get(src, src)
 871        if self.geohash_default:
 872            # 6 geohash prefix is about 100m to 200m error. precision=8 is 1m precision.
 873            if "lat" in dct and not dct.get("geohash"):
 874                dct["geohash"] = point2geohash(dct["lat"], dct["lon"], precision=6)
 875        #
 876        # print("Geoname has no location", dct)
 877        if "search_only" not in dct:
 878            nb = dct.get("name_bias", 0)
 879            dct["search_only"] = 1 if nb < 0 else 0
 880
 881        capitalize(dct)
 882
 883    def add_place(self, obj):
 884        """
 885        Add one place
 886        :param obj: a place dictionary.  If arg is a Place object it is converted to dictionary first.
 887        """
 888        dct = None
 889        if isinstance(obj, Place):
 890            dct = as_place_record(obj, target="db")
 891        else:
 892            dct = obj
 893        self._prep_place(dct)
 894        self.queue.append(dct)
 895        self.queue_count += 1
 896        self.__assess_queue()
 897
 898    def __assess_queue(self, force=False):
 899        if force or (self.queue_count >= self.commit_rate):
 900            sql = """
 901            insert into placenames (
 902                id, place_id, name, name_type, name_group, 
 903                lat, lon, geohash, feat_class, feat_code,
 904                cc, FIPS_cc, adm1, adm2, source, name_bias, id_bias, search_only
 905             ) values (
 906                :id, :place_id, :name, :name_type, :name_group, 
 907                :lat, :lon, :geohash, :feat_class, :feat_code,
 908                :cc, :FIPS_cc, :adm1, :adm2, :source, :name_bias, :id_bias, :search_only)"""
 909            self.conn.executemany(sql, self.queue)
 910            self.conn.commit()
 911            self.queue_count = 0
 912            self.queue.clear()
 913
 914    def add_places(self, arr):
 915        """ Add a list of places. """
 916        for dct in arr:
 917            self._prep_place(dct)
 918        self.queue.extend(arr)
 919        self.queue_count += len(arr)
 920        self.__assess_queue()
 921
 922    def list_places_by_id(self, plid, limit=2):
 923        """
 924        Collect places and name_bias for gazetter ETL.
 925        Lookup place by ID as in "G1234567" for Geonames entry or "N123456789" for an NGA one, etc.
 926
 927        :param plid: Place ID according to the convention of source initial + identifier
 928        :param limit: limit queries because if we know we only one 2 or 3 we need not search database beyond that.
 929        :return:
 930        """
 931        name_bias = dict()
 932        place = None
 933        for row in self.conn.execute(f"select * from placenames where place_id = ? limit {limit}", (plid,)):
 934            pl = as_place(row, source="db")
 935            if not place:
 936                place = pl
 937            name_bias[pl.name.lower()] = pl.name_bias
 938        if place:
 939            # This is first place encountered.
 940            # This is not idempotent unless SQL query is more explicit
 941            return place, name_bias
 942
 943        return None, None
 944
 945    def add_population_stats(self, source="G"):
 946        """
 947        Population stats are record by populated area (P-class features) and rolled up
 948        to provide an ADM1 population approximation.
 949        """
 950        print("Purge Popstats")
 951        self.conn.execute("delete from popstats where source = ?", (source,))
 952        self.conn.commit()
 953
 954        sql = """insert into popstats (grid, population, source, feat_class, 
 955                cc, FIPS_cc, adm1, adm1_path, adm2, adm2_path) 
 956            values (:grid, :population, :source, :feat_class, 
 957                :cc, :FIPS_cc, :adm1, :adm1_path, :adm2, :adm2_path)"""
 958        #
 959        for city in load_major_cities_iso():
 960            adm2_path = ""
 961            if city.adm2:
 962                adm2_path = make_HASC(city.country_code, city.adm1, adm2=city.adm2)
 963            city_entry = {
 964                "grid": coord_grid({"lat": city.lat, "lon": city.lon}),
 965                "population": city.population,
 966                "source": source,
 967                "feat_class": city.feature_class,
 968                "FIPS_cc": city.country_code_fips,
 969                "cc": city.country_code,
 970                "adm1": city.adm1,
 971                "adm1_path": make_HASC(city.country_code, city.adm1),
 972                "adm2": city.adm2,
 973                "adm2_path": adm2_path
 974            }
 975            self.conn.execute(sql, city_entry)
 976        self.conn.commit()
 977        print("Popstats - Complete")
 978
 979    def list_all_popstats(self):
 980        """
 981        :return: map of population by geohash only
 982        """
 983        sql = """select sum(population) AS POP, grid from popstats group by grid order by POP"""
 984        population_map = {}
 985        for popstat in self.conn.execute(sql):
 986            loc = popstat["grid"]
 987            population_map[loc] = popstat["POP"]
 988        return population_map
 989
 990    def list_adm1_popstats(self):
 991        """
 992        Provides a neat lookup of population stats by HASC path,
 993           e.g., "US.CA" is califronia; Reported at 35 million in major cities (where state total is reported
 994           at 39 million in 2021.)  Population stats only cover major cities of 15K or more people.
 995        :return: map of population stats by ADM1 path
 996        """
 997        sql = "select sum(population) AS POP, adm1_path from popstats where adm1 != '0' group by adm1_path order by POP"
 998        population_map = {}
 999        for popstat in self.conn.execute(sql):
1000            adm1 = popstat["adm1_path"]
1001            population_map[adm1] = popstat["POP"]
1002        return population_map
1003
1004    def list_adm2_popstats(self):
1005        """
1006        Get approximate county-level stats
1007        """
1008        sql = "select sum(population) AS POP, adm2_path from popstats where adm2 != '' group by adm2_path order by POP"
1009        population_map = {}
1010        for popstat in self.conn.execute(sql):
1011            adm2 = popstat["adm2_path"]
1012            population_map[adm2] = popstat["POP"]
1013        return population_map
1014
1015    def list_countries(self):
1016        """
1017        List distinct country codes in DB.
1018        :return: list of country codes.
1019        """
1020        arr = []
1021        for cc in self.conn.execute("select distinct(cc) as CC from placenames"):
1022            arr.append(cc["CC"])
1023        return arr
1024
1025    def list_places(self, cc=None, fc=None, criteria=None, limit=-1):
1026        """
1027        Potentially massive array -- so this is just a Place generator.
1028        :param cc: country code or ''
1029        :param fc: feat class constraint with "*" wildcard, or ''
1030        :param criteria: additional clause to constrain search, e.g. " AND duplicate=0 " to find non-dups.
1031        :param limit:  non-zero limit
1032        :return: generator
1033        """
1034        sql = ["select * from placenames"]
1035        _and = ""
1036        if cc is not None or fc is not None:
1037            sql.append("where")
1038        if cc is not None:
1039            sql.append(f"cc ='{cc}'")
1040        if fc is not None:
1041            if cc is not None:
1042                _and = " and "
1043            if "*" in fc:
1044                sql.append(f"{_and}feat_class like '{fc.replace('*', '%')}'")
1045            else:
1046                sql.append(f"{_and}feat_class = '{fc}'")
1047        if criteria:
1048            # Include the " AND " yourself in critera
1049            sql.append(criteria)
1050        if limit > 0:
1051            sql.append(f"limit {limit}")
1052
1053        # Query
1054        sql_script = " ".join(sql)
1055        if self.debug:
1056            print(sql_script)
1057        for p in self.conn.execute(sql_script):
1058            yield as_place(p, source="db")
1059
1060    def _list_places_at_geohash(self, lat: float = None, lon: float = None, geohash: str = None,
1061                                cc: str = None, radius: int = 5000, limit=10):
1062        """
1063        A best effort guess at spatial query. Returns an array of matches, thinking most location queries are focused.
1064        This is a geohash-backed hack at search.  Even with SQLite indexing this is still very slow.
1065
1066        Use geohash_precision accordingly and approximately:
1067        - geohash_precision=6 implies +/-  500m
1068        - geohash_precision=5 implies +/-  2500m
1069        - geohash_precision=4 implies +/- 20000m
1070
1071        This approach uses an approximation of finding the relevant neighbor cells using a geodetic (not geohash)
1072        assessment on the radial range.  This method hopefully gets past the limitations below.
1073
1074        General limitations of using Geohash for spatial query:
1075        Given the nature of geohash you might have locations in different cells "xxxx" and "xxxy" that are
1076        close to each other, i.e. within your specified radius.  E.g.,
1077
1078        "9q5f"  and "9qh4"  are neighbor cells
1079        "9q5fr" and "9qh42" are neighbor cells.
1080
1081        "9q5fp" is a LR (south-east) corner of "9q5".  Searching that 2x2 KM box by geohash will only search from that
1082        corner north and westward.
1083        :param lat:
1084        :param lon:
1085        :param geohash:
1086        :param cc:
1087        :param radius:
1088        :param limit:
1089        :return: dict of matches,  { DIST = PLACE, ... }
1090        """
1091        if geohash:
1092            # Postpend "sss" to create a default centroid in a shorter geohash.
1093            gh = f"{geohash}sss"[0:6]
1094            (lat, lon) = geohash2point(geohash)
1095        elif lat is None and lon is None:
1096            raise Exception("Provide lat/lon or geohash")
1097
1098        cells = geohash_cells_radially(lat, lon, radius)
1099        sql_script = []
1100        for gh in cells:
1101            if len(gh) >= 6:
1102                sql_script.append(f"select * from placenames where duplicate=0 and geohash = '{gh[0:6]}'")
1103            else:
1104                sql_script.append(f"select * from placenames where duplicate=0 and geohash like '{gh}%'")
1105
1106        found = {}
1107        # Search the entire grid space
1108        for script in sql_script:
1109            for p in self.conn.execute(script):
1110                if cc:
1111                    if p["cc"] != cc:
1112                        continue
1113                place = as_place(p)
1114                dist = distance_haversine(lon, lat, place.lon, place.lat)
1115                if dist < radius:
1116                    found[dist] = place
1117            if len(found) >= limit:
1118                # Return after first round of querying.
1119                break
1120
1121        return found
1122
1123    def _list_places_at_2d(self, lat: float, lon: float,
1124                           cc: str = None, radius: int = 5000, limit=10):
1125        found = {}
1126        sw, ne = bbox(lon, lat, radius)
1127        sql_script = [f"""select * from placenames where 
1128            (lat < {ne.lat:0.6} and lon < {ne.lon:0.6}) and 
1129            (lat > {sw.lat:0.6} and lon > {sw.lon:0.6})"""]
1130        if cc:
1131            sql_script.append(f" and cc = '{cc}'")
1132
1133        script = " ".join(sql_script)
1134        for p in self.conn.execute(script):
1135            place = as_place(p)
1136            dist = distance_haversine(lon, lat, place.lon, place.lat)
1137            if dist < radius:
1138                found[dist] = place
1139
1140        return found
1141
1142    def list_places_at(self, lat: float = None, lon: float = None, geohash: str = None,
1143                       cc: str = None, radius: int = 5000, limit=10, method="2d"):
1144        """
1145
1146        :param lat: latitude
1147        :param lon: longitude
1148        :param cc:  ISO country code to filter.
1149        :param geohash:  optionally, use precomputed geohash of precision 6-chars instead of lat/lon.
1150        :param radius:  in METERS, radial distance from given point to search, DEFAULT is 5 KM
1151        :param limit: count of places to return
1152        :param method: bbox or geohash
1153        :return: array of tuples, sorted by distance.
1154        """
1155        found = {}
1156        if method == "geohash" or geohash and lat is None:
1157            found = self._list_places_at_geohash(lat=lat, lon=lon, geohash=geohash, cc=cc, radius=radius, limit=limit)
1158        elif method == "2d":
1159            found = self._list_places_at_2d(lat=lat, lon=lon, cc=cc, radius=radius, limit=limit)
1160        if not found:
1161            return []
1162
1163        # Sort by distance key
1164        result = [(dist, found[dist]) for dist in sorted(found.keys())]
1165        return result[0:limit]
1166
1167    def list_admin_names(self, sources=['U', 'N', 'G'], cc=None) -> set:
1168        """
1169        Lists all admin level1 names.
1170        :param cc: country code filter.
1171        :param sources: list of source IDs defaulting to those for USGS, NGA, Geonames.org
1172        :return: set of names, lowerased
1173        """
1174        source_criteria = ','.join([f"'{s}'" for s in sources])
1175        sql = f"""select distinct(name) AS NAME from placenames where  feat_class = 'A' and feat_code = 'ADM1' 
1176              and source in ({source_criteria}) and name_group='' and name_type='N'"""
1177
1178        if cc:
1179            sql += f" and cc='{cc}'"
1180        names = set([])
1181        for nm in self.conn.execute(sql):
1182            # To list names, we normalize lowercase and remove dashes.
1183            names.add(nm['NAME'].lower().replace("-", " "))
1184        return names
1185
1186    def update_place_id(self, rowid, plid):
1187        sql = "update placenames set place_id=? where rowid=?"
1188        self.conn.execute(sql, (plid, rowid,))
1189
1190    def mark_duplicates(self, dups):
1191        if not dups:
1192            return False
1193        step = 1000
1194        for x1 in _array_blocks(dups, step=step):
1195            x2 = x1 + step
1196            arg = ",".join([str(dup) for dup in dups[x1:x2]])
1197            sql = f"update placenames set duplicate=1 where id in ({arg})"
1198            self.conn.execute(sql)
1199        self.conn.commit()
1200        return True
1201
1202    def update_name_type(self, arr: list, t: str):
1203        """
1204        Change the name type in bulk.
1205        :param arr: bulk array of placenames to change
1206        :param t: type code 'A', 'N', 'C'
1207        :return:
1208        """
1209        if not arr:
1210            return False
1211        step = 1000
1212        for x1 in _array_blocks(arr, step=step):
1213            x2 = x1 + step
1214            arg = ",".join([str(pl) for pl in arr[x1:x2]])
1215            sql = f"update placenames set name_type='{t}' where id in ({arg})"
1216            self.conn.execute(sql)
1217        self.conn.commit()
1218        return True
1219
1220    def update_admin1_code(self, cc, from_code, to_code):
1221        if not cc:
1222            print("NULL country code operations must be done manually, carefully.")
1223            return False
1224        sql = f"update placenames set adm1='{to_code}' where cc='{cc}' and adm1='{from_code}'"
1225        if from_code == 'NULL':
1226            sql = f"update placenames set adm1='{to_code}' where cc='{cc}' and adm1 is NULL"
1227
1228        if self.debug:
1229            print(sql)
1230        self.conn.execute(sql)
1231        return True
1232
1233    def mark_search_only(self, pid):
1234        """
1235        Toggle bit for search only.
1236        :param pid: Place ID int or list
1237        """
1238        if isinstance(pid, int):
1239            sql = "update placenames set search_only=1 where id=?"
1240            self.conn.execute(sql, (pid,))
1241        elif isinstance(pid, list):
1242            idset = ", ".join([str(x) for x in pid])
1243            sql = f"update placenames set search_only=1 where id in ({idset})"
1244            self.conn.execute(sql)
1245        else:
1246            raise Exception("Place ID integer or list of integers is required")
1247
1248    def update_bias(self, name_bias, rowids):
1249        arg = ",".join([str(pid) for pid in rowids])
1250        flag = 1 if name_bias < 0 else 0
1251        sql = f"update placenames set name_bias=?, search_only=? where id in ({arg})"
1252        self.conn.execute(sql, (name_bias, flag,))
1253
1254    def update_bias_by_name(self, name_bias, name):
1255        flag = 1 if name_bias < 0 else 0
1256        sql = "update placenames set name_bias=?, search_only=? where name = ?"
1257        self.conn.execute(sql, (name_bias, flag, name,))
1258
1259
1260def _array_blocks(arr, step=1000):
1261    """
1262    Break up large arrays so we have predictable updates or queries.
1263    :param arr:
1264    :param step:
1265    :return:
1266    """
1267    end = len(arr)
1268    blocks = [0]
1269    if end > step:
1270        for start in range(step, end, step):
1271            blocks.append(start)
1272    return blocks
1273
1274
1275def add_location(geo, lat, lon, add_geohash=False):
1276    """
1277    Insert validated location coordinate and geohash
1278    :param add_geohash: due to performance, add this if needed
1279    :param geo: dict
1280    :param lat: latitude value, str or float
1281    :param lon: longitude value, str or float
1282    :return: geo dict with location
1283    """
1284    if lat and lon:
1285        geo["lat"] = parse_float(lat)
1286        geo["lon"] = parse_float(lon)
1287        if add_geohash and "lat" in geo:
1288            geo["geohash"] = point2geohash(geo["lat"], geo["lon"], precision=6)
1289        return True
1290
1291    print("No location on ROW", geo.get("place_id"))
1292    return False
1293
1294
1295class DataSource:
1296    """
1297    Gazetteer Data Source abstraction -- provides guidelines on how to inject
1298    data into a common, normalized gazetteer.
1299    """
1300
1301    def __init__(self, dbf, debug=False, ver=None):
1302        self.db = DB(dbf, commit_rate=100)
1303        self.ver = ver
1304        self.rate = 1000000
1305        self.rowcount = 0
1306        self.source_keys = []
1307        self.excluded_terms = set([])
1308        self.quiet = False
1309        self.source_name = None
1310        self.debug = debug
1311
1312    def purge(self):
1313        print(f"Purging entries for {self.source_name}")
1314        for k in self.source_keys:
1315            print(f"\tsource ID = {k}")
1316            self.db.purge({"source": k})
1317
1318    def process_source(self, sourcefile, limit=-1):
1319        """
1320        generator yielding DB geo dictionary to be stored.
1321        :param sourcefile: Raw data file
1322        :param limit: limit of number of records to process
1323        :return: generator of Place object or dict of Place schema
1324        """
1325        yield None
1326
1327    def normalize(self, sourcefile, limit=-1, optimize=False):
1328        """
1329        Given the spreadsheet or source file rip through it, ingesting contents into the master gazetteer.
1330        :param sourcefile: input file
1331        :param limit: non-zero limit for testing
1332        :param optimize: if database should be optimized when done.
1333        :return:
1334        """
1335        print("\n============================")
1336        print(f"Start {self.source_name}. {arrow.now()}  FILE={sourcefile}")
1337        for geo in self.process_source(sourcefile, limit=limit):
1338            if self.rowcount % self.rate == 0 and not self.quiet:
1339                print(f"Row {self.rowcount}")
1340            if 0 < limit < self.rowcount:
1341                print("Reached non-zero limit for testing.")
1342                break
1343            try:
1344                self.db.add_place(geo)
1345            except sqlite3.IntegrityError:
1346                print("Data integrity issue")
1347                print(format_exc(limit=5))
1348                print(self.db.queue)
1349                break
1350            except Exception:
1351                print("Error with insertion to DB")
1352                print(format_exc(limit=5))
1353        self.db.close()
1354        if optimize:
1355            self.db.optimize()
1356
1357        print("ROWS: ", self.rowcount)
1358        print("EXCLUSIONS: ", len(self.excluded_terms))
1359        if self.debug:
1360            print("EXCLUSIONS:", self.excluded_terms)
1361        print(f"End {self.source_name}. {arrow.now()}")
1362
1363
1364class GazetteerIndex:
1365    """
1366    GazetteerIndex provides a simple API to inject entries into the Gazetteer.
1367    - Every 1000 records a batch is sent to Solr
1368    - Every 1,000,0000 records a commit() call is sent to Solr
1369
1370    This may provide gazetteer specific functions, but as of v1.3 this is a generic Solr wrapper.
1371    """
1372
1373    def __init__(self, server_url, debug=False):
1374
1375        self.server_url = server_url
1376        if not self.server_url.startswith("http"):
1377            self.server_url = f"http://{self.server_url}/solr/gazetteer"
1378
1379        self.server = pysolr.Solr(self.server_url)
1380        self.debug = debug
1381
1382        self.commit_rate = 1000000
1383        self.add_rate = 1000
1384
1385        self._records = []
1386        self.count = 0
1387
1388    def optimize(self):
1389        if self.server and not self.debug:
1390            self.server.optimize()
1391
1392    def save(self, done=False):
1393        if self.debug:
1394            return
1395
1396        # Send batch
1397        if self._records and (done or self.count % self.add_rate == 0):
1398            self.server.add(self._records)
1399            self._records = []
1400        # Commit
1401        if done or (self.count % self.commit_rate == 0 and self.commit_rate > 0):
1402            self.server.commit()
1403        return
1404
1405    def add(self, place):
1406        """
1407
1408        :param place: Place object.
1409        :return:
1410        """
1411        rec = as_place_record(place)
1412        self._records.append(rec)
1413        self.count += 1
1414        self.save()
1415
1416    def delete(self, entry_id=None):
1417        """
1418        Awaiting other kwdargs for deletion use cases.
1419        :param entry_id: master gazetteer row ID in sqlite or solr.  Deletes solr entry
1420        :return:
1421        """
1422        if entry_id:
1423            self.server.delete(id=entry_id)
1424            return True
1425        return False
1426
1427
1428class GazetteerSearch:
1429    def __init__(self, server_url):
1430        """
1431        TODO: BETA - looking to abstract Solr().search() function for common types of queries.
1432            For now getting a list of country name variants is easy enough.
1433        :param server_url:  URL with path to `/solr/gazetteer' index
1434        """
1435        self.index_url = server_url
1436        self.server = pysolr.Solr(self.index_url)
1437
1438    def get_countries(self, max_namelen=30):
1439        """
1440        Searches gazetteer for Country metadata
1441        TODO: dovetail Country metadata (lang, timezone, codes, etc) with
1442            Country place data.
1443        TODO: Document different uses for GazetteerSearch.get_countries() from API get_country()
1444        TODO: Review differences in Place() schema and Country() schema for name variants,
1445            e.g., Country variants presented as abbreviations, codes or names need to be distinguished as such.
1446        :param max_namelen:
1447        :return:
1448        """
1449        countries = []
1450        hits = self.server.search("feat_class:A AND feat_code:PCL*", **{"rows": 30000})
1451        for country in hits:
1452            nm = country['name']
1453            if len(nm) > max_namelen:
1454                continue
1455            C = Country()
1456            C.name = nm
1457            C.cc_iso2 = country['cc']
1458            C.cc_fips = country.get('FIPS_cc')
1459            C.name_type = country.get('name_type')
1460            feat_code = country.get('feat_code')
1461            C.is_territory = feat_code != "PCLI" or "territo" in nm.lower()  # is NOT independent
1462            countries.append(C)
1463        return countries
1464
1465
1466def estimate_name_bias(nm):
1467    return 100 * trivial_bias(nm)
1468
1469
1470class PlaceHeuristics:
1471    # Population scale 0 = 16K, 1=32K, 2=64K, 3=128K
1472    LARGE_CITY = 3
1473
1474    def __init__(self, dbref: DB):
1475        """
1476
1477        :param dbref: DB instance
1478        """
1479        self.debug = False
1480        self.cities = set([])
1481        self.cities_large = set([])
1482        self.cities_spatial = {}  # keyed by geohash
1483        self.provinces = {}
1484        # These should only be used as relative rankings of size of admin boundaries.
1485        self.adm1_population = {}
1486        self.adm2_population = {}
1487        self.stopwords = load_stopterms()
1488        self.POPULATION_THRESHOLD = 200000
1489        self.MAX_NAMELEN = 50
1490        self.stat_charcount = 0
1491        self.stat_namecount = 0
1492        # Terms appearing in GoogleBooks 8,000,000 or more are consider not tag-worthy for geography, in general
1493        self.wordlookup = WordStats(get_default_wordstats())
1494        self.wordlookup.load_common(threshold=6000000)
1495
1496        # Path relative to ./solr/
1497        fpath = os.path.join('etc', 'gazetteer', 'filters', 'non-placenames,admin-codes.csv')
1498        self.stopwords_admin_codes = set(load_list(fpath))
1499
1500        self.exempt_features = {"PPLC", "ADM1", "PCLI", "PCL"}
1501        self.exempted_names = {}
1502        self.feature_wt = {
1503            "A": 11,
1504            "A/ADM1": 16,
1505            "A/ADM2": 14,
1506            "A/PCL": 16,
1507            "P": 10,
1508            "P/PPL": 10,  # Most common
1509            "P/PPLC": 15,
1510            "P/PPLA": 10,
1511            "P/PPLG": 9,
1512            "P/PPLH": 8,
1513            "P/PPLQ": 7,
1514            "P/PPLX": 7,
1515            "P/PPLL": 8,
1516            "L": 6,
1517            "R": 6,
1518            "H": 7,
1519            "H/SPNG": 2,
1520            "H/RSV": 2,
1521            "H/STM": 2,
1522            "H/WLL": 2,
1523            "V": 7,
1524            "S": 8,
1525            "U": 2,
1526            "T": 5,
1527            "T/ISL": 6,
1528            "T/ISLS": 6
1529        }
1530
1531        # This is a set (list) of distinct names for ADM1 level names.
1532        # This obviously changes as you build out the master gazetteer as in the beginning it has NOTHING.
1533        self.provinces = dbref.list_admin_names()
1534
1535        # Pop stats are primarily for P/PPL.
1536        for city in load_major_cities():
1537            self.cities.add(city.name.lower())
1538            if city.population_scale >= PlaceHeuristics.LARGE_CITY:
1539                self.cities_large.add(city.name.lower())
1540
1541        self.cities_spatial = dbref.list_all_popstats()
1542        # These should only be used to score specific feature types ADM1 or ADM2
1543        self.adm1_population = dbref.list_adm1_popstats()
1544        self.adm2_population = dbref.list_adm2_popstats()
1545
1546    def is_large_city(self, name):
1547        return name in self.cities_large
1548
1549    def is_significant(self, feat) -> bool:
1550        return feat in self.exempt_features
1551
1552    def is_province_name(self, name) -> bool:
1553        """
1554        Report if a name is that of a province, regardless of whether the location repreents something else.
1555        E.g.
1556            "Florida" is a city (lesser known) or a state (well known).   Therefore it is a popular name.
1557        :param name:
1558        :return:
1559        """
1560        return name in self.provinces
1561
1562    def is_stopword(self, name: str) -> bool:
1563        if name in self.stopwords:
1564            return True
1565
1566        if name.replace("-", " ") in self.stopwords:
1567            return True
1568
1569        # Name is "Bar"...
1570        # test if "The Bar" is a stopword
1571        if f"the {name}" in self.stopwords:
1572            return True
1573
1574        # Name is "The Bar"
1575        # test if "bar" is a stopword
1576        if name.startswith("the "):
1577            if name[4:].strip() in self.stopwords:
1578                return True
1579        return False
1580
1581    def estimate_bias(self, geo, name_group=""):
1582        """
1583        Primary Estimator of id_bias and name_bias.
1584
1585        id_bias   -- a location bias to pre-rank city by feature/population
1586        name_bias -- a metric ranging from -1 to 1, that represents the validity of a tagging the name/phrase
1587                     in a general context.  The result is eventually binary  search_only = name_bias < 0. This means
1588                     that geo names that are search_only are not taggable.
1589        :param geo:
1590        :param name_group:
1591        :return:
1592        """
1593        geo["id_bias"] = self.location_bias(geo)
1594        geo["name_bias"] = self.name_bias(geo["name"], geo["feat_class"], geo["feat_code"],
1595                                          name_group=name_group, name_type=geo["name_type"])
1596
1597    def get_feature_scale(self, fc, dsg):
1598        """
1599
1600        :param fc: feature class
1601        :param dsg: feature code
1602        :return:
1603        """
1604        #  Location bias is 70% population, 30% feature type
1605        #
1606        if not dsg:
1607            return self.feature_wt.get(fc, 5)
1608
1609        fckey = f"{fc}/{dsg}"
1610        for glen in [6, 5]:
1611            fc_scale = self.feature_wt.get(fckey[0:glen])
1612            if fc_scale:
1613                return fc_scale
1614
1615        return self.feature_wt.get(fc, 5)
1616
1617    def location_bias(self, geo):
1618        """
1619        See estimate_bias()
1620
1621        A location is pre-disposed by its feature type and population/popularity.
1622        E.g., large cities are mentioned more often in news or documents than less populated cities.
1623        Factors:
1624
1625        Feature gradient     A, P, ..... U.  More populated features have higer bias
1626        Population gradient  log(pop)  scales bias higher
1627
1628        :param geo:  standard ETL geo dict
1629        :return:  score on 100 point scale.
1630        """
1631        return int(10 * self._location_bias(geo))
1632
1633    def _location_bias(self, geo):
1634        """
1635        dict with parts:
1636
1637        :param geo:  standard ETL geo dict
1638        :return:  A number on the range of 0 to 10 approximately.
1639        """
1640        fc = geo["feat_class"]
1641        dsg = geo["feat_code"]
1642        pop_wt = 0
1643        fc_scale = self.get_feature_scale(fc, dsg)
1644
1645        if fc == 'P':
1646            lockey = coord_grid(geo)
1647            population = self.cities_spatial.get(lockey, 0)
1648            pop_wt = popscale(population, feature="city")
1649        if fc == 'A':
1650            cc = geo["cc"]
1651            a1 = geo["adm1"]
1652            pop_wt = 1
1653            if dsg == 'ADM1':
1654                adm_path = make_HASC(cc, a1)
1655                population = self.adm1_population.get(adm_path, 0)
1656                pop_wt = popscale(population, feature="province")
1657            if dsg == 'ADM2' and "adm2" in geo:
1658                adm_path = make_HASC(cc, a1, geo["adm2"])
1659                population = self.adm2_population.get(adm_path, 0)
1660                pop_wt = popscale(population, feature="district")
1661
1662        # For PLACES  this helps differentiate P/PPL by population
1663        # Between PLACES and BOUNDARIES the population component may rank places
1664        # higher than a boundary by the same name.
1665        #
1666        # Weighted sums -- Population has more information than the feature, so we weight that higher.
1667        return (0.75 * pop_wt) + (0.25 * fc_scale)
1668
1669    def name_bias(self, geoname: str, feat_class: str, feat_code: str, name_group="", name_type="N"):
1670        """
1671        See estimate_bias()
1672
1673        Given a geoname we look at the instance of the name variant and if it is something trivially
1674        colliding with stopwords in other languages then we consider omitting it.
1675
1676        very positive bias   - long unique name, diacritic or in non-ASCII script
1677        positive bias        - normal location name, multiple words or grams
1678        neutral              - possibly a place name, but is case-dependent, e.g., person name or generic monument name.
1679        negative bias        - a stopword or trivial version of a stopword, `Ã…re`
1680        very negative bias   - a very rare or extremely long version of a place name, nonsense
1681        -1                   - WordStats reports as a "common" word.
1682
1683        Conclusion: Any Negative name_bias term will NOT be tagged, although it is present in gazetteer.
1684
1685        CODE and ABBREV are not biased -- they are simply not full names.
1686
1687        TODO: ONLY unigrams are tracked, so
1688            "Alabama" -> not common,
1689            "Need" -> common,
1690            "New York" -> not tracked. This is a bi-gram
1691
1692        :param geoname:
1693        :param feat_class:
1694        :param feat_code:
1695        :param name_group:
1696        :param name_type:
1697        :return:  floating point number between -100 and 100
1698        """
1699        return int(100 * self._name_bias(geoname, feat_class, feat_code, name_group=name_group, name_type=name_type))
1700
1701    def _name_bias(self, geoname, feat_class, feat_code, name_group="", name_type="N"):
1702        """
1703        Details on assessing a name against common word stats, feature metadata, lang script
1704        :param geoname: name str
1705        :param feat_class:  UNUSED
1706        :param feat_code:
1707        :param name_group:
1708        :return:
1709        """
1710
1711        if name_group in {'cjk', 'ar'}:
1712            # TODO: Should look up Stopwords here, but that likely happens in tagger.
1713            return trivial_bias(geoname) + 0.10
1714
1715        self.stat_namecount += 1
1716        namelen = len(geoname)
1717        self.stat_charcount += namelen
1718
1719        # if name_type == "C" and is_administrative(feat_class):
1720        # Quick checks:
1721        if namelen < 5:
1722            # Check for administrative codes that are most commonly stopterms or other meanings
1723            if geoname.upper() in self.stopwords_admin_codes:
1724                return -1
1725            # Omit pure digit names
1726            if geoname.isdigit():
1727                return -1
1728
1729        if namelen < 2:
1730            return -0.1
1731        elif 30 < namelen < self.MAX_NAMELEN:
1732            return trivial_bias(geoname)
1733        elif namelen >= self.MAX_NAMELEN:
1734            # Name is too long to consider tagging; Unlikely to appear in this form.
1735            return -0.1
1736
1737        # Test shorter names:  Combine feature, stopwords, and other tests.
1738        # ==============================================================
1739        # FIRST -- see if a judgement was made on a name already.
1740        norm = geoname.lower()
1741        if norm in self.exempted_names:
1742            return self.exempted_names[norm]
1743
1744        # SECOND -- figure out if name is significant and popular because it is a popular place
1745        #           rather than just a common word.
1746
1747        # TODO: add non-diacritic name to this test?
1748        norm2 = strip_quotes(replace_diacritics(norm))
1749        norm2 = norm2.replace("-", " ")
1750        is_popular_place = self.is_significant(feat_code) or self.is_large_city(norm) or \
1751                           self.is_province_name(norm) or self.is_province_name(norm2)
1752
1753        # Example: "Moscow (P/PPLC)" significant. Name is exempted (significant feature)
1754        #          "Moscow (A/ADM2)" not significant; But it is flagged as "common"...and omitted without this:
1755        #          "Florida (P/PPL)" is not a common place
1756        #          "Florida (A/ADM1)" is a significant place.
1757        #                Note "Large Cities" vs. ADMIN-LEVEL1 boundaries are different lookups
1758        if is_popular_place:
1759            self.exempted_names[norm] = trivial_bias(geoname)
1760            return self.exempted_names[norm]
1761        elif self.is_stopword(norm):
1762            return -1
1763        elif self.wordlookup.is_common(norm):
1764            # is a common word, but not associated often with a location
1765            return -1
1766        else:
1767            # Much deeper checks on about 90% of the names
1768            # Omit short diacritic names that are typically stopwords.  These are partial biases
1769            # since we are now checking if the non-diacritic version is filtered.
1770            if norm != norm2:
1771                if self.wordlookup.is_common(norm2):
1772                    return -0.9
1773
1774                if norm2 in self.stopwords:
1775                    return -0.5
1776
1777                if norm2.upper() in self.stopwords_admin_codes:
1778                    return -0.6
1779
1780        # Return a positive value.
1781        return trivial_bias(norm)
1782
1783
1784if __name__ == "__main__":
1785
1786    import argparse
1787
1788    ap = argparse.ArgumentParser()
1789    ap.add_argument('--solr')
1790    ap.add_argument('--output')
1791    ap.add_argument('--query')
1792    ap.add_argument('--lookup')
1793    ap.add_argument('--parse', action="store_true", default=False)
1794    ap.add_argument('--demo')
1795
1796    args = ap.parse_args()
1797
1798    if args.lookup:
1799        findings = run_lookup(args.solr, args.lookup, args.parse)
1800        print_places(findings)
1801    elif args.query:
1802        findings = run_query(args.solr, args.query)
1803        print_places(findings)
def coord_grid(geo: dict) -> str:
108def coord_grid(geo: dict) -> str:
109    """
110    A less dissatisfying grid than geohash. Its just returning Y,X in low resolution. LLL.l,LLL.l
111    """
112    if "lat" not in geo:
113        return None
114    x, y = geo["lon"], geo["lat"]
115    return f"{y:0.1f},{x:0.1f}"

A less dissatisfying grid than geohash. Its just returning Y,X in low resolution. LLL.l,LLL.l

def load_stopterms(project_dir='.', lower=True):
126def load_stopterms(project_dir=".", lower=True):
127    """
128    Load default stop terms from source tree for project build.
129    :param project_dir: The location of Xponents/solr source tree.
130    :param lower: default case to load data as. If not lower, then terms are loaded as-is
131    :return:
132    """
133    loader = ConfigUtility()
134    stopterms = set([])
135    for f in ["etc/gazetteer/filters/non-placenames.csv",
136              "etc/gazetteer/filters/non-placenames,spa.csv",  # SPANISH
137              "etc/gazetteer/filters/non-placenames,rus,ukr.csv",  # Cyrillic languages
138              "etc/gazetteer/filters/non-placenames,deu.csv",  # GERMAN
139              "etc/gazetteer/filters/non-placenames,acronym.csv"]:
140        terms = loader.loadDataFromFile(os.path.join(project_dir, f), ",")
141        for t in terms:
142            if lower:
143                stopterms.add(t[0].lower())
144            else:
145                stopterms.add(t[0])
146    return stopterms

Load default stop terms from source tree for project build. :param project_dir: The location of Xponents/solr source tree. :param lower: default case to load data as. If not lower, then terms are loaded as-is :return:

def run_lookup(url, lookup, parse):
149def run_lookup(url, lookup, parse):
150    """ Gazetteer demo mimics some of the logic in XponentsGazetteerQuery
151        try "San Francisco, CA, US"
152    """
153
154    solr_gaz = pysolr.Solr(url)
155    # specific unit tests
156
157    records = None
158    places = []
159    if parse:
160        # See other Java demo, XponentsGazetteerQuery
161        # assuming NAME, PROV, COUNTRY
162        slots = [a.strip() for a in lookup.split(',')]
163
164        if len(slots) < 3:
165            print("NAME, PROV, CC  is required format for --lookup")
166            return None
167
168        cityVal = slots[0]
169        provVal = slots[1]
170        countryVal = slots[2]
171
172        # Find best match for Province. Pass ADM1 code to next query
173        query = 'name:"{}" AND feat_class:A AND cc:{}'.format(provVal, countryVal)
174        records = solr_gaz.search(query, **{"rows": 100})
175
176        if not records:
177            return None
178
179        # Use a Place object to abstract things.
180        adm1 = as_place(records.docs[0])
181        # Find best match for the tuple NAME/PROV/COUNTRY
182        #
183        query = 'name:"{}" AND feat_class:A AND cc:{} AND adm1:{}'.format(cityVal, countryVal, adm1.adm1)
184        records = solr_gaz.search(query, **{"rows": 1000})
185    else:
186        query = 'name:"{}" AND feat_class:P'.format(lookup)
187        records = solr_gaz.search(query, **{"rows": 1000})
188
189    if not records:
190        return None
191
192    for r in records:
193        places.append(as_place(r))
194
195    return places

Gazetteer demo mimics some of the logic in XponentsGazetteerQuery try "San Francisco, CA, US"

def normalize_name(nm: str):
217def normalize_name(nm: str):
218    """
219    convenience method that ensures we have some consistency on normalization of name
220    :param nm:
221    :return:
222    """
223    return nm.replace("\u2019", "'").replace("\xa0", " ").strip().strip("'")

convenience method that ensures we have some consistency on normalization of name :param nm: :return:

def name_group_for(nm: str):
226def name_group_for(nm: str):
227    """
228    Determine the major language "name group" for the input
229    :param nm: name or any text
230    :return:
231    """
232    if has_cjk(nm):
233        return "cjk"
234    elif has_arabic(nm):
235        return "ar"
236    return ""

Determine the major language "name group" for the input :param nm: name or any text :return:

def as_admin_place(r):
239def as_admin_place(r):
240    """
241    Convert dict to a Place object
242    :param r: gazetteer row from Solr or SQlite.
243    :return: Place
244    """
245    keys = {}
246    if hasattr(r, "keys"):
247        keys = r.keys()
248
249    p = Place(r['place_id'], r['name'])
250    p.country_code = r["cc"]
251    p.adm1 = r["adm1"]
252    p.source = r["source"]
253    p.geohash = r["geohash"]
254    if "adm1_iso" in keys:
255        p.adm1_iso = r["adm1_iso"]
256
257    p.lat = p.lon = p.X = p.Y = None
258    return p

Convert dict to a Place object :param r: gazetteer row from Solr or SQlite. :return: Place

def as_place(r, source='index'):
261def as_place(r, source="index"):
262    """
263    Convert dict to a Place object
264    :param source: db or index (solr)
265    :param r: gazetteer row from Solr or SQlite.
266    :return: Place
267    """
268    keys = {}
269    if hasattr(r, "keys"):
270        keys = r.keys()
271
272    lat, lon = 0, 0
273    if "geo" in r:
274        (lat, lon) = r['geo'].split(',')
275    else:
276        lat, lon = r["lat"], r["lon"]
277
278    p = Place(r['place_id'], r['name'], lat=lat, lon=lon)
279    p.country_code = r["cc"]
280    p.feature_class = r["feat_class"]
281    p.feature_code = r["feat_code"]
282    if "id" in r:
283        # Required if coming or going into a database:
284        p.id = r["id"]
285    p.id_bias = r["id_bias"]
286    if source == "db":
287        p.name_bias = r["name_bias"]
288
289    # optional fields:
290    if "FIPS_cc" in keys:
291        p.country_code_fips = r["FIPS_cc"]
292    if "adm1" in keys:
293        p.adm1 = r["adm1"]
294    if "adm2" in keys:
295        p.adm2 = r["adm2"]
296    if "geohash" in keys:
297        p.geohash = r["geohash"]
298    if "id" in keys:
299        p.id = r["id"]
300    if "source" in keys:
301        p.source = r["source"]
302    if "name_group" in keys:
303        p.name_group = r["name_group"]
304    if "search_only" in keys:
305        p.search_only = get_bool(r["search_only"])
306    if "name_type" in keys:
307        p.name_type = r["name_type"]
308
309    p.is_ascii = is_ascii(p.name)
310    return p

Convert dict to a Place object :param source: db or index (solr) :param r: gazetteer row from Solr or SQlite. :return: Place

def as_place_record(place, target='index'):
313def as_place_record(place, target="index"):
314    """
315    Given a Place object, serialize it as a dict consistent with the Solr index schema.
316    :param place:
317    :param target: index or db
318    :return:
319    """
320    if not isinstance(place, Place):
321        return None
322    # Copy defaults offers nothing.
323    # rec = copy(GAZETTEER_TEMPLATE)
324    rec = {
325        "id": place.id,
326        "place_id": place.place_id,
327        "name": place.name,
328        "name_type": place.name_type,
329        "feat_class": place.feature_class,
330        "feat_code": place.feature_code,
331        "cc": place.country_code,
332        "FIPS_cc": place.country_code_fips,
333        "source": place.source,
334        # "script": place.name_script,
335        "search_only": place.search_only
336    }
337
338    # ADMIN level 1/2 boundary names:
339    if place.adm1:
340        rec["adm1"] = place.adm1
341    if place.adm2:
342        rec["adm2"] = place.adm2
343    # ID BIAS:
344    rec["id_bias"] = 0 if place.id_bias is None else place.id_bias
345
346    if target == "index":
347        # Preserve innate precision on Lat/Lon: e.g., "4.5,-118.4" is result if only that amount of precision is present
348        rec["geo"] = ",".join([str(place.lat), str(place.lon)]),
349        # Name Group / Script tests:
350        if place.name_group == "ar":
351            rec["name_ar"] = place.name
352        elif place.name_group == "cjk":
353            rec["name_cjk"] = place.name
354    elif target == "db":
355        # Required fields:
356        rec["name_bias"] = 0 if place.name_bias is None else place.name_bias
357        rec["name_group"] = place.name_group
358        rec["lat"] = place.lat
359        rec["lon"] = place.lon
360        rec["adm1"] = place.adm1
361        rec["adm2"] = place.adm2
362
363    return rec

Given a Place object, serialize it as a dict consistent with the Solr index schema. :param place: :param target: index or db :return:

def run_query(url, q):
366def run_query(url, q):
367    """ Expert mode:  Run a solr query to see what you get back. 
368        requires you know the schema
369    """
370    solrGaz = pysolr.Solr(url)
371    records = solrGaz.search(q, **{"rows": 100})
372    places = []
373    for r in records:
374        places.append(as_place(r))
375
376    return places

Expert mode: Run a solr query to see what you get back. requires you know the schema

def capitalize(name: dict):
385def capitalize(name: dict):
386    """ Capitalize all city and major admin boundaries """
387    nm = name["name"]
388    if nm and not nm[0].isupper():
389        return
390
391    grp = name.get("name_group")
392    nt = name.get("name_type")
393    ft = name["feat_class"]
394    if nm and grp == '' and nt == 'N' and ft in {'A', 'P'}:
395        # Because we don't like altering data much:
396        name["name"] = nm[0].upper() + nm[1:]

Capitalize all city and major admin boundaries

def gaz_resource(fname):
399def gaz_resource(fname):
400    """
401    Formats the relative path for an item in the ./solr/etc/gazetteer/ metadata
402    :param fname:
403    :return:
404    """
405    return os.path.join("etc", "gazetteer", fname)

Formats the relative path for an item in the ./solr/etc/gazetteer/ metadata :param fname: :return:

def export_admin_mapping(admin_ids, filepath):
408def export_admin_mapping(admin_ids, filepath):
409    """
410    Experimental:  Map all source place IDs => ADM ids
411                   Map all standard ADM ids => place IDs
412    :param admin_ids:  dict for JSON or array for CSV
413    :param filepath:
414    :return:
415    """
416    with open(filepath, "w", encoding="UTF-8") as fio:
417        fio.write("\t".join(["ADM1", "PLACE_ID", "LAT", "LON", "NAME"]))
418
419        for a1 in admin_ids:
420            cc = a1["cc"]
421            adm1 = a1["adm1"]
422            hasc = f"{cc}.{adm1}"
423            y, x = a1["lat"], a1["lon"]
424            entry = [hasc, a1["place_id"], f"{y:0.1f}", f"{x:0.1f}", a1["name"]]
425            fio.write("\t".join(entry))
426            fio.write("\n")

Experimental: Map all source place IDs => ADM ids Map all standard ADM ids => place IDs :param admin_ids: dict for JSON or array for CSV :param filepath: :return:

def add_location(geo, lat, lon, add_geohash=False):
1276def add_location(geo, lat, lon, add_geohash=False):
1277    """
1278    Insert validated location coordinate and geohash
1279    :param add_geohash: due to performance, add this if needed
1280    :param geo: dict
1281    :param lat: latitude value, str or float
1282    :param lon: longitude value, str or float
1283    :return: geo dict with location
1284    """
1285    if lat and lon:
1286        geo["lat"] = parse_float(lat)
1287        geo["lon"] = parse_float(lon)
1288        if add_geohash and "lat" in geo:
1289            geo["geohash"] = point2geohash(geo["lat"], geo["lon"], precision=6)
1290        return True
1291
1292    print("No location on ROW", geo.get("place_id"))
1293    return False

Insert validated location coordinate and geohash :param add_geohash: due to performance, add this if needed :param geo: dict :param lat: latitude value, str or float :param lon: longitude value, str or float :return: geo dict with location

class DataSource:
1296class DataSource:
1297    """
1298    Gazetteer Data Source abstraction -- provides guidelines on how to inject
1299    data into a common, normalized gazetteer.
1300    """
1301
1302    def __init__(self, dbf, debug=False, ver=None):
1303        self.db = DB(dbf, commit_rate=100)
1304        self.ver = ver
1305        self.rate = 1000000
1306        self.rowcount = 0
1307        self.source_keys = []
1308        self.excluded_terms = set([])
1309        self.quiet = False
1310        self.source_name = None
1311        self.debug = debug
1312
1313    def purge(self):
1314        print(f"Purging entries for {self.source_name}")
1315        for k in self.source_keys:
1316            print(f"\tsource ID = {k}")
1317            self.db.purge({"source": k})
1318
1319    def process_source(self, sourcefile, limit=-1):
1320        """
1321        generator yielding DB geo dictionary to be stored.
1322        :param sourcefile: Raw data file
1323        :param limit: limit of number of records to process
1324        :return: generator of Place object or dict of Place schema
1325        """
1326        yield None
1327
1328    def normalize(self, sourcefile, limit=-1, optimize=False):
1329        """
1330        Given the spreadsheet or source file rip through it, ingesting contents into the master gazetteer.
1331        :param sourcefile: input file
1332        :param limit: non-zero limit for testing
1333        :param optimize: if database should be optimized when done.
1334        :return:
1335        """
1336        print("\n============================")
1337        print(f"Start {self.source_name}. {arrow.now()}  FILE={sourcefile}")
1338        for geo in self.process_source(sourcefile, limit=limit):
1339            if self.rowcount % self.rate == 0 and not self.quiet:
1340                print(f"Row {self.rowcount}")
1341            if 0 < limit < self.rowcount:
1342                print("Reached non-zero limit for testing.")
1343                break
1344            try:
1345                self.db.add_place(geo)
1346            except sqlite3.IntegrityError:
1347                print("Data integrity issue")
1348                print(format_exc(limit=5))
1349                print(self.db.queue)
1350                break
1351            except Exception:
1352                print("Error with insertion to DB")
1353                print(format_exc(limit=5))
1354        self.db.close()
1355        if optimize:
1356            self.db.optimize()
1357
1358        print("ROWS: ", self.rowcount)
1359        print("EXCLUSIONS: ", len(self.excluded_terms))
1360        if self.debug:
1361            print("EXCLUSIONS:", self.excluded_terms)
1362        print(f"End {self.source_name}. {arrow.now()}")

Gazetteer Data Source abstraction -- provides guidelines on how to inject data into a common, normalized gazetteer.

def process_source(self, sourcefile, limit=-1):
1319    def process_source(self, sourcefile, limit=-1):
1320        """
1321        generator yielding DB geo dictionary to be stored.
1322        :param sourcefile: Raw data file
1323        :param limit: limit of number of records to process
1324        :return: generator of Place object or dict of Place schema
1325        """
1326        yield None

generator yielding DB geo dictionary to be stored. :param sourcefile: Raw data file :param limit: limit of number of records to process :return: generator of Place object or dict of Place schema

def normalize(self, sourcefile, limit=-1, optimize=False):
1328    def normalize(self, sourcefile, limit=-1, optimize=False):
1329        """
1330        Given the spreadsheet or source file rip through it, ingesting contents into the master gazetteer.
1331        :param sourcefile: input file
1332        :param limit: non-zero limit for testing
1333        :param optimize: if database should be optimized when done.
1334        :return:
1335        """
1336        print("\n============================")
1337        print(f"Start {self.source_name}. {arrow.now()}  FILE={sourcefile}")
1338        for geo in self.process_source(sourcefile, limit=limit):
1339            if self.rowcount % self.rate == 0 and not self.quiet:
1340                print(f"Row {self.rowcount}")
1341            if 0 < limit < self.rowcount:
1342                print("Reached non-zero limit for testing.")
1343                break
1344            try:
1345                self.db.add_place(geo)
1346            except sqlite3.IntegrityError:
1347                print("Data integrity issue")
1348                print(format_exc(limit=5))
1349                print(self.db.queue)
1350                break
1351            except Exception:
1352                print("Error with insertion to DB")
1353                print(format_exc(limit=5))
1354        self.db.close()
1355        if optimize:
1356            self.db.optimize()
1357
1358        print("ROWS: ", self.rowcount)
1359        print("EXCLUSIONS: ", len(self.excluded_terms))
1360        if self.debug:
1361            print("EXCLUSIONS:", self.excluded_terms)
1362        print(f"End {self.source_name}. {arrow.now()}")

Given the spreadsheet or source file rip through it, ingesting contents into the master gazetteer. :param sourcefile: input file :param limit: non-zero limit for testing :param optimize: if database should be optimized when done. :return:

class GazetteerIndex:
1365class GazetteerIndex:
1366    """
1367    GazetteerIndex provides a simple API to inject entries into the Gazetteer.
1368    - Every 1000 records a batch is sent to Solr
1369    - Every 1,000,0000 records a commit() call is sent to Solr
1370
1371    This may provide gazetteer specific functions, but as of v1.3 this is a generic Solr wrapper.
1372    """
1373
1374    def __init__(self, server_url, debug=False):
1375
1376        self.server_url = server_url
1377        if not self.server_url.startswith("http"):
1378            self.server_url = f"http://{self.server_url}/solr/gazetteer"
1379
1380        self.server = pysolr.Solr(self.server_url)
1381        self.debug = debug
1382
1383        self.commit_rate = 1000000
1384        self.add_rate = 1000
1385
1386        self._records = []
1387        self.count = 0
1388
1389    def optimize(self):
1390        if self.server and not self.debug:
1391            self.server.optimize()
1392
1393    def save(self, done=False):
1394        if self.debug:
1395            return
1396
1397        # Send batch
1398        if self._records and (done or self.count % self.add_rate == 0):
1399            self.server.add(self._records)
1400            self._records = []
1401        # Commit
1402        if done or (self.count % self.commit_rate == 0 and self.commit_rate > 0):
1403            self.server.commit()
1404        return
1405
1406    def add(self, place):
1407        """
1408
1409        :param place: Place object.
1410        :return:
1411        """
1412        rec = as_place_record(place)
1413        self._records.append(rec)
1414        self.count += 1
1415        self.save()
1416
1417    def delete(self, entry_id=None):
1418        """
1419        Awaiting other kwdargs for deletion use cases.
1420        :param entry_id: master gazetteer row ID in sqlite or solr.  Deletes solr entry
1421        :return:
1422        """
1423        if entry_id:
1424            self.server.delete(id=entry_id)
1425            return True
1426        return False

GazetteerIndex provides a simple API to inject entries into the Gazetteer.

  • Every 1000 records a batch is sent to Solr
  • Every 1,000,0000 records a commit() call is sent to Solr

This may provide gazetteer specific functions, but as of v1.3 this is a generic Solr wrapper.

def add(self, place):
1406    def add(self, place):
1407        """
1408
1409        :param place: Place object.
1410        :return:
1411        """
1412        rec = as_place_record(place)
1413        self._records.append(rec)
1414        self.count += 1
1415        self.save()

:param place: Place object. :return:

def delete(self, entry_id=None):
1417    def delete(self, entry_id=None):
1418        """
1419        Awaiting other kwdargs for deletion use cases.
1420        :param entry_id: master gazetteer row ID in sqlite or solr.  Deletes solr entry
1421        :return:
1422        """
1423        if entry_id:
1424            self.server.delete(id=entry_id)
1425            return True
1426        return False

Awaiting other kwdargs for deletion use cases. :param entry_id: master gazetteer row ID in sqlite or solr. Deletes solr entry :return: