opensextant.gazetteer
1import json 2import os 3import sqlite3 4from traceback import format_exc 5 6import arrow 7import pysolr 8from opensextant import Place, Country, distance_haversine, load_major_cities, make_HASC, popscale, \ 9 geohash_cells_radially, bbox, point2geohash, geohash2point, pkg_resource_path 10from opensextant.utility import ensure_dirs, is_ascii, has_cjk, has_arabic, \ 11 ConfigUtility, get_bool, trivial_bias, replace_diacritics, strip_quotes, parse_float, load_list 12from opensextant.wordstats import WordStats 13 14DEFAULT_SOLR_URL="127.0.0.1:7000" 15DEFAULT_MASTER = "master_gazetteer.sqlite" 16DEFAULT_COUNTRY_ID_BIAS = 49 17DEFAULT_WORDSTATS = "wordstats.sqlite" 18 19GAZETTEER_SOURCE_ID = { 20 "ISO", # ISO-3166 metadata 21 "N", # NGA 22 "NF", # NGA fixed 23 "U", # USGS 24 "UF", # USGS fixed 25 "OA", # OpenSextant Adhoc 26 "OG", # OpenSextant geonames.org derived 27 "G", # Geonames.org 28 "GP" # Geonames.org Postal 29 "X", # Xponents 30 "NE" # Natural Earth 31} 32 33GAZETTEER_SOURCES = { 34 "NGA": "N", 35 "USGS": "U", 36 "USGS-AUTOFIXED": "UF", 37 "NGA-AUTOFIXED": "NF", 38 "ADHOC": "OA", # OpenSextant Adhoc 39 "NE": "NE", # Natural Earth. 40 "GEONAMES": "OG", # OpenSextant geonames 41 "Geonames.org": "OG", # OpenSextant geonames 42 "XPONENTS": "X", # Xponents Adhoc or generated 43 "XpGen": "X", 44 "XP": "X", 45 "GP": "GP", # Geonames Postal 46 "G": "G" 47} 48 49# Scripts, not languages per se. 50SCRIPT_CODES = { 51 None: "", 52 "LATIN": "L", 53 "HAN": "H", 54 "COMMON": "C", 55 "ARABIC": "A", 56 "ARMENIAN": "AM", 57 "BENGALI": "BN", 58 "CYRILLIC": "CY", 59 "DEVANAGARI": "DV", 60 "ETHIOPIC": "ET", 61 "GEORGIAN": "GE", 62 "GREEK": "GK", 63 "GURMUKHI": "GM", 64 "GUJARATI": "GU", 65 "HEBREW": "HE", 66 "HANGUL": "HG", 67 "HIRAGANA": "HI", 68 "KANNADA": "KN", 69 "KATAKANA": "KA", 70 "KHMER": "KM", 71 "MALAYALAM": "MY", 72 "SINHALA": "SI", 73 "TAMIL": "TA", 74 "THAI": "TH" 75} 76 77# FIPS code to ISO CC 78# Extend to other territory codes 79US_TERRITORY_MAP = { 80 "FIPS": { 81 "AQ": "AS", 82 "GQ": "GU", 83 "CQ": "MP", 84 "RQ": "PR", 85 "VI": "VI", 86 "FQ": "UM", 87 "DQ": "UM", 88 "HQ": "UM", 89 "JQ": "UM", 90 "WQ": "UM", 91 "MQ": "UM" 92 }, 93 "ISO": { 94 # Reverse is not true for all cases: ISO to FIPS 95 # "UM": "UM", 96 "PR": "RQ", 97 "MP": "CQ", 98 "GU": "CQ", 99 "AS": "AQ" 100 } 101} 102 103# IGNROE Historical names and Zones, and Unknowns *H 104MAJOR_ADMIN_CODES = {'ADM1', 'ADMD', 'ADM2', 'ADM3', 'ADM4', 'PRSH', 'TERR'} 105 106 107def coord_grid(geo: dict) -> str: 108 """ 109 A less dissatisfying grid than geohash. Its just returning Y,X in low resolution. LLL.l,LLL.l 110 """ 111 if "lat" not in geo: 112 return None 113 x, y = geo["lon"], geo["lat"] 114 return f"{y:0.1f},{x:0.1f}" 115 116 117def get_default_db(): 118 return os.path.join(".", "tmp", DEFAULT_MASTER) 119 120 121def get_default_wordstats(): 122 return os.path.join(".", "tmp", DEFAULT_WORDSTATS) 123 124 125def load_stopterms(project_dir=".", lower=True): 126 """ 127 Load default stop terms from source tree for project build. 128 :param project_dir: The location of Xponents/solr source tree. 129 :param lower: default case to load data as. If not lower, then terms are loaded as-is 130 :return: 131 """ 132 loader = ConfigUtility() 133 stopterms = set([]) 134 for f in ["etc/gazetteer/filters/non-placenames.csv", 135 "etc/gazetteer/filters/non-placenames,spa.csv", # SPANISH 136 "etc/gazetteer/filters/non-placenames,rus,ukr.csv", # Cyrillic languages 137 "etc/gazetteer/filters/non-placenames,deu.csv", # GERMAN 138 "etc/gazetteer/filters/non-placenames,acronym.csv"]: 139 terms = loader.loadDataFromFile(os.path.join(project_dir, f), ",") 140 for t in terms: 141 if lower: 142 stopterms.add(t[0].lower()) 143 else: 144 stopterms.add(t[0]) 145 return stopterms 146 147 148def run_lookup(url, lookup, parse): 149 """ Gazetteer demo mimics some of the logic in XponentsGazetteerQuery 150 try "San Francisco, CA, US" 151 """ 152 153 solr_gaz = pysolr.Solr(url) 154 # specific unit tests 155 156 records = None 157 places = [] 158 if parse: 159 # See other Java demo, XponentsGazetteerQuery 160 # assuming NAME, PROV, COUNTRY 161 slots = [a.strip() for a in lookup.split(',')] 162 163 if len(slots) < 3: 164 print("NAME, PROV, CC is required format for --lookup") 165 return None 166 167 cityVal = slots[0] 168 provVal = slots[1] 169 countryVal = slots[2] 170 171 # Find best match for Province. Pass ADM1 code to next query 172 query = 'name:"{}" AND feat_class:A AND cc:{}'.format(provVal, countryVal) 173 records = solr_gaz.search(query, **{"rows": 100}) 174 175 if not records: 176 return None 177 178 # Use a Place object to abstract things. 179 adm1 = as_place(records.docs[0]) 180 # Find best match for the tuple NAME/PROV/COUNTRY 181 # 182 query = 'name:"{}" AND feat_class:A AND cc:{} AND adm1:{}'.format(cityVal, countryVal, adm1.adm1) 183 records = solr_gaz.search(query, **{"rows": 1000}) 184 else: 185 query = 'name:"{}" AND feat_class:P'.format(lookup) 186 records = solr_gaz.search(query, **{"rows": 1000}) 187 188 if not records: 189 return None 190 191 for r in records: 192 places.append(as_place(r)) 193 194 return places 195 196 197GAZETTEER_TEMPLATE = { 198 'id': -1, 199 'place_id': -1, 200 'name': None, 201 # name_ar or name_cjk are filled in only if name is Arabic or CJK name group 202 'lat': 0, 'lon': 0, 203 # geo is the field to use for index. lat/lon are used for database. 204 'feat_class': None, 'feat_code': None, 205 'FIPS_cc': None, 'cc': None, 206 'adm1': None, 'adm2': None, 207 'source': None, 208 # 'script': None, 209 'name_bias': 0, 210 'id_bias': 0, 211 'name_type': "N", 212 'search_only': False 213} 214 215 216def normalize_name(nm: str): 217 """ 218 convenience method that ensures we have some consistency on normalization of name 219 :param nm: 220 :return: 221 """ 222 return nm.replace("\u2019", "'").replace("\xa0", " ").strip().strip("'") 223 224 225def name_group_for(nm: str): 226 """ 227 Determine the major language "name group" for the input 228 :param nm: name or any text 229 :return: 230 """ 231 if has_cjk(nm): 232 return "cjk" 233 elif has_arabic(nm): 234 return "ar" 235 return "" 236 237 238def as_admin_place(r): 239 """ 240 Convert dict to a Place object 241 :param r: gazetteer row from Solr or SQlite. 242 :return: Place 243 """ 244 keys = {} 245 if hasattr(r, "keys"): 246 keys = r.keys() 247 248 p = Place(r['place_id'], r['name']) 249 p.country_code = r["cc"] 250 p.adm1 = r["adm1"] 251 p.source = r["source"] 252 p.geohash = r["geohash"] 253 if "adm1_iso" in keys: 254 p.adm1_iso = r["adm1_iso"] 255 256 p.lat = p.lon = p.X = p.Y = None 257 return p 258 259 260def as_place(r, source="index"): 261 """ 262 Convert dict to a Place object 263 :param source: db or index (solr) 264 :param r: gazetteer row from Solr or SQlite. 265 :return: Place 266 """ 267 keys = {} 268 if hasattr(r, "keys"): 269 keys = r.keys() 270 271 lat, lon = 0, 0 272 if "geo" in r: 273 (lat, lon) = r['geo'].split(',') 274 else: 275 lat, lon = r["lat"], r["lon"] 276 277 p = Place(r['place_id'], r['name'], lat=lat, lon=lon) 278 p.country_code = r["cc"] 279 p.feature_class = r["feat_class"] 280 p.feature_code = r["feat_code"] 281 if "id" in r: 282 # Required if coming or going into a database: 283 p.id = r["id"] 284 p.id_bias = r["id_bias"] 285 if source == "db": 286 p.name_bias = r["name_bias"] 287 288 # optional fields: 289 if "FIPS_cc" in keys: 290 p.country_code_fips = r["FIPS_cc"] 291 if "adm1" in keys: 292 p.adm1 = r["adm1"] 293 if "adm2" in keys: 294 p.adm2 = r["adm2"] 295 if "geohash" in keys: 296 p.geohash = r["geohash"] 297 if "id" in keys: 298 p.id = r["id"] 299 if "source" in keys: 300 p.source = r["source"] 301 if "name_group" in keys: 302 p.name_group = r["name_group"] 303 if "search_only" in keys: 304 p.search_only = get_bool(r["search_only"]) 305 if "name_type" in keys: 306 p.name_type = r["name_type"] 307 308 p.is_ascii = is_ascii(p.name) 309 return p 310 311 312def as_place_record(place, target="index"): 313 """ 314 Given a Place object, serialize it as a dict consistent with the Solr index schema. 315 :param place: 316 :param target: index or db 317 :return: 318 """ 319 if not isinstance(place, Place): 320 return None 321 # Copy defaults offers nothing. 322 # rec = copy(GAZETTEER_TEMPLATE) 323 rec = { 324 "id": place.id, 325 "place_id": place.place_id, 326 "name": place.name, 327 "name_type": place.name_type, 328 "feat_class": place.feature_class, 329 "feat_code": place.feature_code, 330 "cc": place.country_code, 331 "FIPS_cc": place.country_code_fips, 332 "source": place.source, 333 # "script": place.name_script, 334 "search_only": place.search_only 335 } 336 337 # ADMIN level 1/2 boundary names: 338 if place.adm1: 339 rec["adm1"] = place.adm1 340 if place.adm2: 341 rec["adm2"] = place.adm2 342 # ID BIAS: 343 rec["id_bias"] = 0 if place.id_bias is None else place.id_bias 344 345 if target == "index": 346 # Preserve innate precision on Lat/Lon: e.g., "4.5,-118.4" is result if only that amount of precision is present 347 rec["geo"] = ",".join([str(place.lat), str(place.lon)]), 348 # Name Group / Script tests: 349 if place.name_group == "ar": 350 rec["name_ar"] = place.name 351 elif place.name_group == "cjk": 352 rec["name_cjk"] = place.name 353 elif target == "db": 354 # Required fields: 355 rec["name_bias"] = 0 if place.name_bias is None else place.name_bias 356 rec["name_group"] = place.name_group 357 rec["lat"] = place.lat 358 rec["lon"] = place.lon 359 rec["adm1"] = place.adm1 360 rec["adm2"] = place.adm2 361 362 return rec 363 364 365def run_query(url, q): 366 """ Expert mode: Run a solr query to see what you get back. 367 requires you know the schema 368 """ 369 solrGaz = pysolr.Solr(url) 370 records = solrGaz.search(q, **{"rows": 100}) 371 places = [] 372 for r in records: 373 places.append(as_place(r)) 374 375 return places 376 377 378def print_places(arr, limit=25): 379 print("FOUND {}. Showing top {}".format(len(arr), limit)) 380 for p in arr[0:limit]: 381 print(str(p), f"\tfeature: {p.feature_class}/{p.feature_code}") 382 383 384def capitalize(name: dict): 385 """ Capitalize all city and major admin boundaries """ 386 nm = name["name"] 387 if nm and not nm[0].isupper(): 388 return 389 390 grp = name.get("name_group") 391 nt = name.get("name_type") 392 ft = name["feat_class"] 393 if nm and grp == '' and nt == 'N' and ft in {'A', 'P'}: 394 # Because we don't like altering data much: 395 name["name"] = nm[0].upper() + nm[1:] 396 397 398def gaz_resource(fname): 399 """ 400 Formats the relative path for an item in the ./solr/etc/gazetteer/ metadata 401 :param fname: 402 :return: 403 """ 404 return os.path.join("etc", "gazetteer", fname) 405 406 407def export_admin_mapping(admin_ids, filepath): 408 """ 409 Experimental: Map all source place IDs => ADM ids 410 Map all standard ADM ids => place IDs 411 :param admin_ids: dict for JSON or array for CSV 412 :param filepath: 413 :return: 414 """ 415 with open(filepath, "w", encoding="UTF-8") as fio: 416 fio.write("\t".join(["ADM1", "PLACE_ID", "LAT", "LON", "NAME"])) 417 418 for a1 in admin_ids: 419 cc = a1["cc"] 420 adm1 = a1["adm1"] 421 hasc = f"{cc}.{adm1}" 422 y, x = a1["lat"], a1["lon"] 423 entry = [hasc, a1["place_id"], f"{y:0.1f}", f"{x:0.1f}", a1["name"]] 424 fio.write("\t".join(entry)) 425 fio.write("\n") 426 427class ISO3166Registry: 428 def __init__(self): 429 self.hierarchy = {} 430 431 def is_iso_country(self, cc): 432 return cc in self.hierarchy 433 434 def has_admin1(self, cc, adm1): 435 registry = self.hierarchy.get(cc) 436 if not registry: 437 # No such country here. 438 return None 439 return adm1 in registry["admin1"] 440 441 def get_admin1_for(self, cc, adm2): 442 registry = self.hierarchy.get(cc) 443 if not registry: 444 # No such country here. 445 return None 446 return registry["admin2"].get(adm2) 447 448 @staticmethod 449 def export_admin_mapping(): 450 """ 451 Generate initial file using pycountry. pycountry is not a library dependency 452 so it is externalized here. 453 :return: 454 """ 455 import pycountry 456 fpath = gaz_resource("iso3166_admin1_admin2_hierarchy.csv") 457 with open(fpath, "w", encoding="UTF-8") as fio: 458 for subd in pycountry.subdivisions: 459 fio.write("\t".join([subd.country_code, subd.parent_code or "-", subd.code])) 460 fio.write("\n") 461 462 def load_admin_mapping(self): 463 from opensextant import parse_admin_code 464 fpath = gaz_resource("iso3166_admin1_admin2_hierarchy.csv") 465 with open(fpath, "r", encoding="UTF-8") as fio: 466 for line in fio: 467 hasc = line.strip().split("\t") 468 cc_iso = hasc[0] 469 adm1, adm2 = None, None 470 parent = hasc[1] 471 if parent == "-": 472 adm1 = parse_admin_code(hasc[2]) 473 else: 474 adm1, adm2 = parse_admin_code(hasc[1]), parse_admin_code(hasc[2]) 475 476 registry = self.hierarchy.get(cc_iso) 477 if not registry: 478 registry = {"admin1": set([]), "admin2": {}} 479 self.hierarchy[cc_iso] = registry 480 registry["admin1"].add(adm1) 481 if adm2: 482 registry["admin2"][adm2] = adm1 483 484 485class AdminLevelCodes: 486 487 # CC = { places = { PLACE_ID: { 'f':FIPS_ADM1, 'i':ISO_ADM1 }}, 488 # coords = { COORD_ID: { 'f':FIPS_ADM1, 'i':ISO_ADM1 }}, 489 # admin1 = { 'f':{FIPS_ADM1:ISO_ADM1, ...}, 490 # 'i':{ISO_ADM1:FIPS_ADM1, ....}} 491 # 492 # Step 1a. Inventory all places by ID across all data sources and standards. 493 # Step 1b. at that time also calculate the relevant codings at the coordinate grid by standard 494 # Step 2. Reindex mapping all paired ADM1-ADM1 equivalents. 495 # 496 # NOT every PLACE is represented by both codings, Not every coordinate has both codings 497 # NOT every ADM1 has a mapping 498 # Questions: 499 # Given an ADM1 in a known standard, what is the alternative code in other standard? 500 # Given a place ID (but no ADM1) locate the relevant ADM1 code 501 502 def __init__(self, filepath=None): 503 self.places = {} 504 self.coords = {} 505 self.admin1 = {} 506 self.admin2 = {} # ADM2 contained in ADM1. {ADM2 : ADM1,...} 507 self.countries = set([]) 508 self.admin_hierarchy = {} # Set before use. 509 self.admin_hierarchy_cc = {} 510 if filepath: 511 self.load(filepath) 512 else: 513 self.load(pkg_resource_path("global_admin1_mapping.json")) 514 515 def adjust_admin1(self, cc, adm1): 516 if cc in self.admin_hierarchy_cc: 517 container_adm1 = self.admin_hierarchy[cc].get(adm1) 518 if container_adm1: 519 # New mapping: ADM1, ADM2 520 return container_adm1, adm1 521 return None, None 522 523 def set_admin_hierarchy(self, countries, adm1_containment): 524 self.admin_hierarchy_cc = set(countries) 525 self.admin_hierarchy = adm1_containment 526 527 def get_alternate_admin1(self, cc, adm1, std): 528 """ 529 EXPERIMENTAL still. 530 531 :param cc: ISO country code 532 :param adm1: ADM1 in the given standard 533 :param std: standard "FIPS" or "ISO" 534 :return: 535 """ 536 if cc in self.admin1: 537 country_registry = self.admin1[cc] 538 if country_registry: 539 return country_registry[std].get(adm1) 540 # No country 541 return None 542 543 def add_country(self, cc): 544 self.countries.add(cc) 545 if cc not in self.places: 546 self.places[cc] = {} 547 if cc not in self.coords: 548 self.coords[cc] = {} 549 if cc not in self.admin1: 550 self.admin1[cc] = {} 551 if cc not in self.admin2: 552 self.admin2[cc] = {} 553 554 def add_place(self, place_id, cc, std, adm1, grid): 555 """ 556 Accumulate discrete ADM1 codings by place instance and location 557 :param place_id: 558 :param cc: 559 :param std: 560 :param adm1: 561 :param adm2: optional ADM2 mapping 562 :param grid: 563 :return: 564 """ 565 alt_adm1, adm2 = self.adjust_admin1(cc, adm1) 566 if adm2: 567 print("CORRECTION:", cc, alt_adm1, adm2) 568 569 obj = self.places[cc].get(place_id, {}) 570 if not obj: 571 self.places[cc][place_id] = obj 572 # TODO: detect errors where a place has a standard set already, but the adm1 value conflicts 573 obj[std] = adm1 574 575 crd = self.coords[cc].get(grid, {}) 576 if not crd: 577 self.coords[cc][grid] = crd 578 crd[std] = adm1 579 580 if adm2: 581 mapping = self.admin2[cc].get(adm2, {}) 582 mapping[adm2] = adm1 583 584 @staticmethod 585 def _update_adminset(given, iso=None, fips=None): 586 """ 587 update the iso[i]=f 588 update the fips[f]=i 589 from the given (f=i, i=f,) 590 591 :param given: a record of codings {f:val, i:val} 592 :param iso: accumulating iso map 593 :param fips: accumulating fips map 594 :return: 595 """ 596 f = given.get("f", "-") 597 i = given.get("i", "-") 598 curr_f = iso.get(i) 599 curr_i = fips.get(f) 600 missing_f = not curr_f or curr_f == "-" 601 missing_i = not curr_i or curr_i == "-" 602 603 if f != "-" and i != "-": 604 if missing_f: 605 iso[i] = f 606 if missing_i: 607 fips[f] = i 608 else: 609 if f != "-" and missing_i: 610 fips[f] = "-" 611 if i != "-" and missing_f: 612 iso[i] = "-" 613 614 def align_admin1(self): 615 for cc in self.countries: 616 # TODO: we'll keep this experimental layer of ADM2s. 617 # Usage: user will have to consult the admin2 lookup first. 618 fips = {} 619 iso = {} 620 registry = self.places[cc] # ADMIN places only please. 621 for plid in registry: 622 self.update_admin_containment(registry[plid]) 623 624 for plid in registry: 625 AdminLevelCodes._update_adminset(registry[plid], iso=iso, fips=fips) 626 627 registry = self.coords[cc] 628 for crd in registry: 629 AdminLevelCodes._update_adminset(registry[crd], iso=iso, fips=fips) 630 631 # Mappings: 632 self.admin1[cc] = {"FIPS": fips, "ISO": iso} 633 634 def as_json(self): 635 result = {} 636 for cc in self.countries: 637 result[cc] = {"places": self.places.get(cc), 638 "coords": self.coords.get(cc), 639 "admin1": self.admin1.get(cc)} 640 return result 641 642 def save(self, fpath): 643 with open(fpath, "w", encoding="UTF-8") as fout: 644 json.dump(self.as_json(), fout) 645 646 def load(self, fpath): 647 if not os.path.exists(fpath): 648 raise Exception("File does not exist", fpath) 649 650 with open(fpath, "r", encoding="UTF-8") as fin: 651 content = json.load(fin) 652 self.countries = set(content.keys()) 653 for cc in self.countries: 654 self.places[cc] = content[cc].get("places") 655 self.coords[cc] = content[cc].get("coords") 656 self.admin1[cc] = content[cc].get("admin1") 657 658 659def load_major_cities_iso(): 660 print("Popstats - Load Major Cities / as ISO coded") 661 admin_lookup = None 662 try: 663 admin_lookup = AdminLevelCodes() 664 except Exception as config_err: 665 print("Try generating Admin level codes first with \"build prep admin1\" script") 666 print(str(config_err)) 667 return 668 669 # ADM1 hierarchy 670 # NGA used FIPS, now uses ISO 671 # Geonames uses FIPS, except a few: 672 # 673 already_iso = {"US", "BE", "CH", "ME"} 674 675 cities = load_major_cities() 676 problem_countries = {} 677 all_countries = {} 678 for city in cities: 679 cc = city.country_code 680 all_countries[cc] = 1 + all_countries.get(cc, 0) 681 if cc in already_iso: 682 continue 683 684 adm1_iso = admin_lookup.get_alternate_admin1(cc, city.adm1, "FIPS") 685 if adm1_iso == "-": 686 print("No match for FIPS", cc, city.adm1) 687 problem_countries[cc] = 1 + problem_countries.get(cc, 0) 688 elif adm1_iso: 689 city.adm1 = adm1_iso # this attr represents the default internal ADM1 690 city.adm1_iso = adm1_iso # Yes, this attr represents the ISO ADM1 691 print("Countries with missing ISO ADM1") 692 for cc in problem_countries: 693 if problem_countries[cc] > 1: 694 print(f"{cc}\t{problem_countries[cc]:4} / {all_countries[cc]:4}") 695 return cities 696 697 698class DB: 699 def __init__(self, dbpath, commit_rate=1000, debug=False, add_geohash=False): 700 """ 701 Save items to SQlite db at the commit_rate given. Call close to finalize any partial batches 702 and save database. 703 704 :param dbpath: 705 :param commit_rate: 706 """ 707 self.dbpath = dbpath 708 self.conn = None 709 self.queue = [] 710 self.queue_count = 0 711 self.commit_rate = commit_rate 712 self.debug = debug 713 self.geohash_default = add_geohash 714 if not os.path.exists(dbpath): 715 ensure_dirs(dbpath) 716 self.reopen() 717 self.create() 718 else: 719 self.reopen() 720 721 def purge(self, q): 722 if "source" in q: 723 self.conn.execute("delete from placenames where source = ?", (q["source"],)) 724 self.conn.commit() 725 print("Purged") 726 else: 727 print("Query not implemented ", q) 728 729 def delete_places(self, q): 730 """ 731 :param q: query starting with "WHERE...." 732 :return: 733 """ 734 if not q: 735 raise Exception("Query required silly") 736 self.conn.execute(f"delete from placenames {q}") 737 738 def create(self): 739 """ 740 Create the placenames table and default indices used for ETL - place_id, source, country, and ADM1 741 :return: 742 """ 743 sql_script = """ 744 create TABLE placenames ( 745 `id` INTEGER PRIMARY KEY, 746 `place_id` TEXT NOT NULL, 747 `name` TEXT NOT NULL, 748 `name_type` TEXT NOT NULL, 749 `name_group` TEXT NULL, 750 `source` TEXT NOT NULL, 751 `feat_class` TEXT NOT NULL, 752 `feat_code` TEXT NOT NULL, 753 `cc` TEXT NULL, 754 `FIPS_cc` TEXT NULL, 755 `adm1` TEXT NULL, 756 `adm2` TEXT NULL, 757 `lat` REAL NOT NULL, 758 `lon` REAL NOT NULL, 759 `geohash` TEXT NOT NULL, 760 `duplicate` BIT DEFAULT 0, 761 `name_bias` INTEGER DEFAULT 0, 762 `id_bias` INTEGER DEFAULT 0, 763 `search_only` BIT DEFAULT 0 764 ) without rowid; 765 766 create INDEX plid_idx on placenames ("place_id"); 767 create INDEX s_idx on placenames ("source"); 768 create INDEX c_idx on placenames ("cc"); 769 create INDEX a1_idx on placenames ("adm1"); 770 """ 771 self.conn.executescript(sql_script) 772 self.conn.commit() 773 774 # Population statistics that use location (geohash) as primary key 775 sql_script = """ 776 create TABLE popstats ( 777 `grid` TEXT NOT NULL, 778 `population` INTEGER NOT NULL, 779 `source` TEXT NOT NULL, 780 `feat_class` TEXT NOT NULL, 781 `cc` TEXT NOT NULL, 782 `FIPS_cc` TEXT NULL, 783 `adm1` TEXT NULL, 784 `adm1_path` TEXT NOT NULL, 785 `adm2` TEXT NULL, 786 `adm2_path` TEXT NOT NULL 787 ); 788 789 create INDEX IF NOT EXISTS idx1 on popstats (`grid`); 790 create INDEX IF NOT EXISTS idx2 on popstats (`source`); 791 create INDEX IF NOT EXISTS idx3 on popstats (`cc`); 792 create INDEX IF NOT EXISTS idx4 on popstats (`adm1`); 793 create INDEX IF NOT EXISTS idx5 on popstats (`adm2`); 794 795 """ 796 self.conn.executescript(sql_script) 797 self.conn.commit() 798 799 def create_indices(self): 800 """ 801 Create additional indices that are used for advanced ETL functions and optimization. 802 :return: 803 """ 804 self.reopen() 805 indices = """ 806 create INDEX IF NOT EXISTS n_idx on placenames ("name"); 807 create INDEX IF NOT EXISTS nt_idx on placenames ("name_type"); 808 create INDEX IF NOT EXISTS ng_idx on placenames ("name_group"); 809 create INDEX IF NOT EXISTS s_idx on placenames ("source"); 810 create INDEX IF NOT EXISTS c_idx on placenames ("cc"); 811 create INDEX IF NOT EXISTS a1_idx on placenames ("adm1"); 812 create INDEX IF NOT EXISTS fc_idx on placenames ("feat_class"); 813 create INDEX IF NOT EXISTS ft_idx on placenames ("feat_code"); 814 create INDEX IF NOT EXISTS dup_idx on placenames ("duplicate"); 815 create INDEX IF NOT EXISTS so_idx on placenames ("search_only"); 816 create INDEX IF NOT EXISTS lat_idx on placenames ("lat"); 817 create INDEX IF NOT EXISTS lon_idx on placenames ("lon"); 818 819 """ 820 self.conn.executescript(indices) 821 self.conn.commit() 822 823 def optimize(self): 824 self.reopen() 825 self.conn.execute("VACUUM") 826 self.conn.commit() 827 828 def reopen(self): 829 if self.conn is not None: 830 return 831 832 # really close cleanly 833 self.close() 834 835 self.conn = sqlite3.connect(self.dbpath) 836 self.conn.execute('PRAGMA cache_size = 8092') 837 self.conn.execute('PRAGMA page_size = 8092') # twice default. Cache = 8092 x 8KB pages ~ 64MB 838 self.conn.execute('PRAGMA mmap_size = 1048576000') # 1000 MB 839 self.conn.execute("PRAGMA encoding = 'UTF-8'") 840 self.conn.execute('PRAGMA synchronous = OFF') 841 self.conn.execute('PRAGMA locking_mode = EXCLUSIVE') 842 self.conn.execute('PRAGMA journal_mode = MEMORY') 843 self.conn.execute('PRAGMA temp_store = MEMORY') 844 self.conn.row_factory = sqlite3.Row 845 846 def commit(self): 847 if self.conn: 848 self.conn.commit() 849 850 def close(self): 851 try: 852 if self.conn is not None: 853 self.__assess_queue(force=True) 854 self.conn.close() 855 self.conn = None 856 except sqlite3.IntegrityError as sql_err: 857 print("Data integrity issue") 858 print(format_exc(limit=5)) 859 except Exception as err: 860 self.conn = None 861 862 def _prep_place(self, dct): 863 """ 864 REQUIRED fields: 'source', 'lat', 'lon'. 865 OPTIONAL fieldsd: 'search_only' 866 :param dct: 867 :return: 868 """ 869 src = dct["source"] 870 dct["source"] = GAZETTEER_SOURCES.get(src, src) 871 if self.geohash_default: 872 # 6 geohash prefix is about 100m to 200m error. precision=8 is 1m precision. 873 if "lat" in dct and not dct.get("geohash"): 874 dct["geohash"] = point2geohash(dct["lat"], dct["lon"], precision=6) 875 # 876 # print("Geoname has no location", dct) 877 if "search_only" not in dct: 878 nb = dct.get("name_bias", 0) 879 dct["search_only"] = 1 if nb < 0 else 0 880 881 capitalize(dct) 882 883 def add_place(self, obj): 884 """ 885 Add one place 886 :param obj: a place dictionary. If arg is a Place object it is converted to dictionary first. 887 """ 888 dct = None 889 if isinstance(obj, Place): 890 dct = as_place_record(obj, target="db") 891 else: 892 dct = obj 893 self._prep_place(dct) 894 self.queue.append(dct) 895 self.queue_count += 1 896 self.__assess_queue() 897 898 def __assess_queue(self, force=False): 899 if force or (self.queue_count >= self.commit_rate): 900 sql = """ 901 insert into placenames ( 902 id, place_id, name, name_type, name_group, 903 lat, lon, geohash, feat_class, feat_code, 904 cc, FIPS_cc, adm1, adm2, source, name_bias, id_bias, search_only 905 ) values ( 906 :id, :place_id, :name, :name_type, :name_group, 907 :lat, :lon, :geohash, :feat_class, :feat_code, 908 :cc, :FIPS_cc, :adm1, :adm2, :source, :name_bias, :id_bias, :search_only)""" 909 self.conn.executemany(sql, self.queue) 910 self.conn.commit() 911 self.queue_count = 0 912 self.queue.clear() 913 914 def add_places(self, arr): 915 """ Add a list of places. """ 916 for dct in arr: 917 self._prep_place(dct) 918 self.queue.extend(arr) 919 self.queue_count += len(arr) 920 self.__assess_queue() 921 922 def list_places_by_id(self, plid, limit=2): 923 """ 924 Collect places and name_bias for gazetter ETL. 925 Lookup place by ID as in "G1234567" for Geonames entry or "N123456789" for an NGA one, etc. 926 927 :param plid: Place ID according to the convention of source initial + identifier 928 :param limit: limit queries because if we know we only one 2 or 3 we need not search database beyond that. 929 :return: 930 """ 931 name_bias = dict() 932 place = None 933 for row in self.conn.execute(f"select * from placenames where place_id = ? limit {limit}", (plid,)): 934 pl = as_place(row, source="db") 935 if not place: 936 place = pl 937 name_bias[pl.name.lower()] = pl.name_bias 938 if place: 939 # This is first place encountered. 940 # This is not idempotent unless SQL query is more explicit 941 return place, name_bias 942 943 return None, None 944 945 def add_population_stats(self, source="G"): 946 """ 947 Population stats are record by populated area (P-class features) and rolled up 948 to provide an ADM1 population approximation. 949 """ 950 print("Purge Popstats") 951 self.conn.execute("delete from popstats where source = ?", (source,)) 952 self.conn.commit() 953 954 sql = """insert into popstats (grid, population, source, feat_class, 955 cc, FIPS_cc, adm1, adm1_path, adm2, adm2_path) 956 values (:grid, :population, :source, :feat_class, 957 :cc, :FIPS_cc, :adm1, :adm1_path, :adm2, :adm2_path)""" 958 # 959 for city in load_major_cities_iso(): 960 adm2_path = "" 961 if city.adm2: 962 adm2_path = make_HASC(city.country_code, city.adm1, adm2=city.adm2) 963 city_entry = { 964 "grid": coord_grid({"lat": city.lat, "lon": city.lon}), 965 "population": city.population, 966 "source": source, 967 "feat_class": city.feature_class, 968 "FIPS_cc": city.country_code_fips, 969 "cc": city.country_code, 970 "adm1": city.adm1, 971 "adm1_path": make_HASC(city.country_code, city.adm1), 972 "adm2": city.adm2, 973 "adm2_path": adm2_path 974 } 975 self.conn.execute(sql, city_entry) 976 self.conn.commit() 977 print("Popstats - Complete") 978 979 def list_all_popstats(self): 980 """ 981 :return: map of population by geohash only 982 """ 983 sql = """select sum(population) AS POP, grid from popstats group by grid order by POP""" 984 population_map = {} 985 for popstat in self.conn.execute(sql): 986 loc = popstat["grid"] 987 population_map[loc] = popstat["POP"] 988 return population_map 989 990 def list_adm1_popstats(self): 991 """ 992 Provides a neat lookup of population stats by HASC path, 993 e.g., "US.CA" is califronia; Reported at 35 million in major cities (where state total is reported 994 at 39 million in 2021.) Population stats only cover major cities of 15K or more people. 995 :return: map of population stats by ADM1 path 996 """ 997 sql = "select sum(population) AS POP, adm1_path from popstats where adm1 != '0' group by adm1_path order by POP" 998 population_map = {} 999 for popstat in self.conn.execute(sql): 1000 adm1 = popstat["adm1_path"] 1001 population_map[adm1] = popstat["POP"] 1002 return population_map 1003 1004 def list_adm2_popstats(self): 1005 """ 1006 Get approximate county-level stats 1007 """ 1008 sql = "select sum(population) AS POP, adm2_path from popstats where adm2 != '' group by adm2_path order by POP" 1009 population_map = {} 1010 for popstat in self.conn.execute(sql): 1011 adm2 = popstat["adm2_path"] 1012 population_map[adm2] = popstat["POP"] 1013 return population_map 1014 1015 def list_countries(self): 1016 """ 1017 List distinct country codes in DB. 1018 :return: list of country codes. 1019 """ 1020 arr = [] 1021 for cc in self.conn.execute("select distinct(cc) as CC from placenames"): 1022 arr.append(cc["CC"]) 1023 return arr 1024 1025 def list_places(self, cc=None, fc=None, criteria=None, limit=-1): 1026 """ 1027 Potentially massive array -- so this is just a Place generator. 1028 :param cc: country code or '' 1029 :param fc: feat class constraint with "*" wildcard, or '' 1030 :param criteria: additional clause to constrain search, e.g. " AND duplicate=0 " to find non-dups. 1031 :param limit: non-zero limit 1032 :return: generator 1033 """ 1034 sql = ["select * from placenames"] 1035 _and = "" 1036 if cc is not None or fc is not None: 1037 sql.append("where") 1038 if cc is not None: 1039 sql.append(f"cc ='{cc}'") 1040 if fc is not None: 1041 if cc is not None: 1042 _and = " and " 1043 if "*" in fc: 1044 sql.append(f"{_and}feat_class like '{fc.replace('*', '%')}'") 1045 else: 1046 sql.append(f"{_and}feat_class = '{fc}'") 1047 if criteria: 1048 # Include the " AND " yourself in critera 1049 sql.append(criteria) 1050 if limit > 0: 1051 sql.append(f"limit {limit}") 1052 1053 # Query 1054 sql_script = " ".join(sql) 1055 if self.debug: 1056 print(sql_script) 1057 for p in self.conn.execute(sql_script): 1058 yield as_place(p, source="db") 1059 1060 def _list_places_at_geohash(self, lat: float = None, lon: float = None, geohash: str = None, 1061 cc: str = None, radius: int = 5000, limit=10): 1062 """ 1063 A best effort guess at spatial query. Returns an array of matches, thinking most location queries are focused. 1064 This is a geohash-backed hack at search. Even with SQLite indexing this is still very slow. 1065 1066 Use geohash_precision accordingly and approximately: 1067 - geohash_precision=6 implies +/- 500m 1068 - geohash_precision=5 implies +/- 2500m 1069 - geohash_precision=4 implies +/- 20000m 1070 1071 This approach uses an approximation of finding the relevant neighbor cells using a geodetic (not geohash) 1072 assessment on the radial range. This method hopefully gets past the limitations below. 1073 1074 General limitations of using Geohash for spatial query: 1075 Given the nature of geohash you might have locations in different cells "xxxx" and "xxxy" that are 1076 close to each other, i.e. within your specified radius. E.g., 1077 1078 "9q5f" and "9qh4" are neighbor cells 1079 "9q5fr" and "9qh42" are neighbor cells. 1080 1081 "9q5fp" is a LR (south-east) corner of "9q5". Searching that 2x2 KM box by geohash will only search from that 1082 corner north and westward. 1083 :param lat: 1084 :param lon: 1085 :param geohash: 1086 :param cc: 1087 :param radius: 1088 :param limit: 1089 :return: dict of matches, { DIST = PLACE, ... } 1090 """ 1091 if geohash: 1092 # Postpend "sss" to create a default centroid in a shorter geohash. 1093 gh = f"{geohash}sss"[0:6] 1094 (lat, lon) = geohash2point(geohash) 1095 elif lat is None and lon is None: 1096 raise Exception("Provide lat/lon or geohash") 1097 1098 cells = geohash_cells_radially(lat, lon, radius) 1099 sql_script = [] 1100 for gh in cells: 1101 if len(gh) >= 6: 1102 sql_script.append(f"select * from placenames where duplicate=0 and geohash = '{gh[0:6]}'") 1103 else: 1104 sql_script.append(f"select * from placenames where duplicate=0 and geohash like '{gh}%'") 1105 1106 found = {} 1107 # Search the entire grid space 1108 for script in sql_script: 1109 for p in self.conn.execute(script): 1110 if cc: 1111 if p["cc"] != cc: 1112 continue 1113 place = as_place(p) 1114 dist = distance_haversine(lon, lat, place.lon, place.lat) 1115 if dist < radius: 1116 found[dist] = place 1117 if len(found) >= limit: 1118 # Return after first round of querying. 1119 break 1120 1121 return found 1122 1123 def _list_places_at_2d(self, lat: float, lon: float, 1124 cc: str = None, radius: int = 5000, limit=10): 1125 found = {} 1126 sw, ne = bbox(lon, lat, radius) 1127 sql_script = [f"""select * from placenames where 1128 (lat < {ne.lat:0.6} and lon < {ne.lon:0.6}) and 1129 (lat > {sw.lat:0.6} and lon > {sw.lon:0.6})"""] 1130 if cc: 1131 sql_script.append(f" and cc = '{cc}'") 1132 1133 script = " ".join(sql_script) 1134 for p in self.conn.execute(script): 1135 place = as_place(p) 1136 dist = distance_haversine(lon, lat, place.lon, place.lat) 1137 if dist < radius: 1138 found[dist] = place 1139 1140 return found 1141 1142 def list_places_at(self, lat: float = None, lon: float = None, geohash: str = None, 1143 cc: str = None, radius: int = 5000, limit=10, method="2d"): 1144 """ 1145 1146 :param lat: latitude 1147 :param lon: longitude 1148 :param cc: ISO country code to filter. 1149 :param geohash: optionally, use precomputed geohash of precision 6-chars instead of lat/lon. 1150 :param radius: in METERS, radial distance from given point to search, DEFAULT is 5 KM 1151 :param limit: count of places to return 1152 :param method: bbox or geohash 1153 :return: array of tuples, sorted by distance. 1154 """ 1155 found = {} 1156 if method == "geohash" or geohash and lat is None: 1157 found = self._list_places_at_geohash(lat=lat, lon=lon, geohash=geohash, cc=cc, radius=radius, limit=limit) 1158 elif method == "2d": 1159 found = self._list_places_at_2d(lat=lat, lon=lon, cc=cc, radius=radius, limit=limit) 1160 if not found: 1161 return [] 1162 1163 # Sort by distance key 1164 result = [(dist, found[dist]) for dist in sorted(found.keys())] 1165 return result[0:limit] 1166 1167 def list_admin_names(self, sources=['U', 'N', 'G'], cc=None) -> set: 1168 """ 1169 Lists all admin level1 names. 1170 :param cc: country code filter. 1171 :param sources: list of source IDs defaulting to those for USGS, NGA, Geonames.org 1172 :return: set of names, lowerased 1173 """ 1174 source_criteria = ','.join([f"'{s}'" for s in sources]) 1175 sql = f"""select distinct(name) AS NAME from placenames where feat_class = 'A' and feat_code = 'ADM1' 1176 and source in ({source_criteria}) and name_group='' and name_type='N'""" 1177 1178 if cc: 1179 sql += f" and cc='{cc}'" 1180 names = set([]) 1181 for nm in self.conn.execute(sql): 1182 # To list names, we normalize lowercase and remove dashes. 1183 names.add(nm['NAME'].lower().replace("-", " ")) 1184 return names 1185 1186 def update_place_id(self, rowid, plid): 1187 sql = "update placenames set place_id=? where rowid=?" 1188 self.conn.execute(sql, (plid, rowid,)) 1189 1190 def mark_duplicates(self, dups): 1191 if not dups: 1192 return False 1193 step = 1000 1194 for x1 in _array_blocks(dups, step=step): 1195 x2 = x1 + step 1196 arg = ",".join([str(dup) for dup in dups[x1:x2]]) 1197 sql = f"update placenames set duplicate=1 where id in ({arg})" 1198 self.conn.execute(sql) 1199 self.conn.commit() 1200 return True 1201 1202 def update_name_type(self, arr: list, t: str): 1203 """ 1204 Change the name type in bulk. 1205 :param arr: bulk array of placenames to change 1206 :param t: type code 'A', 'N', 'C' 1207 :return: 1208 """ 1209 if not arr: 1210 return False 1211 step = 1000 1212 for x1 in _array_blocks(arr, step=step): 1213 x2 = x1 + step 1214 arg = ",".join([str(pl) for pl in arr[x1:x2]]) 1215 sql = f"update placenames set name_type='{t}' where id in ({arg})" 1216 self.conn.execute(sql) 1217 self.conn.commit() 1218 return True 1219 1220 def update_admin1_code(self, cc, from_code, to_code): 1221 if not cc: 1222 print("NULL country code operations must be done manually, carefully.") 1223 return False 1224 sql = f"update placenames set adm1='{to_code}' where cc='{cc}' and adm1='{from_code}'" 1225 if from_code == 'NULL': 1226 sql = f"update placenames set adm1='{to_code}' where cc='{cc}' and adm1 is NULL" 1227 1228 if self.debug: 1229 print(sql) 1230 self.conn.execute(sql) 1231 return True 1232 1233 def mark_search_only(self, pid): 1234 """ 1235 Toggle bit for search only. 1236 :param pid: Place ID int or list 1237 """ 1238 if isinstance(pid, int): 1239 sql = "update placenames set search_only=1 where id=?" 1240 self.conn.execute(sql, (pid,)) 1241 elif isinstance(pid, list): 1242 idset = ", ".join([str(x) for x in pid]) 1243 sql = f"update placenames set search_only=1 where id in ({idset})" 1244 self.conn.execute(sql) 1245 else: 1246 raise Exception("Place ID integer or list of integers is required") 1247 1248 def update_bias(self, name_bias, rowids): 1249 arg = ",".join([str(pid) for pid in rowids]) 1250 flag = 1 if name_bias < 0 else 0 1251 sql = f"update placenames set name_bias=?, search_only=? where id in ({arg})" 1252 self.conn.execute(sql, (name_bias, flag,)) 1253 1254 def update_bias_by_name(self, name_bias, name): 1255 flag = 1 if name_bias < 0 else 0 1256 sql = "update placenames set name_bias=?, search_only=? where name = ?" 1257 self.conn.execute(sql, (name_bias, flag, name,)) 1258 1259 1260def _array_blocks(arr, step=1000): 1261 """ 1262 Break up large arrays so we have predictable updates or queries. 1263 :param arr: 1264 :param step: 1265 :return: 1266 """ 1267 end = len(arr) 1268 blocks = [0] 1269 if end > step: 1270 for start in range(step, end, step): 1271 blocks.append(start) 1272 return blocks 1273 1274 1275def add_location(geo, lat, lon, add_geohash=False): 1276 """ 1277 Insert validated location coordinate and geohash 1278 :param add_geohash: due to performance, add this if needed 1279 :param geo: dict 1280 :param lat: latitude value, str or float 1281 :param lon: longitude value, str or float 1282 :return: geo dict with location 1283 """ 1284 if lat and lon: 1285 geo["lat"] = parse_float(lat) 1286 geo["lon"] = parse_float(lon) 1287 if add_geohash and "lat" in geo: 1288 geo["geohash"] = point2geohash(geo["lat"], geo["lon"], precision=6) 1289 return True 1290 1291 print("No location on ROW", geo.get("place_id")) 1292 return False 1293 1294 1295class DataSource: 1296 """ 1297 Gazetteer Data Source abstraction -- provides guidelines on how to inject 1298 data into a common, normalized gazetteer. 1299 """ 1300 1301 def __init__(self, dbf, debug=False, ver=None): 1302 self.db = DB(dbf, commit_rate=100) 1303 self.ver = ver 1304 self.rate = 1000000 1305 self.rowcount = 0 1306 self.source_keys = [] 1307 self.excluded_terms = set([]) 1308 self.quiet = False 1309 self.source_name = None 1310 self.debug = debug 1311 1312 def purge(self): 1313 print(f"Purging entries for {self.source_name}") 1314 for k in self.source_keys: 1315 print(f"\tsource ID = {k}") 1316 self.db.purge({"source": k}) 1317 1318 def process_source(self, sourcefile, limit=-1): 1319 """ 1320 generator yielding DB geo dictionary to be stored. 1321 :param sourcefile: Raw data file 1322 :param limit: limit of number of records to process 1323 :return: generator of Place object or dict of Place schema 1324 """ 1325 yield None 1326 1327 def normalize(self, sourcefile, limit=-1, optimize=False): 1328 """ 1329 Given the spreadsheet or source file rip through it, ingesting contents into the master gazetteer. 1330 :param sourcefile: input file 1331 :param limit: non-zero limit for testing 1332 :param optimize: if database should be optimized when done. 1333 :return: 1334 """ 1335 print("\n============================") 1336 print(f"Start {self.source_name}. {arrow.now()} FILE={sourcefile}") 1337 for geo in self.process_source(sourcefile, limit=limit): 1338 if self.rowcount % self.rate == 0 and not self.quiet: 1339 print(f"Row {self.rowcount}") 1340 if 0 < limit < self.rowcount: 1341 print("Reached non-zero limit for testing.") 1342 break 1343 try: 1344 self.db.add_place(geo) 1345 except sqlite3.IntegrityError: 1346 print("Data integrity issue") 1347 print(format_exc(limit=5)) 1348 print(self.db.queue) 1349 break 1350 except Exception: 1351 print("Error with insertion to DB") 1352 print(format_exc(limit=5)) 1353 self.db.close() 1354 if optimize: 1355 self.db.optimize() 1356 1357 print("ROWS: ", self.rowcount) 1358 print("EXCLUSIONS: ", len(self.excluded_terms)) 1359 if self.debug: 1360 print("EXCLUSIONS:", self.excluded_terms) 1361 print(f"End {self.source_name}. {arrow.now()}") 1362 1363 1364class GazetteerIndex: 1365 """ 1366 GazetteerIndex provides a simple API to inject entries into the Gazetteer. 1367 - Every 1000 records a batch is sent to Solr 1368 - Every 1,000,0000 records a commit() call is sent to Solr 1369 1370 This may provide gazetteer specific functions, but as of v1.3 this is a generic Solr wrapper. 1371 """ 1372 1373 def __init__(self, server_url, debug=False): 1374 1375 self.server_url = server_url 1376 if not self.server_url.startswith("http"): 1377 self.server_url = f"http://{self.server_url}/solr/gazetteer" 1378 1379 self.server = pysolr.Solr(self.server_url) 1380 self.debug = debug 1381 1382 self.commit_rate = 1000000 1383 self.add_rate = 1000 1384 1385 self._records = [] 1386 self.count = 0 1387 1388 def optimize(self): 1389 if self.server and not self.debug: 1390 self.server.optimize() 1391 1392 def save(self, done=False): 1393 if self.debug: 1394 return 1395 1396 # Send batch 1397 if self._records and (done or self.count % self.add_rate == 0): 1398 self.server.add(self._records) 1399 self._records = [] 1400 # Commit 1401 if done or (self.count % self.commit_rate == 0 and self.commit_rate > 0): 1402 self.server.commit() 1403 return 1404 1405 def add(self, place): 1406 """ 1407 1408 :param place: Place object. 1409 :return: 1410 """ 1411 rec = as_place_record(place) 1412 self._records.append(rec) 1413 self.count += 1 1414 self.save() 1415 1416 def delete(self, entry_id=None): 1417 """ 1418 Awaiting other kwdargs for deletion use cases. 1419 :param entry_id: master gazetteer row ID in sqlite or solr. Deletes solr entry 1420 :return: 1421 """ 1422 if entry_id: 1423 self.server.delete(id=entry_id) 1424 return True 1425 return False 1426 1427 1428class GazetteerSearch: 1429 def __init__(self, server_url): 1430 """ 1431 TODO: BETA - looking to abstract Solr().search() function for common types of queries. 1432 For now getting a list of country name variants is easy enough. 1433 :param server_url: URL with path to `/solr/gazetteer' index 1434 """ 1435 self.index_url = server_url 1436 self.server = pysolr.Solr(self.index_url) 1437 1438 def get_countries(self, max_namelen=30): 1439 """ 1440 Searches gazetteer for Country metadata 1441 TODO: dovetail Country metadata (lang, timezone, codes, etc) with 1442 Country place data. 1443 TODO: Document different uses for GazetteerSearch.get_countries() from API get_country() 1444 TODO: Review differences in Place() schema and Country() schema for name variants, 1445 e.g., Country variants presented as abbreviations, codes or names need to be distinguished as such. 1446 :param max_namelen: 1447 :return: 1448 """ 1449 countries = [] 1450 hits = self.server.search("feat_class:A AND feat_code:PCL*", **{"rows": 30000}) 1451 for country in hits: 1452 nm = country['name'] 1453 if len(nm) > max_namelen: 1454 continue 1455 C = Country() 1456 C.name = nm 1457 C.cc_iso2 = country['cc'] 1458 C.cc_fips = country.get('FIPS_cc') 1459 C.name_type = country.get('name_type') 1460 feat_code = country.get('feat_code') 1461 C.is_territory = feat_code != "PCLI" or "territo" in nm.lower() # is NOT independent 1462 countries.append(C) 1463 return countries 1464 1465 1466def estimate_name_bias(nm): 1467 return 100 * trivial_bias(nm) 1468 1469 1470class PlaceHeuristics: 1471 # Population scale 0 = 16K, 1=32K, 2=64K, 3=128K 1472 LARGE_CITY = 3 1473 1474 def __init__(self, dbref: DB): 1475 """ 1476 1477 :param dbref: DB instance 1478 """ 1479 self.debug = False 1480 self.cities = set([]) 1481 self.cities_large = set([]) 1482 self.cities_spatial = {} # keyed by geohash 1483 self.provinces = {} 1484 # These should only be used as relative rankings of size of admin boundaries. 1485 self.adm1_population = {} 1486 self.adm2_population = {} 1487 self.stopwords = load_stopterms() 1488 self.POPULATION_THRESHOLD = 200000 1489 self.MAX_NAMELEN = 50 1490 self.stat_charcount = 0 1491 self.stat_namecount = 0 1492 # Terms appearing in GoogleBooks 8,000,000 or more are consider not tag-worthy for geography, in general 1493 self.wordlookup = WordStats(get_default_wordstats()) 1494 self.wordlookup.load_common(threshold=6000000) 1495 1496 # Path relative to ./solr/ 1497 fpath = os.path.join('etc', 'gazetteer', 'filters', 'non-placenames,admin-codes.csv') 1498 self.stopwords_admin_codes = set(load_list(fpath)) 1499 1500 self.exempt_features = {"PPLC", "ADM1", "PCLI", "PCL"} 1501 self.exempted_names = {} 1502 self.feature_wt = { 1503 "A": 11, 1504 "A/ADM1": 16, 1505 "A/ADM2": 14, 1506 "A/PCL": 16, 1507 "P": 10, 1508 "P/PPL": 10, # Most common 1509 "P/PPLC": 15, 1510 "P/PPLA": 10, 1511 "P/PPLG": 9, 1512 "P/PPLH": 8, 1513 "P/PPLQ": 7, 1514 "P/PPLX": 7, 1515 "P/PPLL": 8, 1516 "L": 6, 1517 "R": 6, 1518 "H": 7, 1519 "H/SPNG": 2, 1520 "H/RSV": 2, 1521 "H/STM": 2, 1522 "H/WLL": 2, 1523 "V": 7, 1524 "S": 8, 1525 "U": 2, 1526 "T": 5, 1527 "T/ISL": 6, 1528 "T/ISLS": 6 1529 } 1530 1531 # This is a set (list) of distinct names for ADM1 level names. 1532 # This obviously changes as you build out the master gazetteer as in the beginning it has NOTHING. 1533 self.provinces = dbref.list_admin_names() 1534 1535 # Pop stats are primarily for P/PPL. 1536 for city in load_major_cities(): 1537 self.cities.add(city.name.lower()) 1538 if city.population_scale >= PlaceHeuristics.LARGE_CITY: 1539 self.cities_large.add(city.name.lower()) 1540 1541 self.cities_spatial = dbref.list_all_popstats() 1542 # These should only be used to score specific feature types ADM1 or ADM2 1543 self.adm1_population = dbref.list_adm1_popstats() 1544 self.adm2_population = dbref.list_adm2_popstats() 1545 1546 def is_large_city(self, name): 1547 return name in self.cities_large 1548 1549 def is_significant(self, feat) -> bool: 1550 return feat in self.exempt_features 1551 1552 def is_province_name(self, name) -> bool: 1553 """ 1554 Report if a name is that of a province, regardless of whether the location repreents something else. 1555 E.g. 1556 "Florida" is a city (lesser known) or a state (well known). Therefore it is a popular name. 1557 :param name: 1558 :return: 1559 """ 1560 return name in self.provinces 1561 1562 def is_stopword(self, name: str) -> bool: 1563 if name in self.stopwords: 1564 return True 1565 1566 if name.replace("-", " ") in self.stopwords: 1567 return True 1568 1569 # Name is "Bar"... 1570 # test if "The Bar" is a stopword 1571 if f"the {name}" in self.stopwords: 1572 return True 1573 1574 # Name is "The Bar" 1575 # test if "bar" is a stopword 1576 if name.startswith("the "): 1577 if name[4:].strip() in self.stopwords: 1578 return True 1579 return False 1580 1581 def estimate_bias(self, geo, name_group=""): 1582 """ 1583 Primary Estimator of id_bias and name_bias. 1584 1585 id_bias -- a location bias to pre-rank city by feature/population 1586 name_bias -- a metric ranging from -1 to 1, that represents the validity of a tagging the name/phrase 1587 in a general context. The result is eventually binary search_only = name_bias < 0. This means 1588 that geo names that are search_only are not taggable. 1589 :param geo: 1590 :param name_group: 1591 :return: 1592 """ 1593 geo["id_bias"] = self.location_bias(geo) 1594 geo["name_bias"] = self.name_bias(geo["name"], geo["feat_class"], geo["feat_code"], 1595 name_group=name_group, name_type=geo["name_type"]) 1596 1597 def get_feature_scale(self, fc, dsg): 1598 """ 1599 1600 :param fc: feature class 1601 :param dsg: feature code 1602 :return: 1603 """ 1604 # Location bias is 70% population, 30% feature type 1605 # 1606 if not dsg: 1607 return self.feature_wt.get(fc, 5) 1608 1609 fckey = f"{fc}/{dsg}" 1610 for glen in [6, 5]: 1611 fc_scale = self.feature_wt.get(fckey[0:glen]) 1612 if fc_scale: 1613 return fc_scale 1614 1615 return self.feature_wt.get(fc, 5) 1616 1617 def location_bias(self, geo): 1618 """ 1619 See estimate_bias() 1620 1621 A location is pre-disposed by its feature type and population/popularity. 1622 E.g., large cities are mentioned more often in news or documents than less populated cities. 1623 Factors: 1624 1625 Feature gradient A, P, ..... U. More populated features have higer bias 1626 Population gradient log(pop) scales bias higher 1627 1628 :param geo: standard ETL geo dict 1629 :return: score on 100 point scale. 1630 """ 1631 return int(10 * self._location_bias(geo)) 1632 1633 def _location_bias(self, geo): 1634 """ 1635 dict with parts: 1636 1637 :param geo: standard ETL geo dict 1638 :return: A number on the range of 0 to 10 approximately. 1639 """ 1640 fc = geo["feat_class"] 1641 dsg = geo["feat_code"] 1642 pop_wt = 0 1643 fc_scale = self.get_feature_scale(fc, dsg) 1644 1645 if fc == 'P': 1646 lockey = coord_grid(geo) 1647 population = self.cities_spatial.get(lockey, 0) 1648 pop_wt = popscale(population, feature="city") 1649 if fc == 'A': 1650 cc = geo["cc"] 1651 a1 = geo["adm1"] 1652 pop_wt = 1 1653 if dsg == 'ADM1': 1654 adm_path = make_HASC(cc, a1) 1655 population = self.adm1_population.get(adm_path, 0) 1656 pop_wt = popscale(population, feature="province") 1657 if dsg == 'ADM2' and "adm2" in geo: 1658 adm_path = make_HASC(cc, a1, geo["adm2"]) 1659 population = self.adm2_population.get(adm_path, 0) 1660 pop_wt = popscale(population, feature="district") 1661 1662 # For PLACES this helps differentiate P/PPL by population 1663 # Between PLACES and BOUNDARIES the population component may rank places 1664 # higher than a boundary by the same name. 1665 # 1666 # Weighted sums -- Population has more information than the feature, so we weight that higher. 1667 return (0.75 * pop_wt) + (0.25 * fc_scale) 1668 1669 def name_bias(self, geoname: str, feat_class: str, feat_code: str, name_group="", name_type="N"): 1670 """ 1671 See estimate_bias() 1672 1673 Given a geoname we look at the instance of the name variant and if it is something trivially 1674 colliding with stopwords in other languages then we consider omitting it. 1675 1676 very positive bias - long unique name, diacritic or in non-ASCII script 1677 positive bias - normal location name, multiple words or grams 1678 neutral - possibly a place name, but is case-dependent, e.g., person name or generic monument name. 1679 negative bias - a stopword or trivial version of a stopword, `Ã…re` 1680 very negative bias - a very rare or extremely long version of a place name, nonsense 1681 -1 - WordStats reports as a "common" word. 1682 1683 Conclusion: Any Negative name_bias term will NOT be tagged, although it is present in gazetteer. 1684 1685 CODE and ABBREV are not biased -- they are simply not full names. 1686 1687 TODO: ONLY unigrams are tracked, so 1688 "Alabama" -> not common, 1689 "Need" -> common, 1690 "New York" -> not tracked. This is a bi-gram 1691 1692 :param geoname: 1693 :param feat_class: 1694 :param feat_code: 1695 :param name_group: 1696 :param name_type: 1697 :return: floating point number between -100 and 100 1698 """ 1699 return int(100 * self._name_bias(geoname, feat_class, feat_code, name_group=name_group, name_type=name_type)) 1700 1701 def _name_bias(self, geoname, feat_class, feat_code, name_group="", name_type="N"): 1702 """ 1703 Details on assessing a name against common word stats, feature metadata, lang script 1704 :param geoname: name str 1705 :param feat_class: UNUSED 1706 :param feat_code: 1707 :param name_group: 1708 :return: 1709 """ 1710 1711 if name_group in {'cjk', 'ar'}: 1712 # TODO: Should look up Stopwords here, but that likely happens in tagger. 1713 return trivial_bias(geoname) + 0.10 1714 1715 self.stat_namecount += 1 1716 namelen = len(geoname) 1717 self.stat_charcount += namelen 1718 1719 # if name_type == "C" and is_administrative(feat_class): 1720 # Quick checks: 1721 if namelen < 5: 1722 # Check for administrative codes that are most commonly stopterms or other meanings 1723 if geoname.upper() in self.stopwords_admin_codes: 1724 return -1 1725 # Omit pure digit names 1726 if geoname.isdigit(): 1727 return -1 1728 1729 if namelen < 2: 1730 return -0.1 1731 elif 30 < namelen < self.MAX_NAMELEN: 1732 return trivial_bias(geoname) 1733 elif namelen >= self.MAX_NAMELEN: 1734 # Name is too long to consider tagging; Unlikely to appear in this form. 1735 return -0.1 1736 1737 # Test shorter names: Combine feature, stopwords, and other tests. 1738 # ============================================================== 1739 # FIRST -- see if a judgement was made on a name already. 1740 norm = geoname.lower() 1741 if norm in self.exempted_names: 1742 return self.exempted_names[norm] 1743 1744 # SECOND -- figure out if name is significant and popular because it is a popular place 1745 # rather than just a common word. 1746 1747 # TODO: add non-diacritic name to this test? 1748 norm2 = strip_quotes(replace_diacritics(norm)) 1749 norm2 = norm2.replace("-", " ") 1750 is_popular_place = self.is_significant(feat_code) or self.is_large_city(norm) or \ 1751 self.is_province_name(norm) or self.is_province_name(norm2) 1752 1753 # Example: "Moscow (P/PPLC)" significant. Name is exempted (significant feature) 1754 # "Moscow (A/ADM2)" not significant; But it is flagged as "common"...and omitted without this: 1755 # "Florida (P/PPL)" is not a common place 1756 # "Florida (A/ADM1)" is a significant place. 1757 # Note "Large Cities" vs. ADMIN-LEVEL1 boundaries are different lookups 1758 if is_popular_place: 1759 self.exempted_names[norm] = trivial_bias(geoname) 1760 return self.exempted_names[norm] 1761 elif self.is_stopword(norm): 1762 return -1 1763 elif self.wordlookup.is_common(norm): 1764 # is a common word, but not associated often with a location 1765 return -1 1766 else: 1767 # Much deeper checks on about 90% of the names 1768 # Omit short diacritic names that are typically stopwords. These are partial biases 1769 # since we are now checking if the non-diacritic version is filtered. 1770 if norm != norm2: 1771 if self.wordlookup.is_common(norm2): 1772 return -0.9 1773 1774 if norm2 in self.stopwords: 1775 return -0.5 1776 1777 if norm2.upper() in self.stopwords_admin_codes: 1778 return -0.6 1779 1780 # Return a positive value. 1781 return trivial_bias(norm) 1782 1783 1784if __name__ == "__main__": 1785 1786 import argparse 1787 1788 ap = argparse.ArgumentParser() 1789 ap.add_argument('--solr') 1790 ap.add_argument('--output') 1791 ap.add_argument('--query') 1792 ap.add_argument('--lookup') 1793 ap.add_argument('--parse', action="store_true", default=False) 1794 ap.add_argument('--demo') 1795 1796 args = ap.parse_args() 1797 1798 if args.lookup: 1799 findings = run_lookup(args.solr, args.lookup, args.parse) 1800 print_places(findings) 1801 elif args.query: 1802 findings = run_query(args.solr, args.query) 1803 print_places(findings)
108def coord_grid(geo: dict) -> str: 109 """ 110 A less dissatisfying grid than geohash. Its just returning Y,X in low resolution. LLL.l,LLL.l 111 """ 112 if "lat" not in geo: 113 return None 114 x, y = geo["lon"], geo["lat"] 115 return f"{y:0.1f},{x:0.1f}"
A less dissatisfying grid than geohash. Its just returning Y,X in low resolution. LLL.l,LLL.l
126def load_stopterms(project_dir=".", lower=True): 127 """ 128 Load default stop terms from source tree for project build. 129 :param project_dir: The location of Xponents/solr source tree. 130 :param lower: default case to load data as. If not lower, then terms are loaded as-is 131 :return: 132 """ 133 loader = ConfigUtility() 134 stopterms = set([]) 135 for f in ["etc/gazetteer/filters/non-placenames.csv", 136 "etc/gazetteer/filters/non-placenames,spa.csv", # SPANISH 137 "etc/gazetteer/filters/non-placenames,rus,ukr.csv", # Cyrillic languages 138 "etc/gazetteer/filters/non-placenames,deu.csv", # GERMAN 139 "etc/gazetteer/filters/non-placenames,acronym.csv"]: 140 terms = loader.loadDataFromFile(os.path.join(project_dir, f), ",") 141 for t in terms: 142 if lower: 143 stopterms.add(t[0].lower()) 144 else: 145 stopterms.add(t[0]) 146 return stopterms
Load default stop terms from source tree for project build. :param project_dir: The location of Xponents/solr source tree. :param lower: default case to load data as. If not lower, then terms are loaded as-is :return:
149def run_lookup(url, lookup, parse): 150 """ Gazetteer demo mimics some of the logic in XponentsGazetteerQuery 151 try "San Francisco, CA, US" 152 """ 153 154 solr_gaz = pysolr.Solr(url) 155 # specific unit tests 156 157 records = None 158 places = [] 159 if parse: 160 # See other Java demo, XponentsGazetteerQuery 161 # assuming NAME, PROV, COUNTRY 162 slots = [a.strip() for a in lookup.split(',')] 163 164 if len(slots) < 3: 165 print("NAME, PROV, CC is required format for --lookup") 166 return None 167 168 cityVal = slots[0] 169 provVal = slots[1] 170 countryVal = slots[2] 171 172 # Find best match for Province. Pass ADM1 code to next query 173 query = 'name:"{}" AND feat_class:A AND cc:{}'.format(provVal, countryVal) 174 records = solr_gaz.search(query, **{"rows": 100}) 175 176 if not records: 177 return None 178 179 # Use a Place object to abstract things. 180 adm1 = as_place(records.docs[0]) 181 # Find best match for the tuple NAME/PROV/COUNTRY 182 # 183 query = 'name:"{}" AND feat_class:A AND cc:{} AND adm1:{}'.format(cityVal, countryVal, adm1.adm1) 184 records = solr_gaz.search(query, **{"rows": 1000}) 185 else: 186 query = 'name:"{}" AND feat_class:P'.format(lookup) 187 records = solr_gaz.search(query, **{"rows": 1000}) 188 189 if not records: 190 return None 191 192 for r in records: 193 places.append(as_place(r)) 194 195 return places
Gazetteer demo mimics some of the logic in XponentsGazetteerQuery try "San Francisco, CA, US"
217def normalize_name(nm: str): 218 """ 219 convenience method that ensures we have some consistency on normalization of name 220 :param nm: 221 :return: 222 """ 223 return nm.replace("\u2019", "'").replace("\xa0", " ").strip().strip("'")
convenience method that ensures we have some consistency on normalization of name :param nm: :return:
226def name_group_for(nm: str): 227 """ 228 Determine the major language "name group" for the input 229 :param nm: name or any text 230 :return: 231 """ 232 if has_cjk(nm): 233 return "cjk" 234 elif has_arabic(nm): 235 return "ar" 236 return ""
Determine the major language "name group" for the input :param nm: name or any text :return:
239def as_admin_place(r): 240 """ 241 Convert dict to a Place object 242 :param r: gazetteer row from Solr or SQlite. 243 :return: Place 244 """ 245 keys = {} 246 if hasattr(r, "keys"): 247 keys = r.keys() 248 249 p = Place(r['place_id'], r['name']) 250 p.country_code = r["cc"] 251 p.adm1 = r["adm1"] 252 p.source = r["source"] 253 p.geohash = r["geohash"] 254 if "adm1_iso" in keys: 255 p.adm1_iso = r["adm1_iso"] 256 257 p.lat = p.lon = p.X = p.Y = None 258 return p
Convert dict to a Place object :param r: gazetteer row from Solr or SQlite. :return: Place
261def as_place(r, source="index"): 262 """ 263 Convert dict to a Place object 264 :param source: db or index (solr) 265 :param r: gazetteer row from Solr or SQlite. 266 :return: Place 267 """ 268 keys = {} 269 if hasattr(r, "keys"): 270 keys = r.keys() 271 272 lat, lon = 0, 0 273 if "geo" in r: 274 (lat, lon) = r['geo'].split(',') 275 else: 276 lat, lon = r["lat"], r["lon"] 277 278 p = Place(r['place_id'], r['name'], lat=lat, lon=lon) 279 p.country_code = r["cc"] 280 p.feature_class = r["feat_class"] 281 p.feature_code = r["feat_code"] 282 if "id" in r: 283 # Required if coming or going into a database: 284 p.id = r["id"] 285 p.id_bias = r["id_bias"] 286 if source == "db": 287 p.name_bias = r["name_bias"] 288 289 # optional fields: 290 if "FIPS_cc" in keys: 291 p.country_code_fips = r["FIPS_cc"] 292 if "adm1" in keys: 293 p.adm1 = r["adm1"] 294 if "adm2" in keys: 295 p.adm2 = r["adm2"] 296 if "geohash" in keys: 297 p.geohash = r["geohash"] 298 if "id" in keys: 299 p.id = r["id"] 300 if "source" in keys: 301 p.source = r["source"] 302 if "name_group" in keys: 303 p.name_group = r["name_group"] 304 if "search_only" in keys: 305 p.search_only = get_bool(r["search_only"]) 306 if "name_type" in keys: 307 p.name_type = r["name_type"] 308 309 p.is_ascii = is_ascii(p.name) 310 return p
Convert dict to a Place object :param source: db or index (solr) :param r: gazetteer row from Solr or SQlite. :return: Place
313def as_place_record(place, target="index"): 314 """ 315 Given a Place object, serialize it as a dict consistent with the Solr index schema. 316 :param place: 317 :param target: index or db 318 :return: 319 """ 320 if not isinstance(place, Place): 321 return None 322 # Copy defaults offers nothing. 323 # rec = copy(GAZETTEER_TEMPLATE) 324 rec = { 325 "id": place.id, 326 "place_id": place.place_id, 327 "name": place.name, 328 "name_type": place.name_type, 329 "feat_class": place.feature_class, 330 "feat_code": place.feature_code, 331 "cc": place.country_code, 332 "FIPS_cc": place.country_code_fips, 333 "source": place.source, 334 # "script": place.name_script, 335 "search_only": place.search_only 336 } 337 338 # ADMIN level 1/2 boundary names: 339 if place.adm1: 340 rec["adm1"] = place.adm1 341 if place.adm2: 342 rec["adm2"] = place.adm2 343 # ID BIAS: 344 rec["id_bias"] = 0 if place.id_bias is None else place.id_bias 345 346 if target == "index": 347 # Preserve innate precision on Lat/Lon: e.g., "4.5,-118.4" is result if only that amount of precision is present 348 rec["geo"] = ",".join([str(place.lat), str(place.lon)]), 349 # Name Group / Script tests: 350 if place.name_group == "ar": 351 rec["name_ar"] = place.name 352 elif place.name_group == "cjk": 353 rec["name_cjk"] = place.name 354 elif target == "db": 355 # Required fields: 356 rec["name_bias"] = 0 if place.name_bias is None else place.name_bias 357 rec["name_group"] = place.name_group 358 rec["lat"] = place.lat 359 rec["lon"] = place.lon 360 rec["adm1"] = place.adm1 361 rec["adm2"] = place.adm2 362 363 return rec
Given a Place object, serialize it as a dict consistent with the Solr index schema. :param place: :param target: index or db :return:
366def run_query(url, q): 367 """ Expert mode: Run a solr query to see what you get back. 368 requires you know the schema 369 """ 370 solrGaz = pysolr.Solr(url) 371 records = solrGaz.search(q, **{"rows": 100}) 372 places = [] 373 for r in records: 374 places.append(as_place(r)) 375 376 return places
Expert mode: Run a solr query to see what you get back. requires you know the schema
385def capitalize(name: dict): 386 """ Capitalize all city and major admin boundaries """ 387 nm = name["name"] 388 if nm and not nm[0].isupper(): 389 return 390 391 grp = name.get("name_group") 392 nt = name.get("name_type") 393 ft = name["feat_class"] 394 if nm and grp == '' and nt == 'N' and ft in {'A', 'P'}: 395 # Because we don't like altering data much: 396 name["name"] = nm[0].upper() + nm[1:]
Capitalize all city and major admin boundaries
399def gaz_resource(fname): 400 """ 401 Formats the relative path for an item in the ./solr/etc/gazetteer/ metadata 402 :param fname: 403 :return: 404 """ 405 return os.path.join("etc", "gazetteer", fname)
Formats the relative path for an item in the ./solr/etc/gazetteer/ metadata :param fname: :return:
408def export_admin_mapping(admin_ids, filepath): 409 """ 410 Experimental: Map all source place IDs => ADM ids 411 Map all standard ADM ids => place IDs 412 :param admin_ids: dict for JSON or array for CSV 413 :param filepath: 414 :return: 415 """ 416 with open(filepath, "w", encoding="UTF-8") as fio: 417 fio.write("\t".join(["ADM1", "PLACE_ID", "LAT", "LON", "NAME"])) 418 419 for a1 in admin_ids: 420 cc = a1["cc"] 421 adm1 = a1["adm1"] 422 hasc = f"{cc}.{adm1}" 423 y, x = a1["lat"], a1["lon"] 424 entry = [hasc, a1["place_id"], f"{y:0.1f}", f"{x:0.1f}", a1["name"]] 425 fio.write("\t".join(entry)) 426 fio.write("\n")
Experimental: Map all source place IDs => ADM ids Map all standard ADM ids => place IDs :param admin_ids: dict for JSON or array for CSV :param filepath: :return:
1276def add_location(geo, lat, lon, add_geohash=False): 1277 """ 1278 Insert validated location coordinate and geohash 1279 :param add_geohash: due to performance, add this if needed 1280 :param geo: dict 1281 :param lat: latitude value, str or float 1282 :param lon: longitude value, str or float 1283 :return: geo dict with location 1284 """ 1285 if lat and lon: 1286 geo["lat"] = parse_float(lat) 1287 geo["lon"] = parse_float(lon) 1288 if add_geohash and "lat" in geo: 1289 geo["geohash"] = point2geohash(geo["lat"], geo["lon"], precision=6) 1290 return True 1291 1292 print("No location on ROW", geo.get("place_id")) 1293 return False
Insert validated location coordinate and geohash :param add_geohash: due to performance, add this if needed :param geo: dict :param lat: latitude value, str or float :param lon: longitude value, str or float :return: geo dict with location
1296class DataSource: 1297 """ 1298 Gazetteer Data Source abstraction -- provides guidelines on how to inject 1299 data into a common, normalized gazetteer. 1300 """ 1301 1302 def __init__(self, dbf, debug=False, ver=None): 1303 self.db = DB(dbf, commit_rate=100) 1304 self.ver = ver 1305 self.rate = 1000000 1306 self.rowcount = 0 1307 self.source_keys = [] 1308 self.excluded_terms = set([]) 1309 self.quiet = False 1310 self.source_name = None 1311 self.debug = debug 1312 1313 def purge(self): 1314 print(f"Purging entries for {self.source_name}") 1315 for k in self.source_keys: 1316 print(f"\tsource ID = {k}") 1317 self.db.purge({"source": k}) 1318 1319 def process_source(self, sourcefile, limit=-1): 1320 """ 1321 generator yielding DB geo dictionary to be stored. 1322 :param sourcefile: Raw data file 1323 :param limit: limit of number of records to process 1324 :return: generator of Place object or dict of Place schema 1325 """ 1326 yield None 1327 1328 def normalize(self, sourcefile, limit=-1, optimize=False): 1329 """ 1330 Given the spreadsheet or source file rip through it, ingesting contents into the master gazetteer. 1331 :param sourcefile: input file 1332 :param limit: non-zero limit for testing 1333 :param optimize: if database should be optimized when done. 1334 :return: 1335 """ 1336 print("\n============================") 1337 print(f"Start {self.source_name}. {arrow.now()} FILE={sourcefile}") 1338 for geo in self.process_source(sourcefile, limit=limit): 1339 if self.rowcount % self.rate == 0 and not self.quiet: 1340 print(f"Row {self.rowcount}") 1341 if 0 < limit < self.rowcount: 1342 print("Reached non-zero limit for testing.") 1343 break 1344 try: 1345 self.db.add_place(geo) 1346 except sqlite3.IntegrityError: 1347 print("Data integrity issue") 1348 print(format_exc(limit=5)) 1349 print(self.db.queue) 1350 break 1351 except Exception: 1352 print("Error with insertion to DB") 1353 print(format_exc(limit=5)) 1354 self.db.close() 1355 if optimize: 1356 self.db.optimize() 1357 1358 print("ROWS: ", self.rowcount) 1359 print("EXCLUSIONS: ", len(self.excluded_terms)) 1360 if self.debug: 1361 print("EXCLUSIONS:", self.excluded_terms) 1362 print(f"End {self.source_name}. {arrow.now()}")
Gazetteer Data Source abstraction -- provides guidelines on how to inject data into a common, normalized gazetteer.
1319 def process_source(self, sourcefile, limit=-1): 1320 """ 1321 generator yielding DB geo dictionary to be stored. 1322 :param sourcefile: Raw data file 1323 :param limit: limit of number of records to process 1324 :return: generator of Place object or dict of Place schema 1325 """ 1326 yield None
generator yielding DB geo dictionary to be stored. :param sourcefile: Raw data file :param limit: limit of number of records to process :return: generator of Place object or dict of Place schema
1328 def normalize(self, sourcefile, limit=-1, optimize=False): 1329 """ 1330 Given the spreadsheet or source file rip through it, ingesting contents into the master gazetteer. 1331 :param sourcefile: input file 1332 :param limit: non-zero limit for testing 1333 :param optimize: if database should be optimized when done. 1334 :return: 1335 """ 1336 print("\n============================") 1337 print(f"Start {self.source_name}. {arrow.now()} FILE={sourcefile}") 1338 for geo in self.process_source(sourcefile, limit=limit): 1339 if self.rowcount % self.rate == 0 and not self.quiet: 1340 print(f"Row {self.rowcount}") 1341 if 0 < limit < self.rowcount: 1342 print("Reached non-zero limit for testing.") 1343 break 1344 try: 1345 self.db.add_place(geo) 1346 except sqlite3.IntegrityError: 1347 print("Data integrity issue") 1348 print(format_exc(limit=5)) 1349 print(self.db.queue) 1350 break 1351 except Exception: 1352 print("Error with insertion to DB") 1353 print(format_exc(limit=5)) 1354 self.db.close() 1355 if optimize: 1356 self.db.optimize() 1357 1358 print("ROWS: ", self.rowcount) 1359 print("EXCLUSIONS: ", len(self.excluded_terms)) 1360 if self.debug: 1361 print("EXCLUSIONS:", self.excluded_terms) 1362 print(f"End {self.source_name}. {arrow.now()}")
Given the spreadsheet or source file rip through it, ingesting contents into the master gazetteer. :param sourcefile: input file :param limit: non-zero limit for testing :param optimize: if database should be optimized when done. :return:
1365class GazetteerIndex: 1366 """ 1367 GazetteerIndex provides a simple API to inject entries into the Gazetteer. 1368 - Every 1000 records a batch is sent to Solr 1369 - Every 1,000,0000 records a commit() call is sent to Solr 1370 1371 This may provide gazetteer specific functions, but as of v1.3 this is a generic Solr wrapper. 1372 """ 1373 1374 def __init__(self, server_url, debug=False): 1375 1376 self.server_url = server_url 1377 if not self.server_url.startswith("http"): 1378 self.server_url = f"http://{self.server_url}/solr/gazetteer" 1379 1380 self.server = pysolr.Solr(self.server_url) 1381 self.debug = debug 1382 1383 self.commit_rate = 1000000 1384 self.add_rate = 1000 1385 1386 self._records = [] 1387 self.count = 0 1388 1389 def optimize(self): 1390 if self.server and not self.debug: 1391 self.server.optimize() 1392 1393 def save(self, done=False): 1394 if self.debug: 1395 return 1396 1397 # Send batch 1398 if self._records and (done or self.count % self.add_rate == 0): 1399 self.server.add(self._records) 1400 self._records = [] 1401 # Commit 1402 if done or (self.count % self.commit_rate == 0 and self.commit_rate > 0): 1403 self.server.commit() 1404 return 1405 1406 def add(self, place): 1407 """ 1408 1409 :param place: Place object. 1410 :return: 1411 """ 1412 rec = as_place_record(place) 1413 self._records.append(rec) 1414 self.count += 1 1415 self.save() 1416 1417 def delete(self, entry_id=None): 1418 """ 1419 Awaiting other kwdargs for deletion use cases. 1420 :param entry_id: master gazetteer row ID in sqlite or solr. Deletes solr entry 1421 :return: 1422 """ 1423 if entry_id: 1424 self.server.delete(id=entry_id) 1425 return True 1426 return False
GazetteerIndex provides a simple API to inject entries into the Gazetteer.
- Every 1000 records a batch is sent to Solr
- Every 1,000,0000 records a commit() call is sent to Solr
This may provide gazetteer specific functions, but as of v1.3 this is a generic Solr wrapper.
1406 def add(self, place): 1407 """ 1408 1409 :param place: Place object. 1410 :return: 1411 """ 1412 rec = as_place_record(place) 1413 self._records.append(rec) 1414 self.count += 1 1415 self.save()
:param place: Place object. :return:
1417 def delete(self, entry_id=None): 1418 """ 1419 Awaiting other kwdargs for deletion use cases. 1420 :param entry_id: master gazetteer row ID in sqlite or solr. Deletes solr entry 1421 :return: 1422 """ 1423 if entry_id: 1424 self.server.delete(id=entry_id) 1425 return True 1426 return False
Awaiting other kwdargs for deletion use cases. :param entry_id: master gazetteer row ID in sqlite or solr. Deletes solr entry :return: