opensextant.xlayer
Created on Mar 14, 2016
@author: ubaldino
1# -*- coding: utf-8 -*- 2""" 3Created on Mar 14, 2016 4 5@author: ubaldino 6""" 7import json 8import sys 9 10import requests 11import requests.exceptions 12from opensextant import TextMatch, PlaceCandidate, get_country, make_HASC, \ 13 is_populated, is_administrative, is_academic, characterize_location, logger_config 14 15# Move away from "geo" and towards a more descriptive place label. 16GEOCODINGS = {"geo", "place", "postal", "country", "coord", "coordinate"} 17 18 19class XlayerClient: 20 """ 21 Xponents REST client -- low level utility. See also Geotagger class for a better abstraction. 22 ``` 23 client = XlayerClient(url) 24 client.process( "paragraph of text...." ) ---> returns list of raw matches, geocoded. 25 ``` 26 """ 27 def __init__(self, server, options=""): 28 """ 29 @param server: URL for the service. E.g., host:port or 'http://SERVER/xlayer/rest/process'. 30 @keyword options: STRING. a comma-separated list of options to send with each request. 31 There are no default options supported. 32 """ 33 self.server = server 34 if not server.startswith("http"): 35 self.server = f"http://{server}/xlayer/rest/process" 36 self.server_control = f"http://{server}/xlayer/rest/control" 37 else: 38 # User provided a full URL. 39 self.server_control = server.replace('/process', '/control') 40 self.debug = False 41 self.default_options = options 42 43 def stop(self, timeout=30): 44 """ 45 Timeout of 30 seconds is used here so calls do not hang indefinitely. 46 The service URL is inferred: /process and /control endpoints should be next to each other. 47 :return: True if successful or if "Connection aborted" ConnectionError occurs 48 """ 49 try: 50 response = requests.get("{}/stop".format(self.server_control), timeout=timeout) 51 if response.status_code != 200: 52 return response.raise_for_status() 53 except requests.exceptions.ConnectionError as err: 54 return "Connection aborted" in str(err) 55 return False 56 57 def ping(self, timeout=30): 58 """ 59 Timeout of 30 seconds is used here so calls do not hang indefinitely. 60 :return: True if successful. 61 """ 62 response = requests.get("{}/ping".format(self.server_control), timeout=timeout) 63 if response.status_code != 200: 64 return response.raise_for_status() 65 return True 66 67 def process(self, docid, text, lang=None, features=["geo"], timeout=10, minlen=-1, 68 preferred_countries=None, preferred_locations=None): 69 """ 70 Process text, extracting some entities 71 72 lang = "xx" or None, where "xx" is a ISO language 2-char code. 73 For general Chinese/Japanese/Korean (CJK) support, use lang = 'cjk' 74 Language IDs that have some additional tuning include: 75 "ja", "th", "tr", "id", "ar", "fa", "ur", "ru", "it", 76 "pt", "de", "nl", "es", "en", "tl", "ko", "vi" 77 Behavior: Arabic (ar) or CJK (cjk) lang ID directs tagger to use language-specific tokenizers 78 Otherwise other lang ID provided just invokes language-specific stopword filters 79 80 features are places, coordinates, countries, orgs, persons, patterns, postal. 81 82 feature aliases "geo" can be used to get All Geographic entities (places,coordinates,countries) 83 feature "taxons" can get at any Taxon "taxons", "persons", "orgs". As of Xponents 3.6 this reports ALL 84 Other taxons available in TaxCat tagger. "all_taxons" is offered as a means to distinguish old and new behavior. 85 feature "postal" will tag obvious, qualified postal codes that are paired with a CITY, PROVINCE, or COUNTRY tag. 86 feature "patterns" is an alias for dates and any other pattern-based extractors. For now "dates" is only one 87 feature "codes" will tag, use and report coded information for any place; primarily administrative boundaries 88 89 options are not observed by Xlayer "Xgeo", but you can adapt your own service 90 to accomodate such options. Possible options are clean_input, lowercase, for example: 91 92 * clean_input scrubs the input text if it has HTML or other content in it. 93 * lowercase allows the tagging to pass on through lower case matches. 94 95 but interpretation of "clean text" and "lower case" support is subjective. 96 so they are not supported out of the box here. 97 :param docid: identifier of transaction 98 :param text: Unicode text to process 99 :param lang: One of ["ar", "cjk", .... other ISO language IDs] 100 :param features: list of geo OR [places, coordinates, countries], orgs, persons, patterns, taxons 101 :param timeout: default to 10 seconds; If you think your processing takes longer, 102 adjust if you see exceptions. 103 :param minlen: minimum length of matches that are unqualified. To reduce noise in geotags. Server has a default 104 of 4 chars for general purpose noise filtering. 105 :param preferred_countries: Array of country codes representing those which are preferred fall backs when 106 there are ambiguous location names. 107 :param preferred_locations: Array of geohash representing general area of desired preferred matches 108 :return: array of TextMatch objects or empty array. 109 """ 110 json_request = {'docid': docid, 'text': text} 111 if self.default_options: 112 json_request['options'] = self.default_options 113 if features: 114 json_request['features'] = ','.join(features) 115 if preferred_countries: 116 json_request['preferred_countries'] = preferred_countries 117 if preferred_locations: 118 json_request['preferred_locations'] = preferred_locations 119 if lang: 120 json_request['lang'] = lang 121 if minlen and int(minlen) > 0: 122 json_request['minlen'] = minlen 123 124 response = requests.post(self.server, json=json_request, timeout=timeout) 125 if response.status_code != 200: 126 return response.raise_for_status() 127 128 json_content = response.json() 129 130 if self.debug: 131 print(json.dumps(json_content, indent=2)) 132 if 'response' in json_content: 133 # Get the response metadata block 134 # metadata = json_content['response'] 135 pass 136 137 annots = [] 138 if 'annotations' in json_content: 139 aj = json_content['annotations'] 140 for annot in aj: 141 # Desire to move to "label" away from "type" 142 label = annot.get("type") 143 if label in GEOCODINGS: 144 tm = PlaceCandidate(annot.get('matchtext'), annot.get('offset'), None) 145 else: 146 tm = TextMatch(annot.get('matchtext'), annot.get('offset'), None) 147 tm.populate(annot) 148 annots.append(tm) 149 150 return annots 151 152 153# ================== 154# Geotagger -- Simplified wrapper around Xlayer. Reduces volume of information 155# EXPERIMENTAL 156# ================== 157 158 159def _increment_count(dct: dict, code: str): 160 if not code: 161 raise Exception("Data quality issue -- counting on a null value") 162 dct[code] = 1 + dct.get(code, 0) 163 164 165def _infer_slot(all_inf: dict, slot: str, span: PlaceCandidate, match_id=None): 166 """ 167 Insert information into slots. 168 169 :param span: 170 :return: 171 """ 172 mid = match_id or span.id 173 inf = all_inf.get(mid, {}) 174 if not inf: 175 all_inf[mid] = inf 176 177 if slot in inf: 178 return 179 if not span: 180 raise Exception("Data integrity issue -- inferring location should have a non-null match") 181 if slot not in Geotagger.ALLOWED_SLOTS: 182 return 183 184 ids = inf.get("match-ids", []) 185 if not ids: 186 inf["match-ids"] = ids 187 ids.append(span.id) 188 189 inf[slot] = { 190 # "name": span.place.name, # Normalized gazetteer name 191 "matchtext": span.text # Mention in text 192 } 193 if slot != "country": 194 inf[slot]["feature"] = span.place.format_feature() 195 196 197def score_inferences(inf, matches): 198 # PASS 2. chose location and fill out chosen metadata. 199 # RELATED location information -- use Country Code, ADM1 or the CC.ADM1 province_id to 200 # indicate location coding. This applies to all inferences started. 201 202 for inf_id in inf: 203 inference = inf[inf_id] 204 if "scores" not in inference: 205 continue 206 207 scores = inference["scores"] 208 top_score = 0 209 top_match = None 210 for scored_id in scores: 211 score = scores[scored_id] 212 if score > top_score: 213 top_score = score 214 top_match = matches[scored_id] 215 216 feat, res = characterize_location(top_match.place, top_match.label) 217 adm1 = top_match.place.adm1 218 cc2 = top_match.place.country_code 219 220 # Flesh out the metadata for best location for this mention. 221 inference.update({ 222 "matchtext": top_match.text, 223 "confidence": top_match.confidence, 224 "resolution": res, 225 "feature": feat, 226 "lat": top_match.place.lat, 227 "lon": top_match.place.lon, 228 "province_id": make_HASC(cc2, adm1), 229 "cc": cc2, 230 "adm1": adm1}) 231 232 return inf 233 234 235class Geotagger: 236 """ 237 GEOTAGGER REST client. This wrapper around XlayerClient class abstracts alot of the details 238 of calling and parsing the geo-inferencing results from the API server. 239 240 """ 241 ALLOWED_SLOTS = {"site", "city", "admin", "postal", "country"} 242 243 def __init__(self, cfg: dict, debug=False, features=["geo", "postal", "taxons"]): 244 245 self.debug = debug 246 247 log_lvl = "INFO" 248 if debug: 249 log_lvl = "DEBUG" 250 self.log = logger_config(log_lvl, __name__) 251 252 self.features = features 253 url = cfg.get("xponents.url") 254 self.xponents = XlayerClient(url) 255 self.confidence_min = int(cfg.get("xponents.confidence.min", 10)) 256 # On Xponents 100 point scale. 257 258 # Test if client is alive 259 if not self.xponents.ping(): 260 raise Exception("Service not available") 261 262 def dbg(self, msg, *args, **kwargs): 263 self.log.debug(msg, *args, **kwargs) 264 265 def info(self, msg, *args, **kwargs): 266 self.log.info(msg, *args, **kwargs) 267 268 def error(self, msg, *args, **kwargs): 269 self.log.error(msg, *args, **kwargs) 270 271 def _location_info(self, spans: list) -> list: 272 locs = [] 273 for t in spans: 274 loc_conf = int(t.attrs.get("confidence", -1)) 275 if isinstance(t, PlaceCandidate): 276 if 0 < self.confidence_min <= loc_conf: 277 locs.append(t) 278 return locs 279 280 def infer_locations(self, locs: list) -> dict: 281 """ 282 Choose the best location from the list -- Most specific is preferred. 283 :param locs: list of locations 284 :return: 285 """ 286 # Order of preference: 287 # 0. site location 288 # 1. postal location w/related info 289 # 2. qualified city ~ "City, Province" ... or just "City" 290 # 3. Province 291 # 4. Country 292 # 293 294 # LOGIC: 295 # step 1 - key all locations by match-id, for easy lookup 296 # step 2 - distill compound locations like a postal address to reduce matches to a single "geo inference" 297 # with one best location. 298 # step 3 - organize all location mentions in final `inferences` listing. 299 # step 4 - score inferences, as needed. 300 301 inferences = dict() 302 303 # PASS 1. inventory locations and award points 304 matches = {} 305 countries = dict() 306 rendered_match_ids = dict() 307 308 # Ensure matches are Place Candidates only -- location bearing information. 309 for match in locs: 310 if isinstance(match, PlaceCandidate): 311 matches[match.id] = match 312 313 # Loop through high resolution locations first. 314 for mid in matches: 315 match = matches[mid] 316 loc = match.place # Place obj 317 attrs = match.attrs # dict 318 label = match.label # entity label 319 points = int(0.10 * (match.confidence or 10)) 320 321 # POSTAL. Max points ~ 40 or so, 10 points for each qualifying slot (city, prov, code, etc) 322 if label == "postal" and "related" in attrs: 323 inferences[mid] = {"match-ids": [mid], "start": match.start, "end": match.end} 324 rendered_match_ids[mid] = mid 325 points += 10 326 related_geo = attrs["related"] 327 _increment_count(countries, loc.country_code) 328 if related_geo: 329 _infer_slot(inferences, "postal", match) 330 for k in related_geo: 331 # these match IDs indicate the full tuple's geographic connections. 332 points += 10 333 # dereference the postal match. 334 slot = related_geo[k] 335 slot_match = matches.get(slot.get("match-id")) 336 slot_text = related_geo[k]["matchtext"] 337 self.dbg("POSTAL slot %s = %s", k, related_geo[k]) 338 if slot_match: 339 _infer_slot(inferences, k, slot_match, match_id=mid) 340 rendered_match_ids[slot_match.id] = mid 341 else: 342 self.info("Xponents BUG: missing match id for postal evidence. %s = %s", k, slot_text) 343 inferences[match.id]["scores"] = {mid: points} 344 345 # Iterate over remaining matches. 346 for mid in matches: 347 match = matches[mid] 348 loc = match.place # Place obj 349 attrs = match.attrs # dict 350 label = match.label # entity label 351 352 if mid not in rendered_match_ids: 353 # Ignore all upper case short names... for now. Especially if there is no related geography attached. 354 if label == "place" and match.len < 8 and match.text.isupper(): 355 self.info(" IGNORE %s", match.text) 356 continue 357 358 if label == "postal": 359 # Such matches should have been associated through some hook above when all postal addresses 360 # gather related mentions. 361 self.dbg(" (BUG) IGNORE Postal %s", match.text) 362 continue 363 364 # Backfill any entries that appear legit, but were not associated with other compound mentions like addresses. 365 # Given this is a standalone location, there is no scoring. 366 cc2, adm1 = match.place.country_code, match.place.adm1 367 feat, res = characterize_location(match.place, match.label) 368 inferences[mid] = { 369 "start": match.start, "end": match.end, 370 "matchtext": match.text, 371 "confidence": match.confidence, 372 "resolution": res, 373 "feature": feat, 374 "lat": match.place.lat, 375 "lon": match.place.lon, 376 "province_id": make_HASC(cc2, adm1), 377 "cc": cc2, 378 "adm1": adm1} 379 else: 380 # Score slots found in compound POSTAL or other matches 381 points = int(0.10 * (match.confidence or 10)) 382 related_mid = rendered_match_ids[mid] 383 384 if label in {"place", "postal"}: 385 _increment_count(countries, loc.country_code) 386 if is_academic(loc.feature_class, loc.feature_code): 387 _infer_slot(inferences, "site", match, match_id=related_mid) 388 points += 20 389 elif is_populated(loc.feature_class): 390 rules = attrs.get("rules", "").lower() 391 qualified = "adminname" in rules or "admincode" in rules 392 _infer_slot(inferences, "city", match, match_id=related_mid) 393 if qualified: 394 points += 20 395 else: 396 # Else location was not qualified fully with district, province, etc.. Just a city name. 397 self.dbg("CITY %s", match.text) 398 points += 10 399 elif is_administrative(loc.feature_class): 400 _infer_slot(inferences, "admin", match, match_id=related_mid) 401 points += 5 402 self.dbg("ADMIN %s", match.text) 403 elif label == "country": 404 # No bonus points for country mention. 405 if match.len == 2: 406 self.dbg("IGNORE 2-char mention %s", match.text) 407 else: 408 _infer_slot(inferences, "country", match, match_id=related_mid) 409 _increment_count(countries, loc.country_code) 410 self.dbg("COUNTRY %s", loc) 411 412 if related_mid in inferences: 413 inferences[related_mid]["scores"] = {mid: points} 414 else: 415 self.dbg("We missed some feature %s %s %s", label, match.id, match.text) 416 417 score_inferences(inferences, matches) 418 for inf_id in inferences: 419 inference = inferences[inf_id] 420 for k in ["match-ids", "scores"]: 421 if k in inference: 422 del inference[k] 423 return inferences 424 425 def _mention_info(self, spans: list) -> list: 426 men = [] 427 for t in spans: 428 if not isinstance(t, PlaceCandidate) and not t.filtered_out: 429 men.append(t) 430 return men 431 432 def populate_mentions(self, spans: list) -> dict: 433 if not spans: 434 return dict() 435 436 def _add_slot(arr, slot_, txt): 437 grp = arr.get(slot_, set([])) 438 if not grp: 439 arr[slot_] = grp 440 grp.add(txt) 441 442 men = dict() 443 for t in spans: 444 # All spans are either taxon, org, or person...; taxon can break out into any flavor of taxonomic term 445 catalog = None 446 if t.attrs: 447 catalog = t.attrs.get("cat") or t.attrs.get("catalog") 448 449 # Handle special cases first, then more general ones. 450 if catalog and catalog == "nationality": 451 _add_slot(men, "nationality", t.text) 452 elif t.label in {"org", "person", "taxon"}: 453 _add_slot(men, t.label, t.text) 454 else: 455 self.info("Mention oddity ...%s", t.label) 456 457 # To allow as valid JSON, we cannot use set(). Convert back to list. 458 for slot in men: 459 men[slot] = list(men[slot]) 460 return men 461 462 def _nationality_countries(self, spans): 463 countries = set([]) 464 for t in spans: 465 # Its really "catalog". "cat" may happen in other systems. 466 if not (t.attrs and "catalog" in t.attrs): 467 continue 468 if t.attrs["catalog"] == "nationality": 469 taxon = t.attrs.get("name") or t.attrs.get("taxon") # TODO: more convergence of attribute schemes. 470 if taxon and "." in taxon: 471 nat = taxon.split(".")[1] 472 C = get_country(nat) 473 if C: 474 countries.add(C.cc_iso2) 475 return countries 476 477 def summarize(self, doc_id, text, lang=None) -> dict: 478 """ 479 Call the XlayerClient process() endpoint, 480 distills output tags into `geoinferences` and `mentions` (all other non-geo tags). 481 A valid 2-char ISO 639 language code helps to tune 482 483 :param doc_id: ID of text 484 :param text: the text input 485 :param lang: language of the text 486 :return: A single geoinference 487 """ 488 tags = self.xponents.process(doc_id, text, lang=lang, features=self.features, timeout=15) 489 if self.debug: 490 self.dbg("TAGS:%d", len(tags)) 491 492 output = dict() 493 494 all_locations = self._location_info(tags) 495 other_mentions = self._mention_info(tags) 496 nationality_cc = self._nationality_countries(tags) 497 # TODO -- use nationality in inference to add country info 498 499 # Choose best locations 500 output["geoinference"] = self.infer_locations(all_locations) 501 502 # Extra info: This info may be completely unrelated to geography 503 output["mentions"] = self.populate_mentions(other_mentions) 504 505 if nationality_cc: 506 self.dbg("UNUSED - Nationalities? %s", nationality_cc) 507 508 return output 509 510 511def print_results(arr): 512 """ 513 :param arr: array of Annotations or TextMatch 514 :return: 515 """ 516 for a in arr: 517 if isinstance(a, TextMatch): 518 if a.filtered_out: 519 print("{} Excluded".format(str(a))) 520 else: 521 print(a) 522 else: 523 print(a) 524 525 526def print_match(match: TextMatch): 527 """ 528 :param match: 529 :return: 530 """ 531 filtered = "" 532 if match.filtered_out: 533 filtered = "FILTERED-OUT" 534 if match.label == "place": 535 cc = match.attrs.get("cc") 536 fc = match.attrs.get("feat_class") 537 fcode = match.attrs.get("feat_code") 538 print(match, f"\t\t\tcountry:{cc}, feature:{fc}/{fcode} {filtered}") 539 else: 540 print(match, f"\n\tATTRS{match.attrs} {filtered}") 541 542 543def process_text(extractor, txt, docid="$DOC-ID$", features=[], preferred_countries=[], preferred_locations=[]): 544 result = extractor.process(docid, txt, features=features, 545 timeout=90, 546 preferred_countries=preferred_countries, 547 preferred_locations=preferred_locations) 548 print(f"=========DOCID {docid}") 549 print("TEXT", txt[0:200]) 550 print("Matches\n============") 551 for match in result: 552 print_match(match) 553 554 555def main_demo(): 556 import os 557 from traceback import format_exc 558 import argparse 559 560 ap = argparse.ArgumentParser() 561 ap.add_argument("input", help="your input") 562 ap.add_argument("--service-url", help="XLayer server host:port", default="localhost:8787") 563 ap.add_argument("--docid", help="your doc id") 564 ap.add_argument("--lines", action="store_true", help="process your inputfile as one line per call") 565 ap.add_argument("--text", action="store_true", help="<input> arg is a UTF-8 string to process") 566 ap.add_argument("--options", 567 help="your service options to send with each request, e.g., 'lowercase,clean_input,revgeo'", 568 default=None) 569 ap.add_argument("--features", help="Feature set e.g., 'geo,patterns,taxons'", default="geo,patterns,taxons") 570 ap.add_argument("--countries", help="Countries set e.g., 'AF,US,ID,BR,....", default=None) 571 ap.add_argument("--locations", help="Location geohashs set e.g., 'u23,u34,....", default=None) 572 ap.add_argument("--debug", default=False, action="store_true") 573 args = ap.parse_args() 574 575 service_url = args.service_url 576 xtractor = XlayerClient(service_url, options=args.options) 577 xtractor.debug = args.debug 578 feat = ["geo"] 579 countries = None 580 locations = None 581 if args.features: 582 feat = args.features.split(',') 583 if args.countries: 584 countries = args.countries.split(',') 585 if args.locations: 586 locations = args.locations.split(',') 587 588 print("Ping server (timeout=5s)....") 589 try: 590 xtractor.ping(timeout=5) 591 except Exception as runErr: 592 print(str(runErr)) 593 sys.exit(1) 594 595 fpath = os.path.abspath(args.input) 596 597 # ====================================== 598 # Support for arbitrary amounts of text 599 # 600 if args.text: 601 process_text(xtractor, fpath, docid="test-doc-#123", features=feat, 602 preferred_countries=countries, preferred_locations=locations) 603 # ====================================== 604 # Support data as one text record per line in a file 605 # 606 elif args.lines or args.input.endswith(".json"): 607 print("INPUT: from individual lines from input file\n\n") 608 is_json = args.input.endswith(".json") 609 try: 610 with open(fpath, 'r', encoding="UTF-8") as fh: 611 lineNum = 0 612 for line in fh: 613 textbuf = line.strip() 614 lineNum += 1 615 if is_json: 616 if not textbuf or textbuf.startswith("#"): 617 continue 618 textbuf = json.loads(textbuf).get("text") 619 if not textbuf: 620 print("'text' value required in JSON") 621 continue 622 623 test_id = "line{}".format(lineNum) 624 process_text(xtractor, textbuf, docid=test_id, features=feat, 625 preferred_countries=countries, preferred_locations=locations) 626 627 except Exception as runErr: 628 print(format_exc(limit=5)) 629 630 # ====================================== 631 # Use a single file as the source text to process 632 # 633 elif fpath: 634 file_id = os.path.basename(fpath) 635 try: 636 with open(fpath, 'r', encoding="UTF-8") as fh: 637 process_text(xtractor, fh.read(), docid=file_id, features=feat, 638 preferred_countries=countries, preferred_locations=locations) 639 except Exception as runErr: 640 print(format_exc(limit=5)) 641 642 643if __name__ == '__main__': 644 main_demo()
20class XlayerClient: 21 """ 22 Xponents REST client -- low level utility. See also Geotagger class for a better abstraction. 23 ``` 24 client = XlayerClient(url) 25 client.process( "paragraph of text...." ) ---> returns list of raw matches, geocoded. 26 ``` 27 """ 28 def __init__(self, server, options=""): 29 """ 30 @param server: URL for the service. E.g., host:port or 'http://SERVER/xlayer/rest/process'. 31 @keyword options: STRING. a comma-separated list of options to send with each request. 32 There are no default options supported. 33 """ 34 self.server = server 35 if not server.startswith("http"): 36 self.server = f"http://{server}/xlayer/rest/process" 37 self.server_control = f"http://{server}/xlayer/rest/control" 38 else: 39 # User provided a full URL. 40 self.server_control = server.replace('/process', '/control') 41 self.debug = False 42 self.default_options = options 43 44 def stop(self, timeout=30): 45 """ 46 Timeout of 30 seconds is used here so calls do not hang indefinitely. 47 The service URL is inferred: /process and /control endpoints should be next to each other. 48 :return: True if successful or if "Connection aborted" ConnectionError occurs 49 """ 50 try: 51 response = requests.get("{}/stop".format(self.server_control), timeout=timeout) 52 if response.status_code != 200: 53 return response.raise_for_status() 54 except requests.exceptions.ConnectionError as err: 55 return "Connection aborted" in str(err) 56 return False 57 58 def ping(self, timeout=30): 59 """ 60 Timeout of 30 seconds is used here so calls do not hang indefinitely. 61 :return: True if successful. 62 """ 63 response = requests.get("{}/ping".format(self.server_control), timeout=timeout) 64 if response.status_code != 200: 65 return response.raise_for_status() 66 return True 67 68 def process(self, docid, text, lang=None, features=["geo"], timeout=10, minlen=-1, 69 preferred_countries=None, preferred_locations=None): 70 """ 71 Process text, extracting some entities 72 73 lang = "xx" or None, where "xx" is a ISO language 2-char code. 74 For general Chinese/Japanese/Korean (CJK) support, use lang = 'cjk' 75 Language IDs that have some additional tuning include: 76 "ja", "th", "tr", "id", "ar", "fa", "ur", "ru", "it", 77 "pt", "de", "nl", "es", "en", "tl", "ko", "vi" 78 Behavior: Arabic (ar) or CJK (cjk) lang ID directs tagger to use language-specific tokenizers 79 Otherwise other lang ID provided just invokes language-specific stopword filters 80 81 features are places, coordinates, countries, orgs, persons, patterns, postal. 82 83 feature aliases "geo" can be used to get All Geographic entities (places,coordinates,countries) 84 feature "taxons" can get at any Taxon "taxons", "persons", "orgs". As of Xponents 3.6 this reports ALL 85 Other taxons available in TaxCat tagger. "all_taxons" is offered as a means to distinguish old and new behavior. 86 feature "postal" will tag obvious, qualified postal codes that are paired with a CITY, PROVINCE, or COUNTRY tag. 87 feature "patterns" is an alias for dates and any other pattern-based extractors. For now "dates" is only one 88 feature "codes" will tag, use and report coded information for any place; primarily administrative boundaries 89 90 options are not observed by Xlayer "Xgeo", but you can adapt your own service 91 to accomodate such options. Possible options are clean_input, lowercase, for example: 92 93 * clean_input scrubs the input text if it has HTML or other content in it. 94 * lowercase allows the tagging to pass on through lower case matches. 95 96 but interpretation of "clean text" and "lower case" support is subjective. 97 so they are not supported out of the box here. 98 :param docid: identifier of transaction 99 :param text: Unicode text to process 100 :param lang: One of ["ar", "cjk", .... other ISO language IDs] 101 :param features: list of geo OR [places, coordinates, countries], orgs, persons, patterns, taxons 102 :param timeout: default to 10 seconds; If you think your processing takes longer, 103 adjust if you see exceptions. 104 :param minlen: minimum length of matches that are unqualified. To reduce noise in geotags. Server has a default 105 of 4 chars for general purpose noise filtering. 106 :param preferred_countries: Array of country codes representing those which are preferred fall backs when 107 there are ambiguous location names. 108 :param preferred_locations: Array of geohash representing general area of desired preferred matches 109 :return: array of TextMatch objects or empty array. 110 """ 111 json_request = {'docid': docid, 'text': text} 112 if self.default_options: 113 json_request['options'] = self.default_options 114 if features: 115 json_request['features'] = ','.join(features) 116 if preferred_countries: 117 json_request['preferred_countries'] = preferred_countries 118 if preferred_locations: 119 json_request['preferred_locations'] = preferred_locations 120 if lang: 121 json_request['lang'] = lang 122 if minlen and int(minlen) > 0: 123 json_request['minlen'] = minlen 124 125 response = requests.post(self.server, json=json_request, timeout=timeout) 126 if response.status_code != 200: 127 return response.raise_for_status() 128 129 json_content = response.json() 130 131 if self.debug: 132 print(json.dumps(json_content, indent=2)) 133 if 'response' in json_content: 134 # Get the response metadata block 135 # metadata = json_content['response'] 136 pass 137 138 annots = [] 139 if 'annotations' in json_content: 140 aj = json_content['annotations'] 141 for annot in aj: 142 # Desire to move to "label" away from "type" 143 label = annot.get("type") 144 if label in GEOCODINGS: 145 tm = PlaceCandidate(annot.get('matchtext'), annot.get('offset'), None) 146 else: 147 tm = TextMatch(annot.get('matchtext'), annot.get('offset'), None) 148 tm.populate(annot) 149 annots.append(tm) 150 151 return annots
Xponents REST client -- low level utility. See also Geotagger class for a better abstraction.
client = XlayerClient(url)
client.process( "paragraph of text...." ) ---> returns list of raw matches, geocoded.
28 def __init__(self, server, options=""): 29 """ 30 @param server: URL for the service. E.g., host:port or 'http://SERVER/xlayer/rest/process'. 31 @keyword options: STRING. a comma-separated list of options to send with each request. 32 There are no default options supported. 33 """ 34 self.server = server 35 if not server.startswith("http"): 36 self.server = f"http://{server}/xlayer/rest/process" 37 self.server_control = f"http://{server}/xlayer/rest/control" 38 else: 39 # User provided a full URL. 40 self.server_control = server.replace('/process', '/control') 41 self.debug = False 42 self.default_options = options
@param server: URL for the service. E.g., host:port or 'http://SERVER/xlayer/rest/process'. @keyword options: STRING. a comma-separated list of options to send with each request. There are no default options supported.
44 def stop(self, timeout=30): 45 """ 46 Timeout of 30 seconds is used here so calls do not hang indefinitely. 47 The service URL is inferred: /process and /control endpoints should be next to each other. 48 :return: True if successful or if "Connection aborted" ConnectionError occurs 49 """ 50 try: 51 response = requests.get("{}/stop".format(self.server_control), timeout=timeout) 52 if response.status_code != 200: 53 return response.raise_for_status() 54 except requests.exceptions.ConnectionError as err: 55 return "Connection aborted" in str(err) 56 return False
Timeout of 30 seconds is used here so calls do not hang indefinitely. The service URL is inferred: /process and /control endpoints should be next to each other. :return: True if successful or if "Connection aborted" ConnectionError occurs
58 def ping(self, timeout=30): 59 """ 60 Timeout of 30 seconds is used here so calls do not hang indefinitely. 61 :return: True if successful. 62 """ 63 response = requests.get("{}/ping".format(self.server_control), timeout=timeout) 64 if response.status_code != 200: 65 return response.raise_for_status() 66 return True
Timeout of 30 seconds is used here so calls do not hang indefinitely. :return: True if successful.
68 def process(self, docid, text, lang=None, features=["geo"], timeout=10, minlen=-1, 69 preferred_countries=None, preferred_locations=None): 70 """ 71 Process text, extracting some entities 72 73 lang = "xx" or None, where "xx" is a ISO language 2-char code. 74 For general Chinese/Japanese/Korean (CJK) support, use lang = 'cjk' 75 Language IDs that have some additional tuning include: 76 "ja", "th", "tr", "id", "ar", "fa", "ur", "ru", "it", 77 "pt", "de", "nl", "es", "en", "tl", "ko", "vi" 78 Behavior: Arabic (ar) or CJK (cjk) lang ID directs tagger to use language-specific tokenizers 79 Otherwise other lang ID provided just invokes language-specific stopword filters 80 81 features are places, coordinates, countries, orgs, persons, patterns, postal. 82 83 feature aliases "geo" can be used to get All Geographic entities (places,coordinates,countries) 84 feature "taxons" can get at any Taxon "taxons", "persons", "orgs". As of Xponents 3.6 this reports ALL 85 Other taxons available in TaxCat tagger. "all_taxons" is offered as a means to distinguish old and new behavior. 86 feature "postal" will tag obvious, qualified postal codes that are paired with a CITY, PROVINCE, or COUNTRY tag. 87 feature "patterns" is an alias for dates and any other pattern-based extractors. For now "dates" is only one 88 feature "codes" will tag, use and report coded information for any place; primarily administrative boundaries 89 90 options are not observed by Xlayer "Xgeo", but you can adapt your own service 91 to accomodate such options. Possible options are clean_input, lowercase, for example: 92 93 * clean_input scrubs the input text if it has HTML or other content in it. 94 * lowercase allows the tagging to pass on through lower case matches. 95 96 but interpretation of "clean text" and "lower case" support is subjective. 97 so they are not supported out of the box here. 98 :param docid: identifier of transaction 99 :param text: Unicode text to process 100 :param lang: One of ["ar", "cjk", .... other ISO language IDs] 101 :param features: list of geo OR [places, coordinates, countries], orgs, persons, patterns, taxons 102 :param timeout: default to 10 seconds; If you think your processing takes longer, 103 adjust if you see exceptions. 104 :param minlen: minimum length of matches that are unqualified. To reduce noise in geotags. Server has a default 105 of 4 chars for general purpose noise filtering. 106 :param preferred_countries: Array of country codes representing those which are preferred fall backs when 107 there are ambiguous location names. 108 :param preferred_locations: Array of geohash representing general area of desired preferred matches 109 :return: array of TextMatch objects or empty array. 110 """ 111 json_request = {'docid': docid, 'text': text} 112 if self.default_options: 113 json_request['options'] = self.default_options 114 if features: 115 json_request['features'] = ','.join(features) 116 if preferred_countries: 117 json_request['preferred_countries'] = preferred_countries 118 if preferred_locations: 119 json_request['preferred_locations'] = preferred_locations 120 if lang: 121 json_request['lang'] = lang 122 if minlen and int(minlen) > 0: 123 json_request['minlen'] = minlen 124 125 response = requests.post(self.server, json=json_request, timeout=timeout) 126 if response.status_code != 200: 127 return response.raise_for_status() 128 129 json_content = response.json() 130 131 if self.debug: 132 print(json.dumps(json_content, indent=2)) 133 if 'response' in json_content: 134 # Get the response metadata block 135 # metadata = json_content['response'] 136 pass 137 138 annots = [] 139 if 'annotations' in json_content: 140 aj = json_content['annotations'] 141 for annot in aj: 142 # Desire to move to "label" away from "type" 143 label = annot.get("type") 144 if label in GEOCODINGS: 145 tm = PlaceCandidate(annot.get('matchtext'), annot.get('offset'), None) 146 else: 147 tm = TextMatch(annot.get('matchtext'), annot.get('offset'), None) 148 tm.populate(annot) 149 annots.append(tm) 150 151 return annots
Process text, extracting some entities
lang = "xx" or None, where "xx" is a ISO language 2-char code. For general Chinese/Japanese/Korean (CJK) support, use lang = 'cjk' Language IDs that have some additional tuning include: "ja", "th", "tr", "id", "ar", "fa", "ur", "ru", "it", "pt", "de", "nl", "es", "en", "tl", "ko", "vi" Behavior: Arabic (ar) or CJK (cjk) lang ID directs tagger to use language-specific tokenizers Otherwise other lang ID provided just invokes language-specific stopword filters
features are places, coordinates, countries, orgs, persons, patterns, postal.
feature aliases "geo" can be used to get All Geographic entities (places,coordinates,countries) feature "taxons" can get at any Taxon "taxons", "persons", "orgs". As of Xponents 3.6 this reports ALL Other taxons available in TaxCat tagger. "all_taxons" is offered as a means to distinguish old and new behavior. feature "postal" will tag obvious, qualified postal codes that are paired with a CITY, PROVINCE, or COUNTRY tag. feature "patterns" is an alias for dates and any other pattern-based extractors. For now "dates" is only one feature "codes" will tag, use and report coded information for any place; primarily administrative boundaries
options are not observed by Xlayer "Xgeo", but you can adapt your own service to accomodate such options. Possible options are clean_input, lowercase, for example:
- clean_input scrubs the input text if it has HTML or other content in it.
lowercase allows the tagging to pass on through lower case matches.
but interpretation of "clean text" and "lower case" support is subjective. so they are not supported out of the box here. :param docid: identifier of transaction :param text: Unicode text to process :param lang: One of ["ar", "cjk", .... other ISO language IDs] :param features: list of geo OR [places, coordinates, countries], orgs, persons, patterns, taxons :param timeout: default to 10 seconds; If you think your processing takes longer, adjust if you see exceptions. :param minlen: minimum length of matches that are unqualified. To reduce noise in geotags. Server has a default of 4 chars for general purpose noise filtering. :param preferred_countries: Array of country codes representing those which are preferred fall backs when there are ambiguous location names. :param preferred_locations: Array of geohash representing general area of desired preferred matches :return: array of TextMatch objects or empty array.
236class Geotagger: 237 """ 238 GEOTAGGER REST client. This wrapper around XlayerClient class abstracts alot of the details 239 of calling and parsing the geo-inferencing results from the API server. 240 241 """ 242 ALLOWED_SLOTS = {"site", "city", "admin", "postal", "country"} 243 244 def __init__(self, cfg: dict, debug=False, features=["geo", "postal", "taxons"]): 245 246 self.debug = debug 247 248 log_lvl = "INFO" 249 if debug: 250 log_lvl = "DEBUG" 251 self.log = logger_config(log_lvl, __name__) 252 253 self.features = features 254 url = cfg.get("xponents.url") 255 self.xponents = XlayerClient(url) 256 self.confidence_min = int(cfg.get("xponents.confidence.min", 10)) 257 # On Xponents 100 point scale. 258 259 # Test if client is alive 260 if not self.xponents.ping(): 261 raise Exception("Service not available") 262 263 def dbg(self, msg, *args, **kwargs): 264 self.log.debug(msg, *args, **kwargs) 265 266 def info(self, msg, *args, **kwargs): 267 self.log.info(msg, *args, **kwargs) 268 269 def error(self, msg, *args, **kwargs): 270 self.log.error(msg, *args, **kwargs) 271 272 def _location_info(self, spans: list) -> list: 273 locs = [] 274 for t in spans: 275 loc_conf = int(t.attrs.get("confidence", -1)) 276 if isinstance(t, PlaceCandidate): 277 if 0 < self.confidence_min <= loc_conf: 278 locs.append(t) 279 return locs 280 281 def infer_locations(self, locs: list) -> dict: 282 """ 283 Choose the best location from the list -- Most specific is preferred. 284 :param locs: list of locations 285 :return: 286 """ 287 # Order of preference: 288 # 0. site location 289 # 1. postal location w/related info 290 # 2. qualified city ~ "City, Province" ... or just "City" 291 # 3. Province 292 # 4. Country 293 # 294 295 # LOGIC: 296 # step 1 - key all locations by match-id, for easy lookup 297 # step 2 - distill compound locations like a postal address to reduce matches to a single "geo inference" 298 # with one best location. 299 # step 3 - organize all location mentions in final `inferences` listing. 300 # step 4 - score inferences, as needed. 301 302 inferences = dict() 303 304 # PASS 1. inventory locations and award points 305 matches = {} 306 countries = dict() 307 rendered_match_ids = dict() 308 309 # Ensure matches are Place Candidates only -- location bearing information. 310 for match in locs: 311 if isinstance(match, PlaceCandidate): 312 matches[match.id] = match 313 314 # Loop through high resolution locations first. 315 for mid in matches: 316 match = matches[mid] 317 loc = match.place # Place obj 318 attrs = match.attrs # dict 319 label = match.label # entity label 320 points = int(0.10 * (match.confidence or 10)) 321 322 # POSTAL. Max points ~ 40 or so, 10 points for each qualifying slot (city, prov, code, etc) 323 if label == "postal" and "related" in attrs: 324 inferences[mid] = {"match-ids": [mid], "start": match.start, "end": match.end} 325 rendered_match_ids[mid] = mid 326 points += 10 327 related_geo = attrs["related"] 328 _increment_count(countries, loc.country_code) 329 if related_geo: 330 _infer_slot(inferences, "postal", match) 331 for k in related_geo: 332 # these match IDs indicate the full tuple's geographic connections. 333 points += 10 334 # dereference the postal match. 335 slot = related_geo[k] 336 slot_match = matches.get(slot.get("match-id")) 337 slot_text = related_geo[k]["matchtext"] 338 self.dbg("POSTAL slot %s = %s", k, related_geo[k]) 339 if slot_match: 340 _infer_slot(inferences, k, slot_match, match_id=mid) 341 rendered_match_ids[slot_match.id] = mid 342 else: 343 self.info("Xponents BUG: missing match id for postal evidence. %s = %s", k, slot_text) 344 inferences[match.id]["scores"] = {mid: points} 345 346 # Iterate over remaining matches. 347 for mid in matches: 348 match = matches[mid] 349 loc = match.place # Place obj 350 attrs = match.attrs # dict 351 label = match.label # entity label 352 353 if mid not in rendered_match_ids: 354 # Ignore all upper case short names... for now. Especially if there is no related geography attached. 355 if label == "place" and match.len < 8 and match.text.isupper(): 356 self.info(" IGNORE %s", match.text) 357 continue 358 359 if label == "postal": 360 # Such matches should have been associated through some hook above when all postal addresses 361 # gather related mentions. 362 self.dbg(" (BUG) IGNORE Postal %s", match.text) 363 continue 364 365 # Backfill any entries that appear legit, but were not associated with other compound mentions like addresses. 366 # Given this is a standalone location, there is no scoring. 367 cc2, adm1 = match.place.country_code, match.place.adm1 368 feat, res = characterize_location(match.place, match.label) 369 inferences[mid] = { 370 "start": match.start, "end": match.end, 371 "matchtext": match.text, 372 "confidence": match.confidence, 373 "resolution": res, 374 "feature": feat, 375 "lat": match.place.lat, 376 "lon": match.place.lon, 377 "province_id": make_HASC(cc2, adm1), 378 "cc": cc2, 379 "adm1": adm1} 380 else: 381 # Score slots found in compound POSTAL or other matches 382 points = int(0.10 * (match.confidence or 10)) 383 related_mid = rendered_match_ids[mid] 384 385 if label in {"place", "postal"}: 386 _increment_count(countries, loc.country_code) 387 if is_academic(loc.feature_class, loc.feature_code): 388 _infer_slot(inferences, "site", match, match_id=related_mid) 389 points += 20 390 elif is_populated(loc.feature_class): 391 rules = attrs.get("rules", "").lower() 392 qualified = "adminname" in rules or "admincode" in rules 393 _infer_slot(inferences, "city", match, match_id=related_mid) 394 if qualified: 395 points += 20 396 else: 397 # Else location was not qualified fully with district, province, etc.. Just a city name. 398 self.dbg("CITY %s", match.text) 399 points += 10 400 elif is_administrative(loc.feature_class): 401 _infer_slot(inferences, "admin", match, match_id=related_mid) 402 points += 5 403 self.dbg("ADMIN %s", match.text) 404 elif label == "country": 405 # No bonus points for country mention. 406 if match.len == 2: 407 self.dbg("IGNORE 2-char mention %s", match.text) 408 else: 409 _infer_slot(inferences, "country", match, match_id=related_mid) 410 _increment_count(countries, loc.country_code) 411 self.dbg("COUNTRY %s", loc) 412 413 if related_mid in inferences: 414 inferences[related_mid]["scores"] = {mid: points} 415 else: 416 self.dbg("We missed some feature %s %s %s", label, match.id, match.text) 417 418 score_inferences(inferences, matches) 419 for inf_id in inferences: 420 inference = inferences[inf_id] 421 for k in ["match-ids", "scores"]: 422 if k in inference: 423 del inference[k] 424 return inferences 425 426 def _mention_info(self, spans: list) -> list: 427 men = [] 428 for t in spans: 429 if not isinstance(t, PlaceCandidate) and not t.filtered_out: 430 men.append(t) 431 return men 432 433 def populate_mentions(self, spans: list) -> dict: 434 if not spans: 435 return dict() 436 437 def _add_slot(arr, slot_, txt): 438 grp = arr.get(slot_, set([])) 439 if not grp: 440 arr[slot_] = grp 441 grp.add(txt) 442 443 men = dict() 444 for t in spans: 445 # All spans are either taxon, org, or person...; taxon can break out into any flavor of taxonomic term 446 catalog = None 447 if t.attrs: 448 catalog = t.attrs.get("cat") or t.attrs.get("catalog") 449 450 # Handle special cases first, then more general ones. 451 if catalog and catalog == "nationality": 452 _add_slot(men, "nationality", t.text) 453 elif t.label in {"org", "person", "taxon"}: 454 _add_slot(men, t.label, t.text) 455 else: 456 self.info("Mention oddity ...%s", t.label) 457 458 # To allow as valid JSON, we cannot use set(). Convert back to list. 459 for slot in men: 460 men[slot] = list(men[slot]) 461 return men 462 463 def _nationality_countries(self, spans): 464 countries = set([]) 465 for t in spans: 466 # Its really "catalog". "cat" may happen in other systems. 467 if not (t.attrs and "catalog" in t.attrs): 468 continue 469 if t.attrs["catalog"] == "nationality": 470 taxon = t.attrs.get("name") or t.attrs.get("taxon") # TODO: more convergence of attribute schemes. 471 if taxon and "." in taxon: 472 nat = taxon.split(".")[1] 473 C = get_country(nat) 474 if C: 475 countries.add(C.cc_iso2) 476 return countries 477 478 def summarize(self, doc_id, text, lang=None) -> dict: 479 """ 480 Call the XlayerClient process() endpoint, 481 distills output tags into `geoinferences` and `mentions` (all other non-geo tags). 482 A valid 2-char ISO 639 language code helps to tune 483 484 :param doc_id: ID of text 485 :param text: the text input 486 :param lang: language of the text 487 :return: A single geoinference 488 """ 489 tags = self.xponents.process(doc_id, text, lang=lang, features=self.features, timeout=15) 490 if self.debug: 491 self.dbg("TAGS:%d", len(tags)) 492 493 output = dict() 494 495 all_locations = self._location_info(tags) 496 other_mentions = self._mention_info(tags) 497 nationality_cc = self._nationality_countries(tags) 498 # TODO -- use nationality in inference to add country info 499 500 # Choose best locations 501 output["geoinference"] = self.infer_locations(all_locations) 502 503 # Extra info: This info may be completely unrelated to geography 504 output["mentions"] = self.populate_mentions(other_mentions) 505 506 if nationality_cc: 507 self.dbg("UNUSED - Nationalities? %s", nationality_cc) 508 509 return output
GEOTAGGER REST client. This wrapper around XlayerClient class abstracts alot of the details of calling and parsing the geo-inferencing results from the API server.
281 def infer_locations(self, locs: list) -> dict: 282 """ 283 Choose the best location from the list -- Most specific is preferred. 284 :param locs: list of locations 285 :return: 286 """ 287 # Order of preference: 288 # 0. site location 289 # 1. postal location w/related info 290 # 2. qualified city ~ "City, Province" ... or just "City" 291 # 3. Province 292 # 4. Country 293 # 294 295 # LOGIC: 296 # step 1 - key all locations by match-id, for easy lookup 297 # step 2 - distill compound locations like a postal address to reduce matches to a single "geo inference" 298 # with one best location. 299 # step 3 - organize all location mentions in final `inferences` listing. 300 # step 4 - score inferences, as needed. 301 302 inferences = dict() 303 304 # PASS 1. inventory locations and award points 305 matches = {} 306 countries = dict() 307 rendered_match_ids = dict() 308 309 # Ensure matches are Place Candidates only -- location bearing information. 310 for match in locs: 311 if isinstance(match, PlaceCandidate): 312 matches[match.id] = match 313 314 # Loop through high resolution locations first. 315 for mid in matches: 316 match = matches[mid] 317 loc = match.place # Place obj 318 attrs = match.attrs # dict 319 label = match.label # entity label 320 points = int(0.10 * (match.confidence or 10)) 321 322 # POSTAL. Max points ~ 40 or so, 10 points for each qualifying slot (city, prov, code, etc) 323 if label == "postal" and "related" in attrs: 324 inferences[mid] = {"match-ids": [mid], "start": match.start, "end": match.end} 325 rendered_match_ids[mid] = mid 326 points += 10 327 related_geo = attrs["related"] 328 _increment_count(countries, loc.country_code) 329 if related_geo: 330 _infer_slot(inferences, "postal", match) 331 for k in related_geo: 332 # these match IDs indicate the full tuple's geographic connections. 333 points += 10 334 # dereference the postal match. 335 slot = related_geo[k] 336 slot_match = matches.get(slot.get("match-id")) 337 slot_text = related_geo[k]["matchtext"] 338 self.dbg("POSTAL slot %s = %s", k, related_geo[k]) 339 if slot_match: 340 _infer_slot(inferences, k, slot_match, match_id=mid) 341 rendered_match_ids[slot_match.id] = mid 342 else: 343 self.info("Xponents BUG: missing match id for postal evidence. %s = %s", k, slot_text) 344 inferences[match.id]["scores"] = {mid: points} 345 346 # Iterate over remaining matches. 347 for mid in matches: 348 match = matches[mid] 349 loc = match.place # Place obj 350 attrs = match.attrs # dict 351 label = match.label # entity label 352 353 if mid not in rendered_match_ids: 354 # Ignore all upper case short names... for now. Especially if there is no related geography attached. 355 if label == "place" and match.len < 8 and match.text.isupper(): 356 self.info(" IGNORE %s", match.text) 357 continue 358 359 if label == "postal": 360 # Such matches should have been associated through some hook above when all postal addresses 361 # gather related mentions. 362 self.dbg(" (BUG) IGNORE Postal %s", match.text) 363 continue 364 365 # Backfill any entries that appear legit, but were not associated with other compound mentions like addresses. 366 # Given this is a standalone location, there is no scoring. 367 cc2, adm1 = match.place.country_code, match.place.adm1 368 feat, res = characterize_location(match.place, match.label) 369 inferences[mid] = { 370 "start": match.start, "end": match.end, 371 "matchtext": match.text, 372 "confidence": match.confidence, 373 "resolution": res, 374 "feature": feat, 375 "lat": match.place.lat, 376 "lon": match.place.lon, 377 "province_id": make_HASC(cc2, adm1), 378 "cc": cc2, 379 "adm1": adm1} 380 else: 381 # Score slots found in compound POSTAL or other matches 382 points = int(0.10 * (match.confidence or 10)) 383 related_mid = rendered_match_ids[mid] 384 385 if label in {"place", "postal"}: 386 _increment_count(countries, loc.country_code) 387 if is_academic(loc.feature_class, loc.feature_code): 388 _infer_slot(inferences, "site", match, match_id=related_mid) 389 points += 20 390 elif is_populated(loc.feature_class): 391 rules = attrs.get("rules", "").lower() 392 qualified = "adminname" in rules or "admincode" in rules 393 _infer_slot(inferences, "city", match, match_id=related_mid) 394 if qualified: 395 points += 20 396 else: 397 # Else location was not qualified fully with district, province, etc.. Just a city name. 398 self.dbg("CITY %s", match.text) 399 points += 10 400 elif is_administrative(loc.feature_class): 401 _infer_slot(inferences, "admin", match, match_id=related_mid) 402 points += 5 403 self.dbg("ADMIN %s", match.text) 404 elif label == "country": 405 # No bonus points for country mention. 406 if match.len == 2: 407 self.dbg("IGNORE 2-char mention %s", match.text) 408 else: 409 _infer_slot(inferences, "country", match, match_id=related_mid) 410 _increment_count(countries, loc.country_code) 411 self.dbg("COUNTRY %s", loc) 412 413 if related_mid in inferences: 414 inferences[related_mid]["scores"] = {mid: points} 415 else: 416 self.dbg("We missed some feature %s %s %s", label, match.id, match.text) 417 418 score_inferences(inferences, matches) 419 for inf_id in inferences: 420 inference = inferences[inf_id] 421 for k in ["match-ids", "scores"]: 422 if k in inference: 423 del inference[k] 424 return inferences
Choose the best location from the list -- Most specific is preferred. :param locs: list of locations :return:
478 def summarize(self, doc_id, text, lang=None) -> dict: 479 """ 480 Call the XlayerClient process() endpoint, 481 distills output tags into `geoinferences` and `mentions` (all other non-geo tags). 482 A valid 2-char ISO 639 language code helps to tune 483 484 :param doc_id: ID of text 485 :param text: the text input 486 :param lang: language of the text 487 :return: A single geoinference 488 """ 489 tags = self.xponents.process(doc_id, text, lang=lang, features=self.features, timeout=15) 490 if self.debug: 491 self.dbg("TAGS:%d", len(tags)) 492 493 output = dict() 494 495 all_locations = self._location_info(tags) 496 other_mentions = self._mention_info(tags) 497 nationality_cc = self._nationality_countries(tags) 498 # TODO -- use nationality in inference to add country info 499 500 # Choose best locations 501 output["geoinference"] = self.infer_locations(all_locations) 502 503 # Extra info: This info may be completely unrelated to geography 504 output["mentions"] = self.populate_mentions(other_mentions) 505 506 if nationality_cc: 507 self.dbg("UNUSED - Nationalities? %s", nationality_cc) 508 509 return output
Call the XlayerClient process() endpoint,
distills output tags into geoinferences and mentions (all other non-geo tags).
A valid 2-char ISO 639 language code helps to tune
:param doc_id: ID of text :param text: the text input :param lang: language of the text :return: A single geoinference
512def print_results(arr): 513 """ 514 :param arr: array of Annotations or TextMatch 515 :return: 516 """ 517 for a in arr: 518 if isinstance(a, TextMatch): 519 if a.filtered_out: 520 print("{} Excluded".format(str(a))) 521 else: 522 print(a) 523 else: 524 print(a)
:param arr: array of Annotations or TextMatch :return:
527def print_match(match: TextMatch): 528 """ 529 :param match: 530 :return: 531 """ 532 filtered = "" 533 if match.filtered_out: 534 filtered = "FILTERED-OUT" 535 if match.label == "place": 536 cc = match.attrs.get("cc") 537 fc = match.attrs.get("feat_class") 538 fcode = match.attrs.get("feat_code") 539 print(match, f"\t\t\tcountry:{cc}, feature:{fc}/{fcode} {filtered}") 540 else: 541 print(match, f"\n\tATTRS{match.attrs} {filtered}")
:param match: :return: