opensextant.xlayer

Created on Mar 14, 2016

@author: ubaldino

View Source

  1# -*- coding: utf-8 -*-
  2"""
  3Created on Mar 14, 2016
  4
  5@author: ubaldino
  6"""
  7import json
  8import sys
  9
 10import requests
 11import requests.exceptions
 12from opensextant import TextMatch, PlaceCandidate, get_country, make_HASC, \
 13    is_populated, is_administrative, is_academic, characterize_location, logger_config
 14
 15# Move away from "geo" and towards a more descriptive place label.
 16GEOCODINGS = {"geo", "place", "postal", "country", "coord", "coordinate"}
 17
 18
 19class XlayerClient:
 20    """
 21      Xponents REST client --  low level utility. See also Geotagger class for a better abstraction.
 22      ```
 23      client = XlayerClient(url)
 24      client.process( "paragraph of text...." ) ---> returns list of raw matches, geocoded.
 25      ```
 26    """
 27    def __init__(self, server, options=""):
 28        """
 29        @param server: URL for the service.   E.g., host:port or 'http://SERVER/xlayer/rest/process'.
 30        @keyword  options:  STRING. a comma-separated list of options to send with each request.
 31        There are no default options supported.
 32        """
 33        self.server = server
 34        if not server.startswith("http"):
 35            self.server = f"http://{server}/xlayer/rest/process"
 36            self.server_control = f"http://{server}/xlayer/rest/control"
 37        else:
 38            # User provided a full URL.
 39            self.server_control = server.replace('/process', '/control')
 40        self.debug = False
 41        self.default_options = options
 42
 43    def stop(self, timeout=30):
 44        """
 45        Timeout of 30 seconds is used here so calls do not hang indefinitely.
 46        The service URL is inferred:  /process and /control endpoints should be next to each other.
 47        :return: True if successful or if "Connection aborted" ConnectionError occurs
 48        """
 49        try:
 50            response = requests.get("{}/stop".format(self.server_control), timeout=timeout)
 51            if response.status_code != 200:
 52                return response.raise_for_status()
 53        except requests.exceptions.ConnectionError as err:
 54            return "Connection aborted" in str(err)
 55        return False
 56
 57    def ping(self, timeout=30):
 58        """
 59        Timeout of 30 seconds is used here so calls do not hang indefinitely.
 60        :return: True if successful.
 61        """
 62        response = requests.get("{}/ping".format(self.server_control), timeout=timeout)
 63        if response.status_code != 200:
 64            return response.raise_for_status()
 65        return True
 66
 67    def process(self, docid, text, lang=None, features=["geo"], timeout=10, minlen=-1,
 68                preferred_countries=None, preferred_locations=None):
 69        """
 70        Process text, extracting some entities
 71
 72          lang = "xx" or None, where "xx" is a ISO language 2-char code.
 73              For general Chinese/Japanese/Korean (CJK) support, use lang = 'cjk'
 74              Language IDs that have some additional tuning include:
 75                   "ja", "th", "tr", "id", "ar", "fa", "ur", "ru", "it",
 76                   "pt", "de", "nl", "es", "en", "tl", "ko", "vi"
 77          Behavior:  Arabic (ar) or CJK (cjk) lang ID directs tagger to use language-specific tokenizers
 78              Otherwise other lang ID provided just invokes language-specific stopword filters
 79
 80          features are places, coordinates, countries, orgs, persons, patterns, postal. 
 81          
 82          feature aliases "geo" can be used to get All Geographic entities (places,coordinates,countries)
 83          feature "taxons" can get at any Taxon "taxons", "persons", "orgs".  As of Xponents 3.6 this reports ALL
 84              Other taxons available in TaxCat tagger.  "all_taxons" is offered as a means to distinguish old and new behavior.
 85          feature "postal" will tag obvious, qualified postal codes that are paired with a CITY, PROVINCE, or COUNTRY tag.
 86          feature "patterns" is an alias for dates and any other pattern-based extractors. For now "dates" is only one
 87          feature "codes" will tag, use and report coded information for any place; primarily administrative boundaries
 88
 89          options are not observed by Xlayer "Xgeo", but you can adapt your own service
 90          to accomodate such options.   Possible options are clean_input, lowercase, for example:
 91
 92          * clean_input scrubs the input text if it has HTML or other content in it.
 93          * lowercase allows the tagging to pass on through lower case matches.
 94          
 95          but interpretation of "clean text" and "lower case" support is subjective.
 96          so they are not supported out of the box here.
 97        :param docid: identifier of transaction
 98        :param text: Unicode text to process
 99        :param lang: One of ["ar", "cjk", .... other ISO language IDs]
100        :param features: list of geo OR [places, coordinates, countries], orgs, persons, patterns, taxons
101        :param timeout: default to 10 seconds; If you think your processing takes longer,
102                 adjust if you see exceptions.
103        :param minlen: minimum length of matches that are unqualified. To reduce noise in geotags. Server has a default
104            of 4 chars for general purpose noise filtering.
105        :param preferred_countries: Array of country codes representing those which are preferred fall backs when
106            there are ambiguous location names.
107        :param preferred_locations:  Array of geohash representing general area of desired preferred matches
108        :return: array of TextMatch objects or empty array.
109        """
110        json_request = {'docid': docid, 'text': text}
111        if self.default_options:
112            json_request['options'] = self.default_options
113        if features:
114            json_request['features'] = ','.join(features)
115        if preferred_countries:
116            json_request['preferred_countries'] = preferred_countries
117        if preferred_locations:
118            json_request['preferred_locations'] = preferred_locations
119        if lang:
120            json_request['lang'] = lang
121        if minlen and int(minlen) > 0:
122            json_request['minlen'] = minlen
123
124        response = requests.post(self.server, json=json_request, timeout=timeout)
125        if response.status_code != 200:
126            return response.raise_for_status()
127
128        json_content = response.json()
129
130        if self.debug:
131            print(json.dumps(json_content, indent=2))
132        if 'response' in json_content:
133            # Get the response metadata block
134            # metadata = json_content['response']
135            pass
136
137        annots = []
138        if 'annotations' in json_content:
139            aj = json_content['annotations']
140            for annot in aj:
141                # Desire to move to "label" away from "type"
142                label = annot.get("type")
143                if label in GEOCODINGS:
144                    tm = PlaceCandidate(annot.get('matchtext'), annot.get('offset'), None)
145                else:
146                    tm = TextMatch(annot.get('matchtext'), annot.get('offset'), None)
147                tm.populate(annot)
148                annots.append(tm)
149
150        return annots
151
152
153# ==================
154# Geotagger -- Simplified wrapper around Xlayer.  Reduces volume of information
155#    EXPERIMENTAL
156# ==================
157
158
159def _increment_count(dct: dict, code: str):
160    if not code:
161        raise Exception("Data quality issue -- counting on a null value")
162    dct[code] = 1 + dct.get(code, 0)
163
164
165def _infer_slot(all_inf: dict, slot: str, span: PlaceCandidate, match_id=None):
166    """
167    Insert information into slots.
168
169    :param span:
170    :return:
171    """
172    mid = match_id or span.id
173    inf = all_inf.get(mid, {})
174    if not inf:
175        all_inf[mid] = inf
176
177    if slot in inf:
178        return
179    if not span:
180        raise Exception("Data integrity issue -- inferring location should have a non-null match")
181    if slot not in Geotagger.ALLOWED_SLOTS:
182        return
183
184    ids = inf.get("match-ids", [])
185    if not ids:
186        inf["match-ids"] = ids
187    ids.append(span.id)
188
189    inf[slot] = {
190        # "name": span.place.name,  # Normalized gazetteer name
191        "matchtext": span.text  # Mention in text
192    }
193    if slot != "country":
194        inf[slot]["feature"] = span.place.format_feature()
195
196
197def score_inferences(inf, matches):
198    # PASS 2. chose location and fill out chosen metadata.
199    # RELATED location information -- use Country Code, ADM1 or the CC.ADM1 province_id to
200    # indicate location coding.  This applies to all inferences started.
201
202    for inf_id in inf:
203        inference = inf[inf_id]
204        if "scores" not in inference:
205            continue
206
207        scores = inference["scores"]
208        top_score = 0
209        top_match = None
210        for scored_id in scores:
211            score = scores[scored_id]
212            if score > top_score:
213                top_score = score
214                top_match = matches[scored_id]
215
216        feat, res = characterize_location(top_match.place, top_match.label)
217        adm1 = top_match.place.adm1
218        cc2 = top_match.place.country_code
219
220        # Flesh out the metadata for best location for this mention.
221        inference.update({
222            "matchtext": top_match.text,
223            "confidence": top_match.confidence,
224            "resolution": res,
225            "feature": feat,
226            "lat": top_match.place.lat,
227            "lon": top_match.place.lon,
228            "province_id": make_HASC(cc2, adm1),
229            "cc": cc2,
230            "adm1": adm1})
231
232        return inf
233
234
235class Geotagger:
236    """
237    GEOTAGGER REST client.  This wrapper around XlayerClient class abstracts alot of the details
238    of calling and parsing the geo-inferencing results from the API server.
239
240    """
241    ALLOWED_SLOTS = {"site", "city", "admin", "postal", "country"}
242
243    def __init__(self, cfg: dict, debug=False, features=["geo", "postal", "taxons"]):
244
245        self.debug = debug
246
247        log_lvl = "INFO"
248        if debug:
249            log_lvl = "DEBUG"
250        self.log = logger_config(log_lvl, __name__)
251
252        self.features = features
253        url = cfg.get("xponents.url")
254        self.xponents = XlayerClient(url)
255        self.confidence_min = int(cfg.get("xponents.confidence.min", 10))
256        # On Xponents 100 point scale.
257
258        # Test if client is alive
259        if not self.xponents.ping():
260            raise Exception("Service not available")
261
262    def dbg(self, msg, *args, **kwargs):
263        self.log.debug(msg, *args, **kwargs)
264
265    def info(self, msg, *args, **kwargs):
266        self.log.info(msg, *args, **kwargs)
267
268    def error(self, msg, *args, **kwargs):
269        self.log.error(msg, *args, **kwargs)
270
271    def _location_info(self, spans: list) -> list:
272        locs = []
273        for t in spans:
274            loc_conf = int(t.attrs.get("confidence", -1))
275            if isinstance(t, PlaceCandidate):
276                if 0 < self.confidence_min <= loc_conf:
277                    locs.append(t)
278        return locs
279
280    def infer_locations(self, locs: list) -> dict:
281        """
282        Choose the best location from the list -- Most specific is preferred.
283        :param locs: list of locations
284        :return:
285        """
286        # Order of preference:
287        # 0. site location
288        # 1. postal location w/related info
289        # 2. qualified city ~ "City, Province" ... or just "City"
290        # 3. Province
291        # 4. Country
292        #
293
294        # LOGIC:
295        #  step 1 - key all locations by match-id, for easy lookup
296        #  step 2 - distill compound locations like a postal address to reduce matches to a single "geo inference"
297        #       with one best location.
298        #  step 3 - organize all location mentions in final `inferences` listing.
299        #  step 4 - score inferences, as needed.
300
301        inferences = dict()
302
303        # PASS 1. inventory locations and award points
304        matches = {}
305        countries = dict()
306        rendered_match_ids = dict()
307
308        # Ensure matches are Place Candidates only -- location bearing information.
309        for match in locs:
310            if isinstance(match, PlaceCandidate):
311                matches[match.id] = match
312
313        # Loop through high resolution locations first.
314        for mid in matches:
315            match = matches[mid]
316            loc = match.place  # Place obj
317            attrs = match.attrs  # dict
318            label = match.label  # entity label
319            points = int(0.10 * (match.confidence or 10))
320
321            # POSTAL.  Max points ~ 40 or so, 10 points for each qualifying slot (city, prov, code, etc)
322            if label == "postal" and "related" in attrs:
323                inferences[mid] = {"match-ids": [mid], "start": match.start, "end": match.end}
324                rendered_match_ids[mid] = mid
325                points += 10
326                related_geo = attrs["related"]
327                _increment_count(countries, loc.country_code)
328                if related_geo:
329                    _infer_slot(inferences, "postal", match)
330                    for k in related_geo:
331                        # these match IDs indicate the full tuple's geographic connections.
332                        points += 10
333                        # dereference the postal match.
334                        slot = related_geo[k]
335                        slot_match = matches.get(slot.get("match-id"))
336                        slot_text = related_geo[k]["matchtext"]
337                        self.dbg("POSTAL slot %s = %s", k, related_geo[k])
338                        if slot_match:
339                            _infer_slot(inferences, k, slot_match, match_id=mid)
340                            rendered_match_ids[slot_match.id] = mid
341                        else:
342                            self.info("Xponents BUG: missing match id for postal evidence. %s = %s", k, slot_text)
343                inferences[match.id]["scores"] = {mid: points}
344
345        # Iterate over remaining matches.
346        for mid in matches:
347            match = matches[mid]
348            loc = match.place  # Place obj
349            attrs = match.attrs  # dict
350            label = match.label  # entity label
351
352            if mid not in rendered_match_ids:
353                # Ignore all upper case short names... for now. Especially if there is no related geography attached.
354                if label == "place" and match.len < 8 and match.text.isupper():
355                    self.info("    IGNORE %s", match.text)
356                    continue
357
358                if label == "postal":
359                    # Such matches should have been associated through some hook above when all postal addresses
360                    # gather related mentions.
361                    self.dbg("     (BUG) IGNORE Postal %s", match.text)
362                    continue
363
364                # Backfill any entries that appear legit, but were not associated with other compound mentions like addresses.
365                # Given this is a standalone location, there is no scoring.
366                cc2, adm1 = match.place.country_code, match.place.adm1
367                feat, res = characterize_location(match.place, match.label)
368                inferences[mid] = {
369                    "start": match.start, "end": match.end,
370                    "matchtext": match.text,
371                    "confidence": match.confidence,
372                    "resolution": res,
373                    "feature": feat,
374                    "lat": match.place.lat,
375                    "lon": match.place.lon,
376                    "province_id": make_HASC(cc2, adm1),
377                    "cc": cc2,
378                    "adm1": adm1}
379            else:
380                # Score slots found in compound POSTAL or other matches
381                points = int(0.10 * (match.confidence or 10))
382                related_mid = rendered_match_ids[mid]
383
384                if label in {"place", "postal"}:
385                    _increment_count(countries, loc.country_code)
386                    if is_academic(loc.feature_class, loc.feature_code):
387                        _infer_slot(inferences, "site", match, match_id=related_mid)
388                        points += 20
389                    elif is_populated(loc.feature_class):
390                        rules = attrs.get("rules", "").lower()
391                        qualified = "adminname" in rules or "admincode" in rules
392                        _infer_slot(inferences, "city", match, match_id=related_mid)
393                        if qualified:
394                            points += 20
395                        else:
396                            # Else location was not qualified fully with district, province, etc..  Just a city name.
397                            self.dbg("CITY %s", match.text)
398                            points += 10
399                    elif is_administrative(loc.feature_class):
400                        _infer_slot(inferences, "admin", match, match_id=related_mid)
401                        points += 5
402                        self.dbg("ADMIN %s", match.text)
403                elif label == "country":
404                    # No bonus points for country mention.
405                    if match.len == 2:
406                        self.dbg("IGNORE 2-char mention %s", match.text)
407                    else:
408                        _infer_slot(inferences, "country", match, match_id=related_mid)
409                        _increment_count(countries, loc.country_code)
410                        self.dbg("COUNTRY %s", loc)
411
412                if related_mid in inferences:
413                    inferences[related_mid]["scores"] = {mid: points}
414                else:
415                    self.dbg("We missed some feature %s %s %s", label, match.id, match.text)
416
417        score_inferences(inferences, matches)
418        for inf_id in inferences:
419            inference = inferences[inf_id]
420            for k in ["match-ids", "scores"]:
421                if k in inference:
422                    del inference[k]
423        return inferences
424
425    def _mention_info(self, spans: list) -> list:
426        men = []
427        for t in spans:
428            if not isinstance(t, PlaceCandidate) and not t.filtered_out:
429                men.append(t)
430        return men
431
432    def populate_mentions(self, spans: list) -> dict:
433        if not spans:
434            return dict()
435
436        def _add_slot(arr, slot_, txt):
437            grp = arr.get(slot_, set([]))
438            if not grp:
439                arr[slot_] = grp
440            grp.add(txt)
441
442        men = dict()
443        for t in spans:
444            # All spans are either taxon, org, or person...; taxon can break out into any flavor of taxonomic term
445            catalog = None
446            if t.attrs:
447                catalog = t.attrs.get("cat") or t.attrs.get("catalog")
448
449            # Handle special cases first, then more general ones.
450            if catalog and catalog == "nationality":
451                _add_slot(men, "nationality", t.text)
452            elif t.label in {"org", "person", "taxon"}:
453                _add_slot(men, t.label, t.text)
454            else:
455                self.info("Mention oddity ...%s", t.label)
456
457        # To allow as valid JSON, we cannot use set().  Convert back to list.
458        for slot in men:
459            men[slot] = list(men[slot])
460        return men
461
462    def _nationality_countries(self, spans):
463        countries = set([])
464        for t in spans:
465            # Its really "catalog".   "cat" may happen in other systems.
466            if not (t.attrs and "catalog" in t.attrs):
467                continue
468            if t.attrs["catalog"] == "nationality":
469                taxon = t.attrs.get("name") or t.attrs.get("taxon")  # TODO: more convergence of attribute schemes.
470                if taxon and "." in taxon:
471                    nat = taxon.split(".")[1]
472                    C = get_country(nat)
473                    if C:
474                        countries.add(C.cc_iso2)
475        return countries
476
477    def summarize(self, doc_id, text, lang=None) -> dict:
478        """
479        Call the XlayerClient process() endpoint,
480        distills output tags into `geoinferences` and `mentions` (all other non-geo tags).
481        A valid 2-char ISO 639 language code helps to tune
482
483        :param doc_id: ID of text
484        :param text: the text input
485        :param lang:  language of the text
486        :return: A single geoinference
487        """
488        tags = self.xponents.process(doc_id, text, lang=lang, features=self.features, timeout=15)
489        if self.debug:
490            self.dbg("TAGS:%d", len(tags))
491
492        output = dict()
493
494        all_locations = self._location_info(tags)
495        other_mentions = self._mention_info(tags)
496        nationality_cc = self._nationality_countries(tags)
497        # TODO -- use nationality in inference to add country info
498
499        # Choose best locations
500        output["geoinference"] = self.infer_locations(all_locations)
501
502        # Extra info: This info may be completely unrelated to geography
503        output["mentions"] = self.populate_mentions(other_mentions)
504
505        if nationality_cc:
506            self.dbg("UNUSED - Nationalities? %s", nationality_cc)
507
508        return output
509
510
511def print_results(arr):
512    """
513    :param arr: array of Annotations or TextMatch
514    :return:
515    """
516    for a in arr:
517        if isinstance(a, TextMatch):
518            if a.filtered_out:
519                print("{} Excluded".format(str(a)))
520            else:
521                print(a)
522        else:
523            print(a)
524
525
526def print_match(match: TextMatch):
527    """
528    :param match:
529    :return:
530    """
531    filtered = ""
532    if match.filtered_out:
533        filtered = "FILTERED-OUT"
534    if match.label == "place":
535        cc = match.attrs.get("cc")
536        fc = match.attrs.get("feat_class")
537        fcode = match.attrs.get("feat_code")
538        print(match, f"\t\t\tcountry:{cc}, feature:{fc}/{fcode} {filtered}")
539    else:
540        print(match, f"\n\tATTRS{match.attrs} {filtered}")
541
542
543def process_text(extractor, txt, docid="$DOC-ID$", features=[], preferred_countries=[], preferred_locations=[]):
544    result = extractor.process(docid, txt, features=features,
545                               timeout=90,
546                               preferred_countries=preferred_countries,
547                               preferred_locations=preferred_locations)
548    print(f"=========DOCID {docid}")
549    print("TEXT", txt[0:200])
550    print("Matches\n============")
551    for match in result:
552        print_match(match)
553
554
555def main_demo():
556    import os
557    from traceback import format_exc
558    import argparse
559
560    ap = argparse.ArgumentParser()
561    ap.add_argument("input", help="your input")
562    ap.add_argument("--service-url", help="XLayer server host:port", default="localhost:8787")
563    ap.add_argument("--docid", help="your doc id")
564    ap.add_argument("--lines", action="store_true", help="process your inputfile as one line per call")
565    ap.add_argument("--text", action="store_true", help="<input> arg is a UTF-8 string to process")
566    ap.add_argument("--options",
567                    help="your service options to send with each request, e.g., 'lowercase,clean_input,revgeo'",
568                    default=None)
569    ap.add_argument("--features", help="Feature set e.g., 'geo,patterns,taxons'", default="geo,patterns,taxons")
570    ap.add_argument("--countries", help="Countries set e.g., 'AF,US,ID,BR,....", default=None)
571    ap.add_argument("--locations", help="Location geohashs set e.g., 'u23,u34,....", default=None)
572    ap.add_argument("--debug", default=False, action="store_true")
573    args = ap.parse_args()
574
575    service_url = args.service_url
576    xtractor = XlayerClient(service_url, options=args.options)
577    xtractor.debug = args.debug
578    feat = ["geo"]
579    countries = None
580    locations = None
581    if args.features:
582        feat = args.features.split(',')
583    if args.countries:
584        countries = args.countries.split(',')
585    if args.locations:
586        locations = args.locations.split(',')
587
588    print("Ping server (timeout=5s)....")
589    try:
590        xtractor.ping(timeout=5)
591    except Exception as runErr:
592        print(str(runErr))
593        sys.exit(1)
594
595    fpath = os.path.abspath(args.input)
596
597    # ======================================
598    # Support for arbitrary amounts of text
599    #
600    if args.text:
601        process_text(xtractor, fpath, docid="test-doc-#123", features=feat,
602                     preferred_countries=countries, preferred_locations=locations)
603    # ======================================
604    # Support data as one text record per line in a file
605    #
606    elif args.lines or args.input.endswith(".json"):
607        print("INPUT: from individual lines from input file\n\n")
608        is_json = args.input.endswith(".json")
609        try:
610            with open(fpath, 'r', encoding="UTF-8") as fh:
611                lineNum = 0
612                for line in fh:
613                    textbuf = line.strip()
614                    lineNum += 1
615                    if is_json:
616                        if not textbuf or textbuf.startswith("#"):
617                            continue
618                        textbuf = json.loads(textbuf).get("text")
619                        if not textbuf:
620                            print("'text' value required in JSON")
621                            continue
622
623                    test_id = "line{}".format(lineNum)
624                    process_text(xtractor, textbuf, docid=test_id, features=feat,
625                                 preferred_countries=countries, preferred_locations=locations)
626
627        except Exception as runErr:
628            print(format_exc(limit=5))
629
630    # ======================================
631    # Use a single file as the source text to process
632    #
633    elif fpath:
634        file_id = os.path.basename(fpath)
635        try:
636            with open(fpath, 'r', encoding="UTF-8") as fh:
637                process_text(xtractor, fh.read(), docid=file_id, features=feat,
638                             preferred_countries=countries, preferred_locations=locations)
639        except Exception as runErr:
640            print(format_exc(limit=5))
641
642
643if __name__ == '__main__':
644    main_demo()

class XlayerClient: View Source

 20class XlayerClient:
 21    """
 22      Xponents REST client --  low level utility. See also Geotagger class for a better abstraction.
 23      ```
 24      client = XlayerClient(url)
 25      client.process( "paragraph of text...." ) ---> returns list of raw matches, geocoded.
 26      ```
 27    """
 28    def __init__(self, server, options=""):
 29        """
 30        @param server: URL for the service.   E.g., host:port or 'http://SERVER/xlayer/rest/process'.
 31        @keyword  options:  STRING. a comma-separated list of options to send with each request.
 32        There are no default options supported.
 33        """
 34        self.server = server
 35        if not server.startswith("http"):
 36            self.server = f"http://{server}/xlayer/rest/process"
 37            self.server_control = f"http://{server}/xlayer/rest/control"
 38        else:
 39            # User provided a full URL.
 40            self.server_control = server.replace('/process', '/control')
 41        self.debug = False
 42        self.default_options = options
 43
 44    def stop(self, timeout=30):
 45        """
 46        Timeout of 30 seconds is used here so calls do not hang indefinitely.
 47        The service URL is inferred:  /process and /control endpoints should be next to each other.
 48        :return: True if successful or if "Connection aborted" ConnectionError occurs
 49        """
 50        try:
 51            response = requests.get("{}/stop".format(self.server_control), timeout=timeout)
 52            if response.status_code != 200:
 53                return response.raise_for_status()
 54        except requests.exceptions.ConnectionError as err:
 55            return "Connection aborted" in str(err)
 56        return False
 57
 58    def ping(self, timeout=30):
 59        """
 60        Timeout of 30 seconds is used here so calls do not hang indefinitely.
 61        :return: True if successful.
 62        """
 63        response = requests.get("{}/ping".format(self.server_control), timeout=timeout)
 64        if response.status_code != 200:
 65            return response.raise_for_status()
 66        return True
 67
 68    def process(self, docid, text, lang=None, features=["geo"], timeout=10, minlen=-1,
 69                preferred_countries=None, preferred_locations=None):
 70        """
 71        Process text, extracting some entities
 72
 73          lang = "xx" or None, where "xx" is a ISO language 2-char code.
 74              For general Chinese/Japanese/Korean (CJK) support, use lang = 'cjk'
 75              Language IDs that have some additional tuning include:
 76                   "ja", "th", "tr", "id", "ar", "fa", "ur", "ru", "it",
 77                   "pt", "de", "nl", "es", "en", "tl", "ko", "vi"
 78          Behavior:  Arabic (ar) or CJK (cjk) lang ID directs tagger to use language-specific tokenizers
 79              Otherwise other lang ID provided just invokes language-specific stopword filters
 80
 81          features are places, coordinates, countries, orgs, persons, patterns, postal. 
 82          
 83          feature aliases "geo" can be used to get All Geographic entities (places,coordinates,countries)
 84          feature "taxons" can get at any Taxon "taxons", "persons", "orgs".  As of Xponents 3.6 this reports ALL
 85              Other taxons available in TaxCat tagger.  "all_taxons" is offered as a means to distinguish old and new behavior.
 86          feature "postal" will tag obvious, qualified postal codes that are paired with a CITY, PROVINCE, or COUNTRY tag.
 87          feature "patterns" is an alias for dates and any other pattern-based extractors. For now "dates" is only one
 88          feature "codes" will tag, use and report coded information for any place; primarily administrative boundaries
 89
 90          options are not observed by Xlayer "Xgeo", but you can adapt your own service
 91          to accomodate such options.   Possible options are clean_input, lowercase, for example:
 92
 93          * clean_input scrubs the input text if it has HTML or other content in it.
 94          * lowercase allows the tagging to pass on through lower case matches.
 95          
 96          but interpretation of "clean text" and "lower case" support is subjective.
 97          so they are not supported out of the box here.
 98        :param docid: identifier of transaction
 99        :param text: Unicode text to process
100        :param lang: One of ["ar", "cjk", .... other ISO language IDs]
101        :param features: list of geo OR [places, coordinates, countries], orgs, persons, patterns, taxons
102        :param timeout: default to 10 seconds; If you think your processing takes longer,
103                 adjust if you see exceptions.
104        :param minlen: minimum length of matches that are unqualified. To reduce noise in geotags. Server has a default
105            of 4 chars for general purpose noise filtering.
106        :param preferred_countries: Array of country codes representing those which are preferred fall backs when
107            there are ambiguous location names.
108        :param preferred_locations:  Array of geohash representing general area of desired preferred matches
109        :return: array of TextMatch objects or empty array.
110        """
111        json_request = {'docid': docid, 'text': text}
112        if self.default_options:
113            json_request['options'] = self.default_options
114        if features:
115            json_request['features'] = ','.join(features)
116        if preferred_countries:
117            json_request['preferred_countries'] = preferred_countries
118        if preferred_locations:
119            json_request['preferred_locations'] = preferred_locations
120        if lang:
121            json_request['lang'] = lang
122        if minlen and int(minlen) > 0:
123            json_request['minlen'] = minlen
124
125        response = requests.post(self.server, json=json_request, timeout=timeout)
126        if response.status_code != 200:
127            return response.raise_for_status()
128
129        json_content = response.json()
130
131        if self.debug:
132            print(json.dumps(json_content, indent=2))
133        if 'response' in json_content:
134            # Get the response metadata block
135            # metadata = json_content['response']
136            pass
137
138        annots = []
139        if 'annotations' in json_content:
140            aj = json_content['annotations']
141            for annot in aj:
142                # Desire to move to "label" away from "type"
143                label = annot.get("type")
144                if label in GEOCODINGS:
145                    tm = PlaceCandidate(annot.get('matchtext'), annot.get('offset'), None)
146                else:
147                    tm = TextMatch(annot.get('matchtext'), annot.get('offset'), None)
148                tm.populate(annot)
149                annots.append(tm)
150
151        return annots

Xponents REST client -- low level utility. See also Geotagger class for a better abstraction.

client = XlayerClient(url)
client.process( "paragraph of text...." ) ---> returns list of raw matches, geocoded.

XlayerClient(server, options='') View Source

28    def __init__(self, server, options=""):
29        """
30        @param server: URL for the service.   E.g., host:port or 'http://SERVER/xlayer/rest/process'.
31        @keyword  options:  STRING. a comma-separated list of options to send with each request.
32        There are no default options supported.
33        """
34        self.server = server
35        if not server.startswith("http"):
36            self.server = f"http://{server}/xlayer/rest/process"
37            self.server_control = f"http://{server}/xlayer/rest/control"
38        else:
39            # User provided a full URL.
40            self.server_control = server.replace('/process', '/control')
41        self.debug = False
42        self.default_options = options

@param server: URL for the service. E.g., host:port or 'http://SERVER/xlayer/rest/process'. @keyword options: STRING. a comma-separated list of options to send with each request. There are no default options supported.

def stop(self, timeout=30): View Source

44    def stop(self, timeout=30):
45        """
46        Timeout of 30 seconds is used here so calls do not hang indefinitely.
47        The service URL is inferred:  /process and /control endpoints should be next to each other.
48        :return: True if successful or if "Connection aborted" ConnectionError occurs
49        """
50        try:
51            response = requests.get("{}/stop".format(self.server_control), timeout=timeout)
52            if response.status_code != 200:
53                return response.raise_for_status()
54        except requests.exceptions.ConnectionError as err:
55            return "Connection aborted" in str(err)
56        return False

Timeout of 30 seconds is used here so calls do not hang indefinitely. The service URL is inferred: /process and /control endpoints should be next to each other. :return: True if successful or if "Connection aborted" ConnectionError occurs

def ping(self, timeout=30): View Source

58    def ping(self, timeout=30):
59        """
60        Timeout of 30 seconds is used here so calls do not hang indefinitely.
61        :return: True if successful.
62        """
63        response = requests.get("{}/ping".format(self.server_control), timeout=timeout)
64        if response.status_code != 200:
65            return response.raise_for_status()
66        return True

Timeout of 30 seconds is used here so calls do not hang indefinitely. :return: True if successful.

def process( self, docid, text, lang=None, features=['geo'], timeout=10, minlen=-1, preferred_countries=None, preferred_locations=None): View Source

 68    def process(self, docid, text, lang=None, features=["geo"], timeout=10, minlen=-1,
 69                preferred_countries=None, preferred_locations=None):
 70        """
 71        Process text, extracting some entities
 72
 73          lang = "xx" or None, where "xx" is a ISO language 2-char code.
 74              For general Chinese/Japanese/Korean (CJK) support, use lang = 'cjk'
 75              Language IDs that have some additional tuning include:
 76                   "ja", "th", "tr", "id", "ar", "fa", "ur", "ru", "it",
 77                   "pt", "de", "nl", "es", "en", "tl", "ko", "vi"
 78          Behavior:  Arabic (ar) or CJK (cjk) lang ID directs tagger to use language-specific tokenizers
 79              Otherwise other lang ID provided just invokes language-specific stopword filters
 80
 81          features are places, coordinates, countries, orgs, persons, patterns, postal. 
 82          
 83          feature aliases "geo" can be used to get All Geographic entities (places,coordinates,countries)
 84          feature "taxons" can get at any Taxon "taxons", "persons", "orgs".  As of Xponents 3.6 this reports ALL
 85              Other taxons available in TaxCat tagger.  "all_taxons" is offered as a means to distinguish old and new behavior.
 86          feature "postal" will tag obvious, qualified postal codes that are paired with a CITY, PROVINCE, or COUNTRY tag.
 87          feature "patterns" is an alias for dates and any other pattern-based extractors. For now "dates" is only one
 88          feature "codes" will tag, use and report coded information for any place; primarily administrative boundaries
 89
 90          options are not observed by Xlayer "Xgeo", but you can adapt your own service
 91          to accomodate such options.   Possible options are clean_input, lowercase, for example:
 92
 93          * clean_input scrubs the input text if it has HTML or other content in it.
 94          * lowercase allows the tagging to pass on through lower case matches.
 95          
 96          but interpretation of "clean text" and "lower case" support is subjective.
 97          so they are not supported out of the box here.
 98        :param docid: identifier of transaction
 99        :param text: Unicode text to process
100        :param lang: One of ["ar", "cjk", .... other ISO language IDs]
101        :param features: list of geo OR [places, coordinates, countries], orgs, persons, patterns, taxons
102        :param timeout: default to 10 seconds; If you think your processing takes longer,
103                 adjust if you see exceptions.
104        :param minlen: minimum length of matches that are unqualified. To reduce noise in geotags. Server has a default
105            of 4 chars for general purpose noise filtering.
106        :param preferred_countries: Array of country codes representing those which are preferred fall backs when
107            there are ambiguous location names.
108        :param preferred_locations:  Array of geohash representing general area of desired preferred matches
109        :return: array of TextMatch objects or empty array.
110        """
111        json_request = {'docid': docid, 'text': text}
112        if self.default_options:
113            json_request['options'] = self.default_options
114        if features:
115            json_request['features'] = ','.join(features)
116        if preferred_countries:
117            json_request['preferred_countries'] = preferred_countries
118        if preferred_locations:
119            json_request['preferred_locations'] = preferred_locations
120        if lang:
121            json_request['lang'] = lang
122        if minlen and int(minlen) > 0:
123            json_request['minlen'] = minlen
124
125        response = requests.post(self.server, json=json_request, timeout=timeout)
126        if response.status_code != 200:
127            return response.raise_for_status()
128
129        json_content = response.json()
130
131        if self.debug:
132            print(json.dumps(json_content, indent=2))
133        if 'response' in json_content:
134            # Get the response metadata block
135            # metadata = json_content['response']
136            pass
137
138        annots = []
139        if 'annotations' in json_content:
140            aj = json_content['annotations']
141            for annot in aj:
142                # Desire to move to "label" away from "type"
143                label = annot.get("type")
144                if label in GEOCODINGS:
145                    tm = PlaceCandidate(annot.get('matchtext'), annot.get('offset'), None)
146                else:
147                    tm = TextMatch(annot.get('matchtext'), annot.get('offset'), None)
148                tm.populate(annot)
149                annots.append(tm)
150
151        return annots

Process text, extracting some entities

lang = "xx" or None, where "xx" is a ISO language 2-char code. For general Chinese/Japanese/Korean (CJK) support, use lang = 'cjk' Language IDs that have some additional tuning include: "ja", "th", "tr", "id", "ar", "fa", "ur", "ru", "it", "pt", "de", "nl", "es", "en", "tl", "ko", "vi" Behavior: Arabic (ar) or CJK (cjk) lang ID directs tagger to use language-specific tokenizers Otherwise other lang ID provided just invokes language-specific stopword filters

features are places, coordinates, countries, orgs, persons, patterns, postal.

feature aliases "geo" can be used to get All Geographic entities (places,coordinates,countries) feature "taxons" can get at any Taxon "taxons", "persons", "orgs". As of Xponents 3.6 this reports ALL Other taxons available in TaxCat tagger. "all_taxons" is offered as a means to distinguish old and new behavior. feature "postal" will tag obvious, qualified postal codes that are paired with a CITY, PROVINCE, or COUNTRY tag. feature "patterns" is an alias for dates and any other pattern-based extractors. For now "dates" is only one feature "codes" will tag, use and report coded information for any place; primarily administrative boundaries

options are not observed by Xlayer "Xgeo", but you can adapt your own service to accomodate such options. Possible options are clean_input, lowercase, for example:

clean_input scrubs the input text if it has HTML or other content in it.
lowercase allows the tagging to pass on through lower case matches.

but interpretation of "clean text" and "lower case" support is subjective. so they are not supported out of the box here. :param docid: identifier of transaction :param text: Unicode text to process :param lang: One of ["ar", "cjk", .... other ISO language IDs] :param features: list of geo OR [places, coordinates, countries], orgs, persons, patterns, taxons :param timeout: default to 10 seconds; If you think your processing takes longer, adjust if you see exceptions. :param minlen: minimum length of matches that are unqualified. To reduce noise in geotags. Server has a default of 4 chars for general purpose noise filtering. :param preferred_countries: Array of country codes representing those which are preferred fall backs when there are ambiguous location names. :param preferred_locations: Array of geohash representing general area of desired preferred matches :return: array of TextMatch objects or empty array.

class Geotagger: View Source

236class Geotagger:
237    """
238    GEOTAGGER REST client.  This wrapper around XlayerClient class abstracts alot of the details
239    of calling and parsing the geo-inferencing results from the API server.
240
241    """
242    ALLOWED_SLOTS = {"site", "city", "admin", "postal", "country"}
243
244    def __init__(self, cfg: dict, debug=False, features=["geo", "postal", "taxons"]):
245
246        self.debug = debug
247
248        log_lvl = "INFO"
249        if debug:
250            log_lvl = "DEBUG"
251        self.log = logger_config(log_lvl, __name__)
252
253        self.features = features
254        url = cfg.get("xponents.url")
255        self.xponents = XlayerClient(url)
256        self.confidence_min = int(cfg.get("xponents.confidence.min", 10))
257        # On Xponents 100 point scale.
258
259        # Test if client is alive
260        if not self.xponents.ping():
261            raise Exception("Service not available")
262
263    def dbg(self, msg, *args, **kwargs):
264        self.log.debug(msg, *args, **kwargs)
265
266    def info(self, msg, *args, **kwargs):
267        self.log.info(msg, *args, **kwargs)
268
269    def error(self, msg, *args, **kwargs):
270        self.log.error(msg, *args, **kwargs)
271
272    def _location_info(self, spans: list) -> list:
273        locs = []
274        for t in spans:
275            loc_conf = int(t.attrs.get("confidence", -1))
276            if isinstance(t, PlaceCandidate):
277                if 0 < self.confidence_min <= loc_conf:
278                    locs.append(t)
279        return locs
280
281    def infer_locations(self, locs: list) -> dict:
282        """
283        Choose the best location from the list -- Most specific is preferred.
284        :param locs: list of locations
285        :return:
286        """
287        # Order of preference:
288        # 0. site location
289        # 1. postal location w/related info
290        # 2. qualified city ~ "City, Province" ... or just "City"
291        # 3. Province
292        # 4. Country
293        #
294
295        # LOGIC:
296        #  step 1 - key all locations by match-id, for easy lookup
297        #  step 2 - distill compound locations like a postal address to reduce matches to a single "geo inference"
298        #       with one best location.
299        #  step 3 - organize all location mentions in final `inferences` listing.
300        #  step 4 - score inferences, as needed.
301
302        inferences = dict()
303
304        # PASS 1. inventory locations and award points
305        matches = {}
306        countries = dict()
307        rendered_match_ids = dict()
308
309        # Ensure matches are Place Candidates only -- location bearing information.
310        for match in locs:
311            if isinstance(match, PlaceCandidate):
312                matches[match.id] = match
313
314        # Loop through high resolution locations first.
315        for mid in matches:
316            match = matches[mid]
317            loc = match.place  # Place obj
318            attrs = match.attrs  # dict
319            label = match.label  # entity label
320            points = int(0.10 * (match.confidence or 10))
321
322            # POSTAL.  Max points ~ 40 or so, 10 points for each qualifying slot (city, prov, code, etc)
323            if label == "postal" and "related" in attrs:
324                inferences[mid] = {"match-ids": [mid], "start": match.start, "end": match.end}
325                rendered_match_ids[mid] = mid
326                points += 10
327                related_geo = attrs["related"]
328                _increment_count(countries, loc.country_code)
329                if related_geo:
330                    _infer_slot(inferences, "postal", match)
331                    for k in related_geo:
332                        # these match IDs indicate the full tuple's geographic connections.
333                        points += 10
334                        # dereference the postal match.
335                        slot = related_geo[k]
336                        slot_match = matches.get(slot.get("match-id"))
337                        slot_text = related_geo[k]["matchtext"]
338                        self.dbg("POSTAL slot %s = %s", k, related_geo[k])
339                        if slot_match:
340                            _infer_slot(inferences, k, slot_match, match_id=mid)
341                            rendered_match_ids[slot_match.id] = mid
342                        else:
343                            self.info("Xponents BUG: missing match id for postal evidence. %s = %s", k, slot_text)
344                inferences[match.id]["scores"] = {mid: points}
345
346        # Iterate over remaining matches.
347        for mid in matches:
348            match = matches[mid]
349            loc = match.place  # Place obj
350            attrs = match.attrs  # dict
351            label = match.label  # entity label
352
353            if mid not in rendered_match_ids:
354                # Ignore all upper case short names... for now. Especially if there is no related geography attached.
355                if label == "place" and match.len < 8 and match.text.isupper():
356                    self.info("    IGNORE %s", match.text)
357                    continue
358
359                if label == "postal":
360                    # Such matches should have been associated through some hook above when all postal addresses
361                    # gather related mentions.
362                    self.dbg("     (BUG) IGNORE Postal %s", match.text)
363                    continue
364
365                # Backfill any entries that appear legit, but were not associated with other compound mentions like addresses.
366                # Given this is a standalone location, there is no scoring.
367                cc2, adm1 = match.place.country_code, match.place.adm1
368                feat, res = characterize_location(match.place, match.label)
369                inferences[mid] = {
370                    "start": match.start, "end": match.end,
371                    "matchtext": match.text,
372                    "confidence": match.confidence,
373                    "resolution": res,
374                    "feature": feat,
375                    "lat": match.place.lat,
376                    "lon": match.place.lon,
377                    "province_id": make_HASC(cc2, adm1),
378                    "cc": cc2,
379                    "adm1": adm1}
380            else:
381                # Score slots found in compound POSTAL or other matches
382                points = int(0.10 * (match.confidence or 10))
383                related_mid = rendered_match_ids[mid]
384
385                if label in {"place", "postal"}:
386                    _increment_count(countries, loc.country_code)
387                    if is_academic(loc.feature_class, loc.feature_code):
388                        _infer_slot(inferences, "site", match, match_id=related_mid)
389                        points += 20
390                    elif is_populated(loc.feature_class):
391                        rules = attrs.get("rules", "").lower()
392                        qualified = "adminname" in rules or "admincode" in rules
393                        _infer_slot(inferences, "city", match, match_id=related_mid)
394                        if qualified:
395                            points += 20
396                        else:
397                            # Else location was not qualified fully with district, province, etc..  Just a city name.
398                            self.dbg("CITY %s", match.text)
399                            points += 10
400                    elif is_administrative(loc.feature_class):
401                        _infer_slot(inferences, "admin", match, match_id=related_mid)
402                        points += 5
403                        self.dbg("ADMIN %s", match.text)
404                elif label == "country":
405                    # No bonus points for country mention.
406                    if match.len == 2:
407                        self.dbg("IGNORE 2-char mention %s", match.text)
408                    else:
409                        _infer_slot(inferences, "country", match, match_id=related_mid)
410                        _increment_count(countries, loc.country_code)
411                        self.dbg("COUNTRY %s", loc)
412
413                if related_mid in inferences:
414                    inferences[related_mid]["scores"] = {mid: points}
415                else:
416                    self.dbg("We missed some feature %s %s %s", label, match.id, match.text)
417
418        score_inferences(inferences, matches)
419        for inf_id in inferences:
420            inference = inferences[inf_id]
421            for k in ["match-ids", "scores"]:
422                if k in inference:
423                    del inference[k]
424        return inferences
425
426    def _mention_info(self, spans: list) -> list:
427        men = []
428        for t in spans:
429            if not isinstance(t, PlaceCandidate) and not t.filtered_out:
430                men.append(t)
431        return men
432
433    def populate_mentions(self, spans: list) -> dict:
434        if not spans:
435            return dict()
436
437        def _add_slot(arr, slot_, txt):
438            grp = arr.get(slot_, set([]))
439            if not grp:
440                arr[slot_] = grp
441            grp.add(txt)
442
443        men = dict()
444        for t in spans:
445            # All spans are either taxon, org, or person...; taxon can break out into any flavor of taxonomic term
446            catalog = None
447            if t.attrs:
448                catalog = t.attrs.get("cat") or t.attrs.get("catalog")
449
450            # Handle special cases first, then more general ones.
451            if catalog and catalog == "nationality":
452                _add_slot(men, "nationality", t.text)
453            elif t.label in {"org", "person", "taxon"}:
454                _add_slot(men, t.label, t.text)
455            else:
456                self.info("Mention oddity ...%s", t.label)
457
458        # To allow as valid JSON, we cannot use set().  Convert back to list.
459        for slot in men:
460            men[slot] = list(men[slot])
461        return men
462
463    def _nationality_countries(self, spans):
464        countries = set([])
465        for t in spans:
466            # Its really "catalog".   "cat" may happen in other systems.
467            if not (t.attrs and "catalog" in t.attrs):
468                continue
469            if t.attrs["catalog"] == "nationality":
470                taxon = t.attrs.get("name") or t.attrs.get("taxon")  # TODO: more convergence of attribute schemes.
471                if taxon and "." in taxon:
472                    nat = taxon.split(".")[1]
473                    C = get_country(nat)
474                    if C:
475                        countries.add(C.cc_iso2)
476        return countries
477
478    def summarize(self, doc_id, text, lang=None) -> dict:
479        """
480        Call the XlayerClient process() endpoint,
481        distills output tags into `geoinferences` and `mentions` (all other non-geo tags).
482        A valid 2-char ISO 639 language code helps to tune
483
484        :param doc_id: ID of text
485        :param text: the text input
486        :param lang:  language of the text
487        :return: A single geoinference
488        """
489        tags = self.xponents.process(doc_id, text, lang=lang, features=self.features, timeout=15)
490        if self.debug:
491            self.dbg("TAGS:%d", len(tags))
492
493        output = dict()
494
495        all_locations = self._location_info(tags)
496        other_mentions = self._mention_info(tags)
497        nationality_cc = self._nationality_countries(tags)
498        # TODO -- use nationality in inference to add country info
499
500        # Choose best locations
501        output["geoinference"] = self.infer_locations(all_locations)
502
503        # Extra info: This info may be completely unrelated to geography
504        output["mentions"] = self.populate_mentions(other_mentions)
505
506        if nationality_cc:
507            self.dbg("UNUSED - Nationalities? %s", nationality_cc)
508
509        return output

GEOTAGGER REST client. This wrapper around XlayerClient class abstracts alot of the details of calling and parsing the geo-inferencing results from the API server.

def infer_locations(self, locs: list) -> dict: View Source

281    def infer_locations(self, locs: list) -> dict:
282        """
283        Choose the best location from the list -- Most specific is preferred.
284        :param locs: list of locations
285        :return:
286        """
287        # Order of preference:
288        # 0. site location
289        # 1. postal location w/related info
290        # 2. qualified city ~ "City, Province" ... or just "City"
291        # 3. Province
292        # 4. Country
293        #
294
295        # LOGIC:
296        #  step 1 - key all locations by match-id, for easy lookup
297        #  step 2 - distill compound locations like a postal address to reduce matches to a single "geo inference"
298        #       with one best location.
299        #  step 3 - organize all location mentions in final `inferences` listing.
300        #  step 4 - score inferences, as needed.
301
302        inferences = dict()
303
304        # PASS 1. inventory locations and award points
305        matches = {}
306        countries = dict()
307        rendered_match_ids = dict()
308
309        # Ensure matches are Place Candidates only -- location bearing information.
310        for match in locs:
311            if isinstance(match, PlaceCandidate):
312                matches[match.id] = match
313
314        # Loop through high resolution locations first.
315        for mid in matches:
316            match = matches[mid]
317            loc = match.place  # Place obj
318            attrs = match.attrs  # dict
319            label = match.label  # entity label
320            points = int(0.10 * (match.confidence or 10))
321
322            # POSTAL.  Max points ~ 40 or so, 10 points for each qualifying slot (city, prov, code, etc)
323            if label == "postal" and "related" in attrs:
324                inferences[mid] = {"match-ids": [mid], "start": match.start, "end": match.end}
325                rendered_match_ids[mid] = mid
326                points += 10
327                related_geo = attrs["related"]
328                _increment_count(countries, loc.country_code)
329                if related_geo:
330                    _infer_slot(inferences, "postal", match)
331                    for k in related_geo:
332                        # these match IDs indicate the full tuple's geographic connections.
333                        points += 10
334                        # dereference the postal match.
335                        slot = related_geo[k]
336                        slot_match = matches.get(slot.get("match-id"))
337                        slot_text = related_geo[k]["matchtext"]
338                        self.dbg("POSTAL slot %s = %s", k, related_geo[k])
339                        if slot_match:
340                            _infer_slot(inferences, k, slot_match, match_id=mid)
341                            rendered_match_ids[slot_match.id] = mid
342                        else:
343                            self.info("Xponents BUG: missing match id for postal evidence. %s = %s", k, slot_text)
344                inferences[match.id]["scores"] = {mid: points}
345
346        # Iterate over remaining matches.
347        for mid in matches:
348            match = matches[mid]
349            loc = match.place  # Place obj
350            attrs = match.attrs  # dict
351            label = match.label  # entity label
352
353            if mid not in rendered_match_ids:
354                # Ignore all upper case short names... for now. Especially if there is no related geography attached.
355                if label == "place" and match.len < 8 and match.text.isupper():
356                    self.info("    IGNORE %s", match.text)
357                    continue
358
359                if label == "postal":
360                    # Such matches should have been associated through some hook above when all postal addresses
361                    # gather related mentions.
362                    self.dbg("     (BUG) IGNORE Postal %s", match.text)
363                    continue
364
365                # Backfill any entries that appear legit, but were not associated with other compound mentions like addresses.
366                # Given this is a standalone location, there is no scoring.
367                cc2, adm1 = match.place.country_code, match.place.adm1
368                feat, res = characterize_location(match.place, match.label)
369                inferences[mid] = {
370                    "start": match.start, "end": match.end,
371                    "matchtext": match.text,
372                    "confidence": match.confidence,
373                    "resolution": res,
374                    "feature": feat,
375                    "lat": match.place.lat,
376                    "lon": match.place.lon,
377                    "province_id": make_HASC(cc2, adm1),
378                    "cc": cc2,
379                    "adm1": adm1}
380            else:
381                # Score slots found in compound POSTAL or other matches
382                points = int(0.10 * (match.confidence or 10))
383                related_mid = rendered_match_ids[mid]
384
385                if label in {"place", "postal"}:
386                    _increment_count(countries, loc.country_code)
387                    if is_academic(loc.feature_class, loc.feature_code):
388                        _infer_slot(inferences, "site", match, match_id=related_mid)
389                        points += 20
390                    elif is_populated(loc.feature_class):
391                        rules = attrs.get("rules", "").lower()
392                        qualified = "adminname" in rules or "admincode" in rules
393                        _infer_slot(inferences, "city", match, match_id=related_mid)
394                        if qualified:
395                            points += 20
396                        else:
397                            # Else location was not qualified fully with district, province, etc..  Just a city name.
398                            self.dbg("CITY %s", match.text)
399                            points += 10
400                    elif is_administrative(loc.feature_class):
401                        _infer_slot(inferences, "admin", match, match_id=related_mid)
402                        points += 5
403                        self.dbg("ADMIN %s", match.text)
404                elif label == "country":
405                    # No bonus points for country mention.
406                    if match.len == 2:
407                        self.dbg("IGNORE 2-char mention %s", match.text)
408                    else:
409                        _infer_slot(inferences, "country", match, match_id=related_mid)
410                        _increment_count(countries, loc.country_code)
411                        self.dbg("COUNTRY %s", loc)
412
413                if related_mid in inferences:
414                    inferences[related_mid]["scores"] = {mid: points}
415                else:
416                    self.dbg("We missed some feature %s %s %s", label, match.id, match.text)
417
418        score_inferences(inferences, matches)
419        for inf_id in inferences:
420            inference = inferences[inf_id]
421            for k in ["match-ids", "scores"]:
422                if k in inference:
423                    del inference[k]
424        return inferences

Choose the best location from the list -- Most specific is preferred. :param locs: list of locations :return:

def summarize(self, doc_id, text, lang=None) -> dict: View Source

478    def summarize(self, doc_id, text, lang=None) -> dict:
479        """
480        Call the XlayerClient process() endpoint,
481        distills output tags into `geoinferences` and `mentions` (all other non-geo tags).
482        A valid 2-char ISO 639 language code helps to tune
483
484        :param doc_id: ID of text
485        :param text: the text input
486        :param lang:  language of the text
487        :return: A single geoinference
488        """
489        tags = self.xponents.process(doc_id, text, lang=lang, features=self.features, timeout=15)
490        if self.debug:
491            self.dbg("TAGS:%d", len(tags))
492
493        output = dict()
494
495        all_locations = self._location_info(tags)
496        other_mentions = self._mention_info(tags)
497        nationality_cc = self._nationality_countries(tags)
498        # TODO -- use nationality in inference to add country info
499
500        # Choose best locations
501        output["geoinference"] = self.infer_locations(all_locations)
502
503        # Extra info: This info may be completely unrelated to geography
504        output["mentions"] = self.populate_mentions(other_mentions)
505
506        if nationality_cc:
507            self.dbg("UNUSED - Nationalities? %s", nationality_cc)
508
509        return output

Call the XlayerClient process() endpoint, distills output tags into geoinferences and mentions (all other non-geo tags). A valid 2-char ISO 639 language code helps to tune

:param doc_id: ID of text :param text: the text input :param lang: language of the text :return: A single geoinference

def print_results(arr): View Source

512def print_results(arr):
513    """
514    :param arr: array of Annotations or TextMatch
515    :return:
516    """
517    for a in arr:
518        if isinstance(a, TextMatch):
519            if a.filtered_out:
520                print("{} Excluded".format(str(a)))
521            else:
522                print(a)
523        else:
524            print(a)

:param arr: array of Annotations or TextMatch :return:

def print_match(match: opensextant.TextMatch): View Source

527def print_match(match: TextMatch):
528    """
529    :param match:
530    :return:
531    """
532    filtered = ""
533    if match.filtered_out:
534        filtered = "FILTERED-OUT"
535    if match.label == "place":
536        cc = match.attrs.get("cc")
537        fc = match.attrs.get("feat_class")
538        fcode = match.attrs.get("feat_code")
539        print(match, f"\t\t\tcountry:{cc}, feature:{fc}/{fcode} {filtered}")
540    else:
541        print(match, f"\n\tATTRS{match.attrs} {filtered}")

:param match: :return: