Python: module opensextant.gazetteer

opensextant.gazetteer

index
/Users/ubaldino/workspace/opensource/Xponents-Core/src/main/python/opensextant/gazetteer.py

Modules

arrow
json
os
pysolr
sqlite3

Classes



builtins.object

AdminLevelCodes
DB
DataSource
GazetteerIndex
GazetteerSearch
ISO3166Registry
PlaceHeuristics

class AdminLevelCodes(builtins.object)

    AdminLevelCodes(filepath=None)

Methods defined here:

__init__(self, filepath=None)
Initialize self.  See help(type(self)) for accurate signature.

add_country(self, cc)

add_place(self, place_id, cc, std, adm1, grid)
Accumulate discrete ADM1 codings by place instance and location :param place_id: :param cc: :param std: :param adm1: :param adm2: optional ADM2 mapping :param grid: :return:

adjust_admin1(self, cc, adm1)

align_admin1(self)

as_json(self)

get_alternate_admin1(self, cc, adm1, std)
EXPERIMENTAL still. :param cc: ISO country code :param adm1: ADM1 in the given standard :param std: standard "FIPS" or "ISO" :return:

load(self, fpath)

save(self, fpath)

set_admin_hierarchy(self, countries, adm1_containment)

Data descriptors defined here:

__dict__

dictionary for instance variables (if defined)

__weakref__

list of weak references to the object (if defined)

class DB(builtins.object)

    DB(dbpath, commit_rate=1000, debug=False, add_geohash=False)

Methods defined here:

__init__(self, dbpath, commit_rate=1000, debug=False, add_geohash=False)
Save items to SQlite db at the commit_rate given.  Call close to finalize any partial batches and save database. :param dbpath: :param commit_rate:

add_place(self, obj)
Add one place :param obj: a place dictionary.  If arg is a Place object it is converted to dictionary first.

add_places(self, arr)
Add a list of places.

add_population_stats(self, source='G')
Population stats are record by populated area (P-class features) and rolled up to provide an ADM1 population approximation.

close(self)

commit(self)

create(self)
Create the placenames table and default indices used for ETL - place_id, source, country, and ADM1 :return:

create_indices(self)
Create additional indices that are used for advanced ETL functions and optimization. :return:

delete_places(self, q)
:param q: query starting with "WHERE...." :return:

list_adm1_popstats(self)
Provides a neat lookup of population stats by HASC path,    e.g., "US.CA" is califronia; Reported at 35 million in major cities (where state total is reported    at 39 million in 2021.)  Population stats only cover major cities of 15K or more people. :return: map of population stats by ADM1 path

list_adm2_popstats(self)
Get approximate county-level stats

list_admin_names(self, sources=['U', 'N', 'G'], cc=None) -> set
Lists all admin level1 names. :param cc: country code filter. :param sources: list of source IDs defaulting to those for USGS, NGA, Geonames.org :return: set of names, lowerased

list_all_popstats(self)
:return: map of population by geohash only

list_countries(self)
List distinct country codes in DB. :return: list of country codes.

list_places(self, cc=None, fc=None, criteria=None, limit=-1)
Potentially massive array -- so this is just a Place generator. :param cc: country code or '' :param fc: feat class constraint with "*" wildcard, or '' :param criteria: additional clause to constrain search, e.g. " AND duplicate=0 " to find non-dups. :param limit:  non-zero limit :return: generator

list_places_at(self, lat: float = None, lon: float = None, geohash: str = None, cc: str = None, radius: int = 5000, limit=10, method='2d')
:param lat: latitude :param lon: longitude :param cc:  ISO country code to filter. :param geohash:  optionally, use precomputed geohash of precision 6-chars instead of lat/lon. :param radius:  in METERS, radial distance from given point to search, DEFAULT is 5 KM :param limit: count of places to return :param method: bbox or geohash :return: array of tuples, sorted by distance.

list_places_by_id(self, plid, limit=2)
Collect places and name_bias for gazetter ETL. Lookup place by ID as in "G1234567" for Geonames entry or "N123456789" for an NGA one, etc. :param plid: Place ID according to the convention of source initial + identifier :param limit: limit queries because if we know we only one 2 or 3 we need not search database beyond that. :return:

mark_duplicates(self, dups)

mark_search_only(self, pid)
Toggle bit for search only. :param pid: Place ID int or list

optimize(self)

purge(self, q)

reopen(self)

update_admin1_code(self, cc, from_code, to_code)

update_bias(self, name_bias, rowids)

update_bias_by_name(self, name_bias, name)

update_name_type(self, arr: list, t: str)
Change the name type in bulk. :param arr: bulk array of placenames to change :param t: type code 'A', 'N', 'C' :return:

update_place_id(self, rowid, plid)

Data descriptors defined here:

__dict__

dictionary for instance variables (if defined)

__weakref__

list of weak references to the object (if defined)

class DataSource(builtins.object)

    DataSource(dbf, debug=False, ver=None) Gazetteer Data Source abstraction -- provides guidelines on how to inject data into a common, normalized gazetteer.

Methods defined here:

__init__(self, dbf, debug=False, ver=None)
Initialize self.  See help(type(self)) for accurate signature.

normalize(self, sourcefile, limit=-1, optimize=False)
Given the spreadsheet or source file rip through it, ingesting contents into the master gazetteer. :param sourcefile: input file :param limit: non-zero limit for testing :param optimize: if database should be optimized when done. :return:

process_source(self, sourcefile, limit=-1)
generator yielding DB geo dictionary to be stored. :param sourcefile: Raw data file :param limit: limit of number of records to process :return: generator of Place object or dict of Place schema

purge(self)

Data descriptors defined here:

__dict__

dictionary for instance variables (if defined)

__weakref__

list of weak references to the object (if defined)

class GazetteerIndex(builtins.object)

    GazetteerIndex(server_url, debug=False) GazetteerIndex provides a simple API to inject entries into the Gazetteer. - Every 1000 records a batch is sent to Solr - Every 1,000,0000 records a commit() call is sent to Solr This may provide gazetteer specific functions, but as of v1.3 this is a generic Solr wrapper.

Methods defined here:

__init__(self, server_url, debug=False)
Initialize self.  See help(type(self)) for accurate signature.

add(self, place)
:param place: Place object. :return:

delete(self, entry_id=None)
Awaiting other kwdargs for deletion use cases. :param entry_id: master gazetteer row ID in sqlite or solr.  Deletes solr entry :return:

optimize(self)

save(self, done=False)

Data descriptors defined here:

__dict__

dictionary for instance variables (if defined)

__weakref__

list of weak references to the object (if defined)

class GazetteerSearch(builtins.object)

    GazetteerSearch(server_url)

Methods defined here:

__init__(self, server_url)
TODO: BETA - looking to abstract Solr().search() function for common types of queries.     For now getting a list of country name variants is easy enough. :param server_url:  URL with path to `/solr/gazetteer' index

get_countries(self, max_namelen=30)
Searches gazetteer for Country metadata TODO: dovetail Country metadata (lang, timezone, codes, etc) with     Country place data. TODO: Document different uses for GazetteerSearch.get_countries() from API get_country() TODO: Review differences in Place() schema and Country() schema for name variants,     e.g., Country variants presented as abbreviations, codes or names need to be distinguished as such. :param max_namelen: :return:

Data descriptors defined here:

__dict__

dictionary for instance variables (if defined)

__weakref__

list of weak references to the object (if defined)

class ISO3166Registry(builtins.object)

    Methods defined here:

__init__(self)
Initialize self.  See help(type(self)) for accurate signature.

get_admin1_for(self, cc, adm2)

has_admin1(self, cc, adm1)

is_iso_country(self, cc)

load_admin_mapping(self)

Static methods defined here:

export_admin_mapping()
Generate initial file using pycountry.  pycountry is not a library dependency so it is externalized here. :return:

Data descriptors defined here:

__dict__

dictionary for instance variables (if defined)

__weakref__

list of weak references to the object (if defined)

class PlaceHeuristics(builtins.object)

    PlaceHeuristics(dbref: opensextant.gazetteer.DB)

Methods defined here:

__init__(self, dbref: opensextant.gazetteer.DB)
:param dbref: DB instance

estimate_bias(self, geo, name_group='')
Primary Estimator of id_bias and name_bias. id_bias   -- a location bias to pre-rank city by feature/population name_bias -- a metric ranging from -1 to 1, that represents the validity of a tagging the name/phrase              in a general context.  The result is eventually binary  search_only = name_bias < 0. This means              that geo names that are search_only are not taggable. :param geo: :param name_group: :return:

get_feature_scale(self, fc, dsg)
:param fc: feature class :param dsg: feature code :return:

is_large_city(self, name)

is_province_name(self, name) -> bool
Report if a name is that of a province, regardless of whether the location repreents something else. E.g.     "Florida" is a city (lesser known) or a state (well known).   Therefore it is a popular name. :param name: :return:

is_significant(self, feat) -> bool

is_stopword(self, name: str) -> bool

location_bias(self, geo)
See estimate_bias() A location is pre-disposed by its feature type and population/popularity. E.g., large cities are mentioned more often in news or documents than less populated cities. Factors: Feature gradient     A, P, ..... U.  More populated features have higer bias Population gradient  log(pop)  scales bias higher :param geo:  standard ETL geo dict :return:  score on 100 point scale.

name_bias(self, geoname: str, feat_class: str, feat_code: str, name_group='', name_type='N')
See estimate_bias() Given a geoname we look at the instance of the name variant and if it is something trivially colliding with stopwords in other languages then we consider omitting it. very positive bias   - long unique name, diacritic or in non-ASCII script positive bias        - normal location name, multiple words or grams neutral              - possibly a place name, but is case-dependent, e.g., person name or generic monument name. negative bias        - a stopword or trivial version of a stopword, `Åre` very negative bias   - a very rare or extremely long version of a place name, nonsense -1                   - WordStats reports as a "common" word. Conclusion: Any Negative name_bias term will NOT be tagged, although it is present in gazetteer. CODE and ABBREV are not biased -- they are simply not full names. TODO: ONLY unigrams are tracked, so     "Alabama" -> not common,     "Need" -> common,     "New York" -> not tracked. This is a bi-gram :param geoname: :param feat_class: :param feat_code: :param name_group: :param name_type: :return:  floating point number between -100 and 100

Data descriptors defined here:

__dict__

dictionary for instance variables (if defined)

__weakref__

list of weak references to the object (if defined)

Data and other attributes defined here:

LARGE_CITY = 3

Functions


add_location(geo, lat, lon, add_geohash=False)
Insert validated location coordinate and geohash :param add_geohash: due to performance, add this if needed :param geo: dict :param lat: latitude value, str or float :param lon: longitude value, str or float :return: geo dict with location

as_admin_place(r)
Convert dict to a Place object :param r: gazetteer row from Solr or SQlite. :return: Place

as_place(r, source='index')
Convert dict to a Place object :param source: db or index (solr) :param r: gazetteer row from Solr or SQlite. :return: Place

as_place_record(place, target='index')
Given a Place object, serialize it as a dict consistent with the Solr index schema. :param place: :param target: index or db :return:

capitalize(name: dict)
Capitalize all city and major admin boundaries

coord_grid(geo: dict) -> str
A less dissatisfying grid than geohash. Its just returning Y,X in low resolution. LLL.l,LLL.l

estimate_name_bias(nm)

export_admin_mapping(admin_ids, filepath)
Experimental:  Map all source place IDs => ADM ids                Map all standard ADM ids => place IDs :param admin_ids:  dict for JSON or array for CSV :param filepath: :return:

gaz_resource(fname)
Formats the relative path for an item in the ./solr/etc/gazetteer/ metadata :param fname: :return:

get_default_db()

get_default_wordstats()

load_major_cities_iso()

load_stopterms(project_dir='.', lower=True)
Load default stop terms from source tree for project build. :param project_dir: The location of Xponents/solr source tree. :param lower: default case to load data as. If not lower, then terms are loaded as-is :return:

name_group_for(nm: str)
Determine the major language "name group" for the input :param nm: name or any text :return:

normalize_name(nm: str)
convenience method that ensures we have some consistency on normalization of name :param nm: :return:

print_places(arr, limit=25)

run_lookup(url, lookup, parse)
Gazetteer demo mimics some of the logic in XponentsGazetteerQuery try "San Francisco, CA, US"

run_query(url, q)
Expert mode:  Run a solr query to see what you get back. requires you know the schema

Data

DEFAULT_COUNTRY_ID_BIAS = 49
DEFAULT_MASTER = 'master_gazetteer.sqlite'
DEFAULT_SOLR_URL = '127.0.0.1:7000'
DEFAULT_WORDSTATS = 'wordstats.sqlite'
GAZETTEER_SOURCES = {'ADHOC': 'OA', 'G': 'G', 'GEONAMES': 'OG', 'GP': 'GP', 'Geonames.org': 'OG', 'NE': 'NE', 'NGA': 'N', 'NGA-AUTOFIXED': 'NF', 'USGS': 'U', 'USGS-AUTOFIXED': 'UF', ...}
GAZETTEER_SOURCE_ID = {'G', 'GPX', 'ISO', 'N', 'NE', 'NF', ...}
GAZETTEER_TEMPLATE = {'FIPS_cc': None, 'adm1': None, 'adm2': None, 'cc': None, 'feat_class': None, 'feat_code': None, 'id': -1, 'id_bias': 0, 'lat': 0, 'lon': 0, ...}
MAJOR_ADMIN_CODES = {'ADM1', 'ADM2', 'ADM3', 'ADM4', 'ADMD', 'PRSH', ...}
SCRIPT_CODES = {None: '', 'LATIN': 'L', 'HAN': 'H', 'COMMON': 'C', 'ARABIC': 'A', 'ARMENIAN': 'AM', 'BENGALI': 'BN', 'CYRILLIC': 'CY', 'DEVANAGARI': 'DV', 'ETHIOPIC': 'ET', ...}
US_TERRITORY_MAP = {'FIPS': {'AQ': 'AS', 'CQ': 'MP', 'DQ': 'UM', 'FQ': 'UM', 'GQ': 'GU', 'HQ': 'UM', 'JQ': 'UM', 'MQ': 'UM', 'RQ': 'PR', 'VI': 'VI', ...}, 'ISO': {'AS': 'AQ', 'GU': 'CQ', 'MP': 'CQ', 'PR': 'RQ'}}

Data
		DEFAULT_COUNTRY_ID_BIAS = 49 DEFAULT_MASTER = 'master_gazetteer.sqlite' DEFAULT_SOLR_URL = '127.0.0.1:7000' DEFAULT_WORDSTATS = 'wordstats.sqlite' GAZETTEER_SOURCES = {'ADHOC': 'OA', 'G': 'G', 'GEONAMES': 'OG', 'GP': 'GP', 'Geonames.org': 'OG', 'NE': 'NE', 'NGA': 'N', 'NGA-AUTOFIXED': 'NF', 'USGS': 'U', 'USGS-AUTOFIXED': 'UF', ...} GAZETTEER_SOURCE_ID = {'G', 'GPX', 'ISO', 'N', 'NE', 'NF', ...} GAZETTEER_TEMPLATE = {'FIPS_cc': None, 'adm1': None, 'adm2': None, 'cc': None, 'feat_class': None, 'feat_code': None, 'id': -1, 'id_bias': 0, 'lat': 0, 'lon': 0, ...} MAJOR_ADMIN_CODES = {'ADM1', 'ADM2', 'ADM3', 'ADM4', 'ADMD', 'PRSH', ...} SCRIPT_CODES = {None: '', 'LATIN': 'L', 'HAN': 'H', 'COMMON': 'C', 'ARABIC': 'A', 'ARMENIAN': 'AM', 'BENGALI': 'BN', 'CYRILLIC': 'CY', 'DEVANAGARI': 'DV', 'ETHIOPIC': 'ET', ...} US_TERRITORY_MAP = {'FIPS': {'AQ': 'AS', 'CQ': 'MP', 'DQ': 'UM', 'FQ': 'UM', 'GQ': 'GU', 'HQ': 'UM', 'JQ': 'UM', 'MQ': 'UM', 'RQ': 'PR', 'VI': 'VI', ...}, 'ISO': {'AS': 'AQ', 'GU': 'CQ', 'MP': 'CQ', 'PR': 'RQ'}}