opensextant.TaxCat
A simple interface to creating a taxonomic catalog ("taxcat") for OpenSextant TaxMatcher to use. prerequisites: See XTax README
1# -*- coding: utf-8 -*- 2""" 3 A simple interface to creating a taxonomic catalog ("taxcat") for OpenSextant TaxMatcher to use. 4 prerequisites: See XTax README 5""" 6import os 7 8from opensextant.utility import is_text, ConfigUtility 9 10__API_PATH = os.path.realpath(__file__) 11SOLR_SERVER = "http://127.0.0.1:7000/solr/taxcat" 12DEFAULT_SOLR_SERVER = "127.0.0.1:7000" 13 14 15def _scrub_cdata_content(text): 16 """ User should scrub data themselves; but this gives ideas of what goes wrong when adding text to Solr 17 <,>,& all must be escaped. 18 """ 19 return text.replace('<', '(less than)').replace('>', '(greater than)').replace('&', '& ') 20 21 22def get_taxnode(t, val): 23 """ 24 25 :param t: 26 :param val: 27 :return: 28 """ 29 name_value = val.strip().title().replace("'S", "'s") 30 # Title case capitalizes "'s"..geesh. 31 tx = "{}.{}".format(t.lower(), name_value) 32 return tx 33 34 35_FALSE_VAL = {'f', 'false', '0', 'n', 'no'} 36_TRUE_VAL = {'t', 'true', '1', 'y', 'yes'} 37 38 39def add_bool(dct, f, val, default=None): 40 if not val: 41 if default is not None: 42 dct[f] = default 43 return 44 45 if val.lower() in _FALSE_VAL: 46 dct[f] = 'false' 47 elif val.lower() in _TRUE_VAL: 48 dct[f] = 'true' 49 return 50 51 52def add_text(dct, f, val): 53 """ add_text offers a basic idea of how to add values to dict 54 before sending to solr. TEXT strings may need scrubbing 55 but you just add non-TEXT values. 56 """ 57 if is_text(val): 58 dct[f] = val 59 else: 60 dct[f] = val 61 62 63def add_value(f, val, case=0): 64 """ add a value to a given field, f; And normalize case if non-zero. 65 case = CASE_LOWER | CASE_UPPER | 0(default) no change 66 """ 67 68 if val is None: 69 f.append(u'') 70 return 71 72 if is_text(val): 73 v = val 74 # if "&" in val or "<" in val: 75 # print "SCRUB THIS:", val 76 # val.replace('&','+').replace('<', ' lt ') 77 if not case: 78 f.append(v) 79 elif case == CASE_LOWER: 80 f.append(v.lower()) 81 elif case == CASE_UPPER: 82 f.append(v.upper()) 83 else: 84 f.append(str(val)) 85 86 return 87 88 89CASE_LOWER = 1 90CASE_UPPER = 2 91 92# Catalogs must be registered -- Solr has no concept of how to manage string-based record IDs 93# that is something you must manage as you create your combined catalog, 94# 95# Catalog Registry maps your catalog ID to a starting offset for solr records 96# If you think your reference data for catalog X will have 1 million entries, then 97# start catalog X at 1,000,000 and let other smaller catalogs start at 0 or at less than 1 million 98# start the next catalog at 3,000,000 to give X some breathing room. 99# 100CATALOG_REGISTRY = { 101 "DEFAULT": 0, 102 "WFB": 100000, 103 "JRC": 3000000 104} 105 106 107def get_starting_id(cat): 108 """ 109 For well-known catalogs, determine the default catatag ID range. 110 :param cat: 111 :return: 112 """ 113 offset = CATALOG_REGISTRY.get(cat) 114 if not offset: 115 raise Exception("Catalog is not registered: " + cat) 116 117 return offset 118 119 120class Taxon: 121 def __init__(self): 122 self.name = None 123 self.phrase = None 124 self.id = None 125 self.is_valid = True 126 # An array of additional tags. 127 self.tags = None 128 self.is_acronym = False 129 130 131class TaxCatalogBuilder: 132 133 def __init__(self, server=None, test=False): 134 """ 135 API to assist in building taxon nodes and storing them in Solr. 136 :param server: solr server http URL 137 """ 138 139 self.server = None 140 self.server_url = None 141 self.set_server(server) 142 self.test = test 143 144 self._record_count = 0 145 self._byte_count = 0 146 self._add_byte_count = 0 147 self.add_rate = 1000 148 self.commit_rate = -1 149 150 self._records = [] 151 self.count = 0 152 153 # Load file 154 self.utility = ConfigUtility(None) 155 self.stopwords = set([]) 156 157 def add_stopwords(self, stopfile): 158 159 if not os.path.exists(stopfile): 160 raise Exception("No stopwords found at " + stopfile) 161 162 print("Loading stopwords ", stopfile) 163 _stopwords_list = self.utility.loadListFromFile(stopfile) 164 self.stopwords.update(_stopwords_list) 165 166 def purge(self, catalog): 167 if not catalog: 168 raise Exception("Catalog name is required") 169 print("Purging catalog", catalog) 170 self.server.delete(f"catalog:{catalog}", commit=True) 171 172 def set_server(self, svr): 173 self.server_url = svr 174 if not self.server_url: 175 return 176 177 if not self.server_url.startswith("http"): 178 self.server_url = f"http://{self.server_url}/solr/taxcat" 179 180 try: 181 from pysolr import Solr 182 self.server = Solr(self.server_url, timeout=600) 183 print("SERVER ", self.server_url, self.server) 184 185 except Exception as err: 186 print(f"Problem with that server {self.server_url}, ERR={err}") 187 188 def optimize(self): 189 if self.server and not self.test: 190 self.server.optimize() 191 192 def save(self, flush=False): 193 if self.test: 194 return 195 if not self.server: 196 print("No server") 197 return 198 199 ready = 0 < self.count and (self.count % self.add_rate == 0) 200 if flush or ready: 201 self.server.add(self._records) 202 self._records.clear() 203 204 if flush: 205 self.server.commit(expungeDeletes=True) 206 return 207 208 def add(self, catalog, taxon: Taxon): 209 """ 210 Add the given taxon to the index, increment the internal counter. 211 :param catalog: catalog ID 212 :param taxon: 213 :return: 214 """ 215 self.count += 1 216 rec = {'catalog': catalog, 'taxnode': taxon.name, 'phrase': taxon.phrase, 'id': taxon.id, 217 'valid': taxon.is_valid, 218 'name_type': 'N'} 219 if taxon.tags: 220 rec['tag'] = taxon.tags 221 if taxon.is_acronym: 222 rec['name_type'] = 'A' 223 224 self._records.append(rec) 225 self.save() 226 227 def add_wordlist(self, catalog, datafile, start_id, taxnode=None, minlen=1): 228 """ Given a simple one column word list file, each row of data is added 229 to catalog as a Taxon; taxnode may be used as a prefix for the words 230 231 Add a series of organized word lists to a single Catalog, but manage 232 each wordlist with some prefix taxon path. 233 234 add_wordlist('CAT', f1, 400, taxonode='first') 235 add_wordlist('CAT', f2, 500, taxonode='second') 236 add_wordlist('CAT', f3, 600, taxonode='third') 237 add_wordlist('CAT', f4, 700, taxonode='fourth') 238 """ 239 _name = os.path.basename(datafile) 240 if taxnode: 241 _name = taxnode 242 243 words = set([]) 244 with open(datafile, 'r', encoding="UTF-8") as sheet: 245 for row in sheet: 246 _phrase = row.strip() 247 if not _phrase: 248 continue 249 250 if _phrase.startswith("#"): 251 # is a comment or commented out word. 252 continue 253 254 _id = start_id + self.count 255 256 key = _phrase.lower() 257 if key in words: 258 print("Not adding ", key) 259 continue 260 261 words.add(key) 262 263 t = Taxon() 264 t.id = _id 265 t.is_valid = len(key) >= minlen 266 t.name = _name 267 t.phrase = _phrase 268 # Allow case-sensitive entries. IFF input text contains UPPER 269 # case data, we'll mark it as acronym. 270 if t.phrase.isupper(): 271 t.is_acronym = True 272 273 self.add(catalog, t) 274 self.save(flush=True) 275 print(f"COUNT: {self.count}") 276 277 278def create_taxcat(solr_server): 279 """ 280 281 :param solr_server: URL or host:port 282 :return: 283 """ 284 server = solr_server 285 286 if not solr_server: 287 server = "localhost:7000" 288 if server and not server.lower().startswith("http"): 289 server = f"http://{server}/solr/taxcat" 290 291 return TaxCatalogBuilder(server=server)
def
get_taxnode(t, val):
23def get_taxnode(t, val): 24 """ 25 26 :param t: 27 :param val: 28 :return: 29 """ 30 name_value = val.strip().title().replace("'S", "'s") 31 # Title case capitalizes "'s"..geesh. 32 tx = "{}.{}".format(t.lower(), name_value) 33 return tx
:param t: :param val: :return:
def
add_text(dct, f, val):
53def add_text(dct, f, val): 54 """ add_text offers a basic idea of how to add values to dict 55 before sending to solr. TEXT strings may need scrubbing 56 but you just add non-TEXT values. 57 """ 58 if is_text(val): 59 dct[f] = val 60 else: 61 dct[f] = val
add_text offers a basic idea of how to add values to dict before sending to solr. TEXT strings may need scrubbing but you just add non-TEXT values.
def
add_value(f, val, case=0):
64def add_value(f, val, case=0): 65 """ add a value to a given field, f; And normalize case if non-zero. 66 case = CASE_LOWER | CASE_UPPER | 0(default) no change 67 """ 68 69 if val is None: 70 f.append(u'') 71 return 72 73 if is_text(val): 74 v = val 75 # if "&" in val or "<" in val: 76 # print "SCRUB THIS:", val 77 # val.replace('&','+').replace('<', ' lt ') 78 if not case: 79 f.append(v) 80 elif case == CASE_LOWER: 81 f.append(v.lower()) 82 elif case == CASE_UPPER: 83 f.append(v.upper()) 84 else: 85 f.append(str(val)) 86 87 return
add a value to a given field, f; And normalize case if non-zero. case = CASE_LOWER | CASE_UPPER | 0(default) no change
def
get_starting_id(cat):
108def get_starting_id(cat): 109 """ 110 For well-known catalogs, determine the default catatag ID range. 111 :param cat: 112 :return: 113 """ 114 offset = CATALOG_REGISTRY.get(cat) 115 if not offset: 116 raise Exception("Catalog is not registered: " + cat) 117 118 return offset
For well-known catalogs, determine the default catatag ID range. :param cat: :return:
def
create_taxcat(solr_server):
279def create_taxcat(solr_server): 280 """ 281 282 :param solr_server: URL or host:port 283 :return: 284 """ 285 server = solr_server 286 287 if not solr_server: 288 server = "localhost:7000" 289 if server and not server.lower().startswith("http"): 290 server = f"http://{server}/solr/taxcat" 291 292 return TaxCatalogBuilder(server=server)
:param solr_server: URL or host:port :return: