1import glob
2import gzip
3import os
4import sqlite3
5import traceback
6
7from opensextant.utility import ensure_dirs
8
9CATALOGS = {"googlebooks": "G"}
10
11
12def _ignoreable(text, mn, mx):
13 if len(text) < mn or len(text) > mx:
14 # print("Ignore short or long")
15 return True
16
17 if text[-1].isdigit():
18 # Ignore numeric nonsense
19 return True
20 return False
21
22
23class WordStats:
24
25 def __init__(self, db, minlen=2, maxlen=30):
26 """
27
28 :param db: DB path
29 :param minlen: min length of tracked words
30 :param maxlen: max length of tracked words
31 """
32 self.dbpath = db
33 self.counter = 0
34 self.ignored = 0
35 self.minlen = minlen
36 self.maxlen = maxlen
37 self.conn = None
38 self.commit_rate = 100000
39 self.cache = set([])
40 self.cache_loaded = False
41 if self.createdb():
42 self.reopen()
43
44 def reopen(self):
45 if self.conn is not None:
46 return
47
48 # really close cleanly
49 self.close()
50
51 self.conn = sqlite3.connect(self.dbpath)
52 self.conn.execute('PRAGMA cache_size = 8092')
53 self.conn.execute("PRAGMA encoding = 'UTF-8'")
54 self.conn.execute('PRAGMA synchronous = OFF')
55 self.conn.execute('PRAGMA journal_mode = MEMORY')
56 self.conn.execute('PRAGMA temp_store = MEMORY')
57 self.conn.row_factory = sqlite3.Row
58
59 def save(self, rows):
60 try:
61 sql = """insert into wordstats (word, pos, count, catalog) values (:w, :pos, :cnt, :cat)"""
62 self.conn.executemany(sql, rows)
63 self.conn.commit()
64 except:
65 print("Failed to save words")
66 print(traceback.format_exc(limit=5))
67
68 def createdb(self):
69 if os.path.exists(self.dbpath):
70 return True
71
72 ensure_dirs(self.dbpath)
73 self.reopen()
74 sql_script = """
75 create TABLE wordstats (
76 `word` TEXT NOT NULL,
77 `pos` TEXT NOT NULL,
78 `count` INTEGER DEFAULT 0,
79 `catalog` TEXT NOT NULL
80 );
81 create INDEX wd_idx on wordstats ("word");
82 create INDEX pos_idx on wordstats ("pos");
83 create INDEX cat_idx on wordstats ("catalog");
84 """
85 self.conn.executescript(sql_script)
86 self.conn.commit()
87 return True
88
89 def purge(self, cat):
90 sql = "delete from wordstats where catalog = ?"
91 self.conn.execute(sql, (cat,))
92 self.conn.commit()
93
94 def close(self):
95 if self.conn:
96 self.conn.close()
97 del self.conn
98
99 def ingest(self, statsfile, cat):
100 files = []
101 if os.path.exists(statsfile) and os.path.isdir(statsfile):
102 files = glob.glob(f"{statsfile}/*.gz")
103 else:
104 files.append(statsfile)
105
106 for f in files:
107 print(f"INGEST WORDS from {cat}: FILE {f}")
108 with gzip.open(f, "rt", encoding="UTF-8") as fh:
109 linecount = 0
110 terms = {}
111 self.purge(cat)
112 for line in fh:
113 linecount += 1
114 term = line.strip().split("\t")
115 termtext = term[0].lower()
116 pos = ""
117 curr = termtext
118 if "_" in termtext:
119 curr, pos = termtext.rsplit("_", 1)
120 if not pos:
121 curr = termtext
122 if _ignoreable(curr, self.minlen, self.maxlen):
123 self.ignored += 1
124 continue
125
126 subcount = int(term[2])
127 key = f"{curr}#{pos}"
128 if key not in terms:
129 terms[key] = {"cnt": 0, "w": curr, "pos": pos, "cat": cat}
130 self.counter += 1
131 if self.counter % self.commit_rate == 0:
132 self.save(terms.values())
133 terms.clear()
134 terms[key] = {"cnt": 0, "w": curr, "pos": pos, "cat": cat}
135
136 terms[key]["cnt"] += subcount
137 # Flush last batch.
138 self.save(terms.values())
139 print(f"LINES {linecount} WORDS {self.counter} IGNORED {self.ignored}")
140
141 def find(self, word, threshold, catalog="googlebooks"):
142 """
143 EXPERIMENTAL
144 Word look up. for Catalog lookup this is catalog prefix + word initial, e.g., Gp
145 is catalog ID in database when looking for "philadelphia" in googlebooks.
146
147 Threshold is a cut off -- all word counts above this will be returned.
148 If "word" contains "%", we assume this is a wildcard search.
149
150 Word stats include:
151 WORD "_" PARTOFSPEECH
152 WORD -- This query only uses bare word counts.
153
154 The bare WORD counts appear to be a sum of all sub-counts for WORD+POS occurrences.
155
156 :param word:
157 :param threshold:
158 :param catalog:
159 :return:
160 """
161 cat = CATALOGS.get(catalog)
162 if cat:
163 cat = f"{cat}{word[0]}"
164 else:
165 cat = ""
166
167 word_clause = " word = ?"
168 if "%" in word:
169 word_clause = "word like ?"
170 sql = f"""select word, count as CNT from wordstats where pos = '' and catalog = ? and CNT > ?
171 and {word_clause} order by CNT desc"""
172 # Avoid making the SQL summation too difficult. For some reason there are multiple entries for certain
173 # word patterns -- POS may be NULL or "" or something else. But here we sum all bare word patterns
174 wordstats = {}
175 for row in self.conn.execute(sql, (cat, threshold, word)):
176 wd = row["word"]
177 if wd not in wordstats:
178 wordstats[wd] = 0
179 wordstats[wd] += row["CNT"]
180 return wordstats
181
182 def load_common(self, threshold=10000000):
183 """
184 Find all commmon words. The discrete counts of words may have to be added up
185 as part-of-speech accounting confuses things a bit. There are no ground truth numbers in GoogleBooks Ngrams
186 about total counts.
187
188 :param threshold:
189 :return:
190 """
191 sql = f"""select word, count as CNT from wordstats where pos = '' and CNT > 1000000 order by CNT desc"""
192 wordstats = {}
193 # Sum by word (which is already been lowercased, normalized)
194 for row in self.conn.execute(sql):
195 wd = row["word"]
196 if wd not in wordstats:
197 wordstats[wd] = 0
198 wordstats[wd] += row["CNT"]
199 # Filter by count
200 for wd in wordstats:
201 if wordstats[wd] > threshold:
202 self.cache.add(wd)
203 self.cache_loaded = True
204
205 def is_common(self, word, threshold=10000000):
206 """
207 Check if a word is common. Threshold is ignored if cache was pre-loaded using load_common()
208 If not pre-loaded, then a query is made for each term not in the cache.
209
210 :param word: word lookup. Ideally caller has lowercased/normalized this
211 :param threshold: default 10mil or more occurrence is a common NGram in GoogleBooks
212 :return:
213 """
214 if word in self.cache:
215 return True
216 if self.cache_loaded:
217 return False
218
219 found = False
220 # find() cursor returns a dict of found terms. Counts are not used here.
221 for wordnorm in self.find(word, threshold=threshold):
222 self.cache.add(wordnorm)
223 found = True
224 return found