opensextant.wordstats

View Source
  1import glob
  2import gzip
  3import os
  4import sqlite3
  5import traceback
  6
  7from opensextant.utility import ensure_dirs
  8
  9CATALOGS = {"googlebooks": "G"}
 10
 11
 12def _ignoreable(text, mn, mx):
 13    if len(text) < mn or len(text) > mx:
 14        # print("Ignore short or long")
 15        return True
 16
 17    if text[-1].isdigit():
 18        # Ignore numeric nonsense
 19        return True
 20    return False
 21
 22
 23class WordStats:
 24
 25    def __init__(self, db, minlen=2, maxlen=30):
 26        """
 27
 28        :param db:  DB path
 29        :param minlen:  min length of tracked words
 30        :param maxlen:  max length of tracked words
 31        """
 32        self.dbpath = db
 33        self.counter = 0
 34        self.ignored = 0
 35        self.minlen = minlen
 36        self.maxlen = maxlen
 37        self.conn = None
 38        self.commit_rate = 100000
 39        self.cache = set([])
 40        self.cache_loaded = False
 41        if self.createdb():
 42            self.reopen()
 43
 44    def reopen(self):
 45        if self.conn is not None:
 46            return
 47
 48        # really close cleanly
 49        self.close()
 50
 51        self.conn = sqlite3.connect(self.dbpath)
 52        self.conn.execute('PRAGMA cache_size =  8092')
 53        self.conn.execute("PRAGMA encoding = 'UTF-8'")
 54        self.conn.execute('PRAGMA synchronous = OFF')
 55        self.conn.execute('PRAGMA journal_mode = MEMORY')
 56        self.conn.execute('PRAGMA temp_store = MEMORY')
 57        self.conn.row_factory = sqlite3.Row
 58
 59    def save(self, rows):
 60        try:
 61            sql = """insert into wordstats (word, pos, count, catalog) values (:w, :pos, :cnt, :cat)"""
 62            self.conn.executemany(sql, rows)
 63            self.conn.commit()
 64        except:
 65            print("Failed to save words")
 66            print(traceback.format_exc(limit=5))
 67
 68    def createdb(self):
 69        if os.path.exists(self.dbpath):
 70            return True
 71
 72        ensure_dirs(self.dbpath)
 73        self.reopen()
 74        sql_script = """
 75             create TABLE wordstats (
 76                     `word` TEXT NOT NULL,
 77                     `pos` TEXT NOT NULL,
 78                     `count` INTEGER DEFAULT 0,
 79                     `catalog` TEXT NOT NULL                     
 80                 );
 81                 create INDEX wd_idx on wordstats ("word");               
 82                 create INDEX pos_idx on wordstats ("pos");               
 83                 create INDEX cat_idx on wordstats ("catalog");               
 84             """
 85        self.conn.executescript(sql_script)
 86        self.conn.commit()
 87        return True
 88
 89    def purge(self, cat):
 90        sql = "delete from wordstats where catalog = ?"
 91        self.conn.execute(sql, (cat,))
 92        self.conn.commit()
 93
 94    def close(self):
 95        if self.conn:
 96            self.conn.close()
 97            del self.conn
 98
 99    def ingest(self, statsfile, cat):
100        files = []
101        if os.path.exists(statsfile) and os.path.isdir(statsfile):
102            files = glob.glob(f"{statsfile}/*.gz")
103        else:
104            files.append(statsfile)
105
106        for f in files:
107            print(f"INGEST WORDS from {cat}: FILE {f}")
108            with gzip.open(f, "rt", encoding="UTF-8") as fh:
109                linecount = 0
110                terms = {}
111                self.purge(cat)
112                for line in fh:
113                    linecount += 1
114                    term = line.strip().split("\t")
115                    termtext = term[0].lower()
116                    pos = ""
117                    curr = termtext
118                    if "_" in termtext:
119                        curr, pos = termtext.rsplit("_", 1)
120                        if not pos:
121                            curr = termtext
122                    if _ignoreable(curr, self.minlen, self.maxlen):
123                        self.ignored += 1
124                        continue
125
126                    subcount = int(term[2])
127                    key = f"{curr}#{pos}"
128                    if key not in terms:
129                        terms[key] = {"cnt": 0, "w": curr, "pos": pos, "cat": cat}
130                        self.counter += 1
131                        if self.counter % self.commit_rate == 0:
132                            self.save(terms.values())
133                            terms.clear()
134                            terms[key] = {"cnt": 0, "w": curr, "pos": pos, "cat": cat}
135
136                    terms[key]["cnt"] += subcount
137                # Flush last batch.
138                self.save(terms.values())
139                print(f"LINES {linecount}  WORDS {self.counter}  IGNORED {self.ignored}")
140
141    def find(self, word, threshold, catalog="googlebooks"):
142        """
143        EXPERIMENTAL
144        Word look up.  for Catalog lookup this is catalog prefix + word initial, e.g., Gp
145        is catalog ID in database when looking for "philadelphia" in googlebooks.
146
147        Threshold is a cut off -- all word counts above this will be returned.
148        If "word" contains "%", we assume this is a wildcard search.
149
150        Word stats include:
151            WORD "_" PARTOFSPEECH
152            WORD                     -- This query only uses bare word counts.
153
154        The bare WORD counts appear to be a sum of all sub-counts for WORD+POS occurrences.
155
156        :param word:
157        :param threshold:
158        :param catalog:
159        :return:
160        """
161        cat = CATALOGS.get(catalog)
162        if cat:
163            cat = f"{cat}{word[0]}"
164        else:
165            cat = ""
166
167        word_clause = " word = ?"
168        if "%" in word:
169            word_clause = "word like ?"
170        sql = f"""select word, count as CNT from wordstats where  pos = '' and catalog = ? and CNT > ?
171                        and {word_clause} order by CNT desc"""
172        # Avoid making the SQL summation too difficult.  For some reason there are multiple entries for certain
173        # word patterns -- POS may be NULL or "" or something else.  But here we sum all bare word patterns
174        wordstats = {}
175        for row in self.conn.execute(sql, (cat, threshold, word)):
176            wd = row["word"]
177            if wd not in wordstats:
178                wordstats[wd] = 0
179            wordstats[wd] += row["CNT"]
180        return wordstats
181
182    def load_common(self, threshold=10000000):
183        """
184        Find all commmon words.  The discrete counts of words may have to be added up
185        as part-of-speech accounting confuses things a bit.  There are no ground truth numbers in GoogleBooks Ngrams
186        about total counts.
187
188        :param threshold:
189        :return:
190        """
191        sql = f"""select word, count as CNT from wordstats where  pos = '' and CNT > 1000000 order by CNT desc"""
192        wordstats = {}
193        # Sum by word (which is already been lowercased, normalized)
194        for row in self.conn.execute(sql):
195            wd = row["word"]
196            if wd not in wordstats:
197                wordstats[wd] = 0
198            wordstats[wd] += row["CNT"]
199        # Filter by count
200        for wd in wordstats:
201            if wordstats[wd] > threshold:
202                self.cache.add(wd)
203        self.cache_loaded = True
204
205    def is_common(self, word, threshold=10000000):
206        """
207        Check if a word is common.  Threshold is ignored if cache was pre-loaded using load_common()
208        If not pre-loaded, then a query is made for each term not in the cache.
209
210        :param word: word lookup.  Ideally caller has lowercased/normalized this
211        :param threshold: default 10mil or more occurrence is a common NGram in GoogleBooks
212        :return:
213        """
214        if word in self.cache:
215            return True
216        if self.cache_loaded:
217            return False
218
219        found = False
220        # find() cursor returns a dict of found terms. Counts are not used here.
221        for wordnorm in self.find(word, threshold=threshold):
222            self.cache.add(wordnorm)
223            found = True
224        return found