opensextant.extractors.xtemporal
1# -*- coding: utf-8 -*- 2from calendar import timegm 3from datetime import datetime 4 5import arrow 6from opensextant import logger_config 7from opensextant.FlexPat import PatternMatch, RegexPatternManager, PatternExtractor 8 9TZINFO = arrow.utcnow().tzinfo 10 11NOW = arrow.now() 12YEAR = NOW.year 13MILLENNIUM = 2000 14CURR_YY = YEAR - MILLENNIUM 15FUTURE_YY_THRESHOLD = CURR_YY + 2 16MAXIMUM_YEAR = 2040 17 18INVALID_DATE = -1 19INVALID_DAY = -2 20NO_YEAR = -3 21NO_MONTH = -4 22NO_DAY = -5 23 24log = logger_config("INFO", pkg=__name__) 25_default_locale = None 26 27def format_date(d): 28 if isinstance(d, arrow.Arrow): 29 return d.format("YYYY-MM-DD") 30 else: 31 return arrow.get(d).format("YYYY-MM-DD") 32 33 34def normalize_day(slots): 35 """ 36 37 :param slots: 38 :return: 39 """ 40 if not slots: 41 return INVALID_DAY 42 43 day = slots.get("DM2") or slots.get("DOM") or slots.get("DD") 44 if day: 45 try: 46 day = int(day) 47 if 0 < day <= 31: 48 return day 49 else: 50 return INVALID_DAY 51 except: 52 pass 53 return INVALID_DATE 54 55 56def normalize_month_name(slots): 57 """ 58 59 :param slots: 60 :return: name of month in English 61 """ 62 text = slots.get("MON_ABBREV") or slots.get("MON_NAME") 63 if not text: 64 return INVALID_DATE 65 tlen = len(text) 66 if tlen < 3 or 11 < tlen: 67 return INVALID_DATE 68 try: 69 short = text[0:3] 70 return arrow.get(short, "MMM").month 71 except: 72 return INVALID_DATE 73 74 75def normalize_month_num(slots: dict): 76 """ 77 returns month number 78 :param slots: 79 :return: month num, 1-12 80 """ 81 if not slots: 82 return INVALID_DATE 83 84 month_num = slots.get("DM1") or slots.get("MM") or slots.get("MONTH") 85 if month_num: 86 try: 87 num = int(month_num) 88 if 0 < num <= 12: 89 return num 90 except: 91 pass 92 return INVALID_DATE 93 94 95def test_european_locale(slots: dict, locale=None): 96 """ 97 98 :param slots: 99 :return: day, month 100 """ 101 if not ("DM1" in slots and "DM2" in slots): 102 return None, None 103 104 # Matched as MDY 105 # But we test if DMY is valid based on values. 106 try: 107 day = int(slots["DM1"]) 108 mon = int(slots["DM2"]) 109 # First pass -- if LOCALE == "euro", then assume pattern matches DAY/MON/YYYY 110 if locale and locale == "euro": 111 if mon <= 12 and day <= 31: 112 return day, mon 113 else: 114 return -1, -1 115 else: 116 # Otherwise -- this is a test and we're guessing. Only return 117 # a date if date pattern appears to be unambiguous. 03/05 is Mar-5th or May-3rd, for example 118 if day > 12 and mon <= 12: 119 # Valid match 31/12/... new year's eve. 120 return day, mon 121 if day > 12 and mon > 12: 122 # Invalid date match for this pattern, e.g., 13/13/, or 30/13/... 123 return -1, -1 124 except: 125 pass 126 return None, None 127 128def _get_year(slots:dict) -> str: 129 return slots.get("YEAR") or slots.get("YY") or slots.get("YEARYY") or slots.get("YY2") 130 131def normalize_year(slots) -> int: 132 if not slots: 133 return INVALID_DATE 134 135 year_str = slots.get("YEAR") 136 if year_str: 137 year = int(year_str) 138 if 0 < year < MAXIMUM_YEAR: 139 return year 140 141 is_year = False 142 year = INVALID_DATE 143 yearyy = slots.get("YEARYY") 144 yy = slots.get("YY") or slots.get("YY2") 145 try: 146 if yy: 147 year = int(yy) 148 elif yearyy: 149 is_year = yearyy.startswith("'") 150 yearyy = yearyy.strip("'") 151 year = int(yearyy) 152 153 # measure len of either slot 154 short_year = len(yy or yearyy) < 4 155 156 # Recent years, just past turn of century. 157 if not short_year and year < MAXIMUM_YEAR: 158 return year 159 160 # Is short year 161 if is_year: 162 # class of '17 163 # 22 Jun '17 164 if 0 <= year <= FUTURE_YY_THRESHOLD: 165 return MILLENNIUM + year 166 elif year <= 99: 167 # Year is '27 -- more likely 1927 168 return 1900 + year 169 elif FUTURE_YY_THRESHOLD < year <= 99: 170 # If not marked as a year and its a bare two-digits, then only accept years 171 # Note -- "24" could be 1924 or 2024,... or Day of month 24. 172 return 1900 + year 173 else: 174 # Default. Two-digit year, add Millennium 175 return MILLENNIUM + year 176 177 except: 178 return INVALID_DATE 179 180 181def normalize_tz(slots): 182 """ 183 184 :param slots: 185 :return: arrow.tzinfo 186 """ 187 188 try: 189 tz = slots.get("SHORT_TZ") 190 if tz: 191 dt_tz = arrow.get(tz, "Z") 192 return dt_tz 193 tz = slots.get("LONG_TZ") 194 if tz: 195 dt_tz = arrow.get(tz, "ZZZ") 196 return dt_tz 197 except Exception as parse_err: 198 return None 199 200 201def normalize_time(slots): 202 """ 203 Derive a valid time tuple. 204 :param slots: 205 :return: tuple of H, M, S, resolution 206 """ 207 208 # Default time is mid-day, noon. In timezone provided or UTC if no timezone. 209 hh_mm_ss = [] 210 for field in ["hh", "mm", "ss"]: 211 if field in slots: 212 val = slots.get(field) 213 if val is not None: 214 hh_mm_ss.append(int(val)) 215 else: 216 hh_mm_ss.append(-1) 217 218 # Time resolution is D, H, M, S ... where second is optional 219 hh, mm, ss = hh_mm_ss 220 # resolution = Resolution.DAY 221 if not (0 <= hh < 24): 222 return None 223 # resolution = Resolution.HOUR 224 if not (0 <= mm < 60): 225 return None 226 resolution = Resolution.MINUTE 227 if 0 <= ss < 60: 228 resolution = Resolution.SECOND 229 return hh, mm, ss, resolution 230 231 232class XTemporal(PatternExtractor): 233 def __init__(self, cfg="datetime_patterns_py.cfg", debug=False, locale=None): 234 """ 235 :param cfg: patterns config file. 236 """ 237 PatternExtractor.__init__(self, RegexPatternManager(cfg, debug=debug, testing=debug)) 238 if locale: 239 global _default_locale 240 _default_locale = locale.lower() 241 242 if debug: 243 log.setLevel("DEBUG") 244 245 246class Resolution: 247 YEAR = "Y" 248 MONTH = "M" 249 WEEK = "W" 250 DAY = "D" 251 HOUR = "H" 252 MINUTE = "m" 253 SECOND = "s" 254 255 256class DateTimeMatch(PatternMatch): 257 """ 258 DateTimeMatch puts out a matched date with attributes: 259 260 datenorm -- ISO yyyy-mm-dd date 261 epoch -- seconds from 1970-01-01 262 resolution - D, M, h, m, s 263 locale -- "north-am" or "euro". 264 265 If locale is set using XTemporal(locale='euro') 266 matching Euro-style dates will be forced as such through out document. 267 When locale is not set, the default is to only use euro locale for dates 268 that are not ambiguous, e.g., 30/05/1977. 269 Ambiguous dates (with no default locale used) are parsed as "north-am". 270 """ 271 def __init__(self, *args, **kwargs): 272 PatternMatch.__init__(self, *args, **kwargs) 273 self.case = PatternMatch.LOWER_CASE 274 self.locale = "north-am" # vs. "euro" vs... 275 276 def __str__(self): 277 return f"{self.text}" 278 279 def normalize(self): 280 PatternMatch.normalize(self) 281 self.is_valid = False 282 self.filtered_out = True 283 284 # Slots to capture: 285 # MON_ABBREV, MON_NAME, MONTH, MM 286 # YEAR, YY, YEARYY 287 # DAY_ENUM, DOM, DD 288 # SHORT_TZ, LONG_TZ 289 # hh, mm, ss 290 slots = self.attributes() 291 292 # normalize_year, resolution = YEAR 293 # if separators present, validate now 294 # normalize day or day of month 295 # normalize month num or month name 296 # normalize TZ and time if present. 297 # set finest resolution Y, M, D, H, S 298 # TODO: TIMEX encodings 299 300 year = normalize_year(slots) 301 if year is None or year == INVALID_DATE: 302 return False 303 304 # resolution = Resolution.YEAR 305 day, month = None, None 306 is_short_mdy = False 307 if self.pattern_id in {"MDY-01", "MDY-02"}: 308 is_short_mdy = True 309 day, month = test_european_locale(slots, _default_locale) # Uses DM slots only 310 if day and day < 0: 311 return False 312 if day and month: 313 # Non-zero day/month returned from test 314 self.locale = "euro" 315 316 if not month: 317 month = normalize_month_num(slots) 318 if month <= 0: 319 month = normalize_month_name(slots) 320 321 if month < 0: 322 return False 323 324 resolution = Resolution.MONTH 325 sep1 = slots.get("DSEP1") 326 sep2 = slots.get("DSEP2") 327 if sep1 and sep2 and sep1 != sep2: 328 return False 329 330 if sep1 == "." and is_short_mdy: 331 year_str = _get_year(slots) 332 if len(year_str) ==2: 333 return False 334 335 if not day: 336 day = normalize_day(slots) 337 if day == INVALID_DAY: 338 return False 339 elif day == INVALID_DATE: 340 # Missing day 341 day = 1 342 else: 343 resolution = Resolution.DAY 344 345 # Simple february catch: 346 if month == 2 and day > 29: 347 return False 348 349 try: 350 tz_found = None 351 date_found = arrow.get(datetime(year, month, day)) 352 tm = normalize_time(slots) 353 if tm: 354 hr, minute, seconds, resolution = tm 355 if hr >= 0: 356 date_found = date_found.shift(hours=hr) 357 if minute >= 0: 358 date_found = date_found.shift(minutes=minute) 359 if seconds >= 0: 360 date_found = date_found.shift(seconds=seconds) 361 tz_found = normalize_tz(slots) 362 if tz_found: 363 date_found = date_found.to(tz_found.tzinfo) 364 365 # Matchgroups are raw data from REGEX 366 # Attributes are final encodings to share. 367 self.attrs = { 368 "datenorm": date_found.format("YYYY-MM-DD"), 369 "epoch": timegm(date_found.timetuple()), 370 "resolution": resolution, 371 "locale": self.locale 372 } 373 if tm: 374 self.attrs["timestamp"] = date_found.format("YYYY-MM-DDTHH:mm:ssZ") 375 if tz_found: 376 self.attrs["tzinfo"] = tz_found.format("ZZZ") 377 378 self.is_valid = True 379 self.filtered_out = False 380 except Exception as parse_err: 381 # For debugging purposes -- but ideally, you IGNORE 382 # date/time values that are marked filtered_out = True 383 self.attrs["error"] = str(parse_err) 384 log.info("Parsing error: DATE: %s (YMD = %d / % d / %d )", self.text, year, month, day) 385 log.debug("Exception - ", exc_info=parse_err)
def
normalize_day(slots):
35def normalize_day(slots): 36 """ 37 38 :param slots: 39 :return: 40 """ 41 if not slots: 42 return INVALID_DAY 43 44 day = slots.get("DM2") or slots.get("DOM") or slots.get("DD") 45 if day: 46 try: 47 day = int(day) 48 if 0 < day <= 31: 49 return day 50 else: 51 return INVALID_DAY 52 except: 53 pass 54 return INVALID_DATE
:param slots: :return:
def
normalize_month_name(slots):
57def normalize_month_name(slots): 58 """ 59 60 :param slots: 61 :return: name of month in English 62 """ 63 text = slots.get("MON_ABBREV") or slots.get("MON_NAME") 64 if not text: 65 return INVALID_DATE 66 tlen = len(text) 67 if tlen < 3 or 11 < tlen: 68 return INVALID_DATE 69 try: 70 short = text[0:3] 71 return arrow.get(short, "MMM").month 72 except: 73 return INVALID_DATE
:param slots: :return: name of month in English
def
normalize_month_num(slots: dict):
76def normalize_month_num(slots: dict): 77 """ 78 returns month number 79 :param slots: 80 :return: month num, 1-12 81 """ 82 if not slots: 83 return INVALID_DATE 84 85 month_num = slots.get("DM1") or slots.get("MM") or slots.get("MONTH") 86 if month_num: 87 try: 88 num = int(month_num) 89 if 0 < num <= 12: 90 return num 91 except: 92 pass 93 return INVALID_DATE
returns month number :param slots: :return: month num, 1-12
def
test_european_locale(slots: dict, locale=None):
96def test_european_locale(slots: dict, locale=None): 97 """ 98 99 :param slots: 100 :return: day, month 101 """ 102 if not ("DM1" in slots and "DM2" in slots): 103 return None, None 104 105 # Matched as MDY 106 # But we test if DMY is valid based on values. 107 try: 108 day = int(slots["DM1"]) 109 mon = int(slots["DM2"]) 110 # First pass -- if LOCALE == "euro", then assume pattern matches DAY/MON/YYYY 111 if locale and locale == "euro": 112 if mon <= 12 and day <= 31: 113 return day, mon 114 else: 115 return -1, -1 116 else: 117 # Otherwise -- this is a test and we're guessing. Only return 118 # a date if date pattern appears to be unambiguous. 03/05 is Mar-5th or May-3rd, for example 119 if day > 12 and mon <= 12: 120 # Valid match 31/12/... new year's eve. 121 return day, mon 122 if day > 12 and mon > 12: 123 # Invalid date match for this pattern, e.g., 13/13/, or 30/13/... 124 return -1, -1 125 except: 126 pass 127 return None, None
:param slots: :return: day, month
def
normalize_tz(slots):
182def normalize_tz(slots): 183 """ 184 185 :param slots: 186 :return: arrow.tzinfo 187 """ 188 189 try: 190 tz = slots.get("SHORT_TZ") 191 if tz: 192 dt_tz = arrow.get(tz, "Z") 193 return dt_tz 194 tz = slots.get("LONG_TZ") 195 if tz: 196 dt_tz = arrow.get(tz, "ZZZ") 197 return dt_tz 198 except Exception as parse_err: 199 return None
:param slots: :return: arrow.tzinfo
def
normalize_time(slots):
202def normalize_time(slots): 203 """ 204 Derive a valid time tuple. 205 :param slots: 206 :return: tuple of H, M, S, resolution 207 """ 208 209 # Default time is mid-day, noon. In timezone provided or UTC if no timezone. 210 hh_mm_ss = [] 211 for field in ["hh", "mm", "ss"]: 212 if field in slots: 213 val = slots.get(field) 214 if val is not None: 215 hh_mm_ss.append(int(val)) 216 else: 217 hh_mm_ss.append(-1) 218 219 # Time resolution is D, H, M, S ... where second is optional 220 hh, mm, ss = hh_mm_ss 221 # resolution = Resolution.DAY 222 if not (0 <= hh < 24): 223 return None 224 # resolution = Resolution.HOUR 225 if not (0 <= mm < 60): 226 return None 227 resolution = Resolution.MINUTE 228 if 0 <= ss < 60: 229 resolution = Resolution.SECOND 230 return hh, mm, ss, resolution
Derive a valid time tuple. :param slots: :return: tuple of H, M, S, resolution
233class XTemporal(PatternExtractor): 234 def __init__(self, cfg="datetime_patterns_py.cfg", debug=False, locale=None): 235 """ 236 :param cfg: patterns config file. 237 """ 238 PatternExtractor.__init__(self, RegexPatternManager(cfg, debug=debug, testing=debug)) 239 if locale: 240 global _default_locale 241 _default_locale = locale.lower() 242 243 if debug: 244 log.setLevel("DEBUG")
Discussion: Read first https://opensextant.github.io/Xponents/doc/Patterns.md
Example:
from opensextant.extractors.poli import PatternsOfLifeManager
from opensextant.FlexPat import PatternExtractor
# INIT
#=====================
# Invoke a particular REGEX rule set, here poli_patterns.cfg
# @see https://github.com/OpenSextant/Xponents/blob/master/Core/src/main/resources/poli_patterns.cfg
mgr = PatternsOfLifeManager("poli_patterns.cfg")
pex = PatternExtractor(mgr)
# DEV/TEST
#=====================
# "default_test()" is useful to run during development and
# encourages you to capture critical pattern variants in your "TEST" data.
# Look at your pass/fail situations -- what test cases are failing your rule?
test_results = pex.default_tests()
print("TEST RESULTS")
for result in test_results:
print(repr(result))
# RUN
#=====================
real_results = pex.extract(".... text blob 1-800-123-4567...")
print("REAL RESULTS")
for result in real_results:
print(repr(result))
print(" RAW DICT:", render_match(result))
XTemporal(cfg='datetime_patterns_py.cfg', debug=False, locale=None)
234 def __init__(self, cfg="datetime_patterns_py.cfg", debug=False, locale=None): 235 """ 236 :param cfg: patterns config file. 237 """ 238 PatternExtractor.__init__(self, RegexPatternManager(cfg, debug=debug, testing=debug)) 239 if locale: 240 global _default_locale 241 _default_locale = locale.lower() 242 243 if debug: 244 log.setLevel("DEBUG")
:param cfg: patterns config file.
Inherited Members
257class DateTimeMatch(PatternMatch): 258 """ 259 DateTimeMatch puts out a matched date with attributes: 260 261 datenorm -- ISO yyyy-mm-dd date 262 epoch -- seconds from 1970-01-01 263 resolution - D, M, h, m, s 264 locale -- "north-am" or "euro". 265 266 If locale is set using XTemporal(locale='euro') 267 matching Euro-style dates will be forced as such through out document. 268 When locale is not set, the default is to only use euro locale for dates 269 that are not ambiguous, e.g., 30/05/1977. 270 Ambiguous dates (with no default locale used) are parsed as "north-am". 271 """ 272 def __init__(self, *args, **kwargs): 273 PatternMatch.__init__(self, *args, **kwargs) 274 self.case = PatternMatch.LOWER_CASE 275 self.locale = "north-am" # vs. "euro" vs... 276 277 def __str__(self): 278 return f"{self.text}" 279 280 def normalize(self): 281 PatternMatch.normalize(self) 282 self.is_valid = False 283 self.filtered_out = True 284 285 # Slots to capture: 286 # MON_ABBREV, MON_NAME, MONTH, MM 287 # YEAR, YY, YEARYY 288 # DAY_ENUM, DOM, DD 289 # SHORT_TZ, LONG_TZ 290 # hh, mm, ss 291 slots = self.attributes() 292 293 # normalize_year, resolution = YEAR 294 # if separators present, validate now 295 # normalize day or day of month 296 # normalize month num or month name 297 # normalize TZ and time if present. 298 # set finest resolution Y, M, D, H, S 299 # TODO: TIMEX encodings 300 301 year = normalize_year(slots) 302 if year is None or year == INVALID_DATE: 303 return False 304 305 # resolution = Resolution.YEAR 306 day, month = None, None 307 is_short_mdy = False 308 if self.pattern_id in {"MDY-01", "MDY-02"}: 309 is_short_mdy = True 310 day, month = test_european_locale(slots, _default_locale) # Uses DM slots only 311 if day and day < 0: 312 return False 313 if day and month: 314 # Non-zero day/month returned from test 315 self.locale = "euro" 316 317 if not month: 318 month = normalize_month_num(slots) 319 if month <= 0: 320 month = normalize_month_name(slots) 321 322 if month < 0: 323 return False 324 325 resolution = Resolution.MONTH 326 sep1 = slots.get("DSEP1") 327 sep2 = slots.get("DSEP2") 328 if sep1 and sep2 and sep1 != sep2: 329 return False 330 331 if sep1 == "." and is_short_mdy: 332 year_str = _get_year(slots) 333 if len(year_str) ==2: 334 return False 335 336 if not day: 337 day = normalize_day(slots) 338 if day == INVALID_DAY: 339 return False 340 elif day == INVALID_DATE: 341 # Missing day 342 day = 1 343 else: 344 resolution = Resolution.DAY 345 346 # Simple february catch: 347 if month == 2 and day > 29: 348 return False 349 350 try: 351 tz_found = None 352 date_found = arrow.get(datetime(year, month, day)) 353 tm = normalize_time(slots) 354 if tm: 355 hr, minute, seconds, resolution = tm 356 if hr >= 0: 357 date_found = date_found.shift(hours=hr) 358 if minute >= 0: 359 date_found = date_found.shift(minutes=minute) 360 if seconds >= 0: 361 date_found = date_found.shift(seconds=seconds) 362 tz_found = normalize_tz(slots) 363 if tz_found: 364 date_found = date_found.to(tz_found.tzinfo) 365 366 # Matchgroups are raw data from REGEX 367 # Attributes are final encodings to share. 368 self.attrs = { 369 "datenorm": date_found.format("YYYY-MM-DD"), 370 "epoch": timegm(date_found.timetuple()), 371 "resolution": resolution, 372 "locale": self.locale 373 } 374 if tm: 375 self.attrs["timestamp"] = date_found.format("YYYY-MM-DDTHH:mm:ssZ") 376 if tz_found: 377 self.attrs["tzinfo"] = tz_found.format("ZZZ") 378 379 self.is_valid = True 380 self.filtered_out = False 381 except Exception as parse_err: 382 # For debugging purposes -- but ideally, you IGNORE 383 # date/time values that are marked filtered_out = True 384 self.attrs["error"] = str(parse_err) 385 log.info("Parsing error: DATE: %s (YMD = %d / % d / %d )", self.text, year, month, day) 386 log.debug("Exception - ", exc_info=parse_err)
DateTimeMatch puts out a matched date with attributes:
datenorm -- ISO yyyy-mm-dd date
epoch -- seconds from 1970-01-01
resolution - D, M, h, m, s
locale -- "north-am" or "euro".
If locale is set using XTemporal(locale='euro')
matching Euro-style dates will be forced as such through out document.
When locale is not set, the default is to only use euro locale for dates
that are not ambiguous, e.g., 30/05/1977.
Ambiguous dates (with no default locale used) are parsed as "north-am".
def
normalize(self):
280 def normalize(self): 281 PatternMatch.normalize(self) 282 self.is_valid = False 283 self.filtered_out = True 284 285 # Slots to capture: 286 # MON_ABBREV, MON_NAME, MONTH, MM 287 # YEAR, YY, YEARYY 288 # DAY_ENUM, DOM, DD 289 # SHORT_TZ, LONG_TZ 290 # hh, mm, ss 291 slots = self.attributes() 292 293 # normalize_year, resolution = YEAR 294 # if separators present, validate now 295 # normalize day or day of month 296 # normalize month num or month name 297 # normalize TZ and time if present. 298 # set finest resolution Y, M, D, H, S 299 # TODO: TIMEX encodings 300 301 year = normalize_year(slots) 302 if year is None or year == INVALID_DATE: 303 return False 304 305 # resolution = Resolution.YEAR 306 day, month = None, None 307 is_short_mdy = False 308 if self.pattern_id in {"MDY-01", "MDY-02"}: 309 is_short_mdy = True 310 day, month = test_european_locale(slots, _default_locale) # Uses DM slots only 311 if day and day < 0: 312 return False 313 if day and month: 314 # Non-zero day/month returned from test 315 self.locale = "euro" 316 317 if not month: 318 month = normalize_month_num(slots) 319 if month <= 0: 320 month = normalize_month_name(slots) 321 322 if month < 0: 323 return False 324 325 resolution = Resolution.MONTH 326 sep1 = slots.get("DSEP1") 327 sep2 = slots.get("DSEP2") 328 if sep1 and sep2 and sep1 != sep2: 329 return False 330 331 if sep1 == "." and is_short_mdy: 332 year_str = _get_year(slots) 333 if len(year_str) ==2: 334 return False 335 336 if not day: 337 day = normalize_day(slots) 338 if day == INVALID_DAY: 339 return False 340 elif day == INVALID_DATE: 341 # Missing day 342 day = 1 343 else: 344 resolution = Resolution.DAY 345 346 # Simple february catch: 347 if month == 2 and day > 29: 348 return False 349 350 try: 351 tz_found = None 352 date_found = arrow.get(datetime(year, month, day)) 353 tm = normalize_time(slots) 354 if tm: 355 hr, minute, seconds, resolution = tm 356 if hr >= 0: 357 date_found = date_found.shift(hours=hr) 358 if minute >= 0: 359 date_found = date_found.shift(minutes=minute) 360 if seconds >= 0: 361 date_found = date_found.shift(seconds=seconds) 362 tz_found = normalize_tz(slots) 363 if tz_found: 364 date_found = date_found.to(tz_found.tzinfo) 365 366 # Matchgroups are raw data from REGEX 367 # Attributes are final encodings to share. 368 self.attrs = { 369 "datenorm": date_found.format("YYYY-MM-DD"), 370 "epoch": timegm(date_found.timetuple()), 371 "resolution": resolution, 372 "locale": self.locale 373 } 374 if tm: 375 self.attrs["timestamp"] = date_found.format("YYYY-MM-DDTHH:mm:ssZ") 376 if tz_found: 377 self.attrs["tzinfo"] = tz_found.format("ZZZ") 378 379 self.is_valid = True 380 self.filtered_out = False 381 except Exception as parse_err: 382 # For debugging purposes -- but ideally, you IGNORE 383 # date/time values that are marked filtered_out = True 384 self.attrs["error"] = str(parse_err) 385 log.info("Parsing error: DATE: %s (YMD = %d / % d / %d )", self.text, year, month, day) 386 log.debug("Exception - ", exc_info=parse_err)
Optional, but recommended routine to normalize the matched data. That is, parse fields, uppercase, streamline punctuation, etc. As well, given such normalization result, this is the opportunity to additionally validate the match. :return: