A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/datefinder/__init__.py

346 lines
12 KiB

import copy
import logging
import regex as re
from dateutil import tz, parser
from datefinder.date_fragment import DateFragment
from .constants import (
REPLACEMENTS,
TIMEZONE_REPLACEMENTS,
STRIP_CHARS,
DATE_REGEX,
ALL_GROUPS,
RANGE_SPLIT_REGEX,
)
logger = logging.getLogger("datefinder")
class DateFinder(object):
"""
Locates dates in a text
"""
def __init__(self, base_date=None, first="month"):
self.base_date = base_date
self.dayfirst = False
self.yearfirst = False
if first == "day":
self.dayfirst = True
if first == "year":
self.yearfirst = True
def find_dates(self, text, source=False, index=False, strict=False):
for date_string, indices, captures in self.extract_date_strings(
text, strict=strict
):
as_dt = self.parse_date_string(date_string, captures)
if as_dt is None:
## Dateutil couldn't make heads or tails of it
## move on to next
continue
returnables = (as_dt,)
if source:
returnables = returnables + (date_string,)
if index:
returnables = returnables + (indices,)
if len(returnables) == 1:
returnables = returnables[0]
yield returnables
def _find_and_replace(self, date_string, captures):
"""
:warning: when multiple tz matches exist the last sorted capture will trump
:param date_string:
:return: date_string, tz_string
"""
# add timezones to replace
cloned_replacements = copy.copy(REPLACEMENTS) # don't mutate
for tz_string in captures.get("timezones", []):
cloned_replacements.update({tz_string: " "})
date_string = date_string.lower()
for key, replacement in cloned_replacements.items():
# we really want to match all permutations of the key surrounded by whitespace chars except one
# for example: consider the key = 'to'
# 1. match 'to '
# 2. match ' to'
# 3. match ' to '
# but never match r'(\s|)to(\s|)' which would make 'october' > 'ocber'
date_string = re.sub(
r"(^|\s)" + key + r"(\s|$)",
replacement,
date_string,
flags=re.IGNORECASE,
)
return date_string, self._pop_tz_string(sorted(captures.get("timezones", [])))
def _pop_tz_string(self, list_of_timezones):
try:
tz_string = list_of_timezones.pop()
# make sure it's not a timezone we
# want replaced with better abbreviation
return TIMEZONE_REPLACEMENTS.get(tz_string, tz_string)
except IndexError:
return ""
def _add_tzinfo(self, datetime_obj, tz_string):
"""
take a naive datetime and add dateutil.tz.tzinfo object
:param datetime_obj: naive datetime object
:return: datetime object with tzinfo
"""
if datetime_obj is None:
return None
tzinfo_match = tz.gettz(tz_string)
return datetime_obj.replace(tzinfo=tzinfo_match)
def parse_date_string(self, date_string, captures):
# For well formatted string, we can already let dateutils parse them
# otherwise self._find_and_replace method might corrupt them
try:
as_dt = parser.parse(
date_string,
default=self.base_date,
dayfirst=self.dayfirst,
yearfirst=self.yearfirst,
)
except (ValueError, OverflowError):
# replace tokens that are problematic for dateutil
date_string, tz_string = self._find_and_replace(date_string, captures)
## One last sweep after removing
date_string = date_string.strip(STRIP_CHARS)
## Match strings must be at least 3 characters long
## < 3 tends to be garbage
if len(date_string) < 3:
return None
try:
logger.debug("Parsing {0} with dateutil".format(date_string))
as_dt = parser.parse(
date_string,
default=self.base_date,
dayfirst=self.dayfirst,
yearfirst=self.yearfirst,
)
except Exception as e:
logger.debug(e)
as_dt = None
if tz_string:
as_dt = self._add_tzinfo(as_dt, tz_string)
return as_dt
def extract_date_strings(self, text, strict=False):
"""
Scans text for possible datetime strings and extracts them
:param strict: Strict mode will only return dates sourced with day, month, and year
"""
return self.extract_date_strings_inner(text, text_start=0, strict=strict)
def extract_date_strings_inner(self, text, text_start=0, strict=False):
"""
Extends extract_date_strings by text_start parameter: used in recursive calls to
store true text coordinates in output
"""
# Try to find ranges first
rng = self.split_date_range(text)
if rng and len(rng) > 1:
range_strings = []
for range_str in rng:
range_strings.extend(
self.extract_date_strings_inner(
range_str[0], text_start=range_str[1][0], strict=strict
)
)
for range_string in range_strings:
yield range_string
return
tokens = self.tokenize_string(text)
items = self.merge_tokens(tokens)
for match in items:
match_str = match.match_str
indices = (match.indices[0] + text_start, match.indices[1] + text_start)
## Get individual group matches
captures = match.captures
# time = captures.get('time')
digits = captures.get("digits")
# digits_modifiers = captures.get('digits_modifiers')
# days = captures.get('days')
months = captures.get("months")
years = captures.get("years")
# timezones = captures.get('timezones')
# delimiters = captures.get('delimiters')
# time_periods = captures.get('time_periods')
# extra_tokens = captures.get('extra_tokens')
if strict:
complete = False
if len(digits) == 3: # 12-05-2015
complete = True
elif (len(months) == 1) and (
len(digits) == 2
): # 19 February 2013 year 09:10
complete = True
elif (len(years)==1) and (len(digits)==2): #09/06/2018
complete = True
elif (len(years)==1) and (len(months)==1) and (len(digits)==1): # '19th day of May, 2015'
complete = True
if not complete:
continue
## sanitize date string
## replace unhelpful whitespace characters with single whitespace
match_str = re.sub(r"[\n\t\s\xa0]+", " ", match_str)
match_str = match_str.strip(STRIP_CHARS)
## Save sanitized source string
yield match_str, indices, captures
def tokenize_string(self, text):
"""
Get matches from source text. Method merge_tokens will later compose
potential date strings out of these matches.
:param text: source text like 'the big fight at 2p.m. mountain standard time on ufc.com'
:return: [(match_text, match_group, {match.capturesdict()}), ...]
"""
items = []
last_index = 0
for match in DATE_REGEX.finditer(text):
match_str = match.group(0)
indices = match.span(0)
captures = match.capturesdict()
group = self.get_token_group(captures)
if indices[0] > last_index:
items.append((text[last_index : indices[0]], "", {}))
items.append((match_str, group, captures))
last_index = indices[1]
if last_index < len(text):
items.append((text[last_index : len(text)], "", {}))
return items
def merge_tokens(self, tokens):
"""
Makes potential date strings out of matches, got from tokenize_string method.
:param tokens: [(match_text, match_group, {match.capturesdict()}), ...]
:return: potential date strings
"""
MIN_MATCHES = 3
fragments = []
frag = DateFragment()
start_char, total_chars = 0, 0
for token in tokens:
total_chars += len(token[0])
tok_text, group, tok_capts = token[0], token[1], token[2]
if not group:
if frag.indices[1] > 0:
if frag.get_captures_count() >= MIN_MATCHES:
fragments.append(frag)
frag = DateFragment()
start_char = total_chars
continue
if frag.indices[1] == 0:
frag.indices = (start_char, total_chars)
else:
frag.indices = (frag.indices[0], total_chars) # -1
frag.match_str += tok_text
for capt in tok_capts:
if capt in frag.captures:
frag.captures[capt] += tok_capts[capt]
else:
frag.captures[capt] = tok_capts[capt]
start_char = total_chars
if frag.get_captures_count() >= MIN_MATCHES: # frag.matches
fragments.append(frag)
for frag in fragments:
for gr in ALL_GROUPS:
if gr not in frag.captures:
frag.captures[gr] = []
return fragments
@staticmethod
def get_token_group(captures):
for gr in ALL_GROUPS:
lst = captures.get(gr)
if lst and len(lst) > 0:
return gr
return ""
@staticmethod
def split_date_range(text):
st_matches = RANGE_SPLIT_REGEX.finditer(text)
start = 0
parts = [] # List[Tuple[str, Tuple[int, int]]]
for match in st_matches:
match_start = match.start()
if match_start > start:
parts.append((text[start:match_start], (start, match_start)))
start = match.end()
if start < len(text):
parts.append((text[start:], (start, len(text))))
return parts
def find_dates(
text, source=False, index=False, strict=False, base_date=None, first="month"
):
"""
Extract datetime strings from text
:param text:
A string that contains one or more natural language or literal
datetime strings
:type text: str|unicode
:param source:
Return the original string segment
:type source: boolean
:param index:
Return the indices where the datetime string was located in text
:type index: boolean
:param strict:
Only return datetimes with complete date information. For example:
`July 2016` of `Monday` will not return datetimes.
`May 16, 2015` will return datetimes.
:type strict: boolean
:param base_date:
Set a default base datetime when parsing incomplete dates
:type base_date: datetime
:param first:
Whether to interpret the the first value in an ambiguous 3-integer date
(01/02/03) as the month, day, or year. Values can be `month`, `day`, `year`.
Default is `month`.
:type first: str|unicode
:return: Returns a generator that produces :mod:`datetime.datetime` objects,
or a tuple with the source text and index, if requested
"""
date_finder = DateFinder(base_date=base_date, first=first)
return date_finder.find_dates(text, source=source, index=index, strict=strict)