InfoLeaseExtract/venv/Lib/site-packages/datefinder/__init__.py

import copy
import logging
import regex as re
from dateutil import tz, parser
from datefinder.date_fragment import DateFragment
from .constants import (
    REPLACEMENTS,
    TIMEZONE_REPLACEMENTS,
    STRIP_CHARS,
    DATE_REGEX,
    ALL_GROUPS,
    RANGE_SPLIT_REGEX,
)

logger = logging.getLogger("datefinder")


class DateFinder(object):
    """
    Locates dates in a text
    """

    def __init__(self, base_date=None, first="month"):
        self.base_date = base_date
        self.dayfirst = False
        self.yearfirst = False
        if first == "day":
            self.dayfirst = True
        if first == "year":
            self.yearfirst = True

    def find_dates(self, text, source=False, index=False, strict=False):

        for date_string, indices, captures in self.extract_date_strings(
            text, strict=strict
        ):

            as_dt = self.parse_date_string(date_string, captures)
            if as_dt is None:
                ## Dateutil couldn't make heads or tails of it
                ## move on to next
                continue

            returnables = (as_dt,)
            if source:
                returnables = returnables + (date_string,)
            if index:
                returnables = returnables + (indices,)

            if len(returnables) == 1:
                returnables = returnables[0]
            yield returnables

    def _find_and_replace(self, date_string, captures):
        """
        :warning: when multiple tz matches exist the last sorted capture will trump
        :param date_string:
        :return: date_string, tz_string
        """
        # add timezones to replace
        cloned_replacements = copy.copy(REPLACEMENTS)  # don't mutate
        for tz_string in captures.get("timezones", []):
            cloned_replacements.update({tz_string: " "})

        date_string = date_string.lower()
        for key, replacement in cloned_replacements.items():
            # we really want to match all permutations of the key surrounded by whitespace chars except one
            # for example: consider the key = 'to'
            # 1. match 'to '
            # 2. match ' to'
            # 3. match ' to '
            # but never match r'(\s|)to(\s|)' which would make 'october' > 'ocber'
            date_string = re.sub(
                r"(^|\s)" + key + r"(\s|$)",
                replacement,
                date_string,
                flags=re.IGNORECASE,
            )

        return date_string, self._pop_tz_string(sorted(captures.get("timezones", [])))

    def _pop_tz_string(self, list_of_timezones):
        try:
            tz_string = list_of_timezones.pop()
            # make sure it's not a timezone we
            # want replaced with better abbreviation
            return TIMEZONE_REPLACEMENTS.get(tz_string, tz_string)
        except IndexError:
            return ""

    def _add_tzinfo(self, datetime_obj, tz_string):
        """
        take a naive datetime and add dateutil.tz.tzinfo object

        :param datetime_obj: naive datetime object
        :return: datetime object with tzinfo
        """
        if datetime_obj is None:
            return None

        tzinfo_match = tz.gettz(tz_string)
        return datetime_obj.replace(tzinfo=tzinfo_match)

    def parse_date_string(self, date_string, captures):
        # For well formatted string, we can already let dateutils parse them
        # otherwise self._find_and_replace method might corrupt them
        try:
            as_dt = parser.parse(
                date_string,
                default=self.base_date,
                dayfirst=self.dayfirst,
                yearfirst=self.yearfirst,
            )
        except (ValueError, OverflowError):
            # replace tokens that are problematic for dateutil
            date_string, tz_string = self._find_and_replace(date_string, captures)

            ## One last sweep after removing
            date_string = date_string.strip(STRIP_CHARS)
            ## Match strings must be at least 3 characters long
            ## < 3 tends to be garbage
            if len(date_string) < 3:
                return None

            try:
                logger.debug("Parsing {0} with dateutil".format(date_string))
                as_dt = parser.parse(
                    date_string,
                    default=self.base_date,
                    dayfirst=self.dayfirst,
                    yearfirst=self.yearfirst,
                )
            except Exception as e:
                logger.debug(e)
                as_dt = None
            if tz_string:
                as_dt = self._add_tzinfo(as_dt, tz_string)
        return as_dt

    def extract_date_strings(self, text, strict=False):
        """
        Scans text for possible datetime strings and extracts them
        :param strict: Strict mode will only return dates sourced with day, month, and year
        """
        return self.extract_date_strings_inner(text, text_start=0, strict=strict)

    def extract_date_strings_inner(self, text, text_start=0, strict=False):
        """
        Extends extract_date_strings by text_start parameter: used in recursive calls to
        store true text coordinates in output
        """

        # Try to find ranges first
        rng = self.split_date_range(text)
        if rng and len(rng) > 1:
            range_strings = []
            for range_str in rng:
                range_strings.extend(
                    self.extract_date_strings_inner(
                        range_str[0], text_start=range_str[1][0], strict=strict
                    )
                )
            for range_string in range_strings:
                yield range_string
            return

        tokens = self.tokenize_string(text)
        items = self.merge_tokens(tokens)
        for match in items:
            match_str = match.match_str
            indices = (match.indices[0] + text_start, match.indices[1] + text_start)

            ## Get individual group matches
            captures = match.captures
            # time = captures.get('time')
            digits = captures.get("digits")
            # digits_modifiers = captures.get('digits_modifiers')
            # days = captures.get('days')
            months = captures.get("months")
            years = captures.get("years")
            # timezones = captures.get('timezones')
            # delimiters = captures.get('delimiters')
            # time_periods = captures.get('time_periods')
            # extra_tokens = captures.get('extra_tokens')

            if strict:
                complete = False
                if len(digits) == 3:  # 12-05-2015
                    complete = True
                elif (len(months) == 1) and (
                    len(digits) == 2
                ):  # 19 February 2013 year 09:10
                    complete = True
                elif (len(years)==1) and (len(digits)==2): #09/06/2018
                    complete = True

                elif (len(years)==1) and (len(months)==1) and (len(digits)==1): # '19th day of May, 2015'
                    complete = True

                if not complete:
                    continue

            ## sanitize date string
            ## replace unhelpful whitespace characters with single whitespace
            match_str = re.sub(r"[\n\t\s\xa0]+", " ", match_str)
            match_str = match_str.strip(STRIP_CHARS)

            ## Save sanitized source string
            yield match_str, indices, captures

    def tokenize_string(self, text):
        """
        Get matches from source text. Method merge_tokens will later compose
        potential date strings out of these matches.
        :param text: source text like 'the big fight at 2p.m. mountain standard time on ufc.com'
        :return: [(match_text, match_group, {match.capturesdict()}), ...]
        """
        items = []

        last_index = 0

        for match in DATE_REGEX.finditer(text):
            match_str = match.group(0)
            indices = match.span(0)
            captures = match.capturesdict()
            group = self.get_token_group(captures)

            if indices[0] > last_index:
                items.append((text[last_index : indices[0]], "", {}))
            items.append((match_str, group, captures))
            last_index = indices[1]
        if last_index < len(text):
            items.append((text[last_index : len(text)], "", {}))
        return items

    def merge_tokens(self, tokens):
        """
        Makes potential date strings out of matches, got from tokenize_string method.
        :param tokens: [(match_text, match_group, {match.capturesdict()}), ...]
        :return: potential date strings
        """
        MIN_MATCHES = 3
        fragments = []
        frag = DateFragment()

        start_char, total_chars = 0, 0

        for token in tokens:
            total_chars += len(token[0])

            tok_text, group, tok_capts = token[0], token[1], token[2]
            if not group:
                if frag.indices[1] > 0:
                    if frag.get_captures_count() >= MIN_MATCHES:
                        fragments.append(frag)
                frag = DateFragment()
                start_char = total_chars
                continue

            if frag.indices[1] == 0:
                frag.indices = (start_char, total_chars)
            else:
                frag.indices = (frag.indices[0], total_chars)  # -1

            frag.match_str += tok_text

            for capt in tok_capts:
                if capt in frag.captures:
                    frag.captures[capt] += tok_capts[capt]
                else:
                    frag.captures[capt] = tok_capts[capt]

            start_char = total_chars

        if frag.get_captures_count() >= MIN_MATCHES:  # frag.matches
            fragments.append(frag)

        for frag in fragments:
            for gr in ALL_GROUPS:
                if gr not in frag.captures:
                    frag.captures[gr] = []

        return fragments

    @staticmethod
    def get_token_group(captures):
        for gr in ALL_GROUPS:
            lst = captures.get(gr)
            if lst and len(lst) > 0:
                return gr
        return ""

    @staticmethod
    def split_date_range(text):
        st_matches = RANGE_SPLIT_REGEX.finditer(text)
        start = 0
        parts = []  # List[Tuple[str, Tuple[int, int]]]

        for match in st_matches:
            match_start = match.start()
            if match_start > start:
                parts.append((text[start:match_start], (start, match_start)))
            start = match.end()

        if start < len(text):
            parts.append((text[start:], (start, len(text))))

        return parts


def find_dates(
    text, source=False, index=False, strict=False, base_date=None, first="month"
):
    """
    Extract datetime strings from text

    :param text:
        A string that contains one or more natural language or literal
        datetime strings
    :type text: str|unicode
    :param source:
        Return the original string segment
    :type source: boolean
    :param index:
        Return the indices where the datetime string was located in text
    :type index: boolean
    :param strict:
        Only return datetimes with complete date information. For example:
        `July 2016` of `Monday` will not return datetimes.
        `May 16, 2015` will return datetimes.
    :type strict: boolean
    :param base_date:
        Set a default base datetime when parsing incomplete dates
    :type base_date: datetime
    :param first:
        Whether to interpret the the first value in an ambiguous 3-integer date
        (01/02/03) as the month, day, or year. Values can be `month`, `day`, `year`.
        Default is `month`.
    :type first: str|unicode


    :return: Returns a generator that produces :mod:`datetime.datetime` objects,
        or a tuple with the source text and index, if requested
    """
    date_finder = DateFinder(base_date=base_date, first=first)
    return date_finder.find_dates(text, source=source, index=index, strict=strict)