EINService/EINService.py

import requests as rq
from bs4 import BeautifulSoup as bsp
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from logging import warning, debug, error
from pandas import DataFrame, Series
import pandas as pd


SAMPLE_EIN = "59-1571026"


@dataclass
class EINData:
    """
    Represents a basic set of data related to an EIN:
        - Buiness Name
        - Address1: the 2 lines of an address (street and apt)
        - City
        - State
        - Zip: can handle '-'s
        - phone number: (), -, and + will be removed
    """
    def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None:
        if re.search("\d{2}(-|)\d{7}", str(ein)) == None:
            raise Exception(f"Invalid EIN: {ein}")
        self.ein = ein.strip().replace('-','')
        self.buinessName = buinessName.lower().strip()
        self.address1 = address1.lower().strip()
        self.city = city.lower().strip()
        self.state = state.lower().strip()
        self.phone = phone.strip().replace('-','').replace('(',"").replace(')','').replace('+','')
        self.zip = int(zip.replace('-','').strip())


    def __str__(self) -> str:
        """
        Used to print the object
        """
        return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}"""

    def get_ein(self) -> str:
        """
        Returns the associated EIN in the format: XX-XXXXXXX
        """
        return f"{self.ein[0:2]}-{self.ein[2:]}"

    def compare(self, otherEIN: 'EINData') -> dict:
        """
        Compares the EIN object with another.
        Returns a match dictionary containing True/False for whether each data member matched
        and an over all score

        Returns None if comparision fails
        """
        try:
            compareDict = {
                # Checks wether these two data points match
                # Thanks to the standardiztion provided by EINData's init
                # there should be few matching issues
                "buinessName" : self.buinessName == otherEIN.buinessName,
                "address" : self.address1 == otherEIN.address1,
                "city": self.city == otherEIN.city,
                "state":  self.state == otherEIN.state,
                "zip" :self.zip == otherEIN.zip
            }
        except Exception as e:
            error(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""")
            # If we cannot succesfully compare the data return None
            return None
        debug(compareDict)
        score = 0
        for v in compareDict.values():
            # increase score by one for every value that is true
            score += 1 if v else 0
        compareDict["score"] = score
        return compareDict

    def as_dict(self) -> dict:
        return {"BusinessName": self.buinessName, "Address1": self.address1,
        "City":self.city, "State": self.state, "Zip": self.zip, "Phone": self.phone}


    def as_series(self) -> Series:
        """
        Converts object into a pandas Series
        """
        return Series(self.as_dict())


def dataframe_to_eins(df: DataFrame,
    einLabel: str = "Lessee Tax-ID",
    nameLabel: str = "NAME",
    addressLabel: str = "ADDRESS",
    cityLabel: str = "CITY",
    stateLabel: str = "STATE",
    zipLabel: str = "ZIP",
    phoneLabel: str = "PHONE"
    ) -> list[EINData]:
    """
    Converts a dataframe into a list of EINData objects.

    Requires the dataframe contains ceratain data labels.
    """
    # Confirm all correct columns exist
    debug(df)
    try:
        columns = df.columns
        assert (einLabel in columns), f"EIN label not present: {einLabel} | {columns}"
        assert (nameLabel in columns), f"EIN label not present: {nameLabel} | {columns}"
        assert (addressLabel in columns), f"EIN label not present: {addressLabel} | {columns}"
        assert (cityLabel in columns), f"EIN label not present: {cityLabel} | {columns}"
        assert (stateLabel in columns), f"EIN label not present: {stateLabel} | {columns}"
        assert (zipLabel in columns), f"EIN label not present: {zipLabel} | {columns}"
        assert (phoneLabel in columns), f"EIN label not present: {phoneLabel} | {columns}"
    except Exception as e:
        error(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}")

    eins = []
    for _, data in df.iterrows():
        # _ is the row index which is not used
        # The rest is a pandas serices
        # The column labels are used to pull the data from the series
        debug(data)
        try:
            eins.append(EINData(
                str(data[einLabel]),
                data[nameLabel],
                data[addressLabel],
                data[cityLabel],
                data[stateLabel],
                str(data[zipLabel]),
                str(data[phoneLabel])
            ))
        except Exception as e:
            # If we fail, we port the error and move on to the next item in dataframe
            warning(f"Could not add {einLabel}!")
            continue

class __EINService(ABC):
    """
    This is an abstract base class used to define the interface for services that
    can be used to search EINS.

    DO NOT INSTANTIATE THIS CLASS
    """
    @classmethod
    @abstractmethod
    def search_ein(self, ein: str) -> EINData:
        """
        Takes a an ein and returns information from a search using the service
        """

    @classmethod
    @abstractmethod
    def search_eins(self, eins: list[str]) -> list[EINData]:
        """
        Takes a list of EINS and returns a list of EINData objects.

        !! Some items in the list may return None
        """

    @classmethod
    def _isEIN(self, ein: str) -> bool:
        return re.search("\d{2}(-|)\d{7}", ein) != None


class EINTaxIDService(__EINService):
    """
    Concrete class of __EINService using eintaxid.com.

    There seems to be not rate limiting on this service at the moment.
    It's not acutally meant to act as an API. This is a JQuery/php service used to
    hydrate their webapp, but we can call it directly
    """
    _url = "https://eintaxid.com"

    def search_ein(self, ein: str) -> EINData:
        debug(ein)
        try:
            # We don't even want to attempt this unless we're using a valid EIN
            if not self._isEIN(ein):
                warning(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX")
                raise Exception("Invalid EIN")
        except Exception as e:
            warning(f"{e} | {ein}")
            return None
        # Send a POST HTTP request to the site using the search-ajax script
        # query just needs to include EIN, XML header required for parsing
        req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \
            headers={'X-Requested-With': 'XMLHttpRequest'})
        debug(req)
        # Use BeautifulSoup to parse the HTML content
        soup = bsp(req.content, "html.parser")
        debug(soup)
        try:
            # The actual data return is always on line 4. The rest are DIV set up
            text = soup.text.splitlines()[4]
        except:
            warning(f"Failed: {ein} | {soup}")
            return EINData(ein, None,None,None,None,None,None)
        data = self._parse_return(text)

        return EINData(
            ein,
            data["company"],
            data["address1"],
            data["city"],
            data["state"],
            data["zip"],
            data["phone"]
        )

    def search_eins(self, eins: str, asDataFrame: bool = False) -> list[EINData]:
        if asDataFrame:
            dataList =[self.search_ein(ein).as_series() for ein in eins]
            return pd.concat(dataList, ignore_index=True)
        else:
            return [self.search_ein(ein) for ein in eins]

    def _parse_return(self, content: str) -> dict:
        """
        Use regex to parse the return of a call to the sites
        search script

        Expects a string created by BS4 using the HTML parse on request content
        Specifically line 4 of this sites return

        Data that cannot be found will be returned as None
        """
        debug(f"EIN Service returned content:\n{content}")
        m = re.search("EIN Number:", content)
        company = content[0:m.start()].strip()

        m = re.search("Doing Business As:.*Address:", content)
        dba = content[m.start()+18:m.end()-8].strip() if m != None else None

        m = re.search("Address:.*Phone:", content)
        address = content[m.start()+8:m.end()-6].strip() if m != None else None
        if address != None:
            # We need to split address into pieces, which are conviently seperated by ,
            addressPieces = address.split(',')
            address1 = addressPieces[0].strip()
            city = addressPieces[1].strip()
            # State and Zip are only seperated by a space not a ','
            state = addressPieces[2].strip().split(" ")[0].strip()
            zip = addressPieces[2].strip().split(" ")[1].strip()
        else:
            address1 = None
            city = None
            state, zip = None

        phone = content[m.end():].strip() if m != None else None
        return {
            "company": company,
            "dba": dba,
            "address": address,
            "address1": address1,
            "city": city,
            "state": state,
            "zip": zip,
            "phone": phone
        }