You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
EINService/EINService.py

265 lines
9.4 KiB

import requests as rq
from bs4 import BeautifulSoup as bsp
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from logging import warning, debug, error
from pandas import DataFrame, Series
import pandas as pd
SAMPLE_EIN = "59-1571026"
@dataclass
class EINData:
"""
Represents a basic set of data related to an EIN:
- Buiness Name
- Address1: the 2 lines of an address (street and apt)
- City
- State
- Zip: can handle '-'s
- phone number: (), -, and + will be removed
"""
def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None:
if re.search("\d{2}(-|)\d{7}", str(ein)) == None:
raise Exception(f"Invalid EIN: {ein}")
self.ein = ein.strip().replace('-','')
self.buinessName = buinessName.lower().strip()
self.address1 = address1.lower().strip()
self.city = city.lower().strip()
self.state = state.lower().strip()
self.phone = phone.strip().replace('-','').replace('(',"").replace(')','').replace('+','')
self.zip = int(zip.replace('-','').strip())
def __str__(self) -> str:
"""
Used to print the object
"""
return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}"""
def get_ein(self) -> str:
"""
Returns the associated EIN in the format: XX-XXXXXXX
"""
return f"{self.ein[0:2]}-{self.ein[2:]}"
def compare(self, otherEIN: 'EINData') -> dict:
"""
Compares the EIN object with another.
Returns a match dictionary containing True/False for whether each data member matched
and an over all score
Returns None if comparision fails
"""
try:
compareDict = {
# Checks wether these two data points match
# Thanks to the standardiztion provided by EINData's init
# there should be few matching issues
"buinessName" : self.buinessName == otherEIN.buinessName,
"address" : self.address1 == otherEIN.address1,
"city": self.city == otherEIN.city,
"state": self.state == otherEIN.state,
"zip" :self.zip == otherEIN.zip
}
except Exception as e:
error(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""")
# If we cannot succesfully compare the data return None
return None
debug(compareDict)
score = 0
for v in compareDict.values():
# increase score by one for every value that is true
score += 1 if v else 0
compareDict["score"] = score
return compareDict
def as_dict(self) -> dict:
return {"BusinessName": self.buinessName, "Address1": self.address1,
"City":self.city, "State": self.state, "Zip": self.zip, "Phone": self.phone}
def as_series(self) -> Series:
"""
Converts object into a pandas Series
"""
return Series(self.as_dict())
def dataframe_to_eins(df: DataFrame,
einLabel: str = "Lessee Tax-ID",
nameLabel: str = "NAME",
addressLabel: str = "ADDRESS",
cityLabel: str = "CITY",
stateLabel: str = "STATE",
zipLabel: str = "ZIP",
phoneLabel: str = "PHONE"
) -> list[EINData]:
"""
Converts a dataframe into a list of EINData objects.
Requires the dataframe contains ceratain data labels.
"""
# Confirm all correct columns exist
debug(df)
try:
columns = df.columns
assert (einLabel in columns), f"EIN label not present: {einLabel} | {columns}"
assert (nameLabel in columns), f"EIN label not present: {nameLabel} | {columns}"
assert (addressLabel in columns), f"EIN label not present: {addressLabel} | {columns}"
assert (cityLabel in columns), f"EIN label not present: {cityLabel} | {columns}"
assert (stateLabel in columns), f"EIN label not present: {stateLabel} | {columns}"
assert (zipLabel in columns), f"EIN label not present: {zipLabel} | {columns}"
assert (phoneLabel in columns), f"EIN label not present: {phoneLabel} | {columns}"
except Exception as e:
error(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}")
eins = []
for _, data in df.iterrows():
# _ is the row index which is not used
# The rest is a pandas serices
# The column labels are used to pull the data from the series
debug(data)
try:
eins.append(EINData(
str(data[einLabel]),
data[nameLabel],
data[addressLabel],
data[cityLabel],
data[stateLabel],
str(data[zipLabel]),
str(data[phoneLabel])
))
except Exception as e:
# If we fail, we port the error and move on to the next item in dataframe
warning(f"Could not add {einLabel}!")
continue
class __EINService(ABC):
"""
This is an abstract base class used to define the interface for services that
can be used to search EINS.
DO NOT INSTANTIATE THIS CLASS
"""
@classmethod
@abstractmethod
def search_ein(self, ein: str) -> EINData:
"""
Takes a an ein and returns information from a search using the service
"""
@classmethod
@abstractmethod
def search_eins(self, eins: list[str]) -> list[EINData]:
"""
Takes a list of EINS and returns a list of EINData objects.
!! Some items in the list may return None
"""
@classmethod
def _isEIN(self, ein: str) -> bool:
return re.search("\d{2}(-|)\d{7}", ein) != None
class EINTaxIDService(__EINService):
"""
Concrete class of __EINService using eintaxid.com.
There seems to be not rate limiting on this service at the moment.
It's not acutally meant to act as an API. This is a JQuery/php service used to
hydrate their webapp, but we can call it directly
"""
_url = "https://eintaxid.com"
def search_ein(self, ein: str) -> EINData:
debug(ein)
try:
# We don't even want to attempt this unless we're using a valid EIN
if not self._isEIN(ein):
warning(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX")
raise Exception("Invalid EIN")
except Exception as e:
warning(f"{e} | {ein}")
return None
# Send a POST HTTP request to the site using the search-ajax script
# query just needs to include EIN, XML header required for parsing
req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \
headers={'X-Requested-With': 'XMLHttpRequest'})
debug(req)
# Use BeautifulSoup to parse the HTML content
soup = bsp(req.content, "html.parser")
debug(soup)
try:
# The actual data return is always on line 4. The rest are DIV set up
text = soup.text.splitlines()[4]
except:
warning(f"Failed: {ein} | {soup}")
return EINData(ein, None,None,None,None,None,None)
data = self._parse_return(text)
return EINData(
ein,
data["company"],
data["address1"],
data["city"],
data["state"],
data["zip"],
data["phone"]
)
def search_eins(self, eins: str, asDataFrame: bool = False) -> list[EINData]:
if asDataFrame:
dataList =[self.search_ein(ein).as_series() for ein in eins]
return pd.concat(dataList, ignore_index=True)
else:
return [self.search_ein(ein) for ein in eins]
def _parse_return(self, content: str) -> dict:
"""
Use regex to parse the return of a call to the sites
search script
Expects a string created by BS4 using the HTML parse on request content
Specifically line 4 of this sites return
Data that cannot be found will be returned as None
"""
debug(f"EIN Service returned content:\n{content}")
m = re.search("EIN Number:", content)
company = content[0:m.start()].strip()
m = re.search("Doing Business As:.*Address:", content)
dba = content[m.start()+18:m.end()-8].strip() if m != None else None
m = re.search("Address:.*Phone:", content)
address = content[m.start()+8:m.end()-6].strip() if m != None else None
if address != None:
# We need to split address into pieces, which are conviently seperated by ,
addressPieces = address.split(',')
address1 = addressPieces[0].strip()
city = addressPieces[1].strip()
# State and Zip are only seperated by a space not a ','
state = addressPieces[2].strip().split(" ")[0].strip()
zip = addressPieces[2].strip().split(" ")[1].strip()
else:
address1 = None
city = None
state, zip = None
phone = content[m.end():].strip() if m != None else None
return {
"company": company,
"dba": dba,
"address": address,
"address1": address1,
"city": city,
"state": state,
"zip": zip,
"phone": phone
}