You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
265 lines
9.4 KiB
265 lines
9.4 KiB
import requests as rq
|
|
from bs4 import BeautifulSoup as bsp
|
|
import re
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from logging import warning, debug, error
|
|
from pandas import DataFrame, Series
|
|
import pandas as pd
|
|
|
|
|
|
SAMPLE_EIN = "59-1571026"
|
|
|
|
|
|
@dataclass
|
|
class EINData:
|
|
"""
|
|
Represents a basic set of data related to an EIN:
|
|
- Buiness Name
|
|
- Address1: the 2 lines of an address (street and apt)
|
|
- City
|
|
- State
|
|
- Zip: can handle '-'s
|
|
- phone number: (), -, and + will be removed
|
|
"""
|
|
def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None:
|
|
if re.search("\d{2}(-|)\d{7}", str(ein)) == None:
|
|
raise Exception(f"Invalid EIN: {ein}")
|
|
self.ein = ein.strip().replace('-','')
|
|
self.buinessName = buinessName.lower().strip()
|
|
self.address1 = address1.lower().strip()
|
|
self.city = city.lower().strip()
|
|
self.state = state.lower().strip()
|
|
self.phone = phone.strip().replace('-','').replace('(',"").replace(')','').replace('+','')
|
|
self.zip = int(zip.replace('-','').strip())
|
|
|
|
|
|
def __str__(self) -> str:
|
|
"""
|
|
Used to print the object
|
|
"""
|
|
return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}"""
|
|
|
|
def get_ein(self) -> str:
|
|
"""
|
|
Returns the associated EIN in the format: XX-XXXXXXX
|
|
"""
|
|
return f"{self.ein[0:2]}-{self.ein[2:]}"
|
|
|
|
def compare(self, otherEIN: 'EINData') -> dict:
|
|
"""
|
|
Compares the EIN object with another.
|
|
Returns a match dictionary containing True/False for whether each data member matched
|
|
and an over all score
|
|
|
|
Returns None if comparision fails
|
|
"""
|
|
try:
|
|
compareDict = {
|
|
# Checks wether these two data points match
|
|
# Thanks to the standardiztion provided by EINData's init
|
|
# there should be few matching issues
|
|
"buinessName" : self.buinessName == otherEIN.buinessName,
|
|
"address" : self.address1 == otherEIN.address1,
|
|
"city": self.city == otherEIN.city,
|
|
"state": self.state == otherEIN.state,
|
|
"zip" :self.zip == otherEIN.zip
|
|
}
|
|
except Exception as e:
|
|
error(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""")
|
|
# If we cannot succesfully compare the data return None
|
|
return None
|
|
debug(compareDict)
|
|
score = 0
|
|
for v in compareDict.values():
|
|
# increase score by one for every value that is true
|
|
score += 1 if v else 0
|
|
compareDict["score"] = score
|
|
return compareDict
|
|
|
|
def as_dict(self) -> dict:
|
|
return {"BusinessName": self.buinessName, "Address1": self.address1,
|
|
"City":self.city, "State": self.state, "Zip": self.zip, "Phone": self.phone}
|
|
|
|
|
|
def as_series(self) -> Series:
|
|
"""
|
|
Converts object into a pandas Series
|
|
"""
|
|
return Series(self.as_dict())
|
|
|
|
|
|
|
|
def dataframe_to_eins(df: DataFrame,
|
|
einLabel: str = "Lessee Tax-ID",
|
|
nameLabel: str = "NAME",
|
|
addressLabel: str = "ADDRESS",
|
|
cityLabel: str = "CITY",
|
|
stateLabel: str = "STATE",
|
|
zipLabel: str = "ZIP",
|
|
phoneLabel: str = "PHONE"
|
|
) -> list[EINData]:
|
|
"""
|
|
Converts a dataframe into a list of EINData objects.
|
|
|
|
Requires the dataframe contains ceratain data labels.
|
|
"""
|
|
# Confirm all correct columns exist
|
|
debug(df)
|
|
try:
|
|
columns = df.columns
|
|
assert (einLabel in columns), f"EIN label not present: {einLabel} | {columns}"
|
|
assert (nameLabel in columns), f"EIN label not present: {nameLabel} | {columns}"
|
|
assert (addressLabel in columns), f"EIN label not present: {addressLabel} | {columns}"
|
|
assert (cityLabel in columns), f"EIN label not present: {cityLabel} | {columns}"
|
|
assert (stateLabel in columns), f"EIN label not present: {stateLabel} | {columns}"
|
|
assert (zipLabel in columns), f"EIN label not present: {zipLabel} | {columns}"
|
|
assert (phoneLabel in columns), f"EIN label not present: {phoneLabel} | {columns}"
|
|
except Exception as e:
|
|
error(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}")
|
|
|
|
eins = []
|
|
for _, data in df.iterrows():
|
|
# _ is the row index which is not used
|
|
# The rest is a pandas serices
|
|
# The column labels are used to pull the data from the series
|
|
debug(data)
|
|
try:
|
|
eins.append(EINData(
|
|
str(data[einLabel]),
|
|
data[nameLabel],
|
|
data[addressLabel],
|
|
data[cityLabel],
|
|
data[stateLabel],
|
|
str(data[zipLabel]),
|
|
str(data[phoneLabel])
|
|
))
|
|
except Exception as e:
|
|
# If we fail, we port the error and move on to the next item in dataframe
|
|
warning(f"Could not add {einLabel}!")
|
|
continue
|
|
|
|
class __EINService(ABC):
|
|
"""
|
|
This is an abstract base class used to define the interface for services that
|
|
can be used to search EINS.
|
|
|
|
DO NOT INSTANTIATE THIS CLASS
|
|
"""
|
|
@classmethod
|
|
@abstractmethod
|
|
def search_ein(self, ein: str) -> EINData:
|
|
"""
|
|
Takes a an ein and returns information from a search using the service
|
|
"""
|
|
|
|
@classmethod
|
|
@abstractmethod
|
|
def search_eins(self, eins: list[str]) -> list[EINData]:
|
|
"""
|
|
Takes a list of EINS and returns a list of EINData objects.
|
|
|
|
!! Some items in the list may return None
|
|
"""
|
|
|
|
@classmethod
|
|
def _isEIN(self, ein: str) -> bool:
|
|
return re.search("\d{2}(-|)\d{7}", ein) != None
|
|
|
|
|
|
class EINTaxIDService(__EINService):
|
|
"""
|
|
Concrete class of __EINService using eintaxid.com.
|
|
|
|
There seems to be not rate limiting on this service at the moment.
|
|
It's not acutally meant to act as an API. This is a JQuery/php service used to
|
|
hydrate their webapp, but we can call it directly
|
|
"""
|
|
_url = "https://eintaxid.com"
|
|
|
|
def search_ein(self, ein: str) -> EINData:
|
|
debug(ein)
|
|
try:
|
|
# We don't even want to attempt this unless we're using a valid EIN
|
|
if not self._isEIN(ein):
|
|
warning(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX")
|
|
raise Exception("Invalid EIN")
|
|
except Exception as e:
|
|
warning(f"{e} | {ein}")
|
|
return None
|
|
# Send a POST HTTP request to the site using the search-ajax script
|
|
# query just needs to include EIN, XML header required for parsing
|
|
req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \
|
|
headers={'X-Requested-With': 'XMLHttpRequest'})
|
|
debug(req)
|
|
# Use BeautifulSoup to parse the HTML content
|
|
soup = bsp(req.content, "html.parser")
|
|
debug(soup)
|
|
try:
|
|
# The actual data return is always on line 4. The rest are DIV set up
|
|
text = soup.text.splitlines()[4]
|
|
except:
|
|
warning(f"Failed: {ein} | {soup}")
|
|
return EINData(ein, None,None,None,None,None,None)
|
|
data = self._parse_return(text)
|
|
|
|
return EINData(
|
|
ein,
|
|
data["company"],
|
|
data["address1"],
|
|
data["city"],
|
|
data["state"],
|
|
data["zip"],
|
|
data["phone"]
|
|
)
|
|
|
|
def search_eins(self, eins: str, asDataFrame: bool = False) -> list[EINData]:
|
|
if asDataFrame:
|
|
dataList =[self.search_ein(ein).as_series() for ein in eins]
|
|
return pd.concat(dataList, ignore_index=True)
|
|
else:
|
|
return [self.search_ein(ein) for ein in eins]
|
|
|
|
def _parse_return(self, content: str) -> dict:
|
|
"""
|
|
Use regex to parse the return of a call to the sites
|
|
search script
|
|
|
|
Expects a string created by BS4 using the HTML parse on request content
|
|
Specifically line 4 of this sites return
|
|
|
|
Data that cannot be found will be returned as None
|
|
"""
|
|
debug(f"EIN Service returned content:\n{content}")
|
|
m = re.search("EIN Number:", content)
|
|
company = content[0:m.start()].strip()
|
|
|
|
m = re.search("Doing Business As:.*Address:", content)
|
|
dba = content[m.start()+18:m.end()-8].strip() if m != None else None
|
|
|
|
m = re.search("Address:.*Phone:", content)
|
|
address = content[m.start()+8:m.end()-6].strip() if m != None else None
|
|
if address != None:
|
|
# We need to split address into pieces, which are conviently seperated by ,
|
|
addressPieces = address.split(',')
|
|
address1 = addressPieces[0].strip()
|
|
city = addressPieces[1].strip()
|
|
# State and Zip are only seperated by a space not a ','
|
|
state = addressPieces[2].strip().split(" ")[0].strip()
|
|
zip = addressPieces[2].strip().split(" ")[1].strip()
|
|
else:
|
|
address1 = None
|
|
city = None
|
|
state, zip = None
|
|
|
|
phone = content[m.end():].strip() if m != None else None
|
|
return {
|
|
"company": company,
|
|
"dba": dba,
|
|
"address": address,
|
|
"address1": address1,
|
|
"city": city,
|
|
"state": state,
|
|
"zip": zip,
|
|
"phone": phone
|
|
} |