Added more documentation

master
Griffiths Lott 3 years ago
parent d698197d52
commit 2b6c71598b
  1. 111
      EINService.py

@ -12,7 +12,15 @@ SAMPLE_EIN = "59-1571026"
@dataclass @dataclass
class EINData: class EINData:
"""
Represents a basic set of data related to an EIN:
- Buiness Name
- Address1: the 2 lines of an address (street and apt)
- City
- State
- Zip: can handle '-'s
- phone number: (), -, and + will be removed
"""
def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None: def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None:
if re.search("\d{2}(-|)\d{7}", str(ein)) == None: if re.search("\d{2}(-|)\d{7}", str(ein)) == None:
raise Exception(f"Invalid EIN: {ein}") raise Exception(f"Invalid EIN: {ein}")
@ -26,14 +34,29 @@ class EINData:
def __str__(self) -> str: def __str__(self) -> str:
"""
Used to print the object
"""
return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}""" return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}"""
def get_ein(self) -> str: def get_ein(self) -> str:
"""
Returns the associated EIN in the format: XX-XXXXXXX
"""
return f"{self.ein[0:2]}-{self.ein[2:]}" return f"{self.ein[0:2]}-{self.ein[2:]}"
def compare(self, otherEIN: 'EINData') -> dict: def compare(self, otherEIN: 'EINData') -> dict:
"""
Compares the EIN object with another.
Returns a match dictionary containing True/False for whether each data member matched
and an over all score
Returns None if comparision fails
"""
try: try:
compareDict = { compareDict = {
# Tenary operator used ot concisley assign values
# If they match then true, else false
"buinessName" : True if self.buinessName == otherEIN.buinessName else False, "buinessName" : True if self.buinessName == otherEIN.buinessName else False,
"address" : True if self.address1 == otherEIN.address1 else False, "address" : True if self.address1 == otherEIN.address1 else False,
"city": True if self.city == otherEIN.city else False, "city": True if self.city == otherEIN.city else False,
@ -42,15 +65,70 @@ class EINData:
} }
except Exception as e: except Exception as e:
print(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""") print(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""")
# If we cannot succesfully compare the data return None
return None return None
score = 0 score = 0
for v in compareDict.values(): for v in compareDict.values():
# increase score by one for every value that is true
score += 1 if v else 0 score += 1 if v else 0
compareDict["score"] = score compareDict["score"] = score
return compareDict return compareDict
class EINService(ABC): def dataframe_to_eins(df: pd.DataFrame,
einLabel: str = "Lessee Tax-ID",
nameLabel: str = "NAME",
addressLabel: str = "ADDRESS",
cityLabel: str = "CITY",
stateLabel: str = "STATE",
zipLabel: str = "ZIP",
phoneLabel: str = "PHONE"
) -> list[EINData]:
"""
Converts a dataframe into a list of EINData objects.
Requires the dataframe contains ceratain data labels.
"""
# Confirm all correct columns exist
try:
columns = df.columns
assert (einLabel in columns), f"EIN label not present: {einLabel} | {columns}"
assert (nameLabel in columns), f"EIN label not present: {nameLabel} | {columns}"
assert (addressLabel in columns), f"EIN label not present: {addressLabel} | {columns}"
assert (cityLabel in columns), f"EIN label not present: {cityLabel} | {columns}"
assert (stateLabel in columns), f"EIN label not present: {stateLabel} | {columns}"
assert (zipLabel in columns), f"EIN label not present: {zipLabel} | {columns}"
assert (phoneLabel in columns), f"EIN label not present: {phoneLabel} | {columns}"
except Exception as e:
print(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}")
eins = []
for _, data in df.iterrows():
# _ is the row index which is not used
# The rest is a pandas serices
# The column labels are used to pull the data from the series
try:
eins.append(EINData(
str(data[einLabel]),
data[nameLabel],
data[addressLabel],
data[cityLabel],
data[stateLabel],
str(data[zipLabel]),
str(data[phoneLabel])
))
except Exception as e:
# If we fail, we port the error and move on to the next item in dataframe
print(f"Could not add {einLabel}!")
continue
class __EINService(ABC):
"""
This is an abstract base class used to define the interface for services that
can be used to search EINS.
DO NOT INSTANTIATE THIS CLASS
"""
@classmethod @classmethod
@abstractmethod @abstractmethod
def search_ein(self, ein: str) -> EINData: def search_ein(self, ein: str) -> EINData:
@ -60,9 +138,11 @@ class EINService(ABC):
@classmethod @classmethod
@abstractmethod @abstractmethod
def search_eins(self, eins: list[EINData]): def search_eins(self, eins: list[str]) -> list[EINData]:
""" """
Takes a list of EINS and returns a list of EINData objects.
!! Some items in the list may return None
""" """
@classmethod @classmethod
@ -70,22 +150,33 @@ class EINService(ABC):
return re.search("\d{2}(-|)\d{7}", ein) != None return re.search("\d{2}(-|)\d{7}", ein) != None
class EINTaxIDService(EINService): class EINTaxIDService(__EINService):
"""
Concrete class of __EINService using eintaxid.com.
There seems to be not rate limiting on this service at the moment.
It's not acutally meant to act as an API. This is a JQuery/php service used to
hydrate their webapp, but we can call it directly
"""
_url = "https://eintaxid.com" _url = "https://eintaxid.com"
def search_ein(self, ein: str) -> EINData: def search_ein(self, ein: str) -> EINData:
try: try:
# We don't even want to attempt this unless we're using a valid EIN
if not self._isEIN(ein): if not self._isEIN(ein):
print(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX") print(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX")
raise Exception("Invalid EIN") raise Exception("Invalid EIN")
except Exception as e: except Exception as e:
print(e) print(e)
return None return None
# Send a POST HTTP request to the site using the search-ajax script
# query just needs to include EIN, XML header required for parsing
req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \ req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \
headers={'X-Requested-With': 'XMLHttpRequest'}) headers={'X-Requested-With': 'XMLHttpRequest'})
# Use BeautifulSoup to parse the HTML content
soup = bsp(req.content, "html.parser") soup = bsp(req.content, "html.parser")
try: try:
# The actual data return is always on line 4. The rest are DIV set up
text = soup.text.splitlines()[4] text = soup.text.splitlines()[4]
except: except:
print(f"Failed: {ein} | {soup}") print(f"Failed: {ein} | {soup}")
@ -107,6 +198,15 @@ class EINTaxIDService(EINService):
def _parse_return(self, content: str) -> dict: def _parse_return(self, content: str) -> dict:
"""
Use regex to parse the return of a call to the sites
search script
Expects a string created by BS4 using the HTML parse on request content
Specifically line 4 of this sites return
Data that cannot be found will be returned as None
"""
m = re.search("EIN Number:", content) m = re.search("EIN Number:", content)
company = content[0:m.start()].strip() company = content[0:m.start()].strip()
@ -116,6 +216,7 @@ class EINTaxIDService(EINService):
m = re.search("Address:.*Phone:", content) m = re.search("Address:.*Phone:", content)
address = content[m.start()+8:m.end()-6].strip() if m != None else None address = content[m.start()+8:m.end()-6].strip() if m != None else None
if address != None: if address != None:
# We need to split address into pieces, which are conviently seperated by ,
addressPieces = address.split(',') addressPieces = address.split(',')
address1 = addressPieces[0].strip() address1 = addressPieces[0].strip()
city = addressPieces[1].strip() city = addressPieces[1].strip()

Loading…
Cancel
Save