From 2b6c71598b62c3afa8020c67b67d7c5fc5a930cb Mon Sep 17 00:00:00 2001 From: Griffiths Lott Date: Fri, 16 Dec 2022 18:46:14 -0500 Subject: [PATCH] Added more documentation --- EINService.py | 113 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 107 insertions(+), 6 deletions(-) diff --git a/EINService.py b/EINService.py index 6c8a8c7..951ca43 100644 --- a/EINService.py +++ b/EINService.py @@ -12,7 +12,15 @@ SAMPLE_EIN = "59-1571026" @dataclass class EINData: - + """ + Represents a basic set of data related to an EIN: + - Buiness Name + - Address1: the 2 lines of an address (street and apt) + - City + - State + - Zip: can handle '-'s + - phone number: (), -, and + will be removed + """ def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None: if re.search("\d{2}(-|)\d{7}", str(ein)) == None: raise Exception(f"Invalid EIN: {ein}") @@ -26,14 +34,29 @@ class EINData: def __str__(self) -> str: + """ + Used to print the object + """ return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}""" def get_ein(self) -> str: + """ + Returns the associated EIN in the format: XX-XXXXXXX + """ return f"{self.ein[0:2]}-{self.ein[2:]}" def compare(self, otherEIN: 'EINData') -> dict: + """ + Compares the EIN object with another. + Returns a match dictionary containing True/False for whether each data member matched + and an over all score + + Returns None if comparision fails + """ try: compareDict = { + # Tenary operator used ot concisley assign values + # If they match then true, else false "buinessName" : True if self.buinessName == otherEIN.buinessName else False, "address" : True if self.address1 == otherEIN.address1 else False, "city": True if self.city == otherEIN.city else False, @@ -42,15 +65,70 @@ class EINData: } except Exception as e: print(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""") + # If we cannot succesfully compare the data return None return None score = 0 for v in compareDict.values(): + # increase score by one for every value that is true score += 1 if v else 0 compareDict["score"] = score return compareDict -class EINService(ABC): +def dataframe_to_eins(df: pd.DataFrame, + einLabel: str = "Lessee Tax-ID", + nameLabel: str = "NAME", + addressLabel: str = "ADDRESS", + cityLabel: str = "CITY", + stateLabel: str = "STATE", + zipLabel: str = "ZIP", + phoneLabel: str = "PHONE" + ) -> list[EINData]: + """ + Converts a dataframe into a list of EINData objects. + + Requires the dataframe contains ceratain data labels. + """ + # Confirm all correct columns exist + try: + columns = df.columns + assert (einLabel in columns), f"EIN label not present: {einLabel} | {columns}" + assert (nameLabel in columns), f"EIN label not present: {nameLabel} | {columns}" + assert (addressLabel in columns), f"EIN label not present: {addressLabel} | {columns}" + assert (cityLabel in columns), f"EIN label not present: {cityLabel} | {columns}" + assert (stateLabel in columns), f"EIN label not present: {stateLabel} | {columns}" + assert (zipLabel in columns), f"EIN label not present: {zipLabel} | {columns}" + assert (phoneLabel in columns), f"EIN label not present: {phoneLabel} | {columns}" + except Exception as e: + print(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}") + + eins = [] + for _, data in df.iterrows(): + # _ is the row index which is not used + # The rest is a pandas serices + # The column labels are used to pull the data from the series + try: + eins.append(EINData( + str(data[einLabel]), + data[nameLabel], + data[addressLabel], + data[cityLabel], + data[stateLabel], + str(data[zipLabel]), + str(data[phoneLabel]) + )) + except Exception as e: + # If we fail, we port the error and move on to the next item in dataframe + print(f"Could not add {einLabel}!") + continue + +class __EINService(ABC): + """ + This is an abstract base class used to define the interface for services that + can be used to search EINS. + + DO NOT INSTANTIATE THIS CLASS + """ @classmethod @abstractmethod def search_ein(self, ein: str) -> EINData: @@ -60,9 +138,11 @@ class EINService(ABC): @classmethod @abstractmethod - def search_eins(self, eins: list[EINData]): + def search_eins(self, eins: list[str]) -> list[EINData]: """ - + Takes a list of EINS and returns a list of EINData objects. + + !! Some items in the list may return None """ @classmethod @@ -70,22 +150,33 @@ class EINService(ABC): return re.search("\d{2}(-|)\d{7}", ein) != None -class EINTaxIDService(EINService): +class EINTaxIDService(__EINService): + """ + Concrete class of __EINService using eintaxid.com. + + There seems to be not rate limiting on this service at the moment. + It's not acutally meant to act as an API. This is a JQuery/php service used to + hydrate their webapp, but we can call it directly + """ _url = "https://eintaxid.com" def search_ein(self, ein: str) -> EINData: try: + # We don't even want to attempt this unless we're using a valid EIN if not self._isEIN(ein): print(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX") raise Exception("Invalid EIN") except Exception as e: print(e) return None - + # Send a POST HTTP request to the site using the search-ajax script + # query just needs to include EIN, XML header required for parsing req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \ headers={'X-Requested-With': 'XMLHttpRequest'}) + # Use BeautifulSoup to parse the HTML content soup = bsp(req.content, "html.parser") try: + # The actual data return is always on line 4. The rest are DIV set up text = soup.text.splitlines()[4] except: print(f"Failed: {ein} | {soup}") @@ -107,6 +198,15 @@ class EINTaxIDService(EINService): def _parse_return(self, content: str) -> dict: + """ + Use regex to parse the return of a call to the sites + search script + + Expects a string created by BS4 using the HTML parse on request content + Specifically line 4 of this sites return + + Data that cannot be found will be returned as None + """ m = re.search("EIN Number:", content) company = content[0:m.start()].strip() @@ -116,6 +216,7 @@ class EINTaxIDService(EINService): m = re.search("Address:.*Phone:", content) address = content[m.start()+8:m.end()-6].strip() if m != None else None if address != None: + # We need to split address into pieces, which are conviently seperated by , addressPieces = address.split(',') address1 = addressPieces[0].strip() city = addressPieces[1].strip()