import requests as rq from bs4 import BeautifulSoup as bsp import re from abc import ABC, abstractmethod from dataclasses import dataclass from datetime import datetime as dt from pprint import pprint as prt import pandas as pd SAMPLE_EIN = "59-1571026" @dataclass class EINData: """ Represents a basic set of data related to an EIN: - Buiness Name - Address1: the 2 lines of an address (street and apt) - City - State - Zip: can handle '-'s - phone number: (), -, and + will be removed """ def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None: if re.search("\d{2}(-|)\d{7}", str(ein)) == None: raise Exception(f"Invalid EIN: {ein}") self.ein = ein.strip().replace('-','') self.buinessName = buinessName.lower().strip() self.address1 = address1.lower().strip() self.city = city.lower().strip() self.state = state.lower().strip() self.phone = phone.strip().replace('-','').replace('(',"").replace(')','').replace('+','') self.zip = int(zip.replace('-','').strip()) def __str__(self) -> str: """ Used to print the object """ return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}""" def get_ein(self) -> str: """ Returns the associated EIN in the format: XX-XXXXXXX """ return f"{self.ein[0:2]}-{self.ein[2:]}" def compare(self, otherEIN: 'EINData') -> dict: """ Compares the EIN object with another. Returns a match dictionary containing True/False for whether each data member matched and an over all score Returns None if comparision fails """ try: compareDict = { # Tenary operator used ot concisley assign values # If they match then true, else false "buinessName" : True if self.buinessName == otherEIN.buinessName else False, "address" : True if self.address1 == otherEIN.address1 else False, "city": True if self.city == otherEIN.city else False, "state": True if self.state == otherEIN.state else False, "zip" : True if self.zip == otherEIN.zip else False } except Exception as e: print(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""") # If we cannot succesfully compare the data return None return None score = 0 for v in compareDict.values(): # increase score by one for every value that is true score += 1 if v else 0 compareDict["score"] = score return compareDict def dataframe_to_eins(df: pd.DataFrame, einLabel: str = "Lessee Tax-ID", nameLabel: str = "NAME", addressLabel: str = "ADDRESS", cityLabel: str = "CITY", stateLabel: str = "STATE", zipLabel: str = "ZIP", phoneLabel: str = "PHONE" ) -> list[EINData]: """ Converts a dataframe into a list of EINData objects. Requires the dataframe contains ceratain data labels. """ # Confirm all correct columns exist try: columns = df.columns assert (einLabel in columns), f"EIN label not present: {einLabel} | {columns}" assert (nameLabel in columns), f"EIN label not present: {nameLabel} | {columns}" assert (addressLabel in columns), f"EIN label not present: {addressLabel} | {columns}" assert (cityLabel in columns), f"EIN label not present: {cityLabel} | {columns}" assert (stateLabel in columns), f"EIN label not present: {stateLabel} | {columns}" assert (zipLabel in columns), f"EIN label not present: {zipLabel} | {columns}" assert (phoneLabel in columns), f"EIN label not present: {phoneLabel} | {columns}" except Exception as e: print(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}") eins = [] for _, data in df.iterrows(): # _ is the row index which is not used # The rest is a pandas serices # The column labels are used to pull the data from the series try: eins.append(EINData( str(data[einLabel]), data[nameLabel], data[addressLabel], data[cityLabel], data[stateLabel], str(data[zipLabel]), str(data[phoneLabel]) )) except Exception as e: # If we fail, we port the error and move on to the next item in dataframe print(f"Could not add {einLabel}!") continue class __EINService(ABC): """ This is an abstract base class used to define the interface for services that can be used to search EINS. DO NOT INSTANTIATE THIS CLASS """ @classmethod @abstractmethod def search_ein(self, ein: str) -> EINData: """ Takes a an ein and returns information from a search using the service """ @classmethod @abstractmethod def search_eins(self, eins: list[str]) -> list[EINData]: """ Takes a list of EINS and returns a list of EINData objects. !! Some items in the list may return None """ @classmethod def _isEIN(self, ein: str) -> bool: return re.search("\d{2}(-|)\d{7}", ein) != None class EINTaxIDService(__EINService): """ Concrete class of __EINService using eintaxid.com. There seems to be not rate limiting on this service at the moment. It's not acutally meant to act as an API. This is a JQuery/php service used to hydrate their webapp, but we can call it directly """ _url = "https://eintaxid.com" def search_ein(self, ein: str) -> EINData: try: # We don't even want to attempt this unless we're using a valid EIN if not self._isEIN(ein): print(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX") raise Exception("Invalid EIN") except Exception as e: print(e) return None # Send a POST HTTP request to the site using the search-ajax script # query just needs to include EIN, XML header required for parsing req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \ headers={'X-Requested-With': 'XMLHttpRequest'}) # Use BeautifulSoup to parse the HTML content soup = bsp(req.content, "html.parser") try: # The actual data return is always on line 4. The rest are DIV set up text = soup.text.splitlines()[4] except: print(f"Failed: {ein} | {soup}") return None data = self._parse_return(text) return EINData( ein, data["company"], data["address1"], data["city"], data["state"], data["zip"], data["phone"] ) def search_eins(self, eins: str) -> list[EINData]: return [self.search_ein(ein) for ein in eins] def _parse_return(self, content: str) -> dict: """ Use regex to parse the return of a call to the sites search script Expects a string created by BS4 using the HTML parse on request content Specifically line 4 of this sites return Data that cannot be found will be returned as None """ m = re.search("EIN Number:", content) company = content[0:m.start()].strip() m = re.search("Doing Business As:.*Address:", content) dba = content[m.start()+18:m.end()-8].strip() if m != None else None m = re.search("Address:.*Phone:", content) address = content[m.start()+8:m.end()-6].strip() if m != None else None if address != None: # We need to split address into pieces, which are conviently seperated by , addressPieces = address.split(',') address1 = addressPieces[0].strip() city = addressPieces[1].strip() state = addressPieces[2].strip().split(" ")[0].strip() zip = addressPieces[2].strip().split(" ")[1].strip() else: address1 = None city = None state, zip = None phone = content[m.end():].strip() if m != None else None return { "company": company, "dba": dba, "address": address, "address1": address1, "city": city, "state": state, "zip": zip, "phone": phone }