|
|
|
|
@ -3,24 +3,16 @@ from bs4 import BeautifulSoup as bsp |
|
|
|
|
import re |
|
|
|
|
from abc import ABC, abstractmethod |
|
|
|
|
from dataclasses import dataclass |
|
|
|
|
from logging import warning, debug, error |
|
|
|
|
from pandas import DataFrame |
|
|
|
|
from datetime import datetime as dt |
|
|
|
|
from pprint import pprint as prt |
|
|
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SAMPLE_EIN = "59-1571026" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
class EINData: |
|
|
|
|
""" |
|
|
|
|
Represents a basic set of data related to an EIN: |
|
|
|
|
- Buiness Name |
|
|
|
|
- Address1: the 2 lines of an address (street and apt) |
|
|
|
|
- City |
|
|
|
|
- State |
|
|
|
|
- Zip: can handle '-'s |
|
|
|
|
- phone number: (), -, and + will be removed |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None: |
|
|
|
|
if re.search("\d{2}(-|)\d{7}", str(ein)) == None: |
|
|
|
|
raise Exception(f"Invalid EIN: {ein}") |
|
|
|
|
@ -34,29 +26,14 @@ class EINData: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __str__(self) -> str: |
|
|
|
|
""" |
|
|
|
|
Used to print the object |
|
|
|
|
""" |
|
|
|
|
return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}""" |
|
|
|
|
|
|
|
|
|
def get_ein(self) -> str: |
|
|
|
|
""" |
|
|
|
|
Returns the associated EIN in the format: XX-XXXXXXX |
|
|
|
|
""" |
|
|
|
|
return f"{self.ein[0:2]}-{self.ein[2:]}" |
|
|
|
|
|
|
|
|
|
def compare(self, otherEIN: 'EINData') -> dict: |
|
|
|
|
""" |
|
|
|
|
Compares the EIN object with another. |
|
|
|
|
Returns a match dictionary containing True/False for whether each data member matched |
|
|
|
|
and an over all score |
|
|
|
|
|
|
|
|
|
Returns None if comparision fails |
|
|
|
|
""" |
|
|
|
|
try: |
|
|
|
|
compareDict = { |
|
|
|
|
# Tenary operator used ot concisley assign values |
|
|
|
|
# If they match then true, else false |
|
|
|
|
"buinessName" : True if self.buinessName == otherEIN.buinessName else False, |
|
|
|
|
"address" : True if self.address1 == otherEIN.address1 else False, |
|
|
|
|
"city": True if self.city == otherEIN.city else False, |
|
|
|
|
@ -64,74 +41,16 @@ class EINData: |
|
|
|
|
"zip" : True if self.zip == otherEIN.zip else False |
|
|
|
|
} |
|
|
|
|
except Exception as e: |
|
|
|
|
error(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""") |
|
|
|
|
# If we cannot succesfully compare the data return None |
|
|
|
|
print(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""") |
|
|
|
|
return None |
|
|
|
|
debug(compareDict) |
|
|
|
|
score = 0 |
|
|
|
|
for v in compareDict.values(): |
|
|
|
|
# increase score by one for every value that is true |
|
|
|
|
score += 1 if v else 0 |
|
|
|
|
compareDict["score"] = score |
|
|
|
|
return compareDict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dataframe_to_eins(df: DataFrame, |
|
|
|
|
einLabel: str = "Lessee Tax-ID", |
|
|
|
|
nameLabel: str = "NAME", |
|
|
|
|
addressLabel: str = "ADDRESS", |
|
|
|
|
cityLabel: str = "CITY", |
|
|
|
|
stateLabel: str = "STATE", |
|
|
|
|
zipLabel: str = "ZIP", |
|
|
|
|
phoneLabel: str = "PHONE" |
|
|
|
|
) -> list[EINData]: |
|
|
|
|
""" |
|
|
|
|
Converts a dataframe into a list of EINData objects. |
|
|
|
|
|
|
|
|
|
Requires the dataframe contains ceratain data labels. |
|
|
|
|
""" |
|
|
|
|
# Confirm all correct columns exist |
|
|
|
|
debug(df) |
|
|
|
|
try: |
|
|
|
|
columns = df.columns |
|
|
|
|
assert (einLabel in columns), f"EIN label not present: {einLabel} | {columns}" |
|
|
|
|
assert (nameLabel in columns), f"EIN label not present: {nameLabel} | {columns}" |
|
|
|
|
assert (addressLabel in columns), f"EIN label not present: {addressLabel} | {columns}" |
|
|
|
|
assert (cityLabel in columns), f"EIN label not present: {cityLabel} | {columns}" |
|
|
|
|
assert (stateLabel in columns), f"EIN label not present: {stateLabel} | {columns}" |
|
|
|
|
assert (zipLabel in columns), f"EIN label not present: {zipLabel} | {columns}" |
|
|
|
|
assert (phoneLabel in columns), f"EIN label not present: {phoneLabel} | {columns}" |
|
|
|
|
except Exception as e: |
|
|
|
|
error(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}") |
|
|
|
|
|
|
|
|
|
eins = [] |
|
|
|
|
for _, data in df.iterrows(): |
|
|
|
|
# _ is the row index which is not used |
|
|
|
|
# The rest is a pandas serices |
|
|
|
|
# The column labels are used to pull the data from the series |
|
|
|
|
debug(data) |
|
|
|
|
try: |
|
|
|
|
eins.append(EINData( |
|
|
|
|
str(data[einLabel]), |
|
|
|
|
data[nameLabel], |
|
|
|
|
data[addressLabel], |
|
|
|
|
data[cityLabel], |
|
|
|
|
data[stateLabel], |
|
|
|
|
str(data[zipLabel]), |
|
|
|
|
str(data[phoneLabel]) |
|
|
|
|
)) |
|
|
|
|
except Exception as e: |
|
|
|
|
# If we fail, we port the error and move on to the next item in dataframe |
|
|
|
|
warning(f"Could not add {einLabel}!") |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
class __EINService(ABC): |
|
|
|
|
""" |
|
|
|
|
This is an abstract base class used to define the interface for services that |
|
|
|
|
can be used to search EINS. |
|
|
|
|
|
|
|
|
|
DO NOT INSTANTIATE THIS CLASS |
|
|
|
|
""" |
|
|
|
|
class EINService(ABC): |
|
|
|
|
@classmethod |
|
|
|
|
@abstractmethod |
|
|
|
|
def search_ein(self, ein: str) -> EINData: |
|
|
|
|
@ -141,11 +60,9 @@ class __EINService(ABC): |
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
@abstractmethod |
|
|
|
|
def search_eins(self, eins: list[str]) -> list[EINData]: |
|
|
|
|
def search_eins(self, eins: list[EINData]): |
|
|
|
|
""" |
|
|
|
|
Takes a list of EINS and returns a list of EINData objects. |
|
|
|
|
|
|
|
|
|
!! Some items in the list may return None |
|
|
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
@ -153,40 +70,26 @@ class __EINService(ABC): |
|
|
|
|
return re.search("\d{2}(-|)\d{7}", ein) != None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EINTaxIDService(__EINService): |
|
|
|
|
""" |
|
|
|
|
Concrete class of __EINService using eintaxid.com. |
|
|
|
|
|
|
|
|
|
There seems to be not rate limiting on this service at the moment. |
|
|
|
|
It's not acutally meant to act as an API. This is a JQuery/php service used to |
|
|
|
|
hydrate their webapp, but we can call it directly |
|
|
|
|
""" |
|
|
|
|
class EINTaxIDService(EINService): |
|
|
|
|
_url = "https://eintaxid.com" |
|
|
|
|
|
|
|
|
|
def search_ein(self, ein: str) -> EINData: |
|
|
|
|
debug(ein) |
|
|
|
|
try: |
|
|
|
|
# We don't even want to attempt this unless we're using a valid EIN |
|
|
|
|
if not self._isEIN(ein): |
|
|
|
|
warning(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX") |
|
|
|
|
print(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX") |
|
|
|
|
raise Exception("Invalid EIN") |
|
|
|
|
except Exception as e: |
|
|
|
|
warning(f"{e} | {ein}") |
|
|
|
|
print(e) |
|
|
|
|
return None |
|
|
|
|
# Send a POST HTTP request to the site using the search-ajax script |
|
|
|
|
# query just needs to include EIN, XML header required for parsing |
|
|
|
|
|
|
|
|
|
req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \ |
|
|
|
|
headers={'X-Requested-With': 'XMLHttpRequest'}) |
|
|
|
|
debug(req) |
|
|
|
|
# Use BeautifulSoup to parse the HTML content |
|
|
|
|
soup = bsp(req.content, "html.parser") |
|
|
|
|
debug(soup) |
|
|
|
|
try: |
|
|
|
|
# The actual data return is always on line 4. The rest are DIV set up |
|
|
|
|
text = soup.text.splitlines()[4] |
|
|
|
|
except: |
|
|
|
|
warning(f"Failed: {ein} | {soup}") |
|
|
|
|
return EINData(ein, None,None,None,None,None,None) |
|
|
|
|
print(f"Failed: {ein} | {soup}") |
|
|
|
|
return None |
|
|
|
|
data = self._parse_return(text) |
|
|
|
|
|
|
|
|
|
return EINData( |
|
|
|
|
@ -204,16 +107,6 @@ class EINTaxIDService(__EINService): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_return(self, content: str) -> dict: |
|
|
|
|
""" |
|
|
|
|
Use regex to parse the return of a call to the sites |
|
|
|
|
search script |
|
|
|
|
|
|
|
|
|
Expects a string created by BS4 using the HTML parse on request content |
|
|
|
|
Specifically line 4 of this sites return |
|
|
|
|
|
|
|
|
|
Data that cannot be found will be returned as None |
|
|
|
|
""" |
|
|
|
|
debug(f"EIN Service returned content:\n{content}") |
|
|
|
|
m = re.search("EIN Number:", content) |
|
|
|
|
company = content[0:m.start()].strip() |
|
|
|
|
|
|
|
|
|
@ -223,7 +116,6 @@ class EINTaxIDService(__EINService): |
|
|
|
|
m = re.search("Address:.*Phone:", content) |
|
|
|
|
address = content[m.start()+8:m.end()-6].strip() if m != None else None |
|
|
|
|
if address != None: |
|
|
|
|
# We need to split address into pieces, which are conviently seperated by , |
|
|
|
|
addressPieces = address.split(',') |
|
|
|
|
address1 = addressPieces[0].strip() |
|
|
|
|
city = addressPieces[1].strip() |
|
|
|
|
|