Compare commits

...

2 Commits

  1. 134
      EINService.py
  2. 14
      README.md
  3. 15
      basic_example.py
  4. 36
      example_from_excel.py
  5. 31
      test.py

@ -3,16 +3,24 @@ from bs4 import BeautifulSoup as bsp
import re import re
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime as dt from logging import warning, debug, error
from pprint import pprint as prt from pandas import DataFrame
import pandas as pd
SAMPLE_EIN = "59-1571026" SAMPLE_EIN = "59-1571026"
@dataclass @dataclass
class EINData: class EINData:
"""
Represents a basic set of data related to an EIN:
- Buiness Name
- Address1: the 2 lines of an address (street and apt)
- City
- State
- Zip: can handle '-'s
- phone number: (), -, and + will be removed
"""
def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None: def __init__(self, ein: str, buinessName: str, address1: str, city:str, state:str, zip: str, phone: str) -> None:
if re.search("\d{2}(-|)\d{7}", str(ein)) == None: if re.search("\d{2}(-|)\d{7}", str(ein)) == None:
raise Exception(f"Invalid EIN: {ein}") raise Exception(f"Invalid EIN: {ein}")
@ -26,14 +34,29 @@ class EINData:
def __str__(self) -> str: def __str__(self) -> str:
"""
Used to print the object
"""
return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}""" return f"""EIN: {self.ein}\t | Name: {self.buinessName}\t\t| Address: {self.address1}\t\t| City: {self.city}\t| State: {self.state}\t| Phone: {self.phone}"""
def get_ein(self) -> str: def get_ein(self) -> str:
"""
Returns the associated EIN in the format: XX-XXXXXXX
"""
return f"{self.ein[0:2]}-{self.ein[2:]}" return f"{self.ein[0:2]}-{self.ein[2:]}"
def compare(self, otherEIN: 'EINData') -> dict: def compare(self, otherEIN: 'EINData') -> dict:
"""
Compares the EIN object with another.
Returns a match dictionary containing True/False for whether each data member matched
and an over all score
Returns None if comparision fails
"""
try: try:
compareDict = { compareDict = {
# Tenary operator used ot concisley assign values
# If they match then true, else false
"buinessName" : True if self.buinessName == otherEIN.buinessName else False, "buinessName" : True if self.buinessName == otherEIN.buinessName else False,
"address" : True if self.address1 == otherEIN.address1 else False, "address" : True if self.address1 == otherEIN.address1 else False,
"city": True if self.city == otherEIN.city else False, "city": True if self.city == otherEIN.city else False,
@ -41,16 +64,74 @@ class EINData:
"zip" : True if self.zip == otherEIN.zip else False "zip" : True if self.zip == otherEIN.zip else False
} }
except Exception as e: except Exception as e:
print(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""") error(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""")
# If we cannot succesfully compare the data return None
return None return None
debug(compareDict)
score = 0 score = 0
for v in compareDict.values(): for v in compareDict.values():
# increase score by one for every value that is true
score += 1 if v else 0 score += 1 if v else 0
compareDict["score"] = score compareDict["score"] = score
return compareDict return compareDict
class EINService(ABC): def dataframe_to_eins(df: DataFrame,
einLabel: str = "Lessee Tax-ID",
nameLabel: str = "NAME",
addressLabel: str = "ADDRESS",
cityLabel: str = "CITY",
stateLabel: str = "STATE",
zipLabel: str = "ZIP",
phoneLabel: str = "PHONE"
) -> list[EINData]:
"""
Converts a dataframe into a list of EINData objects.
Requires the dataframe contains ceratain data labels.
"""
# Confirm all correct columns exist
debug(df)
try:
columns = df.columns
assert (einLabel in columns), f"EIN label not present: {einLabel} | {columns}"
assert (nameLabel in columns), f"EIN label not present: {nameLabel} | {columns}"
assert (addressLabel in columns), f"EIN label not present: {addressLabel} | {columns}"
assert (cityLabel in columns), f"EIN label not present: {cityLabel} | {columns}"
assert (stateLabel in columns), f"EIN label not present: {stateLabel} | {columns}"
assert (zipLabel in columns), f"EIN label not present: {zipLabel} | {columns}"
assert (phoneLabel in columns), f"EIN label not present: {phoneLabel} | {columns}"
except Exception as e:
error(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}")
eins = []
for _, data in df.iterrows():
# _ is the row index which is not used
# The rest is a pandas serices
# The column labels are used to pull the data from the series
debug(data)
try:
eins.append(EINData(
str(data[einLabel]),
data[nameLabel],
data[addressLabel],
data[cityLabel],
data[stateLabel],
str(data[zipLabel]),
str(data[phoneLabel])
))
except Exception as e:
# If we fail, we port the error and move on to the next item in dataframe
warning(f"Could not add {einLabel}!")
continue
class __EINService(ABC):
"""
This is an abstract base class used to define the interface for services that
can be used to search EINS.
DO NOT INSTANTIATE THIS CLASS
"""
@classmethod @classmethod
@abstractmethod @abstractmethod
def search_ein(self, ein: str) -> EINData: def search_ein(self, ein: str) -> EINData:
@ -60,9 +141,11 @@ class EINService(ABC):
@classmethod @classmethod
@abstractmethod @abstractmethod
def search_eins(self, eins: list[EINData]): def search_eins(self, eins: list[str]) -> list[EINData]:
""" """
Takes a list of EINS and returns a list of EINData objects.
!! Some items in the list may return None
""" """
@classmethod @classmethod
@ -70,26 +153,40 @@ class EINService(ABC):
return re.search("\d{2}(-|)\d{7}", ein) != None return re.search("\d{2}(-|)\d{7}", ein) != None
class EINTaxIDService(EINService): class EINTaxIDService(__EINService):
"""
Concrete class of __EINService using eintaxid.com.
There seems to be not rate limiting on this service at the moment.
It's not acutally meant to act as an API. This is a JQuery/php service used to
hydrate their webapp, but we can call it directly
"""
_url = "https://eintaxid.com" _url = "https://eintaxid.com"
def search_ein(self, ein: str) -> EINData: def search_ein(self, ein: str) -> EINData:
debug(ein)
try: try:
# We don't even want to attempt this unless we're using a valid EIN
if not self._isEIN(ein): if not self._isEIN(ein):
print(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX") warning(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX")
raise Exception("Invalid EIN") raise Exception("Invalid EIN")
except Exception as e: except Exception as e:
print(e) warning(f"{e} | {ein}")
return None return None
# Send a POST HTTP request to the site using the search-ajax script
# query just needs to include EIN, XML header required for parsing
req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \ req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \
headers={'X-Requested-With': 'XMLHttpRequest'}) headers={'X-Requested-With': 'XMLHttpRequest'})
debug(req)
# Use BeautifulSoup to parse the HTML content
soup = bsp(req.content, "html.parser") soup = bsp(req.content, "html.parser")
debug(soup)
try: try:
# The actual data return is always on line 4. The rest are DIV set up
text = soup.text.splitlines()[4] text = soup.text.splitlines()[4]
except: except:
print(f"Failed: {ein} | {soup}") warning(f"Failed: {ein} | {soup}")
return None return EINData(ein, None,None,None,None,None,None)
data = self._parse_return(text) data = self._parse_return(text)
return EINData( return EINData(
@ -107,6 +204,16 @@ class EINTaxIDService(EINService):
def _parse_return(self, content: str) -> dict: def _parse_return(self, content: str) -> dict:
"""
Use regex to parse the return of a call to the sites
search script
Expects a string created by BS4 using the HTML parse on request content
Specifically line 4 of this sites return
Data that cannot be found will be returned as None
"""
debug(f"EIN Service returned content:\n{content}")
m = re.search("EIN Number:", content) m = re.search("EIN Number:", content)
company = content[0:m.start()].strip() company = content[0:m.start()].strip()
@ -116,6 +223,7 @@ class EINTaxIDService(EINService):
m = re.search("Address:.*Phone:", content) m = re.search("Address:.*Phone:", content)
address = content[m.start()+8:m.end()-6].strip() if m != None else None address = content[m.start()+8:m.end()-6].strip() if m != None else None
if address != None: if address != None:
# We need to split address into pieces, which are conviently seperated by ,
addressPieces = address.split(',') addressPieces = address.split(',')
address1 = addressPieces[0].strip() address1 = addressPieces[0].strip()
city = addressPieces[1].strip() city = addressPieces[1].strip()

@ -0,0 +1,14 @@
# Usage Guide Available
For information on how to set up and utilize this library please view: *https://git.glott.me/LEAF/EINService/wiki/Usage-Guide*
# Information:
- Currently only EINTaxIDService is available. This uses https://eintaxid.com to fetch information about a company.
- The EINData object utilized by this library consists of the following data: EIN, Company Name, Address, City, State, Zip Code, Phone
# Upcoming functionality
Future versions will hopefully provide the following capabilities:
- Option to use other EIN search services
- Automatically format EINData lists into dataframes
- Compare 'EINData dataframes' adding the compare dict as an extra column
A GUI application is also planned for release, but will be located in a seperate repo

@ -0,0 +1,15 @@
from EINService import EINTaxIDService
# Instatiate an EINService object
# This is what will be used to do all of our searches
einService = EINTaxIDService()
# Advanced Micro Devices Inc's EIN identifier:
# This is the ein we will be searching.
AMD_EIN = "94-1692300"
# The return will be an EINData object
# If the search was unsuccessful data members other than EIN will be None
searchResult = einService.search_ein(AMD_EIN)
print(searchResult)
# EIN: 941692300 | Name: advanced micro devices inc | Address: 2485 augustine drive | City: santa clara | State: ca | Phone: 408 7494000

@ -0,0 +1,36 @@
from EINService import EINTaxIDService, dataframe_to_eins
import pandas as pd
# Instatiate an EINService object
# This is what will be used to do all of our searches
einService = EINTaxIDService()
# Here we pull in the data from excel
einData = pd.read_excel("SampleData.xlsx")
# Extract the eins column as a list of strings
einList = einData["Lessee Tax-ID"].to_list()
# The service will return a list EINData objects
# if no match what found all data members besides ein will be None
searchResults = einService.search_eins(einList)
print(searchResults)
# Can also convert a dataframe into a list of EINData
# The requires that our dataframe has all of the nessary columns.
# The defaults for these columns are:
# "Lessee Tax-ID",
# "NAME",
# "ADDRESS"
# "CITY"
# "STATE"
# "ZIP"
# "PHONE"
#
# You can also specify your own column names as paramaters.
einDataList = dataframe_to_eins(einData)
print(einData)
# This allows us to compare our search results to our 'local data'
for i, localData in enumerate(einDataList):
comparisonDict = localData.compare(searchResults[i])
print(comparisonDict)

@ -1,31 +0,0 @@
import EINService as es
import pandas as pd
data = pd.read_excel("ExampleCSP.xlsx")
einService = es.EINTaxIDService()
scores = []
for _, deal in data.iterrows():
try:
leafEIN = es.EINData(
str(deal["Lessee Tax-ID"]),
deal["NAME"],
deal["ADDRESS"],
deal["CITY"],
deal["STATE"],
str(deal["ZIP"]),
str(deal["PHONE"]),
)
except:
print(f"Failed to create EINData (LEAF): {deal['Lessee Tax-ID']}")
continue
try:
external = einService.search_ein(leafEIN.get_ein())
if external == None: continue
except:
print(f"Failed to create EINData (EXTERNAL): {deal['Lessee Tax-ID']}")
continue
scores.append(leafEIN.compare(external))
print(f"Scores:\n{scores}")
Loading…
Cancel
Save