From 14884426121e3e54b5bf7d378429455d3ecdd00e Mon Sep 17 00:00:00 2001 From: Griffiths Lott Date: Sun, 18 Dec 2022 12:58:31 -0500 Subject: [PATCH] Added example files and logging (replaces prints) --- EINService.py | 29 ++++++++++++++++++----------- README.md | 14 ++++++++++++++ basic_example.py | 15 +++++++++++++++ example_from_excel.py | 36 ++++++++++++++++++++++++++++++++++++ test.py | 31 ------------------------------- 5 files changed, 83 insertions(+), 42 deletions(-) create mode 100644 README.md create mode 100644 basic_example.py create mode 100644 example_from_excel.py delete mode 100644 test.py diff --git a/EINService.py b/EINService.py index 951ca43..25f2a4b 100644 --- a/EINService.py +++ b/EINService.py @@ -3,13 +3,13 @@ from bs4 import BeautifulSoup as bsp import re from abc import ABC, abstractmethod from dataclasses import dataclass -from datetime import datetime as dt -from pprint import pprint as prt -import pandas as pd +from logging import warning, debug, error +from pandas import DataFrame SAMPLE_EIN = "59-1571026" + @dataclass class EINData: """ @@ -64,9 +64,10 @@ class EINData: "zip" : True if self.zip == otherEIN.zip else False } except Exception as e: - print(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""") + error(f"""Exception:\n{e}\nSelf:{self}\nOther: {otherEIN}\n""") # If we cannot succesfully compare the data return None return None + debug(compareDict) score = 0 for v in compareDict.values(): # increase score by one for every value that is true @@ -75,7 +76,7 @@ class EINData: return compareDict -def dataframe_to_eins(df: pd.DataFrame, +def dataframe_to_eins(df: DataFrame, einLabel: str = "Lessee Tax-ID", nameLabel: str = "NAME", addressLabel: str = "ADDRESS", @@ -90,6 +91,7 @@ def dataframe_to_eins(df: pd.DataFrame, Requires the dataframe contains ceratain data labels. """ # Confirm all correct columns exist + debug(df) try: columns = df.columns assert (einLabel in columns), f"EIN label not present: {einLabel} | {columns}" @@ -100,13 +102,14 @@ def dataframe_to_eins(df: pd.DataFrame, assert (zipLabel in columns), f"EIN label not present: {zipLabel} | {columns}" assert (phoneLabel in columns), f"EIN label not present: {phoneLabel} | {columns}" except Exception as e: - print(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}") + error(f"NOT ALL REQUIRED COLUMNS PRESENT!\n{e}\n{df}") eins = [] for _, data in df.iterrows(): # _ is the row index which is not used # The rest is a pandas serices # The column labels are used to pull the data from the series + debug(data) try: eins.append(EINData( str(data[einLabel]), @@ -119,7 +122,7 @@ def dataframe_to_eins(df: pd.DataFrame, )) except Exception as e: # If we fail, we port the error and move on to the next item in dataframe - print(f"Could not add {einLabel}!") + warning(f"Could not add {einLabel}!") continue class __EINService(ABC): @@ -161,26 +164,29 @@ class EINTaxIDService(__EINService): _url = "https://eintaxid.com" def search_ein(self, ein: str) -> EINData: + debug(ein) try: # We don't even want to attempt this unless we're using a valid EIN if not self._isEIN(ein): - print(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX") + warning(f"{ein} is not a valid EIN!\nValid formats are: XX-XXXXXXX and XXXXXXXXX") raise Exception("Invalid EIN") except Exception as e: - print(e) + warning(f"{e} | {ein}") return None # Send a POST HTTP request to the site using the search-ajax script # query just needs to include EIN, XML header required for parsing req = rq.request("POST",self._url + "/search-ajax.php", data={"query": ein}, \ headers={'X-Requested-With': 'XMLHttpRequest'}) + debug(req) # Use BeautifulSoup to parse the HTML content soup = bsp(req.content, "html.parser") + debug(soup) try: # The actual data return is always on line 4. The rest are DIV set up text = soup.text.splitlines()[4] except: - print(f"Failed: {ein} | {soup}") - return None + warning(f"Failed: {ein} | {soup}") + return EINData(ein, None,None,None,None,None,None) data = self._parse_return(text) return EINData( @@ -207,6 +213,7 @@ class EINTaxIDService(__EINService): Data that cannot be found will be returned as None """ + debug(f"EIN Service returned content:\n{content}") m = re.search("EIN Number:", content) company = content[0:m.start()].strip() diff --git a/README.md b/README.md new file mode 100644 index 0000000..f475f3d --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# Usage Guide Available +For information on how to set up and utilize this library please view: *https://git.glott.me/LEAF/EINService/wiki/Usage-Guide* + +# Information: +- Currently only EINTaxIDService is available. This uses https://eintaxid.com to fetch information about a company. +- The EINData object utilized by this library consists of the following data: EIN, Company Name, Address, City, State, Zip Code, Phone + +# Upcoming functionality +Future versions will hopefully provide the following capabilities: +- Option to use other EIN search services +- Automatically format EINData lists into dataframes +- Compare 'EINData dataframes' adding the compare dict as an extra column + +A GUI application is also planned for release, but will be located in a seperate repo \ No newline at end of file diff --git a/basic_example.py b/basic_example.py new file mode 100644 index 0000000..2210afc --- /dev/null +++ b/basic_example.py @@ -0,0 +1,15 @@ +from EINService import EINTaxIDService + +# Instatiate an EINService object +# This is what will be used to do all of our searches +einService = EINTaxIDService() + +# Advanced Micro Devices Inc's EIN identifier: +# This is the ein we will be searching. +AMD_EIN = "94-1692300" +# The return will be an EINData object +# If the search was unsuccessful data members other than EIN will be None +searchResult = einService.search_ein(AMD_EIN) + +print(searchResult) +# EIN: 941692300 | Name: advanced micro devices inc | Address: 2485 augustine drive | City: santa clara | State: ca | Phone: 408 7494000 \ No newline at end of file diff --git a/example_from_excel.py b/example_from_excel.py new file mode 100644 index 0000000..71e0602 --- /dev/null +++ b/example_from_excel.py @@ -0,0 +1,36 @@ +from EINService import EINTaxIDService, dataframe_to_eins +import pandas as pd + +# Instatiate an EINService object +# This is what will be used to do all of our searches +einService = EINTaxIDService() + +# Here we pull in the data from excel +einData = pd.read_excel("SampleData.xlsx") +# Extract the eins column as a list of strings +einList = einData["Lessee Tax-ID"].to_list() +# The service will return a list EINData objects +# if no match what found all data members besides ein will be None +searchResults = einService.search_eins(einList) + +print(searchResults) + +# Can also convert a dataframe into a list of EINData +# The requires that our dataframe has all of the nessary columns. +# The defaults for these columns are: +# "Lessee Tax-ID", +# "NAME", +# "ADDRESS" +# "CITY" +# "STATE" +# "ZIP" +# "PHONE" +# +# You can also specify your own column names as paramaters. +einDataList = dataframe_to_eins(einData) +print(einData) + +# This allows us to compare our search results to our 'local data' +for i, localData in enumerate(einDataList): + comparisonDict = localData.compare(searchResults[i]) + print(comparisonDict) \ No newline at end of file diff --git a/test.py b/test.py deleted file mode 100644 index 41139b0..0000000 --- a/test.py +++ /dev/null @@ -1,31 +0,0 @@ -import EINService as es -import pandas as pd - -data = pd.read_excel("ExampleCSP.xlsx") -einService = es.EINTaxIDService() - -scores = [] -for _, deal in data.iterrows(): - try: - leafEIN = es.EINData( - str(deal["Lessee Tax-ID"]), - deal["NAME"], - deal["ADDRESS"], - deal["CITY"], - deal["STATE"], - str(deal["ZIP"]), - str(deal["PHONE"]), - ) - except: - print(f"Failed to create EINData (LEAF): {deal['Lessee Tax-ID']}") - continue - - try: - external = einService.search_ein(leafEIN.get_ein()) - if external == None: continue - except: - print(f"Failed to create EINData (EXTERNAL): {deal['Lessee Tax-ID']}") - continue - scores.append(leafEIN.compare(external)) - -print(f"Scores:\n{scores}") \ No newline at end of file