PortfolioParser/ILParser.py

from pandas import DataFrame
import re


COLUMN_NAME_REGEX = re.compile(r"(?P<column_name>(\w|\.|#|\/)+)", re.IGNORECASE)

def replace_bad_cols(line: str, cols: list[str]):
    """
    Replaces bad column names in a string with modified names that have spaces replaced with dots.

    Args:
        line (str): The string containing the column names to modify.
        cols (list[str]): A list of column names to modify.

    Returns:
        str: The modified string with bad column names replaced.
    """
    for c in cols:
        # Replace spaces with dots in the column name
        gc = c.replace(' ', '.')
        # Replace the bad column name with the modified column name in the string
        line = line.replace(c, gc)
    return line


def extract_data(input_doc: str, column_list: list[str]):
    """
    Extracts data from a string in a table-like format, where columns are identified by a list of column names, and
    returns the data as a Pandas DataFrame.

    Args:
        input_doc (str): The string containing the table-like data to extract.
        column_list (list[str]): A list of column names to identify the columns in the table-like data.

    Returns:
        pandas.DataFrame: A DataFrame containing the extracted data from the input string.
    """
    line: str
    columns = {}
    data = {}
    for line in input_doc.splitlines():
        if len(columns) == 0 :
            # Find the line that contains the column names and replace bad column names
            if re.search("^\w", line):
                line = replace_bad_cols(line, column_list)
                # Find the start and end positions of each column name and store them in a dictionary
                columns_names = re.finditer(COLUMN_NAME_REGEX, line)
                for c in columns_names:
                    columns[c.group("column_name")] = {"start": c.start(), "end": c.end()}
                    data[c.group("column_name")] = []
            continue
        elif len(line) < 2:
            continue
        # Check if we've reached the end of the table and return the data
        if re.search("\d+ records listed", line):
            return DataFrame(data)
        # Extract the data from each column based on the start and end positions
        for key, span in columns.items():
            data[key].append(line[span["start"]:span["end"]].strip())