XMLRuleParser/parse_xml.py

import re
from pandas import DataFrame
from typing import Union
from logging import debug as dbg, getLogger, exception as exc, FileHandler, StreamHandler
import win32clipboard
import argparse as ap
from pathlib import Path


def create_table(xmlStr: str) -> Union[DataFrame, Exception]:
    dataDict = {
        "SEQ": [],
    }
    MATCH_SCENARIO = "<Scenario Seq=\"\d{1,3}\">((?!<Sc)(.|\n))*</S"
    senarios = re.finditer(MATCH_SCENARIO,xmlStr)
    dbg(senarios)
    senario: re.Match
    for senario in senarios:
        senarioGroup = senario.group()
        seqMatch = re.search("\"\d{1,3}\"",senarioGroup).group()
        seq = seqMatch[1:-1]
        dbg(f"\nSeq: {seq}")

        CONDITION_REGEX = r"<Condition Id=\"\w+\" Group=\"\w+\" CompareTo=\"(Value|Range)\">((?!</C)(.|\n))*</Condition>"
        UPDATE_REGEX = r"<UpdateField Id=\"\w+\" Group=\"\w+\" UIRequired=\"\d+\" UIDisabled=\"\d+\" ForceUpdate=\"\d+\">\n?((?!</U)(.|\n))*</Up"

        c = list(re.finditer(CONDITION_REGEX,senarioGroup))
        dbg(f"\n\nSenario Group: {senarioGroup}")
        updates = list(re.finditer(UPDATE_REGEX, senarioGroup))
        dbg(f"{seq} | Updates: {updates}")
        dbg(f"conditions:\n{[cond for cond in c]}")
        senarioDict = {}
        senarioDict["SEQ"] = int(seq)
        for m in c:
            group = m.group()
            idStart, idEnd = re.search("\"[^\"]*\"", group).span()
            id = group[idStart+1:idEnd-1]
            dbg(f"SEQ: {seq} | {id}")
            valueGroup = re.search("e\">(.)*<", group)
            if valueGroup == None:
                valueGroup = re.search("e\"(((?!</C)(.|\n))*)</C", group)
                value = valueGroup.group()[4:-3].strip()
            else:
                value = valueGroup.group()[3:-1]
            dbg(f"SEQ: {seq} | {valueGroup}")
            dbg(f"SEQ: {seq} | {value}")
            senarioDict[id] = value
        update: re.Match
        for update in updates:
            update = update.group()
            dbg(f"{seq} | Update: {update}")
            idMatch = re.search(r"\"\w+\"",update).span()
            dbg(f"ID: {idMatch}")
            id = update[idMatch[0]+1:idMatch[1]-1]
            valueMatch = re.search(">(\w+|\.)+</Value>", update).span()
            dbg(f"value: {valueMatch}")
            value = update[valueMatch[0]+1:valueMatch[1]-8]
            dbg(f"{seq} UPDATE | {id} : {value}")
            senarioDict[id] = value
        # Now merge the values from that senario into the main dict
        seen = []
        for key in dataDict.keys():
            dbg(dataDict[key])
            try:
                senarioValue = senarioDict[key]
            except KeyError:
                senarioValue = ''
            dataDict[key].append(senarioValue)
            seen.append(key)
        for key in [k for k in senarioDict.keys() if k not in seen]:
            dataFill = ['' for _ in range(1,int(seq))]
            dataFill.append(senarioDict[key])
            dataDict[key] = dataFill
            dbg(f"New key: {key} | {dataDict[key]}")

        dbg(f"{seq} | {dataDict}\n")

    dbg(dataDict)
    if getLogger().level == 10:
        for key in dataDict.keys():
            dbg(f"{key} : {len(dataDict[key])}")
    try:
        table = DataFrame(dataDict)
        table.set_index('SEQ', inplace=True)
        dbg(table)
        if table.empty:
            raise Exception("No data found...", color='RED', effect='BOLD')
        return table
    except Exception as e:
        return e


def process_clipboard() -> str:
    correct = False
    while not correct:
        win32clipboard.OpenClipboard()
        try:
            xml = win32clipboard.GetClipboardData()
        except:
            xml = "None"
        win32clipboard.CloseClipboard()
        print(f"\n\nYour current clipboard is as follows:")
        print(xml)
        yn = input("\nIs this the XML you'd like to parse? (y/n)\n >")
        if yn.lower() == "debug":
            getLogger().setLevel(10)
            print("\nYou have now entered debug mode...")
        correct = True if re.search("(?i)y|1", yn) != None else False
        if not correct:
            input("Please copy the xml then press enter...")
    return xml


def alter_suffix(p: Path, desired: str) -> Path:
    if p.suffix != desired:
        p = Path(p.name.replace(p.suffix, desired))
    return p

def main(xml: str) -> DataFrame:
    table = None
    while type(table) != DataFrame:

        table: Union[DataFrame, Exception] = create_table(xml)
        if type(table) != DataFrame:
            print(f"\n\nENCOUNTERED ERROR!:\n{table}\n")
            input("Please try again...")
            continue
        print(f"Table sample:")
        print(table)
        table.to_clipboard()
        print("This table is now in your clipboard to paste into excel.")
        return table

if __name__ == "__main__":

    logger = getLogger().setLevel(40)

    try:
        parser = ap.ArgumentParser(
            prog="XML Parser",
            description='''This program parses XML data into a pandas DataFrame.
            The XML data can come from an input file or the clipboard.
            If an output file is specified, the DataFrame will be written to this file in Excel format.
            If debug mode is enabled, detailed logging information will be written to "xml_parse.log".'''
        )

        parser.add_argument(
            "-i", "--input",
            help="Path to the XML file to parse. If not specified, the program will ask for XML data from the clipboard."
        )

        parser.add_argument(
            "-o", "--output",
            help="Path to the output Excel file. If not specified, the DataFrame will be written to 'Parsed XML.xlsx' in the current directory."
        )

        parser.add_argument(
            "--debug", action="store_true",
            help="Enable debug mode. Detailed logging information will be written to 'xml_parse.log'."
        )

        args = parser.parse_args()

        if args.debug:
            logger = getLogger()
            logger.setLevel(10)
            f_handler = FileHandler(
                Path("xml_parse.log")
            )
            f_handler.setLevel(10)
            s_handler = StreamHandler()
            s_handler.setLevel(40)
            logger.addHandler(f_handler)
            logger.addHandler(s_handler)


        if args.input is not None:
            i_file : Path = Path(args.input)

            if not i_file.exists():
                raise ValueError(f"{i_file} could not be found. Make sure the path is correct.")
            elif i_file.suffix != ".xml":
                raise NotImplementedError(f"This program can only parse .xml not {i_file}!")
            with open(i_file) as xml_file:
                xml_str: str = xml_file.read()
            output_path = Path(i_file.parent, i_file.name)

        else:
            xml_str = process_clipboard()
            output_path = Path("Parsed XML.xlsx")

        xml_df: DataFrame = main(xml_str)
        if args.output is not None:
            try:
                output_path = Path(args.output)
                if output_path.suffix != ".xlsx":
                    output_path.suffix = ".xlsx"
            except Exception as e:
                exc(f"Failed to use passed output file: {args.output}.\
                    Using {output_path}.\n{e}")
        output_path = alter_suffix(output_path, ".xlsx")
        xml_df.to_excel(output_path, freeze_panes=(0,1), index=False)
        print(f"Processing Complete!\nOutput data available here: {output_path}")
        input("\n\nPress any key to exit.")

    except Exception as e:
        print(f"The program failed to start do the the following exception:\n{e}")
        input(f"Please make note of the error before closing so that you can report it.")