#!/usr/bin/env python3
#
"""Performs a well-formedness check on XML.

This script is needed as xmllint cannot ignore duplicated IDs.
This tool:

 * Processes XIncludes (if requested using the option --xinclude)
 * Ignores non-unique IDs (attributes xml:id or id)
 * Outputs errors about undeclared entities
"""

__author__ = "Thomas Schraitle"
__version__ = "0.3.1"

import argparse
import enum
import logging
from io import StringIO
import re
import sys

from lxml import etree


ROOTLOGGER = "lxmlerrors"
# Hint: file keyword needs to be filled with extra keyword on the
# logger methods .critical, .debug(), .error(), .fatal(), and .warn().
# For example:
#   log.error("...", extra=dict(file="foo.xml", ))
# Basically everything can be used what is in ERROR_PATTERN
FORMAT = "%(file)s:\n    %(levelname)s: %(message)s (line %(line)s column %(col)s)"

logger = logging.getLogger(ROOTLOGGER)
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
ch.setFormatter(logging.Formatter(FORMAT))
logger.addHandler(ch)

if etree.LXML_VERSION < (4, 4, 2):
    logger.fatal("Need a minimum version of 4.4.2 of lxml, got %s",
                 ".".join([str(e) for e in etree.LXML_VERSION]),
                 extra=dict(file=__file__,
                            line="", col=""))
    sys.exit(10)


# Format of libxml error messages:
#   foo.xml:3:16:FATAL:PARSER:ERR_UNDECLARED_ENTITY:
# This regex was tested against XML files with ":"
ERROR_PATTERN = re.compile(
    r"(?P<file>.*)"
    r":(?P<line>\d+)"
    r":(?P<col>\d+)"
    r":(?P<level>[A-Z]+)"
    r":(?P<origin>[A-Z]+)"
    r":(?P<domain>[A-Z_]+)"
    r":\s*"
    r"(?P<message>.*)$"
)


class ExitCode(enum.IntEnum):
    """Our return codes
    """
    ok, syntax, entities, xinclude, multiple, file_not_found = range(0, 60, 10)
    unknown = 200


def extract_log(stream):
    """Extract log record from stream
    """
    for line in stream.getvalue().split("\n"):
        if not line:
            continue
        match = ERROR_PATTERN.search(line)
        yield match.groupdict()


def print_error_msg(stream) -> int:
    """Print log line from stream

    :param stream: the stream we want to extract the messages from
    :return: the number of found entries (=error message lines)
    """
    error_map = {
        "FATAL": logging.FATAL,  # etree.ErrorLevels.FATAL
        "ERROR": logging.ERROR,  # etree.ErrorLevels.ERROR
        "WARNING": logging.WARNING,  # etree.ErrorLevels.WARNING
    }
    idx = 0
    for entry in extract_log(stream):
        level = error_map.get(entry["level"], logging.ERROR)
        msg = entry.pop("message")
        logger.log(level, msg, extra=entry)
        idx += 1

    return idx


def check_wellformedness(xmlfile: str, xinclude: bool = True) -> int:
    """Checks a file for well-formedness

    This only works with lxml >= 3.4.0 (because of collect_ids option)

    :param xmlfile: filename to XML file
    :param xinclude: do xinclude processing (default: True) or not
    :return: 0 (everything ok) or != 0 (some problem)
    :rtype: int
    """
    # We don't want to collect all IDs to avoid problems when
    # IDs are non-unique:
    result = ExitCode.ok

    log_stream = StringIO()
    log = logging.getLogger(ROOTLOGGER).getChild("wellformedness")
    log.propagate = False
    ch = logging.StreamHandler(log_stream)
    ch.setLevel(logging.WARNING)
    log.addHandler(ch)
    #
    etree.use_global_python_log(etree.PyErrorLog(xmlfile, log))
    xmlparser = etree.XMLParser(collect_ids=False)

    try:
        tree = etree.parse(xmlfile, parser=xmlparser)

        if xinclude:
            tree.xinclude()

    except OSError as err:
        # FATAL:PARSER:ERR_UNDECLARED_ENTITY:
        # Write artificial error message to our log stream:
        log_stream.write(f"{xmlfile}:0:0:FATAL:PARSER:FILE_NOT_FOUND:File does not exist\n")
        result = ExitCode.file_not_found.value

    except etree.XMLSyntaxError as err:
        result = ExitCode.syntax.value

    except etree.XIncludeError as err:
        result = ExitCode.xinclude.value

    idx = print_error_msg(log_stream)

    # HACK:
    # For some unknown reason, when parsing MAIN.SLEDS.xml with an unknown
    # entity, it doesn't raise XIncludeError or XMLSyntaxError exceptions. :-(
    #
    # If the return code hasn't changed but there are error messages,
    # make sure to return some error code:
    if idx and result == ExitCode.ok:
        result = ExitCode.multiple
    return result


def parse_cli(args=None) -> argparse.Namespace:
    """Parse CLI arguments

    :param list args: Use the list or sys.args
    :return: parsed arguments
    :rtype: :class:`argparse.Namespace`
    """
    parser = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0],
                                     epilog=__doc__.split("\n", 1)[-1],
                                     )
    parser.add_argument("--version",
                        action="version",
                        version="%(prog)s {}".format(__version__),
                        )
    parser.add_argument("--xinclude",
                        action="store_true",
                        default=False,
                        help="Do XInclude processing"
                        )
    parser.add_argument("-W", "--warnings-as-errors",
                        action="store_true",
                        default=False,
                        help=("Flag to set the behavior when "
                              "referenced files with XIncludes cannot "
                              "be found; "
                              "(default: %(default)s)")
                        )
    parser.add_argument("-S", "--stats",
                        action="store_true",
                        default=False,
                        help=("Print statistic about successful/"
                              "failed files"
                              )
                        )
    parser.add_argument("xmlfiles",
                        nargs="+",
                        help="XML file(s) to check for well-formedness"
                        )
    args = parser.parse_args(args)
    return args


def main(cliargs=None) -> int:
    """Entry point for the application script

    :param list cliargs: Arguments to parse or None (=use :class:`sys.argv`)
    :return: error code; 0 => everything was successful, !=0 => error
    :rtype: int
    """
    endresult = ExitCode.ok
    failures = 0
    successes = 0

    args = parse_cli(cliargs)

    for xmlfile in args.xmlfiles:
        result = check_wellformedness(xmlfile,
                                      xinclude=args.xinclude
                                      )
        if result and args.warnings_as_errors:
            break
        if result:
            failures += 1
        else:
            successes += 1
        endresult += result

    if args.stats:
        msg = (f"--- Successful Files={successes}, "
               f"Failed files={failures} ---"
               )
        print(msg, file=sys.stderr)

    return int(endresult)


if __name__ == "__main__":
    sys.exit(main())
