Description: Add debian maintainer assistant
 This patch add a helper script for maintainers of Debian FreeDict packages. See
 the documentation of that script to find out more.
Author: Sebastian Humenda <shumenda@gmx.de>
Last-Update: 2018-10-28

Index: freedict-tools/fetchdictdata.py
===================================================================
--- /dev/null
+++ freedict-tools/fetchdictdata.py
@@ -0,0 +1,519 @@
+#!/usr/bin/env python3
+"""This script is designed to help with the Debian FreeDict packaging. It can
+fetch all available databases from
+https://www.freedict.org/freedict-database.xml
+and generate a orig source tar ball or generate debian/control and
+debian/copyright.
+
+Please use `--help` to find out more details.
+
+IMPORTANT: you must run this script from the root of the freedict source,
+otherwise the operations will fail.
+
+Generating debian/copyright and debian/control
+==============================================
+
+Since FreeDict packages contain a lot of very similar dictionaries, the process
+is made more convenient by generating the mentioned files. d/control is made up
+of a d/control.HEAD file (make sure this one exists), usually with the normal
+source stanza and followed by auto-generated stanzas, derived from information
+of the debian/freedict-database.xml. The XML API file is fetched automatically,
+if required.
+
+The d/copyright file is generated using licensecheck and a bit of guessing
+logic. The file d/copyright.snippets/TAIL is appended and usually contains the
+licence definition stanzas. If the copyright file contains "FIXME", the
+maintainer may also like to create a file d/copyright.snippets/xxx-yyy plain
+text As soon as such a file is found, no licence information will be queried for
+the specified dictionary.
+"""
+
+# pylint: disable=multiple-imports,too-few-public-methods,global-statement
+import argparse
+import collections, re
+import os, shutil, subprocess, sys
+import datetime
+import textwrap
+import xml.etree.ElementTree as ET
+import urllib.request
+
+XML_URL = "http://www.freedict.org/freedict-database.xml"
+__DICT2LONGNAME = {}  # filled by dictionarycode2longdescription
+
+
+def dictionarycode2longdescription(string):
+    """Translate a dictionary code into a human-readable string.
+    A dictionary is encoded using a hyphen-delimited pair of ISO 639-3 codes.
+    This function translates this into a language pair of English names,
+    delimited by a hyphen."""
+    global __DICT2LONGNAME
+    if not __DICT2LONGNAME:
+        __DICT2LONGNAME = parse_iso_table()
+    string = string.split("-")
+    return __DICT2LONGNAME[string[0]] + "-" + __DICT2LONGNAME[string[1]]
+
+
+def parse_iso_table():
+    """Parse the ISO codes from a tab-delimited data file in debian/ named
+    iso*.tab."""
+    tbl = {}
+    for item in os.listdir("debian"):
+        if item.startswith("iso-"):
+            tmp = open(os.path.join("debian", item), "r", encoding="utf-8").read()
+            tmp = tmp.split("\n")
+            for line in tmp[1:]:
+                line = line.split("\t")
+                name = line[6]
+                # remove parenthesized vom language name
+                name = re.sub(r"\s*\(.*?\)\s*", "", name)
+                tbl[line[0]] = name
+                if line[1] != "":  # some have two language code, add it:
+                    tbl[line[1]] = name
+    return tbl
+
+
+def get_xml_content(fetch_new=False):
+    """Either read contents of freedict-database.xml or fetch it from the web
+first and save it."""
+    if fetch_new or not os.path.exists("debian/freedict-database.xml"):
+        data = urllib.request.urlopen(XML_URL).read().decode("utf-8")
+        open("debian/freedict-database.xml", "w", encoding="utf-8").write(data)
+    else:  # read from file
+        data = open("debian/freedict-database.xml", "r", encoding="utf-8").read()
+    return data
+
+def run_licensecheck_on(file_path):
+    """Try to extract licence with licence check."""
+    proc = subprocess.Popen(
+        ["licensecheck", "--copyright", "--deb-fmt", file_path], stdout=subprocess.PIPE
+    )
+    try:
+        text = proc.communicate()[0].decode("utf-8").lstrip().split("\n")[0]
+    except IndexError:
+        return None
+    sre = re.search(r"^.+: (.+)$", text.split("\n")[0])
+    if not sre:
+        return None
+    match = sre.groups()[0]
+    return (None if "UNKNOWN" in match else match)
+
+def find_licence(dict):
+    """Find out licence of dictionary.
+First run licence-check, afterwards use a self-brewed licence checker.
+Currently only GPL is detected, else FIXMe is output. It tries to output
+something like GPL, GPL-2, GPL-3, GPL-2+, GPL-3+"""
+
+
+    tei_fn = "{0}{1}{0}.{2}".format(dict, os.sep, "tei")
+    licence = run_licensecheck_on(tei_fn)
+    if not licence:
+        licence = run_licensecheck_on(dict + os.sep + "COPYING.tei")
+
+        if not licence:
+            licence = "FIXME"  # backup solution
+
+    if licence != "FIXME":  # found a licence, return
+        return licence
+
+    # try guessing the licence from the TEI file
+    with open(tei_fn, "r", encoding="utf-8") as f:
+        in_header = True
+        line = "start"
+        lastline = ""
+        while in_header and line != "":
+            line = f.readline().lower()
+            if "<body" in line:
+                line = line[: line.find("<body")]  # parse everything before
+                in_header = False
+            if "gpl" in line.lower() or "gnu general public lic" in line.lower():
+                licence = "GPL"
+                # try to extract version number
+                res = re.search(
+                    r"(?:version|ver.|licence|licence)\s+(\d+)", lastline + line.lower()
+                )
+                if res:
+                    licence += "-%s" % res.groups()[0]
+            elif "attribution-sharealike" in line:
+                licence = "CC-BY-SA"
+                version = re.search(r"sharealike \(?v?(\d+\.?\d*)", line)
+                if version:
+                    licence += "-%s" % version.groups()[0]
+            lastlines = lastline + line
+            if licence.startswith("GPL") and not licence.endswith("+"):
+                if (
+                    re.search(".*later.*version", lastlines)
+                    or "or later" in lastlines
+                    or "and any later" in lastlines
+                ):
+                    licence += "+"
+            lastline = line[:]
+    return licence
+
+
+def recursive_text(node):
+    text = ""
+    if not node.text.strip() == "":
+        text = node.text
+    for child in node:
+        text += "\n" + recursive_text(child)
+    return text
+
+
+class GenerateControlCopyright:
+    def __init__(self, root, no_desc_version):
+        self.__dictionaries = {}
+        self.root = root
+        # keep/strip version number from pkg description
+        self.__desc_version = not no_desc_version
+        self.parse_data()
+
+    def parse_data(self):
+        """Iterate over XML tree to collect dictionary data."""
+        for child in self.root:
+            if not list(child) or not child.attrib.get("name"):
+                continue  # skip dictionaries without releases or non-dictionary nodes
+            name = child.attrib["name"]
+            self.__dictionaries[name] = {}  # initialize new dictionary
+            for key in ("headwords", "edition", "status", "maintainerName"):
+                try:
+                    self.__dictionaries[name][key] = child.attrib[key]
+                except KeyError as e:
+                    if e.args[0] == "status":
+                        pass  # status is optional
+                    else:
+                        raise KeyError(
+                            "missing attribute for %s: %s"
+                            % (child.attrib["name"], e.args[0])
+                        )
+
+    def write_all(self):
+        """Write both control as well as the copyright file."""
+        self.sort_dictionaries()
+        self.write_control()
+        self.write_copyright()
+
+    def write_control(self):
+        """Generate debian/control from debian/control.HEAD and the gathered
+        dictionary data."""
+        HEAD = open("debian/control.HEAD", "r", encoding="utf-8").read() + "\n"
+        tokens = [HEAD.rstrip(), "\n\n"]
+
+        for dict, content in self.__dictionaries.items():
+            tokens.append("\nPackage: dict-freedict-%s\n" % dict)
+            tokens.append(
+                """Architecture: all
+Depends: ${misc:Depends}
+Suggests: dictd | dicod, dict | kdict | gnome-dictionary | goldendict
+Provides: dictd-dictionary\n"""
+            )
+            status = ""
+            if "status" in content:
+                status = " (FreeDict status: %s)" % content["status"]
+            longname = dictionarycode2longdescription(dict)
+            tokens.append(
+                f"Description: {longname} dictionary for the dict server/client\n"
+            )
+            version = ", version %s" % content["edition"] if self.__desc_version else ""
+            longdesc = (
+                "This is the %s dictionary from the FreeDict project%s. "
+                "It contains %s headwords%s. It can be either used with the dictd "
+                "server and a dict client or with GoldenDict."
+            ) % (longname, version, content["headwords"], status,)
+            # format description to 80 characters per line
+            tokens.append(
+                "\n".join(
+                    textwrap.wrap(
+                        longdesc, width=79, initial_indent=" ", subsequent_indent=" "
+                    )
+                )
+            )
+            tokens.append("\n")
+        open("debian/control", "w", encoding="utf-8").write("".join(tokens))
+
+    def write_copyright(self):
+        """Generate debian/copyright from debian/copyright.HEAD and the gathered
+        dictionary data."""
+        upstream_last_touched = int(
+            subprocess.check_output(["dpkg-parsechangelog", "-S", "version"])
+            .decode(sys.getdefaultencoding())
+            .split(".", 1)[0]
+            .strip()
+        )
+        cprght_snippets = "{1}{0}{2}{0}".format(os.sep, "debian", "copyright.snippets")
+        HEAD = open(cprght_snippets + "HEAD", encoding="utf-8").read()
+        string = [HEAD, "\n"]
+        for dict in self.__dictionaries:
+            # is there a manual copyright snippet?
+            if os.path.exists(cprght_snippets + dict):
+                with open(cprght_snippets + dict, encoding="utf-8") as f:
+                    string.append("\n" + f.read())
+            else:
+                string.append("\nFiles: %s/*\n" % dict)
+                string.append(
+                    "Copyright: 2000-%s FreeDict contributors\n" % upstream_last_touched
+                )
+                string.append("License: " + find_licence(dict) + "\n")
+        with open(cprght_snippets + "TAIL", encoding="UTF-8") as f:
+            string += ["\n\n", f.read()]
+
+        document = "".join(string)
+        with open("debian/copyright", "w", encoding="utf-8") as f:
+            f.write(document)
+        if "FIXME" in document:
+            print(
+                'NOTE: some licences could not be extracted, search for "FIXME" in debian/copyright.'
+            )
+
+    def sort_dictionaries(self):
+        """
+        Overwrite the self.__dictionaries-dictionary with a sorted
+        collectionss.OrderedDict. We cannot expect to find ordered data in the
+        XML, so we should sort on our own, afterwards.
+        """
+        d = collections.OrderedDict()
+        for key in sorted(self.__dictionaries):
+            d[key] = self.__dictionaries[key]
+        self.__dictionaries = d
+
+
+class FetchSource:
+    """Fetch the sources of all dictionaries and the tools directory."""
+
+    def __init__(self, root):
+        self.date = self.gen_date()
+        self.dirname = "freedict-%s.orig" % self.date
+        self.root = root
+        self.exclude_dictionaries = []
+        if len(sys.argv) == 4:  # there's the -x option given
+            if sys.argv[2] == "-x":
+                self.exclude_dictionaries = sys.argv[3].split(" ")
+
+    def gen_date(self):
+        """Return date in format "yyyy.mm.dd"."""
+        d = datetime.datetime.now()
+        return (
+            str(d.year)
+            + "."
+            + str(d.month).zfill(2).replace(" ", "0")
+            + "."
+            + str(d.day).zfill(2).replace(" ", "0")
+        )
+
+    def prepare_environment(self):
+        """
+        Perform all actions which are needed before downloading the
+        source.
+        """
+        if os.path.exists(self.dirname):
+            print(
+                "Removing %s; possibly left over from an interrupted run."
+                % self.dirname
+            )
+            shutil.rmtree(self.dirname)
+        os.mkdir(self.dirname)
+        os.chdir(self.dirname)
+
+    def clean_up(self):
+        """
+        Compress the original source, move it to the right destination and
+        remove download directory."""
+        tarname = self.dirname.replace("-", "_") + ".tar.xz"
+        os.chdir("..")
+        ret = os.system("tar cJf %s %s" % (tarname, self.dirname))
+        if ret:
+            sys.exit(9)
+        print("Moving tar archive upward to.", os.path.join("..", tarname))
+        os.rename(tarname, ".." + os.sep + tarname)
+        shutil.rmtree(self.dirname)
+
+    def write_all(self):
+        """Download all upstream source packages."""
+        self.prepare_environment()
+        imported = 0
+        for dict in self.root:
+            if not dict.tag.endswith("dictionary"):
+                continue
+            if dict.attrib["name"] in self.exclude_dictionaries:
+                print("Skip %s (specified via commmand line)" % dict.attrib["name"])
+                continue
+            # iterate over source releases
+            for release in dict:
+                if (
+                    not release.attrib.get("platform")
+                    or release.attrib["platform"] != "src"
+                ):
+                    continue
+                src_url = release.attrib["URL"]
+                fn = release.attrib["URL"].split("/")[-1]
+                print("Fetching %s from %s" % (dict.attrib["name"], src_url))
+                try:
+                    with urllib.request.urlopen(src_url) as u:
+                        data = u.read()
+                except urllib.error.HTTPError as h:
+                    if int(h.code) == 404:
+                        reason = "%s; url: %s" % (str(h), src_url)
+                        raise urllib.error.URLError(reason) from None
+                    raise h from None
+
+                with open(fn, "wb") as f:
+                    f.write(data)
+                print("Extracting", fn)
+                if fn.endswith(".zip"):
+                    os.system('unzip -qq "%s"' % fn)
+                elif any(
+                    fn.endswith(suf) for suf in (".tar.bz2", ".tar.gz", ".tar.xz")
+                ):
+                    os.system('tar xf "%s"' % fn)
+                else:
+                    print('E: unknown format of "%s".' % fn)
+                    sys.exit(0)
+
+                os.system('tar xf "%s"' % fn)
+                os.remove(fn)
+                imported += 1
+                break  # do not search for further source releases, might be multiple archive formats
+        print("Imported %d dictionaries." % imported)
+        self.clean_up()
+
+
+class Criteria:
+    """A criteria matcher: Use the string
+    sourceURL:wikdict
+    to filter for all dictionaries fwith wikdict in their URL and use
+    sourceURL!wikdict
+    to filter for all  dictionaries not derived from wikdict. The first part is
+    an XML attribute name, the second bit is a regular expression.
+    This class can cope with empty criteria: all matches will be true."""
+
+    def __init__(self, criteria):
+        self.__rgx = None
+        self.__delim = None
+        self.__criteria_attr = None
+        if criteria and (not ":" in criteria and not "!" in criteria):
+            raise ValueError(
+                "criteria needs to consist of a dictionary "
+                "attribute followed by `:`  or `!` followed by a regular "
+                "expression"
+            )
+        if criteria:
+            self.__delim = ":" if ":" in criteria else "!"
+            self.__criteria_attr, self.__rgx = criteria.split(self.__delim, 1)
+            self.__rgx = re.compile(self.__rgx)
+
+    def matches(self, dictnode):
+        """Test whether given dictnode matches configured criteria."""
+        if not self.__criteria_attr:
+            return True  # no criteria to match on, include unconditionally
+        attr = dictnode.get(self.__criteria_attr)
+        if not attr:
+            # **only** include positive matches, yet nothing to match on found
+            return not self.__delim == ":"
+        if self.__delim == ":":
+            return bool(self.__rgx.search(attr))
+        return not bool(self.__rgx.search(attr))
+
+
+def clean_up_tree(root, criteria):
+    """Iterate over XML tree and delete those <dictionary/>-nodes which have no
+    release or which don't match one of the given criteria."""
+    criteria_matched = False  # track whether a criteria was matchd
+    criteria = Criteria(criteria)
+    dictionary_idx = 0
+    dictionaries = list(root)
+    while dictionary_idx < len(dictionaries) - 1:
+        dictionary = dictionaries[dictionary_idx]
+        # criteria matched and has children (releases)
+        if criteria.matches(dictionary) and list(dictionary):
+            criteria_matched = True
+            dictionary_idx += 1
+        else:
+            print("%s skipped, " % dictionary.attrib["name"], end="")
+            print(("no releases" if not list(dictionary) else "didn't match criteria"))
+            root.remove(dictionary)
+            dictionaries = list(root)
+    if not criteria_matched:
+        print("Warning: given criteria never matched")
+
+
+def parse_args():
+    if os.getcwd().endswith("debian"):
+        os.chdir("..")
+    if not any(re.match(r"[a-z]{3}-[a-z]{3}", f) for f in os.listdir(os.getcwd())):
+        print("You must run this script from the FreeDict packaging root.")
+        sys.exit(127)
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--criteria",
+        dest="criteria",
+        default=None,
+        help=(
+            "A criteria is a XML attribute name (from the Dictionary tag),"
+            " separated by a delimiter, followed by a regular expression. "
+            "This can be used to filter dictionaries. If attribute name and"
+            " regular expression are separated by a colon `:`, the "
+            "dictionary node MUST match the given expression, if it "
+            "separated by a exclamation mark `!`, these dictionaries will "
+            "be dropped."
+        ),
+    )
+    parser.add_argument(
+        "--dc",
+        dest="gen_control_copyright",
+        action="store_true",
+        default=False,
+        help="generate debian/copyright and debian/control",
+    )
+    parser.add_argument(
+        "--no-desc-version",
+        dest="no_desc_version",
+        action="store_true",
+        default=False,
+        help="omit the version number in package descriptions (only "
+        "useful when using --dc)",
+    )
+    parser.add_argument(
+        "--orig",
+        dest="fetch_orig",
+        action="store_true",
+        default=False,
+        help="fetch a new orig source tar ball to ../",
+    )
+    parser.add_argument(
+        "-u",
+        dest="update_xml_api",
+        action="store_true",
+        default=False,
+        help="Update FreeDict XML API file and exit.",
+    )
+    if len(sys.argv) == 1:
+        parser.print_usage()
+    return parser.parse_args()
+
+
+def main():
+    # are we in the correct directory?
+    # cmd args
+    args = parse_args()
+    xmlsrc = get_xml_content()
+    actions = []
+    # actions can be combined
+    if args.fetch_orig:
+        xmlsrc = get_xml_content(fetch_new=True)  # fetch latest FreeDict API file
+        actions += [FetchSource]
+    if args.gen_control_copyright:
+        actions += [GenerateControlCopyright]
+
+    # usual operation
+    root = ET.fromstring(xmlsrc)
+    clean_up_tree(root, criteria=args.criteria)
+    for obj in actions:
+        inst = None
+        if obj == GenerateControlCopyright:
+            inst = obj(root, args.no_desc_version)
+        else:
+            inst = obj(root)
+        inst.write_all()
+
+
+if __name__ == "__main__":
+    main()
