Description: Add debian maintainer assistant
 This patch add a helper script for maintainers of Debian FreeDict packages. See
 the documentation of that script to find out more.
Author: Sebastian Humenda <shumenda@gmx.de>
Last-Update: 2018-10-28

Index: freedict-tools/fetchdictdata.py
===================================================================
--- /dev/null
+++ freedict-tools/fetchdictdata.py
@@ -0,0 +1,442 @@
+#!/usr/bin/env python3
+"""This script is designed to help with the Debian FreeDict packaging. It can
+fetch all available databases from
+https://www.freedict.org/freedict-database.xml
+and generate a orig source tar ball or generate debian/control and
+debian/copyright.
+
+Please use `--help` to find out more details.
+
+IMPORTANT: you must run this script from the root of the freedict source,
+otherwise the operations will fail.
+
+Generating debian/copyright and debian/control
+==============================================
+
+Since FreeDict packages contain a load of very similar dictionaries, the process
+is made more convenient by generating the mentioned files. d/control is made up
+of a d/control.HEAD file (make sure this one exists), usually with the normal
+source stanza and followed by auto-generated stanzas, derived from information
+of the debian/freedict-database.xml. The XML API file is fetched automatically,
+if required.
+
+The d/copyright file is generated using licensecheck and a bit of guessing
+logic. The file d/copyright.snippets/TAIL is appended and usually contains the
+license definition stanzas. If the copyright file contains "FIXME", the
+maintainer may also like to create a file d/copyright.snippets/xxx-yyy plain
+text As soon as such a file is found, no license information will be queried for
+the specified dictionary.
+"""
+
+#pylint: disable=multiple-imports,too-few-public-methods
+import argparse
+import collections, re
+import os, shutil, subprocess, sys
+import datetime
+import xml.etree.ElementTree as ET
+import urllib.request
+
+XML_URL = 'http://www.freedict.org/freedict-database.xml'
+LANGCODE_TABLE = "debian/iso-639-3_20130123.tab"
+
+def dictionarycode2longdescription(string):
+    """
+    Take a iso-639 dictionary string (like lat-deu) and translate it into the
+    long version (like Latin-German)."""
+    for item in os.listdir('debian'):
+        if item.startswith('iso-'):
+            tmp = open(os.path.join('debian', item), 'r', encoding='utf-8').read()
+            tmp = tmp.split('\n')
+    langtbl = {}
+    for line in tmp[1:]:
+        line = line.split('\t')
+        langtbl[line[0]] = line[6]
+        if line[1] != '': # some have two language code, add it:
+            langtbl[line[1]] = line[6]
+    string = string.split('-')
+    return langtbl[string[0]] +'-' + langtbl[string[1]]
+
+def get_xml_content(fetch_new=False):
+    """Either read contents of freedict-database.xml or fetch it from the web
+first and save it."""
+    if fetch_new or not os.path.exists('debian/freedict-database.xml'):
+        data = urllib.request.urlopen(XML_URL).read().decode('utf-8')
+        open("debian/freedict-database.xml", "w", encoding="utf-8").write(data)
+    else: # read from file
+        data = open('debian/freedict-database.xml', 'r', encoding='utf-8').read()
+    return data
+
+def find_license(dict):
+    """Find out license of dictionary.
+First run license-check, afterwards use a self-brewed license checker.
+Currently only GPL is detected, else FIXMe is output. It tries to output
+something like GPL, GPL-2, GPL-3, GPL-2+, GPL-3+"""
+    def run_licensecheck_on(file_path):
+        """Try to extract license with license check."""
+        proc = subprocess.Popen(['licensecheck', '-m', file_path], stdout=subprocess.PIPE)
+        try:
+            text = proc.communicate()[0].decode('utf-8').split('\t')[1]
+        except IndexError:
+            return None
+        licences = {r'gpl \(v(\d).*': 'GPL-%s',
+                r'.*cc by \(v([^)]+)\).*': 'CC-by-%s',
+                r'.*cc by-sa\s?\(v([^)]+)\)?.*': 'CC-by-sa-%s',
+            }
+        for licence, debianised in licences.items():
+            sre = re.search(licence, text.lower())
+            if sre:
+                if sre[0] is None:
+                    continue # no version found, unusable
+                return debianised % sre.groups() + \
+                        ('+' if 'later' in text.lower() else '')
+        return None
+
+
+    tei_fn = '{0}{1}{0}.{2}'.format(dict, os.sep, 'tei')
+    license = run_licensecheck_on(tei_fn)
+    if not license:
+        license = run_licensecheck_on(dict + os.sep + 'COPYING.tei')
+
+        if not license:
+            license = 'FIXME' # backup solution
+
+    if license != 'FIXME': # found a license, return
+        return license
+
+    # try guessing the license from the TEI file
+    with open(tei_fn, 'r', encoding='utf-8') as f:
+        in_header = True
+        line = 'start'
+        lastline = ''
+        while in_header and line != '':
+            line = f.readline().lower()
+            if '<body' in line:
+                line = line[:line.find('<body')] # parse everything before
+                in_header = False
+            if 'gpl' in line.lower() or 'gnu general public lic' in line.lower():
+                license = 'GPL'
+                # try to extract version number
+                res = re.search(r"(?:version|ver.|license|licence)\s+(\d+)", lastline+line.lower())
+                if res:
+                    license += '-%s' % res.groups()[0]
+            elif 'attribution-sharealike' in line:
+                license = 'CC-BY-SA'
+                version = re.search(r'sharealike \(?v?(\d+\.?\d*)', line)
+                if version:
+                    license += '-%s' % version.groups()[0]
+            lastlines = lastline + line
+            if license.startswith('GPL') and not license.endswith('+'):
+                if re.search('.*later.*version', lastlines) or \
+                        'or later' in lastlines or 'and any later' in lastlines:
+                    license += '+'
+            lastline = line[:]
+    return license
+
+def recursive_text(node):
+    text = ''
+    if not node.text.strip() == '':
+        text = node.text
+    for child in node:
+        text += '\n' + recursive_text(child)
+    return text
+
+class GenerateControlCopyright():
+    def __init__(self, root, no_desc_version):
+        self.__dictionaries = {}
+        self.root = root
+        # keep/strip version number from pkg description
+        self.__desc_version = not no_desc_version
+        self.parse_data()
+
+    def parse_data(self):
+        """Iterate over XML tree to collect dictionary data."""
+        for child in self.root:
+            if not child.getchildren():
+                continue # skip dictionaries without releases
+            name = child.attrib['name']
+            self.__dictionaries[name] = {} # initialize new dictionary
+            for key in ('headwords', 'edition', 'status', 'maintainerName'):
+                try:
+                    self.__dictionaries[name][key] = child.attrib[key]
+                except KeyError as e:
+                    if e.args[0] == 'status':
+                        pass # status is optional
+                    else:
+                        raise KeyError('missing attribute for %s: %s' % \
+                                (child.attrib['name'], e.args[0]))
+
+    def write_all(self):
+        """Write both control as well as the copyright file."""
+        self.sort_dictionaries()
+        self.write_control()
+        self.write_copyright()
+
+    def write_control(self):
+        """Generate debian/control from debian/control.HEAD and the gathered
+        dictionary data."""
+        HEAD = open('debian/control.HEAD', 'r', encoding='utf-8').read() +'\n'
+        string = [HEAD.rstrip(), '\n\n']
+
+        for dict, content in self.__dictionaries.items():
+            string.append('Package: dict-freedict-%s\n' % dict)
+            string.append("""Architecture: all
+Depends: ${misc:Depends}
+Suggests: dictd | dicod, dict | kdict | gnome-dictionary | goldendict
+Provides: dictd-dictionary\n""")
+            status = ''
+            if 'status' in content:
+                status = ' (FreeDict status: %s)' % content['status']
+            string.append('Description: %s dictionary for the dict server/client'\
+                    % dictionarycode2longdescription(dict))
+            version = (', version %s' % content['edition'] if self.__desc_version
+                    else '')
+            longstr = '''
+ This is the %s dictionary from the FreeDict project%s. It contains %s headwords%s. It can be either used with the dictd server and a dict client or with GoldenDict.'''\
+                    % (dictionarycode2longdescription(dict), version, \
+                        content['headwords'], status)
+            # format description to 80 characters per line
+            tmp = ' '
+            for piece in longstr.split(' '):
+                if(len(tmp+piece) <= 80):
+                    tmp += piece + ' '
+                else:
+                    string.append(tmp[:-1]+'\n')
+                    tmp = ' '+piece+ ' '
+            string.append(tmp+'\n\n')
+        open('debian/control','w', encoding='utf-8').write( ''.join(string) )
+
+    def write_copyright(self):
+        """Generate debian/copyright from debian/copyright.HEAD and the gathered
+        dictionary data."""
+        upstream_last_touched = int(subprocess.check_output([
+                'dpkg-parsechangelog', '-S', 'version']
+            ).decode(sys.getdefaultencoding()).split('.', 1)[0].strip())
+        cprght_snippets = '{1}{0}{2}{0}'.format(os.sep, 'debian',
+                'copyright.snippets')
+        HEAD = open(cprght_snippets + 'HEAD', encoding='utf-8').read()
+        string = [HEAD, '\n']
+        for dict in self.__dictionaries:
+            # is there a manual copyright snippet?
+            if os.path.exists(cprght_snippets + dict):
+                with open(cprght_snippets + dict, encoding='utf-8') as f:
+                    string.append('\n' + f.read())
+            else:
+                string.append('\nFiles: %s/*\n' % dict)
+                string.append('Copyright: 2000-%s FreeDict contributors\n' \
+                        % upstream_last_touched)
+                string.append('License: ' + find_license(dict) + '\n')
+        with open(cprght_snippets + 'TAIL', encoding='UTF-8') as f:
+            string += ['\n\n', f.read()]
+
+        document = ''.join(string)
+        with open('debian/copyright', 'w', encoding='utf-8') as f:
+            f.write(document)
+        if 'FIXME' in document:
+            print('NOTE: some licenses could not be extracted, search for "FIXME" in debian/copyright.')
+
+    def sort_dictionaries(self):
+        """
+        Overwrite the self.__dictionaries-dictionary with a sorted
+        collectionss.OrderedDict. We cannot expect to find ordered data in the
+        XML, so we should sort on our own, afterwards.
+        """
+        d = collections.OrderedDict()
+        for key in sorted(self.__dictionaries):
+            d[key] = self.__dictionaries[key]
+        self.__dictionaries = d
+
+
+class fetch_source():
+    """Fetch the sources of all dictionaries and the tools directory."""
+    def __init__(self, root):
+        self.date = self.gen_date()
+        self.dirname = 'freedict-%s.orig' % self.date
+        self.root = root
+        self.exclude_dictionaries = []
+        if len(sys.argv) == 4: # there's the -x option given
+            if sys.argv[2] == '-x':
+                self.exclude_dictionaries = sys.argv[3].split(' ')
+
+    def gen_date(self):
+        """Return date in format "yyyy.mm.dd"."""
+        d = datetime.datetime.now()
+        return str(d.year) + '.' + str(d.month).zfill(2).replace(' ','0') \
+                + '.' + str(d.day).zfill(2).replace(' ','0')
+
+    def prepare_environment(self):
+        """
+        Perform all actions which are needed before downloading the
+        source.
+        """
+        if os.path.exists(self.dirname):
+            print("Removing %s; possibly left over from an interrupted run." %
+                    self.dirname)
+            shutil.rmtree(self.dirname)
+        os.mkdir(self.dirname)
+        os.chdir( self.dirname )
+
+    def clean_up(self):
+        """
+        Compress the original source, move it to the right destination and
+        remove download directory."""
+        tarname = self.dirname.replace('-','_') + '.tar.xz'
+        os.chdir('..')
+        ret = os.system('tar cJf %s %s' % (tarname, self.dirname))
+        if ret:
+            sys.exit(9)
+        print('Moving tar archive upward to.', os.path.join('..', tarname))
+        os.rename(tarname, '..'+os.sep+tarname)
+        shutil.rmtree(self.dirname)
+
+    def write_all(self):
+        """Download all upstream source packages."""
+        self.prepare_environment()
+        imported = 0
+        for dict in self.root:
+            if dict.attrib['name'] in self.exclude_dictionaries:
+                print("Skip %s (specified via commmand line)" \
+                            % dict.attrib['name'])
+                continue
+            # iterate over source releases
+            for release in dict:
+                if not release.attrib.get('platform') or \
+                        release.attrib['platform'] != 'src':
+                    continue
+                src_url = release.attrib['URL']
+                fn = release.attrib['URL'].split('/')[-1]
+                print("Fetching %s from %s" % (dict.attrib['name'], src_url))
+                try:
+                    with urllib.request.urlopen(src_url) as u:
+                        data = u.read()
+                except urllib.error.HTTPError as h:
+                    if int(h.code) == 404:
+                        reason = '%s; url: %s' % (str(h), src_url)
+                        raise urllib.error.URLError(reason) from None
+                    else:
+                        raise h from None
+
+                with open(fn, 'wb') as f:
+                    f.write(data)
+                print("Extracting",fn)
+                if fn.endswith('.zip'):
+                    os.system('unzip -qq "%s"' % fn)
+                elif any(fn.endswith(suf) for suf in ('.tar.bz2', '.tar.gz',
+                        '.tar.xz')):
+                    os.system('tar xf "%s"' % fn)
+                else:
+                    print('E: unknown format of "%s".' % fn)
+                    sys.exit(0)
+
+                os.system('tar xf "%s"' % fn)
+                os.remove(fn)
+                imported += 1
+                break # do not search for further source releases, might be multiple archive formats
+        print("Imported %d dictionaries." % imported)
+        self.clean_up()
+
+class Criteria:
+    """A criteria matcher: Use the string
+    sourceURL:wikdict
+    to filter for all dictionaries fwith wikdict in their URL and use
+    sourceURL!wikdict
+    to filter for all  dictionaries not derived from wikdict. The first part is
+    an XML attribute name, the second bit is a regular expression.
+    This class can cope with empty criteria: all matches will be true."""
+    def __init__(self, criteria):
+        self.__rgx = None
+        self.__delim = None
+        self.__attr = None
+        if criteria and (not ':' in criteria and not '!' in criteria):
+            raise ValueError("criteria needs to consist of a dictionary "
+                "attribute followed by `:`  or `!` followed by a regular "
+                "expression")
+        if criteria:
+            self.__delim = (':' if ':' in criteria else '!')
+            self.__attr, self.__rgx = criteria.split(self.__delim, 1)
+            self.__rgx = re.compile(self.__rgx)
+
+    def matches(self, dictnode):
+        if not self.__attr:
+            return True # include this one, no attribute specified
+        attr = dictnode.get(self.__attr)
+        if not attr:
+            # **only** include positive matches, yet nothing to match on found
+            return (False if self.__delim == ':' else True)
+        if self.__delim == ':':
+            return bool(self.__rgx.search(attr))
+        return not bool(self.__rgx.search(attr))
+
+def clean_up_tree(root, criteria):
+    """Iterate over XML tree and delete those <dictionary/>-nodes which have no
+    release or which don't match one of the given criteria."""
+    criteria_matched = False # track whether a criteria was matchd
+    criteria = Criteria(criteria)
+    for dictionary in root.getchildren():
+        if criteria.matches(dictionary):
+            criteria_matched = True
+            if dictionary.getchildren():
+                continue
+        print('%s removed, ' % dictionary.attrib['name'], end="")
+        print(('no releases' if not dictionary.getchildren()
+                else "didn't match criteria"))
+        root.remove(dictionary)
+    if not criteria_matched:
+        print("Warning: given criteria never matched")
+
+def parse_args():
+    if os.getcwd().endswith('debian'):
+        os.chdir('..')
+    if not any(re.match(r'[a-z]{3}-[a-z]{3}', f)
+                for f in os.listdir(os.getcwd())):
+        print("You must run this script from the FreeDict packaging root.")
+        sys.exit(127)
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--criteria', dest='criteria', default=None,
+            help=('A criteria is a XML attribute name (from the Dictionary tag),'
+                ' separated by a delimiter, followed by a regular expression. '
+                'This can be used to filter dictionaries. If attribute name and'
+                ' regular expression are separated by a colon `:`, the '
+                'dictionary node MUST match the given expression, if it '
+                'separated by a exclamation mark `!`, these dictionaries will '
+                'be dropped.'))
+    parser.add_argument('--dc', dest='gen_control_copyright',
+            action='store_true', default=False,
+            help='generate debian/copyright and debian/control')
+    parser.add_argument('--no-desc-version', dest='no_desc_version',
+            action='store_true', default=False,
+            help='omit the version number in package descriptions (only '
+                'useful when using --dc)')
+    parser.add_argument('--orig', dest='fetch_orig', action='store_true',
+            default=False, help='fetch a new orig source tar ball to ../')
+    parser.add_argument('-u', dest='update_xml_api', action='store_true',
+            default=False, help='Update FreeDict XML API file and exit.')
+    if len(sys.argv) == 1:
+        parser.print_usage()
+    return parser.parse_args()
+
+def main():
+    # are we in the correct directory?
+    # cmd args
+    args = parse_args()
+    xmlsrc = get_xml_content()
+    actions = []
+    # actions can be combined
+    if args.fetch_orig:
+        xmlsrc = get_xml_content(fetch_new=True) # fetch latest FreeDict API file
+        actions += [fetch_source]
+    if args.gen_control_copyright:
+        actions += [GenerateControlCopyright]
+
+    # usual operation
+    root = ET.fromstring(xmlsrc)
+    clean_up_tree(root, criteria=args.criteria)
+    for obj in actions:
+        inst = None
+        if obj == GenerateControlCopyright:
+            inst = obj(root, args.no_desc_version)
+        else:
+            inst = obj(root)
+        inst.write_all()
+
+if __name__ == '__main__':
+    main()
