Source code for germanetpy.germanet

import os
from collections import defaultdict
from tqdm import trange
from germanetpy.utils import parse_xml
from germanetpy.synsetLoader import load_lexunits
from germanetpy.iliLoader import load_ili
from germanetpy.wiktionaryLoader import load_wiktionary
from germanetpy.relationLoader import load_relations
from germanetpy.frames import Frames



[docs]
class Germanet:

    def __init__(self, datadir: str, add_ilirecords: bool = True, add_wiktionary: bool = True):
        """
        The GermaNet object is initialized with the directory where the GermaNet data is stored. The data is loaded
        when GermaNet is initialized.

        :param datadir: The path to the directory where the GermaNet data is stored
        :param add_ilirecords: a boolean, denotes whether the iliRecords should also be loaded into the GermaNet
        object, default: True
        :param add_wiktionary: a boolean, denotes whether the wiktionary files should also be loaded into the GermaNet
        object, default: True
        """
        self._datadir = datadir
        self._add_ilirecords = add_ilirecords
        self._add_wiktionary = add_wiktionary

        # Dictionary: lexunit id - lexunit object
        self._lexunits = {}

        # Dictionary: synset id - synset object
        self._synsets = {}

        # Dictionary: any orthform (all variants) - lexunit id
        self._orthform2lexid = defaultdict(set)

        # Dictionary: main orthform - lexunit id
        self._mainOrtform2lexid = defaultdict(set)

        # Dictionary: lower cased orthographic form (all variants) - lexunit id
        self._lowercasedform2lexid = defaultdict(set)

        # Dictionary: Wordcategory - set of lexunit ids
        self._wordcat2lexid = defaultdict(set)

        # Dictionary: Wordclass - set of lexunit ids
        self._wordclass2lexid = defaultdict(set)

        # Set of synsets (that are compounds)
        self._compounds = set()

        # Dictionary: Frame - Lexunit objects
        self._frames2lexunits = defaultdict(set)

        # List: wiktionary entries
        self._wiktionary_entries = []

        # List: ili Records
        self._ili_records = []

        # the Frames object, storing all frame information from GermaNet
        self._frames = Frames(self._frames2lexunits)

        # load data when GermaNet is initialized
        self._load_data()


[docs]
    def get_synsets_by_orthform(self, form: str, ignorecase: bool = False) -> list:
        """
        This method returns a list of synsets that match the given input search string

        :param form: a word that can be looked up in the GermaNet
        :param ignorecase: whether the case of the word should be ignored (default = False)
        :return: a list of synsets
        """
        if ignorecase:
            form = form.lower()
            lexunit_ids = self.lowercasedform2lexid[form]
        else:
            lexunit_ids = self.orthform2lexid[form]
        return [self.lexunits[id].synset for id in lexunit_ids]



[docs]
    def get_synsets_by_wordcategory(self, category) -> list:
        """
        Returns a list of synsets that belong to the specified word category

        :type category: WordCategory
        :param category: The word category of interest
        :return: A list of Synsets that belong to the specified word category
        """
        lexunit_ids = self.wordcat2lexid[category.name]
        return [self.lexunits[id].synset for id in lexunit_ids]



[docs]
    def get_synsets_by_wordclass(self, wordclass) -> list:
        """
        Returns a list of synsets that belong to the specified word class

        :type wordclass: WordClass
        :param wordclass: The word category of interest
        :return: A list of Synsets that belong to the specified word class
        """
        lexunit_ids = self.wordclass2lexid[wordclass.name]
        return [self.lexunits[id].synset for id in lexunit_ids]



[docs]
    def get_synset_by_id(self, id: str):
        """
        Returns a Synset by a specified identifier (if that exists, otherwise raises an Error)

        :rtype: Synset
        :param id: a Synset identifier
        :return: The matching Synset object
        """
        assert id in self.synsets, "the given Synset id is not in GermaNet"
        return self.synsets[id]



[docs]
    def get_lexunit_by_id(self, id: str):
        """
        Returns a lexical unit by a specified identifier (if that exists, otherwise raises an Error)

        :rtype: Lexunit
        :param id: a Lexunit identifier
        :return: The matching Lexunit object
        """
        assert id in self.lexunits, "the given lexical unit id is not in GermaNet"
        return self.lexunits[id]



[docs]
    def get_lexunits_by_orthform(self, form: str, ignorecase: bool = False) -> list:
        """
        This method returns a list of lexical units that match the given input search string

        :param form: a word that can be looked up in the GermaNet
        :param ignorecase: whether the case of the word should be ignored (default = False)
        :return: a list of lexical units that match the given input query
        """
        if ignorecase:
            form = form.lower()
            lexunit_ids = self.lowercasedform2lexid[form]
            return [self.lexunits[id] for id in lexunit_ids]
        lexunit_ids = self.orthform2lexid[form]
        return [self.lexunits[id] for id in lexunit_ids]



[docs]
    def get_lexunits_by_wordclass(self, wordclass) -> list:
        """
        Returns a list of lexical units that belong to the specified word class

        :type wordclass: WordClass
        :param wordclass: The word category of interest
        :return: A list of lexical units that belong to the specified word class
        """
        lexunit_ids = self.wordclass2lexid[wordclass.name]
        return [self.lexunits[id] for id in lexunit_ids]



[docs]
    def get_lexunits_by_wordcategory(self, category) -> list:
        """
        Returns a list of lexical units that belong to the specified word category

        :type category: WordCategory
        :param category: The word category of interest
        :return: A list of lexical units that belong to the specified word category
        """
        lexunit_ids = self.wordcat2lexid[category.name]
        return [self.lexunits[id] for id in lexunit_ids]



[docs]
    def get_synsets_by_frame(self, frame: str) -> list:
        """
        Returns a list of Synsets that match a specified frame

        :param frame: a frame that describes the argument structure of a verb (e.g. 'NN.AN' specifies that a verb can take a subject and accusative object as arguments.)
        :return: a list of Synsets that match the given frame. If the frame is not valid an Assertion Error will be raised
        """
        assert frame in self.frames2lexunits, "the specified frame is not in GermaNet"
        synset_ids = self.frames2lexunits[frame]
        return [self.synsets[id] for id in synset_ids]


    def _load_data(self):
        """
        Protected method to load the GermaNet data. The Data has to be stored in self.datadir.
        """
        files = os.listdir(self.datadir)
        wikifiles = [f for f in files if "wiktionary" in f and "xml" in f]
        lexentries = [f for f in files if
                      f.startswith("nomen") or f.startswith("verben") or f.startswith("adj") and "xml" in f]
        ilifiles = [f for f in files if "interLingua" in f and "xml" in f]
        pbar = trange(100, desc='Load GermaNet data...', leave=True)
        for i in range(len(lexentries)):
            f = lexentries[i]
            tree = parse_xml(self.datadir, f)
            load_lexunits(germanet=self, tree=tree)
            pbar.update(100 / len(lexentries))
        tree = parse_xml(self.datadir, "gn_relations.xml")
        load_relations(germanet=self, tree=tree)
        pbar.close()
        pbar = trange(100, desc='Load Wiktionary data...', leave=True)
        if self.add_wiktionary:
            for i in range(len(wikifiles)):
                tree = parse_xml(self.datadir, wikifiles[i])
                load_wiktionary(germanet=self, tree=tree)
                pbar.update(100 / len(wikifiles))
        pbar.close()
        pbar = trange(100, desc='Load Ili records...', leave=True)
        if self.add_ilirecords:
            for i in range(len(ilifiles)):
                tree = parse_xml(self.datadir, ilifiles[i])
                load_ili(germanet=self, tree=tree)
                pbar.update(100 / len(ilifiles))
        pbar.close()

    @property
    def lexunits(self):
        return self._lexunits

    @property
    def synsets(self):
        return self._synsets

    @property
    def orthform2lexid(self):
        return self._orthform2lexid

    @property
    def mainOrtform2lexid(self):
        return self._mainOrtform2lexid

    @property
    def lowercasedform2lexid(self):
        return self._lowercasedform2lexid

    @property
    def wordcat2lexid(self):
        return self._wordcat2lexid

    @property
    def wordclass2lexid(self):
        return self._wordclass2lexid

    @property
    def compounds(self):
        return self._compounds

    @property
    def frames2lexunits(self):
        return self._frames2lexunits

    @property
    def wiktionary_entries(self):
        return self._wiktionary_entries

    @property
    def ili_records(self):
        return self._ili_records

    @property
    def frames(self):
        return self._frames

    @property
    def root(self):
        root = self.get_synset_by_id('s51001')
        return root

    @property
    def datadir(self):
        return self._datadir

    @property
    def add_ilirecords(self):
        return self._add_ilirecords

    @property
    def add_wiktionary(self):
        return self._add_wiktionary