Source code for germanetpy.germanet
import os
from collections import defaultdict
from tqdm import trange
from germanetpy.utils import parse_xml
from germanetpy.synsetLoader import load_lexunits
from germanetpy.iliLoader import load_ili
from germanetpy.wiktionaryLoader import load_wiktionary
from germanetpy.relationLoader import load_relations
from germanetpy.frames import Frames
[docs]
class Germanet:
def __init__(self, datadir: str, add_ilirecords: bool = True, add_wiktionary: bool = True):
"""
The GermaNet object is initialized with the directory where the GermaNet data is stored. The data is loaded
when GermaNet is initialized.
:param datadir: The path to the directory where the GermaNet data is stored
:param add_ilirecords: a boolean, denotes whether the iliRecords should also be loaded into the GermaNet
object, default: True
:param add_wiktionary: a boolean, denotes whether the wiktionary files should also be loaded into the GermaNet
object, default: True
"""
self._datadir = datadir
self._add_ilirecords = add_ilirecords
self._add_wiktionary = add_wiktionary
# Dictionary: lexunit id - lexunit object
self._lexunits = {}
# Dictionary: synset id - synset object
self._synsets = {}
# Dictionary: any orthform (all variants) - lexunit id
self._orthform2lexid = defaultdict(set)
# Dictionary: main orthform - lexunit id
self._mainOrtform2lexid = defaultdict(set)
# Dictionary: lower cased orthographic form (all variants) - lexunit id
self._lowercasedform2lexid = defaultdict(set)
# Dictionary: Wordcategory - set of lexunit ids
self._wordcat2lexid = defaultdict(set)
# Dictionary: Wordclass - set of lexunit ids
self._wordclass2lexid = defaultdict(set)
# Set of synsets (that are compounds)
self._compounds = set()
# Dictionary: Frame - Lexunit objects
self._frames2lexunits = defaultdict(set)
# List: wiktionary entries
self._wiktionary_entries = []
# List: ili Records
self._ili_records = []
# the Frames object, storing all frame information from GermaNet
self._frames = Frames(self._frames2lexunits)
# load data when GermaNet is initialized
self._load_data()
[docs]
def get_synsets_by_wordcategory(self, category) -> list:
"""
Returns a list of synsets that belong to the specified word category
:type category: WordCategory
:param category: The word category of interest
:return: A list of Synsets that belong to the specified word category
"""
lexunit_ids = self.wordcat2lexid[category.name]
return [self.lexunits[id].synset for id in lexunit_ids]
[docs]
def get_synsets_by_wordclass(self, wordclass) -> list:
"""
Returns a list of synsets that belong to the specified word class
:type wordclass: WordClass
:param wordclass: The word category of interest
:return: A list of Synsets that belong to the specified word class
"""
lexunit_ids = self.wordclass2lexid[wordclass.name]
return [self.lexunits[id].synset for id in lexunit_ids]
[docs]
def get_synset_by_id(self, id: str):
"""
Returns a Synset by a specified identifier (if that exists, otherwise raises an Error)
:rtype: Synset
:param id: a Synset identifier
:return: The matching Synset object
"""
assert id in self.synsets, "the given Synset id is not in GermaNet"
return self.synsets[id]
[docs]
def get_lexunit_by_id(self, id: str):
"""
Returns a lexical unit by a specified identifier (if that exists, otherwise raises an Error)
:rtype: Lexunit
:param id: a Lexunit identifier
:return: The matching Lexunit object
"""
assert id in self.lexunits, "the given lexical unit id is not in GermaNet"
return self.lexunits[id]
[docs]
def get_lexunits_by_wordclass(self, wordclass) -> list:
"""
Returns a list of lexical units that belong to the specified word class
:type wordclass: WordClass
:param wordclass: The word category of interest
:return: A list of lexical units that belong to the specified word class
"""
lexunit_ids = self.wordclass2lexid[wordclass.name]
return [self.lexunits[id] for id in lexunit_ids]
[docs]
def get_lexunits_by_wordcategory(self, category) -> list:
"""
Returns a list of lexical units that belong to the specified word category
:type category: WordCategory
:param category: The word category of interest
:return: A list of lexical units that belong to the specified word category
"""
lexunit_ids = self.wordcat2lexid[category.name]
return [self.lexunits[id] for id in lexunit_ids]
[docs]
def get_synsets_by_frame(self, frame: str) -> list:
"""
Returns a list of Synsets that match a specified frame
:param frame: a frame that describes the argument structure of a verb (e.g. 'NN.AN' specifies that a verb can take a subject and accusative object as arguments.)
:return: a list of Synsets that match the given frame. If the frame is not valid an Assertion Error will be raised
"""
assert frame in self.frames2lexunits, "the specified frame is not in GermaNet"
synset_ids = self.frames2lexunits[frame]
return [self.synsets[id] for id in synset_ids]
def _load_data(self):
"""
Protected method to load the GermaNet data. The Data has to be stored in self.datadir.
"""
files = os.listdir(self.datadir)
wikifiles = [f for f in files if "wiktionary" in f and "xml" in f]
lexentries = [f for f in files if
f.startswith("nomen") or f.startswith("verben") or f.startswith("adj") and "xml" in f]
ilifiles = [f for f in files if "interLingua" in f and "xml" in f]
pbar = trange(100, desc='Load GermaNet data...', leave=True)
for i in range(len(lexentries)):
f = lexentries[i]
tree = parse_xml(self.datadir, f)
load_lexunits(germanet=self, tree=tree)
pbar.update(100 / len(lexentries))
tree = parse_xml(self.datadir, "gn_relations.xml")
load_relations(germanet=self, tree=tree)
pbar.close()
pbar = trange(100, desc='Load Wiktionary data...', leave=True)
if self.add_wiktionary:
for i in range(len(wikifiles)):
tree = parse_xml(self.datadir, wikifiles[i])
load_wiktionary(germanet=self, tree=tree)
pbar.update(100 / len(wikifiles))
pbar.close()
pbar = trange(100, desc='Load Ili records...', leave=True)
if self.add_ilirecords:
for i in range(len(ilifiles)):
tree = parse_xml(self.datadir, ilifiles[i])
load_ili(germanet=self, tree=tree)
pbar.update(100 / len(ilifiles))
pbar.close()
@property
def lexunits(self):
return self._lexunits
@property
def synsets(self):
return self._synsets
@property
def orthform2lexid(self):
return self._orthform2lexid
@property
def mainOrtform2lexid(self):
return self._mainOrtform2lexid
@property
def lowercasedform2lexid(self):
return self._lowercasedform2lexid
@property
def wordcat2lexid(self):
return self._wordcat2lexid
@property
def wordclass2lexid(self):
return self._wordclass2lexid
@property
def compounds(self):
return self._compounds
@property
def frames2lexunits(self):
return self._frames2lexunits
@property
def wiktionary_entries(self):
return self._wiktionary_entries
@property
def ili_records(self):
return self._ili_records
@property
def frames(self):
return self._frames
@property
def root(self):
root = self.get_synset_by_id('s51001')
return root
@property
def datadir(self):
return self._datadir
@property
def add_ilirecords(self):
return self._add_ilirecords
@property
def add_wiktionary(self):
return self._add_wiktionary