Source code for germanetpy.filterconfig
import re
import itertools
from Levenshtein import distance
from germanetpy.synset import WordCategory, WordClass
from germanetpy.lexunit import OrthFormVariant
[docs]
class Filterconfig:
"""
This class is a configuration object, that helps to filter GermaNets lexical units and Synsets to extract the
ones with certain interesting properties.
"""
def __init__(self, search_string: str, ignore_case: bool = False, regex: bool = False,
levenshtein_distance: int = 0):
"""
The Filterconfiguration consists of a list of word categories (as a default all existing word categories are
selected),
a list of word classes (as a default all existing word classes are selected), a list of orthform variants (as
a default all existing orthform variants are selected)
:param search_string: a String, either a query word the user is looking for or a regular expression
:param ignore_case: a boolean, specifying whether the case of the query should be ignored or not
:param regex: a boolean, specifying whether a regular expression is used. If a regular expression is
specified, a given levenshtein distance will not be taken into consideration.
:param levenshtein_distance : specify a levenshtein distance to retrieve all words that have a certain
levenshtein distance to a given query words. Cannot be used together with a regular expression.
"""
self._search_string = search_string
self._word_categories = [c for c in WordCategory]
self._word_classes = [c for c in WordClass]
self._orth_variants = [o for o in OrthFormVariant]
self._ignore_case = ignore_case
self._regex = regex
self._levenshtein_distance = levenshtein_distance
[docs]
def filter_lexunits(self, germanet) -> set:
"""
Applys the filter to the GermaNet data
:type germanet: Germanet
:param germanet: the GermaNet object, loaded from the data
:return: a set of lexical units that are left after retrieval is filtered with the given constraints
"""
result = set()
if self.regex:
lexunits = self._get_lexunits_by_regex(germanet)
elif not self.regex and self.levenshtein_distance > 0:
lexunits = self._filter_lexunits_levenshtein(germanet)
else:
lexunits = germanet.get_lexunits_by_orthform(self.search_string, self.ignore_case)
lexunits = self._filter_lexunits_orthform(lexunits, self.orth_variants, self.search_string,
self.ignore_case)
for unit in lexunits:
if (unit.synset.word_class in self.word_classes) and (unit.synset.word_category in
self.word_categories) and \
self.search_string:
result.add(unit)
return result
def _filter_lexunits_orthform(self, lexunits, orthvariants, searchstring: str, ignore_case: bool) -> set:
"""
The method filters the retrieved lexical units to match the user-specified orth variants
:type orthvariants: list(OrthVariant)
:type lexunits: set(Lexunit)
:param lexunits: the set if lexical units to be filtered by orth variant
:param orthvariants: a list of oth variants that should be considered during filtering
:param searchstring: the search query
:param ignore_case: boolean, if case should be ignored or not
:return: a set if lexical units, all lexical units match the given orth variants
"""
filtered_units = set()
for unit in lexunits:
for orthvar in orthvariants:
form = unit.get_orthform_variant(orthvar)
if form == searchstring:
filtered_units.add(unit)
if ignore_case and form is not None:
if form.lower() == searchstring.lower():
filtered_units.add(unit)
return filtered_units
[docs]
def filter_synsets(self, germanet) -> set:
"""
Applys the filter to the GermaNet data
:type germanet: Germanet
:param germanet: the GermaNet object, loaded from the data
:return: a set of synsets that are left after retrieval is filtered with the given constraints
"""
result = set()
if self.regex:
lexunits = self._get_lexunits_by_regex(germanet)
elif self.levenshtein_distance > 0 and not self.regex:
lexunits = self._filter_lexunits_levenshtein(germanet)
else:
lexunits = germanet.get_lexunits_by_orthform(self.search_string, self.ignore_case)
lexunits = self._filter_lexunits_orthform(lexunits, self.orth_variants, self.search_string,
self.ignore_case)
synsets = [lexunit.synset for lexunit in lexunits]
for synset in synsets:
if (synset.word_class in self.word_classes) and (synset.word_category in
self.word_categories):
result.add(synset)
return result
def _get_lexunits_by_regex(self, germanet) -> set:
"""
Filters lexical units with a regular expression. All lexical units that match the regular expression are
returned.
:type germanet: Germanet
:param germanet: the GermaNet object, loaded from the data
:return: The set of lexical units that match the given regular expression
"""
result = set()
if self.ignore_case:
pattern = re.compile(self.search_string.lower())
l_ids = [germanet.lowercasedform2lexid[orthform] for orthform in germanet.lowercasedform2lexid.keys() if
pattern.fullmatch(orthform)]
else:
pattern = re.compile(self.search_string)
l_ids = [germanet.orthform2lexid[orthform] for orthform in germanet.orthform2lexid.keys() if
pattern.fullmatch(orthform)]
for id in list(itertools.chain.from_iterable(l_ids)):
result.add(germanet.lexunits[id])
return result
def _filter_lexunits_levenshtein(self, germanet) -> set:
"""
Filters lexical units with levenshtein distance. All lexical units that have a maximum of the given
levenshtein distance or lower are returned.
:type germanet: Germanet
:param germanet: the GermaNet object, loaded from the data
:return: The set of lexical units that match the given levenshtein distance
"""
filtered_lexunits = set()
for cat in self.word_categories:
units = germanet.get_lexunits_by_wordcategory(category=cat)
for unit in units:
if unit.synset.word_class in self.word_classes:
for orthvar in self.orth_variants:
form = unit.get_orthform_variant(orthvar)
if form:
if self.ignore_case:
form = form.lower()
self._search_string = self.search_string.lower()
d = distance(form, self.search_string)
if d <= self.levenshtein_distance:
filtered_lexunits.add(unit)
return filtered_lexunits
@property
def search_string(self):
return self._search_string
@property
def word_categories(self):
return self._word_categories
@property
def word_classes(self):
return self._word_classes
@property
def orth_variants(self):
return self._orth_variants
@property
def ignore_case(self):
return self._ignore_case
@property
def regex(self):
return self._regex
@property
def levenshtein_distance(self):
return self._levenshtein_distance
@word_classes.setter
def word_classes(self, word_classes):
self._word_classes = word_classes
@word_categories.setter
def word_categories(self, word_categories):
self._word_categories = word_categories
@orth_variants.setter
def orth_variants(self, orth_variants):
self._orth_variants = orth_variants