Source code for germanetpy.path_based_relatedness_measures
import math
import numpy as np
from germanetpy import longest_shortest_path
from germanetpy.semrel_measures import SemRelMeasure
[docs]
class PathBasedRelatedness:
"""
These measures use the GermaNet Graph to compute the shortest Paths between two concepts. These concepts have to
have the same word category. The path lengths are normalized in different ways (depending on the measure). The path
lengths are computed taking only the hypernymy / hyponymy relations into account
"""
def __init__(self, germanet, category, max_len: int = None, max_depth: int = None, synset_pair=None):
"""
This is the constructor of the PathBasedRelatedness class. The class needs the germanet Graph and a word
category specified. The parameter max_len, max_depth and synset_pair are optional. If not given they will be
computed online which can be time-consuming, especially for nouns.
:type synset_pair: tuple(Synset, Synset)
:type category: WordCategory
:type germanet: Germanet
:param germanet: The Germanet Graph
:param category: WordCategory
:param max_len: the longest shortest path distance between any of the corresponding synsets
:param max_depth the maximum depth of any corresponding synset
:param synset_pair a pair of synsets that has the longest shortest path distance
"""
self._germanet = germanet
self._category = category
if max_len and max_depth and synset_pair:
self._max_len = max_len
self._max_depth = max_depth
else:
self._max_len, self._max_depth, synset_pair = longest_shortest_path.get_longest_possible_shortest_distance(
germanet, category)
self._normalization_dic = self.init_min_max_normalization_values(synset_pair)
[docs]
def simple_path(self, synset1, synset2, normalize: bool = False, normalized_max: float = 1.0) -> float:
"""
This measure computes the pathlength and normalizes it by the longest possible shortest path between any two
nodes of the corresponding word category.
:type synset2: Synset
:type synset1: Synset
:param synset1: The source synset
:param synset2: The target synset the source synset is compared to
:param normalize: The relatedness value can be normalized to a number between the possible minimum of that measure and a given upper bound.
:param normalized_max: The upper bound of the range the measure is normalized to.
:return: : The normalized path length between two synsets
"""
assert synset1.word_category == synset2.word_category, "only synsets of the same Wordcategory can be " \
"compared"
pathlen = synset1.shortest_path_distance(synset2)
path = (self.max_len - pathlen) / self.max_len
if normalize:
path = self.normalize(raw_value=path, normalized_max=normalized_max,
semrel_measure=SemRelMeasure.SimplePath)
return np.round(path, decimals=5)
[docs]
def init_min_max_normalization_values(self, synset_pair):
"""
This methods computes the minimal values (two synsets are equal) and the maximum values (two synsets are
maximally appart in the graph) for normalization
:param synset_pair: (Synset, Synset) The Tuple of synsets that have the maximum distance in the graph
:return: a dictionary [SemRelMeasure : (int, int)] containing the (minimum value, maximum value) for each semantic similarity measure.
"""
min_wup = self.wu_and_palmer(synset_pair[0], synset_pair[1])
max_wup = self.wu_and_palmer(synset_pair[0], synset_pair[0])
min_path = self.simple_path(synset_pair[0], synset_pair[1])
max_path = self.simple_path(synset_pair[0], synset_pair[0])
min_lch = self.leacock_chodorow(synset_pair[0], synset_pair[1])
max_lch = self.leacock_chodorow(synset_pair[0], synset_pair[0])
norm_values = {
SemRelMeasure.SimplePath: (min_path, max_path),
SemRelMeasure.WuAndPalmer: (min_wup, max_wup),
SemRelMeasure.LeacockAndChodorow: (min_lch, max_lch)
}
return norm_values
[docs]
def wu_and_palmer(self, synset1, synset2, normalize: bool = False, normalized_max: float = 1.0) -> float:
"""
This methods computes the semantic relatedness by taking the path length into account, normalizing by taking
the depth of the LCS. If there are several possible LCS, the one with the largest depth is taken into account.
:type synset2: Synset
:type synset1: Synset
:param synset1: The source synset
:param synset2: The target synset the source synset is compared to
:param normalize: The relatedness value can be normalized to a number between the possible minimum of that measure and a given upper bound.
:param normalized_max: The upper bound of the range the measure is normalized to.
:return: The wu and palmer relatedness measure
"""
assert synset1.word_category == synset2.word_category, "only synsets of the same Wordcategory can be " \
"compared"
root_node = self.germanet.root
lcs_nodes = synset1.lowest_common_subsumer(synset2)
depth = 0
for n in lcs_nodes:
current_depth = n.shortest_path_distance(root_node)
if current_depth > depth:
depth = current_depth
pathlen = synset2.shortest_path_distance(synset1)
wup = (2 * depth) / (pathlen + 2 * depth)
if normalize:
wup = self.normalize(raw_value=wup, normalized_max=normalized_max, semrel_measure=SemRelMeasure.WuAndPalmer)
return np.round(wup, decimals=5)
[docs]
def leacock_chodorow(self, synset1, synset2, normalize: bool = False, normalized_max: float = 1.0) -> float:
"""
This method implements the leackock and chodorow relatedness measure. For the path distance and depth,
node count is used.
:type synset2: Synset
:type synset1: Synset
:param synset1: The source synset
:param synset2: The target synset the source synset is compared to
:param normalize: The relatedness value can be normalized to a number between the possible minimum of that measure and a given upper bound.
:param normalized_max: The upper bound of the range the measure is normalized to.
:return:: The leackock and chodorow relatedness measure
"""
assert synset1.word_category == synset2.word_category, "only synsets of the same Wordcategory can be " \
"compared"
pathlen = synset1.shortest_path_distance(synset2) + 1
lch_sim = -math.log10(pathlen / (2 * (self.max_depth + 1)))
if normalize:
lch_sim = self.normalize(raw_value=lch_sim, normalized_max=normalized_max,
semrel_measure=SemRelMeasure.LeacockAndChodorow)
return np.round(lch_sim, decimals=5)
[docs]
def normalize(self, raw_value: float, normalized_max: float, semrel_measure: SemRelMeasure) -> float:
"""
Normalizes a raw value of semantic relatedness to a value between a lower bound and the given upper bound.
:param raw_value: The raw value
:param normalized_max: The upper bound
:param semrel_measure: The semantic relatedness measure, the value corresponds to.
:return: The normalized semantic relatedness value
"""
lower_bound, upper_bound = self.normalization_dic[semrel_measure]
return np.round(((raw_value - lower_bound) / (upper_bound - lower_bound)) * normalized_max, decimals=5)
@property
def germanet(self):
return self._germanet
@property
def max_len(self):
return self._max_len
@property
def max_depth(self):
return self._max_depth
@property
def category(self):
return self._category
@property
def normalization_dic(self):
return self._normalization_dic