Source code for germanetpy.path_based_relatedness_measures

import math
import numpy as np
from germanetpy import longest_shortest_path
from germanetpy.semrel_measures import SemRelMeasure



[docs]
class PathBasedRelatedness:
    """
    These measures use the GermaNet Graph to compute the shortest Paths between two concepts. These concepts have to
    have the same word category. The path lengths are normalized in different ways (depending on the measure). The path
    lengths are computed taking only the hypernymy / hyponymy relations into account
    """

    def __init__(self, germanet, category, max_len: int = None, max_depth: int = None, synset_pair=None):
        """
        This is the constructor of the PathBasedRelatedness class. The class needs the germanet Graph and a word
        category specified. The parameter max_len, max_depth and synset_pair are optional. If not given they will be
        computed online which can be time-consuming, especially for nouns.

        :type synset_pair: tuple(Synset, Synset)
        :type category: WordCategory
        :type germanet: Germanet
        :param germanet: The Germanet Graph
        :param category: WordCategory
        :param max_len: the longest shortest path distance between any of the corresponding synsets
        :param max_depth the maximum depth of any corresponding synset
        :param synset_pair a pair of synsets that has the longest shortest path distance
        """
        self._germanet = germanet
        self._category = category
        if max_len and max_depth and synset_pair:
            self._max_len = max_len
            self._max_depth = max_depth
        else:
            self._max_len, self._max_depth, synset_pair = longest_shortest_path.get_longest_possible_shortest_distance(
                germanet, category)
        self._normalization_dic = self.init_min_max_normalization_values(synset_pair)


[docs]
    def simple_path(self, synset1, synset2, normalize: bool = False, normalized_max: float = 1.0) -> float:
        """
        This measure computes the pathlength and normalizes it by the longest possible shortest path between any two
        nodes of the corresponding word category.

        :type synset2: Synset
        :type synset1: Synset
        :param synset1: The source synset
        :param synset2: The target synset the source synset is compared to
        :param normalize: The relatedness value can be normalized to a number between the possible minimum of that measure and a given upper bound.
        :param normalized_max: The upper bound of the range the measure is normalized to.
        :return: : The normalized path length between two synsets
        """
        assert synset1.word_category == synset2.word_category, "only synsets of the same Wordcategory can be " \
                                                               "compared"
        pathlen = synset1.shortest_path_distance(synset2)
        path = (self.max_len - pathlen) / self.max_len
        if normalize:
            path = self.normalize(raw_value=path, normalized_max=normalized_max,
                                  semrel_measure=SemRelMeasure.SimplePath)
        return np.round(path, decimals=5)



[docs]
    def init_min_max_normalization_values(self, synset_pair):
        """
        This methods computes the minimal values (two synsets are equal) and the maximum values (two synsets are
        maximally appart in the graph) for normalization

        :param synset_pair: (Synset, Synset) The Tuple of synsets that have the maximum distance in the graph
        :return: a dictionary [SemRelMeasure : (int, int)] containing the (minimum value, maximum value) for each semantic similarity measure.
        """
        min_wup = self.wu_and_palmer(synset_pair[0], synset_pair[1])
        max_wup = self.wu_and_palmer(synset_pair[0], synset_pair[0])
        min_path = self.simple_path(synset_pair[0], synset_pair[1])
        max_path = self.simple_path(synset_pair[0], synset_pair[0])
        min_lch = self.leacock_chodorow(synset_pair[0], synset_pair[1])
        max_lch = self.leacock_chodorow(synset_pair[0], synset_pair[0])
        norm_values = {
            SemRelMeasure.SimplePath: (min_path, max_path),
            SemRelMeasure.WuAndPalmer: (min_wup, max_wup),
            SemRelMeasure.LeacockAndChodorow: (min_lch, max_lch)
        }
        return norm_values



[docs]
    def wu_and_palmer(self, synset1, synset2, normalize: bool = False, normalized_max: float = 1.0) -> float:
        """
        This methods computes the semantic relatedness by taking the path length into account, normalizing by taking
        the depth of the LCS. If there are several possible LCS, the one with the largest depth is taken into account.

        :type synset2: Synset
        :type synset1: Synset
        :param synset1: The source synset
        :param synset2: The target synset the source synset is compared to
        :param normalize: The relatedness value can be normalized to a number between the possible minimum of that measure and a given upper bound.
        :param normalized_max: The upper bound of the range the measure is normalized to.
        :return: The wu and palmer relatedness measure
        """
        assert synset1.word_category == synset2.word_category, "only synsets of the same Wordcategory can be " \
                                                               "compared"
        root_node = self.germanet.root
        lcs_nodes = synset1.lowest_common_subsumer(synset2)
        depth = 0
        for n in lcs_nodes:
            current_depth = n.shortest_path_distance(root_node)
            if current_depth > depth:
                depth = current_depth
        pathlen = synset2.shortest_path_distance(synset1)
        wup = (2 * depth) / (pathlen + 2 * depth)
        if normalize:
            wup = self.normalize(raw_value=wup, normalized_max=normalized_max, semrel_measure=SemRelMeasure.WuAndPalmer)
        return np.round(wup, decimals=5)



[docs]
    def leacock_chodorow(self, synset1, synset2, normalize: bool = False, normalized_max: float = 1.0) -> float:
        """
        This method implements the leackock and chodorow relatedness measure. For the path distance and depth,
        node count is used.

        :type synset2: Synset
        :type synset1: Synset
        :param synset1: The source synset
        :param synset2: The target synset the source synset is compared to
        :param normalize: The relatedness value can be normalized to a number between the possible minimum of that measure and a given upper bound.
        :param normalized_max: The upper bound of the range the measure is normalized to.
        :return:: The leackock and chodorow relatedness measure
        """
        assert synset1.word_category == synset2.word_category, "only synsets of the same Wordcategory can be " \
                                                               "compared"
        pathlen = synset1.shortest_path_distance(synset2) + 1
        lch_sim = -math.log10(pathlen / (2 * (self.max_depth + 1)))
        if normalize:
            lch_sim = self.normalize(raw_value=lch_sim, normalized_max=normalized_max,
                                     semrel_measure=SemRelMeasure.LeacockAndChodorow)
        return np.round(lch_sim, decimals=5)



[docs]
    def normalize(self, raw_value: float, normalized_max: float, semrel_measure: SemRelMeasure) -> float:
        """
        Normalizes a raw value of semantic relatedness to a value between a lower bound and the given upper bound.

        :param raw_value: The raw value
        :param normalized_max: The upper bound
        :param semrel_measure: The semantic relatedness measure, the value corresponds to.
        :return: The normalized semantic relatedness value
        """
        lower_bound, upper_bound = self.normalization_dic[semrel_measure]
        return np.round(((raw_value - lower_bound) / (upper_bound - lower_bound)) * normalized_max, decimals=5)


    @property
    def germanet(self):
        return self._germanet

    @property
    def max_len(self):
        return self._max_len

    @property
    def max_depth(self):
        return self._max_depth

    @property
    def category(self):
        return self._category

    @property
    def normalization_dic(self):
        return self._normalization_dic