Source code for deeppavlov.models.kbqa.ru_adj_to_noun

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from collections import defaultdict
from logging import getLogger
from typing import List

import numpy as np
import spacy
from scipy.sparse import csr_matrix

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register

log = getLogger(__name__)


[docs]@register('ru_adj_to_noun')
class RuAdjToNoun:
    """
        Class for converting an adjective in Russian to the corresponding noun, for example:
        "московский" -> "Москва", "африканский" -> "Африка"
    """

[docs]    def __init__(self, freq_dict_filename: str, candidate_nouns: int = 10, freq_thres: float = 4.5,
                 score_thres: float = 2.8, **kwargs):
        """

        Args:
            freq_dict_filename: file with the dictionary of Russian words with the corresponding frequencies
            candidate_nouns: how many candidate nouns to leave after search
            **kwargs:
        """
        self.candidate_nouns = candidate_nouns
        self.freq_thres = freq_thres
        self.score_thres = score_thres
        alphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя-"
        self.alphabet_length = len(alphabet)
        self.max_word_length = 24
        self.letter_nums = {letter: num for num, letter in enumerate(alphabet)}
        with open(str(expand_path(freq_dict_filename)), 'r') as fl:
            lines = fl.readlines()
        pos_freq_dict = defaultdict(list)
        for line in lines:
            line_split = line.strip('\n').split('\t')
            if re.match("[\d]+\.[\d]+", line_split[2]):
                pos_freq_dict[line_split[1]].append((line_split[0], float(line_split[2])))
        self.nouns_with_freq = pos_freq_dict["s.PROP"]
        self.adj_set = set([word for word, freq in pos_freq_dict["a"]])
        self.nouns = [noun[0] for noun in self.nouns_with_freq]
        self.matrix = self.make_sparse_matrix(self.nouns).transpose()
        self.nlp = spacy.load("ru_core_news_sm")

    def search(self, word: str):
        word = self.nlp(word)[0].lemma_
        if word in self.adj_set:
            q_matrix = self.make_sparse_matrix([word])
            scores = q_matrix * self.matrix
            scores = np.squeeze(scores.toarray())
            indices = np.argsort(-scores)[:self.candidate_nouns]
            scores = list(scores[indices])
            candidates = [self.nouns_with_freq[indices[i]] + (scores[i],) for i in range(len(indices))]
            candidates = [cand for cand in candidates if cand[0][:3].lower() == word[:3].lower()]
            candidates = sorted(candidates, key=lambda x: (x[2], x[1]), reverse=True)
            log.debug(f"AdjToNoun, found nouns: {candidates}")
            if candidates and candidates[0][1] > self.freq_thres and candidates[0][2] > self.score_thres:
                return candidates[0][0]
        return ""

    def make_sparse_matrix(self, words: List[str]):
        indptr = []
        indices = []
        data = []

        total_length = 0

        for n, word in enumerate(words):
            indptr.append(total_length)
            for cnt, letter in enumerate(word.lower()):
                col = self.alphabet_length * cnt + self.letter_nums[letter]
                indices.append(col)
                init_value = 1.0 - cnt * 0.05
                if init_value < 0:
                    init_value = 0
                data.append(init_value)
            total_length += len(word)

        indptr.append(total_length)

        data = np.array(data)
        indptr = np.array(indptr)
        indices = np.array(indices)

        matrix = csr_matrix((data, indices, indptr), shape=(len(words), self.max_word_length * self.alphabet_length))

        return matrix