Source code for deeppavlov.models.tokenizers.ru_sent_tokenizer

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Set, Tuple

from rusenttokenize import ru_sent_tokenize, SHORTENINGS, JOINING_SHORTENINGS, PAIRED_SHORTENINGS

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


[docs]@register("ru_sent_tokenizer")
class RuSentTokenizer(Component):
    """
    Rule-base sentence tokenizer for Russian language.
    https://github.com/deepmipt/ru_sentence_tokenizer

    Args:
        shortenings: list of known shortenings. Use default value if working on news or fiction texts
        joining_shortenings: list of shortenings after that sentence split is not possible (i.e. "ул").
            Use default value if working on news or fiction texts
        paired_shortenings: list of known paired shotenings (i.e. "т. е.").
            Use default value if working on news or fiction texts

    """

    def __init__(self, shortenings: Set[str] = SHORTENINGS,
                 joining_shortenings: Set[str] = JOINING_SHORTENINGS,
                 paired_shortenings: Set[Tuple[str, str]] = PAIRED_SHORTENINGS,
                 **kwargs):
        self.shortenings = shortenings
        self.joining_shortenings = joining_shortenings
        self.paired_shortenings = paired_shortenings

    def __call__(self, batch: [str]) -> [[str]]:
        return [ru_sent_tokenize(x, self.shortenings, self.joining_shortenings, self.paired_shortenings) for x in batch]