Source code for deeppavlov.dataset_readers.morphotagging_dataset_reader

# Copyright 2018 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from typing import Dict, List, Union, Tuple

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import download_decompress, mark_done
from deeppavlov.core.common.log import get_logger
from deeppavlov.core.data.dataset_reader import DatasetReader

import sys

WORD_COLUMN, POS_COLUMN, TAG_COLUMN = 1, 3, 5


log = get_logger(__name__)

[docs]def get_language(filepath: str) -> str:
    """Extracts language from typical UD filename
    """
    return filepath.split("-")[0]

[docs]def read_infile(infile: Union[Path, str], word_column: int = WORD_COLUMN, pos_column: int = POS_COLUMN,
                tag_column: int = TAG_COLUMN, max_sents: int = -1,
                read_only_words: bool = False) -> List[Tuple[List, Union[List, None]]]:
    """Reads input file in CONLL-U format

    Args:
        infile: a path to a file
        word_column: column containing words (default=1)
        pos_column: column containing part-of-speech labels (default=3)
        tag_column: column containing fine-grained tags (default=5)
        max_sents: maximal number of sents to read
        read_only_words: whether to read only words

    Returns:
        a list of sentences. Each item contains a word sequence and a tag sequence, which is ``None``
        in case ``read_only_words = True``
    """
    answer, curr_word_sent, curr_tag_sent = [], [], []
    with open(infile, "r", encoding="utf8") as fin:
        for line in fin:
            line = line.strip()
            if line.startswith("#"):
                continue
            if line == "":
                if len(curr_word_sent) > 0:
                    if read_only_words:
                        curr_tag_sent = None
                    answer.append((curr_word_sent, curr_tag_sent))
                curr_tag_sent, curr_word_sent = [], []
                if len(answer) == max_sents:
                    break
                continue
            splitted = line.split("\t")
            index = splitted[0]
            if not index.isdigit():
                continue
            curr_word_sent.append(splitted[word_column])
            if not read_only_words:
                pos, tag = splitted[pos_column], splitted[tag_column]
                tag = pos if tag == "_" else "{},{}".format(pos, tag)
                curr_tag_sent.append(tag)
        if len(curr_word_sent) > 0:
            if read_only_words:
                curr_tag_sent = None
            answer.append((curr_word_sent, curr_tag_sent))
    return answer


[docs]@register('morphotagger_dataset_reader')
class MorphotaggerDatasetReader(DatasetReader):
    """Class to read training datasets in UD format"""

    URL = 'http://files.deeppavlov.ai/datasets/UD2.0_source/'

[docs]    def read(self, data_path: Union[List, str], language: Union[str, None] = None,
             data_types: Union[List[str], None] = None, **kwargs) -> Dict[str, List]:
        """Reads UD dataset from data_path.

        Args:
            data_path: can be either
                1. a directory containing files. The file for data_type 'mode'
                is then data_path / {language}-ud-{mode}.conllu
                2. a list of files, containing the same number of items as data_types
            language: a language to detect filename when it is not given
            data_types: which dataset parts among 'train', 'dev', 'test' are returned

        Returns:
            a dictionary containing dataset fragments (see ``read_infile``) for given data types
        """
        if data_types is None:
            data_types = ["train", "dev"]
        elif isinstance(data_types, str):
            data_types = list(data_types)
        for data_type in data_types:
            if data_type not in ["train", "dev", "test"]:
                raise ValueError("Unknown data_type: {}, only train, dev and test "
                                 "datatypes are allowed".format(data_type))
        if isinstance(data_path, str):
            data_path = Path(data_path)
        if isinstance(data_path, Path):
            if data_path.exists():
                is_file = data_path.is_file()
            else:
                is_file = (len(data_types) == 1)
            if is_file:
                # path to a single file
                data_path, reserve_data_path = [data_path], None
            else:
                # path to data directory
                if language is None:
                    raise ValueError("You must implicitly provide language "
                                     "when providing data directory as source")
                reserve_data_path = data_path
                data_path = [data_path / "{}-ud-{}.conllu".format(language, mode)
                             for mode in data_types]
                reserve_data_path = [
                    reserve_data_path / language / "{}-ud-{}.conllu".format(language, mode)
                    for mode in data_types]
        else:
            data_path = [Path(data_path) for data_path in data_path]
            reserve_data_path = None
        if len(data_path) != len(data_types):
            raise ValueError("The number of input files in data_path and data types "
                             "in data_types must be equal")
        has_missing_files = any(not filepath.exists() for filepath in data_path)
        if has_missing_files and reserve_data_path is not None:
            has_missing_files = any(not filepath.exists() for filepath in reserve_data_path)
            if not has_missing_files:
                data_path = reserve_data_path
        if has_missing_files:
            # Files are downloaded from the Web repository
            dir_path = data_path[0].parent
            language = language or get_language(data_path[0].parts[-1])
            url = self.URL + "{}.tar.gz".format(language)
            log.info('[downloading data from {} to {}]'.format(url, dir_path))
            dir_path.mkdir(exist_ok=True, parents=True)
            download_decompress(url, dir_path)
            mark_done(dir_path)
        data = {}
        for mode, filepath in zip(data_types, data_path):
            if mode == "dev":
                mode = "valid"
            data[mode] = read_infile(filepath, **kwargs)
        return data