# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
from typing import Dict, List, Tuple
import requests
from pathlib import Path
from lxml import html
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import is_done, download, mark_done
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.common.log import get_logger
log = get_logger(__name__)
[docs]@register('typos_custom_reader')
class TyposCustom(DatasetReader):
"""Base class for reading spelling corrections dataset files
"""
def __init__(self):
pass
[docs] @staticmethod
def build(data_path: str) -> Path:
"""Base method that interprets ``data_path`` argument.
Args:
data_path: path to the tsv-file containing erroneous and corrected words
Returns:
the same path as a :class:`~pathlib.Path` object
"""
return Path(data_path)
[docs] @classmethod
def read(cls, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]:
"""Read train data for spelling corrections algorithms
Args:
data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposCustom.build`
Returns:
train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator`
"""
fname = cls.build(data_path)
with fname.open(newline='', encoding='utf8') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
next(reader)
res = [(mistake, correct) for mistake, correct in reader]
return {'train': res}
[docs]@register('typos_wikipedia_reader')
class TyposWikipedia(TyposCustom):
"""Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with
English Wikipedia's list of common misspellings
"""
[docs] @staticmethod
def build(data_path: str) -> Path:
"""Download and parse common misspellings list from `Wikipedia <https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines>`_
Args:
data_path: target directory to download the data to
Returns:
path to the resulting tsv-file
"""
data_path = Path(data_path) / 'typos_wiki'
fname = data_path / 'misspelings.tsv'
if not is_done(data_path):
url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'
page = requests.get(url)
tree = html.fromstring(page.content)
raw = tree.xpath('//pre/text()')[0].splitlines()
data = []
for pair in raw:
typo, corrects = pair.strip().split('->')
for correct in corrects.split(','):
data.append([typo.strip(), correct.strip()])
fname.parent.mkdir(parents=True, exist_ok=True)
with fname.open('w', newline='', encoding='utf8') as tsvfile:
writer = csv.writer(tsvfile, delimiter='\t')
for line in data:
writer.writerow(line)
mark_done(data_path)
log.info('Built')
return fname
[docs]@register('typos_kartaslov_reader')
class TyposKartaslov(DatasetReader):
"""Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with
a Russian misspellings dataset from `kartaslov <https://github.com/dkulagin/kartaslov>`_
"""
def __init__(self):
pass
[docs] @staticmethod
def build(data_path: str) -> Path:
"""Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_
Args:
data_path: target directory to download the data to
Returns:
path to the resulting csv-file
"""
data_path = Path(data_path) / 'kartaslov'
fname = data_path / 'orfo_and_typos.L1_5.csv'
if not is_done(data_path):
url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'
download(fname, url)
mark_done(data_path)
log.info('Built')
return fname
[docs] @staticmethod
def read(data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]:
"""Read train data for spelling corrections algorithms
Args:
data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposKartaslov.build`
Returns:
train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator`
"""
fname = TyposKartaslov.build(data_path)
with open(str(fname), newline='', encoding='utf8') as csvfile:
reader = csv.reader(csvfile, delimiter=';')
next(reader)
res = [(mistake, correct) for correct, mistake, weight in reader]
return {'train': res}