Source code for wildnlp.aspects.misspelling

import json
import os
import random

from .base import Aspect


[docs]class Misspelling(Aspect): """Misspells words appearing in the Wikipedia list of **commonly misspelled English words** (default): https://en.wikipedia.org/wiki/Commonly_misspelled_English_words .. tip:: You can use **homophones** instead: https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/Homophones If a word has more then one common misspelling, the replacement is selected randomly. *All words that have any misspellings listed will be replaced.* .. caution:: Uses random numbers, default seed is 42. """
[docs] def __init__(self, use_homophones=False, seed=42): """ :param use_homophones: If True list of homophones will be used to replace words. :param seed: Random seed. """ filename = 'misspellings.json' if use_homophones: filename = 'homophones.json' self._misspellings = self._load_misspellings(filename) random.seed(seed)
def __call__(self, sentence): tokens = self._tokenize(sentence) modified_tokens = self._transform_tokens(tokens) return self._detokenize(modified_tokens) def _transform_tokens(self, tokens): modified = [] for token in tokens: if token.lower() in self._misspellings: modified_token =\ random.choice(self._misspellings[token.lower()]) if token[0].isupper(): modified_token = self._capitalize(modified_token) token = modified_token modified.append(token) return modified @staticmethod def _load_misspellings(filename): current_dir = os.path.dirname(__file__) path = os.path.join(current_dir, 'auxiliary', filename) with open(path, 'r') as f: mistakes = json.load(f) return mistakes @staticmethod def _capitalize(token): try: return token[0].upper() + token[1:] except IndexError: return token