import random
from .base import Aspect
[docs]class RemoveChar(Aspect):
"""Randomly removes characters from words.
.. Note:: Note that you may specify white space as a character to be removed
but it'll be processed differently.
.. caution:: Uses random numbers, default seed is 42.
"""
[docs] def __init__(self, char=None, words_percentage=50,
characters_percentage=10, seed=42):
"""
:param words_percentage: Percentage of words in a
sentence that should be transformed. If greater than 0,
always at least single word will be transformed.
:param characters_percentage: Percentage of characters in a
word that should be transformed. If greater than 0
always at least single character will be transformed.
:param char: If specified only that character will be randomly removed.
The specified character can also be a white space.
:param seed: Random seed.
"""
if words_percentage >= 1:
words_percentage /= 100.
if characters_percentage >= 1:
characters_percentage /= 100.
self._words_percentage = words_percentage
self._characters_percentage = characters_percentage
self._char = char
random.seed(seed)
def __call__(self, sentence):
if self._char == ' ':
return self._process_white_space(sentence)
tokens = self._tokenize(sentence)
modified_tokens = self._remove_characters(tokens)
return self._detokenize(modified_tokens)
def _remove_characters(self, tokens):
# TODO: I think that punctuation marks etc. should
# be excluded.
tokens_filtered = [i for i, token in enumerate(tokens)
if token.isalpha() or token.isdigit()]
random.shuffle(tokens_filtered)
selected_tokens =\
sorted(tokens_filtered[:self._percentage_to_num(
tokens_filtered, self._words_percentage)])
modified = []
for i, token in enumerate(tokens):
if i in selected_tokens:
if self._char:
token = token.replace(self._char, '',
self._percentage_to_num(
token,
self._characters_percentage))
else:
# TODO: Here's the change,
# now all the characters are selected randomly.
selected_chars = [i for i, _ in enumerate(token)]
random.shuffle(selected_chars)
selected_chars =\
sorted(selected_chars[:self._percentage_to_num(
token, self._characters_percentage)])
modified_token = ""
for j, char in enumerate(token):
if j in selected_chars:
char = ''
modified_token += char
token = modified_token
modified.append(token)
return modified
def _process_white_space(self, sentence):
occurrences = int((sentence.count(' ') * self._words_percentage))
return sentence.replace(' ', '', occurrences)
@staticmethod
def _percentage_to_num(array, percentage):
if percentage == 0:
return 0
# Ensure that at least one item will be transformed.
return max(1, int(len(array) * percentage))