import numpy as np
from .base import Dataset, file_exists_check
[docs]class CoNLL(Dataset):
"""The CoNLL-2003 shared task data for language-independent named
entity recognition. For details see:
https://www.clips.uantwerpen.be/conll2003/ner/
"""
[docs] @file_exists_check
def load(self, path):
"""Reads a CoNLL dataset file
and loads into internal data structure in the following form:
::
[{tokens: array(<tokens>)
pos_tags: array(<pos_tags>),
chunk_tags: array(<chunk_tags>),
ner_tags: array(<ner_tags>},
...,
]
:param path: A path to a file with CoNLL data
:return: None
"""
with open(path, "r") as f:
for line in f.readlines():
if line == "-DOCSTART- -X- O O\n":
continue
elif line == "\n":
try:
processed = self._process_sample(sample)
self._data.append(processed)
except NameError:
pass
sample = ""
else:
sample += line
if sample != "":
processed = self._process_sample(sample)
self._data.append(processed)
[docs] def apply(self, aspect, apply_to_ne=False):
"""
:param aspect: transformation function
:param apply_to_ne: if `False`, transformation won't be applied
to Named Entities. If `True`, transformation
will be applied only to Named Entities.
:return: modified dataset in the following form:
::
[{tokens: array(<tokens>)
pos_tags: array(<pos_tags>),
chunk_tags: array(<chunk_tags>),
ner_tags: array(<ner_tags>},
...,
]
"""
modified = []
for entry in self._data:
tags = entry['ner_tags']
if apply_to_ne is False:
non_ner = np.where(tags == 'O')[0]
else:
non_ner = np.where(tags != 'O')[0]
if len(non_ner) == 0:
modified.append(entry)
else:
sentence = " ".join(entry['tokens'][non_ner])
modified_sentence = aspect(sentence)
for idx, token in zip(non_ner, modified_sentence.split()):
entry['tokens'][idx] = token
modified.append(entry)
return modified
[docs] def save(self, data, path):
"""Saves data in the CoNLL format
:param data: list of dictionaries in the following form:
::
[{tokens: array(<tokens>)
pos_tags: array(<pos_tags>),
chunk_tags: array(<chunk_tags>),
ner_tags: array(<ner_tags>},
...,
]
:param path: Path to save the file. If the file exists,
it will be overwritten.
:return: None
"""
with open(path, 'w+') as f:
lines = list()
lines.append('-DOCSTART- -X- O O\n')
try:
for entry in data:
line = ""
for token, pos_tag, chunk_tag, ner_tag \
in zip(entry['tokens'], entry['pos_tags'],
entry['chunk_tags'], entry['ner_tags']):
line += " ".join([token, pos_tag, chunk_tag, ner_tag]) + '\n'
lines.append(line)
# Remove a trailing newline
lines[-1] = lines[-1][:-1]
f.write('\n'.join(lines))
except KeyError:
print("The data you're trying to save is corrupted or "
"isn't formatted correctly.")
@staticmethod
def _process_sample(sample):
data = sample.split('\n')
tokens = []
pos_tags = []
chunk_tags = []
ner_tags = []
for entry in data:
if entry == '':
continue
info = entry.split()
tokens.append(info[0])
pos_tags.append(info[1])
chunk_tags.append(info[2])
ner_tags.append(info[3])
processed = dict()
processed['tokens'] = np.asarray(tokens)
processed['pos_tags'] = np.asarray(pos_tags)
processed['chunk_tags'] = np.asarray(chunk_tags)
processed['ner_tags'] = np.asarray(ner_tags)
return processed