Source code for wildnlp.datasets.conll

import numpy as np

from .base import Dataset, file_exists_check


[docs]class CoNLL(Dataset):
    """The CoNLL-2003 shared task data for language-independent named
    entity recognition. For details see:
    https://www.clips.uantwerpen.be/conll2003/ner/
    """

[docs]    @file_exists_check
    def load(self, path):
        """Reads a CoNLL dataset file
        and loads into internal data structure in the following form:

        ::

            [{tokens: array(<tokens>)
              pos_tags: array(<pos_tags>),
              chunk_tags: array(<chunk_tags>),
              ner_tags: array(<ner_tags>},

              ...,

              ]

        :param path: A path to a file with CoNLL data

        :return: None

        """
        with open(path, "r") as f:
            for line in f.readlines():
                if line == "-DOCSTART- -X- O O\n":
                    continue

                elif line == "\n":
                    try:
                        processed = self._process_sample(sample)
                        self._data.append(processed)
                    except NameError:
                        pass
                    sample = ""

                else:
                    sample += line

            if sample != "":
                processed = self._process_sample(sample)
                self._data.append(processed)

[docs]    def apply(self, aspect, apply_to_ne=False):
        """

        :param aspect: transformation function

        :param apply_to_ne: if `False`, transformation won't be applied
                            to Named Entities. If `True`, transformation
                            will be applied only to Named Entities.

        :return: modified dataset in the following form:

        ::

            [{tokens: array(<tokens>)
              pos_tags: array(<pos_tags>),
              chunk_tags: array(<chunk_tags>),
              ner_tags: array(<ner_tags>},

              ...,

              ]

        """

        modified = []
        for entry in self._data:
            tags = entry['ner_tags']

            if apply_to_ne is False:
                non_ner = np.where(tags == 'O')[0]
            else:
                non_ner = np.where(tags != 'O')[0]

            if len(non_ner) == 0:
                modified.append(entry)
            else:
                sentence = " ".join(entry['tokens'][non_ner])
                modified_sentence = aspect(sentence)

                for idx, token in zip(non_ner, modified_sentence.split()):
                    entry['tokens'][idx] = token

                modified.append(entry)

        return modified

[docs]    def save(self, data, path):
        """Saves data in the CoNLL format

        :param data: list of dictionaries in the following form:

        ::

            [{tokens: array(<tokens>)
              pos_tags: array(<pos_tags>),
              chunk_tags: array(<chunk_tags>),
              ner_tags: array(<ner_tags>},

              ...,

              ]

        :param path: Path to save the file. If the file exists,
                     it will be overwritten.

        :return: None
        """

        with open(path, 'w+') as f:

            lines = list()
            lines.append('-DOCSTART- -X- O O\n')

            try:
                for entry in data:
                    line = ""
                    for token, pos_tag, chunk_tag, ner_tag \
                        in zip(entry['tokens'], entry['pos_tags'],
                               entry['chunk_tags'], entry['ner_tags']):
                        line += " ".join([token, pos_tag, chunk_tag, ner_tag]) + '\n'
                    lines.append(line)

                # Remove a trailing newline
                lines[-1] = lines[-1][:-1]
                f.write('\n'.join(lines))
            except KeyError:
                print("The data you're trying to save is corrupted or "
                      "isn't formatted correctly.")

    @staticmethod
    def _process_sample(sample):

        data = sample.split('\n')

        tokens = []
        pos_tags = []
        chunk_tags = []
        ner_tags = []

        for entry in data:
            if entry == '':
                continue

            info = entry.split()
            tokens.append(info[0])
            pos_tags.append(info[1])
            chunk_tags.append(info[2])
            ner_tags.append(info[3])

        processed = dict()
        processed['tokens'] = np.asarray(tokens)
        processed['pos_tags'] = np.asarray(pos_tags)
        processed['chunk_tags'] = np.asarray(chunk_tags)
        processed['ner_tags'] = np.asarray(ner_tags)

        return processed