Source code for wildnlp.datasets.imdb

import csv
import os

from .base import Dataset


[docs]class IMDB(Dataset):
    """The IMDB dataset containing movie reviews for a sentiment analysis.
    The dataset consists of 50 000 reviews of two classes, negative and positive.
    Each review is stored in a separate text file.
    For details see: http://ai.stanford.edu/~amaas/data/sentiment/
    """

[docs]    def load(self, path):
        """Loads a SNLI dataset.

        :param path: A path to single file, directory containing review files
                     or list of paths to such directories.

        :return: None
        """

        if type(path) is str and os.path.isdir(path):
            self._load_multiple_files(path)

        elif type(path) is list:
            for single_path in path:
                self._load_multiple_files(single_path)

        elif os.path.isfile(path):
            _, filename = os.path.split(path)
            entry = {'path': filename,
                     'content': self._read_file(path)}
            self._data.append(entry)

[docs]    def apply(self, aspect):
        """Modifies contents of the whole files in the IMDB dataset.
        """

        modified = []
        for entry in self._data:
            modified_sentence = aspect(entry['content'])
            entry['content'] = modified_sentence
            modified.append(entry)

        return modified

[docs]    def save(self, data, path):
        """Saves IMDB reviews to separate files
        with the original names.

        :param path: path to a top directory where files will be saved.

        :return: None
        """

        for entry in data:
            directory, filename = os.path.split(entry['path'])
            full_path = os.path.join(path, directory)
            if not os.path.exists(full_path) and directory != '':
                os.makedirs(full_path)
            with open(os.path.join(path, entry['path']), 'w') as f:
                f.write(entry['content'])

[docs]    def save_tsv(self, data, path):
        """Convenience function for saving IMDB reviews into a single TSV file.

        :param path: Path to a tab separated file.

        :return: None
        """

        with open(path, 'w', newline='') as tsvfile:
            writer = csv.writer(tsvfile, delimiter='\t')
            writer.writerow(['Sentiment', 'Content'])

            for entry in data:
                directory, _ = os.path.split(entry['path'])
                if directory == 'neg':
                    sentiment = 'neg'
                elif directory == 'pos':
                    sentiment = 'pos'
                else:
                    sentiment = 'unsup'

                writer.writerow([sentiment, entry['content']])

    @staticmethod
    def _read_file(path):

        with open(path, 'r') as f:
            content = f.read()

        return content

    def _load_multiple_files(self, path):

        filenames = os.listdir(path)
        for filename in filenames:
            full_path = os.path.join(path, filename)

            _, patent_dir = os.path.split(path)
            entry = {'path': os.path.join(patent_dir, filename),
                     'content': self._read_file(full_path)}
            self._data.append(entry)