Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
f8287f0
custom data loader
ArshdeepSekhon Oct 29, 2020
bb1e021
custom textattack dataset from local files or in memory using hugging…
ArshdeepSekhon Oct 30, 2020
9195e1e
load user dataset from local files and convert to TextAttack dataset …
ArshdeepSekhon Nov 4, 2020
c1bd607
load user dataset from local files and convert to TextAttack dataset …
ArshdeepSekhon Nov 4, 2020
157bd21
load user dataset from local files and convert to textattack dataset …
ArshdeepSekhon Nov 4, 2020
3edd74b
load user dataset from local files and convert to textattack dataset …
ArshdeepSekhon Nov 4, 2020
29b0d9a
custom dataset: add attribute error
ArshdeepSekhon Nov 4, 2020
ea15f9a
custom dataset: remove stray prints
ArshdeepSekhon Nov 4, 2020
34b02ec
fix output column for custom dataset
ArshdeepSekhon Nov 4, 2020
af379af
custom dataset: add support for dict
ArshdeepSekhon Nov 4, 2020
6e07bd5
custom dataset: checks
ArshdeepSekhon Nov 4, 2020
2105de2
option to test on entire dataset
ArshdeepSekhon Oct 22, 2020
5f9a4c2
eval on entire dataset, checks
ArshdeepSekhon Oct 22, 2020
f238449
fix failed checks
ArshdeepSekhon Oct 22, 2020
2f00e33
custom data loader
ArshdeepSekhon Oct 29, 2020
793dbe0
custom textattack dataset from local files or in memory using hugging…
ArshdeepSekhon Oct 30, 2020
ae1c1f0
load user dataset from local files and convert to TextAttack dataset …
ArshdeepSekhon Nov 4, 2020
799f29e
load user dataset from local files and convert to TextAttack dataset …
ArshdeepSekhon Nov 4, 2020
97ea615
load user dataset from local files and convert to textattack dataset …
ArshdeepSekhon Nov 4, 2020
6172e24
load user dataset from local files and convert to textattack dataset …
ArshdeepSekhon Nov 4, 2020
d3e4269
custom dataset: add attribute error
ArshdeepSekhon Nov 4, 2020
92a54a5
custom dataset: remove stray prints
ArshdeepSekhon Nov 4, 2020
7b167ca
fix output column for custom dataset
ArshdeepSekhon Nov 4, 2020
601371d
custom dataset: add support for dict
ArshdeepSekhon Nov 4, 2020
9d0ed54
custom dataset: checks
ArshdeepSekhon Nov 4, 2020
12aab83
skeleton code for custom dataset
ArshdeepSekhon Nov 24, 2020
474bfa7
Merge branch 'custom_dataset' of https://github.com/ArshdeepSekhon/Te…
ArshdeepSekhon Nov 24, 2020
7f746d1
add utils for reading from files
ArshdeepSekhon Nov 25, 2020
7d91be2
add support for reading from csv, df, txt
ArshdeepSekhon Nov 25, 2020
7d2f976
fix format errors
ArshdeepSekhon Dec 4, 2020
9222066
update the confusing word"Successes" to "True Positive/Positive"
qiyanjun Dec 4, 2020
5c172b2
update the confusing uses of "Successes" to "True Positive/Positive"
qiyanjun Dec 4, 2020
11d2930
Merge branch 'master' into custom_dataset
ArshdeepSekhon Dec 4, 2020
36c83b3
black,isort formatting
ArshdeepSekhon Dec 4, 2020
f6fb8c5
Update dataset.py
qiyanjun Dec 5, 2020
41c5ef5
fix a wrong typo
qiyanjun Dec 5, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
custom data loader
  • Loading branch information
ArshdeepSekhon committed Nov 24, 2020
commit 2f00e33e8aa29f113855c9efdf0fbb11cedf10e7
1 change: 1 addition & 0 deletions textattack/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@

from .dataset import TextAttackDataset
from .huggingface_dataset import HuggingFaceDataset
from .huggingface_dataset import CustomDataset

from . import translation
116 changes: 116 additions & 0 deletions textattack/datasets/custom_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import collections
import random

import datasets

import textattack
from textattack.datasets import HuggingFaceDataset


def _cb(s):
"""Colors some text blue for printing to the terminal."""
return textattack.shared.utils.color_text(str(s), color="blue", method="ansi")


class CustomDataset(HuggingFaceDataset):
"""Loads a Custom Dataset like HuggingFace custom ``datasets`` and prepares it as a
TextAttack dataset.

- name(str): the dataset file names
- file_type(str): Specifies type of file for loading HuggingFaceDataset
from local_files will be loaded as ``datasets.load_dataset(filetype, data_files=name)``.
- label_map: Mapping if output labels should be re-mapped. Useful
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you include an example of how an user should define this label_map (also is this a dict)? For example, should it look like {"Positive": 1, "Negative": 0}?

if model was trained with a different label arrangement than
provided in the ``datasets`` version of the dataset.
- output_scale_factor (float): Factor to divide ground-truth outputs by.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused what output_scale_factor is for. Can you maybe give an example of when this would be used?

Generally, TextAttack goal functions require model outputs
between 0 and 1. Some datasets test the model's correlation
with ground-truth output, instead of its accuracy, so these
outputs may be scaled arbitrarily.
- dataset_columns (list): dataset_columns[0]: input columns, dataset_columns[1]: output_columns
- shuffle (bool): Whether to shuffle the dataset on load.
"""

def __init__(
self,
name,
filetype="csv",
split="train",
label_map=None,
output_scale_factor=None,
dataset_columns=[("text",), None],
shuffle=False,
):

self._name = name

self._dataset = datasets.load_dataset(filetype, data_files=name)[split]

subset = None
subset_print_str = f", subset {_cb(subset)}" if subset else ""

textattack.shared.logger.info(
f"Loading {_cb('datasets')} dataset {_cb(name)}{subset_print_str}, split {_cb(split)}."
)
# Input/output column order, like (('premise', 'hypothesis'), 'label')
if not set(dataset_columns[0]) <= set(self._dataset.column_names):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I'm just not familiar with how datasets work, but does this mean first column of the data should always be input data and the second column is output?

My main concern is that this would be too restrictive for users. For example, NLI datasets have two input columns "premise" and "hypothesis".

raise ValueError(
f"Could not find input column {dataset_columns[0]} in CSV. Found keys: {self._dataset.column_names}"
)
self.input_columns = dataset_columns[0]
self.output_column = dataset_columns[1]
if (
self.output_column is not None
and self.output_column not in self._dataset.column_names
):
raise ValueError(
f"Could not find input column {dataset_columns[1]} in CSV. Found keys: {self._dataset.column_names}"
)

self._i = 0
self.examples = list(self._dataset)

self.label_map = label_map

self.output_scale_factor = output_scale_factor

try:
self.label_names = self._dataset.features["label"].names

# If labels are remapped, the label names have to be remapped as
# well.
if label_map:

self.label_names = [
self.label_names[self.label_map[i]]
for i in range(len(self.label_map))
]
print(self.label_names)
except KeyError:
# This happens when the dataset doesn't have 'features' or a 'label' column.

self.label_names = None

except AttributeError:
# This happens when self._dataset.features["label"] exists
# but is a single value.

self.label_names = ("label",)

if shuffle:
random.shuffle(self.examples)

def _format_raw_example(self, raw_example):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the last three methods are just copied from HuggingFaceDataset, would it be more appropriate to just inherit from HuggingFaceDataset?

input_dict = collections.OrderedDict(
[(c, raw_example[c]) for c in self.input_columns]
)
if self.output_column is not None:
output = raw_example[self.output_column]
if self.label_map:
output = self.label_map[output]
if self.output_scale_factor:
output = output / self.output_scale_factor
else:
output = None

return (input_dict, output)