From 043c1b91d6975b95a11385f07c80adcb0dc65c0e Mon Sep 17 00:00:00 2001 From: Rohan Bhargava Date: Wed, 28 Nov 2018 23:02:16 -0800 Subject: [PATCH 01/31] Fixed breaking changes in predict function introduced by pytorch 0.4 changes to tensor_type --- train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index b2d46a0..7f90aaa 100644 --- a/train.py +++ b/train.py @@ -87,8 +87,8 @@ def predict(text, model, text_field, label_feild, cuda_flag): # text = text_field.tokenize(text) text = text_field.preprocess(text) text = [[text_field.vocab.stoi[x] for x in text]] - x = text_field.tensor_type(text) - x = autograd.Variable(x, volatile=True) + x = torch.tensor(text) + x = autograd.Variable(x) if cuda_flag: x = x.cuda() print(x) @@ -103,4 +103,4 @@ def save(model, save_dir, save_prefix, steps): os.makedirs(save_dir) save_prefix = os.path.join(save_dir, save_prefix) save_path = '{}_steps_{}.pt'.format(save_prefix, steps) - torch.save(model.state_dict(), save_path) \ No newline at end of file + torch.save(model.state_dict(), save_path) From 0ea6819b214154a2b3bee650ba471a4e71d737b9 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Fri, 5 Apr 2019 16:50:04 -0700 Subject: [PATCH 02/31] Refactored for use with scikit-learn --- README.md | 190 ++++++++++++++----------------- cnn_text_classification.py | 228 +++++++++++++++++++++++++++++++++++++ main.py | 116 ------------------- model.py | 58 ---------- mydatasets.py | 110 ------------------ train.py | 106 ----------------- 6 files changed, 312 insertions(+), 496 deletions(-) create mode 100644 cnn_text_classification.py delete mode 100755 main.py delete mode 100644 model.py delete mode 100644 mydatasets.py delete mode 100644 train.py diff --git a/README.md b/README.md index 5ee32a7..f25b726 100644 --- a/README.md +++ b/README.md @@ -1,127 +1,105 @@ ## Introduction -This is the implementation of Kim's [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch. +Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.com/Shawn1993/cnn-text-classification-pytorch), refactored as a scikit-learn classifier. -1. Kim's implementation of the model in Theano: -[https://github.com/yoonkim/CNN_sentence](https://github.com/yoonkim/CNN_sentence) -2. Denny Britz has an implementation in Tensorflow: -[https://github.com/dennybritz/cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf) -3. Alexander Rakhlin's implementation in Keras; -[https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras](https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras) - -## Requirement +## Requirements * python 3 * pytorch > 0.1 * torchtext > 0.1 * numpy +* scikit-learn -## Result -I just tried two dataset, MR and SST. +## Known Issues +* The predict method is probably not as efficient as it could be. +* Doesn't play well with GridSearchCV if num_jobs isn't 1. +* Weights are represented by upsampling. +* Only supports pre-trained word vectors from TorchText. +* The random_state parameter probably only works with integers or None. +* Features my idiosyncratic coding style. -|Dataset|Class Size|Best Result|Kim's Paper Result| -|---|---|---|---| -|MR|2|77.5%(CNN-rand-static)|76.1%(CNN-rand-nostatic)| -|SST|5|37.2%(CNN-rand-static)|45.0%(CNN-rand-nostatic)| +## To Do +* Add support for different scoring methods (balanced accuracy, recall, etc.). +* Add support for cross-validation during training. -I haven't adjusted the hyper-parameters for SST seriously. +## Parameters +**lr : float, optional (default=0.01)** + Initial learning rate. -## Usage -``` -./main.py -h -``` -or +**epochs : integer, optional (default=256)** + Number of training epochs. -``` -python3 main.py -h -``` +**batch_size : integer, optional (default=64)** + Training batch size. -You will get: +**test_interval : integer, optional (default=100)** + The number of epochs to wait before testing. -``` -CNN text classificer - -optional arguments: - -h, --help show this help message and exit - -batch-size N batch size for training [default: 50] - -lr LR initial learning rate [default: 0.01] - -epochs N number of epochs for train [default: 10] - -dropout the probability for dropout [default: 0.5] - -max_norm MAX_NORM l2 constraint of parameters - -cpu disable the gpu - -device DEVICE device to use for iterate data - -embed-dim EMBED_DIM - -static fix the embedding - -kernel-sizes KERNEL_SIZES - Comma-separated kernel size to use for convolution - -kernel-num KERNEL_NUM - number of each kind of kernel - -class-num CLASS_NUM number of class - -shuffle shuffle the data every epoch - -num-workers NUM_WORKERS - how many subprocesses to use for data loading - [default: 0] - -log-interval LOG_INTERVAL - how many batches to wait before logging training - status - -test-interval TEST_INTERVAL - how many epochs to wait before testing - -save-interval SAVE_INTERVAL - how many epochs to wait before saving - -predict PREDICT predict the sentence given - -snapshot SNAPSHOT filename of model snapshot [default: None] - -save-dir SAVE_DIR where to save the checkpoint -``` +**early_stop : integer, optional (default=1000)** + The number of iterations without increased performance to reach before stopping. -## Train -``` -./main.py -``` -You will get: +**save_best : boolean, optional (default=True)** + Keep the model with the best performance found during training. +**dropout : float, optional (default=0.5)** + Dropout probability. + +**max_norm : float, optional (default=0.0)** + L2 constraint. + +**embed_dim : integer, optional (default=128)** + The number of embedding dimensions. + +**kernel_num : integer, optional (default=100)** + The number of each size of kernel. + +**kernel_sizes : string, optional (default='3,4,5')** + Comma-separated kernel sizes to use for convolution. + +**static : boolean, optional (default=False)** + If true, fix the embedding. + +**device : int, optional (default=-1)** + Device to use for iterating data; -1 for CPU (see torch.cuda.set_device()). + +**cuda : boolean, optional (default=True)** + If true, use the GPU if available. + +**class_weight : dict, "balanced" or None, optional (default=None)** + Weights associated with each class (see class_weight parameter in existing scikit-learn classifiers). + +**split_ratio : float, optional (default=0.9)** + Ratio of training data used for training. The remainder will be used for validation. + +**random_state : integer, optional (default=None)** + Seed for the random number generator. + +**vectors : string, optional (default=None)** + Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options). + +**preprocessor : callable or None (default=None)** + Override default string preprocessing. + +## Methods +**fit(X, y, sample_weight=None)** +Train the CNN classifier from the training set (X, y). ``` -Batch[100] - loss: 0.655424 acc: 59.3750% -Evaluation - loss: 0.672396 acc: 57.6923%(615/1066) -``` +Parameters: X: list of strings + The training input samples. -## Test -If you has construct you test set, you make testing like: + y: list of strings + The class labels. + sample_weight: list of integers or floats, or None + Sample weights. If None, samples are equally weighted. + +Returns: self : object ``` -/main.py -test -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt + +**predict(X)** +Predict class for X. ``` -The snapshot option means where your model load from. If you don't assign it, the model will start from scratch. - -## Predict -* **Example1** - - ``` - ./main.py -predict="Hello my dear , I love you so much ." \ - -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt" - ``` - You will get: - - ``` - Loading model from [./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt]... - - [Text] Hello my dear , I love you so much . - [Label] positive - ``` -* **Example2** - - ``` - ./main.py -predict="You just make me so sad and I have to leave you ."\ - -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt" - ``` - You will get: - - ``` - Loading model from [./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt]... - - [Text] You just make me so sad and I have to leave you . - [Label] negative - ``` - -Your text must be separated by space, even punctuation.And, your text should longer then the max kernel size. - -## Reference -* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) +Parameters: X: list of strings + The input samples. +Returns: y: list of strings + The predicted classes. +``` diff --git a/cnn_text_classification.py b/cnn_text_classification.py new file mode 100644 index 0000000..847bcd3 --- /dev/null +++ b/cnn_text_classification.py @@ -0,0 +1,228 @@ +import re +import torch +import torch.nn as nn +import torch.nn.functional as F +from collections import Counter +from copy import deepcopy +from sklearn.base import BaseEstimator, ClassifierMixin +from torch.autograd import Variable +from torchtext.data import Dataset, Example, Field, Iterator, Pipeline + + +class CNNClassifier(BaseEstimator, ClassifierMixin): + def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, + early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0, + embed_dim=128, kernel_num=100, kernel_sizes="3,4,5", + static=False, device=-1, cuda=True, class_weight=None, + split_ratio=0.9, random_state=None, vectors=None, + preprocessor=None): + self.lr = lr + self.epochs = epochs + self.batch_size = batch_size + self.test_interval = test_interval + self.early_stop = early_stop + self.save_best = save_best + self.dropout = dropout + self.max_norm = max_norm + self.embed_dim = embed_dim + self.kernel_num = kernel_num + self.kernel_sizes = kernel_sizes + self.static = static + self.device = device + self.cuda = cuda + self.class_weight = class_weight + self.split_ratio = split_ratio + self.random_state = random_state + self.vectors = vectors + self.preprocessor = preprocessor + + def __clean_str(self, string): + string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) + string = re.sub(r"\'s", " \'s", string) + string = re.sub(r"\'ve", " \'ve", string) + string = re.sub(r"n\'t", " n\'t", string) + string = re.sub(r"\'re", " \'re", string) + string = re.sub(r"\'d", " \'d", string) + string = re.sub(r"\'ll", " \'ll", string) + string = re.sub(r",", " , ", string) + string = re.sub(r"!", " ! ", string) + string = re.sub(r"\(", " ( ", string) + string = re.sub(r"\)", " ) ", string) + string = re.sub(r"\?", " ? ", string) + string = re.sub(r"\s{2,}", " ", string) + return string.strip() + + def __eval(self, data_iter): + self.__model.eval() + + corrects = 0 + + for batch in data_iter: + feature, target = batch.text, batch.label + + feature.data.t_() + target.data.sub_(1) + + if self.cuda and torch.cuda.is_available(): + feature, target = feature.cuda(), target.cuda() + + logit = self.__model(feature) + + F.cross_entropy(logit, target, reduction="sum") + + predictions = torch.max(logit, 1)[1].view(target.size()) + corrects += (predictions.data == target.data).sum() + + return 100.0 * corrects / len(data_iter.dataset) + + def fit(self, X, y, sample_weight=None): + train_iter, dev_iter = self.__preprocess(X, y, sample_weight) + embed_num = len(self.__text_field.vocab) + class_num = len(self.__label_field.vocab) - 1 + kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")] + self.__model = CNNText(embed_num, self.embed_dim, class_num, + self.kernel_num, kernel_sizes, self.dropout, + self.static) + + if self.cuda and torch.cuda.is_available(): + torch.cuda.set_device(self.device) + self.__model.cuda() + + optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr, + weight_decay=self.max_norm) + steps, best_acc, last_step = 0, 0, 0 + + self.__model.train() + + for epoch in range(self.epochs): + for batch in train_iter: + feature, target = batch.text, batch.label + + feature.data.t_() + target.data.sub_(1) + + if self.cuda and torch.cuda.is_available(): + feature, target = feature.cuda(), target.cuda() + + optimizer.zero_grad() + F.cross_entropy(self.__model(feature), target).backward() + optimizer.step() + + steps += 1 + + if steps % self.test_interval == 0: + dev_acc = self.__eval(dev_iter) + + if dev_acc > best_acc: + best_acc = dev_acc + last_step = steps + + if self.save_best: + best_model = deepcopy(self.__model) + elif steps - last_step >= self.early_stop: + if self.save_best: + self.__model = best_model + + return self + + self.__model = best_model if self.save_best else self.__model + return self + + def predict(self, X): + y_pred = [] + max_krnl_sz = int(self.kernel_sizes[self.kernel_sizes.rfind(",") + 1:]) + + for text in X: + assert isinstance(text, str) + + text = self.__text_field.preprocess(text) + + if len(text) < max_krnl_sz: + most_common = self.__label_field.vocab.freqs.most_common(1)[0] + + y_pred.append(most_common[0]) + continue + + self.__model.eval() + + text = [[self.__text_field.vocab.stoi[x] for x in text]] + x = Variable(torch.tensor(text)) + x = x.cuda() if self.cuda and torch.cuda.is_available() else x + _, predicted = torch.max(self.__model(x), 1) + + y_pred.append(self.__label_field.vocab.itos[predicted.data[0] + 1]) + + return y_pred + + def __preprocess(self, X, y, sample_weight): + self.__text_field = Field(lower=True) + self.__label_field = Field(sequential=False) + self.__text_field.preprocessing = Pipeline(self.__preprocess_text) + fields = [("text", self.__text_field), ("label", self.__label_field)] + weights = [1 for yi in y] if sample_weight is None else sample_weight + exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))] + + if self.class_weight is not None: + cw = self.class_weight + + if isinstance(cw, str) and cw == "balanced": + counter = Counter(y) + cw = [len(y) / (len(counter) * counter[yi]) for yi in y] + weights = [weights[i] * cw[i] for i in range(len(y))] + elif isinstance(cw, dict): + cw = [cw[yi] for yi in y] + weights = [weights[i] * cw[i] for i in range(len(y))] + + min_weight = min(weights) + weights = [round(w / min_weight) for w in weights] + + for i in range(len(X)): + if weights[i] > 1: + Xi = [X[i] for j in range(weights[i] - 1)] + exmpl += [Example.fromlist([x, y[i]], fields) for x in Xi] + + train_data, dev_data = Dataset(exmpl, fields).split(self.split_ratio, + self.random_state,) + + self.__text_field.build_vocab(train_data, dev_data, + vectors=self.vectors) + self.__label_field.build_vocab(train_data, dev_data) + + batch_sizes = (self.batch_size, len(dev_data)) + return Iterator.splits((train_data, dev_data), batch_sizes=batch_sizes, + sort_key=lambda ex: len(ex.text), repeat=False) + + def __preprocess_text(self, text): + if self.preprocessor is None: + return self.__clean_str(text) + + return self.preprocessor(text) + + +class CNNText(nn.Module): + def __init__(self, embed_num, embed_dim, class_num, kernel_num, + kernel_sizes, dropout, static): + super(CNNText, self).__init__() + + self.__embed = nn.Embedding(embed_num, embed_dim) + Ks = kernel_sizes + module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks] + self.__convs1 = nn.ModuleList(module_list) + self.__dropout = nn.Dropout(dropout) + self.__fc1 = nn.Linear(len(Ks) * kernel_num, class_num) + self.__static = static + + def conv_and_pool(self, x, conv): + x = F.relu(conv(x)).squeeze(3) + return F.max_pool1d(x, x.size(2)).squeeze(2) + + def forward(self, x): + x = self.__embed(x) + + if self.__static: + x = Variable(x) + + x = x.unsqueeze(1) + x = [F.relu(conv(x)).squeeze(3) for conv in self.__convs1] + x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] + return self.__fc1(self.__dropout(torch.cat(x, 1))) diff --git a/main.py b/main.py deleted file mode 100755 index dd222a6..0000000 --- a/main.py +++ /dev/null @@ -1,116 +0,0 @@ -#! /usr/bin/env python -import os -import argparse -import datetime -import torch -import torchtext.data as data -import torchtext.datasets as datasets -import model -import train -import mydatasets - - -parser = argparse.ArgumentParser(description='CNN text classificer') -# learning -parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]') -parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]') -parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]') -parser.add_argument('-log-interval', type=int, default=1, help='how many steps to wait before logging training status [default: 1]') -parser.add_argument('-test-interval', type=int, default=100, help='how many steps to wait before testing [default: 100]') -parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]') -parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot') -parser.add_argument('-early-stop', type=int, default=1000, help='iteration numbers to stop without performance increasing') -parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance') -# data -parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch') -# model -parser.add_argument('-dropout', type=float, default=0.5, help='the probability for dropout [default: 0.5]') -parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]') -parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]') -parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel') -parser.add_argument('-kernel-sizes', type=str, default='3,4,5', help='comma-separated kernel size to use for convolution') -parser.add_argument('-static', action='store_true', default=False, help='fix the embedding') -# device -parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]') -parser.add_argument('-no-cuda', action='store_true', default=False, help='disable the gpu') -# option -parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]') -parser.add_argument('-predict', type=str, default=None, help='predict the sentence given') -parser.add_argument('-test', action='store_true', default=False, help='train or test') -args = parser.parse_args() - - -# load SST dataset -def sst(text_field, label_field, **kargs): - train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True) - text_field.build_vocab(train_data, dev_data, test_data) - label_field.build_vocab(train_data, dev_data, test_data) - train_iter, dev_iter, test_iter = data.BucketIterator.splits( - (train_data, dev_data, test_data), - batch_sizes=(args.batch_size, - len(dev_data), - len(test_data)), - **kargs) - return train_iter, dev_iter, test_iter - - -# load MR dataset -def mr(text_field, label_field, **kargs): - train_data, dev_data = mydatasets.MR.splits(text_field, label_field) - text_field.build_vocab(train_data, dev_data) - label_field.build_vocab(train_data, dev_data) - train_iter, dev_iter = data.Iterator.splits( - (train_data, dev_data), - batch_sizes=(args.batch_size, len(dev_data)), - **kargs) - return train_iter, dev_iter - - -# load data -print("\nLoading data...") -text_field = data.Field(lower=True) -label_field = data.Field(sequential=False) -train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) -# train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False) - - -# update args and print -args.embed_num = len(text_field.vocab) -args.class_num = len(label_field.vocab) - 1 -args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda -args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] -args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) - -print("\nParameters:") -for attr, value in sorted(args.__dict__.items()): - print("\t{}={}".format(attr.upper(), value)) - - -# model -cnn = model.CNN_Text(args) -if args.snapshot is not None: - print('\nLoading model from {}...'.format(args.snapshot)) - cnn.load_state_dict(torch.load(args.snapshot)) - -if args.cuda: - torch.cuda.set_device(args.device) - cnn = cnn.cuda() - - -# train or predict -if args.predict is not None: - label = train.predict(args.predict, cnn, text_field, label_field, args.cuda) - print('\n[Text] {}\n[Label] {}\n'.format(args.predict, label)) -elif args.test: - try: - train.eval(test_iter, cnn, args) - except Exception as e: - print("\nSorry. The test dataset doesn't exist.\n") -else: - print() - try: - train.train(train_iter, dev_iter, cnn, args) - except KeyboardInterrupt: - print('\n' + '-' * 89) - print('Exiting from training early') - diff --git a/model.py b/model.py deleted file mode 100644 index ce0158b..0000000 --- a/model.py +++ /dev/null @@ -1,58 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import Variable - - -class CNN_Text(nn.Module): - - def __init__(self, args): - super(CNN_Text, self).__init__() - self.args = args - - V = args.embed_num - D = args.embed_dim - C = args.class_num - Ci = 1 - Co = args.kernel_num - Ks = args.kernel_sizes - - self.embed = nn.Embedding(V, D) - # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks] - self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks]) - ''' - self.conv13 = nn.Conv2d(Ci, Co, (3, D)) - self.conv14 = nn.Conv2d(Ci, Co, (4, D)) - self.conv15 = nn.Conv2d(Ci, Co, (5, D)) - ''' - self.dropout = nn.Dropout(args.dropout) - self.fc1 = nn.Linear(len(Ks)*Co, C) - - def conv_and_pool(self, x, conv): - x = F.relu(conv(x)).squeeze(3) # (N, Co, W) - x = F.max_pool1d(x, x.size(2)).squeeze(2) - return x - - def forward(self, x): - x = self.embed(x) # (N, W, D) - - if self.args.static: - x = Variable(x) - - x = x.unsqueeze(1) # (N, Ci, W, D) - - x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] # [(N, Co, W), ...]*len(Ks) - - x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks) - - x = torch.cat(x, 1) - - ''' - x1 = self.conv_and_pool(x,self.conv13) #(N,Co) - x2 = self.conv_and_pool(x,self.conv14) #(N,Co) - x3 = self.conv_and_pool(x,self.conv15) #(N,Co) - x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co) - ''' - x = self.dropout(x) # (N, len(Ks)*Co) - logit = self.fc1(x) # (N, C) - return logit diff --git a/mydatasets.py b/mydatasets.py deleted file mode 100644 index 8fddfce..0000000 --- a/mydatasets.py +++ /dev/null @@ -1,110 +0,0 @@ -import re -import os -import random -import tarfile -import urllib -from torchtext import data - - -class TarDataset(data.Dataset): - """Defines a Dataset loaded from a downloadable tar archive. - - Attributes: - url: URL where the tar archive can be downloaded. - filename: Filename of the downloaded tar archive. - dirname: Name of the top-level directory within the zip archive that - contains the data files. - """ - - @classmethod - def download_or_unzip(cls, root): - path = os.path.join(root, cls.dirname) - if not os.path.isdir(path): - tpath = os.path.join(root, cls.filename) - if not os.path.isfile(tpath): - print('downloading') - urllib.request.urlretrieve(cls.url, tpath) - with tarfile.open(tpath, 'r') as tfile: - print('extracting') - tfile.extractall(root) - return os.path.join(path, '') - - -class MR(TarDataset): - - url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz' - filename = 'rt-polaritydata.tar' - dirname = 'rt-polaritydata' - - @staticmethod - def sort_key(ex): - return len(ex.text) - - def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): - """Create an MR dataset instance given a path and fields. - - Arguments: - text_field: The field that will be used for text data. - label_field: The field that will be used for label data. - path: Path to the data file. - examples: The examples contain all the data. - Remaining keyword arguments: Passed to the constructor of - data.Dataset. - """ - def clean_str(string): - """ - Tokenization/string cleaning for all datasets except for SST. - Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py - """ - string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) - string = re.sub(r"\'s", " \'s", string) - string = re.sub(r"\'ve", " \'ve", string) - string = re.sub(r"n\'t", " n\'t", string) - string = re.sub(r"\'re", " \'re", string) - string = re.sub(r"\'d", " \'d", string) - string = re.sub(r"\'ll", " \'ll", string) - string = re.sub(r",", " , ", string) - string = re.sub(r"!", " ! ", string) - string = re.sub(r"\(", " \( ", string) - string = re.sub(r"\)", " \) ", string) - string = re.sub(r"\?", " \? ", string) - string = re.sub(r"\s{2,}", " ", string) - return string.strip() - - text_field.preprocessing = data.Pipeline(clean_str) - fields = [('text', text_field), ('label', label_field)] - - if examples is None: - path = self.dirname if path is None else path - examples = [] - with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f: - examples += [ - data.Example.fromlist([line, 'negative'], fields) for line in f] - with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f: - examples += [ - data.Example.fromlist([line, 'positive'], fields) for line in f] - super(MR, self).__init__(examples, fields, **kwargs) - - @classmethod - def splits(cls, text_field, label_field, dev_ratio=.1, shuffle=True, root='.', **kwargs): - """Create dataset objects for splits of the MR dataset. - - Arguments: - text_field: The field that will be used for the sentence. - label_field: The field that will be used for label data. - dev_ratio: The ratio that will be used to get split validation dataset. - shuffle: Whether to shuffle the data before split. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose trees - subdirectory the data files will be stored. - train: The filename of the train data. Default: 'train.txt'. - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - path = cls.download_or_unzip(root) - examples = cls(text_field, label_field, path=path, **kwargs).examples - if shuffle: random.shuffle(examples) - dev_index = -1 * int(dev_ratio*len(examples)) - - return (cls(text_field, label_field, examples=examples[:dev_index]), - cls(text_field, label_field, examples=examples[dev_index:])) diff --git a/train.py b/train.py deleted file mode 100644 index 7f90aaa..0000000 --- a/train.py +++ /dev/null @@ -1,106 +0,0 @@ -import os -import sys -import torch -import torch.autograd as autograd -import torch.nn.functional as F - - -def train(train_iter, dev_iter, model, args): - if args.cuda: - model.cuda() - - optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) - - steps = 0 - best_acc = 0 - last_step = 0 - model.train() - for epoch in range(1, args.epochs+1): - for batch in train_iter: - feature, target = batch.text, batch.label - feature.data.t_(), target.data.sub_(1) # batch first, index align - if args.cuda: - feature, target = feature.cuda(), target.cuda() - - optimizer.zero_grad() - logit = model(feature) - - #print('logit vector', logit.size()) - #print('target vector', target.size()) - loss = F.cross_entropy(logit, target) - loss.backward() - optimizer.step() - - steps += 1 - if steps % args.log_interval == 0: - corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum() - accuracy = 100.0 * corrects/batch.batch_size - sys.stdout.write( - '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, - loss.data[0], - accuracy, - corrects, - batch.batch_size)) - if steps % args.test_interval == 0: - dev_acc = eval(dev_iter, model, args) - if dev_acc > best_acc: - best_acc = dev_acc - last_step = steps - if args.save_best: - save(model, args.save_dir, 'best', steps) - else: - if steps - last_step >= args.early_stop: - print('early stop by {} steps.'.format(args.early_stop)) - elif steps % args.save_interval == 0: - save(model, args.save_dir, 'snapshot', steps) - - -def eval(data_iter, model, args): - model.eval() - corrects, avg_loss = 0, 0 - for batch in data_iter: - feature, target = batch.text, batch.label - feature.data.t_(), target.data.sub_(1) # batch first, index align - if args.cuda: - feature, target = feature.cuda(), target.cuda() - - logit = model(feature) - loss = F.cross_entropy(logit, target, size_average=False) - - avg_loss += loss.data[0] - corrects += (torch.max(logit, 1) - [1].view(target.size()).data == target.data).sum() - - size = len(data_iter.dataset) - avg_loss /= size - accuracy = 100.0 * corrects/size - print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n'.format(avg_loss, - accuracy, - corrects, - size)) - return accuracy - - -def predict(text, model, text_field, label_feild, cuda_flag): - assert isinstance(text, str) - model.eval() - # text = text_field.tokenize(text) - text = text_field.preprocess(text) - text = [[text_field.vocab.stoi[x] for x in text]] - x = torch.tensor(text) - x = autograd.Variable(x) - if cuda_flag: - x = x.cuda() - print(x) - output = model(x) - _, predicted = torch.max(output, 1) - #return label_feild.vocab.itos[predicted.data[0][0]+1] - return label_feild.vocab.itos[predicted.data[0]+1] - - -def save(model, save_dir, save_prefix, steps): - if not os.path.isdir(save_dir): - os.makedirs(save_dir) - save_prefix = os.path.join(save_dir, save_prefix) - save_path = '{}_steps_{}.pt'.format(save_prefix, steps) - torch.save(model.state_dict(), save_path) From 8b00bd7cc4f0711241160a40cc0fe60981a050e8 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Wed, 17 Apr 2019 22:12:17 -0700 Subject: [PATCH 03/31] Ignore training samples shorter than the maximum kernel size --- README.md | 2 ++ cnn_text_classification.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f25b726..e720512 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,8 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. * Weights are represented by upsampling. * Only supports pre-trained word vectors from TorchText. * The random_state parameter probably only works with integers or None. +* Training samples shorter than the maximum kernel size are ignored. +* Test samples shorter than the maximum kernel size are classified as the most common class found during training. * Features my idiosyncratic coding style. ## To Do diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 847bcd3..9171076 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -158,9 +158,21 @@ def __preprocess(self, X, y, sample_weight): self.__text_field = Field(lower=True) self.__label_field = Field(sequential=False) self.__text_field.preprocessing = Pipeline(self.__preprocess_text) + max_krnl_sz = int(self.kernel_sizes[self.kernel_sizes.rfind(",") + 1:]) + X, y = list(X), list(y) + sample_weight = None if sample_weight is None else list(sample_weight) + + for i in range(len(X) - 1, -1, -1): + if len(self.__text_field.preprocess(X[i])) < max_krnl_sz: + del X[i] + del y[i] + + if sample_weight is not None: + del sample_weight[i] + fields = [("text", self.__text_field), ("label", self.__label_field)] - weights = [1 for yi in y] if sample_weight is None else sample_weight exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))] + weights = [1 for yi in y] if sample_weight is None else sample_weight if self.class_weight is not None: cw = self.class_weight From aa8b7f19e3984b54a49a1921bff8cc9552fad760 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Wed, 24 Apr 2019 17:39:29 -0700 Subject: [PATCH 04/31] Fixed pretrained vector handling and added console output option --- cnn_text_classification.py | 42 ++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 9171076..94575c7 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -5,6 +5,7 @@ from collections import Counter from copy import deepcopy from sklearn.base import BaseEstimator, ClassifierMixin +from time import time from torch.autograd import Variable from torchtext.data import Dataset, Example, Field, Iterator, Pipeline @@ -15,7 +16,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, embed_dim=128, kernel_num=100, kernel_sizes="3,4,5", static=False, device=-1, cuda=True, class_weight=None, split_ratio=0.9, random_state=None, vectors=None, - preprocessor=None): + preprocessor=None, verbose=0): self.lr = lr self.epochs = epochs self.batch_size = batch_size @@ -35,6 +36,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, self.random_state = random_state self.vectors = vectors self.preprocessor = preprocessor + self.verbose = verbose def __clean_str(self, string): string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) @@ -76,13 +78,15 @@ def __eval(self, data_iter): return 100.0 * corrects / len(data_iter.dataset) def fit(self, X, y, sample_weight=None): + start = time() if self.verbose > 0 else None train_iter, dev_iter = self.__preprocess(X, y, sample_weight) embed_num = len(self.__text_field.vocab) class_num = len(self.__label_field.vocab) - 1 kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")] self.__model = CNNText(embed_num, self.embed_dim, class_num, self.kernel_num, kernel_sizes, self.dropout, - self.static) + self.static, + vectors=self.__text_field.vocab.vectors) if self.cuda and torch.cuda.is_available(): torch.cuda.set_device(self.device) @@ -123,9 +127,15 @@ def fit(self, X, y, sample_weight=None): if self.save_best: self.__model = best_model + if self.verbose > 0: + self.__print_elapsed_time(time() - start) return self self.__model = best_model if self.save_best else self.__model + + if self.verbose > 0: + self.__print_elapsed_time(time() - start) + return self def predict(self, X): @@ -208,15 +218,39 @@ def __preprocess_text(self, text): if self.preprocessor is None: return self.__clean_str(text) - return self.preprocessor(text) + return self.__clean_str(self.preprocessor(text)) + + def __print_elapsed_time(self, seconds): + sc = round(seconds) + mn = int(sc / 60) + sc = sc % 60 + hr = int(mn / 60) + mn = mn % 60 + hr = "{} hour{}".format(hr, "s" if hr > 1 else "") if hr > 0 else "" + mn = "{} minute{}".format(mn, "s" if mn > 1 else "") if mn > 0 else "" + sc = "{} second{}".format(sc, "s" if sc > 1 else "") if sc > 0 else "" + times = [t for t in [hr, mn, sc] if len(t) > 0] + + if len(times) == 3: + times = " and ".format(", ".format(hr, mn), sc) + elif len(times) == 2: + times = " and ".join(times) + else: + times = times[0] + + print("Completed training in {}.".format(times)) class CNNText(nn.Module): def __init__(self, embed_num, embed_dim, class_num, kernel_num, - kernel_sizes, dropout, static): + kernel_sizes, dropout, static, vectors=None): super(CNNText, self).__init__() self.__embed = nn.Embedding(embed_num, embed_dim) + + if vectors is not None: + self.__embed = self.__embed.from_pretrained(vectors) + Ks = kernel_sizes module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks] self.__convs1 = nn.ModuleList(module_list) From 609a73ddcee8d3a12a9ae2c7f6e06aac6013c422 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Thu, 25 Apr 2019 10:30:05 -0700 Subject: [PATCH 05/31] Added support for alternate scoring methods Also made a possible optimization and did some code cleanup. --- README.md | 9 +++++++-- cnn_text_classification.py | 25 +++++++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e720512..f8f4906 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,6 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. * Features my idiosyncratic coding style. ## To Do -* Add support for different scoring methods (balanced accuracy, recall, etc.). * Add support for cross-validation during training. ## Parameters @@ -77,9 +76,15 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. **vectors : string, optional (default=None)** Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options). -**preprocessor : callable or None (default=None)** +**preprocessor : callable or None, optional (default=None)** Override default string preprocessing. +**scoring : callable or None, optional (default=sklearn.metrics.accuracy_score)** + Scoring method for testing model performance during fitting. + +**verbose : integer, optional (default=0)** + Controls the verbosity when fitting. + ## Methods **fit(X, y, sample_weight=None)** Train the CNN classifier from the training set (X, y). diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 94575c7..0b81ecb 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -5,6 +5,7 @@ from collections import Counter from copy import deepcopy from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.metrics import accuracy_score from time import time from torch.autograd import Variable from torchtext.data import Dataset, Example, Field, Iterator, Pipeline @@ -16,7 +17,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, embed_dim=128, kernel_num=100, kernel_sizes="3,4,5", static=False, device=-1, cuda=True, class_weight=None, split_ratio=0.9, random_state=None, vectors=None, - preprocessor=None, verbose=0): + preprocessor=None, scoring=accuracy_score, verbose=0): self.lr = lr self.epochs = epochs self.batch_size = batch_size @@ -36,6 +37,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, self.random_state = random_state self.vectors = vectors self.preprocessor = preprocessor + self.scoring = scoring self.verbose = verbose def __clean_str(self, string): @@ -57,7 +59,8 @@ def __clean_str(self, string): def __eval(self, data_iter): self.__model.eval() - corrects = 0 + preds = [] + targets = [] for batch in data_iter: feature, target = batch.text, batch.label @@ -72,10 +75,10 @@ def __eval(self, data_iter): F.cross_entropy(logit, target, reduction="sum") - predictions = torch.max(logit, 1)[1].view(target.size()) - corrects += (predictions.data == target.data).sum() + preds += torch.max(logit, 1)[1].view(target.size()).data.tolist() + targets += target.data.tolist() - return 100.0 * corrects / len(data_iter.dataset) + return self.scoring(targets, preds) def fit(self, X, y, sample_weight=None): start = time() if self.verbose > 0 else None @@ -95,6 +98,7 @@ def fit(self, X, y, sample_weight=None): optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr, weight_decay=self.max_norm) steps, best_acc, last_step = 0, 0, 0 + active = True self.__model.train() @@ -124,18 +128,18 @@ def fit(self, X, y, sample_weight=None): if self.save_best: best_model = deepcopy(self.__model) elif steps - last_step >= self.early_stop: - if self.save_best: - self.__model = best_model + active = False + break - if self.verbose > 0: - self.__print_elapsed_time(time() - start) - return self + if not active: + break self.__model = best_model if self.save_best else self.__model if self.verbose > 0: self.__print_elapsed_time(time() - start) + torch.cuda.empty_cache() return self def predict(self, X): @@ -162,6 +166,7 @@ def predict(self, X): y_pred.append(self.__label_field.vocab.itos[predicted.data[0] + 1]) + torch.cuda.empty_cache() return y_pred def __preprocess(self, X, y, sample_weight): From 88f693a634405cea20bde6da90d6225f983ba097 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Fri, 26 Apr 2019 22:06:28 -0700 Subject: [PATCH 06/31] Added/fixed console output Also did some more code cleanup --- cnn_text_classification.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 0b81ecb..f078b98 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -81,6 +81,12 @@ def __eval(self, data_iter): return self.scoring(targets, preds) def fit(self, X, y, sample_weight=None): + if self.verbose > 1: + params = self.get_params().items() + + print("Fitting with the following parameters:") + print("\n".join([": ".join([k, str(v)]) for k, v in params])) + start = time() if self.verbose > 0 else None train_iter, dev_iter = self.__preprocess(X, y, sample_weight) embed_num = len(self.__text_field.vocab) @@ -237,7 +243,7 @@ def __print_elapsed_time(self, seconds): times = [t for t in [hr, mn, sc] if len(t) > 0] if len(times) == 3: - times = " and ".format(", ".format(hr, mn), sc) + times = " and ".join(", ".join(hr, mn), sc) elif len(times) == 2: times = " and ".join(times) else: @@ -268,12 +274,7 @@ def conv_and_pool(self, x, conv): return F.max_pool1d(x, x.size(2)).squeeze(2) def forward(self, x): - x = self.__embed(x) - - if self.__static: - x = Variable(x) - - x = x.unsqueeze(1) - x = [F.relu(conv(x)).squeeze(3) for conv in self.__convs1] + x = Variable(self.__embed(x)) if self.__static else self.__embed(x) + x = [F.relu(conv(x.unsqueeze(1))).squeeze(3) for conv in self.__convs1] x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] return self.__fc1(self.__dropout(torch.cat(x, 1))) From 0ae9906a6ed2920c33d7fc7c67d2843d3f28eded Mon Sep 17 00:00:00 2001 From: rriva002 Date: Wed, 8 May 2019 20:49:30 -0700 Subject: [PATCH 07/31] Improved random seed handling --- cnn_text_classification.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index f078b98..dbf7032 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -81,6 +81,12 @@ def __eval(self, data_iter): return self.scoring(targets, preds) def fit(self, X, y, sample_weight=None): + if self.random_state is not None: + torch.manual_seed(self.random_state) + + torch.backends.cudnn.deterministic = self.random_state is not None + torch.backends.cudnn.benchmark = self.random_state is None + if self.verbose > 1: params = self.get_params().items() From 4a6d285d6a5c72243412aaf99b23e443fafd362d Mon Sep 17 00:00:00 2001 From: rriva002 Date: Thu, 20 Jun 2019 07:44:16 -0400 Subject: [PATCH 08/31] Fixed "set_storage_offset is not allowed" error Also fixed a bug in training time output function. --- cnn_text_classification.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index dbf7032..4b587a6 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -59,14 +59,11 @@ def __clean_str(self, string): def __eval(self, data_iter): self.__model.eval() - preds = [] - targets = [] + preds, targets = [], [] for batch in data_iter: feature, target = batch.text, batch.label - - feature.data.t_() - target.data.sub_(1) + feature, target = feature.data.t(), target.data.sub(1) if self.cuda and torch.cuda.is_available(): feature, target = feature.cuda(), target.cuda() @@ -117,9 +114,7 @@ def fit(self, X, y, sample_weight=None): for epoch in range(self.epochs): for batch in train_iter: feature, target = batch.text, batch.label - - feature.data.t_() - target.data.sub_(1) + feature, target = feature.data.t(), target.data.sub(1) if self.cuda and torch.cuda.is_available(): feature, target = feature.cuda(), target.cuda() @@ -249,7 +244,7 @@ def __print_elapsed_time(self, seconds): times = [t for t in [hr, mn, sc] if len(t) > 0] if len(times) == 3: - times = " and ".join(", ".join(hr, mn), sc) + times = " and ".join([", ".join([hr, mn]), sc]) elif len(times) == 2: times = " and ".join(times) else: From b5141b2bb6b3359fad959bd7ea8879bf69e5d6b5 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Thu, 20 Jun 2019 21:21:31 -0400 Subject: [PATCH 09/31] Removed note about weights from known issues See "Buda M, Maki A, Mazurowski MA. A systematic study of the class imbalance problem in convolutional neural networks. Neural Networks. 2018 Oct 1;106:249-59" for justification. Also fixed another bug in the training time output function. --- README.md | 1 - cnn_text_classification.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index f8f4906..05a87a2 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. ## Known Issues * The predict method is probably not as efficient as it could be. * Doesn't play well with GridSearchCV if num_jobs isn't 1. -* Weights are represented by upsampling. * Only supports pre-trained word vectors from TorchText. * The random_state parameter probably only works with integers or None. * Training samples shorter than the maximum kernel size are ignored. diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 4b587a6..d84fc58 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -248,7 +248,7 @@ def __print_elapsed_time(self, seconds): elif len(times) == 2: times = " and ".join(times) else: - times = times[0] + times = times[0] if len(times) > 0 else "less than 1 second" print("Completed training in {}.".format(times)) From a7c9629d19c50c3651f087b7825db93d26014b7f Mon Sep 17 00:00:00 2001 From: rriva002 Date: Sun, 30 Jun 2019 19:44:48 -0400 Subject: [PATCH 10/31] Fixed bug with handling of non-default scoring functions --- README.md | 1 + cnn_text_classification.py | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 05a87a2..877eed7 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. ## To Do * Add support for cross-validation during training. +* Implement sample weights in eval scoring? ## Parameters **lr : float, optional (default=0.01)** diff --git a/cnn_text_classification.py b/cnn_text_classification.py index d84fc58..9cf4b01 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -5,7 +5,7 @@ from collections import Counter from copy import deepcopy from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, make_scorer from time import time from torch.autograd import Variable from torchtext.data import Dataset, Example, Field, Iterator, Pipeline @@ -17,7 +17,8 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, embed_dim=128, kernel_num=100, kernel_sizes="3,4,5", static=False, device=-1, cuda=True, class_weight=None, split_ratio=0.9, random_state=None, vectors=None, - preprocessor=None, scoring=accuracy_score, verbose=0): + preprocessor=None, scoring=make_scorer(accuracy_score), + verbose=0): self.lr = lr self.epochs = epochs self.batch_size = batch_size @@ -75,7 +76,9 @@ def __eval(self, data_iter): preds += torch.max(logit, 1)[1].view(target.size()).data.tolist() targets += target.data.tolist() - return self.scoring(targets, preds) + preds = [self.__label_field.vocab.itos[pred + 1] for pred in preds] + targets = [self.__label_field.vocab.itos[targ + 1] for targ in targets] + return self.scoring(_Eval(preds), None, targets) def fit(self, X, y, sample_weight=None): if self.random_state is not None: @@ -95,10 +98,9 @@ def fit(self, X, y, sample_weight=None): embed_num = len(self.__text_field.vocab) class_num = len(self.__label_field.vocab) - 1 kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")] - self.__model = CNNText(embed_num, self.embed_dim, class_num, - self.kernel_num, kernel_sizes, self.dropout, - self.static, - vectors=self.__text_field.vocab.vectors) + self.__model = _CNNText(embed_num, self.embed_dim, class_num, + self.kernel_num, kernel_sizes, self.dropout, + self.static, self.__text_field.vocab.vectors) if self.cuda and torch.cuda.is_available(): torch.cuda.set_device(self.device) @@ -253,10 +255,10 @@ def __print_elapsed_time(self, seconds): print("Completed training in {}.".format(times)) -class CNNText(nn.Module): +class _CNNText(nn.Module): def __init__(self, embed_num, embed_dim, class_num, kernel_num, kernel_sizes, dropout, static, vectors=None): - super(CNNText, self).__init__() + super(_CNNText, self).__init__() self.__embed = nn.Embedding(embed_num, embed_dim) @@ -279,3 +281,10 @@ def forward(self, x): x = [F.relu(conv(x.unsqueeze(1))).squeeze(3) for conv in self.__convs1] x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] return self.__fc1(self.__dropout(torch.cat(x, 1))) + +class _Eval(): + def __init__(self, preds): + self.__preds = preds + + def predict(self, X): + return self.__preds From f2d82e134af61a613aa33b36e95fa6d996cbe854 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Wed, 3 Jul 2019 18:49:43 -0400 Subject: [PATCH 11/31] Added support for alternate activation functions --- README.md | 23 +++++++++++++---------- cnn_text_classification.py | 33 +++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 877eed7..c6a8638 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. ## Known Issues * The predict method is probably not as efficient as it could be. -* Doesn't play well with GridSearchCV if num_jobs isn't 1. +* Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA). * Only supports pre-trained word vectors from TorchText. * The random_state parameter probably only works with integers or None. * Training samples shorter than the maximum kernel size are ignored. @@ -64,23 +64,26 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. **cuda : boolean, optional (default=True)** If true, use the GPU if available. -**class_weight : dict, "balanced" or None, optional (default=None)** - Weights associated with each class (see class_weight parameter in existing scikit-learn classifiers). - -**split_ratio : float, optional (default=0.9)** - Ratio of training data used for training. The remainder will be used for validation. +** activation_func : string, optional (default='relu')** + Activation function. If 'relu' or 'tanh', uses rectified linear unit or hyperbolic tangent, respectively. Otherwise, uses no activation function (f(x) = x). -**random_state : integer, optional (default=None)** - Seed for the random number generator. +**scoring : callable or None, optional (default=sklearn.metrics.accuracy_score)** + Scoring method for testing model performance during fitting. **vectors : string, optional (default=None)** Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options). +**split_ratio : float, optional (default=0.9)** + Ratio of training data used for training. The remainder will be used for validation. + **preprocessor : callable or None, optional (default=None)** Override default string preprocessing. -**scoring : callable or None, optional (default=sklearn.metrics.accuracy_score)** - Scoring method for testing model performance during fitting. +**class_weight : dict, "balanced" or None, optional (default=None)** + Weights associated with each class (see class_weight parameter in existing scikit-learn classifiers). + +**random_state : integer, optional (default=None)** + Seed for the random number generator. **verbose : integer, optional (default=0)** Controls the verbosity when fitting. diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 9cf4b01..1cc372e 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -15,10 +15,10 @@ class CNNClassifier(BaseEstimator, ClassifierMixin): def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0, embed_dim=128, kernel_num=100, kernel_sizes="3,4,5", - static=False, device=-1, cuda=True, class_weight=None, - split_ratio=0.9, random_state=None, vectors=None, - preprocessor=None, scoring=make_scorer(accuracy_score), - verbose=0): + static=False, device=-1, cuda=True, activation_func="relu", + scoring=make_scorer(accuracy_score), vectors=None, + split_ratio=0.9, preprocessor=None, class_weight=None, + random_state=None, verbose=0): self.lr = lr self.epochs = epochs self.batch_size = batch_size @@ -33,12 +33,13 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, self.static = static self.device = device self.cuda = cuda - self.class_weight = class_weight - self.split_ratio = split_ratio - self.random_state = random_state + self.activation_func = activation_func + self.scoring = scoring self.vectors = vectors + self.split_ratio = split_ratio self.preprocessor = preprocessor - self.scoring = scoring + self.class_weight = class_weight + self.random_state = random_state self.verbose = verbose def __clean_str(self, string): @@ -100,7 +101,8 @@ def fit(self, X, y, sample_weight=None): kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")] self.__model = _CNNText(embed_num, self.embed_dim, class_num, self.kernel_num, kernel_sizes, self.dropout, - self.static, self.__text_field.vocab.vectors) + self.static, self.activation_func, + vectors=self.__text_field.vocab.vectors) if self.cuda and torch.cuda.is_available(): torch.cuda.set_device(self.device) @@ -257,7 +259,7 @@ def __print_elapsed_time(self, seconds): class _CNNText(nn.Module): def __init__(self, embed_num, embed_dim, class_num, kernel_num, - kernel_sizes, dropout, static, vectors=None): + kernel_sizes, dropout, static, activation_func, vectors=None): super(_CNNText, self).__init__() self.__embed = nn.Embedding(embed_num, embed_dim) @@ -272,13 +274,16 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num, self.__fc1 = nn.Linear(len(Ks) * kernel_num, class_num) self.__static = static - def conv_and_pool(self, x, conv): - x = F.relu(conv(x)).squeeze(3) - return F.max_pool1d(x, x.size(2)).squeeze(2) + if activation_func == "relu": + self.__f = F.relu + elif activation_func == "tanh": + self.__f = torch.tanh + else: + self.__f = lambda x: x def forward(self, x): x = Variable(self.__embed(x)) if self.__static else self.__embed(x) - x = [F.relu(conv(x.unsqueeze(1))).squeeze(3) for conv in self.__convs1] + x = [self.__f(cnv(x.unsqueeze(1))).squeeze(3) for cnv in self.__convs1] x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] return self.__fc1(self.__dropout(torch.cat(x, 1))) From aa67e712600ec366347eecfca8fe9f3b17251693 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Mon, 8 Jul 2019 18:41:42 -0400 Subject: [PATCH 12/31] Fixed bug related to saving best model Also did some code cleanup. --- cnn_text_classification.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 1cc372e..2206e17 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -64,8 +64,7 @@ def __eval(self, data_iter): preds, targets = [], [] for batch in data_iter: - feature, target = batch.text, batch.label - feature, target = feature.data.t(), target.data.sub(1) + feature, target = batch.text.data.t(), batch.label.data.sub(1) if self.cuda and torch.cuda.is_available(): feature, target = feature.cuda(), target.cuda() @@ -110,6 +109,7 @@ def fit(self, X, y, sample_weight=None): optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr, weight_decay=self.max_norm) + best_model = self.__model steps, best_acc, last_step = 0, 0, 0 active = True @@ -117,8 +117,7 @@ def fit(self, X, y, sample_weight=None): for epoch in range(self.epochs): for batch in train_iter: - feature, target = batch.text, batch.label - feature, target = feature.data.t(), target.data.sub(1) + feature, target = batch.text.data.t(), batch.label.data.sub(1) if self.cuda and torch.cuda.is_available(): feature, target = feature.cuda(), target.cuda() @@ -262,10 +261,10 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num, kernel_sizes, dropout, static, activation_func, vectors=None): super(_CNNText, self).__init__() - self.__embed = nn.Embedding(embed_num, embed_dim) - - if vectors is not None: - self.__embed = self.__embed.from_pretrained(vectors) + if vectors is None: + self.__embed = nn.Embedding(embed_num, embed_dim) + else: + self.__embed = nn.Embedding.from_pretrained(vectors) Ks = kernel_sizes module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks] From 685fadaefd1c1b3934e2ab12ca54230ad8f45b52 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Wed, 10 Jul 2019 17:05:27 -0400 Subject: [PATCH 13/31] Updated embed_dim to be overridden by pretrained vectors --- README.md | 2 +- cnn_text_classification.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c6a8638..e1e724d 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. L2 constraint. **embed_dim : integer, optional (default=128)** - The number of embedding dimensions. + The number of embedding dimensions. Ignored if vectors is not None. **kernel_num : integer, optional (default=100)** The number of each size of kernel. diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 2206e17..ce62195 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -265,6 +265,7 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num, self.__embed = nn.Embedding(embed_num, embed_dim) else: self.__embed = nn.Embedding.from_pretrained(vectors) + embed_dim = self.__embed.embedding_dim Ks = kernel_sizes module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks] From 380653e086e1eee3342e3c9d54854e10e19a711d Mon Sep 17 00:00:00 2001 From: rriva002 Date: Wed, 17 Jul 2019 19:32:01 -0400 Subject: [PATCH 14/31] Text shorter than maximum kernel size is now padded Also changed the kernel_sizes parameter from a string to an iterable. --- README.md | 8 +++----- cnn_text_classification.py | 39 ++++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index e1e724d..4b9bd69 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,8 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. ## Known Issues * The predict method is probably not as efficient as it could be. * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA). -* Only supports pre-trained word vectors from TorchText. +* Only supports pre-trained word vectors from TorchText (or no pre-trained vectors). * The random_state parameter probably only works with integers or None. -* Training samples shorter than the maximum kernel size are ignored. -* Test samples shorter than the maximum kernel size are classified as the most common class found during training. * Features my idiosyncratic coding style. ## To Do @@ -52,8 +50,8 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. **kernel_num : integer, optional (default=100)** The number of each size of kernel. -**kernel_sizes : string, optional (default='3,4,5')** - Comma-separated kernel sizes to use for convolution. +**kernel_sizes : iterable of integers, optional (default=(3, 4, 5))** + Kernel sizes to use for convolution. **static : boolean, optional (default=False)** If true, fix the embedding. diff --git a/cnn_text_classification.py b/cnn_text_classification.py index ce62195..4e9818c 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -14,7 +14,7 @@ class CNNClassifier(BaseEstimator, ClassifierMixin): def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0, - embed_dim=128, kernel_num=100, kernel_sizes="3,4,5", + embed_dim=128, kernel_num=100, kernel_sizes=(3, 4, 5), static=False, device=-1, cuda=True, activation_func="relu", scoring=make_scorer(accuracy_score), vectors=None, split_ratio=0.9, preprocessor=None, class_weight=None, @@ -29,7 +29,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, self.max_norm = max_norm self.embed_dim = embed_dim self.kernel_num = kernel_num - self.kernel_sizes = kernel_sizes + self.kernel_sizes = sorted(kernel_sizes) self.static = static self.device = device self.cuda = cuda @@ -97,10 +97,10 @@ def fit(self, X, y, sample_weight=None): train_iter, dev_iter = self.__preprocess(X, y, sample_weight) embed_num = len(self.__text_field.vocab) class_num = len(self.__label_field.vocab) - 1 - kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")] self.__model = _CNNText(embed_num, self.embed_dim, class_num, - self.kernel_num, kernel_sizes, self.dropout, - self.static, self.activation_func, + self.kernel_num, self.kernel_sizes, + self.dropout, self.static, + self.activation_func, vectors=self.__text_field.vocab.vectors) if self.cuda and torch.cuda.is_available(): @@ -154,18 +154,11 @@ def fit(self, X, y, sample_weight=None): def predict(self, X): y_pred = [] - max_krnl_sz = int(self.kernel_sizes[self.kernel_sizes.rfind(",") + 1:]) for text in X: assert isinstance(text, str) - text = self.__text_field.preprocess(text) - - if len(text) < max_krnl_sz: - most_common = self.__label_field.vocab.freqs.most_common(1)[0] - - y_pred.append(most_common[0]) - continue + text = self.__pad(self.__text_field.preprocess(text), True) self.__model.eval() @@ -179,21 +172,25 @@ def predict(self, X): torch.cuda.empty_cache() return y_pred + def __pad(self, x, preprocessed=False): + tokens = x if preprocessed else self.__text_field.preprocess(x) + difference = self.kernel_sizes[-1] - len(tokens) + + if difference > 0: + padding = [self.__text_field.pad_token] * difference + return x + padding if preprocessed else " ".join([x] + padding) + + return x + def __preprocess(self, X, y, sample_weight): self.__text_field = Field(lower=True) self.__label_field = Field(sequential=False) self.__text_field.preprocessing = Pipeline(self.__preprocess_text) - max_krnl_sz = int(self.kernel_sizes[self.kernel_sizes.rfind(",") + 1:]) X, y = list(X), list(y) sample_weight = None if sample_weight is None else list(sample_weight) - for i in range(len(X) - 1, -1, -1): - if len(self.__text_field.preprocess(X[i])) < max_krnl_sz: - del X[i] - del y[i] - - if sample_weight is not None: - del sample_weight[i] + for i in range(len(X)): + X[i] = self.__pad(X[i]) fields = [("text", self.__text_field), ("label", self.__label_field)] exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))] From 739de662336b7311d0841ca8a0e5c9573f36bdb7 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Wed, 17 Jul 2019 20:28:30 -0400 Subject: [PATCH 15/31] Added classes_ attribute Also removed an unnecessary line of code. --- cnn_text_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 4e9818c..dd6b793 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -145,6 +145,7 @@ def fit(self, X, y, sample_weight=None): break self.__model = best_model if self.save_best else self.__model + self.classes_ = self.__label_field.vocab.itos[1:] if self.verbose > 0: self.__print_elapsed_time(time() - start) @@ -186,7 +187,6 @@ def __preprocess(self, X, y, sample_weight): self.__text_field = Field(lower=True) self.__label_field = Field(sequential=False) self.__text_field.preprocessing = Pipeline(self.__preprocess_text) - X, y = list(X), list(y) sample_weight = None if sample_weight is None else list(sample_weight) for i in range(len(X)): From 5c177f3d9a29fc7737bd4734315820d1c11c7e87 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Wed, 17 Jul 2019 21:36:43 -0400 Subject: [PATCH 16/31] Added predict_proba method --- README.md | 13 ++++++++++++- cnn_text_classification.py | 23 +++++++++++++++-------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4b9bd69..af88328 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,8 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. * scikit-learn ## Known Issues -* The predict method is probably not as efficient as it could be. +* The predict and predict_proba methods are probably not as efficient as they could be. +* The class probabilities returned by the predict_proba method are probably questionable. * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA). * Only supports pre-trained word vectors from TorchText (or no pre-trained vectors). * The random_state parameter probably only works with integers or None. @@ -111,3 +112,13 @@ Parameters: X: list of strings Returns: y: list of strings The predicted classes. ``` + +**predict_proba(X)** +Predict class probabilities for X. +``` +Parameters: X: list of strings + The input samples. + +Returns: y: list of lists for floats + The predicted class probabilities. +``` diff --git a/cnn_text_classification.py b/cnn_text_classification.py index dd6b793..8a79fd8 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -153,25 +153,32 @@ def fit(self, X, y, sample_weight=None): torch.cuda.empty_cache() return self - def predict(self, X): - y_pred = [] + def __predict(self, X): + y_output = [] + + self.__model.eval() for text in X: assert isinstance(text, str) text = self.__pad(self.__text_field.preprocess(text), True) - - self.__model.eval() - text = [[self.__text_field.vocab.stoi[x] for x in text]] x = Variable(torch.tensor(text)) x = x.cuda() if self.cuda and torch.cuda.is_available() else x - _, predicted = torch.max(self.__model(x), 1) - y_pred.append(self.__label_field.vocab.itos[predicted.data[0] + 1]) + y_output.append(self.__model(x)) torch.cuda.empty_cache() - return y_pred + return y_output + + def predict(self, X): + y_pred = [torch.argmax(yi, 1) for yi in self.__predict(X)] + return [self.__label_field.vocab.itos[yi.data[0] + 1] for yi in y_pred] + + def predict_proba(self, X): + softmax = nn.Softmax(dim=1) + y_prob = [softmax(yi) for yi in self.__predict(X)] + return [[float(yij) for yij in yi[0]] for yi in y_prob] def __pad(self, x, preprocessed=False): tokens = x if preprocessed else self.__text_field.preprocess(x) From a136549b5f6801aafe6afbc365bed50f71afa9cd Mon Sep 17 00:00:00 2001 From: rriva002 Date: Thu, 25 Jul 2019 06:53:23 -0400 Subject: [PATCH 17/31] Fixed cloning bug caused by modification of a parameter in the constructor --- cnn_text_classification.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 8a79fd8..9b3d372 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -29,7 +29,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, self.max_norm = max_norm self.embed_dim = embed_dim self.kernel_num = kernel_num - self.kernel_sizes = sorted(kernel_sizes) + self.kernel_sizes = kernel_sizes self.static = static self.device = device self.cuda = cuda @@ -155,13 +155,15 @@ def fit(self, X, y, sample_weight=None): def __predict(self, X): y_output = [] + max_kernel_size = max(self.kernel_sizes) self.__model.eval() for text in X: assert isinstance(text, str) - text = self.__pad(self.__text_field.preprocess(text), True) + text = self.__text_field.preprocess(text) + text = self.__pad(text, max_kernel_size, True) text = [[self.__text_field.vocab.stoi[x] for x in text]] x = Variable(torch.tensor(text)) x = x.cuda() if self.cuda and torch.cuda.is_available() else x @@ -180,9 +182,9 @@ def predict_proba(self, X): y_prob = [softmax(yi) for yi in self.__predict(X)] return [[float(yij) for yij in yi[0]] for yi in y_prob] - def __pad(self, x, preprocessed=False): + def __pad(self, x, max_kernel_size, preprocessed=False): tokens = x if preprocessed else self.__text_field.preprocess(x) - difference = self.kernel_sizes[-1] - len(tokens) + difference = max_kernel_size - len(tokens) if difference > 0: padding = [self.__text_field.pad_token] * difference @@ -194,10 +196,11 @@ def __preprocess(self, X, y, sample_weight): self.__text_field = Field(lower=True) self.__label_field = Field(sequential=False) self.__text_field.preprocessing = Pipeline(self.__preprocess_text) + max_kernel_size = max(self.kernel_sizes) sample_weight = None if sample_weight is None else list(sample_weight) for i in range(len(X)): - X[i] = self.__pad(X[i]) + X[i] = self.__pad(X[i], max_kernel_size) fields = [("text", self.__text_field), ("label", self.__label_field)] exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))] From 81450f1fa1734c93ae70d647c5ec49a2982a3647 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Sun, 4 Aug 2019 20:14:28 -0400 Subject: [PATCH 18/31] Added support for ROC AUC scoring --- README.md | 8 ++++++-- cnn_text_classification.py | 24 ++++++++++++++++++------ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index af88328..99237d4 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. * scikit-learn ## Known Issues +* Oversampling is applied to the whole training dataset, so many training samples likely end up in both training and dev sets. * The predict and predict_proba methods are probably not as efficient as they could be. * The class probabilities returned by the predict_proba method are probably questionable. * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA). @@ -63,12 +64,15 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. **cuda : boolean, optional (default=True)** If true, use the GPU if available. -** activation_func : string, optional (default='relu')** +**activation_func : string, optional (default='relu')** Activation function. If 'relu' or 'tanh', uses rectified linear unit or hyperbolic tangent, respectively. Otherwise, uses no activation function (f(x) = x). -**scoring : callable or None, optional (default=sklearn.metrics.accuracy_score)** +**scoring : callable or "roc_auc", optional (default=sklearn.metrics.make_scorer(sklearn.metrics.accuracy_score))** Scoring method for testing model performance during fitting. +**pos_label : string, optional (default=None)** + Positive class label for roc_auc scoring. Ignored if using a different scoring method. + **vectors : string, optional (default=None)** Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options). diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 9b3d372..5d11c5b 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -5,7 +5,7 @@ from collections import Counter from copy import deepcopy from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.metrics import accuracy_score, make_scorer +from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score from time import time from torch.autograd import Variable from torchtext.data import Dataset, Example, Field, Iterator, Pipeline @@ -16,9 +16,9 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0, embed_dim=128, kernel_num=100, kernel_sizes=(3, 4, 5), static=False, device=-1, cuda=True, activation_func="relu", - scoring=make_scorer(accuracy_score), vectors=None, - split_ratio=0.9, preprocessor=None, class_weight=None, - random_state=None, verbose=0): + scoring=make_scorer(accuracy_score), pos_label=None, + vectors=None, split_ratio=0.9, preprocessor=None, + class_weight=None, random_state=None, verbose=0): self.lr = lr self.epochs = epochs self.batch_size = batch_size @@ -35,6 +35,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, self.cuda = cuda self.activation_func = activation_func self.scoring = scoring + self.pos_label = pos_label self.vectors = vectors self.split_ratio = split_ratio self.preprocessor = preprocessor @@ -62,6 +63,7 @@ def __eval(self, data_iter): self.__model.eval() preds, targets = [], [] + softmax = nn.Softmax(dim=1) if self.scoring == "roc_auc" else None for batch in data_iter: feature, target = batch.text.data.t(), batch.label.data.sub(1) @@ -73,11 +75,21 @@ def __eval(self, data_iter): F.cross_entropy(logit, target, reduction="sum") - preds += torch.max(logit, 1)[1].view(target.size()).data.tolist() + if self.scoring == "roc_auc": + pred = [[float(p) for p in dist] for dist in softmax(logit)] + else: + pred = torch.max(logit, 1)[1].view(target.size()).data.tolist() + + preds += pred targets += target.data.tolist() - preds = [self.__label_field.vocab.itos[pred + 1] for pred in preds] targets = [self.__label_field.vocab.itos[targ + 1] for targ in targets] + + if self.scoring == "roc_auc": + pos_index = self.__label_field.vocab.stoi[self.pos_label] - 1 + return roc_auc_score(targets, [pred[pos_index] for pred in preds]) + + preds = [self.__label_field.vocab.itos[pred + 1] for pred in preds] return self.scoring(_Eval(preds), None, targets) def fit(self, X, y, sample_weight=None): From e156753f3c6ea2fb753b600d3fd072d38e5d3ceb Mon Sep 17 00:00:00 2001 From: rriva002 Date: Wed, 7 Aug 2019 17:01:04 -0400 Subject: [PATCH 19/31] Updated oversampling to not apply to dev data Also changed the default value of split_ratio to 0.8. --- README.md | 1 - cnn_text_classification.py | 36 ++++++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 99237d4..51755ef 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,6 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. * scikit-learn ## Known Issues -* Oversampling is applied to the whole training dataset, so many training samples likely end up in both training and dev sets. * The predict and predict_proba methods are probably not as efficient as they could be. * The class probabilities returned by the predict_proba method are probably questionable. * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA). diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 5d11c5b..e8fe052 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -6,6 +6,7 @@ from copy import deepcopy from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score +from sklearn.model_selection import train_test_split as split from time import time from torch.autograd import Variable from torchtext.data import Dataset, Example, Field, Iterator, Pipeline @@ -17,7 +18,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, embed_dim=128, kernel_num=100, kernel_sizes=(3, 4, 5), static=False, device=-1, cuda=True, activation_func="relu", scoring=make_scorer(accuracy_score), pos_label=None, - vectors=None, split_ratio=0.9, preprocessor=None, + vectors=None, split_ratio=0.8, preprocessor=None, class_weight=None, random_state=None, verbose=0): self.lr = lr self.epochs = epochs @@ -79,7 +80,7 @@ def __eval(self, data_iter): pred = [[float(p) for p in dist] for dist in softmax(logit)] else: pred = torch.max(logit, 1)[1].view(target.size()).data.tolist() - + preds += pred targets += target.data.tolist() @@ -214,31 +215,37 @@ def __preprocess(self, X, y, sample_weight): for i in range(len(X)): X[i] = self.__pad(X[i], max_kernel_size) + X_t, X_d, y_t, y_d = split(X, y, random_state=self.random_state, + shuffle=True, stratify=y, + train_size=self.split_ratio) fields = [("text", self.__text_field), ("label", self.__label_field)] - exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))] - weights = [1 for yi in y] if sample_weight is None else sample_weight + examples = [[X_t[i], y_t[i]] for i in range(len(X_t))] + examples = [Example.fromlist(example, fields) for example in examples] + weights = [1 for yi in y_t] if sample_weight is None else sample_weight if self.class_weight is not None: cw = self.class_weight if isinstance(cw, str) and cw == "balanced": - counter = Counter(y) - cw = [len(y) / (len(counter) * counter[yi]) for yi in y] - weights = [weights[i] * cw[i] for i in range(len(y))] + counter = Counter(y_t) + cw = [len(y_t) / (len(counter) * counter[yi]) for yi in y_t] + weights = [weights[i] * cw[i] for i in range(len(y_t))] elif isinstance(cw, dict): - cw = [cw[yi] for yi in y] - weights = [weights[i] * cw[i] for i in range(len(y))] + cw = [cw[yi] for yi in y_t] + weights = [weights[i] * cw[i] for i in range(len(y_t))] min_weight = min(weights) weights = [round(w / min_weight) for w in weights] - for i in range(len(X)): + for i in range(len(X_t)): if weights[i] > 1: - Xi = [X[i] for j in range(weights[i] - 1)] - exmpl += [Example.fromlist([x, y[i]], fields) for x in Xi] + Xi = [X_t[i] for j in range(weights[i] - 1)] + examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi] - train_data, dev_data = Dataset(exmpl, fields).split(self.split_ratio, - self.random_state,) + train_data = Dataset(examples, fields) + dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))] + dev_data = [Example.fromlist(example, fields) for example in dev_data] + dev_data = Dataset(dev_data, fields) self.__text_field.build_vocab(train_data, dev_data, vectors=self.vectors) @@ -306,6 +313,7 @@ def forward(self, x): x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] return self.__fc1(self.__dropout(torch.cat(x, 1))) + class _Eval(): def __init__(self, preds): self.__preds = preds From 86481582d4a5753082b179740383ef684c1f1e76 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Fri, 9 Aug 2019 07:11:36 -0400 Subject: [PATCH 20/31] Simplified weight calculation code --- cnn_text_classification.py | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index e8fe052..48fed1c 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -2,11 +2,11 @@ import torch import torch.nn as nn import torch.nn.functional as F -from collections import Counter from copy import deepcopy from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score from sklearn.model_selection import train_test_split as split +from sklearn.utils.class_weight import compute_sample_weight from time import time from torch.autograd import Variable from torchtext.data import Dataset, Example, Field, Iterator, Pipeline @@ -215,32 +215,20 @@ def __preprocess(self, X, y, sample_weight): for i in range(len(X)): X[i] = self.__pad(X[i], max_kernel_size) - X_t, X_d, y_t, y_d = split(X, y, random_state=self.random_state, - shuffle=True, stratify=y, - train_size=self.split_ratio) + sw = [1 for yi in y] if sample_weight is None else sample_weight + X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=y, + random_state=self.random_state, + train_size=self.split_ratio) fields = [("text", self.__text_field), ("label", self.__label_field)] examples = [[X_t[i], y_t[i]] for i in range(len(X_t))] examples = [Example.fromlist(example, fields) for example in examples] - weights = [1 for yi in y_t] if sample_weight is None else sample_weight - - if self.class_weight is not None: - cw = self.class_weight - - if isinstance(cw, str) and cw == "balanced": - counter = Counter(y_t) - cw = [len(y_t) / (len(counter) * counter[yi]) for yi in y_t] - weights = [weights[i] * cw[i] for i in range(len(y_t))] - elif isinstance(cw, dict): - cw = [cw[yi] for yi in y_t] - weights = [weights[i] * cw[i] for i in range(len(y_t))] - + weights = compute_sample_weight(self.class_weight, y_t) + weights = [weights[i] * w_t[i] for i in range(len(y_t))] min_weight = min(weights) - weights = [round(w / min_weight) for w in weights] for i in range(len(X_t)): - if weights[i] > 1: - Xi = [X_t[i] for j in range(weights[i] - 1)] - examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi] + Xi = [X_t[i] for j in range(round(weights[i] / min_weight) - 1)] + examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi] train_data = Dataset(examples, fields) dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))] From a699e22ba8e8ab3198239bf559d00fe4bea3ee5e Mon Sep 17 00:00:00 2001 From: rriva002 Date: Fri, 9 Aug 2019 17:33:20 -0400 Subject: [PATCH 21/31] Fixed data type bug in weight calculation --- cnn_text_classification.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 48fed1c..256baf7 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -225,9 +225,10 @@ def __preprocess(self, X, y, sample_weight): weights = compute_sample_weight(self.class_weight, y_t) weights = [weights[i] * w_t[i] for i in range(len(y_t))] min_weight = min(weights) + weights = [int(round(weight / min_weight)) for weight in weights] for i in range(len(X_t)): - Xi = [X_t[i] for j in range(round(weights[i] / min_weight) - 1)] + Xi = [X_t[i] for j in range(weights[i] - 1)] examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi] train_data = Dataset(examples, fields) From 75cee562b41f9d7c57b9b6fe60142fa198364aff Mon Sep 17 00:00:00 2001 From: rriva002 Date: Thu, 19 Mar 2020 13:00:38 -0700 Subject: [PATCH 22/31] Fixed crash in splitting training/validation sets Also slightly optimized the CNN model's forward function, and best model is now saved to disk. --- cnn_text_classification.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 256baf7..513733d 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -2,7 +2,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -from copy import deepcopy +from collections import Counter +from os import remove from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score from sklearn.model_selection import train_test_split as split @@ -122,9 +123,9 @@ def fit(self, X, y, sample_weight=None): optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr, weight_decay=self.max_norm) - best_model = self.__model steps, best_acc, last_step = 0, 0, 0 active = True + filename = "./{}.model".format(time()) self.__model.train() @@ -149,7 +150,7 @@ def fit(self, X, y, sample_weight=None): last_step = steps if self.save_best: - best_model = deepcopy(self.__model) + torch.save(self.__model.state_dict(), filename) elif steps - last_step >= self.early_stop: active = False break @@ -157,7 +158,10 @@ def fit(self, X, y, sample_weight=None): if not active: break - self.__model = best_model if self.save_best else self.__model + if self.save_best: + self.__model.load_state_dict(torch.load(filename)) + remove(filename) + self.classes_ = self.__label_field.vocab.itos[1:] if self.verbose > 0: @@ -216,7 +220,8 @@ def __preprocess(self, X, y, sample_weight): X[i] = self.__pad(X[i], max_kernel_size) sw = [1 for yi in y] if sample_weight is None else sample_weight - X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=y, + s = y if Counter(y).most_common()[-1][1] > 1 else None + X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=s, random_state=self.random_state, train_size=self.split_ratio) fields = [("text", self.__text_field), ("label", self.__label_field)] @@ -298,11 +303,11 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num, def forward(self, x): x = Variable(self.__embed(x)) if self.__static else self.__embed(x) - x = [self.__f(cnv(x.unsqueeze(1))).squeeze(3) for cnv in self.__convs1] + x = x.unsqueeze(1) + x = [self.__f(conv(x), inplace=True).squeeze(3) for conv in self.__convs1] x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] return self.__fc1(self.__dropout(torch.cat(x, 1))) - class _Eval(): def __init__(self, preds): self.__preds = preds From d1f176c17bd7319c09c5780772ec06113cb36611 Mon Sep 17 00:00:00 2001 From: wuxiaohui Date: Sun, 26 Jul 2020 18:15:24 +0800 Subject: [PATCH 23/31] compatible with new version of pytorch --- model.py | 26 +++++++------------------- mydatasets.py | 2 +- train.py | 18 +++++++----------- 3 files changed, 15 insertions(+), 31 deletions(-) diff --git a/model.py b/model.py index ce0158b..1541344 100644 --- a/model.py +++ b/model.py @@ -18,15 +18,12 @@ def __init__(self, args): Ks = args.kernel_sizes self.embed = nn.Embedding(V, D) - # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks] - self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks]) - ''' - self.conv13 = nn.Conv2d(Ci, Co, (3, D)) - self.conv14 = nn.Conv2d(Ci, Co, (4, D)) - self.conv15 = nn.Conv2d(Ci, Co, (5, D)) - ''' + self.convs = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks]) self.dropout = nn.Dropout(args.dropout) - self.fc1 = nn.Linear(len(Ks)*Co, C) + self.fc1 = nn.Linear(len(Ks) * Co, C) + + if self.args.static: + self.embed.weight.requires_grad = False def conv_and_pool(self, x, conv): x = F.relu(conv(x)).squeeze(3) # (N, Co, W) @@ -35,24 +32,15 @@ def conv_and_pool(self, x, conv): def forward(self, x): x = self.embed(x) # (N, W, D) - - if self.args.static: - x = Variable(x) - + x = x.unsqueeze(1) # (N, Ci, W, D) - x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] # [(N, Co, W), ...]*len(Ks) + x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # [(N, Co, W), ...]*len(Ks) x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks) x = torch.cat(x, 1) - ''' - x1 = self.conv_and_pool(x,self.conv13) #(N,Co) - x2 = self.conv_and_pool(x,self.conv14) #(N,Co) - x3 = self.conv_and_pool(x,self.conv15) #(N,Co) - x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co) - ''' x = self.dropout(x) # (N, len(Ks)*Co) logit = self.fc1(x) # (N, C) return logit diff --git a/mydatasets.py b/mydatasets.py index 8fddfce..8cb9475 100644 --- a/mydatasets.py +++ b/mydatasets.py @@ -33,7 +33,7 @@ def download_or_unzip(cls, root): class MR(TarDataset): url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz' - filename = 'rt-polaritydata.tar' + filename = 'rt-polaritydata.tar.gz' dirname = 'rt-polaritydata' @staticmethod diff --git a/train.py b/train.py index 7f90aaa..d9000af 100644 --- a/train.py +++ b/train.py @@ -18,15 +18,12 @@ def train(train_iter, dev_iter, model, args): for epoch in range(1, args.epochs+1): for batch in train_iter: feature, target = batch.text, batch.label - feature.data.t_(), target.data.sub_(1) # batch first, index align + feature.t_(), target.sub_(1) # batch first, index align if args.cuda: feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() logit = model(feature) - - #print('logit vector', logit.size()) - #print('target vector', target.size()) loss = F.cross_entropy(logit, target) loss.backward() optimizer.step() @@ -37,9 +34,9 @@ def train(train_iter, dev_iter, model, args): accuracy = 100.0 * corrects/batch.batch_size sys.stdout.write( '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, - loss.data[0], - accuracy, - corrects, + loss.item(), + accuracy.item(), + corrects.item(), batch.batch_size)) if steps % args.test_interval == 0: dev_acc = eval(dev_iter, model, args) @@ -60,14 +57,14 @@ def eval(data_iter, model, args): corrects, avg_loss = 0, 0 for batch in data_iter: feature, target = batch.text, batch.label - feature.data.t_(), target.data.sub_(1) # batch first, index align + feature.t_(), target.sub_(1) # batch first, index align if args.cuda: feature, target = feature.cuda(), target.cuda() logit = model(feature) loss = F.cross_entropy(logit, target, size_average=False) - avg_loss += loss.data[0] + avg_loss += loss.item() corrects += (torch.max(logit, 1) [1].view(target.size()).data == target.data).sum() @@ -94,8 +91,7 @@ def predict(text, model, text_field, label_feild, cuda_flag): print(x) output = model(x) _, predicted = torch.max(output, 1) - #return label_feild.vocab.itos[predicted.data[0][0]+1] - return label_feild.vocab.itos[predicted.data[0]+1] + return label_feild.vocab.itos[predicted.item()+1] def save(model, save_dir, save_prefix, steps): From 28ad33e02b1321a4f5f975a4e9e49034ddbd5912 Mon Sep 17 00:00:00 2001 From: wuxiaohui Date: Tue, 28 Jul 2020 11:16:45 +0800 Subject: [PATCH 24/31] delete something not used --- model.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/model.py b/model.py index 1541344..db6586f 100644 --- a/model.py +++ b/model.py @@ -25,11 +25,6 @@ def __init__(self, args): if self.args.static: self.embed.weight.requires_grad = False - def conv_and_pool(self, x, conv): - x = F.relu(conv(x)).squeeze(3) # (N, Co, W) - x = F.max_pool1d(x, x.size(2)).squeeze(2) - return x - def forward(self, x): x = self.embed(x) # (N, W, D) From 5f1aba1c048523adb774741156611b56bd292f6e Mon Sep 17 00:00:00 2001 From: rriva002 Date: Thu, 27 Aug 2020 13:14:11 -0700 Subject: [PATCH 25/31] Compatability updates See https://github.com/Shawn1993/cnn-text-classification-pytorch/commit/d1f176c17bd7319c09c5780772ec06113cb36611 --- cnn_text_classification.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 513733d..9b228fb 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -9,7 +9,6 @@ from sklearn.model_selection import train_test_split as split from sklearn.utils.class_weight import compute_sample_weight from time import time -from torch.autograd import Variable from torchtext.data import Dataset, Example, Field, Iterator, Pipeline @@ -68,7 +67,7 @@ def __eval(self, data_iter): softmax = nn.Softmax(dim=1) if self.scoring == "roc_auc" else None for batch in data_iter: - feature, target = batch.text.data.t(), batch.label.data.sub(1) + feature, target = batch.text.t_(), batch.label.sub_(1) if self.cuda and torch.cuda.is_available(): feature, target = feature.cuda(), target.cuda() @@ -80,10 +79,10 @@ def __eval(self, data_iter): if self.scoring == "roc_auc": pred = [[float(p) for p in dist] for dist in softmax(logit)] else: - pred = torch.max(logit, 1)[1].view(target.size()).data.tolist() + pred = torch.max(logit, 1)[1].view(target.size()).tolist() preds += pred - targets += target.data.tolist() + targets += target.tolist() targets = [self.__label_field.vocab.itos[targ + 1] for targ in targets] @@ -131,7 +130,7 @@ def fit(self, X, y, sample_weight=None): for epoch in range(self.epochs): for batch in train_iter: - feature, target = batch.text.data.t(), batch.label.data.sub(1) + feature, target = batch.text.t_(), batch.label.sub_(1) if self.cuda and torch.cuda.is_available(): feature, target = feature.cuda(), target.cuda() @@ -167,7 +166,6 @@ def fit(self, X, y, sample_weight=None): if self.verbose > 0: self.__print_elapsed_time(time() - start) - torch.cuda.empty_cache() return self def __predict(self, X): @@ -182,17 +180,16 @@ def __predict(self, X): text = self.__text_field.preprocess(text) text = self.__pad(text, max_kernel_size, True) text = [[self.__text_field.vocab.stoi[x] for x in text]] - x = Variable(torch.tensor(text)) + x = torch.tensor(text) x = x.cuda() if self.cuda and torch.cuda.is_available() else x y_output.append(self.__model(x)) - torch.cuda.empty_cache() return y_output def predict(self, X): y_pred = [torch.argmax(yi, 1) for yi in self.__predict(X)] - return [self.__label_field.vocab.itos[yi.data[0] + 1] for yi in y_pred] + return [self.__label_field.vocab.itos[yi.item() + 1] for yi in y_pred] def predict_proba(self, X): softmax = nn.Softmax(dim=1) @@ -253,7 +250,7 @@ def __preprocess_text(self, text): if self.preprocessor is None: return self.__clean_str(text) - return self.__clean_str(self.preprocessor(text)) + return self.preprocessor(text) def __print_elapsed_time(self, seconds): sc = round(seconds) @@ -289,10 +286,10 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num, Ks = kernel_sizes module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks] - self.__convs1 = nn.ModuleList(module_list) + self.__convs = nn.ModuleList(module_list) self.__dropout = nn.Dropout(dropout) - self.__fc1 = nn.Linear(len(Ks) * kernel_num, class_num) - self.__static = static + self.__fc = nn.Linear(len(Ks) * kernel_num, class_num) + self.__embed.weight.requires_grad = not static if activation_func == "relu": self.__f = F.relu @@ -302,11 +299,11 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num, self.__f = lambda x: x def forward(self, x): - x = Variable(self.__embed(x)) if self.__static else self.__embed(x) - x = x.unsqueeze(1) - x = [self.__f(conv(x), inplace=True).squeeze(3) for conv in self.__convs1] + x = self.__embed(x).unsqueeze(1) + x = [self.__f(cnv(x), inplace=True).squeeze(3) for cnv in self.__convs] x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] - return self.__fc1(self.__dropout(torch.cat(x, 1))) + return self.__fc(self.__dropout(torch.cat(x, 1))) + class _Eval(): def __init__(self, preds): From 6dfa40afc41e001b693255580e6e6c4783e5b921 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Thu, 27 Aug 2020 16:56:33 -0700 Subject: [PATCH 26/31] Refactored prediction code and fixed preprocessing bug --- README.md | 2 -- cnn_text_classification.py | 70 +++++++++++++++----------------------- 2 files changed, 28 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 51755ef..9a048b6 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,6 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github. * scikit-learn ## Known Issues -* The predict and predict_proba methods are probably not as efficient as they could be. -* The class probabilities returned by the predict_proba method are probably questionable. * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA). * Only supports pre-trained word vectors from TorchText (or no pre-trained vectors). * The random_state parameter probably only works with integers or None. diff --git a/cnn_text_classification.py b/cnn_text_classification.py index 9b228fb..4d83355 100644 --- a/cnn_text_classification.py +++ b/cnn_text_classification.py @@ -4,12 +4,13 @@ import torch.nn.functional as F from collections import Counter from os import remove +from os.path import exists from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score from sklearn.model_selection import train_test_split as split from sklearn.utils.class_weight import compute_sample_weight from time import time -from torchtext.data import Dataset, Example, Field, Iterator, Pipeline +from torchtext.data import Dataset, Example, Field, Iterator class CNNClassifier(BaseEstimator, ClassifierMixin): @@ -43,8 +44,9 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, self.class_weight = class_weight self.random_state = random_state self.verbose = verbose + self.__max_kernel_size = max(self.kernel_sizes) - def __clean_str(self, string): + def __default_preprocessor(self, string): string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) @@ -107,7 +109,7 @@ def fit(self, X, y, sample_weight=None): print("\n".join([": ".join([k, str(v)]) for k, v in params])) start = time() if self.verbose > 0 else None - train_iter, dev_iter = self.__preprocess(X, y, sample_weight) + train_iter, dev_iter = self.__prepare_train_data(X, y, sample_weight) embed_num = len(self.__text_field.vocab) class_num = len(self.__label_field.vocab) - 1 self.__model = _CNNText(embed_num, self.embed_dim, class_num, @@ -157,7 +159,7 @@ def fit(self, X, y, sample_weight=None): if not active: break - if self.save_best: + if self.save_best and exists(filename): self.__model.load_state_dict(torch.load(filename)) remove(filename) @@ -169,8 +171,7 @@ def fit(self, X, y, sample_weight=None): return self def __predict(self, X): - y_output = [] - max_kernel_size = max(self.kernel_sizes) + texts = [] self.__model.eval() @@ -178,44 +179,25 @@ def __predict(self, X): assert isinstance(text, str) text = self.__text_field.preprocess(text) - text = self.__pad(text, max_kernel_size, True) - text = [[self.__text_field.vocab.stoi[x] for x in text]] - x = torch.tensor(text) - x = x.cuda() if self.cuda and torch.cuda.is_available() else x + text = [self.__text_field.vocab.stoi[x] for x in text] + texts.append(torch.tensor(text)) - y_output.append(self.__model(x)) - - return y_output + x = torch.stack(texts, 0) + x = x.cuda() if self.cuda and torch.cuda.is_available() else x + return self.__model(x) def predict(self, X): - y_pred = [torch.argmax(yi, 1) for yi in self.__predict(X)] + y_pred = torch.argmax(self.__predict(X), 1) return [self.__label_field.vocab.itos[yi.item() + 1] for yi in y_pred] def predict_proba(self, X): - softmax = nn.Softmax(dim=1) - y_prob = [softmax(yi) for yi in self.__predict(X)] - return [[float(yij) for yij in yi[0]] for yi in y_prob] - - def __pad(self, x, max_kernel_size, preprocessed=False): - tokens = x if preprocessed else self.__text_field.preprocess(x) - difference = max_kernel_size - len(tokens) - - if difference > 0: - padding = [self.__text_field.pad_token] * difference - return x + padding if preprocessed else " ".join([x] + padding) - - return x + return nn.Softmax(dim=1)(self.__predict(X)).tolist() - def __preprocess(self, X, y, sample_weight): + def __prepare_train_data(self, X, y, sample_weight): self.__text_field = Field(lower=True) self.__label_field = Field(sequential=False) - self.__text_field.preprocessing = Pipeline(self.__preprocess_text) - max_kernel_size = max(self.kernel_sizes) + self.__text_field.tokenize = self.__tokenize sample_weight = None if sample_weight is None else list(sample_weight) - - for i in range(len(X)): - X[i] = self.__pad(X[i], max_kernel_size) - sw = [1 for yi in y] if sample_weight is None else sample_weight s = y if Counter(y).most_common()[-1][1] > 1 else None X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=s, @@ -246,12 +228,6 @@ def __preprocess(self, X, y, sample_weight): return Iterator.splits((train_data, dev_data), batch_sizes=batch_sizes, sort_key=lambda ex: len(ex.text), repeat=False) - def __preprocess_text(self, text): - if self.preprocessor is None: - return self.__clean_str(text) - - return self.preprocessor(text) - def __print_elapsed_time(self, seconds): sc = round(seconds) mn = int(sc / 60) @@ -272,6 +248,16 @@ def __print_elapsed_time(self, seconds): print("Completed training in {}.".format(times)) + def __tokenize(self, text): + if self.preprocessor is None: + text = self.__default_preprocessor(text) + else: + text = self.preprocessor(text) + + tokens = text.split() + difference = self.__max_kernel_size - len(tokens) + return tokens + [self.__text_field.pad_token] * max(difference, 0) + class _CNNText(nn.Module): def __init__(self, embed_num, embed_dim, class_num, kernel_num, @@ -280,8 +266,9 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num, if vectors is None: self.__embed = nn.Embedding(embed_num, embed_dim) + self.__embed.weight.requires_grad = not static else: - self.__embed = nn.Embedding.from_pretrained(vectors) + self.__embed = nn.Embedding.from_pretrained(vectors, freeze=static) embed_dim = self.__embed.embedding_dim Ks = kernel_sizes @@ -289,7 +276,6 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num, self.__convs = nn.ModuleList(module_list) self.__dropout = nn.Dropout(dropout) self.__fc = nn.Linear(len(Ks) * kernel_num, class_num) - self.__embed.weight.requires_grad = not static if activation_func == "relu": self.__f = F.relu From f30623afb69022ecf4a230d307cf4a795b9651a4 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Mon, 31 Aug 2020 15:34:39 -0700 Subject: [PATCH 27/31] Add files via upload --- README.md | 208 +++++++++++++++++++++++++------------------------- main.py | 116 ++++++++++++++++++++++++++++ model.py | 41 ++++++++++ mydatasets.py | 110 ++++++++++++++++++++++++++ train.py | 102 +++++++++++++++++++++++++ 5 files changed, 474 insertions(+), 103 deletions(-) create mode 100644 main.py create mode 100644 model.py create mode 100644 mydatasets.py create mode 100644 train.py diff --git a/README.md b/README.md index 9a048b6..5ee32a7 100644 --- a/README.md +++ b/README.md @@ -1,125 +1,127 @@ ## Introduction -Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.com/Shawn1993/cnn-text-classification-pytorch), refactored as a scikit-learn classifier. +This is the implementation of Kim's [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch. -## Requirements +1. Kim's implementation of the model in Theano: +[https://github.com/yoonkim/CNN_sentence](https://github.com/yoonkim/CNN_sentence) +2. Denny Britz has an implementation in Tensorflow: +[https://github.com/dennybritz/cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf) +3. Alexander Rakhlin's implementation in Keras; +[https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras](https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras) + +## Requirement * python 3 * pytorch > 0.1 * torchtext > 0.1 * numpy -* scikit-learn - -## Known Issues -* Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA). -* Only supports pre-trained word vectors from TorchText (or no pre-trained vectors). -* The random_state parameter probably only works with integers or None. -* Features my idiosyncratic coding style. - -## To Do -* Add support for cross-validation during training. -* Implement sample weights in eval scoring? - -## Parameters -**lr : float, optional (default=0.01)** - Initial learning rate. - -**epochs : integer, optional (default=256)** - Number of training epochs. - -**batch_size : integer, optional (default=64)** - Training batch size. - -**test_interval : integer, optional (default=100)** - The number of epochs to wait before testing. - -**early_stop : integer, optional (default=1000)** - The number of iterations without increased performance to reach before stopping. - -**save_best : boolean, optional (default=True)** - Keep the model with the best performance found during training. - -**dropout : float, optional (default=0.5)** - Dropout probability. - -**max_norm : float, optional (default=0.0)** - L2 constraint. - -**embed_dim : integer, optional (default=128)** - The number of embedding dimensions. Ignored if vectors is not None. - -**kernel_num : integer, optional (default=100)** - The number of each size of kernel. - -**kernel_sizes : iterable of integers, optional (default=(3, 4, 5))** - Kernel sizes to use for convolution. - -**static : boolean, optional (default=False)** - If true, fix the embedding. - -**device : int, optional (default=-1)** - Device to use for iterating data; -1 for CPU (see torch.cuda.set_device()). -**cuda : boolean, optional (default=True)** - If true, use the GPU if available. +## Result +I just tried two dataset, MR and SST. -**activation_func : string, optional (default='relu')** - Activation function. If 'relu' or 'tanh', uses rectified linear unit or hyperbolic tangent, respectively. Otherwise, uses no activation function (f(x) = x). +|Dataset|Class Size|Best Result|Kim's Paper Result| +|---|---|---|---| +|MR|2|77.5%(CNN-rand-static)|76.1%(CNN-rand-nostatic)| +|SST|5|37.2%(CNN-rand-static)|45.0%(CNN-rand-nostatic)| -**scoring : callable or "roc_auc", optional (default=sklearn.metrics.make_scorer(sklearn.metrics.accuracy_score))** - Scoring method for testing model performance during fitting. +I haven't adjusted the hyper-parameters for SST seriously. -**pos_label : string, optional (default=None)** - Positive class label for roc_auc scoring. Ignored if using a different scoring method. - -**vectors : string, optional (default=None)** - Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options). - -**split_ratio : float, optional (default=0.9)** - Ratio of training data used for training. The remainder will be used for validation. - -**preprocessor : callable or None, optional (default=None)** - Override default string preprocessing. - -**class_weight : dict, "balanced" or None, optional (default=None)** - Weights associated with each class (see class_weight parameter in existing scikit-learn classifiers). - -**random_state : integer, optional (default=None)** - Seed for the random number generator. - -**verbose : integer, optional (default=0)** - Controls the verbosity when fitting. - -## Methods -**fit(X, y, sample_weight=None)** -Train the CNN classifier from the training set (X, y). +## Usage +``` +./main.py -h ``` -Parameters: X: list of strings - The training input samples. +or - y: list of strings - The class labels. +``` +python3 main.py -h +``` - sample_weight: list of integers or floats, or None - Sample weights. If None, samples are equally weighted. +You will get: -Returns: self : object ``` - -**predict(X)** -Predict class for X. +CNN text classificer + +optional arguments: + -h, --help show this help message and exit + -batch-size N batch size for training [default: 50] + -lr LR initial learning rate [default: 0.01] + -epochs N number of epochs for train [default: 10] + -dropout the probability for dropout [default: 0.5] + -max_norm MAX_NORM l2 constraint of parameters + -cpu disable the gpu + -device DEVICE device to use for iterate data + -embed-dim EMBED_DIM + -static fix the embedding + -kernel-sizes KERNEL_SIZES + Comma-separated kernel size to use for convolution + -kernel-num KERNEL_NUM + number of each kind of kernel + -class-num CLASS_NUM number of class + -shuffle shuffle the data every epoch + -num-workers NUM_WORKERS + how many subprocesses to use for data loading + [default: 0] + -log-interval LOG_INTERVAL + how many batches to wait before logging training + status + -test-interval TEST_INTERVAL + how many epochs to wait before testing + -save-interval SAVE_INTERVAL + how many epochs to wait before saving + -predict PREDICT predict the sentence given + -snapshot SNAPSHOT filename of model snapshot [default: None] + -save-dir SAVE_DIR where to save the checkpoint ``` -Parameters: X: list of strings - The input samples. -Returns: y: list of strings - The predicted classes. +## Train +``` +./main.py ``` +You will get: -**predict_proba(X)** -Predict class probabilities for X. ``` -Parameters: X: list of strings - The input samples. +Batch[100] - loss: 0.655424 acc: 59.3750% +Evaluation - loss: 0.672396 acc: 57.6923%(615/1066) +``` + +## Test +If you has construct you test set, you make testing like: -Returns: y: list of lists for floats - The predicted class probabilities. ``` +/main.py -test -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt +``` +The snapshot option means where your model load from. If you don't assign it, the model will start from scratch. + +## Predict +* **Example1** + + ``` + ./main.py -predict="Hello my dear , I love you so much ." \ + -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt" + ``` + You will get: + + ``` + Loading model from [./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt]... + + [Text] Hello my dear , I love you so much . + [Label] positive + ``` +* **Example2** + + ``` + ./main.py -predict="You just make me so sad and I have to leave you ."\ + -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt" + ``` + You will get: + + ``` + Loading model from [./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt]... + + [Text] You just make me so sad and I have to leave you . + [Label] negative + ``` + +Your text must be separated by space, even punctuation.And, your text should longer then the max kernel size. + +## Reference +* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) + diff --git a/main.py b/main.py new file mode 100644 index 0000000..dd222a6 --- /dev/null +++ b/main.py @@ -0,0 +1,116 @@ +#! /usr/bin/env python +import os +import argparse +import datetime +import torch +import torchtext.data as data +import torchtext.datasets as datasets +import model +import train +import mydatasets + + +parser = argparse.ArgumentParser(description='CNN text classificer') +# learning +parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]') +parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]') +parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]') +parser.add_argument('-log-interval', type=int, default=1, help='how many steps to wait before logging training status [default: 1]') +parser.add_argument('-test-interval', type=int, default=100, help='how many steps to wait before testing [default: 100]') +parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]') +parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot') +parser.add_argument('-early-stop', type=int, default=1000, help='iteration numbers to stop without performance increasing') +parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance') +# data +parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch') +# model +parser.add_argument('-dropout', type=float, default=0.5, help='the probability for dropout [default: 0.5]') +parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]') +parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]') +parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel') +parser.add_argument('-kernel-sizes', type=str, default='3,4,5', help='comma-separated kernel size to use for convolution') +parser.add_argument('-static', action='store_true', default=False, help='fix the embedding') +# device +parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]') +parser.add_argument('-no-cuda', action='store_true', default=False, help='disable the gpu') +# option +parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]') +parser.add_argument('-predict', type=str, default=None, help='predict the sentence given') +parser.add_argument('-test', action='store_true', default=False, help='train or test') +args = parser.parse_args() + + +# load SST dataset +def sst(text_field, label_field, **kargs): + train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True) + text_field.build_vocab(train_data, dev_data, test_data) + label_field.build_vocab(train_data, dev_data, test_data) + train_iter, dev_iter, test_iter = data.BucketIterator.splits( + (train_data, dev_data, test_data), + batch_sizes=(args.batch_size, + len(dev_data), + len(test_data)), + **kargs) + return train_iter, dev_iter, test_iter + + +# load MR dataset +def mr(text_field, label_field, **kargs): + train_data, dev_data = mydatasets.MR.splits(text_field, label_field) + text_field.build_vocab(train_data, dev_data) + label_field.build_vocab(train_data, dev_data) + train_iter, dev_iter = data.Iterator.splits( + (train_data, dev_data), + batch_sizes=(args.batch_size, len(dev_data)), + **kargs) + return train_iter, dev_iter + + +# load data +print("\nLoading data...") +text_field = data.Field(lower=True) +label_field = data.Field(sequential=False) +train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) +# train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False) + + +# update args and print +args.embed_num = len(text_field.vocab) +args.class_num = len(label_field.vocab) - 1 +args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda +args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] +args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + +print("\nParameters:") +for attr, value in sorted(args.__dict__.items()): + print("\t{}={}".format(attr.upper(), value)) + + +# model +cnn = model.CNN_Text(args) +if args.snapshot is not None: + print('\nLoading model from {}...'.format(args.snapshot)) + cnn.load_state_dict(torch.load(args.snapshot)) + +if args.cuda: + torch.cuda.set_device(args.device) + cnn = cnn.cuda() + + +# train or predict +if args.predict is not None: + label = train.predict(args.predict, cnn, text_field, label_field, args.cuda) + print('\n[Text] {}\n[Label] {}\n'.format(args.predict, label)) +elif args.test: + try: + train.eval(test_iter, cnn, args) + except Exception as e: + print("\nSorry. The test dataset doesn't exist.\n") +else: + print() + try: + train.train(train_iter, dev_iter, cnn, args) + except KeyboardInterrupt: + print('\n' + '-' * 89) + print('Exiting from training early') + diff --git a/model.py b/model.py new file mode 100644 index 0000000..db6586f --- /dev/null +++ b/model.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable + + +class CNN_Text(nn.Module): + + def __init__(self, args): + super(CNN_Text, self).__init__() + self.args = args + + V = args.embed_num + D = args.embed_dim + C = args.class_num + Ci = 1 + Co = args.kernel_num + Ks = args.kernel_sizes + + self.embed = nn.Embedding(V, D) + self.convs = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks]) + self.dropout = nn.Dropout(args.dropout) + self.fc1 = nn.Linear(len(Ks) * Co, C) + + if self.args.static: + self.embed.weight.requires_grad = False + + def forward(self, x): + x = self.embed(x) # (N, W, D) + + x = x.unsqueeze(1) # (N, Ci, W, D) + + x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # [(N, Co, W), ...]*len(Ks) + + x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks) + + x = torch.cat(x, 1) + + x = self.dropout(x) # (N, len(Ks)*Co) + logit = self.fc1(x) # (N, C) + return logit diff --git a/mydatasets.py b/mydatasets.py new file mode 100644 index 0000000..961188f --- /dev/null +++ b/mydatasets.py @@ -0,0 +1,110 @@ +import re +import os +import random +import tarfile +import urllib +from torchtext import data + + +class TarDataset(data.Dataset): + """Defines a Dataset loaded from a downloadable tar archive. + + Attributes: + url: URL where the tar archive can be downloaded. + filename: Filename of the downloaded tar archive. + dirname: Name of the top-level directory within the zip archive that + contains the data files. + """ + + @classmethod + def download_or_unzip(cls, root): + path = os.path.join(root, cls.dirname) + if not os.path.isdir(path): + tpath = os.path.join(root, cls.filename) + if not os.path.isfile(tpath): + print('downloading') + urllib.request.urlretrieve(cls.url, tpath) + with tarfile.open(tpath, 'r') as tfile: + print('extracting') + tfile.extractall(root) + return os.path.join(path, '') + + +class MR(TarDataset): + + url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz' + filename = 'rt-polaritydata.tar.gz' + dirname = 'rt-polaritydata' + + @staticmethod + def sort_key(ex): + return len(ex.text) + + def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): + """Create an MR dataset instance given a path and fields. + + Arguments: + text_field: The field that will be used for text data. + label_field: The field that will be used for label data. + path: Path to the data file. + examples: The examples contain all the data. + Remaining keyword arguments: Passed to the constructor of + data.Dataset. + """ + def clean_str(string): + """ + Tokenization/string cleaning for all datasets except for SST. + Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py + """ + string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) + string = re.sub(r"\'s", " \'s", string) + string = re.sub(r"\'ve", " \'ve", string) + string = re.sub(r"n\'t", " n\'t", string) + string = re.sub(r"\'re", " \'re", string) + string = re.sub(r"\'d", " \'d", string) + string = re.sub(r"\'ll", " \'ll", string) + string = re.sub(r",", " , ", string) + string = re.sub(r"!", " ! ", string) + string = re.sub(r"\(", " \( ", string) + string = re.sub(r"\)", " \) ", string) + string = re.sub(r"\?", " \? ", string) + string = re.sub(r"\s{2,}", " ", string) + return string.strip() + + text_field.tokenize = lambda x: clean_str(x).split() + fields = [('text', text_field), ('label', label_field)] + + if examples is None: + path = self.dirname if path is None else path + examples = [] + with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f: + examples += [ + data.Example.fromlist([line, 'negative'], fields) for line in f] + with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f: + examples += [ + data.Example.fromlist([line, 'positive'], fields) for line in f] + super(MR, self).__init__(examples, fields, **kwargs) + + @classmethod + def splits(cls, text_field, label_field, dev_ratio=.1, shuffle=True, root='.', **kwargs): + """Create dataset objects for splits of the MR dataset. + + Arguments: + text_field: The field that will be used for the sentence. + label_field: The field that will be used for label data. + dev_ratio: The ratio that will be used to get split validation dataset. + shuffle: Whether to shuffle the data before split. + root: The root directory that the dataset's zip archive will be + expanded into; therefore the directory in whose trees + subdirectory the data files will be stored. + train: The filename of the train data. Default: 'train.txt'. + Remaining keyword arguments: Passed to the splits method of + Dataset. + """ + path = cls.download_or_unzip(root) + examples = cls(text_field, label_field, path=path, **kwargs).examples + if shuffle: random.shuffle(examples) + dev_index = -1 * int(dev_ratio*len(examples)) + + return (cls(text_field, label_field, examples=examples[:dev_index]), + cls(text_field, label_field, examples=examples[dev_index:])) diff --git a/train.py b/train.py new file mode 100644 index 0000000..d9000af --- /dev/null +++ b/train.py @@ -0,0 +1,102 @@ +import os +import sys +import torch +import torch.autograd as autograd +import torch.nn.functional as F + + +def train(train_iter, dev_iter, model, args): + if args.cuda: + model.cuda() + + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) + + steps = 0 + best_acc = 0 + last_step = 0 + model.train() + for epoch in range(1, args.epochs+1): + for batch in train_iter: + feature, target = batch.text, batch.label + feature.t_(), target.sub_(1) # batch first, index align + if args.cuda: + feature, target = feature.cuda(), target.cuda() + + optimizer.zero_grad() + logit = model(feature) + loss = F.cross_entropy(logit, target) + loss.backward() + optimizer.step() + + steps += 1 + if steps % args.log_interval == 0: + corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum() + accuracy = 100.0 * corrects/batch.batch_size + sys.stdout.write( + '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, + loss.item(), + accuracy.item(), + corrects.item(), + batch.batch_size)) + if steps % args.test_interval == 0: + dev_acc = eval(dev_iter, model, args) + if dev_acc > best_acc: + best_acc = dev_acc + last_step = steps + if args.save_best: + save(model, args.save_dir, 'best', steps) + else: + if steps - last_step >= args.early_stop: + print('early stop by {} steps.'.format(args.early_stop)) + elif steps % args.save_interval == 0: + save(model, args.save_dir, 'snapshot', steps) + + +def eval(data_iter, model, args): + model.eval() + corrects, avg_loss = 0, 0 + for batch in data_iter: + feature, target = batch.text, batch.label + feature.t_(), target.sub_(1) # batch first, index align + if args.cuda: + feature, target = feature.cuda(), target.cuda() + + logit = model(feature) + loss = F.cross_entropy(logit, target, size_average=False) + + avg_loss += loss.item() + corrects += (torch.max(logit, 1) + [1].view(target.size()).data == target.data).sum() + + size = len(data_iter.dataset) + avg_loss /= size + accuracy = 100.0 * corrects/size + print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n'.format(avg_loss, + accuracy, + corrects, + size)) + return accuracy + + +def predict(text, model, text_field, label_feild, cuda_flag): + assert isinstance(text, str) + model.eval() + # text = text_field.tokenize(text) + text = text_field.preprocess(text) + text = [[text_field.vocab.stoi[x] for x in text]] + x = torch.tensor(text) + x = autograd.Variable(x) + if cuda_flag: + x = x.cuda() + print(x) + output = model(x) + _, predicted = torch.max(output, 1) + return label_feild.vocab.itos[predicted.item()+1] + + +def save(model, save_dir, save_prefix, steps): + if not os.path.isdir(save_dir): + os.makedirs(save_dir) + save_prefix = os.path.join(save_dir, save_prefix) + save_path = '{}_steps_{}.pt'.format(save_prefix, steps) + torch.save(model.state_dict(), save_path) From 0751811ee2d2577b620b6c3bebef3dbafdb37c3e Mon Sep 17 00:00:00 2001 From: rriva002 Date: Mon, 31 Aug 2020 15:35:06 -0700 Subject: [PATCH 28/31] Delete cnn_text_classification.py --- cnn_text_classification.py | 299 ------------------------------------- 1 file changed, 299 deletions(-) delete mode 100644 cnn_text_classification.py diff --git a/cnn_text_classification.py b/cnn_text_classification.py deleted file mode 100644 index 4d83355..0000000 --- a/cnn_text_classification.py +++ /dev/null @@ -1,299 +0,0 @@ -import re -import torch -import torch.nn as nn -import torch.nn.functional as F -from collections import Counter -from os import remove -from os.path import exists -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score -from sklearn.model_selection import train_test_split as split -from sklearn.utils.class_weight import compute_sample_weight -from time import time -from torchtext.data import Dataset, Example, Field, Iterator - - -class CNNClassifier(BaseEstimator, ClassifierMixin): - def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100, - early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0, - embed_dim=128, kernel_num=100, kernel_sizes=(3, 4, 5), - static=False, device=-1, cuda=True, activation_func="relu", - scoring=make_scorer(accuracy_score), pos_label=None, - vectors=None, split_ratio=0.8, preprocessor=None, - class_weight=None, random_state=None, verbose=0): - self.lr = lr - self.epochs = epochs - self.batch_size = batch_size - self.test_interval = test_interval - self.early_stop = early_stop - self.save_best = save_best - self.dropout = dropout - self.max_norm = max_norm - self.embed_dim = embed_dim - self.kernel_num = kernel_num - self.kernel_sizes = kernel_sizes - self.static = static - self.device = device - self.cuda = cuda - self.activation_func = activation_func - self.scoring = scoring - self.pos_label = pos_label - self.vectors = vectors - self.split_ratio = split_ratio - self.preprocessor = preprocessor - self.class_weight = class_weight - self.random_state = random_state - self.verbose = verbose - self.__max_kernel_size = max(self.kernel_sizes) - - def __default_preprocessor(self, string): - string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) - string = re.sub(r"\'s", " \'s", string) - string = re.sub(r"\'ve", " \'ve", string) - string = re.sub(r"n\'t", " n\'t", string) - string = re.sub(r"\'re", " \'re", string) - string = re.sub(r"\'d", " \'d", string) - string = re.sub(r"\'ll", " \'ll", string) - string = re.sub(r",", " , ", string) - string = re.sub(r"!", " ! ", string) - string = re.sub(r"\(", " ( ", string) - string = re.sub(r"\)", " ) ", string) - string = re.sub(r"\?", " ? ", string) - string = re.sub(r"\s{2,}", " ", string) - return string.strip() - - def __eval(self, data_iter): - self.__model.eval() - - preds, targets = [], [] - softmax = nn.Softmax(dim=1) if self.scoring == "roc_auc" else None - - for batch in data_iter: - feature, target = batch.text.t_(), batch.label.sub_(1) - - if self.cuda and torch.cuda.is_available(): - feature, target = feature.cuda(), target.cuda() - - logit = self.__model(feature) - - F.cross_entropy(logit, target, reduction="sum") - - if self.scoring == "roc_auc": - pred = [[float(p) for p in dist] for dist in softmax(logit)] - else: - pred = torch.max(logit, 1)[1].view(target.size()).tolist() - - preds += pred - targets += target.tolist() - - targets = [self.__label_field.vocab.itos[targ + 1] for targ in targets] - - if self.scoring == "roc_auc": - pos_index = self.__label_field.vocab.stoi[self.pos_label] - 1 - return roc_auc_score(targets, [pred[pos_index] for pred in preds]) - - preds = [self.__label_field.vocab.itos[pred + 1] for pred in preds] - return self.scoring(_Eval(preds), None, targets) - - def fit(self, X, y, sample_weight=None): - if self.random_state is not None: - torch.manual_seed(self.random_state) - - torch.backends.cudnn.deterministic = self.random_state is not None - torch.backends.cudnn.benchmark = self.random_state is None - - if self.verbose > 1: - params = self.get_params().items() - - print("Fitting with the following parameters:") - print("\n".join([": ".join([k, str(v)]) for k, v in params])) - - start = time() if self.verbose > 0 else None - train_iter, dev_iter = self.__prepare_train_data(X, y, sample_weight) - embed_num = len(self.__text_field.vocab) - class_num = len(self.__label_field.vocab) - 1 - self.__model = _CNNText(embed_num, self.embed_dim, class_num, - self.kernel_num, self.kernel_sizes, - self.dropout, self.static, - self.activation_func, - vectors=self.__text_field.vocab.vectors) - - if self.cuda and torch.cuda.is_available(): - torch.cuda.set_device(self.device) - self.__model.cuda() - - optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr, - weight_decay=self.max_norm) - steps, best_acc, last_step = 0, 0, 0 - active = True - filename = "./{}.model".format(time()) - - self.__model.train() - - for epoch in range(self.epochs): - for batch in train_iter: - feature, target = batch.text.t_(), batch.label.sub_(1) - - if self.cuda and torch.cuda.is_available(): - feature, target = feature.cuda(), target.cuda() - - optimizer.zero_grad() - F.cross_entropy(self.__model(feature), target).backward() - optimizer.step() - - steps += 1 - - if steps % self.test_interval == 0: - dev_acc = self.__eval(dev_iter) - - if dev_acc > best_acc: - best_acc = dev_acc - last_step = steps - - if self.save_best: - torch.save(self.__model.state_dict(), filename) - elif steps - last_step >= self.early_stop: - active = False - break - - if not active: - break - - if self.save_best and exists(filename): - self.__model.load_state_dict(torch.load(filename)) - remove(filename) - - self.classes_ = self.__label_field.vocab.itos[1:] - - if self.verbose > 0: - self.__print_elapsed_time(time() - start) - - return self - - def __predict(self, X): - texts = [] - - self.__model.eval() - - for text in X: - assert isinstance(text, str) - - text = self.__text_field.preprocess(text) - text = [self.__text_field.vocab.stoi[x] for x in text] - texts.append(torch.tensor(text)) - - x = torch.stack(texts, 0) - x = x.cuda() if self.cuda and torch.cuda.is_available() else x - return self.__model(x) - - def predict(self, X): - y_pred = torch.argmax(self.__predict(X), 1) - return [self.__label_field.vocab.itos[yi.item() + 1] for yi in y_pred] - - def predict_proba(self, X): - return nn.Softmax(dim=1)(self.__predict(X)).tolist() - - def __prepare_train_data(self, X, y, sample_weight): - self.__text_field = Field(lower=True) - self.__label_field = Field(sequential=False) - self.__text_field.tokenize = self.__tokenize - sample_weight = None if sample_weight is None else list(sample_weight) - sw = [1 for yi in y] if sample_weight is None else sample_weight - s = y if Counter(y).most_common()[-1][1] > 1 else None - X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=s, - random_state=self.random_state, - train_size=self.split_ratio) - fields = [("text", self.__text_field), ("label", self.__label_field)] - examples = [[X_t[i], y_t[i]] for i in range(len(X_t))] - examples = [Example.fromlist(example, fields) for example in examples] - weights = compute_sample_weight(self.class_weight, y_t) - weights = [weights[i] * w_t[i] for i in range(len(y_t))] - min_weight = min(weights) - weights = [int(round(weight / min_weight)) for weight in weights] - - for i in range(len(X_t)): - Xi = [X_t[i] for j in range(weights[i] - 1)] - examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi] - - train_data = Dataset(examples, fields) - dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))] - dev_data = [Example.fromlist(example, fields) for example in dev_data] - dev_data = Dataset(dev_data, fields) - - self.__text_field.build_vocab(train_data, dev_data, - vectors=self.vectors) - self.__label_field.build_vocab(train_data, dev_data) - - batch_sizes = (self.batch_size, len(dev_data)) - return Iterator.splits((train_data, dev_data), batch_sizes=batch_sizes, - sort_key=lambda ex: len(ex.text), repeat=False) - - def __print_elapsed_time(self, seconds): - sc = round(seconds) - mn = int(sc / 60) - sc = sc % 60 - hr = int(mn / 60) - mn = mn % 60 - hr = "{} hour{}".format(hr, "s" if hr > 1 else "") if hr > 0 else "" - mn = "{} minute{}".format(mn, "s" if mn > 1 else "") if mn > 0 else "" - sc = "{} second{}".format(sc, "s" if sc > 1 else "") if sc > 0 else "" - times = [t for t in [hr, mn, sc] if len(t) > 0] - - if len(times) == 3: - times = " and ".join([", ".join([hr, mn]), sc]) - elif len(times) == 2: - times = " and ".join(times) - else: - times = times[0] if len(times) > 0 else "less than 1 second" - - print("Completed training in {}.".format(times)) - - def __tokenize(self, text): - if self.preprocessor is None: - text = self.__default_preprocessor(text) - else: - text = self.preprocessor(text) - - tokens = text.split() - difference = self.__max_kernel_size - len(tokens) - return tokens + [self.__text_field.pad_token] * max(difference, 0) - - -class _CNNText(nn.Module): - def __init__(self, embed_num, embed_dim, class_num, kernel_num, - kernel_sizes, dropout, static, activation_func, vectors=None): - super(_CNNText, self).__init__() - - if vectors is None: - self.__embed = nn.Embedding(embed_num, embed_dim) - self.__embed.weight.requires_grad = not static - else: - self.__embed = nn.Embedding.from_pretrained(vectors, freeze=static) - embed_dim = self.__embed.embedding_dim - - Ks = kernel_sizes - module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks] - self.__convs = nn.ModuleList(module_list) - self.__dropout = nn.Dropout(dropout) - self.__fc = nn.Linear(len(Ks) * kernel_num, class_num) - - if activation_func == "relu": - self.__f = F.relu - elif activation_func == "tanh": - self.__f = torch.tanh - else: - self.__f = lambda x: x - - def forward(self, x): - x = self.__embed(x).unsqueeze(1) - x = [self.__f(cnv(x), inplace=True).squeeze(3) for cnv in self.__convs] - x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] - return self.__fc(self.__dropout(torch.cat(x, 1))) - - -class _Eval(): - def __init__(self, preds): - self.__preds = preds - - def predict(self, X): - return self.__preds From 8b368e65cffc19b290ab2565d7f146191e2884c6 Mon Sep 17 00:00:00 2001 From: rriva002 Date: Mon, 31 Aug 2020 15:36:49 -0700 Subject: [PATCH 29/31] Add files via upload From 5e11c712d34bfc105931678168443d393e31a37c Mon Sep 17 00:00:00 2001 From: "Bowen(Brad) Xu" Date: Sat, 26 Sep 2020 01:33:55 +0800 Subject: [PATCH 30/31] Fix RuntimeError Re-call model.train() from eval() to fix "RuntimeError:Cudnn RNN backward can only be called in training mode." error. --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index d9000af..9e29b33 100644 --- a/train.py +++ b/train.py @@ -14,9 +14,9 @@ def train(train_iter, dev_iter, model, args): steps = 0 best_acc = 0 last_step = 0 - model.train() for epoch in range(1, args.epochs+1): for batch in train_iter: + model.train() feature, target = batch.text, batch.label feature.t_(), target.sub_(1) # batch first, index align if args.cuda: From ddf03147822be1aa7490d27a192847acbae86015 Mon Sep 17 00:00:00 2001 From: TrellixVulnTeam Date: Fri, 25 Nov 2022 16:26:59 +0000 Subject: [PATCH 31/31] Adding tarfile member sanitization to extractall() --- mydatasets.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/mydatasets.py b/mydatasets.py index 961188f..26fae46 100644 --- a/mydatasets.py +++ b/mydatasets.py @@ -26,7 +26,26 @@ def download_or_unzip(cls, root): urllib.request.urlretrieve(cls.url, tpath) with tarfile.open(tpath, 'r') as tfile: print('extracting') - tfile.extractall(root) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(tfile, root) return os.path.join(path, '')