From 043c1b91d6975b95a11385f07c80adcb0dc65c0e Mon Sep 17 00:00:00 2001
From: Rohan Bhargava <rohan.bhargava@gmail.com>
Date: Wed, 28 Nov 2018 23:02:16 -0800
Subject: [PATCH 01/31] Fixed breaking changes in predict function introduced
 by pytorch 0.4 changes to tensor_type

---
 train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/train.py b/train.py
index b2d46a0..7f90aaa 100644
--- a/train.py
+++ b/train.py
@@ -87,8 +87,8 @@ def predict(text, model, text_field, label_feild, cuda_flag):
     # text = text_field.tokenize(text)
     text = text_field.preprocess(text)
     text = [[text_field.vocab.stoi[x] for x in text]]
-    x = text_field.tensor_type(text)
-    x = autograd.Variable(x, volatile=True)
+    x = torch.tensor(text)
+    x = autograd.Variable(x)
     if cuda_flag:
         x = x.cuda()
     print(x)
@@ -103,4 +103,4 @@ def save(model, save_dir, save_prefix, steps):
         os.makedirs(save_dir)
     save_prefix = os.path.join(save_dir, save_prefix)
     save_path = '{}_steps_{}.pt'.format(save_prefix, steps)
-    torch.save(model.state_dict(), save_path)
\ No newline at end of file
+    torch.save(model.state_dict(), save_path)

From 0ea6819b214154a2b3bee650ba471a4e71d737b9 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Fri, 5 Apr 2019 16:50:04 -0700
Subject: [PATCH 02/31] Refactored for use with scikit-learn

---
 README.md                  | 190 ++++++++++++++-----------------
 cnn_text_classification.py | 228 +++++++++++++++++++++++++++++++++++++
 main.py                    | 116 -------------------
 model.py                   |  58 ----------
 mydatasets.py              | 110 ------------------
 train.py                   | 106 -----------------
 6 files changed, 312 insertions(+), 496 deletions(-)
 create mode 100644 cnn_text_classification.py
 delete mode 100755 main.py
 delete mode 100644 model.py
 delete mode 100644 mydatasets.py
 delete mode 100644 train.py

diff --git a/README.md b/README.md
index 5ee32a7..f25b726 100644
--- a/README.md
+++ b/README.md
@@ -1,127 +1,105 @@
 ## Introduction
-This is the implementation of Kim's [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch.
+Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.com/Shawn1993/cnn-text-classification-pytorch), refactored as a scikit-learn classifier.
 
-1. Kim's implementation of the model in Theano:
-[https://github.com/yoonkim/CNN_sentence](https://github.com/yoonkim/CNN_sentence)
-2. Denny Britz has an implementation in Tensorflow:
-[https://github.com/dennybritz/cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf)
-3. Alexander Rakhlin's implementation in Keras;
-[https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras](https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras)
-
-## Requirement
+## Requirements
 * python 3
 * pytorch > 0.1
 * torchtext > 0.1
 * numpy
+* scikit-learn
 
-## Result
-I just tried two dataset, MR and SST.
+## Known Issues
+* The predict method is probably not as efficient as it could be.
+* Doesn't play well with GridSearchCV if num_jobs isn't 1.
+* Weights are represented by upsampling.
+* Only supports pre-trained word vectors from TorchText.
+* The random_state parameter probably only works with integers or None.
+* Features my idiosyncratic coding style.
 
-|Dataset|Class Size|Best Result|Kim's Paper Result|
-|---|---|---|---|
-|MR|2|77.5%(CNN-rand-static)|76.1%(CNN-rand-nostatic)|
-|SST|5|37.2%(CNN-rand-static)|45.0%(CNN-rand-nostatic)|
+## To Do
+* Add support for different scoring methods (balanced accuracy, recall, etc.).
+* Add support for cross-validation during training.
 
-I haven't adjusted the hyper-parameters for SST seriously.
+## Parameters
+**lr : float, optional (default=0.01)**
+  Initial learning rate.
 
-## Usage
-```
-./main.py -h
-```
-or 
+**epochs : integer, optional (default=256)**
+  Number of training epochs.
 
-```
-python3 main.py -h
-```
+**batch_size : integer, optional (default=64)**
+  Training batch size.
 
-You will get:
+**test_interval : integer, optional (default=100)**
+  The number of epochs to wait before testing.
 
-```
-CNN text classificer
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -batch-size N         batch size for training [default: 50]
-  -lr LR                initial learning rate [default: 0.01]
-  -epochs N             number of epochs for train [default: 10]
-  -dropout              the probability for dropout [default: 0.5]
-  -max_norm MAX_NORM    l2 constraint of parameters
-  -cpu                  disable the gpu
-  -device DEVICE        device to use for iterate data
-  -embed-dim EMBED_DIM
-  -static               fix the embedding
-  -kernel-sizes KERNEL_SIZES
-                        Comma-separated kernel size to use for convolution
-  -kernel-num KERNEL_NUM
-                        number of each kind of kernel
-  -class-num CLASS_NUM  number of class
-  -shuffle              shuffle the data every epoch
-  -num-workers NUM_WORKERS
-                        how many subprocesses to use for data loading
-                        [default: 0]
-  -log-interval LOG_INTERVAL
-                        how many batches to wait before logging training
-                        status
-  -test-interval TEST_INTERVAL
-                        how many epochs to wait before testing
-  -save-interval SAVE_INTERVAL
-                        how many epochs to wait before saving
-  -predict PREDICT      predict the sentence given
-  -snapshot SNAPSHOT    filename of model snapshot [default: None]
-  -save-dir SAVE_DIR    where to save the checkpoint
-```
+**early_stop : integer, optional (default=1000)**
+  The number of iterations without increased performance to reach before stopping.
 
-## Train
-```
-./main.py
-```
-You will get:
+**save_best : boolean, optional (default=True)**
+  Keep the model with the best performance found during training.
 
+**dropout : float, optional (default=0.5)**
+  Dropout probability.
+
+**max_norm : float, optional (default=0.0)**
+  L2 constraint.
+
+**embed_dim : integer, optional (default=128)**
+  The number of embedding dimensions.
+
+**kernel_num : integer, optional (default=100)**
+  The number of each size of kernel.
+
+**kernel_sizes : string, optional (default='3,4,5')**
+  Comma-separated kernel sizes to use for convolution.
+
+**static : boolean, optional (default=False)**
+  If true, fix the embedding.
+
+**device : int, optional (default=-1)**
+  Device to use for iterating data; -1 for CPU (see torch.cuda.set_device()).
+
+**cuda : boolean, optional (default=True)**
+  If true, use the GPU if available.
+
+**class_weight : dict, "balanced" or None, optional (default=None)**
+  Weights associated with each class (see class_weight parameter in existing scikit-learn classifiers).
+
+**split_ratio : float, optional (default=0.9)**
+  Ratio of training data used for training. The remainder will be used for validation.
+
+**random_state : integer, optional (default=None)**
+  Seed for the random number generator.
+
+**vectors : string, optional (default=None)**
+  Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options).
+
+**preprocessor : callable or None (default=None)**
+  Override default string preprocessing.
+
+## Methods
+**fit(X, y, sample_weight=None)**
+Train the CNN classifier from the training set (X, y).
 ```
-Batch[100] - loss: 0.655424  acc: 59.3750%
-Evaluation - loss: 0.672396  acc: 57.6923%(615/1066) 
-```
+Parameters: X: list of strings
+               The training input samples.
 
-## Test
-If you has construct you test set, you make testing like:
+            y: list of strings
+               The class labels.
 
+            sample_weight: list of integers or floats, or None
+               Sample weights. If None, samples are equally weighted.
+
+Returns:    self : object
 ```
-/main.py -test -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt
+
+**predict(X)**
+Predict class for X.
 ```
-The snapshot option means where your model load from. If you don't assign it, the model will start from scratch.
-
-## Predict
-* **Example1**
-
-	```
-	./main.py -predict="Hello my dear , I love you so much ." \
-	          -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt" 
-	```
-	You will get:
-	
-	```
-	Loading model from [./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt]...
-	
-	[Text]  Hello my dear , I love you so much .
-	[Label] positive
-	```
-* **Example2**
-
-	```
-	./main.py -predict="You just make me so sad and I have to leave you ."\
-	          -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt" 
-	```
-	You will get:
-	
-	```
-	Loading model from [./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt]...
-	
-	[Text]  You just make me so sad and I have to leave you .
-	[Label] negative
-	```
-
-Your text must be separated by space, even punctuation.And, your text should longer then the max kernel size.
-
-## Reference
-* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882)
+Parameters: X: list of strings
+               The input samples.
 
+Returns:    y: list of strings
+               The predicted classes.
+```
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
new file mode 100644
index 0000000..847bcd3
--- /dev/null
+++ b/cnn_text_classification.py
@@ -0,0 +1,228 @@
+import re
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import Counter
+from copy import deepcopy
+from sklearn.base import BaseEstimator, ClassifierMixin
+from torch.autograd import Variable
+from torchtext.data import Dataset, Example, Field, Iterator, Pipeline
+
+
+class CNNClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
+                 early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0,
+                 embed_dim=128, kernel_num=100, kernel_sizes="3,4,5",
+                 static=False, device=-1, cuda=True, class_weight=None,
+                 split_ratio=0.9, random_state=None, vectors=None,
+                 preprocessor=None):
+        self.lr = lr
+        self.epochs = epochs
+        self.batch_size = batch_size
+        self.test_interval = test_interval
+        self.early_stop = early_stop
+        self.save_best = save_best
+        self.dropout = dropout
+        self.max_norm = max_norm
+        self.embed_dim = embed_dim
+        self.kernel_num = kernel_num
+        self.kernel_sizes = kernel_sizes
+        self.static = static
+        self.device = device
+        self.cuda = cuda
+        self.class_weight = class_weight
+        self.split_ratio = split_ratio
+        self.random_state = random_state
+        self.vectors = vectors
+        self.preprocessor = preprocessor
+
+    def __clean_str(self, string):
+        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
+        string = re.sub(r"\'s", " \'s", string)
+        string = re.sub(r"\'ve", " \'ve", string)
+        string = re.sub(r"n\'t", " n\'t", string)
+        string = re.sub(r"\'re", " \'re", string)
+        string = re.sub(r"\'d", " \'d", string)
+        string = re.sub(r"\'ll", " \'ll", string)
+        string = re.sub(r",", " , ", string)
+        string = re.sub(r"!", " ! ", string)
+        string = re.sub(r"\(", " ( ", string)
+        string = re.sub(r"\)", " ) ", string)
+        string = re.sub(r"\?", " ? ", string)
+        string = re.sub(r"\s{2,}", " ", string)
+        return string.strip()
+
+    def __eval(self, data_iter):
+        self.__model.eval()
+
+        corrects = 0
+
+        for batch in data_iter:
+            feature, target = batch.text, batch.label
+
+            feature.data.t_()
+            target.data.sub_(1)
+
+            if self.cuda and torch.cuda.is_available():
+                feature, target = feature.cuda(), target.cuda()
+
+            logit = self.__model(feature)
+
+            F.cross_entropy(logit, target, reduction="sum")
+
+            predictions = torch.max(logit, 1)[1].view(target.size())
+            corrects += (predictions.data == target.data).sum()
+
+        return 100.0 * corrects / len(data_iter.dataset)
+
+    def fit(self, X, y, sample_weight=None):
+        train_iter, dev_iter = self.__preprocess(X, y, sample_weight)
+        embed_num = len(self.__text_field.vocab)
+        class_num = len(self.__label_field.vocab) - 1
+        kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")]
+        self.__model = CNNText(embed_num, self.embed_dim, class_num,
+                               self.kernel_num, kernel_sizes, self.dropout,
+                               self.static)
+
+        if self.cuda and torch.cuda.is_available():
+            torch.cuda.set_device(self.device)
+            self.__model.cuda()
+
+        optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr,
+                                     weight_decay=self.max_norm)
+        steps, best_acc, last_step = 0, 0, 0
+
+        self.__model.train()
+
+        for epoch in range(self.epochs):
+            for batch in train_iter:
+                feature, target = batch.text, batch.label
+
+                feature.data.t_()
+                target.data.sub_(1)
+
+                if self.cuda and torch.cuda.is_available():
+                    feature, target = feature.cuda(), target.cuda()
+
+                optimizer.zero_grad()
+                F.cross_entropy(self.__model(feature), target).backward()
+                optimizer.step()
+
+                steps += 1
+
+                if steps % self.test_interval == 0:
+                    dev_acc = self.__eval(dev_iter)
+
+                    if dev_acc > best_acc:
+                        best_acc = dev_acc
+                        last_step = steps
+
+                        if self.save_best:
+                            best_model = deepcopy(self.__model)
+                    elif steps - last_step >= self.early_stop:
+                        if self.save_best:
+                            self.__model = best_model
+
+                        return self
+
+        self.__model = best_model if self.save_best else self.__model
+        return self
+
+    def predict(self, X):
+        y_pred = []
+        max_krnl_sz = int(self.kernel_sizes[self.kernel_sizes.rfind(",") + 1:])
+
+        for text in X:
+            assert isinstance(text, str)
+
+            text = self.__text_field.preprocess(text)
+
+            if len(text) < max_krnl_sz:
+                most_common = self.__label_field.vocab.freqs.most_common(1)[0]
+
+                y_pred.append(most_common[0])
+                continue
+
+            self.__model.eval()
+
+            text = [[self.__text_field.vocab.stoi[x] for x in text]]
+            x = Variable(torch.tensor(text))
+            x = x.cuda() if self.cuda and torch.cuda.is_available() else x
+            _, predicted = torch.max(self.__model(x), 1)
+
+            y_pred.append(self.__label_field.vocab.itos[predicted.data[0] + 1])
+
+        return y_pred
+
+    def __preprocess(self, X, y, sample_weight):
+        self.__text_field = Field(lower=True)
+        self.__label_field = Field(sequential=False)
+        self.__text_field.preprocessing = Pipeline(self.__preprocess_text)
+        fields = [("text", self.__text_field), ("label", self.__label_field)]
+        weights = [1 for yi in y] if sample_weight is None else sample_weight
+        exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))]
+
+        if self.class_weight is not None:
+            cw = self.class_weight
+
+            if isinstance(cw, str) and cw == "balanced":
+                counter = Counter(y)
+                cw = [len(y) / (len(counter) * counter[yi]) for yi in y]
+                weights = [weights[i] * cw[i] for i in range(len(y))]
+            elif isinstance(cw, dict):
+                cw = [cw[yi] for yi in y]
+                weights = [weights[i] * cw[i] for i in range(len(y))]
+
+        min_weight = min(weights)
+        weights = [round(w / min_weight) for w in weights]
+
+        for i in range(len(X)):
+            if weights[i] > 1:
+                Xi = [X[i] for j in range(weights[i] - 1)]
+                exmpl += [Example.fromlist([x, y[i]], fields) for x in Xi]
+
+        train_data, dev_data = Dataset(exmpl, fields).split(self.split_ratio,
+                                                            self.random_state,)
+
+        self.__text_field.build_vocab(train_data, dev_data,
+                                      vectors=self.vectors)
+        self.__label_field.build_vocab(train_data, dev_data)
+
+        batch_sizes = (self.batch_size, len(dev_data))
+        return Iterator.splits((train_data, dev_data), batch_sizes=batch_sizes,
+                               sort_key=lambda ex: len(ex.text), repeat=False)
+
+    def __preprocess_text(self, text):
+        if self.preprocessor is None:
+            return self.__clean_str(text)
+
+        return self.preprocessor(text)
+
+
+class CNNText(nn.Module):
+    def __init__(self, embed_num, embed_dim, class_num, kernel_num,
+                 kernel_sizes, dropout, static):
+        super(CNNText, self).__init__()
+
+        self.__embed = nn.Embedding(embed_num, embed_dim)
+        Ks = kernel_sizes
+        module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks]
+        self.__convs1 = nn.ModuleList(module_list)
+        self.__dropout = nn.Dropout(dropout)
+        self.__fc1 = nn.Linear(len(Ks) * kernel_num, class_num)
+        self.__static = static
+
+    def conv_and_pool(self, x, conv):
+        x = F.relu(conv(x)).squeeze(3)
+        return F.max_pool1d(x, x.size(2)).squeeze(2)
+
+    def forward(self, x):
+        x = self.__embed(x)
+
+        if self.__static:
+            x = Variable(x)
+
+        x = x.unsqueeze(1)
+        x = [F.relu(conv(x)).squeeze(3) for conv in self.__convs1]
+        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
+        return self.__fc1(self.__dropout(torch.cat(x, 1)))
diff --git a/main.py b/main.py
deleted file mode 100755
index dd222a6..0000000
--- a/main.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#! /usr/bin/env python
-import os
-import argparse
-import datetime
-import torch
-import torchtext.data as data
-import torchtext.datasets as datasets
-import model
-import train
-import mydatasets
-
-
-parser = argparse.ArgumentParser(description='CNN text classificer')
-# learning
-parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]')
-parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]')
-parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]')
-parser.add_argument('-log-interval',  type=int, default=1,   help='how many steps to wait before logging training status [default: 1]')
-parser.add_argument('-test-interval', type=int, default=100, help='how many steps to wait before testing [default: 100]')
-parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]')
-parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot')
-parser.add_argument('-early-stop', type=int, default=1000, help='iteration numbers to stop without performance increasing')
-parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance')
-# data 
-parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch')
-# model
-parser.add_argument('-dropout', type=float, default=0.5, help='the probability for dropout [default: 0.5]')
-parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]')
-parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]')
-parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel')
-parser.add_argument('-kernel-sizes', type=str, default='3,4,5', help='comma-separated kernel size to use for convolution')
-parser.add_argument('-static', action='store_true', default=False, help='fix the embedding')
-# device
-parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]')
-parser.add_argument('-no-cuda', action='store_true', default=False, help='disable the gpu')
-# option
-parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]')
-parser.add_argument('-predict', type=str, default=None, help='predict the sentence given')
-parser.add_argument('-test', action='store_true', default=False, help='train or test')
-args = parser.parse_args()
-
-
-# load SST dataset
-def sst(text_field, label_field,  **kargs):
-    train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True)
-    text_field.build_vocab(train_data, dev_data, test_data)
-    label_field.build_vocab(train_data, dev_data, test_data)
-    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
-                                        (train_data, dev_data, test_data), 
-                                        batch_sizes=(args.batch_size, 
-                                                     len(dev_data), 
-                                                     len(test_data)),
-                                        **kargs)
-    return train_iter, dev_iter, test_iter 
-
-
-# load MR dataset
-def mr(text_field, label_field, **kargs):
-    train_data, dev_data = mydatasets.MR.splits(text_field, label_field)
-    text_field.build_vocab(train_data, dev_data)
-    label_field.build_vocab(train_data, dev_data)
-    train_iter, dev_iter = data.Iterator.splits(
-                                (train_data, dev_data), 
-                                batch_sizes=(args.batch_size, len(dev_data)),
-                                **kargs)
-    return train_iter, dev_iter
-
-
-# load data
-print("\nLoading data...")
-text_field = data.Field(lower=True)
-label_field = data.Field(sequential=False)
-train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False)
-# train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False)
-
-
-# update args and print
-args.embed_num = len(text_field.vocab)
-args.class_num = len(label_field.vocab) - 1
-args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda
-args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
-args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
-
-print("\nParameters:")
-for attr, value in sorted(args.__dict__.items()):
-    print("\t{}={}".format(attr.upper(), value))
-
-
-# model
-cnn = model.CNN_Text(args)
-if args.snapshot is not None:
-    print('\nLoading model from {}...'.format(args.snapshot))
-    cnn.load_state_dict(torch.load(args.snapshot))
-
-if args.cuda:
-    torch.cuda.set_device(args.device)
-    cnn = cnn.cuda()
-        
-
-# train or predict
-if args.predict is not None:
-    label = train.predict(args.predict, cnn, text_field, label_field, args.cuda)
-    print('\n[Text]  {}\n[Label] {}\n'.format(args.predict, label))
-elif args.test:
-    try:
-        train.eval(test_iter, cnn, args) 
-    except Exception as e:
-        print("\nSorry. The test dataset doesn't  exist.\n")
-else:
-    print()
-    try:
-        train.train(train_iter, dev_iter, cnn, args)
-    except KeyboardInterrupt:
-        print('\n' + '-' * 89)
-        print('Exiting from training early')
-
diff --git a/model.py b/model.py
deleted file mode 100644
index ce0158b..0000000
--- a/model.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-
-
-class CNN_Text(nn.Module):
-    
-    def __init__(self, args):
-        super(CNN_Text, self).__init__()
-        self.args = args
-        
-        V = args.embed_num
-        D = args.embed_dim
-        C = args.class_num
-        Ci = 1
-        Co = args.kernel_num
-        Ks = args.kernel_sizes
-
-        self.embed = nn.Embedding(V, D)
-        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
-        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
-        '''
-        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
-        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
-        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
-        '''
-        self.dropout = nn.Dropout(args.dropout)
-        self.fc1 = nn.Linear(len(Ks)*Co, C)
-
-    def conv_and_pool(self, x, conv):
-        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
-        x = F.max_pool1d(x, x.size(2)).squeeze(2)
-        return x
-
-    def forward(self, x):
-        x = self.embed(x)  # (N, W, D)
-        
-        if self.args.static:
-            x = Variable(x)
-
-        x = x.unsqueeze(1)  # (N, Ci, W, D)
-
-        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
-
-        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
-
-        x = torch.cat(x, 1)
-
-        '''
-        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
-        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
-        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
-        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
-        '''
-        x = self.dropout(x)  # (N, len(Ks)*Co)
-        logit = self.fc1(x)  # (N, C)
-        return logit
diff --git a/mydatasets.py b/mydatasets.py
deleted file mode 100644
index 8fddfce..0000000
--- a/mydatasets.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import re
-import os
-import random
-import tarfile
-import urllib
-from torchtext import data
-
-
-class TarDataset(data.Dataset):
-    """Defines a Dataset loaded from a downloadable tar archive.
-
-    Attributes:
-        url: URL where the tar archive can be downloaded.
-        filename: Filename of the downloaded tar archive.
-        dirname: Name of the top-level directory within the zip archive that
-            contains the data files.
-    """
-
-    @classmethod
-    def download_or_unzip(cls, root):
-        path = os.path.join(root, cls.dirname)
-        if not os.path.isdir(path):
-            tpath = os.path.join(root, cls.filename)
-            if not os.path.isfile(tpath):
-                print('downloading')
-                urllib.request.urlretrieve(cls.url, tpath)
-            with tarfile.open(tpath, 'r') as tfile:
-                print('extracting')
-                tfile.extractall(root)
-        return os.path.join(path, '')
-
-
-class MR(TarDataset):
-
-    url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
-    filename = 'rt-polaritydata.tar'
-    dirname = 'rt-polaritydata'
-
-    @staticmethod
-    def sort_key(ex):
-        return len(ex.text)
-
-    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
-        """Create an MR dataset instance given a path and fields.
-
-        Arguments:
-            text_field: The field that will be used for text data.
-            label_field: The field that will be used for label data.
-            path: Path to the data file.
-            examples: The examples contain all the data.
-            Remaining keyword arguments: Passed to the constructor of
-                data.Dataset.
-        """
-        def clean_str(string):
-            """
-            Tokenization/string cleaning for all datasets except for SST.
-            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
-            """
-            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
-            string = re.sub(r"\'s", " \'s", string)
-            string = re.sub(r"\'ve", " \'ve", string)
-            string = re.sub(r"n\'t", " n\'t", string)
-            string = re.sub(r"\'re", " \'re", string)
-            string = re.sub(r"\'d", " \'d", string)
-            string = re.sub(r"\'ll", " \'ll", string)
-            string = re.sub(r",", " , ", string)
-            string = re.sub(r"!", " ! ", string)
-            string = re.sub(r"\(", " \( ", string)
-            string = re.sub(r"\)", " \) ", string)
-            string = re.sub(r"\?", " \? ", string)
-            string = re.sub(r"\s{2,}", " ", string)
-            return string.strip()
-
-        text_field.preprocessing = data.Pipeline(clean_str)
-        fields = [('text', text_field), ('label', label_field)]
-
-        if examples is None:
-            path = self.dirname if path is None else path
-            examples = []
-            with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f:
-                examples += [
-                    data.Example.fromlist([line, 'negative'], fields) for line in f]
-            with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f:
-                examples += [
-                    data.Example.fromlist([line, 'positive'], fields) for line in f]
-        super(MR, self).__init__(examples, fields, **kwargs)
-
-    @classmethod
-    def splits(cls, text_field, label_field, dev_ratio=.1, shuffle=True, root='.', **kwargs):
-        """Create dataset objects for splits of the MR dataset.
-
-        Arguments:
-            text_field: The field that will be used for the sentence.
-            label_field: The field that will be used for label data.
-            dev_ratio: The ratio that will be used to get split validation dataset.
-            shuffle: Whether to shuffle the data before split.
-            root: The root directory that the dataset's zip archive will be
-                expanded into; therefore the directory in whose trees
-                subdirectory the data files will be stored.
-            train: The filename of the train data. Default: 'train.txt'.
-            Remaining keyword arguments: Passed to the splits method of
-                Dataset.
-        """
-        path = cls.download_or_unzip(root)
-        examples = cls(text_field, label_field, path=path, **kwargs).examples
-        if shuffle: random.shuffle(examples)
-        dev_index = -1 * int(dev_ratio*len(examples))
-
-        return (cls(text_field, label_field, examples=examples[:dev_index]),
-                cls(text_field, label_field, examples=examples[dev_index:]))
diff --git a/train.py b/train.py
deleted file mode 100644
index 7f90aaa..0000000
--- a/train.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import os
-import sys
-import torch
-import torch.autograd as autograd
-import torch.nn.functional as F
-
-
-def train(train_iter, dev_iter, model, args):
-    if args.cuda:
-        model.cuda()
-
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-
-    steps = 0
-    best_acc = 0
-    last_step = 0
-    model.train()
-    for epoch in range(1, args.epochs+1):
-        for batch in train_iter:
-            feature, target = batch.text, batch.label
-            feature.data.t_(), target.data.sub_(1)  # batch first, index align
-            if args.cuda:
-                feature, target = feature.cuda(), target.cuda()
-
-            optimizer.zero_grad()
-            logit = model(feature)
-
-            #print('logit vector', logit.size())
-            #print('target vector', target.size())
-            loss = F.cross_entropy(logit, target)
-            loss.backward()
-            optimizer.step()
-
-            steps += 1
-            if steps % args.log_interval == 0:
-                corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
-                accuracy = 100.0 * corrects/batch.batch_size
-                sys.stdout.write(
-                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps, 
-                                                                             loss.data[0], 
-                                                                             accuracy,
-                                                                             corrects,
-                                                                             batch.batch_size))
-            if steps % args.test_interval == 0:
-                dev_acc = eval(dev_iter, model, args)
-                if dev_acc > best_acc:
-                    best_acc = dev_acc
-                    last_step = steps
-                    if args.save_best:
-                        save(model, args.save_dir, 'best', steps)
-                else:
-                    if steps - last_step >= args.early_stop:
-                        print('early stop by {} steps.'.format(args.early_stop))
-            elif steps % args.save_interval == 0:
-                save(model, args.save_dir, 'snapshot', steps)
-
-
-def eval(data_iter, model, args):
-    model.eval()
-    corrects, avg_loss = 0, 0
-    for batch in data_iter:
-        feature, target = batch.text, batch.label
-        feature.data.t_(), target.data.sub_(1)  # batch first, index align
-        if args.cuda:
-            feature, target = feature.cuda(), target.cuda()
-
-        logit = model(feature)
-        loss = F.cross_entropy(logit, target, size_average=False)
-
-        avg_loss += loss.data[0]
-        corrects += (torch.max(logit, 1)
-                     [1].view(target.size()).data == target.data).sum()
-
-    size = len(data_iter.dataset)
-    avg_loss /= size
-    accuracy = 100.0 * corrects/size
-    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss, 
-                                                                       accuracy, 
-                                                                       corrects, 
-                                                                       size))
-    return accuracy
-
-
-def predict(text, model, text_field, label_feild, cuda_flag):
-    assert isinstance(text, str)
-    model.eval()
-    # text = text_field.tokenize(text)
-    text = text_field.preprocess(text)
-    text = [[text_field.vocab.stoi[x] for x in text]]
-    x = torch.tensor(text)
-    x = autograd.Variable(x)
-    if cuda_flag:
-        x = x.cuda()
-    print(x)
-    output = model(x)
-    _, predicted = torch.max(output, 1)
-    #return label_feild.vocab.itos[predicted.data[0][0]+1]
-    return label_feild.vocab.itos[predicted.data[0]+1]
-
-
-def save(model, save_dir, save_prefix, steps):
-    if not os.path.isdir(save_dir):
-        os.makedirs(save_dir)
-    save_prefix = os.path.join(save_dir, save_prefix)
-    save_path = '{}_steps_{}.pt'.format(save_prefix, steps)
-    torch.save(model.state_dict(), save_path)

From 8b00bd7cc4f0711241160a40cc0fe60981a050e8 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Wed, 17 Apr 2019 22:12:17 -0700
Subject: [PATCH 03/31] Ignore training samples shorter than the maximum kernel
 size

---
 README.md                  |  2 ++
 cnn_text_classification.py | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f25b726..e720512 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,8 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 * Weights are represented by upsampling.
 * Only supports pre-trained word vectors from TorchText.
 * The random_state parameter probably only works with integers or None.
+* Training samples shorter than the maximum kernel size are ignored.
+* Test samples shorter than the maximum kernel size are classified as the most common class found during training.
 * Features my idiosyncratic coding style.
 
 ## To Do
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 847bcd3..9171076 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -158,9 +158,21 @@ def __preprocess(self, X, y, sample_weight):
         self.__text_field = Field(lower=True)
         self.__label_field = Field(sequential=False)
         self.__text_field.preprocessing = Pipeline(self.__preprocess_text)
+        max_krnl_sz = int(self.kernel_sizes[self.kernel_sizes.rfind(",") + 1:])
+        X, y = list(X), list(y)
+        sample_weight = None if sample_weight is None else list(sample_weight)
+
+        for i in range(len(X) - 1, -1, -1):
+            if len(self.__text_field.preprocess(X[i])) < max_krnl_sz:
+                del X[i]
+                del y[i]
+
+                if sample_weight is not None:
+                    del sample_weight[i]
+
         fields = [("text", self.__text_field), ("label", self.__label_field)]
-        weights = [1 for yi in y] if sample_weight is None else sample_weight
         exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))]
+        weights = [1 for yi in y] if sample_weight is None else sample_weight
 
         if self.class_weight is not None:
             cw = self.class_weight

From aa8b7f19e3984b54a49a1921bff8cc9552fad760 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Wed, 24 Apr 2019 17:39:29 -0700
Subject: [PATCH 04/31] Fixed pretrained vector handling and added console
 output option

---
 cnn_text_classification.py | 42 ++++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 9171076..94575c7 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -5,6 +5,7 @@
 from collections import Counter
 from copy import deepcopy
 from sklearn.base import BaseEstimator, ClassifierMixin
+from time import time
 from torch.autograd import Variable
 from torchtext.data import Dataset, Example, Field, Iterator, Pipeline
 
@@ -15,7 +16,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
                  embed_dim=128, kernel_num=100, kernel_sizes="3,4,5",
                  static=False, device=-1, cuda=True, class_weight=None,
                  split_ratio=0.9, random_state=None, vectors=None,
-                 preprocessor=None):
+                 preprocessor=None, verbose=0):
         self.lr = lr
         self.epochs = epochs
         self.batch_size = batch_size
@@ -35,6 +36,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
         self.random_state = random_state
         self.vectors = vectors
         self.preprocessor = preprocessor
+        self.verbose = verbose
 
     def __clean_str(self, string):
         string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
@@ -76,13 +78,15 @@ def __eval(self, data_iter):
         return 100.0 * corrects / len(data_iter.dataset)
 
     def fit(self, X, y, sample_weight=None):
+        start = time() if self.verbose > 0 else None
         train_iter, dev_iter = self.__preprocess(X, y, sample_weight)
         embed_num = len(self.__text_field.vocab)
         class_num = len(self.__label_field.vocab) - 1
         kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")]
         self.__model = CNNText(embed_num, self.embed_dim, class_num,
                                self.kernel_num, kernel_sizes, self.dropout,
-                               self.static)
+                               self.static,
+                               vectors=self.__text_field.vocab.vectors)
 
         if self.cuda and torch.cuda.is_available():
             torch.cuda.set_device(self.device)
@@ -123,9 +127,15 @@ def fit(self, X, y, sample_weight=None):
                         if self.save_best:
                             self.__model = best_model
 
+                        if self.verbose > 0:
+                            self.__print_elapsed_time(time() - start)
                         return self
 
         self.__model = best_model if self.save_best else self.__model
+
+        if self.verbose > 0:
+            self.__print_elapsed_time(time() - start)
+
         return self
 
     def predict(self, X):
@@ -208,15 +218,39 @@ def __preprocess_text(self, text):
         if self.preprocessor is None:
             return self.__clean_str(text)
 
-        return self.preprocessor(text)
+        return self.__clean_str(self.preprocessor(text))
+
+    def __print_elapsed_time(self, seconds):
+        sc = round(seconds)
+        mn = int(sc / 60)
+        sc = sc % 60
+        hr = int(mn / 60)
+        mn = mn % 60
+        hr = "{} hour{}".format(hr, "s" if hr > 1 else "") if hr > 0 else ""
+        mn = "{} minute{}".format(mn, "s" if mn > 1 else "") if mn > 0 else ""
+        sc = "{} second{}".format(sc, "s" if sc > 1 else "") if sc > 0 else ""
+        times = [t for t in [hr, mn, sc] if len(t) > 0]
+
+        if len(times) == 3:
+            times = " and ".format(", ".format(hr, mn), sc)
+        elif len(times) == 2:
+            times = " and ".join(times)
+        else:
+            times = times[0]
+
+        print("Completed training in {}.".format(times))
 
 
 class CNNText(nn.Module):
     def __init__(self, embed_num, embed_dim, class_num, kernel_num,
-                 kernel_sizes, dropout, static):
+                 kernel_sizes, dropout, static, vectors=None):
         super(CNNText, self).__init__()
 
         self.__embed = nn.Embedding(embed_num, embed_dim)
+
+        if vectors is not None:
+            self.__embed = self.__embed.from_pretrained(vectors)
+
         Ks = kernel_sizes
         module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks]
         self.__convs1 = nn.ModuleList(module_list)

From 609a73ddcee8d3a12a9ae2c7f6e06aac6013c422 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Thu, 25 Apr 2019 10:30:05 -0700
Subject: [PATCH 05/31] Added support for alternate scoring methods

Also made a possible optimization and did some code cleanup.
---
 README.md                  |  9 +++++++--
 cnn_text_classification.py | 25 +++++++++++++++----------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index e720512..f8f4906 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,6 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 * Features my idiosyncratic coding style.
 
 ## To Do
-* Add support for different scoring methods (balanced accuracy, recall, etc.).
 * Add support for cross-validation during training.
 
 ## Parameters
@@ -77,9 +76,15 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 **vectors : string, optional (default=None)**
   Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options).
 
-**preprocessor : callable or None (default=None)**
+**preprocessor : callable or None, optional (default=None)**
   Override default string preprocessing.
 
+**scoring : callable or None, optional (default=sklearn.metrics.accuracy_score)**
+  Scoring method for testing model performance during fitting.
+
+**verbose : integer, optional (default=0)**
+  Controls the verbosity when fitting.
+
 ## Methods
 **fit(X, y, sample_weight=None)**
 Train the CNN classifier from the training set (X, y).
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 94575c7..0b81ecb 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -5,6 +5,7 @@
 from collections import Counter
 from copy import deepcopy
 from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.metrics import accuracy_score
 from time import time
 from torch.autograd import Variable
 from torchtext.data import Dataset, Example, Field, Iterator, Pipeline
@@ -16,7 +17,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
                  embed_dim=128, kernel_num=100, kernel_sizes="3,4,5",
                  static=False, device=-1, cuda=True, class_weight=None,
                  split_ratio=0.9, random_state=None, vectors=None,
-                 preprocessor=None, verbose=0):
+                 preprocessor=None, scoring=accuracy_score, verbose=0):
         self.lr = lr
         self.epochs = epochs
         self.batch_size = batch_size
@@ -36,6 +37,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
         self.random_state = random_state
         self.vectors = vectors
         self.preprocessor = preprocessor
+        self.scoring = scoring
         self.verbose = verbose
 
     def __clean_str(self, string):
@@ -57,7 +59,8 @@ def __clean_str(self, string):
     def __eval(self, data_iter):
         self.__model.eval()
 
-        corrects = 0
+        preds = []
+        targets = []
 
         for batch in data_iter:
             feature, target = batch.text, batch.label
@@ -72,10 +75,10 @@ def __eval(self, data_iter):
 
             F.cross_entropy(logit, target, reduction="sum")
 
-            predictions = torch.max(logit, 1)[1].view(target.size())
-            corrects += (predictions.data == target.data).sum()
+            preds += torch.max(logit, 1)[1].view(target.size()).data.tolist()
+            targets += target.data.tolist()
 
-        return 100.0 * corrects / len(data_iter.dataset)
+        return self.scoring(targets, preds)
 
     def fit(self, X, y, sample_weight=None):
         start = time() if self.verbose > 0 else None
@@ -95,6 +98,7 @@ def fit(self, X, y, sample_weight=None):
         optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr,
                                      weight_decay=self.max_norm)
         steps, best_acc, last_step = 0, 0, 0
+        active = True
 
         self.__model.train()
 
@@ -124,18 +128,18 @@ def fit(self, X, y, sample_weight=None):
                         if self.save_best:
                             best_model = deepcopy(self.__model)
                     elif steps - last_step >= self.early_stop:
-                        if self.save_best:
-                            self.__model = best_model
+                        active = False
+                        break
 
-                        if self.verbose > 0:
-                            self.__print_elapsed_time(time() - start)
-                        return self
+            if not active:
+                break
 
         self.__model = best_model if self.save_best else self.__model
 
         if self.verbose > 0:
             self.__print_elapsed_time(time() - start)
 
+        torch.cuda.empty_cache()
         return self
 
     def predict(self, X):
@@ -162,6 +166,7 @@ def predict(self, X):
 
             y_pred.append(self.__label_field.vocab.itos[predicted.data[0] + 1])
 
+        torch.cuda.empty_cache()
         return y_pred
 
     def __preprocess(self, X, y, sample_weight):

From 88f693a634405cea20bde6da90d6225f983ba097 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Fri, 26 Apr 2019 22:06:28 -0700
Subject: [PATCH 06/31] Added/fixed console output

Also did some more code cleanup
---
 cnn_text_classification.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 0b81ecb..f078b98 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -81,6 +81,12 @@ def __eval(self, data_iter):
         return self.scoring(targets, preds)
 
     def fit(self, X, y, sample_weight=None):
+        if self.verbose > 1:
+            params = self.get_params().items()
+
+            print("Fitting with the following parameters:")
+            print("\n".join([": ".join([k, str(v)]) for k, v in params]))
+
         start = time() if self.verbose > 0 else None
         train_iter, dev_iter = self.__preprocess(X, y, sample_weight)
         embed_num = len(self.__text_field.vocab)
@@ -237,7 +243,7 @@ def __print_elapsed_time(self, seconds):
         times = [t for t in [hr, mn, sc] if len(t) > 0]
 
         if len(times) == 3:
-            times = " and ".format(", ".format(hr, mn), sc)
+            times = " and ".join(", ".join(hr, mn), sc)
         elif len(times) == 2:
             times = " and ".join(times)
         else:
@@ -268,12 +274,7 @@ def conv_and_pool(self, x, conv):
         return F.max_pool1d(x, x.size(2)).squeeze(2)
 
     def forward(self, x):
-        x = self.__embed(x)
-
-        if self.__static:
-            x = Variable(x)
-
-        x = x.unsqueeze(1)
-        x = [F.relu(conv(x)).squeeze(3) for conv in self.__convs1]
+        x = Variable(self.__embed(x)) if self.__static else self.__embed(x)
+        x = [F.relu(conv(x.unsqueeze(1))).squeeze(3) for conv in self.__convs1]
         x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
         return self.__fc1(self.__dropout(torch.cat(x, 1)))

From 0ae9906a6ed2920c33d7fc7c67d2843d3f28eded Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Wed, 8 May 2019 20:49:30 -0700
Subject: [PATCH 07/31] Improved random seed handling

---
 cnn_text_classification.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index f078b98..dbf7032 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -81,6 +81,12 @@ def __eval(self, data_iter):
         return self.scoring(targets, preds)
 
     def fit(self, X, y, sample_weight=None):
+        if self.random_state is not None:
+            torch.manual_seed(self.random_state)
+
+        torch.backends.cudnn.deterministic = self.random_state is not None
+        torch.backends.cudnn.benchmark = self.random_state is None
+
         if self.verbose > 1:
             params = self.get_params().items()
 

From 4a6d285d6a5c72243412aaf99b23e443fafd362d Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Thu, 20 Jun 2019 07:44:16 -0400
Subject: [PATCH 08/31] Fixed "set_storage_offset is not allowed" error

Also fixed a bug in training time output function.
---
 cnn_text_classification.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index dbf7032..4b587a6 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -59,14 +59,11 @@ def __clean_str(self, string):
     def __eval(self, data_iter):
         self.__model.eval()
 
-        preds = []
-        targets = []
+        preds, targets = [], []
 
         for batch in data_iter:
             feature, target = batch.text, batch.label
-
-            feature.data.t_()
-            target.data.sub_(1)
+            feature, target = feature.data.t(), target.data.sub(1)
 
             if self.cuda and torch.cuda.is_available():
                 feature, target = feature.cuda(), target.cuda()
@@ -117,9 +114,7 @@ def fit(self, X, y, sample_weight=None):
         for epoch in range(self.epochs):
             for batch in train_iter:
                 feature, target = batch.text, batch.label
-
-                feature.data.t_()
-                target.data.sub_(1)
+                feature, target = feature.data.t(), target.data.sub(1)
 
                 if self.cuda and torch.cuda.is_available():
                     feature, target = feature.cuda(), target.cuda()
@@ -249,7 +244,7 @@ def __print_elapsed_time(self, seconds):
         times = [t for t in [hr, mn, sc] if len(t) > 0]
 
         if len(times) == 3:
-            times = " and ".join(", ".join(hr, mn), sc)
+            times = " and ".join([", ".join([hr, mn]), sc])
         elif len(times) == 2:
             times = " and ".join(times)
         else:

From b5141b2bb6b3359fad959bd7ea8879bf69e5d6b5 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Thu, 20 Jun 2019 21:21:31 -0400
Subject: [PATCH 09/31] Removed note about weights from known issues

See "Buda M, Maki A, Mazurowski MA. A systematic study of the class imbalance problem in convolutional neural networks. Neural Networks. 2018 Oct 1;106:249-59" for justification.

Also fixed another bug in the training time output function.
---
 README.md                  | 1 -
 cnn_text_classification.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f8f4906..05a87a2 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,6 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 ## Known Issues
 * The predict method is probably not as efficient as it could be.
 * Doesn't play well with GridSearchCV if num_jobs isn't 1.
-* Weights are represented by upsampling.
 * Only supports pre-trained word vectors from TorchText.
 * The random_state parameter probably only works with integers or None.
 * Training samples shorter than the maximum kernel size are ignored.
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 4b587a6..d84fc58 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -248,7 +248,7 @@ def __print_elapsed_time(self, seconds):
         elif len(times) == 2:
             times = " and ".join(times)
         else:
-            times = times[0]
+            times = times[0] if len(times) > 0 else "less than 1 second"
 
         print("Completed training in {}.".format(times))
 

From a7c9629d19c50c3651f087b7825db93d26014b7f Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Sun, 30 Jun 2019 19:44:48 -0400
Subject: [PATCH 10/31] Fixed bug with handling of non-default scoring
 functions

---
 README.md                  |  1 +
 cnn_text_classification.py | 27 ++++++++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 05a87a2..877eed7 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 
 ## To Do
 * Add support for cross-validation during training.
+* Implement sample weights in eval scoring?
 
 ## Parameters
 **lr : float, optional (default=0.01)**
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index d84fc58..9cf4b01 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -5,7 +5,7 @@
 from collections import Counter
 from copy import deepcopy
 from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import accuracy_score, make_scorer
 from time import time
 from torch.autograd import Variable
 from torchtext.data import Dataset, Example, Field, Iterator, Pipeline
@@ -17,7 +17,8 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
                  embed_dim=128, kernel_num=100, kernel_sizes="3,4,5",
                  static=False, device=-1, cuda=True, class_weight=None,
                  split_ratio=0.9, random_state=None, vectors=None,
-                 preprocessor=None, scoring=accuracy_score, verbose=0):
+                 preprocessor=None, scoring=make_scorer(accuracy_score),
+                 verbose=0):
         self.lr = lr
         self.epochs = epochs
         self.batch_size = batch_size
@@ -75,7 +76,9 @@ def __eval(self, data_iter):
             preds += torch.max(logit, 1)[1].view(target.size()).data.tolist()
             targets += target.data.tolist()
 
-        return self.scoring(targets, preds)
+        preds = [self.__label_field.vocab.itos[pred + 1] for pred in preds]
+        targets = [self.__label_field.vocab.itos[targ + 1] for targ in targets]
+        return self.scoring(_Eval(preds), None, targets)
 
     def fit(self, X, y, sample_weight=None):
         if self.random_state is not None:
@@ -95,10 +98,9 @@ def fit(self, X, y, sample_weight=None):
         embed_num = len(self.__text_field.vocab)
         class_num = len(self.__label_field.vocab) - 1
         kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")]
-        self.__model = CNNText(embed_num, self.embed_dim, class_num,
-                               self.kernel_num, kernel_sizes, self.dropout,
-                               self.static,
-                               vectors=self.__text_field.vocab.vectors)
+        self.__model = _CNNText(embed_num, self.embed_dim, class_num,
+                                self.kernel_num, kernel_sizes, self.dropout,
+                                self.static, self.__text_field.vocab.vectors)
 
         if self.cuda and torch.cuda.is_available():
             torch.cuda.set_device(self.device)
@@ -253,10 +255,10 @@ def __print_elapsed_time(self, seconds):
         print("Completed training in {}.".format(times))
 
 
-class CNNText(nn.Module):
+class _CNNText(nn.Module):
     def __init__(self, embed_num, embed_dim, class_num, kernel_num,
                  kernel_sizes, dropout, static, vectors=None):
-        super(CNNText, self).__init__()
+        super(_CNNText, self).__init__()
 
         self.__embed = nn.Embedding(embed_num, embed_dim)
 
@@ -279,3 +281,10 @@ def forward(self, x):
         x = [F.relu(conv(x.unsqueeze(1))).squeeze(3) for conv in self.__convs1]
         x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
         return self.__fc1(self.__dropout(torch.cat(x, 1)))
+
+class _Eval():
+    def __init__(self, preds):
+        self.__preds = preds
+
+    def predict(self, X):
+        return self.__preds

From f2d82e134af61a613aa33b36e95fa6d996cbe854 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Wed, 3 Jul 2019 18:49:43 -0400
Subject: [PATCH 11/31] Added support for alternate activation functions

---
 README.md                  | 23 +++++++++++++----------
 cnn_text_classification.py | 33 +++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 877eed7..c6a8638 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 
 ## Known Issues
 * The predict method is probably not as efficient as it could be.
-* Doesn't play well with GridSearchCV if num_jobs isn't 1.
+* Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA).
 * Only supports pre-trained word vectors from TorchText.
 * The random_state parameter probably only works with integers or None.
 * Training samples shorter than the maximum kernel size are ignored.
@@ -64,23 +64,26 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 **cuda : boolean, optional (default=True)**
   If true, use the GPU if available.
 
-**class_weight : dict, "balanced" or None, optional (default=None)**
-  Weights associated with each class (see class_weight parameter in existing scikit-learn classifiers).
-
-**split_ratio : float, optional (default=0.9)**
-  Ratio of training data used for training. The remainder will be used for validation.
+** activation_func : string, optional (default='relu')**
+  Activation function. If 'relu' or 'tanh', uses rectified linear unit or hyperbolic tangent, respectively. Otherwise, uses no activation function (f(x) = x).
 
-**random_state : integer, optional (default=None)**
-  Seed for the random number generator.
+**scoring : callable or None, optional (default=sklearn.metrics.accuracy_score)**
+  Scoring method for testing model performance during fitting.
 
 **vectors : string, optional (default=None)**
   Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options).
 
+**split_ratio : float, optional (default=0.9)**
+  Ratio of training data used for training. The remainder will be used for validation.
+
 **preprocessor : callable or None, optional (default=None)**
   Override default string preprocessing.
 
-**scoring : callable or None, optional (default=sklearn.metrics.accuracy_score)**
-  Scoring method for testing model performance during fitting.
+**class_weight : dict, "balanced" or None, optional (default=None)**
+  Weights associated with each class (see class_weight parameter in existing scikit-learn classifiers).
+
+**random_state : integer, optional (default=None)**
+  Seed for the random number generator.
 
 **verbose : integer, optional (default=0)**
   Controls the verbosity when fitting.
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 9cf4b01..1cc372e 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -15,10 +15,10 @@ class CNNClassifier(BaseEstimator, ClassifierMixin):
     def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
                  early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0,
                  embed_dim=128, kernel_num=100, kernel_sizes="3,4,5",
-                 static=False, device=-1, cuda=True, class_weight=None,
-                 split_ratio=0.9, random_state=None, vectors=None,
-                 preprocessor=None, scoring=make_scorer(accuracy_score),
-                 verbose=0):
+                 static=False, device=-1, cuda=True, activation_func="relu",
+                 scoring=make_scorer(accuracy_score), vectors=None,
+                 split_ratio=0.9, preprocessor=None, class_weight=None,
+                 random_state=None, verbose=0):
         self.lr = lr
         self.epochs = epochs
         self.batch_size = batch_size
@@ -33,12 +33,13 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
         self.static = static
         self.device = device
         self.cuda = cuda
-        self.class_weight = class_weight
-        self.split_ratio = split_ratio
-        self.random_state = random_state
+        self.activation_func = activation_func
+        self.scoring = scoring
         self.vectors = vectors
+        self.split_ratio = split_ratio
         self.preprocessor = preprocessor
-        self.scoring = scoring
+        self.class_weight = class_weight
+        self.random_state = random_state
         self.verbose = verbose
 
     def __clean_str(self, string):
@@ -100,7 +101,8 @@ def fit(self, X, y, sample_weight=None):
         kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")]
         self.__model = _CNNText(embed_num, self.embed_dim, class_num,
                                 self.kernel_num, kernel_sizes, self.dropout,
-                                self.static, self.__text_field.vocab.vectors)
+                                self.static, self.activation_func,
+                                vectors=self.__text_field.vocab.vectors)
 
         if self.cuda and torch.cuda.is_available():
             torch.cuda.set_device(self.device)
@@ -257,7 +259,7 @@ def __print_elapsed_time(self, seconds):
 
 class _CNNText(nn.Module):
     def __init__(self, embed_num, embed_dim, class_num, kernel_num,
-                 kernel_sizes, dropout, static, vectors=None):
+                 kernel_sizes, dropout, static, activation_func, vectors=None):
         super(_CNNText, self).__init__()
 
         self.__embed = nn.Embedding(embed_num, embed_dim)
@@ -272,13 +274,16 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num,
         self.__fc1 = nn.Linear(len(Ks) * kernel_num, class_num)
         self.__static = static
 
-    def conv_and_pool(self, x, conv):
-        x = F.relu(conv(x)).squeeze(3)
-        return F.max_pool1d(x, x.size(2)).squeeze(2)
+        if activation_func == "relu":
+            self.__f = F.relu
+        elif activation_func == "tanh":
+            self.__f = torch.tanh
+        else:
+            self.__f = lambda x: x
 
     def forward(self, x):
         x = Variable(self.__embed(x)) if self.__static else self.__embed(x)
-        x = [F.relu(conv(x.unsqueeze(1))).squeeze(3) for conv in self.__convs1]
+        x = [self.__f(cnv(x.unsqueeze(1))).squeeze(3) for cnv in self.__convs1]
         x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
         return self.__fc1(self.__dropout(torch.cat(x, 1)))
 

From aa67e712600ec366347eecfca8fe9f3b17251693 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Mon, 8 Jul 2019 18:41:42 -0400
Subject: [PATCH 12/31] Fixed bug related to saving best model

Also did some code cleanup.
---
 cnn_text_classification.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 1cc372e..2206e17 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -64,8 +64,7 @@ def __eval(self, data_iter):
         preds, targets = [], []
 
         for batch in data_iter:
-            feature, target = batch.text, batch.label
-            feature, target = feature.data.t(), target.data.sub(1)
+            feature, target = batch.text.data.t(), batch.label.data.sub(1)
 
             if self.cuda and torch.cuda.is_available():
                 feature, target = feature.cuda(), target.cuda()
@@ -110,6 +109,7 @@ def fit(self, X, y, sample_weight=None):
 
         optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr,
                                      weight_decay=self.max_norm)
+        best_model = self.__model
         steps, best_acc, last_step = 0, 0, 0
         active = True
 
@@ -117,8 +117,7 @@ def fit(self, X, y, sample_weight=None):
 
         for epoch in range(self.epochs):
             for batch in train_iter:
-                feature, target = batch.text, batch.label
-                feature, target = feature.data.t(), target.data.sub(1)
+                feature, target = batch.text.data.t(), batch.label.data.sub(1)
 
                 if self.cuda and torch.cuda.is_available():
                     feature, target = feature.cuda(), target.cuda()
@@ -262,10 +261,10 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num,
                  kernel_sizes, dropout, static, activation_func, vectors=None):
         super(_CNNText, self).__init__()
 
-        self.__embed = nn.Embedding(embed_num, embed_dim)
-
-        if vectors is not None:
-            self.__embed = self.__embed.from_pretrained(vectors)
+        if vectors is None:
+            self.__embed = nn.Embedding(embed_num, embed_dim)
+        else:
+            self.__embed = nn.Embedding.from_pretrained(vectors)
 
         Ks = kernel_sizes
         module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks]

From 685fadaefd1c1b3934e2ab12ca54230ad8f45b52 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Wed, 10 Jul 2019 17:05:27 -0400
Subject: [PATCH 13/31] Updated embed_dim to be overridden by pretrained
 vectors

---
 README.md                  | 2 +-
 cnn_text_classification.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c6a8638..e1e724d 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
   L2 constraint.
 
 **embed_dim : integer, optional (default=128)**
-  The number of embedding dimensions.
+  The number of embedding dimensions. Ignored if vectors is not None.
 
 **kernel_num : integer, optional (default=100)**
   The number of each size of kernel.
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 2206e17..ce62195 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -265,6 +265,7 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num,
             self.__embed = nn.Embedding(embed_num, embed_dim)
         else:
             self.__embed = nn.Embedding.from_pretrained(vectors)
+            embed_dim = self.__embed.embedding_dim
 
         Ks = kernel_sizes
         module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks]

From 380653e086e1eee3342e3c9d54854e10e19a711d Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Wed, 17 Jul 2019 19:32:01 -0400
Subject: [PATCH 14/31] Text shorter than maximum kernel size is now padded

Also changed the kernel_sizes parameter from a string to an iterable.
---
 README.md                  |  8 +++-----
 cnn_text_classification.py | 39 ++++++++++++++++++--------------------
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index e1e724d..4b9bd69 100644
--- a/README.md
+++ b/README.md
@@ -11,10 +11,8 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 ## Known Issues
 * The predict method is probably not as efficient as it could be.
 * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA).
-* Only supports pre-trained word vectors from TorchText.
+* Only supports pre-trained word vectors from TorchText (or no pre-trained vectors).
 * The random_state parameter probably only works with integers or None.
-* Training samples shorter than the maximum kernel size are ignored.
-* Test samples shorter than the maximum kernel size are classified as the most common class found during training.
 * Features my idiosyncratic coding style.
 
 ## To Do
@@ -52,8 +50,8 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 **kernel_num : integer, optional (default=100)**
   The number of each size of kernel.
 
-**kernel_sizes : string, optional (default='3,4,5')**
-  Comma-separated kernel sizes to use for convolution.
+**kernel_sizes : iterable of integers, optional (default=(3, 4, 5))**
+  Kernel sizes to use for convolution.
 
 **static : boolean, optional (default=False)**
   If true, fix the embedding.
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index ce62195..4e9818c 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -14,7 +14,7 @@
 class CNNClassifier(BaseEstimator, ClassifierMixin):
     def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
                  early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0,
-                 embed_dim=128, kernel_num=100, kernel_sizes="3,4,5",
+                 embed_dim=128, kernel_num=100, kernel_sizes=(3, 4, 5),
                  static=False, device=-1, cuda=True, activation_func="relu",
                  scoring=make_scorer(accuracy_score), vectors=None,
                  split_ratio=0.9, preprocessor=None, class_weight=None,
@@ -29,7 +29,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
         self.max_norm = max_norm
         self.embed_dim = embed_dim
         self.kernel_num = kernel_num
-        self.kernel_sizes = kernel_sizes
+        self.kernel_sizes = sorted(kernel_sizes)
         self.static = static
         self.device = device
         self.cuda = cuda
@@ -97,10 +97,10 @@ def fit(self, X, y, sample_weight=None):
         train_iter, dev_iter = self.__preprocess(X, y, sample_weight)
         embed_num = len(self.__text_field.vocab)
         class_num = len(self.__label_field.vocab) - 1
-        kernel_sizes = [int(k) for k in self.kernel_sizes.split(",")]
         self.__model = _CNNText(embed_num, self.embed_dim, class_num,
-                                self.kernel_num, kernel_sizes, self.dropout,
-                                self.static, self.activation_func,
+                                self.kernel_num, self.kernel_sizes,
+                                self.dropout, self.static,
+                                self.activation_func,
                                 vectors=self.__text_field.vocab.vectors)
 
         if self.cuda and torch.cuda.is_available():
@@ -154,18 +154,11 @@ def fit(self, X, y, sample_weight=None):
 
     def predict(self, X):
         y_pred = []
-        max_krnl_sz = int(self.kernel_sizes[self.kernel_sizes.rfind(",") + 1:])
 
         for text in X:
             assert isinstance(text, str)
 
-            text = self.__text_field.preprocess(text)
-
-            if len(text) < max_krnl_sz:
-                most_common = self.__label_field.vocab.freqs.most_common(1)[0]
-
-                y_pred.append(most_common[0])
-                continue
+            text = self.__pad(self.__text_field.preprocess(text), True)
 
             self.__model.eval()
 
@@ -179,21 +172,25 @@ def predict(self, X):
         torch.cuda.empty_cache()
         return y_pred
 
+    def __pad(self, x, preprocessed=False):
+        tokens = x if preprocessed else self.__text_field.preprocess(x)
+        difference = self.kernel_sizes[-1] - len(tokens)
+
+        if difference > 0:
+            padding = [self.__text_field.pad_token] * difference
+            return x + padding if preprocessed else " ".join([x] + padding)
+
+        return x
+
     def __preprocess(self, X, y, sample_weight):
         self.__text_field = Field(lower=True)
         self.__label_field = Field(sequential=False)
         self.__text_field.preprocessing = Pipeline(self.__preprocess_text)
-        max_krnl_sz = int(self.kernel_sizes[self.kernel_sizes.rfind(",") + 1:])
         X, y = list(X), list(y)
         sample_weight = None if sample_weight is None else list(sample_weight)
 
-        for i in range(len(X) - 1, -1, -1):
-            if len(self.__text_field.preprocess(X[i])) < max_krnl_sz:
-                del X[i]
-                del y[i]
-
-                if sample_weight is not None:
-                    del sample_weight[i]
+        for i in range(len(X)):
+            X[i] = self.__pad(X[i])
 
         fields = [("text", self.__text_field), ("label", self.__label_field)]
         exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))]

From 739de662336b7311d0841ca8a0e5c9573f36bdb7 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Wed, 17 Jul 2019 20:28:30 -0400
Subject: [PATCH 15/31] Added classes_ attribute

Also removed an unnecessary line of code.
---
 cnn_text_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 4e9818c..dd6b793 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -145,6 +145,7 @@ def fit(self, X, y, sample_weight=None):
                 break
 
         self.__model = best_model if self.save_best else self.__model
+        self.classes_ = self.__label_field.vocab.itos[1:]
 
         if self.verbose > 0:
             self.__print_elapsed_time(time() - start)
@@ -186,7 +187,6 @@ def __preprocess(self, X, y, sample_weight):
         self.__text_field = Field(lower=True)
         self.__label_field = Field(sequential=False)
         self.__text_field.preprocessing = Pipeline(self.__preprocess_text)
-        X, y = list(X), list(y)
         sample_weight = None if sample_weight is None else list(sample_weight)
 
         for i in range(len(X)):

From 5c177f3d9a29fc7737bd4734315820d1c11c7e87 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Wed, 17 Jul 2019 21:36:43 -0400
Subject: [PATCH 16/31] Added predict_proba method

---
 README.md                  | 13 ++++++++++++-
 cnn_text_classification.py | 23 +++++++++++++++--------
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 4b9bd69..af88328 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,8 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 * scikit-learn
 
 ## Known Issues
-* The predict method is probably not as efficient as it could be.
+* The predict and predict_proba methods are probably not as efficient as they could be.
+* The class probabilities returned by the predict_proba method are probably questionable.
 * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA).
 * Only supports pre-trained word vectors from TorchText (or no pre-trained vectors).
 * The random_state parameter probably only works with integers or None.
@@ -111,3 +112,13 @@ Parameters: X: list of strings
 Returns:    y: list of strings
                The predicted classes.
 ```
+
+**predict_proba(X)**
+Predict class probabilities for X.
+```
+Parameters: X: list of strings
+               The input samples.
+
+Returns:    y: list of lists for floats
+               The predicted class probabilities.
+```
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index dd6b793..8a79fd8 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -153,25 +153,32 @@ def fit(self, X, y, sample_weight=None):
         torch.cuda.empty_cache()
         return self
 
-    def predict(self, X):
-        y_pred = []
+    def __predict(self, X):
+        y_output = []
+
+        self.__model.eval()
 
         for text in X:
             assert isinstance(text, str)
 
             text = self.__pad(self.__text_field.preprocess(text), True)
-
-            self.__model.eval()
-
             text = [[self.__text_field.vocab.stoi[x] for x in text]]
             x = Variable(torch.tensor(text))
             x = x.cuda() if self.cuda and torch.cuda.is_available() else x
-            _, predicted = torch.max(self.__model(x), 1)
 
-            y_pred.append(self.__label_field.vocab.itos[predicted.data[0] + 1])
+            y_output.append(self.__model(x))
 
         torch.cuda.empty_cache()
-        return y_pred
+        return y_output
+
+    def predict(self, X):
+        y_pred = [torch.argmax(yi, 1) for yi in self.__predict(X)]
+        return [self.__label_field.vocab.itos[yi.data[0] + 1] for yi in y_pred]
+
+    def predict_proba(self, X):
+        softmax = nn.Softmax(dim=1)
+        y_prob = [softmax(yi) for yi in self.__predict(X)]
+        return [[float(yij) for yij in yi[0]] for yi in y_prob]
 
     def __pad(self, x, preprocessed=False):
         tokens = x if preprocessed else self.__text_field.preprocess(x)

From a136549b5f6801aafe6afbc365bed50f71afa9cd Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Thu, 25 Jul 2019 06:53:23 -0400
Subject: [PATCH 17/31] Fixed cloning bug caused by modification of a parameter
 in the constructor

---
 cnn_text_classification.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 8a79fd8..9b3d372 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -29,7 +29,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
         self.max_norm = max_norm
         self.embed_dim = embed_dim
         self.kernel_num = kernel_num
-        self.kernel_sizes = sorted(kernel_sizes)
+        self.kernel_sizes = kernel_sizes
         self.static = static
         self.device = device
         self.cuda = cuda
@@ -155,13 +155,15 @@ def fit(self, X, y, sample_weight=None):
 
     def __predict(self, X):
         y_output = []
+        max_kernel_size = max(self.kernel_sizes)
 
         self.__model.eval()
 
         for text in X:
             assert isinstance(text, str)
 
-            text = self.__pad(self.__text_field.preprocess(text), True)
+            text = self.__text_field.preprocess(text)
+            text = self.__pad(text, max_kernel_size, True)
             text = [[self.__text_field.vocab.stoi[x] for x in text]]
             x = Variable(torch.tensor(text))
             x = x.cuda() if self.cuda and torch.cuda.is_available() else x
@@ -180,9 +182,9 @@ def predict_proba(self, X):
         y_prob = [softmax(yi) for yi in self.__predict(X)]
         return [[float(yij) for yij in yi[0]] for yi in y_prob]
 
-    def __pad(self, x, preprocessed=False):
+    def __pad(self, x, max_kernel_size, preprocessed=False):
         tokens = x if preprocessed else self.__text_field.preprocess(x)
-        difference = self.kernel_sizes[-1] - len(tokens)
+        difference = max_kernel_size - len(tokens)
 
         if difference > 0:
             padding = [self.__text_field.pad_token] * difference
@@ -194,10 +196,11 @@ def __preprocess(self, X, y, sample_weight):
         self.__text_field = Field(lower=True)
         self.__label_field = Field(sequential=False)
         self.__text_field.preprocessing = Pipeline(self.__preprocess_text)
+        max_kernel_size = max(self.kernel_sizes)
         sample_weight = None if sample_weight is None else list(sample_weight)
 
         for i in range(len(X)):
-            X[i] = self.__pad(X[i])
+            X[i] = self.__pad(X[i], max_kernel_size)
 
         fields = [("text", self.__text_field), ("label", self.__label_field)]
         exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))]

From 81450f1fa1734c93ae70d647c5ec49a2982a3647 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Sun, 4 Aug 2019 20:14:28 -0400
Subject: [PATCH 18/31] Added support for ROC AUC scoring

---
 README.md                  |  8 ++++++--
 cnn_text_classification.py | 24 ++++++++++++++++++------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index af88328..99237d4 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 * scikit-learn
 
 ## Known Issues
+* Oversampling is applied to the whole training dataset, so many training samples likely end up in both training and dev sets.
 * The predict and predict_proba methods are probably not as efficient as they could be.
 * The class probabilities returned by the predict_proba method are probably questionable.
 * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA).
@@ -63,12 +64,15 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 **cuda : boolean, optional (default=True)**
   If true, use the GPU if available.
 
-** activation_func : string, optional (default='relu')**
+**activation_func : string, optional (default='relu')**
   Activation function. If 'relu' or 'tanh', uses rectified linear unit or hyperbolic tangent, respectively. Otherwise, uses no activation function (f(x) = x).
 
-**scoring : callable or None, optional (default=sklearn.metrics.accuracy_score)**
+**scoring : callable or "roc_auc", optional (default=sklearn.metrics.make_scorer(sklearn.metrics.accuracy_score))**
   Scoring method for testing model performance during fitting.
 
+**pos_label : string, optional (default=None)**
+  Positive class label for roc_auc scoring. Ignored if using a different scoring method.
+
 **vectors : string, optional (default=None)**
   Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options).
 
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 9b3d372..5d11c5b 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -5,7 +5,7 @@
 from collections import Counter
 from copy import deepcopy
 from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
 from time import time
 from torch.autograd import Variable
 from torchtext.data import Dataset, Example, Field, Iterator, Pipeline
@@ -16,9 +16,9 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
                  early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0,
                  embed_dim=128, kernel_num=100, kernel_sizes=(3, 4, 5),
                  static=False, device=-1, cuda=True, activation_func="relu",
-                 scoring=make_scorer(accuracy_score), vectors=None,
-                 split_ratio=0.9, preprocessor=None, class_weight=None,
-                 random_state=None, verbose=0):
+                 scoring=make_scorer(accuracy_score), pos_label=None,
+                 vectors=None, split_ratio=0.9, preprocessor=None,
+                 class_weight=None, random_state=None, verbose=0):
         self.lr = lr
         self.epochs = epochs
         self.batch_size = batch_size
@@ -35,6 +35,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
         self.cuda = cuda
         self.activation_func = activation_func
         self.scoring = scoring
+        self.pos_label = pos_label
         self.vectors = vectors
         self.split_ratio = split_ratio
         self.preprocessor = preprocessor
@@ -62,6 +63,7 @@ def __eval(self, data_iter):
         self.__model.eval()
 
         preds, targets = [], []
+        softmax = nn.Softmax(dim=1) if self.scoring == "roc_auc" else None
 
         for batch in data_iter:
             feature, target = batch.text.data.t(), batch.label.data.sub(1)
@@ -73,11 +75,21 @@ def __eval(self, data_iter):
 
             F.cross_entropy(logit, target, reduction="sum")
 
-            preds += torch.max(logit, 1)[1].view(target.size()).data.tolist()
+            if self.scoring == "roc_auc":
+                pred = [[float(p) for p in dist] for dist in softmax(logit)]
+            else:
+                pred = torch.max(logit, 1)[1].view(target.size()).data.tolist()
+           
+            preds += pred
             targets += target.data.tolist()
 
-        preds = [self.__label_field.vocab.itos[pred + 1] for pred in preds]
         targets = [self.__label_field.vocab.itos[targ + 1] for targ in targets]
+
+        if self.scoring == "roc_auc":
+            pos_index = self.__label_field.vocab.stoi[self.pos_label] - 1
+            return roc_auc_score(targets, [pred[pos_index] for pred in preds])
+
+        preds = [self.__label_field.vocab.itos[pred + 1] for pred in preds]
         return self.scoring(_Eval(preds), None, targets)
 
     def fit(self, X, y, sample_weight=None):

From e156753f3c6ea2fb753b600d3fd072d38e5d3ceb Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Wed, 7 Aug 2019 17:01:04 -0400
Subject: [PATCH 19/31] Updated oversampling to not apply to dev data

Also changed the default value of split_ratio to 0.8.
---
 README.md                  |  1 -
 cnn_text_classification.py | 36 ++++++++++++++++++++++--------------
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 99237d4..51755ef 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,6 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 * scikit-learn
 
 ## Known Issues
-* Oversampling is applied to the whole training dataset, so many training samples likely end up in both training and dev sets.
 * The predict and predict_proba methods are probably not as efficient as they could be.
 * The class probabilities returned by the predict_proba method are probably questionable.
 * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA).
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 5d11c5b..e8fe052 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -6,6 +6,7 @@
 from copy import deepcopy
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
+from sklearn.model_selection import train_test_split as split
 from time import time
 from torch.autograd import Variable
 from torchtext.data import Dataset, Example, Field, Iterator, Pipeline
@@ -17,7 +18,7 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
                  embed_dim=128, kernel_num=100, kernel_sizes=(3, 4, 5),
                  static=False, device=-1, cuda=True, activation_func="relu",
                  scoring=make_scorer(accuracy_score), pos_label=None,
-                 vectors=None, split_ratio=0.9, preprocessor=None,
+                 vectors=None, split_ratio=0.8, preprocessor=None,
                  class_weight=None, random_state=None, verbose=0):
         self.lr = lr
         self.epochs = epochs
@@ -79,7 +80,7 @@ def __eval(self, data_iter):
                 pred = [[float(p) for p in dist] for dist in softmax(logit)]
             else:
                 pred = torch.max(logit, 1)[1].view(target.size()).data.tolist()
-           
+
             preds += pred
             targets += target.data.tolist()
 
@@ -214,31 +215,37 @@ def __preprocess(self, X, y, sample_weight):
         for i in range(len(X)):
             X[i] = self.__pad(X[i], max_kernel_size)
 
+        X_t, X_d, y_t, y_d = split(X, y, random_state=self.random_state,
+                                   shuffle=True, stratify=y,
+                                   train_size=self.split_ratio)
         fields = [("text", self.__text_field), ("label", self.__label_field)]
-        exmpl = [Example.fromlist([X[i], y[i]], fields) for i in range(len(X))]
-        weights = [1 for yi in y] if sample_weight is None else sample_weight
+        examples = [[X_t[i], y_t[i]] for i in range(len(X_t))]
+        examples = [Example.fromlist(example, fields) for example in examples]
+        weights = [1 for yi in y_t] if sample_weight is None else sample_weight
 
         if self.class_weight is not None:
             cw = self.class_weight
 
             if isinstance(cw, str) and cw == "balanced":
-                counter = Counter(y)
-                cw = [len(y) / (len(counter) * counter[yi]) for yi in y]
-                weights = [weights[i] * cw[i] for i in range(len(y))]
+                counter = Counter(y_t)
+                cw = [len(y_t) / (len(counter) * counter[yi]) for yi in y_t]
+                weights = [weights[i] * cw[i] for i in range(len(y_t))]
             elif isinstance(cw, dict):
-                cw = [cw[yi] for yi in y]
-                weights = [weights[i] * cw[i] for i in range(len(y))]
+                cw = [cw[yi] for yi in y_t]
+                weights = [weights[i] * cw[i] for i in range(len(y_t))]
 
         min_weight = min(weights)
         weights = [round(w / min_weight) for w in weights]
 
-        for i in range(len(X)):
+        for i in range(len(X_t)):
             if weights[i] > 1:
-                Xi = [X[i] for j in range(weights[i] - 1)]
-                exmpl += [Example.fromlist([x, y[i]], fields) for x in Xi]
+                Xi = [X_t[i] for j in range(weights[i] - 1)]
+                examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi]
 
-        train_data, dev_data = Dataset(exmpl, fields).split(self.split_ratio,
-                                                            self.random_state,)
+        train_data = Dataset(examples, fields)
+        dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))]
+        dev_data = [Example.fromlist(example, fields) for example in dev_data]
+        dev_data = Dataset(dev_data, fields)
 
         self.__text_field.build_vocab(train_data, dev_data,
                                       vectors=self.vectors)
@@ -306,6 +313,7 @@ def forward(self, x):
         x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
         return self.__fc1(self.__dropout(torch.cat(x, 1)))
 
+
 class _Eval():
     def __init__(self, preds):
         self.__preds = preds

From 86481582d4a5753082b179740383ef684c1f1e76 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Fri, 9 Aug 2019 07:11:36 -0400
Subject: [PATCH 20/31] Simplified weight calculation code

---
 cnn_text_classification.py | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index e8fe052..48fed1c 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -2,11 +2,11 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from collections import Counter
 from copy import deepcopy
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
 from sklearn.model_selection import train_test_split as split
+from sklearn.utils.class_weight import compute_sample_weight
 from time import time
 from torch.autograd import Variable
 from torchtext.data import Dataset, Example, Field, Iterator, Pipeline
@@ -215,32 +215,20 @@ def __preprocess(self, X, y, sample_weight):
         for i in range(len(X)):
             X[i] = self.__pad(X[i], max_kernel_size)
 
-        X_t, X_d, y_t, y_d = split(X, y, random_state=self.random_state,
-                                   shuffle=True, stratify=y,
-                                   train_size=self.split_ratio)
+        sw = [1 for yi in y] if sample_weight is None else sample_weight
+        X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=y,
+                                           random_state=self.random_state,
+                                           train_size=self.split_ratio)
         fields = [("text", self.__text_field), ("label", self.__label_field)]
         examples = [[X_t[i], y_t[i]] for i in range(len(X_t))]
         examples = [Example.fromlist(example, fields) for example in examples]
-        weights = [1 for yi in y_t] if sample_weight is None else sample_weight
-
-        if self.class_weight is not None:
-            cw = self.class_weight
-
-            if isinstance(cw, str) and cw == "balanced":
-                counter = Counter(y_t)
-                cw = [len(y_t) / (len(counter) * counter[yi]) for yi in y_t]
-                weights = [weights[i] * cw[i] for i in range(len(y_t))]
-            elif isinstance(cw, dict):
-                cw = [cw[yi] for yi in y_t]
-                weights = [weights[i] * cw[i] for i in range(len(y_t))]
-
+        weights = compute_sample_weight(self.class_weight, y_t)
+        weights = [weights[i] * w_t[i] for i in range(len(y_t))]
         min_weight = min(weights)
-        weights = [round(w / min_weight) for w in weights]
 
         for i in range(len(X_t)):
-            if weights[i] > 1:
-                Xi = [X_t[i] for j in range(weights[i] - 1)]
-                examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi]
+            Xi = [X_t[i] for j in range(round(weights[i] / min_weight) - 1)]
+            examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi]
 
         train_data = Dataset(examples, fields)
         dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))]

From a699e22ba8e8ab3198239bf559d00fe4bea3ee5e Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Fri, 9 Aug 2019 17:33:20 -0400
Subject: [PATCH 21/31] Fixed data type bug in weight calculation

---
 cnn_text_classification.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 48fed1c..256baf7 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -225,9 +225,10 @@ def __preprocess(self, X, y, sample_weight):
         weights = compute_sample_weight(self.class_weight, y_t)
         weights = [weights[i] * w_t[i] for i in range(len(y_t))]
         min_weight = min(weights)
+        weights = [int(round(weight / min_weight)) for weight in weights]
 
         for i in range(len(X_t)):
-            Xi = [X_t[i] for j in range(round(weights[i] / min_weight) - 1)]
+            Xi = [X_t[i] for j in range(weights[i] - 1)]
             examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi]
 
         train_data = Dataset(examples, fields)

From 75cee562b41f9d7c57b9b6fe60142fa198364aff Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Thu, 19 Mar 2020 13:00:38 -0700
Subject: [PATCH 22/31]  Fixed crash in splitting training/validation sets

Also slightly optimized the CNN model's forward function, and best model is now saved to disk.
---
 cnn_text_classification.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 256baf7..513733d 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -2,7 +2,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from copy import deepcopy
+from collections import Counter
+from os import remove
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
 from sklearn.model_selection import train_test_split as split
@@ -122,9 +123,9 @@ def fit(self, X, y, sample_weight=None):
 
         optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr,
                                      weight_decay=self.max_norm)
-        best_model = self.__model
         steps, best_acc, last_step = 0, 0, 0
         active = True
+        filename = "./{}.model".format(time())
 
         self.__model.train()
 
@@ -149,7 +150,7 @@ def fit(self, X, y, sample_weight=None):
                         last_step = steps
 
                         if self.save_best:
-                            best_model = deepcopy(self.__model)
+                            torch.save(self.__model.state_dict(), filename)
                     elif steps - last_step >= self.early_stop:
                         active = False
                         break
@@ -157,7 +158,10 @@ def fit(self, X, y, sample_weight=None):
             if not active:
                 break
 
-        self.__model = best_model if self.save_best else self.__model
+        if self.save_best:
+            self.__model.load_state_dict(torch.load(filename))
+            remove(filename)
+
         self.classes_ = self.__label_field.vocab.itos[1:]
 
         if self.verbose > 0:
@@ -216,7 +220,8 @@ def __preprocess(self, X, y, sample_weight):
             X[i] = self.__pad(X[i], max_kernel_size)
 
         sw = [1 for yi in y] if sample_weight is None else sample_weight
-        X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=y,
+        s = y if Counter(y).most_common()[-1][1] > 1 else None
+        X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=s,
                                            random_state=self.random_state,
                                            train_size=self.split_ratio)
         fields = [("text", self.__text_field), ("label", self.__label_field)]
@@ -298,11 +303,11 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num,
 
     def forward(self, x):
         x = Variable(self.__embed(x)) if self.__static else self.__embed(x)
-        x = [self.__f(cnv(x.unsqueeze(1))).squeeze(3) for cnv in self.__convs1]
+        x = x.unsqueeze(1)
+        x = [self.__f(conv(x), inplace=True).squeeze(3) for conv in self.__convs1]
         x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
         return self.__fc1(self.__dropout(torch.cat(x, 1)))
 
-
 class _Eval():
     def __init__(self, preds):
         self.__preds = preds

From d1f176c17bd7319c09c5780772ec06113cb36611 Mon Sep 17 00:00:00 2001
From: wuxiaohui <wuxiaohui@bytedance.com>
Date: Sun, 26 Jul 2020 18:15:24 +0800
Subject: [PATCH 23/31] compatible with new version of pytorch

---
 model.py      | 26 +++++++-------------------
 mydatasets.py |  2 +-
 train.py      | 18 +++++++-----------
 3 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/model.py b/model.py
index ce0158b..1541344 100644
--- a/model.py
+++ b/model.py
@@ -18,15 +18,12 @@ def __init__(self, args):
         Ks = args.kernel_sizes
 
         self.embed = nn.Embedding(V, D)
-        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
-        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
-        '''
-        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
-        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
-        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
-        '''
+        self.convs = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
         self.dropout = nn.Dropout(args.dropout)
-        self.fc1 = nn.Linear(len(Ks)*Co, C)
+        self.fc1 = nn.Linear(len(Ks) * Co, C)
+
+        if self.args.static:
+            self.embed.weight.requires_grad = False
 
     def conv_and_pool(self, x, conv):
         x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
@@ -35,24 +32,15 @@ def conv_and_pool(self, x, conv):
 
     def forward(self, x):
         x = self.embed(x)  # (N, W, D)
-        
-        if self.args.static:
-            x = Variable(x)
-
+    
         x = x.unsqueeze(1)  # (N, Ci, W, D)
 
-        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
+        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # [(N, Co, W), ...]*len(Ks)
 
         x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
 
         x = torch.cat(x, 1)
 
-        '''
-        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
-        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
-        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
-        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
-        '''
         x = self.dropout(x)  # (N, len(Ks)*Co)
         logit = self.fc1(x)  # (N, C)
         return logit
diff --git a/mydatasets.py b/mydatasets.py
index 8fddfce..8cb9475 100644
--- a/mydatasets.py
+++ b/mydatasets.py
@@ -33,7 +33,7 @@ def download_or_unzip(cls, root):
 class MR(TarDataset):
 
     url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
-    filename = 'rt-polaritydata.tar'
+    filename = 'rt-polaritydata.tar.gz'
     dirname = 'rt-polaritydata'
 
     @staticmethod
diff --git a/train.py b/train.py
index 7f90aaa..d9000af 100644
--- a/train.py
+++ b/train.py
@@ -18,15 +18,12 @@ def train(train_iter, dev_iter, model, args):
     for epoch in range(1, args.epochs+1):
         for batch in train_iter:
             feature, target = batch.text, batch.label
-            feature.data.t_(), target.data.sub_(1)  # batch first, index align
+            feature.t_(), target.sub_(1)  # batch first, index align
             if args.cuda:
                 feature, target = feature.cuda(), target.cuda()
 
             optimizer.zero_grad()
             logit = model(feature)
-
-            #print('logit vector', logit.size())
-            #print('target vector', target.size())
             loss = F.cross_entropy(logit, target)
             loss.backward()
             optimizer.step()
@@ -37,9 +34,9 @@ def train(train_iter, dev_iter, model, args):
                 accuracy = 100.0 * corrects/batch.batch_size
                 sys.stdout.write(
                     '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps, 
-                                                                             loss.data[0], 
-                                                                             accuracy,
-                                                                             corrects,
+                                                                             loss.item(), 
+                                                                             accuracy.item(),
+                                                                             corrects.item(),
                                                                              batch.batch_size))
             if steps % args.test_interval == 0:
                 dev_acc = eval(dev_iter, model, args)
@@ -60,14 +57,14 @@ def eval(data_iter, model, args):
     corrects, avg_loss = 0, 0
     for batch in data_iter:
         feature, target = batch.text, batch.label
-        feature.data.t_(), target.data.sub_(1)  # batch first, index align
+        feature.t_(), target.sub_(1)  # batch first, index align
         if args.cuda:
             feature, target = feature.cuda(), target.cuda()
 
         logit = model(feature)
         loss = F.cross_entropy(logit, target, size_average=False)
 
-        avg_loss += loss.data[0]
+        avg_loss += loss.item()
         corrects += (torch.max(logit, 1)
                      [1].view(target.size()).data == target.data).sum()
 
@@ -94,8 +91,7 @@ def predict(text, model, text_field, label_feild, cuda_flag):
     print(x)
     output = model(x)
     _, predicted = torch.max(output, 1)
-    #return label_feild.vocab.itos[predicted.data[0][0]+1]
-    return label_feild.vocab.itos[predicted.data[0]+1]
+    return label_feild.vocab.itos[predicted.item()+1]
 
 
 def save(model, save_dir, save_prefix, steps):

From 28ad33e02b1321a4f5f975a4e9e49034ddbd5912 Mon Sep 17 00:00:00 2001
From: wuxiaohui <wuxiaohui@bytedance.com>
Date: Tue, 28 Jul 2020 11:16:45 +0800
Subject: [PATCH 24/31] delete something not used

---
 model.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/model.py b/model.py
index 1541344..db6586f 100644
--- a/model.py
+++ b/model.py
@@ -25,11 +25,6 @@ def __init__(self, args):
         if self.args.static:
             self.embed.weight.requires_grad = False
 
-    def conv_and_pool(self, x, conv):
-        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
-        x = F.max_pool1d(x, x.size(2)).squeeze(2)
-        return x
-
     def forward(self, x):
         x = self.embed(x)  # (N, W, D)
     

From 5f1aba1c048523adb774741156611b56bd292f6e Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Thu, 27 Aug 2020 13:14:11 -0700
Subject: [PATCH 25/31] Compatability updates

See https://github.com/Shawn1993/cnn-text-classification-pytorch/commit/d1f176c17bd7319c09c5780772ec06113cb36611
---
 cnn_text_classification.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 513733d..9b228fb 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -9,7 +9,6 @@
 from sklearn.model_selection import train_test_split as split
 from sklearn.utils.class_weight import compute_sample_weight
 from time import time
-from torch.autograd import Variable
 from torchtext.data import Dataset, Example, Field, Iterator, Pipeline
 
 
@@ -68,7 +67,7 @@ def __eval(self, data_iter):
         softmax = nn.Softmax(dim=1) if self.scoring == "roc_auc" else None
 
         for batch in data_iter:
-            feature, target = batch.text.data.t(), batch.label.data.sub(1)
+            feature, target = batch.text.t_(), batch.label.sub_(1)
 
             if self.cuda and torch.cuda.is_available():
                 feature, target = feature.cuda(), target.cuda()
@@ -80,10 +79,10 @@ def __eval(self, data_iter):
             if self.scoring == "roc_auc":
                 pred = [[float(p) for p in dist] for dist in softmax(logit)]
             else:
-                pred = torch.max(logit, 1)[1].view(target.size()).data.tolist()
+                pred = torch.max(logit, 1)[1].view(target.size()).tolist()
 
             preds += pred
-            targets += target.data.tolist()
+            targets += target.tolist()
 
         targets = [self.__label_field.vocab.itos[targ + 1] for targ in targets]
 
@@ -131,7 +130,7 @@ def fit(self, X, y, sample_weight=None):
 
         for epoch in range(self.epochs):
             for batch in train_iter:
-                feature, target = batch.text.data.t(), batch.label.data.sub(1)
+                feature, target = batch.text.t_(), batch.label.sub_(1)
 
                 if self.cuda and torch.cuda.is_available():
                     feature, target = feature.cuda(), target.cuda()
@@ -167,7 +166,6 @@ def fit(self, X, y, sample_weight=None):
         if self.verbose > 0:
             self.__print_elapsed_time(time() - start)
 
-        torch.cuda.empty_cache()
         return self
 
     def __predict(self, X):
@@ -182,17 +180,16 @@ def __predict(self, X):
             text = self.__text_field.preprocess(text)
             text = self.__pad(text, max_kernel_size, True)
             text = [[self.__text_field.vocab.stoi[x] for x in text]]
-            x = Variable(torch.tensor(text))
+            x = torch.tensor(text)
             x = x.cuda() if self.cuda and torch.cuda.is_available() else x
 
             y_output.append(self.__model(x))
 
-        torch.cuda.empty_cache()
         return y_output
 
     def predict(self, X):
         y_pred = [torch.argmax(yi, 1) for yi in self.__predict(X)]
-        return [self.__label_field.vocab.itos[yi.data[0] + 1] for yi in y_pred]
+        return [self.__label_field.vocab.itos[yi.item() + 1] for yi in y_pred]
 
     def predict_proba(self, X):
         softmax = nn.Softmax(dim=1)
@@ -253,7 +250,7 @@ def __preprocess_text(self, text):
         if self.preprocessor is None:
             return self.__clean_str(text)
 
-        return self.__clean_str(self.preprocessor(text))
+        return self.preprocessor(text)
 
     def __print_elapsed_time(self, seconds):
         sc = round(seconds)
@@ -289,10 +286,10 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num,
 
         Ks = kernel_sizes
         module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks]
-        self.__convs1 = nn.ModuleList(module_list)
+        self.__convs = nn.ModuleList(module_list)
         self.__dropout = nn.Dropout(dropout)
-        self.__fc1 = nn.Linear(len(Ks) * kernel_num, class_num)
-        self.__static = static
+        self.__fc = nn.Linear(len(Ks) * kernel_num, class_num)
+        self.__embed.weight.requires_grad = not static
 
         if activation_func == "relu":
             self.__f = F.relu
@@ -302,11 +299,11 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num,
             self.__f = lambda x: x
 
     def forward(self, x):
-        x = Variable(self.__embed(x)) if self.__static else self.__embed(x)
-        x = x.unsqueeze(1)
-        x = [self.__f(conv(x), inplace=True).squeeze(3) for conv in self.__convs1]
+        x = self.__embed(x).unsqueeze(1)
+        x = [self.__f(cnv(x), inplace=True).squeeze(3) for cnv in self.__convs]
         x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
-        return self.__fc1(self.__dropout(torch.cat(x, 1)))
+        return self.__fc(self.__dropout(torch.cat(x, 1)))
+
 
 class _Eval():
     def __init__(self, preds):

From 6dfa40afc41e001b693255580e6e6c4783e5b921 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@ucr.edu>
Date: Thu, 27 Aug 2020 16:56:33 -0700
Subject: [PATCH 26/31] Refactored prediction code and fixed preprocessing bug

---
 README.md                  |  2 --
 cnn_text_classification.py | 70 +++++++++++++++-----------------------
 2 files changed, 28 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index 51755ef..9a048b6 100644
--- a/README.md
+++ b/README.md
@@ -9,8 +9,6 @@ Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.
 * scikit-learn
 
 ## Known Issues
-* The predict and predict_proba methods are probably not as efficient as they could be.
-* The class probabilities returned by the predict_proba method are probably questionable.
 * Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA).
 * Only supports pre-trained word vectors from TorchText (or no pre-trained vectors).
 * The random_state parameter probably only works with integers or None.
diff --git a/cnn_text_classification.py b/cnn_text_classification.py
index 9b228fb..4d83355 100644
--- a/cnn_text_classification.py
+++ b/cnn_text_classification.py
@@ -4,12 +4,13 @@
 import torch.nn.functional as F
 from collections import Counter
 from os import remove
+from os.path import exists
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
 from sklearn.model_selection import train_test_split as split
 from sklearn.utils.class_weight import compute_sample_weight
 from time import time
-from torchtext.data import Dataset, Example, Field, Iterator, Pipeline
+from torchtext.data import Dataset, Example, Field, Iterator
 
 
 class CNNClassifier(BaseEstimator, ClassifierMixin):
@@ -43,8 +44,9 @@ def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
         self.class_weight = class_weight
         self.random_state = random_state
         self.verbose = verbose
+        self.__max_kernel_size = max(self.kernel_sizes)
 
-    def __clean_str(self, string):
+    def __default_preprocessor(self, string):
         string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
         string = re.sub(r"\'s", " \'s", string)
         string = re.sub(r"\'ve", " \'ve", string)
@@ -107,7 +109,7 @@ def fit(self, X, y, sample_weight=None):
             print("\n".join([": ".join([k, str(v)]) for k, v in params]))
 
         start = time() if self.verbose > 0 else None
-        train_iter, dev_iter = self.__preprocess(X, y, sample_weight)
+        train_iter, dev_iter = self.__prepare_train_data(X, y, sample_weight)
         embed_num = len(self.__text_field.vocab)
         class_num = len(self.__label_field.vocab) - 1
         self.__model = _CNNText(embed_num, self.embed_dim, class_num,
@@ -157,7 +159,7 @@ def fit(self, X, y, sample_weight=None):
             if not active:
                 break
 
-        if self.save_best:
+        if self.save_best and exists(filename):
             self.__model.load_state_dict(torch.load(filename))
             remove(filename)
 
@@ -169,8 +171,7 @@ def fit(self, X, y, sample_weight=None):
         return self
 
     def __predict(self, X):
-        y_output = []
-        max_kernel_size = max(self.kernel_sizes)
+        texts = []
 
         self.__model.eval()
 
@@ -178,44 +179,25 @@ def __predict(self, X):
             assert isinstance(text, str)
 
             text = self.__text_field.preprocess(text)
-            text = self.__pad(text, max_kernel_size, True)
-            text = [[self.__text_field.vocab.stoi[x] for x in text]]
-            x = torch.tensor(text)
-            x = x.cuda() if self.cuda and torch.cuda.is_available() else x
+            text = [self.__text_field.vocab.stoi[x] for x in text]
+            texts.append(torch.tensor(text))
 
-            y_output.append(self.__model(x))
-
-        return y_output
+        x = torch.stack(texts, 0)
+        x = x.cuda() if self.cuda and torch.cuda.is_available() else x
+        return self.__model(x)
 
     def predict(self, X):
-        y_pred = [torch.argmax(yi, 1) for yi in self.__predict(X)]
+        y_pred = torch.argmax(self.__predict(X), 1)
         return [self.__label_field.vocab.itos[yi.item() + 1] for yi in y_pred]
 
     def predict_proba(self, X):
-        softmax = nn.Softmax(dim=1)
-        y_prob = [softmax(yi) for yi in self.__predict(X)]
-        return [[float(yij) for yij in yi[0]] for yi in y_prob]
-
-    def __pad(self, x, max_kernel_size, preprocessed=False):
-        tokens = x if preprocessed else self.__text_field.preprocess(x)
-        difference = max_kernel_size - len(tokens)
-
-        if difference > 0:
-            padding = [self.__text_field.pad_token] * difference
-            return x + padding if preprocessed else " ".join([x] + padding)
-
-        return x
+        return nn.Softmax(dim=1)(self.__predict(X)).tolist()
 
-    def __preprocess(self, X, y, sample_weight):
+    def __prepare_train_data(self, X, y, sample_weight):
         self.__text_field = Field(lower=True)
         self.__label_field = Field(sequential=False)
-        self.__text_field.preprocessing = Pipeline(self.__preprocess_text)
-        max_kernel_size = max(self.kernel_sizes)
+        self.__text_field.tokenize = self.__tokenize
         sample_weight = None if sample_weight is None else list(sample_weight)
-
-        for i in range(len(X)):
-            X[i] = self.__pad(X[i], max_kernel_size)
-
         sw = [1 for yi in y] if sample_weight is None else sample_weight
         s = y if Counter(y).most_common()[-1][1] > 1 else None
         X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=s,
@@ -246,12 +228,6 @@ def __preprocess(self, X, y, sample_weight):
         return Iterator.splits((train_data, dev_data), batch_sizes=batch_sizes,
                                sort_key=lambda ex: len(ex.text), repeat=False)
 
-    def __preprocess_text(self, text):
-        if self.preprocessor is None:
-            return self.__clean_str(text)
-
-        return self.preprocessor(text)
-
     def __print_elapsed_time(self, seconds):
         sc = round(seconds)
         mn = int(sc / 60)
@@ -272,6 +248,16 @@ def __print_elapsed_time(self, seconds):
 
         print("Completed training in {}.".format(times))
 
+    def __tokenize(self, text):
+        if self.preprocessor is None:
+            text = self.__default_preprocessor(text)
+        else:
+            text = self.preprocessor(text)
+
+        tokens = text.split()
+        difference = self.__max_kernel_size - len(tokens)
+        return tokens + [self.__text_field.pad_token] * max(difference, 0)
+
 
 class _CNNText(nn.Module):
     def __init__(self, embed_num, embed_dim, class_num, kernel_num,
@@ -280,8 +266,9 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num,
 
         if vectors is None:
             self.__embed = nn.Embedding(embed_num, embed_dim)
+            self.__embed.weight.requires_grad = not static
         else:
-            self.__embed = nn.Embedding.from_pretrained(vectors)
+            self.__embed = nn.Embedding.from_pretrained(vectors, freeze=static)
             embed_dim = self.__embed.embedding_dim
 
         Ks = kernel_sizes
@@ -289,7 +276,6 @@ def __init__(self, embed_num, embed_dim, class_num, kernel_num,
         self.__convs = nn.ModuleList(module_list)
         self.__dropout = nn.Dropout(dropout)
         self.__fc = nn.Linear(len(Ks) * kernel_num, class_num)
-        self.__embed.weight.requires_grad = not static
 
         if activation_func == "relu":
             self.__f = F.relu

From f30623afb69022ecf4a230d307cf4a795b9651a4 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@users.noreply.github.com>
Date: Mon, 31 Aug 2020 15:34:39 -0700
Subject: [PATCH 27/31] Add files via upload

---
 README.md     | 208 +++++++++++++++++++++++++-------------------------
 main.py       | 116 ++++++++++++++++++++++++++++
 model.py      |  41 ++++++++++
 mydatasets.py | 110 ++++++++++++++++++++++++++
 train.py      | 102 +++++++++++++++++++++++++
 5 files changed, 474 insertions(+), 103 deletions(-)
 create mode 100644 main.py
 create mode 100644 model.py
 create mode 100644 mydatasets.py
 create mode 100644 train.py

diff --git a/README.md b/README.md
index 9a048b6..5ee32a7 100644
--- a/README.md
+++ b/README.md
@@ -1,125 +1,127 @@
 ## Introduction
-Fork of Shawn Ng's [CNNs for Sentence Classification in PyTorch](https://github.com/Shawn1993/cnn-text-classification-pytorch), refactored as a scikit-learn classifier.
+This is the implementation of Kim's [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch.
 
-## Requirements
+1. Kim's implementation of the model in Theano:
+[https://github.com/yoonkim/CNN_sentence](https://github.com/yoonkim/CNN_sentence)
+2. Denny Britz has an implementation in Tensorflow:
+[https://github.com/dennybritz/cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf)
+3. Alexander Rakhlin's implementation in Keras;
+[https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras](https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras)
+
+## Requirement
 * python 3
 * pytorch > 0.1
 * torchtext > 0.1
 * numpy
-* scikit-learn
-
-## Known Issues
-* Doesn't play well with GridSearchCV if num_jobs isn't 1 (unless not using CUDA).
-* Only supports pre-trained word vectors from TorchText (or no pre-trained vectors).
-* The random_state parameter probably only works with integers or None.
-* Features my idiosyncratic coding style.
-
-## To Do
-* Add support for cross-validation during training.
-* Implement sample weights in eval scoring?
-
-## Parameters
-**lr : float, optional (default=0.01)**
-  Initial learning rate.
-
-**epochs : integer, optional (default=256)**
-  Number of training epochs.
-
-**batch_size : integer, optional (default=64)**
-  Training batch size.
-
-**test_interval : integer, optional (default=100)**
-  The number of epochs to wait before testing.
-
-**early_stop : integer, optional (default=1000)**
-  The number of iterations without increased performance to reach before stopping.
-
-**save_best : boolean, optional (default=True)**
-  Keep the model with the best performance found during training.
-
-**dropout : float, optional (default=0.5)**
-  Dropout probability.
-
-**max_norm : float, optional (default=0.0)**
-  L2 constraint.
-
-**embed_dim : integer, optional (default=128)**
-  The number of embedding dimensions. Ignored if vectors is not None.
-
-**kernel_num : integer, optional (default=100)**
-  The number of each size of kernel.
-
-**kernel_sizes : iterable of integers, optional (default=(3, 4, 5))**
-  Kernel sizes to use for convolution.
-
-**static : boolean, optional (default=False)**
-  If true, fix the embedding.
-
-**device : int, optional (default=-1)**
-  Device to use for iterating data; -1 for CPU (see torch.cuda.set_device()).
 
-**cuda : boolean, optional (default=True)**
-  If true, use the GPU if available.
+## Result
+I just tried two dataset, MR and SST.
 
-**activation_func : string, optional (default='relu')**
-  Activation function. If 'relu' or 'tanh', uses rectified linear unit or hyperbolic tangent, respectively. Otherwise, uses no activation function (f(x) = x).
+|Dataset|Class Size|Best Result|Kim's Paper Result|
+|---|---|---|---|
+|MR|2|77.5%(CNN-rand-static)|76.1%(CNN-rand-nostatic)|
+|SST|5|37.2%(CNN-rand-static)|45.0%(CNN-rand-nostatic)|
 
-**scoring : callable or "roc_auc", optional (default=sklearn.metrics.make_scorer(sklearn.metrics.accuracy_score))**
-  Scoring method for testing model performance during fitting.
+I haven't adjusted the hyper-parameters for SST seriously.
 
-**pos_label : string, optional (default=None)**
-  Positive class label for roc_auc scoring. Ignored if using a different scoring method.
-
-**vectors : string, optional (default=None)**
-  Which pretrained TorchText vectors to use (see [torchtext.vocab.pretrained_aliases](https://torchtext.readthedocs.io/en/latest/vocab.html#pretrained-aliases) for options).
-
-**split_ratio : float, optional (default=0.9)**
-  Ratio of training data used for training. The remainder will be used for validation.
-
-**preprocessor : callable or None, optional (default=None)**
-  Override default string preprocessing.
-
-**class_weight : dict, "balanced" or None, optional (default=None)**
-  Weights associated with each class (see class_weight parameter in existing scikit-learn classifiers).
-
-**random_state : integer, optional (default=None)**
-  Seed for the random number generator.
-
-**verbose : integer, optional (default=0)**
-  Controls the verbosity when fitting.
-
-## Methods
-**fit(X, y, sample_weight=None)**
-Train the CNN classifier from the training set (X, y).
+## Usage
+```
+./main.py -h
 ```
-Parameters: X: list of strings
-               The training input samples.
+or 
 
-            y: list of strings
-               The class labels.
+```
+python3 main.py -h
+```
 
-            sample_weight: list of integers or floats, or None
-               Sample weights. If None, samples are equally weighted.
+You will get:
 
-Returns:    self : object
 ```
-
-**predict(X)**
-Predict class for X.
+CNN text classificer
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -batch-size N         batch size for training [default: 50]
+  -lr LR                initial learning rate [default: 0.01]
+  -epochs N             number of epochs for train [default: 10]
+  -dropout              the probability for dropout [default: 0.5]
+  -max_norm MAX_NORM    l2 constraint of parameters
+  -cpu                  disable the gpu
+  -device DEVICE        device to use for iterate data
+  -embed-dim EMBED_DIM
+  -static               fix the embedding
+  -kernel-sizes KERNEL_SIZES
+                        Comma-separated kernel size to use for convolution
+  -kernel-num KERNEL_NUM
+                        number of each kind of kernel
+  -class-num CLASS_NUM  number of class
+  -shuffle              shuffle the data every epoch
+  -num-workers NUM_WORKERS
+                        how many subprocesses to use for data loading
+                        [default: 0]
+  -log-interval LOG_INTERVAL
+                        how many batches to wait before logging training
+                        status
+  -test-interval TEST_INTERVAL
+                        how many epochs to wait before testing
+  -save-interval SAVE_INTERVAL
+                        how many epochs to wait before saving
+  -predict PREDICT      predict the sentence given
+  -snapshot SNAPSHOT    filename of model snapshot [default: None]
+  -save-dir SAVE_DIR    where to save the checkpoint
 ```
-Parameters: X: list of strings
-               The input samples.
 
-Returns:    y: list of strings
-               The predicted classes.
+## Train
+```
+./main.py
 ```
+You will get:
 
-**predict_proba(X)**
-Predict class probabilities for X.
 ```
-Parameters: X: list of strings
-               The input samples.
+Batch[100] - loss: 0.655424  acc: 59.3750%
+Evaluation - loss: 0.672396  acc: 57.6923%(615/1066) 
+```
+
+## Test
+If you has construct you test set, you make testing like:
 
-Returns:    y: list of lists for floats
-               The predicted class probabilities.
 ```
+/main.py -test -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt
+```
+The snapshot option means where your model load from. If you don't assign it, the model will start from scratch.
+
+## Predict
+* **Example1**
+
+	```
+	./main.py -predict="Hello my dear , I love you so much ." \
+	          -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt" 
+	```
+	You will get:
+	
+	```
+	Loading model from [./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt]...
+	
+	[Text]  Hello my dear , I love you so much .
+	[Label] positive
+	```
+* **Example2**
+
+	```
+	./main.py -predict="You just make me so sad and I have to leave you ."\
+	          -snapshot="./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt" 
+	```
+	You will get:
+	
+	```
+	Loading model from [./snapshot/2017-02-11_15-50-53/snapshot_steps1500.pt]...
+	
+	[Text]  You just make me so sad and I have to leave you .
+	[Label] negative
+	```
+
+Your text must be separated by space, even punctuation.And, your text should longer then the max kernel size.
+
+## Reference
+* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882)
+
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..dd222a6
--- /dev/null
+++ b/main.py
@@ -0,0 +1,116 @@
+#! /usr/bin/env python
+import os
+import argparse
+import datetime
+import torch
+import torchtext.data as data
+import torchtext.datasets as datasets
+import model
+import train
+import mydatasets
+
+
+parser = argparse.ArgumentParser(description='CNN text classificer')
+# learning
+parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]')
+parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]')
+parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]')
+parser.add_argument('-log-interval',  type=int, default=1,   help='how many steps to wait before logging training status [default: 1]')
+parser.add_argument('-test-interval', type=int, default=100, help='how many steps to wait before testing [default: 100]')
+parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]')
+parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot')
+parser.add_argument('-early-stop', type=int, default=1000, help='iteration numbers to stop without performance increasing')
+parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance')
+# data 
+parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch')
+# model
+parser.add_argument('-dropout', type=float, default=0.5, help='the probability for dropout [default: 0.5]')
+parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]')
+parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]')
+parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel')
+parser.add_argument('-kernel-sizes', type=str, default='3,4,5', help='comma-separated kernel size to use for convolution')
+parser.add_argument('-static', action='store_true', default=False, help='fix the embedding')
+# device
+parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]')
+parser.add_argument('-no-cuda', action='store_true', default=False, help='disable the gpu')
+# option
+parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]')
+parser.add_argument('-predict', type=str, default=None, help='predict the sentence given')
+parser.add_argument('-test', action='store_true', default=False, help='train or test')
+args = parser.parse_args()
+
+
+# load SST dataset
+def sst(text_field, label_field,  **kargs):
+    train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True)
+    text_field.build_vocab(train_data, dev_data, test_data)
+    label_field.build_vocab(train_data, dev_data, test_data)
+    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
+                                        (train_data, dev_data, test_data), 
+                                        batch_sizes=(args.batch_size, 
+                                                     len(dev_data), 
+                                                     len(test_data)),
+                                        **kargs)
+    return train_iter, dev_iter, test_iter 
+
+
+# load MR dataset
+def mr(text_field, label_field, **kargs):
+    train_data, dev_data = mydatasets.MR.splits(text_field, label_field)
+    text_field.build_vocab(train_data, dev_data)
+    label_field.build_vocab(train_data, dev_data)
+    train_iter, dev_iter = data.Iterator.splits(
+                                (train_data, dev_data), 
+                                batch_sizes=(args.batch_size, len(dev_data)),
+                                **kargs)
+    return train_iter, dev_iter
+
+
+# load data
+print("\nLoading data...")
+text_field = data.Field(lower=True)
+label_field = data.Field(sequential=False)
+train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False)
+# train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False)
+
+
+# update args and print
+args.embed_num = len(text_field.vocab)
+args.class_num = len(label_field.vocab) - 1
+args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda
+args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
+args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+
+print("\nParameters:")
+for attr, value in sorted(args.__dict__.items()):
+    print("\t{}={}".format(attr.upper(), value))
+
+
+# model
+cnn = model.CNN_Text(args)
+if args.snapshot is not None:
+    print('\nLoading model from {}...'.format(args.snapshot))
+    cnn.load_state_dict(torch.load(args.snapshot))
+
+if args.cuda:
+    torch.cuda.set_device(args.device)
+    cnn = cnn.cuda()
+        
+
+# train or predict
+if args.predict is not None:
+    label = train.predict(args.predict, cnn, text_field, label_field, args.cuda)
+    print('\n[Text]  {}\n[Label] {}\n'.format(args.predict, label))
+elif args.test:
+    try:
+        train.eval(test_iter, cnn, args) 
+    except Exception as e:
+        print("\nSorry. The test dataset doesn't  exist.\n")
+else:
+    print()
+    try:
+        train.train(train_iter, dev_iter, cnn, args)
+    except KeyboardInterrupt:
+        print('\n' + '-' * 89)
+        print('Exiting from training early')
+
diff --git a/model.py b/model.py
new file mode 100644
index 0000000..db6586f
--- /dev/null
+++ b/model.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+
+class CNN_Text(nn.Module):
+    
+    def __init__(self, args):
+        super(CNN_Text, self).__init__()
+        self.args = args
+        
+        V = args.embed_num
+        D = args.embed_dim
+        C = args.class_num
+        Ci = 1
+        Co = args.kernel_num
+        Ks = args.kernel_sizes
+
+        self.embed = nn.Embedding(V, D)
+        self.convs = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
+        self.dropout = nn.Dropout(args.dropout)
+        self.fc1 = nn.Linear(len(Ks) * Co, C)
+
+        if self.args.static:
+            self.embed.weight.requires_grad = False
+
+    def forward(self, x):
+        x = self.embed(x)  # (N, W, D)
+    
+        x = x.unsqueeze(1)  # (N, Ci, W, D)
+
+        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # [(N, Co, W), ...]*len(Ks)
+
+        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
+
+        x = torch.cat(x, 1)
+
+        x = self.dropout(x)  # (N, len(Ks)*Co)
+        logit = self.fc1(x)  # (N, C)
+        return logit
diff --git a/mydatasets.py b/mydatasets.py
new file mode 100644
index 0000000..961188f
--- /dev/null
+++ b/mydatasets.py
@@ -0,0 +1,110 @@
+import re
+import os
+import random
+import tarfile
+import urllib
+from torchtext import data
+
+
+class TarDataset(data.Dataset):
+    """Defines a Dataset loaded from a downloadable tar archive.
+
+    Attributes:
+        url: URL where the tar archive can be downloaded.
+        filename: Filename of the downloaded tar archive.
+        dirname: Name of the top-level directory within the zip archive that
+            contains the data files.
+    """
+
+    @classmethod
+    def download_or_unzip(cls, root):
+        path = os.path.join(root, cls.dirname)
+        if not os.path.isdir(path):
+            tpath = os.path.join(root, cls.filename)
+            if not os.path.isfile(tpath):
+                print('downloading')
+                urllib.request.urlretrieve(cls.url, tpath)
+            with tarfile.open(tpath, 'r') as tfile:
+                print('extracting')
+                tfile.extractall(root)
+        return os.path.join(path, '')
+
+
+class MR(TarDataset):
+
+    url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
+    filename = 'rt-polaritydata.tar.gz'
+    dirname = 'rt-polaritydata'
+
+    @staticmethod
+    def sort_key(ex):
+        return len(ex.text)
+
+    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
+        """Create an MR dataset instance given a path and fields.
+
+        Arguments:
+            text_field: The field that will be used for text data.
+            label_field: The field that will be used for label data.
+            path: Path to the data file.
+            examples: The examples contain all the data.
+            Remaining keyword arguments: Passed to the constructor of
+                data.Dataset.
+        """
+        def clean_str(string):
+            """
+            Tokenization/string cleaning for all datasets except for SST.
+            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
+            """
+            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
+            string = re.sub(r"\'s", " \'s", string)
+            string = re.sub(r"\'ve", " \'ve", string)
+            string = re.sub(r"n\'t", " n\'t", string)
+            string = re.sub(r"\'re", " \'re", string)
+            string = re.sub(r"\'d", " \'d", string)
+            string = re.sub(r"\'ll", " \'ll", string)
+            string = re.sub(r",", " , ", string)
+            string = re.sub(r"!", " ! ", string)
+            string = re.sub(r"\(", " \( ", string)
+            string = re.sub(r"\)", " \) ", string)
+            string = re.sub(r"\?", " \? ", string)
+            string = re.sub(r"\s{2,}", " ", string)
+            return string.strip()
+
+        text_field.tokenize = lambda x: clean_str(x).split()
+        fields = [('text', text_field), ('label', label_field)]
+
+        if examples is None:
+            path = self.dirname if path is None else path
+            examples = []
+            with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f:
+                examples += [
+                    data.Example.fromlist([line, 'negative'], fields) for line in f]
+            with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f:
+                examples += [
+                    data.Example.fromlist([line, 'positive'], fields) for line in f]
+        super(MR, self).__init__(examples, fields, **kwargs)
+
+    @classmethod
+    def splits(cls, text_field, label_field, dev_ratio=.1, shuffle=True, root='.', **kwargs):
+        """Create dataset objects for splits of the MR dataset.
+
+        Arguments:
+            text_field: The field that will be used for the sentence.
+            label_field: The field that will be used for label data.
+            dev_ratio: The ratio that will be used to get split validation dataset.
+            shuffle: Whether to shuffle the data before split.
+            root: The root directory that the dataset's zip archive will be
+                expanded into; therefore the directory in whose trees
+                subdirectory the data files will be stored.
+            train: The filename of the train data. Default: 'train.txt'.
+            Remaining keyword arguments: Passed to the splits method of
+                Dataset.
+        """
+        path = cls.download_or_unzip(root)
+        examples = cls(text_field, label_field, path=path, **kwargs).examples
+        if shuffle: random.shuffle(examples)
+        dev_index = -1 * int(dev_ratio*len(examples))
+
+        return (cls(text_field, label_field, examples=examples[:dev_index]),
+                cls(text_field, label_field, examples=examples[dev_index:]))
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..d9000af
--- /dev/null
+++ b/train.py
@@ -0,0 +1,102 @@
+import os
+import sys
+import torch
+import torch.autograd as autograd
+import torch.nn.functional as F
+
+
+def train(train_iter, dev_iter, model, args):
+    if args.cuda:
+        model.cuda()
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+
+    steps = 0
+    best_acc = 0
+    last_step = 0
+    model.train()
+    for epoch in range(1, args.epochs+1):
+        for batch in train_iter:
+            feature, target = batch.text, batch.label
+            feature.t_(), target.sub_(1)  # batch first, index align
+            if args.cuda:
+                feature, target = feature.cuda(), target.cuda()
+
+            optimizer.zero_grad()
+            logit = model(feature)
+            loss = F.cross_entropy(logit, target)
+            loss.backward()
+            optimizer.step()
+
+            steps += 1
+            if steps % args.log_interval == 0:
+                corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
+                accuracy = 100.0 * corrects/batch.batch_size
+                sys.stdout.write(
+                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps, 
+                                                                             loss.item(), 
+                                                                             accuracy.item(),
+                                                                             corrects.item(),
+                                                                             batch.batch_size))
+            if steps % args.test_interval == 0:
+                dev_acc = eval(dev_iter, model, args)
+                if dev_acc > best_acc:
+                    best_acc = dev_acc
+                    last_step = steps
+                    if args.save_best:
+                        save(model, args.save_dir, 'best', steps)
+                else:
+                    if steps - last_step >= args.early_stop:
+                        print('early stop by {} steps.'.format(args.early_stop))
+            elif steps % args.save_interval == 0:
+                save(model, args.save_dir, 'snapshot', steps)
+
+
+def eval(data_iter, model, args):
+    model.eval()
+    corrects, avg_loss = 0, 0
+    for batch in data_iter:
+        feature, target = batch.text, batch.label
+        feature.t_(), target.sub_(1)  # batch first, index align
+        if args.cuda:
+            feature, target = feature.cuda(), target.cuda()
+
+        logit = model(feature)
+        loss = F.cross_entropy(logit, target, size_average=False)
+
+        avg_loss += loss.item()
+        corrects += (torch.max(logit, 1)
+                     [1].view(target.size()).data == target.data).sum()
+
+    size = len(data_iter.dataset)
+    avg_loss /= size
+    accuracy = 100.0 * corrects/size
+    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss, 
+                                                                       accuracy, 
+                                                                       corrects, 
+                                                                       size))
+    return accuracy
+
+
+def predict(text, model, text_field, label_feild, cuda_flag):
+    assert isinstance(text, str)
+    model.eval()
+    # text = text_field.tokenize(text)
+    text = text_field.preprocess(text)
+    text = [[text_field.vocab.stoi[x] for x in text]]
+    x = torch.tensor(text)
+    x = autograd.Variable(x)
+    if cuda_flag:
+        x = x.cuda()
+    print(x)
+    output = model(x)
+    _, predicted = torch.max(output, 1)
+    return label_feild.vocab.itos[predicted.item()+1]
+
+
+def save(model, save_dir, save_prefix, steps):
+    if not os.path.isdir(save_dir):
+        os.makedirs(save_dir)
+    save_prefix = os.path.join(save_dir, save_prefix)
+    save_path = '{}_steps_{}.pt'.format(save_prefix, steps)
+    torch.save(model.state_dict(), save_path)

From 0751811ee2d2577b620b6c3bebef3dbafdb37c3e Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@users.noreply.github.com>
Date: Mon, 31 Aug 2020 15:35:06 -0700
Subject: [PATCH 28/31] Delete cnn_text_classification.py

---
 cnn_text_classification.py | 299 -------------------------------------
 1 file changed, 299 deletions(-)
 delete mode 100644 cnn_text_classification.py

diff --git a/cnn_text_classification.py b/cnn_text_classification.py
deleted file mode 100644
index 4d83355..0000000
--- a/cnn_text_classification.py
+++ /dev/null
@@ -1,299 +0,0 @@
-import re
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from collections import Counter
-from os import remove
-from os.path import exists
-from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
-from sklearn.model_selection import train_test_split as split
-from sklearn.utils.class_weight import compute_sample_weight
-from time import time
-from torchtext.data import Dataset, Example, Field, Iterator
-
-
-class CNNClassifier(BaseEstimator, ClassifierMixin):
-    def __init__(self, lr=0.001, epochs=256, batch_size=64, test_interval=100,
-                 early_stop=1000, save_best=True, dropout=0.5, max_norm=0.0,
-                 embed_dim=128, kernel_num=100, kernel_sizes=(3, 4, 5),
-                 static=False, device=-1, cuda=True, activation_func="relu",
-                 scoring=make_scorer(accuracy_score), pos_label=None,
-                 vectors=None, split_ratio=0.8, preprocessor=None,
-                 class_weight=None, random_state=None, verbose=0):
-        self.lr = lr
-        self.epochs = epochs
-        self.batch_size = batch_size
-        self.test_interval = test_interval
-        self.early_stop = early_stop
-        self.save_best = save_best
-        self.dropout = dropout
-        self.max_norm = max_norm
-        self.embed_dim = embed_dim
-        self.kernel_num = kernel_num
-        self.kernel_sizes = kernel_sizes
-        self.static = static
-        self.device = device
-        self.cuda = cuda
-        self.activation_func = activation_func
-        self.scoring = scoring
-        self.pos_label = pos_label
-        self.vectors = vectors
-        self.split_ratio = split_ratio
-        self.preprocessor = preprocessor
-        self.class_weight = class_weight
-        self.random_state = random_state
-        self.verbose = verbose
-        self.__max_kernel_size = max(self.kernel_sizes)
-
-    def __default_preprocessor(self, string):
-        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
-        string = re.sub(r"\'s", " \'s", string)
-        string = re.sub(r"\'ve", " \'ve", string)
-        string = re.sub(r"n\'t", " n\'t", string)
-        string = re.sub(r"\'re", " \'re", string)
-        string = re.sub(r"\'d", " \'d", string)
-        string = re.sub(r"\'ll", " \'ll", string)
-        string = re.sub(r",", " , ", string)
-        string = re.sub(r"!", " ! ", string)
-        string = re.sub(r"\(", " ( ", string)
-        string = re.sub(r"\)", " ) ", string)
-        string = re.sub(r"\?", " ? ", string)
-        string = re.sub(r"\s{2,}", " ", string)
-        return string.strip()
-
-    def __eval(self, data_iter):
-        self.__model.eval()
-
-        preds, targets = [], []
-        softmax = nn.Softmax(dim=1) if self.scoring == "roc_auc" else None
-
-        for batch in data_iter:
-            feature, target = batch.text.t_(), batch.label.sub_(1)
-
-            if self.cuda and torch.cuda.is_available():
-                feature, target = feature.cuda(), target.cuda()
-
-            logit = self.__model(feature)
-
-            F.cross_entropy(logit, target, reduction="sum")
-
-            if self.scoring == "roc_auc":
-                pred = [[float(p) for p in dist] for dist in softmax(logit)]
-            else:
-                pred = torch.max(logit, 1)[1].view(target.size()).tolist()
-
-            preds += pred
-            targets += target.tolist()
-
-        targets = [self.__label_field.vocab.itos[targ + 1] for targ in targets]
-
-        if self.scoring == "roc_auc":
-            pos_index = self.__label_field.vocab.stoi[self.pos_label] - 1
-            return roc_auc_score(targets, [pred[pos_index] for pred in preds])
-
-        preds = [self.__label_field.vocab.itos[pred + 1] for pred in preds]
-        return self.scoring(_Eval(preds), None, targets)
-
-    def fit(self, X, y, sample_weight=None):
-        if self.random_state is not None:
-            torch.manual_seed(self.random_state)
-
-        torch.backends.cudnn.deterministic = self.random_state is not None
-        torch.backends.cudnn.benchmark = self.random_state is None
-
-        if self.verbose > 1:
-            params = self.get_params().items()
-
-            print("Fitting with the following parameters:")
-            print("\n".join([": ".join([k, str(v)]) for k, v in params]))
-
-        start = time() if self.verbose > 0 else None
-        train_iter, dev_iter = self.__prepare_train_data(X, y, sample_weight)
-        embed_num = len(self.__text_field.vocab)
-        class_num = len(self.__label_field.vocab) - 1
-        self.__model = _CNNText(embed_num, self.embed_dim, class_num,
-                                self.kernel_num, self.kernel_sizes,
-                                self.dropout, self.static,
-                                self.activation_func,
-                                vectors=self.__text_field.vocab.vectors)
-
-        if self.cuda and torch.cuda.is_available():
-            torch.cuda.set_device(self.device)
-            self.__model.cuda()
-
-        optimizer = torch.optim.Adam(self.__model.parameters(), lr=self.lr,
-                                     weight_decay=self.max_norm)
-        steps, best_acc, last_step = 0, 0, 0
-        active = True
-        filename = "./{}.model".format(time())
-
-        self.__model.train()
-
-        for epoch in range(self.epochs):
-            for batch in train_iter:
-                feature, target = batch.text.t_(), batch.label.sub_(1)
-
-                if self.cuda and torch.cuda.is_available():
-                    feature, target = feature.cuda(), target.cuda()
-
-                optimizer.zero_grad()
-                F.cross_entropy(self.__model(feature), target).backward()
-                optimizer.step()
-
-                steps += 1
-
-                if steps % self.test_interval == 0:
-                    dev_acc = self.__eval(dev_iter)
-
-                    if dev_acc > best_acc:
-                        best_acc = dev_acc
-                        last_step = steps
-
-                        if self.save_best:
-                            torch.save(self.__model.state_dict(), filename)
-                    elif steps - last_step >= self.early_stop:
-                        active = False
-                        break
-
-            if not active:
-                break
-
-        if self.save_best and exists(filename):
-            self.__model.load_state_dict(torch.load(filename))
-            remove(filename)
-
-        self.classes_ = self.__label_field.vocab.itos[1:]
-
-        if self.verbose > 0:
-            self.__print_elapsed_time(time() - start)
-
-        return self
-
-    def __predict(self, X):
-        texts = []
-
-        self.__model.eval()
-
-        for text in X:
-            assert isinstance(text, str)
-
-            text = self.__text_field.preprocess(text)
-            text = [self.__text_field.vocab.stoi[x] for x in text]
-            texts.append(torch.tensor(text))
-
-        x = torch.stack(texts, 0)
-        x = x.cuda() if self.cuda and torch.cuda.is_available() else x
-        return self.__model(x)
-
-    def predict(self, X):
-        y_pred = torch.argmax(self.__predict(X), 1)
-        return [self.__label_field.vocab.itos[yi.item() + 1] for yi in y_pred]
-
-    def predict_proba(self, X):
-        return nn.Softmax(dim=1)(self.__predict(X)).tolist()
-
-    def __prepare_train_data(self, X, y, sample_weight):
-        self.__text_field = Field(lower=True)
-        self.__label_field = Field(sequential=False)
-        self.__text_field.tokenize = self.__tokenize
-        sample_weight = None if sample_weight is None else list(sample_weight)
-        sw = [1 for yi in y] if sample_weight is None else sample_weight
-        s = y if Counter(y).most_common()[-1][1] > 1 else None
-        X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=s,
-                                           random_state=self.random_state,
-                                           train_size=self.split_ratio)
-        fields = [("text", self.__text_field), ("label", self.__label_field)]
-        examples = [[X_t[i], y_t[i]] for i in range(len(X_t))]
-        examples = [Example.fromlist(example, fields) for example in examples]
-        weights = compute_sample_weight(self.class_weight, y_t)
-        weights = [weights[i] * w_t[i] for i in range(len(y_t))]
-        min_weight = min(weights)
-        weights = [int(round(weight / min_weight)) for weight in weights]
-
-        for i in range(len(X_t)):
-            Xi = [X_t[i] for j in range(weights[i] - 1)]
-            examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi]
-
-        train_data = Dataset(examples, fields)
-        dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))]
-        dev_data = [Example.fromlist(example, fields) for example in dev_data]
-        dev_data = Dataset(dev_data, fields)
-
-        self.__text_field.build_vocab(train_data, dev_data,
-                                      vectors=self.vectors)
-        self.__label_field.build_vocab(train_data, dev_data)
-
-        batch_sizes = (self.batch_size, len(dev_data))
-        return Iterator.splits((train_data, dev_data), batch_sizes=batch_sizes,
-                               sort_key=lambda ex: len(ex.text), repeat=False)
-
-    def __print_elapsed_time(self, seconds):
-        sc = round(seconds)
-        mn = int(sc / 60)
-        sc = sc % 60
-        hr = int(mn / 60)
-        mn = mn % 60
-        hr = "{} hour{}".format(hr, "s" if hr > 1 else "") if hr > 0 else ""
-        mn = "{} minute{}".format(mn, "s" if mn > 1 else "") if mn > 0 else ""
-        sc = "{} second{}".format(sc, "s" if sc > 1 else "") if sc > 0 else ""
-        times = [t for t in [hr, mn, sc] if len(t) > 0]
-
-        if len(times) == 3:
-            times = " and ".join([", ".join([hr, mn]), sc])
-        elif len(times) == 2:
-            times = " and ".join(times)
-        else:
-            times = times[0] if len(times) > 0 else "less than 1 second"
-
-        print("Completed training in {}.".format(times))
-
-    def __tokenize(self, text):
-        if self.preprocessor is None:
-            text = self.__default_preprocessor(text)
-        else:
-            text = self.preprocessor(text)
-
-        tokens = text.split()
-        difference = self.__max_kernel_size - len(tokens)
-        return tokens + [self.__text_field.pad_token] * max(difference, 0)
-
-
-class _CNNText(nn.Module):
-    def __init__(self, embed_num, embed_dim, class_num, kernel_num,
-                 kernel_sizes, dropout, static, activation_func, vectors=None):
-        super(_CNNText, self).__init__()
-
-        if vectors is None:
-            self.__embed = nn.Embedding(embed_num, embed_dim)
-            self.__embed.weight.requires_grad = not static
-        else:
-            self.__embed = nn.Embedding.from_pretrained(vectors, freeze=static)
-            embed_dim = self.__embed.embedding_dim
-
-        Ks = kernel_sizes
-        module_list = [nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in Ks]
-        self.__convs = nn.ModuleList(module_list)
-        self.__dropout = nn.Dropout(dropout)
-        self.__fc = nn.Linear(len(Ks) * kernel_num, class_num)
-
-        if activation_func == "relu":
-            self.__f = F.relu
-        elif activation_func == "tanh":
-            self.__f = torch.tanh
-        else:
-            self.__f = lambda x: x
-
-    def forward(self, x):
-        x = self.__embed(x).unsqueeze(1)
-        x = [self.__f(cnv(x), inplace=True).squeeze(3) for cnv in self.__convs]
-        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
-        return self.__fc(self.__dropout(torch.cat(x, 1)))
-
-
-class _Eval():
-    def __init__(self, preds):
-        self.__preds = preds
-
-    def predict(self, X):
-        return self.__preds

From 8b368e65cffc19b290ab2565d7f146191e2884c6 Mon Sep 17 00:00:00 2001
From: rriva002 <rriva002@users.noreply.github.com>
Date: Mon, 31 Aug 2020 15:36:49 -0700
Subject: [PATCH 29/31] Add files via upload


From 5e11c712d34bfc105931678168443d393e31a37c Mon Sep 17 00:00:00 2001
From: "Bowen(Brad) Xu" <maxxbw1992@gmail.com>
Date: Sat, 26 Sep 2020 01:33:55 +0800
Subject: [PATCH 30/31] Fix RuntimeError

Re-call model.train() from eval() to fix "RuntimeError:Cudnn RNN backward can only be called in training mode." error.
---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index d9000af..9e29b33 100644
--- a/train.py
+++ b/train.py
@@ -14,9 +14,9 @@ def train(train_iter, dev_iter, model, args):
     steps = 0
     best_acc = 0
     last_step = 0
-    model.train()
     for epoch in range(1, args.epochs+1):
         for batch in train_iter:
+            model.train()
             feature, target = batch.text, batch.label
             feature.t_(), target.sub_(1)  # batch first, index align
             if args.cuda:

From ddf03147822be1aa7490d27a192847acbae86015 Mon Sep 17 00:00:00 2001
From: TrellixVulnTeam <charles.mcfarland@trellix.com>
Date: Fri, 25 Nov 2022 16:26:59 +0000
Subject: [PATCH 31/31] Adding tarfile member sanitization to extractall()

---
 mydatasets.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/mydatasets.py b/mydatasets.py
index 961188f..26fae46 100644
--- a/mydatasets.py
+++ b/mydatasets.py
@@ -26,7 +26,26 @@ def download_or_unzip(cls, root):
                 urllib.request.urlretrieve(cls.url, tpath)
             with tarfile.open(tpath, 'r') as tfile:
                 print('extracting')
-                tfile.extractall(root)
+                def is_within_directory(directory, target):
+                    
+                    abs_directory = os.path.abspath(directory)
+                    abs_target = os.path.abspath(target)
+                
+                    prefix = os.path.commonprefix([abs_directory, abs_target])
+                    
+                    return prefix == abs_directory
+                
+                def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
+                
+                    for member in tar.getmembers():
+                        member_path = os.path.join(path, member.name)
+                        if not is_within_directory(path, member_path):
+                            raise Exception("Attempted Path Traversal in Tar File")
+                
+                    tar.extractall(path, members, numeric_owner=numeric_owner) 
+                    
+                
+                safe_extract(tfile, root)
         return os.path.join(path, '')