cosmozhang
diff --git a/‎.gitignore‎
Lines changed: 9 additions & 0 deletions b/‎.gitignore‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎.idea/vcs.xml‎
Lines changed: 6 additions & 0 deletions b/‎.idea/vcs.xml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 26 additions & 7 deletions b/‎README.md‎
Lines changed: 26 additions & 7 deletions
diff --git a/‎main.py‎
Lines changed: 75 additions & 52 deletions b/‎main.py‎
Lines changed: 75 additions & 52 deletions
diff --git a/‎model.py‎
Lines changed: 8 additions & 3 deletions b/‎model.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎mydatasets.py‎
Lines changed: 33 additions & 6 deletions b/‎mydatasets.py‎
Lines changed: 33 additions & 6 deletions
diff --git a/‎rt-polaritydata/rt-polaritydata/rt-polarity.neg‎ renamed to ‎rt-polaritydata/rt-polarity.neg‎ b/‎rt-polaritydata/rt-polaritydata/rt-polarity.neg‎ renamed to ‎rt-polaritydata/rt-polarity.neg‎
diff --git a/‎rt-polaritydata/rt-polaritydata/rt-polarity.pos‎ renamed to ‎rt-polaritydata/rt-polarity.pos‎ b/‎rt-polaritydata/rt-polaritydata/rt-polarity.pos‎ renamed to ‎rt-polaritydata/rt-polarity.pos‎
diff --git a/‎train.py‎
Lines changed: 6 additions & 1 deletion b/‎train.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎trainDevTestTrees_PTB.zip‎
771 KB b/‎trainDevTestTrees_PTB.zip‎
771 KB
@@ -93,3 +93,12 @@ ENV/
 
 # model cache
 *.pt
+
+# dataset
+rt-polaritydata
+trees
+*.tar
+*.zip
+
+# pycharm
+.idea
@@ -1,10 +1,9 @@
 ## Introduction
-This is the implementation of kim's [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch.
+This is the implementation of Kim's [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch.
 
-Kim's implementation of the model in Theano:
+1. Kim's implementation of the model in Theano:
 [https://github.com/yoonkim/CNN_sentence](https://github.com/yoonkim/CNN_sentence)
-
-Denny Britz has an implementation in Tensorflow:
+2. Denny Britz has an implementation in Tensorflow:
 [https://github.com/dennybritz/cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf)
 
 ## Requirement
@@ -13,6 +12,16 @@ Denny Britz has an implementation in Tensorflow:
 * torchtext > 0.1
 * numpy
 
+## Result
+I just tried two dataset, MR and SST.
+
+|Dataset|Class Size|Best Result|Kim's Paper Result|
+|---|---|---|---|
+|MR|2|77.5%(CNN-rand-static)|76.1%(CNN-rand-nostatic)|
+|SST|5|37.2%(CNN-rand-static)|45.0%(CNN-rand-nostatic)|
+
+I haven't adjusted the hyper-parameters for SST seriously.
+
 ## Usage
 ```
 ./main.py -h
@@ -71,11 +80,20 @@ Batch[100] - loss: 0.655424  acc: 59.3750%
 Evaluation - loss: 0.672396  acc: 57.6923%(615/1066) 
 ```
 
+## Test
+If you has construct you test set, you make testing like:
+
+```
+/main.py -test -snapshot="./snapshot/2017-02-11-15-50/snapshot_steps1500.pt
+```
+The snapshot option means where your model load from. If you don't assign it, the model will start from scratch.
+
 ## Predict
 * **Example1**
 
 	```
-	./main.py -predict="Hello my dear , I love you so much ." -snapshot="./snapshot/2017-02-11-15-50/snapshot_steps1500.pt" 
+	./main.py -predict="Hello my dear , I love you so much ." \
+	          -snapshot="./snapshot/2017-02-11-15-50/snapshot_steps1500.pt" 
 	```
 	You will get:
 
@@ -88,7 +106,8 @@ Evaluation - loss: 0.672396  acc: 57.6923%(615/1066)
 * **Example2**
 
 	```
-	./main.py -predict="You just make me so sad and I have to leave you ." -snapshot="./snapshot/2017-02-11-15-50/snapshot_steps1500.pt" 
+	./main.py -predict="You just make me so sad and I have to leave you ."\
+	          -snapshot="./snapshot/2017-02-11-15-50/snapshot_steps1500.pt" 
 	```
 	You will get:
 
@@ -99,7 +118,7 @@ Evaluation - loss: 0.672396  acc: 57.6923%(615/1066)
 	[Label] negative
 	```
 
-
+Your text must be separated by space, even punctuation.
 
 ## Reference
 * [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882)
 
@@ -1,6 +1,4 @@
 #! /usr/bin/env python
-
-
 import os
 import argparse
 import datetime
@@ -13,76 +11,101 @@
 
 
 parser = argparse.ArgumentParser(description='CNN text classificer')
-parser.add_argument('-batch-size', type=int, default=64, metavar='N', help='batch size for training [default: 50]')
-parser.add_argument('-lr', type=float, default=0.001, metavar='LR', help='initial learning rate [default: 0.01]')
-parser.add_argument('-epochs', type=int, default=200, metavar='N', help='number of epochs for train [default: 10]')
-parser.add_argument('-dropout', type=float, default=0.5, metavar='', help='the probability for dropout [default: 0.5]')
-parser.add_argument('-max_norm', type=float, default=3.0, help='l2 constraint of parameters')
-parser.add_argument('-cpu', action='store_true', default=False, help='disable the gpu' )
-parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data')
-# model
-parser.add_argument('-embed-dim', type=int, default=128)
-parser.add_argument('-static', action='store_true', default=False, help='fix the embedding')
-parser.add_argument('-kernel-sizes', type=str, default='3,4,5', help='Comma-separated kernel size to use for convolution')
-parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel')
-parser.add_argument('-class-num', type=int, default=2, help='number of class')
+# learning
+parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]')
+parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]')
+parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]')
+parser.add_argument('-log-interval',  type=int, default=1,   help='how many steps to wait before logging training status [default: 1]')
+parser.add_argument('-test-interval', type=int, default=100, help='how many steps to wait before testing [default: 100]')
+parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]')
+parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the checkpoint')
 # data 
 parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch' )
-parser.add_argument('-num-workers', type=int, default=0, help='how many subprocesses to use for data loading [default: 0]')
-# log
-parser.add_argument('-log-interval', type=int, default=1, help='how many batches to wait before logging training status')
-parser.add_argument('-test-interval', type=int, default=100, help='how many epochs to wait before testing')
-parser.add_argument('-save-interval', type=int, default=100, help='how many epochs to wait before saving')
-parser.add_argument('-predict', type=str, default=None, help='predict the sentence given')
+# model
+parser.add_argument('-dropout', type=float, default=0.5, help='the probability for dropout [default: 0.5]')
+parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]')
+parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]')
+parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel')
+parser.add_argument('-kernel-sizes', type=str, default='3,4,5', help='comma-separated kernel size to use for convolution')
+parser.add_argument('-static', action='store_true', default=False, help='fix the embedding')
+# device
+parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]')
+parser.add_argument('-no-cuda', action='store_true', default=False, help='disable the gpu' )
+# option
 parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]')
-parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the checkpoint')
+parser.add_argument('-predict', type=str, default=None, help='predict the sentence given')
+parser.add_argument('-test', action='store_true', default=False, help='train or test')
 args = parser.parse_args()
-args.cuda = not args.cpu and torch.cuda.is_available()
-args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
-args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d-%H-%M'))
+
 
 # load SST dataset
-'''
-print("Loading data...")
-text_field = data.Field(lower=True)
-label_field = data.Field(sequential=False)
-train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True)
-text_field.build_vocab(train_data, dev_data, test_data)
-label_field.build_vocab(train_data)
-train_iter, dev_iter, test_iter = data.BucketIterator.splits(
-                                    (train_data, dev_data, test_data), 
-                                    batch_sizes=(args.batch_size, 
-                                                 len(dev_data), 
-                                                 len(test_data)),
-                                    device=-1, repeat=False)
-'''
+def sst(text_field, label_field,  **kargs):
+    train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True)
+    text_field.build_vocab(train_data, dev_data, test_data)
+    label_field.build_vocab(train_data, dev_data, test_data)
+    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
+                                        (train_data, dev_data, test_data), 
+                                        batch_sizes=(args.batch_size, 
+                                                     len(dev_data), 
+                                                     len(test_data)),
+                                        **kargs)
+    return train_iter, dev_iter, test_iter 
+
 
 # load MR dataset
-print("Loading data...")
+def mr(text_field, label_field, **kargs):
+    train_data, dev_data = mydatasets.MR.splits(text_field, label_field)
+    text_field.build_vocab(train_data, dev_data)
+    label_field.build_vocab(train_data, dev_data)
+    train_iter, dev_iter = data.Iterator.splits(
+                                (train_data, dev_data), 
+                                batch_sizes=(args.batch_size, len(dev_data)),
+                                **kargs)
+    return train_iter, dev_iter
+
+
+# load data
+print("\nLoading data...")
 text_field = data.Field(lower=True)
 label_field = data.Field(sequential=False)
-train_data, dev_data = mydatasets.MR.splits(text_field, label_field)
-text_field.build_vocab(train_data, dev_data)
-label_field.build_vocab(train_data)
-train_iter, dev_iter = data.Iterator.splits((train_data, dev_data), 
-                                    batch_sizes=(args.batch_size, len(dev_data)),
-                                    device=-1, repeat=False)
+#train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False)
+train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False)
 
-# args from vacab
+
+# update args and print
 args.embed_num = len(text_field.vocab)
+args.class_num = len(label_field.vocab) - 1
+args.cuda = args.no_cuda and torch.cuda.is_available(); del args.no_cuda
+args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
+args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d-%H-%M'))
+
+print("\nParameters:")
+for attr, value in sorted(args.__dict__.items()):
+    print("\t{}={}".format(attr.upper(), value))
+
 
 # model
 if args.snapshot is None:
     cnn = model.CNN_Text(args)
 else :
-    print('Loading model from [%s]...' % args.snapshot)
-    cnn = torch.load(args.snapshot)
+    print('\nLoading model from [%s]...' % args.snapshot)
+    try:
+        cnn = torch.load(args.snapshot)
+    except :
+        print("Sorry, This snapshot doesn't exist."); exit()
+        
 
 # train or predict
 if args.predict is not None:
     label = train.predict(args.predict, cnn, text_field, label_field)
-    print('\n[Text]  %s'% args.predict)
-    print('[Label] %s\n'% label)
-else: 
+    print('\n[Text]  {}[Label] {}\n'.format(args.predict, label))
+elif args.test :
+    try:
+        train.eval(test_iter, cnn, args) 
+    except Exception as e:
+        print("\nSorry. The test dataset doesn't  exist.\n")
+else :
+    print()
     train.train(train_iter, dev_iter, cnn, args)
+    
 
@@ -16,10 +16,12 @@ def __init__(self, args):
         Ks = args.kernel_sizes
 
         self.embed = nn.Embedding(V, D)
-        #self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
+        self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
+        '''
         self.conv13 = nn.Conv2d(Ci, Co, (3, D))
         self.conv14 = nn.Conv2d(Ci, Co, (4, D))
         self.conv15 = nn.Conv2d(Ci, Co, (5, D))
+        '''
         self.dropout = nn.Dropout(args.dropout)
         self.fc1 = nn.Linear(len(Ks)*Co, C)
 
@@ -36,12 +38,15 @@ def forward(self, x):
             x = Variable(x)
 
         x = x.unsqueeze(1) # (N,Ci,W,D)
-        #x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
-        #x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
+        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
+        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
+        x = torch.cat(x, 1)
+        '''
         x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
         x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
         x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
         x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
+        '''
         x = self.dropout(x) # (N,len(Ks)*Co)
         logit = self.fc1(x) # (N,C)
         return logit
@@ -1,20 +1,46 @@
 import re
 import os
 import random
+import tarfile
+from six.moves import urllib
 from torchtext import data
 
 
-class MR(data.ZipDataset):
+class TarDataset(data.Dataset):
+    """Defines a Dataset loaded from a downloadable tar archive.
+
+    Attributes:
+        url: URL where the tar archive can be downloaded.
+        filename: Filename of the downloaded tar archive.
+        dirname: Name of the top-level directory within the zip archive that
+            contains the data files.
+    """
+
+    @classmethod
+    def download_or_unzip(cls, root):
+        path = os.path.join(root, cls.dirname)
+        if not os.path.isdir(path):
+            tpath = os.path.join(root, cls.filename)
+            if not os.path.isfile(tpath):
+                print('downloading')
+                urllib.request.urlretrieve(cls.url, tpath)
+            with tarfile.open(tpath, 'r') as tfile:
+                print('extracting')
+                tfile.extractall(root)
+        return os.path.join(path, '')
+
+
+class MR(TarDataset):
 
     url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
     filename = 'rt-polaritydata.tar'
-    dirname = os.path.join('rt-polaritydata', 'rt-polaritydata')
+    dirname = 'rt-polaritydata'
 
     @staticmethod
     def sort_key(ex):
         return len(ex.text)
 
-    def __init__(self, text_field, label_field, path='.', examples=None, **kwargs):
+    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
         """Create an MR dataset instance given a path and fields.
 
         Arguments:
@@ -49,7 +75,7 @@ def clean_str(string):
         fields = [('text', text_field), ('label', label_field)]
 
         if examples is None:
-            path = os.path.expanduser(path)
+            path = self.dirname if path is None else path
             examples = []
             with open(os.path.join(path, 'rt-polarity.neg')) as f:
                 examples += [
@@ -60,13 +86,14 @@ def clean_str(string):
         super(MR, self).__init__(examples, fields, **kwargs)
 
     @classmethod
-    def splits(cls, text_field, label_field, dev_ratio=.1, root='.', **kwargs):
+    def splits(cls, text_field, label_field, dev_ratio=.1, shuffle=True ,root='.', **kwargs):
         """Create dataset objects for splits of the MR dataset.
 
         Arguments:
             text_field: The field that will be used for the sentence.
             label_field: The field that will be used for label data.
             dev_ratio: The ratio that will be used to get split validation dataset.
+            shuffle: Whether to shuffle the data before split.
             root: The root directory that the dataset's zip archive will be
                 expanded into; therefore the directory in whose trees
                 subdirectory the data files will be stored.
@@ -76,7 +103,7 @@ def splits(cls, text_field, label_field, dev_ratio=.1, root='.', **kwargs):
         """
         path = cls.download_or_unzip(root)
         examples = cls(text_field, label_field, path=path, **kwargs).examples
-        random.shuffle(examples)
+        if shuffle: random.shuffle(examples)
         dev_index = -1 * int(dev_ratio*len(examples))
 
         return (cls(text_field, label_field, examples=examples[:dev_index]),
 
@@ -31,7 +31,11 @@ def train(train_iter, dev_iter, model, args):
                 corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
                 accuracy = corrects/batch.batch_size * 100.0
                 sys.stdout.write(
-                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%'.format(steps, loss.data[0], accuracy))
+                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps, 
+                                                                             loss.data[0], 
+                                                                             accuracy,
+                                                                             corrects,
+                                                                             batch.batch_size))
             if steps % args.test_interval == 0:
                 eval(dev_iter, model, args)
             if steps % args.save_interval == 0:
@@ -75,6 +79,7 @@ def predict(text, model, text_field, label_feild):
     text = [[text_field.vocab.stoi[x] for x in text]]
     x = text_field.tensor_type(text)
     x = autograd.Variable(x, volatile=True)
+    print(x)
     output = model(x)
     _, predicted = torch.max(output, 1)
     return label_feild.vocab.itos[predicted.data[0][0]+1]