diff --git a/cal_acc.py b/cal_acc.py new file mode 100644 index 0000000..89cf9aa --- /dev/null +++ b/cal_acc.py @@ -0,0 +1,146 @@ +#! /usr/bin/env python +import os +import argparse +import datetime +import torch +import torchtext.data as data +import torchtext.datasets as datasets +from main import imdb +import model +# import train +import mydatasets +import torch.autograd as autograd + +parser = argparse.ArgumentParser(description='CNN text classificer') +# learning +parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]') +parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]') +parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]') +parser.add_argument('-log-interval', type=int, default=1, + help='how many steps to wait before logging training status [default: 1]') +parser.add_argument('-test-interval', type=int, default=1, help='how many steps to wait before testing [default: 100]') +parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]') +parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot') +parser.add_argument('-early-stop', type=int, default=1000, + help='iteration numbers to stop without performance increasing') +parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance') +# data +parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch') +# model +parser.add_argument('-dropout', type=float, default=0.5, help='the probability for dropout [default: 0.5]') +parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]') +parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]') +parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel') +parser.add_argument('-kernel-sizes', type=str, default='3,4,5', + help='comma-separated kernel size to use for convolution') +parser.add_argument('-static', action='store_true', default=False, help='fix the embedding') +# device +parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]') +parser.add_argument('-no-cuda', action='store_true', default=False, help='disable the gpu') +# option +parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]') +parser.add_argument('-predict', type=str, default=True, help='predict the sentence given') +parser.add_argument('-test', action='store_true', default=False, help='train or test') +args = parser.parse_args() + +# load data +print("\nLoading data...") +text_field = data.Field(lower=True) +label_field = data.Field(sequential=False) + +train_iter, dev_iter, test_iter = imdb(text_field, label_field, device=-1, repeat=False) +# train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) +# train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False) + + +# update args and print +args.embed_num = len(text_field.vocab) +args.class_num = len(label_field.vocab) - 1 +args.cuda = (not args.no_cuda) and torch.cuda.is_available() +del args.no_cuda +args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] +args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) +print("\nParameters:") +for attr, value in sorted(args.__dict__.items()): + print("\t{}={}".format(attr.upper(), value)) + +# model +cnn = model.CNN_Text(args) +snapshot = '/Users/xuchen/core/pycharm/project/cnn-text-classification-pytorch/snapshot/best_steps_11513.pt' +if snapshot is not None: + print('\nLoading model from {}...'.format(snapshot)) + cnn.load_state_dict(torch.load(snapshot, map_location=torch.device('cpu'))) + +if args.cuda: + torch.cuda.set_device(args.device) + cnn = cnn.cuda() + + +def sent_acc(samples, model, text_field, cuda_flag, positive=True, ): + size = len(samples) + model.eval() + # text = text_field.tokenize(text) + outputs = torch.tensor([], dtype=torch.int64) + for sample in samples: + sample = text_field.preprocess(sample) + sample = [[text_field.vocab.stoi[x] for x in sample]] + # inputs.append(sample) + x = torch.tensor(sample) + x = autograd.Variable(x) + if cuda_flag: + x = x.cuda() + # print(x) + output = model(x) + _, predicted = torch.max(output, 1) # logits + outputs = torch.cat([outputs, predicted]) + + target = [1] * size if positive else [0] * size + target = torch.tensor(target) + corrects = outputs == target + corrects = corrects.sum() + accuracy = 100.0 * corrects / size + # return label_feild.vocab.itos[predicted.data[0][0]+1] + return accuracy + + +def cal_acc(sent_label, method_label, suffix, src): + src = '{}/{}/{}{}'.format(src, sent_label, method_label, suffix) + with open(src, 'r') as f: + samples = f.read().split('<|endoftext|>') + samples = samples[1:] + acc = sent_acc(samples, cnn, text_field, args.cuda, + True if sent_label == 'positive' else False) + return acc + + +def cal_accs(method_label, suffix, src): + print('{}:\n'.format(method_label)) + pos_acc = cal_acc('positive', method_label, suffix, src) + print('pos_acc: {}'.format(pos_acc.item())) + + neg_acc = cal_acc('negative', method_label, suffix, src) + print('neg_acc: {}'.format(neg_acc.item())) + + print('mean_acc: {}'.format((pos_acc + neg_acc) / 2)) + + +SRC_SAMPLES = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/generated_samples' + +# # single or not +# sent_label = [ +# # 'positive', +# 'negative' +# ] + +# multiple +method_label = [ + 'BC', + # 'BC_VAD', + # 'BC_VAD_ABS', +] + +suffix = '(2_45_10)' + +cal_accs(method_label[0], suffix, SRC_SAMPLES) +# neg_acc = cal_acc('negative', method_label[0], SRC_SAMPLES) +# print('neg_acc: {}'.format(neg_acc.item())) diff --git a/main.py b/main.py index dd222a6..1cfe966 100755 --- a/main.py +++ b/main.py @@ -8,18 +8,22 @@ import model import train import mydatasets +import random +import re - +SEED = 1234 parser = argparse.ArgumentParser(description='CNN text classificer') # learning parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]') parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]') parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]') -parser.add_argument('-log-interval', type=int, default=1, help='how many steps to wait before logging training status [default: 1]') -parser.add_argument('-test-interval', type=int, default=100, help='how many steps to wait before testing [default: 100]') +parser.add_argument('-log-interval', type=int, default=1, + help='how many steps to wait before logging training status [default: 1]') +parser.add_argument('-test-interval', type=int, default=1, help='how many steps to wait before testing [default: 100]') parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]') parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot') -parser.add_argument('-early-stop', type=int, default=1000, help='iteration numbers to stop without performance increasing') +parser.add_argument('-early-stop', type=int, default=1000, + help='iteration numbers to stop without performance increasing') parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance') # data parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch') @@ -28,7 +32,8 @@ parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]') parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]') parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel') -parser.add_argument('-kernel-sizes', type=str, default='3,4,5', help='comma-separated kernel size to use for convolution') +parser.add_argument('-kernel-sizes', type=str, default='3,4,5', + help='comma-separated kernel size to use for convolution') parser.add_argument('-static', action='store_true', default=False, help='fix the embedding') # device parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]') @@ -41,17 +46,55 @@ # load SST dataset -def sst(text_field, label_field, **kargs): +def sst(text_field, label_field, **kargs): train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True) text_field.build_vocab(train_data, dev_data, test_data) label_field.build_vocab(train_data, dev_data, test_data) train_iter, dev_iter, test_iter = data.BucketIterator.splits( - (train_data, dev_data, test_data), - batch_sizes=(args.batch_size, - len(dev_data), - len(test_data)), - **kargs) - return train_iter, dev_iter, test_iter + (train_data, dev_data, test_data), + batch_sizes=(args.batch_size, + len(dev_data), + len(test_data)), + **kargs) + return train_iter, dev_iter, test_iter + + +def clean_str(string): + """ + Tokenization/string cleaning for all datasets except for SST. + Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py + """ + string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) + string = re.sub(r"\'s", " \'s", string) + string = re.sub(r"\'ve", " \'ve", string) + string = re.sub(r"n\'t", " n\'t", string) + string = re.sub(r"\'re", " \'re", string) + string = re.sub(r"\'d", " \'d", string) + string = re.sub(r"\'ll", " \'ll", string) + string = re.sub(r",", " , ", string) + string = re.sub(r"!", " ! ", string) + string = re.sub(r"\(", " \( ", string) + string = re.sub(r"\)", " \) ", string) + string = re.sub(r"\?", " \? ", string) + string = re.sub(r"\s{2,}", " ", string) + return string.strip() + + +# load IMDB dataset +def imdb(text_field, label_field, **kargs): + text_field.preprocessing = data.Pipeline(clean_str) + train_data, test_data = datasets.IMDB.splits(text_field, label_field) + train_data, dev_data = train_data.split(random_state=random.seed(SEED)) + text_field.build_vocab(train_data, dev_data, test_data) + label_field.build_vocab(train_data, dev_data, test_data) + train_iter, dev_iter, test_iter = data.BucketIterator.splits( + (train_data, dev_data, test_data), + batch_sizes=(args.batch_size, + len(dev_data), + len(test_data)), + **kargs) + return train_iter, dev_iter, test_iter + # return train_iter, dev_iter, test_iter # load MR dataset @@ -60,57 +103,56 @@ def mr(text_field, label_field, **kargs): text_field.build_vocab(train_data, dev_data) label_field.build_vocab(train_data, dev_data) train_iter, dev_iter = data.Iterator.splits( - (train_data, dev_data), - batch_sizes=(args.batch_size, len(dev_data)), - **kargs) + (train_data, dev_data), + batch_sizes=(args.batch_size, len(dev_data)), + **kargs) return train_iter, dev_iter -# load data -print("\nLoading data...") -text_field = data.Field(lower=True) -label_field = data.Field(sequential=False) -train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) -# train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False) - - -# update args and print -args.embed_num = len(text_field.vocab) -args.class_num = len(label_field.vocab) - 1 -args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda -args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] -args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) - -print("\nParameters:") -for attr, value in sorted(args.__dict__.items()): - print("\t{}={}".format(attr.upper(), value)) - - -# model -cnn = model.CNN_Text(args) -if args.snapshot is not None: - print('\nLoading model from {}...'.format(args.snapshot)) - cnn.load_state_dict(torch.load(args.snapshot)) - -if args.cuda: - torch.cuda.set_device(args.device) - cnn = cnn.cuda() - - -# train or predict -if args.predict is not None: - label = train.predict(args.predict, cnn, text_field, label_field, args.cuda) - print('\n[Text] {}\n[Label] {}\n'.format(args.predict, label)) -elif args.test: - try: - train.eval(test_iter, cnn, args) - except Exception as e: - print("\nSorry. The test dataset doesn't exist.\n") -else: - print() - try: - train.train(train_iter, dev_iter, cnn, args) - except KeyboardInterrupt: - print('\n' + '-' * 89) - print('Exiting from training early') - +if __name__ == '__main__': + + # load data + print("\nLoading data...") + text_field = data.Field(lower=True) + label_field = data.Field(sequential=False) + # train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) + train_iter, dev_iter, test_iter = imdb(text_field, label_field, device=-1, repeat=False) + + # update args and print + args.embed_num = len(text_field.vocab) + args.class_num = len(label_field.vocab) - 1 + args.cuda = (not args.no_cuda) and torch.cuda.is_available() + del args.no_cuda + args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] + args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + + print("\nParameters:") + for attr, value in sorted(args.__dict__.items()): + print("\t{}={}".format(attr.upper(), value)) + + # model + cnn = model.CNN_Text(args) + if args.snapshot is not None: + print('\nLoading model from {}...'.format(args.snapshot)) + cnn.load_state_dict(torch.load(args.snapshot)) + + if args.cuda: + torch.cuda.set_device(args.device) + cnn = cnn.cuda() + + # train or predict + if args.predict is not None: + label = train.predict(args.predict, cnn, text_field, label_field, args.cuda) + print('\n[Text] {}\n[Label] {}\n'.format(args.predict, label)) + elif args.test: + try: + train.eval(test_iter, cnn, args) + except Exception as e: + print("\nSorry. The test dataset doesn't exist.\n") + else: + print() + try: + train.train(train_iter, dev_iter, cnn, args) + except KeyboardInterrupt: + print('\n' + '-' * 89) + print('Exiting from training early') diff --git a/train.py b/train.py index 7f90aaa..b38f97f 100644 --- a/train.py +++ b/train.py @@ -15,29 +15,36 @@ def train(train_iter, dev_iter, model, args): best_acc = 0 last_step = 0 model.train() - for epoch in range(1, args.epochs+1): + for epoch in range(1, args.epochs + 1): for batch in train_iter: feature, target = batch.text, batch.label - feature.data.t_(), target.data.sub_(1) # batch first, index align + # feature.data.t_(), target.data.sub_(1) # batch first, index align + feature = feature.data.t() + target = target.data.sub(1) if args.cuda: feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() logit = model(feature) - #print('logit vector', logit.size()) - #print('target vector', target.size()) + # print('logit vector', logit.size()) + # print('target vector', target.size()) loss = F.cross_entropy(logit, target) loss.backward() optimizer.step() steps += 1 if steps % args.log_interval == 0: - corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum() - accuracy = 100.0 * corrects/batch.batch_size + r2 = torch.max(logit, 1) + r3 = r2[1] + r5 = target.size() + r4 = r3.view(r5) + r1 = r4.data == target.data + corrects = r1.sum() + accuracy = 100.0 * corrects / batch.batch_size sys.stdout.write( - '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, - loss.data[0], + '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, + loss.item(), accuracy, corrects, batch.batch_size)) @@ -60,27 +67,30 @@ def eval(data_iter, model, args): corrects, avg_loss = 0, 0 for batch in data_iter: feature, target = batch.text, batch.label - feature.data.t_(), target.data.sub_(1) # batch first, index align + # feature.data.t_(), target.data.sub_(1) # batch first, index align + feature = feature.data.t() + target = target.data.sub(1) if args.cuda: feature, target = feature.cuda(), target.cuda() logit = model(feature) loss = F.cross_entropy(logit, target, size_average=False) - avg_loss += loss.data[0] + avg_loss += loss.item() corrects += (torch.max(logit, 1) [1].view(target.size()).data == target.data).sum() size = len(data_iter.dataset) avg_loss /= size - accuracy = 100.0 * corrects/size - print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n'.format(avg_loss, - accuracy, - corrects, + accuracy = 100.0 * corrects / size + print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n'.format(avg_loss, + accuracy, + corrects, size)) return accuracy + def predict(text, model, text_field, label_feild, cuda_flag): assert isinstance(text, str) model.eval() @@ -94,8 +104,8 @@ def predict(text, model, text_field, label_feild, cuda_flag): print(x) output = model(x) _, predicted = torch.max(output, 1) - #return label_feild.vocab.itos[predicted.data[0][0]+1] - return label_feild.vocab.itos[predicted.data[0]+1] + # return label_feild.vocab.itos[predicted.data[0][0]+1] + return label_feild.vocab.itos[predicted.data[0] + 1] def save(model, save_dir, save_prefix, steps): @@ -104,3 +114,6 @@ def save(model, save_dir, save_prefix, steps): save_prefix = os.path.join(save_dir, save_prefix) save_path = '{}_steps_{}.pt'.format(save_prefix, steps) torch.save(model.state_dict(), save_path) + +if __name__ == '__main__': + predict() \ No newline at end of file