From 42c1ff1fd25a81f5f62665dc7ca0858f87a0cd8a Mon Sep 17 00:00:00 2001 From: xuchen Date: Wed, 24 Jun 2020 07:12:33 +0800 Subject: [PATCH 1/4] sent_acc --- main.py | 85 +++++++++++++++----- sentiment_accuracy.py | 183 ++++++++++++++++++++++++++++++++++++++++++ train.py | 45 +++++++---- 3 files changed, 275 insertions(+), 38 deletions(-) create mode 100644 sentiment_accuracy.py diff --git a/main.py b/main.py index dd222a6..223bd68 100755 --- a/main.py +++ b/main.py @@ -8,18 +8,23 @@ import model import train import mydatasets +import random +import re +SEED = 1234 parser = argparse.ArgumentParser(description='CNN text classificer') # learning parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]') parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]') parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]') -parser.add_argument('-log-interval', type=int, default=1, help='how many steps to wait before logging training status [default: 1]') -parser.add_argument('-test-interval', type=int, default=100, help='how many steps to wait before testing [default: 100]') +parser.add_argument('-log-interval', type=int, default=1, + help='how many steps to wait before logging training status [default: 1]') +parser.add_argument('-test-interval', type=int, default=1, help='how many steps to wait before testing [default: 100]') parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]') parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot') -parser.add_argument('-early-stop', type=int, default=1000, help='iteration numbers to stop without performance increasing') +parser.add_argument('-early-stop', type=int, default=1000, + help='iteration numbers to stop without performance increasing') parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance') # data parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch') @@ -28,7 +33,8 @@ parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]') parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]') parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel') -parser.add_argument('-kernel-sizes', type=str, default='3,4,5', help='comma-separated kernel size to use for convolution') +parser.add_argument('-kernel-sizes', type=str, default='3,4,5', + help='comma-separated kernel size to use for convolution') parser.add_argument('-static', action='store_true', default=False, help='fix the embedding') # device parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]') @@ -41,17 +47,55 @@ # load SST dataset -def sst(text_field, label_field, **kargs): +def sst(text_field, label_field, **kargs): train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True) text_field.build_vocab(train_data, dev_data, test_data) label_field.build_vocab(train_data, dev_data, test_data) train_iter, dev_iter, test_iter = data.BucketIterator.splits( - (train_data, dev_data, test_data), - batch_sizes=(args.batch_size, - len(dev_data), - len(test_data)), - **kargs) - return train_iter, dev_iter, test_iter + (train_data, dev_data, test_data), + batch_sizes=(args.batch_size, + len(dev_data), + len(test_data)), + **kargs) + return train_iter, dev_iter, test_iter + + +def clean_str(string): + """ + Tokenization/string cleaning for all datasets except for SST. + Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py + """ + string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) + string = re.sub(r"\'s", " \'s", string) + string = re.sub(r"\'ve", " \'ve", string) + string = re.sub(r"n\'t", " n\'t", string) + string = re.sub(r"\'re", " \'re", string) + string = re.sub(r"\'d", " \'d", string) + string = re.sub(r"\'ll", " \'ll", string) + string = re.sub(r",", " , ", string) + string = re.sub(r"!", " ! ", string) + string = re.sub(r"\(", " \( ", string) + string = re.sub(r"\)", " \) ", string) + string = re.sub(r"\?", " \? ", string) + string = re.sub(r"\s{2,}", " ", string) + return string.strip() + + +# load IMDB dataset +def imdb(text_field, label_field, **kargs): + text_field.preprocessing = data.Pipeline(clean_str) + train_data, test_data = datasets.IMDB.splits(text_field, label_field) + train_data, dev_data = train_data.split(random_state=random.seed(SEED)) + text_field.build_vocab(train_data, dev_data, test_data) + label_field.build_vocab(train_data, dev_data, test_data) + train_iter, dev_iter, test_iter = data.BucketIterator.splits( + (train_data, dev_data, test_data), + batch_sizes=(args.batch_size, + len(dev_data), + len(test_data)), + **kargs) + return train_iter, dev_iter, test_iter + # return train_iter, dev_iter, test_iter # load MR dataset @@ -60,9 +104,9 @@ def mr(text_field, label_field, **kargs): text_field.build_vocab(train_data, dev_data) label_field.build_vocab(train_data, dev_data) train_iter, dev_iter = data.Iterator.splits( - (train_data, dev_data), - batch_sizes=(args.batch_size, len(dev_data)), - **kargs) + (train_data, dev_data), + batch_sizes=(args.batch_size, len(dev_data)), + **kargs) return train_iter, dev_iter @@ -70,14 +114,14 @@ def mr(text_field, label_field, **kargs): print("\nLoading data...") text_field = data.Field(lower=True) label_field = data.Field(sequential=False) -train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) -# train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False) - +# train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) +train_iter, dev_iter, test_iter = imdb(text_field, label_field, device=-1, repeat=False) # update args and print args.embed_num = len(text_field.vocab) args.class_num = len(label_field.vocab) - 1 -args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda +args.cuda = (not args.no_cuda) and torch.cuda.is_available() +del args.no_cuda args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) @@ -85,7 +129,6 @@ def mr(text_field, label_field, **kargs): for attr, value in sorted(args.__dict__.items()): print("\t{}={}".format(attr.upper(), value)) - # model cnn = model.CNN_Text(args) if args.snapshot is not None: @@ -95,7 +138,6 @@ def mr(text_field, label_field, **kargs): if args.cuda: torch.cuda.set_device(args.device) cnn = cnn.cuda() - # train or predict if args.predict is not None: @@ -103,7 +145,7 @@ def mr(text_field, label_field, **kargs): print('\n[Text] {}\n[Label] {}\n'.format(args.predict, label)) elif args.test: try: - train.eval(test_iter, cnn, args) + train.eval(test_iter, cnn, args) except Exception as e: print("\nSorry. The test dataset doesn't exist.\n") else: @@ -113,4 +155,3 @@ def mr(text_field, label_field, **kargs): except KeyboardInterrupt: print('\n' + '-' * 89) print('Exiting from training early') - diff --git a/sentiment_accuracy.py b/sentiment_accuracy.py new file mode 100644 index 0000000..1bddae6 --- /dev/null +++ b/sentiment_accuracy.py @@ -0,0 +1,183 @@ +#! /usr/bin/env python +import os +import argparse +import datetime +import torch +import torchtext.data as data +import torchtext.datasets as datasets +import model +# import train +import mydatasets +import torch.autograd as autograd + +parser = argparse.ArgumentParser(description='CNN text classificer') +# learning +parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]') +parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]') +parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]') +parser.add_argument('-log-interval', type=int, default=1, + help='how many steps to wait before logging training status [default: 1]') +parser.add_argument('-test-interval', type=int, default=1, help='how many steps to wait before testing [default: 100]') +parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]') +parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot') +parser.add_argument('-early-stop', type=int, default=1000, + help='iteration numbers to stop without performance increasing') +parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance') +# data +parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch') +# model +parser.add_argument('-dropout', type=float, default=0.5, help='the probability for dropout [default: 0.5]') +parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]') +parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]') +parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel') +parser.add_argument('-kernel-sizes', type=str, default='3,4,5', + help='comma-separated kernel size to use for convolution') +parser.add_argument('-static', action='store_true', default=False, help='fix the embedding') +# device +parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]') +parser.add_argument('-no-cuda', action='store_true', default=False, help='disable the gpu') +# option +parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]') +parser.add_argument('-predict', type=str, default=True, help='predict the sentence given') +parser.add_argument('-test', action='store_true', default=False, help='train or test') +args = parser.parse_args() + + +# load SST dataset +def sst(text_field, label_field, **kargs): + train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True) + text_field.build_vocab(train_data, dev_data, test_data) + label_field.build_vocab(train_data, dev_data, test_data) + train_iter, dev_iter, test_iter = data.BucketIterator.splits( + (train_data, dev_data, test_data), + batch_sizes=(args.batch_size, + len(dev_data), + len(test_data)), + **kargs) + return train_iter, dev_iter, test_iter + + +# load MR dataset +def mr(text_field, label_field, **kargs): + train_data, dev_data = mydatasets.MR.splits(text_field, label_field) + text_field.build_vocab(train_data, dev_data) + label_field.build_vocab(train_data, dev_data) + train_iter, dev_iter = data.Iterator.splits( + (train_data, dev_data), + batch_sizes=(args.batch_size, len(dev_data)), + **kargs) + return train_iter, dev_iter + + +# load data +print("\nLoading data...") +text_field = data.Field(lower=True) +label_field = data.Field(sequential=False) +train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) +# train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False) + + +# update args and print +args.embed_num = len(text_field.vocab) +args.class_num = len(label_field.vocab) - 1 +args.cuda = (not args.no_cuda) and torch.cuda.is_available() +del args.no_cuda +args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] +args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) +print("\nParameters:") +for attr, value in sorted(args.__dict__.items()): + print("\t{}={}".format(attr.upper(), value)) + +# model +cnn = model.CNN_Text(args) +snapshot = '/Users/xuchen/core/pycharm/project/cnn-text-classification-pytorch/snapshot/best_steps_11513.pt' +if snapshot is not None: + print('\nLoading model from {}...'.format(snapshot)) + cnn.load_state_dict(torch.load(snapshot)) + +if args.cuda: + torch.cuda.set_device(args.device) + cnn = cnn.cuda() + + +def sent_acc(samples, model, text_field, cuda_flag, positive=True, ): + size = len(samples) + model.eval() + # text = text_field.tokenize(text) + outputs = torch.tensor([], dtype=torch.int64) + for sample in samples: + sample = text_field.preprocess(sample) + sample = [[text_field.vocab.stoi[x] for x in sample]] + # inputs.append(sample) + x = torch.tensor(sample) + x = autograd.Variable(x) + if cuda_flag: + x = x.cuda() + # print(x) + output = model(x) + _, predicted = torch.max(output, 1) # logits + outputs = torch.cat([outputs, predicted]) + + target = [1] * size if positive else [0] * size + target = torch.tensor(target) + corrects = outputs == target + corrects = corrects.sum() + accuracy = 100.0 * corrects / size + # return label_feild.vocab.itos[predicted.data[0][0]+1] + return accuracy + +def calculate_acc(file_pos, file_neg, label): + # mean_acc, pos_acc, neg_acc = None, None, None + file_pos = '{}/{}'.format(file_pos, label) + with open(file_pos, 'r') as f: + samples = f.read().split('<|endoftext|>') + samples = [s for s in samples if len(s.split()) > 20] + + # samples = ['I love you so much !', 'So cool good . happy birthday ! '] + pos_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=True) + # print('{} = {}'.format(l, pos_acc)) + + file_neg = '{}/{}'.format(file_neg, label) + with open(file_neg, 'r') as f: + samples = f.read().split('<|endoftext|>') + samples = [s for s in samples if len(s.split()) > 20] + + # samples = ['I love you so much !', 'So cool good . happy birthday ! '] + neg_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=False) + # print('{} = {}'.format(l, neg_acc)) + mean_acc = (pos_acc + neg_acc) / 2 + return mean_acc, pos_acc, neg_acc + + +# train or predict +label = ['B', 'BR', 'BC', 'BCR'] +file_pos = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/vad_abs/positive' +file_neg = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/vad_abs/negative' +# file_pos = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/pplm/reversed/positive' +# file_neg = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/pplm/reversed/negative' +mean_acc, pos_acc, neg_acc = calculate_acc(file_pos, file_neg, 'BC') +print(pos_acc) +print(neg_acc) +print(mean_acc.item()) + +# for l in label: +# pos_acc, neg_acc = None, None +# with open('/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/positive/{}'.format(l), 'r') as f: +# samples = f.read().split('<|endoftext|>') +# samples = [s for s in samples if len(s.split()) > 20] +# +# # samples = ['I love you so much !', 'So cool good . happy birthday ! '] +# pos_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=True) +# # print('{} = {}'.format(l, pos_acc)) +# +# with open('/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/negative/{}'.format(l), 'r') as f: +# samples = f.read().split('<|endoftext|>') +# samples = [s for s in samples if len(s.split()) > 20] +# +# # samples = ['I love you so much !', 'So cool good . happy birthday ! '] +# neg_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=False) +# # print('{} = {}'.format(l, neg_acc)) +# +# print('{} = {}'.format(l, (pos_acc + neg_acc) / 2)) + + diff --git a/train.py b/train.py index 7f90aaa..b38f97f 100644 --- a/train.py +++ b/train.py @@ -15,29 +15,36 @@ def train(train_iter, dev_iter, model, args): best_acc = 0 last_step = 0 model.train() - for epoch in range(1, args.epochs+1): + for epoch in range(1, args.epochs + 1): for batch in train_iter: feature, target = batch.text, batch.label - feature.data.t_(), target.data.sub_(1) # batch first, index align + # feature.data.t_(), target.data.sub_(1) # batch first, index align + feature = feature.data.t() + target = target.data.sub(1) if args.cuda: feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() logit = model(feature) - #print('logit vector', logit.size()) - #print('target vector', target.size()) + # print('logit vector', logit.size()) + # print('target vector', target.size()) loss = F.cross_entropy(logit, target) loss.backward() optimizer.step() steps += 1 if steps % args.log_interval == 0: - corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum() - accuracy = 100.0 * corrects/batch.batch_size + r2 = torch.max(logit, 1) + r3 = r2[1] + r5 = target.size() + r4 = r3.view(r5) + r1 = r4.data == target.data + corrects = r1.sum() + accuracy = 100.0 * corrects / batch.batch_size sys.stdout.write( - '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, - loss.data[0], + '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, + loss.item(), accuracy, corrects, batch.batch_size)) @@ -60,27 +67,30 @@ def eval(data_iter, model, args): corrects, avg_loss = 0, 0 for batch in data_iter: feature, target = batch.text, batch.label - feature.data.t_(), target.data.sub_(1) # batch first, index align + # feature.data.t_(), target.data.sub_(1) # batch first, index align + feature = feature.data.t() + target = target.data.sub(1) if args.cuda: feature, target = feature.cuda(), target.cuda() logit = model(feature) loss = F.cross_entropy(logit, target, size_average=False) - avg_loss += loss.data[0] + avg_loss += loss.item() corrects += (torch.max(logit, 1) [1].view(target.size()).data == target.data).sum() size = len(data_iter.dataset) avg_loss /= size - accuracy = 100.0 * corrects/size - print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n'.format(avg_loss, - accuracy, - corrects, + accuracy = 100.0 * corrects / size + print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n'.format(avg_loss, + accuracy, + corrects, size)) return accuracy + def predict(text, model, text_field, label_feild, cuda_flag): assert isinstance(text, str) model.eval() @@ -94,8 +104,8 @@ def predict(text, model, text_field, label_feild, cuda_flag): print(x) output = model(x) _, predicted = torch.max(output, 1) - #return label_feild.vocab.itos[predicted.data[0][0]+1] - return label_feild.vocab.itos[predicted.data[0]+1] + # return label_feild.vocab.itos[predicted.data[0][0]+1] + return label_feild.vocab.itos[predicted.data[0] + 1] def save(model, save_dir, save_prefix, steps): @@ -104,3 +114,6 @@ def save(model, save_dir, save_prefix, steps): save_prefix = os.path.join(save_dir, save_prefix) save_path = '{}_steps_{}.pt'.format(save_prefix, steps) torch.save(model.state_dict(), save_path) + +if __name__ == '__main__': + predict() \ No newline at end of file From ca59e02ae57daee6d1d56879729dd4eb90e14862 Mon Sep 17 00:00:00 2001 From: xuchen Date: Wed, 24 Jun 2020 13:45:59 +0800 Subject: [PATCH 2/4] sent_acc --- main.py | 93 ++++++++++++++++++++------------------- sentiment_accuracy.py | 100 ++++++++++++++++++------------------------ 2 files changed, 90 insertions(+), 103 deletions(-) diff --git a/main.py b/main.py index 223bd68..1cfe966 100755 --- a/main.py +++ b/main.py @@ -12,7 +12,6 @@ import re SEED = 1234 - parser = argparse.ArgumentParser(description='CNN text classificer') # learning parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]') @@ -110,48 +109,50 @@ def mr(text_field, label_field, **kargs): return train_iter, dev_iter -# load data -print("\nLoading data...") -text_field = data.Field(lower=True) -label_field = data.Field(sequential=False) -# train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) -train_iter, dev_iter, test_iter = imdb(text_field, label_field, device=-1, repeat=False) - -# update args and print -args.embed_num = len(text_field.vocab) -args.class_num = len(label_field.vocab) - 1 -args.cuda = (not args.no_cuda) and torch.cuda.is_available() -del args.no_cuda -args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] -args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) - -print("\nParameters:") -for attr, value in sorted(args.__dict__.items()): - print("\t{}={}".format(attr.upper(), value)) - -# model -cnn = model.CNN_Text(args) -if args.snapshot is not None: - print('\nLoading model from {}...'.format(args.snapshot)) - cnn.load_state_dict(torch.load(args.snapshot)) - -if args.cuda: - torch.cuda.set_device(args.device) - cnn = cnn.cuda() - -# train or predict -if args.predict is not None: - label = train.predict(args.predict, cnn, text_field, label_field, args.cuda) - print('\n[Text] {}\n[Label] {}\n'.format(args.predict, label)) -elif args.test: - try: - train.eval(test_iter, cnn, args) - except Exception as e: - print("\nSorry. The test dataset doesn't exist.\n") -else: - print() - try: - train.train(train_iter, dev_iter, cnn, args) - except KeyboardInterrupt: - print('\n' + '-' * 89) - print('Exiting from training early') +if __name__ == '__main__': + + # load data + print("\nLoading data...") + text_field = data.Field(lower=True) + label_field = data.Field(sequential=False) + # train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) + train_iter, dev_iter, test_iter = imdb(text_field, label_field, device=-1, repeat=False) + + # update args and print + args.embed_num = len(text_field.vocab) + args.class_num = len(label_field.vocab) - 1 + args.cuda = (not args.no_cuda) and torch.cuda.is_available() + del args.no_cuda + args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] + args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + + print("\nParameters:") + for attr, value in sorted(args.__dict__.items()): + print("\t{}={}".format(attr.upper(), value)) + + # model + cnn = model.CNN_Text(args) + if args.snapshot is not None: + print('\nLoading model from {}...'.format(args.snapshot)) + cnn.load_state_dict(torch.load(args.snapshot)) + + if args.cuda: + torch.cuda.set_device(args.device) + cnn = cnn.cuda() + + # train or predict + if args.predict is not None: + label = train.predict(args.predict, cnn, text_field, label_field, args.cuda) + print('\n[Text] {}\n[Label] {}\n'.format(args.predict, label)) + elif args.test: + try: + train.eval(test_iter, cnn, args) + except Exception as e: + print("\nSorry. The test dataset doesn't exist.\n") + else: + print() + try: + train.train(train_iter, dev_iter, cnn, args) + except KeyboardInterrupt: + print('\n' + '-' * 89) + print('Exiting from training early') diff --git a/sentiment_accuracy.py b/sentiment_accuracy.py index 1bddae6..e86f381 100644 --- a/sentiment_accuracy.py +++ b/sentiment_accuracy.py @@ -5,6 +5,7 @@ import torch import torchtext.data as data import torchtext.datasets as datasets +from main import imdb import model # import train import mydatasets @@ -42,38 +43,13 @@ parser.add_argument('-test', action='store_true', default=False, help='train or test') args = parser.parse_args() - -# load SST dataset -def sst(text_field, label_field, **kargs): - train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True) - text_field.build_vocab(train_data, dev_data, test_data) - label_field.build_vocab(train_data, dev_data, test_data) - train_iter, dev_iter, test_iter = data.BucketIterator.splits( - (train_data, dev_data, test_data), - batch_sizes=(args.batch_size, - len(dev_data), - len(test_data)), - **kargs) - return train_iter, dev_iter, test_iter - - -# load MR dataset -def mr(text_field, label_field, **kargs): - train_data, dev_data = mydatasets.MR.splits(text_field, label_field) - text_field.build_vocab(train_data, dev_data) - label_field.build_vocab(train_data, dev_data) - train_iter, dev_iter = data.Iterator.splits( - (train_data, dev_data), - batch_sizes=(args.batch_size, len(dev_data)), - **kargs) - return train_iter, dev_iter - - # load data print("\nLoading data...") text_field = data.Field(lower=True) label_field = data.Field(sequential=False) -train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) + +train_iter, dev_iter, test_iter = imdb(text_field, label_field, device=-1, repeat=False) +# train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False) # train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False) @@ -93,7 +69,7 @@ def mr(text_field, label_field, **kargs): snapshot = '/Users/xuchen/core/pycharm/project/cnn-text-classification-pytorch/snapshot/best_steps_11513.pt' if snapshot is not None: print('\nLoading model from {}...'.format(snapshot)) - cnn.load_state_dict(torch.load(snapshot)) + cnn.load_state_dict(torch.load(snapshot, map_location=torch.device('cpu'))) if args.cuda: torch.cuda.set_device(args.device) @@ -126,39 +102,51 @@ def sent_acc(samples, model, text_field, cuda_flag, positive=True, ): # return label_feild.vocab.itos[predicted.data[0][0]+1] return accuracy -def calculate_acc(file_pos, file_neg, label): - # mean_acc, pos_acc, neg_acc = None, None, None - file_pos = '{}/{}'.format(file_pos, label) - with open(file_pos, 'r') as f: - samples = f.read().split('<|endoftext|>') - samples = [s for s in samples if len(s.split()) > 20] - # samples = ['I love you so much !', 'So cool good . happy birthday ! '] - pos_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=True) - # print('{} = {}'.format(l, pos_acc)) +def calculate_acc(method_dirs, label): + for method_dir in method_dirs: + # mean_acc, pos_acc, neg_acc = None, None, None + file_pos = '{}/positive/{}'.format(method_dir, label) + with open(file_pos, 'r') as f: + samples = f.read().split('<|endoftext|>') + samples = [s for s in samples if len(s.split()) > 20] + + # samples = ['I love you so much !', 'So cool good . happy birthday ! '] + pos_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=True) + # print('{} = {}'.format(l, pos_acc)) - file_neg = '{}/{}'.format(file_neg, label) - with open(file_neg, 'r') as f: - samples = f.read().split('<|endoftext|>') - samples = [s for s in samples if len(s.split()) > 20] + file_neg = '{}/negative/{}'.format(method_dir, label) + with open(file_neg, 'r') as f: + samples = f.read().split('<|endoftext|>') + samples = [s for s in samples if len(s.split()) > 20] - # samples = ['I love you so much !', 'So cool good . happy birthday ! '] - neg_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=False) - # print('{} = {}'.format(l, neg_acc)) - mean_acc = (pos_acc + neg_acc) / 2 - return mean_acc, pos_acc, neg_acc + # samples = ['I love you so much !', 'So cool good . happy birthday ! '] + neg_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=False) + # print('{} = {}'.format(l, neg_acc)) + mean_acc = (pos_acc + neg_acc) / 2 + print(method_dir) + print(mean_acc.item(), pos_acc, neg_acc) # train or predict + +pplm_generated_dir = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/pplm/generated' +pplm_reversed_dir = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/pplm/reversed' +vad_dir = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/vad' +vad_abs_dir = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/vad_abs' + label = ['B', 'BR', 'BC', 'BCR'] -file_pos = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/vad_abs/positive' -file_neg = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/vad_abs/negative' -# file_pos = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/pplm/reversed/positive' -# file_neg = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/pplm/reversed/negative' -mean_acc, pos_acc, neg_acc = calculate_acc(file_pos, file_neg, 'BC') -print(pos_acc) -print(neg_acc) -print(mean_acc.item()) +method_dirs = [ + pplm_generated_dir, + pplm_reversed_dir, + vad_dir, + vad_abs_dir +] + +calculate_acc(method_dirs, label[2]) +# print('Positive: {}'.format(pos_acc)) +# print('Negative: {}'.format(neg_acc)) +# print('Mean: {}'.format(mean_acc.item())) # for l in label: # pos_acc, neg_acc = None, None @@ -179,5 +167,3 @@ def calculate_acc(file_pos, file_neg, label): # # print('{} = {}'.format(l, neg_acc)) # # print('{} = {}'.format(l, (pos_acc + neg_acc) / 2)) - - From f41ad6be7cc4b87097f03fd5d591a9c2f6c7dd83 Mon Sep 17 00:00:00 2001 From: xuchen Date: Mon, 29 Jun 2020 14:12:38 +0800 Subject: [PATCH 3/4] sent_acc --- sentiment_accuracy.py | 101 ++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 62 deletions(-) diff --git a/sentiment_accuracy.py b/sentiment_accuracy.py index e86f381..89cf9aa 100644 --- a/sentiment_accuracy.py +++ b/sentiment_accuracy.py @@ -103,67 +103,44 @@ def sent_acc(samples, model, text_field, cuda_flag, positive=True, ): return accuracy -def calculate_acc(method_dirs, label): - for method_dir in method_dirs: - # mean_acc, pos_acc, neg_acc = None, None, None - file_pos = '{}/positive/{}'.format(method_dir, label) - with open(file_pos, 'r') as f: - samples = f.read().split('<|endoftext|>') - samples = [s for s in samples if len(s.split()) > 20] - - # samples = ['I love you so much !', 'So cool good . happy birthday ! '] - pos_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=True) - # print('{} = {}'.format(l, pos_acc)) - - file_neg = '{}/negative/{}'.format(method_dir, label) - with open(file_neg, 'r') as f: - samples = f.read().split('<|endoftext|>') - samples = [s for s in samples if len(s.split()) > 20] - - # samples = ['I love you so much !', 'So cool good . happy birthday ! '] - neg_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=False) - # print('{} = {}'.format(l, neg_acc)) - mean_acc = (pos_acc + neg_acc) / 2 - print(method_dir) - print(mean_acc.item(), pos_acc, neg_acc) - - -# train or predict - -pplm_generated_dir = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/pplm/generated' -pplm_reversed_dir = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/pplm/reversed' -vad_dir = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/vad' -vad_abs_dir = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/vad_abs' - -label = ['B', 'BR', 'BC', 'BCR'] -method_dirs = [ - pplm_generated_dir, - pplm_reversed_dir, - vad_dir, - vad_abs_dir +def cal_acc(sent_label, method_label, suffix, src): + src = '{}/{}/{}{}'.format(src, sent_label, method_label, suffix) + with open(src, 'r') as f: + samples = f.read().split('<|endoftext|>') + samples = samples[1:] + acc = sent_acc(samples, cnn, text_field, args.cuda, + True if sent_label == 'positive' else False) + return acc + + +def cal_accs(method_label, suffix, src): + print('{}:\n'.format(method_label)) + pos_acc = cal_acc('positive', method_label, suffix, src) + print('pos_acc: {}'.format(pos_acc.item())) + + neg_acc = cal_acc('negative', method_label, suffix, src) + print('neg_acc: {}'.format(neg_acc.item())) + + print('mean_acc: {}'.format((pos_acc + neg_acc) / 2)) + + +SRC_SAMPLES = '/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/generated_samples' + +# # single or not +# sent_label = [ +# # 'positive', +# 'negative' +# ] + +# multiple +method_label = [ + 'BC', + # 'BC_VAD', + # 'BC_VAD_ABS', ] -calculate_acc(method_dirs, label[2]) -# print('Positive: {}'.format(pos_acc)) -# print('Negative: {}'.format(neg_acc)) -# print('Mean: {}'.format(mean_acc.item())) - -# for l in label: -# pos_acc, neg_acc = None, None -# with open('/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/positive/{}'.format(l), 'r') as f: -# samples = f.read().split('<|endoftext|>') -# samples = [s for s in samples if len(s.split()) > 20] -# -# # samples = ['I love you so much !', 'So cool good . happy birthday ! '] -# pos_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=True) -# # print('{} = {}'.format(l, pos_acc)) -# -# with open('/Users/xuchen/core/pycharm/project/PPL/automated_evaluation/negative/{}'.format(l), 'r') as f: -# samples = f.read().split('<|endoftext|>') -# samples = [s for s in samples if len(s.split()) > 20] -# -# # samples = ['I love you so much !', 'So cool good . happy birthday ! '] -# neg_acc = sent_acc(samples, cnn, text_field, args.cuda, positive=False) -# # print('{} = {}'.format(l, neg_acc)) -# -# print('{} = {}'.format(l, (pos_acc + neg_acc) / 2)) +suffix = '(2_45_10)' + +cal_accs(method_label[0], suffix, SRC_SAMPLES) +# neg_acc = cal_acc('negative', method_label[0], SRC_SAMPLES) +# print('neg_acc: {}'.format(neg_acc.item())) From 70b01c8a7b5c880736005ec8cc4dedda8585706a Mon Sep 17 00:00:00 2001 From: xuchen Date: Mon, 29 Jun 2020 14:14:14 +0800 Subject: [PATCH 4/4] sent_acc --- sentiment_accuracy.py => cal_acc.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sentiment_accuracy.py => cal_acc.py (100%) diff --git a/sentiment_accuracy.py b/cal_acc.py similarity index 100% rename from sentiment_accuracy.py rename to cal_acc.py