JavaImportPrediction/python/wrapper.py at master · Dargones/JavaImportPrediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import torch as tt
from torch.utils.data import DataLoader
import os
import json
import numpy as np
import random
from tqdm.auto import tqdm
import sys

from python.utils import CustomTripletLoss
from python.data_loader import GraphDataset
from python.model import GGNN

seed = 0
random.seed(seed)
np.random.seed(seed)
tt.manual_seed(seed)

CUDA = True


def load_dataset(directory, filename, batch_size, shuffle, targets, hidden_size, annotation_size,
                 max_nodes=300, edge_types=3, target_edge_type=1, num_workers=4, max_targets=7):
    """
    Load a .json file into memory, create a GraphDataset out of it and return a DataLoader for it
    :param directory:       the directory from which to load the file
    :param filename:        the name of the file
    :param batch_size:      batch size used to initialize the DataLoader
    :param shuffle:         if True, shuffle the graphs on each pass
    :param targets:         Can be either "generate", "generateOnPass", or a key to the json dictionary
                            from which to load the targets
                            "generate": generate targets once and keep them this way (validation)
                            "generateOnPass": generate new targets at each epoch (training)
    :param hidden_size:     the size of node embedding in GGNN that will be used on this dataset
    :param max_nodes:       maximum number of nodes per graph
    :param edge_types:      number of different edge-types. Does not include the edges added to
                            the undirected graph
    :param annotation_size: the size of annotations (initial embedddings) for each node
    :param target_edge_type:the type of edge that is to be predicted
    :return:                a dataloader object
    """
    full_path = os.path.join(directory, filename) # full path to the file
    print("Loading data from %s" % full_path)
    with open(full_path, 'r') as f:
        data = json.load(f)
    dataset =  GraphDataset(data,
                            hidden_size=hidden_size,
                            max_nodes=max_nodes,
                            edge_types=edge_types,
                            annotation_size=annotation_size,
                            targets=targets,
                            target_edge_type=target_edge_type,
                            max_targets=max_targets)
    return DataLoader(dataset, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers)


def run_epoch(model, optimizer, criterion, data, epoch, is_training, useTqdm=False):
    """
    Run a given model for one epoch on the given dataset and return the loss and the accuracy
    :param data:         a DataLoader object that works on a GraphDataset
    :param epoch:        epoch id (for logging purposes)
    :param is_training:  whether to train or just evaluate
    :param uesTqdm:      use tqdm for output
    :return:             mean loss, mean accuracy
    """
    if CUDA:
        model.cuda()

    if is_training:
        model.train()
        if useTqdm:
            print("Epoch %d. Training" % epoch)
    else:
        model.eval()
        if useTqdm:
            print("Epoch %d. Evaluating" % epoch)

    total_loss = 0
    total_acc = 0
    step = 0
    if useTqdm:
        batches = tqdm(data)
    else:
        batches = data

    for adj_matrix, features, src, mask in batches:
        step += 1

        optimizer.zero_grad()
        model.zero_grad()

        batch_size = adj_matrix.shape[0]
        option_size = adj_matrix.shape[1]

        # TODO: move these view functions in the GraphDataset
        adj_matrix = adj_matrix.view(-1, adj_matrix.shape[2], adj_matrix.shape[3]).float()
        src = src.view(-1).long()
        features = features.view(-1, features.shape[2], features.shape[3]).float()

        if CUDA:  # move to CUDA, if possible
            mask = mask.cuda()
            features = features.cuda()
            adj_matrix = adj_matrix.cuda()
            src = src.cuda()

        distances = model(features, adj_matrix, src, batch_size, option_size)

        loss, acc = criterion(distances, mask)
        total_acc += acc.cpu().data.numpy().item()
        total_loss += loss.cpu().data.numpy().item()

        if useTqdm:
            batches.set_description("Acc: step=%.2f, m.=%.3f. M. loss=%.3f" % (acc, total_acc / step, total_loss / step))

        if is_training:
            loss.backward(retain_graph=True)
            tt.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

    if not useTqdm:
        print("Epoch: %d, Acc: %.4f, Loss: %.4f" % (epoch, total_acc/step, total_loss/step))
    return total_loss/step, total_acc/step


def train(model, epochs, optimizer, criterion, patience, train_data, val_data, best_model_file,
          useTqdm=False):
    """
    Train a given model for a given number of epochs. Use early stopping with given patience.
    :param model:     a GGNN model
    :param epochs:    maximum number of epochs to run the model for
    :param patience:  patience value to use for early stopping
    :param train_data:training dataset
    :apram val_data:  validation dataset
    :param best_model_file: file to save the best model to
    :param uesTqdm:   use tqdm for output
    """

    best_val_loss, best_val_loss_epoch = float("inf"), 0
    for epoch in range(epochs):
        run_epoch(model, optimizer, criterion, train_data, epoch, True, useTqdm)
        val_loss, _ = run_epoch(model, optimizer, criterion, val_data, epoch, False, useTqdm)

        if val_loss < best_val_loss:
            tt.save(model.state_dict(), best_model_file)
            if useTqdm:
                print("Best epoch so far. Saving to '%s')" % (best_model_file))
            best_val_loss = val_loss
            best_val_loss_epoch = epoch
        elif epoch - best_val_loss_epoch >= patience:  # early stopping
            print("Early Stopping after %i epochs." % epoch)
            break


def train_test(dim, lr, steps):
    DIR = 'data/graphs/newMethod' + str(dim) + '/'
    test_data = load_dataset(DIR, 'test.json', batch_size=20, shuffle=False, targets="targets_1",
                             hidden_size=dim * 2, annotation_size=dim * 2, max_targets=15,
                             num_workers=1)
    train_data = load_dataset(DIR, "train.json", batch_size=20, shuffle=True,
                              targets="generateOnPass", hidden_size=dim * 2, annotation_size=dim * 2,
                              max_targets=15, num_workers=1)
    val_data = load_dataset(DIR, 'valid.json', batch_size=20, shuffle=False, targets="generate",
                            hidden_size=dim * 2, annotation_size=dim * 2, max_targets=15,
                            num_workers=1)

    model = GGNN(state_dim=dim * 2,
                 annotation_dim=dim * 2,
                 n_edge_types=3,
                 n_nodes=300,
                 n_steps=steps)
    criterion = CustomTripletLoss(margin=0.5)
    optimizer = tt.optim.Adam(model.parameters(), lr=lr)

    best_filename = "best"+str(dim)+"-" + str(lr) + "_" + str(steps) + ".model"

    train(model, epochs=30, optimizer=optimizer, criterion=criterion, patience=3,
          train_data=train_data, val_data=val_data, best_model_file=best_filename, useTqdm=False
          )

    model.load_state_dict(tt.load(best_filename))
    loss, acc = run_epoch(model, optimizer, criterion, test_data, 1, False, useTqdm=False)
    print(loss, acc)


def test_model(dim, steps, candidates, model_file):
    DIR = '../data/graphs/newMethod' + str(dim) + '/'
    test_data = load_dataset(DIR, 'test.json', batch_size=1, shuffle=False,
                             targets="targets_" + str(candidates), hidden_size=dim * 2,
                             annotation_size=dim * 2, num_workers=1, max_targets=candidates)

    model = GGNN(state_dim=dim * 2,
                 annotation_dim=dim * 2,
                 n_edge_types=3,
                 n_nodes=300,
                 n_steps=steps)
    criterion = CustomTripletLoss(margin=0.5, binary_acc=False)
    optimizer = tt.optim.Adam(model.parameters(), lr=0.001)

    model.load_state_dict(tt.load(model_file))
    loss, acc = run_epoch(model, optimizer, criterion, test_data, 1, False, useTqdm=True)
    print(loss, acc)


if __name__ == "__main__":
    if sys.argv[1] == "train":
        train_test(int(sys.argv[2]), float(sys.argv[3]), int(sys.argv[4]))
    elif sys.argv[1] == "test":
        test_model(int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5])