JavaImportPrediction/python/data_loader.py at master · Dargones/JavaImportPrediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# A module for loading the graph data from .json files and preparing them for input to GGNNs

import numpy as np
from torch.utils.data import Dataset
import random


class GraphDataset(Dataset):
    """
    This class allows converting graphs to their neural-network representations on demand
    """

    def __init__(self, data, hidden_size, max_nodes, edge_types, annotation_size, targets, target_edge_type, max_targets=7):
        """
        Initialize GraphDataset so that it can be passed to DataLoader
        :param data:            graph data in .json format as loaded from disk
        :param hidden_size:     the size of node embedding in GGNN that will be used on this dataset
        :param max_nodes:       maximum number of nodes per graph
        :param edge_types:      number of different edge-types. Does not include the edges added to
                                the undirected graph
        :param annotation_size: the size of annotations (initial embedddings) for each node
        :param targets:         Can be either "generate", "generateOnPass", or "preset"
                                "generate": generate targets once and keep them this way (valid)
                                "generateOnPass": generate new targets each epoch (train)
                                some other string that is the key to a target field in the data
        :param max_targets:     Maximum number of possible target options
        :param target_edge_type:the type of edge that is to be predicted
        """
        self.hidden_size = hidden_size
        self.max_nodes = max_nodes
        self.edge_types = edge_types
        self.annotation_size = annotation_size
        self.targets = targets
        self.target_edge_type = target_edge_type
        self.max_targets = max_targets
        # if targets are ot be automatically generated, clear whatever is stored as targets now:
        if self.targets == "generate" or self.targets == "generateOnPass":
            self.data = data
            for graph in self.data:
                graph['targets'] = None
        else:
            self.data = []
            for graph in data:
                if not graph[self.targets]:
                    continue
                self.data.append(graph)
                graph['targets'] = self.read_targets(graph)

    def __len__(self):
        """
        Return the number of samples in the dataset
        :return:
        """
        return len(self.data)

    def get_numeric_representation(self, index):
        """
        Given an index of a graph in the dataset, get its numeric representation. This consists
        of the following:
        1) adjacency matrix
        2) matrix of initial node annotations
        3) src, pos, and mask as returned by create_target() or read_target()
        """
        graph = self.data[index]
        a_matrix = self.create_adjacency_matrix(graph["edges"])  # adjacency matrix
        # node features should have the the shape (Max#OfNodes, hidden_size)
        features = np.pad(graph['annotations'],
                          ((0, self.max_nodes - len(graph['annotations'])),
                           (0, self.hidden_size - self.annotation_size)),
                          'constant')

        if self.targets == "generateOnPass" or (
                self.targets == "generate" and not graph["targets"]):
            graph["targets"] = self.create_target(graph['edges'], a_matrix,
                                                  len(graph['annotations']))
        src, pos, mask = graph["targets"]

        self.set_matrix(a_matrix, src, pos, self.target_edge_type, 0)
        return a_matrix, features, mask, src, pos

    def __getitem__(self, index):
        """
        For a given repository, return its numeric representation with the target edge removed
        as well as max_targets more of its numeric representations each of which introduces a
        single edge to the graph. The network will have to select the graph that introduces the
        target edge
        """
        matrix, features, mask, src, pos = self.get_numeric_representation(index)

        options = np.where(mask == 1)[0]
        if len(options) > self.max_targets:
            options = np.random.choice(options, self.max_targets, replace=False)

        matrixes = np.zeros((self.max_targets + 2, matrix.shape[0], matrix.shape[1]))
        matrixes[0] = matrix

        for i, option in enumerate([pos] + list(options)):
            new_matrix = matrix.copy()
            self.set_matrix(new_matrix, src, option, self.target_edge_type, 1)
            matrixes[i + 1] = new_matrix

        src = np.full(shape=self.max_targets + 2, fill_value=src)
        features = np.stack(list([features for _ in range(self.max_targets + 2)]), axis=0)
        mask = np.zeros(self.max_targets + 2)
        mask[2:len(options) + 2] = 1
        return matrixes, features, src, mask

    def create_adjacency_matrix(self, edges):
        """
        Create adjacency matrix for the graph
        :param edges: List of all edges in the graph
        :return:
        """
        a = np.zeros([self.max_nodes, self.max_nodes * self.edge_types * 2])
        for edge in edges:
            src = edge[0]
            e_type = edge[1]
            dest = edge[2]
            self.set_matrix(a, src, dest, e_type, 1)
        return a

    def set_matrix(self, a, src, dest, e_type, value):
        """
        Remove or add an edge in the adjacency matrix. Also remove or add the corresponding edge
        going in the opposite direction
        :param a:     the adjacency matrix
        :param src:   the source node
        :param dest:  the destination node
        :param e_type:the type of the edge to be removed\added
        :param value: 1 if the edge is to be added, 0 otehrwise
        """
        a[dest][(e_type - 1) * self.max_nodes + src] = value
        a[src][(e_type - 1 + self.edge_types) * self.max_nodes + dest] = value

    def create_target(self, edges, a, n_nodes):
        """
        Select a random import edge on a graph. Return the src and destination nodes that this
        edge connects. Also return a mask that for each node in the graph specifies whether it
        could have been connected to the src node with an import edge. These are future negatives
        that the network will have to distinguish from the true destination node.
        :return:
        """
        valid_edges = [x for x in edges if x[1] == self.target_edge_type]
        src, _, dest = valid_edges[random.randint(0, len(valid_edges) - 1)]
        # a column that for each node specifies whether there is an edge to it from the source node:
        mask = np.ones(self.max_nodes, dtype=np.float)
        for e_type in range(self.edge_types):
            mask *= (1 - np.resize(a[:, e_type * self.max_nodes + src], self.max_nodes))
        mask[dest] = 0  # remove positive example from the mask - this line in redundant
        mask[n_nodes:] = 0
        return src, dest, mask

    def read_targets(self, graph):
        """
        Read the targets directly from the json file (should be used for testing). Return the
        targets in the same format that create target does
        """
        src = graph[self.targets][0]
        dest = graph[self.targets][1]
        mask = np.zeros(self.max_nodes, dtype=np.float)
        for node_id in graph[self.targets][2:]:
            mask[node_id] = 1
        return src, dest, mask