-
Notifications
You must be signed in to change notification settings - Fork 315
/
Copy pathgtn_data.py
149 lines (117 loc) · 5.42 KB
/
gtn_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os.path as osp
import pickle
import numpy as np
import torch
from cogdl.data import Graph, Dataset
from cogdl.utils import download_url, untar
class GTNDataset(Dataset):
r"""The network datasets "ACM", "DBLP" and "IMDB" from the
`"Graph Transformer Networks"
<https://arxiv.org/abs/1911.06455>`_ paper.
Args:
root (string): Root directory where the dataset should be saved.
name (string): The name of the dataset (:obj:`"gtn-acm"`,
:obj:`"gtn-dblp"`, :obj:`"gtn-imdb"`).
"""
def __init__(self, root, name):
self.name = name
self.url = f"https://github.com/cenyk1230/gtn-data/blob/master/{name}.zip?raw=true"
super(GTNDataset, self).__init__(root)
self.data = torch.load(self.processed_paths[0])
self.num_edge = len(self.data.adj)
self.num_nodes = self.data.x.shape[0]
@property
def raw_file_names(self):
names = ["edges.pkl", "labels.pkl", "node_features.pkl"]
return names
@property
def processed_file_names(self):
return ["data.pt"]
@property
def num_classes(self):
return torch.max(self.data.train_target).item() + 1
def read_gtn_data(self, folder):
edges = pickle.load(open(osp.join(folder, "edges.pkl"), "rb"))
labels = pickle.load(open(osp.join(folder, "labels.pkl"), "rb"))
node_features = pickle.load(open(osp.join(folder, "node_features.pkl"), "rb"))
data = Graph()
data.x = torch.from_numpy(node_features).type(torch.FloatTensor)
num_nodes = edges[0].shape[0]
node_type = np.zeros((num_nodes), dtype=int)
assert len(edges) == 4
assert len(edges[0].nonzero()) == 2
node_type[edges[0].nonzero()[0]] = 0
node_type[edges[0].nonzero()[1]] = 1
node_type[edges[1].nonzero()[0]] = 1
node_type[edges[1].nonzero()[1]] = 0
node_type[edges[2].nonzero()[0]] = 0
node_type[edges[2].nonzero()[1]] = 2
node_type[edges[3].nonzero()[0]] = 2
node_type[edges[3].nonzero()[1]] = 0
print(node_type)
data.pos = torch.from_numpy(node_type)
edge_list = []
for i, edge in enumerate(edges):
edge_tmp = torch.from_numpy(np.vstack((edge.nonzero()[0], edge.nonzero()[1]))).type(torch.LongTensor)
edge_list.append(edge_tmp)
data.edge_index = torch.cat(edge_list, 1)
A = []
for i, edge in enumerate(edges):
edge_tmp = torch.from_numpy(np.vstack((edge.nonzero()[0], edge.nonzero()[1]))).type(torch.LongTensor)
value_tmp = torch.ones(edge_tmp.shape[1]).type(torch.FloatTensor)
A.append((edge_tmp, value_tmp))
edge_tmp = torch.stack((torch.arange(0, num_nodes), torch.arange(0, num_nodes))).type(torch.LongTensor)
value_tmp = torch.ones(num_nodes).type(torch.FloatTensor)
A.append((edge_tmp, value_tmp))
data.adj = A
data.train_node = torch.from_numpy(np.array(labels[0])[:, 0]).type(torch.LongTensor)
data.train_target = torch.from_numpy(np.array(labels[0])[:, 1]).type(torch.LongTensor)
data.valid_node = torch.from_numpy(np.array(labels[1])[:, 0]).type(torch.LongTensor)
data.valid_target = torch.from_numpy(np.array(labels[1])[:, 1]).type(torch.LongTensor)
data.test_node = torch.from_numpy(np.array(labels[2])[:, 0]).type(torch.LongTensor)
data.test_target = torch.from_numpy(np.array(labels[2])[:, 1]).type(torch.LongTensor)
y = np.zeros((num_nodes), dtype=int)
x_index = torch.cat((data.train_node, data.valid_node, data.test_node))
y_index = torch.cat((data.train_target, data.valid_target, data.test_target))
y[x_index.numpy()] = y_index.numpy()
data.y = torch.from_numpy(y)
self.data = data
def get(self, idx):
assert idx == 0
return self.data
def apply_to_device(self, device):
self.data.x = self.data.x.to(device)
self.data.y = self.data.y.to(device)
self.data.train_node = self.data.train_node.to(device)
self.data.valid_node = self.data.valid_node.to(device)
self.data.test_node = self.data.test_node.to(device)
self.data.train_target = self.data.train_target.to(device)
self.data.valid_target = self.data.valid_target.to(device)
self.data.test_target = self.data.test_target.to(device)
new_adj = []
for (t1, t2) in self.data.adj:
new_adj.append((t1.to(device), t2.to(device)))
self.data.adj = new_adj
def download(self):
download_url(self.url, self.raw_dir, name=self.name + ".zip")
untar(self.raw_dir, self.name + ".zip")
def process(self):
self.read_gtn_data(self.raw_dir)
torch.save(self.data, self.processed_paths[0])
def __repr__(self):
return "{}".format(self.name)
class ACM_GTNDataset(GTNDataset):
def __init__(self, data_path="data"):
dataset = "gtn-acm"
path = osp.join(data_path, dataset)
super(ACM_GTNDataset, self).__init__(path, dataset)
class DBLP_GTNDataset(GTNDataset):
def __init__(self, data_path="data"):
dataset = "gtn-dblp"
path = osp.join(data_path, dataset)
super(DBLP_GTNDataset, self).__init__(path, dataset)
class IMDB_GTNDataset(GTNDataset):
def __init__(self, data_path="data"):
dataset = "gtn-imdb"
path = osp.join(data_path, dataset)
super(IMDB_GTNDataset, self).__init__(path, dataset)