From c38aba14b4c3bb15d47de0e0bd78a2de9c6fe65d Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Wed, 2 Jun 2021 16:46:29 -0700 Subject: [PATCH 01/66] update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 927fb084..a37445a5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ run/datasets/data/ **/__pycache__/ **/.ipynb_checkpoints -.idea/ \ No newline at end of file +.idea/ +.vscode/settings.json From 961140f79fc259fbef0355b34c0bf0cde8ecf947 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Thu, 3 Jun 2021 14:47:00 -0700 Subject: [PATCH 02/66] make copy. --- graphgym/models/head_mem.py | 130 ++++++++++++++++ graphgym/models/layer_recurrent.py | 238 +++++++++++++++++++++++++++++ 2 files changed, 368 insertions(+) create mode 100644 graphgym/models/head_mem.py create mode 100644 graphgym/models/layer_recurrent.py diff --git a/graphgym/models/head_mem.py b/graphgym/models/head_mem.py new file mode 100644 index 00000000..3114cc72 --- /dev/null +++ b/graphgym/models/head_mem.py @@ -0,0 +1,130 @@ +""" GNN heads are the last layer of a GNN right before loss computation. + +They are constructed in the init function of the gnn.GNN. +""" + +import torch +import torch.nn as nn + +from graphgym.config import cfg +from graphgym.models.layer import MLP +from graphgym.models.pooling import pooling_dict + +from graphgym.contrib.head import * +import graphgym.register as register + + +########### Head ############ + +class GNNNodeHead(nn.Module): + '''Head of GNN, node prediction''' + + def __init__(self, dim_in, dim_out): + super(GNNNodeHead, self).__init__() + self.layer_post_mp = MLP(dim_in, dim_out, + num_layers=cfg.gnn.layers_post_mp, bias=True) + + def _apply_index(self, batch): + if batch.node_label_index.shape[0] == batch.node_label.shape[0]: + return batch.node_feature[batch.node_label_index], batch.node_label + else: + return batch.node_feature[batch.node_label_index], \ + batch.node_label[batch.node_label_index] + + def forward(self, batch): + batch = self.layer_post_mp(batch) + pred, label = self._apply_index(batch) + return pred, label + + +class GNNEdgeHead(nn.Module): + '''Head of GNN, edge prediction''' + + def __init__(self, dim_in, dim_out): + ''' Head of Edge and link prediction models. + + Args: + dim_out: output dimension. For binary prediction, dim_out=1. + ''' + # Use dim_in for graph conv, since link prediction dim_out could be + # binary + # E.g. if decoder='dot', link probability is dot product between + # node embeddings, of dimension dim_in + super(GNNEdgeHead, self).__init__() + # module to decode edges from node embeddings + + if cfg.model.edge_decoding == 'concat': + self.layer_post_mp = MLP(dim_in * 2, dim_out, + num_layers=cfg.gnn.layers_post_mp, + bias=True) + # requires parameter + self.decode_module = lambda v1, v2: \ + self.layer_post_mp(torch.cat((v1, v2), dim=-1)) + else: + if dim_out > 1: + raise ValueError( + 'Binary edge decoding ({})is used for multi-class ' + 'edge/link prediction.'.format(cfg.model.edge_decoding)) + self.layer_post_mp = MLP(dim_in, dim_in, + num_layers=cfg.gnn.layers_post_mp, + bias=True) + if cfg.model.edge_decoding == 'dot': + self.decode_module = lambda v1, v2: torch.sum(v1 * v2, dim=-1) + elif cfg.model.edge_decoding == 'cosine_similarity': + self.decode_module = nn.CosineSimilarity(dim=-1) + else: + raise ValueError('Unknown edge decoding {}.'.format( + cfg.model.edge_decoding)) + + def _apply_index(self, batch): + return batch.node_feature[batch.edge_label_index], \ + batch.edge_label + + def forward(self, batch): + if cfg.model.edge_decoding != 'concat': + batch = self.layer_post_mp(batch) + pred, label = self._apply_index(batch) + nodes_first = pred[0] + nodes_second = pred[1] + pred = self.decode_module(nodes_first, nodes_second) + return pred, label + + +class GNNGraphHead(nn.Module): + '''Head of GNN, graph prediction + + The optional post_mp layer (specified by cfg.gnn.post_mp) is used + to transform the pooled embedding using an MLP. + ''' + + def __init__(self, dim_in, dim_out): + super(GNNGraphHead, self).__init__() + # todo: PostMP before or after global pooling + self.layer_post_mp = MLP(dim_in, dim_out, + num_layers=cfg.gnn.layers_post_mp, bias=True) + self.pooling_fun = pooling_dict[cfg.model.graph_pooling] + + def _apply_index(self, batch): + return batch.graph_feature, batch.graph_label + + def forward(self, batch): + if cfg.dataset.transform == 'ego': + graph_emb = self.pooling_fun(batch.node_feature, batch.batch, + batch.node_id_index) + else: + graph_emb = self.pooling_fun(batch.node_feature, batch.batch) + graph_emb = self.layer_post_mp(graph_emb) + batch.graph_feature = graph_emb + pred, label = self._apply_index(batch) + return pred, label + + +# Head models for external interface +head_dict = { + 'node': GNNNodeHead, + 'edge': GNNEdgeHead, + 'link_pred': GNNEdgeHead, + 'graph': GNNGraphHead +} + +head_dict = {**register.head_dict, **head_dict} diff --git a/graphgym/models/layer_recurrent.py b/graphgym/models/layer_recurrent.py new file mode 100644 index 00000000..df60700e --- /dev/null +++ b/graphgym/models/layer_recurrent.py @@ -0,0 +1,238 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch_geometric as pyg + +from graphgym.config import cfg +from graphgym.models.act import act_dict +from graphgym.contrib.layer.generalconv import (GeneralConvLayer, + GeneralEdgeConvLayer) + +from graphgym.contrib.layer import * +import graphgym.register as register + + +## General classes +class GeneralLayer(nn.Module): + '''General wrapper for layers''' + + def __init__(self, name, dim_in, dim_out, has_act=True, has_bn=True, + has_l2norm=False, **kwargs): + super(GeneralLayer, self).__init__() + self.has_l2norm = has_l2norm + has_bn = has_bn and cfg.gnn.batchnorm + self.layer = layer_dict[name](dim_in, dim_out, + bias=not has_bn, **kwargs) + layer_wrapper = [] + if has_bn: + layer_wrapper.append(nn.BatchNorm1d( + dim_out, eps=cfg.bn.eps, momentum=cfg.bn.mom)) + if cfg.gnn.dropout > 0: + layer_wrapper.append(nn.Dropout( + p=cfg.gnn.dropout, inplace=cfg.mem.inplace)) + if has_act: + layer_wrapper.append(act_dict[cfg.gnn.act]) + self.post_layer = nn.Sequential(*layer_wrapper) + + def forward(self, batch): + batch = self.layer(batch) + if isinstance(batch, torch.Tensor): + batch = self.post_layer(batch) + if self.has_l2norm: + batch = F.normalize(batch, p=2, dim=1) + else: + batch.node_feature = self.post_layer(batch.node_feature) + if self.has_l2norm: + batch.node_feature = F.normalize(batch.node_feature, p=2, dim=1) + return batch + + +class GeneralMultiLayer(nn.Module): + '''General wrapper for stack of layers''' + + def __init__(self, name, num_layers, dim_in, dim_out, dim_inner=None, + final_act=True, **kwargs): + super(GeneralMultiLayer, self).__init__() + dim_inner = dim_in if dim_inner is None else dim_inner + for i in range(num_layers): + d_in = dim_in if i == 0 else dim_inner + d_out = dim_out if i == num_layers - 1 else dim_inner + has_act = final_act if i == num_layers - 1 else True + layer = GeneralLayer(name, d_in, d_out, has_act, **kwargs) + self.add_module('Layer_{}'.format(i), layer) + + def forward(self, batch): + for layer in self.children(): + batch = layer(batch) + return batch + + +## Core basic layers +# Input: batch; Output: batch +class Linear(nn.Module): + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(Linear, self).__init__() + self.model = nn.Linear(dim_in, dim_out, bias=bias) + + def forward(self, batch): + if isinstance(batch, torch.Tensor): + batch = self.model(batch) + else: + batch.node_feature = self.model(batch.node_feature) + return batch + + +class BatchNorm1dNode(nn.Module): + '''General wrapper for layers''' + + def __init__(self, dim_in): + super(BatchNorm1dNode, self).__init__() + self.bn = nn.BatchNorm1d(dim_in, eps=cfg.bn.eps, momentum=cfg.bn.mom) + + def forward(self, batch): + batch.node_feature = self.bn(batch.node_feature) + return batch + + +class BatchNorm1dEdge(nn.Module): + '''General wrapper for layers''' + + def __init__(self, dim_in): + super(BatchNorm1dEdge, self).__init__() + self.bn = nn.BatchNorm1d(dim_in, eps=cfg.bn.eps, momentum=cfg.bn.mom) + + def forward(self, batch): + batch.edge_feature = self.bn(batch.edge_feature) + return batch + + +class MLP(nn.Module): + def __init__(self, dim_in, dim_out, bias=True, dim_inner=None, + num_layers=2, **kwargs): + ''' + Note: MLP works for 0 layers + ''' + super(MLP, self).__init__() + dim_inner = dim_in if dim_inner is None else dim_inner + layers = [] + if num_layers > 1: + layers.append( + GeneralMultiLayer('linear', num_layers - 1, dim_in, dim_inner, + dim_inner, final_act=True)) + layers.append(Linear(dim_inner, dim_out, bias)) + else: + layers.append(Linear(dim_in, dim_out, bias)) + self.model = nn.Sequential(*layers) + + def forward(self, batch): + if isinstance(batch, torch.Tensor): + batch = self.model(batch) + else: + batch.node_feature = self.model(batch.node_feature) + return batch + + +class GCNConv(nn.Module): + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(GCNConv, self).__init__() + self.model = pyg.nn.GCNConv(dim_in, dim_out, bias=bias) + + def forward(self, batch): + batch.node_feature = self.model(batch.node_feature, batch.edge_index) + return batch + + +class SAGEConv(nn.Module): + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(SAGEConv, self).__init__() + self.model = pyg.nn.SAGEConv(dim_in, dim_out, bias=bias, concat=True) + + def forward(self, batch): + batch.node_feature = self.model(batch.node_feature, batch.edge_index) + return batch + + +class GATConv(nn.Module): + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(GATConv, self).__init__() + self.model = pyg.nn.GATConv(dim_in, dim_out, bias=bias) + + def forward(self, batch): + batch.node_feature = self.model(batch.node_feature, batch.edge_index) + return batch + + +class GINConv(nn.Module): + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(GINConv, self).__init__() + gin_nn = nn.Sequential(nn.Linear(dim_in, dim_out), nn.ReLU(), + nn.Linear(dim_out, dim_out)) + self.model = pyg.nn.GINConv(gin_nn) + + def forward(self, batch): + batch.node_feature = self.model(batch.node_feature, batch.edge_index) + return batch + + +class SplineConv(nn.Module): + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(SplineConv, self).__init__() + self.model = pyg.nn.SplineConv(dim_in, dim_out, + dim=1, kernel_size=2, bias=bias) + + def forward(self, batch): + batch.node_feature = self.model(batch.node_feature, batch.edge_index, + batch.edge_feature) + return batch + + +class GeneralConv(nn.Module): + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(GeneralConv, self).__init__() + self.model = GeneralConvLayer(dim_in, dim_out, bias=bias) + + def forward(self, batch): + batch.node_feature = self.model(batch.node_feature, batch.edge_index) + return batch + + +class GeneralEdgeConv(nn.Module): + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(GeneralEdgeConv, self).__init__() + self.model = GeneralEdgeConvLayer(dim_in, dim_out, bias=bias) + + def forward(self, batch): + batch.node_feature = self.model(batch.node_feature, batch.edge_index, + edge_feature=batch.edge_feature) + return batch + + +class GeneralSampleEdgeConv(nn.Module): + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(GeneralSampleEdgeConv, self).__init__() + self.model = GeneralEdgeConvLayer(dim_in, dim_out, bias=bias) + + def forward(self, batch): + edge_mask = torch.rand(batch.edge_index.shape[1]) < cfg.gnn.keep_edge + edge_index = batch.edge_index[:, edge_mask] + edge_feature = batch.edge_feature[edge_mask, :] + batch.node_feature = self.model(batch.node_feature, edge_index, + edge_feature=edge_feature) + return batch + + +layer_dict = { + 'linear': Linear, + 'mlp': MLP, + 'gcnconv': GCNConv, + 'sageconv': SAGEConv, + 'gatconv': GATConv, + 'splineconv': SplineConv, + 'ginconv': GINConv, + 'generalconv': GeneralConv, + 'generaledgeconv': GeneralEdgeConv, + 'generalsampleedgeconv': GeneralSampleEdgeConv, +} + +# register additional convs +layer_dict = {**register.layer_dict, **layer_dict} From 2d57c1278ed7c07116a1cdd66dcf3274463da0d3 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Thu, 3 Jun 2021 15:04:28 -0700 Subject: [PATCH 03/66] add config file for roland. --- graphgym/contrib/config/roland.py | 204 ++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 graphgym/contrib/config/roland.py diff --git a/graphgym/contrib/config/roland.py b/graphgym/contrib/config/roland.py new file mode 100644 index 00000000..a30c38fe --- /dev/null +++ b/graphgym/contrib/config/roland.py @@ -0,0 +1,204 @@ +from yacs.config import CfgNode as CN + +from graphgym.register import register_config + + +def set_cfg_roland(cfg): + r''' + This function sets the default config value for customized options + :return: customized configuration use by the experiment. + ''' + + # ----------------------------------------------------------------------- # + # Customized options + # ----------------------------------------------------------------------- # + # Method to update node embedding from old node embedding and new node features. + # Options: 'moving_average', 'masked_gru', 'gru' + # moving average: new embedding = r * old + (1-r) * node_feature. + # gru: new embedding = GRU(node_feature, old_embedding). + # masked_gru: only apply GRU to active nodes. + cfg.gnn.embed_update_method = 'moving_average' + # what kind of GRU kernel to use if GRU is required for embedding updating. + cfg.gnn.gru_kernel = 'linear' + # how many layers to use in the MLP updater. + # default: 1, use a simple linear layer. + cfg.gnn.mlp_update_layers = 2 + + # For meta-learning. + cfg.meta = CN() + # Whether to do meta-learning via initialization moving average. + # Default to False. + cfg.meta.is_meta = False + + # choose between 'moving_average' and 'online_mean' + cfg.meta.method = 'moving_average' + # For online mean: + # new_mean = (n-1)/n * old_mean + 1/n * new_value. + # where *_mean corresponds to W_init. + + # Weight used in moving average for model parameters. + # After fine-tuning the model in period t and get model M[t], + # Set W_init = (1-alpha) * W_init + alpha * M[t]. + # For the next period, use W_init as the initialization for fine-tune + # Set cfg.meta.alpha = 1.0 to recover the original algorithm. + cfg.meta.alpha = 0.9 + + # Use to identify experiments. + cfg.remark = '' + # Experimental Features, use this name space to save all controls for + # experimental features. + cfg.experimental = CN() + + # How many negative edges for each node to compute rank-based evaluation + # metrics such as MRR and recall at K. + # E.g., if multiplier = 1000 and a node has 3 positive edges, then we + # compute the MRR using 1000 randomly generated negative edges + # + 3 existing positive edges. + cfg.experimental.rank_eval_multiplier = 1000 + + # Only use the first n snapshots (time periods) to train the model. + # Empirically, the model learns rich dynamics from only a few periods. + # Set to -1 if using all snapshots. + cfg.experimental.restrict_training_set = -1 + + # Whether to visualize edge attention of GNN layer after training. + cfg.experimental.visualize_gnn_layer = False + + cfg.train.tbptt_freq = 5 + + cfg.train.internal_validation_tolerance = 5 + + # Computing MRR is slow in the baseline setting. + # Only start to compute MRR in the test set range after certain time. + cfg.train.start_compute_mrr = 0 + + # How to handle node features in AS dataset. + # available: ['one', 'one_hot_id', 'one_hot_degree_global', 'one_hot_degree_local'] + cfg.dataset.AS_node_feature = 'one' + + # ----------------------------------------------------------------------- # + # Additional dataset option for the BSI dataset. + # ----------------------------------------------------------------------- # + # Method used to sample negative edges for edge_label_index. + # 'uniform': all non-existing edges have same probability of being sampled + # as negative edges. + # 'src': non-existing edges from high-degree nodes are more likely to be + # sampled as negative edges. + # 'dest': non-existing edges pointed to high-degree nodes are more likely + # to be sampled as negative edges. + cfg.dataset.negative_sample_weight = 'uniform' + + # whether to load heterogeneous graphs. + cfg.dataset.is_hetero = False + + # where to put type information. 'append' or 'graph_attribute'. + cfg.dataset.type_info_loc = 'append' + + # whether to look for and load cached graph. By default (load_cache=False) + # the loader loads the raw tsv file from disk and + cfg.dataset.load_cache = False + + cfg.dataset.premade_datasets = 'fresh' + + cfg.dataset.include_node_features = False + + # 'chronological_temporal' or 'default'. + # 'chronological_temporal': only for temporal graphs, for example, + # the first 80% snapshots are for training, then subsequent 10% snapshots + # are for validation and the last 10% snapshots are for testing. + cfg.dataset.split_method = 'default' + + cfg.gnn.skip_connection = 'none' # {'none', 'identity', 'affine'} + # ----------------------------------------------------------------------- # + # Customized options + # ----------------------------------------------------------------------- # + + # example argument group + cfg.transaction = CN() + + # whether use snapshot + cfg.transaction.snapshot = False + + # snapshot split method 1: number of snapshots + # split dataset into fixed number of snapshots. + cfg.transaction.snapshot_num = 100 + + # snapshot split method 2: snapshot frequency + # e.g., one snapshot contains transactions within 1 day. + cfg.transaction.snapshot_freq = 'D' + + cfg.transaction.check_snapshot = False + + # how to use transaction history + # full or rolling + cfg.transaction.history = 'full' + + + # type of loss: supervised / meta + cfg.transaction.loss = 'meta' + + # feature dim for int edge features + cfg.transaction.feature_int_dim = 32 + cfg.transaction.feature_edge_int_num = [50, 8, 252, 252, 3, 3] + cfg.transaction.feature_node_int_num = [0] + + # feature dim for amount (float) edge feature + cfg.transaction.feature_amount_dim = 64 + + # feature dim for time (float) edge feature + cfg.transaction.feature_time_dim = 64 + + # + cfg.transaction.node_feature = 'raw' + + # how many days look into the future + cfg.transaction.horizon = 1 + + # prediction mode for the task; 'before' or 'after' + cfg.transaction.pred_mode = 'before' + + # number of periods to be captured. + # set to a list of integers if wish to use pre-defined periodicity. + # e.g., [1,7,28,31,...] etc. + cfg.transaction.time_enc_periods = [1] + + # if 'enc_before_diff': attention weight = diff(enc(t1), enc(t2)) + # if 'diff_before_enc': attention weight = enc(t1 - t2) + cfg.transaction.time_enc_mode = 'enc_before_diff' + + # how to compute the keep ratio while updating the recurrent GNN. + # the update ratio (for each node) is a function of its degree in [0, t) + # and its degree in snapshot t. + cfg.transaction.keep_ratio = 'linear' + + cfg.metric = CN() + # how to compute MRR. + # available: f = 'min', 'max', 'mean'. + # Step 1: get the p* = f(scores of positive edges) + # Step 2: compute the rank r of p* among all negative edges. + # Step 3: RR = 1 / rank. + # Step 4: average over all users. + # expected MRR(min) <= MRR(mean) <= MRR(max). + cfg.metric.mrr_method = 'max' + + # Specs for the link prediction task using BSI dataset. + # All units are days. + cfg.link_pred_spec = CN() + + # The period of `today`'s increase: how often the system is making forecast. + # E.g., when = 1, + # the system forecasts transactions in upcoming 7 days for everyday. + # One training epoch loops over + # {Jan-1-2020, Jan-2-2020, Jan-3-2020..., Dec-31-2020} + # When = 7, the system makes prediction every week. + # E.g., the system forecasts transactions in upcoming 7 days + # on every Monday. + cfg.link_pred_spec.forecast_frequency = 1 + + # How many days into the future the model is trained to predict. + # The model forecasts transactions in (today, today + forecast_horizon]. + # NOTE: forecast_horizon should >= forecast_frequency to cover all days. + cfg.link_pred_spec.forecast_horizon = 7 + + +register_config('roland', set_cfg_roland) From 2774bbd1b63902b7085afeb96c22ee4a34461144 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Thu, 3 Jun 2021 19:33:59 -0700 Subject: [PATCH 04/66] add register for embedding update module. --- graphgym/register.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/graphgym/register.py b/graphgym/register.py index 32d64b6d..cfa95ade 100644 --- a/graphgym/register.py +++ b/graphgym/register.py @@ -33,6 +33,10 @@ def register_head(key, module): def register_layer(key, module): register(key, module, layer_dict) +update_dict = {} +def register_update(key, module): + register(key, module, update_dict) + pooling_dict = {} def register_pooling(key, module): register(key, module, pooling_dict) From 015b86cbfb2a635746b468837688577d774add78 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Thu, 3 Jun 2021 19:52:12 -0700 Subject: [PATCH 05/66] add embedding update modules. --- graphgym/models/update.py | 188 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 graphgym/models/update.py diff --git a/graphgym/models/update.py b/graphgym/models/update.py new file mode 100644 index 00000000..df99d3aa --- /dev/null +++ b/graphgym/models/update.py @@ -0,0 +1,188 @@ +"""Embedding update modules for dynamic graphs.""" +import graphgym.register as register +import torch +import torch.nn as nn +from graphgym.models.layer import MLP + + +class MovingAverageUpdater(nn.Module): + """ + Moving average updater for node embeddings, + let h[l, t] denote all nodes' embedding at the l-th layer at snapshot t. + + h[l,t] = KeepRatio * h[l,t-1] + (1-KeepRatio) * h[l-1,t] + + where the precomputed KeepRatio at current snapshot t is node-specific, + which depends on the node's degree in all snapshots before t and nodes's + degree in snapshot at time t. + """ + + def __init__(self, dim_in: int, dim_out: int, layer_id: int) -> None: + self.layer_id = layer_id + super(MovingAverageUpdater, self).__init__() + + def forward(self, batch): + # TODO: check if boardcasting is correct. + H_prev = batch.node_states[self.layer_id] + X = batch.node_feature + H_new = H_prev * batch.keep_ratio + X * (1.0 - batch.keep_ratio) + batch.node_states[self.layer_id] = H_new + return batch + + +class MLPUpdater(nn.Module): + """ + Node embedding update block using simple MLP. + + h[l,t] = MLP(concat(h[l,t-1],h[l-1,t])) + """ + + def __init__(self, dim_in: int, dim_out: int, layer_id: int, + num_layers: int): + """ + Args: + dim_in (int): dimension of h[l-1, t]. + dim_out (int): dimension of h[l, t-1], node embedding dimension of + the current layer level. + layer_id (int): the index of current layer in multi-layer setting. + num_layers (int): number of layers in MLP. + """ + super(MLPUpdater, self).__init__() + self.layer_id = layer_id + # FIXME: + # assert num_layers > 1, 'There is a problem with layer=1 now, pending fix.' + self.mlp = MLP(dim_in=dim_in + dim_out, dim_out=dim_out, + num_layers=num_layers) + + def forward(self, batch): + H_prev = batch.node_states[self.layer_id] + X = batch.node_feature + concat = torch.cat((H_prev, X), axis=1) + H_new = self.mlp(concat) + batch.node_states[self.layer_id] = H_new + return batch + + +class GRUUpdater(nn.Module): + """ + Node embedding update block using standard GRU. + + h[l,t] = GRU(h[l,t-1], h[l-1,t]) + """ + def __init__(self, dim_in: int, dim_out: int, layer_id: int): + # dim_in (dim of X): dimension of input node_feature. + # dim_out (dim of H): dimension of previous and current hidden states. + # forward(X, H) --> H. + super(GRUUpdater, self).__init__() + self.layer_id = layer_id + self.GRU_Z = nn.Sequential( + nn.Linear(dim_in + dim_out, dim_out, bias=True), + nn.Sigmoid()) + # reset gate. + self.GRU_R = nn.Sequential( + nn.Linear(dim_in + dim_out, dim_out, bias=True), + nn.Sigmoid()) + # new embedding gate. + self.GRU_H_Tilde = nn.Sequential( + nn.Linear(dim_in + dim_out, dim_out, bias=True), + nn.Tanh()) + + def forward(self, batch): + H_prev = batch.node_states[self.layer_id] + X = batch.node_feature + Z = self.GRU_Z(torch.cat([X, H_prev], dim=1)) + R = self.GRU_R(torch.cat([X, H_prev], dim=1)) + H_tilde = self.GRU_H_Tilde(torch.cat([X, R * H_prev], dim=1)) + H_gru = Z * H_prev + (1 - Z) * H_tilde + batch.node_states[self.layer_id] = H_gru + return batch + + +# class MaskedGRUUpdater(nn.Module): +# """ +# Node embedding update block using standard GRU. + +# h[l,t] = GRU(h[l,t-1], h[l-1,t]) +# """ +# def __init__(self, dim_in: int, dim_out: int, layer_id: int): +# # dim_in (dim of X): dimension of input node_feature. +# # dim_out (dim of H): dimension of previous and current hidden states. +# # forward(X, H) --> H. +# super(MaskedGRUUpdater, self).__init__() +# self.layer_id = layer_id +# self.GRU_Z = nn.Sequential( +# nn.Linear(dim_in + dim_out, dim_out, bias=True), +# nn.Sigmoid()) +# # reset gate. +# self.GRU_R = nn.Sequential( +# nn.Linear(dim_in + dim_out, dim_out, bias=True), +# nn.Sigmoid()) +# # new embedding gate. +# self.GRU_H_Tilde = nn.Sequential( +# nn.Linear(dim_in + dim_out, dim_out, bias=True), +# nn.Tanh()) + +# def forward(self, batch): +# H_prev = batch.node_states[self.layer_id] +# X = batch.node_feature +# Z = self.GRU_Z(torch.cat([X, H_prev], dim=1)) +# R = self.GRU_R(torch.cat([X, H_prev], dim=1)) +# H_tilde = self.GRU_H_Tilde(torch.cat([X, R * H_prev], dim=1)) +# H_gru = Z * H_prev + (1 - Z) * H_tilde + +# # Update for active nodes only, use output from GRU. +# keep_mask = (batch.node_degree_new == 0) +# H_out = H_gru +# # Reset inactive nodes' embedding. +# H_out[keep_mask, :] = H_prev[keep_mask, :] + +# batch.node_states[self.layer_id] = H_out +# return batch + + +# class MovingAverageGRUUpdater(nn.Module): +# """ +# Node embedding update block using standard GRU. + +# h[l,t] = GRU(h[l,t-1], h[l-1,t]) +# """ +# def __init__(self, dim_in: int, dim_out: int, layer_id: int): +# # dim_in (dim of X): dimension of input node_feature. +# # dim_out (dim of H): dimension of previous and current hidden states. +# # forward(X, H) --> H. +# super(GRUUpdater, self).__init__() +# self.layer_id = layer_id +# self.GRU_Z = nn.Sequential( +# nn.Linear(dim_in + dim_out, dim_out, bias=True), +# nn.Sigmoid()) +# # reset gate. +# self.GRU_R = nn.Sequential( +# nn.Linear(dim_in + dim_out, dim_out, bias=True), +# nn.Sigmoid()) +# # new embedding gate. +# self.GRU_H_Tilde = nn.Sequential( +# nn.Linear(dim_in + dim_out, dim_out, bias=True), +# nn.Tanh()) + +# def forward(self, batch): +# H_prev = batch.node_states[self.layer_id] +# X = batch.node_feature +# Z = self.GRU_Z(torch.cat([X, H_prev], dim=1)) +# R = self.GRU_R(torch.cat([X, H_prev], dim=1)) +# H_tilde = self.GRU_H_Tilde(torch.cat([X, R * H_prev], dim=1)) +# H_gru = Z * H_prev + (1 - Z) * H_tilde + +# H_out = H_prev * batch.keep_ratio + H_gru * (1 - batch.keep_ratio) + +# batch.node_states[self.layer_id] = H_out +# return batch + + +update_dict = { + 'moving_average': MovingAverageUpdater, + 'mlp': MLPUpdater, + 'gru': GRUUpdater +} + +# merge additional update modules in register.update_dict. +update_dict = {**register.update_dict, **update_dict} From 158d859cd4201a25253c90bafbc3cd34ce056454 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Thu, 3 Jun 2021 19:53:02 -0700 Subject: [PATCH 06/66] add config for roland. --- graphgym/contrib/config/roland.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/graphgym/contrib/config/roland.py b/graphgym/contrib/config/roland.py index a30c38fe..cf00a50b 100644 --- a/graphgym/contrib/config/roland.py +++ b/graphgym/contrib/config/roland.py @@ -18,8 +18,7 @@ def set_cfg_roland(cfg): # gru: new embedding = GRU(node_feature, old_embedding). # masked_gru: only apply GRU to active nodes. cfg.gnn.embed_update_method = 'moving_average' - # what kind of GRU kernel to use if GRU is required for embedding updating. - cfg.gnn.gru_kernel = 'linear' + # how many layers to use in the MLP updater. # default: 1, use a simple linear layer. cfg.gnn.mlp_update_layers = 2 @@ -31,7 +30,7 @@ def set_cfg_roland(cfg): cfg.meta.is_meta = False # choose between 'moving_average' and 'online_mean' - cfg.meta.method = 'moving_average' + cfg.meta.method = 'moving_average' # TODO: remove, only use moving_average. # For online mean: # new_mean = (n-1)/n * old_mean + 1/n * new_value. # where *_mean corresponds to W_init. From e7467484037faacc137963b8044e414f77eea0be Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 15:41:20 -0700 Subject: [PATCH 07/66] add residual edge convolution --- graphgym/contrib/layer/residual_edge_conv.py | 137 +++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 graphgym/contrib/layer/residual_edge_conv.py diff --git a/graphgym/contrib/layer/residual_edge_conv.py b/graphgym/contrib/layer/residual_edge_conv.py new file mode 100644 index 00000000..e3fa28d7 --- /dev/null +++ b/graphgym/contrib/layer/residual_edge_conv.py @@ -0,0 +1,137 @@ +import torch +import torch.nn as nn +from torch.nn import Parameter +from torch_geometric.nn.conv import MessagePassing +from torch_geometric.nn.inits import zeros +from torch_geometric.utils import add_remaining_self_loops +from torch_scatter import scatter_add + +from graphgym.config import cfg +from graphgym.register import register_layer + + +class ResidualEdgeConvLayer(MessagePassing): + r''' + A general GNN layer with arbitrary edge features and self residual + connections. + ''' + + def __init__(self, in_channels: int, out_channels: int, + improved: bool = False, cached: bool = False, bias: bool = True, + **kwargs): + super(ResidualEdgeConvLayer, self).__init__(aggr=cfg.gnn.agg, **kwargs) + + self.in_channels = in_channels + self.out_channels = out_channels + self.improved = improved + self.cached = cached + self.normalize = cfg.gnn.normalize_adj + self.msg_direction = cfg.gnn.msg_direction + + if self.msg_direction == 'single': + self.linear_msg = nn.Linear(in_channels + cfg.dataset.edge_dim, + out_channels, bias=False) + elif self.msg_direction == 'both': + self.linear_msg = nn.Linear(in_channels * 2 + cfg.dataset.edge_dim, + out_channels, bias=False) + else: + raise ValueError + + if cfg.gnn.skip_connection == 'affine': + self.linear_skip = nn.Linear(in_channels, out_channels, bias=True) + elif cfg.gnn.skip_connection == 'identity': + assert self.in_channels == self.out_channels + + if bias: + self.bias = Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + + self.reset_parameters() + + def reset_parameters(self): + zeros(self.bias) + self.cached_result = None + self.cached_num_edges = None + + @staticmethod + def norm(edge_index, num_nodes, edge_weight=None, improved=False, + dtype=None): + if edge_weight is None: + edge_weight = torch.ones((edge_index.size(1),), dtype=dtype, + device=edge_index.device) + + fill_value = 1 if not improved else 2 + edge_index, edge_weight = add_remaining_self_loops( + edge_index, edge_weight, fill_value, num_nodes) + + row, col = edge_index + deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes) + deg_inv_sqrt = deg.pow(-0.5) + deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 + + return edge_index, deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col] + + def forward(self, x, edge_index, edge_weight=None, edge_feature=None): + if self.cached and self.cached_result is not None: + if edge_index.size(1) != self.cached_num_edges: + raise RuntimeError( + 'Cached {} number of edges, but found {}. Please ' + 'disable the caching behavior of this layer by removing ' + 'the `cached=True` argument in its constructor.'.format( + self.cached_num_edges, edge_index.size(1))) + + if not self.cached or self.cached_result is None: + self.cached_num_edges = edge_index.size(1) + if self.normalize: + edge_index, norm = self.norm(edge_index, x.size(self.node_dim), + edge_weight, self.improved, + x.dtype) + else: + norm = edge_weight + self.cached_result = edge_index, norm + + edge_index, norm = self.cached_result + if cfg.gnn.skip_connection == 'affine': + skip_x = self.linear_skip(x) + elif cfg.gnn.skip_connection == 'identity': + skip_x = x + else: + skip_x = 0.0 + return self.propagate(edge_index, x=x, norm=norm, + edge_feature=edge_feature) + skip_x + + def message(self, x_i, x_j, norm, edge_feature): + if self.msg_direction == 'both': + x_j = torch.cat((x_i, x_j, edge_feature), dim=-1) + elif self.msg_direction == 'single': + x_j = torch.cat((x_j, edge_feature), dim=-1) + else: + raise ValueError + x_j = self.linear_msg(x_j) + return norm.view(-1, 1) * x_j if norm is not None else x_j + + def update(self, aggr_out): + if self.bias is not None: + aggr_out = aggr_out + self.bias + return aggr_out + + def __repr__(self): + return '{}({}, {})'.format(self.__class__.__name__, self.in_channels, + self.out_channels) + + +class ResidualEdgeConv(nn.Module): + '''Wrapper for residual edge conv layer''' + + def __init__(self, dim_in, dim_out, bias=False, **kwargs): + super(ResidualEdgeConv, self).__init__() + self.model = ResidualEdgeConvLayer(dim_in, dim_out, bias=bias) + + def forward(self, batch): + batch.node_feature = self.model(batch.node_feature, batch.edge_index, + edge_feature=batch.edge_feature) + return batch + + +register_layer('residual_edge_conv', ResidualEdgeConv) From 6d0cf0327d6fcbc9b85c57a840408c0f93b7a439 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 15:42:36 -0700 Subject: [PATCH 08/66] update --- graphgym/contrib/layer/residual_edge_conv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/graphgym/contrib/layer/residual_edge_conv.py b/graphgym/contrib/layer/residual_edge_conv.py index e3fa28d7..dc7a7ec0 100644 --- a/graphgym/contrib/layer/residual_edge_conv.py +++ b/graphgym/contrib/layer/residual_edge_conv.py @@ -11,10 +11,10 @@ class ResidualEdgeConvLayer(MessagePassing): - r''' + """ A general GNN layer with arbitrary edge features and self residual connections. - ''' + """ def __init__(self, in_channels: int, out_channels: int, improved: bool = False, cached: bool = False, bias: bool = True, From 55c384e63f9555a7476784158cd10648a1753cb4 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 15:48:15 -0700 Subject: [PATCH 09/66] add flie --- .../contrib/loader/dynamic_graph_utils.py | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 graphgym/contrib/loader/dynamic_graph_utils.py diff --git a/graphgym/contrib/loader/dynamic_graph_utils.py b/graphgym/contrib/loader/dynamic_graph_utils.py new file mode 100644 index 00000000..098b3f5e --- /dev/null +++ b/graphgym/contrib/loader/dynamic_graph_utils.py @@ -0,0 +1,80 @@ +""" +Helper functions and utilities for dynamic graphs. + +Mar. 31, 2021. +""" +import numpy as np +import pandas as pd +from deepsnap.graph import Graph +from typing import List + + +def make_graph_snapshot(g_all: Graph, + snapshot_freq: str, + is_hetero: bool=False) -> List[Graph]: + """ + Constructs a list of graph snapshots based from g_all using g_all.edge_time + and provided snapshot_freq (frequency on calendar). + + Args: + g_all: the entire graph object, g_all must have a edge_time attribute, + g_all.edge_time consists of unix timestamp of edge time. + snapshot_freq: snapshot frequency, must be one of + 'D': daily, 'W': weekly, and 'M': monthly. + is_hetero: whether the graph is heterogeneous. + + Return: + A list of graph object, each graph snapshot has edge level information + (edge_feature, edge_time, etc) of only edges in that time period. + However, every graph snapshot has the same and full node level + information (node_feature, node_type, etc). + """ + # Arg check. + if not hasattr(g_all, 'edge_time'): + raise KeyError('Temporal graph needs to have edge_time attribute.') + + if snapshot_freq.upper() not in ['D', 'W', 'M']: + raise ValueError(f'Unsupported snapshot freq: {snapshot_freq}.') + + snapshot_freq = snapshot_freq.upper() + t = g_all.edge_time.numpy().astype(np.int64) # all timestamps. + + period_split = pd.DataFrame( + {'Timestamp': t, 'TransactionTime': pd.to_datetime(t, unit='s')}, + index=range(len(g_all.edge_time)) + ) + + freq_map = {'D': '%j', # day of year. + 'W': '%W', # week of year. + 'M': '%m'} # month of year. + + period_split['Year'] = period_split['TransactionTime'].dt.strftime( + '%Y').astype(int) + + period_split['SubYearFlag'] = period_split['TransactionTime'].dt.strftime( + freq_map[snapshot_freq]).astype(int) + + period2id = period_split.groupby(['Year', 'SubYearFlag']).indices + # e.g., dictionary w/ key = (2021, 3) and val = array(edge IDs). + + periods = sorted(list(period2id.keys())) # ascending order. + # alternatively, sorted(..., key=lambda x: x[0] + x[1]/1000). + snapshot_list = list() + for p in periods: + # unique IDs of edges in this period. + period_members = period2id[p] + + g_incr = Graph( + node_feature=g_all.node_feature, + edge_feature=g_all.edge_feature[period_members, :], + edge_index=g_all.edge_index[:, period_members], + edge_time=g_all.edge_time[period_members], + directed=g_all.directed, + list_n_type=g_all.list_n_type if is_hetero else None, + list_e_type=g_all.list_e_type if is_hetero else None, + ) + if is_hetero and hasattr(g_all, 'node_type'): + g_incr.node_type = g_all.node_type + g_incr.edge_type = g_all.edge_type[period_members] + snapshot_list.append(g_incr) + return snapshot_list From 505e017a6f20ad1893fd509d8a63f3a69f76c35a Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 15:53:48 -0700 Subject: [PATCH 10/66] add method make_graph_snapshot_by_seconds --- .../contrib/loader/dynamic_graph_utils.py | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/graphgym/contrib/loader/dynamic_graph_utils.py b/graphgym/contrib/loader/dynamic_graph_utils.py index 098b3f5e..448401ea 100644 --- a/graphgym/contrib/loader/dynamic_graph_utils.py +++ b/graphgym/contrib/loader/dynamic_graph_utils.py @@ -3,10 +3,12 @@ Mar. 31, 2021. """ +from typing import List + import numpy as np import pandas as pd +import torch from deepsnap.graph import Graph -from typing import List def make_graph_snapshot(g_all: Graph, @@ -78,3 +80,24 @@ def make_graph_snapshot(g_all: Graph, g_incr.edge_type = g_all.edge_type[period_members] snapshot_list.append(g_incr) return snapshot_list + + +def make_graph_snapshot_by_seconds(g_all: Graph, + freq_sec: int) -> List[Graph]: + """ + Split the entire graph into snapshots by frequency in terms of seconds. + """ + split_criterion = g_all.edge_time // freq_sec + groups = torch.sort(torch.unique(split_criterion))[0] + snapshot_list = list() + for t in groups: + period_members = (split_criterion == t) + g_incr = Graph( + node_feature=g_all.node_feature, + edge_feature=g_all.edge_feature[period_members, :], + edge_index=g_all.edge_index[:, period_members], + edge_time=g_all.edge_time[period_members], + directed=g_all.directed + ) + snapshot_list.append(g_incr) + return snapshot_list From c22f1a22a5d2ced5eacabe838b5fbb60838d7d77 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 15:54:49 -0700 Subject: [PATCH 11/66] add loader for UCI message. --- graphgym/contrib/loader/roland_ucimsg.py | 112 +++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 graphgym/contrib/loader/roland_ucimsg.py diff --git a/graphgym/contrib/loader/roland_ucimsg.py b/graphgym/contrib/loader/roland_ucimsg.py new file mode 100644 index 00000000..6912be75 --- /dev/null +++ b/graphgym/contrib/loader/roland_ucimsg.py @@ -0,0 +1,112 @@ +""" +Loader for the CollegeMsg temporal network. + +For more information: https://snap.stanford.edu/data/CollegeMsg.html + +Mar. 31, 2021 +""" +import os +from typing import List, Union + +import deepsnap +import numpy as np +import pandas as pd +import torch +from deepsnap.graph import Graph +from sklearn.preprocessing import MinMaxScaler + +from graphgym.config import cfg +import graphgym.contrib.loader.dynamic_graph_utils as utils +from graphgym.register import register_loader + + +def load_single_dataset(dataset_dir: str) -> Graph: + df_trans = pd.read_csv(dataset_dir, sep=' ', header=None) + df_trans.columns = ['SRC', 'DST', 'TIMESTAMP'] + assert not np.any(pd.isna(df_trans).values) + df_trans.reset_index(drop=True, inplace=True) + + # Node IDs of this dataset start from 1, re-index to 0-based. + df_trans['SRC'] -= 1 + df_trans['DST'] -= 1 + + print('num of edges:', len(df_trans)) + print('num of nodes:', np.max(df_trans[['SRC', 'DST']].values) + 1) + + time_scaler = MinMaxScaler((0, 2)) + df_trans['TimestampScaled'] = time_scaler.fit_transform( + df_trans['TIMESTAMP'].values.reshape(-1, 1)) + + edge_feature = torch.Tensor( + df_trans[['TimestampScaled']].values).view(-1, 1) + edge_index = torch.Tensor( + df_trans[['SRC', 'DST']].values.transpose()).long() # (2, E) + num_nodes = torch.max(edge_index) + 1 + + node_feature = torch.ones(num_nodes, 1) + + print('feature_node_int_num: ', node_feature.max() + 1) + + edge_time = torch.FloatTensor(df_trans['TIMESTAMP'].values) + + graph = Graph( + node_feature=node_feature, + edge_feature=edge_feature, + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + + return graph + + +def load_snapshots(dataset_dir: str, + snapshot: bool = True, + snapshot_freq: str = None + ) -> Union[deepsnap.graph.Graph, + List[deepsnap.graph.Graph]]: + g_all = load_single_dataset(dataset_dir) + if not snapshot: + return g_all + if snapshot_freq.upper() not in ['D', 'W', 'M']: + # format: '1200000s' + assert snapshot_freq.endswith('s') + freq = int(snapshot_freq.strip('s')) + snapshot_list = utils.make_graph_snapshot_by_seconds(g_all, freq) + else: + snapshot_list = utils.make_graph_snapshot(g_all, snapshot_freq, + is_hetero=False) + + num_nodes = g_all.edge_index.max() + 1 + + for g_snapshot in snapshot_list: + g_snapshot.node_states = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_degree_existing = torch.zeros(num_nodes) + + return snapshot_list + + +def load_uci_dataset(format, name, dataset_dir): + if format == 'uci_message': + graphs = load_snapshots(os.path.join(dataset_dir, name), + snapshot=cfg.transaction.snapshot, + snapshot_freq=cfg.transaction.snapshot_freq) + if cfg.dataset.split_method == 'chronological_temporal': + # return graphs with enough number of edges. + filtered_graphs = list() + for g in graphs: + if g.num_edges >= 2: + filtered_graphs.append(g) + return filtered_graphs + else: + # The default split (80-10-10) requires at least 10 edges each + # snapshot. + filtered_graphs = list() + for g in graphs: + if g.num_edges >= 10: + filtered_graphs.append(g) + return filtered_graphs + + +register_loader('roland_uci_message', load_uci_dataset) From 238c4772722383944934fbb5d11671cee0d35e2a Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 15:57:39 -0700 Subject: [PATCH 12/66] remove print --- graphgym/contrib/loader/roland_ucimsg.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/graphgym/contrib/loader/roland_ucimsg.py b/graphgym/contrib/loader/roland_ucimsg.py index 6912be75..6ac2b9fc 100644 --- a/graphgym/contrib/loader/roland_ucimsg.py +++ b/graphgym/contrib/loader/roland_ucimsg.py @@ -45,8 +45,6 @@ def load_single_dataset(dataset_dir: str) -> Graph: node_feature = torch.ones(num_nodes, 1) - print('feature_node_int_num: ', node_feature.max() + 1) - edge_time = torch.FloatTensor(df_trans['TIMESTAMP'].values) graph = Graph( From 5c05fa481f61af8fe74215afe96d0b39b4039a50 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 16:57:57 -0700 Subject: [PATCH 13/66] add loader --- graphgym/contrib/loader/roland_as.py | 167 +++++++++++++++++++++++ graphgym/contrib/loader/roland_btc.py | 182 ++++++++++++++++++++++++++ 2 files changed, 349 insertions(+) create mode 100644 graphgym/contrib/loader/roland_as.py create mode 100644 graphgym/contrib/loader/roland_btc.py diff --git a/graphgym/contrib/loader/roland_as.py b/graphgym/contrib/loader/roland_as.py new file mode 100644 index 00000000..4cab81ad --- /dev/null +++ b/graphgym/contrib/loader/roland_as.py @@ -0,0 +1,167 @@ +""" +Loader for the Autonomous systems AS-733 dataset. +""" +import os +from datetime import datetime +from typing import List + +import numpy as np +import pandas as pd +import torch +from deepsnap.graph import Graph +from graphgym.config import cfg +from graphgym.register import register_loader +from sklearn.preprocessing import OrdinalEncoder +from tqdm import tqdm + + +def make_graph_snapshot(g_all: Graph, snapshot_freq: str) -> List[Graph]: + t = g_all.edge_time.numpy().astype(np.int64) + snapshot_freq = snapshot_freq.upper() + + period_split = pd.DataFrame( + {'Timestamp': t, + 'TransactionTime': pd.to_datetime(t, unit='s')}, + index=range(len(g_all.edge_time))) + + freq_map = {'D': '%j', # day of year. + 'W': '%W', # week of year. + 'M': '%m' # month of year. + } + + period_split['Year'] = period_split['TransactionTime'].dt.strftime( + '%Y').astype(int) + + period_split['SubYearFlag'] = period_split['TransactionTime'].dt.strftime( + freq_map[snapshot_freq]).astype(int) + + period2id = period_split.groupby(['Year', 'SubYearFlag']).indices + + periods = sorted(list(period2id.keys())) + snapshot_list = list() + + for p in periods: + # unique IDs of edges in this period. + period_members = period2id[p] + assert np.all(period_members == np.unique(period_members)) + + g_incr = Graph( + node_feature=g_all.node_feature, + edge_feature=g_all.edge_feature[period_members, :], + edge_index=g_all.edge_index[:, period_members], + edge_time=g_all.edge_time[period_members], + directed=g_all.directed + ) + snapshot_list.append(g_incr) + + snapshot_list.sort(key=lambda x: torch.min(x.edge_time)) + + return snapshot_list + + +def file2timestamp(file_name): + t = file_name.strip('.txt').strip('as') + ts = int(datetime.strptime(t, '%Y%m%d').timestamp()) + return ts + + +def load_generic_dataset(format, name, dataset_dir): + if format == 'as': + all_files = [x for x in sorted(os.listdir(dataset_dir)) + if (x.startswith('as') and x.endswith('.txt'))] + assert len(all_files) == 733 + assert all(x.endswith('.txt') for x in all_files) + + edge_index_lst, edge_time_lst = list(), list() + all_files = sorted(all_files) + # if cfg.train.mode in ['baseline', 'baseline_v2', 'live_update_fixed_split']: + # # The baseline setting in EvolveGCN paper only uses 100 snapshots. + # all_files = all_files[:100] + for graph_file in tqdm(all_files): + today = file2timestamp(graph_file) + graph_file = os.path.join(dataset_dir, graph_file) + + src, dst = list(), list() + with open(graph_file, 'r') as f: + for line in f.readlines(): + if line.startswith('#'): + continue + line = line.strip('\n') + v1, v2 = line.split('\t') + src.append(int(v1)) + dst.append(int(v2)) + + edge_index = np.stack((src, dst)) + edge_index_lst.append(edge_index) + + edge_time = np.ones(edge_index.shape[1]) * today + edge_time_lst.append(edge_time) + + edge_index_raw = np.concatenate(edge_index_lst, axis=1).astype(int) + + num_nodes = len(np.unique(edge_index_raw)) + + # encode node indices to consecutive integers. + node_indices = np.sort(np.unique(edge_index_raw)) + enc = OrdinalEncoder(categories=[node_indices, node_indices]) + edge_index = enc.fit_transform(edge_index_raw.transpose()).transpose() + edge_index = torch.Tensor(edge_index).long() + edge_time = torch.Tensor(np.concatenate(edge_time_lst)) + + # Use scaled datetime as edge_feature. + scale = edge_time.max() - edge_time.min() + base = edge_time.min() + scaled_edge_time = 2 * (edge_time.clone() - base) / scale + + assert cfg.dataset.AS_node_feature in ['one', 'one_hot_id', + 'one_hot_degree_global', + 'one_hot_degree_local'] + + if cfg.dataset.AS_node_feature == 'one': + node_feature = torch.ones(num_nodes, 1) + elif cfg.dataset.AS_node_feature == 'one_hot_id': + # One hot encoding the node ID. + node_feature = torch.Tensor(np.eye(num_nodes)) + elif cfg.dataset.AS_node_feature == 'one_hot_degree_global': + # undirected graph, use only out degree. + _, node_degree = torch.unique(edge_index[0], sorted=True, + return_counts=True) + node_feature = np.zeros((num_nodes, node_degree.max() + 1)) + node_feature[np.arange(num_nodes), node_degree] = 1 + # 1 ~ 63748 degrees, but only 710 possible levels, exclude all zero + # columns. + non_zero_cols = (node_feature.sum(axis=0) > 0) + node_feature = node_feature[:, non_zero_cols] + node_feature = torch.Tensor(node_feature) + else: + raise NotImplementedError + + g_all = Graph( + node_feature=node_feature, + edge_feature=scaled_edge_time.reshape(-1, 1), + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + + snapshot_list = make_graph_snapshot(g_all, + cfg.transaction.snapshot_freq) + + for g_snapshot in snapshot_list: + g_snapshot.node_states = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_degree_existing = torch.zeros(num_nodes) + + if cfg.dataset.split_method == 'chronological_temporal': + return snapshot_list + else: + # The default split (80-10-10) requires at least 10 edges each + # snapshot. + filtered_graphs = list() + for g in tqdm(snapshot_list): + if g.num_edges >= 10: + filtered_graphs.append(g) + return filtered_graphs + + +register_loader('roland_as', load_generic_dataset) diff --git a/graphgym/contrib/loader/roland_btc.py b/graphgym/contrib/loader/roland_btc.py new file mode 100644 index 00000000..58a9884d --- /dev/null +++ b/graphgym/contrib/loader/roland_btc.py @@ -0,0 +1,182 @@ +""" +Data loader for bitcoin datasets. +Mar. 27, 2021 +""" +import os +from typing import List, Union + +import deepsnap +import graphgym.contrib.loader.dynamic_graph_utils as utils +import numpy as np +import pandas as pd +import torch +from deepsnap.graph import Graph +from graphgym.config import cfg +from graphgym.register import register_loader +from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder + + +def load_single_dataset(dataset_dir: str) -> Graph: + df_trans = pd.read_csv(dataset_dir, sep=',', header=None, index_col=None) + df_trans.columns = ['SOURCE', 'TARGET', 'RATING', 'TIME'] + # NOTE: 'SOURCE' and 'TARGET' are not consecutive. + num_nodes = len( + pd.unique(df_trans[['SOURCE', 'TARGET']].to_numpy().ravel())) + + # bitcoin OTC contains decimal numbers, round them. + df_trans['TIME'] = df_trans['TIME'].astype(np.int).astype(np.float) + assert not np.any(pd.isna(df_trans).values) + + time_scaler = MinMaxScaler((0, 2)) + df_trans['TimestampScaled'] = time_scaler.fit_transform( + df_trans['TIME'].values.reshape(-1, 1)) + + edge_feature = torch.Tensor( + df_trans[['RATING', 'TimestampScaled']].values) # (E, edge_dim) + # SOURCE and TARGET IDs are already encoded in the csv file. + # edge_index = torch.Tensor( + # df_trans[['SOURCE', 'TARGET']].values.transpose()).long() # (2, E) + + node_indices = np.sort( + pd.unique(df_trans[['SOURCE', 'TARGET']].to_numpy().ravel())) + enc = OrdinalEncoder(categories=[node_indices, node_indices]) + raw_edges = df_trans[['SOURCE', 'TARGET']].values + edge_index = enc.fit_transform(raw_edges).transpose() + edge_index = torch.LongTensor(edge_index) + + # num_nodes = torch.max(edge_index) + 1 + # Use dummy node features. + node_feature = torch.ones(num_nodes, 1).float() + + edge_time = torch.FloatTensor(df_trans['TIME'].values) + + # TODO: add option here. + # if cfg.train.mode in ['baseline', 'baseline_v2', 'live_update_fixed_split']: + # edge_feature = torch.cat((edge_feature, edge_feature.clone()), dim=0) + # reversed_idx = torch.stack([edge_index[1], edge_index[0]]).clone() + # edge_index = torch.cat((edge_index, reversed_idx), dim=1) + # edge_time = torch.cat((edge_time, edge_time.clone())) + + graph = Graph( + node_feature=node_feature, + edge_feature=edge_feature, + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + return graph + + +# def make_graph_snapshot(g_all: Graph, snapshot_freq: str) -> List[Graph]: +# t = g_all.edge_time.numpy().astype(np.int64) +# snapshot_freq = snapshot_freq.upper() + +# period_split = pd.DataFrame( +# {'Timestamp': t, +# 'TransactionTime': pd.to_datetime(t, unit='s')}, +# index=range(len(g_all.edge_time))) + +# freq_map = {'D': '%j', # day of year. +# 'W': '%W', # week of year. +# 'M': '%m' # month of year. +# } + +# period_split['Year'] = period_split['TransactionTime'].dt.strftime( +# '%Y').astype(int) + +# period_split['SubYearFlag'] = period_split['TransactionTime'].dt.strftime( +# freq_map[snapshot_freq]).astype(int) + +# period2id = period_split.groupby(['Year', 'SubYearFlag']).indices + +# periods = sorted(list(period2id.keys())) +# snapshot_list = list() + +# for p in periods: +# # unique IDs of edges in this period. +# period_members = period2id[p] +# assert np.all(period_members == np.unique(period_members)) + +# g_incr = Graph( +# node_feature=g_all.node_feature, +# edge_feature=g_all.edge_feature[period_members, :], +# edge_index=g_all.edge_index[:, period_members], +# edge_time=g_all.edge_time[period_members], +# directed=g_all.directed +# ) +# snapshot_list.append(g_incr) + +# snapshot_list.sort(key=lambda x: torch.min(x.edge_time)) + +# return snapshot_list + + +# def split_by_seconds(g_all, freq_sec: int): +# # Split the entire graph into snapshots. +# split_criterion = g_all.edge_time // freq_sec +# groups = torch.sort(torch.unique(split_criterion))[0] +# snapshot_list = list() +# for t in groups: +# period_members = (split_criterion == t) +# g_incr = Graph( +# node_feature=g_all.node_feature, +# edge_feature=g_all.edge_feature[period_members, :], +# edge_index=g_all.edge_index[:, period_members], +# edge_time=g_all.edge_time[period_members], +# directed=g_all.directed +# ) +# snapshot_list.append(g_incr) +# return snapshot_list + +# TODO: merge these two method. +def load_snapshots(dataset_dir: str, + snapshot: bool = True, + snapshot_freq: str = None + ) -> Union[deepsnap.graph.Graph, + List[deepsnap.graph.Graph]]: + g_all = load_single_dataset(dataset_dir) + if not snapshot: + return g_all + + if snapshot_freq.upper() not in ['D', 'W', 'M']: + # format: '1200000s' + # assume split by seconds (timestamp) as in EvolveGCN paper. + freq = int(snapshot_freq.strip('s')) + snapshot_list = utils.make_graph_snapshot_by_seconds(g_all, freq) + else: + snapshot_list = utils.make_graph_snapshot(g_all, snapshot_freq) + num_nodes = g_all.edge_index.max() + 1 + + for g_snapshot in snapshot_list: + g_snapshot.node_states = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_degree_existing = torch.zeros(num_nodes) + + # check snapshots ordering. + prev_end = -1 + for g in snapshot_list: + start, end = torch.min(g.edge_time), torch.max(g.edge_time) + assert prev_end < start <= end + prev_end = end + + return snapshot_list + + +def load_btc_dataset(format: str, name: str, dataset_dir: str): + if format == 'bitcoin': + graphs = load_snapshots(os.path.join(dataset_dir, name), + snapshot=cfg.transaction.snapshot, + snapshot_freq=cfg.transaction.snapshot_freq) + if cfg.dataset.split_method == 'chronological_temporal': + return graphs + else: + # The default split (80-10-10) requires at least 10 edges each + # snapshot. + filtered_graphs = list() + for g in graphs: + if g.num_edges >= 10: + filtered_graphs.append(g) + return filtered_graphs + + +register_loader('roland_btc', load_btc_dataset) From c60761fdd0d14bc4373fdc8dd37ea5c10941c478 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 17:01:14 -0700 Subject: [PATCH 14/66] add --- graphgym/contrib/loader/roland_reddit.py | 174 +++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 graphgym/contrib/loader/roland_reddit.py diff --git a/graphgym/contrib/loader/roland_reddit.py b/graphgym/contrib/loader/roland_reddit.py new file mode 100644 index 00000000..37d7e66d --- /dev/null +++ b/graphgym/contrib/loader/roland_reddit.py @@ -0,0 +1,174 @@ +import os +from typing import List, Union + +import dask.dataframe as dd +import deepsnap +import graphgym.contrib.loader.dynamic_graph_utils as utils +import numpy as np +import pandas as pd +import torch +from dask_ml.preprocessing import OrdinalEncoder +from deepsnap.graph import Graph +from graphgym.config import cfg +from graphgym.register import register_loader +from sklearn.preprocessing import MinMaxScaler + + +def load_single_dataset(dataset_dir: str) -> Graph: + df_trans = dd.read_csv(dataset_dir, sep='\t', low_memory=False) + df_trans = df_trans.compute() + assert not np.any(pd.isna(df_trans).values) + df_trans.reset_index(drop=True, inplace=True) # required for dask. + + # Encode src and dst node IDs. + # get unique values of src and dst. + unique_subreddits = pd.unique( + df_trans[['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']].to_numpy().ravel()) + unique_subreddits = np.sort(unique_subreddits) + cate_type = pd.api.types.CategoricalDtype(categories=unique_subreddits, + ordered=True) + df_trans['SOURCE_SUBREDDIT'] = df_trans['SOURCE_SUBREDDIT'].astype( + cate_type) + df_trans['TARGET_SUBREDDIT'] = df_trans['TARGET_SUBREDDIT'].astype( + cate_type) + enc = OrdinalEncoder(columns=['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']) + df_encoded = enc.fit_transform(df_trans) + df_encoded.reset_index(drop=True, inplace=True) + + # Add node feature from the embedding dataset. + node_embedding_dir = os.path.join(cfg.dataset.dir, + 'web-redditEmbeddings-subreddits.csv') + + # index: subreddit name, values: embedding. + df_node = pd.read_csv(node_embedding_dir, header=None, index_col=0) + + # ordinal encoding follows order in unique_subreddits. + # df_encoded['SOURCE_SUBREDDIT'] contains encoded integral values. + # unique_subreddits[df_encoded['SOURCE_SUBREDDIT']] + # tries to reverse encoded_integer --> original subreddit name. + # check if recovered sub-reddit name matched the raw data. + for col in ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']: + assert all(unique_subreddits[df_encoded[col]] == df_trans[col]) + + num_nodes = len(cate_type.categories) + node_feature = torch.ones(size=(num_nodes, 300)) + # for nodes without precomputed embedding, use the average value. + node_feature = node_feature * np.mean(df_node.values) + + # cate_type.categories[i] is encoded to i, by construction. + for i, subreddit in enumerate(cate_type.categories): + if subreddit in df_node.index: + embedding = df_node.loc[subreddit] + node_feature[i, :] = torch.Tensor(embedding.values) + + # Original format: df['TIMESTAMP'][0] = '2013-12-31 16:39:18' + # Convert to unix timestamp (integers). + df_encoded['TIMESTAMP'] = pd.to_datetime(df_encoded['TIMESTAMP'], + format='%Y-%m-%d %H:%M:%S') + df_encoded['TIMESTAMP'] = (df_encoded['TIMESTAMP'] - pd.Timestamp( + '1970-01-01')) // pd.Timedelta('1s') # now integers. + + # Scale edge time. + time_scaler = MinMaxScaler((0, 2)) + df_encoded['TimestampScaled'] = time_scaler.fit_transform( + df_encoded['TIMESTAMP'].values.reshape(-1, 1)) + + # Link sentimental representation (86-dimension). + # comma-separated string: '3.1,5.1,0.0,...' + senti_str_lst = df_encoded['PROPERTIES'].values + edge_senti_embedding = [x.split(',') for x in senti_str_lst] + edge_senti_embedding = np.array(edge_senti_embedding).astype(np.float32) + # (E, 86) + + ef = df_encoded[['TimestampScaled', 'LINK_SENTIMENT']].values + edge_feature = np.concatenate([ef, edge_senti_embedding], axis=1) + edge_feature = torch.Tensor(edge_feature).float() # (E, 88) + + edge_index = torch.Tensor( + df_encoded[['SOURCE_SUBREDDIT', + 'TARGET_SUBREDDIT']].values.transpose()).long() # (2, E) + num_nodes = torch.max(edge_index) + 1 + + edge_time = torch.FloatTensor(df_encoded['TIMESTAMP'].values) + + graph = Graph( + node_feature=node_feature, + edge_feature=edge_feature, + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + + return graph + + +# def make_graph_snapshot(g_all: Graph, snapshot_freq: str) -> list: +# t = g_all.edge_time.numpy().astype(np.int64) +# snapshot_freq = snapshot_freq.upper() + +# period_split = pd.DataFrame( +# {'Timestamp': t, +# 'TransactionTime': pd.to_datetime(t, unit='s')}, +# index=range(len(g_all.edge_time))) + +# freq_map = {'D': '%j', # day of year. +# 'W': '%W', # week of year. +# 'M': '%m' # month of year. +# } + +# period_split['Year'] = period_split['TransactionTime'].dt.strftime( +# '%Y').astype(int) + +# period_split['SubYearFlag'] = period_split['TransactionTime'].dt.strftime( +# freq_map[snapshot_freq]).astype(int) + +# period2id = period_split.groupby(['Year', 'SubYearFlag']).indices +# # e.g., dictionary w/ key = (2021, 3) and val = array(edges). + +# periods = sorted(list(period2id.keys())) +# snapshot_list = list() +# for p in periods: +# # unique IDs of edges in this period. +# period_members = period2id[p] +# assert np.all(period_members == np.unique(period_members)) + +# g_incr = Graph( +# node_feature=g_all.node_feature, +# edge_feature=g_all.edge_feature[period_members, :], +# edge_index=g_all.edge_index[:, period_members], +# edge_time=g_all.edge_time[period_members], +# directed=g_all.directed +# ) +# snapshot_list.append(g_incr) +# return snapshot_list + + +def load_generic(dataset_dir: str, + snapshot: bool = True, + snapshot_freq: str = None + ) -> Union[deepsnap.graph.Graph, + List[deepsnap.graph.Graph]]: + g_all = load_single_dataset(dataset_dir) + if not snapshot: + return g_all + else: + snapshot_list = utils.make_graph_snapshot(g_all, snapshot_freq) + num_nodes = g_all.edge_index.max() + 1 + + for g_snapshot in snapshot_list: + g_snapshot.node_states = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_degree_existing = torch.zeros(num_nodes) + + return snapshot_list + + +def load_generic_dataset(format, name, dataset_dir): + if format == 'reddit_hyperlink': + graphs = load_generic(os.path.join(dataset_dir, name), + snapshot=cfg.transaction.snapshot, + snapshot_freq=cfg.transaction.snapshot_freq) + return graphs + + +register_loader('roland_reddit_hyperlink', load_generic_dataset) From e4c8173a102965caffff479fd86706dbd7eacc11 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 17:03:18 -0700 Subject: [PATCH 15/66] add --- graphgym/contrib/loader/roland_bsi_v3.py | 339 +++++++++++++++++++++++ 1 file changed, 339 insertions(+) create mode 100644 graphgym/contrib/loader/roland_bsi_v3.py diff --git a/graphgym/contrib/loader/roland_bsi_v3.py b/graphgym/contrib/loader/roland_bsi_v3.py new file mode 100644 index 00000000..93683931 --- /dev/null +++ b/graphgym/contrib/loader/roland_bsi_v3.py @@ -0,0 +1,339 @@ +""" +A refined version for loading the roland dataset. This version has the +following key points: + +(1) Node's features are determined by their first transaction, so that + payer and payee information are no longer included as a edge features. + + Node features include: + company identity, bank, country, region, Skd, SkdL1, SkdL2, Skis, + SkisL1, SkisL2. + +(2) edge features include: # system, currency, scaled amount (EUR), and + scaled timestamp. + +Mar. 31, 2021 +""" +import os +from typing import List, Union + +import dask.dataframe as dd +import deepsnap +import graphgym.contrib.loader.dynamic_graph_utils as utils +import numpy as np +import pandas as pd +import torch +from dask_ml.preprocessing import OrdinalEncoder +from deepsnap.graph import Graph +from graphgym.config import cfg +from graphgym.register import register_loader +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import OrdinalEncoder as SkOrdinalEncoder + +# ============================================================================= +# Configure and instantiate the loader here. +# ============================================================================= +# Required for all graphs. +SRC_NODE: str = 'Payer' +DST_NODE: str = 'Payee' +TIMESTAMP: str = 'Timestamp' +AMOUNT: str = 'AmountEUR' + +# Categorical columns are SRC_NODE+var and DST_NODE+var. +# columns: SRC_NODE + NODE_CATE_VARS, DST_NODE + NODE_CATE_VARS, EDGE_CATE_VARS +# will be encoded using ordinal encoder. +# Note that '' corresponds to columns SRC_NODE and DST_NODE. +NODE_CATE_VARS: List[str] = ['', 'Bank', 'Country', 'Region', 'Skd', 'SkdL1', + 'SkdL2', 'Skis', 'SkisL1', 'SkisL2'] +EDGE_CATE_VARS: List[str] = ['# System', 'Currency'] + +# contents of graph.edge_feature +EDGE_FEATURE_COLS: List[str] = [AMOUNT, 'TimestampScaled'] +# contents of graph.node_feature +NODE_FEATURE_LIST: List[str] = ['Bank', 'Country', 'Region', 'SkdL1', 'SkisL1'] + +# Required for heterogeneous graphs only. +# Node and edge features used to define node and edge type in hete GNN. +NODE_TYPE_DEFN: List[str] = ['Country'] +EDGE_TYPE_DEFN: List[str] = ['# System'] + + +# Required for graphs with node features only. + +def get_node_feature(df: pd.DataFrame) -> pd.DataFrame: + """Extract node features from a transaction dataset. + """ + temp = list() + for p in [SRC_NODE, DST_NODE]: + # require ['Payer', 'PayerBank', 'PayerCountry', ...] + cols = [p] + [p + var for var in NODE_FEATURE_LIST] + relevant = df[cols].copy() + # rename to ['Company', 'Bank', 'Country', ...] + relevant.columns = ['Company'] + NODE_FEATURE_LIST + temp.append(relevant) + df_char = pd.concat(temp, axis=0) + + # get company's information based on its first occurrence. + df_char = df_char.groupby('Company').first() + return df_char[NODE_FEATURE_LIST] + + +def construct_additional_features(df: pd.DataFrame) -> pd.DataFrame: + """ + Constructs additional features of the transaction dataset. + """ + # for p in ('Payer', 'Payee'): + # # %% Location of companies. + # mask = (df[p + 'Country'] != 'SI') + # out_of_country = np.empty(len(df), dtype=object) + # out_of_country[mask] = 'OutOfCountry' + # out_of_country[~mask] = 'InCountry' + # df[p + 'OutOfCountry'] = out_of_country + # + # mask = (df['PayerCountry'] != df['PayeeCountry']) + # missing_mask = np.logical_or(df['PayerCountry'] == 'missing', + # df['PayeeCountry'] == 'missing') + # cross_country = np.empty(len(df), dtype=object) + # cross_country[mask] = 'CrossCountry' + # cross_country[~mask] = 'WithinCountry' + # cross_country[missing_mask] = 'Missing' + # df['CrossCountry'] = cross_country + # + # amount_level = np.empty(len(df), dtype=object) + # mask_small = df['AmountEUR'] < 500 + # mask_medium = np.logical_and(df['AmountEUR'] >= 500, + # df['AmountEUR'] < 1000) + # mask_large = df['AmountEUR'] >= 1000 + # amount_level[mask_small] = '$<500' + # amount_level[mask_medium] = '500<=$<1k' + # amount_level[mask_large] = '$>=1k' + # + # df['AmountLevel'] = amount_level + return df + + +def load_single_dataset(dataset_dir: str, is_hetero: bool = True, + type_info_loc: str = 'append' + ) -> Graph: + """ + Loads a single graph object from tsv file. + + Args: + dataset_dir: the path of tsv file to be loaded. + is_hetero: whether to load heterogeneous graph. + type_info_loc: 'append' or 'graph_attribute'. + + Returns: + graph: a (homogenous) deepsnap graph object. + """ + # Load dataset using dask for fast parallel loading. + df_trans = dd.read_csv(dataset_dir, sep='\t', low_memory=False) + df_trans = df_trans.fillna('missing') + df_trans = df_trans.compute() + df_trans = construct_additional_features(df_trans) + df_trans.reset_index(drop=True, inplace=True) # necessary for dask. + + # a unique values of node-level categorical variables. + node_cat_uniques = dict() # Dict[str, np.ndarray of str] + for var in NODE_CATE_VARS: # for each node level categorical variable. + # get unique values of this categorical variable. + relevant = df_trans[[SRC_NODE + var, DST_NODE + var]] + unique_var = pd.unique(relevant.to_numpy().ravel()) + node_cat_uniques[var] = np.sort(unique_var) + # convert corresponding columns into pandas categorical variables. + cate_type = pd.api.types.CategoricalDtype( + categories=node_cat_uniques[var], ordered=True) + for p in ['Payer', 'Payee']: + df_trans[p + var] = df_trans[p + var].astype(cate_type) + + # Convert edge level categorical variables. + for var in EDGE_CATE_VARS: + unique_var = np.sort(pd.unique(df_trans[[var]].to_numpy().ravel())) + cate_type = pd.api.types.CategoricalDtype(categories=unique_var, + ordered=True) + df_trans[var] = df_trans[var].astype(cate_type) + + # Encoding categorical variables, the dask_ml.OrdinalEncoder only modify + # and encode columns of categorical dtype. + enc = OrdinalEncoder() + df_encoded = enc.fit_transform(df_trans) + df_encoded.reset_index(drop=True, inplace=True) + print('Columns encoded to ordinal:') + print(list(enc.categorical_columns_)) + + # Scaling transaction amounts. + scaler = MinMaxScaler((0, 2)) + df_encoded[AMOUNT] = scaler.fit_transform( + df_encoded[AMOUNT].values.reshape(-1, 1)) + + # Scaling timestamps. + time_scaler = MinMaxScaler((0, 2)) + df_encoded['TimestampScaled'] = time_scaler.fit_transform( + df_encoded[TIMESTAMP].values.reshape(-1, 1)) + + # Prepare for output. + edge_feature = torch.Tensor(df_encoded[EDGE_FEATURE_COLS].values) + + print('feature_edge_int_num', + [int(torch.max(edge_feature[:, i])) + 1 + for i in range(len(EDGE_FEATURE_COLS) - 2)]) + + edge_index = torch.Tensor( + df_encoded[[SRC_NODE, DST_NODE]].values.transpose()).long() # (2, E) + num_nodes = torch.max(edge_index) + 1 + assert num_nodes == len(node_cat_uniques['']) + + df_node_info = get_node_feature(df_encoded) + print(df_node_info.shape) + node_feature = torch.Tensor(df_node_info.astype(float).values) + + cfg.transaction.feature_node_int_num = [ + int(torch.max(node_feature[:, i])) + 1 + for i in range(len(NODE_FEATURE_LIST)) + ] + + print('feature_node_int_num: ', + [int(torch.max(node_feature[:, i])) + 1 + for i in range(len(NODE_FEATURE_LIST))]) + + edge_time = torch.FloatTensor(df_encoded[TIMESTAMP].values) + + graph = Graph( + node_feature=node_feature, + edge_feature=edge_feature, + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + + if is_hetero: + # Construct node type signatures. E.g., 'USA--CA' for country + region. + df_node_info['NodeType'] = df_node_info[NODE_TYPE_DEFN[0]].astype(str) + for var in NODE_TYPE_DEFN[1:]: + df_node_info['NodeType'] += ('--' + df_node_info[var].astype(str)) + + node_type_enc = SkOrdinalEncoder() + # The sklearn ordinal encoder transforms numpy array instead. + node_type_int = node_type_enc.fit_transform( + df_node_info['NodeType'].values.reshape(-1, 1)) + node_type_int = torch.FloatTensor(node_type_int) + + # Construct edge type signatures. + df_trans['EdgeType'] = df_trans[EDGE_TYPE_DEFN[0]].astype(str) + for var in EDGE_TYPE_DEFN[1:]: + df_trans['EdgeType'] += ('--' + df_trans[var].astype(str)) + + edge_type_enc = SkOrdinalEncoder() + edge_type_int = edge_type_enc.fit_transform( + df_trans['EdgeType'].values.reshape(-1, 1)) + edge_type_int = torch.FloatTensor(edge_type_int) + + if type_info_loc == 'append': + graph.edge_feature = torch.cat((graph.edge_feature, edge_type_int), + dim=1) + graph.node_feature = torch.cat((graph.node_feature, node_type_int), + dim=1) + elif type_info_loc == 'graph_attribute': + graph.node_type = node_type_int.reshape(-1, ) + graph.edge_type = edge_type_int.reshape(-1, ) + else: + raise ValueError(f'Unsupported type info loc: {type_info_loc}') + + # add a list of unique types for reference. + graph.list_n_type = node_type_int.unique().long() + graph.list_e_type = edge_type_int.unique().long() + + return graph + + +# def make_graph_snapshot(g_all: Graph, +# snapshot_freq: str, +# is_hetero: bool = True) -> list: +# """ +# Constructs a list of graph snapshots (Graph or HeteroGraph) based +# on g_all and snapshot_freq. +# +# Args: +# g_all: the entire homogenous graph. +# snapshot_freq: snapshot frequency. +# is_hetero: if make heterogeneous graphs. +# """ +# t = g_all.edge_time.numpy().astype(np.int64) +# snapshot_freq = snapshot_freq.upper() +# +# period_split = pd.DataFrame( +# {'Timestamp': t, +# 'TransactionTime': pd.to_datetime(t, unit='s')}, +# index=range(len(g_all.edge_time))) +# +# freq_map = {'D': '%j', # day of year. +# 'W': '%W', # week of year. +# 'M': '%m' # month of year. +# } +# +# period_split['Year'] = period_split['TransactionTime'].dt.strftime( +# '%Y').astype(int) +# +# period_split['SubYearFlag'] = period_split['TransactionTime'].dt.strftime( +# freq_map[snapshot_freq]).astype(int) +# +# period2id = period_split.groupby(['Year', 'SubYearFlag']).indices +# # e.g., dictionary w/ key = (2021, 3) and val = array(edges). +# +# periods = sorted(list(period2id.keys())) # ascending order. +# # alternatively, sorted(..., key=lambda x: x[0] + x[1]/1000). +# snapshot_list = list() +# for p in periods: +# # unique IDs of edges in this period. +# period_members = period2id[p] +# +# g_incr = Graph( +# node_feature=g_all.node_feature, +# edge_feature=g_all.edge_feature[period_members, :], +# edge_index=g_all.edge_index[:, period_members], +# edge_time=g_all.edge_time[period_members], +# directed=g_all.directed, +# list_n_type=g_all.list_n_type if is_hetero else None, +# list_e_type=g_all.list_e_type if is_hetero else None, +# ) +# if is_hetero and hasattr(g_all, 'node_type'): +# g_incr.node_type = g_all.node_type +# g_incr.edge_type = g_all.edge_type[period_members] +# snapshot_list.append(g_incr) +# return snapshot_list + + +def load_generic(dataset_dir: str, + snapshot: bool = True, + snapshot_freq: str = None, + is_hetero: bool = False, + type_info_loc: str = 'graph_attribute' + ) -> Union[deepsnap.graph.Graph, List[deepsnap.graph.Graph]]: + g_all = load_single_dataset(dataset_dir, is_hetero=is_hetero, + type_info_loc=type_info_loc) + if not snapshot: + return g_all + else: + snapshot_list = utils.make_graph_snapshot(g_all, snapshot_freq, is_hetero) + num_nodes = g_all.edge_index.max() + 1 + + for g_snapshot in snapshot_list: + g_snapshot.node_states = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] + g_snapshot.node_degree_existing = torch.zeros(num_nodes) + + return snapshot_list + + +def load_generic_dataset(format, name, dataset_dir): + if format == 'roland_bsi_general': + dataset_dir = os.path.join(dataset_dir, name) + graphs = load_generic(dataset_dir, + snapshot=cfg.transaction.snapshot, + snapshot_freq=cfg.transaction.snapshot_freq, + is_hetero=cfg.dataset.is_hetero, + type_info_loc=cfg.dataset.type_info_loc) + return graphs + + +register_loader('roland_bsi_v3', load_generic_dataset) From 13ff46c168aaea49262c36db135912cc728c6143 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 17:04:06 -0700 Subject: [PATCH 16/66] rename --- graphgym/contrib/loader/{roland_bsi_v3.py => roland.py} | 1 + 1 file changed, 1 insertion(+) rename graphgym/contrib/loader/{roland_bsi_v3.py => roland.py} (99%) diff --git a/graphgym/contrib/loader/roland_bsi_v3.py b/graphgym/contrib/loader/roland.py similarity index 99% rename from graphgym/contrib/loader/roland_bsi_v3.py rename to graphgym/contrib/loader/roland.py index 93683931..0e640e77 100644 --- a/graphgym/contrib/loader/roland_bsi_v3.py +++ b/graphgym/contrib/loader/roland.py @@ -336,4 +336,5 @@ def load_generic_dataset(format, name, dataset_dir): return graphs +# TODO: change name. register_loader('roland_bsi_v3', load_generic_dataset) From e9c71f178312fe6ef05ab5dd3aadfd038abf9b63 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sat, 5 Jun 2021 17:28:50 -0700 Subject: [PATCH 17/66] add --- graphgym/contrib/train/train_utils.py | 444 ++++++++++++++++++++++++++ 1 file changed, 444 insertions(+) create mode 100644 graphgym/contrib/train/train_utils.py diff --git a/graphgym/contrib/train/train_utils.py b/graphgym/contrib/train/train_utils.py new file mode 100644 index 00000000..bfc5100d --- /dev/null +++ b/graphgym/contrib/train/train_utils.py @@ -0,0 +1,444 @@ +""" +Metrics, other utility, and helper functions. +""" +from typing import Dict, List, Optional + +import deepsnap +import numpy as np +import torch +from graphgym.config import cfg +from graphgym.loss import compute_loss +from torch_scatter import scatter_max, scatter_mean, scatter_min +from tqdm import tqdm + + +def get_keep_ratio(existing: torch.Tensor, new: torch.Tensor, + mode: str='linear') -> torch.Tensor: + """ + Get the keep ratio for individual nodes to update node embeddings. + Specifically: + state[v,t] = state[v,t-1]*keep_ratio + new_feature[v,t]*(1-keep_ratio) + + Args: + existing: a tensor of nodes' degrees in G[0], G[1], ..., G[t-1]. + new: a tensor of nodes' degrees in G[t]. + mode: how to compute the keep_ratio. + + Returns: + A tensor with shape (num_nodes,) valued in [0, 1]. + """ + if mode == 'constant': + # This scheme is equivalent to exponential decaying. + ratio = torch.ones_like(existing) + # node observed for the first time, keep_ratio = 0. + ratio[torch.logical_and(existing == 0, new > 0)] = 0 + # take convex combination of old and new embeddings. + # 1/2 can be changed to other values. + ratio[torch.logical_and(existing > 0, new > 0)] = 1 / 2 + # inactive nodes have keep ratio 1, embeddings don't change. + elif mode == 'linear': + # The original method proposed by Jiaxuan. + ratio = existing / (existing + new + 1e-6) + # Following methods aim to shrink the weight of existing + # degrees, help to ensure non-trivial embedding update when the graph + # is large and history is long. + elif mode == 'log': + ratio = torch.log(existing + 1) / ( + torch.log(existing + 1) + new + 1e-6) + elif mode == 'sqrt': + ratio = torch.sqrt(existing) / (torch.sqrt(existing) + new + 1e-6) + else: + raise NotImplementedError(f'Mode {mode} is not supported.') + return ratio + + +def size_of(batch: deepsnap.graph.Graph) -> int: + """Computes how much memory a batch has consumed.""" + total_byte = 0 + for k, v in batch.__dict__.items(): + if isinstance(v, torch.Tensor): + total_byte += v.element_size() * v.nelement() + elif isinstance(v, list): # for node_states. + for sub_v in v: + if isinstance(sub_v, torch.Tensor): + total_byte += sub_v.element_size() * sub_v.nelement() + + return total_byte / (1024 ** 2) # MiB. + + +def move_batch_to_device(batch: deepsnap.graph.Graph, + device: str) -> deepsnap.graph.Graph: + """Moves and collects everything in the batch to the target device.""" + device = torch.device(device) + # This handles node_feature, edge_feature, etc. + batch = batch.to(device) + + for layer in range(len(batch.node_states)): + if torch.is_tensor(batch.node_states[layer]): + batch.node_states[layer] = batch.node_states[layer].to(device) + + if hasattr(batch, 'node_cells'): + # node_cells exist only for LSTM type RNNs. + for layer in range(len(batch.node_cells)): + if torch.is_tensor(batch.node_cells[layer]): + batch.node_cells[layer] = batch.node_cells[layer].to(device) + + return batch + + +def edge_index_difference(edge_include: torch.LongTensor, + edge_except: torch.LongTensor, + num_nodes: int) -> torch.LongTensor: + """Set difference operator, return edges in edge_all but not + in edge_except. + + Args: + edge_all (torch.LongTensor): (2, E1) tensor of edge indices. + edge_except (torch.LongTensor): (2, E2) tensor of edge indices to be + excluded from edge_all. + num_nodes (int): total number of nodes. + + Returns: + torch.LongTensor: Edge indices in edge_include but not in edge_except. + """ + # flatten (i, j) edge representations. + idx_include = edge_include[0] * num_nodes + edge_include[1] + idx_except = edge_except[0] * num_nodes + edge_except[1] + # filter out edges in idx_except. + mask = torch.from_numpy(np.isin(idx_include, idx_except)).to(torch.bool) + idx_kept = idx_include[~mask] + i = idx_kept // num_nodes + j = idx_kept % num_nodes + return torch.stack([i, j], dim=0).long() + + +def gen_negative_edges(edge_index: torch.LongTensor, + num_neg_per_node: int, + num_nodes: int) -> torch.LongTensor: + """Generates a fixed number of negative edges for each node. + + Args: + edge_index (torch.LongTensor): (2, E) array of positive edges. + num_neg_per_node (int): 'approximate' number of negative edges generated + for each source node in edge_index. + num_nodes (int): total number of nodes. + + Returns: + torch.LongTensor: approximate num_nodes * num_neg_per_node + negative edges. + """ + src_lst = torch.unique(edge_index[0]) # get unique senders. + num_neg_per_node = int(1.2 * num_neg_per_node) # add some redundancy. + i = src_lst.repeat_interleave(num_neg_per_node) + j = torch.Tensor(np.random.choice(num_nodes, len(i), replace=True)) + # candidates for negative edges, X candidates from each src. + candidates = torch.stack([i, j], dim=0).long() + # filter out positive edges in candidate. + neg_edge_index = edge_index_difference(candidates, edge_index.to('cpu'), + num_nodes) + return neg_edge_index + + +def compute_src_mrr_and_recall(edge_label_index: torch.LongTensor, + edge_label: torch.LongTensor, + pred_score: torch.Tensor, + recall_k_lst: List[int], + mrr_top_k: Optional[int] = None + ) -> (float, Dict[int, float]): + """ + Computes source-based MRR and recall at K for each source node in + edge_label_index. + + Args: + edge_label_index: combination of positive and negative edges. + edge_label: label of edges in edge_label_index. + pred_score: P(E=positive) for each edge in edge_label_index. + recall_k_lst: to report recall at k for all k in this list. + mrr_top_k: calculating MRR for each source node using mean(1/rank) for + k positive edges with the highest pred_score. Set to None to use + all positive edges. + """ + assert edge_label_index.shape[1] == len(edge_label) == len(pred_score) + + src_lst = torch.unique(edge_label_index[0]) # source nodes to consider. + # edge_label_index were constructed by adding negative edges to every + # node in edge_index[0], thus every node in src_lst has at least one + # positive edge in edge_label_index. + # I.e., src_lst == torch.unique(edge_label_index[0][edge_label == 1]) + + node_level_mrr = [] # store MRR for each node. + node_recall_at = dict((k, []) for k in recall_k_lst) + for src in tqdm(src_lst, leave=False, desc='Node level MRR/Recall'): + # get positive/negative edges emitted from src node. + self_mask = (edge_label_index[0] == src) + self_label = edge_label[self_mask] + self_pred_score = pred_score[self_mask] + + # Alternative implementation. + best = torch.max(self_pred_score[self_label == 1]) + rank = torch.sum(self_pred_score[self_label == 0] >= best) + 1 + # print(pos_edge_rank[0], true, torch.sum(label == 0)) + mrr = float(1 / rank) + node_level_mrr.append(mrr) # mrr for this node. + + for k in recall_k_lst: + recall = _calculate_recall_at_k(self_pred_score, self_label, k) + node_recall_at[k].append(recall) + + # Average over all nodes. + macro_recall = dict((k, np.mean(v)) for (k, v) in node_recall_at.items()) + macro_mrr = float(np.mean(node_level_mrr)) + return macro_mrr, macro_recall + + +def _calculate_recall_at_k(pred_score: torch.Tensor, + label: torch.Tensor, + k: int) -> int: + """Computes whether the score of the most confident positive edge is + within the highest k scores. I.e., whether the most confident + positive edge beats at least k most confident negative edges. + + Args: + pred_score: a tensor of scores of predictions. + label: a tensor of labels. + k: get whether successful recall at k. + + Returns: + an indicator whether there is a successful recall at rank k. + """ + neg_score = pred_score[label == 0] + if len(neg_score) == 0: + return 0 + best_pos_score = torch.max(pred_score[label == 1]) + rank = torch.sum(neg_score >= best_pos_score) + 1 + return int(rank <= k) + + +@torch.no_grad() +def fast_batch_mrr_and_recall(edge_label_index: torch.Tensor, + edge_label: torch.Tensor, + pred_score: torch.Tensor, + num_neg_per_node: int, + num_nodes: int + ) -> (float, Dict[int, float]): + """ + A vectorized implementation to compute average rank-based metrics over + all source nodes. + + Args: + edge_label_index: + edge_label: + pred_score: P(edge i is positive) from the model. + num_neg_per_node: number of negative edges per node. + num_nodes: total number of nodes in the graph. + """ + # start = datetime.now() + + # A list of source nodes to consider. + src_lst = torch.unique(edge_label_index[0], sorted=True) + num_users = len(src_lst) + + edge_pos = edge_label_index[:, edge_label == 1] + edge_neg = edge_label_index[:, edge_label == 0] + + # By construction, negative edge index should be sorted by their src nodes. + assert torch.all(edge_neg[0].sort()[0] == edge_neg[0]) + + # Prediction scores of all positive and negative edges. + p_pos = pred_score[edge_label == 1] + p_neg = pred_score[edge_label == 0] + + # For each player src, compute the highest score among all positive edges + # from src. + # We want to compute the rank of this edge. + # Construct an interval of model's performance. + if cfg.metric.mrr_method == 'mean': + best_p_pos = scatter_mean(src=p_pos, index=edge_pos[0], + dim_size=num_nodes) + elif cfg.metric.mrr_method == 'min': + best_p_pos, _ = scatter_min(src=p_pos, index=edge_pos[0], + dim_size=num_nodes) + else: + # The default setting, consider the rank of the most confident edge. + best_p_pos, _ = scatter_max(src=p_pos, index=edge_pos[0], + dim_size=num_nodes) + # best_p_pos has shape (num_nodes), for nodes not in src_lst has value 0. + best_p_pos_by_user = best_p_pos[src_lst] + + # Sanity check. + # src_lst_2, inverse = torch.unique(edge_pos[0], return_inverse=True) + # best_p_pos, _ = scatter_max(p_pos, inverse) + # assert torch.all(best_p_pos_by_user == best_p_pos) + + uni, counts = torch.unique(edge_neg[0], sorted=True, return_counts=True) + # assert torch.all(counts >= num_neg_per_node) + # assert torch.all(uni == src_lst) + # note: edge_neg (src, dst) are sorted by src. + # find index of first occurrence of each src in edge_neg[0]. + # neg edges[0], [1,1,...1, 2, 2, ... 2, 3, ..] + first_occ_idx = torch.cumsum(counts, dim=0) - counts + add = torch.arange(num_neg_per_node, device=first_occ_idx.device) + + # take the first 100 negative edges from each src. + score_idx = first_occ_idx.view(-1, 1) + add.view(1, -1) + + assert torch.all(edge_neg[0][score_idx].float().std(axis=1) == 0) + # Z = edge_neg[0][first_occ_idx - 1] + # A = edge_neg[0][first_occ_idx] + # B = edge_neg[0][first_occ_idx + 1] + # assert torch.all(Z != A) + # assert torch.all(B == A) + + p_neg_by_user = p_neg[score_idx] # (num_users, num_neg_per_node) + compare = (p_neg_by_user >= best_p_pos_by_user.view(num_users, 1)).float() + assert compare.shape == (num_users, num_neg_per_node) + # compare[i, j], for node i, the j-th negative edge's score > p_best. + + # counts 1 + how many negative edge from src has higher score than p_best. + # if there's no such negative edge, rank is 1. + rank_by_user = compare.sum(axis=1) + 1 # (num_users,) + assert rank_by_user.shape == (num_users,) + + mrr = float(torch.mean(1 / rank_by_user)) + # print(f'MRR={mrr}, time taken: {datetime.now() - start}') + # computes recall at k as well + recall_at = dict() + for k in [1, 3, 10]: + recall_at[k] = float((rank_by_user <= k).float().mean()) + + return mrr, recall_at + + +@torch.no_grad() +def report_rank_based_eval(eval_batch, model, num_neg_per_node: int = 1000): + if num_neg_per_node == -1: + # Do not report rank-based metrics, used in debug mode. + return 0, 0, 0, 0 + # Get positive edge indices. + edge_index = eval_batch.edge_label_index[:, eval_batch.edge_label == 1] + edge_index = edge_index.to('cpu') + + neg_edge_index = gen_negative_edges(edge_index, num_neg_per_node, + num_nodes=eval_batch.num_nodes) + + new_edge_label_index = torch.cat((edge_index, neg_edge_index), + dim=1).long() + new_edge_label = torch.cat((torch.ones(edge_index.shape[1]), + torch.zeros(neg_edge_index.shape[1]) + ), dim=0).long() + + # Construct evaluation samples. + eval_batch.edge_label_index = new_edge_label_index + eval_batch.edge_label = new_edge_label + + eval_batch.to(torch.device(cfg.device)) + # move state to gpu + for layer in range(len(eval_batch.node_states)): + if torch.is_tensor(eval_batch.node_states[layer]): + eval_batch.node_states[layer] = eval_batch.node_states[layer].to( + torch.device(cfg.device)) + pred, true = model(eval_batch) + loss, pred_score = compute_loss(pred, true) + + mrr, recall_at = fast_batch_mrr_and_recall(eval_batch.edge_label_index, + eval_batch.edge_label, + pred_score, + num_neg_per_node, + eval_batch.num_nodes) + + # return mrr, 0, 0, 0 + # + # mrr_old, recall_at_old = compute_src_mrr_and_recall( + # eval_batch.edge_label_index, + # eval_batch.edge_label, + # pred_score, + # recall_k_lst=[1, 3, 10], + # mrr_top_k=1) + # + # print(f'Old MRR: {mrr_old: 0.6f}, new MRR {mrr: 0.6f}') + # print( + # f'Old Recall@1: {recall_at_old[1]: 0.6f}, new Recall@1 {recall_at[1]: 0.6f}') + # print( + # f'Old Recall@3: {recall_at_old[3]: 0.6f}, new Recall@3 {recall_at[3]: 0.6f}') + # print( + # f'Old Recall@10: {recall_at_old[10]: 0.6f}, new Recall@10 {recall_at[10]: 0.6f}') + + return mrr, recall_at[1], recall_at[3], recall_at[10] + + +def get_row_MRR(probs, true_classes): + existing_mask = true_classes == 1 + # descending in probability for all edge predictions. + ordered_indices = np.flip(probs.argsort()) + # indicators of positive/negative, in prob desc order. + ordered_existing_mask = existing_mask[ordered_indices] + # [1, 2, ... ][ordered_existing_mask] + # prob rank of positive edges. + existing_ranks = np.arange(1, true_classes.shape[0] + 1, + dtype=np.float)[ordered_existing_mask] + # average 1/rank of positive edges. + MRR = (1 / existing_ranks).sum() / existing_ranks.shape[0] + return MRR + + +@torch.no_grad() +def report_baseline_MRR(eval_batch, model): + # Get positive edge indices. + edge_index = eval_batch.edge_label_index[:, eval_batch.edge_label == 1] + edge_index = edge_index.to('cpu') + num_nodes = eval_batch.num_nodes + src_of_pos_edges = torch.unique(edge_index[0]).numpy() + + all_edges_idx = np.arange(num_nodes) + all_edges_idx = np.array(np.meshgrid(all_edges_idx, + all_edges_idx)).reshape(2, -1) + all_edges_idx = torch.LongTensor(all_edges_idx) + # Get all O(N^2) negative edges. + neg_edge_index = edge_index_difference( + all_edges_idx, edge_index, num_nodes) + # Only keep negative edges share src node with some positive edges. + mask = np.isin(neg_edge_index[0], src_of_pos_edges) + neg_edge_index = neg_edge_index[:, mask] + + new_edge_label_index = torch.cat((edge_index, neg_edge_index), + dim=1).long() + new_edge_label = torch.cat((torch.ones(edge_index.shape[1]), + torch.zeros(neg_edge_index.shape[1]) + ), dim=0).long() + + # Construct evaluation samples. + eval_batch.edge_label_index = new_edge_label_index + eval_batch.edge_label = new_edge_label + + eval_batch.to(torch.device(cfg.device)) + # move state to gpu + for layer in range(len(eval_batch.node_states)): + if torch.is_tensor(eval_batch.node_states[layer]): + eval_batch.node_states[layer] = eval_batch.node_states[layer].to( + torch.device(cfg.device)) + pred, true = model(eval_batch) + loss, pred_score = compute_loss(pred, true) + + probs = pred_score.cpu().numpy().squeeze() + true = true.cpu().numpy() + + xi = new_edge_label_index[0].cpu().numpy() + xj = new_edge_label_index[1].cpu().numpy() + # pred_matrix = coo_matrix((probs, (xi, xj))).toarray() + # true_matrix = coo_matrix((true, (xi, xj))).toarray() + + row_MRRs = [] + for src in src_of_pos_edges: + mask = np.argwhere(xi == src) + pred_row = probs.take(mask).squeeze() + true_row = true.take(mask).squeeze() + row_MRRs.append(get_row_MRR(pred_row, true_row)) + + # for i, pred_row in enumerate(pred_matrix): + # #check if there are any existing edges + # # only evaluate senders with existing edge (of course). + # if np.isin(1, true_matrix[i]): + # row_MRRs.append(get_row_MRR(pred_row, true_matrix[i])) + + avg_MRR = torch.tensor(row_MRRs).mean() + return float(avg_MRR) From 55f9e765eb76a664f2bba7a176e1d17ea3f27c0c Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 03:37:21 -0700 Subject: [PATCH 18/66] move mrr_num_negative_edges to metric field. --- graphgym/contrib/config/roland.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/graphgym/contrib/config/roland.py b/graphgym/contrib/config/roland.py index cf00a50b..62d5b3c3 100644 --- a/graphgym/contrib/config/roland.py +++ b/graphgym/contrib/config/roland.py @@ -12,6 +12,7 @@ def set_cfg_roland(cfg): # ----------------------------------------------------------------------- # # Customized options # ----------------------------------------------------------------------- # + # TODO: add documentation. # Method to update node embedding from old node embedding and new node features. # Options: 'moving_average', 'masked_gru', 'gru' # moving average: new embedding = r * old + (1-r) * node_feature. @@ -46,19 +47,13 @@ def set_cfg_roland(cfg): cfg.remark = '' # Experimental Features, use this name space to save all controls for # experimental features. - cfg.experimental = CN() - - # How many negative edges for each node to compute rank-based evaluation - # metrics such as MRR and recall at K. - # E.g., if multiplier = 1000 and a node has 3 positive edges, then we - # compute the MRR using 1000 randomly generated negative edges - # + 3 existing positive edges. - cfg.experimental.rank_eval_multiplier = 1000 + # TODO: consider remove experiment field. + # cfg.experimental = CN() # Only use the first n snapshots (time periods) to train the model. # Empirically, the model learns rich dynamics from only a few periods. # Set to -1 if using all snapshots. - cfg.experimental.restrict_training_set = -1 + # cfg.experimental.restrict_training_set = -1 # Whether to visualize edge attention of GNN layer after training. cfg.experimental.visualize_gnn_layer = False @@ -171,6 +166,14 @@ def set_cfg_roland(cfg): cfg.transaction.keep_ratio = 'linear' cfg.metric = CN() + # How many negative edges for each node to compute rank-based evaluation + # metrics such as MRR and recall at K. + # E.g., if multiplier = 1000 and a node has 3 positive edges, then we + # compute the MRR using 1000 randomly generated negative edges + # + 3 existing positive edges. + # Use 100 ~ 1000 for fast and reliable results. + cfg.metric.mrr_num_negative_edges = 1000 + # how to compute MRR. # available: f = 'min', 'max', 'mean'. # Step 1: get the p* = f(scores of positive edges) @@ -180,9 +183,10 @@ def set_cfg_roland(cfg): # expected MRR(min) <= MRR(mean) <= MRR(max). cfg.metric.mrr_method = 'max' + # TODO: consider remove link_pred_spec field. # Specs for the link prediction task using BSI dataset. # All units are days. - cfg.link_pred_spec = CN() + # cfg.link_pred_spec = CN() # The period of `today`'s increase: how often the system is making forecast. # E.g., when = 1, @@ -192,12 +196,12 @@ def set_cfg_roland(cfg): # When = 7, the system makes prediction every week. # E.g., the system forecasts transactions in upcoming 7 days # on every Monday. - cfg.link_pred_spec.forecast_frequency = 1 + # cfg.link_pred_spec.forecast_frequency = 1 # How many days into the future the model is trained to predict. # The model forecasts transactions in (today, today + forecast_horizon]. # NOTE: forecast_horizon should >= forecast_frequency to cover all days. - cfg.link_pred_spec.forecast_horizon = 7 + # cfg.link_pred_spec.forecast_horizon = 7 register_config('roland', set_cfg_roland) From c014e687900005a827841304f24300eff859e885 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 03:40:18 -0700 Subject: [PATCH 19/66] update --- graphgym/contrib/train/train_utils.py | 394 +++++++++++++++----------- 1 file changed, 225 insertions(+), 169 deletions(-) diff --git a/graphgym/contrib/train/train_utils.py b/graphgym/contrib/train/train_utils.py index bfc5100d..dad12ff7 100644 --- a/graphgym/contrib/train/train_utils.py +++ b/graphgym/contrib/train/train_utils.py @@ -1,19 +1,19 @@ """ Metrics, other utility, and helper functions. """ -from typing import Dict, List, Optional - import deepsnap import numpy as np import torch from graphgym.config import cfg from graphgym.loss import compute_loss from torch_scatter import scatter_max, scatter_mean, scatter_min -from tqdm import tqdm +# TODO: proof-read this file. +# TODO: remove comments. -def get_keep_ratio(existing: torch.Tensor, new: torch.Tensor, - mode: str='linear') -> torch.Tensor: +def get_keep_ratio(existing: torch.Tensor, + new: torch.Tensor, + mode: str = 'linear') -> torch.Tensor: """ Get the keep ratio for individual nodes to update node embeddings. Specifically: @@ -78,7 +78,7 @@ def move_batch_to_device(batch: deepsnap.graph.Graph, batch.node_states[layer] = batch.node_states[layer].to(device) if hasattr(batch, 'node_cells'): - # node_cells exist only for LSTM type RNNs. + # node_cells exist only for LSTM type RNNs. for layer in range(len(batch.node_cells)): if torch.is_tensor(batch.node_cells[layer]): batch.node_cells[layer] = batch.node_cells[layer].to(device) @@ -122,7 +122,7 @@ def gen_negative_edges(edge_index: torch.LongTensor, num_neg_per_node (int): 'approximate' number of negative edges generated for each source node in edge_index. num_nodes (int): total number of nodes. - + Returns: torch.LongTensor: approximate num_nodes * num_neg_per_node negative edges. @@ -139,91 +139,91 @@ def gen_negative_edges(edge_index: torch.LongTensor, return neg_edge_index -def compute_src_mrr_and_recall(edge_label_index: torch.LongTensor, - edge_label: torch.LongTensor, - pred_score: torch.Tensor, - recall_k_lst: List[int], - mrr_top_k: Optional[int] = None - ) -> (float, Dict[int, float]): - """ - Computes source-based MRR and recall at K for each source node in - edge_label_index. - - Args: - edge_label_index: combination of positive and negative edges. - edge_label: label of edges in edge_label_index. - pred_score: P(E=positive) for each edge in edge_label_index. - recall_k_lst: to report recall at k for all k in this list. - mrr_top_k: calculating MRR for each source node using mean(1/rank) for - k positive edges with the highest pred_score. Set to None to use - all positive edges. - """ - assert edge_label_index.shape[1] == len(edge_label) == len(pred_score) - - src_lst = torch.unique(edge_label_index[0]) # source nodes to consider. - # edge_label_index were constructed by adding negative edges to every - # node in edge_index[0], thus every node in src_lst has at least one - # positive edge in edge_label_index. - # I.e., src_lst == torch.unique(edge_label_index[0][edge_label == 1]) - - node_level_mrr = [] # store MRR for each node. - node_recall_at = dict((k, []) for k in recall_k_lst) - for src in tqdm(src_lst, leave=False, desc='Node level MRR/Recall'): - # get positive/negative edges emitted from src node. - self_mask = (edge_label_index[0] == src) - self_label = edge_label[self_mask] - self_pred_score = pred_score[self_mask] - - # Alternative implementation. - best = torch.max(self_pred_score[self_label == 1]) - rank = torch.sum(self_pred_score[self_label == 0] >= best) + 1 - # print(pos_edge_rank[0], true, torch.sum(label == 0)) - mrr = float(1 / rank) - node_level_mrr.append(mrr) # mrr for this node. - - for k in recall_k_lst: - recall = _calculate_recall_at_k(self_pred_score, self_label, k) - node_recall_at[k].append(recall) - - # Average over all nodes. - macro_recall = dict((k, np.mean(v)) for (k, v) in node_recall_at.items()) - macro_mrr = float(np.mean(node_level_mrr)) - return macro_mrr, macro_recall - - -def _calculate_recall_at_k(pred_score: torch.Tensor, - label: torch.Tensor, - k: int) -> int: - """Computes whether the score of the most confident positive edge is - within the highest k scores. I.e., whether the most confident - positive edge beats at least k most confident negative edges. - - Args: - pred_score: a tensor of scores of predictions. - label: a tensor of labels. - k: get whether successful recall at k. - - Returns: - an indicator whether there is a successful recall at rank k. - """ - neg_score = pred_score[label == 0] - if len(neg_score) == 0: - return 0 - best_pos_score = torch.max(pred_score[label == 1]) - rank = torch.sum(neg_score >= best_pos_score) + 1 - return int(rank <= k) +# def compute_src_mrr_and_recall(edge_label_index: torch.LongTensor, +# edge_label: torch.LongTensor, +# pred_score: torch.Tensor, +# recall_k_lst: List[int], +# mrr_top_k: Optional[int] = None +# ) -> (float, Dict[int, float]): +# """ +# Computes source-based MRR and recall at K for each source node in +# edge_label_index. + +# Args: +# edge_label_index: combination of positive and negative edges. +# edge_label: label of edges in edge_label_index. +# pred_score: P(E=positive) for each edge in edge_label_index. +# recall_k_lst: to report recall at k for all k in this list. +# mrr_top_k: calculating MRR for each source node using mean(1/rank) for +# k positive edges with the highest pred_score. Set to None to use +# all positive edges. +# """ +# assert edge_label_index.shape[1] == len(edge_label) == len(pred_score) + +# src_lst = torch.unique(edge_label_index[0]) # source nodes to consider. +# # edge_label_index were constructed by adding negative edges to every +# # node in edge_index[0], thus every node in src_lst has at least one +# # positive edge in edge_label_index. +# # I.e., src_lst == torch.unique(edge_label_index[0][edge_label == 1]) + +# node_level_mrr = [] # store MRR for each node. +# node_recall_at = dict((k, []) for k in recall_k_lst) +# for src in tqdm(src_lst, leave=False, desc='Node level MRR/Recall'): +# # get positive/negative edges emitted from src node. +# self_mask = (edge_label_index[0] == src) +# self_label = edge_label[self_mask] +# self_pred_score = pred_score[self_mask] + +# # Alternative implementation. +# best = torch.max(self_pred_score[self_label == 1]) +# rank = torch.sum(self_pred_score[self_label == 0] >= best) + 1 +# # print(pos_edge_rank[0], true, torch.sum(label == 0)) +# mrr = float(1 / rank) +# node_level_mrr.append(mrr) # mrr for this node. + +# for k in recall_k_lst: +# recall = _calculate_recall_at_k(self_pred_score, self_label, k) +# node_recall_at[k].append(recall) + +# # Average over all nodes. +# macro_recall = dict((k, np.mean(v)) for (k, v) in node_recall_at.items()) +# macro_mrr = float(np.mean(node_level_mrr)) +# return macro_mrr, macro_recall + + +# def _calculate_recall_at_k(pred_score: torch.Tensor, +# label: torch.Tensor, +# k: int) -> int: +# """Computes whether the score of the most confident positive edge is +# within the highest k scores. I.e., whether the most confident +# positive edge beats at least k most confident negative edges. + +# Args: +# pred_score: a tensor of scores of predictions. +# label: a tensor of labels. +# k: get whether successful recall at k. + +# Returns: +# an indicator whether there is a successful recall at rank k. +# """ +# neg_score = pred_score[label == 0] +# if len(neg_score) == 0: +# return 0 +# best_pos_score = torch.max(pred_score[label == 1]) +# rank = torch.sum(neg_score >= best_pos_score) + 1 +# return int(rank <= k) @torch.no_grad() -def fast_batch_mrr_and_recall(edge_label_index: torch.Tensor, - edge_label: torch.Tensor, - pred_score: torch.Tensor, - num_neg_per_node: int, - num_nodes: int - ) -> (float, Dict[int, float]): +def fast_batch_mrr(edge_label_index: torch.Tensor, + edge_label: torch.Tensor, + pred_score: torch.Tensor, + num_neg_per_node: int, + num_nodes: int, + method: str) -> float: """ A vectorized implementation to compute average rank-based metrics over - all source nodes. + all source nodes. Args: edge_label_index: @@ -231,9 +231,10 @@ def fast_batch_mrr_and_recall(edge_label_index: torch.Tensor, pred_score: P(edge i is positive) from the model. num_neg_per_node: number of negative edges per node. num_nodes: total number of nodes in the graph. - """ - # start = datetime.now() + Returns: + the MRR for all nodes. + """ # A list of source nodes to consider. src_lst = torch.unique(edge_label_index[0], sorted=True) num_users = len(src_lst) @@ -252,27 +253,22 @@ def fast_batch_mrr_and_recall(edge_label_index: torch.Tensor, # from src. # We want to compute the rank of this edge. # Construct an interval of model's performance. - if cfg.metric.mrr_method == 'mean': + if method == 'mean': best_p_pos = scatter_mean(src=p_pos, index=edge_pos[0], dim_size=num_nodes) - elif cfg.metric.mrr_method == 'min': + elif method == 'min': best_p_pos, _ = scatter_min(src=p_pos, index=edge_pos[0], dim_size=num_nodes) - else: + elif method == 'max': # The default setting, consider the rank of the most confident edge. best_p_pos, _ = scatter_max(src=p_pos, index=edge_pos[0], dim_size=num_nodes) + else: + raise ValueError(f'Unrecognized method: {method}.') # best_p_pos has shape (num_nodes), for nodes not in src_lst has value 0. best_p_pos_by_user = best_p_pos[src_lst] - # Sanity check. - # src_lst_2, inverse = torch.unique(edge_pos[0], return_inverse=True) - # best_p_pos, _ = scatter_max(p_pos, inverse) - # assert torch.all(best_p_pos_by_user == best_p_pos) - uni, counts = torch.unique(edge_neg[0], sorted=True, return_counts=True) - # assert torch.all(counts >= num_neg_per_node) - # assert torch.all(uni == src_lst) # note: edge_neg (src, dst) are sorted by src. # find index of first occurrence of each src in edge_neg[0]. # neg edges[0], [1,1,...1, 2, 2, ... 2, 3, ..] @@ -283,11 +279,6 @@ def fast_batch_mrr_and_recall(edge_label_index: torch.Tensor, score_idx = first_occ_idx.view(-1, 1) + add.view(1, -1) assert torch.all(edge_neg[0][score_idx].float().std(axis=1) == 0) - # Z = edge_neg[0][first_occ_idx - 1] - # A = edge_neg[0][first_occ_idx] - # B = edge_neg[0][first_occ_idx + 1] - # assert torch.all(Z != A) - # assert torch.all(B == A) p_neg_by_user = p_neg[score_idx] # (num_users, num_neg_per_node) compare = (p_neg_by_user >= best_p_pos_by_user.view(num_users, 1)).float() @@ -300,70 +291,66 @@ def fast_batch_mrr_and_recall(edge_label_index: torch.Tensor, assert rank_by_user.shape == (num_users,) mrr = float(torch.mean(1 / rank_by_user)) - # print(f'MRR={mrr}, time taken: {datetime.now() - start}') - # computes recall at k as well - recall_at = dict() - for k in [1, 3, 10]: - recall_at[k] = float((rank_by_user <= k).float().mean()) - - return mrr, recall_at - - -@torch.no_grad() -def report_rank_based_eval(eval_batch, model, num_neg_per_node: int = 1000): - if num_neg_per_node == -1: - # Do not report rank-based metrics, used in debug mode. - return 0, 0, 0, 0 - # Get positive edge indices. - edge_index = eval_batch.edge_label_index[:, eval_batch.edge_label == 1] - edge_index = edge_index.to('cpu') - - neg_edge_index = gen_negative_edges(edge_index, num_neg_per_node, - num_nodes=eval_batch.num_nodes) - - new_edge_label_index = torch.cat((edge_index, neg_edge_index), - dim=1).long() - new_edge_label = torch.cat((torch.ones(edge_index.shape[1]), - torch.zeros(neg_edge_index.shape[1]) - ), dim=0).long() - - # Construct evaluation samples. - eval_batch.edge_label_index = new_edge_label_index - eval_batch.edge_label = new_edge_label - - eval_batch.to(torch.device(cfg.device)) - # move state to gpu - for layer in range(len(eval_batch.node_states)): - if torch.is_tensor(eval_batch.node_states[layer]): - eval_batch.node_states[layer] = eval_batch.node_states[layer].to( - torch.device(cfg.device)) - pred, true = model(eval_batch) - loss, pred_score = compute_loss(pred, true) - - mrr, recall_at = fast_batch_mrr_and_recall(eval_batch.edge_label_index, - eval_batch.edge_label, - pred_score, - num_neg_per_node, - eval_batch.num_nodes) - - # return mrr, 0, 0, 0 - # - # mrr_old, recall_at_old = compute_src_mrr_and_recall( - # eval_batch.edge_label_index, - # eval_batch.edge_label, - # pred_score, - # recall_k_lst=[1, 3, 10], - # mrr_top_k=1) - # - # print(f'Old MRR: {mrr_old: 0.6f}, new MRR {mrr: 0.6f}') - # print( - # f'Old Recall@1: {recall_at_old[1]: 0.6f}, new Recall@1 {recall_at[1]: 0.6f}') - # print( - # f'Old Recall@3: {recall_at_old[3]: 0.6f}, new Recall@3 {recall_at[3]: 0.6f}') - # print( - # f'Old Recall@10: {recall_at_old[10]: 0.6f}, new Recall@10 {recall_at[10]: 0.6f}') - - return mrr, recall_at[1], recall_at[3], recall_at[10] + return mrr + + +# @torch.no_grad() +# def report_rank_based_eval(eval_batch, model, method: str, +# num_neg_per_node: int=1000): +# if num_neg_per_node == -1: +# # Do not report rank-based metrics, used in debug mode. +# return 0, 0, 0, 0 +# # Get positive edge indices. +# edge_index = eval_batch.edge_label_index[:, eval_batch.edge_label == 1] +# edge_index = edge_index.to('cpu') + +# neg_edge_index = gen_negative_edges(edge_index, num_neg_per_node, +# num_nodes=eval_batch.num_nodes) + +# new_edge_label_index = torch.cat((edge_index, neg_edge_index), +# dim=1).long() +# new_edge_label = torch.cat((torch.ones(edge_index.shape[1]), +# torch.zeros(neg_edge_index.shape[1]) +# ), dim=0).long() + +# # Construct evaluation samples. +# eval_batch.edge_label_index = new_edge_label_index +# eval_batch.edge_label = new_edge_label + +# eval_batch.to(torch.device(cfg.device)) +# # move state to gpu +# for layer in range(len(eval_batch.node_states)): +# if torch.is_tensor(eval_batch.node_states[layer]): +# eval_batch.node_states[layer] = eval_batch.node_states[layer].to( +# torch.device(cfg.device)) +# pred, true = model(eval_batch) +# loss, pred_score = compute_loss(pred, true) + +# mrr, recall_at = fast_batch_mrr_and_recall(eval_batch.edge_label_index, +# eval_batch.edge_label, +# pred_score, +# num_neg_per_node, +# eval_batch.num_nodes, +# method) + +# # return mrr, 0, 0, 0 +# # +# # mrr_old, recall_at_old = compute_src_mrr_and_recall( +# # eval_batch.edge_label_index, +# # eval_batch.edge_label, +# # pred_score, +# # recall_k_lst=[1, 3, 10], +# # mrr_top_k=1) +# # +# # print(f'Old MRR: {mrr_old: 0.6f}, new MRR {mrr: 0.6f}') +# # print( +# # f'Old Recall@1: {recall_at_old[1]: 0.6f}, new Recall@1 {recall_at[1]: 0.6f}') +# # print( +# # f'Old Recall@3: {recall_at_old[3]: 0.6f}, new Recall@3 {recall_at[3]: 0.6f}') +# # print( +# # f'Old Recall@10: {recall_at_old[10]: 0.6f}, new Recall@10 {recall_at[10]: 0.6f}') + +# return mrr, recall_at[1], recall_at[3], recall_at[10] def get_row_MRR(probs, true_classes): @@ -382,7 +369,8 @@ def get_row_MRR(probs, true_classes): @torch.no_grad() -def report_baseline_MRR(eval_batch, model): +def report_baseline_MRR(eval_batch: deepsnap.graph.Graph, + model: torch.nn.Module) -> float: # Get positive edge indices. edge_index = eval_batch.edge_label_index[:, eval_batch.edge_label == 1] edge_index = edge_index.to('cpu') @@ -442,3 +430,71 @@ def report_baseline_MRR(eval_batch, model): avg_MRR = torch.tensor(row_MRRs).mean() return float(avg_MRR) + + +def compute_MRR(eval_batch: deepsnap.graph.Graph, + model: torch.nn.Module, + num_neg_per_node: int, + method: str) -> float: + """Computes the MRR score on the evaluation batch. + + Args: + eval_batch (deepsnap.graph.Graph): a graph snapshot. + model (torch.nn.Module): a GNN model for this graph snapshot + num_neg_per_node (int): how many negative edges per node required for + computing the MRR score. + For example, if num_neg_per_node = 1000, this method firstly + sample 1,000 negative edges for each source node, and compute the + average rank of positive edges from each source node among these + 1,000 sampled negative edges. + Setting num_neg_per_node = -1 to use all possible negative edges. + method (str): {'min', 'mean', 'max', 'all'} + All methods firstly compute MRR for each source node, and then + average MRRs over all source nodes. + For each source node v, + let P denote scores of all positive edges from v, the rank() + operator computes the rank among all negative edges from v. + 'min' computes 1/rank(min(P)) + 'mean' computes 1/rank(mean(P)) + 'max' computes 1/rank(max(P)) + 'all' computes mean[1/rank(x) for x in P] + """ + if method == 'all': + # NOTE: this method requires iterating over all nodes, which is slow. + assert num_neg_per_node == -1 + return report_baseline_MRR(eval_batch, model) + else: + assert num_neg_per_node > 0 + # Sample negative edges for each node. + edge_index = eval_batch.edge_label_index[:, eval_batch.edge_label == 1] + edge_index = edge_index.to('cpu') + + neg_edge_index = gen_negative_edges(edge_index, num_neg_per_node, + num_nodes=eval_batch.num_nodes) + + new_edge_label_index = torch.cat((edge_index, neg_edge_index), + dim=1).long() + new_edge_label = torch.cat((torch.ones(edge_index.shape[1]), + torch.zeros(neg_edge_index.shape[1]) + ), dim=0).long() + + # Construct evaluation samples. + eval_batch.edge_label_index = new_edge_label_index + eval_batch.edge_label = new_edge_label + + eval_batch.to(torch.device(cfg.device)) + # move state to gpu + for layer in range(len(eval_batch.node_states)): + if torch.is_tensor(eval_batch.node_states[layer]): + eval_batch.node_states[layer] = eval_batch.node_states[layer].to( + torch.device(cfg.device)) + pred, true = model(eval_batch) + loss, pred_score = compute_loss(pred, true) + + mrr = fast_batch_mrr(eval_batch.edge_label_index, + eval_batch.edge_label, + pred_score, + num_neg_per_node, + eval_batch.num_nodes, + method) + return mrr From 2c9ba8c406b58490d593527b4eed26a23d0dff79 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 03:42:00 -0700 Subject: [PATCH 20/66] add file --- graphgym/contrib/train/train_live_update.py | 324 ++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 graphgym/contrib/train/train_live_update.py diff --git a/graphgym/contrib/train/train_live_update.py b/graphgym/contrib/train/train_live_update.py new file mode 100644 index 00000000..75d0dad6 --- /dev/null +++ b/graphgym/contrib/train/train_live_update.py @@ -0,0 +1,324 @@ +""" +The ROLAND training pipeline with live-update. +""" +import copy +import datetime +import logging +import os +from typing import Dict, List, Optional, Tuple + +import deepsnap +import numpy as np +import torch +from graphgym.checkpoint import clean_ckpt +from graphgym.config import cfg +from graphgym.contrib.train import train_utils +from graphgym.loss import compute_loss +from graphgym.optimizer import create_optimizer, create_scheduler +from graphgym.register import register_train +from graphgym.utils.io import makedirs_rm_exist +from graphgym.utils.stats import node_degree +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + + +@torch.no_grad() +def average_state_dict(dict1: dict, dict2: dict, weight: float) -> dict: + """ + Average two model.state_dict() objects, + ut = (1-w)*dict1 + w*dict2 + when dict1, dict2 are model_dicts, this method updates the meta-model. + """ + assert 0 <= weight <= 1 + d1 = copy.deepcopy(dict1) + d2 = copy.deepcopy(dict2) + out = dict() + for key in d1.keys(): + assert isinstance(d1[key], torch.Tensor) + param1 = d1[key].detach().clone() + assert isinstance(d2[key], torch.Tensor) + param2 = d2[key].detach().clone() + out[key] = (1 - weight) * param1 + weight * param2 + return out + + +def precompute_edge_degree_info(dataset: deepsnap.dataset.GraphDataset): + """Pre-computes edge_degree_existing, edge_degree_new and keep ratio + at each snapshot. Inplace modifications. + """ + # Assume all graph snapshots have the same number of nodes. + num_nodes = dataset[0].node_feature.shape[0] + for t in range(len(dataset)): + if t == 0: + # No previous edges for any nodes. + dataset[t].node_degree_existing = torch.zeros(num_nodes) + else: + dataset[t].node_degree_existing \ + = dataset[t - 1].node_degree_existing \ + + dataset[t - 1].node_degree_new + + dataset[t].node_degree_new = node_degree(dataset[t].edge_index, + n=num_nodes) + + dataset[t].keep_ratio = train_utils.get_keep_ratio( + existing=dataset[t].node_degree_existing, + new=dataset[t].node_degree_new, + mode=cfg.transaction.keep_ratio) + dataset[t].keep_ratio = dataset[t].keep_ratio.unsqueeze(-1) + + +@torch.no_grad() +def get_task_batch(dataset: deepsnap.dataset.GraphDataset, + today: int, tomorrow: int, + prev_node_states: Optional[Dict[str, List[torch.Tensor]]] + ) -> deepsnap.graph.Graph: + """ + Construct batch required for the task (today, tomorrow). As defined in + batch's get_item method (used to get edge_label and get_label_index), + edge_label and edge_label_index returned would be different everytime + get_task_batch() is called. + + Moreover, copy node-memories (node_states and node_cells) to the batch. + """ + assert today < tomorrow < len(dataset) + # Get edges for message passing and prediction task. + batch = dataset[today].clone() + batch.edge_label = dataset[tomorrow].edge_label.clone() + batch.edge_label_index = dataset[tomorrow].edge_label_index.clone() + + # Copy previous memory to the batch. + if prev_node_states is not None: + for key, val in prev_node_states.items(): + copied = [x.detach().clone() for x in val] + setattr(batch, key, copied) + + batch = train_utils.move_batch_to_device(batch, cfg.device) + return batch + + +@torch.no_grad() +def update_node_states(model, dataset, task: Tuple[int, int], + prev_node_states: Optional[ + Dict[str, List[torch.Tensor]]] + ) -> Dict[str, List[torch.Tensor]]: + """Perform the provided task and keep track of the latest node_states. + + Example: task = (t, t+1), + the prev_node_states contains node embeddings at time (t-1). + the model perform task (t, t+1): + Input: (node embedding at t - 1, edges at t). + Output: possible transactions at t+1. + the model also generates node embeddings at t. + + after doing task (t, t+1), node_states contains information + from snapshot t. + """ + today, tomorrow = task + batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() + # Let the model modify batch.node_states (and batch.node_cells). + _, _ = model(batch) + # Collect the updated node states. + out = dict() + out['node_states'] = [x.detach().clone() for x in batch.node_states] + if isinstance(batch.node_cells[0], torch.Tensor): + out['node_cells'] = [x.detach().clone() for x in batch.node_cells] + + return out + + +def train_step(model, optimizer, scheduler, dataset, + task: Tuple[int, int], + prev_node_states: Optional[Dict[str, torch.Tensor]] + ) -> dict: + """ + After receiving ground truth from a particular task, update the model by + performing back-propagation. + For example, on day t, the ground truth of task (t-1, t) has been revealed, + train the model using G[t-1] for message passing and label[t] as target. + """ + optimizer.zero_grad() + torch.cuda.empty_cache() + + today, tomorrow = task + model.train() + batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() + + pred, true = model(batch) + loss, pred_score = compute_loss(pred, true) + loss.backward() + optimizer.step() + + scheduler.step() + return {'loss': loss} + + +@torch.no_grad() +def evaluate_step(model, dataset, task: Tuple[int, int], + prev_node_states: Optional[Dict[str, List[torch.Tensor]]], + fast: bool = False) -> dict: + """ + Evaluate model's performance on task = (today, tomorrow) + where today and tomorrow are integers indexing snapshots. + """ + today, tomorrow = task + model.eval() + batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() + + pred, true = model(batch) + loss, pred_score = compute_loss(pred, true) + + if fast: + # skip MRR calculation for internal validation. + return {'loss': loss.item()} + + mrr_batch = get_task_batch(dataset, today, tomorrow, + prev_node_states).clone() + + mrr, rck1, rck3, rck10 = train_utils.report_rank_based_eval( + mrr_batch, model, + num_neg_per_node=cfg.metric.mrr_num_negative_edges) + + return {'loss': loss.item(), 'mrr': mrr, 'rck1': rck1, 'rck3': rck3, + 'rck10': rck10} + + +def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, + **kwargs): + + for dataset in datasets: + # Sometimes edge degree info is already included in dataset. + if not hasattr(dataset[0], 'keep_ratio'): + precompute_edge_degree_info(dataset) + + if cfg.dataset.premade_datasets == 'fresh_save_cache': + if not os.path.exists(f'{cfg.dataset.dir}/cache/'): + os.mkdir(f'{cfg.dataset.dir}/cache/') + cache_path = '{}/cache/cached_datasets_{}_{}_{}.pt'.format( + cfg.dataset.dir, cfg.dataset.format.replace('.tsv', ''), + cfg.transaction.snapshot_freq, + datetime.now().strftime('%Y_%m_%d__%H_%M_%S') + ) + torch.save(datasets, cache_path) + + num_splits = len(loggers) # train/val/test splits. + # range for today in (today, tomorrow) task pairs. + task_range = range(len(datasets[0]) - cfg.transaction.horizon) + + t = datetime.datetime.now().strftime('%b%d_%H-%M-%S') + + # directory to store tensorboard files of this run. + out_dir = cfg.out_dir.replace('/', '\\') + # dir to store all run outputs for the entire batch. + run_dir = 'runs_' + cfg.remark + + print(f'Tensorboard directory: {out_dir}') + # If tensorboard directory exists, this config is in the re-run phase + # of run_batch, replace logs of previous runs with the new one. + makedirs_rm_exist(f'./{run_dir}/{out_dir}') + writer = SummaryWriter(f'./{run_dir}/{out_dir}') + + # save a copy of configuration for later identifications. + with open(f'./{run_dir}/{out_dir}/config.yaml', 'w') as f: + cfg.dump(stream=f) + + prev_node_states = None # no previous state on day 0. + # {'node_states': [Tensor, Tensor], 'node_cells: [Tensor, Tensor]} + + model_init = None # for meta-learning only, a model.state_dict() object. + + for t in tqdm(task_range, desc='snapshot', leave=True): + # current task: t --> t+1. + # (1) Evaluate model's performance on this task, at this time, the + # model has seen no information on t+1, this evaluation is fair. + for i in range(1, num_splits): + perf = evaluate_step(model, datasets[i], (t, t + 1), + prev_node_states) + + writer.add_scalars('val' if i == 1 else 'test', perf, t) + + # (2) Reveal the ground truth of task (t, t+1) and update the model + # to prepare for the next task. + del optimizer, scheduler # use new optimizers. + optimizer = create_optimizer(model.parameters()) + scheduler = create_scheduler(optimizer) + + # best model's validation loss, training epochs, and state_dict. + best_model = {'val_loss': np.inf, 'train_epoch': 0, 'state': None} + # keep track of how long we have NOT update the best model. + best_model_unchanged = 0 + # after not updating the best model for `tol` epochs, stop. + tol = cfg.train.internal_validation_tolerance + + # internal training loop (intra-snapshot cross-validation). + # choose the best model using current validation set, prepare for + # next task. + + if cfg.meta.is_meta and (model_init is not None): + # For meta-learning, start fine-tuning from the pre-computed + # initialization weight. + model.load_state_dict(copy.deepcopy(model_init)) + + for i in tqdm(range(cfg.optim.max_epoch + 1), desc='live update', + leave=True): + # Start with the un-trained model (i = 0), evaluate the model. + internal_val_perf = evaluate_step(model, datasets[1], + (t, t + 1), + prev_node_states, fast=True) + val_loss = internal_val_perf['loss'] + + if val_loss < best_model['val_loss']: + # replace the best model with the current model. + best_model = {'val_loss': val_loss, 'train_epoch': i, + 'state': copy.deepcopy(model.state_dict())} + best_model_unchanged = 0 + else: + # the current best model has dominated for these epochs. + best_model_unchanged += 1 + + # if (i >= 2 * tol) and (best_model_unchanged >= tol): + if best_model_unchanged >= tol: + # If the best model has not been updated for a while, stop. + break + else: + # Otherwise, keep training. + train_perf = train_step(model, optimizer, scheduler, + datasets[0], (t, t + 1), + prev_node_states) + writer.add_scalars('train', train_perf, t) + + writer.add_scalar('internal_best_val', best_model['val_loss'], t) + writer.add_scalar('best epoch', best_model['train_epoch'], t) + + # (3) Actually perform the update on training set to get node_states + # contains information up to time t. + # Use the best model selected from intra-snapshot cross-validation. + model.load_state_dict(best_model['state']) + + if cfg.meta.is_meta: # update meta-learning's initialization weights. + if model_init is None: # for the first task. + model_init = copy.deepcopy(best_model['state']) + else: # for subsequent task, update init. + if cfg.meta.method == 'moving_average': + new_weight = cfg.meta.alpha + elif cfg.meta.method == 'online_mean': + new_weight = 1 / (t + 1) # for t=1, the second item, 1/2. + else: + raise ValueError(f'Invalid method: {cfg.meta.method}') + + # (1-new_weight)*model_init + new_weight*best_model. + model_init = average_state_dict(model_init, + best_model['state'], + new_weight) + + prev_node_states = update_node_states(model, datasets[0], (t, t + 1), + prev_node_states) + + writer.close() + + if cfg.train.ckpt_clean: + clean_ckpt() + + logging.info('Task done, results saved in {}'.format(cfg.out_dir)) + + +register_train('live_update', train_live_update) From 0cd303c7ca7db64c9b40fb6c082c9c80d6268d0a Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 16:22:37 -0700 Subject: [PATCH 21/66] update --- graphgym/contrib/train/train_live_update.py | 88 +++++++++++---------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/graphgym/contrib/train/train_live_update.py b/graphgym/contrib/train/train_live_update.py index 75d0dad6..d54d5ece 100644 --- a/graphgym/contrib/train/train_live_update.py +++ b/graphgym/contrib/train/train_live_update.py @@ -4,7 +4,6 @@ import copy import datetime import logging -import os from typing import Dict, List, Optional, Tuple import deepsnap @@ -42,6 +41,7 @@ def average_state_dict(dict1: dict, dict2: dict, weight: float) -> dict: return out +@torch.no_grad() def precompute_edge_degree_info(dataset: deepsnap.dataset.GraphDataset): """Pre-computes edge_degree_existing, edge_degree_new and keep ratio at each snapshot. Inplace modifications. @@ -53,6 +53,7 @@ def precompute_edge_degree_info(dataset: deepsnap.dataset.GraphDataset): # No previous edges for any nodes. dataset[t].node_degree_existing = torch.zeros(num_nodes) else: + # degree[ deepsnap.graph.Graph: """ - Construct batch required for the task (today, tomorrow). As defined in - batch's get_item method (used to get edge_label and get_label_index), - edge_label and edge_label_index returned would be different everytime - get_task_batch() is called. + Construct batch required for the task (today, tomorrow). + For current implementation, we use tomorrow = today + 1. + As defined in batch's get_item method (used to get edge_label and + get_label_index), edge_label and edge_label_index returned would be + different everytime get_task_batch() is called. Moreover, copy node-memories (node_states and node_cells) to the batch. + + Lastly, this method moves the created task batch to the appropriate device. """ assert today < tomorrow < len(dataset) # Get edges for message passing and prediction task. @@ -116,10 +120,12 @@ def update_node_states(model, dataset, task: Tuple[int, int], today, tomorrow = task batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() # Let the model modify batch.node_states (and batch.node_cells). - _, _ = model(batch) + # This operation does not track gradient, so should not affect back-prop. + _, _ = model(batch) # Inplace modification on batch. # Collect the updated node states. out = dict() out['node_states'] = [x.detach().clone() for x in batch.node_states] + # If node cells are also used. if isinstance(batch.node_cells[0], torch.Tensor): out['node_cells'] = [x.detach().clone() for x in batch.node_cells] @@ -129,7 +135,7 @@ def update_node_states(model, dataset, task: Tuple[int, int], def train_step(model, optimizer, scheduler, dataset, task: Tuple[int, int], prev_node_states: Optional[Dict[str, torch.Tensor]] - ) -> dict: + ) -> Dict[str, float]: """ After receiving ground truth from a particular task, update the model by performing back-propagation. @@ -149,13 +155,13 @@ def train_step(model, optimizer, scheduler, dataset, optimizer.step() scheduler.step() - return {'loss': loss} + return {'loss': loss.item()} @torch.no_grad() def evaluate_step(model, dataset, task: Tuple[int, int], prev_node_states: Optional[Dict[str, List[torch.Tensor]]], - fast: bool = False) -> dict: + fast: bool=False) -> Dict[str, float]: """ Evaluate model's performance on task = (today, tomorrow) where today and tomorrow are integers indexing snapshots. @@ -174,12 +180,13 @@ def evaluate_step(model, dataset, task: Tuple[int, int], mrr_batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() - mrr, rck1, rck3, rck10 = train_utils.report_rank_based_eval( - mrr_batch, model, - num_neg_per_node=cfg.metric.mrr_num_negative_edges) + mrr = train_utils.compute_MRR( + mrr_batch, + model, + num_neg_per_node=cfg.metric.mrr_num_negative_edges, + method=cfg.metric.mrr_method) - return {'loss': loss.item(), 'mrr': mrr, 'rck1': rck1, 'rck3': rck3, - 'rck10': rck10} + return {'loss': loss.item(), 'mrr': mrr} def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, @@ -190,15 +197,15 @@ def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, if not hasattr(dataset[0], 'keep_ratio'): precompute_edge_degree_info(dataset) - if cfg.dataset.premade_datasets == 'fresh_save_cache': - if not os.path.exists(f'{cfg.dataset.dir}/cache/'): - os.mkdir(f'{cfg.dataset.dir}/cache/') - cache_path = '{}/cache/cached_datasets_{}_{}_{}.pt'.format( - cfg.dataset.dir, cfg.dataset.format.replace('.tsv', ''), - cfg.transaction.snapshot_freq, - datetime.now().strftime('%Y_%m_%d__%H_%M_%S') - ) - torch.save(datasets, cache_path) + # if cfg.dataset.premade_datasets == 'fresh_save_cache': + # if not os.path.exists(f'{cfg.dataset.dir}/cache/'): + # os.mkdir(f'{cfg.dataset.dir}/cache/') + # cache_path = '{}/cache/cached_datasets_{}_{}_{}.pt'.format( + # cfg.dataset.dir, cfg.dataset.format.replace('.tsv', ''), + # cfg.transaction.snapshot_freq, + # datetime.now().strftime('%Y_%m_%d__%H_%M_%S') + # ) + # torch.save(datasets, cache_path) num_splits = len(loggers) # train/val/test splits. # range for today in (today, tomorrow) task pairs. @@ -224,13 +231,17 @@ def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, prev_node_states = None # no previous state on day 0. # {'node_states': [Tensor, Tensor], 'node_cells: [Tensor, Tensor]} - model_init = None # for meta-learning only, a model.state_dict() object. + model_meta = None # the state_dict() object of the meta-model. + + # TODO: How to incorporate logger? - for t in tqdm(task_range, desc='snapshot', leave=True): + for t in tqdm(task_range, desc='Snapshot'): # current task: t --> t+1. # (1) Evaluate model's performance on this task, at this time, the # model has seen no information on t+1, this evaluation is fair. + # TODO: modify here to predict on all edges? for i in range(1, num_splits): + # Validation and test edges. perf = evaluate_step(model, datasets[i], (t, t + 1), prev_node_states) @@ -253,13 +264,13 @@ def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, # choose the best model using current validation set, prepare for # next task. - if cfg.meta.is_meta and (model_init is not None): - # For meta-learning, start fine-tuning from the pre-computed - # initialization weight. - model.load_state_dict(copy.deepcopy(model_init)) + if cfg.meta.is_meta and (model_meta is not None): + # For meta-learning, start fine-tuning from the meta-model. + model.load_state_dict(copy.deepcopy(model_meta)) + # Internal training loop. for i in tqdm(range(cfg.optim.max_epoch + 1), desc='live update', - leave=True): + leave=False): # Start with the un-trained model (i = 0), evaluate the model. internal_val_perf = evaluate_step(model, datasets[1], (t, t + 1), @@ -295,20 +306,13 @@ def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, model.load_state_dict(best_model['state']) if cfg.meta.is_meta: # update meta-learning's initialization weights. - if model_init is None: # for the first task. - model_init = copy.deepcopy(best_model['state']) + if model_meta is None: # for the first task. + model_meta = copy.deepcopy(best_model['state']) else: # for subsequent task, update init. - if cfg.meta.method == 'moving_average': - new_weight = cfg.meta.alpha - elif cfg.meta.method == 'online_mean': - new_weight = 1 / (t + 1) # for t=1, the second item, 1/2. - else: - raise ValueError(f'Invalid method: {cfg.meta.method}') - - # (1-new_weight)*model_init + new_weight*best_model. - model_init = average_state_dict(model_init, + # (1-alpha)*model_meta + alpha*best_model. + model_meta = average_state_dict(model_meta, best_model['state'], - new_weight) + cfg.meta.alpha) prev_node_states = update_node_states(model, datasets[0], (t, t + 1), prev_node_states) From bc1b0bc4adf0c92b0284e2ca3e2a1031073683d3 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 16:28:42 -0700 Subject: [PATCH 22/66] add roland feature encoder. --- graphgym/contrib/feature_encoder/roland.py | 115 +++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 graphgym/contrib/feature_encoder/roland.py diff --git a/graphgym/contrib/feature_encoder/roland.py b/graphgym/contrib/feature_encoder/roland.py new file mode 100644 index 00000000..6090843e --- /dev/null +++ b/graphgym/contrib/feature_encoder/roland.py @@ -0,0 +1,115 @@ +import deepsnap +import torch +import torch.nn as nn +from graphgym.config import cfg +from graphgym.register import register_edge_encoder, register_node_encoder + + +class TransactionEdgeEncoder(torch.nn.Module): + r"""A module that encodes edge features in the transaction graph. + + Example: + TransactionEdgeEncoder( + (embedding_list): ModuleList( + (0): Embedding(50, 32) # The first integral edge feature has 50 unique values. + # convert this integral feature to 32 dimensional embedding. + (1): Embedding(8, 32) + (2): Embedding(252, 32) + (3): Embedding(252, 32) + ) + (linear_amount): Linear(in_features=1, out_features=64, bias=True) + (linear_time): Linear(in_features=1, out_features=64, bias=True) + ) + + Initial edge feature dimension = 6 + Final edge embedding dimension = 32 + 32 + 32 + 32 + 64 + 64 = 256 + """ + + def __init__(self, emb_dim: int): + # emb_dim is not used here. + super(TransactionEdgeEncoder, self).__init__() + + self.embedding_list = torch.nn.ModuleList() + # Note: feature_edge_int_num[i] = len(torch.unique(graph.edge_feature[:, i])) + # where i-th edge features are integral. + for num in cfg.transaction.feature_edge_int_num: + emb = torch.nn.Embedding(num, cfg.transaction.feature_int_dim) + torch.nn.init.xavier_uniform_(emb.weight.data) + self.embedding_list.append(emb) + + # Embed non-integral features. + self.linear_amount = nn.Linear(1, cfg.transaction.feature_amount_dim) + self.linear_time = nn.Linear(1, cfg.transaction.feature_time_dim) + # update edge_dim + cfg.dataset.edge_dim = len(cfg.transaction.feature_edge_int_num) \ + * cfg.transaction.feature_int_dim \ + + cfg.transaction.feature_amount_dim \ + + cfg.transaction.feature_time_dim + + def forward(self, batch: deepsnap.batch.Batch) -> deepsnap.batch.Batch: + edge_embedding = [] + for i in range(len(self.embedding_list)): + edge_embedding.append( + self.embedding_list[i](batch.edge_feature[:, i].long()) + ) + # By default, edge_feature[:, -2] contains edge amount, + # edge_feature[:, -1] contains edge time. + edge_embedding.append( + self.linear_amount(batch.edge_feature[:, -2].view(-1, 1)) + ) + edge_embedding.append( + self.linear_time(batch.edge_feature[:, -1].view(-1, 1)) + ) + batch.edge_feature = torch.cat(edge_embedding, dim=1) + return batch + + +register_edge_encoder('roland', TransactionEdgeEncoder) + + +class TransactionNodeEncoder(torch.nn.Module): + r"""A module that encodes node features in the transaction graph. + + Parameters: + num_classes - the number of classes for the embedding mapping to learn + + Example: + 3 unique values for the first integral node feature. + 3 unique values for the second integral node feature. + + cfg.transaction.feature_node_int_num = [3, 3] + cfg.transaction.feature_int_dim = 32 + + TransactionNodeEncoder( + (embedding_list): ModuleList( + (0): Embedding(3, 32) # embed the first node feature to 32-dimensional space. + (1): Embedding(3, 32) # embed the second node feature to 32-dimensional space. + ) + ) + + Initial node feature dimension = 2 + Final node embedding dimension = 32 + 32 = 256 + """ + + def __init__(self, emb_dim: int, num_classes=None): + super(TransactionNodeEncoder, self).__init__() + self.embedding_list = torch.nn.ModuleList() + for i, num in enumerate(cfg.transaction.feature_node_int_num): + emb = torch.nn.Embedding(num, cfg.transaction.feature_int_dim) + torch.nn.init.xavier_uniform_(emb.weight.data) + self.embedding_list.append(emb) + # update encoder_dim + cfg.dataset.encoder_dim = len(cfg.transaction.feature_node_int_num) \ + * cfg.transaction.feature_int_dim + + def forward(self, batch: deepsnap.batch.Batch) -> deepsnap.batch.Batch: + node_embedding = [] + for i in range(len(self.embedding_list)): + node_embedding.append( + self.embedding_list[i](batch.node_feature[:, i].long()) + ) + batch.node_feature = torch.cat(node_embedding, dim=1) + return batch + + +register_node_encoder('roland', TransactionNodeEncoder) From 57984c1500a5d56356f9cf518d55731bd60db1bd Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 16:46:23 -0700 Subject: [PATCH 23/66] update config --- graphgym/contrib/config/roland.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/graphgym/contrib/config/roland.py b/graphgym/contrib/config/roland.py index 62d5b3c3..69bc7d27 100644 --- a/graphgym/contrib/config/roland.py +++ b/graphgym/contrib/config/roland.py @@ -30,12 +30,6 @@ def set_cfg_roland(cfg): # Default to False. cfg.meta.is_meta = False - # choose between 'moving_average' and 'online_mean' - cfg.meta.method = 'moving_average' # TODO: remove, only use moving_average. - # For online mean: - # new_mean = (n-1)/n * old_mean + 1/n * new_value. - # where *_mean corresponds to W_init. - # Weight used in moving average for model parameters. # After fine-tuning the model in period t and get model M[t], # Set W_init = (1-alpha) * W_init + alpha * M[t]. @@ -55,9 +49,6 @@ def set_cfg_roland(cfg): # Set to -1 if using all snapshots. # cfg.experimental.restrict_training_set = -1 - # Whether to visualize edge attention of GNN layer after training. - cfg.experimental.visualize_gnn_layer = False - cfg.train.tbptt_freq = 5 cfg.train.internal_validation_tolerance = 5 From 4efdd0cf4fb330198eee203506e2d94c92607333 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 16:46:39 -0700 Subject: [PATCH 24/66] Add general LinearEdgeEncoder. --- graphgym/contrib/feature_encoder/roland.py | 31 ++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/graphgym/contrib/feature_encoder/roland.py b/graphgym/contrib/feature_encoder/roland.py index 6090843e..5a3b4898 100644 --- a/graphgym/contrib/feature_encoder/roland.py +++ b/graphgym/contrib/feature_encoder/roland.py @@ -6,7 +6,7 @@ class TransactionEdgeEncoder(torch.nn.Module): - r"""A module that encodes edge features in the transaction graph. + """A module that encodes edge features in the transaction graph. Example: TransactionEdgeEncoder( @@ -68,7 +68,7 @@ def forward(self, batch: deepsnap.batch.Batch) -> deepsnap.batch.Batch: class TransactionNodeEncoder(torch.nn.Module): - r"""A module that encodes node features in the transaction graph. + """A module that encodes node features in the transaction graph. Parameters: num_classes - the number of classes for the embedding mapping to learn @@ -113,3 +113,30 @@ def forward(self, batch: deepsnap.batch.Batch) -> deepsnap.batch.Batch: register_node_encoder('roland', TransactionNodeEncoder) + + +class LinearEdgeEncoder(torch.nn.Module): + """ + Basic edge encoder for temporal graphs, this encoder does not assume edge dim, + this encoder uses linear layers to contract/expand raw edge features to + dimension cfg.transaction.feature_amount_dim + feature_time_dim for consistency. + """ + def __init__(self, emb_dim: int): + # emb_dim is not used here. + super(LinearEdgeEncoder, self).__init__() + # For consistency, for non-transaction datasets with only timestamp, + # we use the feature amount dimension + time dimension to generate + # the same dimension as transaction datasets. + # TODO: change to feature_time_dim only for better naming? + expected_dim = cfg.transaction.feature_amount_dim \ + + cfg.transaction.feature_time_dim + + self.linear = nn.Linear(cfg.dataset.edge_dim, expected_dim) + cfg.dataset.edge_dim = expected_dim + + def forward(self, batch: deepsnap.batch.Batch) -> deepsnap.batch.Batch: + batch.edge_feature = self.linear(batch.edge_feature) + return batch + + +register_edge_encoder('roland_general', LinearEdgeEncoder) From bc8f2c73ef5fa0a00a466ffed85a61036b48fcfc Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 17:00:05 -0700 Subject: [PATCH 25/66] move average_state_dict and precompute_edge_degree_info to utils. --- graphgym/contrib/train/train_live_update.py | 59 +++------------------ graphgym/contrib/train/train_utils.py | 54 ++++++++++++++++++- 2 files changed, 58 insertions(+), 55 deletions(-) diff --git a/graphgym/contrib/train/train_live_update.py b/graphgym/contrib/train/train_live_update.py index d54d5ece..60555125 100644 --- a/graphgym/contrib/train/train_live_update.py +++ b/graphgym/contrib/train/train_live_update.py @@ -21,53 +21,6 @@ from tqdm import tqdm -@torch.no_grad() -def average_state_dict(dict1: dict, dict2: dict, weight: float) -> dict: - """ - Average two model.state_dict() objects, - ut = (1-w)*dict1 + w*dict2 - when dict1, dict2 are model_dicts, this method updates the meta-model. - """ - assert 0 <= weight <= 1 - d1 = copy.deepcopy(dict1) - d2 = copy.deepcopy(dict2) - out = dict() - for key in d1.keys(): - assert isinstance(d1[key], torch.Tensor) - param1 = d1[key].detach().clone() - assert isinstance(d2[key], torch.Tensor) - param2 = d2[key].detach().clone() - out[key] = (1 - weight) * param1 + weight * param2 - return out - - -@torch.no_grad() -def precompute_edge_degree_info(dataset: deepsnap.dataset.GraphDataset): - """Pre-computes edge_degree_existing, edge_degree_new and keep ratio - at each snapshot. Inplace modifications. - """ - # Assume all graph snapshots have the same number of nodes. - num_nodes = dataset[0].node_feature.shape[0] - for t in range(len(dataset)): - if t == 0: - # No previous edges for any nodes. - dataset[t].node_degree_existing = torch.zeros(num_nodes) - else: - # degree[ Dict[str, float]: + fast: bool = False) -> Dict[str, float]: """ Evaluate model's performance on task = (today, tomorrow) where today and tomorrow are integers indexing snapshots. @@ -195,7 +148,7 @@ def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, for dataset in datasets: # Sometimes edge degree info is already included in dataset. if not hasattr(dataset[0], 'keep_ratio'): - precompute_edge_degree_info(dataset) + train_utils.precompute_edge_degree_info(dataset) # if cfg.dataset.premade_datasets == 'fresh_save_cache': # if not os.path.exists(f'{cfg.dataset.dir}/cache/'): @@ -310,9 +263,9 @@ def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, model_meta = copy.deepcopy(best_model['state']) else: # for subsequent task, update init. # (1-alpha)*model_meta + alpha*best_model. - model_meta = average_state_dict(model_meta, - best_model['state'], - cfg.meta.alpha) + model_meta = train_utils.average_state_dict(model_meta, + best_model['state'], + cfg.meta.alpha) prev_node_states = update_node_states(model, datasets[0], (t, t + 1), prev_node_states) diff --git a/graphgym/contrib/train/train_utils.py b/graphgym/contrib/train/train_utils.py index dad12ff7..6caf7ffd 100644 --- a/graphgym/contrib/train/train_utils.py +++ b/graphgym/contrib/train/train_utils.py @@ -1,14 +1,37 @@ """ Metrics, other utility, and helper functions. """ +# TODO: proof-read this file. +# TODO: remove comments. +import copy + import deepsnap import numpy as np import torch from graphgym.config import cfg from graphgym.loss import compute_loss +from graphgym.utils.stats import node_degree from torch_scatter import scatter_max, scatter_mean, scatter_min -# TODO: proof-read this file. -# TODO: remove comments. + + +@torch.no_grad() +def average_state_dict(dict1: dict, dict2: dict, weight: float) -> dict: + """ + Average two model.state_dict() objects, + ut = (1-w)*dict1 + w*dict2 + when dict1, dict2 are model_dicts, this method updates the meta-model. + """ + assert 0 <= weight <= 1 + d1 = copy.deepcopy(dict1) + d2 = copy.deepcopy(dict2) + out = dict() + for key in d1.keys(): + assert isinstance(d1[key], torch.Tensor) + param1 = d1[key].detach().clone() + assert isinstance(d2[key], torch.Tensor) + param2 = d2[key].detach().clone() + out[key] = (1 - weight) * param1 + weight * param2 + return out def get_keep_ratio(existing: torch.Tensor, @@ -52,6 +75,33 @@ def get_keep_ratio(existing: torch.Tensor, return ratio +@torch.no_grad() +def precompute_edge_degree_info(dataset: deepsnap.dataset.GraphDataset): + """Pre-computes edge_degree_existing, edge_degree_new and keep ratio + at each snapshot. Inplace modifications. + """ + # Assume all graph snapshots have the same number of nodes. + num_nodes = dataset[0].node_feature.shape[0] + for t in range(len(dataset)): + if t == 0: + # No previous edges for any nodes. + dataset[t].node_degree_existing = torch.zeros(num_nodes) + else: + # degree[ int: """Computes how much memory a batch has consumed.""" total_byte = 0 From 7bba9d776df5901e6433d0748e76daacb4adf627 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 17:36:20 -0700 Subject: [PATCH 26/66] add training pipeline. --- .../contrib/train/train_live_update_bptt.py | 269 ++++++++++++++++++ 1 file changed, 269 insertions(+) create mode 100644 graphgym/contrib/train/train_live_update_bptt.py diff --git a/graphgym/contrib/train/train_live_update_bptt.py b/graphgym/contrib/train/train_live_update_bptt.py new file mode 100644 index 00000000..e07b8d95 --- /dev/null +++ b/graphgym/contrib/train/train_live_update_bptt.py @@ -0,0 +1,269 @@ +""" +The baseline training (non-incremental) training for live-update scheme. +NOTE: this setup requires extensive GPU memory and could lead to OOM error. +""" +import copy +import datetime +import logging +import os +from typing import Dict, List, Optional, Tuple + +import deepsnap +import numpy as np +import torch +from graphgym.checkpoint import clean_ckpt +from graphgym.config import cfg +from graphgym.contrib.train import train_utils +from graphgym.loss import compute_loss +from graphgym.optimizer import create_optimizer, create_scheduler +from graphgym.register import register_train +from graphgym.utils.io import makedirs_rm_exist +from graphgym.utils.stats import node_degree +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + + +@torch.no_grad() +def get_task_batch(dataset: deepsnap.dataset.GraphDataset, + today: int, tomorrow: int, + prev_node_states: Optional[Dict[str, List[torch.Tensor]]] + ) -> deepsnap.graph.Graph: + """ + Construct batch required for the task (today, tomorrow). As defined in + batch's get_item method (used to get edge_label and get_label_index), + edge_label and edge_label_index returned would be different everytime + get_task_batch() is called. + + Moreover, copy node-memories (node_states and node_cells) to the batch. + """ + assert today < tomorrow < len(dataset) + # Get edges for message passing and prediction task. + batch = dataset[today].clone() + batch.edge_label = dataset[tomorrow].edge_label.clone() + batch.edge_label_index = dataset[tomorrow].edge_label_index.clone() + + # Copy previous memory to the batch. + if prev_node_states is not None: + for key, val in prev_node_states.items(): + copied = [x.detach().clone() for x in val] + setattr(batch, key, copied) + + batch = train_utils.move_batch_to_device(batch, cfg.device) + return batch + + +@torch.no_grad() +def update_node_states(model, dataset, task: Tuple[int, int], + prev_node_states: Optional[ + Dict[str, List[torch.Tensor]]] + ) -> Dict[str, List[torch.Tensor]]: + """Perform the provided task and keep track of the latest node_states. + + Example: task = (t, t+1), + the prev_node_states contains node embeddings at time (t-1). + the model perform task (t, t+1): + Input: (node embedding at t - 1, edges at t). + Output: possible transactions at t+1. + the model also generates node embeddings at t. + + after doing task (t, t+1), node_states contains information + from snapshot t. + """ + today, tomorrow = task + batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() + # Let the model modify batch.node_states (and batch.node_cells). + _, _ = model(batch) + # Collect the updated node states. + out = dict() + out['node_states'] = [x.detach().clone() for x in batch.node_states] + if isinstance(batch.node_cells[0], torch.Tensor): + out['node_cells'] = [x.detach().clone() for x in batch.node_cells] + + return out + + +def train_step(model, optimizer, scheduler, dataset, + task: Tuple[int, int]) -> dict: + """ + After receiving ground truth from a particular task, update the model by + performing back-propagation. + For example, on day t, the ground truth of task (t-1, t) has been revealed, + train the model using G[t-1] for message passing and label[t] as target. + """ + optimizer.zero_grad() + torch.cuda.empty_cache() + model.train() + + today, _ = task + + # get loss over time. + total_loss_over_time = torch.tensor(0.0).to(torch.device(cfg.device)) + # iterate from the beginning to compute node_states. + for t in range(today + 1): # (0, 1), (1, 2), ..., (today, today+1). + # perform task (t, t+1), use information up to tomorrow. + new_batch = get_task_batch(dataset, t, t + 1, None).clone() + if t > 0: # manually inherit node states and node cells for LSTM. + new_batch.node_states = batch.node_states + new_batch.node_cells = batch.node_cells + batch = new_batch + pred, true = model(batch) + loss, _ = compute_loss(pred, true) + if t > today - cfg.train.tbptt_freq: + # Perform the truncated version, only accumulate loss for recent + # snapshots. + total_loss_over_time += loss + # get average loss over time. + total_loss_over_time /= (today + 1) + # perform back-prop through time. + total_loss_over_time.backward() + optimizer.step() + + scheduler.step() + return {'loss': total_loss_over_time} + + +@torch.no_grad() +def evaluate_step(model, dataset, task: Tuple[int, int], fast: bool = False + ) -> dict: + """ + Evaluate model's performance on task = (today, tomorrow) + where today and tomorrow are integers indexing snapshots. + """ + today, tomorrow = task + model.eval() + + # Run forward pass to get the latest node states. + for t in range(today): # (0, 1), (1, 2), ...(today-1, today) + # Iterative through snapshots in the past, up to (today-1, today) + new_batch = get_task_batch(dataset, t, t + 1, None).clone() + if t > 0: + new_batch.node_states = batch.node_states + new_batch.node_cells = batch.node_cells + batch = new_batch + # forward pass to update node_states in batch. + _, _ = model(batch) + + # Evaluation. + # (today, today+1) + cur_batch = get_task_batch(dataset, today, tomorrow, None).clone() + if today > 0: + cur_batch.node_states = copy.deepcopy(batch.node_states) + cur_batch.node_cells = copy.deepcopy(batch.node_cells) + + pred, true = model(cur_batch) + loss, _ = compute_loss(pred, true) + + if fast: + # skip MRR calculation for internal validation. + return {'loss': loss.item()} + + mrr_batch = get_task_batch(dataset, today, tomorrow, None).clone() + if today > 0: + mrr_batch.node_states = copy.deepcopy(batch.node_states) + mrr_batch.node_cells = copy.deepcopy(batch.node_cells) + + mrr = train_utils.compute_MRR( + mrr_batch, + model, + num_neg_per_node=cfg.metric.mrr_num_negative_edges, + method=cfg.metric.mrr_method) + + return {'loss': loss.item(), 'mrr': mrr} + + +def train_live_update_bptt(loggers, loaders, model, optimizer, scheduler, datasets, + **kwargs): + for dataset in datasets: + # Sometimes edge degree info is already included in dataset. + if not hasattr(dataset[0], 'keep_ratio'): + train_utils.precompute_edge_degree_info(dataset) + + num_splits = len(loggers) # train/val/test splits. + # range for today in (today, tomorrow) task pairs. + task_range = range(len(datasets[0]) - cfg.transaction.horizon) + + t = datetime.datetime.now().strftime('%b%d_%H-%M-%S') + + # directory to store tensorboard files of this run. + out_dir = cfg.out_dir.replace('/', '\\') + # dir to store all run outputs for the entire batch. + run_dir = 'runs_' + cfg.remark + + print(f'Tensorboard directory: {out_dir}') + # If tensorboard directory exists, this config is in the re-run phase + # of run_batch, replace logs of previous runs with the new one. + makedirs_rm_exist(f'./{run_dir}/{out_dir}') + writer = SummaryWriter(f'./{run_dir}/{out_dir}') + + # save a copy of configuration for later identifications. + with open(f'./{run_dir}/{out_dir}/config.yaml', 'w') as f: + cfg.dump(stream=f) + + for t in tqdm(task_range, desc='Snapshot'): + # current task: t --> t+1. + # (1) Evaluate model's performance on this task, at this time, the + # model has seen no information on t+1, this evaluation is fair. + for i in range(1, num_splits): + perf = evaluate_step(model, datasets[i], (t, t + 1), fast=False) + + writer.add_scalars('val' if i == 1 else 'test', perf, t) + + # (2) Reveal the ground truth of task (t, t+1) and update the model + # to prepare for the next task. + del optimizer, scheduler # use new optimizers. + optimizer = create_optimizer(model.parameters()) + scheduler = create_scheduler(optimizer) + + # best model's validation loss, training epochs, and state_dict. + best_model = {'val_loss': np.inf, 'train_epoch': 0, 'state': None} + # keep track of how long we have NOT update the best model. + best_model_unchanged = 0 + # after not updating the best model for `tol` epochs, stop. + tol = cfg.train.internal_validation_tolerance + + # internal training loop (intra-snapshot cross-validation). + # choose the best model using current validation set, prepare for + # next task. + + for i in tqdm(range(cfg.optim.max_epoch + 1), desc='live update', + leave=False): + # Start with the un-trained model (i = 0), evaluate the model. + internal_val_perf = evaluate_step(model, datasets[1], + (t, t + 1), fast=True) + val_loss = internal_val_perf['loss'] + + if val_loss < best_model['val_loss']: + # replace the best model with the current model. + best_model = {'val_loss': val_loss, 'train_epoch': i, + 'state': copy.deepcopy(model.state_dict())} + best_model_unchanged = 0 + else: + # the current best model has dominated for these epochs. + best_model_unchanged += 1 + + if best_model_unchanged >= tol: + # If the best model has not been updated for a while, stop. + break + else: + # Otherwise, keep training. + train_perf = train_step(model, optimizer, scheduler, + datasets[0], (t, t + 1)) + writer.add_scalars('train', train_perf, t) + + writer.add_scalar('internal_best_val', best_model['val_loss'], t) + writer.add_scalar('best epoch', best_model['train_epoch'], t) + + # (3) Actually perform the update on training set to get node_states + # contains information up to time t. + # Use the best model selected from intra-snapshot cross-validation. + model.load_state_dict(best_model['state']) + + writer.close() + + if cfg.train.ckpt_clean: + clean_ckpt() + + logging.info('Task done, results saved in {}'.format(cfg.out_dir)) + + +register_train('live_update_baseline', train_live_update_bptt) From 0a87698962c7088f683677fe0608c14495e99e54 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 17:51:16 -0700 Subject: [PATCH 27/66] add pipeline --- .../train/train_live_update_fixed_split.py | 281 ++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 graphgym/contrib/train/train_live_update_fixed_split.py diff --git a/graphgym/contrib/train/train_live_update_fixed_split.py b/graphgym/contrib/train/train_live_update_fixed_split.py new file mode 100644 index 00000000..cde96628 --- /dev/null +++ b/graphgym/contrib/train/train_live_update_fixed_split.py @@ -0,0 +1,281 @@ +""" +A pipeline training model using live-update scheme but only evaluates the model +using the last 10% of snapshots, which is the same as conventional chronological +data splitting method. +""" +import copy +import datetime +import logging +import os +from typing import Dict, List, Optional, Tuple + +import deepsnap +import numpy as np +import torch +from graphgym.checkpoint import clean_ckpt +from graphgym.config import cfg +from graphgym.contrib.train import train_utils +from graphgym.loss import compute_loss +from graphgym.optimizer import create_optimizer, create_scheduler +from graphgym.register import register_train +from graphgym.utils.io import makedirs_rm_exist +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + + +@torch.no_grad() +def get_task_batch(dataset: deepsnap.dataset.GraphDataset, + today: int, tomorrow: int, + prev_node_states: Optional[Dict[str, List[torch.Tensor]]] + ) -> deepsnap.graph.Graph: + """ + Construct batch required for the task (today, tomorrow). As defined in + batch's get_item method (used to get edge_label and get_label_index), + edge_label and edge_label_index returned would be different everytime + get_task_batch() is called. + + Moreover, copy node-memories (node_states and node_cells) to the batch. + """ + assert today < tomorrow < len(dataset) + # Get edges for message passing and prediction task. + batch = dataset[today].clone() + batch.edge_label = dataset[tomorrow].edge_label.clone() + batch.edge_label_index = dataset[tomorrow].edge_label_index.clone() + + # Copy previous memory to the batch. + if prev_node_states is not None: + for key, val in prev_node_states.items(): + copied = [x.detach().clone() for x in val] + setattr(batch, key, copied) + + batch = train_utils.move_batch_to_device(batch, cfg.device) + return batch + + +@torch.no_grad() +def update_node_states(model, dataset, task: Tuple[int, int], + prev_node_states: Optional[ + Dict[str, List[torch.Tensor]]] + ) -> Dict[str, List[torch.Tensor]]: + """Perform the provided task and keep track of the latest node_states. + + Example: task = (t, t+1), + the prev_node_states contains node embeddings at time (t-1). + the model perform task (t, t+1): + Input: (node embedding at t - 1, edges at t). + Output: possible transactions at t+1. + the model also generates node embeddings at t. + + after doing task (t, t+1), node_states contains information + from snapshot t. + """ + today, tomorrow = task + batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() + # Let the model modify batch.node_states (and batch.node_cells). + _, _ = model(batch) + # Collect the updated node states. + out = dict() + out['node_states'] = [x.detach().clone() for x in batch.node_states] + if isinstance(batch.node_cells[0], torch.Tensor): + out['node_cells'] = [x.detach().clone() for x in batch.node_cells] + + return out + + +def train_step(model, optimizer, scheduler, dataset, + task: Tuple[int, int], + prev_node_states: Optional[Dict[str, torch.Tensor]] + ) -> dict: + """ + After receiving ground truth from a particular task, update the model by + performing back-propagation. + For example, on day t, the ground truth of task (t-1, t) has been revealed, + train the model using G[t-1] for message passing and label[t] as target. + """ + optimizer.zero_grad() + torch.cuda.empty_cache() + + today, tomorrow = task + model.train() + batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() + + pred, true = model(batch) + loss, pred_score = compute_loss(pred, true) + loss.backward() + optimizer.step() + + scheduler.step() + return {'loss': loss} + + +@torch.no_grad() +def evaluate_step(model, dataset, task: Tuple[int, int], + prev_node_states: Optional[Dict[str, List[torch.Tensor]]], + fast: bool = False) -> dict: + """ + Evaluate model's performance on task = (today, tomorrow) + where today and tomorrow are integers indexing snapshots. + """ + today, tomorrow = task + model.eval() + batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() + + pred, true = model(batch) + loss, pred_score = compute_loss(pred, true) + + if fast: + # skip MRR calculation for internal validation. + return {'loss': loss.item()} + + mrr_batch = get_task_batch(dataset, today, tomorrow, + prev_node_states).clone() + + mrr = train_utils.compute_MRR(mrr_batch, model, -1, 'all') + + return {'loss': loss.item(), 'mrr': mrr} + + +def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, + **kwargs): + + for dataset in datasets: + # Sometimes edge degree info is already included in dataset. + if not hasattr(dataset[0], 'keep_ratio'): + train_utils.precompute_edge_degree_info(dataset) + + if cfg.dataset.premade_datasets == 'fresh_save_cache': + if not os.path.exists(f'{cfg.dataset.dir}/cache/'): + os.mkdir(f'{cfg.dataset.dir}/cache/') + cache_path = '{}/cache/cached_datasets_{}_{}_{}.pt'.format( + cfg.dataset.dir, cfg.dataset.format.replace('.tsv', ''), + cfg.transaction.snapshot_freq, + datetime.now().strftime('%Y_%m_%d__%H_%M_%S') + ) + torch.save(datasets, cache_path) + + num_splits = len(loggers) # train/val/test splits. + # range for today in (today, tomorrow) task pairs. + task_range = range(len(datasets[0]) - cfg.transaction.horizon) + + t = datetime.datetime.now().strftime('%b%d_%H-%M-%S') + + # directory to store tensorboard files of this run. + out_dir = cfg.out_dir.replace('/', '\\') + # dir to store all run outputs for the entire batch. + run_dir = 'runs_' + cfg.remark + + print(f'Tensorboard directory: {out_dir}') + # If tensorboard directory exists, this config is in the re-run phase + # of run_batch, replace logs of previous runs with the new one. + makedirs_rm_exist(f'./{run_dir}/{out_dir}') + writer = SummaryWriter(f'./{run_dir}/{out_dir}') + + # save a copy of configuration for later identifications. + with open(f'./{run_dir}/{out_dir}/config.yaml', 'w') as f: + cfg.dump(stream=f) + + prev_node_states = None # no previous state on day 0. + # {'node_states': [Tensor, Tensor], 'node_cells: [Tensor, Tensor]} + + model_init = None # for meta-learning only, a model.state_dict() object. + for t in tqdm(task_range, desc='snapshot', leave=True): + # current task: t --> t+1. + # (1) Evaluate model's performance on this task, at this time, the + # model has seen no information on t+1, this evaluation is fair. + # Only evaluate the performance within the test set split region. + # Test snapshots are indexed [cfg.train.start_compute_mrr, end]. + perf = evaluate_step(model, datasets[2], (t, t + 1), + prev_node_states, fast=t < cfg.train.start_compute_mrr) + + writer.add_scalars('test', perf, t) + + # (2) Reveal the ground truth of task (t, t+1) and update the model + # to prepare for the next task. + del optimizer, scheduler # use new optimizers. + optimizer = create_optimizer(model.parameters()) + scheduler = create_scheduler(optimizer) + + # best model's validation loss, training epochs, and state_dict. + # The untrained model is the default best model. + best_model = {'val_loss': np.inf, 'train_epoch': 0, + 'state': copy.deepcopy(model.state_dict())} + # keep track of how long we have NOT update the best model. + best_model_unchanged = 0 + # after not updating the best model for `tol` epochs, stop. + tol = cfg.train.internal_validation_tolerance + + # internal training loop (intra-snapshot cross-validation). + # choose the best model using current validation set, prepare for + # next task. + + if cfg.meta.is_meta and (model_init is not None): + # For meta-learning, start fine-tuning from the pre-computed + # initialization weight. + model.load_state_dict(copy.deepcopy(model_init)) + + for i in tqdm(range(cfg.optim.max_epoch + 1), desc='live update', + leave=True): + # Start with the un-trained model (i = 0), evaluate the model. + internal_val_perf = evaluate_step(model, datasets[1], + (t, t + 1), + prev_node_states, fast=True) + val_loss = internal_val_perf['loss'] + + if val_loss < best_model['val_loss']: + # replace the best model with the current model. + best_model = {'val_loss': val_loss, 'train_epoch': i, + 'state': copy.deepcopy(model.state_dict())} + best_model_unchanged = 0 + else: + # the current best model has dominated for these epochs. + best_model_unchanged += 1 + + # if (i >= 2 * tol) and (best_model_unchanged >= tol): + if best_model_unchanged >= tol: + # If the best model has not been updated for a while, stop. + break + else: + # Otherwise, keep training. + train_perf = train_step(model, optimizer, scheduler, + datasets[0], (t, t + 1), + prev_node_states) + writer.add_scalars('train', train_perf, t) + + writer.add_scalar('internal_best_val', best_model['val_loss'], t) + writer.add_scalar('best epoch', best_model['train_epoch'], t) + + # (3) Actually perform the update on training set to get node_states + # contains information up to time t. + # Use the best model selected from intra-snapshot cross-validation. + # if best_model['state'] is None: + # breakpoint() + model.load_state_dict(best_model['state']) + + if cfg.meta.is_meta: # update meta-learning's initialization weights. + if model_init is None: # for the first task. + model_init = copy.deepcopy(best_model['state']) + else: # for subsequent task, update init. + if cfg.meta.method == 'moving_average': + new_weight = cfg.meta.alpha + elif cfg.meta.method == 'online_mean': + new_weight = 1 / (t + 1) # for t=1, the second item, 1/2. + else: + raise ValueError(f'Invalid method: {cfg.meta.method}') + + # (1-new_weight)*model_init + new_weight*best_model. + model_init = train_utils.average_state_dict(model_init, + best_model['state'], + new_weight) + + prev_node_states = update_node_states(model, datasets[0], (t, t + 1), + prev_node_states) + + writer.close() + + if cfg.train.ckpt_clean: + clean_ckpt() + + logging.info('Task done, results saved in {}'.format(cfg.out_dir)) + + +register_train('live_update_fixed_split', train_live_update) From 34cc91d4ac5ebb1f83c951c6708b79f6b22ca405 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 18:15:27 -0700 Subject: [PATCH 28/66] add --- .../contrib/head/head_large_prediction.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 graphgym/contrib/head/head_large_prediction.py diff --git a/graphgym/contrib/head/head_large_prediction.py b/graphgym/contrib/head/head_large_prediction.py new file mode 100644 index 00000000..3b7e99c9 --- /dev/null +++ b/graphgym/contrib/head/head_large_prediction.py @@ -0,0 +1,109 @@ +""" +An improved version of graphgym.models.head.GNNEdgeHead. This head handles +large link prediction tasks by splitting them into chunks to avoid OOM errors. +This is particular useful for computing MRR when a large amount of memory is +needed. + +(Not implemented yet) Alternatively, one may implement head for MRR by all +prediction task to CPU, by doing so, we need sepearate heads for training and +inference (training requires everything including head to be on GPU). +""" +import torch +import torch.nn as nn +from graphgym.config import cfg +from graphgym.models.layer import MLP +from graphgym.register import register_head + + +class LargeGNNEdgeHead(nn.Module): + def __init__(self, dim_in: int, dim_out: int): + # Use dim_in for graph conv, since link prediction dim_out could be + # binary + # E.g. if decoder='dot', link probability is dot product between + # node embeddings, of dimension dim_in + super(LargeGNNEdgeHead, self).__init__() + # module to decode edges from node embeddings + + if cfg.model.edge_decoding == 'concat': + # Only use node features. + self.layer_post_mp = MLP(dim_in * 2, dim_out, + num_layers=cfg.gnn.layers_post_mp, + bias=True) + # requires parameter + self.decode_module = lambda v1, v2: \ + self.layer_post_mp(torch.cat((v1, v2), dim=-1)) + elif cfg.model.edge_decoding == 'edgeconcat': + # Use both node and edge features. + self.layer_post_mp = MLP(dim_in * 2 + cfg.dataset.edge_dim, dim_out, + num_layers=cfg.gnn.layers_post_mp, + bias=True) + # requires parameter + self.decode_module = lambda v1, v2, edge: \ + self.layer_post_mp(torch.cat((v1, v2, edge), dim=-1)) + else: + if dim_out > 1: + raise ValueError( + 'Binary edge decoding ({})is used for multi-class ' + 'edge/link prediction.'.format(cfg.model.edge_decoding)) + self.layer_post_mp = MLP(dim_in, dim_in, + num_layers=cfg.gnn.layers_post_mp, + bias=True) + if cfg.model.edge_decoding == 'dot': + self.decode_module = lambda v1, v2: torch.sum(v1 * v2, dim=-1) + elif cfg.model.edge_decoding == 'cosine_similarity': + self.decode_module = nn.CosineSimilarity(dim=-1) + else: + raise ValueError('Unknown edge decoding {}.'.format( + cfg.model.edge_decoding)) + + def _apply_index(self, batch): + return batch.node_feature[batch.edge_label_index], \ + batch.edge_label + + def forward_pred(self, batch): + # TODO: consider moving this to config. + predict_batch_size = 500000 # depends on GPU memroy size. + num_pred = len(batch.edge_label) + label = batch.edge_label + if num_pred >= predict_batch_size: + # for large prediction tasks, split into chunks. + num_chunks = num_pred // predict_batch_size + 1 + edge_label_index_chunks = torch.chunk( + batch.edge_label_index, num_chunks, dim=1) + gathered_pred = list() + + for edge_label_index in edge_label_index_chunks: + pred = batch.node_feature[edge_label_index] + # node features of the source node of each edge. + nodes_first = pred[0] + nodes_second = pred[1] + if cfg.model.edge_decoding == 'edgeconcat': + raise NotImplementedError + else: + pred = self.decode_module(nodes_first, nodes_second) + gathered_pred.append(pred) + + pred = torch.cat(gathered_pred) + else: + pred, label = self._apply_index(batch) + # node features of the source node of each edge. + nodes_first = pred[0] + nodes_second = pred[1] + if cfg.model.edge_decoding == 'edgeconcat': + edge_feature = torch.index_select( + batch.edge_feature, 0, batch.edge_split_index) + pred = self.decode_module( + nodes_first, nodes_second, edge_feature) + else: + pred = self.decode_module(nodes_first, nodes_second) + return pred, label + + def forward(self, batch): + if cfg.model.edge_decoding != 'concat' and \ + cfg.model.edge_decoding != 'edgeconcat': + batch = self.layer_post_mp(batch) + pred, label = self.forward_pred(batch) + return pred, label + + +register_head('link_pred_large', LargeGNNEdgeHead) From 4826e76b5f7e9810aa9ea0370e7fb58f9ac17f4b Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 18:56:47 -0700 Subject: [PATCH 29/66] update config file. --- graphgym/contrib/config/roland.py | 106 +++++++++++++++--------------- 1 file changed, 52 insertions(+), 54 deletions(-) diff --git a/graphgym/contrib/config/roland.py b/graphgym/contrib/config/roland.py index 69bc7d27..58a32bac 100644 --- a/graphgym/contrib/config/roland.py +++ b/graphgym/contrib/config/roland.py @@ -4,30 +4,43 @@ def set_cfg_roland(cfg): - r''' + """ This function sets the default config value for customized options :return: customized configuration use by the experiment. - ''' + """ # ----------------------------------------------------------------------- # # Customized options # ----------------------------------------------------------------------- # - # TODO: add documentation. + + # Use to identify experiments, tensorboard will be saved to this path. + # Options: any string. + cfg.remark = '' + + # ----------------------------------------------------------------------- # + # Additional GNN options. + # ----------------------------------------------------------------------- # # Method to update node embedding from old node embedding and new node features. - # Options: 'moving_average', 'masked_gru', 'gru' - # moving average: new embedding = r * old + (1-r) * node_feature. - # gru: new embedding = GRU(node_feature, old_embedding). - # masked_gru: only apply GRU to active nodes. + # Options: {'moving_average', 'mlp', 'gru'} cfg.gnn.embed_update_method = 'moving_average' - # how many layers to use in the MLP updater. - # default: 1, use a simple linear layer. + # How many layers to use in the MLP updater. + # Options: integers >= 1. + # NOTE: there is a known issue when set to 1, use >= 2 for now. + # Only effective when cfg.gnn.embed_update_method == 'mlp'. cfg.gnn.mlp_update_layers = 2 - + + # What kind of skip-connection to use. + # Options: {'none', 'identity', 'affine'}. + cfg.gnn.skip_connection = 'none' + + # ----------------------------------------------------------------------- # + # Meta-Learning options. + # ----------------------------------------------------------------------- # # For meta-learning. cfg.meta = CN() # Whether to do meta-learning via initialization moving average. - # Default to False. + # Options: {True, False} cfg.meta.is_meta = False # Weight used in moving average for model parameters. @@ -35,36 +48,36 @@ def set_cfg_roland(cfg): # Set W_init = (1-alpha) * W_init + alpha * M[t]. # For the next period, use W_init as the initialization for fine-tune # Set cfg.meta.alpha = 1.0 to recover the original algorithm. + # Options: float between 0.0 and 1.0. cfg.meta.alpha = 0.9 - # Use to identify experiments. - cfg.remark = '' - # Experimental Features, use this name space to save all controls for - # experimental features. - # TODO: consider remove experiment field. - # cfg.experimental = CN() - - # Only use the first n snapshots (time periods) to train the model. - # Empirically, the model learns rich dynamics from only a few periods. - # Set to -1 if using all snapshots. - # cfg.experimental.restrict_training_set = -1 - - cfg.train.tbptt_freq = 5 + # ----------------------------------------------------------------------- # + # Additional GNN options. + # ----------------------------------------------------------------------- # + # How many snapshots for the truncated back-propagation. + # Set to a very large integer to use full-back-prop-through-time + # Options: integers >= 1. + cfg.train.tbptt_freq = 10 + # Early stopping tolerance in live-update. + # Options: integers >= 1. cfg.train.internal_validation_tolerance = 5 # Computing MRR is slow in the baseline setting. # Only start to compute MRR in the test set range after certain time. + # Options: integers >= 0. cfg.train.start_compute_mrr = 0 - - # How to handle node features in AS dataset. - # available: ['one', 'one_hot_id', 'one_hot_degree_global', 'one_hot_degree_local'] - cfg.dataset.AS_node_feature = 'one' # ----------------------------------------------------------------------- # - # Additional dataset option for the BSI dataset. + # Additional dataset options. # ----------------------------------------------------------------------- # + + # How to handle node features in AS-733 dataset. + # Options: ['one', 'one_hot_id', 'one_hot_degree_global'] + cfg.dataset.AS_node_feature = 'one' + # Method used to sample negative edges for edge_label_index. + # Options: # 'uniform': all non-existing edges have same probability of being sampled # as negative edges. # 'src': non-existing edges from high-degree nodes are more likely to be @@ -73,10 +86,13 @@ def set_cfg_roland(cfg): # to be sampled as negative edges. cfg.dataset.negative_sample_weight = 'uniform' - # whether to load heterogeneous graphs. + # Whether to load dataset as heterogeneous graphs. + # Options: {True, False}. cfg.dataset.is_hetero = False - # where to put type information. 'append' or 'graph_attribute'. + # Where to put type information. + # Options: {'append', 'graph_attribute'}. + # Only effective if cfg.dataset.is_hetero == True. cfg.dataset.type_info_loc = 'append' # whether to look for and load cached graph. By default (load_cache=False) @@ -93,9 +109,8 @@ def set_cfg_roland(cfg): # are for validation and the last 10% snapshots are for testing. cfg.dataset.split_method = 'default' - cfg.gnn.skip_connection = 'none' # {'none', 'identity', 'affine'} # ----------------------------------------------------------------------- # - # Customized options + # Customized options: `transaction` for ROLAND dynamic graphs. # ----------------------------------------------------------------------- # # example argument group @@ -118,7 +133,6 @@ def set_cfg_roland(cfg): # full or rolling cfg.transaction.history = 'full' - # type of loss: supervised / meta cfg.transaction.loss = 'meta' @@ -156,6 +170,10 @@ def set_cfg_roland(cfg): # and its degree in snapshot t. cfg.transaction.keep_ratio = 'linear' + # ----------------------------------------------------------------------- # + # Customized options: metrics. + # ----------------------------------------------------------------------- # + cfg.metric = CN() # How many negative edges for each node to compute rank-based evaluation # metrics such as MRR and recall at K. @@ -174,25 +192,5 @@ def set_cfg_roland(cfg): # expected MRR(min) <= MRR(mean) <= MRR(max). cfg.metric.mrr_method = 'max' - # TODO: consider remove link_pred_spec field. - # Specs for the link prediction task using BSI dataset. - # All units are days. - # cfg.link_pred_spec = CN() - - # The period of `today`'s increase: how often the system is making forecast. - # E.g., when = 1, - # the system forecasts transactions in upcoming 7 days for everyday. - # One training epoch loops over - # {Jan-1-2020, Jan-2-2020, Jan-3-2020..., Dec-31-2020} - # When = 7, the system makes prediction every week. - # E.g., the system forecasts transactions in upcoming 7 days - # on every Monday. - # cfg.link_pred_spec.forecast_frequency = 1 - - # How many days into the future the model is trained to predict. - # The model forecasts transactions in (today, today + forecast_horizon]. - # NOTE: forecast_horizon should >= forecast_frequency to cover all days. - # cfg.link_pred_spec.forecast_horizon = 7 - register_config('roland', set_cfg_roland) From 22db15d28b2594251b09cab68dd4f0b7b3a6140c Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 18:57:17 -0700 Subject: [PATCH 30/66] remove unused 'one_hot_degree_local' option. --- graphgym/contrib/loader/roland_as.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/graphgym/contrib/loader/roland_as.py b/graphgym/contrib/loader/roland_as.py index 4cab81ad..bcf3b7a1 100644 --- a/graphgym/contrib/loader/roland_as.py +++ b/graphgym/contrib/loader/roland_as.py @@ -114,8 +114,7 @@ def load_generic_dataset(format, name, dataset_dir): scaled_edge_time = 2 * (edge_time.clone() - base) / scale assert cfg.dataset.AS_node_feature in ['one', 'one_hot_id', - 'one_hot_degree_global', - 'one_hot_degree_local'] + 'one_hot_degree_global'] if cfg.dataset.AS_node_feature == 'one': node_feature = torch.ones(num_nodes, 1) From a40de8651c2c7a473189a9d14657e717bef21c2f Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 18:59:58 -0700 Subject: [PATCH 31/66] copy stats utility from GraphGym_dev --- graphgym/utils/stats.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 graphgym/utils/stats.py diff --git a/graphgym/utils/stats.py b/graphgym/utils/stats.py new file mode 100644 index 00000000..f8bb0e29 --- /dev/null +++ b/graphgym/utils/stats.py @@ -0,0 +1,26 @@ +import torch + + +def node_degree(edge_index, n=None, mode='in'): + if mode == 'in': + index = edge_index[0, :] + elif mode == 'out': + index = edge_index[1, :] + else: + index = edge_index.flatten() + n = edge_index.max() + 1 if n is None else n + degree = torch.zeros(n) + ones = torch.ones(index.shape[0]) + return degree.scatter_add_(0, index, ones) + + + + + + + +# edge_index = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5]]) + +# print(compute_degree(edge_index, mode='in')) +# print(compute_degree(edge_index, mode='out')) +# print(compute_degree(edge_index, mode='both')) From 11162c4187e40e005bd7b6149d907a56533d5b8c Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 19:24:43 -0700 Subject: [PATCH 32/66] add loader template for dynamic graphs. --- graphgym/contrib/loader/roland_template.py | 62 ++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 graphgym/contrib/loader/roland_template.py diff --git a/graphgym/contrib/loader/roland_template.py b/graphgym/contrib/loader/roland_template.py new file mode 100644 index 00000000..901cf329 --- /dev/null +++ b/graphgym/contrib/loader/roland_template.py @@ -0,0 +1,62 @@ +""" +A generic loader for the roland project, modify this template to build +loaders for other financial transaction datasets and dynamic graphs. +NOTE: this script is the trimmed version for homogenous graphs only. +Mar. 22, 2021. +# Search for TODO in this file. +""" +import os +from typing import List + +import deepsnap +import graphgym.contrib.loader.dynamic_graph_utils as utils +from deepsnap.graph import Graph +from graphgym.config import cfg +from graphgym.register import register_loader + + +def load_single_dataset(dataset_dir: str) -> Graph: + # TODO: Load your data here. + node_feature = None # (num_nodes, *) + edge_feature = None # (num_edges, *) + edge_index = None # (2, num_edges) + # edge time should be unix timestmap integers. + edge_time = None # (num_edges) + + graph = Graph( + node_feature=node_feature, + edge_feature=edge_feature, + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + + return graph + + +def load_generic_dataset(format: str, name: str, dataset_dir: str + ) -> List[deepsnap.graph.Graph]: + """Load the dataset as a list of graph snapshots. + + Args: + format (str): format of dataset. + name (str): file name of dataset. + dataset_dir (str): path of dataset, do NOT include the file name, use + the parent directory of dataset file. + + Returns: + List[deepsnap.graph.Graph]: a list of graph snapshots. + """ + # TODO: change the format name. + if format == 'generic': + dataset_dir = os.path.join(dataset_dir, name) + g_all = load_single_dataset(dataset_dir) + snapshot_list = utils.make_graph_snapshot( + g_all, + snapshot_freq=cfg.transaction.snapshot_freq, + is_hetero=cfg.dataset.is_hetero) + return snapshot_list + + +# TODO: don't forget to register the loader. +register_loader('roland_generic', load_generic_dataset) From 71eb77f68292094bad633c970061229a84385d64 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 19:24:53 -0700 Subject: [PATCH 33/66] remove --- graphgym/models/head_mem.py | 130 ------------------------------------ 1 file changed, 130 deletions(-) delete mode 100644 graphgym/models/head_mem.py diff --git a/graphgym/models/head_mem.py b/graphgym/models/head_mem.py deleted file mode 100644 index 3114cc72..00000000 --- a/graphgym/models/head_mem.py +++ /dev/null @@ -1,130 +0,0 @@ -""" GNN heads are the last layer of a GNN right before loss computation. - -They are constructed in the init function of the gnn.GNN. -""" - -import torch -import torch.nn as nn - -from graphgym.config import cfg -from graphgym.models.layer import MLP -from graphgym.models.pooling import pooling_dict - -from graphgym.contrib.head import * -import graphgym.register as register - - -########### Head ############ - -class GNNNodeHead(nn.Module): - '''Head of GNN, node prediction''' - - def __init__(self, dim_in, dim_out): - super(GNNNodeHead, self).__init__() - self.layer_post_mp = MLP(dim_in, dim_out, - num_layers=cfg.gnn.layers_post_mp, bias=True) - - def _apply_index(self, batch): - if batch.node_label_index.shape[0] == batch.node_label.shape[0]: - return batch.node_feature[batch.node_label_index], batch.node_label - else: - return batch.node_feature[batch.node_label_index], \ - batch.node_label[batch.node_label_index] - - def forward(self, batch): - batch = self.layer_post_mp(batch) - pred, label = self._apply_index(batch) - return pred, label - - -class GNNEdgeHead(nn.Module): - '''Head of GNN, edge prediction''' - - def __init__(self, dim_in, dim_out): - ''' Head of Edge and link prediction models. - - Args: - dim_out: output dimension. For binary prediction, dim_out=1. - ''' - # Use dim_in for graph conv, since link prediction dim_out could be - # binary - # E.g. if decoder='dot', link probability is dot product between - # node embeddings, of dimension dim_in - super(GNNEdgeHead, self).__init__() - # module to decode edges from node embeddings - - if cfg.model.edge_decoding == 'concat': - self.layer_post_mp = MLP(dim_in * 2, dim_out, - num_layers=cfg.gnn.layers_post_mp, - bias=True) - # requires parameter - self.decode_module = lambda v1, v2: \ - self.layer_post_mp(torch.cat((v1, v2), dim=-1)) - else: - if dim_out > 1: - raise ValueError( - 'Binary edge decoding ({})is used for multi-class ' - 'edge/link prediction.'.format(cfg.model.edge_decoding)) - self.layer_post_mp = MLP(dim_in, dim_in, - num_layers=cfg.gnn.layers_post_mp, - bias=True) - if cfg.model.edge_decoding == 'dot': - self.decode_module = lambda v1, v2: torch.sum(v1 * v2, dim=-1) - elif cfg.model.edge_decoding == 'cosine_similarity': - self.decode_module = nn.CosineSimilarity(dim=-1) - else: - raise ValueError('Unknown edge decoding {}.'.format( - cfg.model.edge_decoding)) - - def _apply_index(self, batch): - return batch.node_feature[batch.edge_label_index], \ - batch.edge_label - - def forward(self, batch): - if cfg.model.edge_decoding != 'concat': - batch = self.layer_post_mp(batch) - pred, label = self._apply_index(batch) - nodes_first = pred[0] - nodes_second = pred[1] - pred = self.decode_module(nodes_first, nodes_second) - return pred, label - - -class GNNGraphHead(nn.Module): - '''Head of GNN, graph prediction - - The optional post_mp layer (specified by cfg.gnn.post_mp) is used - to transform the pooled embedding using an MLP. - ''' - - def __init__(self, dim_in, dim_out): - super(GNNGraphHead, self).__init__() - # todo: PostMP before or after global pooling - self.layer_post_mp = MLP(dim_in, dim_out, - num_layers=cfg.gnn.layers_post_mp, bias=True) - self.pooling_fun = pooling_dict[cfg.model.graph_pooling] - - def _apply_index(self, batch): - return batch.graph_feature, batch.graph_label - - def forward(self, batch): - if cfg.dataset.transform == 'ego': - graph_emb = self.pooling_fun(batch.node_feature, batch.batch, - batch.node_id_index) - else: - graph_emb = self.pooling_fun(batch.node_feature, batch.batch) - graph_emb = self.layer_post_mp(graph_emb) - batch.graph_feature = graph_emb - pred, label = self._apply_index(batch) - return pred, label - - -# Head models for external interface -head_dict = { - 'node': GNNNodeHead, - 'edge': GNNEdgeHead, - 'link_pred': GNNEdgeHead, - 'graph': GNNGraphHead -} - -head_dict = {**register.head_dict, **head_dict} From ffc2cc9d565609b4927f6dc7a96d367c9b3b9ced Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 20:03:58 -0700 Subject: [PATCH 34/66] add GNN recurrent layer --- graphgym/contrib/network/gnn_recurrent.py | 110 ++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 graphgym/contrib/network/gnn_recurrent.py diff --git a/graphgym/contrib/network/gnn_recurrent.py b/graphgym/contrib/network/gnn_recurrent.py new file mode 100644 index 00000000..1d0c562c --- /dev/null +++ b/graphgym/contrib/network/gnn_recurrent.py @@ -0,0 +1,110 @@ +import graphgym.register as register +import torch.nn as nn +import torch.nn.functional as F +from graphgym.config import cfg +from graphgym.contrib.stage import * +from graphgym.init import init_weights +from graphgym.models.act import act_dict +from graphgym.models.feature_augment import Preprocess +from graphgym.models.feature_encoder import (edge_encoder_dict, + node_encoder_dict) +from graphgym.models.head import head_dict +from graphgym.models.layer import (BatchNorm1dEdge, BatchNorm1dNode, + GeneralMultiLayer, layer_dict) +from graphgym.models.layer_recurrent import RecurrentGraphLayer +from graphgym.register import register_network + + +def GNNLayer(dim_in: int, dim_out: int, has_act: bool=True, layer_id: int=0): + # General constructor for GNN layer. + return RecurrentGraphLayer(cfg.gnn.layer_type, dim_in, dim_out, + has_act, layer_id=layer_id) + + +def GNNPreMP(dim_in, dim_out): + r'''Constructs preprocessing layers: dim_in --> dim_out --> dim_out --> ... --> dim_out''' + return GeneralMultiLayer('linear', cfg.gnn.layers_pre_mp, + dim_in, dim_out, dim_inner=dim_out, + final_act=True) + + +class GNNStackStage(nn.Module): + def __init__(self, dim_in, dim_out, num_layers): + super(GNNStackStage, self).__init__() + for i in range(num_layers): + d_in = dim_in if i == 0 else dim_out + layer = GNNLayer(d_in, dim_out, layer_id=i) + self.add_module('layer{}'.format(i), layer) + self.dim_out = dim_out + + def forward(self, batch): + for layer in self.children(): + batch = layer(batch) + if cfg.gnn.l2norm: + batch.node_feature = F.normalize(batch.node_feature, p=2, dim=-1) + return batch + + +stage_dict = { + 'stack': GNNStackStage, +} + +stage_dict = {**register.stage_dict, **stage_dict} + + +class GNNRecurrent(nn.Module): + r'''The General GNN model''' + + def __init__(self, dim_in, dim_out, **kwargs): + r'''Initializes the GNN model. + + Args: + dim_in, dim_out: dimensions of in and out channels. + Parameters: + node_encoding_classes - For integer features, gives the number + of possible integer features to map. + ''' + super(GNNRecurrent, self).__init__() + # Stage: inter-layer connections. + GNNStage = stage_dict[cfg.gnn.stage_type] + # Head: prediction head, the final layer. + GNNHead = head_dict[cfg.dataset.task] + + if cfg.dataset.node_encoder: + # Encode integer node features via nn.Embeddings + NodeEncoder = node_encoder_dict[cfg.dataset.node_encoder_name] + self.node_encoder = NodeEncoder(cfg.dataset.encoder_dim) + if cfg.dataset.node_encoder_bn: + self.node_encoder_bn = BatchNorm1dNode(cfg.dataset.encoder_dim) + # Update dim_in to reflect the new dimension fo the node features + dim_in = cfg.dataset.encoder_dim + + if cfg.dataset.edge_encoder: + # Encode integer edge features via nn.Embeddings + EdgeEncoder = edge_encoder_dict[cfg.dataset.edge_encoder_name] + self.edge_encoder = EdgeEncoder(cfg.dataset.encoder_dim) + if cfg.dataset.edge_encoder_bn: + self.edge_encoder_bn = BatchNorm1dEdge(cfg.dataset.edge_dim) + + self.preprocess = Preprocess(dim_in) + d_in = self.preprocess.dim_out + + if cfg.gnn.layers_pre_mp > 0: + self.pre_mp = GNNPreMP(d_in, cfg.gnn.dim_inner) + d_in = cfg.gnn.dim_inner + if cfg.gnn.layers_mp >= 1: + self.mp = GNNStage(dim_in=d_in, + dim_out=cfg.gnn.dim_inner, + num_layers=cfg.gnn.layers_mp) + d_in = self.mp.dim_out + self.post_mp = GNNHead(dim_in=d_in, dim_out=dim_out) + + self.apply(init_weights) + + def forward(self, batch): + for module in self.children(): + batch = module(batch) + return batch + + +register_network('gnn_recurrent', GNNRecurrent) From 61563973aaa9874dc74fd341ea1635fe2a282ad6 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 20:04:27 -0700 Subject: [PATCH 35/66] add gnn recurrent layer. --- graphgym/models/layer_recurrent.py | 281 +++++++---------------------- 1 file changed, 68 insertions(+), 213 deletions(-) diff --git a/graphgym/models/layer_recurrent.py b/graphgym/models/layer_recurrent.py index df60700e..50e4cfde 100644 --- a/graphgym/models/layer_recurrent.py +++ b/graphgym/models/layer_recurrent.py @@ -1,26 +1,55 @@ +''' +This file contains wrapper layers and constructors for dynamic/recurrent GNNs. +''' +from graphgym.register import register_layer import torch import torch.nn as nn import torch.nn.functional as F -import torch_geometric as pyg - from graphgym.config import cfg from graphgym.models.act import act_dict -from graphgym.contrib.layer.generalconv import (GeneralConvLayer, - GeneralEdgeConvLayer) - -from graphgym.contrib.layer import * -import graphgym.register as register - - -## General classes -class GeneralLayer(nn.Module): - '''General wrapper for layers''' - - def __init__(self, name, dim_in, dim_out, has_act=True, has_bn=True, - has_l2norm=False, **kwargs): - super(GeneralLayer, self).__init__() +from graphgym.models.layer import layer_dict +from graphgym.models.update import update_dict + + +class RecurrentGraphLayer(nn.Module): + ''' + The recurrent graph layer for snapshot-based dynamic graphs. + This layer requires + (1): a GNN block for message passing. + (2): a node embedding/state update module. + + This layer updates node embedding as the following: + h[l, t] = Update(h[l, t-1], GNN(h[l-1, t])). + + This layer corresponds to a particular l-th layer in multi-layer setting, + the layer id is specified by 'id' in '__init__'. + ''' + def __init__(self, name: str, dim_in: int, dim_out: int, has_act: bool=True, + has_bn: bool=True, has_l2norm: bool=False, layer_id: int=0, + **kwargs): + ''' + Args: + name (str): The name of GNN layer to use for message-passing. + dim_in (int): Dimension of input node feature. + dim_out (int): Dimension of updated embedding. + has_act (bool, optional): Whether to after message passing. + Defaults to True. + has_bn (bool, optional): Whether add batch normalization for + node embedding. Defaults to True. + has_l2norm (bool, optional): Whether to add L2-normalization for + message passing result. Defaults to False. + layer_id (int, optional): The layer id in multi-layer setting. + Defaults to 0. + ''' + super(RecurrentGraphLayer, self).__init__() self.has_l2norm = has_l2norm + if layer_id < 0: + raise ValueError(f'layer_id must be non-negative, got {layer_id}.') + self.layer_id = layer_id has_bn = has_bn and cfg.gnn.batchnorm + self.dim_in = dim_in + self.dim_out = dim_out + # Construct the internal GNN layer. self.layer = layer_dict[name](dim_in, dim_out, bias=not has_bn, **kwargs) layer_wrapper = [] @@ -33,206 +62,32 @@ def __init__(self, name, dim_in, dim_out, has_act=True, has_bn=True, if has_act: layer_wrapper.append(act_dict[cfg.gnn.act]) self.post_layer = nn.Sequential(*layer_wrapper) + # self.update = self.construct_update_block(self.dim_in, self.dim_out, + # self.layer_id) + self.update = update_dict[cfg.gnn.embed_update_method](self.dim_in, + self.dim_out, + self.layer_id) + + def _init_hidden_state(self, batch): + # Initialize all node-states to zero. + if not isinstance(batch.node_states[self.layer_id], torch.Tensor): + batch.node_states[self.layer_id] = torch.zeros( + batch.node_feature.shape[0], self.dim_out).to( + batch.node_feature.device) def forward(self, batch): + # Message passing. batch = self.layer(batch) - if isinstance(batch, torch.Tensor): - batch = self.post_layer(batch) - if self.has_l2norm: - batch = F.normalize(batch, p=2, dim=1) - else: - batch.node_feature = self.post_layer(batch.node_feature) - if self.has_l2norm: - batch.node_feature = F.normalize(batch.node_feature, p=2, dim=1) - return batch - - -class GeneralMultiLayer(nn.Module): - '''General wrapper for stack of layers''' - - def __init__(self, name, num_layers, dim_in, dim_out, dim_inner=None, - final_act=True, **kwargs): - super(GeneralMultiLayer, self).__init__() - dim_inner = dim_in if dim_inner is None else dim_inner - for i in range(num_layers): - d_in = dim_in if i == 0 else dim_inner - d_out = dim_out if i == num_layers - 1 else dim_inner - has_act = final_act if i == num_layers - 1 else True - layer = GeneralLayer(name, d_in, d_out, has_act, **kwargs) - self.add_module('Layer_{}'.format(i), layer) - - def forward(self, batch): - for layer in self.children(): - batch = layer(batch) - return batch - - -## Core basic layers -# Input: batch; Output: batch -class Linear(nn.Module): - def __init__(self, dim_in, dim_out, bias=False, **kwargs): - super(Linear, self).__init__() - self.model = nn.Linear(dim_in, dim_out, bias=bias) - - def forward(self, batch): - if isinstance(batch, torch.Tensor): - batch = self.model(batch) - else: - batch.node_feature = self.model(batch.node_feature) - return batch - - -class BatchNorm1dNode(nn.Module): - '''General wrapper for layers''' - - def __init__(self, dim_in): - super(BatchNorm1dNode, self).__init__() - self.bn = nn.BatchNorm1d(dim_in, eps=cfg.bn.eps, momentum=cfg.bn.mom) - - def forward(self, batch): - batch.node_feature = self.bn(batch.node_feature) - return batch - - -class BatchNorm1dEdge(nn.Module): - '''General wrapper for layers''' - - def __init__(self, dim_in): - super(BatchNorm1dEdge, self).__init__() - self.bn = nn.BatchNorm1d(dim_in, eps=cfg.bn.eps, momentum=cfg.bn.mom) - - def forward(self, batch): - batch.edge_feature = self.bn(batch.edge_feature) - return batch - - -class MLP(nn.Module): - def __init__(self, dim_in, dim_out, bias=True, dim_inner=None, - num_layers=2, **kwargs): - ''' - Note: MLP works for 0 layers - ''' - super(MLP, self).__init__() - dim_inner = dim_in if dim_inner is None else dim_inner - layers = [] - if num_layers > 1: - layers.append( - GeneralMultiLayer('linear', num_layers - 1, dim_in, dim_inner, - dim_inner, final_act=True)) - layers.append(Linear(dim_inner, dim_out, bias)) - else: - layers.append(Linear(dim_in, dim_out, bias)) - self.model = nn.Sequential(*layers) - - def forward(self, batch): - if isinstance(batch, torch.Tensor): - batch = self.model(batch) - else: - batch.node_feature = self.model(batch.node_feature) - return batch - - -class GCNConv(nn.Module): - def __init__(self, dim_in, dim_out, bias=False, **kwargs): - super(GCNConv, self).__init__() - self.model = pyg.nn.GCNConv(dim_in, dim_out, bias=bias) - - def forward(self, batch): - batch.node_feature = self.model(batch.node_feature, batch.edge_index) - return batch - - -class SAGEConv(nn.Module): - def __init__(self, dim_in, dim_out, bias=False, **kwargs): - super(SAGEConv, self).__init__() - self.model = pyg.nn.SAGEConv(dim_in, dim_out, bias=bias, concat=True) - - def forward(self, batch): - batch.node_feature = self.model(batch.node_feature, batch.edge_index) - return batch - - -class GATConv(nn.Module): - def __init__(self, dim_in, dim_out, bias=False, **kwargs): - super(GATConv, self).__init__() - self.model = pyg.nn.GATConv(dim_in, dim_out, bias=bias) - - def forward(self, batch): - batch.node_feature = self.model(batch.node_feature, batch.edge_index) - return batch - - -class GINConv(nn.Module): - def __init__(self, dim_in, dim_out, bias=False, **kwargs): - super(GINConv, self).__init__() - gin_nn = nn.Sequential(nn.Linear(dim_in, dim_out), nn.ReLU(), - nn.Linear(dim_out, dim_out)) - self.model = pyg.nn.GINConv(gin_nn) - - def forward(self, batch): - batch.node_feature = self.model(batch.node_feature, batch.edge_index) + batch.node_feature = self.post_layer(batch.node_feature) + if self.has_l2norm: + batch.node_feature = F.normalize(batch.node_feature, p=2, dim=1) + + self._init_hidden_state(batch) + # Compute output from updater block. + batch = self.update(batch) + # batch.node_states[self.layer_id] = node_states_new + batch.node_feature = batch.node_states[self.layer_id] return batch -class SplineConv(nn.Module): - def __init__(self, dim_in, dim_out, bias=False, **kwargs): - super(SplineConv, self).__init__() - self.model = pyg.nn.SplineConv(dim_in, dim_out, - dim=1, kernel_size=2, bias=bias) - - def forward(self, batch): - batch.node_feature = self.model(batch.node_feature, batch.edge_index, - batch.edge_feature) - return batch - - -class GeneralConv(nn.Module): - def __init__(self, dim_in, dim_out, bias=False, **kwargs): - super(GeneralConv, self).__init__() - self.model = GeneralConvLayer(dim_in, dim_out, bias=bias) - - def forward(self, batch): - batch.node_feature = self.model(batch.node_feature, batch.edge_index) - return batch - - -class GeneralEdgeConv(nn.Module): - def __init__(self, dim_in, dim_out, bias=False, **kwargs): - super(GeneralEdgeConv, self).__init__() - self.model = GeneralEdgeConvLayer(dim_in, dim_out, bias=bias) - - def forward(self, batch): - batch.node_feature = self.model(batch.node_feature, batch.edge_index, - edge_feature=batch.edge_feature) - return batch - - -class GeneralSampleEdgeConv(nn.Module): - def __init__(self, dim_in, dim_out, bias=False, **kwargs): - super(GeneralSampleEdgeConv, self).__init__() - self.model = GeneralEdgeConvLayer(dim_in, dim_out, bias=bias) - - def forward(self, batch): - edge_mask = torch.rand(batch.edge_index.shape[1]) < cfg.gnn.keep_edge - edge_index = batch.edge_index[:, edge_mask] - edge_feature = batch.edge_feature[edge_mask, :] - batch.node_feature = self.model(batch.node_feature, edge_index, - edge_feature=edge_feature) - return batch - - -layer_dict = { - 'linear': Linear, - 'mlp': MLP, - 'gcnconv': GCNConv, - 'sageconv': SAGEConv, - 'gatconv': GATConv, - 'splineconv': SplineConv, - 'ginconv': GINConv, - 'generalconv': GeneralConv, - 'generaledgeconv': GeneralEdgeConv, - 'generalsampleedgeconv': GeneralSampleEdgeConv, -} - -# register additional convs -layer_dict = {**register.layer_dict, **layer_dict} +register_layer('recurrent_graph_layer', RecurrentGraphLayer) From 57fd6eb0f1d8791524cfcedbf3f0699ac44dbd97 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 21:28:54 -0700 Subject: [PATCH 36/66] add cfg.dataset.link_pred_all_edges option --- graphgym/loader.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/graphgym/loader.py b/graphgym/loader.py index c4e810ac..402e8062 100644 --- a/graphgym/loader.py +++ b/graphgym/loader.py @@ -166,12 +166,23 @@ def transform_before_split(dataset): return dataset -def transform_after_split(datasets): +def transform_after_split(datasets, dataset): ''' Dataset transformation after train/val/test split :param dataset: A list of DeepSNAP dataset objects :return: A list of transformed DeepSNAP dataset objects ''' + if cfg.dataset.link_pred_all_edges: + for t in range(len(datasets[2])): + g = datasets[2].graphs[t] + neg = g.negative_sampling(dataset[t].edge_index, + dataset[t].num_nodes, + dataset[t].edge_index.shape[1]) + pos = dataset[t].edge_index + g.edge_label_index = torch.cat((neg, pos), dim=1) + g.edge_label = torch.cat((torch.zeros(neg.shape[1]), + torch.ones(pos.shape[1]))) + if cfg.dataset.transform == 'ego': for split_dataset in datasets: split_dataset.apply_transform(ego_nets, @@ -190,18 +201,18 @@ def transform_after_split(datasets): def create_dataset(): - ## Load dataset + # Load dataset time1 = time.time() if cfg.dataset.format == 'OGB': graphs, splits = load_dataset() else: graphs = load_dataset() - ## Filter graphs + # Filter graphs time2 = time.time() min_node = filter_graphs() - ## Create whole dataset + # Create whole dataset dataset = GraphDataset( graphs, task=cfg.dataset.task, @@ -211,10 +222,10 @@ def create_dataset(): resample_disjoint=cfg.dataset.resample_disjoint, minimum_node_per_graph=min_node) - ## Transform the whole dataset + # Transform the whole dataset dataset = transform_before_split(dataset) - ## Split dataset + # Split dataset time3 = time.time() # Use custom data splits if cfg.dataset.format == 'OGB': @@ -232,14 +243,14 @@ def create_dataset(): for i in range(1, len(datasets)): dataset.edge_negative_sampling_ratio = 1 - ## Transform each split dataset + # Transform each split dataset time4 = time.time() - datasets = transform_after_split(datasets) + datasets = transform_after_split(datasets, dataset) time5 = time.time() logging.info('Load: {:.4}s, Before split: {:.4}s, ' 'Split: {:.4}s, After split: {:.4}s'.format( - time2 - time1, time3 - time2, time4 - time3, time5 - time4)) + time2 - time1, time3 - time2, time4 - time3, time5 - time4)) return datasets From f61cffb4a38646b70b68274e4c59cedc16733600 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 21:29:01 -0700 Subject: [PATCH 37/66] add cfg.dataset.link_pred_all_edges --- graphgym/contrib/config/roland.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/graphgym/contrib/config/roland.py b/graphgym/contrib/config/roland.py index 58a32bac..1e4f242b 100644 --- a/graphgym/contrib/config/roland.py +++ b/graphgym/contrib/config/roland.py @@ -109,6 +109,8 @@ def set_cfg_roland(cfg): # are for validation and the last 10% snapshots are for testing. cfg.dataset.split_method = 'default' + # In the case of live-update, whether to predict all edges at time t+1. + cfg.dataset.link_pred_all_edges = False # ----------------------------------------------------------------------- # # Customized options: `transaction` for ROLAND dynamic graphs. # ----------------------------------------------------------------------- # From 910274449a89e39ee6a56fa3f121efe648107be3 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Sun, 6 Jun 2021 21:30:33 -0700 Subject: [PATCH 38/66] remove unused training scheme --- .../contrib/train/train_live_update_bptt.py | 269 ----------------- .../train/train_live_update_fixed_split.py | 281 ------------------ 2 files changed, 550 deletions(-) delete mode 100644 graphgym/contrib/train/train_live_update_bptt.py delete mode 100644 graphgym/contrib/train/train_live_update_fixed_split.py diff --git a/graphgym/contrib/train/train_live_update_bptt.py b/graphgym/contrib/train/train_live_update_bptt.py deleted file mode 100644 index e07b8d95..00000000 --- a/graphgym/contrib/train/train_live_update_bptt.py +++ /dev/null @@ -1,269 +0,0 @@ -""" -The baseline training (non-incremental) training for live-update scheme. -NOTE: this setup requires extensive GPU memory and could lead to OOM error. -""" -import copy -import datetime -import logging -import os -from typing import Dict, List, Optional, Tuple - -import deepsnap -import numpy as np -import torch -from graphgym.checkpoint import clean_ckpt -from graphgym.config import cfg -from graphgym.contrib.train import train_utils -from graphgym.loss import compute_loss -from graphgym.optimizer import create_optimizer, create_scheduler -from graphgym.register import register_train -from graphgym.utils.io import makedirs_rm_exist -from graphgym.utils.stats import node_degree -from torch.utils.tensorboard import SummaryWriter -from tqdm import tqdm - - -@torch.no_grad() -def get_task_batch(dataset: deepsnap.dataset.GraphDataset, - today: int, tomorrow: int, - prev_node_states: Optional[Dict[str, List[torch.Tensor]]] - ) -> deepsnap.graph.Graph: - """ - Construct batch required for the task (today, tomorrow). As defined in - batch's get_item method (used to get edge_label and get_label_index), - edge_label and edge_label_index returned would be different everytime - get_task_batch() is called. - - Moreover, copy node-memories (node_states and node_cells) to the batch. - """ - assert today < tomorrow < len(dataset) - # Get edges for message passing and prediction task. - batch = dataset[today].clone() - batch.edge_label = dataset[tomorrow].edge_label.clone() - batch.edge_label_index = dataset[tomorrow].edge_label_index.clone() - - # Copy previous memory to the batch. - if prev_node_states is not None: - for key, val in prev_node_states.items(): - copied = [x.detach().clone() for x in val] - setattr(batch, key, copied) - - batch = train_utils.move_batch_to_device(batch, cfg.device) - return batch - - -@torch.no_grad() -def update_node_states(model, dataset, task: Tuple[int, int], - prev_node_states: Optional[ - Dict[str, List[torch.Tensor]]] - ) -> Dict[str, List[torch.Tensor]]: - """Perform the provided task and keep track of the latest node_states. - - Example: task = (t, t+1), - the prev_node_states contains node embeddings at time (t-1). - the model perform task (t, t+1): - Input: (node embedding at t - 1, edges at t). - Output: possible transactions at t+1. - the model also generates node embeddings at t. - - after doing task (t, t+1), node_states contains information - from snapshot t. - """ - today, tomorrow = task - batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() - # Let the model modify batch.node_states (and batch.node_cells). - _, _ = model(batch) - # Collect the updated node states. - out = dict() - out['node_states'] = [x.detach().clone() for x in batch.node_states] - if isinstance(batch.node_cells[0], torch.Tensor): - out['node_cells'] = [x.detach().clone() for x in batch.node_cells] - - return out - - -def train_step(model, optimizer, scheduler, dataset, - task: Tuple[int, int]) -> dict: - """ - After receiving ground truth from a particular task, update the model by - performing back-propagation. - For example, on day t, the ground truth of task (t-1, t) has been revealed, - train the model using G[t-1] for message passing and label[t] as target. - """ - optimizer.zero_grad() - torch.cuda.empty_cache() - model.train() - - today, _ = task - - # get loss over time. - total_loss_over_time = torch.tensor(0.0).to(torch.device(cfg.device)) - # iterate from the beginning to compute node_states. - for t in range(today + 1): # (0, 1), (1, 2), ..., (today, today+1). - # perform task (t, t+1), use information up to tomorrow. - new_batch = get_task_batch(dataset, t, t + 1, None).clone() - if t > 0: # manually inherit node states and node cells for LSTM. - new_batch.node_states = batch.node_states - new_batch.node_cells = batch.node_cells - batch = new_batch - pred, true = model(batch) - loss, _ = compute_loss(pred, true) - if t > today - cfg.train.tbptt_freq: - # Perform the truncated version, only accumulate loss for recent - # snapshots. - total_loss_over_time += loss - # get average loss over time. - total_loss_over_time /= (today + 1) - # perform back-prop through time. - total_loss_over_time.backward() - optimizer.step() - - scheduler.step() - return {'loss': total_loss_over_time} - - -@torch.no_grad() -def evaluate_step(model, dataset, task: Tuple[int, int], fast: bool = False - ) -> dict: - """ - Evaluate model's performance on task = (today, tomorrow) - where today and tomorrow are integers indexing snapshots. - """ - today, tomorrow = task - model.eval() - - # Run forward pass to get the latest node states. - for t in range(today): # (0, 1), (1, 2), ...(today-1, today) - # Iterative through snapshots in the past, up to (today-1, today) - new_batch = get_task_batch(dataset, t, t + 1, None).clone() - if t > 0: - new_batch.node_states = batch.node_states - new_batch.node_cells = batch.node_cells - batch = new_batch - # forward pass to update node_states in batch. - _, _ = model(batch) - - # Evaluation. - # (today, today+1) - cur_batch = get_task_batch(dataset, today, tomorrow, None).clone() - if today > 0: - cur_batch.node_states = copy.deepcopy(batch.node_states) - cur_batch.node_cells = copy.deepcopy(batch.node_cells) - - pred, true = model(cur_batch) - loss, _ = compute_loss(pred, true) - - if fast: - # skip MRR calculation for internal validation. - return {'loss': loss.item()} - - mrr_batch = get_task_batch(dataset, today, tomorrow, None).clone() - if today > 0: - mrr_batch.node_states = copy.deepcopy(batch.node_states) - mrr_batch.node_cells = copy.deepcopy(batch.node_cells) - - mrr = train_utils.compute_MRR( - mrr_batch, - model, - num_neg_per_node=cfg.metric.mrr_num_negative_edges, - method=cfg.metric.mrr_method) - - return {'loss': loss.item(), 'mrr': mrr} - - -def train_live_update_bptt(loggers, loaders, model, optimizer, scheduler, datasets, - **kwargs): - for dataset in datasets: - # Sometimes edge degree info is already included in dataset. - if not hasattr(dataset[0], 'keep_ratio'): - train_utils.precompute_edge_degree_info(dataset) - - num_splits = len(loggers) # train/val/test splits. - # range for today in (today, tomorrow) task pairs. - task_range = range(len(datasets[0]) - cfg.transaction.horizon) - - t = datetime.datetime.now().strftime('%b%d_%H-%M-%S') - - # directory to store tensorboard files of this run. - out_dir = cfg.out_dir.replace('/', '\\') - # dir to store all run outputs for the entire batch. - run_dir = 'runs_' + cfg.remark - - print(f'Tensorboard directory: {out_dir}') - # If tensorboard directory exists, this config is in the re-run phase - # of run_batch, replace logs of previous runs with the new one. - makedirs_rm_exist(f'./{run_dir}/{out_dir}') - writer = SummaryWriter(f'./{run_dir}/{out_dir}') - - # save a copy of configuration for later identifications. - with open(f'./{run_dir}/{out_dir}/config.yaml', 'w') as f: - cfg.dump(stream=f) - - for t in tqdm(task_range, desc='Snapshot'): - # current task: t --> t+1. - # (1) Evaluate model's performance on this task, at this time, the - # model has seen no information on t+1, this evaluation is fair. - for i in range(1, num_splits): - perf = evaluate_step(model, datasets[i], (t, t + 1), fast=False) - - writer.add_scalars('val' if i == 1 else 'test', perf, t) - - # (2) Reveal the ground truth of task (t, t+1) and update the model - # to prepare for the next task. - del optimizer, scheduler # use new optimizers. - optimizer = create_optimizer(model.parameters()) - scheduler = create_scheduler(optimizer) - - # best model's validation loss, training epochs, and state_dict. - best_model = {'val_loss': np.inf, 'train_epoch': 0, 'state': None} - # keep track of how long we have NOT update the best model. - best_model_unchanged = 0 - # after not updating the best model for `tol` epochs, stop. - tol = cfg.train.internal_validation_tolerance - - # internal training loop (intra-snapshot cross-validation). - # choose the best model using current validation set, prepare for - # next task. - - for i in tqdm(range(cfg.optim.max_epoch + 1), desc='live update', - leave=False): - # Start with the un-trained model (i = 0), evaluate the model. - internal_val_perf = evaluate_step(model, datasets[1], - (t, t + 1), fast=True) - val_loss = internal_val_perf['loss'] - - if val_loss < best_model['val_loss']: - # replace the best model with the current model. - best_model = {'val_loss': val_loss, 'train_epoch': i, - 'state': copy.deepcopy(model.state_dict())} - best_model_unchanged = 0 - else: - # the current best model has dominated for these epochs. - best_model_unchanged += 1 - - if best_model_unchanged >= tol: - # If the best model has not been updated for a while, stop. - break - else: - # Otherwise, keep training. - train_perf = train_step(model, optimizer, scheduler, - datasets[0], (t, t + 1)) - writer.add_scalars('train', train_perf, t) - - writer.add_scalar('internal_best_val', best_model['val_loss'], t) - writer.add_scalar('best epoch', best_model['train_epoch'], t) - - # (3) Actually perform the update on training set to get node_states - # contains information up to time t. - # Use the best model selected from intra-snapshot cross-validation. - model.load_state_dict(best_model['state']) - - writer.close() - - if cfg.train.ckpt_clean: - clean_ckpt() - - logging.info('Task done, results saved in {}'.format(cfg.out_dir)) - - -register_train('live_update_baseline', train_live_update_bptt) diff --git a/graphgym/contrib/train/train_live_update_fixed_split.py b/graphgym/contrib/train/train_live_update_fixed_split.py deleted file mode 100644 index cde96628..00000000 --- a/graphgym/contrib/train/train_live_update_fixed_split.py +++ /dev/null @@ -1,281 +0,0 @@ -""" -A pipeline training model using live-update scheme but only evaluates the model -using the last 10% of snapshots, which is the same as conventional chronological -data splitting method. -""" -import copy -import datetime -import logging -import os -from typing import Dict, List, Optional, Tuple - -import deepsnap -import numpy as np -import torch -from graphgym.checkpoint import clean_ckpt -from graphgym.config import cfg -from graphgym.contrib.train import train_utils -from graphgym.loss import compute_loss -from graphgym.optimizer import create_optimizer, create_scheduler -from graphgym.register import register_train -from graphgym.utils.io import makedirs_rm_exist -from torch.utils.tensorboard import SummaryWriter -from tqdm import tqdm - - -@torch.no_grad() -def get_task_batch(dataset: deepsnap.dataset.GraphDataset, - today: int, tomorrow: int, - prev_node_states: Optional[Dict[str, List[torch.Tensor]]] - ) -> deepsnap.graph.Graph: - """ - Construct batch required for the task (today, tomorrow). As defined in - batch's get_item method (used to get edge_label and get_label_index), - edge_label and edge_label_index returned would be different everytime - get_task_batch() is called. - - Moreover, copy node-memories (node_states and node_cells) to the batch. - """ - assert today < tomorrow < len(dataset) - # Get edges for message passing and prediction task. - batch = dataset[today].clone() - batch.edge_label = dataset[tomorrow].edge_label.clone() - batch.edge_label_index = dataset[tomorrow].edge_label_index.clone() - - # Copy previous memory to the batch. - if prev_node_states is not None: - for key, val in prev_node_states.items(): - copied = [x.detach().clone() for x in val] - setattr(batch, key, copied) - - batch = train_utils.move_batch_to_device(batch, cfg.device) - return batch - - -@torch.no_grad() -def update_node_states(model, dataset, task: Tuple[int, int], - prev_node_states: Optional[ - Dict[str, List[torch.Tensor]]] - ) -> Dict[str, List[torch.Tensor]]: - """Perform the provided task and keep track of the latest node_states. - - Example: task = (t, t+1), - the prev_node_states contains node embeddings at time (t-1). - the model perform task (t, t+1): - Input: (node embedding at t - 1, edges at t). - Output: possible transactions at t+1. - the model also generates node embeddings at t. - - after doing task (t, t+1), node_states contains information - from snapshot t. - """ - today, tomorrow = task - batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() - # Let the model modify batch.node_states (and batch.node_cells). - _, _ = model(batch) - # Collect the updated node states. - out = dict() - out['node_states'] = [x.detach().clone() for x in batch.node_states] - if isinstance(batch.node_cells[0], torch.Tensor): - out['node_cells'] = [x.detach().clone() for x in batch.node_cells] - - return out - - -def train_step(model, optimizer, scheduler, dataset, - task: Tuple[int, int], - prev_node_states: Optional[Dict[str, torch.Tensor]] - ) -> dict: - """ - After receiving ground truth from a particular task, update the model by - performing back-propagation. - For example, on day t, the ground truth of task (t-1, t) has been revealed, - train the model using G[t-1] for message passing and label[t] as target. - """ - optimizer.zero_grad() - torch.cuda.empty_cache() - - today, tomorrow = task - model.train() - batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() - - pred, true = model(batch) - loss, pred_score = compute_loss(pred, true) - loss.backward() - optimizer.step() - - scheduler.step() - return {'loss': loss} - - -@torch.no_grad() -def evaluate_step(model, dataset, task: Tuple[int, int], - prev_node_states: Optional[Dict[str, List[torch.Tensor]]], - fast: bool = False) -> dict: - """ - Evaluate model's performance on task = (today, tomorrow) - where today and tomorrow are integers indexing snapshots. - """ - today, tomorrow = task - model.eval() - batch = get_task_batch(dataset, today, tomorrow, prev_node_states).clone() - - pred, true = model(batch) - loss, pred_score = compute_loss(pred, true) - - if fast: - # skip MRR calculation for internal validation. - return {'loss': loss.item()} - - mrr_batch = get_task_batch(dataset, today, tomorrow, - prev_node_states).clone() - - mrr = train_utils.compute_MRR(mrr_batch, model, -1, 'all') - - return {'loss': loss.item(), 'mrr': mrr} - - -def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, - **kwargs): - - for dataset in datasets: - # Sometimes edge degree info is already included in dataset. - if not hasattr(dataset[0], 'keep_ratio'): - train_utils.precompute_edge_degree_info(dataset) - - if cfg.dataset.premade_datasets == 'fresh_save_cache': - if not os.path.exists(f'{cfg.dataset.dir}/cache/'): - os.mkdir(f'{cfg.dataset.dir}/cache/') - cache_path = '{}/cache/cached_datasets_{}_{}_{}.pt'.format( - cfg.dataset.dir, cfg.dataset.format.replace('.tsv', ''), - cfg.transaction.snapshot_freq, - datetime.now().strftime('%Y_%m_%d__%H_%M_%S') - ) - torch.save(datasets, cache_path) - - num_splits = len(loggers) # train/val/test splits. - # range for today in (today, tomorrow) task pairs. - task_range = range(len(datasets[0]) - cfg.transaction.horizon) - - t = datetime.datetime.now().strftime('%b%d_%H-%M-%S') - - # directory to store tensorboard files of this run. - out_dir = cfg.out_dir.replace('/', '\\') - # dir to store all run outputs for the entire batch. - run_dir = 'runs_' + cfg.remark - - print(f'Tensorboard directory: {out_dir}') - # If tensorboard directory exists, this config is in the re-run phase - # of run_batch, replace logs of previous runs with the new one. - makedirs_rm_exist(f'./{run_dir}/{out_dir}') - writer = SummaryWriter(f'./{run_dir}/{out_dir}') - - # save a copy of configuration for later identifications. - with open(f'./{run_dir}/{out_dir}/config.yaml', 'w') as f: - cfg.dump(stream=f) - - prev_node_states = None # no previous state on day 0. - # {'node_states': [Tensor, Tensor], 'node_cells: [Tensor, Tensor]} - - model_init = None # for meta-learning only, a model.state_dict() object. - for t in tqdm(task_range, desc='snapshot', leave=True): - # current task: t --> t+1. - # (1) Evaluate model's performance on this task, at this time, the - # model has seen no information on t+1, this evaluation is fair. - # Only evaluate the performance within the test set split region. - # Test snapshots are indexed [cfg.train.start_compute_mrr, end]. - perf = evaluate_step(model, datasets[2], (t, t + 1), - prev_node_states, fast=t < cfg.train.start_compute_mrr) - - writer.add_scalars('test', perf, t) - - # (2) Reveal the ground truth of task (t, t+1) and update the model - # to prepare for the next task. - del optimizer, scheduler # use new optimizers. - optimizer = create_optimizer(model.parameters()) - scheduler = create_scheduler(optimizer) - - # best model's validation loss, training epochs, and state_dict. - # The untrained model is the default best model. - best_model = {'val_loss': np.inf, 'train_epoch': 0, - 'state': copy.deepcopy(model.state_dict())} - # keep track of how long we have NOT update the best model. - best_model_unchanged = 0 - # after not updating the best model for `tol` epochs, stop. - tol = cfg.train.internal_validation_tolerance - - # internal training loop (intra-snapshot cross-validation). - # choose the best model using current validation set, prepare for - # next task. - - if cfg.meta.is_meta and (model_init is not None): - # For meta-learning, start fine-tuning from the pre-computed - # initialization weight. - model.load_state_dict(copy.deepcopy(model_init)) - - for i in tqdm(range(cfg.optim.max_epoch + 1), desc='live update', - leave=True): - # Start with the un-trained model (i = 0), evaluate the model. - internal_val_perf = evaluate_step(model, datasets[1], - (t, t + 1), - prev_node_states, fast=True) - val_loss = internal_val_perf['loss'] - - if val_loss < best_model['val_loss']: - # replace the best model with the current model. - best_model = {'val_loss': val_loss, 'train_epoch': i, - 'state': copy.deepcopy(model.state_dict())} - best_model_unchanged = 0 - else: - # the current best model has dominated for these epochs. - best_model_unchanged += 1 - - # if (i >= 2 * tol) and (best_model_unchanged >= tol): - if best_model_unchanged >= tol: - # If the best model has not been updated for a while, stop. - break - else: - # Otherwise, keep training. - train_perf = train_step(model, optimizer, scheduler, - datasets[0], (t, t + 1), - prev_node_states) - writer.add_scalars('train', train_perf, t) - - writer.add_scalar('internal_best_val', best_model['val_loss'], t) - writer.add_scalar('best epoch', best_model['train_epoch'], t) - - # (3) Actually perform the update on training set to get node_states - # contains information up to time t. - # Use the best model selected from intra-snapshot cross-validation. - # if best_model['state'] is None: - # breakpoint() - model.load_state_dict(best_model['state']) - - if cfg.meta.is_meta: # update meta-learning's initialization weights. - if model_init is None: # for the first task. - model_init = copy.deepcopy(best_model['state']) - else: # for subsequent task, update init. - if cfg.meta.method == 'moving_average': - new_weight = cfg.meta.alpha - elif cfg.meta.method == 'online_mean': - new_weight = 1 / (t + 1) # for t=1, the second item, 1/2. - else: - raise ValueError(f'Invalid method: {cfg.meta.method}') - - # (1-new_weight)*model_init + new_weight*best_model. - model_init = train_utils.average_state_dict(model_init, - best_model['state'], - new_weight) - - prev_node_states = update_node_states(model, datasets[0], (t, t + 1), - prev_node_states) - - writer.close() - - if cfg.train.ckpt_clean: - clean_ckpt() - - logging.info('Task done, results saved in {}'.format(cfg.out_dir)) - - -register_train('live_update_fixed_split', train_live_update) From 9f7a5fdd0b2d8bb1527637513a91206daed6c3e2 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 00:14:48 -0700 Subject: [PATCH 39/66] add template for homogenous graphs. --- graphgym/contrib/loader/roland_template.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/graphgym/contrib/loader/roland_template.py b/graphgym/contrib/loader/roland_template.py index 901cf329..b576029e 100644 --- a/graphgym/contrib/loader/roland_template.py +++ b/graphgym/contrib/loader/roland_template.py @@ -10,18 +10,25 @@ import deepsnap import graphgym.contrib.loader.dynamic_graph_utils as utils +import torch from deepsnap.graph import Graph from graphgym.config import cfg from graphgym.register import register_loader def load_single_dataset(dataset_dir: str) -> Graph: - # TODO: Load your data here. - node_feature = None # (num_nodes, *) - edge_feature = None # (num_edges, *) - edge_index = None # (2, num_edges) + # TODO: Load your data from dataset_dir here. + # Example: + num_nodes = 500 + num_node_feature = 16 + num_edges = 10000 + num_edge_feature = 32 + node_feature = torch.rand((num_nodes, num_node_feature)) + edge_feature = torch.rand((num_edges, num_edge_feature)) + edge_index = torch.randint(0, num_nodes - 1, (2, num_edges)) # edge time should be unix timestmap integers. - edge_time = None # (num_edges) + # random generate timestamps from 2021-05-01 to 2021-06-01 + edge_time = torch.randint(1619852450, 1622530850, (num_edges,)).sort()[0] graph = Graph( node_feature=node_feature, @@ -48,7 +55,7 @@ def load_generic_dataset(format: str, name: str, dataset_dir: str List[deepsnap.graph.Graph]: a list of graph snapshots. """ # TODO: change the format name. - if format == 'generic': + if format == 'YOUR_FORMAT_NAME_HERE': dataset_dir = os.path.join(dataset_dir, name) g_all = load_single_dataset(dataset_dir) snapshot_list = utils.make_graph_snapshot( @@ -59,4 +66,4 @@ def load_generic_dataset(format: str, name: str, dataset_dir: str # TODO: don't forget to register the loader. -register_loader('roland_generic', load_generic_dataset) +register_loader('YOUR_LOADER_NAME_HERE', load_generic_dataset) From f0b72f78e6fc2eee9d49bcfe6fa8110fbc8c1412 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 00:21:48 -0700 Subject: [PATCH 40/66] rename layer and don't register. --- graphgym/models/layer_recurrent.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/graphgym/models/layer_recurrent.py b/graphgym/models/layer_recurrent.py index 50e4cfde..c64f039c 100644 --- a/graphgym/models/layer_recurrent.py +++ b/graphgym/models/layer_recurrent.py @@ -1,6 +1,6 @@ -''' +""" This file contains wrapper layers and constructors for dynamic/recurrent GNNs. -''' +""" from graphgym.register import register_layer import torch import torch.nn as nn @@ -11,8 +11,8 @@ from graphgym.models.update import update_dict -class RecurrentGraphLayer(nn.Module): - ''' +class GeneralRecurrentLayer(nn.Module): + """ The recurrent graph layer for snapshot-based dynamic graphs. This layer requires (1): a GNN block for message passing. @@ -23,11 +23,11 @@ class RecurrentGraphLayer(nn.Module): This layer corresponds to a particular l-th layer in multi-layer setting, the layer id is specified by 'id' in '__init__'. - ''' + """ def __init__(self, name: str, dim_in: int, dim_out: int, has_act: bool=True, has_bn: bool=True, has_l2norm: bool=False, layer_id: int=0, **kwargs): - ''' + """ Args: name (str): The name of GNN layer to use for message-passing. dim_in (int): Dimension of input node feature. @@ -40,8 +40,8 @@ def __init__(self, name: str, dim_in: int, dim_out: int, has_act: bool=True, message passing result. Defaults to False. layer_id (int, optional): The layer id in multi-layer setting. Defaults to 0. - ''' - super(RecurrentGraphLayer, self).__init__() + """ + super(GeneralRecurrentLayer, self).__init__() self.has_l2norm = has_l2norm if layer_id < 0: raise ValueError(f'layer_id must be non-negative, got {layer_id}.') @@ -62,8 +62,7 @@ def __init__(self, name: str, dim_in: int, dim_out: int, has_act: bool=True, if has_act: layer_wrapper.append(act_dict[cfg.gnn.act]) self.post_layer = nn.Sequential(*layer_wrapper) - # self.update = self.construct_update_block(self.dim_in, self.dim_out, - # self.layer_id) + self.update = update_dict[cfg.gnn.embed_update_method](self.dim_in, self.dim_out, self.layer_id) @@ -88,6 +87,3 @@ def forward(self, batch): # batch.node_states[self.layer_id] = node_states_new batch.node_feature = batch.node_states[self.layer_id] return batch - - -register_layer('recurrent_graph_layer', RecurrentGraphLayer) From 7045cebe66bd499891eec62495dd4dde14a73441 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 02:36:33 -0700 Subject: [PATCH 41/66] rename --- .../{head_large_prediction.py => scalable_link_pred.py} | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) rename graphgym/contrib/head/{head_large_prediction.py => scalable_link_pred.py} (94%) diff --git a/graphgym/contrib/head/head_large_prediction.py b/graphgym/contrib/head/scalable_link_pred.py similarity index 94% rename from graphgym/contrib/head/head_large_prediction.py rename to graphgym/contrib/head/scalable_link_pred.py index 3b7e99c9..c2a08b06 100644 --- a/graphgym/contrib/head/head_large_prediction.py +++ b/graphgym/contrib/head/scalable_link_pred.py @@ -15,13 +15,13 @@ from graphgym.register import register_head -class LargeGNNEdgeHead(nn.Module): +class ScalableLinkPred(nn.Module): def __init__(self, dim_in: int, dim_out: int): # Use dim_in for graph conv, since link prediction dim_out could be # binary # E.g. if decoder='dot', link probability is dot product between # node embeddings, of dimension dim_in - super(LargeGNNEdgeHead, self).__init__() + super(ScalableLinkPred, self).__init__() # module to decode edges from node embeddings if cfg.model.edge_decoding == 'concat': @@ -61,8 +61,7 @@ def _apply_index(self, batch): batch.edge_label def forward_pred(self, batch): - # TODO: consider moving this to config. - predict_batch_size = 500000 # depends on GPU memroy size. + predict_batch_size = cfg.metric.link_pred_batch_size num_pred = len(batch.edge_label) label = batch.edge_label if num_pred >= predict_batch_size: @@ -106,4 +105,4 @@ def forward(self, batch): return pred, label -register_head('link_pred_large', LargeGNNEdgeHead) +register_head('scalable_link_pred', ScalableLinkPred) From 6ef2747595ce83e9b7329a816222442cacdc90ea Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 02:37:55 -0700 Subject: [PATCH 42/66] add cfg.gnn.link_pred_batch_size --- graphgym/contrib/config/roland.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/graphgym/contrib/config/roland.py b/graphgym/contrib/config/roland.py index 1e4f242b..99b5ced5 100644 --- a/graphgym/contrib/config/roland.py +++ b/graphgym/contrib/config/roland.py @@ -34,6 +34,9 @@ def set_cfg_roland(cfg): # Options: {'none', 'identity', 'affine'}. cfg.gnn.skip_connection = 'none' + # The bath size while making link prediction, useful when number of negative + # edges is huge, use a smaller number depends on GPU memroy size.. + cfg.gnn.link_pred_batch_size = 500000 # ----------------------------------------------------------------------- # # Meta-Learning options. # ----------------------------------------------------------------------- # From 267e11dfb2d7d39cb92563e9f2f1cf3b4b7fb0ac Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 02:38:22 -0700 Subject: [PATCH 43/66] fix naming --- graphgym/contrib/head/scalable_link_pred.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphgym/contrib/head/scalable_link_pred.py b/graphgym/contrib/head/scalable_link_pred.py index c2a08b06..6ea875a6 100644 --- a/graphgym/contrib/head/scalable_link_pred.py +++ b/graphgym/contrib/head/scalable_link_pred.py @@ -61,7 +61,7 @@ def _apply_index(self, batch): batch.edge_label def forward_pred(self, batch): - predict_batch_size = cfg.metric.link_pred_batch_size + predict_batch_size = cfg.gnn.link_pred_batch_size num_pred = len(batch.edge_label) label = batch.edge_label if num_pred >= predict_batch_size: From bf28bf13a9796b1e3fafd6cff4d2df6502d13f28 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 12:40:52 -0700 Subject: [PATCH 44/66] add roland loader. --- graphgym/contrib/loader/roland.py | 492 ++++++++++++++++++++++-------- 1 file changed, 357 insertions(+), 135 deletions(-) diff --git a/graphgym/contrib/loader/roland.py b/graphgym/contrib/loader/roland.py index 0e640e77..ed38f30b 100644 --- a/graphgym/contrib/loader/roland.py +++ b/graphgym/contrib/loader/roland.py @@ -1,37 +1,113 @@ """ -A refined version for loading the roland dataset. This version has the -following key points: - -(1) Node's features are determined by their first transaction, so that - payer and payee information are no longer included as a edge features. - - Node features include: - company identity, bank, country, region, Skd, SkdL1, SkdL2, Skis, - SkisL1, SkisL2. - -(2) edge features include: # system, currency, scaled amount (EUR), and - scaled timestamp. - -Mar. 31, 2021 +One single loader for the roland project. """ import os -from typing import List, Union +from datetime import datetime +from typing import List import dask.dataframe as dd -import deepsnap import graphgym.contrib.loader.dynamic_graph_utils as utils import numpy as np import pandas as pd import torch -from dask_ml.preprocessing import OrdinalEncoder +from dask_ml.preprocessing import OrdinalEncoder as DaskOrdinalEncoder from deepsnap.graph import Graph from graphgym.config import cfg from graphgym.register import register_loader from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import OrdinalEncoder as SkOrdinalEncoder +from tqdm import tqdm # ============================================================================= -# Configure and instantiate the loader here. +# AS-733 Dataset. +# ============================================================================= + + +def load_AS_dataset(dataset_dir: str) -> Graph: + all_files = [x for x in sorted(os.listdir(dataset_dir)) + if (x.startswith('as') and x.endswith('.txt'))] + assert len(all_files) == 733 + assert all(x.endswith('.txt') for x in all_files) + + def file2timestamp(file_name: str) -> int: + t = file_name.strip('.txt').strip('as') + ts = int(datetime.strptime(t, '%Y%m%d').timestamp()) + return ts + + edge_index_lst, edge_time_lst = list(), list() + all_files = sorted(all_files) + + for graph_file in tqdm(all_files): + today = file2timestamp(graph_file) + graph_file = os.path.join(dataset_dir, graph_file) + + src, dst = list(), list() + with open(graph_file, 'r') as f: + for line in f.readlines(): + if line.startswith('#'): + continue + line = line.strip('\n') + v1, v2 = line.split('\t') + src.append(int(v1)) + dst.append(int(v2)) + + edge_index = np.stack((src, dst)) + edge_index_lst.append(edge_index) + + edge_time = np.ones(edge_index.shape[1]) * today + edge_time_lst.append(edge_time) + + edge_index_raw = np.concatenate(edge_index_lst, axis=1).astype(int) + + num_nodes = len(np.unique(edge_index_raw)) + + # encode node indices to consecutive integers. + node_indices = np.sort(np.unique(edge_index_raw)) + enc = SkOrdinalEncoder(categories=[node_indices, node_indices]) + edge_index = enc.fit_transform(edge_index_raw.transpose()).transpose() + edge_index = torch.Tensor(edge_index).long() + edge_time = torch.Tensor(np.concatenate(edge_time_lst)) + + # Use scaled datetime as edge_feature. + scale = edge_time.max() - edge_time.min() + base = edge_time.min() + scaled_edge_time = 2 * (edge_time.clone() - base) / scale + + assert cfg.dataset.AS_node_feature in ['one', 'one_hot_id', + 'one_hot_degree_global'] + + if cfg.dataset.AS_node_feature == 'one': + node_feature = torch.ones(num_nodes, 1) + elif cfg.dataset.AS_node_feature == 'one_hot_id': + # One hot encoding the node ID. + node_feature = torch.Tensor(np.eye(num_nodes)) + elif cfg.dataset.AS_node_feature == 'one_hot_degree_global': + # undirected graph, use only out degree. + _, node_degree = torch.unique(edge_index[0], sorted=True, + return_counts=True) + node_feature = np.zeros((num_nodes, node_degree.max() + 1)) + node_feature[np.arange(num_nodes), node_degree] = 1 + # 1 ~ 63748 degrees, but only 710 possible levels, exclude all zero + # columns. + non_zero_cols = (node_feature.sum(axis=0) > 0) + node_feature = node_feature[:, non_zero_cols] + node_feature = torch.Tensor(node_feature) + else: + raise NotImplementedError + + g_all = Graph( + node_feature=node_feature, + edge_feature=scaled_edge_time.reshape(-1, 1), + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + + return g_all + + +# ============================================================================= +# BSI-SVT Dataset # ============================================================================= # Required for all graphs. SRC_NODE: str = 'Payer' @@ -82,39 +158,39 @@ def construct_additional_features(df: pd.DataFrame) -> pd.DataFrame: """ Constructs additional features of the transaction dataset. """ - # for p in ('Payer', 'Payee'): - # # %% Location of companies. - # mask = (df[p + 'Country'] != 'SI') - # out_of_country = np.empty(len(df), dtype=object) - # out_of_country[mask] = 'OutOfCountry' - # out_of_country[~mask] = 'InCountry' - # df[p + 'OutOfCountry'] = out_of_country - # - # mask = (df['PayerCountry'] != df['PayeeCountry']) - # missing_mask = np.logical_or(df['PayerCountry'] == 'missing', - # df['PayeeCountry'] == 'missing') - # cross_country = np.empty(len(df), dtype=object) - # cross_country[mask] = 'CrossCountry' - # cross_country[~mask] = 'WithinCountry' - # cross_country[missing_mask] = 'Missing' - # df['CrossCountry'] = cross_country - # - # amount_level = np.empty(len(df), dtype=object) - # mask_small = df['AmountEUR'] < 500 - # mask_medium = np.logical_and(df['AmountEUR'] >= 500, - # df['AmountEUR'] < 1000) - # mask_large = df['AmountEUR'] >= 1000 - # amount_level[mask_small] = '$<500' - # amount_level[mask_medium] = '500<=$<1k' - # amount_level[mask_large] = '$>=1k' - # - # df['AmountLevel'] = amount_level + for p in ('Payer', 'Payee'): + # %% Location of companies. + mask = (df[p + 'Country'] != 'SI') + out_of_country = np.empty(len(df), dtype=object) + out_of_country[mask] = 'OutOfCountry' + out_of_country[~mask] = 'InCountry' + df[p + 'OutOfCountry'] = out_of_country + + mask = (df['PayerCountry'] != df['PayeeCountry']) + missing_mask = np.logical_or(df['PayerCountry'] == 'missing', + df['PayeeCountry'] == 'missing') + cross_country = np.empty(len(df), dtype=object) + cross_country[mask] = 'CrossCountry' + cross_country[~mask] = 'WithinCountry' + cross_country[missing_mask] = 'Missing' + df['CrossCountry'] = cross_country + + amount_level = np.empty(len(df), dtype=object) + mask_small = df['AmountEUR'] < 500 + mask_medium = np.logical_and(df['AmountEUR'] >= 500, + df['AmountEUR'] < 1000) + mask_large = df['AmountEUR'] >= 1000 + amount_level[mask_small] = '$<500' + amount_level[mask_medium] = '500<=$<1k' + amount_level[mask_large] = '$>=1k' + + df['AmountLevel'] = amount_level return df -def load_single_dataset(dataset_dir: str, is_hetero: bool = True, - type_info_loc: str = 'append' - ) -> Graph: +def load_bsi_dataset(dataset_dir: str, is_hetero: bool = False, + type_info_loc: str = 'append' + ) -> Graph: """ Loads a single graph object from tsv file. @@ -130,7 +206,8 @@ def load_single_dataset(dataset_dir: str, is_hetero: bool = True, df_trans = dd.read_csv(dataset_dir, sep='\t', low_memory=False) df_trans = df_trans.fillna('missing') df_trans = df_trans.compute() - df_trans = construct_additional_features(df_trans) + if is_hetero: + df_trans = construct_additional_features(df_trans) df_trans.reset_index(drop=True, inplace=True) # necessary for dask. # a unique values of node-level categorical variables. @@ -155,7 +232,7 @@ def load_single_dataset(dataset_dir: str, is_hetero: bool = True, # Encoding categorical variables, the dask_ml.OrdinalEncoder only modify # and encode columns of categorical dtype. - enc = OrdinalEncoder() + enc = DaskOrdinalEncoder() df_encoded = enc.fit_transform(df_trans) df_encoded.reset_index(drop=True, inplace=True) print('Columns encoded to ordinal:') @@ -174,9 +251,10 @@ def load_single_dataset(dataset_dir: str, is_hetero: bool = True, # Prepare for output. edge_feature = torch.Tensor(df_encoded[EDGE_FEATURE_COLS].values) - print('feature_edge_int_num', - [int(torch.max(edge_feature[:, i])) + 1 - for i in range(len(EDGE_FEATURE_COLS) - 2)]) + feature_edge_int_num = [int(torch.max(edge_feature[:, i])) + 1 + for i in range(len(EDGE_FEATURE_COLS) - 2)] + cfg.transaction.feature_edge_int_num = feature_edge_int_num + print('feature_edge_int_num', feature_edge_int_num) edge_index = torch.Tensor( df_encoded[[SRC_NODE, DST_NODE]].values.transpose()).long() # (2, E) @@ -245,76 +323,226 @@ def load_single_dataset(dataset_dir: str, is_hetero: bool = True, return graph +# ============================================================================= +# Bitcoin Dataset. +# ============================================================================= + + +def load_bitcoin_dataset(dataset_dir: str) -> Graph: + df_trans = pd.read_csv(dataset_dir, sep=',', header=None, index_col=None) + df_trans.columns = ['SOURCE', 'TARGET', 'RATING', 'TIME'] + # NOTE: 'SOURCE' and 'TARGET' are not consecutive. + num_nodes = len( + pd.unique(df_trans[['SOURCE', 'TARGET']].to_numpy().ravel())) + + # bitcoin OTC contains decimal numbers, round them. + df_trans['TIME'] = df_trans['TIME'].astype(np.int).astype(np.float) + assert not np.any(pd.isna(df_trans).values) + + time_scaler = MinMaxScaler((0, 2)) + df_trans['TimestampScaled'] = time_scaler.fit_transform( + df_trans['TIME'].values.reshape(-1, 1)) + + edge_feature = torch.Tensor( + df_trans[['RATING', 'TimestampScaled']].values) # (E, edge_dim) + + node_indices = np.sort( + pd.unique(df_trans[['SOURCE', 'TARGET']].to_numpy().ravel())) + enc = SkOrdinalEncoder(categories=[node_indices, node_indices]) + raw_edges = df_trans[['SOURCE', 'TARGET']].values + edge_index = enc.fit_transform(raw_edges).transpose() + edge_index = torch.LongTensor(edge_index) + + # num_nodes = torch.max(edge_index) + 1 + # Use dummy node features. + node_feature = torch.ones(num_nodes, 1).float() + + edge_time = torch.FloatTensor(df_trans['TIME'].values) + + graph = Graph( + node_feature=node_feature, + edge_feature=edge_feature, + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + return graph + + +# ============================================================================= +# Reddit Dataset. +# ============================================================================= + + +def load_reddit_dataset(dataset_dir: str) -> Graph: + df_trans = dd.read_csv(dataset_dir, sep='\t', low_memory=False) + df_trans = df_trans.compute() + assert not np.any(pd.isna(df_trans).values) + df_trans.reset_index(drop=True, inplace=True) # required for dask. + + # Encode src and dst node IDs. + # get unique values of src and dst. + unique_subreddits = pd.unique( + df_trans[['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']].to_numpy().ravel()) + unique_subreddits = np.sort(unique_subreddits) + cate_type = pd.api.types.CategoricalDtype(categories=unique_subreddits, + ordered=True) + df_trans['SOURCE_SUBREDDIT'] = df_trans['SOURCE_SUBREDDIT'].astype( + cate_type) + df_trans['TARGET_SUBREDDIT'] = df_trans['TARGET_SUBREDDIT'].astype( + cate_type) + enc = DaskOrdinalEncoder(columns=['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']) + df_encoded = enc.fit_transform(df_trans) + df_encoded.reset_index(drop=True, inplace=True) + + # Add node feature from the embedding dataset. + node_embedding_dir = os.path.join(cfg.dataset.dir, + 'web-redditEmbeddings-subreddits.csv') + + # index: subreddit name, values: embedding. + df_node = pd.read_csv(node_embedding_dir, header=None, index_col=0) + + # ordinal encoding follows order in unique_subreddits. + # df_encoded['SOURCE_SUBREDDIT'] contains encoded integral values. + # unique_subreddits[df_encoded['SOURCE_SUBREDDIT']] + # tries to reverse encoded_integer --> original subreddit name. + # check if recovered sub-reddit name matched the raw data. + for col in ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']: + assert all(unique_subreddits[df_encoded[col]] == df_trans[col]) + + num_nodes = len(cate_type.categories) + node_feature = torch.ones(size=(num_nodes, 300)) + # for nodes without precomputed embedding, use the average value. + node_feature = node_feature * np.mean(df_node.values) + + # cate_type.categories[i] is encoded to i, by construction. + for i, subreddit in enumerate(cate_type.categories): + if subreddit in df_node.index: + embedding = df_node.loc[subreddit] + node_feature[i, :] = torch.Tensor(embedding.values) + + # Original format: df['TIMESTAMP'][0] = '2013-12-31 16:39:18' + # Convert to unix timestamp (integers). + df_encoded['TIMESTAMP'] = pd.to_datetime(df_encoded['TIMESTAMP'], + format='%Y-%m-%d %H:%M:%S') + df_encoded['TIMESTAMP'] = (df_encoded['TIMESTAMP'] - pd.Timestamp( + '1970-01-01')) // pd.Timedelta('1s') # now integers. + + # Scale edge time. + time_scaler = MinMaxScaler((0, 2)) + df_encoded['TimestampScaled'] = time_scaler.fit_transform( + df_encoded['TIMESTAMP'].values.reshape(-1, 1)) + + # Link sentimental representation (86-dimension). + # comma-separated string: '3.1,5.1,0.0,...' + senti_str_lst = df_encoded['PROPERTIES'].values + edge_senti_embedding = [x.split(',') for x in senti_str_lst] + edge_senti_embedding = np.array(edge_senti_embedding).astype(np.float32) + # (E, 86) + + ef = df_encoded[['TimestampScaled', 'LINK_SENTIMENT']].values + edge_feature = np.concatenate([ef, edge_senti_embedding], axis=1) + edge_feature = torch.Tensor(edge_feature).float() # (E, 88) + + edge_index = torch.Tensor( + df_encoded[['SOURCE_SUBREDDIT', + 'TARGET_SUBREDDIT']].values.transpose()).long() # (2, E) + num_nodes = torch.max(edge_index) + 1 + + edge_time = torch.FloatTensor(df_encoded['TIMESTAMP'].values) + + graph = Graph( + node_feature=node_feature, + edge_feature=edge_feature, + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + + return graph + + +# ============================================================================= +# College Message Dataset. +# ============================================================================= + + +def load_college_message_dataset(dataset_dir: str) -> Graph: + df_trans = pd.read_csv(dataset_dir, sep=' ', header=None) + df_trans.columns = ['SRC', 'DST', 'TIMESTAMP'] + assert not np.any(pd.isna(df_trans).values) + df_trans.reset_index(drop=True, inplace=True) + + # Node IDs of this dataset start from 1, re-index to 0-based. + df_trans['SRC'] -= 1 + df_trans['DST'] -= 1 + + print('num of edges:', len(df_trans)) + print('num of nodes:', np.max(df_trans[['SRC', 'DST']].values) + 1) + + time_scaler = MinMaxScaler((0, 2)) + df_trans['TimestampScaled'] = time_scaler.fit_transform( + df_trans['TIMESTAMP'].values.reshape(-1, 1)) + + edge_feature = torch.Tensor( + df_trans[['TimestampScaled']].values).view(-1, 1) + edge_index = torch.Tensor( + df_trans[['SRC', 'DST']].values.transpose()).long() # (2, E) + num_nodes = torch.max(edge_index) + 1 + + node_feature = torch.ones(num_nodes, 1) + + edge_time = torch.FloatTensor(df_trans['TIMESTAMP'].values) + + graph = Graph( + node_feature=node_feature, + edge_feature=edge_feature, + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + + return graph + + +def load_roland_dataset(format: str, name: str, dataset_dir: str + ) -> List[Graph]: + if format == 'roland': + # Load the entire graph from specified dataset. + if name in ['AS-733']: + g_all = load_AS_dataset(os.path.join(dataset_dir, name)) + elif name in ['bsi_svt_2008.tsv']: + # NOTE: only BSI dataset supports hetero graph. + g_all = load_bsi_dataset(os.path.join(dataset_dir, name), + is_hetero=cfg.dataset.is_hetero, + type_info_loc=cfg.dataset.type_info_loc) + elif name in ['bitcoinotc.csv', 'bitcoinalpha.csv']: + g_all = load_bitcoin_dataset(os.path.join(dataset_dir, name)) + elif name in ['reddit-body.tsv', 'reddit-title.tsv']: + g_all = load_reddit_dataset(os.path.join(dataset_dir, name)) + elif name in ['CollegeMsg.txt']: + g_all = load_college_message_dataset( + os.path.join(dataset_dir, name)) + else: + raise ValueError(f'Unsupported filename') + + # Make the graph snapshots. + snapshot_freq = cfg.transaction.snapshot_freq + if snapshot_freq.upper() in ['D', 'W', 'M']: + # Split snapshot using calendar frequency. + snapshot_list = utils.make_graph_snapshot(g_all, + snapshot_freq, + cfg.dataset.is_hetero) + elif snapshot_freq.endswith('s'): + # Split using frequency in terms of seconds. + assert snapshot_freq.endswith('s') + snapshot_freq = int(snapshot_freq.strip('s')) + assert not cfg.dataset.is_hetero, 'Hetero graph is not supported.' + snapshot_list = utils.make_graph_snapshot_by_seconds(g_all, + snapshot_freq) + else: + raise ValueError(f'Unsupported frequency type: {snapshot_freq}') -# def make_graph_snapshot(g_all: Graph, -# snapshot_freq: str, -# is_hetero: bool = True) -> list: -# """ -# Constructs a list of graph snapshots (Graph or HeteroGraph) based -# on g_all and snapshot_freq. -# -# Args: -# g_all: the entire homogenous graph. -# snapshot_freq: snapshot frequency. -# is_hetero: if make heterogeneous graphs. -# """ -# t = g_all.edge_time.numpy().astype(np.int64) -# snapshot_freq = snapshot_freq.upper() -# -# period_split = pd.DataFrame( -# {'Timestamp': t, -# 'TransactionTime': pd.to_datetime(t, unit='s')}, -# index=range(len(g_all.edge_time))) -# -# freq_map = {'D': '%j', # day of year. -# 'W': '%W', # week of year. -# 'M': '%m' # month of year. -# } -# -# period_split['Year'] = period_split['TransactionTime'].dt.strftime( -# '%Y').astype(int) -# -# period_split['SubYearFlag'] = period_split['TransactionTime'].dt.strftime( -# freq_map[snapshot_freq]).astype(int) -# -# period2id = period_split.groupby(['Year', 'SubYearFlag']).indices -# # e.g., dictionary w/ key = (2021, 3) and val = array(edges). -# -# periods = sorted(list(period2id.keys())) # ascending order. -# # alternatively, sorted(..., key=lambda x: x[0] + x[1]/1000). -# snapshot_list = list() -# for p in periods: -# # unique IDs of edges in this period. -# period_members = period2id[p] -# -# g_incr = Graph( -# node_feature=g_all.node_feature, -# edge_feature=g_all.edge_feature[period_members, :], -# edge_index=g_all.edge_index[:, period_members], -# edge_time=g_all.edge_time[period_members], -# directed=g_all.directed, -# list_n_type=g_all.list_n_type if is_hetero else None, -# list_e_type=g_all.list_e_type if is_hetero else None, -# ) -# if is_hetero and hasattr(g_all, 'node_type'): -# g_incr.node_type = g_all.node_type -# g_incr.edge_type = g_all.edge_type[period_members] -# snapshot_list.append(g_incr) -# return snapshot_list - - -def load_generic(dataset_dir: str, - snapshot: bool = True, - snapshot_freq: str = None, - is_hetero: bool = False, - type_info_loc: str = 'graph_attribute' - ) -> Union[deepsnap.graph.Graph, List[deepsnap.graph.Graph]]: - g_all = load_single_dataset(dataset_dir, is_hetero=is_hetero, - type_info_loc=type_info_loc) - if not snapshot: - return g_all - else: - snapshot_list = utils.make_graph_snapshot(g_all, snapshot_freq, is_hetero) num_nodes = g_all.edge_index.max() + 1 for g_snapshot in snapshot_list: @@ -322,19 +550,13 @@ def load_generic(dataset_dir: str, g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] g_snapshot.node_degree_existing = torch.zeros(num_nodes) - return snapshot_list - + # Filter small snapshots. + filtered_graphs = list() + for g in snapshot_list: + if g.num_edges >= 10: + filtered_graphs.append(g) -def load_generic_dataset(format, name, dataset_dir): - if format == 'roland_bsi_general': - dataset_dir = os.path.join(dataset_dir, name) - graphs = load_generic(dataset_dir, - snapshot=cfg.transaction.snapshot, - snapshot_freq=cfg.transaction.snapshot_freq, - is_hetero=cfg.dataset.is_hetero, - type_info_loc=cfg.dataset.type_info_loc) - return graphs + return filtered_graphs -# TODO: change name. -register_loader('roland_bsi_v3', load_generic_dataset) +register_loader('roland', load_roland_dataset) From 7d71297249f9b75032ae43051fdc8fa4eb4cddc3 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 12:42:56 -0700 Subject: [PATCH 45/66] update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index a37445a5..4b747366 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ **/data_dir/ run/datasets/data/ +run/results/ +run/runs_*/ **/__pycache__/ **/.ipynb_checkpoints .idea/ From 9e6336f231d2dbb78d973767dedb31a0adb054ca Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 12:45:10 -0700 Subject: [PATCH 46/66] add example yamls. --- run/configs/ROLAND/roland_gru_as733.yaml | 70 ++++++++++++++++++ run/configs/ROLAND/roland_gru_btcalpha.yaml | 70 ++++++++++++++++++ run/configs/ROLAND/roland_gru_btcotc.yaml | 70 ++++++++++++++++++ run/configs/ROLAND/roland_gru_redditbody.yaml | 70 ++++++++++++++++++ .../ROLAND/roland_gru_reddittitle.yaml | 70 ++++++++++++++++++ run/configs/ROLAND/roland_gru_ucimsg.yaml | 70 ++++++++++++++++++ run/configs/ROLAND/roland_mlp_bsisvt.yaml | 71 +++++++++++++++++++ 7 files changed, 491 insertions(+) create mode 100644 run/configs/ROLAND/roland_gru_as733.yaml create mode 100644 run/configs/ROLAND/roland_gru_btcalpha.yaml create mode 100644 run/configs/ROLAND/roland_gru_btcotc.yaml create mode 100644 run/configs/ROLAND/roland_gru_redditbody.yaml create mode 100644 run/configs/ROLAND/roland_gru_reddittitle.yaml create mode 100644 run/configs/ROLAND/roland_gru_ucimsg.yaml create mode 100644 run/configs/ROLAND/roland_mlp_bsisvt.yaml diff --git a/run/configs/ROLAND/roland_gru_as733.yaml b/run/configs/ROLAND/roland_gru_as733.yaml new file mode 100644 index 00000000..16901c0a --- /dev/null +++ b/run/configs/ROLAND/roland_gru_as733.yaml @@ -0,0 +1,70 @@ +remark: live_update +out_dir: results +device: auto +metric: + mrr_method: max + mrr_num_negative_edges: 1000 +dataset: + format: roland + name: AS-733 + is_hetero: False + dir: /home/tianyudu/Data/all_datasets + task: link_pred + task_type: classification + transductive: True + split: [0.8, 0.1, 0.1] + augment_feature: [] + augment_feature_dims: [0] + edge_encoder: True + edge_encoder_name: roland_general + edge_dim: 1 + node_encoder: False + link_pred_all_edges: False +transaction: + keep_ratio: linear + snapshot: True + snapshot_freq: D + check_snapshot: False + history: rolling + horizon: 1 + pred_mode: at + loss: supervised + feature_int_dim: 16 + feature_edge_int_num: [] + feature_node_int_num: [] + feature_amount_dim: 16 + feature_time_dim: 16 +train: + batch_size: 32 + eval_period: 20 + ckpt_period: 400 + mode: live_update + internal_validation_tolerance: 5 +meta: + is_meta: True + alpha: 0.5 +model: + type: gnn_recurrent + loss_fun: cross_entropy + edge_decoding: concat +gnn: + embed_update_method: gru + layers_pre_mp: 2 + layers_mp: 2 + layers_post_mp: 2 + dim_inner: 128 + mlp_update_layers: 2 + layer_type: residual_edge_conv + skip_connection: affine + stage_type: stack + batchnorm: True + act: prelu + dropout: 0.0 + agg: add + att_heads: 1 + normalize_adj: False + msg_direction: both +optim: + optimizer: adam + base_lr: 0.03 + max_epoch: 100 \ No newline at end of file diff --git a/run/configs/ROLAND/roland_gru_btcalpha.yaml b/run/configs/ROLAND/roland_gru_btcalpha.yaml new file mode 100644 index 00000000..5b5ed5e7 --- /dev/null +++ b/run/configs/ROLAND/roland_gru_btcalpha.yaml @@ -0,0 +1,70 @@ +remark: live_update +out_dir: results +device: auto +metric: + mrr_method: max + mrr_num_negative_edges: 1000 +dataset: + format: roland + name: bitcoinalpha.csv + is_hetero: False + dir: /home/tianyudu/Data/all_datasets + task: link_pred + task_type: classification + transductive: True + split: [0.8, 0.1, 0.1] + augment_feature: [] + augment_feature_dims: [0] + edge_encoder: True + edge_encoder_name: roland_general + edge_dim: 2 + node_encoder: False + link_pred_all_edges: False +transaction: + keep_ratio: linear + snapshot: True + snapshot_freq: W + check_snapshot: False + history: rolling + horizon: 1 + pred_mode: at + loss: supervised + feature_int_dim: 16 + feature_edge_int_num: [] + feature_node_int_num: [] + feature_amount_dim: 16 + feature_time_dim: 16 +train: + batch_size: 32 + eval_period: 20 + ckpt_period: 400 + mode: live_update + internal_validation_tolerance: 5 +meta: + is_meta: True + alpha: 0.8 +model: + type: gnn_recurrent + loss_fun: cross_entropy + edge_decoding: concat +gnn: + embed_update_method: gru + layers_pre_mp: 2 + layers_mp: 2 + layers_post_mp: 2 + dim_inner: 64 + mlp_update_layers: 2 + layer_type: residual_edge_conv + skip_connection: affine + stage_type: stack + batchnorm: False + act: prelu + dropout: 0.0 + agg: add + att_heads: 1 + normalize_adj: False + msg_direction: both +optim: + optimizer: adam + base_lr: 0.003 + max_epoch: 100 \ No newline at end of file diff --git a/run/configs/ROLAND/roland_gru_btcotc.yaml b/run/configs/ROLAND/roland_gru_btcotc.yaml new file mode 100644 index 00000000..fafc3cb1 --- /dev/null +++ b/run/configs/ROLAND/roland_gru_btcotc.yaml @@ -0,0 +1,70 @@ +remark: live_update +out_dir: results +device: auto +metric: + mrr_method: max + mrr_num_negative_edges: 1000 +dataset: + format: roland + name: bitcoinotc.csv + is_hetero: False + dir: /home/tianyudu/Data/all_datasets + task: link_pred + task_type: classification + transductive: True + split: [0.8, 0.1, 0.1] + augment_feature: [] + augment_feature_dims: [0] + edge_encoder: True + edge_encoder_name: roland_general + edge_dim: 2 + node_encoder: False + link_pred_all_edges: False +transaction: + keep_ratio: linear + snapshot: True + snapshot_freq: W + check_snapshot: False + history: rolling + horizon: 1 + pred_mode: at + loss: supervised + feature_int_dim: 16 + feature_edge_int_num: [] + feature_node_int_num: [] + feature_amount_dim: 16 + feature_time_dim: 16 +train: + batch_size: 32 + eval_period: 20 + ckpt_period: 400 + mode: live_update + internal_validation_tolerance: 5 +meta: + is_meta: True + alpha: 0.9 +model: + type: gnn_recurrent + loss_fun: cross_entropy + edge_decoding: concat +gnn: + embed_update_method: gru + layers_pre_mp: 2 + layers_mp: 4 + layers_post_mp: 2 + dim_inner: 64 + mlp_update_layers: 2 + layer_type: residual_edge_conv + skip_connection: affine + stage_type: stack + batchnorm: False + act: prelu + dropout: 0.0 + agg: add + att_heads: 1 + normalize_adj: False + msg_direction: both +optim: + optimizer: adam + base_lr: 0.003 + max_epoch: 100 \ No newline at end of file diff --git a/run/configs/ROLAND/roland_gru_redditbody.yaml b/run/configs/ROLAND/roland_gru_redditbody.yaml new file mode 100644 index 00000000..ee2da59a --- /dev/null +++ b/run/configs/ROLAND/roland_gru_redditbody.yaml @@ -0,0 +1,70 @@ +remark: live_update +out_dir: results +device: auto +metric: + mrr_method: max + mrr_num_negative_edges: 1000 +dataset: + format: roland + name: reddit-body.tsv + is_hetero: False + dir: /home/tianyudu/Data/all_datasets + task: link_pred + task_type: classification + transductive: True + split: [0.8, 0.1, 0.1] + augment_feature: [] + augment_feature_dims: [0] + edge_encoder: True + edge_encoder_name: roland_general + edge_dim: 88 + node_encoder: False + link_pred_all_edges: False +transaction: + keep_ratio: linear + snapshot: True + snapshot_freq: W + check_snapshot: False + history: rolling + horizon: 1 + pred_mode: at + loss: supervised + feature_int_dim: 16 + feature_edge_int_num: [] + feature_node_int_num: [] + feature_amount_dim: 16 + feature_time_dim: 16 +train: + batch_size: 32 + eval_period: 20 + ckpt_period: 400 + mode: live_update + internal_validation_tolerance: 5 +meta: + is_meta: True + alpha: 0.5 +model: + type: gnn_recurrent + loss_fun: cross_entropy + edge_decoding: concat +gnn: + embed_update_method: gru + layers_pre_mp: 2 + layers_mp: 2 + layers_post_mp: 2 + dim_inner: 64 + mlp_update_layers: 2 + layer_type: residual_edge_conv + skip_connection: affine + stage_type: stack + batchnorm: True + act: prelu + dropout: 0.0 + agg: add + att_heads: 1 + normalize_adj: False + msg_direction: both +optim: + optimizer: adam + base_lr: 0.003 + max_epoch: 100 \ No newline at end of file diff --git a/run/configs/ROLAND/roland_gru_reddittitle.yaml b/run/configs/ROLAND/roland_gru_reddittitle.yaml new file mode 100644 index 00000000..e48519fd --- /dev/null +++ b/run/configs/ROLAND/roland_gru_reddittitle.yaml @@ -0,0 +1,70 @@ +remark: live_update +out_dir: results +device: auto +metric: + mrr_method: max + mrr_num_negative_edges: 1000 +dataset: + format: roland + name: reddit-title.tsv + is_hetero: False + dir: /home/tianyudu/Data/all_datasets + task: link_pred + task_type: classification + transductive: True + split: [0.8, 0.1, 0.1] + augment_feature: [] + augment_feature_dims: [0] + edge_encoder: True + edge_encoder_name: roland_general + edge_dim: 88 + node_encoder: False + link_pred_all_edges: False +transaction: + keep_ratio: linear + snapshot: True + snapshot_freq: W + check_snapshot: False + history: rolling + horizon: 1 + pred_mode: at + loss: supervised + feature_int_dim: 16 + feature_edge_int_num: [] + feature_node_int_num: [] + feature_amount_dim: 16 + feature_time_dim: 16 +train: + batch_size: 32 + eval_period: 20 + ckpt_period: 400 + mode: live_update + internal_validation_tolerance: 5 +meta: + is_meta: True + alpha: 0.1 +model: + type: gnn_recurrent + loss_fun: cross_entropy + edge_decoding: concat +gnn: + embed_update_method: gru + layers_pre_mp: 2 + layers_mp: 6 + layers_post_mp: 2 + dim_inner: 128 + mlp_update_layers: 2 + layer_type: residual_edge_conv + skip_connection: affine + stage_type: stack + batchnorm: True + act: prelu + dropout: 0.0 + agg: add + att_heads: 1 + normalize_adj: False + msg_direction: both +optim: + optimizer: adam + base_lr: 0.003 + max_epoch: 100 \ No newline at end of file diff --git a/run/configs/ROLAND/roland_gru_ucimsg.yaml b/run/configs/ROLAND/roland_gru_ucimsg.yaml new file mode 100644 index 00000000..441cb0c9 --- /dev/null +++ b/run/configs/ROLAND/roland_gru_ucimsg.yaml @@ -0,0 +1,70 @@ +remark: live_update +out_dir: results +device: auto +metric: + mrr_method: max + mrr_num_negative_edges: 1000 +dataset: + format: roland + name: CollegeMsg.txt + is_hetero: False + dir: /home/tianyudu/Data/all_datasets + task: link_pred + task_type: classification + transductive: True + split: [0.8, 0.1, 0.1] + augment_feature: [] + augment_feature_dims: [0] + edge_encoder: True + edge_encoder_name: roland_general + edge_dim: 1 + node_encoder: False + link_pred_all_edges: False +transaction: + keep_ratio: linear + snapshot: True + snapshot_freq: W + check_snapshot: False + history: rolling + horizon: 1 + pred_mode: at + loss: supervised + feature_int_dim: 16 + feature_edge_int_num: [] + feature_node_int_num: [] + feature_amount_dim: 16 + feature_time_dim: 16 +train: + batch_size: 32 + eval_period: 20 + ckpt_period: 400 + mode: live_update + internal_validation_tolerance: 5 +meta: + is_meta: True + alpha: 0.5 +model: + type: gnn_recurrent + loss_fun: cross_entropy + edge_decoding: concat +gnn: + embed_update_method: gru + layers_pre_mp: 2 + layers_mp: 8 + layers_post_mp: 2 + dim_inner: 64 + mlp_update_layers: 2 + layer_type: residual_edge_conv + skip_connection: affine + stage_type: stack + batchnorm: True + act: prelu + dropout: 0.0 + agg: add + att_heads: 1 + normalize_adj: False + msg_direction: both +optim: + optimizer: adam + base_lr: 0.01 + max_epoch: 100 \ No newline at end of file diff --git a/run/configs/ROLAND/roland_mlp_bsisvt.yaml b/run/configs/ROLAND/roland_mlp_bsisvt.yaml new file mode 100644 index 00000000..35949287 --- /dev/null +++ b/run/configs/ROLAND/roland_mlp_bsisvt.yaml @@ -0,0 +1,71 @@ +remark: live_update +out_dir: results +device: auto +metric: + mrr_method: max + mrr_num_negative_edges: 1000 +dataset: + format: roland + name: bsi_svt_2008.tsv + is_hetero: False + dir: /home/tianyudu/Data/all_datasets + task: link_pred + task_type: classification + transductive: True + split: [0.8, 0.1, 0.1] + augment_feature: [] + augment_feature_dims: [0] + edge_encoder: True + edge_encoder_name: roland + edge_dim: 2 + node_encoder: True + node_encoder_name: roland + link_pred_all_edges: False +transaction: + keep_ratio: linear + snapshot: True + snapshot_freq: W + check_snapshot: False + history: rolling + horizon: 1 + pred_mode: at + loss: supervised + feature_int_dim: 16 + feature_edge_int_num: [] + feature_node_int_num: [1018, 33, 13, 23, 5] + feature_amount_dim: 16 + feature_time_dim: 16 +train: + batch_size: 32 + eval_period: 20 + ckpt_period: 400 + mode: live_update + internal_validation_tolerance: 5 +meta: + is_meta: True + alpha: 0.4 +model: + type: gnn_recurrent + loss_fun: cross_entropy + edge_decoding: concat +gnn: + embed_update_method: gru + layers_pre_mp: 2 + layers_mp: 4 + layers_post_mp: 2 + dim_inner: 128 + mlp_update_layers: 2 + layer_type: residual_edge_conv + skip_connection: affine + stage_type: stack + batchnorm: True + act: prelu + dropout: 0.0 + agg: add + att_heads: 1 + normalize_adj: False + msg_direction: both +optim: + optimizer: adam + base_lr: 0.003 + max_epoch: 100 \ No newline at end of file From 998dae07d6c9941f114131be02d3e0fac184adec Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 12:56:02 -0700 Subject: [PATCH 47/66] add training script for dynamic prediction tasks (lvie-update) --- run/main_dynamic.py | 71 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 run/main_dynamic.py diff --git a/run/main_dynamic.py b/run/main_dynamic.py new file mode 100644 index 00000000..cb52e1a3 --- /dev/null +++ b/run/main_dynamic.py @@ -0,0 +1,71 @@ +import logging +import os +import random +import warnings +from datetime import datetime +from itertools import product + +import numpy as np +import torch +from graphgym.cmd_args import parse_args +from graphgym.config import (assert_cfg, cfg, dump_cfg, get_parent_dir, + update_out_dir) +from graphgym.contrib.train import * +from graphgym.loader import create_dataset, create_loader +from graphgym.logger import create_logger, setup_printing +from graphgym.model_builder import create_model +from graphgym.optimizer import create_optimizer, create_scheduler +from graphgym.register import train_dict +from graphgym.train import train +from graphgym.utils.agg_runs import agg_runs +from graphgym.utils.comp_budget import params_count +from graphgym.utils.device import auto_select_device + +os.environ['MPLCONFIGDIR'] = "/tmp" + + +if __name__ == '__main__': + # Load cmd line args + args = parse_args() + # Repeat for different random seeds + for i in range(args.repeat): + # Load config file + cfg.merge_from_file(args.cfg_file) + cfg.merge_from_list(args.opts) + assert_cfg(cfg) + # Set Pytorch environment + torch.set_num_threads(cfg.num_threads) + out_dir_parent = cfg.out_dir + cfg.seed = i + 1 + random.seed(cfg.seed) + np.random.seed(cfg.seed) + torch.manual_seed(cfg.seed) + update_out_dir(out_dir_parent, args.cfg_file) + dump_cfg(cfg) + setup_printing() + auto_select_device() + + # Set learning environment + datasets = create_dataset() + + cfg.dataset.num_nodes = datasets[0][0].num_nodes + loaders = create_loader(datasets) + meters = create_logger(datasets, loaders) + + model = create_model(datasets) + # breakpoint() + optimizer = create_optimizer(model.parameters()) + scheduler = create_scheduler(optimizer) + # Print model info + logging.info(model) + logging.info(cfg) + cfg.params = params_count(model) + logging.info('Num parameters: {}'.format(cfg.params)) + # Start training + if cfg.train.mode == 'live_update': + train_dict[cfg.train.mode]( + meters, loaders, model, optimizer, scheduler, datasets=datasets) + + # When being launched in batch mode, mark a yaml as done + if args.mark_done: + os.rename(args.cfg_file, '{}_done'.format(args.cfg_file)) From 89c5197fdcd4155c89bff2d132af3658413e6817 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 12:58:58 -0700 Subject: [PATCH 48/66] remove comments --- graphgym/utils/stats.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/graphgym/utils/stats.py b/graphgym/utils/stats.py index f8bb0e29..4b97b3b6 100644 --- a/graphgym/utils/stats.py +++ b/graphgym/utils/stats.py @@ -12,15 +12,3 @@ def node_degree(edge_index, n=None, mode='in'): degree = torch.zeros(n) ones = torch.ones(index.shape[0]) return degree.scatter_add_(0, index, ones) - - - - - - - -# edge_index = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5]]) - -# print(compute_degree(edge_index, mode='in')) -# print(compute_degree(edge_index, mode='out')) -# print(compute_degree(edge_index, mode='both')) From 93f7ff367aa2d6ba66d4a02ac2f520d2ccf422d2 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 13:22:18 -0700 Subject: [PATCH 49/66] add template for hetero graphs --- .../contrib/loader/roland_template_hetero.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 graphgym/contrib/loader/roland_template_hetero.py diff --git a/graphgym/contrib/loader/roland_template_hetero.py b/graphgym/contrib/loader/roland_template_hetero.py new file mode 100644 index 00000000..1573a9ba --- /dev/null +++ b/graphgym/contrib/loader/roland_template_hetero.py @@ -0,0 +1,95 @@ +""" +A generic loader for the roland project, modify this template to build +loaders for other financial transaction datasets and dynamic graphs. +NOTE: this script is the trimmed version for homogenous graphs only. +Mar. 22, 2021. +# Search for TODO in this file. +""" +import os +from typing import List + +import deepsnap +import graphgym.contrib.loader.dynamic_graph_utils as utils +import torch +from deepsnap.graph import Graph +from graphgym.config import cfg +from graphgym.register import register_loader + + +def load_single_hetero_dataset(dataset_dir: str, type_info_loc: str) -> Graph: + # TODO: Load your data from dataset_dir here. + # Example: + num_nodes = 500 + num_node_feature = 16 + num_edges = 10000 + num_edge_feature = 32 + node_feature = torch.rand((num_nodes, num_node_feature)) + edge_feature = torch.rand((num_edges, num_edge_feature)) + edge_index = torch.randint(0, num_nodes - 1, (2, num_edges)) + # edge time should be unix timestmap integers. + # random generate timestamps from 2021-05-01 to 2021-06-01 + edge_time = torch.randint(1619852450, 1622530850, (num_edges,)).sort()[0] + + graph = Graph( + node_feature=node_feature, + edge_feature=edge_feature, + edge_index=edge_index, + edge_time=edge_time, + directed=True + ) + + # TODO: additional operations required for heterogeneous graphs. + # Assume there are 3 types of edges. + num_edge_types = 3 + edge_type_int = torch.randint(0, num_edge_types - 1, (num_edges,)).float() + # Assume there are 5 types of nodes. + num_node_types = 5 + node_type_int = torch.randint(0, num_node_types - 1, (num_nodes,)).float() + + if type_info_loc == 'append': + graph.edge_feature = torch.cat((graph.edge_feature, edge_type_int), + dim=1) + graph.node_feature = torch.cat((graph.node_feature, node_type_int), + dim=1) + elif type_info_loc == 'graph_attribute': + graph.node_type = node_type_int.reshape(-1, ) + graph.edge_type = edge_type_int.reshape(-1, ) + else: + raise ValueError(f'Unsupported type info loc: {type_info_loc}') + + # add a list of unique types for reference. + graph.list_n_type = node_type_int.unique().long() + graph.list_e_type = edge_type_int.unique().long() + + return graph + + +def load_generic_dataset(format: str, name: str, dataset_dir: str + ) -> List[deepsnap.graph.Graph]: + """Load the dataset as a list of graph snapshots. + + Args: + format (str): format of dataset. + name (str): file name of dataset. + dataset_dir (str): path of dataset, do NOT include the file name, use + the parent directory of dataset file. + + Returns: + List[deepsnap.graph.Graph]: a list of graph snapshots. + """ + # TODO: change the format name. + if format == 'YOUR_HETERO_FORMAT_NAME_HERE': + assert cfg.dataset.is_hetero + dataset_dir = os.path.join(dataset_dir, name) + g_all = load_single_hetero_dataset( + dataset_dir, + type_info_loc=cfg.dataset.type_info_loc) + snapshot_list = utils.make_graph_snapshot( + g_all, + snapshot_freq=cfg.transaction.snapshot_freq, + is_hetero=cfg.dataset.is_hetero) + return snapshot_list + + +# TODO: don't forget to register the loader. +register_loader('YOUR_HETERO_LOADER_NAME_HERE', load_generic_dataset) From ddf58c68f1a73fe4920ffe924b4bf513490a206e Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 13:22:32 -0700 Subject: [PATCH 50/66] rename --- graphgym/contrib/network/gnn_recurrent.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/graphgym/contrib/network/gnn_recurrent.py b/graphgym/contrib/network/gnn_recurrent.py index 1d0c562c..85667d63 100644 --- a/graphgym/contrib/network/gnn_recurrent.py +++ b/graphgym/contrib/network/gnn_recurrent.py @@ -4,21 +4,20 @@ from graphgym.config import cfg from graphgym.contrib.stage import * from graphgym.init import init_weights -from graphgym.models.act import act_dict from graphgym.models.feature_augment import Preprocess from graphgym.models.feature_encoder import (edge_encoder_dict, node_encoder_dict) from graphgym.models.head import head_dict from graphgym.models.layer import (BatchNorm1dEdge, BatchNorm1dNode, GeneralMultiLayer, layer_dict) -from graphgym.models.layer_recurrent import RecurrentGraphLayer +from graphgym.models.layer_recurrent import GeneralRecurrentLayer from graphgym.register import register_network -def GNNLayer(dim_in: int, dim_out: int, has_act: bool=True, layer_id: int=0): +def GNNLayer(dim_in: int, dim_out: int, has_act: bool = True, layer_id: int = 0): # General constructor for GNN layer. - return RecurrentGraphLayer(cfg.gnn.layer_type, dim_in, dim_out, - has_act, layer_id=layer_id) + return GeneralRecurrentLayer(cfg.gnn.layer_type, dim_in, dim_out, + has_act, layer_id=layer_id) def GNNPreMP(dim_in, dim_out): From 2ea233c58ca3ffa6688310f6d2e2becbc735889c Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 13:22:45 -0700 Subject: [PATCH 51/66] remove comment --- graphgym/contrib/train/train_live_update.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/graphgym/contrib/train/train_live_update.py b/graphgym/contrib/train/train_live_update.py index 60555125..aa8a91fd 100644 --- a/graphgym/contrib/train/train_live_update.py +++ b/graphgym/contrib/train/train_live_update.py @@ -150,16 +150,6 @@ def train_live_update(loggers, loaders, model, optimizer, scheduler, datasets, if not hasattr(dataset[0], 'keep_ratio'): train_utils.precompute_edge_degree_info(dataset) - # if cfg.dataset.premade_datasets == 'fresh_save_cache': - # if not os.path.exists(f'{cfg.dataset.dir}/cache/'): - # os.mkdir(f'{cfg.dataset.dir}/cache/') - # cache_path = '{}/cache/cached_datasets_{}_{}_{}.pt'.format( - # cfg.dataset.dir, cfg.dataset.format.replace('.tsv', ''), - # cfg.transaction.snapshot_freq, - # datetime.now().strftime('%Y_%m_%d__%H_%M_%S') - # ) - # torch.save(datasets, cache_path) - num_splits = len(loggers) # train/val/test splits. # range for today in (today, tomorrow) task pairs. task_range = range(len(datasets[0]) - cfg.transaction.horizon) From c03bb12fc6049a3ee9f4da73d129042d7b67e9de Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 13:23:01 -0700 Subject: [PATCH 52/66] add example run files for ROLAND --- run/run_roland_single.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 run/run_roland_single.sh diff --git a/run/run_roland_single.sh b/run/run_roland_single.sh new file mode 100644 index 00000000..b10c5533 --- /dev/null +++ b/run/run_roland_single.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_btcalpha.yaml --repeat 1 + +# python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_btcotc.yaml --repeat 1 + +# python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_ucimsg.yaml --repeat 1 + +# python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_reddittitle.yaml --repeat 1 + +# python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_redditbody.yaml --repeat 1 + +python3 main_dynamic.py --cfg configs/ROLAND/roland_mlp_bsisvt.yaml --repeat 1 \ No newline at end of file From 190f49f3ddcc494d777bb8a030fa6682d324c558 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 15:25:49 -0700 Subject: [PATCH 53/66] add readme --- ROLAND_README.md | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 ROLAND_README.md diff --git a/ROLAND_README.md b/ROLAND_README.md new file mode 100644 index 00000000..3d052c99 --- /dev/null +++ b/ROLAND_README.md @@ -0,0 +1,65 @@ +# Use case: ROLAND: Graph Neural Networks for Dynamic Graphs +Code associated with the ROLAND project. + + +## TODO: add figures to illustrate the ROLAND framework. + +## Datasets +Most of datasets are used in our paper can be found at `https://snap.stanford.edu/data/index.html`. + +```bash +mkdir ./all_datasets/ +cd ./all_datasets +wget 'https://snap.stanford.edu/data/soc-sign-bitcoinotc.csv.gz' +wget 'https://snap.stanford.edu/data/soc-sign-bitcoinalpha.csv.gz' +wget 'https://snap.stanford.edu/data/as-733.tar.gz' +wget 'https://snap.stanford.edu/data/CollegeMsg.txt.gz' +wget 'https://snap.stanford.edu/data/soc-redditHyperlinks-body.tsv' +wget 'https://snap.stanford.edu/data/soc-redditHyperlinks-title.tsv' +wget 'http://snap.stanford.edu/data/web-redditEmbeddings-subreddits.csv' + +# Unzip files +gunzip CollegeMsg.txt.gz +gunzip soc-sign-bitcoinalpha.csv.gz +gunzip soc-sign-bitcoinotc.csv.gz +tar xf ./as-733.tar.gz + +# Rename files. +mv ./soc-sign-bitcoinotc.csv ./bitcoinotc.csv +mv ./soc-sign-bitcoinalpha.csv ./bitcoinalpha.csv + +mv ./soc-redditHyperlinks-body.tsv ./reddit-body.tsv +mv ./soc-redditHyperlinks-title.tsv ./reddit-title.tsv +``` +## Examples of ROLAND Use Cases +See `./run/run_roland_single.sh` for experiments on all datasets. +To run link-prediction task on `CollegeMsg.txt` dataset: +```bash +cd ./run +python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_ucimsg.yaml --repeat 1 +``` +To explore training result: +```bash +cd ./run +tensorboard --logdir=./runs_live_update --port=6006 +``` + +## Examples on Homogenous Graph Snapshots +Prediction for BitCoin transactions. + +```bash +TODO: add yaml file. +``` + +## Examples on Heterogenous Graph Snapshots +TODO. + +## How to Load Your Own Dataset +Please refer to `./graphgym/contrib/loader/roland_template.py` and `./graphgym/contrib/loader/roland_template_hetero.py` for examples of building loaders. + +## Data Structures for Snapshot-Based Dynamic Graphs + + +## Grid Search +`./run/grids/ROLAND/` +`./` From 18bb3004927d7980ffc19bc7b844b3a320796966 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 7 Jun 2021 15:30:42 -0700 Subject: [PATCH 54/66] add --- ROLAND_README.md | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/ROLAND_README.md b/ROLAND_README.md index 3d052c99..7c688ab4 100644 --- a/ROLAND_README.md +++ b/ROLAND_README.md @@ -32,34 +32,33 @@ mv ./soc-redditHyperlinks-body.tsv ./reddit-body.tsv mv ./soc-redditHyperlinks-title.tsv ./reddit-title.tsv ``` ## Examples of ROLAND Use Cases -See `./run/run_roland_single.sh` for experiments on all datasets. +The ROLAND project focuses on link-predictions for homogenous dynamic graphs. To run link-prediction task on `CollegeMsg.txt` dataset: ```bash cd ./run python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_ucimsg.yaml --repeat 1 ``` -To explore training result: +For other datasets: ```bash -cd ./run -tensorboard --logdir=./runs_live_update --port=6006 -``` +python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_btcalpha.yaml --repeat 1 -## Examples on Homogenous Graph Snapshots -Prediction for BitCoin transactions. +python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_btcotc.yaml --repeat 1 +python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_ucimsg.yaml --repeat 1 + +python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_reddittitle.yaml --repeat 1 + +python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_redditbody.yaml --repeat 1 +``` + +To explore training result: ```bash -TODO: add yaml file. +cd ./run +tensorboard --logdir=./runs_live_update --port=6006 ``` ## Examples on Heterogenous Graph Snapshots -TODO. +`Under development` ## How to Load Your Own Dataset Please refer to `./graphgym/contrib/loader/roland_template.py` and `./graphgym/contrib/loader/roland_template_hetero.py` for examples of building loaders. - -## Data Structures for Snapshot-Based Dynamic Graphs - - -## Grid Search -`./run/grids/ROLAND/` -`./` From 5a42cb67138b1700e196ca92e28fbdf811beee4d Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Wed, 9 Jun 2021 22:22:09 -0700 Subject: [PATCH 55/66] remove type_info_loc config --- graphgym/contrib/config/roland.py | 5 ----- graphgym/contrib/loader/roland.py | 20 ++++--------------- .../contrib/loader/roland_template_hetero.py | 18 ++++------------- 3 files changed, 8 insertions(+), 35 deletions(-) diff --git a/graphgym/contrib/config/roland.py b/graphgym/contrib/config/roland.py index 99b5ced5..af6da3fc 100644 --- a/graphgym/contrib/config/roland.py +++ b/graphgym/contrib/config/roland.py @@ -93,11 +93,6 @@ def set_cfg_roland(cfg): # Options: {True, False}. cfg.dataset.is_hetero = False - # Where to put type information. - # Options: {'append', 'graph_attribute'}. - # Only effective if cfg.dataset.is_hetero == True. - cfg.dataset.type_info_loc = 'append' - # whether to look for and load cached graph. By default (load_cache=False) # the loader loads the raw tsv file from disk and cfg.dataset.load_cache = False diff --git a/graphgym/contrib/loader/roland.py b/graphgym/contrib/loader/roland.py index ed38f30b..77e09fe1 100644 --- a/graphgym/contrib/loader/roland.py +++ b/graphgym/contrib/loader/roland.py @@ -188,16 +188,13 @@ def construct_additional_features(df: pd.DataFrame) -> pd.DataFrame: return df -def load_bsi_dataset(dataset_dir: str, is_hetero: bool = False, - type_info_loc: str = 'append' - ) -> Graph: +def load_bsi_dataset(dataset_dir: str, is_hetero: bool = False) -> Graph: """ Loads a single graph object from tsv file. Args: dataset_dir: the path of tsv file to be loaded. is_hetero: whether to load heterogeneous graph. - type_info_loc: 'append' or 'graph_attribute'. Returns: graph: a (homogenous) deepsnap graph object. @@ -306,16 +303,8 @@ def load_bsi_dataset(dataset_dir: str, is_hetero: bool = False, df_trans['EdgeType'].values.reshape(-1, 1)) edge_type_int = torch.FloatTensor(edge_type_int) - if type_info_loc == 'append': - graph.edge_feature = torch.cat((graph.edge_feature, edge_type_int), - dim=1) - graph.node_feature = torch.cat((graph.node_feature, node_type_int), - dim=1) - elif type_info_loc == 'graph_attribute': - graph.node_type = node_type_int.reshape(-1, ) - graph.edge_type = edge_type_int.reshape(-1, ) - else: - raise ValueError(f'Unsupported type info loc: {type_info_loc}') + graph.node_type = node_type_int.reshape(-1,) + graph.edge_type = edge_type_int.reshape(-1,) # add a list of unique types for reference. graph.list_n_type = node_type_int.unique().long() @@ -514,8 +503,7 @@ def load_roland_dataset(format: str, name: str, dataset_dir: str elif name in ['bsi_svt_2008.tsv']: # NOTE: only BSI dataset supports hetero graph. g_all = load_bsi_dataset(os.path.join(dataset_dir, name), - is_hetero=cfg.dataset.is_hetero, - type_info_loc=cfg.dataset.type_info_loc) + is_hetero=cfg.dataset.is_hetero) elif name in ['bitcoinotc.csv', 'bitcoinalpha.csv']: g_all = load_bitcoin_dataset(os.path.join(dataset_dir, name)) elif name in ['reddit-body.tsv', 'reddit-title.tsv']: diff --git a/graphgym/contrib/loader/roland_template_hetero.py b/graphgym/contrib/loader/roland_template_hetero.py index 1573a9ba..77bc3049 100644 --- a/graphgym/contrib/loader/roland_template_hetero.py +++ b/graphgym/contrib/loader/roland_template_hetero.py @@ -16,7 +16,7 @@ from graphgym.register import register_loader -def load_single_hetero_dataset(dataset_dir: str, type_info_loc: str) -> Graph: +def load_single_hetero_dataset(dataset_dir: str) -> Graph: # TODO: Load your data from dataset_dir here. # Example: num_nodes = 500 @@ -46,16 +46,8 @@ def load_single_hetero_dataset(dataset_dir: str, type_info_loc: str) -> Graph: num_node_types = 5 node_type_int = torch.randint(0, num_node_types - 1, (num_nodes,)).float() - if type_info_loc == 'append': - graph.edge_feature = torch.cat((graph.edge_feature, edge_type_int), - dim=1) - graph.node_feature = torch.cat((graph.node_feature, node_type_int), - dim=1) - elif type_info_loc == 'graph_attribute': - graph.node_type = node_type_int.reshape(-1, ) - graph.edge_type = edge_type_int.reshape(-1, ) - else: - raise ValueError(f'Unsupported type info loc: {type_info_loc}') + graph.node_type = node_type_int.reshape(-1,) + graph.edge_type = edge_type_int.reshape(-1,) # add a list of unique types for reference. graph.list_n_type = node_type_int.unique().long() @@ -81,9 +73,7 @@ def load_generic_dataset(format: str, name: str, dataset_dir: str if format == 'YOUR_HETERO_FORMAT_NAME_HERE': assert cfg.dataset.is_hetero dataset_dir = os.path.join(dataset_dir, name) - g_all = load_single_hetero_dataset( - dataset_dir, - type_info_loc=cfg.dataset.type_info_loc) + g_all = load_single_hetero_dataset(dataset_dir) snapshot_list = utils.make_graph_snapshot( g_all, snapshot_freq=cfg.transaction.snapshot_freq, From c77ec978adbaa276e5622c768f6a41ef58c58ebc Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Thu, 10 Jun 2021 00:20:36 -0700 Subject: [PATCH 56/66] update loader --- graphgym/contrib/loader/roland.py | 6 +++--- graphgym/contrib/loader/roland_template_hetero.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/graphgym/contrib/loader/roland.py b/graphgym/contrib/loader/roland.py index 77e09fe1..ffa8d94d 100644 --- a/graphgym/contrib/loader/roland.py +++ b/graphgym/contrib/loader/roland.py @@ -131,7 +131,7 @@ def file2timestamp(file_name: str) -> int: # Required for heterogeneous graphs only. # Node and edge features used to define node and edge type in hete GNN. NODE_TYPE_DEFN: List[str] = ['Country'] -EDGE_TYPE_DEFN: List[str] = ['# System'] +EDGE_TYPE_DEFN: List[str] = ['# System', 'AmountLevel'] # Required for graphs with node features only. @@ -303,8 +303,8 @@ def load_bsi_dataset(dataset_dir: str, is_hetero: bool = False) -> Graph: df_trans['EdgeType'].values.reshape(-1, 1)) edge_type_int = torch.FloatTensor(edge_type_int) - graph.node_type = node_type_int.reshape(-1,) - graph.edge_type = edge_type_int.reshape(-1,) + graph.node_type = node_type_int.reshape(-1,).long() + graph.edge_type = edge_type_int.reshape(-1,).long() # add a list of unique types for reference. graph.list_n_type = node_type_int.unique().long() diff --git a/graphgym/contrib/loader/roland_template_hetero.py b/graphgym/contrib/loader/roland_template_hetero.py index 77bc3049..fe202a33 100644 --- a/graphgym/contrib/loader/roland_template_hetero.py +++ b/graphgym/contrib/loader/roland_template_hetero.py @@ -46,8 +46,8 @@ def load_single_hetero_dataset(dataset_dir: str) -> Graph: num_node_types = 5 node_type_int = torch.randint(0, num_node_types - 1, (num_nodes,)).float() - graph.node_type = node_type_int.reshape(-1,) - graph.edge_type = edge_type_int.reshape(-1,) + graph.node_type = node_type_int.reshape(-1,).long() + graph.edge_type = edge_type_int.reshape(-1,).long() # add a list of unique types for reference. graph.list_n_type = node_type_int.unique().long() From f0579b4cb109114755c0188d15091f2ba3924444 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Thu, 10 Jun 2021 00:21:12 -0700 Subject: [PATCH 57/66] remove comments --- graphgym/contrib/train/train_utils.py | 88 ++------------------------- 1 file changed, 4 insertions(+), 84 deletions(-) diff --git a/graphgym/contrib/train/train_utils.py b/graphgym/contrib/train/train_utils.py index 6caf7ffd..9117ee76 100644 --- a/graphgym/contrib/train/train_utils.py +++ b/graphgym/contrib/train/train_utils.py @@ -189,81 +189,6 @@ def gen_negative_edges(edge_index: torch.LongTensor, return neg_edge_index -# def compute_src_mrr_and_recall(edge_label_index: torch.LongTensor, -# edge_label: torch.LongTensor, -# pred_score: torch.Tensor, -# recall_k_lst: List[int], -# mrr_top_k: Optional[int] = None -# ) -> (float, Dict[int, float]): -# """ -# Computes source-based MRR and recall at K for each source node in -# edge_label_index. - -# Args: -# edge_label_index: combination of positive and negative edges. -# edge_label: label of edges in edge_label_index. -# pred_score: P(E=positive) for each edge in edge_label_index. -# recall_k_lst: to report recall at k for all k in this list. -# mrr_top_k: calculating MRR for each source node using mean(1/rank) for -# k positive edges with the highest pred_score. Set to None to use -# all positive edges. -# """ -# assert edge_label_index.shape[1] == len(edge_label) == len(pred_score) - -# src_lst = torch.unique(edge_label_index[0]) # source nodes to consider. -# # edge_label_index were constructed by adding negative edges to every -# # node in edge_index[0], thus every node in src_lst has at least one -# # positive edge in edge_label_index. -# # I.e., src_lst == torch.unique(edge_label_index[0][edge_label == 1]) - -# node_level_mrr = [] # store MRR for each node. -# node_recall_at = dict((k, []) for k in recall_k_lst) -# for src in tqdm(src_lst, leave=False, desc='Node level MRR/Recall'): -# # get positive/negative edges emitted from src node. -# self_mask = (edge_label_index[0] == src) -# self_label = edge_label[self_mask] -# self_pred_score = pred_score[self_mask] - -# # Alternative implementation. -# best = torch.max(self_pred_score[self_label == 1]) -# rank = torch.sum(self_pred_score[self_label == 0] >= best) + 1 -# # print(pos_edge_rank[0], true, torch.sum(label == 0)) -# mrr = float(1 / rank) -# node_level_mrr.append(mrr) # mrr for this node. - -# for k in recall_k_lst: -# recall = _calculate_recall_at_k(self_pred_score, self_label, k) -# node_recall_at[k].append(recall) - -# # Average over all nodes. -# macro_recall = dict((k, np.mean(v)) for (k, v) in node_recall_at.items()) -# macro_mrr = float(np.mean(node_level_mrr)) -# return macro_mrr, macro_recall - - -# def _calculate_recall_at_k(pred_score: torch.Tensor, -# label: torch.Tensor, -# k: int) -> int: -# """Computes whether the score of the most confident positive edge is -# within the highest k scores. I.e., whether the most confident -# positive edge beats at least k most confident negative edges. - -# Args: -# pred_score: a tensor of scores of predictions. -# label: a tensor of labels. -# k: get whether successful recall at k. - -# Returns: -# an indicator whether there is a successful recall at rank k. -# """ -# neg_score = pred_score[label == 0] -# if len(neg_score) == 0: -# return 0 -# best_pos_score = torch.max(pred_score[label == 1]) -# rank = torch.sum(neg_score >= best_pos_score) + 1 -# return int(rank <= k) - - @torch.no_grad() def fast_batch_mrr(edge_label_index: torch.Tensor, edge_label: torch.Tensor, @@ -343,6 +268,7 @@ def fast_batch_mrr(edge_label_index: torch.Tensor, mrr = float(torch.mean(1 / rank_by_user)) return mrr +# TODO: get recall at k back. # @torch.no_grad() # def report_rank_based_eval(eval_batch, model, method: str, @@ -419,8 +345,8 @@ def get_row_MRR(probs, true_classes): @torch.no_grad() -def report_baseline_MRR(eval_batch: deepsnap.graph.Graph, - model: torch.nn.Module) -> float: +def report_MRR_all(eval_batch: deepsnap.graph.Graph, + model: torch.nn.Module) -> float: # Get positive edge indices. edge_index = eval_batch.edge_label_index[:, eval_batch.edge_label == 1] edge_index = edge_index.to('cpu') @@ -472,12 +398,6 @@ def report_baseline_MRR(eval_batch: deepsnap.graph.Graph, true_row = true.take(mask).squeeze() row_MRRs.append(get_row_MRR(pred_row, true_row)) - # for i, pred_row in enumerate(pred_matrix): - # #check if there are any existing edges - # # only evaluate senders with existing edge (of course). - # if np.isin(1, true_matrix[i]): - # row_MRRs.append(get_row_MRR(pred_row, true_matrix[i])) - avg_MRR = torch.tensor(row_MRRs).mean() return float(avg_MRR) @@ -512,7 +432,7 @@ def compute_MRR(eval_batch: deepsnap.graph.Graph, if method == 'all': # NOTE: this method requires iterating over all nodes, which is slow. assert num_neg_per_node == -1 - return report_baseline_MRR(eval_batch, model) + return report_MRR_all(eval_batch, model) else: assert num_neg_per_node > 0 # Sample negative edges for each node. From 6eabfa9a0625ec261ad11a3344d8964b4706afd6 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Thu, 10 Jun 2021 00:21:30 -0700 Subject: [PATCH 58/66] Add example config yaml for hetero GNN --- run/configs/ROLAND/roland_hetero.yaml | 70 +++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 run/configs/ROLAND/roland_hetero.yaml diff --git a/run/configs/ROLAND/roland_hetero.yaml b/run/configs/ROLAND/roland_hetero.yaml new file mode 100644 index 00000000..23bf9feb --- /dev/null +++ b/run/configs/ROLAND/roland_hetero.yaml @@ -0,0 +1,70 @@ +remark: live_update +out_dir: results +device: auto +metric: + mrr_method: max + mrr_num_negative_edges: 1000 +dataset: + format: roland + name: bsi_svt_2008.tsv + is_hetero: True + dir: /home/tianyudu/Data/all_datasets + task: link_pred # edge, node. + task_type: classification + transductive: True + split: [0.8, 0.1, 0.1] + augment_feature: [] + augment_feature_dims: [0] + edge_encoder: True + edge_encoder_name: roland_general + edge_dim: 1 + node_encoder: False + link_pred_all_edges: False +transaction: + keep_ratio: linear + snapshot: True + snapshot_freq: W + check_snapshot: False + history: rolling + horizon: 1 + pred_mode: at + loss: supervised + feature_int_dim: 16 + feature_edge_int_num: [] + feature_node_int_num: [] + feature_amount_dim: 16 # * + feature_time_dim: 16 # * +train: + batch_size: 32 + eval_period: 20 + ckpt_period: 400 + mode: live_update + internal_validation_tolerance: 5 # * +meta: + is_meta: True + alpha: 0.5 # * +model: + type: gnn_recurrent + loss_fun: cross_entropy + edge_decoding: concat +gnn: + embed_update_method: gru # * + layers_pre_mp: 2 # * + layers_mp: 8 # * + layers_post_mp: 2 # * + dim_inner: 64 # * + mlp_update_layers: 2 # * + layer_type: residual_edge_conv # * + skip_connection: affine # * + stage_type: stack + batchnorm: True # * + act: prelu + dropout: 0.0 + agg: add # * + att_heads: 1 + normalize_adj: False + msg_direction: both # * +optim: + optimizer: adam + base_lr: 0.01 # * + max_epoch: 100 \ No newline at end of file From 2466f5c87c522dc68b7d4e09bf94cf2e3750a166 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 14 Jun 2021 13:52:39 -0700 Subject: [PATCH 59/66] add examples --- run/grids/ROLAND/example_grid.txt | 7 +++++++ run/run_roland_batch.sh | 21 +++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 run/grids/ROLAND/example_grid.txt create mode 100644 run/run_roland_batch.sh diff --git a/run/grids/ROLAND/example_grid.txt b/run/grids/ROLAND/example_grid.txt new file mode 100644 index 00000000..d2c3291f --- /dev/null +++ b/run/grids/ROLAND/example_grid.txt @@ -0,0 +1,7 @@ +meta.is_meta is_meta [True] +meta.alpha alpha [0.2,0.4,0.6,0.8,1.0] +gnn.skip_connection skip ['affine','identity','none'] +gnn.embed_update_method update ['gru'] +gnn.layers_mp mp [2,3] +gnn.batchnorm bn [True] +optim.base_lr lr [0.003,0.01,0.03] \ No newline at end of file diff --git a/run/run_roland_batch.sh b/run/run_roland_batch.sh new file mode 100644 index 00000000..398fa830 --- /dev/null +++ b/run/run_roland_batch.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +CONFIG=roland_gru_ucimsg +GRID=example_grid +REPEAT=3 +MAX_JOBS=10 +SLEEP=1 + +python configs_gen.py --config configs/ROLAND/${CONFIG}.yaml \ + --grid grids/ROLAND/${GRID}.txt \ + --out_dir configs +# run batch of configs +# Args: config_dir, num of repeats, max jobs running, sleep time +bash parallel.sh configs/${CONFIG}_grid_${GRID} $REPEAT $MAX_JOBS $SLEEP +# rerun missed / stopped experiments +bash parallel.sh configs/${CONFIG}_grid_${GRID} $REPEAT $MAX_JOBS $SLEEP +# rerun missed / stopped experiments +bash parallel.sh configs/${CONFIG}_grid_${GRID} $REPEAT $MAX_JOBS $SLEEP + +# aggregate results for the batch +python agg_batch.py --dir results/${CONFIG}_grid_${GRID} From f89fbbfd278161da974e12f6150401f065673846 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 14 Jun 2021 14:08:46 -0700 Subject: [PATCH 60/66] update markdown --- ROLAND_README.md | 60 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/ROLAND_README.md b/ROLAND_README.md index 7c688ab4..4b7390e5 100644 --- a/ROLAND_README.md +++ b/ROLAND_README.md @@ -1,13 +1,17 @@ -# Use case: ROLAND: Graph Neural Networks for Dynamic Graphs -Code associated with the ROLAND project. +# ROLAND: Graph Neural Networks for Dynamic Graphs +This repository contains code associated with the ROLAND project and more. +You can firstly walk through the *how-to* sections to run experiments on existing +public datasets. +After understanding how to run and analyze experiments, you can read through the *development topics* to run our ## TODO: add figures to illustrate the ROLAND framework. -## Datasets +## How to Download Datasets Most of datasets are used in our paper can be found at `https://snap.stanford.edu/data/index.html`. ```bash +# Or Use your own dataset directory. mkdir ./all_datasets/ cd ./all_datasets wget 'https://snap.stanford.edu/data/soc-sign-bitcoinotc.csv.gz' @@ -24,16 +28,23 @@ gunzip soc-sign-bitcoinalpha.csv.gz gunzip soc-sign-bitcoinotc.csv.gz tar xf ./as-733.tar.gz -# Rename files. +# Rename files, this step is required by our loader. +# You can leave the web-redditEmbeddings-subreddits.csv file unchanged. mv ./soc-sign-bitcoinotc.csv ./bitcoinotc.csv mv ./soc-sign-bitcoinalpha.csv ./bitcoinalpha.csv mv ./soc-redditHyperlinks-body.tsv ./reddit-body.tsv mv ./soc-redditHyperlinks-title.tsv ./reddit-title.tsv ``` -## Examples of ROLAND Use Cases +You should expect 740 files, including the zipped `as-733.tar.gz`, by checking `ls | wc -l`. +The total disk space required is approximately 950MiB. +## How to Run Single Experiments from Our Paper +**WARNING**: for each `yaml` file in `./run/configs/ROLAND`, you need to update the `dataset.dir` field to the correct path of datasets downloaded above. + The ROLAND project focuses on link-predictions for homogenous dynamic graphs. -To run link-prediction task on `CollegeMsg.txt` dataset: +Here we demonstrate example runs using + +To run link-prediction task on `CollegeMsg.txt` dataset with default settings: ```bash cd ./run python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_ucimsg.yaml --repeat 1 @@ -50,15 +61,46 @@ python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_reddittitle.yaml --repea python3 main_dynamic.py --cfg configs/ROLAND/roland_gru_redditbody.yaml --repeat 1 ``` +The `--repeat` argument controls for number of random seeds used for each experiment. For example, setting `--repeat 3` runs each single experiments for three times with three different random seeds. To explore training result: ```bash cd ./run tensorboard --logdir=./runs_live_update --port=6006 ``` +**WARNING** The x-axis of plots in tensorboard is **not** epochs, they are snapshot IDs (e.g., the $i^{th}$ day or the $i^{th}$ week) instead. ## Examples on Heterogenous Graph Snapshots -`Under development` +```bash +Under development. +``` + +## How to Run Grid Search / Batch Experiments +To run grid search / batch experiments, one needs a `main.py` file, a `base_config.yaml`, and a `grid.txt` file. The main and config files are the same as in the single experiment setup above. +If one wants to do link-prediction on `CollegeMsg.txt` dataset with configurations from `configs/ROLAND/roland_gru_ucimsg.yaml`, in addition, she wants to try out (1) *different numbers of GNN message passing layers* and (2) *different learning rates*. +In this case, one can use the following grid file: +```text +# grid.txt, lines starting with # are comments. +gnn.layers_mp mp [2,3,4,5] +optim.base_lr lr [0.003,0.01,0.03] +``` +**WARNING**: the format of each line is crucial: `NAME_IN_YAMLSHORT_ALIASLIST_OF_VALUES`, and there should **not** be any space in the list of values. + +The `grid.txt` above will generate $4\times 3=12$ different configurations by modifying `gnn.layers_mp` and `gnn.layers_mp` to the respective levels in base config file `roland_gru_ucimsg.yaml`. + +Please see `./run/grids/ROLAND/example_grid.txt` for a complete example of grid search text file. + +To run the experiment using `example_grid.txt`: +```bash +bash ./run_roland_batch.sh +``` +## How to Export Tensorboard Results to CSV +We provide a simple script to aggregate results from a batch of tensorboard files, please feel free to look into `tabulate_events.py` and modify it. +```bash +# Usage: python3 ./tabulate_events.py +python3 ./tabulate_events.py ./live_update ./out.csv +``` -## How to Load Your Own Dataset -Please refer to `./graphgym/contrib/loader/roland_template.py` and `./graphgym/contrib/loader/roland_template_hetero.py` for examples of building loaders. +## Development Topic: Use Your Own Dataset +We provided two examples of constructing your own datasets, please refer to +(1) `./graphgym/contrib/loader/roland_template.py` and (2) `./graphgym/contrib/loader/roland_template_hetero.py` for examples of building loaders. From f97532188c2a1b71f170ac9aa464fbc5c6de7168 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Mon, 14 Jun 2021 14:08:56 -0700 Subject: [PATCH 61/66] add file --- run/tabulate_events.py | 116 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 run/tabulate_events.py diff --git a/run/tabulate_events.py b/run/tabulate_events.py new file mode 100644 index 00000000..42e640cb --- /dev/null +++ b/run/tabulate_events.py @@ -0,0 +1,116 @@ +""" +A simple utility that generates performance report for different model on +different datasets. + +This script works for live-update scheme only, use graphgym's native analyze +tools for rolling/fixed-split scheme. +""" +import os +import sys +from typing import List + +import numpy as np +import pandas as pd +import yaml +from tensorboard.backend.event_processing.event_accumulator import \ + EventAccumulator +from tqdm import tqdm + + +def squeeze_dict(old_dict: dict) -> dict: + """Squeezes nested dictionary keys. + Example: old_dict['key1'] = {'key2': 'hello'}. + will generate new_dict['key1.key2'] = 'hello'. + """ + new_dict = dict() + for k1 in old_dict.keys(): + if isinstance(old_dict[k1], dict): + for k2 in old_dict[k1].keys(): + new_key = k1 + '.' + k2 + new_dict[new_key] = old_dict[k1][k2] + else: + new_dict[k1] = old_dict[k1] + return new_dict + + +def tabulate_events(logdir: str, variables: List[str]) -> pd.DataFrame: + """ + Generates a pandas dataframe which contains experiment (runs) as its rows, + the returned dataframe contains columns: + (1) File name/path of that run. + (2) Fields required in `variables' from corresponding config.yaml. + (3) Test and validation set performance (MRR and Recall at k). + """ + all_runs = list() + count = 0 # count number of experiment runs processed. + + for run_dir in tqdm(os.listdir(logdir)): + if run_dir.startswith('.'): + # Ignore hidden files. + continue + + if not os.path.isdir(os.path.join(logdir, run_dir)): + # Ignore other things such as generated tables. + print(run_dir) + continue + + count += 1 + + config_dir = os.path.join(logdir, run_dir, 'config.yaml') + with open(config_dir) as file: + config = yaml.full_load(file) + config = squeeze_dict(config) + + current_run = {'run': run_dir} + for var in variables: + # record required variables in config.yaml. + current_run[var] = config[var] + + # for metric in ['test_mrr', 'test_rck1', 'test_rck3', 'test_rck10', + # 'test_loss', + # 'val_mrr', 'val_rck1', 'val_rck3', 'val_rck10', + # 'val_loss']: + for metric in ['test_mrr']: + event_path = os.path.join(logdir, run_dir, metric) + # print(f'Processing event file {event_path}') + + ea = EventAccumulator(event_path).Reload() + + tag_values = [] + steps = [] + + x = 'test' if metric.startswith('test') else 'val' + for event in ea.Scalars(x): + # Each (value, step) corresponds to a (value, snapshot). + tag_values.append(event.value) + steps.append(event.step) + + current_run['average_' + metric] = np.mean(tag_values) + # current_run: one row in the aggregated dataset. + all_runs.append(current_run) + print(f'exported {count} experiments.') + return pd.DataFrame(all_runs) + + +if __name__ == '__main__': + # 1. directory of baseline experiment set. + # 2. directory of fine-tuning experiment, our model + all datasets. + # 3. directory of output tables and files. + path, out_dir = sys.argv[1], sys.argv[2] + # fields from config.yaml to be included as columns, + # doesn't hurt to add more columns. + variables = ['dataset.format', 'dataset.name', + 'dataset.AS_node_feature', + 'gnn.layer_type', 'gnn.batchnorm', 'gnn.layers_mp', + 'gnn.layers_post_mp', + 'gnn.layers_pre_mp', + 'gnn.skip_connection', 'gnn.embed_update_method', + 'optim.base_lr', + 'transaction.feature_int_dim', + 'gnn.agg', 'train.mode', + 'gnn.msg_direction', + 'train.internal_validation_tolerance', 'gnn.dim_inner', + 'meta.is_meta', 'meta.method', 'meta.alpha', + 'transaction.snapshot_freq', 'gnn.embed_update_method'] + df = tabulate_events(path, variables) + df.to_csv(out_dir) From 0671c0bbd6ad2df3a8aabeef1400ec5764cf65d0 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Tue, 15 Jun 2021 13:34:25 -0700 Subject: [PATCH 62/66] clean up comments --- graphgym/models/update.py | 80 --------------------------------------- 1 file changed, 80 deletions(-) diff --git a/graphgym/models/update.py b/graphgym/models/update.py index df99d3aa..0d3a6520 100644 --- a/graphgym/models/update.py +++ b/graphgym/models/update.py @@ -98,86 +98,6 @@ def forward(self, batch): return batch -# class MaskedGRUUpdater(nn.Module): -# """ -# Node embedding update block using standard GRU. - -# h[l,t] = GRU(h[l,t-1], h[l-1,t]) -# """ -# def __init__(self, dim_in: int, dim_out: int, layer_id: int): -# # dim_in (dim of X): dimension of input node_feature. -# # dim_out (dim of H): dimension of previous and current hidden states. -# # forward(X, H) --> H. -# super(MaskedGRUUpdater, self).__init__() -# self.layer_id = layer_id -# self.GRU_Z = nn.Sequential( -# nn.Linear(dim_in + dim_out, dim_out, bias=True), -# nn.Sigmoid()) -# # reset gate. -# self.GRU_R = nn.Sequential( -# nn.Linear(dim_in + dim_out, dim_out, bias=True), -# nn.Sigmoid()) -# # new embedding gate. -# self.GRU_H_Tilde = nn.Sequential( -# nn.Linear(dim_in + dim_out, dim_out, bias=True), -# nn.Tanh()) - -# def forward(self, batch): -# H_prev = batch.node_states[self.layer_id] -# X = batch.node_feature -# Z = self.GRU_Z(torch.cat([X, H_prev], dim=1)) -# R = self.GRU_R(torch.cat([X, H_prev], dim=1)) -# H_tilde = self.GRU_H_Tilde(torch.cat([X, R * H_prev], dim=1)) -# H_gru = Z * H_prev + (1 - Z) * H_tilde - -# # Update for active nodes only, use output from GRU. -# keep_mask = (batch.node_degree_new == 0) -# H_out = H_gru -# # Reset inactive nodes' embedding. -# H_out[keep_mask, :] = H_prev[keep_mask, :] - -# batch.node_states[self.layer_id] = H_out -# return batch - - -# class MovingAverageGRUUpdater(nn.Module): -# """ -# Node embedding update block using standard GRU. - -# h[l,t] = GRU(h[l,t-1], h[l-1,t]) -# """ -# def __init__(self, dim_in: int, dim_out: int, layer_id: int): -# # dim_in (dim of X): dimension of input node_feature. -# # dim_out (dim of H): dimension of previous and current hidden states. -# # forward(X, H) --> H. -# super(GRUUpdater, self).__init__() -# self.layer_id = layer_id -# self.GRU_Z = nn.Sequential( -# nn.Linear(dim_in + dim_out, dim_out, bias=True), -# nn.Sigmoid()) -# # reset gate. -# self.GRU_R = nn.Sequential( -# nn.Linear(dim_in + dim_out, dim_out, bias=True), -# nn.Sigmoid()) -# # new embedding gate. -# self.GRU_H_Tilde = nn.Sequential( -# nn.Linear(dim_in + dim_out, dim_out, bias=True), -# nn.Tanh()) - -# def forward(self, batch): -# H_prev = batch.node_states[self.layer_id] -# X = batch.node_feature -# Z = self.GRU_Z(torch.cat([X, H_prev], dim=1)) -# R = self.GRU_R(torch.cat([X, H_prev], dim=1)) -# H_tilde = self.GRU_H_Tilde(torch.cat([X, R * H_prev], dim=1)) -# H_gru = Z * H_prev + (1 - Z) * H_tilde - -# H_out = H_prev * batch.keep_ratio + H_gru * (1 - batch.keep_ratio) - -# batch.node_states[self.layer_id] = H_out -# return batch - - update_dict = { 'moving_average': MovingAverageUpdater, 'mlp': MLPUpdater, From 71e0086b72b288bf0fdbe21b5f27bed9db644f29 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Tue, 15 Jun 2021 13:34:57 -0700 Subject: [PATCH 63/66] cleanup comments --- graphgym/contrib/train/train_utils.py | 60 --------------------------- 1 file changed, 60 deletions(-) diff --git a/graphgym/contrib/train/train_utils.py b/graphgym/contrib/train/train_utils.py index 9117ee76..5784147e 100644 --- a/graphgym/contrib/train/train_utils.py +++ b/graphgym/contrib/train/train_utils.py @@ -268,66 +268,6 @@ def fast_batch_mrr(edge_label_index: torch.Tensor, mrr = float(torch.mean(1 / rank_by_user)) return mrr -# TODO: get recall at k back. - -# @torch.no_grad() -# def report_rank_based_eval(eval_batch, model, method: str, -# num_neg_per_node: int=1000): -# if num_neg_per_node == -1: -# # Do not report rank-based metrics, used in debug mode. -# return 0, 0, 0, 0 -# # Get positive edge indices. -# edge_index = eval_batch.edge_label_index[:, eval_batch.edge_label == 1] -# edge_index = edge_index.to('cpu') - -# neg_edge_index = gen_negative_edges(edge_index, num_neg_per_node, -# num_nodes=eval_batch.num_nodes) - -# new_edge_label_index = torch.cat((edge_index, neg_edge_index), -# dim=1).long() -# new_edge_label = torch.cat((torch.ones(edge_index.shape[1]), -# torch.zeros(neg_edge_index.shape[1]) -# ), dim=0).long() - -# # Construct evaluation samples. -# eval_batch.edge_label_index = new_edge_label_index -# eval_batch.edge_label = new_edge_label - -# eval_batch.to(torch.device(cfg.device)) -# # move state to gpu -# for layer in range(len(eval_batch.node_states)): -# if torch.is_tensor(eval_batch.node_states[layer]): -# eval_batch.node_states[layer] = eval_batch.node_states[layer].to( -# torch.device(cfg.device)) -# pred, true = model(eval_batch) -# loss, pred_score = compute_loss(pred, true) - -# mrr, recall_at = fast_batch_mrr_and_recall(eval_batch.edge_label_index, -# eval_batch.edge_label, -# pred_score, -# num_neg_per_node, -# eval_batch.num_nodes, -# method) - -# # return mrr, 0, 0, 0 -# # -# # mrr_old, recall_at_old = compute_src_mrr_and_recall( -# # eval_batch.edge_label_index, -# # eval_batch.edge_label, -# # pred_score, -# # recall_k_lst=[1, 3, 10], -# # mrr_top_k=1) -# # -# # print(f'Old MRR: {mrr_old: 0.6f}, new MRR {mrr: 0.6f}') -# # print( -# # f'Old Recall@1: {recall_at_old[1]: 0.6f}, new Recall@1 {recall_at[1]: 0.6f}') -# # print( -# # f'Old Recall@3: {recall_at_old[3]: 0.6f}, new Recall@3 {recall_at[3]: 0.6f}') -# # print( -# # f'Old Recall@10: {recall_at_old[10]: 0.6f}, new Recall@10 {recall_at[10]: 0.6f}') - -# return mrr, recall_at[1], recall_at[3], recall_at[10] - def get_row_MRR(probs, true_classes): existing_mask = true_classes == 1 From bd0527a0b7a7ee9b72ff74ad72e075c6ff5be05e Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Tue, 15 Jun 2021 13:35:56 -0700 Subject: [PATCH 64/66] remove loaders --- graphgym/contrib/loader/roland_as.py | 166 --------------------- graphgym/contrib/loader/roland_btc.py | 182 ----------------------- graphgym/contrib/loader/roland_reddit.py | 174 ---------------------- graphgym/contrib/loader/roland_ucimsg.py | 110 -------------- 4 files changed, 632 deletions(-) delete mode 100644 graphgym/contrib/loader/roland_as.py delete mode 100644 graphgym/contrib/loader/roland_btc.py delete mode 100644 graphgym/contrib/loader/roland_reddit.py delete mode 100644 graphgym/contrib/loader/roland_ucimsg.py diff --git a/graphgym/contrib/loader/roland_as.py b/graphgym/contrib/loader/roland_as.py deleted file mode 100644 index bcf3b7a1..00000000 --- a/graphgym/contrib/loader/roland_as.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -Loader for the Autonomous systems AS-733 dataset. -""" -import os -from datetime import datetime -from typing import List - -import numpy as np -import pandas as pd -import torch -from deepsnap.graph import Graph -from graphgym.config import cfg -from graphgym.register import register_loader -from sklearn.preprocessing import OrdinalEncoder -from tqdm import tqdm - - -def make_graph_snapshot(g_all: Graph, snapshot_freq: str) -> List[Graph]: - t = g_all.edge_time.numpy().astype(np.int64) - snapshot_freq = snapshot_freq.upper() - - period_split = pd.DataFrame( - {'Timestamp': t, - 'TransactionTime': pd.to_datetime(t, unit='s')}, - index=range(len(g_all.edge_time))) - - freq_map = {'D': '%j', # day of year. - 'W': '%W', # week of year. - 'M': '%m' # month of year. - } - - period_split['Year'] = period_split['TransactionTime'].dt.strftime( - '%Y').astype(int) - - period_split['SubYearFlag'] = period_split['TransactionTime'].dt.strftime( - freq_map[snapshot_freq]).astype(int) - - period2id = period_split.groupby(['Year', 'SubYearFlag']).indices - - periods = sorted(list(period2id.keys())) - snapshot_list = list() - - for p in periods: - # unique IDs of edges in this period. - period_members = period2id[p] - assert np.all(period_members == np.unique(period_members)) - - g_incr = Graph( - node_feature=g_all.node_feature, - edge_feature=g_all.edge_feature[period_members, :], - edge_index=g_all.edge_index[:, period_members], - edge_time=g_all.edge_time[period_members], - directed=g_all.directed - ) - snapshot_list.append(g_incr) - - snapshot_list.sort(key=lambda x: torch.min(x.edge_time)) - - return snapshot_list - - -def file2timestamp(file_name): - t = file_name.strip('.txt').strip('as') - ts = int(datetime.strptime(t, '%Y%m%d').timestamp()) - return ts - - -def load_generic_dataset(format, name, dataset_dir): - if format == 'as': - all_files = [x for x in sorted(os.listdir(dataset_dir)) - if (x.startswith('as') and x.endswith('.txt'))] - assert len(all_files) == 733 - assert all(x.endswith('.txt') for x in all_files) - - edge_index_lst, edge_time_lst = list(), list() - all_files = sorted(all_files) - # if cfg.train.mode in ['baseline', 'baseline_v2', 'live_update_fixed_split']: - # # The baseline setting in EvolveGCN paper only uses 100 snapshots. - # all_files = all_files[:100] - for graph_file in tqdm(all_files): - today = file2timestamp(graph_file) - graph_file = os.path.join(dataset_dir, graph_file) - - src, dst = list(), list() - with open(graph_file, 'r') as f: - for line in f.readlines(): - if line.startswith('#'): - continue - line = line.strip('\n') - v1, v2 = line.split('\t') - src.append(int(v1)) - dst.append(int(v2)) - - edge_index = np.stack((src, dst)) - edge_index_lst.append(edge_index) - - edge_time = np.ones(edge_index.shape[1]) * today - edge_time_lst.append(edge_time) - - edge_index_raw = np.concatenate(edge_index_lst, axis=1).astype(int) - - num_nodes = len(np.unique(edge_index_raw)) - - # encode node indices to consecutive integers. - node_indices = np.sort(np.unique(edge_index_raw)) - enc = OrdinalEncoder(categories=[node_indices, node_indices]) - edge_index = enc.fit_transform(edge_index_raw.transpose()).transpose() - edge_index = torch.Tensor(edge_index).long() - edge_time = torch.Tensor(np.concatenate(edge_time_lst)) - - # Use scaled datetime as edge_feature. - scale = edge_time.max() - edge_time.min() - base = edge_time.min() - scaled_edge_time = 2 * (edge_time.clone() - base) / scale - - assert cfg.dataset.AS_node_feature in ['one', 'one_hot_id', - 'one_hot_degree_global'] - - if cfg.dataset.AS_node_feature == 'one': - node_feature = torch.ones(num_nodes, 1) - elif cfg.dataset.AS_node_feature == 'one_hot_id': - # One hot encoding the node ID. - node_feature = torch.Tensor(np.eye(num_nodes)) - elif cfg.dataset.AS_node_feature == 'one_hot_degree_global': - # undirected graph, use only out degree. - _, node_degree = torch.unique(edge_index[0], sorted=True, - return_counts=True) - node_feature = np.zeros((num_nodes, node_degree.max() + 1)) - node_feature[np.arange(num_nodes), node_degree] = 1 - # 1 ~ 63748 degrees, but only 710 possible levels, exclude all zero - # columns. - non_zero_cols = (node_feature.sum(axis=0) > 0) - node_feature = node_feature[:, non_zero_cols] - node_feature = torch.Tensor(node_feature) - else: - raise NotImplementedError - - g_all = Graph( - node_feature=node_feature, - edge_feature=scaled_edge_time.reshape(-1, 1), - edge_index=edge_index, - edge_time=edge_time, - directed=True - ) - - snapshot_list = make_graph_snapshot(g_all, - cfg.transaction.snapshot_freq) - - for g_snapshot in snapshot_list: - g_snapshot.node_states = [0 for _ in range(cfg.gnn.layers_mp)] - g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] - g_snapshot.node_degree_existing = torch.zeros(num_nodes) - - if cfg.dataset.split_method == 'chronological_temporal': - return snapshot_list - else: - # The default split (80-10-10) requires at least 10 edges each - # snapshot. - filtered_graphs = list() - for g in tqdm(snapshot_list): - if g.num_edges >= 10: - filtered_graphs.append(g) - return filtered_graphs - - -register_loader('roland_as', load_generic_dataset) diff --git a/graphgym/contrib/loader/roland_btc.py b/graphgym/contrib/loader/roland_btc.py deleted file mode 100644 index 58a9884d..00000000 --- a/graphgym/contrib/loader/roland_btc.py +++ /dev/null @@ -1,182 +0,0 @@ -""" -Data loader for bitcoin datasets. -Mar. 27, 2021 -""" -import os -from typing import List, Union - -import deepsnap -import graphgym.contrib.loader.dynamic_graph_utils as utils -import numpy as np -import pandas as pd -import torch -from deepsnap.graph import Graph -from graphgym.config import cfg -from graphgym.register import register_loader -from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder - - -def load_single_dataset(dataset_dir: str) -> Graph: - df_trans = pd.read_csv(dataset_dir, sep=',', header=None, index_col=None) - df_trans.columns = ['SOURCE', 'TARGET', 'RATING', 'TIME'] - # NOTE: 'SOURCE' and 'TARGET' are not consecutive. - num_nodes = len( - pd.unique(df_trans[['SOURCE', 'TARGET']].to_numpy().ravel())) - - # bitcoin OTC contains decimal numbers, round them. - df_trans['TIME'] = df_trans['TIME'].astype(np.int).astype(np.float) - assert not np.any(pd.isna(df_trans).values) - - time_scaler = MinMaxScaler((0, 2)) - df_trans['TimestampScaled'] = time_scaler.fit_transform( - df_trans['TIME'].values.reshape(-1, 1)) - - edge_feature = torch.Tensor( - df_trans[['RATING', 'TimestampScaled']].values) # (E, edge_dim) - # SOURCE and TARGET IDs are already encoded in the csv file. - # edge_index = torch.Tensor( - # df_trans[['SOURCE', 'TARGET']].values.transpose()).long() # (2, E) - - node_indices = np.sort( - pd.unique(df_trans[['SOURCE', 'TARGET']].to_numpy().ravel())) - enc = OrdinalEncoder(categories=[node_indices, node_indices]) - raw_edges = df_trans[['SOURCE', 'TARGET']].values - edge_index = enc.fit_transform(raw_edges).transpose() - edge_index = torch.LongTensor(edge_index) - - # num_nodes = torch.max(edge_index) + 1 - # Use dummy node features. - node_feature = torch.ones(num_nodes, 1).float() - - edge_time = torch.FloatTensor(df_trans['TIME'].values) - - # TODO: add option here. - # if cfg.train.mode in ['baseline', 'baseline_v2', 'live_update_fixed_split']: - # edge_feature = torch.cat((edge_feature, edge_feature.clone()), dim=0) - # reversed_idx = torch.stack([edge_index[1], edge_index[0]]).clone() - # edge_index = torch.cat((edge_index, reversed_idx), dim=1) - # edge_time = torch.cat((edge_time, edge_time.clone())) - - graph = Graph( - node_feature=node_feature, - edge_feature=edge_feature, - edge_index=edge_index, - edge_time=edge_time, - directed=True - ) - return graph - - -# def make_graph_snapshot(g_all: Graph, snapshot_freq: str) -> List[Graph]: -# t = g_all.edge_time.numpy().astype(np.int64) -# snapshot_freq = snapshot_freq.upper() - -# period_split = pd.DataFrame( -# {'Timestamp': t, -# 'TransactionTime': pd.to_datetime(t, unit='s')}, -# index=range(len(g_all.edge_time))) - -# freq_map = {'D': '%j', # day of year. -# 'W': '%W', # week of year. -# 'M': '%m' # month of year. -# } - -# period_split['Year'] = period_split['TransactionTime'].dt.strftime( -# '%Y').astype(int) - -# period_split['SubYearFlag'] = period_split['TransactionTime'].dt.strftime( -# freq_map[snapshot_freq]).astype(int) - -# period2id = period_split.groupby(['Year', 'SubYearFlag']).indices - -# periods = sorted(list(period2id.keys())) -# snapshot_list = list() - -# for p in periods: -# # unique IDs of edges in this period. -# period_members = period2id[p] -# assert np.all(period_members == np.unique(period_members)) - -# g_incr = Graph( -# node_feature=g_all.node_feature, -# edge_feature=g_all.edge_feature[period_members, :], -# edge_index=g_all.edge_index[:, period_members], -# edge_time=g_all.edge_time[period_members], -# directed=g_all.directed -# ) -# snapshot_list.append(g_incr) - -# snapshot_list.sort(key=lambda x: torch.min(x.edge_time)) - -# return snapshot_list - - -# def split_by_seconds(g_all, freq_sec: int): -# # Split the entire graph into snapshots. -# split_criterion = g_all.edge_time // freq_sec -# groups = torch.sort(torch.unique(split_criterion))[0] -# snapshot_list = list() -# for t in groups: -# period_members = (split_criterion == t) -# g_incr = Graph( -# node_feature=g_all.node_feature, -# edge_feature=g_all.edge_feature[period_members, :], -# edge_index=g_all.edge_index[:, period_members], -# edge_time=g_all.edge_time[period_members], -# directed=g_all.directed -# ) -# snapshot_list.append(g_incr) -# return snapshot_list - -# TODO: merge these two method. -def load_snapshots(dataset_dir: str, - snapshot: bool = True, - snapshot_freq: str = None - ) -> Union[deepsnap.graph.Graph, - List[deepsnap.graph.Graph]]: - g_all = load_single_dataset(dataset_dir) - if not snapshot: - return g_all - - if snapshot_freq.upper() not in ['D', 'W', 'M']: - # format: '1200000s' - # assume split by seconds (timestamp) as in EvolveGCN paper. - freq = int(snapshot_freq.strip('s')) - snapshot_list = utils.make_graph_snapshot_by_seconds(g_all, freq) - else: - snapshot_list = utils.make_graph_snapshot(g_all, snapshot_freq) - num_nodes = g_all.edge_index.max() + 1 - - for g_snapshot in snapshot_list: - g_snapshot.node_states = [0 for _ in range(cfg.gnn.layers_mp)] - g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] - g_snapshot.node_degree_existing = torch.zeros(num_nodes) - - # check snapshots ordering. - prev_end = -1 - for g in snapshot_list: - start, end = torch.min(g.edge_time), torch.max(g.edge_time) - assert prev_end < start <= end - prev_end = end - - return snapshot_list - - -def load_btc_dataset(format: str, name: str, dataset_dir: str): - if format == 'bitcoin': - graphs = load_snapshots(os.path.join(dataset_dir, name), - snapshot=cfg.transaction.snapshot, - snapshot_freq=cfg.transaction.snapshot_freq) - if cfg.dataset.split_method == 'chronological_temporal': - return graphs - else: - # The default split (80-10-10) requires at least 10 edges each - # snapshot. - filtered_graphs = list() - for g in graphs: - if g.num_edges >= 10: - filtered_graphs.append(g) - return filtered_graphs - - -register_loader('roland_btc', load_btc_dataset) diff --git a/graphgym/contrib/loader/roland_reddit.py b/graphgym/contrib/loader/roland_reddit.py deleted file mode 100644 index 37d7e66d..00000000 --- a/graphgym/contrib/loader/roland_reddit.py +++ /dev/null @@ -1,174 +0,0 @@ -import os -from typing import List, Union - -import dask.dataframe as dd -import deepsnap -import graphgym.contrib.loader.dynamic_graph_utils as utils -import numpy as np -import pandas as pd -import torch -from dask_ml.preprocessing import OrdinalEncoder -from deepsnap.graph import Graph -from graphgym.config import cfg -from graphgym.register import register_loader -from sklearn.preprocessing import MinMaxScaler - - -def load_single_dataset(dataset_dir: str) -> Graph: - df_trans = dd.read_csv(dataset_dir, sep='\t', low_memory=False) - df_trans = df_trans.compute() - assert not np.any(pd.isna(df_trans).values) - df_trans.reset_index(drop=True, inplace=True) # required for dask. - - # Encode src and dst node IDs. - # get unique values of src and dst. - unique_subreddits = pd.unique( - df_trans[['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']].to_numpy().ravel()) - unique_subreddits = np.sort(unique_subreddits) - cate_type = pd.api.types.CategoricalDtype(categories=unique_subreddits, - ordered=True) - df_trans['SOURCE_SUBREDDIT'] = df_trans['SOURCE_SUBREDDIT'].astype( - cate_type) - df_trans['TARGET_SUBREDDIT'] = df_trans['TARGET_SUBREDDIT'].astype( - cate_type) - enc = OrdinalEncoder(columns=['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']) - df_encoded = enc.fit_transform(df_trans) - df_encoded.reset_index(drop=True, inplace=True) - - # Add node feature from the embedding dataset. - node_embedding_dir = os.path.join(cfg.dataset.dir, - 'web-redditEmbeddings-subreddits.csv') - - # index: subreddit name, values: embedding. - df_node = pd.read_csv(node_embedding_dir, header=None, index_col=0) - - # ordinal encoding follows order in unique_subreddits. - # df_encoded['SOURCE_SUBREDDIT'] contains encoded integral values. - # unique_subreddits[df_encoded['SOURCE_SUBREDDIT']] - # tries to reverse encoded_integer --> original subreddit name. - # check if recovered sub-reddit name matched the raw data. - for col in ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']: - assert all(unique_subreddits[df_encoded[col]] == df_trans[col]) - - num_nodes = len(cate_type.categories) - node_feature = torch.ones(size=(num_nodes, 300)) - # for nodes without precomputed embedding, use the average value. - node_feature = node_feature * np.mean(df_node.values) - - # cate_type.categories[i] is encoded to i, by construction. - for i, subreddit in enumerate(cate_type.categories): - if subreddit in df_node.index: - embedding = df_node.loc[subreddit] - node_feature[i, :] = torch.Tensor(embedding.values) - - # Original format: df['TIMESTAMP'][0] = '2013-12-31 16:39:18' - # Convert to unix timestamp (integers). - df_encoded['TIMESTAMP'] = pd.to_datetime(df_encoded['TIMESTAMP'], - format='%Y-%m-%d %H:%M:%S') - df_encoded['TIMESTAMP'] = (df_encoded['TIMESTAMP'] - pd.Timestamp( - '1970-01-01')) // pd.Timedelta('1s') # now integers. - - # Scale edge time. - time_scaler = MinMaxScaler((0, 2)) - df_encoded['TimestampScaled'] = time_scaler.fit_transform( - df_encoded['TIMESTAMP'].values.reshape(-1, 1)) - - # Link sentimental representation (86-dimension). - # comma-separated string: '3.1,5.1,0.0,...' - senti_str_lst = df_encoded['PROPERTIES'].values - edge_senti_embedding = [x.split(',') for x in senti_str_lst] - edge_senti_embedding = np.array(edge_senti_embedding).astype(np.float32) - # (E, 86) - - ef = df_encoded[['TimestampScaled', 'LINK_SENTIMENT']].values - edge_feature = np.concatenate([ef, edge_senti_embedding], axis=1) - edge_feature = torch.Tensor(edge_feature).float() # (E, 88) - - edge_index = torch.Tensor( - df_encoded[['SOURCE_SUBREDDIT', - 'TARGET_SUBREDDIT']].values.transpose()).long() # (2, E) - num_nodes = torch.max(edge_index) + 1 - - edge_time = torch.FloatTensor(df_encoded['TIMESTAMP'].values) - - graph = Graph( - node_feature=node_feature, - edge_feature=edge_feature, - edge_index=edge_index, - edge_time=edge_time, - directed=True - ) - - return graph - - -# def make_graph_snapshot(g_all: Graph, snapshot_freq: str) -> list: -# t = g_all.edge_time.numpy().astype(np.int64) -# snapshot_freq = snapshot_freq.upper() - -# period_split = pd.DataFrame( -# {'Timestamp': t, -# 'TransactionTime': pd.to_datetime(t, unit='s')}, -# index=range(len(g_all.edge_time))) - -# freq_map = {'D': '%j', # day of year. -# 'W': '%W', # week of year. -# 'M': '%m' # month of year. -# } - -# period_split['Year'] = period_split['TransactionTime'].dt.strftime( -# '%Y').astype(int) - -# period_split['SubYearFlag'] = period_split['TransactionTime'].dt.strftime( -# freq_map[snapshot_freq]).astype(int) - -# period2id = period_split.groupby(['Year', 'SubYearFlag']).indices -# # e.g., dictionary w/ key = (2021, 3) and val = array(edges). - -# periods = sorted(list(period2id.keys())) -# snapshot_list = list() -# for p in periods: -# # unique IDs of edges in this period. -# period_members = period2id[p] -# assert np.all(period_members == np.unique(period_members)) - -# g_incr = Graph( -# node_feature=g_all.node_feature, -# edge_feature=g_all.edge_feature[period_members, :], -# edge_index=g_all.edge_index[:, period_members], -# edge_time=g_all.edge_time[period_members], -# directed=g_all.directed -# ) -# snapshot_list.append(g_incr) -# return snapshot_list - - -def load_generic(dataset_dir: str, - snapshot: bool = True, - snapshot_freq: str = None - ) -> Union[deepsnap.graph.Graph, - List[deepsnap.graph.Graph]]: - g_all = load_single_dataset(dataset_dir) - if not snapshot: - return g_all - else: - snapshot_list = utils.make_graph_snapshot(g_all, snapshot_freq) - num_nodes = g_all.edge_index.max() + 1 - - for g_snapshot in snapshot_list: - g_snapshot.node_states = [0 for _ in range(cfg.gnn.layers_mp)] - g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] - g_snapshot.node_degree_existing = torch.zeros(num_nodes) - - return snapshot_list - - -def load_generic_dataset(format, name, dataset_dir): - if format == 'reddit_hyperlink': - graphs = load_generic(os.path.join(dataset_dir, name), - snapshot=cfg.transaction.snapshot, - snapshot_freq=cfg.transaction.snapshot_freq) - return graphs - - -register_loader('roland_reddit_hyperlink', load_generic_dataset) diff --git a/graphgym/contrib/loader/roland_ucimsg.py b/graphgym/contrib/loader/roland_ucimsg.py deleted file mode 100644 index 6ac2b9fc..00000000 --- a/graphgym/contrib/loader/roland_ucimsg.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -Loader for the CollegeMsg temporal network. - -For more information: https://snap.stanford.edu/data/CollegeMsg.html - -Mar. 31, 2021 -""" -import os -from typing import List, Union - -import deepsnap -import numpy as np -import pandas as pd -import torch -from deepsnap.graph import Graph -from sklearn.preprocessing import MinMaxScaler - -from graphgym.config import cfg -import graphgym.contrib.loader.dynamic_graph_utils as utils -from graphgym.register import register_loader - - -def load_single_dataset(dataset_dir: str) -> Graph: - df_trans = pd.read_csv(dataset_dir, sep=' ', header=None) - df_trans.columns = ['SRC', 'DST', 'TIMESTAMP'] - assert not np.any(pd.isna(df_trans).values) - df_trans.reset_index(drop=True, inplace=True) - - # Node IDs of this dataset start from 1, re-index to 0-based. - df_trans['SRC'] -= 1 - df_trans['DST'] -= 1 - - print('num of edges:', len(df_trans)) - print('num of nodes:', np.max(df_trans[['SRC', 'DST']].values) + 1) - - time_scaler = MinMaxScaler((0, 2)) - df_trans['TimestampScaled'] = time_scaler.fit_transform( - df_trans['TIMESTAMP'].values.reshape(-1, 1)) - - edge_feature = torch.Tensor( - df_trans[['TimestampScaled']].values).view(-1, 1) - edge_index = torch.Tensor( - df_trans[['SRC', 'DST']].values.transpose()).long() # (2, E) - num_nodes = torch.max(edge_index) + 1 - - node_feature = torch.ones(num_nodes, 1) - - edge_time = torch.FloatTensor(df_trans['TIMESTAMP'].values) - - graph = Graph( - node_feature=node_feature, - edge_feature=edge_feature, - edge_index=edge_index, - edge_time=edge_time, - directed=True - ) - - return graph - - -def load_snapshots(dataset_dir: str, - snapshot: bool = True, - snapshot_freq: str = None - ) -> Union[deepsnap.graph.Graph, - List[deepsnap.graph.Graph]]: - g_all = load_single_dataset(dataset_dir) - if not snapshot: - return g_all - if snapshot_freq.upper() not in ['D', 'W', 'M']: - # format: '1200000s' - assert snapshot_freq.endswith('s') - freq = int(snapshot_freq.strip('s')) - snapshot_list = utils.make_graph_snapshot_by_seconds(g_all, freq) - else: - snapshot_list = utils.make_graph_snapshot(g_all, snapshot_freq, - is_hetero=False) - - num_nodes = g_all.edge_index.max() + 1 - - for g_snapshot in snapshot_list: - g_snapshot.node_states = [0 for _ in range(cfg.gnn.layers_mp)] - g_snapshot.node_cells = [0 for _ in range(cfg.gnn.layers_mp)] - g_snapshot.node_degree_existing = torch.zeros(num_nodes) - - return snapshot_list - - -def load_uci_dataset(format, name, dataset_dir): - if format == 'uci_message': - graphs = load_snapshots(os.path.join(dataset_dir, name), - snapshot=cfg.transaction.snapshot, - snapshot_freq=cfg.transaction.snapshot_freq) - if cfg.dataset.split_method == 'chronological_temporal': - # return graphs with enough number of edges. - filtered_graphs = list() - for g in graphs: - if g.num_edges >= 2: - filtered_graphs.append(g) - return filtered_graphs - else: - # The default split (80-10-10) requires at least 10 edges each - # snapshot. - filtered_graphs = list() - for g in graphs: - if g.num_edges >= 10: - filtered_graphs.append(g) - return filtered_graphs - - -register_loader('roland_uci_message', load_uci_dataset) From 5f4f45fa8b6a9ccff56ff251404238ed6ba7e26a Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Tue, 15 Jun 2021 13:41:44 -0700 Subject: [PATCH 65/66] comment out under-development part. --- ROLAND_README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ROLAND_README.md b/ROLAND_README.md index 4b7390e5..933d02d2 100644 --- a/ROLAND_README.md +++ b/ROLAND_README.md @@ -5,7 +5,7 @@ public datasets. After understanding how to run and analyze experiments, you can read through the *development topics* to run our -## TODO: add figures to illustrate the ROLAND framework. + ## How to Download Datasets Most of datasets are used in our paper can be found at `https://snap.stanford.edu/data/index.html`. @@ -70,10 +70,10 @@ tensorboard --logdir=./runs_live_update --port=6006 ``` **WARNING** The x-axis of plots in tensorboard is **not** epochs, they are snapshot IDs (e.g., the $i^{th}$ day or the $i^{th}$ week) instead. -## Examples on Heterogenous Graph Snapshots + ## How to Run Grid Search / Batch Experiments To run grid search / batch experiments, one needs a `main.py` file, a `base_config.yaml`, and a `grid.txt` file. The main and config files are the same as in the single experiment setup above. From c6e35a20b3e687a9f7fdbfbb8229966e42123d75 Mon Sep 17 00:00:00 2001 From: Tianyu Du Date: Tue, 15 Jun 2021 14:00:24 -0700 Subject: [PATCH 66/66] update requirements.txt --- requirements.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index f2111231..0c1a0d8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,11 +4,15 @@ torch torch-scatter torch-geometric deepsnap +dask_ml +dask[complete] ogb numpy -pandas +pandas>=1.0 scipy scikit-learn matplotlib seaborn -notebook \ No newline at end of file +notebook +tensorboard +tqdm \ No newline at end of file