snap-stanford · luciancahil · Mar 30, 2023 · Mar 30, 2023 · Mar 30, 2023 · Mar 30, 2023
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,13 @@
 run/datasets/data/
 **/__pycache__/
 **/.ipynb_checkpoints
-.idea/
+.idea/
+
+results/
+datasets/
+configs/
+
+example_custom.yaml
+run_custom.sh
+Custom Loader.txt
+example_custom.txt
diff --git a/README.md b/README.md
@@ -313,13 +313,33 @@ Within each directory, (at least) an example is provided, showing how to registe
 Note that new user customized modules may result in new configurations; in these cases, new configuration fields
 can be registered at [`graphgym/contrib/config/`](graphgym/contrib/config).
 
-**Note: Applying to your own datasets.**
+### Applying to your own datasets
 A common use case will be applying GraphGym to your favorite datasets.
 To do so, you may follow our example in 
 [`graphgym/contrib/loader/example.py`](graphgym/contrib/loader/example.py).
 GraphGym currently accepts a list of [NetworkX](https://networkx.org/documentation/stable/index.html) graphs 
 or [PyG](https://pytorch-geometric.readthedocs.io/en/latest/) datasets.
 
+Alterernatively:
+1. Save your pytorch geometric dataset as a .pt file.
+1. Compress the .pt file into a .zip file
+1. Upload that .zip file somewhere on the internet, such as google drive.
+1. Generate a download link for the zip file; that is, a link that begins a download of the zip file, rather than show a preview
+1. Alter the "name" section in the yaml file to the following structure: "Custom,[NAME_OF_FILE],[DOWNLOAD_URL]". For example, to run the MNISTSuperdigit dataset, set "name" to "Custom,MNISTSuperPixels,https://data.pyg.org/datasets/MNISTSuperpixels.zip"
+1. Add the dataset class file of your dataset to the run folder, and import it into main.py. Update the dataset's processed_dir attribute to "dataset/name/processed
+1. Alter the process() function to do follow the instructions below
+
+#### Process function requirements
+1. Your dataset's process function must create 3 tuples, one for training data, one for all data, and one for testing. The tuples must have 2 objects in them. First, a Data() object containing an x tensor, a y tensor, and an edge_index tensor. Second, a dict that serves as the "slices" array for each of the x tensor, y tensor, and edge_index tensor.
+1. If your dataset is a node classifaction task, then the 2nd object should just be None. 
+1. If your data.x must contain a 2D tensor of type float. There must be as many rows as there are node inputs in your entire dataset, and as many collumns as there are values in each individual node. If each node only has 1 single value, then it should only have 1 collumn.
+1. Your data.y field must be a 1D tensor of type int. It represents the desired target. For graph classification, it must have as many elements as graphs you wish to classify. For node classification, it must have as many elements as nodes.
+1. Your data.edge_index field must be a 2D tensor of type int with 2 rows and as many collumns as their are connections in your entire dataset (not just one graph). The ith elment in the first row tells you where the ith connection originates, and the ith element in the 2nd row tells you where the ith connection terminates.
+1. Your slices dict must have 3 key-value pairs. The keys must be the following string literals: 'x', 'y', 'edge_index'. The values must be 1D tensors, with the first value for all 3 being 0. All 3 tensors one more element than the number of graphs you want to classify. The 2nd value should state the index of the first value that is not in the first graph, the 3rd value should be the first value that is not in the 2nd graph. For example, if slices['x'] = tensor([0, 100, 200]), that means there are 2 graphs to classify. The first graph should take values 0 to 99 inclusive from the data.x tensor, and the 2nd graph should take values 100 to 199.
+1. Once all these tuples are made, save the training and testing tuples using torch.save() as 'train_data.pt' and 'test_dat.pt' respectively into the processed folder in the newly created subfolder for your dataset, which is located in run/datasets
+
+
+
 ### Use case: Design Space for Graph Neural Networks (NeurIPS 2020 Spotlight)
 
 Reproducing experiments in *[Design Space for Graph Neural Networks](https://arxiv.org/abs/2011.08843)*, Jiaxuan You, Rex Ying, Jure Leskovec, **NeurIPS 2020 Spotlight**.

diff --git a/graphgym.egg-info/PKG-INFO b/graphgym.egg-info/PKG-INFO
diff --git a/graphgym.egg-info/SOURCES.txt b/graphgym.egg-info/SOURCES.txt
@@ -0,0 +1,87 @@
+LICENSE
+README.md
+setup.py
+graphgym/__init__.py
+graphgym/checkpoint.py
+graphgym/cmd_args.py
+graphgym/config.py
+graphgym/custom_dataset.py
+graphgym/init.py
+graphgym/loader.py
+graphgym/loader_pyg.py
+graphgym/logger.py
+graphgym/loss.py
+graphgym/model_builder.py
+graphgym/model_builder_pyg.py
+graphgym/optimizer.py
+graphgym/register.py
+graphgym/train.py
+graphgym/train_pyg.py
+graphgym.egg-info/PKG-INFO
+graphgym.egg-info/SOURCES.txt
+graphgym.egg-info/dependency_links.txt
+graphgym.egg-info/requires.txt
+graphgym.egg-info/top_level.txt
+graphgym/contrib/__init__.py
+graphgym/contrib/act/__init__.py
+graphgym/contrib/act/example.py
+graphgym/contrib/config/__init__.py
+graphgym/contrib/config/example.py
+graphgym/contrib/feature_augment/__init__.py
+graphgym/contrib/feature_augment/example.py
+graphgym/contrib/feature_encoder/__init__.py
+graphgym/contrib/feature_encoder/example.py
+graphgym/contrib/head/__init__.py
+graphgym/contrib/head/example.py
+graphgym/contrib/layer/__init__.py
+graphgym/contrib/layer/attconv.py
+graphgym/contrib/layer/example.py
+graphgym/contrib/layer/generalconv.py
+graphgym/contrib/layer/generalconv_ogb.py
+graphgym/contrib/layer/generalconv_v2.py
+graphgym/contrib/layer/idconv.py
+graphgym/contrib/layer/sageinitconv.py
+graphgym/contrib/loader/__init__.py
+graphgym/contrib/loader/example.py
+graphgym/contrib/loss/__init__.py
+graphgym/contrib/loss/example.py
+graphgym/contrib/network/__init__.py
+graphgym/contrib/network/example.py
+graphgym/contrib/optimizer/__init__.py
+graphgym/contrib/optimizer/example.py
+graphgym/contrib/pooling/__init__.py
+graphgym/contrib/pooling/example.py
+graphgym/contrib/stage/__init__.py
+graphgym/contrib/stage/example.py
+graphgym/contrib/train/__init__.py
+graphgym/contrib/train/example.py
+graphgym/contrib/transform/__init__.py
+graphgym/contrib/transform/identity.py
+graphgym/models/__init__.py
+graphgym/models/act.py
+graphgym/models/feature_augment.py
+graphgym/models/feature_encoder.py
+graphgym/models/feature_encoder_pyg.py
+graphgym/models/gnn.py
+graphgym/models/gnn_pyg.py
+graphgym/models/head.py
+graphgym/models/head_pyg.py
+graphgym/models/layer.py
+graphgym/models/layer_pyg.py
+graphgym/models/pooling.py
+graphgym/models/transform.py
+graphgym/utils/__init__.py
+graphgym/utils/agg_runs.py
+graphgym/utils/comp_budget.py
+graphgym/utils/device.py
+graphgym/utils/epoch.py
+graphgym/utils/io.py
+graphgym/utils/plot.py
+graphgym/utils/tools.py
+run/CytokinesDataSet.py
+run/Visualization.py
+run/__init__.py
+run/agg_batch.py
+run/configs_gen.py
+run/main.py
+run/main_pyg.py
diff --git a/graphgym.egg-info/dependency_links.txt b/graphgym.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/graphgym.egg-info/requires.txt b/graphgym.egg-info/requires.txt
@@ -0,0 +1,8 @@
+yacs
+tensorboardx
+torch
+torch-geometric
+networkx
+numpy
+deepsnap
+ogb
diff --git a/graphgym.egg-info/top_level.txt b/graphgym.egg-info/top_level.txt
@@ -0,0 +1,2 @@
+graphgym
+run
diff --git a/graphgym/custom_dataset.py b/graphgym/custom_dataset.py
@@ -0,0 +1,58 @@
+import os
+from typing import Callable, List, Optional
+
+import torch
+
+from torch_geometric.data import (
+    Data,
+    InMemoryDataset,
+    download_url,
+    extract_zip,
+)
+
+
+class custom_dataset(InMemoryDataset):
+    url = "temp"
+    def __init__(
+        self,
+        name,
+        url,
+        root: str,
+        train: bool = -1,
+        transform: Optional[Callable] = None,
+        pre_transform: Optional[Callable] = None,
+        pre_filter: Optional[Callable] = None,
+    ):
+        self.name = name
+        self.url = url
+        self.train = train
+        super().__init__(root, transform, pre_transform, pre_filter)
+
+        if train == True:
+            path = self.processed_paths[0]
+        elif train == False:
+            path = self.processed_paths[1]
+        elif train == -1:
+            path = self.processed_paths[2]
+
+        self.path = path
+        self.data, self.slices = torch.load(path)
+
+    @property
+    def raw_file_names(self) -> str:
+        return self.name + '.pt'
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        return ['train_data.pt', 'test_data.pt','all_data.pt']
+
+    def download(self):
+        path = download_url(self.url, self.raw_dir)
+        extract_zip(path, self.raw_dir)
+        os.unlink(path)
+
+    def process(self):
+        inputs = torch.load(self.raw_paths[0])
+        inputs.process()
+        self.data = inputs
+
diff --git a/graphgym/loader.py b/graphgym/loader.py
@@ -13,6 +13,8 @@
                                       MNISTSuperpixels, Planetoid, QM7b,
                                       TUDataset)
 
+from .custom_dataset import custom_dataset
+
 import graphgym.models.feature_augment as preprocess
 import graphgym.register as register
 from graphgym.config import cfg
@@ -27,7 +29,12 @@ def load_pyg(name, dataset_dir):
     :param dataset_dir: data directory
     :return: a list of networkx/deepsnap graphs
     '''
-    dataset_dir = '{}/{}'.format(dataset_dir, name)
+    if not str(name[0:6]) == "Custom":
+        dataset_dir = '{}/{}'.format(dataset_dir, name)
+    else:
+        parts = name.split(",")     # in custom sets, the names field must contain names and urls
+        dataset_dir = '{}/{}'.format(dataset_dir, parts[1])
+
     if name in ['Cora', 'CiteSeer', 'PubMed']:
         dataset_raw = Planetoid(dataset_dir, name)
     elif name[:3] == 'TU_':
@@ -67,6 +74,10 @@ def load_pyg(name, dataset_dir):
         dataset_raw = PPI(dataset_dir)
     elif name == 'QM7b':
         dataset_raw = QM7b(dataset_dir)
+    elif name[0:6] == "Custom":
+        parts = name.split(",")     # in custom sets, the names field must contain names and urls
+        dataset_raw = custom_dataset(root=dataset_dir, name=parts[1], url=parts[2])
+        name = parts[1] # give it a new name
     else:
         raise ValueError('{} not support'.format(name))
     graphs = GraphDataset.pyg_to_graphs(dataset_raw)
@@ -251,7 +262,7 @@ def create_dataset():
     else:
         datasets = dataset.split(transductive=cfg.dataset.transductive,
                                  split_ratio=cfg.dataset.split,
-                                 shuffle=cfg.dataset.shuffle_split)
+                                 shuffle=False)
     # We only change the training negative sampling ratio
     for i in range(1, len(datasets)):
         dataset.edge_negative_sampling_ratio = 1

diff --git a/graphgym/models/gnn.py b/graphgym/models/gnn.py
@@ -9,10 +9,12 @@
 from graphgym.models.feature_augment import Preprocess
 from graphgym.models.feature_encoder import (edge_encoder_dict,
                                              node_encoder_dict)
-from graphgym.models.head import head_dict
+from graphgym.models.head import (head_dict, GNNGraphHead)
 from graphgym.models.layer import (BatchNorm1dEdge, BatchNorm1dNode,
-                                   GeneralLayer, GeneralMultiLayer)
+                                   GeneralLayer, GeneralMultiLayer,
+                                   Linear)
 
+import numpy as np
 
 # Layer
 def GNNLayer(dim_in, dim_out, has_act=True):
@@ -176,6 +178,52 @@ def __init__(self, dim_in, dim_out, **kwargs):
         self.post_mp = GNNHead(dim_in=d_in, dim_out=dim_out)
 
         self.apply(init_weights)
+
+    def get_last_hidden_layer_pooled(self, batch):
+        for module in self.children():
+            # don't do the final output layer. Keep the last hidden layer
+            if(isinstance(module, GNNGraphHead)):
+                for child in module.children():
+                    for grandChild in child.children():
+                        for greatGrandChild in grandChild.children():
+                            if not isinstance(greatGrandChild, Linear):
+                                batch = greatGrandChild(batch)
+                break
+
+            batch = module(batch)
+
+        labels = (batch.graph_label)
+        num_nodes = batch.G[0].number_of_nodes()
+        last_hidden_layer_pooled = []
+
+        for i in range(0, len(batch.node_feature), num_nodes):
+            relevant_vectors = batch.node_feature.detach()[i:(i + num_nodes)]
+            last_hidden_layer_pooled.append(torch.mean(relevant_vectors, dim = 0))
+
+
+        return last_hidden_layer_pooled, labels # add true as well
+
+    # returns a 2D matrix of n x h, where n is the number of nodes, and h is the hidden size
+    def get_correlations(self, batch):
+        for module in self.children():
+            # don't do the final output layer. Keep the last hidden layer
+            if(isinstance(module, GNNGraphHead)):
+                break
+            batch = module(batch)
+        num_nodes = batch.G[0].number_of_nodes()
+
+        correlationMatricies = []
+
+
+        for i in range(0, len(batch.node_feature), num_nodes):
+            relevant_vectors = batch.node_feature.detach()[i:(i + num_nodes)]
+            relevant_vectors = relevant_vectors.numpy()
+            correlationMatrix = np.corrcoef(relevant_vectors)
+
+            correlationMatricies.append(correlationMatrix)
+
+
+        return correlationMatricies
 
     def forward(self, batch):
         for module in self.children():

diff --git a/graphgym/models/layer.py b/graphgym/models/layer.py
@@ -214,7 +214,7 @@ def __init__(self, dim_in, dim_out, bias=False, **kwargs):
         self.model = GeneralConvLayer(dim_in, dim_out, bias=bias)
 
     def forward(self, batch):
-        batch.node_feature = self.model(batch.node_feature, batch.edge_index)
+        batch.node_feature = self.model(batch.node_feature, batch.edge_index, edge_weight = batch.edge_weights)
         return batch
 
 

diff --git a/graphgym/train.py b/graphgym/train.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from graphgym.checkpoint import clean_ckpt, load_ckpt, save_ckpt
+from graphgym.checkpoint import clean_ckpt, load_ckpt, save_ckpt, remove_ckpt
 from graphgym.config import cfg
 from graphgym.loss import compute_loss
 from graphgym.utils.epoch import is_ckpt_epoch, is_eval_epoch
@@ -81,4 +81,6 @@ def train(loggers, loaders, model, optimizer, scheduler):
     if cfg.train.ckpt_clean:
         clean_ckpt()
 
+    remove_ckpt()
+
     logging.info('Task done, results saved in {}'.format(cfg.out_dir))
diff --git a/graphgym/utils/agg_runs.py b/graphgym/utils/agg_runs.py
@@ -150,6 +150,9 @@ def agg_runs(dir, metric_best='auto'):
         dir_out = os.path.join(dir, 'agg', key)
         fname = os.path.join(dir_out, 'best.json')
         dict_to_json(value, fname)
+
+    print("End")
+    print(os.path.join(dir, 'agg'))
     logging.info('Results aggregated across runs saved in {}'.format(
         os.path.join(dir, 'agg')))
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,13 +1,18 @@
-yacs
-tensorboardx
-torch
-torch-geometric
-deepsnap
-ogb
-numpy
-pandas
-scipy
-scikit-learn
-matplotlib
-seaborn
-notebook
+deepsnap==0.2.1
+matplotlib==3.7.0
+networkx==2.8.4
+numpy==1.18.5
+numpy==1.23.5
+ogb==1.3.6
+pandas==1.5.3
+PyYAML==6.0
+PyYAML==6.0.1
+scikit_learn==1.2.1
+seaborn==0.12.2
+setuptools==66.0.0
+setuptools==65.6.3
+tensorboardX==2.6.2
+torch==1.12.1
+torch_geometric==2.3.1
+torch_scatter==2.1.1
+yacs==0.1.8