Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better mask_test_edges function #55

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 54 additions & 97 deletions gae/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,111 +29,68 @@ def construct_feed_dict(adj_normalized, adj, features, placeholders):
return feed_dict


def mask_test_edges(adj, test_percent=30., val_percent=20.):
def mask_test_edges(adj, val_perc = 20., test_perc = 10.):
# Function to build test set with 10% positive links
# NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper.

# Remove diagonal elements
adj = adj - sp.dia_matrix((adj.diagonal()[None, :], [0]), shape=adj.shape)
adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
adj.eliminate_zeros()
# Check that diag is zero:
assert adj.diagonal().sum() == 0

edges_positive, _, _ = sparse_to_tuple(adj)
edges_positive = edges_positive[edges_positive[:,1] > edges_positive[:,0],:] # filtering out edges from lower triangle of adjacency matrix
val_edges, val_edges_false, test_edges, test_edges_false = None, None, None, None

# number of positive (and negative) edges in test and val sets:
num_test = int(np.floor(edges_positive.shape[0] / (100. / test_percent)))
num_val = int(np.floor(edges_positive.shape[0] / (100. / val_percent)))

# sample positive edges for test and val sets:
edges_positive_idx = np.arange(edges_positive.shape[0])
np.random.shuffle(edges_positive_idx)
val_edge_idx = edges_positive_idx[:num_val]
test_edge_idx = edges_positive_idx[num_val:(num_val + num_test)]
test_edges = edges_positive[test_edge_idx] # positive test edges
val_edges = edges_positive[val_edge_idx] # positive val edges
train_edges = np.delete(edges_positive, np.hstack([test_edge_idx, val_edge_idx]), axis=0) # positive train edges

# the above strategy for sampling without replacement will not work for sampling negative edges on large graphs, because the pool of negative edges is much much larger due to sparsity
# therefore we'll use the following strategy:
# 1. sample random linear indices from adjacency matrix WITH REPLACEMENT (without replacement is super slow). sample more than we need so we'll probably have enough after all the filtering steps.
# 2. remove any edges that have already been added to the other edge lists
# 3. convert to (i,j) coordinates
# 4. swap i and j where i > j, to ensure they're upper triangle elements
# 5. remove any duplicate elements if there are any
# 6. remove any diagonal elements
# 7. if we don't have enough edges, repeat this process until we get enough

positive_idx, _, _ = sparse_to_tuple(adj) # [i,j] coord pairs for all true edges
positive_idx = positive_idx[:,0]*adj.shape[0] + positive_idx[:,1] # linear indices

test_edges_false = np.empty((0,2),dtype='int64')
idx_test_edges_false = np.empty((0,),dtype='int64')
while len(test_edges_false) < len(test_edges):
# step 1:
idx = np.random.choice(adj.shape[0]**2, 2*(num_test-len(test_edges_false)), replace=True)
# step 2:
idx = idx[~np.in1d(idx,positive_idx,assume_unique=True)]
idx = idx[~np.in1d(idx,idx_test_edges_false,assume_unique=True)]
# step 3:
rowidx = idx // adj.shape[0]
colidx = idx % adj.shape[0]
coords = np.vstack((rowidx,colidx)).transpose()
# step 4:
lowertrimask = coords[:,0] > coords[:,1]
coords[lowertrimask] = coords[lowertrimask][:,::-1]
# step 5:
coords = np.unique(coords,axis=0) # note: coords are now sorted lexicographically
np.random.shuffle(coords) # not any more
# step 6:
coords = coords[coords[:,0]!=coords[:,1]]
# step 7:
coords = coords[:min(num_test,len(idx))]
test_edges_false = np.append(test_edges_false,coords,axis=0)
idx = idx[:min(num_test,len(idx))]
idx_test_edges_false = np.append(idx_test_edges_false, idx)


val_edges_false = np.empty((0,2),dtype='int64')
idx_val_edges_false = np.empty((0,),dtype='int64')
while len(val_edges_false) < len(val_edges):
# step 1:
idx = np.random.choice(adj.shape[0]**2, 2*(num_val-len(val_edges_false)), replace=True)
# step 2:
idx = idx[~np.in1d(idx,positive_idx,assume_unique=True)]
idx = idx[~np.in1d(idx,idx_test_edges_false,assume_unique=True)]
idx = idx[~np.in1d(idx,idx_val_edges_false,assume_unique=True)]
# step 3:
rowidx = idx // adj.shape[0]
colidx = idx % adj.shape[0]
coords = np.vstack((rowidx,colidx)).transpose()
# step 4:
lowertrimask = coords[:,0] > coords[:,1]
coords[lowertrimask] = coords[lowertrimask][:,::-1]
# step 5:
coords = np.unique(coords,axis=0) # note: coords are now sorted lexicographically
np.random.shuffle(coords) # not any more
# step 6:
coords = coords[coords[:,0]!=coords[:,1]]
# step 7:
coords = coords[:min(num_val,len(idx))]
val_edges_false = np.append(val_edges_false,coords,axis=0)
idx = idx[:min(num_val,len(idx))]
idx_val_edges_false = np.append(idx_val_edges_false, idx)

# sanity checks:
train_edges_linear = train_edges[:,0]*adj.shape[0] + train_edges[:,1]
test_edges_linear = test_edges[:,0]*adj.shape[0] + test_edges[:,1]
assert not np.any(np.in1d(idx_test_edges_false, positive_idx))
assert not np.any(np.in1d(idx_val_edges_false, positive_idx))
assert not np.any(np.in1d(val_edges[:,0]*adj.shape[0]+val_edges[:,1], train_edges_linear))
assert not np.any(np.in1d(test_edges_linear, train_edges_linear))
assert not np.any(np.in1d(val_edges[:,0]*adj.shape[0]+val_edges[:,1], test_edges_linear))
assert np.diag(adj.todense()).sum() == 0

adj_triu = sp.triu(adj)
adj_tuple = sparse_to_tuple(adj_triu)
edges = adj_tuple[0]
edges_all = sparse_to_tuple(adj)[0]
num_test = int(np.floor(edges.shape[0] / test_perc))
num_val = int(np.floor(edges.shape[0] / val_perc))

all_edge_idx = list(range(edges.shape[0]))
np.random.shuffle(all_edge_idx)
val_edge_idx = all_edge_idx[:num_val]
test_edge_idx = all_edge_idx[num_val:(num_val + num_test)]
test_edges = edges[test_edge_idx]
val_edges = edges[val_edge_idx]
train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0)

# To generate the negative samples we use the following procedure:
# 1. Construct a full dense matrix
# 2. Remove the already existing edges of the graph leaving only the negative edges
# 3. Shuffle the indexes of the remaining negative edges
# 4. Use the proper amount of edges for validation and testing.

# Step 1.
full_dense = np.ones(adj.shape)
S_full = sp.csr_matrix(full_dense)
# Step 2.
S_negative = S_full - adj
S_negative_triu = sp.triu(S_negative)
idx_false = sparse_to_tuple(S_negative_triu)[0]

# Step 3.
false_edges_idx = list(range(idx_false.shape[0]))
np.random.shuffle(false_edges_idx)

# Step 4.
val_edges_false_idx = false_edges_idx[:num_val]
test_edges_false_idx = false_edges_idx[num_val:(num_val + num_test)]
val_edges_false = idx_false[val_edges_false_idx]
test_edges_false = idx_false[test_edges_false_idx]

def ismember(a, b, tol=5):
rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1)
return np.any(rows_close)

assert ~ismember(test_edges_false, edges_all)
assert ~ismember(val_edges_false, edges_all)
assert ~ismember(val_edges, train_edges)
assert ~ismember(test_edges, train_edges)
assert ~ismember(val_edges, test_edges)

# Re-build adj matrix
data = np.ones(train_edges.shape[0])

# Re-build adj matrix
adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape)
adj_train = adj_train + adj_train.T

Expand Down