import math
import os
import os.path as osp
import dgl
import numpy as np
import scipy.sparse as sparse
import sklearn.preprocessing as preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
from cogdl.data import download_url
from cogdl.layers.gcc_module import *
from scipy.sparse import linalg
from .. import BaseModel, register_model
[docs]def batcher():
def batcher_dev(batch):
graph_q, graph_k = zip(*batch)
graph_q, graph_k = dgl.batch(graph_q), dgl.batch(graph_k)
return graph_q, graph_k
return batcher_dev
[docs]def test_moco(train_loader, model, opt):
"""
one epoch training for moco
"""
model.eval()
emb_list = []
for idx, batch in enumerate(train_loader):
graph_q, graph_k = batch
bsz = graph_q.batch_size
graph_q.to(opt.device)
graph_k.to(opt.device)
with torch.no_grad():
feat_q = model(graph_q)
feat_k = model(graph_k)
assert feat_q.shape == (bsz, opt.hidden_size)
emb_list.append(((feat_q + feat_k) / 2).detach().cpu())
return torch.cat(emb_list)
[docs]def eigen_decomposision(n, k, laplacian, hidden_size, retry):
if k <= 0:
return torch.zeros(n, hidden_size)
laplacian = laplacian.astype("float64")
ncv = min(n, max(2 * k + 1, 20))
# follows https://stackoverflow.com/questions/52386942/scipy-sparse-linalg-eigsh-with-fixed-seed
v0 = np.random.rand(n).astype("float64")
for i in range(retry):
try:
s, u = linalg.eigsh(laplacian, k=k, which="LA", ncv=ncv, v0=v0)
except sparse.linalg.eigen.arpack.ArpackError:
# print("arpack error, retry=", i)
ncv = min(ncv * 2, n)
if i + 1 == retry:
sparse.save_npz("arpack_error_sparse_matrix.npz", laplacian)
u = torch.zeros(n, k)
else:
break
x = preprocessing.normalize(u, norm="l2")
x = torch.from_numpy(x.astype("float32"))
x = F.pad(x, (0, hidden_size - k), "constant", 0)
return x
[docs]def _add_undirected_graph_positional_embedding(g, hidden_size, retry=10):
# We use eigenvectors of normalized graph laplacian as vertex features.
# It could be viewed as a generalization of positional embedding in the
# attention is all you need paper.
# Recall that the eignvectors of normalized laplacian of a line graph are cos/sin functions.
# See section 2.4 of http://www.cs.yale.edu/homes/spielman/561/2009/lect02-09.pdf
n = g.number_of_nodes()
adj = g.adjacency_matrix_scipy(transpose=False, return_edge_ids=False).astype(float)
norm = sparse.diags(
dgl.backend.asnumpy(g.in_degrees()).clip(1) ** -0.5, dtype=float
)
laplacian = norm * adj * norm
k = min(n - 2, hidden_size)
x = eigen_decomposision(n, k, laplacian, hidden_size, retry)
g.ndata["pos_undirected"] = x.float()
return g
[docs]def _rwr_trace_to_dgl_graph(
g, seed, trace, positional_embedding_size, entire_graph=False
):
subv = torch.unique(torch.cat(trace)).tolist()
try:
subv.remove(seed)
except ValueError:
pass
subv = [seed] + subv
if entire_graph:
subg = g.subgraph(g.nodes())
else:
subg = g.subgraph(subv)
subg = _add_undirected_graph_positional_embedding(subg, positional_embedding_size)
subg.ndata["seed"] = torch.zeros(subg.number_of_nodes(), dtype=torch.long)
if entire_graph:
subg.ndata["seed"][seed] = 1
else:
subg.ndata["seed"][0] = 1
return subg
[docs]class NodeClassificationDataset(object):
def __init__(
self,
data,
rw_hops=64,
subgraph_size=64,
restart_prob=0.8,
positional_embedding_size=32,
step_dist=[1.0, 0.0, 0.0],
):
self.rw_hops = rw_hops
self.subgraph_size = subgraph_size
self.restart_prob = restart_prob
self.positional_embedding_size = positional_embedding_size
self.step_dist = step_dist
assert positional_embedding_size > 1
self.data = data
self.graphs = [self._create_dgl_graph(self.data)]
self.length = sum([g.number_of_nodes() for g in self.graphs])
self.total = self.length
[docs] def _create_dgl_graph(self, data):
graph = dgl.DGLGraph()
src, dst = data.edge_index.tolist()
num_nodes = data.edge_index.max() + 1
graph.add_nodes(num_nodes)
graph.add_edges(src, dst)
graph.add_edges(dst, src)
graph.readonly()
return graph
[docs] def __len__(self):
return self.length
[docs] def _convert_idx(self, idx):
graph_idx = 0
node_idx = idx
for i in range(len(self.graphs)):
if node_idx < self.graphs[i].number_of_nodes():
graph_idx = i
break
else:
node_idx -= self.graphs[i].number_of_nodes()
return graph_idx, node_idx
[docs] def __getitem__(self, idx):
graph_idx, node_idx = self._convert_idx(idx)
step = np.random.choice(len(self.step_dist), 1, p=self.step_dist)[0]
if step == 0:
other_node_idx = node_idx
else:
other_node_idx = dgl.contrib.sampling.random_walk(
g=self.graphs[graph_idx], seeds=[node_idx], num_traces=1, num_hops=step
)[0][0][-1].item()
max_nodes_per_seed = max(
self.rw_hops,
int(
(
self.graphs[graph_idx].out_degree(node_idx)
* math.e
/ (math.e - 1)
/ self.restart_prob
)
+ 0.5
),
)
traces = dgl.contrib.sampling.random_walk_with_restart(
self.graphs[graph_idx],
seeds=[node_idx, other_node_idx],
restart_prob=self.restart_prob,
max_nodes_per_seed=max_nodes_per_seed,
)
graph_q = _rwr_trace_to_dgl_graph(
g=self.graphs[graph_idx],
seed=node_idx,
trace=traces[0],
positional_embedding_size=self.positional_embedding_size,
entire_graph=hasattr(self, "entire_graph") and self.entire_graph,
)
graph_k = _rwr_trace_to_dgl_graph(
g=self.graphs[graph_idx],
seed=other_node_idx,
trace=traces[1],
positional_embedding_size=self.positional_embedding_size,
entire_graph=hasattr(self, "entire_graph") and self.entire_graph,
)
return graph_q, graph_k
[docs]class GraphClassificationDataset(NodeClassificationDataset):
def __init__(
self,
data,
rw_hops=64,
subgraph_size=64,
restart_prob=0.8,
positional_embedding_size=32,
step_dist=[1.0, 0.0, 0.0],
):
self.rw_hops = rw_hops
self.subgraph_size = subgraph_size
self.restart_prob = restart_prob
self.positional_embedding_size = positional_embedding_size
self.step_dist = step_dist
self.entire_graph = True
assert positional_embedding_size > 1
self.graphs = data
self.length = len(self.graphs)
self.total = self.length
[docs] def _convert_idx(self, idx):
graph_idx = idx
node_idx = self.graphs[idx].out_degrees().argmax().item()
return graph_idx, node_idx
[docs]@register_model("gcc")
class GCC(BaseModel):
@staticmethod
[docs] def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
parser.add_argument("--load-path", type=str, default='./saved/gcc_pretrained.pth')
parser.add_argument("--hidden-size", type=int, default=64)
parser.add_argument("--epoch", type=int, default=0)
# fmt: on
@classmethod
[docs] def build_model_from_args(cls, args):
return cls(args.load_path)
def __init__(self, load_path):
super(GCC, self).__init__()
self.load_path = load_path
[docs] def train(self, data):
if not os.path.isfile(self.load_path):
print("=> no checkpoint found at '{}'".format(self.load_path))
url = "https://github.com/cenyk1230/gcc-data/raw/master/saved/gcc_pretrained.pth"
path = '/'.join(self.load_path.split('/')[:-1])
name = self.load_path.split('/')[-1]
download_url(url, path, name=name)
print("=> loading checkpoint '{}'".format(self.load_path))
checkpoint = torch.load(self.load_path, map_location="cpu")
print(
"=> loaded successfully '{}' (epoch {})".format(
self.load_path, checkpoint["epoch"]
)
)
args = checkpoint["opt"]
args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if isinstance(data, list):
train_dataset = GraphClassificationDataset(
data=data,
rw_hops=args.rw_hops,
subgraph_size=args.subgraph_size,
restart_prob=args.restart_prob,
positional_embedding_size=args.positional_embedding_size,
)
else:
train_dataset = NodeClassificationDataset(
data=data,
rw_hops=args.rw_hops,
subgraph_size=args.subgraph_size,
restart_prob=args.restart_prob,
positional_embedding_size=args.positional_embedding_size,
)
args.batch_size = len(train_dataset)
train_loader = torch.utils.data.DataLoader(
dataset=train_dataset,
batch_size=args.batch_size,
collate_fn=batcher(),
shuffle=False,
num_workers=args.num_workers,
)
# create model and optimizer
model = GraphEncoder(
positional_embedding_size=args.positional_embedding_size,
max_node_freq=args.max_node_freq,
max_edge_freq=args.max_edge_freq,
max_degree=args.max_degree,
freq_embedding_size=args.freq_embedding_size,
degree_embedding_size=args.degree_embedding_size,
output_dim=args.hidden_size,
node_hidden_dim=args.hidden_size,
edge_hidden_dim=args.hidden_size,
num_layers=args.num_layer,
num_step_set2set=args.set2set_iter,
num_layer_set2set=args.set2set_lstm_layer,
gnn_model=args.model,
norm=args.norm,
degree_input=True,
)
model = model.to(args.device)
model.load_state_dict(checkpoint["model"])
del checkpoint
emb = test_moco(train_loader, model, args)
return emb.numpy()