Source code for models.nn.dgl_gcc

import math
import os

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import scipy.sparse as sparse
from scipy.sparse import linalg
import sklearn.preprocessing as preprocessing

import dgl
from cogdl.layers.gcc_module import *

from .. import BaseModel, register_model


[docs]def batcher(): def batcher_dev(batch): graph_q, graph_k = zip(*batch) graph_q, graph_k = dgl.batch(graph_q), dgl.batch(graph_k) return graph_q, graph_k return batcher_dev
[docs]def test_moco(train_loader, model, opt): """ one epoch training for moco """ model.eval() emb_list = [] for idx, batch in enumerate(train_loader): graph_q, graph_k = batch bsz = graph_q.batch_size graph_q.to(opt.device) graph_k.to(opt.device) with torch.no_grad(): feat_q = model(graph_q) feat_k = model(graph_k) assert feat_q.shape == (bsz, opt.hidden_size) emb_list.append(((feat_q + feat_k) / 2).detach().cpu()) return torch.cat(emb_list)
[docs]def eigen_decomposision(n, k, laplacian, hidden_size, retry): if k <= 0: return torch.zeros(n, hidden_size) laplacian = laplacian.astype("float64") ncv = min(n, max(2 * k + 1, 20)) # follows https://stackoverflow.com/questions/52386942/scipy-sparse-linalg-eigsh-with-fixed-seed v0 = np.random.rand(n).astype("float64") for i in range(retry): try: s, u = linalg.eigsh(laplacian, k=k, which="LA", ncv=ncv, v0=v0) except sparse.linalg.eigen.arpack.ArpackError: # print("arpack error, retry=", i) ncv = min(ncv * 2, n) if i + 1 == retry: sparse.save_npz("arpack_error_sparse_matrix.npz", laplacian) u = torch.zeros(n, k) else: break x = preprocessing.normalize(u, norm="l2") x = torch.from_numpy(x.astype("float32")) x = F.pad(x, (0, hidden_size - k), "constant", 0) return x
[docs]def _add_undirected_graph_positional_embedding(g, hidden_size, retry=10): # We use eigenvectors of normalized graph laplacian as vertex features. # It could be viewed as a generalization of positional embedding in the # attention is all you need paper. # Recall that the eignvectors of normalized laplacian of a line graph are cos/sin functions. # See section 2.4 of http://www.cs.yale.edu/homes/spielman/561/2009/lect02-09.pdf n = g.number_of_nodes() adj = g.adjacency_matrix_scipy(transpose=False, return_edge_ids=False).astype(float) norm = sparse.diags( dgl.backend.asnumpy(g.in_degrees()).clip(1) ** -0.5, dtype=float ) laplacian = norm * adj * norm k = min(n - 2, hidden_size) x = eigen_decomposision(n, k, laplacian, hidden_size, retry) g.ndata["pos_undirected"] = x.float() return g
[docs]def _rwr_trace_to_dgl_graph( g, seed, trace, positional_embedding_size, entire_graph=False ): subv = torch.unique(torch.cat(trace)).tolist() try: subv.remove(seed) except ValueError: pass subv = [seed] + subv if entire_graph: subg = g.subgraph(g.nodes()) else: subg = g.subgraph(subv) subg = _add_undirected_graph_positional_embedding(subg, positional_embedding_size) subg.ndata["seed"] = torch.zeros(subg.number_of_nodes(), dtype=torch.long) if entire_graph: subg.ndata["seed"][seed] = 1 else: subg.ndata["seed"][0] = 1 return subg
[docs]class NodeClassificationDataset(object): def __init__( self, data, rw_hops=64, subgraph_size=64, restart_prob=0.8, positional_embedding_size=32, step_dist=[1.0, 0.0, 0.0], ): self.rw_hops = rw_hops self.subgraph_size = subgraph_size self.restart_prob = restart_prob self.positional_embedding_size = positional_embedding_size self.step_dist = step_dist assert positional_embedding_size > 1 self.data = data self.graphs = [self._create_dgl_graph(self.data)] self.length = sum([g.number_of_nodes() for g in self.graphs]) self.total = self.length
[docs] def _create_dgl_graph(self, data): graph = dgl.DGLGraph() src, dst = data.edge_index.tolist() num_nodes = data.edge_index.max() + 1 graph.add_nodes(num_nodes) graph.add_edges(src, dst) graph.add_edges(dst, src) graph.readonly() return graph
[docs] def __len__(self): return self.length
[docs] def _convert_idx(self, idx): graph_idx = 0 node_idx = idx for i in range(len(self.graphs)): if node_idx < self.graphs[i].number_of_nodes(): graph_idx = i break else: node_idx -= self.graphs[i].number_of_nodes() return graph_idx, node_idx
[docs] def __getitem__(self, idx): graph_idx, node_idx = self._convert_idx(idx) step = np.random.choice(len(self.step_dist), 1, p=self.step_dist)[0] if step == 0: other_node_idx = node_idx else: other_node_idx = dgl.contrib.sampling.random_walk( g=self.graphs[graph_idx], seeds=[node_idx], num_traces=1, num_hops=step )[0][0][-1].item() max_nodes_per_seed = max( self.rw_hops, int( ( self.graphs[graph_idx].out_degree(node_idx) * math.e / (math.e - 1) / self.restart_prob ) + 0.5 ), ) traces = dgl.contrib.sampling.random_walk_with_restart( self.graphs[graph_idx], seeds=[node_idx, other_node_idx], restart_prob=self.restart_prob, max_nodes_per_seed=max_nodes_per_seed, ) graph_q = _rwr_trace_to_dgl_graph( g=self.graphs[graph_idx], seed=node_idx, trace=traces[0], positional_embedding_size=self.positional_embedding_size, entire_graph=hasattr(self, "entire_graph") and self.entire_graph, ) graph_k = _rwr_trace_to_dgl_graph( g=self.graphs[graph_idx], seed=other_node_idx, trace=traces[1], positional_embedding_size=self.positional_embedding_size, entire_graph=hasattr(self, "entire_graph") and self.entire_graph, ) return graph_q, graph_k
[docs]class GraphClassificationDataset(NodeClassificationDataset): def __init__( self, data, rw_hops=64, subgraph_size=64, restart_prob=0.8, positional_embedding_size=32, step_dist=[1.0, 0.0, 0.0], ): self.rw_hops = rw_hops self.subgraph_size = subgraph_size self.restart_prob = restart_prob self.positional_embedding_size = positional_embedding_size self.step_dist = step_dist self.entire_graph = True assert positional_embedding_size > 1 self.graphs = data self.length = len(self.graphs) self.total = self.length
[docs] def _convert_idx(self, idx): graph_idx = idx node_idx = self.graphs[idx].out_degrees().argmax().item() return graph_idx, node_idx
[docs]@register_model("gcc") class GCC(BaseModel): @staticmethod
[docs] def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--load_path", type=str) parser.add_argument("--hidden-size", type=int, default=64) parser.add_argument("--epoch", type=int, default=0)
# fmt: on @classmethod
[docs] def build_model_from_args(cls, args): return cls(args.load_path)
def __init__(self, load_path): super(GCC, self).__init__() self.load_path = load_path
[docs] def train(self, data): if os.path.isfile(self.load_path): print("=> loading checkpoint '{}'".format(self.load_path)) checkpoint = torch.load(self.load_path, map_location="cpu") print( "=> loaded successfully '{}' (epoch {})".format( self.load_path, checkpoint["epoch"] ) ) else: print("=> no checkpoint found at '{}'".format(self.load_path)) args = checkpoint["opt"] args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if isinstance(data, list): train_dataset = GraphClassificationDataset( data=data, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) else: train_dataset = NodeClassificationDataset( data=data, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) args.batch_size = len(train_dataset) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, collate_fn=batcher(), shuffle=False, num_workers=args.num_workers, ) # create model and optimizer model = GraphEncoder( positional_embedding_size=args.positional_embedding_size, max_node_freq=args.max_node_freq, max_edge_freq=args.max_edge_freq, max_degree=args.max_degree, freq_embedding_size=args.freq_embedding_size, degree_embedding_size=args.degree_embedding_size, output_dim=args.hidden_size, node_hidden_dim=args.hidden_size, edge_hidden_dim=args.hidden_size, num_layers=args.num_layer, num_step_set2set=args.set2set_iter, num_layer_set2set=args.set2set_lstm_layer, gnn_model=args.model, norm=args.norm, degree_input=True, ) model = model.to(args.device) model.load_state_dict(checkpoint["model"]) del checkpoint emb = test_moco(train_loader, model, args) return emb.numpy()