Source code for cogdl.models.nn.dgl_gcc

import math
import os
import os.path as osp

import dgl
import numpy as np
import scipy.sparse as sparse
import sklearn.preprocessing as preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
from cogdl.data import download_url
from cogdl.layers.gcc_module import *
from scipy.sparse import linalg

from .. import BaseModel, register_model


[docs]def batcher(): def batcher_dev(batch): graph_q, graph_k = zip(*batch) graph_q, graph_k = dgl.batch(graph_q), dgl.batch(graph_k) return graph_q, graph_k return batcher_dev
[docs]def test_moco(train_loader, model, opt): """ one epoch training for moco """ model.eval() emb_list = [] for idx, batch in enumerate(train_loader): graph_q, graph_k = batch bsz = graph_q.batch_size graph_q.to(opt.device) graph_k.to(opt.device) with torch.no_grad(): feat_q = model(graph_q) feat_k = model(graph_k) assert feat_q.shape == (bsz, opt.hidden_size) emb_list.append(((feat_q + feat_k) / 2).detach().cpu()) return torch.cat(emb_list)
[docs]def eigen_decomposision(n, k, laplacian, hidden_size, retry): if k <= 0: return torch.zeros(n, hidden_size) laplacian = laplacian.astype("float64") ncv = min(n, max(2 * k + 1, 20)) # follows https://stackoverflow.com/questions/52386942/scipy-sparse-linalg-eigsh-with-fixed-seed v0 = np.random.rand(n).astype("float64") for i in range(retry): try: s, u = linalg.eigsh(laplacian, k=k, which="LA", ncv=ncv, v0=v0) except sparse.linalg.eigen.arpack.ArpackError: # print("arpack error, retry=", i) ncv = min(ncv * 2, n) if i + 1 == retry: sparse.save_npz("arpack_error_sparse_matrix.npz", laplacian) u = torch.zeros(n, k) else: break x = preprocessing.normalize(u, norm="l2") x = torch.from_numpy(x.astype("float32")) x = F.pad(x, (0, hidden_size - k), "constant", 0) return x
[docs]def _add_undirected_graph_positional_embedding(g, hidden_size, retry=10): # We use eigenvectors of normalized graph laplacian as vertex features. # It could be viewed as a generalization of positional embedding in the # attention is all you need paper. # Recall that the eignvectors of normalized laplacian of a line graph are cos/sin functions. # See section 2.4 of http://www.cs.yale.edu/homes/spielman/561/2009/lect02-09.pdf n = g.number_of_nodes() adj = g.adjacency_matrix_scipy(transpose=False, return_edge_ids=False).astype(float) norm = sparse.diags( dgl.backend.asnumpy(g.in_degrees()).clip(1) ** -0.5, dtype=float ) laplacian = norm * adj * norm k = min(n - 2, hidden_size) x = eigen_decomposision(n, k, laplacian, hidden_size, retry) g.ndata["pos_undirected"] = x.float() return g
[docs]def _rwr_trace_to_dgl_graph( g, seed, trace, positional_embedding_size, entire_graph=False ): subv = torch.unique(torch.cat(trace)).tolist() try: subv.remove(seed) except ValueError: pass subv = [seed] + subv if entire_graph: subg = g.subgraph(g.nodes()) else: subg = g.subgraph(subv) subg = _add_undirected_graph_positional_embedding(subg, positional_embedding_size) subg.ndata["seed"] = torch.zeros(subg.number_of_nodes(), dtype=torch.long) if entire_graph: subg.ndata["seed"][seed] = 1 else: subg.ndata["seed"][0] = 1 return subg
[docs]class NodeClassificationDataset(object): def __init__( self, data, rw_hops=64, subgraph_size=64, restart_prob=0.8, positional_embedding_size=32, step_dist=[1.0, 0.0, 0.0], ): self.rw_hops = rw_hops self.subgraph_size = subgraph_size self.restart_prob = restart_prob self.positional_embedding_size = positional_embedding_size self.step_dist = step_dist assert positional_embedding_size > 1 self.data = data self.graphs = [self._create_dgl_graph(self.data)] self.length = sum([g.number_of_nodes() for g in self.graphs]) self.total = self.length
[docs] def _create_dgl_graph(self, data): graph = dgl.DGLGraph() src, dst = data.edge_index.tolist() num_nodes = data.edge_index.max() + 1 graph.add_nodes(num_nodes) graph.add_edges(src, dst) graph.add_edges(dst, src) graph.readonly() return graph
[docs] def __len__(self): return self.length
[docs] def _convert_idx(self, idx): graph_idx = 0 node_idx = idx for i in range(len(self.graphs)): if node_idx < self.graphs[i].number_of_nodes(): graph_idx = i break else: node_idx -= self.graphs[i].number_of_nodes() return graph_idx, node_idx
[docs] def __getitem__(self, idx): graph_idx, node_idx = self._convert_idx(idx) step = np.random.choice(len(self.step_dist), 1, p=self.step_dist)[0] if step == 0: other_node_idx = node_idx else: other_node_idx = dgl.contrib.sampling.random_walk( g=self.graphs[graph_idx], seeds=[node_idx], num_traces=1, num_hops=step )[0][0][-1].item() max_nodes_per_seed = max( self.rw_hops, int( ( self.graphs[graph_idx].out_degree(node_idx) * math.e / (math.e - 1) / self.restart_prob ) + 0.5 ), ) traces = dgl.contrib.sampling.random_walk_with_restart( self.graphs[graph_idx], seeds=[node_idx, other_node_idx], restart_prob=self.restart_prob, max_nodes_per_seed=max_nodes_per_seed, ) graph_q = _rwr_trace_to_dgl_graph( g=self.graphs[graph_idx], seed=node_idx, trace=traces[0], positional_embedding_size=self.positional_embedding_size, entire_graph=hasattr(self, "entire_graph") and self.entire_graph, ) graph_k = _rwr_trace_to_dgl_graph( g=self.graphs[graph_idx], seed=other_node_idx, trace=traces[1], positional_embedding_size=self.positional_embedding_size, entire_graph=hasattr(self, "entire_graph") and self.entire_graph, ) return graph_q, graph_k
[docs]class GraphClassificationDataset(NodeClassificationDataset): def __init__( self, data, rw_hops=64, subgraph_size=64, restart_prob=0.8, positional_embedding_size=32, step_dist=[1.0, 0.0, 0.0], ): self.rw_hops = rw_hops self.subgraph_size = subgraph_size self.restart_prob = restart_prob self.positional_embedding_size = positional_embedding_size self.step_dist = step_dist self.entire_graph = True assert positional_embedding_size > 1 self.graphs = data self.length = len(self.graphs) self.total = self.length
[docs] def _convert_idx(self, idx): graph_idx = idx node_idx = self.graphs[idx].out_degrees().argmax().item() return graph_idx, node_idx
[docs]@register_model("gcc") class GCC(BaseModel): @staticmethod
[docs] def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--load-path", type=str, default='./saved/gcc_pretrained.pth') parser.add_argument("--hidden-size", type=int, default=64) parser.add_argument("--epoch", type=int, default=0)
# fmt: on @classmethod
[docs] def build_model_from_args(cls, args): return cls(args.load_path)
def __init__(self, load_path): super(GCC, self).__init__() self.load_path = load_path
[docs] def train(self, data): if not os.path.isfile(self.load_path): print("=> no checkpoint found at '{}'".format(self.load_path)) url = "https://github.com/cenyk1230/gcc-data/raw/master/saved/gcc_pretrained.pth" path = '/'.join(self.load_path.split('/')[:-1]) name = self.load_path.split('/')[-1] download_url(url, path, name=name) print("=> loading checkpoint '{}'".format(self.load_path)) checkpoint = torch.load(self.load_path, map_location="cpu") print( "=> loaded successfully '{}' (epoch {})".format( self.load_path, checkpoint["epoch"] ) ) args = checkpoint["opt"] args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if isinstance(data, list): train_dataset = GraphClassificationDataset( data=data, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) else: train_dataset = NodeClassificationDataset( data=data, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) args.batch_size = len(train_dataset) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, collate_fn=batcher(), shuffle=False, num_workers=args.num_workers, ) # create model and optimizer model = GraphEncoder( positional_embedding_size=args.positional_embedding_size, max_node_freq=args.max_node_freq, max_edge_freq=args.max_edge_freq, max_degree=args.max_degree, freq_embedding_size=args.freq_embedding_size, degree_embedding_size=args.degree_embedding_size, output_dim=args.hidden_size, node_hidden_dim=args.hidden_size, edge_hidden_dim=args.hidden_size, num_layers=args.num_layer, num_step_set2set=args.set2set_iter, num_layer_set2set=args.set2set_lstm_layer, gnn_model=args.model, norm=args.norm, degree_input=True, ) model = model.to(args.device) model.load_state_dict(checkpoint["model"]) del checkpoint emb = test_moco(train_loader, model, args) return emb.numpy()