Source code for cogdl.tasks.unsupervised_graph_classification

import copy
import os
from collections import defaultdict
import numpy as np
from sklearn.utils import shuffle as skshuffle
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F

from cogdl.data import Data, DataLoader
from cogdl.datasets import build_dataset
from cogdl.models import build_model
from . import BaseTask, register_task
from .graph_classification import node_degree_as_feature


[docs]@register_task("unsupervised_graph_classification") class UnsupervisedGraphClassification(BaseTask): r"""Unsupervised graph classification""" @staticmethod
[docs] def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off parser.add_argument("--lr", type=float, default=0.001) parser.add_argument("--num-shuffle", type=int, default=10) parser.add_argument("--degree-feature", dest="degree_feature", action="store_true")
# fmt: on def __init__(self, args, dataset=None, model=None): super(UnsupervisedGraphClassification, self).__init__(args) self.device = args.device_id[0] if not args.cpu else 'cpu' dataset = build_dataset(args) if dataset is None else dataset if 'gcc' in args.model: self.label = dataset.graph_labels[:, 0] self.data = dataset.graph_lists else: self.label = np.array([data.y for data in dataset]) self.data = [ Data(x=data.x, y=data.y, edge_index=data.edge_index, edge_attr=data.edge_attr, pos=data.pos).apply(lambda x:x.to(self.device)) for data in dataset ] args.num_features = dataset.num_features args.num_classes = args.hidden_size args.use_unsup = True if args.degree_feature: self.data = node_degree_as_feature(self.data) args.num_features = self.data[0].num_features self.num_graphs = len(self.data) self.num_classes = dataset.num_classes # self.label_matrix = np.zeros((self.num_graphs, self.num_classes)) # self.label_matrix[range(self.num_graphs), np.array([data.y for data in self.data], dtype=int)] = 1 self.model = build_model(args) if model is None else model self.model = self.model.to(self.device) self.model_name = args.model self.hidden_size = args.hidden_size self.num_shuffle = args.num_shuffle self.save_dir = args.save_dir self.epoch = args.epoch self.use_nn = args.model in ('infograph', ) if self.use_nn: self.optimizer = torch.optim.Adam( self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay ) self.data_loader = DataLoader(self.data, batch_size=args.batch_size, shuffle=True)
[docs] def train(self): if self.use_nn: best_model = None best_loss = 10000 epoch_iter = tqdm(range(self.epoch)) for epoch in epoch_iter: loss_n = [] for batch in self.data_loader: batch = batch.to(self.device) predict, loss = self.model(batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step() loss_n.append(loss.item()) loss_n = np.mean(loss_n) epoch_iter.set_description( f"Epoch: {epoch:03d}, TrainLoss: {np.mean(loss_n)} " ) if loss_n < best_loss: best_loss = loss_n best_model = copy.deepcopy(self.model) self.model = best_model with torch.no_grad(): self.model.eval() prediction = [] label = [] for batch in self.data_loader: batch = batch.to(self.device) predict, _ = self.model(batch) prediction.extend(predict.cpu().numpy()) label.extend(batch.y.cpu().numpy()) prediction = np.array(prediction).reshape(len(label), -1) label = np.array(label).reshape(-1) elif 'gcc' in self.model_name: prediction = self.model.train(self.data) label = self.label else: prediction, loss = self.model(self.data) label = self.label if prediction is not None: # self.save_emb(prediction) return self._evaluate(prediction, label)
[docs] def save_emb(self, embs): name = os.path.join(self.save_dir, self.model_name + '_emb.npy') np.save(name, embs)
[docs] def _evaluate(self, embeddings, labels): shuffles = [] for _ in range(self.num_shuffle): shuffles.append(skshuffle(embeddings, labels)) all_results = defaultdict(list) training_percents = [0.1, 0.3, 0.5, 0.7, 0.9] for training_percent in training_percents: for shuf in shuffles: training_size = int(training_percent * self.num_graphs) X, y = shuf X_train = X[:training_size, :] y_train = y[:training_size] X_test = X[training_size:, :] y_test = y[training_size:] # clf = SVC() # clf.fit(X_train, y_train) params = {"C": [1e-3, 1e-2, 1e-1, 1, 10]} svc = SVC() clf = GridSearchCV(svc, params) clf.fit(X_train, y_train) preds = clf.predict(X_test) accuracy = f1_score(y_test, preds, average="micro") all_results[training_percent].append(accuracy) result = dict( ( f"Accuracy {train_percent}", sum(all_results[train_percent]) / len(all_results[train_percent]), ) for train_percent in sorted(all_results.keys()) ) print("Test Acc: ", list(result.values())[-1]) return result