Unsupervised Node Classification

In this tutorial, we will introduce a important task, unsupervised node classification. In this task, we usually apply L2 normalized logisitic regression to train a classifier and use F1-score to measure the performance.

First we define the UnsupervisedNodeClassification class, which has two parameters hidden-size and num-shuffle . hidden-size represents the dimension of node representation, while num-shuffle means the shuffle times in classifier.

@register_task("unsupervised_node_classification")
class UnsupervisedNodeClassification(BaseTask):
    """Node classification task."""

    @staticmethod
    def add_args(parser):
        """Add task-specific arguments to the parser."""
        # fmt: off
        parser.add_argument("--hidden-size", type=int, default=128)
        parser.add_argument("--num-shuffle", type=int, default=5)
        # fmt: on

    def __init__(self, args):
        super(UnsupervisedNodeClassification, self).__init__(args)

Then we can build dataset according to input graph’s type, and get self.label_matrix.

dataset = build_dataset(args)
self.data = dataset[0]
if issubclass(dataset.__class__.__bases__[0], InMemoryDataset):
    self.num_nodes = self.data.y.shape[0]
    self.num_classes = dataset.num_classes
    self.label_matrix = np.zeros((self.num_nodes, self.num_classes), dtype=int)
    self.label_matrix[range(self.num_nodes), self.data.y] = 1
    self.data.edge_attr = self.data.edge_attr.t()
else:
    self.label_matrix = self.data.y
    self.num_nodes, self.num_classes = self.data.y.shape

After that, we can build model and run model.train(G) to obtain node representation.

self.model = build_model(args)
self.model_name = args.model
self.hidden_size = args.hidden_size
self.num_shuffle = args.num_shuffle
self.save_dir = args.save_dir
self.enhance = args.enhance
self.args = args
self.is_weighted = self.data.edge_attr is not None


def train(self):
    G = nx.Graph()
    if self.is_weighted:
        edges, weight = (
            self.data.edge_index.t().tolist(),
            self.data.edge_attr.tolist(),
        )
        G.add_weighted_edges_from(
            [(edges[i][0], edges[i][1], weight[0][i]) for i in range(len(edges))]
        )
    else:
        G.add_edges_from(self.data.edge_index.t().tolist())
    embeddings = self.model.train(G)

The spectral propagation in ProNE can improve the quality of representation learned from other methods, so we can use enhance_emb to enhance performance.

    if self.enhance is True:
        embeddings = self.enhance_emb(G, embeddings)

def enhance_emb(self, G, embs):
    A = sp.csr_matrix(nx.adjacency_matrix(G))
    self.args.model = 'prone'
    self.args.step, self.args.theta, self.args.mu = 5, 0.5, 0.2
    model = build_model(self.args)
    embs = model._chebyshev_gaussian(A, embs)
    return embs

When the embeddings are obtained, we can save them at self.save_dir.

# Map node2id
    features_matrix = np.zeros((self.num_nodes, self.hidden_size))
    for vid, node in enumerate(G.nodes()):
        features_matrix[node] = embeddings[vid]

self.save_emb(features_matrix)

def save_emb(self, embs):
    name = os.path.join(self.save_dir, self.model_name + '_emb.npy')
    np.save(name, embs)

At last, we evaluate embedding via run num_shuffle times classification under different training ratio with features_matrix and label_matrix.

return self._evaluate(features_matrix, label_matrix, self.num_shuffle)

    def _evaluate(self, features_matrix, label_matrix, num_shuffle):
        # shuffle, to create train/test groups
        shuffles = []
        for _ in range(num_shuffle):
            shuffles.append(skshuffle(features_matrix, label_matrix))

        # score each train/test group
        all_results = defaultdict(list)
        training_percents = [0.1, 0.3, 0.5, 0.7, 0.9]
        for train_percent in training_percents:
            for shuf in shuffles:

In each shuffle, split data into two parts(training and testing) and use LogisticRegression to evaluate.

X, y = shuf

training_size = int(train_percent * self.num_nodes)

X_train = X[:training_size, :]
y_train = y[:training_size, :]

X_test = X[training_size:, :]
y_test = y[training_size:, :]

clf = TopKRanker(LogisticRegression())
clf.fit(X_train, y_train)

# find out how many labels should be predicted
top_k_list = list(map(int, y_test.sum(axis=1).T.tolist()[0]))
preds = clf.predict(X_test, top_k_list)
result = f1_score(y_test, preds, average="micro")
all_results[train_percent].append(result)

Node in graph may have multiple labels, so we conduct multilbel classification built from TopKRanker.

from sklearn.multiclass import OneVsRestClassifier

class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        assert X.shape[0] == len(top_k_list)
        probs = np.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = sp.lil_matrix(probs.shape)

        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            for label in labels:
                all_labels[i, label] = 1
        return all_labels

Finally, we get the results of Micro-F1 score under different training ratio for different models on datasets.

return dict(
    (
        f"Micro-F1 {train_percent}",
        sum(all_results[train_percent]) / len(all_results[train_percent]),
    )
    for train_percent in sorted(all_results.keys())
)

The overall implementation of UnsupervisedNodeClassification is at (https://github.com/THUDM/cogdl/blob/master/cogdl/tasks/unsupervised_node_classification.py).

To run UnsupervisedNodeClassification, we can use following instruction:

python scripts/train.py --task unsupervised_node_classification --dataset ppi wikipedia --model deepwalk prone -seed 0 1

Then We get experimental results like this:

Variant

Micro-F1 0.1

Micro-F1 0.3

Micro-F1 0.5

Micro-F1 0.7

Micro-F1 0.9

(‘ppi’, ‘deepwalk’)

0.1547±0.0002

0.1846±0.0002

0.2033±0.0015

0.2161±0.0009

0.2243±0.0018

(‘ppi’, ‘prone’)

0.1777±0.0016

0.2214±0.0020

0.2397±0.0015

0.2486±0.0022

0.2607±0.0096

(‘wikipedia’, ‘deepwalk’)

0.4255±0.0027

0.4712±0.0005

0.4916±0.0011

0.5011±0.0017

0.5166±0.0043

(‘wikipedia’, ‘prone’)

0.4834±0.0009

0.5320±0.0020

0.5504±0.0045

0.5586±0.0022

0.5686±0.0072