Code ReproductionSCGC__Simple Contrastive Graph Clustering

【Code Reproduction】SCGC__Simple Contrastive Graph Clustering

Article directory

  • 【Code Reproduction】SCGC__Simple Contrastive Graph Clustering
    • 1 Introduction
    • 2. Preface
    • 3. Reproduce code
      • 3.1 Project framework
      • 3.2 Code files
        • 3.2.1 main.py
        • 3.2.2 model.py
        • 3.2.3 utils.py
        • 3.2.4 opt.py
      • 3.3 Experimental results
    • 4. Reference

1. Introduction

The code reproduced in this article is the paper – Simple Contrastive Graph Clustering.

  • Contrastive learning has attracted extensive attention in deep graph clustering due to its promising performance.
  • However, complex data augmentation and time-consuming graph convolution operations impair the efficiency of these methods.

To address this issue, the authors propose a Simple Contrastive Graph Clustering (SCGC) algorithm to improve existing methods from the perspectives of network architecture, data augmentation, and objective function. In terms of architecture, the network consists of two main parts, namely preprocessing and network backbone.

  • A simple low-pass denoising operation aggregates neighbor information as a stand-alone preprocessing, and consists of only two multi-layer perceptrons (MLPs) as the backbone.
  • For data augmentation, the model does not introduce complex operations on the graph, but constructs two augmented views of the same vertex by designing a parameter-unshared Siamese encoder and directly interfering with node embeddings.
  • Finally, in terms of the objective function, to further improve the clustering performance, a novel cross-view structural consistency objective function is designed to improve the discriminative ability of the learned network.

2. Foreword

  • For some data sets used in the task, you can refer to common graph data sets.
  • For a detailed explanation of the paper, see: SCGC paper explanation.

3. Reproduce code

3.1 Project Framework

3.2 Code file

3.2.1 main.py

import tqdm
from torch.optim import Adam, SGD
from model import *
from utils import *
import opt

def train(X, y, A, a):
    opt.args.acc, opt.args.nmi, opt.args.ari, opt.args.f1 = 0, 0, 0, 0
    
    A_sl = a * A + np.eye(A.shape[0])

    if opt.args.is_pass != 0:
        if opt.args.is_pass == 1:
            adjs = get_adjs(A)
            for a in adjs:
                X = a.dot(X)
        else:
            adjs = get_laps(A)
            for a in adjs:
                X = a.dot(X)

    enc_dims = [opt.args.n_input] + opt.args.enc_dims
    dec_dims = opt.args.dec_dims + [opt.args.n_input]
    model = OUR(opt.args.layers, enc_dims, dec_dims).to(opt.args.device)

    X = numpy_to_torch(X).to(opt.args.device)
    A_sl = numpy_to_torch(A_sl).to(opt.args.device)
    
    centers = model_init(model, X, y)

    # initialize cluster centers
    model.cluster_centers.data = torch.tensor(centers).to(opt.args.device)

    optimizer = Adam(model.parameters(), lr=opt.args.lr)
    
    for epoch in range(opt. args. epoch):
        # input & output
        Z1, Z2, Z, Q, X_ = model(X)
        P = target_distribution(Q)

        loss_cv = cross_view_loss(Z1, Z2, A_sl)
        loss_kl = distribution_loss(Q, P)
        loss_rec = reconstruction_loss(X, X_)

        # print(loss_cv, loss_kl)
        loss = loss_cv + 10 * loss_kl + loss_rec
        
        #optimization
        optimizer. zero_grad()
        loss.backward(retain_graph=True)
        optimizer. step()

        # clustering & evaluation
        acc, nmi, ari, f1, _ = clustering(Z, y)
        if acc > opt.args.acc:
            opt.args.acc = acc
            opt.args.nmi = nmi
            opt.args.ari = ari
            opt.args.f1 = f1
            print(epoch, "ACC: {:.4f},".format(acc), "NMI: {:.4f},".format(nmi), "ARI: {:.4f},".format(ari ), "F1: {:.4f}".format(f1))
        
    return opt.args.acc, opt.args.nmi, opt.args.ari, opt.args.f1

if __name__ == '__main__':
    #initialize
    setup()

    # load data
    X, y, A = load_graph_data(opt.args.name)

    acc, nmi, ari, f1 = train(X, y, A, 1.0)
    print("ACC: {:.4f},".format(acc), "NMI: {:.4f},".format(nmi), "ARI: {:.4f},".format(ari), "F1: {:.4f}".format(f1))

3.2.2 model.py

import torch.nn as nn
import torch.nn.functional as F
import torch
from utils import *

class LinTrans(nn.Module):
    def __init__(self, layers, dims):
        super(LinTrans, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(layers):
            self. layers. append(nn. Linear(dims[i], dims[i + 1]))
        self.act = nn.Sigmoid()
        # self.act = nn.LeakyReLU()

    def scale(self, z):
        zmax = z.max(dim=1, keepdim=True)[0]
        zmin = z.min(dim=1, keepdim=True)[0]
        z_std = (z - zmin) / (zmax - zmin)
        z_scaled = z_std
    
        return z_scaled

    def forward(self, x):
        num_layer = len(self. layers)
        out = x
        for i in range(num_layer - 1):
            out = self. act(self. layers[i](out))
        out = self.layers[num_layer - 1](out)
        # out = self. scale(out)
        out = F. normalize(out)
        return out

class OUR(nn.Module):
    def __init__(self, lt_layers, dims):
        super(OUR, self).__init__()
        self.lt1 = LinTrans(lt_layers, dims)
        self.lt2 = LinTrans(lt_layers, dims)
    
    def forward(self, X):

        Z1, Z2 = self.lt1(X), self.lt2(X)
        return Z1, Z2

3.2.3 utils.py

import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
import sklearn. preprocessing as preprocessing
import torch
import random
import opt
import numpy as np
from sklearn import metrics
from munkres import Munkres
import torch.nn.functional as F
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score as ari_score
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score

def setup():
    """
    setup
    - name: the name of dataset
    -device: CPU / GPU
    -seed: random seed
    - n_clusters: num of clusters
    - n_input: dimension of feature
    - alpha_value: alpha value for graph diffusion
    - lambda_value: lambda value for clustering guidance
    - gamma_value: gamma value for propagation regularization
    - lr: learning rate
    Return: None

    """
    # print("---------------setting---------------")

    setup_seed(opt. args. seed)

    if opt.args.name == 'amap':
        print('amap..........')
        opt.args.n_clusters = 8
        opt.args.t = 5
        opt.args.lr = 1e-5

    elif opt.args.name == 'cite':
        print('cite...')
        opt.args.n_clusters = 6
        opt.args.t = 2
        opt.args.lr = 5e-5
    
    elif opt.args.name == 'cora':
        print('cora..........')
        opt.args.n_clusters = 7
        opt.args.t = 2
        opt.args.lr = 1e-3
    
    elif opt.args.name == 'corafull':
        print('corafull...')
        opt.args.n_clusters = 70
        opt.args.t = 2
        opt.args.lr = 1e-4

    elif opt.args.name == 'bat':
        # opt.args.n_input = 50
        print('bat..........')
        opt.args.n_clusters = 4
        opt.args.t = 3
        opt.args.lr = 1e-3

    elif opt.args.name == 'eat':
        print('eat......')
        opt.args.n_clusters = 4
        opt.args.t = 5
        opt.args.lr = 1e-3
    
    elif opt.args.name == 'uat':
        print('uat...')
        opt.args.n_clusters = 4
        opt.args.t = 3
        opt.args.lr = 1e-3
    
    else:
        print("error!")
        exit(0)

    opt.args.device = torch.device("cuda:1" if opt.args.cuda else "cpu")
    # opt.args.device = torch.device("cpu")

    # print("dataset : {}". format(opt. args. name))
    # print("device : {}". format(opt. args. device))
    # print("random seed : {}". format(opt. args. seed))
    # print("clusters : {}". format(opt. args. n_clusters))
    # print("n_PCA : {}". format(opt. args. n_input))
    # print("learning rate : {:.0e}". format(opt. args. lr))

def setup_seed(seed):
    """
    setup random seed to fix the result
    Args:
        seed: random seed
    Returns: None
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np. random. seed(seed)
    random. seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

def numpy_to_torch(a, sparse=False):
    """
    numpy array to torch tensor
    :param a: the numpy array
    :param sparse: is sparse tensor or not
    :return: torch tensor
    """
    if sparse:
        a = torch. sparse. Tensor(a)
        a = a.to_sparse()
    else:
        a = torch. FloatTensor(a)
    return a

def torch_to_numpy(t):
    """
    torch tensor to numpy array
    :param t: the torch tensor
    :return: numpy array
    """
    return t. numpy()

def load_graph_data(dataset_name, show_details=False):
    """
    load graph data
    :param dataset_name: the name of the dataset
    :param show_details: if show the details of dataset
    - dataset name
    - features' shape
    - labels' shape
    - adj shape
    - edge num
    - category num
    - category distribution
    :return: the features, labels and adj
    """
    load_path = "../dataset/" + dataset_name + "/" + dataset_name
    feat = np.load(load_path + "_feat.npy", allow_pickle=True)
    label = np.load(load_path + "_label.npy", allow_pickle=True)
    adj = np.load(load_path + "_adj.npy", allow_pickle=True)
    
    if show_details:
        print("++++++++++++++++++++++++++++++++")
        print("---details of graph dataset---")
        print("++++++++++++++++++++++++++++++++")
        print("dataset name: ", dataset_name)
        print("feature shape: ", feat. shape)
        print("label shape: ", label.shape)
        print("adj shape: ", adj. shape)
        print("undirected edge num: ", int(np.nonzero(adj)[0].shape[0]/2))
        print("category num: ", max(label)-min(label) + 1)
        print("category distribution: ")
        for i in range(max(label) + 1):
            print("label", i, end=":")
            print(len(label[np. where(label == i)]))
        print("++++++++++++++++++++++++++++++++")

    # X pre-processing
    # pca = PCA(n_components=opt.args.n_input)
    # feat = pca. fit_transform(feat)

    opt.args.n_input = feat.shape[1]

    return feat, label, adj

def gaussian_noised_feature(X):
    """
    add gaussian noise to the attribute matrix X
    Args:
        X: the attribute matrix
    Returns: the noised attribute matrix Y
    """
    N = torch.Tensor(np.random.normal(0, 0.01, X.shape)).to(opt.args.device)
    Y = X + N
    return Y

def gaussian_noised_feature_(X):
    """
    add gaussian noise to the attribute matrix X
    Args:
        X: the attribute matrix
    Returns: the noised attribute matrix Y
    """
    N = torch.Tensor(np.random.normal(1, 0.01, X.shape)).to(opt.args.device)
    Y = X * N
    return Y

def normalize_adj(adj, self_loop=True, symmetry=False):
    """
    normalize the adj matrix
    :param adj: input adj matrix
    :param self_loop: if add the self loop or not
    :param symmetry: symmetry normalize or not
    :return: the normalized adj matrix
    """
    ident = np.eye(adj.shape[0])
    # add the self_loop
    if self_loop:
        adj_tmp = adj + ident
    else:
        adj_tmp = adj

    # calculate degree matrix and it's inverse matrix
    row_sum = adj_tmp. sum(1)
    L = np.diag(row_sum) - adj_tmp

    if symmetry:
        d1 = np.diag(np.power(row_sum, -0.5))
        norm_L = np.matmul(np.matmul(d1, L), d1) # symmetry normalize: D^{-0.5} A D^{-0.5}
    else:
        d2 = np.diag(np.power(row_sum, -1))
        norm_L = np.matmul(d2, L) # non-symmetry normalize: D^{-1} A
    
    return norm_L

def get_adjs(adj, norm = True):
    ident = 1 * np.eye(adj.shape[0])
    norm_L = normalize_adj(adj, True, norm)
    reg = [1] * (opt.args.t)
    # reg = [1] * (2)
    print('t......', len(reg))
    adjs = []
    for i in range(len(reg)):
        adjs.append(ident-(reg[i] * norm_L))
    # for i in range(len(reg)):
    # adjs.append(norm_L)
    return adjs

# Calculating loss----------------------------------------------- ------------------start
def distance(x, y):
    return torch. sum(torch. square(x - y))

def similarity_loss(edges, Z):
    num_edges = len(edges)
    loss_sim = [0.0]
    loss_sim = torch.FloatTensor(loss_sim).to(opt.args.device)
    for i in range(0, num_edges):
        loss_sim + = distance(Z[edges[i][0]], Z[edges[i][1]])
    return loss_sim / num_edges


def cross_correlation(X, Y):
    return torch.mm(X, Y.t())

def cross_view_loss(X, Y, A):
    # cross-view similarity matrix
    S = cross_correlation(X, Y)
    # loss of cross view
    L_cv = (A-S).pow(2).mean()
    return L_cv


def aug_loss(X, Xl, A, Al):
    return - (A-Al).pow(2).mean() - (X-Xl).pow(2).mean()
# Calculating loss----------------------------------------------- ------------------end


# Clustering and Evaluation---------------------------------------------- ----------start
def clustering(Z, y):
    """
    clustering based on embedding
    Args:
        Z: the input embedding
        y: the ground truth

    Returns: acc, nmi, ari, f1, clustering centers
    """
    model = KMeans(n_clusters=opt.args.n_clusters, n_init=20)
    cluster_id = model.fit_predict(Z.data.cpu().numpy())
    acc, nmi, ari, f1 = eva(y, cluster_id, show_details=opt. args. show_training_details)
    return acc, nmi, ari, f1, model.cluster_centers_

def cluster_acc(y_true, y_pred):
    """
    calculate clustering acc and f1-score
    Args:
        y_true: the ground truth
        y_pred: the clustering id

    Returns: acc and f1-score
    """
    y_true = y_true - np.min(y_true)
    l1 = list(set(y_true))
    num_class1 = len(l1)
    l2 = list(set(y_pred))
    num_class2 = len(l2)
    ind = 0
    if num_class1 != num_class2:
        for i in l1:
            if i in l2:
                pass
            else:
                y_pred[ind] = i
                ind + = 1
    l2 = list(set(y_pred))
    numclass2 = len(l2)
    if num_class1 != numclass2:
        print('error')
        return
    cost = np.zeros((num_class1, numclass2), dtype=int)
    for i, c1 in enumerate(l1):
        mps = [i1 for i1, e1 in enumerate(y_true) if e1 == c1]
        for j, c2 in enumerate(l2):
            mps_d = [i1 for i1 in mps if y_pred[i1] == c2]
            cost[i][j] = len(mps_d)
    m = Munkres()
    cost = cost.__neg__().tolist()
    indexes = m.compute(cost)
    new_predict = np. zeros(len(y_pred))
    for i, c in enumerate(l1):
        c2 = l2[indexes[i][1]]
        ai = [ind for ind, elm in enumerate(y_pred) if elm == c2]
        new_predict[ai] = c
    acc = metrics. accuracy_score(y_true, new_predict)
    f1_macro = metrics.f1_score(y_true, new_predict, average='macro')
    return acc, f1_macro


def eva(y_true, y_pred, show_details=False):
    """
    evaluate the clustering performance
    Args:
        y_true: the ground truth
        y_pred: the predicted label
        show_details: if print the details
    Returns: None
    """
    acc, f1 = cluster_acc(y_true, y_pred)
    nmi = nmi_score(y_true, y_pred, average_method='arithmetic')
    ari = ari_score(y_true, y_pred)
    if show_details:
        print(':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari),
              ', f1 {:.4f}'. format(f1))
    return acc, nmi, ari, f1
# Clustering and Evaluation---------------------------------------------- ----------end


3.2.4 opt.py

import argparse

parser = argparse.ArgumentParser(description='OUR', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# setting
parser.add_argument('--name', type=str, default="cite")
parser.add_argument('--cuda', type=bool, default=True)
parser. add_argument('--seed', type=int, default=42)
parser.add_argument('--lr', type=float, default=1e-4)
parser.add_argument('--layers', type=int, default=1)
parser.add_argument('--dims', type=int, default=[500], help='Number of units in hidden layer 1.')
parser.add_argument('--epoch', type=int, default=400)
parser.add_argument('--show_training_details', type=bool, default=False)

# clustering performance: acc, nmi, ari, f1
parser.add_argument('--acc', type=float, default=0)
parser.add_argument('--nmi', type=float, default=0)
parser.add_argument('--ari', type=float, default=0)
parser.add_argument('--f1', type=float, default=0)

args = parser. parse_args()

3.3 Experimental results

4. Reference

【1】https://blog.csdn.net/qq_51392112/article/details/128943812
【2】https://blog.csdn.net/qq_51392112/article/details/129429108