train.py

import os
import yaml
import glob
import torch
import shutil
import argparse
import numpy as np
import torch.nn as nn
from model import GCN, modularity_loss
from utils import *
from dataset import load_feat, load_labels, FeatureDataset
from metrics import pairwise, bcubed
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from pytorch_metric_learning import samplers
from torch_geometric.loader import NeighborLoader

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:100"

# Manual seed
seed = 123
set_random_seed(seed)

def train(model, optimizer, training_loader, config, device):
    modularity = 0.
    neg_pair_loss = 0.
    running_loss = 0.

    for i, sample_batched in enumerate(training_loader):
        b_features, b_labels = sample_batched
        b_features = b_features.to(device)
        b_labels = b_labels.to(device)
        
        optimizer.zero_grad()  # Clear gradients.
        data, nbrs = gen_graph(b_features, config, device)
        out = model(data.x, data.edge_index)  # Perform a single forward pass.

        comm_spmat, neg_comm_spmat = community2spmat(b_labels, device)
        curr_modularity, curr_neg_pair_loss, curr_loss = modularity_loss(out, nbrs, comm_spmat, neg_comm_spmat, config) # Compute the loss solely based on the training nodes.
        curr_loss.backward()  # Derive gradients.
        # nn.utils.clip_grad_norm_(model.parameters(), 2.)
        optimizer.step()  # Update parameters based on gradients.
        running_loss += curr_loss.item()
        modularity += curr_modularity.item()
        neg_pair_loss += curr_neg_pair_loss.item()

        # free up gpu memory
        torch.cuda.empty_cache()

    return running_loss / len(training_loader), \
           modularity / len(training_loader), \
           neg_pair_loss / len(training_loader)

def main(config_path, device):
    # Load config
    config = yaml.safe_load(open(config_path, 'r'))
    k = config['K']

    # Load data
    feat_dim = config['FEATURE_DIM']

    train_feat_path = config['TRAIN_FEATURES']
    train_feat = load_feat(train_feat_path, feat_dim)
    train_features = l2norm(train_feat)
    print('Train:')
    print('features shape:', train_features.shape)

    train_label_path = config['TRAIN_LABELS']
    train_labels = load_labels(train_label_path)
    print(f'num of labels: {train_labels.shape[0]}')
    print(f'#cls: {len(np.unique(train_labels))}\n')

    val_feat_path = config['VAL_FEATURES']
    val_feat = load_feat(val_feat_path, feat_dim)
    val_features = l2norm(val_feat)
    print('Test:')
    print('features shape:', val_features.shape)
    test_in_graph, test_nbrs = gen_graph(torch.from_numpy(val_features).to(device),
                                                    config,
                                                    device,
                                                    z_score=True)
    test_ji = jaccard_index(test_nbrs,
                            torch.zeros_like(test_nbrs, device=device).float(),
                            1.,
                            test_in_graph['nbrs_bounds'])
    torch.save(test_in_graph, 'ws50/test_graph.pt')
    torch.save(test_in_graph['nbrs_bounds'], 'ws50/test_nbrs_bounds.pt')
    torch.save(test_nbrs, 'ws50/test_nbrs.pt')
    torch.save(test_ji, 'ws50/test_ji.pt')

    # test_in_graph = torch.load('ws50/test_graph.pt')
    # test_nbrs_bounds = torch.load('ws50/test_nbrs_bounds.pt')
    # test_in_graph['nbrs_bounds'] = test_nbrs_bounds
    # test_nbrs = torch.load('ws50/test_nbrs.pt')
    # test_ji = torch.load('ws50/test_ji.pt')
    print(f"Median of neighbour bound: {test_in_graph['nbrs_bounds'].median()}")
    print(f"Mean of neighbour bound: {test_in_graph['nbrs_bounds'].float().mean()}")

    test_loader = NeighborLoader(
        test_in_graph,
        num_neighbors=[-1, -1],
        batch_size=512,
        shuffle=False
    )

    val_label_path = config['VAL_LABELS']
    val_labels = load_labels(val_label_path)
    print(f'num of labels: {val_labels.shape[0]}')
    print(f'#cls: {len(np.unique(val_labels))}')

    # Create Dataset
    batch_size = config['BATCH_SIZE']
    train_dataset = FeatureDataset(train_features, train_labels)
    sampler = samplers.MPerClassSampler(labels=train_labels,
                                        m=config['SAMPLES_PER_CLASS'],
                                        batch_size=batch_size,
                                        length_before_new_iter=100000)
    dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)

    # Create model
    model = GCN(in_dim=feat_dim, hidden_dim=config['HIDDEN_DIM'], out_dim=config['OUT_DIM'], dropout=config['DROPOUT'])
    print(f'GCN model:\n {model}\n')

    # Checkpoint dir
    model_dir = os.path.join(config['CHECKPOINT_DIR'], config['MODEL_NAME'])
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    tb_writer = SummaryWriter(log_dir=model_dir) # init tb writer
    shutil.copy(config_path, model_dir)

    # Run training
    epochs = config['EPOCHS']
    lr=config['LR']
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config['WEIGHT_DECAY'])
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                                    max_lr=lr,
                                                    steps_per_epoch=len(dataloader),
                                                    epochs=epochs,
                                                    div_factor=10)
    model = model.to(device)
    best_fp = -np.Inf

    for epoch in range(1, epochs):
        model.train()
        epoch_loss, modularity, neg_pair_loss = train(model, optimizer, dataloader, config, device)
        scheduler.step()

        # Add to tensorboard
        tb_writer.add_scalar('Train/Loss', epoch_loss, epoch)
        tb_writer.add_scalar('Train/Modularity', modularity, epoch)
        tb_writer.add_scalar('Train/NegPairLoss', neg_pair_loss, epoch)

        # Evaluate
        if epoch % 8 == 0:
            print(f'Epoch: {epoch:05d}, Loss: {epoch_loss:.4f}')

            model.eval()
            with torch.no_grad():
                out = torch.zeros((test_in_graph.num_nodes, config['OUT_DIM']), device=device)
                for data in test_loader:
                    local_idx = (data.input_id[:, None]==data.n_id).nonzero()[:, 1]
                    out[data.input_id, :] = model(data.x, data.edge_index)[local_idx]
                torch.cuda.empty_cache()

                test_graph = Data(x=out, edge_index=test_in_graph.edge_index)

                # Calculate similarities for the same neighbours before GCN inference
                # We calculate row-by-row similarities due to VRAM limitation
                out_sims = torch.zeros_like(test_nbrs, dtype=torch.float32, device=device)
                for i, out_feat in enumerate(out):
                    out_sims[i, :] = out[test_nbrs[i]]@out_feat

                out_sims = (1 - config['SIM_LAMBDA']) * test_ji + config['SIM_LAMBDA'] * out_sims

                out_spmat = knns2spmat(out_sims, test_nbrs)
                indices, values = sparse_mx_to_indices_values(out_spmat)
                test_graph['raw_affine'] = values
                w_out = out_spmat.sum(dim=1)
                w_in = out_spmat.sum(dim=0)
                test_graph['w_out'] = w_out
                test_graph['w_in'] = w_in

                tau_best_fp = -np.Inf
                best_metrices = None
                for curr_tau in torch.range(0.2, 0.7, 0.05):
                    curr_test_graph = test_graph.clone()
                    edge_mask = filter_edges(curr_test_graph, tau=curr_tau.item())
                    curr_test_graph.edge_index = curr_test_graph.edge_index[:, edge_mask]
                    curr_test_graph['raw_affine'] = curr_test_graph['raw_affine'][edge_mask]

                    treeg = tree_generation(curr_test_graph, device)
                    peaks, pred_labels = peak_propagation(treeg)
                    cluster_feats = scatter_mean(curr_test_graph.x, pred_labels, dim=0)
                    pred_labels = peak_agg(cluster_feats, pred_labels, config, curr_tau.item())
                    print(f'Number of identities: {len(pred_labels.unique())}')

                    # Calculate metrics
                    avg_pre_p, avg_rec_p, fscore_p = pairwise(val_labels, pred_labels.cpu().numpy())
                    avg_pre_b, avg_rec_b, fscore_b = bcubed(val_labels, pred_labels.cpu().numpy())

                    if fscore_p > tau_best_fp:
                        tau_best_fp = fscore_p
                        best_metrices = [curr_tau, avg_pre_p, avg_rec_p, fscore_p, avg_pre_b, avg_rec_b, fscore_b]

                curr_tau, avg_pre_p, avg_rec_p, fscore_p, avg_pre_b, avg_rec_b, fscore_b = best_metrices

                tb_writer.add_scalar('Val_Metrics/best_tau', curr_tau, epoch)

                print('#pairwise: avg_pre:{:.4f}, avg_rec:{:.4f}, fscore:{:.4f}'.format(avg_pre_p, avg_rec_p, fscore_p))
                tb_writer.add_scalar('Val_Metrics/precision_p', avg_pre_p, epoch)
                tb_writer.add_scalar('Val_Metrics/recall_p', avg_rec_p, epoch)
                tb_writer.add_scalar('Val_Metrics/Fp', fscore_p, epoch)

                print('#bicubic: avg_pre:{:.4f}, avg_rec:{:.4f}, fscore:{:.4f}'.format(avg_pre_b, avg_rec_b, fscore_b))
                tb_writer.add_scalar('Val_Metrics/precision_b', avg_pre_b, epoch)
                tb_writer.add_scalar('Val_Metrics/recall_b', avg_rec_b, epoch)
                tb_writer.add_scalar('Val_Metrics/Fb', fscore_b, epoch)

                # Save the best checkpoint
                if fscore_p > best_fp:
                    best_fp = fscore_p
                    print(f'\nNew best epoch: {epoch}\n')
                    best_model = glob.glob(os.path.join(model_dir, 'model_best-*.pth'))
                    if len(best_model):
                        os.remove(best_model[0])
                    torch.save(model.state_dict(), os.path.join(model_dir, f'model_best-{epoch}.pth'))

                # free up gpu memory
                torch.cuda.empty_cache()

if __name__== '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--config",
                        default='configs/train.yaml',
                        type=str,
                        help='Path of the config file')
    parser.add_argument("--no-cuda",
                        action='store_true',
                        help='Do not use GPU resources')

    args = parser.parse_args()

    # Set torch device
    if (not args.no_cuda) and torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    main(args.config, device)