diff --git a/mlpf/data_cms/genjob_pu55to75.sh b/mlpf/data_cms/genjob_pu55to75.sh index 54ed7c166..75a5866b7 100755 --- a/mlpf/data_cms/genjob_pu55to75.sh +++ b/mlpf/data_cms/genjob_pu55to75.sh @@ -75,11 +75,10 @@ ls -lrt echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py cmsRun step2_phase1_new.py > /dev/null cmsRun step3_phase1_new.py > /dev/null -#cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root -# python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ -# bzip2 -z pfntuple_${SEED}.pkl -# cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ +python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ +bzip2 -z pfntuple_${SEED}.pkl +cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ #copy ROOT outputs #cp step2_phase1_new.root $OUTDIR/$SAMPLE/root/step2_${SEED}.root diff --git a/mlpf/data_cms/postprocessing_jobs.py b/mlpf/data_cms/postprocessing_jobs.py index 685470a3d..6de4866d9 100644 --- a/mlpf/data_cms/postprocessing_jobs.py +++ b/mlpf/data_cms/postprocessing_jobs.py @@ -33,14 +33,16 @@ def write_script(infiles, outfiles): samples = [ - "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi", + "/local/joosep/mlpf/cms/20240823_simcluster/nopu/TTbar_14TeV_TuneCUETP8M1_cfi", + # "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi", + # "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/QCDForPF_14TeV_TuneCUETP8M1_cfi", ] ichunk = 1 for sample in samples: infiles = list(glob.glob(f"{sample}/root/pfntuple*.root")) for infiles_chunk in chunks(infiles, 10): - outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw2/") for inf in infiles_chunk] + outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw/") for inf in infiles_chunk] os.makedirs(os.path.dirname(outfiles_chunk[0]), exist_ok=True) scr = write_script(infiles_chunk, outfiles_chunk) ofname = f"jobscripts/postproc_{ichunk}.sh" diff --git a/mlpf/data_cms/prepare_args.py b/mlpf/data_cms/prepare_args.py index a07892e56..2ab663e01 100644 --- a/mlpf/data_cms/prepare_args.py +++ b/mlpf/data_cms/prepare_args.py @@ -6,15 +6,15 @@ outdir = "/local/joosep/mlpf/cms/20240823_simcluster" samples = [ -# ("TTbar_14TeV_TuneCUETP8M1_cfi", 105000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"), + ("TTbar_14TeV_TuneCUETP8M1_cfi", 100000, 120010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 200000, 220010, "genjob_pu55to75.sh", outdir + "/pu55to75"), - ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 305000, "genjob_pu55to75.sh", outdir + "/pu55to75"), + ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 320010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 500000, 520010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("ZpTT_1500_14TeV_TuneCP5_cfi", 600000, 620010, "genjob_pu55to75.sh", outdir + "/pu55to75"), -# ("VBF_TuneCP5_14TeV_pythia8_cfi", 700000, 720010, "genjob_pu55to75.sh", outdir + "/pu55to75"), +# ("VBF_TuneCP5_14TeV_pythia8_cfi", 700000, 705010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("TTbar_14TeV_TuneCUETP8M1_cfi", 702000, 705000, "genjob_nopu.sh", outdir + "/nopu"), -# ("MultiParticlePFGun50_cfi", 800000, 820000, "genjob_nopu.sh", outdir + "/nopu"), +# ("MultiParticlePFGun50_cfi", 800000, 805000, "genjob_nopu.sh", outdir + "/nopu"), # ("VBF_TuneCP5_14TeV_pythia8_cfi", 900000, 920010, "genjob_nopu.sh", outdir + "/nopu"), # ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 1000000,1020010, "genjob_nopu.sh", outdir + "/nopu"), @@ -36,6 +36,6 @@ os.makedirs(this_outdir + "/" + samp + "/root", exist_ok=True) for seed in range(seed0, seed1): - p = this_outdir + "/" + samp + "/raw2/pfntuple_{}.pkl.bz2".format(seed) + p = this_outdir + "/" + samp + "/root/pfntuple_{}.root".format(seed) if not os.path.isfile(p): print(f"sbatch --mem-per-cpu 8G --partition main --time 20:00:00 --cpus-per-task 1 scripts/tallinn/cmssw-el8.sh mlpf/data_cms/{script} {samp} {seed}") diff --git a/mlpf/heptfds/cms_pf/qcd.py b/mlpf/heptfds/cms_pf/qcd.py index f772bead8..39ee9d912 100644 --- a/mlpf/heptfds/cms_pf/qcd.py +++ b/mlpf/heptfds/cms_pf/qcd.py @@ -21,7 +21,7 @@ class CmsPfQcd(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_qcd dataset.""" - VERSION = tfds.core.Version("2.1.0") + VERSION = tfds.core.Version("2.3.0") RELEASE_NOTES = { "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", "1.3.1": "Remove PS again", @@ -33,6 +33,7 @@ class CmsPfQcd(tfds.core.GeneratorBasedBuilder): "1.7.1": "Increase stats to 400k events", "2.0.0": "New truth def based primarily on CaloParticles", "2.1.0": "Additional stats", + "2.3.0": "Split CaloParticles along tracks", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_qcd ~/tensorflow_datasets/ diff --git a/mlpf/heptfds/cms_pf/ttbar.py b/mlpf/heptfds/cms_pf/ttbar.py index 077ed97fd..82369f9bc 100644 --- a/mlpf/heptfds/cms_pf/ttbar.py +++ b/mlpf/heptfds/cms_pf/ttbar.py @@ -21,7 +21,7 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf dataset.""" - VERSION = tfds.core.Version("2.2.0") + VERSION = tfds.core.Version("2.3.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", @@ -38,6 +38,7 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): "2.0.0": "New truth def based primarily on CaloParticles", "2.1.0": "Additional stats", "2.2.0": "Split CaloParticles along tracks", + "2.3.0": "Increase stats", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar ~/tensorflow_datasets/ diff --git a/mlpf/jet_utils.py b/mlpf/jet_utils.py index 3a6d58616..8704aa26e 100644 --- a/mlpf/jet_utils.py +++ b/mlpf/jet_utils.py @@ -1,24 +1,24 @@ import numpy as np -import numba +# import numba import awkward import vector -@numba.njit +# @numba.njit def deltaphi(phi1, phi2): diff = phi1 - phi2 return np.arctan2(np.sin(diff), np.cos(diff)) -@numba.njit +# @numba.njit def deltar(eta1, phi1, eta2, phi2): deta = eta1 - eta2 dphi = deltaphi(phi1, phi2) return np.sqrt(deta**2 + dphi**2) -@numba.njit +# @numba.njit def match_jets(jets1, jets2, deltaR_cut): iev = len(jets1) jet_inds_1_ev = [] diff --git a/mlpf/plotting/plot_utils.py b/mlpf/plotting/plot_utils.py index 25875c0d7..c1ec189ff 100644 --- a/mlpf/plotting/plot_utils.py +++ b/mlpf/plotting/plot_utils.py @@ -530,6 +530,16 @@ def plot_jets(yvals, epoch=None, cp_dir=None, comet_experiment=None, title=None, plt.figure() b = np.linspace(0, 1000, 100) + + pt = awkward.to_numpy(awkward.flatten(yvals["jets_target_pt"])) + plt.hist( + pt, + bins=b, + histtype="step", + lw=2, + label="Target", + ) + pt = awkward.to_numpy(awkward.flatten(yvals["jets_cand_pt"])) plt.hist( pt, @@ -580,6 +590,67 @@ def plot_jets(yvals, epoch=None, cp_dir=None, comet_experiment=None, title=None, ) + + plt.figure() + b = np.linspace(-5, 5, 100) + eta = awkward.to_numpy(awkward.flatten(yvals["jets_target_eta"])) + plt.hist( + eta, + bins=b, + histtype="step", + lw=2, + label="Target", + ) + + eta = awkward.to_numpy(awkward.flatten(yvals["jets_cand_eta"])) + plt.hist( + eta, + bins=b, + histtype="step", + lw=2, + label="PF", + ) + + eta = awkward.to_numpy(awkward.flatten(yvals["jets_pred_eta"])) + plt.hist( + eta, + bins=b, + histtype="step", + lw=2, + label="MLPF", + ) + + eta = awkward.to_numpy(awkward.flatten(yvals["jets_gen_eta"])) + plt.hist( + eta, + bins=b, + histtype="step", + lw=2, + label="Truth", + ) + + plt.xlabel("jet $\eta$") + plt.ylabel("Jets / bin") + plt.yscale("log") + plt.legend(loc="best") + if title: + plt.title(title) + ax = plt.gca() + ylim = ax.get_ylim() + ax.set_ylim(ylim[0], 10 * ylim[1]) + + if dataset: + EXPERIMENT_LABELS[dataset](ax) + if sample: + sample_label(ax, sample) + + save_img( + "jet_eta.png", + epoch, + cp_dir=cp_dir, + comet_experiment=comet_experiment, + ) + def plot_jet_ratio( yvals, epoch=None, diff --git a/mlpf/pyg/PFDataset.py b/mlpf/pyg/PFDataset.py index 22bcec6a4..410756457 100644 --- a/mlpf/pyg/PFDataset.py +++ b/mlpf/pyg/PFDataset.py @@ -70,6 +70,23 @@ def __getitem__(self, item): ret["ygen"][:, 0][(ret["X"][:, 0] == 10) & (ret["ygen"][:, 0] == 7)] = 2 ret["ygen"][:, 0][(ret["X"][:, 0] == 11) & (ret["ygen"][:, 0] == 7)] = 2 + # set pt for HO which would otherwise be 0 + msk_ho = ret["X"][:, 0] == 10 + eta = ret["X"][:, 2][msk_ho] + e = ret["X"][:, 5][msk_ho] + ret["X"][:, 1][msk_ho] = np.sqrt(e**2 - (np.tanh(eta) * e) ** 2) + + # transform pt -> log(pt / elem pt), same for energy + ret["ygen"][:, 6] = np.log(ret["ygen"][:, 6] / ret["X"][:, 5]) + ret["ygen"][:, 6][np.isnan(ret["ygen"][:, 6])] = 0.0 + ret["ygen"][:, 6][np.isinf(ret["ygen"][:, 6])] = 0.0 + ret["ygen"][:, 6][ret["ygen"][:, 0] == 0] = 0 + + ret["ygen"][:, 2] = np.log(ret["ygen"][:, 2] / ret["X"][:, 1]) + ret["ygen"][:, 2][np.isnan(ret["ygen"][:, 2])] = 0.0 + ret["ygen"][:, 2][np.isinf(ret["ygen"][:, 2])] = 0.0 + ret["ygen"][:, 2][ret["ygen"][:, 0] == 0] = 0 + return ret def __len__(self): @@ -214,10 +231,14 @@ def get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray): dataset.append(ds) dataset = torch.utils.data.ConcatDataset(dataset) + shuffle = split == "train" if world_size > 1: - sampler = torch.utils.data.distributed.DistributedSampler(dataset) + sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=shuffle) else: - sampler = torch.utils.data.SequentialSampler(dataset) + if shuffle: + sampler = torch.utils.data.RandomSampler(dataset) + else: + sampler = torch.utils.data.SequentialSampler(dataset) # build dataloaders batch_size = config[f"{split}_dataset"][config["dataset"]][type_]["batch_size"] * config["gpu_batch_multiplier"] diff --git a/mlpf/pyg/inference.py b/mlpf/pyg/inference.py index 8d874c025..65258ac9d 100644 --- a/mlpf/pyg/inference.py +++ b/mlpf/pyg/inference.py @@ -42,13 +42,26 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_m batch = batch.to(rank) ypred = model(batch.X, batch.mask) + # transform log (pt/elempt) -> pt + pred_cls = torch.argmax(ypred[0], axis=-1) + ypred[2][..., 0] = torch.exp(ypred[2][..., 0]) * batch.X[..., 1] + batch.ygen[..., 2] = torch.exp(batch.ygen[..., 2]) * batch.X[..., 1] + + # transform log (E/elemE) -> E + ypred[2][..., 4] = torch.exp(ypred[2][..., 4]) * batch.X[..., 5] + batch.ygen[..., 6] = torch.exp(batch.ygen[..., 6]) * batch.X[..., 5] + + ypred[2][..., 0][pred_cls == 0] = 0 + ypred[2][..., 4][pred_cls == 0] = 0 + batch.ygen[..., 2][batch.ygen[..., 0] == 0] = 0 + batch.ygen[..., 6][batch.ygen[..., 0] == 0] = 0 + # convert all outputs to float32 in case running in float16 or bfloat16 ypred = tuple([y.to(torch.float32) for y in ypred]) - ygen = unpack_target(batch.ygen.to(torch.float32)) - ycand = unpack_target(batch.ycand.to(torch.float32)) + ygen = unpack_target(batch.ygen.to(torch.float32), model) + ycand = unpack_target(batch.ycand.to(torch.float32), model) ypred = unpack_predictions(ypred) - genjets_msk = batch.genjets[:, :, 0].cpu() != 0 genjets = awkward.unflatten(batch.genjets.cpu().to(torch.float64)[genjets_msk], torch.sum(genjets_msk, axis=1)) genjets = vector.awk( @@ -79,15 +92,18 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_m jets_coll = {} for typ, ydata in zip(["cand", "target"], [ycand, ygen]): clsid = awkward.unflatten(ydata["cls_id"], counts) + pt = awkward.unflatten(ydata["pt"], counts) + eta = awkward.unflatten(ydata["eta"], counts) + phi = awkward.unflatten(ydata["phi"], counts) + e = awkward.unflatten(ydata["energy"], counts) msk = clsid != 0 - p4 = awkward.unflatten(ydata["p4"], counts) vec = vector.awk( awkward.zip( { - "pt": p4[msk][:, :, 0], - "eta": p4[msk][:, :, 1], - "phi": p4[msk][:, :, 2], - "e": p4[msk][:, :, 3], + "pt": pt[msk], + "eta": eta[msk], + "phi": phi[msk], + "e": e[msk], } ) ) diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index ccf4523a5..a18a2fde6 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -3,8 +3,57 @@ from .gnn_lsh import CombinedGraphLayer -from torch.nn.attention import SDPBackend, sdpa_kernel from pyg.logger import _logger +import math +import numpy as np + + +def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): + # From https://github.com/rwightman/pytorch-image-models/blob/ + # 18ec173f95aa220af753358bf860b16b6691edb2/timm/layers/weight_init.py#L8 + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + lo = norm_cdf((a - mean) / std) + up = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2lo-1, 2up-1]. + tensor.uniform_(2 * lo - 1, 2 * up - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor def get_activation(activation): @@ -21,65 +70,10 @@ def get_activation(activation): return act -class SelfAttentionLayer(nn.Module): - def __init__( - self, - activation="elu", - embedding_dim=128, - num_heads=2, - width=128, - dropout_mha=0.1, - dropout_ff=0.1, - attention_type="efficient", - ): - super(SelfAttentionLayer, self).__init__() - - # to enable manual override for ONNX export - self.enable_ctx_manager = True - - self.attention_type = attention_type - self.act = get_activation(activation) - if self.attention_type == "flash_external": - from flash_attn.modules.mha import MHA - - self.mha = MHA(embedding_dim, num_heads, dropout=dropout_mha) - else: - self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True) - self.norm0 = torch.nn.LayerNorm(embedding_dim) - self.norm1 = torch.nn.LayerNorm(embedding_dim) - self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()) - self.dropout = torch.nn.Dropout(dropout_ff) - _logger.info("using attention_type={}".format(attention_type)) - # params for torch sdp_kernel - self.attn_params = { - "math": [SDPBackend.MATH], - "efficient": [SDPBackend.EFFICIENT_ATTENTION], - "flash": [SDPBackend.FLASH_ATTENTION], - } - - def forward(self, x, mask): - # explicitly call the desired attention mechanism - if self.attention_type == "flash_external": - mha_out = self.mha(x) - else: - if self.enable_ctx_manager: - with sdpa_kernel(self.attn_params[self.attention_type]): - mha_out = self.mha(x, x, x, need_weights=False)[0] - else: - mha_out = self.mha(x, x, x, need_weights=False)[0] - - x = x + mha_out - x = self.norm0(x) - x = x + self.seq(x) - x = self.norm1(x) - x = self.dropout(x) - x = x * mask.unsqueeze(-1) - return x - - class PreLnSelfAttentionLayer(nn.Module): def __init__( self, + name="", activation="elu", embedding_dim=128, num_heads=2, @@ -87,74 +81,80 @@ def __init__( dropout_mha=0.1, dropout_ff=0.1, attention_type="efficient", + learnable_queries=False, + elems_as_queries=False, ): super(PreLnSelfAttentionLayer, self).__init__() + self.name = name - # to enable manual override for ONNX export + # set to False to enable manual override for ONNX export self.enable_ctx_manager = True self.attention_type = attention_type self.act = get_activation(activation) - if self.attention_type == "flash_external": - from flash_attn.modules.mha import MHA - - self.mha = MHA(embedding_dim, num_heads, dropout=dropout_mha) - else: - self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True) + self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True) self.norm0 = torch.nn.LayerNorm(embedding_dim) self.norm1 = torch.nn.LayerNorm(embedding_dim) self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()) self.dropout = torch.nn.Dropout(dropout_ff) _logger.info("using attention_type={}".format(attention_type)) # params for torch sdp_kernel - self.attn_params = { - "math": [SDPBackend.MATH], - "efficient": [SDPBackend.EFFICIENT_ATTENTION], - "flash": [SDPBackend.FLASH_ATTENTION], - } - - def forward(self, x, mask): - x = self.norm0(x) - - # explicitly call the desired attention mechanism - if self.attention_type == "flash_external": - mha_out = self.mha(x) + if self.enable_ctx_manager: + from torch.nn.attention import SDPBackend, sdpa_kernel + self.attn_params = { + "math": [SDPBackend.MATH], + "efficient": [SDPBackend.EFFICIENT_ATTENTION], + "flash": [SDPBackend.FLASH_ATTENTION], + } + + self.learnable_queries = learnable_queries + self.elems_as_queries = elems_as_queries + if self.learnable_queries: + self.queries = nn.Parameter(torch.zeros(1, 1, embedding_dim), requires_grad=True) + trunc_normal_(self.queries, std=0.02) + + self.save_attention = False + self.outdir = "" + + def forward(self, x, mask, initial_embedding): + mask_ = mask.unsqueeze(-1) + x = self.norm0(x * mask_) + + q = x + if self.learnable_queries: + q = self.queries.expand(*x.shape) * mask_ + elif self.elems_as_queries: + q = initial_embedding * mask_ + + key_padding_mask = None + if self.attention_type == "math": + key_padding_mask = ~mask + + # default path, for FlashAttn/Math backend + if self.enable_ctx_manager: + with sdpa_kernel(self.attn_params[self.attention_type]): + mha_out = self.mha(q, x, x, need_weights=False, key_padding_mask=key_padding_mask)[0] + + if self.save_attention: + att_mat = self.mha(q, x, x, need_weights=True, key_padding_mask=key_padding_mask)[1] + att_mat = att_mat.detach().cpu().numpy() + np.savez( + open("{}/attn_{}.npz".format(self.outdir, self.name), "wb"), + att=att_mat, + in_proj_weight=self.mha.in_proj_weight.detach().cpu().numpy(), + ) + + # path for ONNX export else: - if self.enable_ctx_manager: - with sdpa_kernel(self.attn_params[self.attention_type]): - mha_out = self.mha(x, x, x, need_weights=False)[0] - else: - mha_out = self.mha(x, x, x, need_weights=False)[0] + mha_out = self.mha(q, x, x, need_weights=False, key_padding_mask=key_padding_mask)[0] + + mha_out = mha_out * mask_ mha_out = x + mha_out x = self.norm1(mha_out) x = mha_out + self.seq(x) x = self.dropout(x) - x = x * mask.unsqueeze(-1) - return x - - -class MambaLayer(nn.Module): - def __init__(self, activation="elu", embedding_dim=128, width=128, d_state=16, d_conv=4, expand=2, dropout=0.1): - super(MambaLayer, self).__init__() - self.act = get_activation(activation) - from mamba_ssm import Mamba - - self.mamba = Mamba( - d_model=embedding_dim, - d_state=d_state, - d_conv=d_conv, - expand=expand, - ) - self.norm0 = torch.nn.LayerNorm(embedding_dim) - self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()) - self.dropout = torch.nn.Dropout(dropout) - - def forward(self, x, mask): - x = self.mamba(x) - x = self.norm0(x + self.seq(x)) - x = self.dropout(x) - x = x * (~mask.unsqueeze(-1)) + x = x * mask_ return x @@ -177,6 +177,12 @@ def __init__(self, mode, embed_dim, width, act, dropout, elemtypes): # single output if self.mode == "direct" or self.mode == "additive" or self.mode == "multiplicative": self.nn = ffn(embed_dim, 1, width, act, dropout) + elif self.mode == "direct-elemtype": + self.nn = ffn(embed_dim, len(self.elemtypes), width, act, dropout) + elif self.mode == "direct-elemtype-split": + self.nn = nn.ModuleList() + for elem in range(len(self.elemtypes)): + self.nn.append(ffn(embed_dim, 1, width, act, dropout)) # two outputs elif self.mode == "linear": self.nn = ffn(embed_dim, 2, width, act, dropout) @@ -185,10 +191,21 @@ def __init__(self, mode, embed_dim, width, act, dropout, elemtypes): self.nn2 = ffn(embed_dim, len(self.elemtypes), width, act, dropout) def forward(self, elems, x, orig_value): - if self.mode == "direct": nn_out = self.nn(x) return nn_out + elif self.mode == "direct-elemtype": + nn_out = self.nn(x) + elemtype_mask = torch.cat([elems[..., 0:1] == elemtype for elemtype in self.elemtypes], axis=-1) + nn_out = torch.sum(elemtype_mask * nn_out, axis=-1, keepdims=True) + return nn_out + elif self.mode == "direct-elemtype-split": + elem_outs = [] + for elem in range(len(self.elemtypes)): + elem_outs.append(self.nn[elem](x)) + elemtype_mask = torch.cat([elems[..., 0:1] == elemtype for elemtype in self.elemtypes], axis=-1) + elem_outs = torch.cat(elem_outs, axis=-1) + return torch.sum(elem_outs * elemtype_mask, axis=-1, keepdims=True) elif self.mode == "additive": nn_out = self.nn(x) return orig_value + nn_out @@ -220,11 +237,11 @@ def __init__( layernorm=True, conv_type="attention", input_encoding="joint", - pt_mode="additive-elemtype", - eta_mode="additive-elemtype", - sin_phi_mode="additive-elemtype", - cos_phi_mode="additive-elemtype", - energy_mode="additive-elemtype", + pt_mode="linear", + eta_mode="linear", + sin_phi_mode="linear", + cos_phi_mode="linear", + energy_mode="linear", # element types which actually exist in the dataset elemtypes_nonzero=[1, 4, 5, 6, 8, 9, 10, 11], # should the conv layer outputs be concatted (concat) or take the last (last) @@ -245,10 +262,6 @@ def __init__( dropout_conv_id_mha=0.0, dropout_conv_id_ff=0.0, use_pre_layernorm=False, - # mamba specific parameters - d_state=16, - d_conv=4, - expand=2, ): super(MLPF, self).__init__() @@ -273,7 +286,7 @@ def __init__( width = num_heads * head_dim # embedding of the inputs - if num_convs != 0: + if self.num_convs != 0: if self.input_encoding == "joint": self.nn0_id = ffn(self.input_dim, embedding_dim, width, self.act, dropout_ff) self.nn0_reg = ffn(self.input_dim, embedding_dim, width, self.act, dropout_ff) @@ -289,11 +302,11 @@ def __init__( self.conv_id = nn.ModuleList() self.conv_reg = nn.ModuleList() - attention_layer = PreLnSelfAttentionLayer if self.use_pre_layernorm else SelfAttentionLayer - - for i in range(num_convs): + for i in range(self.num_convs): + lastlayer = i == self.num_convs - 1 self.conv_id.append( - attention_layer( + PreLnSelfAttentionLayer( + name="conv_id_{}".format(i), activation=activation, embedding_dim=embedding_dim, num_heads=num_heads, @@ -301,10 +314,13 @@ def __init__( dropout_mha=dropout_conv_id_mha, dropout_ff=dropout_conv_id_ff, attention_type=attention_type, + elems_as_queries=lastlayer, + # learnable_queries=lastlayer, ) ) self.conv_reg.append( - attention_layer( + PreLnSelfAttentionLayer( + name="conv_reg_{}".format(i), activation=activation, embedding_dim=embedding_dim, num_heads=num_heads, @@ -312,18 +328,14 @@ def __init__( dropout_mha=dropout_conv_reg_mha, dropout_ff=dropout_conv_reg_ff, attention_type=attention_type, + elems_as_queries=lastlayer, + # learnable_queries=lastlayer, ) ) - elif self.conv_type == "mamba": - self.conv_id = nn.ModuleList() - self.conv_reg = nn.ModuleList() - for i in range(num_convs): - self.conv_id.append(MambaLayer(activation, embedding_dim, width, d_state, d_conv, expand, dropout_ff)) - self.conv_reg.append(MambaLayer(activation, embedding_dim, width, d_state, d_conv, expand, dropout_ff)) elif self.conv_type == "gnn_lsh": self.conv_id = nn.ModuleList() self.conv_reg = nn.ModuleList() - for i in range(num_convs): + for i in range(self.num_convs): gnn_conf = { "inout_dim": embedding_dim, "bin_size": self.bin_size, @@ -339,16 +351,16 @@ def __init__( self.conv_reg.append(CombinedGraphLayer(**gnn_conf)) if self.learned_representation_mode == "concat": - decoding_dim = self.input_dim + num_convs * embedding_dim + decoding_dim = self.num_convs * embedding_dim elif self.learned_representation_mode == "last": - decoding_dim = self.input_dim + embedding_dim + decoding_dim = embedding_dim # DNN that acts on the node level to predict the PID self.nn_binary_particle = ffn(decoding_dim, 2, width, self.act, dropout_ff) self.nn_pid = ffn(decoding_dim, num_classes, width, self.act, dropout_ff) # elementwise DNN for node momentum regression - embed_dim = decoding_dim + 2 + num_classes + embed_dim = decoding_dim self.nn_pt = RegressionOutput(pt_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero) self.nn_eta = RegressionOutput(eta_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero) self.nn_sin_phi = RegressionOutput(sin_phi_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero) @@ -379,18 +391,18 @@ def forward(self, X_features, mask): for num, conv in enumerate(self.conv_id): conv_input = embedding_id if num == 0 else embeddings_id[-1] - out_padded = conv(conv_input, mask) + out_padded = conv(conv_input, mask, embedding_id) embeddings_id.append(out_padded) for num, conv in enumerate(self.conv_reg): conv_input = embedding_reg if num == 0 else embeddings_reg[-1] - out_padded = conv(conv_input, mask) + out_padded = conv(conv_input, mask, embedding_reg) embeddings_reg.append(out_padded) # id input if self.learned_representation_mode == "concat": - final_embedding_id = torch.cat([Xfeat_normed] + embeddings_id, axis=-1) + final_embedding_id = torch.cat(embeddings_id, axis=-1) elif self.learned_representation_mode == "last": - final_embedding_id = torch.cat([Xfeat_normed] + [embeddings_id[-1]], axis=-1) + final_embedding_id = torch.cat([embeddings_id[-1]], axis=-1) if self.use_pre_layernorm: final_embedding_id = self.final_norm_id(final_embedding_id) @@ -402,9 +414,9 @@ def forward(self, X_features, mask): # regression input if self.learned_representation_mode == "concat": - final_embedding_reg = torch.cat([Xfeat_normed] + embeddings_reg + [preds_binary_particle.detach(), preds_pid.detach()], axis=-1) + final_embedding_reg = torch.cat(embeddings_reg, axis=-1) elif self.learned_representation_mode == "last": - final_embedding_reg = torch.cat([Xfeat_normed] + [embeddings_reg[-1]] + [preds_binary_particle.detach(), preds_pid.detach()], axis=-1) + final_embedding_reg = torch.cat([embeddings_reg[-1]], axis=-1) if self.use_pre_layernorm: final_embedding_reg = self.final_norm_reg(final_embedding_reg) @@ -414,7 +426,25 @@ def forward(self, X_features, mask): preds_eta = self.nn_eta(X_features, final_embedding_reg, X_features[..., 2:3]) preds_sin_phi = self.nn_sin_phi(X_features, final_embedding_reg, X_features[..., 3:4]) preds_cos_phi = self.nn_cos_phi(X_features, final_embedding_reg, X_features[..., 4:5]) - preds_energy = self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6]) - preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1) + # ensure created particle has positive mass^2 by computing energy from pt and adding a positive-only correction + pt_real = torch.exp(preds_pt.detach()) * X_features[..., 1:2] + pz_real = pt_real * torch.sinh(preds_eta.detach()) + e_real = torch.log(torch.sqrt(pt_real**2 + pz_real**2) / X_features[..., 5:6]) + e_real[~mask] = 0 + e_real[torch.isinf(e_real)] = 0 + e_real[torch.isnan(e_real)] = 0 + preds_energy = e_real + torch.nn.functional.relu(self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6])) + preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1) return preds_binary_particle, preds_pid, preds_momentum + + +def set_save_attention(model, outdir, save_attention): + if isinstance(model, torch.nn.parallel.DistributedDataParallel): + model = model.module + if model.conv_type == "attention": + for iconv in range(model.num_convs): + model.conv_id[iconv].outdir = outdir + model.conv_reg[iconv].outdir = outdir + model.conv_id[iconv].save_attention = save_attention + model.conv_reg[iconv].save_attention = save_attention diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 0100a9a72..677da2a16 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -15,6 +15,10 @@ import sklearn import sklearn.metrics import numpy as np +import pandas +import matplotlib +import matplotlib.pyplot as plt +import glob # comet needs to be imported before torch from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip @@ -44,6 +48,8 @@ import fastjet from pyg.inference import make_plots, run_predictions + +from pyg.mlpf import set_save_attention from pyg.mlpf import MLPF from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders from utils import create_comet_experiment @@ -91,41 +97,53 @@ def mlpf_loss(y, ypred, batch): ypred["cls_id_onehot"] = ypred["cls_id_onehot"].permute((0, 2, 1)) # binary loss for particle / no-particle classification - loss_binary_classification = 100 * loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape) + # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape) + loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none") # compare the particle type, only for cases where there was a true particle - loss_pid_classification = 100 * loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape) + loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape) loss_pid_classification[y["cls_id"] == 0] *= 0 # compare particle momentum, only for cases where there was a true particle - loss_regression = 10 * torch.nn.functional.huber_loss(ypred["momentum"], y["momentum"], reduction="none") - loss_regression[y["cls_id"] == 0] *= 0 + loss_regression_pt = torch.nn.functional.mse_loss(ypred["pt"], y["pt"], reduction="none") + loss_regression_eta = 1e-2 * torch.nn.functional.mse_loss(ypred["eta"], y["eta"], reduction="none") + loss_regression_sin_phi = 1e-2 * torch.nn.functional.mse_loss(ypred["sin_phi"], y["sin_phi"], reduction="none") + loss_regression_cos_phi = 1e-2 * torch.nn.functional.mse_loss(ypred["cos_phi"], y["cos_phi"], reduction="none") + loss_regression_energy = torch.nn.functional.mse_loss(ypred["energy"], y["energy"], reduction="none") + + loss_regression_pt[y["cls_id"] == 0] *= 0 + loss_regression_eta[y["cls_id"] == 0] *= 0 + loss_regression_sin_phi[y["cls_id"] == 0] *= 0 + loss_regression_cos_phi[y["cls_id"] == 0] *= 0 + loss_regression_energy[y["cls_id"] == 0] *= 0 # set the loss to 0 on padded elements in the batch loss_binary_classification[batch.mask == 0] *= 0 loss_pid_classification[batch.mask == 0] *= 0 - loss_regression[batch.mask == 0] *= 0 + loss_regression_pt[batch.mask == 0] *= 0 + loss_regression_eta[batch.mask == 0] *= 0 + loss_regression_sin_phi[batch.mask == 0] *= 0 + loss_regression_cos_phi[batch.mask == 0] *= 0 + loss_regression_energy[batch.mask == 0] *= 0 + + # average over all target particles + loss["Regression_pt"] = loss_regression_pt.sum() / npart + loss["Regression_eta"] = loss_regression_eta.sum() / npart + loss["Regression_sin_phi"] = loss_regression_sin_phi.sum() / npart + loss["Regression_cos_phi"] = loss_regression_cos_phi.sum() / npart + loss["Regression_energy"] = loss_regression_energy.sum() / npart # average over all elements that were not padded loss["Classification_binary"] = loss_binary_classification.sum() / nelem loss["Classification"] = loss_pid_classification.sum() / nelem - # normalize loss with stddev to stabilize across batches with very different pt, E distributions - mom_normalizer = y["momentum"][y["cls_id"] != 0].std(axis=0) - reg_losses = loss_regression[y["cls_id"] != 0] - - # average over all true particles - loss["Regression"] = (reg_losses / mom_normalizer).sum() / npart - - # in case we are using the 3D-padded mode, we can compute a few additional event-level monitoring losses - msk_pred_particle = torch.unsqueeze(torch.argmax(ypred["cls_binary"].detach(), axis=1) != 0, axis=-1) - # pt * cos_phi - px = ypred["momentum"][..., 0:1].detach() * ypred["momentum"][..., 3:4].detach() * msk_pred_particle - # pt * sin_phi - py = ypred["momentum"][..., 0:1].detach() * ypred["momentum"][..., 2:3].detach() * msk_pred_particle - # sum across events - pred_met = torch.sum(px, axis=-2) ** 2 + torch.sum(py, axis=-2) ** 2 + # compute predicted pt from model output + pred_pt = torch.unsqueeze(torch.exp(ypred["pt"].detach()) * batch.X[..., 1], axis=-1) * msk_pred_particle + pred_px = pred_pt * torch.unsqueeze(ypred["cos_phi"].detach(), axis=-1) * msk_pred_particle + pred_py = pred_pt * torch.unsqueeze(ypred["sin_phi"].detach(), axis=-1) * msk_pred_particle + # compute MET, sum across particle axis in event + pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2) loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean() was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( @@ -135,15 +153,31 @@ def mlpf_loss(y, ypred, batch): axis=-1 ) + # standardize Wasserstein loss std = was_input_true[batch.mask].std(axis=0) loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(was_input_pred / std, was_input_true / std).mean() - loss["Total"] = loss["Classification_binary"] + loss["Classification"] + loss["Regression"] # + 0.01 * loss["Sliced_Wasserstein_Loss"] + # this is the final loss to be optimized + loss["Total"] = ( + loss["Classification_binary"] + + loss["Classification"] + + loss["Regression_pt"] + + loss["Regression_eta"] + + loss["Regression_sin_phi"] + + loss["Regression_cos_phi"] + + loss["Regression_energy"] + ) + # store these separately but detached loss["Classification_binary"] = loss["Classification_binary"].detach() loss["Classification"] = loss["Classification"].detach() - loss["Regression"] = loss["Regression"].detach() + loss["Regression_pt"] = loss["Regression_pt"].detach() + loss["Regression_eta"] = loss["Regression_eta"].detach() + loss["Regression_sin_phi"] = loss["Regression_sin_phi"].detach() + loss["Regression_cos_phi"] = loss["Regression_cos_phi"].detach() + loss["Regression_energy"] = loss["Regression_energy"].detach() loss["Sliced_Wasserstein_Loss"] = loss["Sliced_Wasserstein_Loss"].detach() + return loss @@ -238,6 +272,152 @@ def configure_model_trainable(model, trainable, is_training): model.eval() +def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir): + X = batch.X[batch.mask].cpu() + ygen_flat = batch.ygen[batch.mask].cpu() + ypred_binary = ypred_raw[0][batch.mask].detach().cpu() + ypred_binary_cls = torch.argmax(ypred_binary, axis=-1) + ypred_cls = ypred_raw[1][batch.mask].detach().cpu() + ypred_p4 = ypred_raw[2][batch.mask].detach().cpu() + + arr = torch.concatenate( + [X, ygen_flat, ypred_binary, ypred_cls, ypred_p4], + axis=-1, + ).numpy() + df = pandas.DataFrame(arr) + df.to_parquet(f"{outdir}/batch0_epoch{epoch}.parquet") + + if tensorboard_writer: + sig_prob = torch.softmax(ypred_binary, axis=-1)[:, 1].to(torch.float32) + for xcls in np.unique(X[:, 0]): + fig = plt.figure() + msk = X[:, 0] == xcls + egen = ygen_flat[msk & (ygen_flat[:, 0] != 0), 6] + epred = ypred_p4[msk & (ypred_binary_cls != 0), 4] + b = np.linspace(-4, 4, 100) + plt.hist(egen, bins=b, histtype="step") + plt.hist(epred, bins=b, histtype="step") + plt.xlabel("log [E/E_elem]") + tensorboard_writer.add_figure("energy_elemtype{}".format(int(xcls)), fig, global_step=epoch) + + fig = plt.figure() + msk = X[:, 0] == xcls + pt_gen = ygen_flat[msk & (ygen_flat[:, 0] != 0), 2] + pt_pred = ypred_p4[msk & (ypred_binary_cls != 0), 0] + b = np.linspace(-4, 4, 100) + plt.hist(egen, bins=b, histtype="step") + plt.hist(epred, bins=b, histtype="step") + plt.xlabel("log [pt/pt_elem]") + tensorboard_writer.add_figure("pt_elemtype{}".format(int(xcls)), fig, global_step=epoch) + + fig = plt.figure(figsize=(5, 5)) + msk = (X[:, 0] == xcls) & (ygen_flat[:, 0] != 0) & (ypred_binary_cls != 0) + egen = ygen_flat[msk, 6] + epred = ypred_p4[msk, 4] + b = np.linspace(-4, 4, 100) + plt.hist2d(egen, epred, bins=b, cmap="Blues") + plt.plot([-4, 4], [-4, 4], color="black", ls="--") + plt.xlabel("log [E_gen/E_elem]") + plt.ylabel("log [E_pred/E_elem]") + tensorboard_writer.add_figure("energy_elemtype{}_corr".format(int(xcls)), fig, global_step=epoch) + + fig = plt.figure(figsize=(5, 5)) + msk = (X[:, 0] == xcls) & (ygen_flat[:, 0] != 0) & (ypred_binary_cls != 0) + pt_gen = ygen_flat[msk, 2] + pt_pred = ypred_p4[msk, 0] + b = np.linspace(-4, 4, 100) + plt.hist2d(pt_gen, pt_pred, bins=b, cmap="Blues") + plt.plot([-4, 4], [-4, 4], color="black", ls="--") + plt.xlabel("log [pt_gen/pt_elem]") + plt.ylabel("log [pt_pred/pt_elem]") + tensorboard_writer.add_figure("pt_elemtype{}_corr".format(int(xcls)), fig, global_step=epoch) + + fig = plt.figure(figsize=(5, 5)) + msk = (X[:, 0] == xcls) & (ygen_flat[:, 0] != 0) & (ypred_binary_cls != 0) + eta_gen = ygen_flat[msk, 3] + eta_pred = ypred_p4[msk, 1] + b = np.linspace(-6, 6, 100) + plt.hist2d(eta_gen, eta_pred, bins=b, cmap="Blues") + plt.plot([-6, 6], [-6, 6], color="black", ls="--") + plt.xlabel("eta_gen") + plt.ylabel("eta_pred") + tensorboard_writer.add_figure("eta_elemtype{}_corr".format(int(xcls)), fig, global_step=epoch) + + fig = plt.figure(figsize=(5, 5)) + msk = (X[:, 0] == xcls) & (ygen_flat[:, 0] != 0) & (ypred_binary_cls != 0) + sphi_gen = ygen_flat[msk, 4] + sphi_pred = ypred_p4[msk, 2] + b = np.linspace(-2, 2, 100) + plt.hist2d(sphi_gen, sphi_pred, bins=b, cmap="Blues") + plt.plot([-2, 2], [-2, 2], color="black", ls="--") + plt.xlabel("sin_phi_gen") + plt.ylabel("sin_phi_pred") + tensorboard_writer.add_figure("sphi_elemtype{}_corr".format(int(xcls)), fig, global_step=epoch) + + fig = plt.figure(figsize=(5, 5)) + msk = (X[:, 0] == xcls) & (ygen_flat[:, 0] != 0) & (ypred_binary_cls != 0) + cphi_gen = ygen_flat[msk, 5] + cphi_pred = ypred_p4[msk, 3] + b = np.linspace(-2, 2, 100) + plt.hist2d(cphi_gen, cphi_pred, bins=b, cmap="Blues") + plt.plot([-2, 2], [-2, 2], color="black", ls="--") + plt.xlabel("cos_phi_gen") + plt.ylabel("cos_phi_pred") + tensorboard_writer.add_figure("cphi_elemtype{}_corr".format(int(xcls)), fig, global_step=epoch) + + fig = plt.figure() + msk = X[:, 0] == xcls + b = np.linspace(0, 1, 100) + plt.hist(sig_prob[msk & (ygen_flat[:, 0] == 0)], bins=b, histtype="step") + plt.hist(sig_prob[msk & (ygen_flat[:, 0] != 0)], bins=b, histtype="step") + plt.xlabel("particle proba") + tensorboard_writer.add_figure("sig_proba_elemtype{}".format(int(xcls)), fig, global_step=epoch) + + tensorboard_writer.add_histogram("pt_target", torch.clamp(batch.ygen[batch.mask][:, 2], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("pt_pred", torch.clamp(ypred_raw[2][batch.mask][:, 0], -10, 10), global_step=epoch) + ratio = (ypred_raw[2][batch.mask][:, 0] / batch.ygen[batch.mask][:, 2])[batch.ygen[batch.mask][:, 0] != 0] + tensorboard_writer.add_histogram("pt_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) + + tensorboard_writer.add_histogram("eta_target", torch.clamp(batch.ygen[batch.mask][:, 3], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("eta_pred", torch.clamp(ypred_raw[2][batch.mask][:, 1], -10, 10), global_step=epoch) + ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0] + tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) + + tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch) + ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0] + tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) + + tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch) + ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0] + tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) + + tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch) + ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0] + tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) + + for attn in sorted(list(glob.glob(f"{outdir}/attn_conv_*.npz"))): + attn_name = os.path.basename(attn).split(".")[0] + attn_matrix = np.load(attn)["att"] + batch_size = min(attn_matrix.shape[0], 8) + fig, axes = plt.subplots(1, batch_size, figsize=((batch_size * 3, 1 * 3))) + if isinstance(axes, matplotlib.axes._axes.Axes): + axes = [axes] + for ibatch in range(batch_size): + plt.sca(axes[ibatch]) + print(attn_matrix[ibatch]) + # plot the attention matrix of the first event in the batch + plt.imshow(attn_matrix[ibatch].T, cmap="Blues", norm=matplotlib.colors.LogNorm()) + plt.xticks([]) + plt.yticks([]) + plt.colorbar() + plt.title("event {}, m={:.2E}".format(ibatch, np.mean(attn_matrix[ibatch][attn_matrix[ibatch] > 0]))) + plt.suptitle(attn_name) + tensorboard_writer.add_figure(attn_name, fig, global_step=epoch) + + def train_and_valid( rank, world_size, @@ -255,6 +435,7 @@ def train_and_valid( val_freq=None, dtype=torch.float32, tensorboard_writer=None, + save_attention=False, ): """ Performs training over a given epoch. Will run a validation step every N_STEPS and after the last training batch. @@ -289,21 +470,25 @@ def train_and_valid( cm_id = np.zeros((13, 13)) for itrain, batch in iterator: + set_save_attention(model, outdir, False) batch = batch.to(rank, non_blocking=True) - ygen = unpack_target(batch.ygen) + ygen = unpack_target(batch.ygen, model) num_elems = batch.X[batch.mask].shape[0] num_batch = batch.X.shape[0] with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): if is_train: - ypred = model(batch.X, batch.mask) + ypred_raw = model(batch.X, batch.mask) else: with torch.no_grad(): - ypred = model(batch.X, batch.mask) + # save some attention matrices + if save_attention and (rank == 0 or rank == "cpu") and itrain == 0: + set_save_attention(model, outdir, True) + ypred_raw = model(batch.X, batch.mask) - ypred = unpack_predictions(ypred) + ypred = unpack_predictions(ypred_raw) if not is_train: cm_X_gen += sklearn.metrics.confusion_matrix( @@ -315,6 +500,9 @@ def train_and_valid( cm_id += sklearn.metrics.confusion_matrix( ygen["cls_id"][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13) ) + # save the events of the first validation batch for quick checks + if (rank == 0 or rank == "cpu") and itrain == 0: + validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir) with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): if is_train: @@ -349,10 +537,6 @@ def train_and_valid( loss_accum = 0.0 extra_state = {"step": step, "lr_schedule_state_dict": lr_schedule.state_dict()} - torch.save( - {"model_state_dict": get_model_state_dict(model), "optimizer_state_dict": optimizer.state_dict()}, - f"{outdir}/step_weights.pth", - ) save_checkpoint(f"{outdir}/step_weights.pth", model, optimizer, extra_state) if not (comet_experiment is None) and (itrain % comet_step_freq == 0): @@ -389,11 +573,19 @@ def train_and_valid( ) intermediate_metrics = dict( loss=intermediate_losses_t["Total"], - reg_loss=intermediate_losses_t["Regression"], + reg_pt_loss=intermediate_losses_t["Regression_pt"], + reg_eta_loss=intermediate_losses_t["Regression_eta"], + reg_sin_phi_loss=intermediate_losses_t["Regression_sin_phi"], + reg_cos_phi_loss=intermediate_losses_t["Regression_cos_phi"], + reg_energy_loss=intermediate_losses_t["Regression_energy"], cls_loss=intermediate_losses_t["Classification"], cls_binary_loss=intermediate_losses_t["Classification_binary"], val_loss=intermediate_losses_v["Total"], - val_reg_loss=intermediate_losses_v["Regression"], + val_reg_pt_loss=intermediate_losses_v["Regression_pt"], + val_reg_eta_loss=intermediate_losses_v["Regression_eta"], + val_reg_sin_phi_loss=intermediate_losses_v["Regression_sin_phi"], + val_reg_cos_phi_loss=intermediate_losses_v["Regression_cos_phi"], + val_reg_energy_loss=intermediate_losses_v["Regression_energy"], val_cls_loss=intermediate_losses_v["Classification"], val_cls_binary_loss=intermediate_losses_v["Classification_binary"], inside_epoch=epoch, @@ -435,6 +627,8 @@ def train_and_valid( if world_size > 1: dist.barrier() + if device_type == "cuda": + torch.cuda.empty_cache() return epoch_loss @@ -458,6 +652,7 @@ def train_mlpf( comet_experiment=None, comet_step_freq=None, val_freq=None, + save_attention=False, ): """ Will run a full training by calling train(). @@ -480,7 +675,17 @@ def train_mlpf( t0_initial = time.time() - losses_of_interest = ["Total", "Classification", "Classification_binary", "Regression"] + losses_of_interest = [ + "Total", + "Classification", + "Classification_binary", + "Regression_pt", + "Regression_eta", + "Regression_sin_phi", + "Regression_cos_phi", + "Regression_energy", + # "Mass", + ] losses = {} losses["train"], losses["valid"] = {}, {} @@ -548,6 +753,7 @@ def train_mlpf( epoch=epoch, dtype=dtype, tensorboard_writer=tensorboard_writer_valid, + save_attention=save_attention, ) t_valid = time.time() @@ -585,11 +791,19 @@ def train_mlpf( # Ray automatically syncs the checkpoint to persistent storage metrics = dict( loss=losses_t["Total"], - reg_loss=losses_t["Regression"], + reg_pt_loss=losses_t["Regression_pt"], + reg_eta_loss=losses_t["Regression_eta"], + reg_sin_phi_loss=losses_t["Regression_sin_phi"], + reg_cos_phi_loss=losses_t["Regression_cos_phi"], + reg_energy_loss=losses_t["Regression_energy"], cls_loss=losses_t["Classification"], cls_binary_loss=losses_t["Classification_binary"], val_loss=losses_v["Total"], - val_reg_loss=losses_v["Regression"], + val_reg_pt_loss=losses_v["Regression_pt"], + val_reg_eta_loss=losses_v["Regression_eta"], + val_reg_sin_phi_loss=losses_v["Regression_sin_phi"], + val_reg_cos_phi_loss=losses_v["Regression_cos_phi"], + val_reg_energy_loss=losses_v["Regression_energy"], val_cls_loss=losses_v["Classification"], val_cls_binary_loss=losses_v["Classification_binary"], epoch=epoch, @@ -795,6 +1009,7 @@ def run(rank, world_size, config, args, outdir, logfile): comet_experiment=comet_experiment, comet_step_freq=config["comet_step_freq"], val_freq=config["val_freq"], + save_attention=config["save_attention"], ) checkpoint = torch.load(f"{outdir}/best_weights.pth", map_location=torch.device(rank)) @@ -803,7 +1018,7 @@ def run(rank, world_size, config, args, outdir, logfile): if not (config["load"] is None): testdir_name = "_" + Path(config["load"]).stem else: - testdir_name = "_bestweights" + testdir_name = "_best_weights" if args.test: for sample in args.test_datasets: @@ -840,8 +1055,10 @@ def run(rank, world_size, config, args, outdir, logfile): if args.dataset == "clic": jetdef = fastjet.JetDefinition(fastjet.ee_genkt_algorithm, 0.7, -1.0) + jet_ptcut = 15 else: jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4) + jet_ptcut = 3 device_type = "cuda" if isinstance(rank, int) else "cpu" with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): @@ -853,7 +1070,7 @@ def run(rank, world_size, config, args, outdir, logfile): sample, outdir, jetdef, - jet_ptcut=15.0, + jet_ptcut=jet_ptcut, jet_match_dr=0.1, dir_name=testdir_name, ) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index aab90741b..6ec64c480 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -144,7 +144,7 @@ Y_FEATURES = ["cls_id", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy"] -def unpack_target(y): +def unpack_target(y, model): ret = {} ret["cls_id"] = y[..., 0].long() ret["charge"] = torch.clamp((y[..., 1] + 1).to(dtype=torch.float32), 0, 2) # -1, 0, 1 -> 0, 1, 2 @@ -172,9 +172,6 @@ def unpack_target(y): def unpack_predictions(preds): ret = {} ret["cls_binary"], ret["cls_id_onehot"], ret["momentum"] = preds - # ret["cls_id_onehot"], ret["momentum"] = preds - - # ret["charge"] = torch.argmax(ret["charge"], axis=1, keepdim=True) - 1 # unpacking ret["pt"] = ret["momentum"][..., 0] @@ -301,7 +298,7 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=- pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3, ) elif config["lr_schedule"] == "cosinedecay": - lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=1e-5) + lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1) else: raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.") return lr_schedule diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index 96a18d65c..4110e2dea 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -12,6 +12,9 @@ # comet needs to be imported before torch from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip +os.environ["OPENBLAS_NUM_THREADS"] = "1" +os.environ["OMP_NUM_THREADS"] = "1" + import yaml from pyg.training import device_agnostic_run, override_config, run_hpo, run_ray_training from utils import create_experiment_dir @@ -106,7 +109,10 @@ def get_outdir(resume_training, load): def main(): - # import matplotlib.pyplot as plt + import matplotlib + + matplotlib.use("agg") + # plt.rcParams['text.usetex'] = True args = parser.parse_args() diff --git a/notebooks/cms/cms-simvalidation.ipynb b/notebooks/cms/cms-simvalidation.ipynb index 39cb600d6..aef492ee3 100644 --- a/notebooks/cms/cms-simvalidation.ipynb +++ b/notebooks/cms/cms-simvalidation.ipynb @@ -94,16 +94,6 @@ " os.makedirs(plot_outpath)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "020e1e9d-058e-473b-933d-2c8c965b04ff", - "metadata": {}, - "outputs": [], - "source": [ - "list(glob.glob(\"/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/{}/raw2/*.pkl.bz2\".format(sample)))[:maxfiles][0]" - ] - }, { "cell_type": "code", "execution_count": null, @@ -114,7 +104,7 @@ "pickle_data = sum(\n", " [\n", " pickle.load(bz2.BZ2File(f, \"r\"))\n", - " for f in tqdm.tqdm(list(glob.glob(\"/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/{}/raw2/*.pkl.bz2\".format(sample)))[:maxfiles])\n", + " for f in tqdm.tqdm(list(glob.glob(\"/local/joosep/mlpf/cms/20240823_simcluster/nopu/{}/raw/*.pkl.bz2\".format(sample)))[:maxfiles])\n", " ],\n", " [],\n", ")\n", @@ -210,7 +200,7 @@ "metadata": {}, "outputs": [], "source": [ - "arrs_flat[\"ygen\"][\"pid\"][msk_trk]" + "np.unique(awkward.flatten(arrs_flat[\"ygen\"][\"pid\"][msk_trk]), return_counts=True)" ] }, { @@ -279,7 +269,7 @@ "outputs": [], "source": [ "plt.figure(figsize=(10,10))\n", - "b = np.linspace(0,3,100)\n", + "b = np.linspace(0,10,100)\n", "plt.hist(\n", " awkward.flatten(\n", " jets_coll[\"ygen\"][cmssw_to_ygen[\"ygen\"]].pt / jets_coll[\"cmssw\"][cmssw_to_ygen[\"cmssw\"]].pt\n", @@ -415,7 +405,9 @@ "source": [ "plt.figure()\n", "b = np.linspace(0, 10000, 101)\n", - "plt.hist(awkward.sum(arrs_awk[\"ygen\"][\"e\"], axis=1), bins=b)\n", + "plt.hist(awkward.sum(arrs_awk[\"Xelem\"][\"pt\"], axis=1), bins=b, histtype=\"step\", lw=1)\n", + "plt.hist(awkward.sum(arrs_awk[\"ygen\"][\"pt\"], axis=1), bins=b, histtype=\"step\", lw=1)\n", + "plt.hist(awkward.sum(arrs_awk[\"ycand\"][\"pt\"], axis=1), bins=b, histtype=\"step\", lw=1)\n", "plt.yscale(\"log\")\n", "plt.show()" ] @@ -489,7 +481,7 @@ "source": [ "plt.figure(figsize=(12, 10))\n", "ax = plt.axes()\n", - "b = np.logspace(1, 6, 100)\n", + "b = np.logspace(1, 3, 100)\n", "plt.hist2d(\n", " awkward.to_numpy(met(arrs_awk[\"ygen\"][\"pt\"], arrs_awk[\"ygen\"][\"phi\"])),\n", " awkward.to_numpy(met(arrs_awk[\"ycand\"][\"pt\"], arrs_awk[\"ycand\"][\"phi\"])),\n", @@ -518,12 +510,13 @@ "outputs": [], "source": [ "for pid in [\n", - " 0\n", + " 0, 211, 130, 22\n", "]:\n", " if pid == 0:\n", " msk = arrs_flat[\"ygen\"][\"pid\"] != pid\n", " else:\n", " msk = arrs_flat[\"ygen\"][\"pid\"] == pid\n", + " print(np.sum(msk))\n", " data1 = awkward.to_numpy(awkward.flatten(arrs_flat[\"Xelem\"][\"eta\"][msk]))\n", " data2 = awkward.to_numpy(awkward.flatten(arrs_flat[\"ygen\"][\"eta\"][msk]))\n", "\n", @@ -580,20 +573,32 @@ " plt.show()\n", " plt.savefig(plot_outpath + \"truth_vs_pfelement_phi_{}.pdf\".format(pid), bbox_inches=\"tight\")\n", "\n", - "# data1 = awkward.flatten(Xelem_e[msk])\n", - "# data2 = awkward.flatten(ygen_e[msk])\n", - "\n", - "# plt.figure(figsize=(12, 10))\n", - "# ax = plt.axes()\n", - "# plt.hist2d(data2, data1, bins=(np.logspace(-2, 3, 100), np.logspace(-2, 3, 100)), cmap=\"Blues\")\n", - "# plt.xscale(\"log\")\n", - "# plt.yscale(\"log\")\n", - "# plt.colorbar()\n", - "# cms_label(ax)\n", - "# sample_label(ax, \", \" + CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", - "# plt.xlabel(\"Truth $E$\")\n", - "# plt.ylabel(\"PFElement $E$ [GeV]\")\n", - "# plt.savefig(\"truth_vs_pf_e_{}.pdf\".format(pid), bbox_inches=\"tight\")" + " data1 = awkward.to_numpy(awkward.flatten(arrs_flat[\"Xelem\"][\"e\"][msk]))\n", + " data2 = awkward.to_numpy(awkward.flatten(arrs_flat[\"ygen\"][\"e\"][msk]))\n", + " plt.figure(figsize=(12, 10))\n", + " ax = plt.axes()\n", + " plt.hist2d(\n", + " data2,\n", + " data1,\n", + " bins=(np.logspace(0, 3, 100), np.logspace(0, 3, 100)),\n", + " cmap=\"hot_r\",\n", + " norm=matplotlib.colors.Normalize(vmin=0),\n", + " )\n", + " plt.plot([1, 1e3], [1, 1e3], ls=\"--\", color=\"black\")\n", + " plt.xscale(\"log\")\n", + " plt.yscale(\"log\")\n", + " cbar = plt.colorbar(label=\"number of particles / bin\")\n", + " cbar.formatter.set_powerlimits((0, 0))\n", + " cbar.formatter.set_useMathText(True)\n", + " cms_label(ax)\n", + " # if pid == 0:\n", + " # sample_label(ax, sample)\n", + " # else:\n", + " # sample_label(ax, sample, \", \" + CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", + " plt.xlabel(\"Truth $E$\")\n", + " plt.ylabel(\"PFElement $E$ [GeV]\")\n", + " plt.show()\n", + " plt.savefig(plot_outpath + \"truth_vs_pf_e_{}.pdf\".format(pid), bbox_inches=\"tight\")" ] }, { @@ -871,7 +876,7 @@ "# plt.xscale(\"log\")\n", "plt.ylim(0, 1.5 * np.sum([h[0] for h in hs], axis=0).max())\n", "if sample == \"TTbar_14TeV_TuneCUETP8M1_cfi\":\n", - " plt.ylim(0, 1e5)\n", + " plt.ylim(0, 1e5)s\n", "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n", "ax.yaxis.major.formatter._useMathText = True\n", "\n", @@ -1097,7 +1102,8 @@ "source": [ "gen_pid = awkward.flatten(arrs_flat[\"ygen\"][\"pid\"][arrs_flat[\"Xelem\"][\"typ\"]==1])\n", "cand_pid = awkward.flatten(arrs_flat[\"ycand\"][\"pid\"][arrs_flat[\"Xelem\"][\"typ\"]==1])\n", - "track_pt = awkward.flatten(arrs_flat[\"Xelem\"][\"pt\"][arrs_flat[\"Xelem\"][\"typ\"]==1])" + "track_pt = awkward.flatten(arrs_flat[\"Xelem\"][\"pt\"][arrs_flat[\"Xelem\"][\"typ\"]==1])\n", + "track_eta = awkward.flatten(arrs_flat[\"Xelem\"][\"eta\"][arrs_flat[\"Xelem\"][\"typ\"]==1])" ] }, { @@ -1113,6 +1119,17 @@ "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "7afc451a-af79-4db4-acd8-f5d806835876", + "metadata": {}, + "outputs": [], + "source": [ + "def midpoints(x):\n", + " return x[:-1] + np.diff(x)/2" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1131,7 +1148,14 @@ " frac_gen = np.sum(gen_pid[msk]!=0) / np.sum(msk)\n", " frac_cand = np.sum(cand_pid[msk]!=0) / np.sum(msk)\n", " fracs_gen.append(frac_gen)\n", - " fracs_cand.append(frac_cand)" + " fracs_cand.append(frac_cand)\n", + "\n", + "plt.figure()\n", + "plt.plot(midpoints(bins), fracs_gen, marker=\"o\", label=\"target\")\n", + "plt.plot(midpoints(bins), fracs_cand, marker=\"o\", label=\"PF\")\n", + "plt.xscale(\"log\")\n", + "plt.legend(loc=\"best\")\n", + "plt.show()" ] }, { @@ -1141,17 +1165,30 @@ "metadata": {}, "outputs": [], "source": [ + "bins = np.linspace(-4, 4, 20)\n", + "fracs_gen = []\n", + "fracs_cand = []\n", + "\n", + "for ibin in range(len(bins)-1):\n", + " b0 = bins[ibin]\n", + " b1 = bins[ibin+1]\n", + " msk = (track_eta >= b0) & (track_eta < b1) & (track_pt>1)\n", + " frac_gen = np.sum(gen_pid[msk]!=0) / np.sum(msk)\n", + " frac_cand = np.sum(cand_pid[msk]!=0) / np.sum(msk)\n", + " fracs_gen.append(frac_gen)\n", + " fracs_cand.append(frac_cand)\n", + "\n", "plt.figure()\n", - "plt.plot(bins[:-1], fracs_gen, marker=\"o\")\n", - "plt.plot(bins[:-1], fracs_cand, marker=\"o\")\n", - "plt.xscale(\"log\")\n", + "plt.plot(midpoints(bins), fracs_gen, marker=\"o\", label=\"target\")\n", + "plt.plot(midpoints(bins), fracs_cand, marker=\"o\", label=\"PF\")\n", + "plt.legend(loc=\"best\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, - "id": "86838aaa-04ae-4e1e-9ba0-ab28164ea947", + "id": "b723800c-3235-452d-9480-d4280534c0c5", "metadata": {}, "outputs": [], "source": [] diff --git a/parameters/pytorch/pyg-cld.yaml b/parameters/pytorch/pyg-cld.yaml new file mode 100644 index 000000000..204689385 --- /dev/null +++ b/parameters/pytorch/pyg-cld.yaml @@ -0,0 +1,120 @@ +backend: pytorch + +dataset: cld +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: gnn_lsh +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: joint #split, joint + pt_mode: linear + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: linear + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 256 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 6 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "relu" + head_dim: 16 + num_heads: 32 + attention_type: flash + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + cld: + physical: + batch_size: 1 + samples: + cld_edm_ttbar_pf: + version: 2.0.0 + +valid_dataset: + cld: + physical: + batch_size: 1 + samples: + cld_edm_ttbar_pf: + version: 2.0.0 + +test_dataset: + cld_edm_ttbar_pf: + version: 2.0.0 diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml index dd16021c4..a51540683 100644 --- a/parameters/pytorch/pyg-clic.yaml +++ b/parameters/pytorch/pyg-clic.yaml @@ -1,5 +1,6 @@ backend: pytorch +save_attention: yes dataset: clic sort_data: no data_dir: @@ -26,12 +27,12 @@ val_freq: # run an extra validation run every val_freq training steps model: trainable: all learned_representation_mode: last #last, concat - input_encoding: joint #split, joint - pt_mode: linear + input_encoding: split #split, joint + pt_mode: direct-elemtype-split eta_mode: linear sin_phi_mode: linear cos_phi_mode: linear - energy_mode: linear + energy_mode: direct-elemtype-split gnn_lsh: conv_type: gnn_lsh @@ -50,13 +51,13 @@ model: attention: conv_type: attention - num_convs: 12 - dropout_ff: 0.1 + num_convs: 4 + dropout_ff: 0.0 dropout_conv_id_mha: 0.0 dropout_conv_id_ff: 0.0 dropout_conv_reg_mha: 0.0 dropout_conv_reg_ff: 0.0 - activation: "relu" + activation: "gelu" head_dim: 32 num_heads: 32 attention_type: math diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml index 68e160693..661b1626a 100644 --- a/parameters/pytorch/pyg-cms.yaml +++ b/parameters/pytorch/pyg-cms.yaml @@ -1,5 +1,6 @@ backend: pytorch +save_attention: yes dataset: cms sort_data: yes data_dir: @@ -29,22 +30,22 @@ model: # - nn_pt learned_representation_mode: last #last, concat - input_encoding: joint #split, joint - pt_mode: linear + input_encoding: split #split, joint + pt_mode: direct-elemtype-split eta_mode: linear sin_phi_mode: linear cos_phi_mode: linear - energy_mode: linear + energy_mode: direct-elemtype-split gnn_lsh: conv_type: gnn_lsh embedding_dim: 512 width: 512 - num_convs: 3 + num_convs: 8 dropout_ff: 0.0 activation: "elu" # gnn-lsh specific parameters - bin_size: 640 + bin_size: 320 max_num_bins: 200 distance_dim: 128 layernorm: True @@ -54,14 +55,14 @@ model: attention: conv_type: attention - num_convs: 8 + num_convs: 4 dropout_ff: 0.0 dropout_conv_id_mha: 0.0 dropout_conv_id_ff: 0.0 dropout_conv_reg_mha: 0.0 dropout_conv_reg_ff: 0.0 - activation: "relu" - head_dim: 16 + activation: "gelu" + head_dim: 32 num_heads: 32 attention_type: flash use_pre_layernorm: True @@ -108,7 +109,9 @@ train_dataset: batch_size: 1 samples: cms_pf_ttbar: - version: 2.2.0 + version: 2.3.0 + cms_pf_qcd: + version: 2.3.0 valid_dataset: cms: @@ -116,7 +119,11 @@ valid_dataset: batch_size: 1 samples: cms_pf_ttbar: - version: 2.2.0 + version: 2.3.0 + cms_pf_qcd: + version: 2.3.0 test_dataset: cms_pf_ttbar: - version: 2.2.0 + version: 2.3.0 + cms_pf_qcd: + version: 2.3.0 diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh index d8533b658..efed9af31 100755 --- a/scripts/generate_tfds.sh +++ b/scripts/generate_tfds.sh @@ -17,7 +17,7 @@ export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build " export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets export MANUAL_DIR=/local/joosep/mlpf/cms/20240823_simcluster $CMD mlpf/heptfds/cms_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ttbar.log & -# $CMD mlpf/heptfds/cms_pf/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd.log & +$CMD mlpf/heptfds/cms_pf/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd.log & # $CMD mlpf/heptfds/cms_pf/ztt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ztt.log & # $CMD mlpf/heptfds/cms_pf/qcd_high_pt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd_high_pt.log & # $CMD mlpf/heptfds/cms_pf/smst1tttt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_smst1tttt.log & diff --git a/scripts/lumi/clic_bin_size_256.sh b/scripts/lumi/clic_bin_size_256.sh deleted file mode 100644 index fd1612fe5..000000000 --- a/scripts/lumi/clic_bin_size_256.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic -#SBATCH --account=project_465000301 -#SBATCH --time=3-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=160G -#SBATCH --gpus-per-task=4 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -export ROCM_PATH=/opt/rocm -#export NCCL_DEBUG=WARN -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -singularity exec \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic/clic_bin_size_256.yaml --plot-freq 1 --num-cpus 32 \ - --batch-multiplier 5 --plot-freq 1 diff --git a/scripts/lumi/clic_bin_size_32.sh b/scripts/lumi/clic_bin_size_32.sh deleted file mode 100755 index f9fae6c14..000000000 --- a/scripts/lumi/clic_bin_size_32.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic -#SBATCH --account=project_465000301 -#SBATCH --time=3-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=160G -#SBATCH --gpus-per-task=4 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -export ROCM_PATH=/opt/rocm -#export NCCL_DEBUG=WARN -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -singularity exec \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic/clic_bin_size_32.yaml --plot-freq 1 --num-cpus 32 \ - --batch-multiplier 5 --plot-freq 1 diff --git a/scripts/lumi/clic_bin_size_512.sh b/scripts/lumi/clic_bin_size_512.sh deleted file mode 100644 index cecd82a17..000000000 --- a/scripts/lumi/clic_bin_size_512.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic -#SBATCH --account=project_465000301 -#SBATCH --time=3-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=160G -#SBATCH --gpus-per-task=4 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -export ROCM_PATH=/opt/rocm -#export NCCL_DEBUG=WARN -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -singularity exec \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic/clic_bin_size_512.yaml --plot-freq 1 --num-cpus 32 \ - --batch-multiplier 5 --plot-freq 1 diff --git a/scripts/lumi/clic_bin_size_64.sh b/scripts/lumi/clic_bin_size_64.sh deleted file mode 100755 index 02f0343f7..000000000 --- a/scripts/lumi/clic_bin_size_64.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic -#SBATCH --account=project_465000301 -#SBATCH --time=3-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=160G -#SBATCH --gpus-per-task=4 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -export ROCM_PATH=/opt/rocm -#export NCCL_DEBUG=WARN -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -singularity exec \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic/clic_bin_size_64.yaml --plot-freq 1 --num-cpus 32 \ - --batch-multiplier 5 --plot-freq 1 diff --git a/scripts/lumi/clic_bin_size_128.sh b/scripts/lumi/pytorch-clic-1.sh old mode 100644 new mode 100755 similarity index 60% rename from scripts/lumi/clic_bin_size_128.sh rename to scripts/lumi/pytorch-clic-1.sh index 5dc778376..fe01c5286 --- a/scripts/lumi/clic_bin_size_128.sh +++ b/scripts/lumi/pytorch-clic-1.sh @@ -1,21 +1,21 @@ #!/bin/bash -#SBATCH --job-name=mlpf-train-clic +#SBATCH --job-name=mlpf-train #SBATCH --account=project_465000301 #SBATCH --time=3-00:00:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=32 -#SBATCH --mem=160G -#SBATCH --gpus-per-task=4 +#SBATCH --mem=200G +#SBATCH --gpus-per-task=1 #SBATCH --partition=small-g #SBATCH --no-requeue #SBATCH -o logs/slurm-%x-%j-%N.out cd /scratch/project_465000301/particleflow -module load LUMI/22.08 partition/G +module load LUMI/24.03 partition/G -export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg +export IMG=/scratch/project_465000301/pytorch-rocm6.2.simg export PYTHONPATH=hep_tfds export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets #export MIOPEN_DISABLE_CACHE=true @@ -23,14 +23,13 @@ export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages export ROCM_PATH=/opt/rocm -#export NCCL_DEBUG=WARN +#export NCCL_DEBUG=INFO #export MIOPEN_ENABLE_LOGGING=1 #export MIOPEN_ENABLE_LOGGING_CMD=1 #export MIOPEN_LOG_LEVEL=4 +export KERAS_BACKEND=torch -singularity exec \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram +env #TF training singularity exec \ @@ -38,6 +37,7 @@ singularity exec \ -B /scratch/project_465000301 \ -B /tmp \ --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic/clic_bin_size_128.yaml --plot-freq 1 --num-cpus 32 \ - --batch-multiplier 5 --plot-freq 1 + --env CUDA_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES \ + $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \ + --data-dir $TFDS_DATA_DIR --config parameters/pytorch/pyg-clic.yaml \ + --train --gpu-batch-multiplier 256 --num-workers 8 --prefetch-factor 100 --checkpoint-freq 1 --conv-type attention --dtype bfloat16 --lr 0.001 diff --git a/scripts/lumi/pytorch.sh b/scripts/lumi/pytorch.sh deleted file mode 100755 index eb9a0c8ae..000000000 --- a/scripts/lumi/pytorch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-cms -#SBATCH --account=project_465000301 -#SBATCH --time=3-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=130G -#SBATCH --gpus-per-task=8 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/lumi-pytorch-rocm.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export ROCM_PATH=/opt/rocm -#export NCCL_DEBUG=WARN -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -env - -singularity exec --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env PYTHONPATH=hep_tfds \ - $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus $SLURM_GPUS_PER_TASK \ - --data-dir $TFDS_DATA_DIR --config parameters/pytorch/pyg-cms.yaml \ - --train \ - --conv-type attention --attention-type flash_external \ - --num-epochs 10 --gpu-batch-multiplier 4 --num-workers 1 --prefetch-factor 10 --checkpoint-freq 1 diff --git a/scripts/lumi/train-gpu-1.sh b/scripts/lumi/train-gpu-1.sh deleted file mode 100755 index 01a838bb5..000000000 --- a/scripts/lumi/train-gpu-1.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-cms -#SBATCH --account=project_465000301 -#SBATCH --time=3-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=16 -#SBATCH --mem=160G -#SBATCH --gpus-per-task=1 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12-2024-01-11.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -export ROCM_PATH=/opt/rocm -export NCCL_DEBUG=WARN -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -singularity exec \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3.9 mlpf/pipeline.py hypertune --config parameters/bench/clic-hits-bench.yaml --ntrain 100 --ntest 100 -o hypertuning_100 diff --git a/scripts/lumi/train-gpu-2.sh b/scripts/lumi/train-gpu-2.sh deleted file mode 100755 index a62aa2fbf..000000000 --- a/scripts/lumi/train-gpu-2.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic-hits-ln-full -#SBATCH --account=project_465000301 -#SBATCH --time=1-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=130G -#SBATCH --gpus-per-task=2 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic-test.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 5 --ntrain 50000 --ntest 50000 --benchmark_dir exp_dir - -# --env MIOPEN_USER_DB_PATH=$MIPEN_USER_DB_PATH \ -# --env MIOPEN_CUSTOM_CACHE_DIR=$MIOPEN_CUSTOM_CACHE_DIR \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=7 \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=5 \ diff --git a/scripts/lumi/train-gpu-3.sh b/scripts/lumi/train-gpu-3.sh deleted file mode 100755 index 12cc85089..000000000 --- a/scripts/lumi/train-gpu-3.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic-hits-ln-full -#SBATCH --account=project_465000301 -#SBATCH --time=1-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=130G -#SBATCH --gpus-per-task=3 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic-test.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 5 --ntrain 50000 --ntest 50000 --benchmark_dir exp_dir - -# --env MIOPEN_USER_DB_PATH=$MIPEN_USER_DB_PATH \ -# --env MIOPEN_CUSTOM_CACHE_DIR=$MIOPEN_CUSTOM_CACHE_DIR \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=7 \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=5 \ diff --git a/scripts/lumi/train-gpu-4.sh b/scripts/lumi/train-gpu-4.sh deleted file mode 100755 index bcfa28b68..000000000 --- a/scripts/lumi/train-gpu-4.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-cms -#SBATCH --account=project_465000301 -#SBATCH --time=3-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=16 -#SBATCH --mem=160G -#SBATCH --gpus-per-task=4 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -export ROCM_PATH=/opt/rocm -#export NCCL_DEBUG=WARN -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -singularity exec \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/cms-gen.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 2 --plot-freq -1 --weights experiments/cms-gen_20240108_154245_299103.nid005026/weights/weights-05-4.250307.hdf5 diff --git a/scripts/lumi/train-gpu-5.sh b/scripts/lumi/train-gpu-5.sh deleted file mode 100755 index e6fa1835b..000000000 --- a/scripts/lumi/train-gpu-5.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic-hits-ln-full -#SBATCH --account=project_465000301 -#SBATCH --time=1-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=130G -#SBATCH --gpus-per-task=5 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic-test.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 5 --ntrain 50000 --ntest 50000 --benchmark_dir exp_dir - -# --env MIOPEN_USER_DB_PATH=$MIPEN_USER_DB_PATH \ -# --env MIOPEN_CUSTOM_CACHE_DIR=$MIOPEN_CUSTOM_CACHE_DIR \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=7 \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=5 \ diff --git a/scripts/lumi/train-gpu-6.sh b/scripts/lumi/train-gpu-6.sh deleted file mode 100755 index 965c4ebd0..000000000 --- a/scripts/lumi/train-gpu-6.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic-hits-ln-full -#SBATCH --account=project_465000301 -#SBATCH --time=1-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=130G -#SBATCH --gpus-per-task=6 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic-test.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 5 --ntrain 50000 --ntest 50000 --benchmark_dir exp_dir - -# --env MIOPEN_USER_DB_PATH=$MIPEN_USER_DB_PATH \ -# --env MIOPEN_CUSTOM_CACHE_DIR=$MIOPEN_CUSTOM_CACHE_DIR \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=7 \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=5 \ diff --git a/scripts/lumi/train-gpu-7.sh b/scripts/lumi/train-gpu-7.sh deleted file mode 100755 index 77f563fdd..000000000 --- a/scripts/lumi/train-gpu-7.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic-hits-ln-full -#SBATCH --account=project_465000301 -#SBATCH --time=1-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=130G -#SBATCH --gpus-per-task=7 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic-test.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 5 --ntrain 50000 --ntest 50000 --benchmark_dir exp_dir - -# --env MIOPEN_USER_DB_PATH=$MIPEN_USER_DB_PATH \ -# --env MIOPEN_CUSTOM_CACHE_DIR=$MIOPEN_CUSTOM_CACHE_DIR \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=7 \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=5 \ diff --git a/scripts/lumi/train-gpu-8.sh b/scripts/lumi/train-gpu-8.sh deleted file mode 100755 index 3e2f3ea23..000000000 --- a/scripts/lumi/train-gpu-8.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-cms -#SBATCH --account=project_465000301 -#SBATCH --time=3-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=16 -#SBATCH --mem=160G -#SBATCH --gpus-per-task=8 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -#module load LUMI/22.08 partition/G -module load LUMI/23.09 partition/G - -export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -export ROCM_PATH=/opt/rocm -export NCCL_DEBUG=WARN -export MIOPEN_ENABLE_LOGGING=1 -export MIOPEN_ENABLE_LOGGING_CMD=1 -export MIOPEN_LOG_LEVEL=4 - -singularity exec \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/cms-gen.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 2 --plot-freq -1 --weights experiments/cms-gen_20240108_154245_299103.nid005026/weights/weights-05-4.250307.hdf5 diff --git a/scripts/lumi/train-gpu-ln-full.sh b/scripts/lumi/train-gpu-ln-full.sh deleted file mode 100755 index e94eb0fcc..000000000 --- a/scripts/lumi/train-gpu-ln-full.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic-hits-ln-full -#SBATCH --account=project_465000301 -#SBATCH --time=3-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=130G -#SBATCH --gpus-per-task=8 -#SBATCH --partition=small-g -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -cd /scratch/project_465000301/particleflow - -module load LUMI/22.08 partition/G - -export IMG=/scratch/project_465000301/tf-rocm.simg -export PYTHONPATH=hep_tfds -export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets -#export MIOPEN_DISABLE_CACHE=true -export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - -B /tmp \ - --env LD_LIBRARY_PATH=/opt/rocm-5.4.0/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic-hits-ln.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 2 \ - --weights experiments/clic-hits-ln_20230623_090308_368360.nid007329/weights/weights-10-0.163285.hdf5 - -# --env MIOPEN_USER_DB_PATH=$MIPEN_USER_DB_PATH \ -# --env MIOPEN_CUSTOM_CACHE_DIR=$MIOPEN_CUSTOM_CACHE_DIR \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=7 \ -# --env MIOPEN_ENABLE_LOGGING=1 \ -# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ -# --env MIOPEN_LOG_LEVEL=5 \ diff --git a/scripts/tallinn/a100/pytorch-clic-small.sh b/scripts/tallinn/a100/pytorch-clic-small.sh new file mode 100755 index 000000000..3d6989c12 --- /dev/null +++ b/scripts/tallinn/a100/pytorch-clic-small.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --partition gpu +#SBATCH --gres gpu:mig:1 +#SBATCH --mem-per-gpu 50G +#SBATCH -o logs/slurm-%x-%j-%N.out + +IMG=/home/software/singularity/pytorch.simg:2024-08-18 +cd ~/particleflow + +ulimit -n 10000 +singularity exec -B /scratch/persistent --nv \ + --env PYTHONPATH=hep_tfds \ + --env KERAS_BACKEND=torch \ + $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \ + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ + --train --test --make-plots --conv-type attention --num-epochs 5 --gpu-batch-multiplier 16 --num-workers 1 --prefetch-factor 100 --checkpoint-freq 1 --comet --ntrain 1000 --nvalid 1000 --ntest 1000 diff --git a/scripts/tallinn/a100/pytorch-clic.sh b/scripts/tallinn/a100/pytorch-clic.sh index 3d6e59a4f..187c0911a 100755 --- a/scripts/tallinn/a100/pytorch-clic.sh +++ b/scripts/tallinn/a100/pytorch-clic.sh @@ -13,4 +13,4 @@ singularity exec -B /scratch/persistent --nv \ --env KERAS_BACKEND=torch \ $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \ --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ - --train --conv-type attention --num-epochs 100 --gpu-batch-multiplier 256 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet --dtype bfloat16 + --train --test --make-plots --conv-type attention --num-epochs 20 --gpu-batch-multiplier 128 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet --attention-type math --ntrain 50000 --nvalid 1000 --ntest 1000 diff --git a/scripts/tallinn/a100/pytorch-small-eval-clic.sh b/scripts/tallinn/a100/pytorch-small-eval-clic.sh old mode 100755 new mode 100644 index a8a79ad82..60eb8bf56 --- a/scripts/tallinn/a100/pytorch-small-eval-clic.sh +++ b/scripts/tallinn/a100/pytorch-small-eval-clic.sh @@ -1,16 +1,30 @@ #!/bin/bash #SBATCH --partition gpu #SBATCH --gres gpu:mig:1 +<<<<<<< HEAD +#SBATCH --mem-per-gpu 60G +#SBATCH -o logs/slurm-%x-%j-%N.out + +IMG=/home/software/singularity/pytorch.simg:2024-08-02 +cd ~/particleflow + +WEIGHTS=experiments/pyg-clic_20240807_134034_168101/checkpoints/checkpoint-47-9.910686.pth +======= #SBATCH --mem-per-gpu 50G #SBATCH -o logs/slurm-%x-%j-%N.out IMG=/home/software/singularity/pytorch.simg:2024-08-18 cd ~/particleflow -WEIGHTS=experiments/pyg-clic_20240830_114129_279460/checkpoints/checkpoint-11-8.095037.pth +WEIGHTS=experiments/pyg-clic_20240910_092302_797928/best_weights.pth +>>>>>>> origin/fixes_sep6 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env KERAS_BACKEND=torch \ $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \ --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ +<<<<<<< HEAD + --test --make-plots --gpu-batch-multiplier 100 --load $WEIGHTS --dtype bfloat16 +======= --test --make-plots --gpu-batch-multiplier 100 --load $WEIGHTS --dtype bfloat16 --ntest 10000 +>>>>>>> origin/fixes_sep6 diff --git a/scripts/tallinn/a100/pytorch-small-eval-cms.sh b/scripts/tallinn/a100/pytorch-small-eval-cms.sh old mode 100755 new mode 100644 index b91a923d3..f5e5fcba6 --- a/scripts/tallinn/a100/pytorch-small-eval-cms.sh +++ b/scripts/tallinn/a100/pytorch-small-eval-cms.sh @@ -7,10 +7,11 @@ IMG=/home/software/singularity/pytorch.simg:2024-08-18 cd ~/particleflow -WEIGHTS=experiments/pyg-cms_20240901_194940_868487/checkpoints/checkpoint-02-18.710537.pth +WEIGHTS=experiments/pyg-cms_20240915_162455_135826/checkpoints/checkpoint-01-4.121693.pth +env singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env KERAS_BACKEND=torch \ - $IMG python mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ + $IMG python mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ - --test --make-plots --gpu-batch-multiplier 10 --load $WEIGHTS --ntest 5000 --dtype bfloat16 + --test --make-plots --gpu-batch-multiplier 1 --load $WEIGHTS --ntrain 1000 --ntest 1000 --nvalid 1000 --dtype bfloat16 --num-epochs 20 diff --git a/scripts/tallinn/a100/pytorch-small.sh b/scripts/tallinn/a100/pytorch-small.sh index 76feb888a..7f7da851a 100755 --- a/scripts/tallinn/a100/pytorch-small.sh +++ b/scripts/tallinn/a100/pytorch-small.sh @@ -4,14 +4,15 @@ #SBATCH --mem-per-gpu 60G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/pytorch.simg:2024-07-08 +IMG=/home/software/singularity/pytorch.simg:2024-08-18 cd ~/particleflow env +ulimit -n 10000 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env KERAS_BACKEND=torch \ - $IMG python3.10 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ - --train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 40 --num-workers 1 --prefetch-factor 50 --dtype bfloat16 --checkpoint-freq 1 + $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ + --train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 1 --num-workers 1 --prefetch-factor 10 --dtype bfloat16 --checkpoint-freq -1 --ntrain 100 --nvalid 100 --ntest 100 --num-epochs 10 diff --git a/scripts/tallinn/a100/pytorch.sh b/scripts/tallinn/a100/pytorch.sh index 8a0535da2..ca2970b1f 100755 --- a/scripts/tallinn/a100/pytorch.sh +++ b/scripts/tallinn/a100/pytorch.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --partition gpu #SBATCH --gres gpu:a100:1 -#SBATCH --mem-per-gpu 100G +#SBATCH --mem-per-gpu 250G #SBATCH -o logs/slurm-%x-%j-%N.out IMG=/home/software/singularity/pytorch.simg:2024-08-18 @@ -13,5 +13,5 @@ singularity exec -B /scratch/persistent --nv \ --env KERAS_BACKEND=torch \ $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ - --train --test --make-plots --num-epochs 100 --conv-type attention \ - --gpu-batch-multiplier 10 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet + --train --test --make-plots --num-epochs 5 --conv-type attention \ + --gpu-batch-multiplier 8 --checkpoint-freq 1 --num-workers 8 --prefetch-factor 50 --comet --ntrain 20000 --nvalid 1000 --ntest 1000 diff --git a/scripts/tallinn/rtx/pytorch.sh b/scripts/tallinn/rtx/pytorch.sh index 0ce0280f5..9891d4fb2 100755 --- a/scripts/tallinn/rtx/pytorch.sh +++ b/scripts/tallinn/rtx/pytorch.sh @@ -6,10 +6,11 @@ IMG=/home/software/singularity/pytorch.simg:2024-08-18 -ulimit -n 10000 +ulimit -n 100000 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env KERAS_BACKEND=torch \ $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 4 \ --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ - --train --test --make-plots --conv-type attention --gpu-batch-multiplier 10 --num-workers 2 --prefetch-factor 100 --attention-type math --dtype float32 + --train --test --make-plots --conv-type attention --gpu-batch-multiplier 10 --num-workers 4 \ + --prefetch-factor 100 --attention-type math --dtype float32 --num-epochs 100 --checkpoint-freq 1 --lr 0.001