Skip to content

Commit

Permalink
clean up loading (#301)
Browse files Browse the repository at this point in the history
* clean up loading

* rerun postprocessing for clic clusters with inclusive particles

* faster inference

* some fixes for plotting

* update plotting

* commit config that was used for training

* residual cleanup

* Update pyg-cms.yaml
  • Loading branch information
jpata authored Apr 5, 2024
1 parent 5e13929 commit 8420a5d
Show file tree
Hide file tree
Showing 14 changed files with 543 additions and 404 deletions.
3 changes: 0 additions & 3 deletions mlpf/data_cms/genjob_pu55to75.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,13 @@ PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data_cms/pu_files_local.txt

N=20

env
source /cvmfs/cms.cern.ch/cmsset_default.sh

cd $CMSSWDIR
eval `scramv1 runtime -sh`
which python
which python3

env

cd $WORKDIR

#Generate the MC
Expand Down
15 changes: 7 additions & 8 deletions mlpf/data_cms/prepare_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
outdir = "/local/joosep/mlpf/cms/v3"

samples = [
("TTbar_14TeV_TuneCUETP8M1_cfi", 100000, 105010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 200000, 205010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 305010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi", 400000, 405010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 500000, 505010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("ZpTT_1500_14TeV_TuneCP5_cfi", 600000, 605010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("TTbar_14TeV_TuneCUETP8M1_cfi", 100000, 120010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 200000, 220010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 320010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi", 400000, 420010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 500000, 520010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("ZpTT_1500_14TeV_TuneCP5_cfi", 600000, 620010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("VBF_TuneCP5_14TeV_pythia8_cfi", 1700000,1720010, "genjob_pu55to75.sh", outdir + "/pu55to75"),

("TTbar_14TeV_TuneCUETP8M1_cfi", 700000, 701000, "genjob_nopu.sh", outdir + "/nopu"),
("MultiParticlePFGun50_cfi", 800000, 850000, "genjob_nopu.sh", outdir + "/nopu"),
Expand All @@ -24,8 +25,6 @@
("SinglePiMinusFlatPt0p7To1000_cfi", 1400000,1410000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleProtonMinusFlatPt0p7To1000_cfi", 1500000,1510000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleTauFlatPt1To1000_cfi", 1600000,1610000, "genjob_nopu.sh", outdir + "/nopu"),

("VBF_TuneCP5_14TeV_pythia8_cfi", 1700000,1705010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
]

if __name__ == "__main__":
Expand Down
105 changes: 33 additions & 72 deletions mlpf/heptfds/cms_pf/cms_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import fastjet
import numpy as np
import vector
from numpy.lib.recfunctions import append_fields

# https://github.com/ahlinist/cmssw/blob/1df62491f48ef964d198f574cdfcccfd17c70425/DataFormats/ParticleFlowReco/interface/PFBlockElement.h#L33
ELEM_LABELS_CMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Expand Down Expand Up @@ -131,85 +130,47 @@ def prepare_data_cms(fn, with_jet_idx=True):
elif fn.endswith(".pkl.bz2"):
data = pickle.load(bz2.BZ2File(fn, "rb"))

for event in tqdm.tqdm(data):
for event in data:
Xelem = event["Xelem"]
ygen = event["ygen"]
ycand = event["ycand"]

# remove PS and BREM from inputs
msk_ps = (Xelem["typ"] == 2) | (Xelem["typ"] == 3) | (Xelem["typ"] == 7)

Xelem = Xelem[~msk_ps]
ygen = ygen[~msk_ps]
ycand = ycand[~msk_ps]
Xelem = ak.Array(Xelem[~msk_ps])
ygen = ak.Array(ygen[~msk_ps])
ycand = ak.Array(ycand[~msk_ps])

Xelem = append_fields(
Xelem,
"sin_phi",
np.sin(Xelem["phi"]),
)
Xelem = append_fields(
Xelem,
"cos_phi",
np.cos(Xelem["phi"]),
)

Xelem = append_fields(
Xelem,
"typ_idx",
np.array(
[ELEM_LABELS_CMS.index(int(i)) for i in Xelem["typ"]],
dtype=np.float32,
),
)
ygen = append_fields(
ygen,
"typ_idx",
np.array(
[CLASS_LABELS_CMS.index(abs(int(i))) for i in ygen["typ"]],
dtype=np.float32,
),
)
ycand = append_fields(
ycand,
"typ_idx",
np.array(
[CLASS_LABELS_CMS.index(abs(int(i))) for i in ycand["typ"]],
dtype=np.float32,
),
)
Xelem["sin_phi"] = np.sin(Xelem["phi"])
Xelem["cos_phi"] = np.cos(Xelem["phi"])
Xelem["typ_idx"] = np.array([ELEM_LABELS_CMS.index(int(i)) for i in Xelem["typ"]], dtype=np.float32)
ygen["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ygen["typ"]], dtype=np.float32)
ycand["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ycand["typ"]], dtype=np.float32)

y_features = Y_FEATURES[:-1]
if with_jet_idx:
ygen = append_fields(ygen, "jet_idx", np.zeros(ygen["typ"].shape, dtype=np.float32))
ycand = append_fields(
ycand,
"jet_idx",
np.zeros(ycand["typ"].shape, dtype=np.float32),
)
y_features = Y_FEATURES
ygen["jet_idx"] = np.zeros(len(ygen["typ"]), dtype=np.float32)
ycand["jet_idx"] = np.zeros(len(ycand["typ"]), dtype=np.float32)

Xelem_flat = np.stack(
[Xelem[k].view(np.float32).data for k in X_FEATURES],
axis=-1,
Xelem_flat = ak.to_numpy(
np.stack(
[Xelem[k] for k in X_FEATURES],
axis=-1,
)
)
ygen_flat = np.stack(
[ygen[k].view(np.float32).data for k in y_features],
axis=-1,
ygen_flat = ak.to_numpy(
np.stack(
[ygen[k] for k in Y_FEATURES],
axis=-1,
)
)
ycand_flat = np.stack(
[ycand[k].view(np.float32).data for k in y_features],
axis=-1,
ycand_flat = ak.to_numpy(
np.stack(
[ycand[k] for k in Y_FEATURES],
axis=-1,
)
)

# take care of outliers
# Xelem_flat[np.isnan(Xelem_flat)] = 0
# Xelem_flat[np.abs(Xelem_flat) > 1e4] = 0
# ygen_flat[np.isnan(ygen_flat)] = 0
# ygen_flat[np.abs(ygen_flat) > 1e4] = 0
# ycand_flat[np.isnan(ycand_flat)] = 0
# ycand_flat[np.abs(ycand_flat) > 1e4] = 0

X = Xelem_flat
ycand = ycand_flat
ygen = ygen_flat
Expand All @@ -224,13 +185,13 @@ def prepare_data_cms(fn, with_jet_idx=True):
cumsum = np.cumsum(valid) - 1
_, index_mapping = np.unique(cumsum, return_index=True)

pt = ygen[valid, y_features.index("pt")]
eta = ygen[valid, y_features.index("eta")]
pt = ygen[valid, Y_FEATURES.index("pt")]
eta = ygen[valid, Y_FEATURES.index("eta")]
phi = np.arctan2(
ygen[valid, y_features.index("sin_phi")],
ygen[valid, y_features.index("cos_phi")],
ygen[valid, Y_FEATURES.index("sin_phi")],
ygen[valid, Y_FEATURES.index("cos_phi")],
)
e = ygen[valid, y_features.index("e")]
e = ygen[valid, Y_FEATURES.index("e")]
vec = vector.awk(ak.zip({"pt": pt, "eta": eta, "phi": phi, "e": e}))

# cluster jets, sort jet indices in descending order by pt
Expand All @@ -249,8 +210,8 @@ def prepare_data_cms(fn, with_jet_idx=True):
jet_constituents = [
index_mapping[idx] for idx in constituent_idx[jet_idx]
] # map back to constituent index *before* masking
ygen[jet_constituents, y_features.index("jet_idx")] = jet_idx + 1 # jet index starts from 1
ycand[jet_constituents, y_features.index("jet_idx")] = jet_idx + 1
ygen[jet_constituents, Y_FEATURES.index("jet_idx")] = jet_idx + 1 # jet index starts from 1
ycand[jet_constituents, Y_FEATURES.index("jet_idx")] = jet_idx + 1

Xs.append(X)
ygens.append(ygen)
Expand Down
3 changes: 2 additions & 1 deletion mlpf/heptfds/cms_pf/qcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
class CmsPfQcd(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf_qcd dataset."""

VERSION = tfds.core.Version("1.7.0")
VERSION = tfds.core.Version("1.7.1")
RELEASE_NOTES = {
"1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle",
"1.3.1": "Remove PS again",
Expand All @@ -30,6 +30,7 @@ class CmsPfQcd(tfds.core.GeneratorBasedBuilder):
"1.5.1": "Remove outlier caps",
"1.6.0": "Regenerate with ARRAY_RECORD",
"1.7.0": "Add cluster shape vars",
"1.7.1": "Increase stats to 400k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_qcd ~/tensorflow_datasets/
Expand Down
3 changes: 2 additions & 1 deletion mlpf/heptfds/cms_pf/ttbar.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
class CmsPfTtbar(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf dataset."""

VERSION = tfds.core.Version("1.7.0")
VERSION = tfds.core.Version("1.7.1")
RELEASE_NOTES = {
"1.0.0": "Initial release.",
"1.1.0": "Add muon type, fix electron GSF association",
Expand All @@ -33,6 +33,7 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder):
"1.5.1": "Remove outlier caps",
"1.6.0": "Regenerate with ARRAY_RECORD",
"1.7.0": "Add cluster shape vars",
"1.7.1": "Increase stats to 400k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar ~/tensorflow_datasets/
Expand Down
3 changes: 2 additions & 1 deletion mlpf/heptfds/cms_pf/ztt.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
class CmsPfZtt(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf_ztt dataset."""

VERSION = tfds.core.Version("1.7.0")
VERSION = tfds.core.Version("1.7.1")
RELEASE_NOTES = {
"1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle",
"1.3.1": "Remove PS again",
Expand All @@ -30,6 +30,7 @@ class CmsPfZtt(tfds.core.GeneratorBasedBuilder):
"1.5.1": "Remove outlier caps",
"1.6.0": "Regenerate with ARRAY_RECORD",
"1.7.0": "Add cluster shape vars",
"1.7.1": "Increase stats to 400k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ztt ~/tensorflow_datasets/
Expand Down
Loading

0 comments on commit 8420a5d

Please sign in to comment.