Skip to content

Commit

Permalink
Update validation scripts to CMSSW_14_1_0 (#323)
Browse files Browse the repository at this point in the history
* update validation for cmssw 14

* it's running

* update dqm for cmssw 14

* update with link

* update recipe

* added runtime plot notebook
  • Loading branch information
jpata authored May 28, 2024
1 parent f45c9bd commit 0791d61
Show file tree
Hide file tree
Showing 9 changed files with 346 additions and 140 deletions.
40 changes: 22 additions & 18 deletions mlpf/data_cms/README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
## Validation data

### ACAT 2022
The MLPF CMSSW results presented at ACAT can be reproduced using the MINIAOD samples from
```
gfal-copy -r root://xrootd.hep.kbfi.ee:1094//store/user/jpata/mlpf/results/acat2022 ./
gfal-copy -r root://xrootd.hep.kbfi.ee:1094//store/user/jpata/mlpf/results/ ./
```

See below for the steps to reproduce these samples.

The resulting plots can be found at:
```
https://jpata.web.cern.ch/jpata/mlpf/results/acat2022_20221004_model40M_revalidation20240523/
https://jpata.web.cern.ch/jpata/mlpf/results/acat2022_20221004_model40M_revalidation_CMSSW14_20240527/
```

## Code setup
Expand All @@ -20,30 +20,35 @@ The following should work on lxplus.
voms-proxy-init -voms cms -valid 192:00
voms-proxy-info
#Initialize SLC7
cmssw-el7
#Initialize EL8
cmssw-el8
export SCRAM_ARCH=slc7_amd64_gcc10
cmsrel CMSSW_12_3_0_pre6
cd CMSSW_12_3_0_pre6/src
export SCRAM_ARCH=el8_amd64_gcc12
cmsrel CMSSW_14_1_0_pre3
cd CMSSW_14_1_0_pre3/src
cmsenv
git cms-init
#checkout the MLPF code
git-cms-merge-topic jpata:pfanalysis_caloparticle
#set the directories we want to check out
echo "/Configuration/Generator/" >> .git/info/sparse-checkout
echo "/IOMC/ParticleGuns/" >> .git/info/sparse-checkout
echo "/RecoParticleFlow/PFProducer/" >> .git/info/sparse-checkout
echo "/Validation/RecoParticleFlow/" >> .git/info/sparse-checkout
#check out the version from the 2022 release
git checkout mlpf_acat2022
#checkout the CMSSW code
git remote add jpata https://github.com/jpata/cmssw.git
git fetch -a jpata
git checkout pfanalysis_caloparticle_CMSSW_14_1_0_pre3_acat2022
#compile
scram b -j4
#download the MLPF model
mkdir -p src/RecoParticleFlow/PFProducer/data/mlpf/
wget https://huggingface.co/jpata/particleflow/resolve/main/cms/acat2022_20221004_model40M/dev.onnx?download=true -O RecoParticleFlow/PFProducer/data/mlpf/dev.onnx
#download the latest MLPF model
mkdir -p RecoParticleFlow/PFProducer/data/mlpf/
wget https://huggingface.co/jpata/particleflow/blob/main/cms/2024_05_16_attn_model21M/onnx/mlpf_21M_attn2x6x512_bs40_relu_tt_qcd_zh400k_checkpoint25_1xa100_fp32_fused.onnx?download=true -O RecoParticleFlow/PFProducer/data/mlpf/mlpf_21M_attn2x6x512_bs40_relu_tt_qcd_zh400k_checkpoint25_1xa100_fp32_fused.onnx
# must be b786aa6de49b51f703c87533a66326d6
md5sum RecoParticleFlow/PFProducer/data/mlpf/dev.onnx
# must be 57d334c9a5eaa9eb5f1c2708e0fbc5e0
md5sum RecoParticleFlow/PFProducer/data/mlpf/mlpf_21M_attn2x6x512_bs40_relu_tt_qcd_zh400k_checkpoint25_1xa100_fp32_fused.onnx
```

## Running MLPF in CMSSW
Expand Down Expand Up @@ -97,7 +102,6 @@ cd particleflow
./scripts/cmssw/validation_job.sh mlpf scripts/cmssw/qcd_pu.txt QCD_PU 1
./scripts/cmssw/validation_job.sh pf scripts/cmssw/qcd_pu.txt QCD_PU 1
```
Note: the input dataset is only stored on `T2_EE_Estonia`, therefore depending on grid accessibility, it might be slow to access.

The MINIAOD output will be in `$CMSSW_BASE/out/QCD_PU_mlpf` and `$CMSSW_BASE/out/QCD_PU_pf`.

Expand Down
3 changes: 2 additions & 1 deletion mlpf/heptfds/cms_pf/cms_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
"mu",
]
CLASS_NAMES_LONG_CMS = [
"none" "charged hadron",
"none",
"charged hadron",
"neutral hadron",
"hfem",
"hfhad",
Expand Down
249 changes: 249 additions & 0 deletions notebooks/cms/cms-runtimes.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "edf63d45-5656-4b3f-8cd3-244aad8853b1",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c08aa3bd-5ccf-4317-b357-0f91f1a08812",
"metadata": {},
"outputs": [],
"source": [
"s1 = \"\"\"\n",
"timing/gpu_fp32_fused.txt:Nelem=2560 mean_time=6.99 ms stddev_time=2.89 ms mem_used=1678 MB\n",
"timing/gpu_fp32_fused.txt:Nelem=5120 mean_time=16.59 ms stddev_time=0.15 ms mem_used=1946 MB\n",
"timing/gpu_fp32_fused.txt:Nelem=10240 mean_time=53.13 ms stddev_time=0.23 ms mem_used=1946 MB\n",
"\"\"\"\n",
"\n",
"s2 = \"\"\"\n",
"timing/gpu_fp32_unfused.txt:Nelem=2560 mean_time=39.31 ms stddev_time=1.73 ms mem_used=3817 MB\n",
"timing/gpu_fp32_unfused.txt:Nelem=5120 mean_time=130.18 ms stddev_time=6.52 ms mem_used=12407 MB\n",
"timing/gpu_fp32_unfused.txt:Nelem=10240 mean_time=465.09 ms stddev_time=25.82 ms mem_used=46766 MB\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8bac11cf-3b44-4156-97bf-7efc7a2a6da4",
"metadata": {},
"outputs": [],
"source": [
"def parse_str(s):\n",
" data = pd.DataFrame()\n",
" Nelem = []\n",
" mean_time = []\n",
" stddev_time = []\n",
" mem_used = []\n",
" for line in s.strip().split(\"\\n\"):\n",
" elems = line.split(\":\")[1].split()\n",
" print(elems)\n",
" Nelem.append(int(elems[0].split(\"=\")[1]))\n",
" mean_time.append(float(elems[1].split(\"=\")[1]))\n",
" stddev_time.append(float(elems[3].split(\"=\")[1]))\n",
" mem_used.append(float(elems[5].split(\"=\")[1]))\n",
" data[\"Nelem\"] = Nelem\n",
" data[\"mean_time\"] = mean_time\n",
" data[\"stddev_time\"] = stddev_time\n",
" data[\"mem_used\"] = mem_used\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "012a42e2-210e-48af-8419-9cde7acc53c4",
"metadata": {},
"outputs": [],
"source": [
"data_fused = parse_str(s1)\n",
"data_unfused = parse_str(s2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c27f336-174e-430b-9035-80ad60c74e1b",
"metadata": {},
"outputs": [],
"source": [
"plt.errorbar(data_unfused[\"Nelem\"], data_unfused[\"mean_time\"], yerr=data_unfused[\"stddev_time\"], marker=\"o\", label=\"ONNX unfused attention\")\n",
"plt.errorbar(data_fused[\"Nelem\"], data_fused[\"mean_time\"], yerr=data_fused[\"stddev_time\"], marker=\"o\", label=\"ONNX fused attention\")\n",
"plt.xticks(data_fused[\"Nelem\"])\n",
"plt.ylabel(\"Runtime per event [ms]\")\n",
"plt.xlabel(\"Elements per event\")\n",
"plt.title(\"MLPF runtime, 2x6 layers, ONNX backend, A100\")\n",
"plt.legend(loc=\"best\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b05b10e9-acca-4c71-945e-8568ac9ac3b5",
"metadata": {},
"outputs": [],
"source": [
"plt.errorbar(data_unfused[\"Nelem\"], data_unfused[\"mem_used\"], marker=\"o\", label=\"ONNX unfused attention\")\n",
"plt.errorbar(data_fused[\"Nelem\"], data_fused[\"mem_used\"], marker=\"o\", label=\"ONNX fused attention\")\n",
"plt.xticks(data_fused[\"Nelem\"])\n",
"plt.ylabel(\"GPU memory used [MB]\")\n",
"plt.xlabel(\"Elements per event\")\n",
"plt.title(\"MLPF memory, 2x6 layers, ONNX backend, A100\")\n",
"plt.yscale(\"log\")\n",
"plt.legend(loc=\"best\")\n",
"ytick = [1000,2000,10000,20000,40000]\n",
"plt.yticks(ytick, ytick)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30df457b-bd9a-4dd4-a6ea-156dac41b7eb",
"metadata": {},
"outputs": [],
"source": [
"s_cpu_pf = \"\"\"\n",
"log_cpu_pf.txt:TimeModule> 35002 1 particleFlowTmp PFProducer 0.00893436\n",
"log_cpu_pf.txt:TimeModule> 35005 1 particleFlowTmp PFProducer 0.00696006\n",
"log_cpu_pf.txt:TimeModule> 35001 1 particleFlowTmp PFProducer 0.0205714\n",
"log_cpu_pf.txt:TimeModule> 35004 1 particleFlowTmp PFProducer 0.0115013\n",
"log_cpu_pf.txt:TimeModule> 35003 1 particleFlowTmp PFProducer 0.010012\n",
"log_cpu_pf.txt:TimeModule> 35006 1 particleFlowTmp PFProducer 0.00605446\n",
"log_cpu_pf.txt:TimeModule> 35010 1 particleFlowTmp PFProducer 0.0122532\n",
"log_cpu_pf.txt:TimeModule> 35009 1 particleFlowTmp PFProducer 0.0221017\n",
"log_cpu_pf.txt:TimeModule> 35008 1 particleFlowTmp PFProducer 0.00843328\n",
"log_cpu_pf.txt:TimeModule> 35011 1 particleFlowTmp PFProducer 0.0095517\n",
"log_cpu_pf.txt:TimeModule> 35012 1 particleFlowTmp PFProducer 0.00850458\n",
"log_cpu_pf.txt:TimeModule> 35014 1 particleFlowTmp PFProducer 0.0196761\n",
"log_cpu_pf.txt:TimeModule> 35007 1 particleFlowTmp PFProducer 0.00726191\n",
"log_cpu_pf.txt:TimeModule> 35017 1 particleFlowTmp PFProducer 0.0126049\n",
"log_cpu_pf.txt:TimeModule> 35018 1 particleFlowTmp PFProducer 0.00476037\n",
"log_cpu_pf.txt:TimeModule> 35013 1 particleFlowTmp PFProducer 0.0111422\n",
"log_cpu_pf.txt:TimeModule> 35016 1 particleFlowTmp PFProducer 0.0135155\n",
"log_cpu_pf.txt:TimeModule> 35019 1 particleFlowTmp PFProducer 0.00631518\n",
"log_cpu_pf.txt:TimeModule> 35015 1 particleFlowTmp PFProducer 0.00879818\n",
"log_cpu_pf.txt:TimeModule> 35021 1 particleFlowTmp PFProducer 0.0111998\n",
"\"\"\"\n",
"\n",
"s_cpu_mlpf = \"\"\"\n",
"log_cpu.txt:TimeModule> 35002 1 particleFlowTmp MLPFProducer 9.4116\n",
"log_cpu.txt:TimeModule> 35005 1 particleFlowTmp MLPFProducer 8.02389\n",
"log_cpu.txt:TimeModule> 35001 1 particleFlowTmp MLPFProducer 13.4437\n",
"log_cpu.txt:TimeModule> 35004 1 particleFlowTmp MLPFProducer 10.4151\n",
"log_cpu.txt:TimeModule> 35003 1 particleFlowTmp MLPFProducer 12.1385\n",
"log_cpu.txt:TimeModule> 35006 1 particleFlowTmp MLPFProducer 7.06085\n",
"log_cpu.txt:TimeModule> 35010 1 particleFlowTmp MLPFProducer 12.1508\n",
"log_cpu.txt:TimeModule> 35009 1 particleFlowTmp MLPFProducer 13.2121\n",
"log_cpu.txt:TimeModule> 35008 1 particleFlowTmp MLPFProducer 10.3394\n",
"log_cpu.txt:TimeModule> 35011 1 particleFlowTmp MLPFProducer 9.24309\n",
"log_cpu.txt:TimeModule> 35012 1 particleFlowTmp MLPFProducer 9.26367\n",
"log_cpu.txt:TimeModule> 35014 1 particleFlowTmp MLPFProducer 13.2224\n",
"log_cpu.txt:TimeModule> 35007 1 particleFlowTmp MLPFProducer 8.03034\n",
"log_cpu.txt:TimeModule> 35017 1 particleFlowTmp MLPFProducer 12.1319\n",
"log_cpu.txt:TimeModule> 35018 1 particleFlowTmp MLPFProducer 5.83649\n",
"log_cpu.txt:TimeModule> 35013 1 particleFlowTmp MLPFProducer 11.9684\n",
"log_cpu.txt:TimeModule> 35016 1 particleFlowTmp MLPFProducer 10.2273\n",
"log_cpu.txt:TimeModule> 35019 1 particleFlowTmp MLPFProducer 6.9992\n",
"log_cpu.txt:TimeModule> 35015 1 particleFlowTmp MLPFProducer 7.96592\n",
"log_cpu.txt:TimeModule> 35021 1 particleFlowTmp MLPFProducer 11.9789\n",
"\"\"\"\n",
"\n",
"s_gpu_mlpf = \"\"\"\n",
"log_gpu.txt:TimeModule> 35002 1 particleFlowTmp MLPFProducer 0.177305\n",
"log_gpu.txt:TimeModule> 35005 1 particleFlowTmp MLPFProducer 0.0156437\n",
"log_gpu.txt:TimeModule> 35001 1 particleFlowTmp MLPFProducer 0.0187983\n",
"log_gpu.txt:TimeModule> 35004 1 particleFlowTmp MLPFProducer 0.0158696\n",
"log_gpu.txt:TimeModule> 35003 1 particleFlowTmp MLPFProducer 0.0171756\n",
"log_gpu.txt:TimeModule> 35006 1 particleFlowTmp MLPFProducer 0.0125436\n",
"log_gpu.txt:TimeModule> 35010 1 particleFlowTmp MLPFProducer 0.0167758\n",
"log_gpu.txt:TimeModule> 35009 1 particleFlowTmp MLPFProducer 0.0184546\n",
"log_gpu.txt:TimeModule> 35008 1 particleFlowTmp MLPFProducer 0.0161449\n",
"log_gpu.txt:TimeModule> 35011 1 particleFlowTmp MLPFProducer 0.0146528\n",
"log_gpu.txt:TimeModule> 35012 1 particleFlowTmp MLPFProducer 0.0149266\n",
"log_gpu.txt:TimeModule> 35014 1 particleFlowTmp MLPFProducer 0.0183499\n",
"log_gpu.txt:TimeModule> 35007 1 particleFlowTmp MLPFProducer 0.0130879\n",
"log_gpu.txt:TimeModule> 35017 1 particleFlowTmp MLPFProducer 0.0170359\n",
"log_gpu.txt:TimeModule> 35018 1 particleFlowTmp MLPFProducer 0.0111724\n",
"log_gpu.txt:TimeModule> 35013 1 particleFlowTmp MLPFProducer 0.0167873\n",
"log_gpu.txt:TimeModule> 35016 1 particleFlowTmp MLPFProducer 0.0162624\n",
"log_gpu.txt:TimeModule> 35019 1 particleFlowTmp MLPFProducer 0.0118865\n",
"log_gpu.txt:TimeModule> 35015 1 particleFlowTmp MLPFProducer 0.0126998\n",
"log_gpu.txt:TimeModule> 35021 1 particleFlowTmp MLPFProducer 0.0169669\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4d38d03-5980-4c05-9320-84f3801d4d4a",
"metadata": {},
"outputs": [],
"source": [
"def parse_cmssw(s):\n",
" s = s.strip()\n",
" times = []\n",
" for line in s.split(\"\\n\"):\n",
" if len(line)>0:\n",
" time = float(line.split()[-1])*1000.0\n",
" times.append(time)\n",
" return np.array(times)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec4031a8-0730-4a52-9918-bb2e0b918e62",
"metadata": {},
"outputs": [],
"source": [
"times_pf_cpu = parse_cmssw(s_cpu_pf)\n",
"times_mlpf_cpu = parse_cmssw(s_cpu_mlpf)\n",
"times_mlpf_gpu = parse_cmssw(s_gpu_mlpf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b1ce3d3-083d-49b6-a367-79a9376eac1b",
"metadata": {},
"outputs": [],
"source": [
"plt.bar([0,1], [np.mean(times_pf_cpu[1:]), np.mean(times_mlpf_gpu[1:])], yerr=[np.std(times_pf_cpu[1:]), np.std(times_mlpf_gpu[1:])])\n",
"plt.xticks([0,1], [\"PF on CPU\", \"MLPF-2x6x512-relu on GPU\"])\n",
"plt.ylabel(\"runtime per event [ms]\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 0791d61

Please sign in to comment.