Skip to content

Commit

Permalink
Merge branch 'update_lp' of https://github.com/rkansal47/HHbbVV into …
Browse files Browse the repository at this point in the history
…update_lp
  • Loading branch information
rkansal47 committed Aug 1, 2024
2 parents 82dc7cc + e61540f commit 52a2e79
Show file tree
Hide file tree
Showing 59 changed files with 629 additions and 358 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,4 @@ src/HHbbVV/postprocessing/templates_old
src/HHbbVV/postprocessing/outs

paper/plots
temp
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ repos:
rev: "v4.6.0"
hooks:
- id: check-added-large-files
args: ["--maxkb=2000"]
args: ["--maxkb=10000"]
- id: check-case-conflict
- id: check-merge-conflict
- id: check-symlinks
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,14 @@ Or just signal:
python src/condor/submit.py --year 2017 --tag $TAG --samples HH --subsamples GluGluToHHTobbVV_node_cHHH1 --processor skimmer --submit
```


Submitting signal files to get only the Lund plane densities of all the signals:

```bash
for year in 2016APV 2016 2017 2018; do python src/condor/submit_from_yaml.py --year $year --tag 24Jul24LundPlaneDensity --processor skimmer --git-branch update_lp --yaml src/condor/submit_configs/skimmer_24_07_24_signal_lp.yaml --site ucsd --submit --no-save-skims --no-inference; done
```


### TaggerInputSkimmer

Applies a loose pre-selection cut, saves ntuples with training inputs.
Expand Down Expand Up @@ -313,6 +321,12 @@ In `src/HHbbVV/postprocessing':
python BDTPreProcessing.py --data-dir "../../../../data/skimmer/Feb24/" --signal-data-dir "../../../../data/skimmer/Jun10/" --plot-dir "../../../plots/BDTPreProcessing/$TAG/" --year "2017" --bdt-data (--control-plots)
```

Running inference with a trained model, e.g.:

```bash
python src/HHbbVV/postprocessing/BDTPreProcessing.py --no-save-data --inference --bdt-preds-dir temp/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta/ --data-dir temp --year 2016 --sig-samples HHbbVV --bg-keys "" --no-data --no-do-jshifts
```

### BDT Trainings

```bash
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 1 addition & 2 deletions src/HHbbVV/hh_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,7 @@
("lp_sf_pt_extrap_vars", 100),
("lp_sf_sys_down", 1),
("lp_sf_sys_up", 1),
("lp_sf_dist_down", 1),
("lp_sf_dist_up", 1),
("lp_sf_dist", 1),
("lp_sf_np_down", 1),
("lp_sf_np_up", 1),
("lp_sf_double_matched_event", 1),
Expand Down
66 changes: 57 additions & 9 deletions src/HHbbVV/postprocessing/BDTPreProcessing.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from __future__ import annotations

import argparse
import warnings
from collections import OrderedDict
from copy import copy
from pathlib import Path

import pandas as pd
import postprocessing
import TrainBDT
import utils
from pandas.errors import SettingWithCopyWarning

from HHbbVV import run_utils
from HHbbVV.hh_vars import (
BDT_sample_order,
jec_shifts,
Expand Down Expand Up @@ -69,20 +73,27 @@ def main(args):

bdt_data_dir = args.data_dir / "bdt_data"
bdt_data_dir.mkdir(exist_ok=True)
save_bdt_data(
events_dict, bb_masks, BDT_sample_order, bdt_data_dir / f"{args.year}_bdt_data.parquet"
)

for key in copy(BDT_sample_order):
if key not in all_samples:
BDT_sample_order.remove(key)

bdt_events_dict = get_bdt_data(events_dict, bb_masks, BDT_sample_order)

if args.save_data:
save_bdt_data(
bdt_events_dict, BDT_sample_order, bdt_data_dir / f"{args.year}_bdt_data.parquet"
)

if args.inference:
run_inference(args.year, bdt_events_dict, args.bdt_preds_dir, args.do_jshifts)


def save_bdt_data(
def get_bdt_data(
events_dict: dict[str, pd.DataFrame],
bb_masks: dict[str, pd.DataFrame],
BDT_sample_order: str,
out_file: Path,
):
import pyarrow as pa
import pyarrow.parquet as pq

jec_jmsr_vars = []

for var in BDT_data_vars:
Expand All @@ -103,6 +114,13 @@ def save_bdt_data(
events["Dataset"] = key
bdt_events_dict.append(events)

return bdt_events_dict


def save_bdt_data(bdt_events_dict: list[pd.DataFrame], BDT_sample_order: list[str], out_file: Path):
import pyarrow as pa
import pyarrow.parquet as pq

print("Saving BDT data to", out_file)

bdt_events = pd.concat(bdt_events_dict, axis=0)
Expand All @@ -117,7 +135,37 @@ def save_bdt_data(
f.write(str(sample_order_dict))


def run_inference(
year: str, bdt_events_dict: list[pd.DataFrame], bdt_preds_dir: str, do_jshifts: bool
):
import xgboost as xgb

model = xgb.XGBClassifier()
model.load_model(args.bdt_model)

bdt_events = pd.concat(bdt_events_dict, axis=0)

TrainBDT.do_inference_year(
model,
bdt_preds_dir,
year,
bdt_events,
TrainBDT.AllTaggerBDTVars,
jec_jmsr_shifts=do_jshifts,
multiclass=True,
)


if __name__ == "__main__":
args = postprocessing.parse_args()
parser = argparse.ArgumentParser()
run_utils.add_bool_arg(parser, "save-data", default=True, help="save preprocessed data")
run_utils.add_bool_arg(parser, "inference", default=False, help="run inference on data")
parser.add_argument(
"--bdt-model",
default="src/HHbbVV/postprocessing/bdt_models/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta.model",
help="path to BDT model, if running inference",
type=str,
)
args = postprocessing.parse_args(parser)
args.data_dir = Path(args.data_dir)
main(args)
133 changes: 75 additions & 58 deletions src/HHbbVV/postprocessing/TopAnalysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"from pathlib import Path\n",
"\n",
"from pandas.errors import SettingWithCopyWarning\n",
"from HHbbVV.hh_vars import data_key\n",
"from HHbbVV.hh_vars import data_key, years\n",
"import postprocessing\n",
"\n",
"# ignore these because they don't seem to apply\n",
Expand Down Expand Up @@ -51,7 +51,7 @@
"metadata": {},
"outputs": [],
"source": [
"plot_dir = Path(\"../../../plots/ttsfs/24Jul24BLDistortion\")\n",
"plot_dir = Path(\"../../../plots/ttsfs/24Jul29Distortion\")\n",
"plot_dir.mkdir(parents=True, exist_ok=True)"
]
},
Expand Down Expand Up @@ -362,7 +362,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tesing distortion uncertainty"
"### Testing distortion uncertainty"
]
},
{
Expand All @@ -371,35 +371,55 @@
"metadata": {},
"outputs": [],
"source": [
"import uproot\n",
"from tqdm import tqdm\n",
"import pickle\n",
"\n",
"# package_path = Path(__file__).parent.parent.resolve()\n",
"package_path = Path(\"../\").resolve()\n",
"package_path = Path(\"../\")\n",
"\n",
"for key in [\n",
" \"TTToSemiLeptonic\",\n",
" # \"ST_tW_antitop_5f_NoFullyHadronicDecays\",\n",
" # \"ST_tW_top_5f_NoFullyHadronicDecays\",\n",
" # \"ST_s-channel_4f_leptonDecays\",\n",
" # \"ST_t-channel_antitop_4f_InclusiveDecays\",\n",
" # \"ST_t-channel_top_4f_InclusiveDecays\",\n",
"]:\n",
" sig_lp_hist = utils.get_pickles(f\"{signal_data_dir}/{year}/{key}/pickles\", year, key)[\"lp_hist\"]\n",
"for dist_year in tqdm(years[-1:]):\n",
" f = uproot.open(package_path / f\"corrections/lp_ratios/ratio_{dist_year}.root\")\n",
"\n",
" # remove negatives\n",
" sig_lp_hist.values()[sig_lp_hist.values() < 0] = 0\n",
" # 3D histogram: [subjet_pt, ln(0.8/Delta), ln(kT/GeV)]\n",
" mc_nom = f[\"mc_nom\"].to_numpy()\n",
" ratio_edges = mc_nom[1:]\n",
" mc_nom = mc_nom[0]\n",
"\n",
" with (package_path / f\"corrections/lp_ratios/signals/{year}_{key}.hist\").open(\"wb\") as f:\n",
" pickle.dump(sig_lp_hist, f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sig_lp_hist.values()[sig_lp_hist.values() <= 0] = 1"
" mc_tot_pt = np.sum(mc_nom, axis=(1, 2), keepdims=True)\n",
" mc_density = mc_nom / mc_tot_pt\n",
" plotting.plot_lund_plane_six(\n",
" mc_density, ratio_edges, name=f\"{plot_dir}/{dist_year}_MC.pdf\", show=False\n",
" )\n",
"\n",
" # ratio_nom = f[\"ratio_nom\"].to_numpy()[0]\n",
"\n",
" for sig in [\"GluGluToHHTobbVV_node_cHHH1\", \"VBF_HHTobbVV_CV_1_C2V_2_C3_1\", \"TTToSemiLeptonic\"]:\n",
" if sig == \"TTToSemiLeptonic\" and dist_year != \"2018\":\n",
" continue\n",
"\n",
" with (package_path / f\"corrections/lp_ratios/signals/{dist_year}_{sig}.hist\").open(\n",
" \"rb\"\n",
" ) as f:\n",
" sig_lp_hist = pickle.load(f)\n",
"\n",
" sig_tot_pt = np.sum(sig_lp_hist.values(), axis=(1, 2), keepdims=True)\n",
" sig_density = sig_lp_hist.values() / sig_tot_pt\n",
"\n",
" mc_sig_ratio = np.nan_to_num(mc_density / sig_density, nan=1.0)\n",
" mc_sig_ratio[mc_sig_ratio == 0] = 1.0\n",
" mc_sig_ratio = np.clip(mc_sig_ratio, 0.2, 5.0)\n",
"\n",
" plotting.plot_lund_plane_six(\n",
" sig_density, ratio_edges, name=f\"{plot_dir}/{dist_year}_{sig}.pdf\", show=False\n",
" )\n",
" plotting.plot_lund_plane_six(\n",
" mc_sig_ratio,\n",
" ratio_edges,\n",
" name=f\"{plot_dir}/{dist_year}_{sig}_ratio.pdf\",\n",
" show=False,\n",
" )\n",
"\n",
" break"
]
},
{
Expand All @@ -408,9 +428,7 @@
"metadata": {},
"outputs": [],
"source": [
"sig_mc_ratio = np.nan_to_num((sig_lp_hist.values() / sig_tot) / (mc_nom / mc_tot), nan=1.0)\n",
"sig_mc_ratio[sig_mc_ratio == 0] = 1.0\n",
"sig_mc_ratio = np.clip(sig_mc_ratio, 0.5, 2.0)"
"sig_lp_hist.values()"
]
},
{
Expand All @@ -419,24 +437,14 @@
"metadata": {},
"outputs": [],
"source": [
"import uproot\n",
"\n",
"# initialize lund plane scale factors lookups\n",
"f = uproot.open(package_path / f\"corrections/lp_ratios/ratio_{year[:4]}.root\")\n",
"import hist\n",
"\n",
"# 3D histogram: [subjet_pt, ln(0.8/Delta), ln(kT/GeV)]\n",
"mc_nom = f[\"mc_nom\"].to_numpy()\n",
"ratio_edges = mc_nom[1:]\n",
"mc_nom = mc_nom[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.imshow(sig_lp_hist.values()[0])"
"h = hist.Hist(\n",
" hist.axis.Variable(ratio_edges[0], name=\"subjet_pt\", label=\"Subjet pT [GeV]\"),\n",
" hist.axis.Variable(ratio_edges[1], name=\"logD\", label=\"ln(0.8/Delta)\"),\n",
" hist.axis.Variable(ratio_edges[2], name=\"logkt\", label=\"ln(kT/GeV)\"),\n",
" storage=hist.storage.Weight(),\n",
")"
]
},
{
Expand All @@ -445,7 +453,7 @@
"metadata": {},
"outputs": [],
"source": [
"plt.imshow(mc_nom[0])"
"h.variances()"
]
},
{
Expand All @@ -454,8 +462,11 @@
"metadata": {},
"outputs": [],
"source": [
"plt.imshow(sig_mc_ratio_pt[5])\n",
"plt.colorbar()"
"mc_tot = np.sum(mc_nom)\n",
"sig_tot = sig_lp_hist.sum()\n",
"sig_mc_ratio = np.clip(\n",
" np.nan_to_num((sig_lp_hist.values() / sig_tot) / (mc_nom / mc_tot), nan=1), 0.5, 2.0\n",
")"
]
},
{
Expand All @@ -464,8 +475,8 @@
"metadata": {},
"outputs": [],
"source": [
"mc_tot = np.sum(mc_nom)\n",
"sig_tot = sig_lp_hist.sum()"
"print(np.mean(mc_sig_ratio_pt, axis=(1, 2)))\n",
"print(np.mean(mc_sig_old_ratio_pt, axis=(1, 2)))"
]
},
{
Expand All @@ -476,9 +487,10 @@
"source": [
"sig_tot_pt = np.sum(sig_lp_hist.values(), axis=(1, 2), keepdims=True)\n",
"mc_tot_pt = np.sum(mc_nom, axis=(1, 2), keepdims=True)\n",
"sig_mc_ratio_pt = np.nan_to_num((sig_lp_hist.values() / sig_tot_pt) / (mc_nom / mc_tot_pt), nan=1.0)\n",
"sig_mc_ratio_pt[sig_mc_ratio_pt == 0] = 1.0\n",
"sig_mc_ratio_pt = np.clip(sig_mc_ratio_pt, 0.5, 2.0)"
"mc_sig_ratio_pt = np.nan_to_num((mc_nom / mc_tot_pt) / (sig_lp_hist.values() / sig_tot_pt), nan=1.0)\n",
"mc_sig_ratio_pt[mc_sig_ratio_pt == 0] = 1.0\n",
"mc_sig_ratio_pt = np.clip(mc_sig_ratio_pt, 0.5, 2.0)\n",
"plt.imshow(mc_sig_ratio_pt[0])"
]
},
{
Expand All @@ -487,9 +499,14 @@
"metadata": {},
"outputs": [],
"source": [
"sig_mc_ratio = np.clip(\n",
" np.nan_to_num((sig_lp_hist.values() / sig_tot) / (mc_nom / mc_tot), nan=1), 0.5, 2.0\n",
")"
"sig_tot_pt = np.sum(sig_old_lp_hist.values(), axis=(1, 2), keepdims=True)\n",
"mc_tot_pt = np.sum(mc_nom, axis=(1, 2), keepdims=True)\n",
"mc_sig_old_ratio_pt = np.nan_to_num(\n",
" (mc_nom / mc_tot_pt) / (sig_old_lp_hist.values() / sig_tot_pt), nan=1.0\n",
")\n",
"mc_sig_old_ratio_pt[mc_sig_old_ratio_pt == 0] = 1.0\n",
"mc_sig_old_ratio_pt = np.clip(mc_sig_old_ratio_pt, 0.5, 2.0)\n",
"plt.imshow(mc_sig_old_ratio_pt[0])"
]
},
{
Expand Down
Loading

0 comments on commit 52a2e79

Please sign in to comment.