diff --git a/src/HHbbVV/postprocessing/InferenceAnalysis.ipynb b/src/HHbbVV/postprocessing/InferenceAnalysis.ipynb index 44c16a78..8e158d1f 100644 --- a/src/HHbbVV/postprocessing/InferenceAnalysis.ipynb +++ b/src/HHbbVV/postprocessing/InferenceAnalysis.ipynb @@ -58,8 +58,8 @@ "metadata": {}, "outputs": [], "source": [ - "# plot_dir = f\"{MAIN_DIR}/plots/TaggerAnalysis/23Aug28\"\n", - "plot_dir = MAIN_DIR / \"plots/BDT/24Apr9\"\n", + "plot_dir = MAIN_DIR / \"plots/TaggerAnalysis/24May20\"\n", + "# plot_dir = MAIN_DIR / \"plots/BDT/24Apr9\"\n", "plot_dir.mkdir(parents=True, exist_ok=True)\n", "\n", "samples_dir = f\"{MAIN_DIR}/../data/skimmer/24Mar14UpdateData\"\n", @@ -129,12 +129,28 @@ "metadata": {}, "outputs": [], "source": [ + "# (column name, number of subcolumns)\n", + "load_columns = [\n", + " (\"weight\", 1),\n", + " (\"weight_noTrigEffs\", 1),\n", + " (\"ak8FatJetPt\", 2),\n", + " (\"ak8FatJetParticleNetMass\", 2),\n", + " (\"VVFatJetParTMD_THWWvsT\", 1),\n", + " (\"VVFatJetParTMD_probHWW4q\", 1),\n", + " (\"VVFatJetParTMD_probHWW3q\", 1),\n", + " (\"VVFatJetParTMD_probQCD\", 1),\n", + " (\"VVFatJetParTMD_probT\", 1),\n", + " # (\"VVFatJetParticleNet_Th4q\", 2),\n", + "]\n", + "\n", "# Both Jet's Regressed Mass above 50, electron veto\n", - "events_dict = utils.load_samples(\n", + "events_dict = postprocessing.load_samples(\n", " samples_dir,\n", - " {sig_key: samples[sig_key] for sig_key in nonres_sig_keys},\n", + " {key: samples[key] for key in nonres_sig_keys + [\"QCD\", \"TT\"]},\n", " year,\n", " filters=postprocessing.load_filters,\n", + " columns=utils.format_columns(load_columns),\n", + " variations=False,\n", ")" ] }, @@ -144,144 +160,9 @@ "metadata": {}, "outputs": [], "source": [ - "# (column name, number of subcolumns)\n", - "save_columns = [\n", - " (\"weight\", 1),\n", - " (\"ak8FatJetPt\", 2),\n", - " (\"ak8FatJetMsd\", 2),\n", - " (\"ak8FatJetParTMD_THWW4q\", 2),\n", - " (\"ak8FatJetParTMD_probHWW4q\", 2),\n", - " (\"ak8FatJetParTMD_probHWW3q\", 2),\n", - " (\"ak8FatJetParTMD_probQCD\", 2),\n", - " (\"ak8FatJetParTMD_probT\", 2),\n", - " (\"ak8FatJetParticleNet_Th4q\", 2),\n", - "]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Signal Processing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for sig_key, events in list(events_dict.items()):\n", - " sig_dict = {}\n", - " masks = events[\"ak8FatJetHVV\"].astype(bool)\n", - "\n", - " for column, num_idx in save_columns:\n", - " if num_idx == 1:\n", - " sig_dict[column] = np.tile(events[column].values, 2)[masks]\n", - " else:\n", - " sig_dict[column] = np.nan_to_num(events[column].values[masks], copy=True, nan=0)\n", - "\n", - " events_dict[sig_key] = sig_dict" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Background Processing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "full_samples_list = listdir(f\"{samples_dir}/{year}\")\n", - "\n", - "# reformat into (\"column name\", \"idx\") format for reading multiindex columns\n", - "bg_column_labels = []\n", - "for key, num_columns in save_columns:\n", - " for i in range(num_columns):\n", - " bg_column_labels.append(f\"('{key}', '{i}')\")\n", - "\n", - "\n", - "bg_keys = [\"TT\", \"QCD\"]\n", - "# bg_keys = [\"QCD\"]\n", - "\n", - "for bg_key in bg_keys:\n", - " events_dict[bg_key] = {}\n", - " for sample in full_samples_list:\n", - " if bg_key not in sample:\n", - " continue\n", - "\n", - " # doesn't have probT for some reason\n", - " if sample in [\"QCD_HT300to500\", \"QCD_HT200to300\"]:\n", - " continue\n", - "\n", - " if \"HH\" in sample or \"GluGluH\" in sample:\n", - " continue\n", - "\n", - " if not exists(f\"{samples_dir}/{year}/{sample}/parquet\"):\n", - " print(f\"No parquet file for {sample}\")\n", - " continue\n", - "\n", - " print(sample)\n", - "\n", - " with utils.timer():\n", - " events = pd.read_parquet(\n", - " f\"{samples_dir}/{year}/{sample}/parquet\",\n", - " columns=bg_column_labels,\n", - " )\n", - "\n", - " pickles_path = f\"{samples_dir}/{year}/{sample}/pickles\"\n", - " n_events = utils.get_nevents(pickles_path, year, sample)\n", - " events[\"weight\"] /= n_events\n", - "\n", - " for var, num_idx in save_columns:\n", - " if num_idx == 1:\n", - " values = np.tile(events[var].values, 2).reshape(-1)\n", - " else:\n", - " values = np.reshape(events[var].values, -1)\n", - "\n", - " if var in events_dict[bg_key]:\n", - " events_dict[bg_key][var] = np.concatenate(\n", - " (events_dict[bg_key][var], values), axis=0\n", - " )\n", - " else:\n", - " events_dict[bg_key][var] = values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print weighted sample yields\n", - "for sample in events_dict:\n", - " tot_weight = np.sum(events_dict[sample][\"weight\"])\n", - " print(f\"Pre-selection {sample} yield: {tot_weight:.2f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for sample, events in events_dict.items():\n", - " if \"ak8FatJetParTMD_THWWvsT\" not in events:\n", - " events[\"ak8FatJetParTMD_THWWvsT\"] = (\n", - " events[\"ak8FatJetParTMD_probHWW3q\"] + events[\"ak8FatJetParTMD_probHWW4q\"]\n", - " ) / (\n", - " events[\"ak8FatJetParTMD_probHWW3q\"]\n", - " + events[\"ak8FatJetParTMD_probHWW4q\"]\n", - " + events[\"ak8FatJetParTMD_probQCD\"]\n", - " + events[\"ak8FatJetParTMD_probT\"]\n", - " )" + "cutflow = pd.DataFrame(index=list(events_dict.keys()))\n", + "utils.add_to_cutflow(events_dict, \"Preselection\", \"finalWeight\", cutflow)\n", + "cutflow" ] }, { @@ -311,7 +192,7 @@ "\"\"\"\n", "\n", "pt_key = \"Pt\"\n", - "msd_key = \"Msd\"\n", + "msd_key = \"ParticleNetMass\"\n", "var_prefix = \"ak8FatJet\"\n", "\n", "cutvars_dict = {\"Pt\": \"pt\", \"Msd\": \"msoftdrop\"}\n", @@ -322,7 +203,7 @@ " # {pt_key: [300, 1500], msd_key: [110, 140]},\n", "]\n", "\n", - "var_labels = {pt_key: \"pT\", msd_key: \"mSD\"}\n", + "var_labels = {pt_key: r\"$p_T$\", msd_key: r\"$m_{Reg}$\"}\n", "\n", "cuts_dict = {}\n", "cut_labels = {} # labels for plot titles, formatted as \"var1label: [min, max] var2label...\"\n", @@ -363,19 +244,19 @@ "outputs": [], "source": [ "plot_vars = {\n", - " \"th4q\": {\n", - " \"title\": \"ParticleNet Non-MD Th4q\",\n", - " \"score_label\": \"ak8FatJetParticleNet_Th4q\",\n", - " \"colour\": \"orange\",\n", - " },\n", - " \"thvv4q\": {\n", - " \"title\": \"ParT MD THVV\",\n", - " \"score_label\": \"ak8FatJetParTMD_THWW4q\",\n", - " \"colour\": \"green\",\n", - " },\n", + " # \"th4q\": {\n", + " # \"title\": \"ParticleNet Non-MD Th4q\",\n", + " # \"score_label\": \"ak8FatJetParticleNet_Th4q\",\n", + " # \"colour\": \"orange\",\n", + " # },\n", + " # \"thvv4q\": {\n", + " # \"title\": \"ParT MD THVV\",\n", + " # \"score_label\": \"ak8FatJetParTMD_THWW4q\",\n", + " # \"colour\": \"green\",\n", + " # },\n", " \"thvv4qt\": {\n", - " \"title\": \"ParT MD THVV\",\n", - " \"score_label\": \"ak8FatJetParTMD_THWWvsT\",\n", + " \"title\": r\"GloParT $T_{HVV}$\",\n", + " \"score_label\": \"VVFatJetParTMD_THWWvsT\",\n", " \"colour\": \"green\",\n", " },\n", "}" @@ -434,18 +315,19 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.metrics import roc_curve, auc\n", + "from sklearn.metrics import roc_curve, auc, integrate\n", "\n", "rocs = {}\n", "# sig_key = \"HHbbVV\"\n", "bg_keys = [\"TT\", \"QCD\"]\n", - "bg_skip = 4\n", + "bg_skip = 1\n", "\n", "\n", "for cutstr in cut_labels:\n", " # print(cutstr)\n", " rocs[cutstr] = {}\n", - " for sig_key in tqdm(nonres_sig_keys + res_sig_keys):\n", + " # for sig_key in tqdm(nonres_sig_keys + res_sig_keys):\n", + " for sig_key in tqdm(nonres_sig_keys):\n", " rocs[cutstr][sig_key] = {}\n", " sig_cut = cuts_dict[sig_key][cutstr]\n", " bg_cuts = [cuts_dict[bg_key][cutstr] for bg_key in bg_keys]\n", @@ -465,7 +347,6 @@ " for bg_key, bg_cut in zip(bg_keys, bg_cuts)\n", " ],\n", " )\n", - " # print(weights[np.sum(sig_cut):])\n", "\n", " for t, pvars in plot_vars.items():\n", " score_label = pvars[\"score_label\"]\n", @@ -526,10 +407,22 @@ "]\n", "\n", "sig_splits = [\n", - " [\"HHbbVV\"] + [f\"X[{mX}]->H(bb)Y[{mY}](WW)\" for (mX, mY) in mps] for mps in sig_split_points\n", + " [\"HHbbVV\"] # + [f\"X[{mX}]->H(bb)Y[{mY}](WW)\" for (mX, mY) in mps] for mps in sig_split_points\n", "]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "roc = rocs[cutstr][sig_key][t]\n", + "roc_auc = integrate.trapz(y=roc[\"tpr\"], x=roc[\"fpr\"])\n", + "print(\"AUC:\", roc_auc)\n", + "plotting.rocCurve(roc[\"fpr\"], roc[\"tpr\"], show=True, plot_dir=plot_dir, name=\"THVV\", auc=roc_auc)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/HHbbVV/postprocessing/PostProcess.ipynb b/src/HHbbVV/postprocessing/PostProcess.ipynb index fd72a2a4..a34b3d17 100644 --- a/src/HHbbVV/postprocessing/PostProcess.ipynb +++ b/src/HHbbVV/postprocessing/PostProcess.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -82,8 +82,9 @@ "MAIN_DIR = Path(\"../../../\")\n", "samples_dir = MAIN_DIR / \"../data/skimmer/24Mar14UpdateData\"\n", "year = \"2018\"\n", + "bdt_preds_dir = samples_dir / \"24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta/inferences\"\n", "\n", - "date = \"24Apr11\"\n", + "date = \"24May16\"\n", "plot_dir = MAIN_DIR / f\"plots/PostProcessing/{date}/\"\n", "templates_dir = f\"templates/{date}\"\n", "_ = os.system(f\"mkdir -p {plot_dir}\")\n", @@ -106,7 +107,7 @@ "# add both VBF keys just in case (the unused one will be removed below)\n", "nonres_samples = {\n", " \"HHbbVV\": \"GluGluToHHTobbVV_node_cHHH1\",\n", - " # \"qqHH_CV_1_C2V_0_kl_1_HHbbVV\": \"VBF_HHTobbVV_CV_1_C2V_0_C3_1\",\n", + " \"qqHH_CV_1_C2V_0_kl_1_HHbbVV\": \"VBF_HHTobbVV_CV_1_C2V_0_C3_1\",\n", " # \"qqHH_CV_1_C2V_2_kl_1_HHbbVV\": \"VBF_HHTobbVV_CV_1_C2V_2_C3_1\",\n", "}\n", "\n", @@ -143,52 +144,6 @@ "cutflow" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "events_dict[\"QCD\"][\"weight_pileupIDUp\"][0].to_numpy() / events_dict[\"QCD\"][\"finalWeight\"].to_numpy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.hist(\n", - " events_dict[\"QCD\"][\"weight_pileupIDUp\"].to_numpy()\n", - " / events_dict[\"QCD\"][\"finalWeight\"].to_numpy()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "events_dict[\"QCD\"].columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cutflow" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -349,6 +304,52 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check mVV after BDT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for bdt_cut in [0.0, 0.5, 0.9, 0.99, 0.995]:\n", + " sel, _ = utils.make_selection(\n", + " {\"BDTScore\": [bdt_cut, CUT_MAX_VAL]},\n", + " events_dict,\n", + " bb_masks,\n", + " )\n", + "\n", + " hists = postprocessing.control_plots(\n", + " events_dict,\n", + " bb_masks,\n", + " [\"HHbbVV\", \"qqHH_CV_1_C2V_0_kl_1_HHbbVV\"],\n", + " [\n", + " ShapeVar(\n", + " var=\"VVFatJetParticleNetMass\",\n", + " label=r\"$m^{VV}_{reg}$ (GeV)\",\n", + " bins=[20, 50, 250],\n", + " significance_dir=\"bin\",\n", + " )\n", + " ],\n", + " plot_dir / f\"ControlPlots/{year}/\",\n", + " year,\n", + " cutstr=f\"bdtcut_{bdt_cut}_\",\n", + " title=rf\"$BDT_{{ggF}}$ ≥ {bdt_cut}\" if bdt_cut != 0.0 else None,\n", + " hists={},\n", + " bg_keys=bg_keys,\n", + " selection=sel,\n", + " sig_scale_dict={\"HHbbVV\": 1},\n", + " combine_pdf=False,\n", + " show=True,\n", + " log=False,\n", + " )" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -532,21 +533,12 @@ "plt.legend()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list(events_dict[\"HHbbVV\"].columns)" - ] - }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Templates" + "## Templates" ] }, { diff --git a/src/HHbbVV/postprocessing/plotting.py b/src/HHbbVV/postprocessing/plotting.py index 384c18ea..20c5f60e 100644 --- a/src/HHbbVV/postprocessing/plotting.py +++ b/src/HHbbVV/postprocessing/plotting.py @@ -791,6 +791,7 @@ def rocCurve( ylim=None, plot_dir="", name="", + show: bool = False, ): """Plots a ROC curve""" if ylim is None: @@ -815,14 +816,22 @@ def rocCurve( plt.xlabel("Signal efficiency") plt.ylabel("Background efficiency") plt.title(title) + plt.grid(which="major") if auc is not None: plt.legend() plt.xlim(*xlim) plt.ylim(*ylim) - hep.cms.label(data=False, rlabel="") - plt.savefig(f"{plot_dir}/{name}.pdf", bbox_inches="tight") + hep.cms.label(data=False, rlabel="(13 TeV)") + + if len(name): + plt.savefig(plot_dir / f"{name}.pdf", bbox_inches="tight") + + if show: + plt.show() + else: + plt.close() def _find_nearest(array, value): diff --git a/src/HHbbVV/postprocessing/postprocessing.py b/src/HHbbVV/postprocessing/postprocessing.py index fc4c08d9..f2b82cbb 100644 --- a/src/HHbbVV/postprocessing/postprocessing.py +++ b/src/HHbbVV/postprocessing/postprocessing.py @@ -1394,6 +1394,7 @@ def control_plots( weight_key: str = "finalWeight", hists: dict = None, cutstr: str = "", + title: str = None, sig_splits: list[list[str]] = None, bg_keys: list[str] = bg_keys, selection: dict[str, np.ndarray] = None, @@ -1472,6 +1473,7 @@ def control_plots( plot_sig_keys, bg_keys, name=name, + title=title, sig_scale_dict=tsig_scale_dict if not log else None, plot_significance=plot_significance, significance_dir=shape_var.significance_dir, diff --git a/src/HHbbVV/postprocessing/utils.py b/src/HHbbVV/postprocessing/utils.py index 51f98557..67b39238 100644 --- a/src/HHbbVV/postprocessing/utils.py +++ b/src/HHbbVV/postprocessing/utils.py @@ -207,6 +207,18 @@ def add_to_cutflow( ] +def format_columns(columns: list): + """ + Reformat input of (`column name`, `num columns`) into (`column name`, `idx`) format for + reading multiindex columns + """ + ret_columns = [] + for key, num_columns in columns: + for i in range(num_columns): + ret_columns.append(f"('{key}', '{i}')") + return ret_columns + + def getParticles(particle_list, particle_type): """ Finds particles in `particle_list` of type `particle_type`