Merge branch 'main' of https://github.com/rkansal47/HHbbVV

rkansal47 · May 22, 2024 · 013ad1a · 013ad1a
2 parents 8449160 + 02feab4
commit 013ad1a
Show file tree

Hide file tree

Showing 482 changed files with 11,093 additions and 345 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -66,4 +66,4 @@ jobs:
           --durations=20
 
       - name: Upload coverage report
-        uses: codecov/codecov-action@v4.3.0
+        uses: codecov/codecov-action@v4.4.1
diff --git a/README.md b/README.md
@@ -422,6 +422,17 @@ git clone -b v2.0.0 https://github.com/cms-analysis/CombineHarvester.git Combine
 scramv1 b clean; scramv1 b
 ```
 
+### Packages
+
+To create datacards, you need to use the same cmsenv as above + these packages:
+
+```bash
+pip3 install --upgrade pip
+pip3 install rhalphalib
+cd /path/to/your/local/HHbbVV/repo
+pip3 install -e .
+```
+
 I also add this to my .bashrc for convenience:
 
 ```

diff --git a/inference_scans/run_law.sh b/inference_scans/run_law.sh
@@ -3,7 +3,7 @@
 
 ####################################################################################################
 # Script for running HH inference 'law' commands
-# 
+#
 # Usage:
 # 0) snapshot: --snapshot
 # 1) limits at point: --limpoint (--vbf)  # --vbf runs it for the VBF k2v=0 point
@@ -198,5 +198,3 @@ if [ $impacts = 1 ]; then
         --pull-range 3 \
         --Snapshot-custom-args="$custom_args"
 fi
-
-
diff --git a/src/HHbbVV/VBF_binder/VBFKinematicsStudyRK.ipynb b/src/HHbbVV/VBF_binder/VBFKinematicsStudyRK.ipynb
@@ -709,6 +709,85 @@
     "matching_efficiency(gen_quarks[tsel], sjets)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "25d84192",
+   "metadata": {},
+   "source": [
+    "Testing more efficient ways of saving the jets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7e1dfeb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "jets = vbf_jets[tsel]\n",
+    "num_jets = 3\n",
+    "eta_jj_min = 2\n",
+    "jets = ak.pad_none(jets, num_jets, clip=True)\n",
+    "eta = jets.eta\n",
+    "\n",
+    "etas = []\n",
+    "i_s = []\n",
+    "for i in range(num_jets):\n",
+    "    for j in range(i + 1, num_jets):\n",
+    "        etajj = ak.fill_none(np.abs(eta[:, i] - eta[:, j]) >= eta_jj_min, False)\n",
+    "        etas.append(etajj)\n",
+    "        i_s.append([i, j])\n",
+    "\n",
+    "inds = np.zeros((len(jets), 2))\n",
+    "inds[:, 1] += 1\n",
+    "\n",
+    "eta_jj_cache = ~etas[0]\n",
+    "for n in range(1, len(etas)):\n",
+    "    inds[eta_jj_cache * etas[n]] = i_s[n]\n",
+    "    eta_jj_cache = eta_jj_cache * ~etas[n]\n",
+    "\n",
+    "i1 = inds[:, 0].astype(int)\n",
+    "i2 = inds[:, 1].astype(int)\n",
+    "\n",
+    "j1 = jets[np.arange(len(jets)), i1]\n",
+    "j2 = jets[np.arange(len(jets)), i2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3ca6ce0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PAD_VAL = -99999\n",
+    "num_ak4_jets = 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0de25e6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mask = np.zeros((len(jets), len(jets[0])))\n",
+    "mask[np.arange(len(jets)), i1] = 1\n",
+    "mask[np.arange(len(jets)), i2] = 1\n",
+    "jets[ak.Array(mask.astype(bool))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "330d7265",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vbf_jets_ak8_etaminjj = jets[ak.Array(mask.astype(bool))]\n",
+    "np.reshape(ak.fill_none(vbf_jets_ak8_etaminjj.pt, -PAD_VAL).to_numpy(), (-1, num_ak4_jets))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "bf5a8917",

diff --git a/src/HHbbVV/combine/binder/FTest.ipynb b/src/HHbbVV/combine/binder/FTest.ipynb
@@ -13,6 +13,8 @@
     "import mplhep as hep\n",
     "import matplotlib.ticker as mticker\n",
     "from pathlib import Path\n",
+    "from HHbbVV.postprocessing import utils\n",
+    "from scipy import stats\n",
     "\n",
     "plt.style.use(hep.style.CMS)\n",
     "hep.style.use(\"CMS\")\n",
@@ -29,7 +31,7 @@
    "source": [
     "MAIN_DIR = Path(\"../../../../\")\n",
     "\n",
-    "plot_dir = MAIN_DIR / \"plots/FTests/24Apr10ggF\"\n",
+    "plot_dir = MAIN_DIR / \"plots/FTests/24Apr26NonresBDT995\"\n",
     "plot_dir.mkdir(exist_ok=True, parents=True)"
    ]
   },
@@ -43,6 +45,13 @@
     "    return np.mean(toy_ts >= data_ts)\n",
     "\n",
     "\n",
+    "def p_value_fit(data_ts: float, df: float):\n",
+    "    \"\"\"get p-value from chi^2 it rather than toy values\"\"\"\n",
+    "    x = np.linspace(0.01, 100, 1000)\n",
+    "    cdf = stats.chi2.cdf(x, df)\n",
+    "    return 1 - cdf[utils.find_nearest(x, data_ts)]\n",
+    "\n",
+    "\n",
     "def F_statistic(\n",
     "    ts_low: List[float],\n",
     "    ts_high: list[float],\n",
@@ -71,8 +80,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "eos_cards_dir = \"/eos/uscms/store/user/rkansal/bbVV/cards/f_tests/24Apr10ggFMP9965/\"\n",
-    "local_cards_dir = \"/uscms/home/rkansal/hhcombine/cards/f_tests/24Apr10ggFMP9965/\"\n",
+    "eos_cards_dir = \"/eos/uscms/store/user/rkansal/bbVV/cards/f_tests/24Apr26NonresBDT995/ggf/\"\n",
+    "local_cards_dir = \"/uscms/home/rkansal/hhcombine/cards/f_tests/24Apr26NonresBDT995/ggf/\"\n",
+    "# test_orders = [0, 1, 2, 3, 4, 5]\n",
     "test_orders = [0, 1, 2]\n",
     "test_statistics = {}\n",
     "\n",
@@ -85,7 +95,7 @@
     "\n",
     "        # test statistics for toys generated by (o1, o2) order model\n",
     "        file = uproot.concatenate(\n",
-    "            f\"{eos_cards_dir}/nTF_{nTF}/higgsCombineToys{tlabel}Seed*44.GoodnessOfFit.mH125.*44.root\"\n",
+    "            f\"{eos_cards_dir}/nTF_{nTF}/higgsCombineToys{tlabel}Seed*4.GoodnessOfFit.mH125.*4.root\"\n",
     "        )\n",
     "        tdict[\"toys\"][tflabel] = np.array(file[\"limit\"])\n",
     "\n",
@@ -164,9 +174,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from scipy import stats\n",
-    "\n",
-    "\n",
     "def plot_tests(\n",
     "    data_ts: float,\n",
     "    toy_ts: np.ndarray,\n",
@@ -182,6 +189,7 @@
     "    # plot_min = min(np.min(toy_ts), data_ts, 0)\n",
     "    plot_min = 0\n",
     "    pval = p_value(data_ts, toy_ts)\n",
+    "    res = None\n",
     "\n",
     "    plt.figure(figsize=(12, 8))\n",
     "    h = plt.hist(\n",
@@ -191,14 +199,14 @@
     "        histtype=\"step\",\n",
     "        label=f\"{len(toy_ts)} Toys\",\n",
     "    )\n",
-    "    plt.axvline(data_ts, color=\"#FF502E\", linestyle=\":\", label=rf\"Data ($p$-value = {pval:.2f})\")\n",
     "\n",
     "    if fit is not None:\n",
     "        x = np.linspace(plot_min + 0.01, plot_max, 100)\n",
     "\n",
     "        if fit == \"chi2\":\n",
     "            res = stats.fit(stats.chi2, toy_ts, [(0, 200)])\n",
     "            pdf = stats.chi2.pdf(x, res.params.df)\n",
+    "            pval = p_value_fit(data_ts, res.params.df)\n",
     "            label = rf\"$\\chi^2_{{DoF = {res.params.df:.2f}}}$ Fit\"\n",
     "        elif fit == \"f\":\n",
     "            pdf = stats.f.pdf(x, 1, fdof2)\n",
@@ -215,6 +223,8 @@
     "            label=label,\n",
     "        )\n",
     "\n",
+    "    plt.axvline(data_ts, color=\"#FF502E\", linestyle=\":\", label=rf\"Data ($p$-value = {pval:.2f})\")\n",
+    "\n",
     "    hep.cms.label(\n",
     "        \"Work in Progress\",\n",
     "        data=True,\n",
@@ -227,7 +237,9 @@
     "    plt.ylabel(\"Number of Toys\")\n",
     "    plt.xlabel(\"Test Statistics\")\n",
     "\n",
-    "    plt.savefig(f\"{plot_dir}/{name}.pdf\", bbox_inches=\"tight\")"
+    "    plt.savefig(f\"{plot_dir}/{name}.pdf\", bbox_inches=\"tight\")\n",
+    "\n",
+    "    return res"
    ]
   },
   {
@@ -244,11 +256,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "o1 = 1  # order being tested\n",
+    "o1 = 2  # order being tested\n",
     "tlabel = f\"{o1}\"\n",
     "\n",
     "data_ts, toy_ts = test_statistics[tlabel][\"data\"][tlabel], test_statistics[tlabel][\"toys\"][tlabel]\n",
-    "plot_tests(data_ts, toy_ts, \"gof\" + tlabel, fit=\"chi2\", bins=20)\n",
+    "res = plot_tests(data_ts, toy_ts, \"gof\" + tlabel, fit=\"chi2\", bins=20)\n",
     "\n",
     "ord1 = o1 + 1\n",
     "tflabel = f\"{ord1}\"\n",
@@ -288,13 +300,6 @@
     "    plot_tests(data_ts, toy_ts, f\"f{tlabel}_{tflabel}\", title=f\"({o1}, {o2}) vs. ({ord1}, {ord2})\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/src/HHbbVV/combine/binder/PlotScan.ipynb b/src/HHbbVV/combine/binder/PlotScan.ipynb
@@ -56,7 +56,7 @@
     "# scan_bdt_wps = [0.6, 0.9, 0.96, 0.99, 0.997, 0.998, 0.999]\n",
     "\n",
     "scan_txbb_wps = [\"MP\", \"HP\"]\n",
-    "scan_bdt_wps = [0.9, 0.98, 0.995, 0.996, 0.9965, 0.998]\n",
+    "scan_bdt_wps = [0.9, 0.98, 0.995, 0.9965, 0.998]\n",
     "\n",
     "scan_lepton_veto = [\"Hbb\"]\n",
     "scan_thww_wps = [0.4, 0.6, 0.8, 0.9, 0.94, 0.96, 0.98]\n",

diff --git a/src/HHbbVV/combine/run_blinded.sh b/src/HHbbVV/combine/run_blinded.sh
@@ -35,6 +35,7 @@
 workspace=0
 bfit=0
 limits=0
+toylimits=0
 significance=0
 dfit=0
 dfit_asimov=0
@@ -52,7 +53,7 @@ mintol=0.1  # --cminDefaultMinimizerTolerance
 nonresggf=1
 nonresvbf=1
 
-options=$(getopt -o "wblsdrgti" --long "workspace,bfit,limits,significance,dfit,dfitasimov,resonant,noggf,novbf,gofdata,goftoys,impactsi,impactsf:,impactsc:,bias:,seed:,numtoys:,mintol:" -- "$@")
+options=$(getopt -o "wblsdrgti" --long "workspace,bfit,limits,significance,dfit,dfitasimov,toylimits,resonant,noggf,novbf,gofdata,goftoys,impactsi,impactsf:,impactsc:,bias:,seed:,numtoys:,mintol:" -- "$@")
 eval set -- "$options"
 
 while true; do
@@ -66,6 +67,9 @@ while true; do
         -l|--limits)
             limits=1
             ;;
+        --toylimits)
+            toylimits=1
+            ;;
         -s|--significance)
             significance=1
             ;;
@@ -311,6 +315,18 @@ if [ $limits = 1 ]; then
 fi
 
 
+if [ $toylimits = 1 ]; then
+    echo "Expected limits (MC Unblinded) using toys"
+    combine -M HybridNew --LHCmode LHC-limits --saveHybridResult -m 125 -n "" -d ${wsm_snapshot}.root --snapshotName MultiDimFit -v 9 \
+    ${unblindedparams},r=0 -s "$seed" --bypassFrequentistFit --rAbsAcc 5.0 -T 100 --clsAcc 10 \
+    --floatParameters "${freezeparamsblinded},r" --toysFrequentist --expectedFromGrid 0.500 2>&1 | tee $outsdir/ToysLimits.txt
+
+    # combine -M HybridNew --LHCmode LHC-limits --singlePoint 0 --saveHybridResult -m 125 -n "" -d ${wsm_snapshot}.root --snapshotName MultiDimFit -v 9 --saveToys \
+    # ${unblindedparams},r=0 -s "$seed" --bypassFrequentistFit --rAbsAcc 1.0 -T 100 --clsAcc 10 \
+    # --floatParameters "${freezeparamsblinded},r" --toysFrequentist 2>&1 | tee $outsdir/ToysLimitsSP.txt
+fi
+
+
 if [ $significance = 1 ]; then
     echo "Expected significance (MC Unblinded)"
     combine -M Significance -d ${wsm_snapshot}.root -n "" --significance -m 125 --snapshotName MultiDimFit -v 9 \