feat: Add draft stage of paper

scipy-conference · Jun 8, 2024 · 1580bc8 · 1580bc8
1 parent 1312327
commit 1580bc8
Show file tree

Hide file tree

Showing 18 changed files with 851 additions and 0 deletions.
diff --git a/papers/matthew_feickert/acknowledgements.md b/papers/matthew_feickert/acknowledgements.md
@@ -0,0 +1,4 @@
+## Acknowledgements
+
+Matthew Feickert, Alexander Held, and Gordon Watts are supported by the U.S. National Science Foundation (NSF) under Cooperative Agreement OAC-1836650 and PHY-2323298 (IRIS-HEP).
+Lukas Heinrich and Vangelis Kourlitis are supported by the Excellence Cluster ORIGINS, which is funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany's Excellence Strategy - EXC-2094-390783311 and by the German Federal Ministry of Education and Research Project 05H2021 (ErUM-FSP T02) - "Run 3 von ATLAS am LHC: Analysis Infrastructure for the ATLAS Detektor at the LHC".
diff --git a/papers/matthew_feickert/banner.png b/papers/matthew_feickert/banner.png
diff --git a/papers/matthew_feickert/code/coffea.py b/papers/matthew_feickert/code/coffea.py
@@ -0,0 +1,99 @@
+import awkward as ak
+import hist
+import vector
+from coffea import processor
+from distributed import Client
+
+
+def get_xsec_weight(sample, infofile):
+    """Returns normalization weight for a given sample."""
+    lumi = 10_000  # pb^-1
+    xsec_map = infofile.infos[sample]  # dictionary with event weighting information
+    xsec_weight = (lumi * xsec_map["xsec"]) / (xsec_map["sumw"] * xsec_map["red_eff"])
+    return xsec_weight
+
+
+def lepton_filter(lep_charge, lep_type):
+    """Filters leptons: sum of charges is required to be 0, and sum of lepton types 44/48/52.
+    Electrons have type 11, muons have 13, so this means 4e/4mu/2e2mu.
+    """
+    sum_lep_charge = ak.sum(lep_charge, axis=1)
+    sum_lep_type = ak.sum(lep_type, axis=1)
+    good_lep_type = ak.any(
+        [sum_lep_type == 44, sum_lep_type == 48, sum_lep_type == 52], axis=0
+    )
+    return ak.all([sum_lep_charge == 0, good_lep_type], axis=0)
+
+
+class HZZAnalysis(processor.ProcessorABC):
+    """The coffea processor used in this analysis."""
+
+    def __init__(self):
+        pass
+
+    def process(self, events):
+        # The process function performs columnar operations on the events
+        # passed to it and applies all the corrections and selections to
+        # either the simulation or the data (e.g. get_xsec_weight and
+        # lepton_filter). All the event level data selection occurs here
+        # and returns accumulators with the selections.
+
+        vector.register_awkward()
+        # type of dataset being processed, provided via metadata (comes originally from fileset)
+        dataset_category = events.metadata["dataset_name"]
+
+        # apply a cut to events, based on lepton charge and lepton type
+        events = events[lepton_filter(events.lep_charge, events.lep_typeid)]
+
+        # construct lepton four-vectors
+        leptons = ak.zip(
+            {
+                "pt": events.lep_pt,
+                "eta": events.lep_eta,
+                "phi": events.lep_phi,
+                "energy": events.lep_energy,
+            },
+            with_name="Momentum4D",
+        )
+
+        # calculate the 4-lepton invariant mass for each remaining event
+        # this could also be an expensive calculation using external tools
+        mllll = (
+            leptons[:, 0] + leptons[:, 1] + leptons[:, 2] + leptons[:, 3]
+        ).mass / 1000
+
+        # create histogram holding outputs, for data just binned in m4l
+        mllllhist_data = hist.Hist.new.Reg(
+            num_bins,
+            bin_edge_low,
+            bin_edge_high,
+            name="mllll",
+            label="$\mathrm{m_{4l}}$ [GeV]",
+        ).Weight()  # using weighted storage here for plotting later, but not needed
+
+        # three histogram axes for MC: m4l, category, and variation (nominal and
+        # systematic variations)
+        mllllhist_MC = (
+            hist.Hist.new.Reg(
+                num_bins,
+                bin_edge_low,
+                bin_edge_high,
+                name="mllll",
+                label="$\mathrm{m_{4l}}$ [GeV]",
+            )
+            .StrCat([k for k in fileset.keys() if k != "Data"], name="dataset")
+            .StrCat(
+                ["nominal", "scaleFactorUP", "scaleFactorDOWN", "m4lUP", "m4lDOWN"],
+                name="variation",
+            )
+            .Weight()
+        )
+
+        # ...
+        # fill histograms based on dataset_category
+        # ...
+
+        return {"data": mllllhist_data, "MC": mllllhist_MC}
+
+    def postprocess(self, accumulator):
+        pass
diff --git a/papers/matthew_feickert/code/corrections.py b/papers/matthew_feickert/code/corrections.py
@@ -0,0 +1,27 @@
+import awkward as ak
+
+from atlascp import EgammaTools  # ATLAS CP tool Python nanobind bindings
+
+
+def get_corrected_mass(energyCorrectionTool, electrons, sys=None):
+    electron_vectors = ak.zip(
+        {
+            "pt": energyCorrectionTool(electrons, sys=sys).newPt,
+            "eta": electrons.eta,
+            "phi": electrons.phi,
+            "mass": electrons.m,
+        },
+        with_name="Momentum4D",
+    )
+    return (electron_vectors[:, 0] + electron_vectors[:, 1]).mass / 1000  # GeV
+
+
+energy_correction_tool = EgammaTools.EgammaCalibrationAndSmearingTool()
+# ...
+# configure and initialize correction algorithm
+# ...
+energy_correction_tool.initialize()
+
+corrected_m_Res_UP = get_corrected_mass(
+    energy_correction_tool, electrons, "Res_up"
+).compute()
diff --git a/papers/matthew_feickert/code/postfit_plot.py b/papers/matthew_feickert/code/postfit_plot.py
@@ -0,0 +1,32 @@
+import cabinetry
+import numpy as np
+
+config = cabinetry.configuration.load("config.yml")
+
+cabinetry.templates.collect(config)
+cabinetry.templates.postprocess(config)  # optional post-processing (e.g. smoothing)
+workspace = cabinetry.workspace.build(config)
+
+model, data = cabinetry.model_utils.model_and_data(workspace)
+fit_results = cabinetry.fit.fit(model, data)
+
+# create post-fit model prediction
+postfit_model = cabinetry.model_utils.prediction(model, fit_results=fit_results)
+
+# binning to use in plot
+plot_config = {
+    "Regions": [
+        {
+            "Name": "Signal_region",
+            "Binning": list(np.linspace(bin_edge_low, bin_edge_high, num_bins + 1)),
+        }
+    ]
+}
+
+figure_dict = cabinetry.visualize.data_mc(
+    postfit_model, data, config=plot_config, save_figure=False
+)
+
+# modify x-axis label
+fig = figure_dict[0]["figure"]
+fig.axes[1].set_xlabel("m4l [GeV]")
diff --git a/papers/matthew_feickert/code/prefit_plot.py b/papers/matthew_feickert/code/prefit_plot.py
@@ -0,0 +1,12 @@
+import hist
+import mplhep
+
+mplhep.histplot(
+    all_histograms["data"], histtype="errorbar", color="black", label="Data"
+)
+hist.Hist.plot1d(
+    all_histograms["MC"][:, :, "nominal"],
+    stack=True,
+    histtype="fill",
+    color=["purple", "red", "lightblue"],
+)
diff --git a/papers/matthew_feickert/code/read.py b/papers/matthew_feickert/code/read.py
@@ -0,0 +1,26 @@
+from coffea.nanoevents import NanoEventsFactory, PHYSLITESchema
+
+
+def get_uris_from_cache(): ...
+
+
+def filter_name(name):
+    return name in (
+        "AnalysisElectronsAuxDyn.pt",
+        "AnalysisElectronsAuxDyn.eta",
+        "AnalysisElectronsAuxDyn.phi",
+        "AnalysisElectronsAuxDyn.m",
+        "AnalysisElectronsAuxDyn.charge",
+        ...,
+    )
+
+
+file_uris = get_uris_from_cache(...)
+
+# uproot used internally to read files into Awkward arrays
+events_mc = NanoEventsFactory.from_root(
+    file_uris,
+    schemaclass=PHYSLITESchema,
+    uproot_options=dict(filter_name=filter_name),
+    permit_dask=True,
+).events()
diff --git a/papers/matthew_feickert/conclusions.md b/papers/matthew_feickert/conclusions.md
@@ -0,0 +1,5 @@
+## Conclusions
+
+When the Higgs boson was discovered in 2012, the idea of being able to perform real Pythonic data analysis in HEP, let alone _efficient_ analysis, was viewed as unfeasible.
+Though investment in the broader Scientific Python ecosystem, and development of the domain specific pieces in the Scikit-HEP organization the field of particle physics successfully created a PyHEP ecosystem of robust tooling.
+Further investment by the ATLAS collaboration has resulted in new performant tooling for complex systematic corrections that will allow for more full and complex operations to be performed entirely within a Python workflow, helping to further reduce the time to insight for physics analysts.
diff --git a/papers/matthew_feickert/figures/Zee_mc_systematics.png b/papers/matthew_feickert/figures/Zee_mc_systematics.png
diff --git a/papers/matthew_feickert/figures/access_layer_diagram.png b/papers/matthew_feickert/figures/access_layer_diagram.png
diff --git a/papers/matthew_feickert/figures/postfit_plot.png b/papers/matthew_feickert/figures/postfit_plot.png
diff --git a/papers/matthew_feickert/figures/prefit_plot.png b/papers/matthew_feickert/figures/prefit_plot.png
diff --git a/papers/matthew_feickert/introduction.md b/papers/matthew_feickert/introduction.md
@@ -0,0 +1,18 @@
+## Introduction
+
+<!-- Here we can talk about the ATLAS experiment, the detector, its purpose, the amount of data collected, etc. -->
+
+The field of high energy physics (HEP) is devoted to the study of the fundamental forces of Nature and their interactions with matter.
+To study the structure of the Universe on the smallest scales requires the highest energy density environments possible &mdash; similar to those of the early Universe.
+These extreme energy density environments are created at the CERN international laboratory, in Geneva, Switzerland, using the Large Hadron Collider (LHC) to collide bunches of billions of protons at a center-of-mass energy of $\sqrt{s} = 13~\mathrm{TeV}$.
+The resulting collisions are recorded with building-sized particle detectors positioned around the LHC's $27~\mathrm{km}$ ring that are designed to measure subatomic particle properties.
+Given the rarity of the subatomic phenomena of interest, the rate of the beam crossings is a tremendous $40~\mathrm{MHz}$ to maximize the number of high quality collisions that can be captured and read out by the detectors.
+Even with real-time onboard processing ("triggering") of the experiment detector readout to save only the most interesting collisions, detectors like the ATLAS experiment [@PERF-2007-01] still produce multiple petabytes of data per year.
+These data are then further filtered through selection criteria on the topology and kinematic quantities of the particle collision "events" recorded into specialized datasets for different kinds of physics analysis.
+The final datasets that physicists use in their physics analyses in ATLAS is still on the order of hundreds of terabytes, which poses challenges of compute scale and analyst time to efficiently use while maximizing physics value.
+
+Traditionally, the ATLAS and the other LHC experiment have created experiment-specific custom `C++` frameworks to handle all stages of the data processing pipeline, from the initial construction of high-level physics objects from the raw data to the final statistical analyses.
+Motivated by the broad success of the Scientific Python ecosystem across many domains of science, and the rise of the Scikit-HEP ecosystem of Pythonic tooling for particle physics [@Rodrigues:2020syo;@henry_schreiner-proc-scipy-2022] and community tools produced by the Institute for Research and Innovation in Software for High Energy Physics (IRIS-HEP) [@S2I2HEPSP;@CWPDOC], there has been a broad community-driven shift in HEP towards use of the Scientific Python ecosystem for analysis of physics data &mdash; a PyHEP ecosystem [@PyHEP].
+The use of dataframes and array programming for data analysis has enhanced the user experience while providing efficient computations without the need of coding optimized low-level routines.
+The ATLAS collaboration has further extended this ecosystem of tooling to include high-level custom Python bindings to the low level `C++` frameworks using `nanobind` [@nanobind].
+Collectively, these tools are modernizing the methods which researchers are engaging data analysis at large scale and providing a nobel end-to-end analysis ecosystem for the ATLAS collaboration.
diff --git a/papers/matthew_feickert/main.md b/papers/matthew_feickert/main.md
@@ -0,0 +1,27 @@
+---
+# Ensure that this title is the same as the one in `myst.yml`
+title: How the Scientific Python ecosystem helps answer fundamental questions of the Universe
+abstract: |
+  The ATLAS experiment at CERN explores vast amounts of physics data to answer the most fundamental questions of the Universe.
+  The prevalence of Python in scientific computing motivated ATLAS to adopt it for its data analysis workflows while enhancing users' experience.
+  This talk will describe to a broad audience how a large scientific collaboration leverages the power of the Scientific Python ecosystem to tackle domain-specific challenges and advance our understanding of the Cosmos.
+  Through a simplified example of the renowned Higgs boson discovery, attendees will gain insights into the utilization of Python libraries to discriminate a signal in immersive noise, through tasks such as data cleaning, feature engineering, statistical interpretation and visualization at scale.
+exports:
+  - format: pdf
+    template: eartharxiv
+    output: exports/scipy_proceedings_draft.pdf
+---
+:::{include} introduction.md
+:::
+
+:::{include} scientific-python-ecosystem.md
+:::
+
+:::{include} uncovering-higgs.md
+:::
+
+:::{include} conclusions.md
+:::
+
+:::{include} acknowledgements.md
+:::