diff --git a/papers/matthew_feickert/acknowledgements.md b/papers/matthew_feickert/acknowledgements.md new file mode 100644 index 0000000000..02f89b5d14 --- /dev/null +++ b/papers/matthew_feickert/acknowledgements.md @@ -0,0 +1,4 @@ +## Acknowledgements + +Matthew Feickert, Alexander Held, and Gordon Watts are supported by the U.S. National Science Foundation (NSF) under Cooperative Agreement OAC-1836650 and PHY-2323298 (IRIS-HEP). +Lukas Heinrich and Vangelis Kourlitis are supported by the Excellence Cluster ORIGINS, which is funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany's Excellence Strategy - EXC-2094-390783311 and by the German Federal Ministry of Education and Research Project 05H2021 (ErUM-FSP T02) - "Run 3 von ATLAS am LHC: Analysis Infrastructure for the ATLAS Detektor at the LHC". diff --git a/papers/matthew_feickert/banner.png b/papers/matthew_feickert/banner.png new file mode 100644 index 0000000000..e6a793bd6c Binary files /dev/null and b/papers/matthew_feickert/banner.png differ diff --git a/papers/matthew_feickert/code/coffea.py b/papers/matthew_feickert/code/coffea.py new file mode 100644 index 0000000000..dc35dad3da --- /dev/null +++ b/papers/matthew_feickert/code/coffea.py @@ -0,0 +1,99 @@ +import awkward as ak +import hist +import vector +from coffea import processor +from distributed import Client + + +def get_xsec_weight(sample, infofile): + """Returns normalization weight for a given sample.""" + lumi = 10_000 # pb^-1 + xsec_map = infofile.infos[sample] # dictionary with event weighting information + xsec_weight = (lumi * xsec_map["xsec"]) / (xsec_map["sumw"] * xsec_map["red_eff"]) + return xsec_weight + + +def lepton_filter(lep_charge, lep_type): + """Filters leptons: sum of charges is required to be 0, and sum of lepton types 44/48/52. + Electrons have type 11, muons have 13, so this means 4e/4mu/2e2mu. + """ + sum_lep_charge = ak.sum(lep_charge, axis=1) + sum_lep_type = ak.sum(lep_type, axis=1) + good_lep_type = ak.any( + [sum_lep_type == 44, sum_lep_type == 48, sum_lep_type == 52], axis=0 + ) + return ak.all([sum_lep_charge == 0, good_lep_type], axis=0) + + +class HZZAnalysis(processor.ProcessorABC): + """The coffea processor used in this analysis.""" + + def __init__(self): + pass + + def process(self, events): + # The process function performs columnar operations on the events + # passed to it and applies all the corrections and selections to + # either the simulation or the data (e.g. get_xsec_weight and + # lepton_filter). All the event level data selection occurs here + # and returns accumulators with the selections. + + vector.register_awkward() + # type of dataset being processed, provided via metadata (comes originally from fileset) + dataset_category = events.metadata["dataset_name"] + + # apply a cut to events, based on lepton charge and lepton type + events = events[lepton_filter(events.lep_charge, events.lep_typeid)] + + # construct lepton four-vectors + leptons = ak.zip( + { + "pt": events.lep_pt, + "eta": events.lep_eta, + "phi": events.lep_phi, + "energy": events.lep_energy, + }, + with_name="Momentum4D", + ) + + # calculate the 4-lepton invariant mass for each remaining event + # this could also be an expensive calculation using external tools + mllll = ( + leptons[:, 0] + leptons[:, 1] + leptons[:, 2] + leptons[:, 3] + ).mass / 1000 + + # create histogram holding outputs, for data just binned in m4l + mllllhist_data = hist.Hist.new.Reg( + num_bins, + bin_edge_low, + bin_edge_high, + name="mllll", + label="$\mathrm{m_{4l}}$ [GeV]", + ).Weight() # using weighted storage here for plotting later, but not needed + + # three histogram axes for MC: m4l, category, and variation (nominal and + # systematic variations) + mllllhist_MC = ( + hist.Hist.new.Reg( + num_bins, + bin_edge_low, + bin_edge_high, + name="mllll", + label="$\mathrm{m_{4l}}$ [GeV]", + ) + .StrCat([k for k in fileset.keys() if k != "Data"], name="dataset") + .StrCat( + ["nominal", "scaleFactorUP", "scaleFactorDOWN", "m4lUP", "m4lDOWN"], + name="variation", + ) + .Weight() + ) + + # ... + # fill histograms based on dataset_category + # ... + + return {"data": mllllhist_data, "MC": mllllhist_MC} + + def postprocess(self, accumulator): + pass diff --git a/papers/matthew_feickert/code/corrections.py b/papers/matthew_feickert/code/corrections.py new file mode 100644 index 0000000000..85f543d988 --- /dev/null +++ b/papers/matthew_feickert/code/corrections.py @@ -0,0 +1,27 @@ +import awkward as ak + +from atlascp import EgammaTools # ATLAS CP tool Python nanobind bindings + + +def get_corrected_mass(energyCorrectionTool, electrons, sys=None): + electron_vectors = ak.zip( + { + "pt": energyCorrectionTool(electrons, sys=sys).newPt, + "eta": electrons.eta, + "phi": electrons.phi, + "mass": electrons.m, + }, + with_name="Momentum4D", + ) + return (electron_vectors[:, 0] + electron_vectors[:, 1]).mass / 1000 # GeV + + +energy_correction_tool = EgammaTools.EgammaCalibrationAndSmearingTool() +# ... +# configure and initialize correction algorithm +# ... +energy_correction_tool.initialize() + +corrected_m_Res_UP = get_corrected_mass( + energy_correction_tool, electrons, "Res_up" +).compute() diff --git a/papers/matthew_feickert/code/postfit_plot.py b/papers/matthew_feickert/code/postfit_plot.py new file mode 100644 index 0000000000..1cb4355540 --- /dev/null +++ b/papers/matthew_feickert/code/postfit_plot.py @@ -0,0 +1,32 @@ +import cabinetry +import numpy as np + +config = cabinetry.configuration.load("config.yml") + +cabinetry.templates.collect(config) +cabinetry.templates.postprocess(config) # optional post-processing (e.g. smoothing) +workspace = cabinetry.workspace.build(config) + +model, data = cabinetry.model_utils.model_and_data(workspace) +fit_results = cabinetry.fit.fit(model, data) + +# create post-fit model prediction +postfit_model = cabinetry.model_utils.prediction(model, fit_results=fit_results) + +# binning to use in plot +plot_config = { + "Regions": [ + { + "Name": "Signal_region", + "Binning": list(np.linspace(bin_edge_low, bin_edge_high, num_bins + 1)), + } + ] +} + +figure_dict = cabinetry.visualize.data_mc( + postfit_model, data, config=plot_config, save_figure=False +) + +# modify x-axis label +fig = figure_dict[0]["figure"] +fig.axes[1].set_xlabel("m4l [GeV]") diff --git a/papers/matthew_feickert/code/prefit_plot.py b/papers/matthew_feickert/code/prefit_plot.py new file mode 100644 index 0000000000..c965ea5021 --- /dev/null +++ b/papers/matthew_feickert/code/prefit_plot.py @@ -0,0 +1,12 @@ +import hist +import mplhep + +mplhep.histplot( + all_histograms["data"], histtype="errorbar", color="black", label="Data" +) +hist.Hist.plot1d( + all_histograms["MC"][:, :, "nominal"], + stack=True, + histtype="fill", + color=["purple", "red", "lightblue"], +) diff --git a/papers/matthew_feickert/code/read.py b/papers/matthew_feickert/code/read.py new file mode 100644 index 0000000000..e01b2a2955 --- /dev/null +++ b/papers/matthew_feickert/code/read.py @@ -0,0 +1,26 @@ +from coffea.nanoevents import NanoEventsFactory, PHYSLITESchema + + +def get_uris_from_cache(): ... + + +def filter_name(name): + return name in ( + "AnalysisElectronsAuxDyn.pt", + "AnalysisElectronsAuxDyn.eta", + "AnalysisElectronsAuxDyn.phi", + "AnalysisElectronsAuxDyn.m", + "AnalysisElectronsAuxDyn.charge", + ..., + ) + + +file_uris = get_uris_from_cache(...) + +# uproot used internally to read files into Awkward arrays +events_mc = NanoEventsFactory.from_root( + file_uris, + schemaclass=PHYSLITESchema, + uproot_options=dict(filter_name=filter_name), + permit_dask=True, +).events() diff --git a/papers/matthew_feickert/conclusions.md b/papers/matthew_feickert/conclusions.md new file mode 100644 index 0000000000..ab7c56be9f --- /dev/null +++ b/papers/matthew_feickert/conclusions.md @@ -0,0 +1,5 @@ +## Conclusions + +When the Higgs boson was discovered in 2012, the idea of being able to perform real Pythonic data analysis in HEP, let alone _efficient_ analysis, was viewed as unfeasible. +Though investment in the broader Scientific Python ecosystem, and development of the domain specific pieces in the Scikit-HEP organization the field of particle physics successfully created a PyHEP ecosystem of robust tooling. +Further investment by the ATLAS collaboration has resulted in new performant tooling for complex systematic corrections that will allow for more full and complex operations to be performed entirely within a Python workflow, helping to further reduce the time to insight for physics analysts. diff --git a/papers/matthew_feickert/figures/Zee_mc_systematics.png b/papers/matthew_feickert/figures/Zee_mc_systematics.png new file mode 100644 index 0000000000..a373486fa0 Binary files /dev/null and b/papers/matthew_feickert/figures/Zee_mc_systematics.png differ diff --git a/papers/matthew_feickert/figures/access_layer_diagram.png b/papers/matthew_feickert/figures/access_layer_diagram.png new file mode 100644 index 0000000000..16914e4110 Binary files /dev/null and b/papers/matthew_feickert/figures/access_layer_diagram.png differ diff --git a/papers/matthew_feickert/figures/postfit_plot.png b/papers/matthew_feickert/figures/postfit_plot.png new file mode 100644 index 0000000000..e565917d3e Binary files /dev/null and b/papers/matthew_feickert/figures/postfit_plot.png differ diff --git a/papers/matthew_feickert/figures/prefit_plot.png b/papers/matthew_feickert/figures/prefit_plot.png new file mode 100644 index 0000000000..6dae915444 Binary files /dev/null and b/papers/matthew_feickert/figures/prefit_plot.png differ diff --git a/papers/matthew_feickert/introduction.md b/papers/matthew_feickert/introduction.md new file mode 100644 index 0000000000..9588d47255 --- /dev/null +++ b/papers/matthew_feickert/introduction.md @@ -0,0 +1,18 @@ +## Introduction + + + +The field of high energy physics (HEP) is devoted to the study of the fundamental forces of Nature and their interactions with matter. +To study the structure of the Universe on the smallest scales requires the highest energy density environments possible — similar to those of the early Universe. +These extreme energy density environments are created at the CERN international laboratory, in Geneva, Switzerland, using the Large Hadron Collider (LHC) to collide bunches of billions of protons at a center-of-mass energy of $\sqrt{s} = 13~\mathrm{TeV}$. +The resulting collisions are recorded with building-sized particle detectors positioned around the LHC's $27~\mathrm{km}$ ring that are designed to measure subatomic particle properties. +Given the rarity of the subatomic phenomena of interest, the rate of the beam crossings is a tremendous $40~\mathrm{MHz}$ to maximize the number of high quality collisions that can be captured and read out by the detectors. +Even with real-time onboard processing ("triggering") of the experiment detector readout to save only the most interesting collisions, detectors like the ATLAS experiment [@PERF-2007-01] still produce multiple petabytes of data per year. +These data are then further filtered through selection criteria on the topology and kinematic quantities of the particle collision "events" recorded into specialized datasets for different kinds of physics analysis. +The final datasets that physicists use in their physics analyses in ATLAS is still on the order of hundreds of terabytes, which poses challenges of compute scale and analyst time to efficiently use while maximizing physics value. + +Traditionally, the ATLAS and the other LHC experiment have created experiment-specific custom `C++` frameworks to handle all stages of the data processing pipeline, from the initial construction of high-level physics objects from the raw data to the final statistical analyses. +Motivated by the broad success of the Scientific Python ecosystem across many domains of science, and the rise of the Scikit-HEP ecosystem of Pythonic tooling for particle physics [@Rodrigues:2020syo;@henry_schreiner-proc-scipy-2022] and community tools produced by the Institute for Research and Innovation in Software for High Energy Physics (IRIS-HEP) [@S2I2HEPSP;@CWPDOC], there has been a broad community-driven shift in HEP towards use of the Scientific Python ecosystem for analysis of physics data — a PyHEP ecosystem [@PyHEP]. +The use of dataframes and array programming for data analysis has enhanced the user experience while providing efficient computations without the need of coding optimized low-level routines. +The ATLAS collaboration has further extended this ecosystem of tooling to include high-level custom Python bindings to the low level `C++` frameworks using `nanobind` [@nanobind]. +Collectively, these tools are modernizing the methods which researchers are engaging data analysis at large scale and providing a nobel end-to-end analysis ecosystem for the ATLAS collaboration. diff --git a/papers/matthew_feickert/main.md b/papers/matthew_feickert/main.md new file mode 100644 index 0000000000..b73136f2af --- /dev/null +++ b/papers/matthew_feickert/main.md @@ -0,0 +1,27 @@ +--- +# Ensure that this title is the same as the one in `myst.yml` +title: How the Scientific Python ecosystem helps answer fundamental questions of the Universe +abstract: | + The ATLAS experiment at CERN explores vast amounts of physics data to answer the most fundamental questions of the Universe. + The prevalence of Python in scientific computing motivated ATLAS to adopt it for its data analysis workflows while enhancing users' experience. + This talk will describe to a broad audience how a large scientific collaboration leverages the power of the Scientific Python ecosystem to tackle domain-specific challenges and advance our understanding of the Cosmos. + Through a simplified example of the renowned Higgs boson discovery, attendees will gain insights into the utilization of Python libraries to discriminate a signal in immersive noise, through tasks such as data cleaning, feature engineering, statistical interpretation and visualization at scale. +exports: + - format: pdf + template: eartharxiv + output: exports/scipy_proceedings_draft.pdf +--- +:::{include} introduction.md +::: + +:::{include} scientific-python-ecosystem.md +::: + +:::{include} uncovering-higgs.md +::: + +:::{include} conclusions.md +::: + +:::{include} acknowledgements.md +::: diff --git a/papers/matthew_feickert/mybib.bib b/papers/matthew_feickert/mybib.bib new file mode 100644 index 0000000000..91297336d0 --- /dev/null +++ b/papers/matthew_feickert/mybib.bib @@ -0,0 +1,388 @@ +# ATLAS +@article{PERF-2007-01, + author = "{ATLAS Collaboration}", + title = "{The ATLAS Experiment at the CERN Large Hadron Collider}", + journal = "JINST", + volume = "3", + year = "2008", + pages = "S08003", + doi = "10.1088/1748-0221/3/08/S08003", + primaryClass = "hep-ex", +} + +# Scikit-HEP +@article{Rodrigues:2020syo, + author = "Rodrigues, Eduardo and others", + editor = "Doglioni, C. and Kim, D. and Stewart, G. A. and Silvestris, L. and Jackson, P. and Kamleh, W.", + title = "{The Scikit HEP Project -- overview and prospects}", + eprint = "2007.03577", + archivePrefix = "arXiv", + primaryClass = "physics.comp-ph", + doi = "10.1051/epjconf/202024506028", + journal = "EPJ Web Conf.", + volume = "245", + pages = "06028", + year = "2020" +} + +# IRIS-HEP +@article{S2I2HEPSP, + author = "Elmer, Peter and Neubauer, Mark and Sokoloff, Michael D.", + title = "{Strategic Plan for a Scientific Software Innovation Institute (S2I2) for High Energy Physics}", + year = "2017", + note = "[arXiv 1712.06592] \url{https://arxiv.org/abs/1712.06592}", + archiveprefix = "arXiv", + eprint = "1712.06592", + primaryclass = "physics.comp-ph", +} + +@article{CWPDOC, + author = "Albrecht, Johannes and others", + journal = "Comput. Softw. Big Sci.", + title = "{A Roadmap for HEP Software and Computing R\&D for the 2020s}", + year = "2019", + number = "1", + pages = "7", + volume = "3", + archiveprefix = "arXiv", + collaboration = "HEP Software Foundation", + doi = "10.1007/s41781-018-0018-8", + eprint = "1712.06982", + primaryclass = "physics.comp-ph", + reportnumber = "HSF-CWP-2017-01, HSF-CWP-2017-001, FERMILAB-PUB-17-607-CD", +} + +# PyHEP +@misc{PyHEP, + author = "{HEP Software Foundation}", + year = {2024}, + title = "{Python in HEP (PyHEP) HSF Working Group}", + url = {https://hepsoftwarefoundation.org/workinggroups/pyhep.html} +} + +# Scikit-HEP packaging SciPy 2022 proceedings +@inproceedings{henry_schreiner-proc-scipy-2022, + author = {{H}enry {S}chreiner and {J}im {P}ivarski and {E}duardo {R}odrigues}, + title = {{A}wkward {P}ackaging: building {S}cikit-{H}{E}{P}}, + booktitle = {{P}roceedings of the 21st {P}ython in {S}cience {C}onference}, + pages = {115 - 120}, + year = {2022}, + editor = {{M}eghann {A}garwal and {C}hris {C}alloway and {D}illon {N}iederhut and {D}avid {S}hupe}, + doi = {10.25080/majora-212e5952-012} +} + +# nanobind +@misc{nanobind, + author = {Wenzel Jakob}, + year = {2022}, + note = {https://github.com/wjakob/nanobind}, + title = {nanobind: tiny and efficient C++/Python bindings}, + url = {https://github.com/wjakob/nanobind} +} + +# Awkward Array +@software{Awkward_Array_zenodo, +author = {Pivarski, Jim and Osborne, Ianna and Ifrim, Ioana and Schreiner, Henry and Hollands, Angus and Biswas, Anish and Das, Pratyush and Roy Choudhury, Santam and Smith, Nicholas and Goyal, Manasvi}, +doi = {10.5281/zenodo.4341376}, +month = oct, +title = {{Awkward Array}}, +year = {2018} +} + +# uproot +@software{Uproot_zenodo, +author = {Pivarski, Jim and Schreiner, Henry and Hollands, Angus and Das, Pratyush and Kothari, Kush and Roy, Aryan and Ling, Jerry and Smith, Nicholas and Burr, Chris and Stark, Giordon}, +doi = {10.5281/zenodo.4340632}, +month = sep, +title = {{Uproot}}, +year = {2017} +} + +# Dask +@manual{Dask, + title = {Dask: Library for dynamic task scheduling}, + author = {{Dask Development Team}}, + year = {2016}, + url = {https://dask.pydata.org}, +} + +# dask-awkward +@software{Dask-awkward, + author = "{dask-awkward Development Team}", + title = {{dask-awkward}}, + year = {2022}, + url = {https://github.com/dask-contrib/dask-awkward}, +} + +# boost-histogram +@software{Boost-histogram_zenodo, +author = {Schreiner, Henry and Dembinski, Hans and Goel, Aman and Gohil, Jay and Liu, Shuo and Eschle, Jonas and Maji, Chanchal and Novak, Andrzej and Burr, Chris and Davis, Doug and Lieret, Kilian and Gizdov, Konstantin and Cranmer, Kyle and Feickert, Matthew and Grimaud Pierre}, +doi = {10.5281/zenodo.3492034}, +title = {{boost-histogram}}, +year = {2018} +} + +# dask-histogram +@software{Dask-histogram, + author = "{dask-histogram Development Team}", + title = {{dask-histogram}}, + year = {2021}, + url = {https://github.com/dask-contrib/dask-histogram}, +} + +# hist +@software{hist_zenodo, +author = {Schreiner, Henry and Liu, Shuo and Goel, Aman}, +doi = {10.5281/zenodo.4057112}, +license = {BSD-3-Clause}, +title = {{hist}}, +url = {https://github.com/scikit-hep/hist} +} + +# coffea +@software{coffea_zenodo, +author = {Gray, Lindsey and Smith, Nicholas and Novak, Andrzej and Fackeldey, Peter and Tovar, Benjamin and Chen, Yi-Mu and Watts, Gordon and Krommydas, Iason}, +doi = {10.5281/zenodo.3266454}, +title = {{coffea}}, +url = {https://github.com/CoffeaTeam/coffea}, +year = {2019} +} + +# mplhep +@software{mplhep_zenodo, +author = {Novak, Andrzej and Schreiner, Henry and Feickert, Matthew}, +doi = {10.5281/zenodo.6807166}, +license = {MIT}, +month = apr, +title = {{mplhep}}, +url = {https://github.com/scikit-hep/mplhep}, +year = {2020} +} + +# pyhf +@software{pyhf_zenodo, + author = {Lukas Heinrich and Matthew Feickert and Giordon Stark}, + title = "{pyhf: v0.7.6}", + version = {0.7.6}, + doi = {10.5281/zenodo.1169739}, + url = {https://doi.org/10.5281/zenodo.1169739}, + note = {https://github.com/scikit-hep/pyhf/releases/tag/v0.7.6} +} + +@article{pyhf_joss, + doi = {10.21105/joss.02823}, + url = {https://doi.org/10.21105/joss.02823}, + year = {2021}, + publisher = {The Open Journal}, + volume = {6}, + number = {58}, + pages = {2823}, + author = {Lukas Heinrich and Matthew Feickert and Giordon Stark and Kyle Cranmer}, + title = {pyhf: pure-Python implementation of HistFactory statistical models}, + journal = {Journal of Open Source Software} +} + +# cabinetry +@software{cabinetry_zenodo, + author = {Alexander Held and Matthew Feickert}, + title = "{cabinetry}", + doi = {10.5281/zenodo.4742752}, + url = {https://doi.org/10.5281/10.5281/zenodo.4742752}, + note = {https://github.com/scikit-hep/cabinetry} +} + +# iminuit +@software{iminuit_zenodo, +author = {Dembinski, Hans and Ongmongkolkul, Piti and Deil, Christoph and Schreiner, Henry and Feickert, Matthew and Burr, Chris and Watson, Jason and Rost, Fabian and Pearce, Alex and Geiger, Lukas and Abdelmotteleb, Ahmed and Desai, Aman and Wiedemann, Bernhard M. and Gohlke, Christoph and Sanders, Jeremy and Drotleff, Jonas and Eschle, Jonas and Neste, Ludwig and Gorelli, Marco Edward and Baak, Max and Eliachevitch, Michael and Zapata, Omar}, +license = {MIT}, +doi = {10.5281/zenodo.3949207}, +title = "{iminuit}", +url = {https://github.com/scikit-hep/iminuit} +} + +# Columnar analysis with ATLAS data +@article{Hartmann:2021qzp, + author = {Hartmann, Nikolai and Elmsheuser, Johannes and Duckeck, G\"unter}, + collaboration = "ATLAS Software and Computing", + title = "{Columnar data analysis with ATLAS analysis formats}", + reportNumber = "ATL-SOFT-PROC-2021-015", + doi = "10.1051/epjconf/202125103001", + journal = "EPJ Web Conf.", + volume = "251", + pages = "03001", + year = "2021" +} + +# ROOT +@article{Brun:1997pa, + author = "Brun, R. and Rademakers, F.", + editor = "Werlen, M. and Perret-Gallix, D.", + title = "{ROOT: An object oriented data analysis framework}", + doi = "10.1016/S0168-9002(97)00048-X", + journal = "Nucl. Instrum. Meth. A", + volume = "389", + pages = "81--86", + year = "1997" +} + +# Athena +@software{Athena_zenodo, + author = "{ATLAS Collaboration}", + doi = {10.5281/zenodo.4772550}, + title = {{Athena}}, + year = {2021} +} + +# ATLAS software note +@booklet{ATL-SOFT-PUB-2021-001, + author = "{ATLAS Collaboration}", + title = "{The ATLAS Collaboration Software and Firmware}", + howpublished = "{ATL-SOFT-PUB-2021-001}", + url = "https://cds.cern.ch/record/2767187", + year = "2021", +} + +# NumPy +@article{numpy, + author = {Harris, Charles R. and Millman, K. Jarrod and van der Walt, Stéfan J. and Gommers, Ralf and Virtanen, Pauli and Cournapeau, David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and Smith, Nathaniel J. and Kern, Robert and Picus, Matti and Hoyer, Stephan and van Kerkwijk, Marten H. and Brett, Matthew and Haldane, Allan and del Río, Jaime Fernández and Wiebe, Mark and Peterson, Pearu and Gérard-Marchant, Pierre and Sheppard, Kevin and Reddy, Tyler and Weckesser, Warren and Abbasi, Hameer and Gohlke, Christoph and Oliphant, Travis E.}, + publisher = {Springer Science and Business Media {LLC}}, + doi = {https://doi.org/10.1038/s41586-020-2649-2}, + date = {2020-09}, + year = {2020}, + journal = {Nature}, + number = {7825}, + pages = {357--362}, + title = {Array programming with {NumPy}}, + volume = {585}, +} + +# Matplotlib +@article{matplotlib, + abstract = {Matplotlib is a 2D graphics package used for Python for application development, interactive scripting, and publication-quality image generation across user interfaces and operating systems.}, + author = {Hunter, J. D.}, + publisher = {IEEE COMPUTER SOC}, + year = {2007}, + doi = {https://doi.org/10.1109/MCSE.2007.55}, + journal = {Computing in Science \& Engineering}, + number = {3}, + pages = {90--95}, + title = {Matplotlib: A 2D graphics environment}, + volume = {9}, +} + +# ATLAS Higgs discovery +@article{HIGG-2012-27, + author = "{ATLAS Collaboration}", + title = "{Observation of a new particle in the search for the Standard Model Higgs boson with the ATLAS detector at the LHC}", + journal = "Phys. Lett. B", + volume = "716", + year = "2012", + pages = "1", + doi = "10.1016/j.physletb.2012.08.020", + reportNumber = "CERN-PH-EP-2012-218", + eprint = "1207.7214", + archivePrefix = "arXiv", + primaryClass = "hep-ex", +} + +# CMS Higgs discovery +@article{CMS-HIG-12-028, + author = "{CMS Collaboration}", + title = "{Observation of a new boson at a mass of 125 GeV with the CMS experiment at the LHC}", + journal = "Phys. Lett. B", + volume = "716", + year = "2012", + pages = "30", + doi = "10.1016/j.physletb.2012.08.021", + reportNumber = "CERN-PH-EP-2012-220", + eprint = "1207.7235", + archivePrefix = "arXiv", + primaryClass = "hep-ex", +} + +# ACAT 2024 poster +@techreport{Kourlitis:2890478, + author = "Kourlitis, Vangelis and Vigl, Matthias and Krumnack, Nils + Erik and Feickert, Matthew and Stark, Giordon Holtsberg and + Heinrich, Lukas Alexander and Watts, Gordon and Held, + Alexander and Hartmann, Nikolai", + title = "{Using Legacy ATLAS C++ Calibration Tools in Modern + Columnar Analysis Environments - Poster for ACAT 2024}", + institution = "CERN", + reportNumber = "ATL-COM-SOFT-2024-014", + address = "Geneva", + year = "2024", + url = "https://cds.cern.ch/record/2890478", + note = "22nd International Workshop on Advanced Computing and + Analysis Techniques in Physics Research, Stony Brook, Us 11 + - 15 Mar 2024", +} + +# ATLAS open data +@misc{ATLAS-open-data, + author = "{ATLAS Collaboration}", + title = "{ATLAS 13 TeV samples collection at least four leptons (electron or muon), for 2020 Open Data release}", + year = "2020", + howpublished = "{CERN Open Data Portal}", + doi = "10.7483/OPENDATA.ATLAS.2Y1T.TLGL", +} + +# These references may be helpful: + +@inproceedings{jupyter, + abstract = {It is increasingly necessary for researchers in all fields to write computer code, and in order to reproduce research results, it is important that this code is published. We present Jupyter notebooks, a document format for publishing code, results and explanations in a form that is both readable and executable. We discuss various tools and use cases for notebook documents.}, + author = {Kluyver, Thomas and Ragan-Kelley, Benjamin and Pérez, Fernando and Granger, Brian and Bussonnier, Matthias and Frederic, Jonathan and Kelley, Kyle and Hamrick, Jessica and Grout, Jason and Corlay, Sylvain and Ivanov, Paul and Avila, Damián and Abdalla, Safia and Willing, Carol and {Jupyter development team}}, + editor = {Loizides, Fernando and Scmidt, Birgit}, + location = {Netherlands}, + publisher = {IOS Press}, + url = {https://eprints.soton.ac.uk/403913/}, + booktitle = {Positioning and Power in Academic Publishing: Players, Agents and Agendas}, + year = {2016}, + pages = {87--90}, + title = {Jupyter Notebooks - a publishing format for reproducible computational workflows}, +} + + +@misc{pandas1, + author = {{The Pandas Development Team}}, + title = {pandas-dev/pandas: Pandas}, + month = feb, + year = {2020}, + publisher = {Zenodo}, + version = {latest}, + url = {https://doi.org/10.5281/zenodo.3509134}, +} + +@inproceedings{pandas2, + author = {Wes McKinney}, + title = {{D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython}, + booktitle = {{P}roceedings of the 9th {P}ython in {S}cience {C}onference}, + pages = {56 - 61}, + year = {2010}, + editor = {{S}t\'efan van der {W}alt and {J}arrod {M}illman}, + doi = {https://doi.org/10.25080/Majora-92bf1922-00a}, +} + +@article{scipy, + author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and + Haberland, Matt and Reddy, Tyler and Cournapeau, David and + Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and + Bright, Jonathan and {van der Walt}, St{\'e}fan J. and + Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and + Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and + Kern, Robert and Larson, Eric and Carey, C J and + Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and + {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and + Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and + Harris, Charles R. and Archibald, Anne M. and + Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and + {van Mulbregt}, Paul and {SciPy 1.0 Contributors}}, + title = {{{SciPy} 1.0: Fundamental Algorithms for Scientific + Computing in Python}}, + journal = {Nature Methods}, + year = {2020}, + volume = {17}, + pages = {261--272}, + adsurl = {https://rdcu.be/b08Wh}, + doi = {https://doi.org/10.1038/s41592-019-0686-2}, +} diff --git a/papers/matthew_feickert/myst.yml b/papers/matthew_feickert/myst.yml new file mode 100644 index 0000000000..fd6f3b1713 --- /dev/null +++ b/papers/matthew_feickert/myst.yml @@ -0,0 +1,94 @@ +version: 1 +project: + # Update this to match `scipy-2024-` the folder should be `` + id: scipy-2024-matthew_feickert + # Ensure your title is the same as in your `main.md` + title: How the Scientific Python ecosystem helps answer fundamental questions of the Universe + # Authors should have affiliations, emails and ORCIDs if available + authors: + - name: Matthew Feickert + email: matthew.feickert@cern.ch + orcid: 0000-0003-4124-7862 + affiliations: + - University of Wisconsin--Madison + corresponding: true + - name: Nikolai Hartmann + email: nikolai.hartmann@cern.ch + orcid: 0000-0003-0047-2908 + affiliations: + - Ludwig Maximilians Universitat + - name: Lukas Heinrich + email: lukas.heinrich@cern.ch + orcid: 0000-0002-4048-7584 + affiliations: + - Technical University of Munich + - name: Alexander Held + email: alexander.held@cern.ch + orcid: 0000-0002-8924-5885 + affiliations: + - University of Wisconsin--Madison + - name: Vangelis Kourlitis + email: evangelos.kourlitis@cern.ch + orcid: 0000-0001-6568-2047 + affiliations: + - Technical University of Munich + - name: Nils Krumnack + email: nils.erik.krumnack@cern.ch + affiliations: + - Iowa State University + - name: Giordon Stark + email: kratsg@gmail.com + orcid: 0000-0001-6616-3433 + affiliations: + - Santa Cruz Institute for Particle Physics + - name: Matthias Vigl + email: matthias.vigl@cern.ch + orcid: 0000-0003-2281-3822 + affiliations: + - Technical University of Munich + - name: Gordon Watts + email: gwatts@uw.edu + orcid: 0000-0002-0753-7308 + affiliations: + - University of Washington + keywords: + - ATLAS + - particle physics + - Scikit-HEP + # Add the abbreviations that you use in your paper here + abbreviations: + CP: Combined Performance + EDM: Event Data Model + HEP: High Energy Physics + IRIS-HEP: Institute for Research and Innovation in Software for High Energy Physics + LHC: Large Hadron Collider + # It is possible to explicitly ignore the `doi-exists` check for certain citation keys + error_rules: + - rule: doi-exists + severity: ignore + keys: + - S2I2HEPSP + - PyHEP + - nanobind + - ATL-SOFT-PUB-2021-001 + - Dask + - Dask-awkward + - Dask-histogram + - Kourlitis:2890478 + # A banner will be generated for you on publication, this is a placeholder + banner: banner.png + # The rest of the information shouldn't be modified + subject: Research Article + open_access: true + license: CC-BY-4.0 + venue: Scipy 2024 + date: 2024-07-10 + # Exclude source files to keep them from appearing as supporting documents + exclude: + - introduction.md + - scientific-python-ecosystem.md + - uncovering-higgs.md + - conclusions.md + - acknowledgements.md +site: + template: article-theme diff --git a/papers/matthew_feickert/scientific-python-ecosystem.md b/papers/matthew_feickert/scientific-python-ecosystem.md new file mode 100644 index 0000000000..e61d21df87 --- /dev/null +++ b/papers/matthew_feickert/scientific-python-ecosystem.md @@ -0,0 +1,33 @@ +## Employing the Scientific Python ecosystem + +The multiple stages of physics data processing and analysis map onto different parts of the Scientific Python ecosystem. +This begins with the highly-structured but jagged nature of the event data in HEP. +The data structure of each event consists of variable length lists of physics objects (e.g. electrons, collections of tracks from charged objects). +To study the properties of the physics objects in a statistical manner, a fixed event analysis procedure is repeated over billions of events. +This has traditionally motivated the use of "event loops" that implicitly construct event-level quantities of interest and leveraged the `C++` compiler to produce efficient iterative code. +This precedent made it difficult to take advantage of array programming paradigms that are common in Scientific Python given NumPy [@numpy] vector operations. +The Scikit-HEP library Awkward Array [@Awkward_Array_zenodo] provides a path forward by providing NumPy-like idioms for nested, variable-sized (JSON-like) data and also brings analysts into an array programming paradigm [@Hartmann:2021qzp]. + +With the ability to operate on HEP data structures in an array programming — or "columnar" — approach, the next step is to be able to read and write with the HEP domain specific ROOT [@Brun:1997pa] file format. +This is accomplished with use of the `uproot` library [@Uproot_zenodo], which allows for efficient transformation of ROOT data to NumPy or Awkward arrays. +The data is then filtered through kinematic and physics signature motivated selections using Awkward manipulations and queries to create array collections that contain the passing events. +Through intense detector characterization and calibration efforts, the ATLAS collaboration has developed robust methods and tooling to apply corrections to the data and evaluate systematic uncertainties. +For instance, corrections to the signal collected by a specific calorimeter subsystems along with systematic uncertainties due to the imperfect knowledge of the subsystems. +Given the custom nature of the detector and correction implementations, these corrections are implemented in custom `C++` libraries in the ATLAS software framework, Athena [@ATL-SOFT-PUB-2021-001;@Athena_zenodo]. +To expose these `C++` libraries to the Pythonic tooling layer, custom Python bindings were written using `nanobind` for high efficiency, as seen in @fig:access_layer_diagram. + +:::{figure} figures/access_layer_diagram.png +:label: fig:access_layer_diagram + +The data access abstract interface from the high level user facing Python API to the ATLAS Event Data Model (EDM) access library that exposes the shared ATLAS combined performance (CP) tools for reconstruction, identification, and measurement of physics objects. +The interface takes advantage of `nanobind`'s efficient bindings and zero-copy exchange protocols to achieve viable performance. [@Kourlitis:2890478] +::: + +To contend with the extreme data volume, efficient distributed computing is an essential requirement. +Given the success of Dask [@Dask] in the Scientific Python ecosystem, and its ability to be deployed across both traditional batch systems and cloud based infrastructure with Kubernetes, the Scikit-HEP ecosystem has built extensions to Dask that allow for native Dask collections of Awkward arrays [@Dask-awkward] and computing multidimensional `boost-histogram` objects [@Boost-histogram_zenodo] with Dask collections [@Dask-histogram]. +Using Dask and these extensions, the data selection and systematic correction workflow is able to be horizontally scaled out across ATLAS collaboration compute resources to provide the data throughput necessary to make analysis feasible. +This is often achieved through use of the high level `coffea` columnar analysis framework [@coffea_zenodo] which was designed to integrate with Dask and these HEP specific Dask extensions. + +The resulting data objects that are returned to analysts are histograms of physics quantity distributions — such as the reconstructed invariant-mass of a collection of particles or particle momentum. +Using the `hist` library [@hist_zenodo] for higher level data exploration and manipulation, physicists are then able to efficiently further manipulate the data distributions using tooling from the broader Scientific Python ecosystem and create domain-centric visualizations using the `mplhep` [@mplhep_zenodo] extension of Matplotlib [@matplotlib]. +From these high level data representations of the relevant physics, physicists are then able to serialize the distributions and use them for the final stages of data analysis and statistical modeling and inference. diff --git a/papers/matthew_feickert/uncovering-higgs.md b/papers/matthew_feickert/uncovering-higgs.md new file mode 100644 index 0000000000..2f56cde9de --- /dev/null +++ b/papers/matthew_feickert/uncovering-higgs.md @@ -0,0 +1,86 @@ +## Uncovering the Higgs boson + +The most famous and revolutionary discovery in particle physics this century is the discovery of the Higgs boson — the particle corresponding to the quantum field that gives mass to fundamental particles through the Brout-Englert-Higgs mechanism — by the ATLAS and CMS experimental collaborations in 2012. [@HIGG-2012-27;@CMS-HIG-12-028] +This discovery work was done using large amounts of customized `C++` software, but in the following decade the state of the PyHEP community has advanced enough that the workflow can now be done using community Python tooling. +To provide an overview of the tooling and functionality, a high level summary of a simplified analysis workflow of a Higgs "decay" to two intermediate $Z$ bosons that decay to charged leptons $(\ell)$ (i.e. electrons ($e$) and muons ($\mu$)), $H \to Z Z^{*} \to 4 \ell$, on ATLAS open data [@ATLAS-open-data] is summarized in this section. + +### Loading data + +Given the size of the data, the files used in a real analysis will usually be cached at a national level "analysis facility" where the analysis code will run. +Using `coffea`, `uproot`, and Dask, these files can then be efficiently read and the tree structure of the data can populate Awkward arrays. + + +```{include} code/read.py +:lang: python +:caption: Using `coffea`, tree structured ROOT files are read with `uproot` from an efficient file cache, and the relevant branches for physics are filtered out into Awkward arrays. +The operation is scaled out on a Dask cluster for read performance. +``` + +### Cleaning and selecting data + +Once the data is in Awkward arrays, additional selections need to be applied before it can be analyzed. +In this particular example, the simulation events need to be normalized to the amount of events in data. +Event selections that correspond to the physics quantities of interest also need to be enforced. +The final event topology of interest for the physics decay structure will have four charged leptons grouped in two opposite flavor lepton pairs (so that the total charge is zero, as the Higgs and the $Z$-bosons are electrically neutral). + +These selection can then be implemented in an analysis specific `coffea` processor, and then the processor can be executed used a Dask executor to horizontally scale out the analysis selection across the available compute. + +```{include} code/coffea.py +:lang: python +:caption: A `coffea` processor designed to make physics motivated event selections to create accumulators of the 4-lepton invariant mass. +``` + +The example shown uses a simple selection process, but the methods used can use complex feature engineering that involve machine learning methods to calculate optimal discriminants. + +### Measurement uncertainties + +One of the most expensive operations that happens during the event selections is the computation of systematic variations of the events to accommodate for imperfect knowledge of the detector systems. +This in practice requires applying complex, experiment specific corrections to each event, using algorithms implemented in `C++`. +Historically these tools were implemented for an event loop processing paradigm, but with recent tooling additions, as shown in @fig:access_layer_diagram, efficient on-the-fly systematic corrections can be computed for array programming paradigms. + +The following provides an example of high level Python APIs that provide handles to these tools to use in the workflows described so far. +These tools are efficient enough to be able to apply multiple systematic variations in analysis workflows, as seen in @fig:Zee_mc_systematics. + +```{include} code/corrections.py +:lang: python +:caption: Simplified example of what the Python API for a systematic correction tool with a columnar implementation looks like. +``` + +:::{figure} figures/Zee_mc_systematics.png +:label: fig:Zee_mc_systematics + +Example of the reconstructed dilepton invariant mass distribution in simulation with the electron reconstruction and identification efficiency scale factor (SF) and corrections to the energy resolution (res) energy scale (scale) computed on-the-fly using the `nanobind` Python bindings to the ATLAS `C++` correction tools. +The total variation in the systematic corrections is plotted as a hashed band. [@Kourlitis:2890478] +::: + +### The "discovery" plot + +After running the `coffea` processors, the resulting data from the selections is accumulated into `boost-histogram` objects, as seen visualized in @fig:prefit_plot. + + +```{include} code/prefit_plot.py +:lang: python +:caption: Using `mplhep`, `hist`, and `matplotlib` the post-processed histograms of the simulation and the data are visualized in advance of any statistical inference of best-fit model parameters. +``` + +:::{figure} figures/prefit_plot.png +:label: fig:prefit_plot + +Using `mplhep`, `hist`, and `matplotlib` the post-processed histograms of the simulation and the data are visualized in advance of any statistical inference of best-fit model parameters. +::: + +These histograms are then serialized into files with `uproot` and used by the statistical modeling and inference libraries `pyhf` [@pyhf_zenodo;@pyhf_joss] and `cabinetry` [@cabinetry_zenodo] to build binned statistical models and efficiently fit the models to the observed data using vectorized computations and the optimization library `iminuit` [@iminuit_zenodo] for full uncertainties on all model parameters. +The resulting best-fit model parameters — such as the scale factor on the signal component of the model corresponding to the normalization on the Higgs contributions — are visualized in @fig:postfit_plot, where good agreement between the model predictions and the data is observed. +The signal component, clearly visible above the additional "background" components of the model, are Higgs boson events, with an observed count in agreement with theoretical expectations. +_TODO: Quantify results._ + +```{include} code/postfit_plot.py +:lang: python +:caption: Using `cabinetry`, `pyhf`, and `matplotlib` the data and the post-fit model prediction are visualized. +``` + +:::{figure} figures/postfit_plot.png +:label: fig:postfit_plot + +Using `cabinetry`, `pyhf`, and `matplotlib` the data and the post-fit model prediction are visualized. +:::