analysis2_manuscript.tex

\documentclass[hidelinks]{article}
\usepackage[letterpaper,margin=1.0in]{geometry}
\usepackage[utf8]{inputenc}
\pagenumbering{arabic}
\usepackage{authblk}
\usepackage{graphicx}
\usepackage[singlelinecheck=false]{caption} % singlelinecheck makes single line caption left aligned instead of centered
\usepackage{subcaption}
\usepackage{amsmath}
\usepackage{fancyhdr}
\usepackage{longtable}
\usepackage{booktabs}
% hyperlinks
\usepackage{hyperref}
\usepackage{wrapfig}
\usepackage{xspace}
\usepackage{mathrsfs}
\usepackage{graphicx}
\usepackage{lipsum}
\usepackage{makecell}


\pagestyle{fancy}
\fancyhead[R]{\textbf{\stdpopsim with selection}}

% for highlighting text
\usepackage{xcolor}
\usepackage{soul}

% bibliography
\usepackage[
    backend=biber,
    natbib=true,
    style=authoryear,
    ]{biblatex}
\addbibresource{references.bib}

%\usepackage[round]{natbib}   % omit 'round' option if you prefer square brackets
%\bibliographystyle{plainnat}


\newcommand{\Stdpopsim}{\texttt{Stdpopsim}\xspace}
\newcommand{\stdpopsim}{\texttt{stdpopsim}\xspace}

%commands to format figure and table references in the supplement
\newcommand{\beginsupplement}{%
        \fancyhead[L]{Supplemental Material}
        \setcounter{table}{0}
        \renewcommand{\thetable}{S\arabic{table}}%
        \setcounter{figure}{0}
        \renewcommand{\thefigure}{S\arabic{figure}}%
     }
\newcommand{\stopsupplement}{%
        \setcounter{table}{0}
        \renewcommand{\thetable}{\arabic{table}}%
        \setcounter{figure}{0}
        \renewcommand{\thefigure}{\arabic{figure}}%
     }

\makeatletter
\newcommand{\labelname}[1]{\def\@currentlabelname{#1}}
\makeatother

\newcommand{\tskit}{\texttt{tskit}\xspace}

% add commands for msmc2, stairway plot, gone, and smc++
\newcommand{\msmc}{\texttt{msmc2}\xspace}
\newcommand{\stairway}{\texttt{stairwayplot}\xspace}
\newcommand{\gone}{\texttt{GONE}\xspace}
\newcommand{\smcpp}{\texttt{SMC++}\xspace}

% add commands for the DFE inference methods
\newcommand{\dfe}{\texttt{dfe-alpha}\xspace}
\newcommand{\polydfe}{\texttt{polyDFE}\xspace}
\newcommand{\dadicli}{\texttt{dadi-cli}\xspace}
\newcommand{\grapes}{\texttt{GRAPES}\xspace}

% add commands for the sweep detection methods
\newcommand{\sweepfinder}{\texttt{sweepfinder2}\xspace}
\newcommand{\diploshic}{\texttt{diploshic}\xspace}

% Avoid pandoc bug when there are lists in the body.
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}

\newcommand{\rngcomment}[1]{\textcolor{red}{RNG: #1}}

\newcommand{\kelcomment}[1]{\textcolor{blue}{KEL: #1}}

\title{Accesible, realistic genome simulation with selection using \stdpopsim}

% \author[1,+]{M. Elise Lauterbur}
% \author[3,*]{Ariella L. Gladstein}
\author[4,*]{Graham Gower}
\author[5*]{Nathaniel S. Pope}
\author[5*]{Murillo F. Rodrigues}
\author[5*]{Silas Tittes}
\author[34*]{Linh N. Tran}
\author[6]{Georgia Tsambos}

\author[2]{Maria Izabel A. Cavassim}
% \author[5,7]{Jeff Adrion}
% \author[5]{Saurabh Belsare}
% \author[8]{Arjun Biddanda}
% \author[5]{Victoria Caudill}
% \author[9]{Jean Cury}
% \author[10]{Ignacio Echevarria}
% \author[11]{Benjamin C. Haller}
% \author[12,13]{Ahmed R. Hasan}
\author[14,15]{Xin Huang}
% \author[16]{Leonardo Nicola Martin Iasi}
% \author[17]{Ekaterina Noskova}
% \author[18]{Jana Obšteter}
% \author[19]{Vitor Antonio Corrêa Pavinato}
% \author[20,21]{Alice Pearson}
% \author[22,23]{David Peede}
% \author[24]{Manolo F. Perez}
\author[5]{Chris C. R. Smith}
% \author[25]{Jeffrey P. Spence}
% \author[5]{Anastasia Teterina}

\author[5]{Scott T. Small}
\author[5]{Clara T. Rehmann}
% \author[26]{Per Unneberg}
% \author[27]{Juan Manuel Vazquez}
% \author[28]{Ryan K. Waples}
% \author[29]{Anthony Wilder Wohns}
% \author[30]{Yan Wong}
% \author[31]{Franz Baumdicker}
% \author[32]{Reed A. Cartwright}
% \author[33]{Gregor Gorjanc}
\author[4]{Kirk E. Lohmueller}
\author[34]{Ryan N. Gutenkunst}
\author[30]{Jerome Kelleher}
\author[35]{Aaron P. Ragsdale}

\author[37]{Daniel R. Schrider}
\author[38]{Ilan Gronau}
\author[5,36]{Peter L. Ralph}
\author[5]{Andrew D. Kern}


 \affil[*]{\small{These authors contributed equally to the paper.}}
% \affil[+]{\small{Corresponding authors: lauterbur@gmail.com ; ilan.gronau@runi.ac.il.}}
% \affil[1]{\small{Department of Ecology and Evolutionary Biology, University of Arizona, Tucson AZ 85719, USA}}
% \affil[2]{\small{Department of Ecology and Evolutionary Biology, University of California, Los Angeles, Los Angeles CA, USA}}
% \affil[3]{\small{Embark Veterinary, Inc., Boston MA 02111, USA}}
 \affil[4]{\small{Section for Molecular Ecology and Evolution, Globe Institute, University of Copenhagen, Denmark}}
 \affil[5]{\small{Institute of Ecology and Evolution, University of Oregon, Eugene OR 97402, USA}}
% \affil[6]{\small{School of Mathematics and Statistics, University of Melbourne, Australia}}
% \affil[7]{\small{AncestryDNA, San Francisco CA 94107, USA}}
% \affil[8]{\small{54Gene, Inc., Washington DC 20005, USA}}
% \affil[9]{\small{Université Paris-Saclay, CNRS, INRIA, Laboratoire Interdisciplinaire des Sciences du Numérique, UMR 9015 Orsay, France}}
% \affil[10]{\small{School of Life Sciences, University of Glasgow, Glasgow, UK}}
% \affil[11]{\small{Department of Computational Biology, Cornell University, Ithaca NY, USA}}
% \affil[12]{\small{Department of Cell and Systems Biology, University of Toronto, Toronto ON, Canada}}
% \affil[13]{\small{Department of Biology, University of Toronto Mississauga, Mississauga ON, Canada}}
\affil[14]{\small{Department of Evolutionary Anthropology, University of Vienna, Vienna, Austria}}
\affil[15]{\small{Human Evolution and Archaeological Sciences (HEAS), University of Vienna, Vienna, Austria}}
% \affil[16]{\small{Department of Evolutionary Genetics, Max Planck Institute for Evolutionary Anthropology, Leipzig, Germany}}
% \affil[17]{\small{Computer Technologies Laboratory, ITMO University, St Petersburg, Russia}}
% \affil[18]{\small{Agricultural Institute of Slovenia, Department of Animal Science, Ljubljana, Slovenia}}
% \affil[19]{\small{Entomology Department, The Ohio State University, Wooster OH, USA}}
% \affil[20]{\small{Department of Genetics, University of Cambridge, Cambridge, UK}}
% \affil[21]{\small{Department of Zoology, University of Cambridge, Cambridge, UK}}
% \affil[22]{\small{Department of Ecology, Evolution, and Organismal Biology, Brown University, Providence RI, USA}}
% \affil[23]{\small{Center for Computational Molecular Biology, Brown University, Providence RI, USA}}
% \affil[24]{\small{Department of Genetics and Evolution, Federal University of Sao Carlos, Sao Carlos 13565905, Brazil}}
% \affil[25]{\small{Department of Genetics, Stanford University School of Medicine, Stanford CA 94305, USA}}
% \affil[26]{\small{Department of Cell and Molecular Biology, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Uppsala University, Husargatan 3, SE-752 37 Uppsala, Sweden}}
% \affil[27]{\small{Department of Integrative Biology, University of California, Berkeley, Berkeley CA, USA}}
% \affil[28]{\small{Department of Biostatistics, University of Washington, Seattle WA, USA}}
% \affil[29]{\small{Broad Institute of MIT and Harvard, Cambridge MA 02142, USA}}
\affil[30]{\small{Big Data Institute, Li Ka Shing Centre for Health Information and Discovery, University of Oxford, Oxford OX3 7LF, UK}}
% \affil[31]{\small{Cluster of Excellence - Controlling Microbes to Fight Infections, Eberhard Karls Universität Tübingen, Tübingen, Baden-Württemberg, Germany}}
% \affil[32]{\small{School of Life Sciences and The Biodesign Institute, Arizona State University, Tempe AZ, USA}}
% \affil[33]{\small{The Roslin Institute and Royal (Dick) School of Veterinary Studies, University of Edinburgh, Edinburgh EH25 9RG, UK}}
\affil[34]{\small{Department of Molecular and Cellular Biology, University of Arizona, Tucson AZ 85721, USA}}
\affil[35]{\small{Department of Integrative Biology, University of Wisconsin-Madison, Madison WI, USA}}
\affil[36]{\small{Department of Mathematics, University of Oregon, Eugene OR 97402, USA}}
\affil[37]{\small{Department of Genetics, University of North Carolina at Chapel Hill, Chapel Hill NC 27599, USA}}
\affil[38]{\small{Efi Arazi School of Computer Science, Reichman University, Herzliya, Israel}}

\date{\small{\today{}}}

\begin{document}

\maketitle


\section*{Abstract}
    \label{abstract}
    Natural selection is a fundamental evolutionary force that shapes patterns of genetic variation across species. 
    However, simulating realistic models that incorporate both selection and complex demographic histories
    is challenging, limiting our ability to benchmark statistical methods aimed at detecting selection and explore theoretical predictions.
    \stdpopsim is a tool for facilitating the simulation of a contiually expanding catalog of population genetic models.
    Here we present a major extension to the \stdpopsim{} framework that enables simulation of various modes
    of natural selection, including background selection, selective sweeps, and arbitrary distributions of fitness effects (DFE).
    This extension maintains \stdpopsim's core principles of reproducibility and standardization while adding support
    for species-specific genomic annotations and published DFE estimates. 
    We demonstrate the utility of this framework by benchmarking methods for demographic inference,
    DFE estimation, and selective sweep detection across different species and scenarios. 
    Our results highlight how selection can bias demographic inference, 
    reveal the sensitivity of DFE inference methods to model assumptions, 
    and show how genomic features like recombination rate and the density of functional sequence influence power to detect selective sweeps.
    This extension to \stdpopsim{} provides a powerful new resource for the population genetics community
    to explore the interplay between selection and other evolutionary forces in a reproducible framework.

\section*{Introduction}
    \label{introduction}
    % natural selection
    Natural selection is a fundamental force in evolution, shaping the
    genetic diversity of populations and driving the adaptation of
    species to their environments. The effects of natural selection
    on genetic variation are complex, and can be difficult to disentangle
    from other evolutionary processes such as mutation, recombination,
    and genetic drift \citep[e.g.,][]{gillespie1991causes}.
    For instance, changes in population size can lead to fluctuations
    in genetic diversity across a recombining chromosome 
    that can mimic the effects of selection \citep{simonsen1995properties},
    and lead to spurious inferences about the strength and targets of genetic adaptation
    \citep{simonsen1995properties,akey2004population,nielsen2005genomic}.
    In turn, selection can confound our ability to infer demographic 
    history from allele frequencies \citep{ewing2016consequences,schrider2016effects} and
    estimates of inverse coalescent rate \citep{schrider2016effects, johri2021impact, cousins2024accurate}.
    Thus, it is imperative to jointly account for the effects of selection
    and demography when inferring evolutionary history from genetic data \citep{sheehan2016deep,johri2020toward}.
    However, this is a challenging task from a modeling perspective \citep{johri2022prospect}. % MFR this is a bit unclear

    % simulation
    To meet the growing need for the interpretation,
    analysis, and exploration of realistic and complex evolutionary models
    the field of population genetics has increasingly turned to simulation.
    Simulation in population genetics has a long history 
    including both backward-in-time coalescent simulations
    \citep{kingman1982genealogy,hudson1983testing, hudson1990gene}
    and forward-in-time simulations of complex demography and selection
    \citep[e.g.,][]{gillespie1984molecular,thornton2014c++, haller2019slim}.  \kelcomment{maybe also cite https://doi.org/10.1093/bioinformatics/btn522}
    The development of simulation tools has been driven by the need to
    understand the effects of complex evolutionary processes on genetic
    variation \citep[e.g.,][]{galloway2020few},\kelcomment{also cite: PMID: 32487519 } to provide a null model for hypothesis testing
    \citep[e.g.,][]{hudson1992statistical,hudson1994evidence,sabeti2002detecting},
    to explore the power and limitations of statistical methods \citep[e.g.,][]{przeworski2002signature},
    and increasingly to provide a basis for machine learning and other
    simulation-based inference methods \citep[e.g.,][]{beaumont2002approximate,pavlidis2010searching,lin2011distinguishing,kern2018diplos,mughal2019localizing,sanchez2021deep,wang2021automatic}. \kelcomment{also cite: PMID: 36617238 }
    While this is so, joint simulation of complex demography and selection
    is challenging,  requires a deep understanding of the underlying
    evolutionary processes, and necessitates a number of parameter choices including
    the strength of selection and the
    recombination rate, as well as a parameterized demographic model.
    This is a daunting task for many researchers, and can be a barrier to
    the adoption of simulation-based methods in population genetics.
    Furthermore, different simulation and inference tools may use different structures
    and parameter scalings for their models.

    % stdpopsim
    In light of the complexities associated with estimating and simulating
    population genetic models, it is no surprise that
    a lingering challenge with simulation in population genetics has been
    reproducibility and the ability to share and compare results among 
    researchers \citep[e.g.,][]{ragsdale2020lessons}.
    This challenge has been addressed in part by the development
    of community resources for sharing and distributing simulation software
    via the \stdpopsim project \citep{adrion2020community}. \stdpopsim
    provides a standardized interface for accessing a wide range of
    population genetic models, and has begun to be be widely adopted by the community. %maybe add citations here?
    While that is so, the original version of \stdpopsim did not include
    models of selection, which is a major limitation for more empirical
    applications of population genetic simulation. In particular, modeling selection
    through simulation is critical for understanding processes such
    as adaptation, the effects of selective sweeps \citep[e.g.][]{braverman1995hitchhiking,fay2000hitchhiking,przeworski2002signature,przeworski2005signature,schrider2015soft}, and the impact of
    background selection on genetic diversity
    \citep[e.g.][]{charlesworth1993effect,charlesworth1995pattern,williamson2002genealogy,ewing2016consequences,torres2020temporal}.
    Ideally, one would like
    to have a single, unified framework for simulating both neutral and
    non-neutral evolutionary processes, and to be able to compare the
    results of these simulations to empirical data in a manner that is
    both accessible to a wide range of researchers and highly reproducible. 
    Further the framework should include the complex realities of 
    genomes, including heterogenous recombination rates, 
    variation in the size and density of functional elements, and
    non-equilibrium demographic histories. Ideally, one should be able
    to model all of these features using estimates/annotations obtained
    from a species of interest \citep[e.g.][]{schrider2020background,rodrigues2024shared}.

    % goals
    In this manuscript, we provide an overview of a major new addition
    to \stdpopsim---the inclusion of models of selection.
    We begin by describing the models of selection that we have implemented,
    as well as the parameter choices that are available to the user.
    This includes models of background selection, selective sweeps, and
    models of the distribution of fitness effects (DFE).
    Further we describe how these models can be combined with genomic
    annotations that are specific to a given species available
    through the \stdpopsim API, and combined with a parameterized model of
    demography and recombination to provide a realistic simulation of
    genetic variation in a population.
    We then provide a series of examples of how these models can be used
    to benchmark and compare the performance of different methods for
    inferring demographic history, the targets of recent positive selection,
    and the distribution of fitness effects from genetic data. 

\section*{Implementing Selection in \stdpopsim}
    \label{selection}

    % implementing selection
    \stdpopsim previously provided a wide range of models of neutral
    demographic history from roughly two dozen species \citep{lauterbur2023expanding}.
    This was accomplished through the use of a standardized interface
    and data structure that allows for curated addition of new
    species and models, and can use either
    msprime \citep{Baumdicker2022} or SLiM \citep{haller2019slim}
    as the backend engine for efficient population genetic simulation.
    To extend \stdpopsim's functionality to include models of selection, 
    we introduce an important new class representing a specfic distribution of fitness effects
    to the \stdpopsim API: the \texttt{DFE} class. 
    This class provides an interface for specifying
    a distribution of the fitness effects of new mutations, and
    is added to a \texttt{Contig} object to provide a description
    of the fitness effects of mutations along a chromosome.
    A \texttt{DFE} object can apply to specified intervals within a chromosome,
    and can be combined with other \texttt{DFE} objects to provide 
    a rich model of how selection may vary along a chromosome. 
    Once specified, a model of selection can then be run using \stdpopsim's standard interfaces
    but currently is limited to using SLiM simulation engine, which
    is perhaps the most flexible simulation engine for modeling selection currently available.
    

    \begin{wrapfigure}{l}{0.5\textwidth}
        \vspace{-0.0cm}
        \includegraphics[width=\linewidth]{figures/schematics/catalog.pdf}
        \caption{\label{fig:schematic}
        \textbf{(A)} A schematic of the \stdpopsim catalog and the different components that
        can be used in simulations. 
        \textbf{(B)} The python API and \textbf{(C)} the command line interface.
        The user specifies a species, a portion of
        the genome to simulate, and optionally a genetic map, a model of
        demography, and a model of selection that itself is composed of a
        distribution of fitness effects (DFE) and a set of functional
        annotations. \rngcomment{I find the images too small to read the text comfortably.}}
    \end{wrapfigure}
    \hfill
    
    As with other \stdpopsim models, when incorporating selection we aim to hew as closely as possible
    to realistic genomes, and provide public annotations of specices' functional genomic
    elements that can be used along with a \texttt{DFE} object to
    readily provide realistic model of selection for a specific genome and set of
    populations. A schematic of the \stdpopsim catalog is shown in Figure \ref{fig:schematic}.
    From a user's perspective, one can specify a species, a portion of the genome to simulate,
    a genetic map if available, a model of demography, and a model of selection that is
    composed of a set of annotations and a \texttt{DFE} object, all in an automated and user-friendly fashion. 


    The vast majority of published DFE estimates are based on a limited number of species,
    so \stdpopsim currently has implemented published DFEs from
    four species: \textit{Arabidopsis thaleana} \citep{huber2018gene}, \textit{Drosophila melanogaster} \citep{ragsdale2016triallelic,huber2017determining},
    humans \citep{huber2017determining,kim2017inference}, and the vaquita porpoise \textit{Phocoena sinus} \citep{robinson2022critically}.
    While the DFEs are from a limited number of sepecies, these DFEs can be applied to other species
    in the catalog. For instance, one could simulate a butterfly species with a gamma-distributed DFE originally
    estimated from humans, or a human population with a DFE estimated from \textit{Drosophila}.
    Furthermore, the user can specify a custom DFE and provide their own annotations
    of functional elements to simulate selection in a species for which we do not yet have 
    a published DFE included in the catalog. This flexibility allows for a wide range of
    models of selection to be simulated. %, and for the results of these simulations to be
    %compared to empirical data or used as a null model for hypothesis testing.
    % DRS: commenting that stuff out because there are obviously other use cases that could be
    % listed here (e.g. ML/ABC, and if we don't know enough about a species to do that then
    % we probably have no business doing hypothesis testing either, which could be just as
    % misleading or maybe even more so), so the simplest solution is to cut this as it is
    % not needed here.
   
    % sweep interface
    In addition to the \texttt{DFE} class, we have also implemented a class (\texttt{stdpopsim.ext.selective\_sweep})
    that enables selective sweeps to be simulated in \stdpopsim.
    This class augments a demographic model with an ``extended event''
    which conditions on the introduction of a selected mutation at a given time and position
    and with a given selection coefficient. Further the user can specify the minimum frequency
    of the selected allele at the time of sampling. As these extended events are implemented
    on top of the existing \stdpopsim API, they can be combined with other models of selection
    and demography to provide a rich model of the combined effects of multiple disparate evolutionary processes
    on genetic variation in a population.
    

    % current numbers of DFEs / species / etc
    % DRS: are we giving this a version number?
    At this release the \stdpopsim catalog comprises 24 species (i.e. genome representations)
    for which we have implemented 28 demographic models, have 37 genetic maps, and 7 DFEs. %someone should double check the numbers
    Additions to the catalog are ongoing---we point the reader to our previously published 
    report detailing this effort \citep{lauterbur2023expanding}. We welcome contributions from the
    community and encourage those interested in contributing to visit https://popsim-consortium.github.io/stdpopsim-docs
    or contact one of the PopSim Consortium members for guidance.


\section*{Example Applications with Selection}
    \label{applications}
    In this section, we present a series of examples of how the new models of selection
    in \stdpopsim can be used to benchmark different
    methods for population genetic inference.
    We focus on three main areas: demographic inference in the 
    single and multi-population settings, inference of the DFE,
    and the detection of selective sweeps, particularly in the context of realistic recombination maps and background selection.

    \section*{Inference of $N_e(t)$ in the context of selection}
    One of the most common applications of population genetic inference is to estimate
    the effective population size over time, $N_e(t)$, from genetic data. This can be done
    using a variety of methods, including the pairwise sequentially Markovian coalescent
    (PSMC) \citep{li2011inference}, \kelcomment{mabye also cite Gutenkunst 2009 Dadi paper here} through use of the site frequency spectrum (SFS) \citep{liu2020stairway},
    as well as through identity by descent information \citep{santiago2020recent}. 
    While these methods perform well under neutral evolution, because of its effects on
    patterns of genetic variation, natural selection on linked sites can bias estimates of $N_e(t)$
    away from true census population sizes simply because selection increases the 
    rate of coalescence in the genome \citep[e.g.][]{schrider2016effects}. 

    Using \stdpopsim, we can easily simulate genetic data under a model of selection and then 
    examine the performance of different methods for inferring $N_e(t)$ from these simulated 
    data. In Figure \ref{fig:1pop-human-demography}, we show the results of such a benchmark
    where we have simulated genomes under a model of human out of Africa (OOA) demography
    with and without purifying selection acting on exons.
    For this simulation we have used
    a genetic map from the HapMap Project \citep{international2007second} along with a
    DFE for deleterious alleles (estimated by \cite{huber2017determining})
    that acts on exonic \kelcomment{do you mean exonic, or nonsynonymous?} positions defined from the HAVANA group release 104 for the human genome.
    % DRS: Maybe we should say something about how easy it was to do these simulations, and maybe even show
    % some example code somewhere to prove our point? Could help emphasize the utility of this resource.
    These simulations were then used to estimate $N_e(t)$ using four methods: 1) \msmc \citep{Schiffels2020}, 
    2) \stairway \citep{liu2020stairway}, 3) \gone \citep{santiago2020recent}, and 4) \smcpp \citep{terhorst2017robust}.
    In this and what follows we performed 3 replicate whole genome simulations, so uncertainty in estimates might
    be roughly assessed.
    However, as a thorough benchmarking is not the main purpose of this manuscript, we saved computational resources by not conducting more simulation replicates.
    
    In Figure \ref{fig:1pop-human-demography}, we show six panels, each of which shows the estimated $N_e(t)$
    from data simulated under the five-population model of human out-of-Africa demography
    from Ragsdale \textit{et al.} (2019)
    with samples taken from three modern geographic groups (CEU, CHB, and YRI from top to bottom).
    The left column shows the results of $N_e(t)$ estimates made from simulations without selection, 
    while the right column shows the results
    from simulations with deleterious mutations occurring in exons \rngcomment{(with inference based on simulated synonymous data from exons? or from whole genome?)}.
    In each panel we show the true $N_e(t)$ in black, and the estimated $N_e(t)$
    in various colors for the four methods. Dotted lines on the top row are to visually reinforce that inference is done
    from a model without selection. We see that these methods generally perform reasonably well in the absence of selection, 
    although GONE seems to overestimate $N_e(t)$ for the CHB sample and to a lesser extent in the CEU sample.
    Presumably this could be due to patterns of linkage disequilibrium that result from the combination
    of bottleneck and growth in these populations.
    In the presence of selection, different methods behave differently. 
    For instance stairwayplot, which uses the SFS to infer $N_e(t)$, 
    is reasonably close to the true $N_e(t)$ both with and without selection,
    although it seems to infer significant population size changes that are not part of the true history.
    Likewise \smcpp seems to be robust to the presence of selection,
    but the timing of population size changes is slightly inaccurate. 
    \msmc seems to be the most affected by the presence of selection, 
    estimates of $N_e(t)$ varying quite a bit from the true $N_e(t)$ particularly in the very recent past,
    although estimates from \msmc have quite a bit of added noise even in the absence of selection.

    % TODO: legend here needs to be changed. The dotted line indicates no selection, the solid selection


    \begin{figure}[t]
        \centering
        \includegraphics[width=\textwidth]{figures/HomSap/OOA/estimated_Ne_t_final}
        \caption{
        \label{fig:1pop-human-demography}
        Performance of methods to infer $N_e(t)$ from a human out-of-Africa model \citep{ragsdale2019models}
        with and without purifying selection on exons. The left column shows estimates of $N_e(t)$ from simulations
        without selection, while the right column shows estimates of $N_e(t)$ from simulations with a gamma-distributed   
        DFE acting on exons. In each panel we show the true $N_e(t)$ in black, and the estimated $N_e(t)$ from four methods:    
        \msmc \citep{Schiffels2020}, \stairway \citep{liu2020stairway}, \gone \citep{santiago2020recent}, and \smcpp \citep{terhorst2017robust}.  
        }
    \end{figure}
    %DRS-fig: reminder to update the legend to just say ``exons'' instead of ensembl_havana_whatever...

    To highlight the ease of comparisons between different species and models enabled by \stdpopsim,
    we also performed a similar benchmark using the vaquita porpoise \textit{Phocoena sinus}.
    In Figure \ref{fig:1pop-vaquita-demography}, we show the results of $N_e(t)$ inference
    where we simulated genomes under a model of vaquita demography from \textcite{robinson2022critically}.
    For these simulations, we used the genome structure and exon annotations from the vaquita genome assembly
    and a DFE from \textcite{robinson2022critically} that acts on exons, however no genetic map 
    is currently available, so recombination rates are assumed to be constant across the genome.
    The demographic model is a simple two-epoch model where population size has declined in the recent past. 
    In this case, we see that all methods perform well both in the absence and presence of selection,
    with a close match between the true $N_e(t)$ and the estimated $N_e(t)$ in all cases.
    % MFR: Should we say somewhere that people use to expect from BGS? That is, that it can be seen as a reduction in local rates of coalescences and thus that we would expect these methods to be somewhat downwardly biased in the presence of BGS?


\begin{figure}[t]
    \centering
    \includegraphics[width=\textwidth]{figures/PhoSin/Vaquita2Epoch_1R22/estimated_Ne_t_final}
    \caption{
    \label{fig:1pop-vaquita-demography}
    Performance of methods to infer $N_e(t)$ from simulations of the vaquita porpoise genome under a single population
    model of declining population size \citep{robinson2022critically} with and without background selection on exons. 
    The left panel shows estimates of $N_e(t)$ from simulations
    without selection, while the right panel shows estimates of $N_e(t)$ from simulations with a gamma-distributed   
    DFE acting on exons. In each panel we show the true $N_e(t)$ in black, and the estimated $N_e(t)$ from four methods:    
    \msmc \citep{Schiffels2020}, \stairway \citep{liu2020stairway}, \gone \citep{santiago2020recent}, and \smcpp \citep{terhorst2017robust}.  
    }
\end{figure}

\section*{Estimation of the Distribution of Fitness effects}
    \label{dfe}
    Another common application of population genetic inference is to estimate the distribution of fitness effects (DFE) of new mutations
    from genetic data. Here, we show how the new models of selection in \stdpopsim can be used to benchmark and compare
    the performance of different methods for inferring the DFE. We performed simulations using the 
    human and vaquita porpoise models described above, with complex demographic histories and models of selection acting on exons.
    
    A challenge when comparing quantitative DFE estimates among tools and with simulations is differing conventions of factors of two in terms of how the selection coefficient ($s$) is defined.
    The simulator SLiM and the inference tool \grapes define the selection coefficient $s$ such that a homozygote has fitness $1+s$.
    On the other hand, \dadicli and \polydfe define the selection coefficient such that a homozygote has fitness $1+ 2s$. %https://academic.oup.com/genetics/article/207/3/1103/5930676
    Moreover, all three inference tools directly infer the distribution of the selection coefficient scaled by an inferred ancestral effective population size $N_e$, but they assume different scaling.
    \dadicli infers the distribution of $2 N_e s$ while \polydfe and \grapes infer the distribution of $4 N_e s$. %https://academic.oup.com/genetics/article/207/3/1103/5930676, https://academic.oup.com/genetics/article/216/2/559/6066183
    For comparison, here we have normalized all inference results to the convention of SLiM.
           		
    % Add citation to fastDFE as the successor to polyDFE: https://academic.oup.com/mbe/article/41/5/msae070/7641109 ?


    \begin{figure}[htbp]
        \centering
        \includegraphics[width=\linewidth]{figures/HomSap/OOA/HomSap_OutOfAfricaArchaicAdmixture_5R19_Gamma_K17_ensembl_havana_104_exons_DFE_plot}
        \caption{A comparison of methods for inferring the distribution of fitness effects (DFE) from genetic data.
        Simulations were performed using a human out-of-Africa model \citep{ragsdale2019models} with a gamma-distributed DFE
        acting on exons parameterized by a mean selection coefficient and a shape parameter. Estimates of the 
        parameters of the DFE are shown in the left ($\lvert E(s) \rvert $) and right hand panels (shape) respectively.
        The human model has three extant populations (CEU, CHB, and YRI), and parameter estimates from each
        sample are shown in different colors.
        Estimates from three different methods are shown: 1) \grapes \cite{galtier2016adaptive}, 2) \polydfe \citep{tataru2020polydfe},
        and 3) \dadicli \citep{Huang2023}.
}
        \label{fig:homsap-dfe.ooa}
    \end{figure}

    In Figures \ref{fig:homsap-dfe.constant} and \ref{fig:homsap-dfe.ooa} we show a comparison of estimates
    from three software packages for inferring the DFE from genetic data:
    \dadicli \citep{Huang2023}, \polydfe \citep{tataru2020polydfe}, 
    and \grapes \citep{galtier2016adaptive}.
    In each case we note that the DFE is estimated from segregating sites only,
    without the use of substitution data, as a current limitation of \stdpopsim
    is that we are not able to simulate an outgroup sequence to estimate
    divergence. 
    Figure \ref{fig:homsap-dfe.constant} shows estimates from a single, constant size population of humans, 
    while Figure \ref{fig:homsap-dfe.ooa} shows estimates from a model of human out-of-Africa demography.
    In a constant size population, all methods perform reasonably well, although \dadicli seems to
    perform slightly worse than the other methods examined, at least for the small number of replicates we simulated here.
    Notably only \polydfe overlapped the true mean selection coefficient, whereas 
    the other methods do not. 
    For the shape parameter in constant size simulations \grapes and \polydfe are roughly
    equivalent in their performance whereas \dadicli seems to yield overestimates. 
    
    In the out-of-Africa model (Fig.~\ref{fig:homsap-dfe.ooa}) the performance of the methods is more mixed. 
    In particular, variation in accuracy of the parameters estimated in each
    sample population depends on the methods used, and each method seems to 
    have its own biases with respect to the way in which population history 
    affects parameter estimates. For instance, for \dadicli we see that estimates of 
    the shape parameter are most accurate in the YRI sample, while for \polydfe
    estimates are least accurate in the YRI sample. 
    With respect to the strength of selection, as in constant size populations,
    all methods are performing reasonably well, although \polydfe seems to
    lead in terms of accuracy.
\kelcomment{I'm a little surprised by the non-great performance of Dadi-cli in here....I know that folks have spent a long time working on this analysis and debugging it, so I don't want be negative or delay things. That said, I think it would be good to plot the empirical (in the case simulated) SFS along with the SFSs from the MLE models. I would also do this for the demogrpahic models as well as the DFE models}

    %add PhoSin DFE figure here
    \begin{figure}
        \centering
        \includegraphics[width=0.8\textwidth]{figures/PhoSin/Vaquita2Epoch_1R22/PhoSin_Vaquita2Epoch_1R22_Gamma_R22_Phocoena_sinus.mPhoSin1.pri.110_exons_DFE_plot.pdf}
        \caption{
        \label{fig:vaquita-dfe}
        A comparison of methods for inferring the distribution of fitness effects (DFE) from genetic data.
        Simulations were performed using a model of vaquita porpoise demography \citep{robinson2022critically} with a gamma-distributed DFE
        acting on exons parameterized by a mean selection coefficient, a shape parameter, and a relationship between selection and dominance. Estimates of the 
        parameters of the DFE are shown in the left ($\lvert E(s) \rvert $) and right hand panels (shape) respectively.
        This vaquita model has a single extant population, and parameter estimates from each
        of the three different methods are shown: 1) \grapes \cite{galtier2016adaptive}, 2) \polydfe \citep{tataru2020polydfe},
        and 3) \dadicli \citep{Huang2023,kim2017inference}.
        C) Shown is the simulated distribution of $2 h s$, where $h$ is the dominance coefficient, along with the simulated distribution of $s$ and the median inferred DFEs. The distribution of $2 h s$ is multimodal because the simulated relationship between $h$ and $s$ is not continuous.
        }
    \end{figure}
    
    In Figures \ref{fig:vaquita-dfe.constant} and \ref{fig:vaquita-dfe} we show a comparison of estimates
    from the same three DFE-inference packages on simulations of the
    vaquita porpoise genome. Importantly, these simulations use the vaquita demogrpahic model shown in Fig. \ref{fig:1pop-vaquita-demography} and a DFE that includes recessive deleterious mutations, as suggested in \cite{robinson2022critically}. All methods
    perform uniformly worse that in the human genome simulations, with consistent underestimation of the mean selection coefficient    
    and overestimation of the shape parameter. This is not surprising as a mis-specified model is being fit to the data. All DFE inference methods used here assume an addive model (no dominance, $h = 1/2$) while the simulated data from the vaquita  was generated from a DFE in which more deleterious mutations are more recessive ($h < 1/2$)
   
    Reassuringly, for all three software packages, the inferred mean and shape of the distribution of selection coefficients 
    is consistent with the mean and shape of the distribution of $2 h s$, the product of the dominance coefficient
    and the selection coefficient (Fig.~\ref{fig:vaquita-dfe}C).
    This is likely because deleterious alleles are typically at low frequency and thus heterozygous, 
    where their selective effect is $h s$, and all these tools assume additivity  ($h = 1/2$).
   \kelcomment{I think we should move this next sentence to the Discussion and expand on it a bit....update on 1/10--I moved this to the Discussion and fleshed it out} 

\section*{Detection of Selective Sweeps}
    \label{sweeps}
    We next highlight how the new models of selection in \stdpopsim can be used to benchmark and compare the performance of different methods for detecting selective sweeps from genetic data.
    Here, we performed simulations of human chromosome 1 under the three population out-of-Africa model from \cite{gutenkunst2009inferring} with purifying selection acting on exons under the gamma-distributed DFE described above, the HapMap II genetic map \citep{international2007second}, but also include sweeps.
    In particular, we introduced a beneficial mutation at a given location (that was varied) in the genome at a given time, and with a moderately strong selection coefficient ($s = 0.03$; $2Ns \sim 600$).
    For each sweep location, we performed 1000 replicate simulations, sampling from them only if the selected allele reached a frequency of 0.95 or greater.
    Rather than simulate the entire chromosome arm, replicate simulations were performed on 5cM regions of the chromosome at each of 100 different locations.
    As recombination rate varies across the genome, the size of the region in base pairs simulated varied, however the recombination distance, which is critical for the linked effects of a sweep, was held constant.
 

    %sweeep power figure
    \begin{figure}
        \centering
        \includegraphics[width=0.8 \textwidth]{figures/sweeps/chr1_power.pdf}
        \caption{
        Power to detect selective sweeps at 100 locations along human
        chromosome 1 under a three population out-of-Africa model
        \citep{gutenkunst2009inferring} when using null models with and without
        purifying selection acting on exons under the gamma-distributed DFE
        described above.  The HapMap II genetic map
        \citep{international2007second} was used in each simulation.  Single
        beneficial mutations were introduced at each location with a selection
        coefficient of $s = 0.03$, and only sampled if they reached a terminal
        frequency of 0.95 or greater.  Three methods for detecting sweeps were
        used: 1) \sweepfinder \citep{degiorgio2016sweepfinder2} (top row), 2)
        \diploshic \citep{kern2018diplos} (middle row), and 3) an empirical
        cutoff of $\pi$ to find regions of reduced diversity (bottom row).
        Power (true positive rate) is shown for these methods for the CEU and YRI
        samples (left and right respectively).  Under each method we show
        heatmaps of exon density and local recombination rate along the
        chromosome.  Power is shown as the proportion of replicates in which a
        sweep was detected at each location.  To define
        thresholds for calling a sweep, two different null models were used: a
        neutral model (blue lines), and a model with purifying selection acting
        on exons (red lines).
        }
        \label{fig:chr1_power}
    \end{figure}
    
    In Figure \ref{fig:chr1_power}, we show the power to detect selective sweeps at 100 locations along human chromosome 1 from
    the simulations described above. We tested three methods for detecting sweeps: 1) \sweepfinder \citep{degiorgio2016sweepfinder2},
    2) \diploshic \citep{kern2018diplos}, and 3) an empirical cutoff of $\pi$ to find regions of reduced diversity.
    A few things are immediately apparent from this figure. First, the power to detect sweeps is generally higher in the YRI sample
    than in the CEU sample, which is consistent with the fact that the YRI sample has a larger effective population size and thus both stronger selection and
    more variation to detect a sweep. Further, the population size of YRI has been relatively constant over the last 100,000 years,
    thus the out-of-Africa bottleneck has does not mute the signal of a sweep as much as in the CEU sample. This is consistent with
    what is known about the effects of demography on the power to detect sweeps \citep[e.g.,][]{simonsen1995properties}.
    Second, the power to detect sweeps is slightly lower when we use a null model with purifying selection acting on exons (consistent with \cite{schrider2020background}) but the effect is not
    as strong as one might expect. 
    \rngcomment{I'm confused by the terminology ``null model'' throughout this section. Are we always testing on simulated data that includes background selection, and ``null model'' refers to how the methods are tuned? Or are we comparing power on data simulated with and without background selection?} This is likely because the strength of selection acting on exons is relatively weak, and the
    sweep detection methods we have used are relatively robust to the effects of background selection. 
    Finally and most importantly, the power to detect sweeps varies considerably among regions of the genome. 
    Indeed variation in power along the chromosome is much greater than the variation in power between the CEU and YRI samples or between
    models with or without purifying selection. This is likely due to the effects of recombination rate heterogeniety as well
    as the density of functional elements along the chromosome.

    Using this set of simulations, we can also compare the power of different methods to detect sweeps.
    In general, the power of the methods varies with respect to the population history of the sample.
    For instance \sweepfinder has marginally higher power to detect sweeps in the CEU sample than \diploshic,
    likely as a results of the fact that we here trained \diploshic on simulations under a constant size population,
    while the population size history of CEU is characterized by a strong recent bottleneck.
    Conversely, \diploshic has higher power to detect sweeps in the YRI sample than \sweepfinder,
    likely because that population size history more closely matches the population size history used to train \diploshic.
    \sweepfinder power remains more constant along the chromosome, while the $\pi$ cutoff method varies wildly. 

    We further explored the performance of the methods by plotting their receiver operating characteristic (ROC) curves.
    In Figure \ref{fig:roc-curves}, we show the ROC curves for the three methods for detecting sweeps.
    False positive rates were established using neutral simulations that contained no sweeps
    but were conditioned on the same population history as the CEU and YRI samples. \rngcomment{Did these simulations contain background selection?}
    These suggest that the $\pi$ cutoff is a surprisingly effective method for detecting sweeps,
    in these genomic regions at least, but that it lags behind \diploshic and \sweepfinder in terms of accuracy in the YRI sample.
    %While this is not exhaustive benchmarking, we aim to give a sense to the reader
    %of how \stdpopsim can be used to benchmark the performance of different methods easily and efficiently.
    These ROC curves, however, hide the phenomenon of high variation in power (see Figure \ref{fig:chr1_power}) to detect sweeps along a realistic looking chromosome.
    That is, although in aggregate the $\pi$ cutoff may seem effective, its power is highly heterogeneous across the chromosome.


    \begin{figure}
        \centering
        \includegraphics[width=0.8 \textwidth]{figures/sweeps/relationship_power_cM.pdf}
        \caption{
        Power to detect selective sweeps at 100 locations along human
        chromosome 1 under a three population out-of-Africa model
        \citep{gutenkunst2009inferring}  when using null models with and
        without purifying selection as a function of local recombination rate.
        Simulations are as described in Figure \ref{fig:chr1_power}.  True
        positive rate is shown for these methods for samples from the CEU and
        YRI samples (left and right respectively).  To define thresholds for
        calling a sweep, two different null models were used: a neutral model
        (blue lines), and a model with purifying selection acting on exons (red
        lines).  Points represent the power at each location and trend lines
        along with 95\% confidence intervals are shown fit to those points.
        }
        \label{fig:power-recomb}
    \end{figure}

    To further dissect the reasons for high variance (along the chromosome) in power to detect sweeps,
    we show in Figure \ref{fig:power-recomb} the relationship between power and local recombination rate for our sweep simulations.
    We see that power to detect sweeps is
    a decreasing function of local recombination rate for \sweepfinder and $\pi$ outliers,
    which is consistent with sweeps having a larger genomic footprint in regions of low
    recombination.
    \diploshic on the other hand shows a slight increase in power with recombination rates,
    probably because it is designed to distinguish regions where a sweep happened from regions linked to a sweep.
    We also see that the presence of purifying selection acting on exons has a relatively small effect on the power to detect sweeps,    
    but one that is present across the entire range of recombinations rates observed in human chromosome 1. 
    Unsurprisingly, the empirical cutoff of $\pi$ to detect sweeps is the less robust to the effects of recombination rate than the other two methods,
    which are specifically designed to find selective sweeps. 

    In Figure \ref{fig:power-exon}, we show the relationship between power and local exon density for our sweep simulations.
    There is little relationship between power (across all methods) and exon density,
    which is somewhat surprising given that background selection should somewhat interfere with the sweep signature.
    It is unclear why that is so, but the effects of realistic background selection may not be as similar to sweeps as previously thought \citep{schrider2020background}.

\section*{Discussion}
    \label{Discussion}
    In this paper, we have presented an important new addition to the \stdpopsim{} library
    that allows for simulating genetic variation in the presence of selection.
    We have demonstrated the utility of this new API by showing how it can be used to benchmark
    the performance of different methods for demographic history inference, the inference of the distribution
    of fitness effects of new mutations, and the detection of selective sweeps.
    In general, this is an important step forward for the field of population genetics,
    as it allows for a wide range of models of selection to be easily simulated and compared to
    empirical data, with reproducibility, computational efficiency, and rigor.

    One important finding we made using the new stdpopsim selection API is that inference of the DFE is sensitive to the dominance of deleterious mutations. All three of the inference methods of the DFE applied here assume that deleterious mutations act in an additive fashion (h=0.5) by default. However, the simulated DFE for the vaquita genome included highly recessive mutations. Specifically, these simulations used a DFE where mutations with $s>0.01$ are quite recessive, with $h<=0.01$ and where mutations with $0.001<s<0.01$ have $h=0.1$. We found that the three DFE inference methods underestimated $E[s]$ and overestimated the shape parameter (Fig 4). This performance is not surprising as decreasing h for a given distribution of s will lead to a decrease in hs, or an apparent decrease in the amount of selection. Fitting a mis-specified DFE that assumes $h=0.5$ will then lead to underestimates of $E[s]$. Indeed, the distribution of $hs$ is much more accurately estimated (Fig 4b). Overall, these results are consistent with prior work showing that it is challenging to separately infer the DFE of $s$ along with $h$ (cite PMID: 24830675; PMID: 39302992; PMID: 34951958 ) and that a large proportion (>5%) of recessive deleterious mutations can confound inferences of $s$ (PMID: 37074880). 
Given the plethora of evidence for recessive deleterious mutations (cite PMID: 4630587, PMID: 39114967 , PMID: 30013096 , PMID: 21098719 ), these results suggest the need for care in DFE inference. Kyriazis & Lohmueller (PMID: 39302992 ) inferred the DFE of $s$ for nonsynonymous mutations in humans when considering different values of $h$. Intriguingly, additive models show a satisfactory fit to the SFS of nonsynonymous variants in humans (Kyriazis et al) and more complex models of dominance do not substantially improve the fit. Nevertheless, models including recessive deleterious mutations, like those simulated here for the vaquita genome, show a similar fit to the SFS and may be more biologically realistic than fully additive models, given the evidence for the presence of recessive deleterious mutations. More broadly, new computational approaches are needed to precisely quantify the DFE and the covariance of $h$ and $s$.


    Although we have shown how the new API can be used to benchmark the performance of different methods,
    the benchmarks shown here merely scratch the surface of what can be done with the new API.
    We envision that, moving forward, many researchers will use
    the new API to simulate data under a wide range of models of selection, and to develop
    new methods for the analysis of genetic data in the presence of selection. 
    The additions to \stdpopsim described here further expand this tool's utility for both 
    characterizing the patterns of diversity expected under a wide range of population genetic
    models, and for benchmarking methods for the analysis of genetic data
    on common ground
    %, using the same simulations and parameterizations of models of selection
    % DRS: this sentence got pretty bloated with my edits so I commented part of it out
    in a way that was not possible before.

\section*{Methods}
    \label{methods}

    \subsection*{Simulations}
    In this study we performed simulations of the human and vaquita porpoise genomes
    using the \stdpopsim{} framework. As stated above, simulations did or did not include a model 
    of (negative) selection acting on exonic regions, matched to the human and vaquita porpoise genomes.
    Three replicates of whole-genome simulations were performed for each species, with 
    a sample size of 100 diploid individuals. Demographic models are as stated above, each
    using the exon annotations from the human and vaquita porpoise genomes respectively.
    For simulations with selection, we sampled 10 diploid individuals from each of two populations (YRI and CEU). 
    To decrease runtimes, we used a SLiM scaling factor of 2 with a burn-in of 2N generations. \kelcomment{I'm confused about the 100 diploid individuals samples vs. the 10 diploid individuals sampled. Did some analyses use 100 and others use 10? That's fine, but maybe we can specify that more clearly...}

    \subsection*{Demographic inference}
    We assessed the accuracy of demographic inferences from
    \msmc \citep{Schiffels2020}, \stairway \citep{liu2020stairway}, \gone \citep{santiago2020recent}, and \smcpp \citep{terhorst2017robust}.
    Each of these methods was run using the same set of simulations in replicate.
    In each case, the non-recombining portion
    of the genome was masked to remove any bias due to recombination rate % DRS: might need more detail here
    variation. Comparisons between inferences made with and without selection were 
    of particular interest. For \msmc, we used a random sample of $N=6$ genomes with 20
    iterations of the EM algorithm. For \gone, we set max\_snps to 500000,
    the number of generations to 2000, and the number of bins to 400, all representing default settings.
    

    \subsection*{Distribution of fitness effects inference}
    We used three software packages to infer the distribution of fitness effects (DFE) from genetic data:
    \dadicli \citep{Huang2023,kim2017inference}, \polydfe \citep{tataru2020polydfe}, and \grapes \citep{galtier2016adaptive}.
    Each software package was run using the same set of simulations.
    For these analyses we used the exon annotations from the human and vaquita porpoise genomes respectively.
    A key distinction between our analyses and those done in practice
    is that we have complete knowledge of which sites are under selection in the simulation,
    thus our inferences is a best case scenario. Empirical inference of the DFE is a much
    harder problem as some mutations assumed to be free of selection
    may in reality be under selection. As above, all methods were run using default settings.
    \kelcomment{I think we need a lot more detail here...how big was the coding sequence length? How was demogrpahic inference done for the different models via the different programs? How was the unscaling of E[s] done?, etc}

    \subsection*{Selective sweeps detection}
    We ran simulations of human chromosome 1 using the three population out-of-Africa model
    (identified as OutOfAfrica\_3G09 in the stdpopsim library; \cite{gutenkunst2009inferring})
    and the genetic map estimated in \citet{international2007second} (HapMapII\_GRCh38).
    For computational efficiency, we simulated a 5Mb region flanking 100 evenly distributed points along the chromosome 1.
    We generated 1000 replicates of each of three classes of simulations:
    (i) neutral (with the specified demography and genetic map),
    (ii) background selection, in which we applied the DFE estimated in \citet{kim2017inference} (Gamma\_K17) to exon annotations (taken from Ensembl; ensembl\_havana\_104\_exons),
    and (iii) sweep with background selection, in which we simulated the hard sweep as described with the addition of constraint in exonic regions (same as for \emph{ii}).
    In the sweep simulations, we placed a single beneficial mutation in the center of the simulated region with a selection coefficient of $s = 0.03$, and only sampled if the beneficial mutation reached terminal frequency of 0.95 or greater.

    We then benchmarked the performance of three methods for detecting sweeps: 1) \sweepfinder \citep{degiorgio2016sweepfinder2},
    2) \diploshic \citep{kern2018diplos}, and 3) an empirical cutoff for nucleotide diversity ($\pi$) to find regions of reduced diversity.
    \diploshic was trained on data simulated using \texttt{discoal} \citep{kern2016discoal}.
    Nucleotide diversity was computed directly from the underlying tree sequences using \tskit \citep{ralph2020efficiently} over 10 equally sized and non-overlapping subwindows.
    The composite likelihood ratio (output by \sweepfinder) was computed at 21 uniformly distributed points along the simulated region.
    To train the \diploshic classifier, 
    For training simulations, we simulated 20 haploid genomic regions of 1.21Mbp
    with a constant population size of 10,000 individuals, a mutation rate of 2e-8, and a recombination rate of 2e-8.
    For \texttt{discoal} simulations that included selective sweeps,
    selection coefficients were drawn from a uniform distribution between 0.001 and 0.10.
    The time of the sweep was chosen from a uniform distribution between 0 and 0.01 Ne generations in the past.
    For soft sweeps, the frequency at which selection was introduced was drawn from a uniform distribution between 1e-4 and 0.1.
    All \texttt{discoal} training simulations were done under a model of equilibrium population size, 
    so we expected it to perform suboptimally given the variation in population size in the out-of-Africa model.

    To call sweeps, we empirically determined the distribution of each statistic (nucleotide diversity, \sweepfinder's CLR, and \diploshic's probability of sweep) under a null model and defined the top $5\%$ quantile as a cutoff to call a sweep (though regions with extremely low recombination rate were excluded from the computation).
    We considered two null models: neutral and background selection (as explained before).
    Power (or true positive rate) to detect a sweep is the proportion of simulations with a sweep that were correctly classified as a sweep for each method and at each of the 100 locations along chromosome 1.

\section*{Data availability}\label{data_availability}


\section*{Acknowledgments}\label{acknowledgements}

\section*{Funding}
    \label{funding}
    Multiple funding sources supported this work.
    MFR, ST, NP, and ADK were supported in part by NIH award R35GM148253.
    RNG and LNT were supported in part by NIH award R35GM149235.

    %other funding sources?
\printbibliography

%%%%%%%% supplementary material
\clearpage
\beginsupplement

\section*{Supplementary Material}

% constant size DFE figure
\begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth]{figures/HomSap/Constant/HomSap_Constant_Gamma_K17_ensembl_havana_104_exons_DFE_plot}
    \caption{
    Performance of methods to infer the distribution of fitness effects (DFE) from genetic data.
    Simulations were performed using a human constant size model with a gamma-distributed DFE
    acting on exons parameterized by a mean selection coefficient and a shape parameter. On each row we show the true parameter 
    as a black line, and the estimates as boxplots from replicate simulations. The human model has three extant populations (CEU, CHB, and YRI).
    %DRS-fig: only one population is shown here, but 3 referenced in the legend. Wrong legend or wrong fig?
    %DRS-fig: If it is the right fig, get rid of the pop0 legend inset
    Each row of the figure shows estimates from a different method: 1) GRAPES \citep{galtier2016adaptive}, 2) polyDFE \citep{tataru2020polydfe}, and 3) \dadicli \citep{Huang2023}.
    }
    \label{fig:homsap-dfe.constant}    
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{figures/PhoSin/Constant/PhoSin_Constant_Gamma_R22_Phocoena_sinus.mPhoSin1.pri.110_exons_DFE_plot.pdf}
    \caption{
    \label{fig:vaquita-dfe.constant}
    A comparison of methods for inferring the distribution of fitness effects (DFE) from genetic data.
    Simulations were performed using a model of vaquita porpoise genome under constant population size with a gamma-distributed DFE
    acting on exons parameterized by a mean selection coefficient and a shape parameter. Estimates of the 
    parameters of the DFE are shown in the left ($\lvert E(s) \rvert $) and right hand panels (shape) respectively.
    This vaquita model has a single extant population, and parameter estimates from each
    of the three different methods are shown: 1) GRAPES \citep{galtier2016adaptive}, 2) polyDFE \citep{tataru2020polydfe},
    and 3) \dadicli \citep{Huang2023}.}
    %DRS-fig: remove the pop 0 legend again
\end{figure}
% DRS-fig: ditch the unnecessary legends from panels A and B. Also, are we deciding which version of panel C to include here? I guess I prefer
% to bottom one but with the legend of the top one (``Original s DFE'' sounds a little clunky).

% constant all site size DFE figure
\begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth]{figures/HomSap/Constant/HomSap_Constant_Gamma_K17_all_sites_DFE_plot}
    \caption{
    \label{fig:homsap-dfe.constant.all_sites}
    Performance of methods to infer the distribution of fitness effects (DFE) from genetic data
    under constant population size  where all sites along the simulated chromosome can experience selected and neutral mutation
    as opposed to exonic regions only.
    }
    %DRS-fig: human? vaquita? need more info in this figure legend, and also get rid of the pop 0 thing again
\end{figure}


\begin{table}[ht]
\centering
\small
\caption{\bf{Performance of DFE methods across the simulation scenarios}. 
Bold rows show the lowest mean absolute error (MAE) for the two DFE parameters
for each species.}
\begin{tabular}{lllrrrrrr}
\toprule
species ID & demography & annotation & \makecell{MAE \\ $E|s|$ \\ dadi-cli} & \makecell{MAE \\ $E|s|$ \\ grapes} & \makecell{MAE \\ $E|s|$ \\ polyDFE} & \makecell{MAE \\ shape \\ dadi-cli} & \makecell{MAE \\ shape \\ grapes} & \makecell{MAE \\ shape \\ polyDFE} \\
\midrule
HomSap & Constant & all sites & 0.0027 & 0.0111 & \bf{0.0018} & \bf{0.00071} & 0.029 & 0.014 \\
HomSap & Constant & exons & \bf{0.0012} & 0.0101 & 0.0023 & 0.030 & 0.0086 & \bf{0.0068} \\
HomSap & OOA 5R19 & all sites & 0.012 & 0.012 & \bf{0.0074} & \bf{0.027} & 0.055 & 0.035 \\
HomSap & OOA 5R19 & exons & 0.014 & 0.0111 & \bf{0.0072} & 0.031 & 0.051 & \bf{0.024} \\
PhoSin & Constant & all sites & \bf{0.024} & 0.025 & 0.024 & 0.21 & 0.24 & \bf{0.19} \\
PhoSin & Constant & exons & 0.024 & 0.025 & 0.024 & \bf{0.23} & 0.25 & 0.23 \\
PhoSin & Vaquita2Epoch 1R22 & all sites & \bf{0.024} & 0.025 & 0.024 & 0.20 & 0.23 & \bf{0.18} \\
PhoSin & Vaquita2Epoch 1R22 & exons & 0.024 & 0.025 & \bf{0.024} & \bf{0.18} & 0.23 & 0.21 \\
\bottomrule
\end{tabular}
\label{tab:dfe_table}
\end{table}


% roc curves for sweeps
\begin{figure}
    \centering
    \includegraphics[width=0.8 \textwidth]{figures/sweeps/roc_neutral_null.pdf}
    \caption{
    \label{fig:roc-curves}
    Receiver operating characteristic (ROC) curves for the three methods for detecting sweeps.
    False positive rates were established using neutral simulations that contained no sweeps
    but were conditioned on the same population history as the CEU and YRI samples.
    }
    %DRS-fig: can we include AUCs here?
\end{figure}


%sweep power vs exon density
\begin{figure}
    \centering
    \includegraphics[width=0.8 \textwidth]{figures/sweeps/relationship_power_exon.pdf}
    \caption{
    Power to detect selective sweeps at 100 locations along human chromosome 1 under a three-population out-of-Africa model \citep{gutenkunst2009inferring}  
    with and without purifying selection as a function of local exon density.
    Simulations are as described in Figure \ref{fig:chr1_power}.
    Each panel shows the results of simulations with purifying selection acting on exons (red; labelled BGS)
    and without purifying selection (blue; labelled Neutral).
    True positive rate is shown for these methods for the CEU and YRI samples (left and right respectively).
    Points represent the power at each location and trend lines along with 95\% confidence intervals are shown fit to those points.
    }
    \label{fig:power-exon}
\end{figure}


\stopsupplement
\end{document}