Merge pull request #44 from kbonney/joss_paper

Adding joss paper and joss action
sandialabs · May 17, 2023 · e5af076 · e5af076
2 parents 7bbcf7e + f78b48b
commit e5af076
Show file tree

Hide file tree

Showing 4 changed files with 293 additions and 0 deletions.
diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml
@@ -0,0 +1,23 @@
+on: [push]
+
+jobs:
+  paper:
+    runs-on: ubuntu-latest
+    name: Paper Draft
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Build draft PDF
+        uses: openjournals/openjournals-draft-action@master
+        with:
+          journal: joss
+          # This should be the path to the paper within your repo.
+          paper-path: paper.md
+      - name: Upload
+        uses: actions/upload-artifact@v1
+        with:
+          name: paper
+          # This is the output path where Pandoc will write the compiled
+          # PDF. Note, this should be the same directory as the input
+          # paper.md
+          path: paper.pdf
diff --git a/paper.bib b/paper.bib
@@ -0,0 +1,187 @@
+@techreport{deceglie2018rdtools,
+  title={RdTools: an open source python library for PV degradation analysis},
+  author={Deceglie, Michael G and Jordan, Dirk and Nag, Ambarish and Deline, Christopher A and Shinn, Adam},
+  year={2018},
+  institution={National Renewable Energy Lab.(NREL), Golden, CO (United States)}
+}
+
+@article{gunda2020machine,
+  title={A machine learning evaluation of maintenance records for common failure modes in PV inverters},
+  author={Gunda, Thushara and Hackett, Sean and Kraus, Laura and Downs, Christopher and Jones, Ryan and McNalley, Christopher and Bolen, Michael and Walker, Andy},
+  journal={IEEE Access},
+  volume={8},
+  pages={211610--211620},
+  year={2020},
+  doi={10.1109/ACCESS.2020.3039182},
+  publisher={IEEE}
+}
+
+@article{holmgren2018pvlib,
+  title={pvlib python: A python package for modeling solar energy systems},
+  author={Holmgren, William F and Hansen, Clifford W and Mikofski, Mark A},
+  journal={Journal of Open Source Software},
+  volume={3},
+  number={29},
+  pages={884},
+  doi={10.21105/joss.00884},
+  year={2018}
+}
+
+@article{hopwood2020neural,
+  title={Neural network-based classification of string-level IV curves from physically-induced failures of photovoltaic modules},
+  author={Hopwood, Michael W and Gunda, Thushara and Seigneur, Hubert and Walters, Joseph},
+  journal={IEEE Access},
+  volume={8},
+  pages={161480--161487},
+  year={2020},
+  doi={10.1109/ACCESS.2020.3021577},
+  publisher={IEEE}
+}
+
+@article{hopwood2022classification,
+  title={Classification of Photovoltaic Failures with Hidden Markov Modeling, an Unsupervised Statistical Approach},
+  author={Hopwood, Michael W and Patel, Lekha and Gunda, Thushara},
+  journal={Energies},
+  volume={15},
+  number={14},
+  pages={5104},
+  year={2022},
+  doi={10.3390/en15145104},
+  publisher={MDPI}
+}
+
+@article{hopwood2022generation,
+  title={Generation of Data-Driven Expected Energy Models for Photovoltaic Systems},
+  author={Hopwood, Michael W and Gunda, Thushara},
+  journal={Applied Sciences},
+  volume={12},
+  number={4},
+  pages={1872},
+  year={2022},
+  doi={10.3390/app12041872},
+  publisher={MDPI}
+}
+
+@article{hopwood2022physics,
+  title={Physics-Based Method for Generating Fully Synthetic IV Curve Training Datasets for Machine Learning Classification of PV Failures},
+  author={Hopwood, Michael W and Stein, Joshua S and Braid, Jennifer L and Seigneur, Hubert P},
+  journal={Energies},
+  volume={15},
+  number={14},
+  pages={5085},
+  year={2022},
+  doi={10.3390/en15145085},
+  publisher={MDPI}
+}
+
+@inproceedings{mckinney2010data,
+  title={Data structures for statistical computing in python},
+  author={McKinney, Wes and others},
+  booktitle={Proceedings of the 9th Python in Science Conference},
+  volume={445},
+  number={1},
+  pages={51--56},
+  year={2010},
+  doi={10.25080/Majora-92bf1922-00a},
+  organization={Austin, TX}
+}
+
+@inproceedings{mendoza2021pvops,
+  title={pvOps: Improving operational assessments through data fusion},
+  author={Mendoza, Hector and Hopwood, Michael and Gunda, Thushara},
+  booktitle={2021 IEEE 48th Photovoltaic Specialists Conference (PVSC)},
+  pages={0112--0119},
+  year={2021},
+  doi={10.1109/PVSC43889.2021.9518439},
+  organization={IEEE}
+}
+
+@software{reback2020pandas,
+    author       = {{The pandas development team}},
+    title        = {pandas-dev/pandas: Pandas},
+    month        = feb,
+    year         = 2020,
+    publisher    = {Zenodo},
+    version      = {latest},
+    doi          = {10.5281/zenodo.3509134},
+    url          = {https://doi.org/10.5281/zenodo.3509134}
+}
+
+
+@inproceedings{pierce2020identifying,
+  title={Identifying Degradation Modes of Photovoltaic Modules Using Unsupervised Machine Learning on Electroluminescense Images},
+  author={Pierce, Benjamin G and Karimi, Ahmad Maroof and Liu, JiQi and French, Roger H and Braid, Jennifer L},
+  booktitle={2020 47th IEEE Photovoltaic Specialists Conference (PVSC)},
+  pages={1850--1855},
+  year={2020},
+  organization={IEEE},
+  doi = {10.1109/PVSC45281.2020.9301021}
+}
+
+@techreport{klise2016performance,
+  title={Performance Monitoring using Pecos (V. 0.1)},
+  author={Klise, Katherine A and Stein, Joshua S},
+  year={2016},
+  institution={Sandia National Laboraties},
+  doi = {10.2172/1734479}
+}
+
+@online{plotly2015,
+  title = {Collaborative data science}, 
+  author = {{Plotly Technologies Inc.}}, 
+  year = {2015}, 
+  publisher = {Plotly Technologies Inc.}, 
+  address = {Montreal, QC}, 
+  url = {https://plot.ly} 
+}
+
+@article{waskom2021seaborn,
+  title = {seaborn: statistical data visualization},
+  author = {Michael L. Waskom},
+  journal = {Journal of Open Source Software},
+  year = {2021},
+  doi = {10.21105/joss.03021},
+  url = {https://doi.org/10.21105/joss.03021},
+  publisher = {The Open Journal},
+  volume = {6},
+  number = {60},
+  pages = {3021}
+}
+
+@article{hunter2007matplotlib,
+  title     = {Matplotlib: A 2D graphics environment},
+  author    = {Hunter, J. D.},
+  year      = 2007,
+  journal   = {Computing in Science \& Engineering},
+  volume    = {9},
+  number    = {3},
+  pages     = {90--95},
+  publisher = {IEEE COMPUTER SOC},
+  doi       = {10.1109/MCSE.2007.55}
+}
+
+@book{bird2009nltk,
+  title = {Natural Language Processing with Python},
+  author = {Steven Bird and Ewan Klein and Edward Loper},
+  year = 2009,
+  publisher = {O'Reilly Media}
+}
+
+@misc{chollet2015keras,
+  title={Keras},
+  author={Chollet, Fran\c{c}ois and others},
+  year={2015},
+  howpublished={\url{https://keras.io}}
+}
+
+@article{pedregosa2011sklearn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+          and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+          and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+          Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  year={2011},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830}
+}
diff --git a/paper.md b/paper.md
@@ -0,0 +1,83 @@
+---
+title: 'pvOps: a Python package for empirical analysis of photovoltaic field data'
+tags:
+  - Python
+  - photovoltaic
+  - time series 
+  - machine learning
+  - natural language processing
+authors:
+  - name: Kirk L. Bonney
+    corresponding: true # (This is how to denote the corresponding author)
+    orcid: 0009-0006-2383-1634
+    affiliation: 1 
+  - name: Thushara Gunda
+    orcid: 0000-0003-1945-4064
+    affiliation: 1 
+  - name: Michael W. Hopwood
+    orcid: 0000-0001-6190-1767 
+    affiliation: 2 
+  - name: Hector Mendoza
+    orcid: 0009-0007-5812-606X
+    affiliation: 1 
+  - name: Nicole D. Jackson
+    orcid: 0000-0002-3814-9906
+    affiliation: 1
+affiliations:
+ - name: Sandia National Laboratories, USA
+   index: 1
+ - name: University of Central Florida, USA
+   index: 2
+date: 4 April 2023
+bibliography: paper.bib
+---
+
+<!--[pvOps](figures/pvops_full_logo.svg) Perhaps we could ask the journal if there's a way to include the pvOps icon in the title. I'm ok with it being excluded.-->
+
+[GitHub repository]: https://github.com/sandialabs/pvOps
+[package documentation]: https://pvops.readthedocs.io/en/latest/
+
+# Summary
+
+The purpose of `pvOps` is to support empirical evaluations of data collected in the field related to the operations and maintenance (O&M) of photovoltaic (PV) power plants. `pvOps` presently contains modules that address the diversity of field data, including text-based maintenance logs, current-voltage (IV) curves, and timeseries of production information. The package functions leverage machine learning, visualization, and other techniques to enable cleaning, processing, and fusion of these datasets. These capabilities are intended to facilitate easier evaluation of field patterns and extraction of relevant insights to support reliability-related decision-making for PV sites. The open-source code, examples, and instructions for installing the package through PyPI can be accessed through the [GitHub repository]. 
+
+# Statement of Need
+
+Continued interest in PV deployment across the world has resulted in increased awareness of needs associated with managing reliability and performance of these systems during operation. Current open-source packages for PV analysis focus on theoretical evaluations of solar power simulations (e.g., `pvlib`; [@holmgren2018pvlib]), specific use cases of empirical evaluations (e.g., `RdTools`; [@deceglie2018rdtools] and `Pecos`; [@klise2016performance] for degradation analysis), or analysis of electroluminescene images (e.g., `PVimage`; [@pierce2020identifying]). However, a general package that can support data-driven, exploratory evaluations of diverse field collected information is currently lacking. To address this gap, we present `pvOps`, an open-source, Python package that can be used by  researchers and industry analysts alike to evaluate different types of data routinely collected during PV field operations. 
+
+PV data collected in the field varies greatly in structure (i.e., timeseries and text records) and quality (i.e., completeness and consistency). The data available for analysis is frequently semi-structured. Furthermore, the level of detail collected between different owners/operators might vary. For example, some may capture a general start and end time for an associated event whereas others might include additional time details for different resolution activities. This diversity in data types and structures often leads to data being under-utilized due to the amount of manual processing required. To address these issues, `pvOps` provides a suite of data processing, cleaning, and visualization methods to leverage insights across a broad range of data types, including operations and maintenance records,  production timeseries, and IV curves. The functions within `pvOps` enable users to better parse available data to understand patterns in outages and production losses. 
+
+# Package Overview 
+The following table summarizes the four modules within `pvOps` by presenting: the type of data they analyze, example data features, and highlights of relevant functions. 
+
+<!-- Module | Type of data | Example data features | Highlights of functions
+------- | ------ | --------- | -----------
+text | O&M records | - *timestamps* string or datetime <br> - *issue description* string (unstructured) <br> - *classification* string (structured) | - fill data gaps in dates and categorical records <br> - visualize word clusters and patterns over time
+timeseries | Production data | - site: integer or string <br> - *timestamp* string or datetime <br> - *power production* numeric <br> - *irradiance* numeric | - estimate expected energy with multiple models <br> - evaluate inverter clipping
+text2time | O&M records and <br> production data | see entries for `text` and <br>  `timeseries` modules above | - analyze overlaps between O&M and production (timeseries) records <br> - visualize overlaps between O&M records and production data
+iv | IV records | - *current* 1D array <br> - *voltage* 1D array <br> - *irradiance* numeric <br> - *temperature* numeric | - *simulate* IV curves with physical faults <br> - extract diode parameters from IV curves <br> - classify faults using IV curves -->
+
+![table](table.png)
+
+The functions within each module can be used to build pipelines that integrate relevant data processing, fusion, and visualization capabilities to support user endgoals. For example, a user with IV curve data could build a pipeline that leverages functions within the `iv` module to process and extract diode parameters within IV curves as well as train models to support classifications based on fault type. A pipeline could be also be built that leverages functions across modules if a user has access to multiple types of data (e.g., both O&M and production records). A sample end-to-end workflow using `pvOps` modules could be: 
+1. Use functions within the `text` module to systematically review data quality issues within O&M records, train a machine learning model on available records, and use the model to estimate possible labels for missing entries
+2. Leverage the functions within the `timeseries` module, use machine learning to develop their own expected energy models for a given time series of irradiance and system size details, or use a pre-trained expected energy model [@hopwood2022generation] or leverage industry standard equations as a basis for evaluating possible production losses
+3. Couple outputs from the above two analyses (using functions in the `text2time` module) based on timestamps to develop summaries and visualizations of production impacts observed during these periods
+
+The [package documentation] for `pvOps` provides thorough examples exploring the various capabilities of each module. Additional details about the `iv` module capabilities, are captured in [@hopwood2020neural; @hopwood2022physics] while more information about the design and development of the `text`, `timeseries`, and `text2time` modules are captured in [@mendoza2021pvops]. Key package dependencies of `pvOps` include `pandas` [@reback2020pandas], `sklearn` [@pedregosa2011sklearn], `nltk` [@bird2009nltk], and `keras` [@chollet2015keras] for analysis and `matplotlib` [@hunter2007matplotlib], `seaborn` [@waskom2021seaborn], and `plotly` [@plotly2015] for visualization.
+
+# Ongoing Development
+The `pvOps` functionality and documentation continues to be improved and updated as new empirical techniques are identified. For example, research efforts have demonstrated utility of natural language processing techniques (e.g., topic modeling) and survival analyses to support evaluation of patterns in O&M records  [@gunda2020machine]. Additional statistical methods, such as Hidden Markov Modeling, have also been successfully used to support classification of failures within production data [@hopwood2022classification]. These and other capabilities will continue to be added to the package to improve its utility for supporting empirical analyses of field data. 
+
+# CRediT Authorship Statement
+
+<!-- see: https://www.elsevier.com/authors/policies-and-guidelines/credit-author-statement -->
+
+KLB: Writing - Original Draft; TG: Conceptualization, Writing - Original Draft; MWH: Writing - Review & Editing; HM: Writing - Review & Editing; NDJ: Conceptualization, Funding Acquisition, Project Administration, Supervision, Writing - review & editing. 
+
+# Acknowledgements
+This material is supported by the U.S. Department of Energy, Office of Energy Efficiency and Renewable Energy - Solar Energy Technologies Office. Sandia National Laboratories, a multimission laboratory managed and operated by National Technology and Engineering Solutions of Sandia LLC, a wholly owned subsidiary of Honeywell International Inc. for the U.S. Department of Energy’s National Nuclear Security Administration under contract DE-NA0003525.
+
+# References
+
+<!-- These will be formally checked and built during the review process -->
diff --git a/table.png b/table.png