diff --git a/.github/workflows/paper.yml b/.github/workflows/paper.yml new file mode 100644 index 00000000..d04aeefe --- /dev/null +++ b/.github/workflows/paper.yml @@ -0,0 +1,20 @@ +name: Draft PDF +on: [push, workflow_dispatch] + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + path: paper/paper.pdf diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e3d66b52..18a885b9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,6 +40,7 @@ repos: rev: v2.3.0 hooks: - id: codespell + args: ["-LOnd"] exclude: ^docs/usage/intro.ipynb$ - repo: https://github.com/rbubley/mirrors-prettier diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 00000000..be442799 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,203 @@ +@Article{ harris:2020, + title = {Array programming with {NumPy}}, + author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. + van der Walt and Ralf Gommers and Pauli Virtanen and David + Cournapeau and Eric Wieser and Julian Taylor and Sebastian + Berg and Nathaniel J. Smith and Robert Kern and Matti Picus + and Stephan Hoyer and Marten H. van Kerkwijk and Matthew + Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del + R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre + G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and + Warren Weckesser and Hameer Abbasi and Christoph Gohlke and + Travis E. Oliphant}, + year = {2020}, + month = sep, + journal = {Nature}, + volume = {585}, + number = {7825}, + pages = {357--362}, + doi = {10.1038/s41586-020-2649-2}, + publisher = {Springer Science and Business Media {LLC}}, + url = {https://doi.org/10.1038/s41586-020-2649-2} +} + +@software{Gray:2023, +author = {Gray, Lindsey and Smith, Nicholas and Novak, Andrzej and Fackeldey, Peter and Tovar, Benjamin and Chen, Yi-Mu and Watts, Gordon and Krommydas, Iason}, +doi = {10.5281/zenodo.7733568}, +month = mar, +title = {{coffea}}, +url = {https://github.com/CoffeaTeam/coffea}, +version = {0.7.21}, +year = {2023} +} + +@software{Pivarski:2018, +author = {Pivarski, Jim and Osborne, Ianna and Ifrim, Ioana and Schreiner, Henry and Hollands, Angus and Biswas, Anish and Das, Pratyush and Roy Choudhury, Santam and Smith, Nicholas and Goyal, Manasvi}, +doi = {10.5281/zenodo.4341376}, +month = oct, +title = {{Awkward Array}}, +year = {2018} +} + +@inproceedings{rocklin:2015, + title={Dask: Parallel computation with blocked algorithms and task scheduling}, + author={Rocklin, Matthew}, + booktitle={Proceedings of the 14th python in science conference}, + number={130-136}, + year={2015}, + organization={Citeseer} +} + +@inproceedings{lam:2015, + title={Numba: A llvm-based python jit compiler}, + author={Lam, Siu Kwan and Pitrou, Antoine and Seibert, Stanley}, + booktitle={Proceedings of the Second Workshop on the LLVM Compiler Infrastructure in HPC}, + pages={1--6}, + year={2015} +} + +@article{Meurer:2017, + title = {SymPy: symbolic computing in Python}, + author = {Meurer, Aaron and Smith, Christopher P. and Paprocki, Mateusz and \v{C}ert\'{i}k, Ond\v{r}ej and Kirpichev, Sergey B. and Rocklin, Matthew and Kumar, AMiT and Ivanov, Sergiu and Moore, Jason K. and Singh, Sartaj and Rathnayake, Thilina and Vig, Sean and Granger, Brian E. and Muller, Richard P. and Bonazzi, Francesco and Gupta, Harsh and Vats, Shivam and Johansson, Fredrik and Pedregosa, Fabian and Curry, Matthew J. and Terrel, Andy R. and Rou\v{c}ka, \v{S}t\v{e}p\'{a}n and Saboo, Ashutosh and Fernando, Isuru and Kulal, Sumith and Cimrman, Robert and Scopatz, Anthony}, + year = 2017, + month = jan, + keywords = {Python, Computer algebra system, Symbolics}, + abstract = { + SymPy is an open source computer algebra system written in pure Python. It is built with a focus on extensibility and ease of use, through both interactive and programmatic applications. These characteristics have led SymPy to become a popular symbolic library for the scientific Python ecosystem. This paper presents the architecture of SymPy, a description of its features, and a discussion of select submodules. The supplementary material provide additional examples and further outline details of the architecture and features of SymPy. + }, + volume = 3, + pages = {e103}, + journal = {PeerJ Computer Science}, + issn = {2376-5992}, + url = {https://doi.org/10.7717/peerj-cs.103}, + doi = {10.7717/peerj-cs.103} +} + +@article{Kling:2023, + title={FLArE up dark sectors with EM form factors at the LHC forward physics facility}, + volume={987}, + ISSN={0550-3213}, + url={http://dx.doi.org/10.1016/j.nuclphysb.2023.116103}, + DOI={10.1016/j.nuclphysb.2023.116103}, + journal={Nuclear Physics B}, + publisher={Elsevier BV}, + author={Kling, Felix and Kuo, Jui-Lin and Trojanowski, Sebastian and Tsai, Yu-Dai}, + year={2023}, + month=feb, pages={116103} } + +@article{Held:2024, + author = "Held, Alexander and Kauffman, Elliott and Shadura, Oksana and Wightman, Andrew", + title = "{Physics analysis for the HL-LHC: Concepts and pipelines in practice with the Analysis Grand Challenge}", + eprint = "2401.02766", + archivePrefix = "arXiv", + primaryClass = "hep-ex", + doi = "10.1051/epjconf/202429506016", + journal = "EPJ Web Conf.", + volume = "295", + pages = "06016", + year = "2024" +} + +@InProceedings{Qu:2022, + author = "Qu, Huilin and Li, Congqiao and Qian, Sitian", + title = "{Particle Transformer} for Jet Tagging", + booktitle = "{Proceedings of the 39th International Conference on Machine Learning}", + pages = "18281--18292", + year = "2022", + eprint = "2202.03772", + archivePrefix = "arXiv", + primaryClass = "hep-ph" +} + +@article{Brehmer:2020, + author = "Brehmer, Johann and Kling, Felix and Espejo, Irina and Cranmer, Kyle", + title = "{MadMiner: Machine learning-based inference for particle physics}", + journal = "Comput. Softw. Big Sci.", + volume = "4", + year = "2020", + number = "1", + pages = "3", + doi = "10.1007/s41781-020-0035-2", + eprint = "1907.10621", + archivePrefix = "arXiv", + primaryClass = "hep-ph", + SLACcitation = "%%CITATION = ARXIV:1907.10621;%%" +} + +@software{aryan:2023, + author = {Aryan Roy and + Jim Pivarski and + Chris Papageorgakis and + Javier Duarte and + Lindsey Gray and + Henry Schreiner and + Raghav Kansal and + Matthew Feickert and + Kilian Lieret and + ssrothman}, + title = {scikit-hep/fastjet}, + month = jan, + year = 2023, + publisher = {Zenodo}, + doi = {10.5281/zenodo.7504167}, + url = {https://doi.org/10.5281/zenodo.7504167} +} + +@software{spyral-utils:2024, + author = {Gordon McCann}, + title = "{spyral-utils}", + url = {https://github.com/ATTPC/spyral-utils}, +} + +@software{weaver-core:2024, + author = {Huilin Qu and Javier Duarte and Stephen Chao and sunwayihep}, + title = "{weaver-core}", + url = {https://github.com/hqucms/weaver-core}, +} + +@software{pylhe, + author = {Lukas Heinrich and Matthew Feickert and Eduardo Rodrigues}, + title = "{pylhe}", + doi = {10.5281/zenodo.1217031}, + url = {https://github.com/scikit-hep/pylhe}, +} + +@software{root:2020, + author = {Rene Brun and + Fons Rademakers and + Philippe Canal and + Axel Naumann and + Olivier Couet and + Lorenzo Moneta and + Vassil Vassilev and + Sergey Linev and + Danilo Piparo and + Gerardo GANIS and + Bertrand Bellenot and + Enrico Guiraud and + Guilherme Amadio and + wverkerke and + Pere Mato and + TimurP and + Matevž Tadel and + wlav and + Enric Tejedor and + Jakob Blomer and + Andrei Gheata and + Stephan Hageboeck and + Stefan Roiser and + marsupial and + Stefan Wunsch and + Oksana Shadura and + Anirudha Bose and + CristinaCristescu and + Xavier Valls and + Raphael Isemann}, + title = {root-project/root: v6.18/02}, + month = jun, + year = 2020, + publisher = {Zenodo}, + version = {v6-18-02}, + doi = {10.5281/zenodo.3895860}, + url = {https://doi.org/10.5281/zenodo.3895860} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 00000000..498c02e4 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,93 @@ +--- +title: "Vector: JIT-compilable mathematical manipulations of ragged Lorentz vectors" +tags: + - Python + - vector algebra + - high energy physics +authors: + - name: Saransh Chopra + orcid: 0000-0003-3046-7675 + equal-contrib: true + affiliation: "1, 2" + - name: Henry Schreiner + orcid: 0000-0002-7833-783X + equal-contrib: true + affiliation: 2 + - name: Jim Pivarski + orcid: 0000-0002-6649-343X + equal-contrib: true + corresponding: true + affiliation: 2 + +affiliations: + - name: University College London + index: 1 + - name: Princeton University + index: 2 +date: 12 October 2024 +bibliography: paper.bib +--- + +# Summary + +Mathematical manipulations of vectors is a crucial component of data analysis +pipelines in high energy physics, enabling physicists to transform raw data +into meaningful results that can be visualized. More specifically, high energy +physicists work with 2D and 3D Euclidean vectors, and 4D Lorentz vectors that +can be used as physical quantities, such as position, momentum, and forces. +Given that high energy physics data is not uniform, the vector manipulation +frameworks or libraries are expected to work readily on non-uniform or ragged +data, data with variable-sized rows (or a nested data structure with variable-sized +entries); thus, the library is expected to perform operations on an entire +ragged structure in minimum passes. Furthermore, optimizing memory usage and +processing time has become essential with the increasing computational demands +at the LHC. Vector is a Python library for creating and manipulating 2D, 3D, +and Lorentz vectors, especially arrays of vectors, to solve common physics +problems in a NumPy-like [@harris:2020] way. The library enables physicists to +operate on high energy physics data in a high level language without +compromising speed. The library is already in use at LHC and is a part of +frameworks, like Coffea [@Gray:2023], employed by physicists across multiple +high energy physics experiments. + +# Statement of need + +Vector is one of the few Lorentz vector libraries providing a Pythonic interface +but a compiled (through Awkward Array [@Pivarski:2018]) computational backend. +Vector integrates seamlessly with the existing high energy physics +ecosystem and the broader scientific Python ecosystem, including libraries like +Dask [@rocklin:2015] and Numba [@lam:2015]. The library implements a variety of +backends for several purposes. Although vector was written with high energy +physics in mind, it is a general-purpose library that can be used for any +scientific or engineering application. The library houses a set of diverse +backends, 3 numerical backends for experimental physicists and 1 symbolic +backend for theoretical physicists. These backends include: + +- a pure Python object (builtin) backend for scalar computations, +- a NumPy backend for computations on regular collection-type data, +- a SymPy [@Meurer:2017] backend for symbolic computations, and +- an Awkward backend for computations on ragged collection-type data + +There also exists implementations of the Object and the Awkward backend in Numba +for just-in-time compilable operations. Further, support for JAX and Dask is +provided through the Awkward backend, which enables vector functionalities to +support automatic differentiation and parallel computing. + +## Impact + +Besides PyROOT's LorentzVectors and TLorentzVector [@root:2020], vector has +become a popular choice for mathematical manipulations in Python based high energy +physics analysis pipelines. Along with being utilized directly in +analysis pipelines at LHC and other experiments [@Kling:2023; @Held:2024; @Qu:2022], +the library is being used as a dependency in user-facing frameworks, such as, +Coffea, MadMiner [@Brehmer:2020], FastJet [@aryan:2023], Spyral [@spyral-utils:2024], +Weaver [@weaver-core:2024], and pylhe [@pylhe]. The library is also used in multiple +teaching materials for graduate courses and workshops. Finally, given the generic +nature of the library, it is often used in non high energy physics use cases. + +# Acknowledgements + +The work on vector was supported by NSF cooperative agreements OAC-1836650 +(IRIS-HEP) and PHY-2323298 (IRIS-HEP). We would additionally like to thank the +contributors of vector and the Scikit-HEP community for their support. + +# Reference