paper.bib

@inproceedings{jetleyMassivelyParallelCosmological2008,
	address = {Miami, FL, USA},
	title = {Massively parallel cosmological simulations with {ChaNGa}},
	isbn = {978-1-4244-1693-6},
	url = {http://ieeexplore.ieee.org/document/4536319/},
	doi = {10.1109/IPDPS.2008.4536319},
	booktitle = {2008 {IEEE} {International} {Symposium} on {Parallel} and {Distributed} {Processing}},
	publisher = {IEEE},
	author = {Jetley, Pritish and Gioachin, Filippo and Mendes, Celso and Kale, Laxmikant V. and Quinn, Thomas},
	month = apr,
	year = {2008},
	note = {ISSN: 1530-2075},
	pages = {1--12}
}

@article{bedorfSparseOctreeGravitational2012,
	title = {A sparse octree gravitational {N}-body code that runs entirely on the {GPU} processor},
	volume = {231},
	issn = {0021-9991},
	url = {http://www.sciencedirect.com/science/article/pii/S0021999111007364},
	doi = {10.1016/j.jcp.2011.12.024},
	number = {7},
	urldate = {2020-07-29},
	journal = {Journal of Computational Physics},
	author = {Bédorf, Jeroen and Gaburov, Evghenii and Portegies Zwart, Simon},
	month = apr,
	year = {2012},
	keywords = {GPU, -body, Gravity, Hierarchical, Parallel, Tree-code},
	pages = {2825--2839}
}

@article{BoardSchulten2000,
	Author = {J. Board and K. Schulten},
	Doi = {10.1109/5992.814662},
	Journal = {Computing in Science \& Engineering},
	Keywords = {FMM},
	Month = {January/February},
	Number = {1},
	Pages = {76--79},
	Title = {The Fast Multipole Algorithm},
	Volume = {2},
	Year = {2000}
}

@article{yingKernelindependentAdaptiveFast2004,
	title = {A kernel-independent adaptive fast multipole algorithm in two and three dimensions},
	volume = {196},
	issn = {00219991},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0021999103006090},
	doi = {10.1016/j.jcp.2003.11.021},
	abstract = {We present a new fast multipole method for particle simulations. The main feature of our algorithm is that it does not require the implementation of multipole expansions of the underlying kernel, and it is based only on kernel evaluations. Instead of using analytic expansions to represent the potential generated by sources inside a box of the hierarchical FMM tree, we use a continuous distribution of an equivalent density on a surface enclosing the box. To ﬁnd this equivalent density, we match its potential to the potential of the original sources at a surface, in the far ﬁeld, by solving local Dirichlet-type boundary value problems. The far-ﬁeld evaluations are sparsiﬁed with singular value decomposition in 2D or fast Fourier transforms in 3D. We have tested the new method on the single and double layer operators for the Laplacian, the modiﬁed Laplacian, the Stokes, the modiﬁed Stokes, the Navier, and the modiﬁed Navier operators in two and three dimensions. Our numerical results indicate that our method compares very well with the best known implementations of the analytic FMM method for both the Laplacian and modiﬁed Laplacian kernels. Its advantage is the (relative) simplicity of the implementation and its immediate extension to more general kernels.},
	language = {en},
	number = {2},
	urldate = {2019-07-30},
	journal = {Journal of Computational Physics},
	author = {Ying, Lexing and Biros, George and Zorin, Denis},
	month = may,
	year = {2004},
	pages = {591--626}
}

@article{GreengardRokhlin1987,
	Author = {L. Greengard and V. Rokhlin},
	Date-Added = {2008-02-15 18:43:55 +0000},
	Date-Modified = {2009-10-04 16:55:26 -0400},
	Doi = {10.1016/0021-9991(87)90140-9},
	Journal = {J. Comput.\ Phys.},
	Keywords = {FMM},
	Number = {2},
	Pages = {325--348},
	Title = {A Fast Algorithm for Particle Simulations},
	Volume = {73},
	Year = {1987},
	Bdsk-Url-1 = {http://dx.doi.org/10.1016/0021-9991}
}

@article{cruz2011petfmm,
	  title={PetFMM—A dynamically load-balancing parallel fast multipole library},
	  author={Cruz, Felipe A. and Knepley, Matthew G. and Barba, Lorena A.},
	  journal={International Journal for Numerical Methods in Engineering},
	  volume={85},
	  number={4},
	  pages={403--428},
	  year={2011},
	  doi={10.1002/nme.2972},
	  publisher={Wiley Online Library}
}

@incollection{yokota2011gems,
	  title={Treecode and fast multipole method for N-body simulation with CUDA},
	  author={Yokota, Rio and Barba, Lorena A},
	  booktitle={GPU Computing Gems Emerald Edition},
	  pages={113--132},
	  year={2011},
	  doi={10.1016/B978-0-12-384988-5.00009-7},
	  publisher={Elsevier}
}

@misc{BarbaYokota2012-figshare,
	author = {Lorena A. Barba and Rio Yokota},
	date-added = {2016-09-04 23:48:13 +0000},
	date-modified = {2016-09-05 00:01:19 +0000},
	doi = {10.6084/m9.figshare.92166.v1},
	howpublished = {Poster on \textbf{Figshare}, under CC-BY license, \href{https://dx.doi.org/10.6084/m9.figshare.92166.v1}{https://dx.doi.org/10.6084/m9.figshare.92166.v1}},
	keywords = {figshare},
	month = {5},
	title = {{ExaFMM: An open source library for Fast Multipole Methods aimed towards Exascale systems}},
	year = {2012},
	Bdsk-Url-1 = {10.6084/m9.figshare.92166.v1}
}

@article{YokotaBarba2011a,
	author = {Yokota, R. and Barba, L.~A.},
	date-added = {2011-09-14 20:07:10 -0400},
	date-modified = {2013-09-16 16:32:29 +0000},
	journal = {The International Journal of High-performance Computing Applications},
	doi = {10.1177/1094342011429952},
	note = {Preprint on \href{http://arxiv.org/abs/1106.2176}{http://arxiv.org/abs/1106.2176}},
	title = {A tuned and scalable fast multipole method as a preeminent algorithm for exascale systems},
	year = {2012}
}

@article{yokotaFMMBasedDual2013,
	title = {An {FMM} {Based} on {Dual} {Tree} {Traversal} for {Many}-{Core} {Architectures}},
	volume = {7},
	issn = {1748-3026},
	url = {https://doi.org/10.1260/1748-3018.7.3.301},
	doi = {10.1260/1748-3018.7.3.301},
	abstract = {The present work attempts to integrate the independent efforts in the fast N-body community to create the fastest N-body library for many-core and heterogenous architectures. Focus is placed on low accuracy optimizations, in response to the recent interest to use FMM as a preconditioner for sparse linear solvers. A direct comparison with other state-of-the-art fast N-body codes demonstrates that orders of magnitude increase in performance can be achieved by careful selection of the optimal algorithm and low-level optimization of the code. The current N-body solver uses a fast multipole method with an efficient strategy for finding the list of cell-cell interactions by a dual tree traversal. A task-based threading model is used to maximize thread-level parallelism and intra-node load-balancing. In order to extract the full potential of the SIMD units on the latest CPUs, the inner kernels are optimized using AVX instructions.},
	language = {en},
	number = {3},
	urldate = {2019-07-30},
	journal = {Journal of Algorithms \& Computational Technology},
	author = {Yokota, R.},
	month = sep,
	year = {2013},
	pages = {301--324}
}

@article{malhotraPVFMMParallelKernel2015,
	title = {{PVFMM}: {A} {Parallel} {Kernel} {Independent} {FMM} for {Particle} and {Volume} {Potentials}},
	volume = {18},
	issn = {1815-2406, 1991-7120},
	shorttitle = {{PVFMM}},
	url = {https://www.cambridge.org/core/journals/communications-in-computational-physics/article/pvfmm-a-parallel-kernel-independent-fmm-for-particle-and-volume-potentials/365109A4C15B126CD2A184F767D4C957},
	doi = {10.4208/cicp.020215.150515sw},
	abstract = {We describe our implementation of a parallel fast multipole method for evaluating potentials for discrete and continuous source distributions. The first requires summation over the source points and the second requiring integration over a continuous source density. Both problems require (N2) complexity when computed directly; however, can be accelerated to (N) time using FMM. In our PVFMM software library, we use kernel independent FMM and this allows us to compute potentials for a wide range of elliptic kernels. Our method is high order, adaptive and scalable. In this paper, we discuss several algorithmic improvements and performance optimizations including cache locality, vectorization, shared memory parallelism and use of coprocessors. Our distributed memory implementation uses space-filling curve for partitioning data and a hypercube communication scheme. We present convergence results for Laplace, Stokes and Helmholtz (low wavenumber) kernels for both particle and volume FMM. We measure efficiency of our method in terms of CPU cycles per unknown for different accuracies and different kernels. We also demonstrate scalability of our implementation up to several thousand processor cores on the Stampede platform at the Texas Advanced Computing Center.},
	language = {en},
	number = {3},
	urldate = {2019-07-30},
	journal = {Communications in Computational Physics},
	author = {Malhotra, Dhairya and Biros, George},
	month = sep,
	year = {2015},
	keywords = {31-04, 35J05, 65Y05, 65Y20, Fast multipole method, N-body problems, potential theory},
	pages = {808--830}
}

@inproceedings{blanchardScalFMMGenericParallel2015,
	address = {Salt Lake City, United States},
	title = {{ScalFMM}: {A} {Generic} {Parallel} {Fast} {Multipole} {Library}},
	shorttitle = {{ScalFMM}},
	url = {https://hal.inria.fr/hal-01135253},
	abstract = {ScalFMM (Parallel Fast Multipole Library for Large Scale Simulations) offers all the functionalities needed to perform large parallel simulations while enabling an easy customization of the simulation components: kernels, particles and cells. We will present how we use our library on two kinds of application involving boundary integral representations of physical fields. The first one implements isotropic dislocation kernels for Dislocation Dynamics and the second a time dependent kernel for acoustic problems.},
	urldate = {2019-12-27},
	booktitle = {{SIAM} {Conference} on {Computational} {Science} and {Engineering} ({SIAM} {CSE} 2015)},
	author = {Blanchard, Pierre and Bramas, Bérenger and Coulaud, Olivier and Darve, Eric and Dupuy, Laurent and Etcheverry, Arnaud and Sylvand, Guillaume},
	month = mar,
	year = {2015},
	keywords = {BEM, Dislocation Dynamics, FMM, HPC, ScalFMM, Time Domain}
}

@inproceedings{choiCPUGPUHybrid2014,
	address = {Salt Lake City, UT, USA},
	series = {{GPGPU}-7},
	title = {A {CPU}: {GPU} {Hybrid} {Implementation} and {Model}-{Driven} {Scheduling} of the {Fast} {Multipole} {Method}},
	isbn = {978-1-4503-2766-4},
	shorttitle = {A {CPU}},
	url = {https://doi.org/10.1145/2588768.2576787},
	doi = {10.1145/2588768.2576787},
	abstract = {This paper presents an optimized CPU--GPU hybrid implementation and a GPU performance model for the kernel-independent fast multipole method (FMM). We implement an optimized kernel-independent FMM for GPUs, and combine it with our previous CPU implementation to create a hybrid CPU+GPU FMM kernel. When compared to another highly optimized GPU implementation, our implementation achieves as much as a 1.9× speedup. We then extend our previous lower bound analyses of FMM for CPUs to include GPUs. This yields a model for predicting the execution times of the different phases of FMM. Using this information, we estimate the execution times of a set of static hybrid schedules on a given system, which allows us to automatically choose the schedule that yields the best performance. In the best case, we achieve a speedup of 1.5× compared to our GPU-only implementation, despite the large difference in computational powers of CPUs and GPUs. We comment on one consequence of having such performance models, which is to enable speculative predictions about FMM scalability on future systems.},
	urldate = {2020-06-21},
	booktitle = {Proceedings of {Workshop} on {General} {Purpose} {Processing} {Using} {GPUs}},
	publisher = {Association for Computing Machinery},
	author = {Choi, Jee and Chandramowlishwaran, Aparna and Madduri, Kamesh and Vuduc, Richard},
	month = mar,
	year = {2014},
	keywords = {exascale, fast multipole method, GPU, hybrid, multicore, performance model},
	pages = {64--71}
}

@article{smigajSolvingBoundaryIntegral2015,
	title = {Solving {Boundary} {Integral} {Problems} with {BEM}++},
	volume = {41},
	issn = {00983500},
	url = {http://dl.acm.org/citation.cfm?doid=2732672.2590830},
	doi = {10.1145/2590830},
	language = {en},
	number = {2},
	urldate = {2019-10-03},
	journal = {ACM Transactions on Mathematical Software},
	author = {Śmigaj, Wojciech and Betcke, Timo and Arridge, Simon and Phillips, Joel and Schweiger, Martin},
	month = feb,
	year = {2015},
	pages = {1--40}
}
	
@misc{pybind11,
   author = {Wenzel Jakob and Jason Rhinelander and Dean Moldovan},
   year = {2017},
   note = {https://github.com/pybind/pybind11},
   title = {pybind11 -- Seamless operability between C++11 and Python}
}

@article{Betcke2021,
  doi = {10.21105/joss.02879},
  url = {https://doi.org/10.21105/joss.02879},
  year = {2021},
  publisher = {The Open Journal},
  volume = {6},
  number = {59},
  pages = {2879},
  author = {Timo Betcke and Matthew W. Scroggs},
  title = {Bempp-cl: A fast Python based just-in-time compiling boundary element library.},
  journal = {Journal of Open Source Software}
}

@article{bramasTBFMMGenericParallel2020,
	title = {{TBFMM}: {A} {C}++ generic and parallel fast multipole method library},
	volume = {5},
	issn = {2475-9066},
	shorttitle = {{TBFMM}},
	url = {https://joss.theoj.org/papers/10.21105/joss.02444},
	doi = {10.21105/joss.02444},
	language = {en},
	number = {56},
	urldate = {2021-02-24},
	journal = {Journal of Open Source Software},
	author = {Bramas, Berenger},
	month = dec,
	year = {2020},
	pages = {2444},
}

@article{agullo2014task,
  title = {Task-based FMM for multicore architectures},
  author = {Agullo, Emmanuel and Bramas, B{\'e}renger and Coulaud, Olivier and Darve, Eric and Messner, Matthias and Takahashi, Toru},
  journal = {SIAM Journal on Scientific Computing},
  doi = {10.1137/130915662},
  volume = {36},
  number = {1},
  pages = {C66--C93},
  year = {2014},
  publisher = {SIAM}
}

@misc{wangETal2021,
      title={High-productivity, high-performance workflow for virus-scale electrostatic simulations with {Bempp-Exafmm}}, 
      author={Tingyu Wang and Christopher D. Cooper and Timo Betcke and Lorena A. Barba},
      year={2021},
      eprint={2103.01048},
      archivePrefix={arXiv},
      primaryClass={physics.comp-ph}
}