-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathpaper.bib
264 lines (246 loc) · 14 KB
/
paper.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
@inproceedings{jetleyMassivelyParallelCosmological2008,
address = {Miami, FL, USA},
title = {Massively parallel cosmological simulations with {ChaNGa}},
isbn = {978-1-4244-1693-6},
url = {http://ieeexplore.ieee.org/document/4536319/},
doi = {10.1109/IPDPS.2008.4536319},
booktitle = {2008 {IEEE} {International} {Symposium} on {Parallel} and {Distributed} {Processing}},
publisher = {IEEE},
author = {Jetley, Pritish and Gioachin, Filippo and Mendes, Celso and Kale, Laxmikant V. and Quinn, Thomas},
month = apr,
year = {2008},
note = {ISSN: 1530-2075},
pages = {1--12}
}
@article{bedorfSparseOctreeGravitational2012,
title = {A sparse octree gravitational {N}-body code that runs entirely on the {GPU} processor},
volume = {231},
issn = {0021-9991},
url = {http://www.sciencedirect.com/science/article/pii/S0021999111007364},
doi = {10.1016/j.jcp.2011.12.024},
number = {7},
urldate = {2020-07-29},
journal = {Journal of Computational Physics},
author = {Bédorf, Jeroen and Gaburov, Evghenii and Portegies Zwart, Simon},
month = apr,
year = {2012},
keywords = {GPU, -body, Gravity, Hierarchical, Parallel, Tree-code},
pages = {2825--2839}
}
@article{BoardSchulten2000,
Author = {J. Board and K. Schulten},
Doi = {10.1109/5992.814662},
Journal = {Computing in Science \& Engineering},
Keywords = {FMM},
Month = {January/February},
Number = {1},
Pages = {76--79},
Title = {The Fast Multipole Algorithm},
Volume = {2},
Year = {2000}
}
@article{yingKernelindependentAdaptiveFast2004,
title = {A kernel-independent adaptive fast multipole algorithm in two and three dimensions},
volume = {196},
issn = {00219991},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0021999103006090},
doi = {10.1016/j.jcp.2003.11.021},
abstract = {We present a new fast multipole method for particle simulations. The main feature of our algorithm is that it does not require the implementation of multipole expansions of the underlying kernel, and it is based only on kernel evaluations. Instead of using analytic expansions to represent the potential generated by sources inside a box of the hierarchical FMM tree, we use a continuous distribution of an equivalent density on a surface enclosing the box. To find this equivalent density, we match its potential to the potential of the original sources at a surface, in the far field, by solving local Dirichlet-type boundary value problems. The far-field evaluations are sparsified with singular value decomposition in 2D or fast Fourier transforms in 3D. We have tested the new method on the single and double layer operators for the Laplacian, the modified Laplacian, the Stokes, the modified Stokes, the Navier, and the modified Navier operators in two and three dimensions. Our numerical results indicate that our method compares very well with the best known implementations of the analytic FMM method for both the Laplacian and modified Laplacian kernels. Its advantage is the (relative) simplicity of the implementation and its immediate extension to more general kernels.},
language = {en},
number = {2},
urldate = {2019-07-30},
journal = {Journal of Computational Physics},
author = {Ying, Lexing and Biros, George and Zorin, Denis},
month = may,
year = {2004},
pages = {591--626}
}
@article{GreengardRokhlin1987,
Author = {L. Greengard and V. Rokhlin},
Date-Added = {2008-02-15 18:43:55 +0000},
Date-Modified = {2009-10-04 16:55:26 -0400},
Doi = {10.1016/0021-9991(87)90140-9},
Journal = {J. Comput.\ Phys.},
Keywords = {FMM},
Number = {2},
Pages = {325--348},
Title = {A Fast Algorithm for Particle Simulations},
Volume = {73},
Year = {1987},
Bdsk-Url-1 = {http://dx.doi.org/10.1016/0021-9991}
}
@article{cruz2011petfmm,
title={PetFMM—A dynamically load-balancing parallel fast multipole library},
author={Cruz, Felipe A. and Knepley, Matthew G. and Barba, Lorena A.},
journal={International Journal for Numerical Methods in Engineering},
volume={85},
number={4},
pages={403--428},
year={2011},
doi={10.1002/nme.2972},
publisher={Wiley Online Library}
}
@incollection{yokota2011gems,
title={Treecode and fast multipole method for N-body simulation with CUDA},
author={Yokota, Rio and Barba, Lorena A},
booktitle={GPU Computing Gems Emerald Edition},
pages={113--132},
year={2011},
doi={10.1016/B978-0-12-384988-5.00009-7},
publisher={Elsevier}
}
@misc{BarbaYokota2012-figshare,
author = {Lorena A. Barba and Rio Yokota},
date-added = {2016-09-04 23:48:13 +0000},
date-modified = {2016-09-05 00:01:19 +0000},
doi = {10.6084/m9.figshare.92166.v1},
howpublished = {Poster on \textbf{Figshare}, under CC-BY license, \href{https://dx.doi.org/10.6084/m9.figshare.92166.v1}{https://dx.doi.org/10.6084/m9.figshare.92166.v1}},
keywords = {figshare},
month = {5},
title = {{ExaFMM: An open source library for Fast Multipole Methods aimed towards Exascale systems}},
year = {2012},
Bdsk-Url-1 = {10.6084/m9.figshare.92166.v1}
}
@article{YokotaBarba2011a,
author = {Yokota, R. and Barba, L.~A.},
date-added = {2011-09-14 20:07:10 -0400},
date-modified = {2013-09-16 16:32:29 +0000},
journal = {The International Journal of High-performance Computing Applications},
doi = {10.1177/1094342011429952},
note = {Preprint on \href{http://arxiv.org/abs/1106.2176}{http://arxiv.org/abs/1106.2176}},
title = {A tuned and scalable fast multipole method as a preeminent algorithm for exascale systems},
year = {2012}
}
@article{yokotaFMMBasedDual2013,
title = {An {FMM} {Based} on {Dual} {Tree} {Traversal} for {Many}-{Core} {Architectures}},
volume = {7},
issn = {1748-3026},
url = {https://doi.org/10.1260/1748-3018.7.3.301},
doi = {10.1260/1748-3018.7.3.301},
abstract = {The present work attempts to integrate the independent efforts in the fast N-body community to create the fastest N-body library for many-core and heterogenous architectures. Focus is placed on low accuracy optimizations, in response to the recent interest to use FMM as a preconditioner for sparse linear solvers. A direct comparison with other state-of-the-art fast N-body codes demonstrates that orders of magnitude increase in performance can be achieved by careful selection of the optimal algorithm and low-level optimization of the code. The current N-body solver uses a fast multipole method with an efficient strategy for finding the list of cell-cell interactions by a dual tree traversal. A task-based threading model is used to maximize thread-level parallelism and intra-node load-balancing. In order to extract the full potential of the SIMD units on the latest CPUs, the inner kernels are optimized using AVX instructions.},
language = {en},
number = {3},
urldate = {2019-07-30},
journal = {Journal of Algorithms \& Computational Technology},
author = {Yokota, R.},
month = sep,
year = {2013},
pages = {301--324}
}
@article{malhotraPVFMMParallelKernel2015,
title = {{PVFMM}: {A} {Parallel} {Kernel} {Independent} {FMM} for {Particle} and {Volume} {Potentials}},
volume = {18},
issn = {1815-2406, 1991-7120},
shorttitle = {{PVFMM}},
url = {https://www.cambridge.org/core/journals/communications-in-computational-physics/article/pvfmm-a-parallel-kernel-independent-fmm-for-particle-and-volume-potentials/365109A4C15B126CD2A184F767D4C957},
doi = {10.4208/cicp.020215.150515sw},
abstract = {We describe our implementation of a parallel fast multipole method for evaluating potentials for discrete and continuous source distributions. The first requires summation over the source points and the second requiring integration over a continuous source density. Both problems require (N2) complexity when computed directly; however, can be accelerated to (N) time using FMM. In our PVFMM software library, we use kernel independent FMM and this allows us to compute potentials for a wide range of elliptic kernels. Our method is high order, adaptive and scalable. In this paper, we discuss several algorithmic improvements and performance optimizations including cache locality, vectorization, shared memory parallelism and use of coprocessors. Our distributed memory implementation uses space-filling curve for partitioning data and a hypercube communication scheme. We present convergence results for Laplace, Stokes and Helmholtz (low wavenumber) kernels for both particle and volume FMM. We measure efficiency of our method in terms of CPU cycles per unknown for different accuracies and different kernels. We also demonstrate scalability of our implementation up to several thousand processor cores on the Stampede platform at the Texas Advanced Computing Center.},
language = {en},
number = {3},
urldate = {2019-07-30},
journal = {Communications in Computational Physics},
author = {Malhotra, Dhairya and Biros, George},
month = sep,
year = {2015},
keywords = {31-04, 35J05, 65Y05, 65Y20, Fast multipole method, N-body problems, potential theory},
pages = {808--830}
}
@inproceedings{blanchardScalFMMGenericParallel2015,
address = {Salt Lake City, United States},
title = {{ScalFMM}: {A} {Generic} {Parallel} {Fast} {Multipole} {Library}},
shorttitle = {{ScalFMM}},
url = {https://hal.inria.fr/hal-01135253},
abstract = {ScalFMM (Parallel Fast Multipole Library for Large Scale Simulations) offers all the functionalities needed to perform large parallel simulations while enabling an easy customization of the simulation components: kernels, particles and cells. We will present how we use our library on two kinds of application involving boundary integral representations of physical fields. The first one implements isotropic dislocation kernels for Dislocation Dynamics and the second a time dependent kernel for acoustic problems.},
urldate = {2019-12-27},
booktitle = {{SIAM} {Conference} on {Computational} {Science} and {Engineering} ({SIAM} {CSE} 2015)},
author = {Blanchard, Pierre and Bramas, Bérenger and Coulaud, Olivier and Darve, Eric and Dupuy, Laurent and Etcheverry, Arnaud and Sylvand, Guillaume},
month = mar,
year = {2015},
keywords = {BEM, Dislocation Dynamics, FMM, HPC, ScalFMM, Time Domain}
}
@inproceedings{choiCPUGPUHybrid2014,
address = {Salt Lake City, UT, USA},
series = {{GPGPU}-7},
title = {A {CPU}: {GPU} {Hybrid} {Implementation} and {Model}-{Driven} {Scheduling} of the {Fast} {Multipole} {Method}},
isbn = {978-1-4503-2766-4},
shorttitle = {A {CPU}},
url = {https://doi.org/10.1145/2588768.2576787},
doi = {10.1145/2588768.2576787},
abstract = {This paper presents an optimized CPU--GPU hybrid implementation and a GPU performance model for the kernel-independent fast multipole method (FMM). We implement an optimized kernel-independent FMM for GPUs, and combine it with our previous CPU implementation to create a hybrid CPU+GPU FMM kernel. When compared to another highly optimized GPU implementation, our implementation achieves as much as a 1.9× speedup. We then extend our previous lower bound analyses of FMM for CPUs to include GPUs. This yields a model for predicting the execution times of the different phases of FMM. Using this information, we estimate the execution times of a set of static hybrid schedules on a given system, which allows us to automatically choose the schedule that yields the best performance. In the best case, we achieve a speedup of 1.5× compared to our GPU-only implementation, despite the large difference in computational powers of CPUs and GPUs. We comment on one consequence of having such performance models, which is to enable speculative predictions about FMM scalability on future systems.},
urldate = {2020-06-21},
booktitle = {Proceedings of {Workshop} on {General} {Purpose} {Processing} {Using} {GPUs}},
publisher = {Association for Computing Machinery},
author = {Choi, Jee and Chandramowlishwaran, Aparna and Madduri, Kamesh and Vuduc, Richard},
month = mar,
year = {2014},
keywords = {exascale, fast multipole method, GPU, hybrid, multicore, performance model},
pages = {64--71}
}
@article{smigajSolvingBoundaryIntegral2015,
title = {Solving {Boundary} {Integral} {Problems} with {BEM}++},
volume = {41},
issn = {00983500},
url = {http://dl.acm.org/citation.cfm?doid=2732672.2590830},
doi = {10.1145/2590830},
language = {en},
number = {2},
urldate = {2019-10-03},
journal = {ACM Transactions on Mathematical Software},
author = {Śmigaj, Wojciech and Betcke, Timo and Arridge, Simon and Phillips, Joel and Schweiger, Martin},
month = feb,
year = {2015},
pages = {1--40}
}
@misc{pybind11,
author = {Wenzel Jakob and Jason Rhinelander and Dean Moldovan},
year = {2017},
note = {https://github.com/pybind/pybind11},
title = {pybind11 -- Seamless operability between C++11 and Python}
}
@article{Betcke2021,
doi = {10.21105/joss.02879},
url = {https://doi.org/10.21105/joss.02879},
year = {2021},
publisher = {The Open Journal},
volume = {6},
number = {59},
pages = {2879},
author = {Timo Betcke and Matthew W. Scroggs},
title = {Bempp-cl: A fast Python based just-in-time compiling boundary element library.},
journal = {Journal of Open Source Software}
}
@article{bramasTBFMMGenericParallel2020,
title = {{TBFMM}: {A} {C}++ generic and parallel fast multipole method library},
volume = {5},
issn = {2475-9066},
shorttitle = {{TBFMM}},
url = {https://joss.theoj.org/papers/10.21105/joss.02444},
doi = {10.21105/joss.02444},
language = {en},
number = {56},
urldate = {2021-02-24},
journal = {Journal of Open Source Software},
author = {Bramas, Berenger},
month = dec,
year = {2020},
pages = {2444},
}
@article{agullo2014task,
title = {Task-based FMM for multicore architectures},
author = {Agullo, Emmanuel and Bramas, B{\'e}renger and Coulaud, Olivier and Darve, Eric and Messner, Matthias and Takahashi, Toru},
journal = {SIAM Journal on Scientific Computing},
doi = {10.1137/130915662},
volume = {36},
number = {1},
pages = {C66--C93},
year = {2014},
publisher = {SIAM}
}
@misc{wangETal2021,
title={High-productivity, high-performance workflow for virus-scale electrostatic simulations with {Bempp-Exafmm}},
author={Tingyu Wang and Christopher D. Cooper and Timo Betcke and Lorena A. Barba},
year={2021},
eprint={2103.01048},
archivePrefix={arXiv},
primaryClass={physics.comp-ph}
}