From 466f909a8272719818dc3b5626d2edf2891cde36 Mon Sep 17 00:00:00 2001
From: Nick Thompson <nathompson7@protonmail.com>
Date: Mon, 8 Jan 2024 17:18:29 -0800
Subject: [PATCH] Fix race conditions in differential evolution

Through a combination of silly mistakes, I missed a pile of race conditions in the OpenMP threading.

Switch to C++ threading. Note that this change requires serial generation of trial vectors.

Hopefully I can figure out to parallelize the generation of trial vectors to reduce the serial section a la Ahmdahl's law,
while simultaneously keeping thread sanitizer happy.
---
 .gitignore                                    |   1 +
 README.md                                     |   6 +-
 doc/math.qbk                                  |   3 +
 .../differential_evolution.qbk                |  20 +-
 doc/roots/roots_overview.qbk                  |   1 -
 example/differential_evolution.cpp            |   6 +-
 .../boost/math/optimization/detail/common.hpp | 165 +++++++++
 .../optimization/differential_evolution.hpp   | 231 ++++++++++++
 .../math/tools/differential_evolution.hpp     | 329 ------------------
 test/differential_evolution_test.cpp          | 105 +++---
 test/test_functions_for_optimization.hpp      |  77 ++++
 11 files changed, 549 insertions(+), 395 deletions(-)
 rename doc/{roots => optimization}/differential_evolution.qbk (88%)
 create mode 100644 include/boost/math/optimization/detail/common.hpp
 create mode 100644 include/boost/math/optimization/differential_evolution.hpp
 delete mode 100644 include/boost/math/tools/differential_evolution.hpp
 create mode 100644 test/test_functions_for_optimization.hpp
diff --git a/.gitignore b/.gitignore
index 5ff21efb64..6435addffd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ cmake-build-debug/*
 .cmake/*
 build.ninja
 .ninja*
+a.out
diff --git a/README.md b/README.md
index 07c6c4604b..b795613c46 100644
--- a/README.md
+++ b/README.md
@@ -38,11 +38,13 @@ All the implementations are fully generic and support the use of arbitrary "real
 
 These functions also provide the basis of support for the TR1 special functions.
 
-### Root Finding and Function Minimisation
+### Root Finding
 
 A comprehensive set of root-finding algorithms over the real line, both with derivatives and derivative free.
 
-Also function minimisation via Brent's Method.
+### Optimization
+
+Minimization of cost functions via Brent's method and differential evolution.
 
 ### Polynomials and Rational Functions
 
diff --git a/doc/math.qbk b/doc/math.qbk
index f4e80a39a7..ddefdd1586 100644
--- a/doc/math.qbk
+++ b/doc/math.qbk
@@ -724,6 +724,9 @@ and as a CD ISBN 0-9504833-2-X  978-0-9504833-2-0, Classification 519.2-dc22.
 [mathpart root_finding Root Finding \& Minimization Algorithms]
 [include roots/roots_overview.qbk]
 [endmathpart] [/mathpart roots Root Finding Algorithms]
+[mathpart optimization Optimization]
+[include optimization/differential_evolution.qbk]
+[endmathpart] [/mathpart optimization Optimization]
 
 [mathpart poly Polynomials and Rational Functions]
 [include internals/polynomial.qbk]
diff --git a/doc/roots/differential_evolution.qbk b/doc/optimization/differential_evolution.qbk
similarity index 88%
rename from doc/roots/differential_evolution.qbk
rename to doc/optimization/differential_evolution.qbk
index 23e0e32aa3..78edcda662 100644
--- a/doc/roots/differential_evolution.qbk
+++ b/doc/optimization/differential_evolution.qbk
@@ -10,16 +10,16 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 [heading Synopsis]
 
 ``
-    #include <boost/math/tools/differential_evolution.hpp>
+    #include <boost/math/optimization/differential_evolution.hpp>
     
-    namespace boost::math::tools {
+    namespace boost::math::optimization {
 
     template <typename ArgumentContainer> struct differential_evolution_parameters {
         using Real = typename ArgumentContainer::value_type;
         ArgumentContainer lower_bounds;
         ArgumentContainer upper_bounds;
-        Real F = static_cast<Real>(0.65);
-        double crossover_ratio = 0.5;
+        Real mutation_factor = static_cast<Real>(0.65);
+        double crossover_probability = 0.5;
         // Population in each generation:
         size_t NP = 200;
         size_t max_generations = 1000;
@@ -47,8 +47,8 @@ We justify this design choice by reference to the "No free lunch" theorem of Wol
 
     `lower_bounds`: A container representing the lower bounds of the optimization space along each dimension. The `.size()` of the bounds should return the dimension of the problem.
     `upper_bounds`: A container representing the upper bounds of the optimization space along each dimension. It should have the same size of `lower_bounds`, and each element should be >= the corresponding element of `lower_bounds`.
-    `F`: The scale factor controlling the rate at which the population evolves (default is 0.65).
-    `crossover_ratio`: The crossover ratio determining the trade-off between exploration and exploitation (default is 0.5).
+    `mutation_factor`: The F or scale factor controlling the rate at which the population evolves (default is 0.65).
+    `crossover_probability`: The crossover probability determining the trade-off between exploration and exploitation (default is 0.5).
     `NP`: The population size (default is 200). Parallelization occurs over the population, so this should be "large".
     `max_generations`: The maximum number of generations for the optimization (default is 1000).
     `threads`: The number of threads to use for parallelization (default is the hardware concurrency). If the objective function is already multithreaded, then this should be set to 1 to prevent oversubscription.
@@ -64,7 +64,7 @@ The most robust way of decreasing the probability of getting stuck in a local mi
 template <typename ArgumentContainer, class Func, class URBG>
 ArgumentContainer differential_evolution(const Func cost_function,
     differential_evolution_parameters<ArgumentContainer> const &de_params,
-    URBG &g,
+    URBG &gen,
     std::invoke_result_t<Func, ArgumentContainer> value_to_reach = std::numeric_limits<std::invoke_result_t<Func, ArgumentContainer>>::quiet_NaN(),
     std::atomic<bool> *cancellation = nullptr,
     std::vector<std::pair<ArgumentContainer, std::invoke_result_t<Func, ArgumentContainer>>> *queries = nullptr,
@@ -102,6 +102,12 @@ Supported termination criteria are explicit requests for termination, value-to-r
 Price, Storn, and Lampinen, Section 2.8 also list population statistics and lack of accepted trials over many generations as sensible termination criteria.
 These could be supported if there is demand.
 
+Parallelization with `std::thread` does have overhead-especially for very fast function calls.
+We found that the function call needs to be roughly a microsecond for unambigous parallelization wins.
+However, we have not provided conditional parallelization as computationally inexpensive cost functions are the exception; not the rule.
+If there is a clear use case for conditional parallelization (cheap cost function in very high dimensions is a good example),
+we can provide it.
+
 [h4:references References]
 
 * Price, Kenneth, Rainer M. Storn, and Jouni A. Lampinen. ['Differential evolution: a practical approach to global optimization.] Springer Science & Business Media, 2006.
diff --git a/doc/roots/roots_overview.qbk b/doc/roots/roots_overview.qbk
index 78afe7a974..30bd2e3855 100644
--- a/doc/roots/roots_overview.qbk
+++ b/doc/roots/roots_overview.qbk
@@ -21,7 +21,6 @@ There are several fully-worked __root_finding_examples, including:
 [include quartic_roots.qbk]
 [include root_finding_examples.qbk]
 [include minima.qbk]
-[include differential_evolution.qbk]
 [include root_comparison.qbk]
 
 [/ roots_overview.qbk
diff --git a/example/differential_evolution.cpp b/example/differential_evolution.cpp
index 2b175ef5e9..4fbbe0566a 100644
--- a/example/differential_evolution.cpp
+++ b/example/differential_evolution.cpp
@@ -5,10 +5,10 @@
  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  */
 #include <iostream>
-#include <boost/math/tools/differential_evolution.hpp>
+#include <boost/math/optimization/differential_evolution.hpp>
 
-using boost::math::tools::differential_evolution_parameters;
-using boost::math::tools::differential_evolution;
+using boost::math::optimization::differential_evolution_parameters;
+using boost::math::optimization::differential_evolution;
 
 double rosenbrock(std::vector<double> const & x) {
    double result = 0;
diff --git a/include/boost/math/optimization/detail/common.hpp b/include/boost/math/optimization/detail/common.hpp
new file mode 100644
index 0000000000..e2ea5b7f0f
--- /dev/null
+++ b/include/boost/math/optimization/detail/common.hpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright Nick Thompson, 2024
+ * Use, modification and distribution are subject to the
+ * Boost Software License, Version 1.0. (See accompanying file
+ * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+#ifndef BOOST_MATH_OPTIMIZATION_DETAIL_COMMON_HPP
+#define BOOST_MATH_OPTIMIZATION_DETAIL_COMMON_HPP
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <random>
+
+namespace boost::math::optimization::detail {
+
+template <typename T, typename = void> struct has_resize : std::false_type {};
+
+template <typename T>
+struct has_resize<T, std::void_t<decltype(std::declval<T>().resize(size_t{}))>> : std::true_type {};
+
+template <typename T> constexpr bool has_resize_v = has_resize<T>::value;
+
+template <typename ArgumentContainer>
+void validate_bounds(ArgumentContainer const &lower_bounds, ArgumentContainer const &upper_bounds) {
+  using std::isfinite;
+  std::ostringstream oss;
+  if (lower_bounds.size() == 0) {
+    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+    oss << ": The dimension of the problem cannot be zero.";
+    throw std::domain_error(oss.str());
+  }
+  if (upper_bounds.size() != lower_bounds.size()) {
+    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+    oss << ": There must be the same number of lower bounds as upper bounds, but given ";
+    oss << upper_bounds.size() << " upper bounds, and " << lower_bounds.size() << " lower bounds.";
+    throw std::domain_error(oss.str());
+  }
+  for (size_t i = 0; i < lower_bounds.size(); ++i) {
+    auto lb = lower_bounds[i];
+    auto ub = upper_bounds[i];
+    if (lb > ub) {
+      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+      oss << ": The upper bound must be greater than or equal to the lower bound, but the upper bound is " << ub
+          << " and the lower is " << lb << ".";
+      throw std::domain_error(oss.str());
+    }
+    if (!isfinite(lb)) {
+      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+      oss << ": The lower bound must be finite, but got " << lb << ".";
+      oss << " For infinite bounds, emulate with std::numeric_limits<Real>::lower() or use a standard infinite->finite "
+             "transform.";
+      throw std::domain_error(oss.str());
+    }
+    if (!isfinite(ub)) {
+      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+      oss << ": The upper bound must be finite, but got " << ub << ".";
+      oss << " For infinite bounds, emulate with std::numeric_limits<Real>::max() or use a standard infinite->finite "
+             "transform.";
+      throw std::domain_error(oss.str());
+    }
+  }
+}
+
+template <typename ArgumentContainer, class URBG>
+std::vector<ArgumentContainer> random_initial_population(ArgumentContainer const &lower_bounds,
+                                                         ArgumentContainer const &upper_bounds,
+                                                         size_t initial_population_size, URBG &&gen) {
+  using Real = typename ArgumentContainer::value_type;
+  constexpr bool has_resize = detail::has_resize_v<ArgumentContainer>;
+  std::vector<ArgumentContainer> population(initial_population_size);
+  auto const dimension = lower_bounds.size();
+  for (size_t i = 0; i < population.size(); ++i) {
+    if constexpr (has_resize) {
+      population[i].resize(dimension);
+    } else {
+      // Argument type must be known at compile-time; like std::array:
+      if (population[i].size() != dimension) {
+        std::ostringstream oss;
+        oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+        oss << ": For containers which do not have resize, the default size must be the same as the dimension, ";
+        oss << "but the default container size is " << population[i].size() << " and the dimension of the problem is "
+            << dimension << ".";
+        oss << " The function argument container type is " << typeid(ArgumentContainer).name() << ".\n";
+        throw std::runtime_error(oss.str());
+      }
+    }
+  }
+
+  // Why don't we provide an option to initialize with (say) a Gaussian distribution?
+  // > If the optimum's location is fairly well known,
+  // > a Gaussian distribution may prove somewhat faster, although it
+  // > may also increase the probability that the population will converge prematurely.
+  // > In general, uniform distributions are preferred, since they best reflect
+  // > the lack of knowledge about the optimum's location.
+  //  - Differential Evolution: A Practical Approach to Global Optimization
+  // That said, scipy uses Latin Hypercube sampling and says self-avoiding sequences are preferable.
+  // So this is something that could be investigated and potentially improved.
+  using std::uniform_real_distribution;
+  uniform_real_distribution<Real> dis(Real(0), Real(1));
+  for (size_t i = 0; i < population.size(); ++i) {
+    for (size_t j = 0; j < dimension; ++j) {
+      auto const &lb = lower_bounds[j];
+      auto const &ub = upper_bounds[j];
+      population[i][j] = lb + dis(gen) * (ub - lb);
+    }
+  }
+
+  return population;
+}
+
+template <typename ArgumentContainer>
+void validate_initial_guess(ArgumentContainer const &initial_guess, ArgumentContainer const &lower_bounds,
+                            ArgumentContainer const &upper_bounds) {
+  std::ostringstream oss;
+  auto const dimension = lower_bounds.size();
+  if (initial_guess.size() != dimension) {
+    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+    oss << ": The initial guess must have the same dimensions as the problem,";
+    oss << ", but the problem size is " << dimension << " and the initial guess has " << initial_guess.size()
+        << " elements.";
+    throw std::domain_error(oss.str());
+  }
+  for (size_t i = 0; i < dimension; ++i) {
+    auto lb = lower_bounds[i];
+    auto ub = upper_bounds[i];
+    if (!isfinite(initial_guess[i])) {
+      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+      oss << ": At index " << i << ", the initial guess is " << initial_guess[i]
+          << ", make sure all elements of the initial guess are finite.";
+      throw std::domain_error(oss.str());
+    }
+    if (initial_guess[i] < lb || initial_guess[i] > ub) {
+      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+      oss << ": At index " << i << " the initial guess " << initial_guess[i] << " is not in the bounds [" << lb << ", "
+          << ub << "].";
+      throw std::domain_error(oss.str());
+    }
+  }
+}
+
+// Return indices corresponding to the minimum function values.
+template <typename Real> std::vector<size_t> best_indices(std::vector<Real> const &function_values) {
+  using std::isnan;
+  const size_t n = function_values.size();
+  std::vector<size_t> indices(n);
+  for (size_t i = 0; i < n; ++i) {
+    indices[i] = i;
+  }
+
+  std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
+    if (isnan(function_values[a])) {
+      return false;
+    }
+    if (isnan(function_values[b])) {
+      return true;
+    }
+    return function_values[a] < function_values[b];
+  });
+  return indices;
+}
+
+} // namespace boost::math::optimization::detail
+#endif
\ No newline at end of file
diff --git a/include/boost/math/optimization/differential_evolution.hpp b/include/boost/math/optimization/differential_evolution.hpp
new file mode 100644
index 0000000000..36ccfa781d
--- /dev/null
+++ b/include/boost/math/optimization/differential_evolution.hpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright Nick Thompson, 2024
+ * Use, modification and distribution are subject to the
+ * Boost Software License, Version 1.0. (See accompanying file
+ * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+#ifndef BOOST_MATH_OPTIMIZATION_DIFFERENTIAL_EVOLUTION_HPP
+#define BOOST_MATH_OPTIMIZATION_DIFFERENTIAL_EVOLUTION_HPP
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <boost/math/optimization/detail/common.hpp>
+#include <chrono>
+#include <cmath>
+#include <future>
+#include <limits>
+#include <list>
+#include <mutex>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <thread>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace boost::math::optimization {
+
+// Storn, R., Price, K. (1997). Differential evolution-a simple and efficient heuristic for global optimization over
+// continuous spaces.
+// Journal of global optimization, 11, 341-359.
+// See:
+// https://www.cp.eng.chula.ac.th/~prabhas//teaching/ec/ec2012/storn_price_de.pdf
+
+// We provide the parameters in a struct-there are too many of them and they are too unwieldy to pass individually:
+template <typename ArgumentContainer> struct differential_evolution_parameters {
+  using Real = typename ArgumentContainer::value_type;
+  ArgumentContainer lower_bounds;
+  ArgumentContainer upper_bounds;
+  // mutation factor is also called scale factor or just F in the literature:
+  Real mutation_factor = static_cast<Real>(0.65);
+  double crossover_probability = 0.5;
+  // Population in each generation:
+  size_t NP = 500;
+  size_t max_generations = 1000;
+  ArgumentContainer const *initial_guess = nullptr;
+  unsigned threads = std::thread::hardware_concurrency();
+};
+
+template <typename ArgumentContainer>
+void validate_differential_evolution_parameters(differential_evolution_parameters<ArgumentContainer> const &de_params) {
+  using std::isfinite;
+  using std::isnan;
+  std::ostringstream oss;
+  detail::validate_bounds(de_params.lower_bounds, de_params.upper_bounds);
+  if (de_params.NP < 4) {
+    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+    oss << ": The population size must be at least 4, but requested population size of " << de_params.NP << ".";
+    throw std::invalid_argument(oss.str());
+  }
+  // From: "Differential Evolution: A Practical Approach to Global Optimization (Natural Computing Series)"
+  // > The scale factor, F in (0,1+), is a positive real number that controls the rate at which the population evolves.
+  // > While there is no upper limit on F, effective values are seldom greater than 1.0.
+  // ...
+  // Also see "Limits on F", Section 2.5.1:
+  // > This discontinuity at F = 1 reduces the number of mutants by half and can result in erratic convergence...
+  auto F = de_params.mutation_factor;
+  if (isnan(F) || F >= 1 || F <= 0) {
+    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+    oss << ": F in (0, 1) is required, but got F=" << F << ".";
+    throw std::domain_error(oss.str());
+  }
+  if (de_params.max_generations < 1) {
+    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+    oss << ": There must be at least one generation.";
+    throw std::invalid_argument(oss.str());
+  }
+  if (de_params.initial_guess) {
+    detail::validate_initial_guess(*de_params.initial_guess, de_params.lower_bounds, de_params.upper_bounds);
+  }
+  if (de_params.threads == 0) {
+    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
+    oss << ": There must be at least one thread.";
+    throw std::invalid_argument(oss.str());
+  }
+}
+
+template <typename ArgumentContainer, class Func, class URBG>
+ArgumentContainer differential_evolution(
+    const Func cost_function, differential_evolution_parameters<ArgumentContainer> const &de_params, URBG &gen,
+    std::invoke_result_t<Func, ArgumentContainer> target_value =
+        std::numeric_limits<std::invoke_result_t<Func, ArgumentContainer>>::quiet_NaN(),
+    std::atomic<bool> *cancellation = nullptr,
+    std::vector<std::pair<ArgumentContainer, std::invoke_result_t<Func, ArgumentContainer>>> *queries = nullptr,
+    std::atomic<std::invoke_result_t<Func, ArgumentContainer>> *current_minimum_cost = nullptr) {
+  using Real = typename ArgumentContainer::value_type;
+  using ResultType = std::invoke_result_t<Func, ArgumentContainer>;
+  using std::clamp;
+  using std::isnan;
+  using std::round;
+  using std::uniform_real_distribution;
+  validate_differential_evolution_parameters(de_params);
+  const size_t dimension = de_params.lower_bounds.size();
+  auto NP = de_params.NP;
+  auto population = detail::random_initial_population(de_params.lower_bounds, de_params.upper_bounds, NP, gen);
+  if (de_params.initial_guess) {
+    population[0] = *de_params.initial_guess;
+  }
+  std::vector<ResultType> cost(NP, std::numeric_limits<ResultType>::quiet_NaN());
+  std::atomic<bool> target_attained = false;
+  // This mutex is only used if the queries are stored:
+  std::mutex mt;
+
+  std::vector<std::thread> thread_pool;
+  auto const threads = de_params.threads;
+  for (size_t j = 0; j < threads; ++j) {
+    // Note that if some members of the population take way longer to compute,
+    // then this parallelization strategy is very suboptimal.
+    // However, we tried using std::async (which should be robust to this particular problem),
+    // but the overhead was just totally unacceptable on ARM Macs (the only platform tested).
+    // As the economists say "there are no solutions, only tradeoffs".
+    thread_pool.emplace_back([&, j]() {
+      for (size_t i = j; i < cost.size(); i += threads) {
+        cost[i] = cost_function(population[i]);
+        if (current_minimum_cost && cost[i] < *current_minimum_cost) {
+          *current_minimum_cost = cost[i];
+        }
+        if (queries) {
+          std::scoped_lock lock(mt);
+          queries->push_back(std::make_pair(population[i], cost[i]));
+        }
+        if (!isnan(target_value) && cost[i] <= target_value) {
+          target_attained = true;
+        }
+      }
+    });
+  }
+  for (auto &thread : thread_pool) {
+    thread.join();
+  }
+
+  std::vector<ArgumentContainer> trial_vectors(NP);
+  for (size_t i = 0; i < NP; ++i) {
+    if constexpr (detail::has_resize_v<ArgumentContainer>) {
+      trial_vectors[i].resize(dimension);
+    }
+  }
+
+  for (size_t generation = 0; generation < de_params.max_generations; ++generation) {
+    if (cancellation && *cancellation) {
+      break;
+    }
+    if (target_attained) {
+      break;
+    }
+
+    // You might be tempted to parallelize the generation of trial vectors.
+    // Here's the deal: Reproducibly generating random numbers is a nightmare.
+    // I first tried seeding thread-local random number generators with the global generator,
+    // but ThreadSanitizer didn't like it. I *suspect* there's a way around this, but
+    // even if it's formally threadsafe, there's still a lot of effort to get it computationally reproducible.
+    uniform_real_distribution<Real> unif01(Real(0), Real(1));
+    for (size_t i = 0; i < NP; ++i) {
+      size_t r1, r2, r3;
+      do {
+        r1 = gen() % NP;
+      } while (r1 == i);
+      do {
+        r2 = gen() % NP;
+      } while (r2 == i || r2 == r1);
+      do {
+        r3 = gen() % NP;
+      } while (r3 == i || r3 == r2 || r3 == r1);
+      for (size_t j = 0; j < dimension; ++j) {
+        // See equation (4) of the reference:
+        auto guaranteed_changed_idx = gen() % dimension;
+        if (unif01(gen) < de_params.crossover_probability || j == guaranteed_changed_idx) {
+          auto tmp = population[r1][j] + de_params.mutation_factor * (population[r2][j] - population[r3][j]);
+          auto const &lb = de_params.lower_bounds[j];
+          auto const &ub = de_params.upper_bounds[j];
+          // Some others recommend regenerating the indices rather than clamping;
+          // I dunno seems like it could get stuck regenerating . . .
+          trial_vectors[i][j] = clamp(tmp, lb, ub);
+        } else {
+          trial_vectors[i][j] = population[i][j];
+        }
+      }
+    }
+
+    thread_pool.resize(0);
+    for (size_t j = 0; j < threads; ++j) {
+      thread_pool.emplace_back([&, j]() {
+        for (size_t i = j; i < cost.size(); i += threads) {
+          if (target_attained) {
+            return;
+          }
+          if (cancellation && *cancellation) {
+            return;
+          }
+          auto const trial_cost = cost_function(trial_vectors[i]);
+          if (isnan(trial_cost)) {
+            continue;
+          }
+          if (queries) {
+            std::scoped_lock lock(mt);
+            queries->push_back(std::make_pair(trial_vectors[i], trial_cost));
+          }
+          if (trial_cost < cost[i] || isnan(cost[i])) {
+            cost[i] = trial_cost;
+            if (!isnan(target_value) && cost[i] <= target_value) {
+              target_attained = true;
+            }
+            if (current_minimum_cost && cost[i] < *current_minimum_cost) {
+              *current_minimum_cost = cost[i];
+            }
+            population[i] = trial_vectors[i];
+          }
+        }
+      });
+    }
+    for (auto &thread : thread_pool) {
+      thread.join();
+    }
+  }
+
+  auto it = std::min_element(cost.begin(), cost.end());
+  return population[std::distance(cost.begin(), it)];
+}
+
+} // namespace boost::math::optimization
+#endif
diff --git a/include/boost/math/tools/differential_evolution.hpp b/include/boost/math/tools/differential_evolution.hpp
deleted file mode 100644
index 61066c5c82..0000000000
--- a/include/boost/math/tools/differential_evolution.hpp
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * Copyright Nick Thompson, 2023
- * Use, modification and distribution are subject to the
- * Boost Software License, Version 1.0. (See accompanying file
- * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
- */
-#ifndef BOOST_MATH_TOOLS_DIFFERENTIAL_EVOLUTION_HPP
-#define BOOST_MATH_TOOLS_DIFFERENTIAL_EVOLUTION_HPP
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <cmath>
-#include <limits>
-#if __has_include(<omp.h>)
-#include <omp.h>
-#endif
-#include <random>
-#include <sstream>
-#include <stdexcept>
-#include <thread>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-namespace boost::math::tools {
-
-namespace detail {
-
-template <typename T, typename = void> struct has_resize : std::false_type {};
-
-template <typename T>
-struct has_resize<T, std::void_t<decltype(std::declval<T>().resize(size_t{}))>> : std::true_type {};
-
-template <typename T> constexpr bool has_resize_v = has_resize<T>::value;
-
-} // namespace detail
-
-// Storn, R., Price, K. (1997). Differential evolution-a simple and efficient heuristic for global optimization over
-// continuous spaces.
-// Journal of global optimization, 11, 341-359.
-// See:
-// https://www.cp.eng.chula.ac.th/~prabhas//teaching/ec/ec2012/storn_price_de.pdf
-
-// We provide the parameters in a struct-there are too many of them and they are too unwieldy to pass individually:
-template <typename ArgumentContainer> struct differential_evolution_parameters {
-  using Real = typename ArgumentContainer::value_type;
-  ArgumentContainer lower_bounds;
-  ArgumentContainer upper_bounds;
-  Real F = static_cast<Real>(0.65);
-  double crossover_ratio = 0.5;
-  // Population in each generation:
-  size_t NP = 200;
-
-  size_t max_generations = 1000;
-#if defined(_OPENMP)
-  size_t threads = std::thread::hardware_concurrency();
-#else
-  size_t threads = 1;
-#endif
-  ArgumentContainer const *initial_guess = nullptr;
-};
-
-template <typename ArgumentContainer>
-void validate_differential_evolution_parameters(differential_evolution_parameters<ArgumentContainer> const &de_params) {
-  using std::isfinite;
-  using std::isnan;
-  std::ostringstream oss;
-  if (de_params.threads == 0) {
-    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-    oss << ": Requested zero threads to perform the calculation, but at least 1 is required.";
-    throw std::invalid_argument(oss.str());
-  }
-  if (de_params.lower_bounds.size() == 0) {
-    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-    oss << ": The dimension of the problem cannot be zero.";
-    throw std::domain_error(oss.str());
-  }
-  if (de_params.upper_bounds.size() != de_params.lower_bounds.size()) {
-    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-    oss << ": There must be the same number of lower bounds as upper bounds, but given ";
-    oss << de_params.upper_bounds.size() << " upper bounds, and " << de_params.lower_bounds.size() << " lower bounds.";
-    throw std::domain_error(oss.str());
-  }
-  for (size_t i = 0; i < de_params.lower_bounds.size(); ++i) {
-    auto lb = de_params.lower_bounds[i];
-    auto ub = de_params.upper_bounds[i];
-    if (lb > ub) {
-      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-      oss << ": The upper bound must be greater than or equal to the lower bound, but the upper bound is " << ub
-          << " and the lower is " << lb << ".";
-      throw std::domain_error(oss.str());
-    }
-    if (!isfinite(lb)) {
-      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-      oss << ": The lower bound must be finite, but got " << lb << ".";
-      oss << " For infinite bounds, emulate with std::numeric_limits<Real>::lower() or use a standard infinite->finite "
-             "transform.";
-      throw std::domain_error(oss.str());
-    }
-    if (!isfinite(ub)) {
-      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-      oss << ": The upper bound must be finite, but got " << ub << ".";
-      oss << " For infinite bounds, emulate with std::numeric_limits<Real>::max() or use a standard infinite->finite "
-             "transform.";
-      throw std::domain_error(oss.str());
-    }
-  }
-  if (de_params.NP < 4) {
-    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-    oss << ": The population size must be at least 4, but requested population size of " << de_params.NP << ".";
-    throw std::invalid_argument(oss.str());
-  }
-  if (de_params.threads > de_params.NP) {
-    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-    oss << ": There must be more individuals in the population than threads.";
-    throw std::invalid_argument(oss.str());
-  }
-  // From: "Differential Evolution: A Practical Approach to Global Optimization (Natural Computing Series)"
-  // > The scale factor, F in (0,1+), is a positive real number that controls the rate at which the population evolves.
-  // > While there is no upper limit on F, effective values are seldom greater than 1.0.
-  // ...
-  // Also see "Limits on F", Section 2.5.1:
-  // > This discontinuity at F = 1 reduces the number of mutants by half and can result in erratic convergence...
-  auto F = de_params.F;
-  if (isnan(F) || F >= 1 || F <= 0) {
-    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-    oss << ": F in (0, 1) is required, but got F=" << F << ".";
-    throw std::domain_error(oss.str());
-  }
-  if (de_params.max_generations < 1) {
-    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-    oss << ": There must be at least one generation.";
-    throw std::invalid_argument(oss.str());
-  }
-  if (de_params.initial_guess) {
-    auto dimension = de_params.lower_bounds.size();
-    if (de_params.initial_guess->size() != dimension) {
-      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-      oss << ": The initial guess must have the same dimensions as the problem,";
-      oss << ", but the problem size is " << dimension << " and the initial guess has "
-          << de_params.initial_guess->size() << " elements.";
-      throw std::domain_error(oss.str());
-    }
-    auto const &guess = *de_params.initial_guess;
-    for (size_t i = 0; i < dimension; ++i) {
-      auto lb = de_params.lower_bounds[i];
-      auto ub = de_params.upper_bounds[i];
-      if (guess[i] < lb || guess[i] > ub) {
-        oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-        oss << ": At index " << i << " the initial guess " << guess[i] << " is not in the bounds [" << lb << ", " << ub
-            << "].";
-        throw std::domain_error(oss.str());
-      }
-    }
-  }
-#if !defined(_OPENMP)
-  if (de_params.threads != 1) {
-    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-    oss << ": If OpenMP is not available, then there algorithm must run on a single thread, but requested "
-        << de_params.threads << " threads.";
-    throw std::invalid_argument(oss.str());
-  }
-#endif
-}
-
-template <typename ArgumentContainer, class Func, class URBG>
-ArgumentContainer differential_evolution(
-    const Func cost_function, differential_evolution_parameters<ArgumentContainer> const &de_params, URBG &g,
-    std::invoke_result_t<Func, ArgumentContainer> target_value =
-        std::numeric_limits<std::invoke_result_t<Func, ArgumentContainer>>::quiet_NaN(),
-    std::atomic<bool> *cancellation = nullptr,
-    std::vector<std::pair<ArgumentContainer, std::invoke_result_t<Func, ArgumentContainer>>> *queries = nullptr,
-    std::atomic<std::invoke_result_t<Func, ArgumentContainer>> *current_minimum_cost = nullptr) {
-  using Real = typename ArgumentContainer::value_type;
-  validate_differential_evolution_parameters(de_params);
-  constexpr bool has_resize = detail::has_resize_v<ArgumentContainer>;
-
-  using ResultType = std::invoke_result_t<Func, ArgumentContainer>;
-  using std::clamp;
-  using std::isnan;
-  using std::round;
-  auto const NP = de_params.NP;
-  std::vector<ArgumentContainer> population(NP);
-  auto const dimension = de_params.lower_bounds.size();
-  for (size_t i = 0; i < population.size(); ++i) {
-    if constexpr (has_resize) {
-      population[i].resize(dimension);
-    } else {
-      // Argument type must be known at compile-time; like std::array:
-      if (population[i].size() != dimension) {
-        std::ostringstream oss;
-        oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
-        oss << ": For containers which do not have resize, the default size must be the same as the dimension, ";
-        oss << "but the default container size is " << population[i].size() << " and the dimension of the problem is "
-            << dimension << ".";
-        oss << " The function argument container type is " << typeid(ArgumentContainer).name() << ".\n";
-        throw std::runtime_error(oss.str());
-      }
-    }
-  }
-  // Why don't we provide an option to initialize with (say) a Gaussian distribution?
-  // > If the optimum's location is fairly well known,
-  // > a Gaussian distribution may prove somewhat faster, although it
-  // > may also increase the probability that the population will converge prematurely.
-  // > In general, uniform distributions are preferred, since they best reflect
-  // > the lack of knowledge about the optimum's location.
-  //  - Differential Evolution: A Practical Approach to Global Optimization
-  // That said, scipy uses Latin Hypercube sampling and says self-avoiding sequences are preferable.
-  // So this is something that could be investigated and potentially improved.
-  using std::uniform_real_distribution;
-  uniform_real_distribution<Real> dis(Real(0), Real(1));
-  for (size_t i = 0; i < population.size(); ++i) {
-    for (size_t j = 0; j < dimension; ++j) {
-      auto const &lb = de_params.lower_bounds[j];
-      auto const &ub = de_params.upper_bounds[j];
-      population[i][j] = lb + dis(g) * (ub - lb);
-    }
-  }
-  if (de_params.initial_guess) {
-    population[0] = *de_params.initial_guess;
-  }
-
-  std::atomic<bool> target_attained = false;
-  std::vector<ResultType> cost(NP, std::numeric_limits<ResultType>::quiet_NaN());
-#if defined(_OPENMP)
-#pragma omp parallel for num_threads(de_params.threads)
-#endif
-  for (size_t i = 0; i < cost.size(); ++i) {
-    cost[i] = cost_function(population[i]);
-    if (!isnan(target_value) && cost[i] <= target_value) {
-      target_attained = true;
-    }
-    if (current_minimum_cost && cost[i] < *current_minimum_cost) {
-      *current_minimum_cost = cost[i];
-    }
-    if (queries) {
-#if defined(_OPENMP) // get rid of -Wunknown-pragmas when OpenMP is not available:
-#pragma omp critical
-#endif
-      queries->push_back(std::make_pair(population[i], cost[i]));
-    }
-  }
-
-  // This probably won't show up on any performance metrics, but just convert everything to integer ops:
-  const auto crossover_int =
-      static_cast<decltype(g())>(round(static_cast<double>((URBG::max)() - (URBG::min)()) * de_params.crossover_ratio));
-  std::vector<URBG> generators(de_params.threads);
-  for (size_t i = 0; i < de_params.threads; ++i) {
-    generators[i].seed(g());
-  }
-  for (size_t generation = 0; generation < de_params.max_generations; ++generation) {
-    if (cancellation && *cancellation) {
-      break;
-    }
-    if (target_attained) {
-      break;
-    }
-#if defined(_OPENMP)
-#pragma omp parallel for num_threads(de_params.threads)
-#endif
-    for (size_t i = 0; i < NP; ++i) {
-#if defined(_OPENMP)
-      size_t thread_idx = omp_get_thread_num();
-#else
-      size_t thread_idx = 0;
-#endif
-      auto &gen = generators[thread_idx];
-      size_t r1, r2, r3;
-      do {
-        r1 = gen() % NP;
-      } while (r1 == i);
-      do {
-        r2 = gen() % NP;
-      } while (r2 == i || r2 == r1);
-      do {
-        r3 = gen() % NP;
-      } while (r3 == i || r3 == r2 || r3 == r1);
-      // Hopefully the compiler optimizes this so that it's not allocated on every iteration:
-      ArgumentContainer trial_vector;
-      if constexpr (has_resize) {
-        trial_vector.resize(dimension);
-      }
-      for (size_t j = 0; j < dimension; ++j) {
-        // See equation (4) of the reference:
-        auto guaranteed_changed_idx = gen() % NP;
-        if (gen() < crossover_int || j == guaranteed_changed_idx) {
-          auto tmp = population[r1][j] + de_params.F * (population[r2][j] - population[r3][j]);
-          auto const &lb = de_params.lower_bounds[j];
-          auto const &ub = de_params.upper_bounds[j];
-          // Some others recommend regenerating the indices rather than clamping;
-          // I dunno seems like it could get stuck regenerating . . .
-          trial_vector[j] = clamp(tmp, lb, ub);
-        } else {
-          trial_vector[j] = population[i][j];
-        }
-      }
-      auto trial_cost = cost_function(trial_vector);
-      if (queries) {
-#if defined(_OPENMP)
-#pragma omp critical
-#endif
-        queries->push_back(std::make_pair(trial_vector, trial_cost));
-      }
-
-      if (isnan(trial_cost)) {
-        continue;
-      }
-      if (trial_cost < cost[i] || isnan(cost[i])) {
-        std::swap(population[i], trial_vector);
-        cost[i] = trial_cost;
-        if (!isnan(target_value) && cost[i] <= target_value) {
-          target_attained = true;
-          // In a single-threaded context, I'd put a break statement here,
-          // but OpenMP does not allow break statements in for loops.
-          // We'll just have to wait until the end of this generation.
-        }
-        if (current_minimum_cost && cost[i] < *current_minimum_cost) {
-          *current_minimum_cost = cost[i];
-        }
-      }
-    }
-  }
-
-  auto it = std::min_element(cost.begin(), cost.end());
-  return population[std::distance(cost.begin(), it)];
-}
-
-} // namespace boost::math::tools
-#endif
diff --git a/test/differential_evolution_test.cpp b/test/differential_evolution_test.cpp
index ffa54c0925..1267568015 100644
--- a/test/differential_evolution_test.cpp
+++ b/test/differential_evolution_test.cpp
@@ -6,29 +6,15 @@
  */
 
 #include "math_unit_test.hpp"
-#include <boost/math/constants/constants.hpp>
-#include <boost/math/tools/differential_evolution.hpp>
+#include "test_functions_for_optimization.hpp"
+#include <boost/math/optimization/differential_evolution.hpp>
 #include <random>
 
-using boost::math::constants::e;
-using boost::math::constants::two_pi;
-using boost::math::tools::differential_evolution;
-using boost::math::tools::differential_evolution_parameters;
-using std::cbrt;
-using std::cos;
-using std::exp;
-using std::sqrt;
-
-// Taken from: https://en.wikipedia.org/wiki/Test_functions_for_optimization
-template <typename Real> Real ackley(std::array<Real, 2> const &v) {
-  Real x = v[0];
-  Real y = v[1];
-  Real arg1 = -sqrt((x * x + y * y) / 2) / 5;
-  Real arg2 = cos(two_pi<Real>() * x) + cos(two_pi<Real>() * y);
-  return -20 * exp(arg1) - exp(arg2 / 2) + 20 + e<Real>();
-}
+using boost::math::optimization::differential_evolution;
+using boost::math::optimization::differential_evolution_parameters;
 
 template <class Real> void test_ackley() {
+  std::cout << "Testing differential evolution on the Ackley function . . .\n";
   using ArgType = std::array<Real, 2>;
   auto de_params = differential_evolution_parameters<ArgType>();
   de_params.lower_bounds = {-5, -5};
@@ -53,13 +39,8 @@ template <class Real> void test_ackley() {
   CHECK_EQUAL(local_minima[1], Real(0));
 }
 
-template <typename Real> auto rosenbrock_saddle(std::array<Real, 2> const &v) {
-  auto x = v[0];
-  auto y = v[1];
-  return 100 * (x * x - y) * (x * x - y) + (1 - x) * (1 - x);
-}
-
 template <class Real> void test_rosenbrock_saddle() {
+  std::cout << "Testing differential evolution on the Rosenbrock saddle . . .\n";
   using ArgType = std::array<Real, 2>;
   auto de_params = differential_evolution_parameters<ArgType>();
   de_params.lower_bounds = {0.5, 0.5};
@@ -78,16 +59,8 @@ template <class Real> void test_rosenbrock_saddle() {
   CHECK_GE(std::abs(local_minima[0] - Real(1)), std::sqrt(std::numeric_limits<Real>::epsilon()));
 }
 
-template <class Real> Real rastrigin(std::vector<Real> const &v) {
-  Real A = 10;
-  Real y = 10 * v.size();
-  for (auto x : v) {
-    y += x * x - A * cos(two_pi<Real>() * x);
-  }
-  return y;
-}
-
 template <class Real> void test_rastrigin() {
+  std::cout << "Testing differential evolution on the Rastrigin function . . .\n";
   using ArgType = std::vector<Real>;
   auto de_params = differential_evolution_parameters<ArgType>();
   de_params.lower_bounds.resize(8, static_cast<Real>(-5.12));
@@ -104,43 +77,69 @@ template <class Real> void test_rastrigin() {
   CHECK_LE(rastrigin(local_minima), target_value);
 }
 
-double sphere(std::vector<float> const &v) {
-  double r = 0.0;
-  for (auto x : v) {
-    double x_ = static_cast<double>(x);
-    r += x_ * x_;
-  }
-  if (r >= 1) {
-    return std::numeric_limits<double>::quiet_NaN();
-  }
-  return r;
-}
-
 // Tests NaN return types and return type != input type:
 void test_sphere() {
+  std::cout << "Testing differential evolution on the sphere function . . .\n";
   using ArgType = std::vector<float>;
   auto de_params = differential_evolution_parameters<ArgType>();
-  de_params.lower_bounds.resize(8, -1);
-  de_params.upper_bounds.resize(8, 1);
+  de_params.lower_bounds.resize(3, -1);
+  de_params.upper_bounds.resize(3, 1);
   de_params.NP *= 10;
   de_params.max_generations *= 10;
+  de_params.crossover_probability = 0.9;
+  double target_value = 1e-8;
+  de_params.threads = 1;
   std::mt19937_64 gen(56789);
-  auto local_minima = differential_evolution(sphere, de_params, gen);
+  auto local_minima = differential_evolution(sphere, de_params, gen, target_value);
+  CHECK_LE(sphere(local_minima), target_value);
+  // Check computational reproducibility:
+  gen.seed(56789);
+  auto local_minima_2 = differential_evolution(sphere, de_params, gen, target_value);
+  for (size_t i = 0; i < local_minima.size(); ++i) {
+    CHECK_EQUAL(local_minima[i], local_minima_2[i]);
+  }
+}
+
+template<typename Real>
+void test_three_hump_camel() {
+  std::cout << "Testing differential evolution on the three hump camel . . .\n";
+  using ArgType = std::array<Real, 2>;
+  auto de_params = differential_evolution_parameters<ArgType>();
+  de_params.lower_bounds[0] = -5.0;
+  de_params.lower_bounds[1] = -5.0;
+  de_params.upper_bounds[0] = 5.0;
+  de_params.upper_bounds[1] = 5.0;
+  std::mt19937_64 gen(56789);
+  auto local_minima = differential_evolution(three_hump_camel<Real>, de_params, gen);
   for (auto x : local_minima) {
     CHECK_ABSOLUTE_ERROR(0.0f, x, 2e-4f);
   }
 }
 
-#define GCC_COMPILER (defined(__GNUC__) && !defined(__clang__))
+template<typename Real>
+void test_beale() {
+  std::cout << "Testing differential evolution on the Beale function . . .\n";
+  using ArgType = std::array<Real, 2>;
+  auto de_params = differential_evolution_parameters<ArgType>();
+  de_params.lower_bounds[0] = -5.0;
+  de_params.lower_bounds[1] = -5.0;
+  de_params.upper_bounds[0]= 5.0;
+  de_params.upper_bounds[1]= 5.0;
+  std::mt19937_64 gen(56789);
+  auto local_minima = differential_evolution(beale<Real>, de_params, gen);
+  CHECK_ABSOLUTE_ERROR(Real(3), local_minima[0], Real(2e-4));
+  CHECK_ABSOLUTE_ERROR(Real(1)/Real(2), local_minima[1], Real(2e-4));
+}
 
 int main() {
-  // GCC<=8 rejects the function call syntax we use here.
-  // Just do a workaround:
-#if !defined(GCC_COMPILER) || (defined(GCC_COMPILER) && __GNUC__ >= 9)
+
+#if defined(__clang__) || defined(_MSC_VER)
   test_ackley<float>();
   test_ackley<double>();
   test_rosenbrock_saddle<double>();
   test_rastrigin<float>();
+  test_three_hump_camel<float>();
+  test_beale<double>();
 #endif
   test_sphere();
   return boost::math::test::report_errors();
diff --git a/test/test_functions_for_optimization.hpp b/test/test_functions_for_optimization.hpp
new file mode 100644
index 0000000000..46b9779af1
--- /dev/null
+++ b/test/test_functions_for_optimization.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright Nick Thompson, 2024
+ * Use, modification and distribution are subject to the
+ * Boost Software License, Version 1.0. (See accompanying file
+ * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+#ifndef TEST_FUNCTIONS_FOR_OPTIMIZATION_HPP
+#define TEST_FUNCTIONS_FOR_OPTIMIZATION_HPP
+#include <boost/math/constants/constants.hpp>
+
+// Taken from: https://en.wikipedia.org/wiki/Test_functions_for_optimization
+template <typename Real> Real ackley(std::array<Real, 2> const &v) {
+  using std::sqrt;
+  using std::cos;
+  using std::exp;
+  using boost::math::constants::two_pi;
+  using boost::math::constants::e;
+  Real x = v[0];
+  Real y = v[1];
+  Real arg1 = -sqrt((x * x + y * y) / 2) / 5;
+  Real arg2 = cos(two_pi<Real>() * x) + cos(two_pi<Real>() * y);
+  return -20 * exp(arg1) - exp(arg2 / 2) + 20 + e<Real>();
+}
+
+template <typename Real> auto rosenbrock_saddle(std::array<Real, 2> const &v) {
+  auto x = v[0];
+  auto y = v[1];
+  return 100 * (x * x - y) * (x * x - y) + (1 - x) * (1 - x);
+}
+
+
+template <class Real> Real rastrigin(std::vector<Real> const &v) {
+  using std::cos;
+  using boost::math::constants::two_pi;
+  Real A = 10;
+  Real y = 10 * v.size();
+  for (auto x : v) {
+    y += x * x - A * cos(two_pi<Real>() * x);
+  }
+  return y;
+}
+
+// Useful for testing return-type != scalar argument type,
+// and robustness to NaNs:
+double sphere(std::vector<float> const &v) {
+  double r = 0.0;
+  for (auto x : v) {
+    double x_ = static_cast<double>(x);
+    r += x_ * x_;
+  }
+  if (r >= 1) {
+    return std::numeric_limits<double>::quiet_NaN();
+  }
+  return r;
+}
+
+template<typename Real>
+Real three_hump_camel(std::array<Real, 2> const & v) {
+  Real x = v[0];
+  Real y = v[1];
+  auto xsq = x*x;
+  return 2*xsq - (1 + Real(1)/Real(20))*xsq*xsq  + xsq*xsq*xsq/6 + x*y + y*y;
+}
+
+// Minima occurs at (3, 1/2) with value 0:
+template<typename Real>
+Real beale(std::array<Real, 2> const & v) {
+  Real x = v[0];
+  Real y = v[1];
+  Real t1 = Real(3)/Real(2) -x + x*y;
+  Real t2 = Real(9)/Real(4) -x  + x*y*y;
+  Real t3 = Real(21)/Real(8) -x  + x*y*y*y;
+  return t1*t1 + t2*t2 + t3*t3;
+}
+
+
+#endif