Skip to content

Commit

Permalink
CP-53335, topology: do not raise exception when loading invalid dista…
Browse files Browse the repository at this point in the history
…nce matrices (NUMA) (#6249)

Unreachable nodes do not contain any CPUs, and therefore VCPUs cannot be
scheduled on them. They marked with a value of (2ˆ32) - 1. Instead of
failing
to produce a NUMA object that allows for scheduling, create an object
that
contains only schedulable NUMA nodes. This means changing how the
datastructures node_cpus and candidates are created to ignore the
unreachable
ones.

Fixes two minor potential issues (distances being NaN, and adding
duplicates to candidates, which adds to running time); and separates the
xenopsd unit tests into 3 binaries for ease of testing.

Fixes #6240
  • Loading branch information
psafont authored Feb 3, 2025
2 parents b81ff9d + 9badc14 commit 3234be2
Show file tree
Hide file tree
Showing 7 changed files with 297 additions and 151 deletions.
189 changes: 118 additions & 71 deletions ocaml/xenopsd/lib/topology.ml
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,8 @@

module D = Debug.Make (struct let name = "topology" end)

open D

module CPUSet = struct
include Set.Make (struct
type t = int

let compare (x : int) (y : int) = compare x y
end)
include Set.Make (Int)

let pp_dump = Fmt.using to_seq Fmt.(Dump.seq int)

Expand Down Expand Up @@ -95,27 +89,32 @@ let seq_range a b =
let rec next i () = if i = b then Seq.Nil else Seq.Cons (i, next (i + 1)) in
next a

(** [gen_2n n] Generates all non-empty subsets of the set of [n] nodes. *)
let seq_gen_2n n =
let seq_filteri p s =
let rec loop i s () =
match s () with
| Seq.Nil ->
Seq.Nil
| Cons (hd, s) ->
if p i hd then
Cons (hd, loop (i + 1) s)
else
loop (i + 1) s ()
in
loop 0 s

(** [seq_all_subsets n] Generates all non-empty subsets of the [nodes] set. *)
let seq_all_subsets nodes =
let n = Seq.length nodes in
(* A node can either be present in the output or not, so use a loop [1, 2^n)
and have the [i]th bit determine that. *)
let of_mask i =
seq_range 0 n |> Seq.filter (fun bit -> (i lsr bit) land 1 = 1)
in
let of_mask i = nodes |> seq_filteri (fun bit _ -> (i lsr bit) land 1 = 1) in
seq_range 1 (1 lsl n) |> Seq.map of_mask

(** [seq_sort ~cmp s] sorts [s] in a temporary place using [cmp]. *)
let seq_sort ~cmp s =
let a = Array.of_seq s in
Array.fast_sort cmp a ; Array.to_seq a

(** [seq_append a b] is the sequence [a] followed by [b] *)
let seq_append (a : 'a Seq.t) (b : 'a Seq.t) =
let rec next v () =
match v () with Seq.Nil -> b () | Seq.Cons (x, xs) -> Seq.Cons (x, next xs)
in
next a

module NUMA = struct
type node = Node of int

Expand All @@ -125,31 +124,43 @@ module NUMA = struct
let compare (Node a) (Node b) = compare a b
end)

(* -1 in 32 bits *)
let unreachable_distance = 0xFFFFFFFF

let self_distance = 10

(* no mutation is exposed in the interface, therefore this is immutable *)
type t = {
distances: int array array
; cpu_to_node: node array
; node_cpus: CPUSet.t array
; all: CPUSet.t
; node_usage: int array
(** Usage across nodes is meant to be balanced when choosing candidates for a VM *)
; candidates: (float * node Seq.t) Seq.t
(** Sequence of all subsets of nodes and the average distance within
the subset, sorted by the latter in increasing order. *)
}

let node_of_int i = Node i

let node_distances d nodes =
let dists =
nodes |> Seq.flat_map (fun n1 -> nodes |> Seq.map (fun n2 -> d.(n1).(n2)))
in
let count, max_dist, sum_dist =
Seq.fold_left
(fun (count, maxv, sum) e -> (count + 1, max maxv e, sum + e))
(0, min_int, 0) dists
in
(* We want to minimize maximum distance first, and average distance next.
When running the VM we don't know which pCPU it'll end up using, and want
to limit the worst case performance. *)
((max_dist, float sum_dist /. float count), nodes)
if Seq.is_empty nodes then
None
else
let dists =
nodes
|> Seq.flat_map (fun n1 -> nodes |> Seq.map (fun n2 -> d.(n1).(n2)))
in
let count, max_dist, sum_dist =
Seq.fold_left
(fun (count, maxv, sum) e -> (count + 1, max maxv e, sum + e))
(0, min_int, 0) dists
in
(* We want to minimize maximum distance first, and average distance next.
When running the VM we don't know which pCPU it'll end up using, and want
to limit the worst case performance. *)
Some ((max_dist, float sum_dist /. float count), nodes)

let dist_cmp (a1, _) (b1, _) = compare a1 b1

Expand All @@ -159,60 +170,96 @@ module NUMA = struct
[n*multiply ... n*multiply + multiply-1], except we always the add the
single NUMA node combinations. *)
(* make sure that single NUMA nodes are always present in the combinations *)
let single_nodes =
let distance_to_candidate d = (d, float d) in
let valid_nodes =
seq_range 0 (Array.length d)
|> Seq.map (fun i -> ((10, 10.0), Seq.return i))
|> Seq.filter_map (fun i ->
let self = d.(i).(i) in
if self <> unreachable_distance then
Some i
else
None
)
in
let numa_nodes = Array.length d in
let numa_nodes = Seq.length valid_nodes in
let nodes =
if numa_nodes > 16 then
(* try just the single nodes, and give up (use all nodes otherwise) to
avoid exponential running time. We could do better here, e.g. by
if numa_nodes > 16 then (
(* Avoid generating too many candidates because of the exponential
running time. We could do better here, e.g. by
reducing the matrix *)
single_nodes
else
numa_nodes
|> seq_gen_2n
|> Seq.map (node_distances d)
|> seq_append single_nodes
D.info
"%s: More than 16 valid NUMA nodes detected, considering only \
individual nodes."
__FUNCTION__ ;
valid_nodes
|> Seq.map (fun i ->
let self = d.(i).(i) in
(distance_to_candidate self, Seq.return i)
)
) else
valid_nodes |> seq_all_subsets |> Seq.filter_map (node_distances d)
in
nodes
|> seq_sort ~cmp:dist_cmp
|> Seq.map (fun ((_, avg), nodes) -> (avg, Seq.map (fun n -> Node n) nodes))

let pp_dump_distances = Fmt.(int |> Dump.array |> Dump.array)

let make ~distances ~cpu_to_node =
debug "Distances: %s" (Fmt.to_to_string pp_dump_distances distances) ;
debug "CPU2Node: %s" (Fmt.to_to_string Fmt.(Dump.array int) cpu_to_node) ;
let ( let* ) = Option.bind in
let node_cpus = Array.map (fun _ -> CPUSet.empty) distances in

(* nothing can be scheduled on unreachable nodes, remove them from the
node_cpus *)
Array.iteri
(fun i node -> node_cpus.(node) <- CPUSet.add i node_cpus.(node))
cpu_to_node ;
Array.iteri
(fun i row ->
let d = distances.(i).(i) in
if d <> 10 then
invalid_arg
(Printf.sprintf "NUMA distance from node to itself must be 10: %d" d) ;
Array.iteri
(fun _ d ->
if d < 10 then
invalid_arg (Printf.sprintf "NUMA distance must be >= 10: %d" d)
)
row
(fun i node ->
let self = distances.(node).(node) in
if self <> unreachable_distance then
node_cpus.(node) <- CPUSet.add i node_cpus.(node)
)
distances ;
let all = Array.fold_left CPUSet.union CPUSet.empty node_cpus in
let candidates = gen_candidates distances in
{
cpu_to_node ;

let* () =
if Array.for_all (fun cpus -> CPUSet.is_empty cpus) node_cpus then (
D.info
"Not enabling NUMA: the ACPI SLIT only contains unreachable nodes." ;
None
) else
Some ()
in

let numa_matrix_is_reasonable =
distances
; cpu_to_node= Array.map node_of_int cpu_to_node
; node_cpus
; all
; node_usage= Array.map (fun _ -> 0) distances
; candidates
}
|> Array.to_seqi
|> Seq.for_all (fun (i, row) ->
let d = distances.(i).(i) in
(d = unreachable_distance || d = self_distance)
&& Array.for_all
(fun d -> d >= self_distance || d = unreachable_distance)
row
)
in

let* () =
if not numa_matrix_is_reasonable then (
D.info
"Not enabling NUMA: the ACPI SLIT table contains values that are \
invalid." ;
None
) else
Some ()
in

let candidates = gen_candidates distances in

let all = Array.fold_left CPUSet.union CPUSet.empty node_cpus in
Some
{
distances
; cpu_to_node= Array.map node_of_int cpu_to_node
; node_cpus
; all
; node_usage= Array.map (fun _ -> 0) distances
; candidates
}

let distance t (Node a) (Node b) = t.distances.(a).(b)

Expand Down
4 changes: 1 addition & 3 deletions ocaml/xenopsd/lib/topology.mli
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ module NUMA : sig
(** A NUMA node index. Distinct from an int to avoid mixing with CPU numbers *)
type node = private Node of int

val make : distances:int array array -> cpu_to_node:int array -> t
val make : distances:int array array -> cpu_to_node:int array -> t option
(** [make distances cpu_to_node] stores the topology. [distances] is a square
matrix [d] where [d.(i).(j)] is an approximation to how much slower it is
to access memory from node [j] when running on node [i]. Distances are
Expand All @@ -97,8 +97,6 @@ module NUMA : sig
NUMA nodes without any CPUs are accepted (to handle hard affinities).
Raises [invalid_arg] if above constraints are not met
A typical matrix might look like this:
10 21
Expand Down
20 changes: 15 additions & 5 deletions ocaml/xenopsd/test/dune
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
(test
(name test)
(modes exe)
(modules :standard \ test_cpuid test_topology)
(package xapi-tools)
(libraries
alcotest
cpuid

fmt
result
rpclib.core
Expand All @@ -15,15 +13,27 @@
xapi-idl.xen.interface.types
xapi-log
xapi-stdext-pervasives
xapi-test-utils
xapi_xenopsd
xenstore_transport.unix
)
(preprocess
(per_module ((pps ppx_deriving_rpc) Test))
)
)

(test
(name test_cpuid)
(modules test_cpuid)
(package xapi-tools)
(libraries alcotest cpuid xapi-test-utils xapi_xenopsd)
)

(test
(name test_topology)
(modules test_topology)
(package xapi-tools)
(libraries alcotest fmt xapi-log xapi_xenopsd)
)

(rule
(alias runtest)
(package xapi-tools)
Expand Down
2 changes: 1 addition & 1 deletion ocaml/xenopsd/test/test.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1032,4 +1032,4 @@ let _ =
)
in
Debug.log_to_stdout () ;
Alcotest.run "xenops test" ([suite; Test_topology.suite] @ Test_cpuid.tests)
Alcotest.run "xenops test" [suite]
2 changes: 2 additions & 0 deletions ocaml/xenopsd/test/test_cpuid.ml
Original file line number Diff line number Diff line change
Expand Up @@ -327,3 +327,5 @@ let tests =
; ("equality", Equality.tests)
; ("comparisons", Comparisons.tests)
]

let () = Alcotest.run "CPUID" tests
Loading

0 comments on commit 3234be2

Please sign in to comment.