From 38c3eb57de4772ec8a05332df2f98436f2a2b211 Mon Sep 17 00:00:00 2001 From: Rob Hoes Date: Thu, 23 Jan 2025 17:58:24 +0000 Subject: [PATCH 1/3] CA-403759: Initialise licensing after no-other-masters check When the coordinator restarts. the no-other-masters check in the startup sequence does two things for each pool member: 1. It checks that the host agrees that it is are not the coordinator. 2. It unblocks the host's master_connection thread, which is likely waiting for a reconnection delay to expire, which may be up to 256 seconds (exponential backoff is used). The delay is interrupted to immediately unblock DB calls. Licensing initialisation comes earlier in the startup sequence, but under certain circumstance make calls to other host, in particular after an upgrade. A this time, hosts may still be blocked on the master_connection for up to 256 s, which adds an unnecessary delay to the coordinator's startup sequence and therefore the usability of the API. Address this by reversing the order of the two startup actions. Signed-off-by: Rob Hoes --- ocaml/xapi/xapi.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xapi/xapi.ml b/ocaml/xapi/xapi.ml index fd5c0650266..f4e9a6f0f68 100644 --- a/ocaml/xapi/xapi.ml +++ b/ocaml/xapi/xapi.ml @@ -1218,7 +1218,6 @@ let server_init () = , [] , Monitor_master.update_configuration_from_master ) - ; ("Initialising licensing", [], handle_licensing) ; ( "message_hook_thread" , [Startup.NoExnRaising] , Xapi_message.start_message_hook_thread ~__context @@ -1252,6 +1251,7 @@ let server_init () = , [Startup.OnlyMaster] , check_no_other_masters ) + ; ("Initialising licensing", [], handle_licensing) ; ( "Registering periodic functions" , [] , fun () -> Xapi_periodic_scheduler_init.register ~__context From f78df2ec41c2486367ffc53edba705d300bdb26b Mon Sep 17 00:00:00 2001 From: Rob Hoes Date: Wed, 22 Jan 2025 18:20:26 +0000 Subject: [PATCH 2/3] master_connection: remove unreachable case Signed-off-by: Rob Hoes --- ocaml/database/master_connection.ml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ocaml/database/master_connection.ml b/ocaml/database/master_connection.ml index ed9bfbd2826..8a06e5cb66a 100644 --- a/ocaml/database/master_connection.ml +++ b/ocaml/database/master_connection.ml @@ -248,11 +248,7 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) : "Connection to master died: time taken so far in this call '%f'; will \ %s" time_sofar - ( if !connection_timeout < 0. then - "never timeout" - else - Printf.sprintf "timeout after '%f'" !connection_timeout - ) ; + (Printf.sprintf "timeout after '%f'" !connection_timeout) ; if time_sofar > !connection_timeout && !connection_timeout >= 0. then if !restart_on_connection_timeout then ( debug "Exceeded timeout for retrying master connection: restarting xapi" ; From f346848b12b9ae8a40829993c4b8fabca00384b2 Mon Sep 17 00:00:00 2001 From: Rob Hoes Date: Wed, 22 Jan 2025 18:26:29 +0000 Subject: [PATCH 3/3] master_connection: logging once is enough Signed-off-by: Rob Hoes --- ocaml/database/master_connection.ml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ocaml/database/master_connection.ml b/ocaml/database/master_connection.ml index 8a06e5cb66a..d7faff1cd62 100644 --- a/ocaml/database/master_connection.ml +++ b/ocaml/database/master_connection.ml @@ -235,14 +235,11 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) : let time_sofar = Unix.gettimeofday () -. time_call_started in if !connection_timeout < 0. then ( if not !surpress_no_timeout_logs then ( - debug - "Connection to master died. I will continue to retry indefinitely \ - (supressing future logging of this message)." ; error "Connection to master died. I will continue to retry indefinitely \ - (supressing future logging of this message)." - ) ; - surpress_no_timeout_logs := true + (supressing future logging of this message)." ; + surpress_no_timeout_logs := true + ) ) else debug "Connection to master died: time taken so far in this call '%f'; will \