Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CA-404062: Wrongly restart xapi when receiving HTTP errors #6205

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 54 additions & 58 deletions ocaml/database/master_connection.ml
Original file line number Diff line number Diff line change
@@ -214,6 +214,55 @@ let do_db_xml_rpc_persistent_with_reopen ~host ~path (req : string) :
else if !backoff_delay > 256.0 then
backoff_delay := 256.0
in
let reconnect () =
(* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *)
( match !my_connection with
| None ->
()
| Some st_proc -> (
my_connection := None ;
(* don't want to try closing multiple times *)
try Stunnel.disconnect st_proc with _ -> ()
)
) ;
let time_sofar = Unix.gettimeofday () -. time_call_started in
if !connection_timeout < 0. then (
if not !surpress_no_timeout_logs then (
debug
"Connection to master died. I will continue to retry indefinitely \
(supressing future logging of this message)." ;
error
"Connection to master died. I will continue to retry indefinitely \
(supressing future logging of this message)."
) ;
surpress_no_timeout_logs := true
) else
debug
"Connection to master died: time taken so far in this call '%f'; will \
%s"
time_sofar
( if !connection_timeout < 0. then
"never timeout"
else
Printf.sprintf "timeout after '%f'" !connection_timeout
) ;
if time_sofar > !connection_timeout && !connection_timeout >= 0. then
if !restart_on_connection_timeout then (
debug "Exceeded timeout for retrying master connection: restarting xapi" ;
!Db_globs.restart_fn ()
) else (
debug
"Exceeded timeout for retrying master connection: raising \
Cannot_connect_to_master" ;
raise Cannot_connect_to_master
) ;
debug "Sleeping %f seconds before retrying master connection..."
!backoff_delay ;
Thread.delay !backoff_delay ;
update_backoff_delay () ;
try open_secure_connection () with _ -> ()
(* oh well, maybe nextime... *)
in
while not !write_ok do
try
let req_string = req in
@@ -259,66 +308,13 @@ let do_db_xml_rpc_persistent_with_reopen ~host ~path (req : string) :
Db_globs.http_limit_max_rpc_size ;
debug "Re-raising exception to caller." ;
raise Http_svr.Client_requested_size_over_limit
(* TODO: This http exception handler caused CA-36936 and can probably be removed now that there's backoff delay in the generic handler _ below *)
| Http_client.Http_error (http_code, err_msg) ->
error
"Received HTTP error %s (%s) from master. This suggests our master \
address is wrong. Sleeping for %.0fs and then executing restart_fn."
http_code err_msg
!Db_globs.permanent_master_failure_retry_interval ;
Thread.delay !Db_globs.permanent_master_failure_retry_interval ;
!Db_globs.restart_fn ()
| e -> (
error "Received HTTP error %s (%s) from the coordinator." http_code
err_msg ;
reconnect ()
| e ->
error "Caught %s" (Printexc.to_string e) ;
(* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *)
( match !my_connection with
| None ->
()
| Some st_proc -> (
my_connection := None ;
(* don't want to try closing multiple times *)
try Stunnel.disconnect st_proc with _ -> ()
)
) ;
let time_sofar = Unix.gettimeofday () -. time_call_started in
if !connection_timeout < 0. then (
if not !surpress_no_timeout_logs then (
debug
"Connection to master died. I will continue to retry \
indefinitely (supressing future logging of this message)." ;
error
"Connection to master died. I will continue to retry \
indefinitely (supressing future logging of this message)."
) ;
surpress_no_timeout_logs := true
) else
debug
"Connection to master died: time taken so far in this call '%f'; \
will %s"
time_sofar
( if !connection_timeout < 0. then
"never timeout"
else
Printf.sprintf "timeout after '%f'" !connection_timeout
) ;
if time_sofar > !connection_timeout && !connection_timeout >= 0. then
if !restart_on_connection_timeout then (
debug
"Exceeded timeout for retrying master connection: restarting xapi" ;
!Db_globs.restart_fn ()
) else (
debug
"Exceeded timeout for retrying master connection: raising \
Cannot_connect_to_master" ;
raise Cannot_connect_to_master
) ;
debug "Sleeping %f seconds before retrying master connection..."
!backoff_delay ;
Thread.delay !backoff_delay ;
update_backoff_delay () ;
try open_secure_connection () with _ -> ()
(* oh well, maybe nextime... *)
)
reconnect ()
done ;
!result