From 190fd0b9a62b5039e6ff51e6d3c93155c48df9d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Mat=C4=9Bj=C4=8Dek?= Date: Sat, 28 Dec 2024 19:34:26 +0100 Subject: [PATCH] More robust waiting for the process death MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reverted usage of ProcessHandle.onExit.get as it doesn't work in containers which don't have strict reaper. The zombie project is considered as alive and get then hangs forever. - Added waitpid, however it is not installed everywhere - if it is missing, we sleep for 1 second instead. That should be enough so the operating system could do the cleanup. Signed-off-by: David Matějček --- .../universal/process/ProcessUtils.java | 47 +++++++++---------- .../enterprise/v3/admin/RestartServer.java | 2 +- .../enterprise/v3/admin/StartServerHook.java | 18 ++++--- 3 files changed, 30 insertions(+), 37 deletions(-) diff --git a/nucleus/common/common-util/src/main/java/com/sun/enterprise/universal/process/ProcessUtils.java b/nucleus/common/common-util/src/main/java/com/sun/enterprise/universal/process/ProcessUtils.java index 26b40498e8e..f938e934110 100644 --- a/nucleus/common/common-util/src/main/java/com/sun/enterprise/universal/process/ProcessUtils.java +++ b/nucleus/common/common-util/src/main/java/com/sun/enterprise/universal/process/ProcessUtils.java @@ -34,9 +34,6 @@ import java.time.Duration; import java.time.Instant; import java.util.Optional; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import java.util.function.Supplier; import static com.sun.enterprise.util.StringUtils.ok; @@ -76,23 +73,7 @@ private ProcessUtils() { * @return true if the handle was not found or exited before timeout. False otherwise. */ public static boolean waitWhileIsAlive(final long pid, Duration timeout, boolean printDots) { - Optional handle = ProcessHandle.of(pid); - if (handle.isEmpty()) { - return true; - } - final DotPrinter dotPrinter = DotPrinter.startWaiting(printDots); - try { - handle.get().onExit().get(timeout.toSeconds(), TimeUnit.SECONDS); - return true; - } catch (TimeoutException e) { - LOG.log(TRACE, "Timeout while waiting for exit of process with id " + pid + ". Returning false.", e); - return false; - } catch (InterruptedException | ExecutionException e) { - LOG.log(TRACE, "Exception while waiting for exit of process with id " + pid + ". Returning true.", e); - return true; - } finally { - DotPrinter.stopWaiting(dotPrinter); - } + return waitFor(() -> !isAlive(pid), timeout, printDots); } @@ -115,13 +96,28 @@ public static boolean isAlive(final File pidFile) { */ public static boolean isAlive(final long pid) { Optional handle = ProcessHandle.of(pid); - if (handle.isEmpty()) { - return false; - } - if (!handle.get().isAlive()) { + return handle.isPresent() ? isAlive(handle.get()) : false; + } + + + /** + * The {@link Process#isAlive()} returns true even for zombies so we implemented + * this method which considers zombies as dead. + * + * @param process + * @return true if the process with is alive. + */ + public static boolean isAlive(final ProcessHandle process) { + if (!process.isAlive()) { return false; } - Info info = handle.get().info(); + // This is a trick to avoid zombies on some systems (ie containers on Jenkins) + // Operating system there does the cleanup much later, so we can still access + // zombies to process their output despite for us would be better we would + // not see them any more. + // The ProcessHandle.onExit blocks forever for zombies in docker containers + // without proper process reaper. + final Info info = process.info(); if (info.commandLine().isEmpty() && !(OS.isWindowsForSure() && info.command().isPresent())) { LOG.log(TRACE, "Could not retrieve command line for the pid {0}," + " therefore we assume that the process stopped."); @@ -130,7 +126,6 @@ public static boolean isAlive(final long pid) { return true; } - /** * Blocks until the endpoint closes the connection or timeout comes first. * diff --git a/nucleus/core/kernel/src/main/java/com/sun/enterprise/v3/admin/RestartServer.java b/nucleus/core/kernel/src/main/java/com/sun/enterprise/v3/admin/RestartServer.java index 6214fe54d4f..a0c663715ce 100644 --- a/nucleus/core/kernel/src/main/java/com/sun/enterprise/v3/admin/RestartServer.java +++ b/nucleus/core/kernel/src/main/java/com/sun/enterprise/v3/admin/RestartServer.java @@ -104,7 +104,6 @@ protected final void doExecute(AdminCommandContext context) { } scheduleReincarnation(); } - // else we just return a special int from System.exit() gfKernel.stop(); } catch (Exception e) { throw new Error(strings.get("restart.server.failure"), e); @@ -116,6 +115,7 @@ protected final void doExecute(AdminCommandContext context) { } else { restartType = debug ? RESTART_DEBUG_ON : RESTART_DEBUG_OFF; } + // return a special int from System.exit() System.exit(restartType); } diff --git a/nucleus/core/kernel/src/main/java/com/sun/enterprise/v3/admin/StartServerHook.java b/nucleus/core/kernel/src/main/java/com/sun/enterprise/v3/admin/StartServerHook.java index 43ae119d848..34f9a5c73fc 100644 --- a/nucleus/core/kernel/src/main/java/com/sun/enterprise/v3/admin/StartServerHook.java +++ b/nucleus/core/kernel/src/main/java/com/sun/enterprise/v3/admin/StartServerHook.java @@ -188,7 +188,7 @@ private static PrintStream getLogFileOld(Instant startTime) { } - private static ProcessBuilder prepareStartup(Instant now, String classpath, String[] sysprops, String classname, + private static ProcessBuilder prepareStartup(Instant startTime, String classpath, String[] sysprops, String classname, String[] args) { final Path javaExecutable = detectJavaExecutable(); final List cmdline = new ArrayList<>(); @@ -217,21 +217,19 @@ private static ProcessBuilder prepareStartup(Instant now, String classpath, Stri // To avoid conflict of the debug port used both by old and new JVM, // we will force waiting for the end of the old JVM. outerCommand = new ArrayList<>(); + outerCommand.add("nohup"); outerCommand.add("bash"); outerCommand.add("-c"); - outerCommand.add("tail --pid=" + ProcessHandle.current().pid() + " -f /dev/null && " - + cmdline.stream().collect(Collectors.joining(" "))); + // waitpid is not everywhere. + outerCommand.add("(waitpid -e " + ProcessHandle.current().pid() + " || sleep 1) && '" + + cmdline.stream().collect(Collectors.joining("' '")) + + (LOG_RESTART ? "' > '" + LOGDIR.resolve("restart-" + startTime + "-new.log'") : "'")); } final ProcessBuilder builder = new ProcessBuilder(outerCommand); builder.directory(new File(System.getProperty("user.dir"))); - if (LOG_RESTART) { - builder.redirectErrorStream(true); - builder.redirectOutput(LOGDIR.resolve("restart-" + now + "-new.log").toFile()); - } else { - builder.redirectError(Redirect.DISCARD); - builder.redirectOutput(Redirect.DISCARD); - } + builder.redirectError(Redirect.DISCARD); + builder.redirectOutput(Redirect.DISCARD); return builder; }