From 0ef023ba3731f456bbce1da46a7afdb355fef6b9 Mon Sep 17 00:00:00 2001 From: Benny Kopilov Date: Tue, 3 Dec 2024 21:44:39 +0200 Subject: [PATCH] Add filtered controller logs when timeout during installation. During CI runs we got timeout failures without any root cause for failures. The default timeout is 3600 seconds. We are going to increase total masters and operators installed and probably May hit on timeout and the current timers are not enough. Added support for filtering last messages from assisted controller log when timeout occured. In case controller logs exists on timeout we extract last error/ warning and report them as part of the raise exception We expect the see additional info when bubbled up timeout exception and get more details from the raised exception . --- .../test_infra/helper_classes/cluster.py | 31 +++++++---- .../test_infra/utils/logs_utils.py | 54 ++++++++++++++++++- 2 files changed, 75 insertions(+), 10 deletions(-) diff --git a/src/assisted_test_infra/test_infra/helper_classes/cluster.py b/src/assisted_test_infra/test_infra/helper_classes/cluster.py index 990c6bba14c..1a0f2f4a85a 100644 --- a/src/assisted_test_infra/test_infra/helper_classes/cluster.py +++ b/src/assisted_test_infra/test_infra/helper_classes/cluster.py @@ -6,14 +6,18 @@ from collections import Counter from typing import List, Optional, Set -import waiting -import yaml from assisted_service_client import models from assisted_service_client.models.operator_type import OperatorType + +import consts + from junit_report import JunitTestCase from netaddr import IPAddress, IPNetwork -import consts +import waiting + +import yaml + from assisted_test_infra.test_infra import BaseClusterConfig, BaseInfraEnvConfig, ClusterName, exceptions, utils from assisted_test_infra.test_infra.controllers.load_balancer_controller import LoadBalancerController from assisted_test_infra.test_infra.helper_classes.base_cluster import BaseCluster @@ -22,6 +26,7 @@ from assisted_test_infra.test_infra.helper_classes.nodes import Nodes from assisted_test_infra.test_infra.tools import terraform_utils from assisted_test_infra.test_infra.utils import logs_utils, network_utils, operators_utils +from assisted_test_infra.test_infra.utils.logs_utils import filter_controllers_logs from assisted_test_infra.test_infra.utils.waiting import ( wait_till_all_hosts_are_in_status, wait_till_all_hosts_use_agent_image, @@ -689,12 +694,20 @@ def start_install_and_wait_for_installed( fall_on_pending_status=False, ): self.start_install() - if wait_for_hosts: - self.wait_for_hosts_to_install(fall_on_pending_status=fall_on_pending_status) - if wait_for_operators: - self.wait_for_operators_to_finish() - if wait_for_cluster_install: - self.wait_for_install() + try: + if wait_for_hosts: + self.wait_for_hosts_to_install(fall_on_pending_status=fall_on_pending_status) + if wait_for_operators: + self.wait_for_operators_to_finish() + if wait_for_cluster_install: + self.wait_for_install() + except Exception as e: + filtered_messages = filter_controllers_logs(self.api_client.client, self.id, + filters=["level=warning", "level=error"]) + # In case of error or timeout we will add last filter_controllers_logs with more info + e.filter_message = str(filtered_messages) + raise Exception(e.__class__.__name__, e.__dict__) + if download_kubeconfig: self.download_kubeconfig() diff --git a/src/assisted_test_infra/test_infra/utils/logs_utils.py b/src/assisted_test_infra/test_infra/utils/logs_utils.py index 718e6ef93c3..0363cefcdd3 100644 --- a/src/assisted_test_infra/test_infra/utils/logs_utils.py +++ b/src/assisted_test_infra/test_infra/utils/logs_utils.py @@ -2,17 +2,22 @@ import copy import hashlib import os +import random import shutil import tarfile import time from pathlib import Path +from string import ascii_lowercase from tempfile import TemporaryDirectory -import waiting +from assisted_service_client.rest import ApiException from consts import NUMBER_OF_MASTERS + from service_client import log +import waiting + OC_DOWNLOAD_LOGS_INTERVAL = 5 * 60 OC_DOWNLOAD_LOGS_TIMEOUT = 60 * 60 MAX_LOG_SIZE_BYTES = 1_000_000 @@ -342,3 +347,50 @@ def _are_logs_in_status(client, cluster_id, statuses, check_host_logs_only=False except BaseException: log.exception("Failed to get cluster %s log info", cluster_id) return False + + +def filter_controllers_logs(client, cluster_id, filters, last_messages=5): + """Get controller log file when possible when master's in finalized stage and filter messages. + + Used to detect error or warning when timeout expires , will be added to the exception last known errors/warning + :param client: + :param cluster_id: + :param filters: list of string to match from assisted + :param last_messages: + :return: + """ + installer_controller = 'assisted-installer-controller.logs' + tmp_file = f"/tmp/{''.join(random.choice(ascii_lowercase) for _ in range(10))}.tar.gz" + filtered_content = [] + try: + log.info(f"Collecting controllers info from {installer_controller}") + controller_data = client.v2_download_cluster_logs(cluster_id, logs_type="controller", + _preload_content=False).data + with open(tmp_file, "wb") as _file: + log.info(f"Write controllers content to file {tmp_file}") + _file.write(controller_data) + + with tarfile.open(tmp_file, 'r') as tar: + log.info(f"Extract {tmp_file} and create object content") + content = tar.extractfile(installer_controller).readlines() + reversed_iterator = reversed(content) + content_lines = [(next(reversed_iterator).decode("utf-8")) for _ in range(last_messages)] + + # filtered content - check if one of the filters in a line + log.info(f"Searching filters in object content {content_lines}") + for line in content_lines: + for f in filters: + if f in line: + filtered_content.append(line) + + except ApiException as api_error: + log.info("Failed to get controller logs %s", api_error) + + except Exception as e: + log.info("Error during controller log filtering %s", str(e)) + finally: + try: + os.remove(tmp_file) + except FileNotFoundError: + log.info("Unable to remove tmp file %s", tmp_file) + return filtered_content