Skip to content

Commit

Permalink
Add filtered controller logs when timeout during installation.
Browse files Browse the repository at this point in the history
During CI runs we got timeout failures without any root cause for failures.
The default timeout is 3600 seconds.

We are going to increase total masters and operators installed and probably
May hit on timeout and the current timers are not enough.

Added support for filtering last messages from assisted controller log when timeout occured.
In case controller logs exists on timeout we extract last error/ warning and report them
as part of the raise exception

We expect the see additional info when bubbled up timeout exception and get more details
from the raised exception
.
  • Loading branch information
bkopilov committed Dec 4, 2024
1 parent 4e9d8f3 commit e7c61ba
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 10 deletions.
31 changes: 22 additions & 9 deletions src/assisted_test_infra/test_infra/helper_classes/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,18 @@
from collections import Counter
from typing import List, Optional, Set

import waiting
import yaml
from assisted_service_client import models
from assisted_service_client.models.operator_type import OperatorType

import consts

from junit_report import JunitTestCase
from netaddr import IPAddress, IPNetwork

import consts
import waiting

import yaml

from assisted_test_infra.test_infra import BaseClusterConfig, BaseInfraEnvConfig, ClusterName, exceptions, utils
from assisted_test_infra.test_infra.controllers.load_balancer_controller import LoadBalancerController
from assisted_test_infra.test_infra.helper_classes.base_cluster import BaseCluster
Expand All @@ -22,6 +26,7 @@
from assisted_test_infra.test_infra.helper_classes.nodes import Nodes
from assisted_test_infra.test_infra.tools import terraform_utils
from assisted_test_infra.test_infra.utils import logs_utils, network_utils, operators_utils
from assisted_test_infra.test_infra.utils.logs_utils import filter_controllers_logs
from assisted_test_infra.test_infra.utils.waiting import (
wait_till_all_hosts_are_in_status,
wait_till_all_hosts_use_agent_image,
Expand Down Expand Up @@ -689,12 +694,20 @@ def start_install_and_wait_for_installed(
fall_on_pending_status=False,
):
self.start_install()
if wait_for_hosts:
self.wait_for_hosts_to_install(fall_on_pending_status=fall_on_pending_status)
if wait_for_operators:
self.wait_for_operators_to_finish()
if wait_for_cluster_install:
self.wait_for_install()
try:
if wait_for_hosts:
self.wait_for_hosts_to_install(fall_on_pending_status=fall_on_pending_status)
if wait_for_operators:
self.wait_for_operators_to_finish()
if wait_for_cluster_install:
self.wait_for_install()
except Exception as e:
filtered_messages = filter_controllers_logs(self.api_client.client, self.id,
filters=["level=warning", "level=error"])
# In case of error or timeout we will add last filter_controllers_logs with more info
e.filter_message = str(filtered_messages)
raise Exception(e.__class__.__name__, e.__dict__)

if download_kubeconfig:
self.download_kubeconfig()

Expand Down
54 changes: 53 additions & 1 deletion src/assisted_test_infra/test_infra/utils/logs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,22 @@
import copy
import hashlib
import os
import random
import shutil
import tarfile
import time
from pathlib import Path
from string import ascii_lowercase
from tempfile import TemporaryDirectory

import waiting
from assisted_service_client.rest import ApiException

from consts import NUMBER_OF_MASTERS

from service_client import log

import waiting

OC_DOWNLOAD_LOGS_INTERVAL = 5 * 60
OC_DOWNLOAD_LOGS_TIMEOUT = 60 * 60
MAX_LOG_SIZE_BYTES = 1_000_000
Expand Down Expand Up @@ -342,3 +347,50 @@ def _are_logs_in_status(client, cluster_id, statuses, check_host_logs_only=False
except BaseException:
log.exception("Failed to get cluster %s log info", cluster_id)
return False


def filter_controllers_logs(client, cluster_id, filters, last_messages=10):
"""Get controller log file when possible when master's in finalized stage and filter messages.
Used to detect error or warning when timeout expires , will be added to the exception last known errors/warning
:param client:
:param cluster_id:
:param filters: list of string to match from assisted
:param last_messages:
:return:
"""
installer_controller = 'assisted-installer-controller.logs'
tmp_file = f"/tmp/{''.join(random.choice(ascii_lowercase) for _ in range(10))}.tar.gz"
filtered_content = []
try:
log.info(f"Collecting controllers info from {installer_controller}")
controller_data = client.v2_download_cluster_logs(cluster_id, logs_type="controller",
_preload_content=False).data
with open(tmp_file, "wb") as _file:
log.info(f"Write controllers content to file {tmp_file}")
_file.write(controller_data)

with tarfile.open(tmp_file, 'r') as tar:
log.info(f"Extract {tmp_file} and create object content")
content = tar.extractfile(installer_controller).readlines()
reversed_iterator = reversed(content)
content_lines = [(next(reversed_iterator).decode("utf-8")) for _ in range(last_messages)]

# filtered content - check if one of the filters in a line
log.info(f"Searching filters in object content {content_lines}")
for line in content_lines:
for f in filters:
if f in line:
filtered_content.append(line)

except ApiException as api_error:
log.info("Failed to get controller logs %s", api_error)

except Exception as e:
log.info("Error during controller log filtering %s", str(e))
finally:
try:
os.remove(tmp_file)
except FileNotFoundError:
log.info("Unable to remove tmp file %s", tmp_file)
return filtered_content

0 comments on commit e7c61ba

Please sign in to comment.