diff --git a/lib/runtest.py b/lib/runtest.py index f4367f27..9cc03efe 100644 --- a/lib/runtest.py +++ b/lib/runtest.py @@ -1,12 +1,10 @@ import os import sys -import re import runpy import signal import traceback import tempfile import urllib3 -import yaml from pathlib import Path from lib import util, results @@ -24,27 +22,8 @@ # this is not a problem for Beaker, which captures test output separately and # actually cares about the exit code of the test script def _setup_timeout_handling(): - metadata_yaml = os.environ['TMT_TEST_METADATA'] # exception if undefined - with open(metadata_yaml) as f: - test_metadata = yaml.safe_load(f) - - if 'duration' in test_metadata: - duration_str = test_metadata['duration'] - match = re.fullmatch(r'([0-9]+)([a-z]+)', duration_str) - if not match: - results.report_and_exit('error', note=f"duration '{duration_str}' has invalid format") - length, unit = match.groups() - if unit == 'm': - duration = int(length)*60 - elif unit == 'h': - duration = int(length)*60*60 - elif unit == 'd': - duration = int(length)*60*60*24 - else: - duration = int(length) - else: - # use TMT's default of 5m - duration = 300 + metadata = util.TestMetadata() + duration = metadata.duration_seconds() # leave 10 seconds for our alarm timeout code duration -= 10 diff --git a/lib/util/__init__.py b/lib/util/__init__.py index 0d80abfa..51b40042 100644 --- a/lib/util/__init__.py +++ b/lib/util/__init__.py @@ -6,14 +6,15 @@ # so we could add the libdir to PATH and PYTHONPATH libdir = Path(inspect.getfile(inspect.currentframe())).parent.parent -from .content import * # noqa -from .backup import * # noqa -from .dedent import * # noqa -from .environment import * # noqa -from .httpsrv import * # noqa -from .log import * # noqa -from .old_content import * # noqa -from .rpmpack import * # noqa -from .sanitization import * # noqa -from .ssh import * # noqa -from .subprocess import * # noqa +from .content import * # noqa +from .backup import * # noqa +from .dedent import * # noqa +from .environment import * # noqa +from .httpsrv import * # noqa +from .log import * # noqa +from .old_content import * # noqa +from .rpmpack import * # noqa +from .sanitization import * # noqa +from .ssh import * # noqa +from .subprocess import * # noqa +from .test_metadata import * # noqa diff --git a/lib/util/test_metadata.py b/lib/util/test_metadata.py new file mode 100644 index 00000000..7f455b9d --- /dev/null +++ b/lib/util/test_metadata.py @@ -0,0 +1,36 @@ +import os +import re +import yaml +import copy as copy_mod + + +class TestMetadata(dict): + def __init__(self): + metadata_yaml = os.environ['TMT_TEST_METADATA'] # exception if undefined + with open(metadata_yaml) as f: + test_metadata = yaml.safe_load(f) + self.update(test_metadata) + + # return 'TestMetadata' for .copy(), not 'dict' + def copy(self): + return copy_mod.copy(self) + + def duration_seconds(self): + if 'duration' in self: + duration_str = self['duration'] + match = re.fullmatch(r'([0-9]+)([a-z]+)', duration_str) + if not match: + raise RuntimeError(f"'duration' has invalid format: {duration_str}") + length, unit = match.groups() + if unit == 'm': + duration = int(length)*60 + elif unit == 'h': + duration = int(length)*60*60 + elif unit == 'd': + duration = int(length)*60*60*24 + else: + duration = int(length) + else: + # use TMT's default of 5m + duration = 300 + return duration diff --git a/lib/virt.py b/lib/virt.py index 60eafb07..3e26af88 100644 --- a/lib/virt.py +++ b/lib/virt.py @@ -37,6 +37,7 @@ Example using snapshots: + import subprocess import virt virt.Host.setup() @@ -48,7 +49,7 @@ g.prepare_for_snapshot() with g.snapshotted(): - state = g.ssh('ls', '/root', capture=True) + state = g.ssh('ls', '/root', stdout=subprocess.PIPE) print(state.stdout) if state.returncode != 0: report_failure() @@ -82,6 +83,7 @@ import contextlib import tempfile import json +import uuid import xml.etree.ElementTree as ET from datetime import datetime, timedelta from pathlib import Path @@ -253,12 +255,15 @@ def __init__(self, template=TEMPLATE, packages=PACKAGES, partitions=None): self.ks = template self.appends = [] self.packages = packages - self.partitions = partitions if partitions else [] + self.partitions = partitions def assemble(self): - partitions_block = '\n'.join( - (f'part {mountpoint} --size={size}' for mountpoint, size in self.partitions), - ) + if self.partitions: + partitions_block = '\n'.join( + (f'part {mountpoint} --size={size}' for mountpoint, size in self.partitions), + ) + else: + partitions_block = 'part / --size=1 --grow' appends_block = '\n'.join(self.appends) packages_block = '\n'.join(self.packages) packages_block = f'%packages\n{packages_block}\n%end' @@ -337,7 +342,8 @@ class Guest: Set a 'tag' (string) to a unique name you would like to share across tests that use snapshots - the .can_be_snapshotted() function will return True when it finds an already installed guest using the same tag. - Tag-less guests cannot be shared across tests. + Tag-less guests can be used only for snapshotting within the same test + and should not be shared across tests. """ # custom post-install setup to allow smooth login and qemu-qa command execution @@ -353,7 +359,7 @@ class Guest: ] def __init__(self, tag=None, *, name=GUEST_NAME): - self.tag = tag + self.tag = tag or str(uuid.uuid4()) self.name = name self.ipaddr = None self.ssh_keyfile_path = f'{GUEST_IMG_DIR}/{name}.sshkey' @@ -469,8 +475,7 @@ def install(self, location=None, kickstart=None, rpmpack=None, disk_format='raw' # installed system doesn't need as much RAM, alleviate swap pressure set_domain_memory(self.name, 2000) - if self.tag is not None: - self.install_ready_path.write_text(self.tag) + self.install_ready_path.write_text(self.tag) self.orig_disk_path = disk_path self.orig_disk_format = disk_format @@ -553,8 +558,7 @@ def prepare_for_snapshot(self): # modify its built-in XML to point to a snapshot-style disk path set_state_image_disk(self.state_file_path, self.snapshot_path, 'qcow2') - if self.tag is not None: - self.snapshot_ready_path.write_text(self.tag) + self.snapshot_ready_path.write_text(self.tag) def _restore_snapshotted(self): # reused guest from another test, install() or prepare_for_snapshot() @@ -612,9 +616,13 @@ def snapshotted(self): self._destroy_snapshotted() @contextlib.contextmanager - def booted(self): + def booted(self, *, safe_shutdown=False): """ Just boot the guest, ready it for communication. + + With 'safe_shutdown', guarantee that the guest shuts down cleanly. + This is useful for setup-style use cases where the test wants to modify + the guest before taking a snapshot. """ self.start() self.ipaddr = wait_for_ifaddr(self.name) @@ -623,20 +631,21 @@ def booted(self): try: yield self finally: - if os.environ.get('CONTEST_LEAVE_GUEST_RUNNING') == '1': - self._log_leave_running_notice() + if safe_shutdown: + util.log(f"shutting down {self.name} (safely)") + self.shutdown() else: - try: - util.log(f"shutting down {self.name}") - self.shutdown() - except TimeoutError: - util.log(f"shutdown timed out, destroying {self.name}") - self.destroy() - - def _do_ssh(self, *cmd, func=util.subprocess_run, capture=False, **run_args): - if capture: - run_args['stdout'] = PIPE - run_args['stderr'] = PIPE + if os.environ.get('CONTEST_LEAVE_GUEST_RUNNING') == '1': + self._log_leave_running_notice() + else: + try: + util.log(f"shutting down {self.name}") + self.shutdown() + except TimeoutError: + util.log(f"shutdown timed out, destroying {self.name}") + self.destroy() + + def _do_ssh(self, *cmd, func=util.subprocess_run, **run_args): ssh_cmdline = [ 'ssh', '-q', '-i', self.ssh_keyfile_path, '-o', 'BatchMode=yes', '-o', 'StrictHostKeyChecking=no', '-o', 'UserKnownHostsFile=/dev/null', diff --git a/scanning/oscap-debug/helgrind.fmf b/scanning/oscap-debug/helgrind.fmf new file mode 100644 index 00000000..5faef8bf --- /dev/null +++ b/scanning/oscap-debug/helgrind.fmf @@ -0,0 +1,5 @@ +summary: Runs oscap via valgrind - helgrind +test: python3 -m lib.runtest ./helgrind.py +duration: 4h +require+: + - valgrind diff --git a/scanning/oscap-debug/helgrind.py b/scanning/oscap-debug/helgrind.py new file mode 100755 index 00000000..ab343918 --- /dev/null +++ b/scanning/oscap-debug/helgrind.py @@ -0,0 +1,26 @@ +#!/usr/bin/python3 + +from lib import util, results + + +profile = 'cis_workstation_l1' + +extra_debuginfos = [ + 'glibc', + 'openscap-scanner', + 'xmlsec1', + 'xmlsec1-openssl', + 'libtool-ltdl', + 'openssl-libs', +] + +util.subprocess_run(['dnf', '-y', 'debuginfo-install', *extra_debuginfos], check=True) + +oscap_cmd = [ + 'valgrind', '--tool=helgrind', '--', + 'oscap', 'xccdf', 'eval', '--profile', profile, '--progress', + util.get_datastream(), +] +util.subprocess_run(oscap_cmd) + +results.report_and_exit() diff --git a/scanning/oscap-debug/main.fmf b/scanning/oscap-debug/main.fmf new file mode 100644 index 00000000..b2ecfa4e --- /dev/null +++ b/scanning/oscap-debug/main.fmf @@ -0,0 +1,7 @@ +result: custom +environment+: + PYTHONPATH: ../.. +# these are tools to be manually modified and executed, +# not to be run in any kind of automation +tag+: +- needs-param diff --git a/scanning/oscap-debug/sysctl-only.fmf b/scanning/oscap-debug/sysctl-only.fmf new file mode 100644 index 00000000..d555b2e1 --- /dev/null +++ b/scanning/oscap-debug/sysctl-only.fmf @@ -0,0 +1,9 @@ +summary: Runs oscap many times to hopefully reproduce a freeze +test: python3 -m lib.runtest ./sysctl-only.py +duration: 4h +require+: + - gdb +adjust: + - when: distro < rhel-9.5 + enabled: false + because: we need a fairly modern gdb diff --git a/scanning/oscap-debug/sysctl-only.py b/scanning/oscap-debug/sysctl-only.py new file mode 100755 index 00000000..023ae1b2 --- /dev/null +++ b/scanning/oscap-debug/sysctl-only.py @@ -0,0 +1,105 @@ +#!/usr/bin/python3 + +import time +import signal +import subprocess + +from lib import util, results, oscap + + +start_time = time.monotonic() + +profile = 'anssi_bp28_high' + +# sysctl rules only take about 1-2 seconds +oscap_timeout = 10 + +# unselect all rules in the specified profile, except for +# sysctl_* rules +ds = oscap.global_ds() +rules = ds.profiles[profile].rules +rules = {rule for rule in rules if not rule.startswith('sysctl_')} +oscap.unselect_rules(util.get_datastream(), 'scan-ds.xml', rules) + +extra_debuginfos = [ + 'glibc', + 'openscap-scanner', + 'xmlsec1', + 'xmlsec1-openssl', + 'libtool-ltdl', + 'openssl-libs', +] + +util.subprocess_run(['dnf', '-y', 'debuginfo-install', *extra_debuginfos], check=True) + +with open('gdb.script', 'w') as f: + f.write(util.dedent(''' + generate-core-file oscap.core + set logging file oscap-bt.txt + set logging overwrite on + set logging redirect on + set logging enabled on + thread apply all bt + set logging enabled off + ''')) + +oscap_cmd = [ + 'oscap', 'xccdf', 'eval', '--profile', profile, '--progress', 'scan-ds.xml', +] + +# run for all of the configured test duration, minus 600 seconds for safety +# (running gdb, compressing corefile which takes forever, etc.) +attempt = 1 +metadata = util.TestMetadata() +duration = metadata.duration_seconds() - oscap_timeout - 600 +util.log(f"trying to freeze oscap for {duration} total seconds") + +while time.monotonic() - start_time < duration: + oscap_proc = util.subprocess_Popen(oscap_cmd) + + try: + returncode = oscap_proc.wait(oscap_timeout) + if returncode not in [0,2]: + results.report( + 'fail', f'attempt:{attempt}', f"oscap failed with {returncode}", + ) + continue + + except subprocess.TimeoutExpired: + # figure out oscap PID on the remote system + pgrep = util.subprocess_run( + ['pgrep', '-n', 'oscap'], + stdout=subprocess.PIPE, universal_newlines=True, + ) + if pgrep.returncode != 0: + results.report( + 'warn', + f'attempt:{attempt}', + f"pgrep returned {pgrep.returncode}, oscap probably just finished " + "and we hit a rare race, moving on", + ) + continue + + oscap_pid = pgrep.stdout.strip() + + # attach gdb to that PID + util.subprocess_run( + ['gdb', '-n', '-batch', '-x', 'gdb.script', '-p', oscap_pid], + check=True, + ) + + util.subprocess_run(['xz', '-e', '-9', 'oscap.core'], check=True) + results.report( + 'fail', f'attempt:{attempt}', "oscap froze, gdb output available", + logs=['oscap.core.xz', 'oscap-bt.txt'], + ) + break + + finally: + oscap_proc.send_signal(signal.SIGKILL) + oscap_proc.wait() + + results.report('pass', f'attempt:{attempt}') + attempt += 1 + +results.report_and_exit() diff --git a/scanning/oscap-debug/vm-scan.fmf b/scanning/oscap-debug/vm-scan.fmf new file mode 100644 index 00000000..e4f47cde --- /dev/null +++ b/scanning/oscap-debug/vm-scan.fmf @@ -0,0 +1,25 @@ +summary: Runs oscap many times to hopefully reproduce a freeze +test: python3 -m lib.runtest ./vm-scan.py +duration: 4h +require+: + # virt library dependencies + - libvirt-daemon + - libvirt-daemon-driver-qemu + - libvirt-daemon-driver-storage-core + - libvirt-daemon-driver-network + - firewalld + - qemu-kvm + - libvirt-client + - virt-install + - rpm-build + - createrepo +extra-hardware: | + keyvalue = HVM=1 + hostrequire = memory>=3720 +adjust: + - when: arch != x86_64 + enabled: false + because: we want to run virtualization on x86_64 only + - when: distro < rhel-9.5 + enabled: false + because: we need a fairly modern gdb diff --git a/scanning/oscap-debug/vm-scan.py b/scanning/oscap-debug/vm-scan.py new file mode 100755 index 00000000..d6c403d6 --- /dev/null +++ b/scanning/oscap-debug/vm-scan.py @@ -0,0 +1,109 @@ +#!/usr/bin/python3 + +import time +import subprocess +import tempfile + +from lib import util, results, virt + + +profile = 'cis_workstation_l1' + +# cis_workstation_l1 takes about 4-5 seconds to scan +oscap_timeout = 30 + +extra_packages = [ + 'gdb', + #'@Server with GUI', # uncomment to test with GUI +] +extra_debuginfos = [ + 'glibc', + 'openscap-scanner', + 'xmlsec1', + 'xmlsec1-openssl', + 'libtool-ltdl', + 'openssl-libs', +] + +start_time = time.monotonic() + +virt.Host.setup() +g = virt.Guest() +ks = virt.Kickstart() +ks.packages += extra_packages +g.install(kickstart=ks) + +with g.booted(): + # copy our datastream to the guest + g.copy_to(util.get_datastream(), 'scan-ds.xml') + # install debugsource / debuginfo + g.ssh(' '.join(['dnf', '-y', 'debuginfo-install', *extra_debuginfos]), check=True) + # prepare gdb script + with tempfile.NamedTemporaryFile(mode='w+t') as f: + f.write(util.dedent(''' + generate-core-file oscap.core + set logging file oscap-bt.txt + set logging overwrite on + set logging redirect on + set logging enabled on + thread apply all bt + set logging enabled off + ''')) + f.flush() + g.copy_to(f.name, 'gdb.script') + + # run for all of the configured test duration, minus 600 seconds for safety + # (running gdb, compressing corefile which takes forever, etc.) + attempt = 1 + metadata = util.TestMetadata() + duration = metadata.duration_seconds() - oscap_timeout - 600 + util.log(f"trying to freeze oscap for {duration} total seconds") + + oscap_cmd = f'oscap xccdf eval --profile {profile} --progress scan-ds.xml' + + while time.monotonic() - start_time < duration: + oscap_proc = g.ssh(oscap_cmd, func=util.subprocess_Popen) + + try: + returncode = oscap_proc.wait(oscap_timeout) + if returncode not in [0,2]: + results.report( + 'fail', f'attempt:{attempt}', f"oscap failed with {returncode}", + ) + continue + + except subprocess.TimeoutExpired: + # figure out oscap PID on the remote system + pgrep = g.ssh('pgrep -n oscap', stdout=subprocess.PIPE, universal_newlines=True) + if pgrep.returncode != 0: + results.report( + 'warn', + f'attempt:{attempt}', + f"pgrep returned {pgrep.returncode}, oscap probably just finished " + "and we hit a rare race, moving on", + ) + continue + + oscap_pid = pgrep.stdout.strip() + + # attach gdb to that PID + g.ssh(f'gdb -n -batch -x gdb.script -p {oscap_pid}', check=True) + + # and download its results + g.copy_from('oscap.core') + g.copy_from('oscap-bt.txt') + util.subprocess_run(['xz', '-e', '-9', 'oscap.core'], check=True) + results.report( + 'fail', f'attempt:{attempt}', "oscap froze, gdb output available", + logs=['oscap.core.xz', 'oscap-bt.txt'], + ) + break + + finally: + oscap_proc.terminate() + oscap_proc.wait() + + results.report('pass', f'attempt:{attempt}') + attempt += 1 + +results.report_and_exit()