Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Investigate flakiness #436

Closed
wants to merge 59 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
91909d5
Empty commit
javierdelapuente Jan 27, 2025
3acd674
try 2
javierdelapuente Jan 27, 2025
7cc974f
try 3
javierdelapuente Jan 27, 2025
263067f
try 4
javierdelapuente Jan 27, 2025
a13c42d
try 5
javierdelapuente Jan 27, 2025
048446e
put more logs
javierdelapuente Jan 27, 2025
b8b3d34
test
javierdelapuente Jan 28, 2025
44ccf70
another try
javierdelapuente Jan 28, 2025
cdd6cb5
another one
javierdelapuente Jan 28, 2025
853a8a8
put some logs...
javierdelapuente Jan 28, 2025
4aed1a2
let's see if I can get somethign from op workflows
javierdelapuente Jan 29, 2025
b5d8a84
another try with os pass updated
javierdelapuente Jan 29, 2025
147313e
if the image builder does not go to active or goes to error, fail
javierdelapuente Jan 29, 2025
0c3f50c
more reasonable timeouts
javierdelapuente Jan 29, 2025
6e4fb54
empty
javierdelapuente Jan 29, 2025
370ba7e
more timeouts updated
javierdelapuente Jan 29, 2025
1620146
other one
javierdelapuente Jan 29, 2025
ce01650
increase create server timeout
javierdelapuente Jan 29, 2025
c9cf847
a bit more info about the command
javierdelapuente Jan 29, 2025
b861db2
add maintenance status in debug-ssh
javierdelapuente Jan 29, 2025
8b7c5af
increase timeout
javierdelapuente Jan 29, 2025
6ae8c2e
bad ssh run
javierdelapuente Jan 29, 2025
c197149
change call debug ssh
javierdelapuente Jan 29, 2025
bc7638d
let's try again
javierdelapuente Jan 30, 2025
e568678
clean messages
javierdelapuente Jan 30, 2025
6345b57
add snap watch for auto refresh
javierdelapuente Jan 30, 2025
3b0c952
better logging
javierdelapuente Jan 30, 2025
0e3adbf
a few more tries
javierdelapuente Jan 30, 2025
4a13d80
try two
javierdelapuente Jan 30, 2025
d06501d
what is the path
javierdelapuente Jan 30, 2025
0c0c2c4
forked repo full name
javierdelapuente Jan 30, 2025
e71aa72
just in case autorefresh is creating issues
javierdelapuente Jan 30, 2025
097c98d
more logs to see if they work
javierdelapuente Jan 30, 2025
8a59b7b
fix linting
javierdelapuente Jan 30, 2025
fdd2b85
reconcile on more cases
javierdelapuente Jan 31, 2025
0480e85
remoev comment I do not what it means
javierdelapuente Jan 31, 2025
05e3282
a few more logs to see where it gets stuck
javierdelapuente Jan 31, 2025
36e49f2
add log on image relation joined
javierdelapuente Jan 31, 2025
08c01c3
a couple of logs more
javierdelapuente Jan 31, 2025
4caeaf9
do not consider deleted for reconciling
javierdelapuente Jan 31, 2025
883dc06
increase a timeout
javierdelapuente Jan 31, 2025
398a3eb
just for curiosity
javierdelapuente Jan 31, 2025
d143e55
more logs
javierdelapuente Jan 31, 2025
a571016
another try
javierdelapuente Feb 3, 2025
de19c71
use a newer version of the image-builder
javierdelapuente Feb 3, 2025
46ac8da
Revert "use a newer version of the image-builder"
javierdelapuente Feb 3, 2025
5a76dd1
use the latest iamge builder
javierdelapuente Feb 4, 2025
d882f55
give it a bit more time
javierdelapuente Feb 4, 2025
9835025
let's try the last version
javierdelapuente Feb 6, 2025
d8bafb5
Merge branch 'main' into investigate-flakiness
javierdelapuente Feb 6, 2025
9b91fb9
go for the latest version of the image-builder
javierdelapuente Feb 6, 2025
4e7dbef
app-channel not a config anymore
javierdelapuente Feb 6, 2025
5d8915d
try with revision 54
javierdelapuente Feb 6, 2025
75cf7fd
rev 55
javierdelapuente Feb 6, 2025
26d3c33
revision history must be >1
javierdelapuente Feb 6, 2025
fd1eb10
try to fix the leaking of keypairs in image builder
javierdelapuente Feb 6, 2025
4dfc9db
dump debug logs on failure
javierdelapuente Feb 7, 2025
daf4c29
Merge branch 'main' into investigate-flakiness
javierdelapuente Feb 7, 2025
cb79f9a
Merge branch 'main' into investigate-flakiness
javierdelapuente Feb 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/integration_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
# INTEGRATION_TEST_ARGS to operator-workflows automatically.
integration-tests:
name: Integration test with juju 3.1
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@dump-debug-logs-on-failure
secrets: inherit
with:
juju-channel: 3.1/stable
Expand All @@ -31,7 +31,7 @@ jobs:
test-timeout: 90
openstack-interface-tests-private-endpoint:
name: openstack interface test using private-endpoint
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@dump-debug-logs-on-failure
secrets: inherit
with:
juju-channel: 3.6/stable
Expand All @@ -44,7 +44,7 @@ jobs:
self-hosted-runner-label: stg-private-endpoint
openstack-integration-tests-private-endpoint:
name: Integration test using private-endpoint
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@dump-debug-logs-on-failure
secrets: inherit
with:
juju-channel: 3.6/stable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,13 @@ def _reconcile_non_reactive(self, expected_quantity: int) -> _ReconcileResult:
"""
delete_metric_stats = None
metric_stats = self._manager.cleanup()
runners = self._manager.get_runners()
# JAVI TODO FILTER DELETED. THIS WILL GENERATE QUOTA ISSUES :(
from github_runner_manager.manager.cloud_runner_manager import CloudRunnerState
runners = self._manager.get_runners(
github_states = None,
cloud_states = [s for s in CloudRunnerState if s != CloudRunnerState.DELETED],
)
# runners = self._manager.get_runners()
logger.info("Reconcile runners from %s to %s", len(runners), expected_quantity)
runner_diff = expected_quantity - len(runners)
if runner_diff > 0:
Expand Down Expand Up @@ -313,8 +319,7 @@ def _issue_reconciliation_metric(

try:

metric_events.issue_event(
metric_events.Reconciliation(
event = metric_events.Reconciliation(
timestamp=time.time(),
flavor=reconcile_metric_data.flavor,
crashed_runners=reconcile_metric_data.metric_stats.get(
Expand All @@ -327,6 +332,8 @@ def _issue_reconciliation_metric(
duration=reconcile_metric_data.end_timestamp
- reconcile_metric_data.start_timestamp,
)
)
metric_events.issue_event(event)
logger.info("JAVI METRIC EVENTS: %s,", event)
logger.info("JAVI RUNNER LIST: %s,", reconcile_metric_data.runner_list)
except IssueMetricEventError:
logger.exception("Failed to issue Reconciliation metric")
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,11 @@ def _run_health_check_cloud_init(
"""
result: invoke.runners.Result = _execute_ssh_command(ssh_conn, "cloud-init status")
if not result.ok:
logger.warning("cloud-init status command failed on %s: %s.", server_name, result.stderr)
logger.error("cloud-init status command failed on %s: %s.", server_name, result.stderr)
cloud_init_log_output_result = _execute_ssh_command(ssh_conn, "cat /var/log/cloud-init-output.log")
logger.error("/var/log/cloud-init-output.log stdout: %s", cloud_init_log_output_result.stdout)
cloud_init_log_result = _execute_ssh_command(ssh_conn, "cat /var/log/cloud-init.log")
logger.error("/var/log/cloud-init.log stdout: %s", cloud_init_log_result.stdout)
return False

if CloudInitStatus.DONE in result.stdout:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,10 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None:
logger.warning(
"cloud-init status command failed on %s: %s.", instance.server_name, result.stderr
)
cloud_init_log_output_result = ssh_conn.run("cat /var/log/cloud-init-output.log", warn=True, timeout=60)
logger.error("/var/log/cloud-init-output.log stdout: %s", cloud_init_log_output_result.stdout)
cloud_init_log_result = ssh_conn.run("cat /var/log/cloud-init.log")
logger.error("/var/log/cloud-init.log stdout: %s", cloud_init_log_result.stdout, warn=True, timeout=60)
raise RunnerStartError(f"Runner startup process not found on {instance.server_name}")
# A short running job may have already completed and exited the runner, hence check the
# condition via cloud-init status check.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ snap watch --last=auto-refresh?

{% if aproxy_address %}
snap install aproxy --edge
snap refresh --hold=2h
snap watch --last=auto-refresh?
snap set aproxy proxy={{ aproxy_address }} listen=:54969
cat << EOF > /etc/nftables.conf
define default-ip = $(ip route get $(ip route show 0.0.0.0/0 | grep -oP 'via \K\S+') | grep -oP 'src \K\S+')
Expand Down
12 changes: 12 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ def _on_upgrade_charm(self, _: UpgradeCharmEvent) -> None:
@catch_charm_errors
def _on_config_changed(self, _: ConfigChangedEvent) -> None:
"""Handle the configuration change."""
logger.info("JAVI CHARM _ON_CONFIG_CHANGED")
state = self._setup_state()
self._set_reconcile_timer()

Expand All @@ -355,6 +356,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None:
if not self._get_set_image_ready_status():
return
if flush_and_reconcile:
logger.info("JAVI CHARM _ON_CONFIG_CHANGED FLUSH RECONCILE")
logger.info("Flush and reconcile on config-changed")
runner_scaler = self._get_runner_scaler(state)
runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE)
Expand All @@ -363,16 +365,19 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None:
@catch_charm_errors
def _on_reconcile_runners(self, _: ReconcileRunnersEvent) -> None:
"""Event handler for reconciling runners."""
logger.info("JAVI CHARM _on_reconcile_runners")
self._trigger_reconciliation()

@catch_charm_errors
def _on_database_created(self, _: ops.RelationEvent) -> None:
"""Handle the MongoDB database created event."""
logger.info("JAVI CHARM _on_database_created")
self._trigger_reconciliation()

@catch_charm_errors
def _on_endpoints_changed(self, _: ops.RelationEvent) -> None:
"""Handle the MongoDB endpoints changed event."""
logger.info("JAVI CHARM _on_endpoints_changed")
self._trigger_reconciliation()

def _trigger_reconciliation(self) -> None:
Expand All @@ -392,6 +397,7 @@ def _on_check_runners_action(self, event: ActionEvent) -> None:
Args:
event: The event fired on check_runners action.
"""
logger.info("JAVI CHARM _on_check_runners_action")
state = self._setup_state()

runner_scaler = self._get_runner_scaler(state)
Expand All @@ -414,6 +420,7 @@ def _on_reconcile_runners_action(self, event: ActionEvent) -> None:
Args:
event: Action event of reconciling the runner.
"""
logger.info("JAVI CHARM _on_reconcile_runners_action")
self.unit.status = MaintenanceStatus("Reconciling runners")
state = self._setup_state()

Expand Down Expand Up @@ -441,6 +448,7 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None:
Args:
event: Action event of flushing all runners.
"""
logger.info("JAVI CHARM _on_flush_runners_action")
state = self._setup_state()

# Flushing mode not implemented for OpenStack yet.
Expand All @@ -465,6 +473,7 @@ def _on_update_dependencies_action(self, event: ActionEvent) -> None:
Args:
event: Action event of updating dependencies.
"""
logger.info("JAVI CHARM _on_update_dependencies_action")
# No dependencies managed by the charm for OpenStack-based runners.
event.set_results({"flush": False})

Expand Down Expand Up @@ -522,6 +531,7 @@ def _apt_install(self, packages: Sequence[str]) -> None:
@catch_charm_errors
def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None:
"""Handle debug ssh relation changed event."""
logger.info("JAVI CHARM _on_debug_ssh_relation_changed")
self.unit.status = MaintenanceStatus("Added debug-ssh relation")
state = self._setup_state()

Expand All @@ -534,6 +544,7 @@ def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None:
@catch_charm_errors
def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None:
"""Handle image relation joined event."""
logger.info("JAVI CHARM _on_image_relation_joined")
state = self._setup_state()

clouds_yaml = state.charm_config.openstack_clouds_yaml
Expand All @@ -545,6 +556,7 @@ def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None:
@catch_charm_errors
def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None:
"""Handle image relation changed event."""
logger.info("JAVI CHARM _on_image_relation_changed")
state = self._setup_state()
self.unit.status = MaintenanceStatus("Update image for runners")

Expand Down
46 changes: 43 additions & 3 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,23 @@ def runner_manager_github_client(token: str) -> GithubClient:
return GithubClient(token=token)


@pytest_asyncio.fixture(scope="module")
async def openstack_model_proxy(
openstack_http_proxy: str,
openstack_https_proxy: str,
openstack_no_proxy: str,
model: Model,
) -> None:
await model.set_config(
{
"juju-http-proxy": openstack_http_proxy,
"juju-https-proxy": openstack_https_proxy,
"juju-no-proxy": openstack_no_proxy,
"logging-config": "<root>=INFO;unit=DEBUG",
}
)


@pytest_asyncio.fixture(scope="module")
async def app_no_runner(
model: Model,
Expand Down Expand Up @@ -432,10 +449,32 @@ async def app_openstack_runner_fixture(
)
await model.integrate(f"{image_builder.name}:image", f"{application.name}:image")
await model.wait_for_idle(
apps=[application.name, image_builder.name], status=ACTIVE, timeout=20 * 60
apps=[application.name, image_builder.name], status=ACTIVE, timeout=30 * 60
)

return application
# better use test-mode charm config... but let's see
command = "find /var/lib/juju -type f -name 'constants.py' -exec sed -i 's/^CREATE_SERVER_TIMEOUT = .*/CREATE_SERVER_TIMEOUT = 900/gI' {} \\;"
run_actions = await application.run(command)
logging.info("JAVI run_actions %s", run_actions)
for action_result in run_actions.actions:
logging.info("JAVI action_result %s", action_result)
action = action_result.action
logging.info("JAVI action %s", action)
# no comment...
action_id = action.tag
if action_id.startswith("action-"):
# strip the action- part of "action-<num>" tag
action_id = action_id[7:]
action = await model._wait_for_new("action", action_id)
result = await action.wait()
logging.info("JAVI output of one unit of CREATE_SERVER_TIMEOUT %s", result.results)

yield application
try:
logging.info("JAVI after yield in app_openstack_runner_fixture")
# get_file_content(unit, filename)
except Exception:
logging.exception("JAVI something failed after yield")


@pytest_asyncio.fixture(scope="module", name="app_scheduled_events")
Expand Down Expand Up @@ -525,7 +564,7 @@ async def tmate_ssh_server_app_fixture(
"""tmate-ssh-server charm application related to GitHub-Runner app charm."""
tmate_app: Application = await model.deploy("tmate-ssh-server", channel="edge")
await app_no_wait_tmate.relate("debug-ssh", f"{tmate_app.name}:debug-ssh")
await model.wait_for_idle(apps=[tmate_app.name], status=ACTIVE, timeout=60 * 30)
await model.wait_for_idle(apps=[tmate_app.name], status=ACTIVE, timeout=60 * 20)

return tmate_app

Expand Down Expand Up @@ -627,6 +666,7 @@ async def app_with_forked_repo(
Test should ensure it returns with the application in a good state and has
one runner.
"""
logging.info("JAVI forked_github_repository.full_name: %s", forked_github_repository.full_name)
await basic_app.set_config({PATH_CONFIG_NAME: forked_github_repository.full_name})

return basic_app
Expand Down
6 changes: 4 additions & 2 deletions tests/integration/helpers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,11 @@ async def reconcile(app: Application, model: Model) -> None:
app: The GitHub Runner Charm app to reconcile the runners for.
model: The machine charm model.
"""
logger.info("JAVI calling common.reconcile")
action = await app.units[0].run_action("reconcile-runners")
await action.wait()
await model.wait_for_idle(apps=[app.name], status=ACTIVE)
# JAVI TODO put this timeout a bit bigger than the new create build timeout
await model.wait_for_idle(apps=[app.name], status=ACTIVE, timeout=16 * 60)


async def deploy_github_runner_charm(
Expand Down Expand Up @@ -324,7 +326,7 @@ async def wait_for_completion(run: WorkflowRun, conclusion: str) -> None:
"""
await wait_for(
partial(_is_workflow_run_complete, run=run),
timeout=60 * 30,
timeout=60 * 20,
check_interval=60,
)
# The run object is updated by _is_workflow_run_complete function above.
Expand Down
65 changes: 65 additions & 0 deletions tests/integration/helpers/openstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# See LICENSE file for licensing details.
import logging
import secrets
import threading
from asyncio import sleep
from typing import Optional, TypedDict

Expand All @@ -17,6 +18,50 @@
logger = logging.getLogger(__name__)


async def javi_wait_for_idle(openstack_connection, model, *args, **kwargs) -> None:
"""TODO.

Args:
openstack_connection: OpenStack connection object.
model: model
args: args
kwargs: kwargs
"""
logger.info("javi_wait_for_idle")
e = threading.Event()

def _log_openstack():
"""TODO."""
end_loop = False
while True:
end_loop = e.wait(20)
# probably not thread safe, but...
try:
servers = openstack_connection.list_servers()
except Exception as ex:
logger.exception("JAVI in log openstack thread")
raise ex
logger.info(" [ runner list ]")
for runner in servers:
logger.info(
" [ runner %s ] status %s created %s updated %s",
runner.name,
runner.status,
runner.created_at,
runner.updated_at,
)
if end_loop:
break

try:
t = threading.Thread(target=_log_openstack)
t.start()
await model.wait_for_idle(*args, **kwargs)
finally:
e.set()
t.join()


class OpenStackInstanceHelper:
"""Helper class to interact with OpenStack instances."""

Expand Down Expand Up @@ -179,6 +224,26 @@ async def get_runner_name(self, unit: Unit) -> str:
assert len(runners) == 1
return runners[0].name

def log_runners(self, unit: Unit) -> None:
"""TODO LOG RUNNERS.

Expects only one runner to be present.

Args:
unit: The GitHub Runner Charm unit to get the runner name for.
"""
runners = self._get_runners(unit)
logger.info("[ list of runners for unit %s]", unit)
for runner in runners:
logger.info(
" [ runner %s ] status %s created %s updated %s",
runner.name,
runner.status,
runner.created_at,
runner.updated_at,
)
logger.info("[ end list of runners for unit %s]")

async def delete_single_runner(self, unit: Unit) -> None:
"""Delete the only runner.

Expand Down
Loading
Loading