From db1195a697fc1ce992e112c2a68aff4220760834 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 26 Oct 2023 10:04:37 -0400 Subject: [PATCH 01/70] Add gitleaks.toml for rh-gitleaks (#510) Add a .gitleaks.toml file to avoid the false positive leak for the example certificate when deploying for Elasticsearch. --- .gitleaks.toml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .gitleaks.toml diff --git a/.gitleaks.toml b/.gitleaks.toml new file mode 100644 index 000000000..1c2a4ed0f --- /dev/null +++ b/.gitleaks.toml @@ -0,0 +1,8 @@ +[allowlist] + description = "Global Allowlist" + + # Ignore based on any subset of the file path + paths = [ + # Ignore all example certs + '''roles\/servicetelemetry\/vars\/dummy_user_certs\.yml''' + ] From 9d7be76f186ffef4b4f579f43fecb352d417d437 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 31 Oct 2023 18:43:47 +0000 Subject: [PATCH 02/70] [stf-collect-logs] Move describe build|pod from ci/ to the role (#505) --- build/stf-collect-logs/tasks/main.yml | 24 ++++++++++++++++++++++-- ci/post-collect_logs.yml | 19 ------------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/build/stf-collect-logs/tasks/main.yml b/build/stf-collect-logs/tasks/main.yml index dde52761f..07c602840 100644 --- a/build/stf-collect-logs/tasks/main.yml +++ b/build/stf-collect-logs/tasks/main.yml @@ -54,8 +54,28 @@ ansible.builtin.command: cmd: | oc -n {{ namespace }} get pods > {{ logfile_dir }}/post_oc_get_pods.log 2>&1 - echo "Additional information" >> {{ logfile_dir }}/post_oc_get_pods.log - oc -n {{ namespace }} describe pods >> {{ logfile_dir }}/post_oc_get_pods.log 2>&1 + ignore_errors: true + retries: 3 + delay: 10 + +- name: "Describe non-completed, non-running pods" + ansible.builtin.shell: + cmd: | + for pod in $(oc get pods | grep -v NAME | grep -v Running | awk '{ print $1 }'); + do + oc -n {{ namespace }} describe pod $pod > {{ logfile_dir }}/post_oc_describe_pod_${pod}.log 2>&1 + done + ignore_errors: true + retries: 3 + delay: 10 + +- name: "Describe builds" + ansible.builtin.shell: + cmd: | + for build in $(oc -n {{ namespace }} get builds -o json | jq -r '.items[].metadata.name'); + do + oc -n {{ namespace }} describe build $build > {{ logfile_dir }}/post_oc_describe_build_${build}.log 2>&1 + done ignore_errors: true retries: 3 delay: 10 diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml index 58552b618..78526cd1b 100644 --- a/ci/post-collect_logs.yml +++ b/ci/post-collect_logs.yml @@ -42,25 +42,6 @@ ansible.builtin.import_role: name: '../build/stf-collect-logs' - - name: "Get pods and describe non-completed, non-running pods" - ansible.builtin.shell: - cmd: | - echo "*** oc get pods ***" > {{ logfile_dir }}/oc_get_pods.log 2>&1 - oc -n {{ namespace }} get pods >> {{ logfile_dir }}/oc_get_pods.log 2>&1 - - for pod in $(oc get pods | grep -v NAME | grep -v Running | awk '{ print $1 }'); - do - oc -n {{ namespace }} describe pod $pod > {{ logfile_dir }}/post_oc_describe_pod_${pod}.log 2>&1 - done - ignore_errors: true - retries: 3 - delay: 10 - - - name: "Get build details" - ansible.builtin.shell: - cmd: | - for build in $(oc -n {{ namespace }} get builds -o json| jq -r '.items[].metadata.name'); do oc -n {{ namespace }} describe build $build > {{ logfile_dir }}/post_oc_describe_build_${build}.log 2>&1; done - - name: "Copy generated logs" ansible.builtin.shell: | cp {{ ansible_env.HOME }}/*.log . From 1772184ac63ef82306c8086271e7c0f4c043e676 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Wed, 1 Nov 2023 19:50:26 +0000 Subject: [PATCH 03/70] [stf-run-ci] Fix check to include bool filter (#511) Update the check to use bool filter instead of a bar var. By default, ansible parses vars as strings, and without the | bool filter, this check is invalid, as it will always resolve to true, since it is a non-empty string. Other instances of the same check did this, but this one was missed. --- build/stf-run-ci/tasks/setup_stf_from_bundles.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index 8439dce4a..b4883c7c5 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -64,7 +64,7 @@ data: cert.pem: "{{ lookup('file', 'CA.pem') | b64encode }}" -- when: setup_bundle_registry_tls_ca +- when: setup_bundle_registry_tls_ca | bool name: Patch the default service account to use our pull secret kubernetes.core.k8s_json_patch: kind: ServiceAccount From bacb1fb889def2c62d97cf1a806809eb747a3e97 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 2 Nov 2023 15:58:31 +0000 Subject: [PATCH 04/70] [allow_skip_clone] Allow skipping of the cloning stages (#512) * [allow_skip_clone] Use _dir instead of hardcoding all directories relative to base_dir This will allow configuration of the repo clone destination, so we can use pre-cloned dirs instead of explicitly cloning the dirs each time. This is essential for CI systems like zuul, that set-up the repos with particular versions/branches prior to running the test scripts. * [zuul] List the other infrawatch repos as required for the job * [zuul] Set the {sgo,sg-bridge,sg-core,prometheus-webhook-snmp}_dir vars Add in the repo dir locations where the repos should be pre-cloned by zuul * Replace base_dir with sto_dir * set sto_dir relative to base_dir is it isn't already set * [ci] use absolute dir for requirements.txt * [ci] Update sto_dir using explicit reference zuul.project.src_dir refers to the current project dir. When using the jobs in another infrawatch project, this becomes invalid. Instead, sto_dir is explicitly set using zuul.projects[].src_dir, the same way that the other repo dirs are set in vars-zuul-common --------- Co-authored-by: Chris Sibbitt --- .zuul.yaml | 5 ++ build/stf-run-ci/defaults/main.yml | 1 + build/stf-run-ci/tasks/clone_repos.yml | 50 ++++++++++++++----- build/stf-run-ci/tasks/main.yml | 20 ++++++-- .../tasks/setup_stf_local_build.yml | 19 +++---- ci/deploy_stf.yml | 2 +- ci/post-collect_logs.yml | 2 +- ci/prepare.yml | 4 +- ci/test_stf.yml | 2 +- ci/vars-zuul-common.yml | 5 ++ 10 files changed, 78 insertions(+), 32 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 91d848359..6d6f72ac6 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -12,6 +12,11 @@ required-projects: - name: openstack-k8s-operators/ci-framework override-checkout: main + - name: github.com/infrawatch/smart-gateway-operator + - name: github.com/infrawatch/sg-bridge + - name: github.com/infrawatch/sg-core + - name: github.com/infrawatch/prometheus-webhook-snmp + pre-run: - ci/prepare.yml run: diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index da9834ecf..a6c5ee184 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -67,5 +67,6 @@ prometheus_webhook_snmp_repository: https://github.com/infrawatch/prometheus-web base_dir: '' +clone_repos: true setup_bundle_registry_auth: true setup_bundle_registry_tls_ca: true diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index 2bb2871bf..5f0131832 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -3,60 +3,84 @@ # NOTE: since you can't loop against blocks (and we're using them for failure # # recovery when the request branch doesn't exist) we have to define each # of these separately rather than using a loop. + +- name: Check if the {{ sgo_dir }} already exists + ansible.builtin.stat: + path: "{{ sgo_dir }}" + register: check_sgo_dir + - name: Get Smart Gateway Operator + when: not check_sgo_dir.stat.exists block: - name: Try cloning same-named branch or override branch from SGO repository ansible.builtin.git: repo: "{{ sgo_repository }}" - dest: "{{ base_dir }}/working/smart-gateway-operator" - version: "{{ sgo_branch | default(branch, true) }}" - force: true + dest: "{{ sgo_dir }}" + version: "{{ version_branches.sgo | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sgo }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/smart-gateway-operator - dest: "{{ base_dir }}/working/smart-gateway-operator" + dest: "{{ sgo_dir }}" version: "{{ version_branches.sgo }}" +- name: Check if the {{ sg_core_dir }} already exists + ansible.builtin.stat: + path: "{{ sg_core_dir }}" + register: check_sg_core_dir + - name: Get sg-core + when: not check_sg_core_dir.stat.exists block: - name: Try cloning same-named branch or override branch from sg-core repository ansible.builtin.git: repo: "{{ sg_core_repository }}" - dest: "{{ base_dir }}/working/sg-core" - version: "{{ sg_core_branch | default(branch, true) }}" + dest: "{{ sg_core_dir }}" + version: "{{ version_branches.sg_core | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_core }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/sg-core - dest: "{{ base_dir }}/working/sg-core" + dest: "{{ sg_core_dir }}" version: "{{ version_branches.sg_core }}" +- name: Check if the {{ sg_bridge_dir }} already exists + ansible.builtin.stat: + path: "{{ sg_bridge_dir }}" + register: check_sg_bridge_dir + - name: Get sg-bridge + when: not check_sg_bridge_dir.stat.exists block: - name: Try cloning same-named branch or override branch from sg-bridge repository ansible.builtin.git: repo: "{{ sg_bridge_repository }}" - dest: "{{ base_dir }}/working/sg-bridge" - version: "{{ sg_bridge_branch | default(branch, true) }}" + dest: "{{ sg_bridge_dir }}" + version: "{{ version_branches.sg_bridge | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_bridge }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/sg-bridge - dest: "{{ base_dir }}/working/sg-bridge" + dest: "{{ sg_bridge_dir }}" version: "{{ version_branches.sg_bridge }}" +- name: Check if the {{ prometheus_webhook_snmp_dir }} already exists + ansible.builtin.stat: + path: "{{ prometheus_webhook_snmp_dir }}" + register: check_prometheus_webhook_snmp_dir + - name: Get prometheus-webhook-snmp + when: not check_prometheus_webhook_snmp_dir.stat.exists block: - name: Try cloning same-named branch or override branch from prometheus-webhook-snmp repository ansible.builtin.git: repo: "{{ prometheus_webhook_snmp_repository }}" - dest: "{{ base_dir }}/working/prometheus-webhook-snmp" - version: "{{ prometheus_webhook_snmp_branch | default(branch, true) }}" + dest: "{{ prometheus_webhook_snmp_dir }}" + version: "{{ version_branches.prometheus_webhook_snmp | default(branch, true) }}" rescue: - name: "Get {{ version_branches.prometheus_webhook_snmp }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/prometheus-webhook-snmp - dest: "{{ base_dir }}/working/prometheus-webhook-snmp" + dest: "{{ prometheus_webhook_snmp_dir }}" version: "{{ version_branches.prometheus_webhook_snmp }}" diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index bd0821959..7d64ff56e 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -4,6 +4,7 @@ # -- initial setup - name: Setup default values ansible.builtin.set_fact: + # The branch should be removed, we should assume that everything is checked out to the right place. branch: "{{ working_branch | default('master') }}" namespace: "{{ namespace if namespace is defined else (working_namespace | default('service-telemetry'))}}" @@ -68,6 +69,14 @@ base_dir: "{{ playbook_dir }}" when: base_dir | length == 0 +- name: Set the repo destination dirs, if not provided + ansible.builtin.set_fact: + sto_dir: "{{ sto_dir if sto_dir is defined else base_dir + '/..' }}" + sgo_dir: "{{ sgo_dir if sgo_dir is defined else base_dir + '/working/smart-gateway-operator' }}" + sg_core_dir: "{{ sg_core_dir if sg_core_dir is defined else base_dir + '/working/sg-core' }}" + sg_bridge_dir: "{{ sg_bridge_dir if sg_bridge_dir is defined else base_dir + '/working/sg-bridge'}}" + prometheus_webhook_snmp_dir: "{{ prometheus_webhook_snmp_dir if prometheus_webhook_snmp_dir is defined else base_dir + '/working/prometheus-webhook-snmp' }}" + - name: Get operator_sdk_v0 (build bundles) ansible.builtin.command: cmd: "./get_operator_sdk.sh {{ operator_sdk_v0 }}" @@ -97,6 +106,7 @@ - create_builds block: - name: Setup supporting repositories + when: clone_repos | bool ansible.builtin.include_tasks: clone_repos.yml tags: - clone @@ -104,11 +114,11 @@ - name: Create base build list ansible.builtin.set_fact: build_list: - - {name: service-telemetry-operator, dockerfile_path: build/Dockerfile, image_reference_name: sto_image_path, working_build_dir: "{{ base_dir }}/../"} - - {name: smart-gateway-operator, dockerfile_path: build/Dockerfile, image_reference_name: sgo_image_path, working_build_dir: "{{ base_dir }}/working/smart-gateway-operator"} - - {name: sg-core, dockerfile_path: build/Dockerfile, image_reference_name: sg_core_image_path, working_build_dir: "{{ base_dir }}/working/sg-core"} - - {name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: "{{ base_dir }}/working/sg-bridge"} - - {name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: "{{ base_dir }}/working/prometheus-webhook-snmp"} + - {name: service-telemetry-operator, dockerfile_path: build/Dockerfile, image_reference_name: sto_image_path, working_build_dir: "{{ sto_dir }}"} + - {name: smart-gateway-operator, dockerfile_path: build/Dockerfile, image_reference_name: sgo_image_path, working_build_dir: "{{ sgo_dir }}"} + - {name: sg-core, dockerfile_path: build/Dockerfile, image_reference_name: sg_core_image_path, working_build_dir: "{{ sg_core_dir }}"} + - {name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: "{{ sg_bridge_dir }}"} + - {name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: "{{ prometheus_webhook_snmp_dir }}"} - ansible.builtin.debug: var: build_list diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index 40774223f..ffe2e7f6a 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -6,7 +6,7 @@ - block: - name: Generate Smart Gateway Operator CSV ansible.builtin.shell: - chdir: "{{ base_dir }}/working/smart-gateway-operator/build" + chdir: "{{ sgo_dir }}/build" cmd: | LOGFILE="{{ logfile_dir }}/sgo_gen_bundle.log" \ OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ @@ -28,7 +28,7 @@ - name: Replace namespace in SGO role binding ansible.builtin.replace: - path: "{{ base_dir }}/working/smart-gateway-operator/deploy/role_binding.yaml" + path: "{{ sgo_dir }}/deploy/role_binding.yaml" regexp: 'placeholder' replace: '{{ namespace }}' @@ -42,7 +42,8 @@ block: - name: Load Smart Gateway Operator RBAC ansible.builtin.command: - cmd: oc apply -f {{ base_dir }}/working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" + cmd: oc apply -f ./deploy/{{ item }} -n "{{ namespace }}" + chdir: "{{ sgo_dir }}" loop: - service_account.yaml - role.yaml @@ -57,7 +58,7 @@ - block: - name: Generate Service Telemetry Operator CSV ansible.builtin.shell: - chdir: "{{ base_dir }}" + chdir: "{{ sto_dir }}/build" cmd: | LOGFILE="{{ logfile_dir }}/sto_gen_bundle.log" \ OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ @@ -76,7 +77,7 @@ - name: Replace namespace in STO role binding ansible.builtin.replace: - path: "{{ base_dir }}/../deploy/role_binding.yaml" + path: "{{ sto_dir }}/deploy/role_binding.yaml" regexp: 'placeholder' replace: '{{ namespace }}' @@ -90,8 +91,8 @@ block: - name: Load Service Telemetry Operator RBAC ansible.builtin.command: - cmd: oc apply -f ../deploy/{{ item }} -n "{{ namespace }}" - chdir: "{{ base_dir }}" + cmd: oc apply -f ./deploy/{{ item }} -n "{{ namespace }}" + chdir: "{{ sto_dir }}" loop: - service_account.yaml - role.yaml @@ -105,5 +106,5 @@ # cleanup - name: Revert local change to role_binding.yaml ansible.builtin.shell: - cmd: git checkout -- "{{ base_dir }}/../deploy/role_binding.yaml" - chdir: "{{ base_dir }}" + cmd: git checkout -- deploy/role_binding.yaml + chdir: "{{ sto_dir }}" diff --git a/ci/deploy_stf.yml b/ci/deploy_stf.yml index 170e8590a..cd1e4d2ed 100644 --- a/ci/deploy_stf.yml +++ b/ci/deploy_stf.yml @@ -4,7 +4,7 @@ tasks: - name: "Set the sto_dir if it isn't already set" ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' when: sto_dir | default('') | length == 0 - name: "Get vars common to all jobs" diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml index 78526cd1b..4340f2eec 100644 --- a/ci/post-collect_logs.yml +++ b/ci/post-collect_logs.yml @@ -16,7 +16,7 @@ tasks: - name: "Set the sto_dir if it isn't already set" ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' when: sto_dir | default('') | length == 0 - name: "Get vars common to all jobs" diff --git a/ci/prepare.yml b/ci/prepare.yml index 7b65362d6..02be5114c 100644 --- a/ci/prepare.yml +++ b/ci/prepare.yml @@ -4,7 +4,7 @@ tasks: - name: "Set the sto_dir if it isn't already set" ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' when: sto_dir | default('') | length == 0 - name: "Get vars common to all jobs" @@ -23,7 +23,7 @@ - name: "Install pre-reqs from pip" ansible.builtin.pip: - requirements: "build/stf-run-ci/requirements.txt" + requirements: "{{ sto_dir }}/build/stf-run-ci/requirements.txt" chdir: "{{ sto_dir }}" state: present diff --git a/ci/test_stf.yml b/ci/test_stf.yml index 7f196e860..4fcec7c13 100644 --- a/ci/test_stf.yml +++ b/ci/test_stf.yml @@ -4,7 +4,7 @@ tasks: - name: "Set the sto_dir if it isn't already set" ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' when: sto_dir | default('') | length == 0 - name: "Get vars common to all jobs" diff --git a/ci/vars-zuul-common.yml b/ci/vars-zuul-common.yml index dfd64e7ad..e435df462 100644 --- a/ci/vars-zuul-common.yml +++ b/ci/vars-zuul-common.yml @@ -5,3 +5,8 @@ setup_bundle_registry_auth: false __service_telemetry_transports_qdr_auth: none base_dir: "{{ sto_dir }}/build" logfile_dir: "{{ ansible_user_dir }}/zuul-output/logs/controller" +clone_repos: false +sgo_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/smart-gateway-operator'].src_dir }}" +sg_core_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/sg-core'].src_dir }}" +sg_bridge_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/sg-bridge'].src_dir }}" +prometheus_webhook_snmp_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/prometheus-webhook-snmp'].src_dir }}" From acaffe6b581dd2314d4e70e8e9c0c947f3f0f875 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Fri, 3 Nov 2023 17:16:40 -0400 Subject: [PATCH 05/70] Fix qdr auth one_time_upgrade label check (#518) * Fix qdr auth one_time_upgrade label check * Fix incorrect variable naming on one_time_upgrade label check * Adjust QDR authentication password generation (#520) Adjust the passwords being generated for QDR authentication since certain characters (such as colon) will cause a failure in the parsing routine within qpid-dispatch. Updates the lookup function to only use ascii_letters and digits and increases the length to 32 characters. --------- Co-authored-by: Leif Madsen --- roles/servicetelemetry/tasks/component_qdr.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 64489ff74..4610c4be9 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -154,9 +154,9 @@ block: - name: Get QDR BasicAuth secret k8s_info: - api_version: interconnectedcloud.github.io/v1alpha1 - kind: Interconnect - name: "{{ ansible_operator_meta.name }}-interconnect" + api_version: v1 + kind: Secret + name: "{{ ansible_operator_meta.name }}-interconnect-users" namespace: "{{ ansible_operator_meta.namespace }}" register: _qdr_basicauth_object @@ -172,9 +172,9 @@ labels: stf_one_time_upgrade: "{{ lookup('pipe', 'date +%s') }}" stringData: - guest: "{{ lookup('password', '/dev/null') }}" + guest: "{{ lookup('password', '/dev/null chars=ascii_letters,digits length=32') }}" when: - - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object[0].metadata.labels.stf_one_time_upgrade is not defined + - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object.resources[0].metadata.labels.stf_one_time_upgrade is not defined - name: Set default Interconnect manifest set_fact: From 0786bc8656c2e3cae6285c9e1dff1e7c69277638 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Mon, 6 Nov 2023 13:49:57 +0000 Subject: [PATCH 06/70] Add docs for skip_clone (#515) * [allow_skip_clone] Add docs for clone_repos and *_dir vars * Align README table column spacing (#516) * Align README table column spacing * Update build/stf-run-ci/README.md --------- Co-authored-by: Emma Foley --------- Co-authored-by: Leif Madsen --- build/stf-run-ci/README.md | 88 ++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 41 deletions(-) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 353a8f81f..664e57e7a 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -15,47 +15,53 @@ Primarily this means a running CodeReady Container system has been provided. Not all variables are listed here, but these are the most common ones you might choose to override: -| Parameter name | Values | Default | Description | -| ------------------------------ | ------------ | --------- | ------------------------------------ | -| `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | -| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | -| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | -| `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | -| `__service_telemetry_bundle_image_path` | | `quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head` | Image path to Service Telemetry Operator bundle | -| `__smart_gateway_bundle_image_path` | | `quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head` | Image path to Smart Gateway Operator bundle | -| `setup_bundle_registry_tls_ca` | {true,false} | true | Whether to setup or not a TLS CA cert for the bundle registry access | -| `setup_bundle_registry_auth` | {true,false} | true | Whether to setup or not the auth for the bundle registry access | -| `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | -| `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | -| `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | -| `sg_bridge_branch` | | master | Which Smart Gateway Bridge git branch to checkout | -| `prometheus_webhook_snmp_branch` | | master | Which Prometheus webhook snmp branch to checkout | -| `sgo_repository` | | https://github.com/infrawatch/smart-gateway-operator | Which Smart Gateway Operator git repository to clone | -| `sg_core_repository` | | https://github.com/infrawatch/sg-core | Which Smart Gateway Core git repository to clone | -| `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | -| `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | -| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | -| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | -| `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | -| `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | -| `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | -| `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | -| `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | -| `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | -| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | -| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | -| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | -| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | -| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | -| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | -| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | -| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | -| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | -| `__service_telemetry_observability_strategy` | | `use_redhat` | Which observability strategy to use for deployment. Default is 'use_redhat'. Also supported are 'use_hybrid', 'use_community', and 'none' | -| `__service_telemetry_transports_qdr_auth` | {'none', 'basic'} | `none` | Which auth method to use for QDR. Can be 'none' or 'basic'. Note: 'basic' is not yet supported in smoketests. | -| `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | -| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | -| `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | +| Parameter name | Values | Default | Description | +| ------------------------------ | ------------ | --------- | ------------------------------------ | +| `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | +| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | +| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | +| `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | +| `__service_telemetry_bundle_image_path` | | `quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head` | Image path to Service Telemetry Operator bundle | +| `__smart_gateway_bundle_image_path` | | `quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head` | Image path to Smart Gateway Operator bundle | +| `setup_bundle_registry_tls_ca` | {true,false} | true | Whether to setup or not a TLS CA cert for the bundle registry access | +| `setup_bundle_registry_auth` | {true,false} | true | Whether to setup or not the auth for the bundle registry access | +| `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | +| `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | +| `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | +| `sg_bridge_branch` | | master | Which Smart Gateway Bridge git branch to checkout | +| `prometheus_webhook_snmp_branch` | | master | Which Prometheus webhook snmp branch to checkout | +| `sgo_repository` | | https://github.com/infrawatch/smart-gateway-operator | Which Smart Gateway Operator git repository to clone | +| `sg_core_repository` | | https://github.com/infrawatch/sg-core | Which Smart Gateway Core git repository to clone | +| `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | +| `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | +| `clone_repos` | {true, false} | true | Whether to clone the repos. If false, the repos will not be cloned, and the user will need to specify a value for `sto_dir`. The location of the other repos may need to be specified as well. (see relevant sections). | +| `sto_dir` | | `{{ playbook_dir }}/..` | The location of the service-telemetry-operator directory (needed to set the other repo paths) | +| `sgo_dir` | | `{{ sto_dir }}/build/working/smart-gateway-operator` | The directory to clone smart-gateway-operator into (when clone_repos == true) or the location of the the repo (when clone_repos == false) | +| `sg_core_dir` | | `{{ sto_dir }}/build/working/sg-core` | See description of sgo_dir | +| `sg_bridge_dir` | | `{{ sto_dir }}/build/working/sg-bridge` | See description of sgo_dir | +| `prometheus_webhook_snmp_dir` | | `{{ sto_dir }}/build/working/prometheus-webhook-snmp` | See description of sgo_dir | +| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | +| `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | +| `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | +| `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | +| `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | +| `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | +| `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | +| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | +| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | +| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | +| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | +| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | +| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | +| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | +| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | +| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | +| `__service_telemetry_observability_strategy` | | `use_redhat` | Which observability strategy to use for deployment. Default is 'use_redhat'. Also supported are 'use_hybrid', 'use_community', and 'none' | +| `__service_telemetry_transports_qdr_auth` | {'none', 'basic'} | `none` | Which auth method to use for QDR. Can be 'none' or 'basic'. Note: 'basic' is not yet supported in smoketests. | +| `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | +| `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | # Example Playbook From 14694d4ed0b87b67f7aaaaa397dde2d5826d3be5 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 7 Nov 2023 17:00:33 +0000 Subject: [PATCH 07/70] [zuul] Add STO to required repos (#524) It appears that STO is not included explictly when running jobs from SGO [1]. This will be the case in all the other repos. This change explicitly add it, in case it's not already included by zuul. [1] https://review.rdoproject.org/zuul/build/edd8f17bfdac4360a94186b46c4cea3f --- .zuul.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.zuul.yaml b/.zuul.yaml index 6d6f72ac6..20009eefe 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -12,6 +12,7 @@ required-projects: - name: openstack-k8s-operators/ci-framework override-checkout: main + - name: github.com/infrawatch/service-telemetry-operator - name: github.com/infrawatch/smart-gateway-operator - name: github.com/infrawatch/sg-bridge - name: github.com/infrawatch/sg-core From 37b6f035d6ff44a39598aacf812b3b893bafda7e Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 8 Nov 2023 11:36:52 -0500 Subject: [PATCH 08/70] QDR Auth in smoketest (#525) * QDR Auth in smoketest * Added qdr-test as a mock of the OSP-side QDR * Connection from qdr-test -> default-interconnect is TLS+Auth * Collectors point at qdr-test instead of default-interconnect directly * Much more realistic than the existing setup * Eliminated a substitution in sensubility config * Used default QDR basic auth in Jenkinsfile --- Jenkinsfile | 1 - tests/smoketest/collectd-sensubility.conf | 2 +- .../smoketest/minimal-collectd.conf.template | 4 +- tests/smoketest/qdr-test.conf.yaml.template | 66 +++++++++++++++++++ tests/smoketest/qdr-test.yaml | 52 +++++++++++++++ tests/smoketest/smoketest.sh | 20 +++--- .../smoketest_ceilometer_entrypoint.sh | 6 +- 7 files changed, 133 insertions(+), 18 deletions(-) create mode 100644 tests/smoketest/qdr-test.conf.yaml.template create mode 100644 tests/smoketest/qdr-test.yaml diff --git a/Jenkinsfile b/Jenkinsfile index f3a13d571..f94b64b1e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -36,7 +36,6 @@ spec: strategy: ephemeral transports: qdr: - auth: none enabled: true deploymentSize: 1 web: diff --git a/tests/smoketest/collectd-sensubility.conf b/tests/smoketest/collectd-sensubility.conf index 0cc773f21..4604e2e85 100644 --- a/tests/smoketest/collectd-sensubility.conf +++ b/tests/smoketest/collectd-sensubility.conf @@ -10,7 +10,7 @@ worker_count=2 checks={"check-container-health":{"command":"cat /healthcheck.log","handlers":[],"interval":3,"occurrences":3,"refresh":90,"standalone":true}} [amqp1] -connection=amqp://default-interconnect.<>.svc:5671 +connection=amqp://qdr-test:5672 results_channel=sensubility/cloud1-telemetry client_name=smoketest.redhat.com results_format=smartgateway diff --git a/tests/smoketest/minimal-collectd.conf.template b/tests/smoketest/minimal-collectd.conf.template index e6cf09189..ac0a6475a 100644 --- a/tests/smoketest/minimal-collectd.conf.template +++ b/tests/smoketest/minimal-collectd.conf.template @@ -11,8 +11,8 @@ LoadPlugin cpu LoadPlugin amqp1 - Host "default-interconnect" - Port "5671" + Host "qdr-test" + Port "5672" Address "collectd" Format JSON diff --git a/tests/smoketest/qdr-test.conf.yaml.template b/tests/smoketest/qdr-test.conf.yaml.template new file mode 100644 index 000000000..24b758214 --- /dev/null +++ b/tests/smoketest/qdr-test.conf.yaml.template @@ -0,0 +1,66 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: qdr-test-config +data: + qdrouterd.conf: | + router { + mode: edge + id: qdr-test.smoketest + workerThreads: 2 + saslConfigDir: /etc/sasl2 + saslConfigName: qdrouterd + } + + sslProfile { + name: sslProfile + caCertFile: /etc/pki/tls/certs/ca.crt + } + + listener { + host: 0.0.0.0 + port: 5672 + authenticatePeer: false + saslMechanisms: ANONYMOUS + } + + connector { + host: default-interconnect + port: 5671 + role: edge + saslPassword: pass:<> + saslUsername: guest@default-interconnect + sslProfile: sslProfile + verifyHostname: false + } + + address { + prefix: unicast + distribution: closest + } + + address { + prefix: exclusive + distribution: closest + } + + address { + prefix: broadcast + distribution: multicast + } + + address { + distribution: multicast + prefix: collectd + } + + address { + distribution: multicast + prefix: anycast/ceilometer + } + + log { + module: DEFAULT + enable: info+ + includeTimestamp: true + } diff --git a/tests/smoketest/qdr-test.yaml b/tests/smoketest/qdr-test.yaml new file mode 100644 index 000000000..3e6366cc2 --- /dev/null +++ b/tests/smoketest/qdr-test.yaml @@ -0,0 +1,52 @@ +apiVersion: v1 +kind: Pod +metadata: + annotations: + openshift.io/scc: restricted-v2 + name: qdr-test + labels: + qdr: qdr-test +spec: + containers: + - name: qdr + image: quay.io/tripleowallabycentos9/openstack-qdrouterd:current-tripleo + imagePullPolicy: IfNotPresent + command: ['/usr/sbin/qdrouterd','-c','/etc/qpid-dispatch/qdrouterd.conf'] + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + ports: + - containerPort: 5672 + name: amqp + protocol: TCP + volumeMounts: + - mountPath: /etc/pki/tls/certs/ + name: default-interconnect-selfsigned-cert + - mountPath: /etc/qpid-dispatch/ + name: qdr-test-config + resources: {} + volumes: + - name: default-interconnect-selfsigned-cert + secret: + defaultMode: 420 + secretName: default-interconnect-selfsigned + - name: qdr-test-config + configMap: + defaultMode: 420 + name: qdr-test-config + +--- + +apiVersion: v1 +kind: Service +metadata: + name: qdr-test +spec: + ports: + - name: amqp + port: 5672 + targetPort: amqp + selector: + qdr: qdr-test diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 2909e694f..29510a837 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -27,13 +27,6 @@ if [ "${OC_CLIENT_VERSION_Y}" -lt "${OC_CLIENT_VERSION_Y_REQUIRED}" ] || [ "${OC exit 1 fi -if [ "$(oc get stf default -o=jsonpath='{.spec.transports.qdr.auth}')" != "none" ]; then - echo "*** QDR authentication is currently not supported in smoketests." - echo "To disable it, use: oc patch stf default --patch '{\"spec\":{\"transports\":{\"qdr\":{\"auth\":\"none\"}}}}' --type=merge" - echo "For more info: https://github.com/infrawatch/service-telemetry-operator/pull/492" - exit 1 -fi - CLEANUP=${CLEANUP:-true} SMOKETEST_VERBOSE=${SMOKETEST_VERBOSE:-true} @@ -57,18 +50,23 @@ ELASTICSEARCH_AUTH_PASS=$(oc get secret elasticsearch-es-elastic-user -ogo-templ echo "*** [INFO] Getting Prometheus authentication password" PROMETHEUS_AUTH_PASS=$(oc get secret default-prometheus-htpasswd -ogo-template='{{ .data.password | base64decode }}') -echo "*** [INFO] Setting namepsace for collectd-sensubility config" -sed "s/<>/${OCP_PROJECT}/g" "${REL}/collectd-sensubility.conf" > /tmp/collectd-sensubility.conf - echo "*** [INFO] Creating configmaps..." oc delete configmap/stf-smoketest-healthcheck-log configmap/stf-smoketest-collectd-config configmap/stf-smoketest-sensubility-config configmap/stf-smoketest-collectd-entrypoint-script configmap/stf-smoketest-ceilometer-publisher configmap/stf-smoketest-ceilometer-entrypoint-script job/stf-smoketest || true oc create configmap stf-smoketest-healthcheck-log --from-file "${REL}/healthcheck.log" oc create configmap stf-smoketest-collectd-config --from-file "${REL}/minimal-collectd.conf.template" -oc create configmap stf-smoketest-sensubility-config --from-file /tmp/collectd-sensubility.conf +oc create configmap stf-smoketest-sensubility-config --from-file "${REL}/collectd-sensubility.conf" oc create configmap stf-smoketest-collectd-entrypoint-script --from-file "${REL}/smoketest_collectd_entrypoint.sh" oc create configmap stf-smoketest-ceilometer-publisher --from-file "${REL}/ceilometer_publish.py" oc create configmap stf-smoketest-ceilometer-entrypoint-script --from-file "${REL}/smoketest_ceilometer_entrypoint.sh" +echo "*** [INFO] Creating Mock OSP Metrics QDR router..." +oc delete pod qdr-test +oc delete service qdr-test +oc delete configmap qdr-test-config +AMQP_PASS=$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d) +oc create -f <(sed -e "s/<>/${AMQP_PASS}/;" "${REL}/qdr-test.conf.yaml.template") +oc create -f "${REL}/qdr-test.yaml" + echo "*** [INFO] Creating smoketest jobs..." oc delete job -l app=stf-smoketest for NAME in "${CLOUDNAMES[@]}"; do diff --git a/tests/smoketest/smoketest_ceilometer_entrypoint.sh b/tests/smoketest/smoketest_ceilometer_entrypoint.sh index 8e2ac7f6f..adf3a9046 100644 --- a/tests/smoketest/smoketest_ceilometer_entrypoint.sh +++ b/tests/smoketest/smoketest_ceilometer_entrypoint.sh @@ -13,11 +13,11 @@ POD=$(hostname) echo "*** [INFO] My pod is: ${POD}" # Run ceilometer_publisher script -python3 /ceilometer_publish.py default-interconnect:5671 'driver=amqp&topic=cloud1-metering' 'driver=amqp&topic=cloud1-event' +python3 /ceilometer_publish.py qdr-test:5672 'driver=amqp&topic=cloud1-metering' 'driver=amqp&topic=cloud1-event' # Sleeping to produce data -echo "*** [INFO] Sleeping for 20 seconds to produce all metrics and events" -sleep 20 +echo "*** [INFO] Sleeping for 30 seconds to produce all metrics and events" +sleep 30 echo "*** [INFO] List of metric names for debugging..." curl -sk -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names From d12aa38b1950bd5158ec5f9ba4f95ca6c24500c0 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 8 Nov 2023 11:56:22 -0500 Subject: [PATCH 09/70] QDR Auth for infrared 17.1 script (#517) * QDR Auth for infrared 17.1 script * Fix missing substitution for AMQP_PASS in infrared script --- tests/infrared/17.1/README.md | 1 + tests/infrared/17.1/infrared-openstack.sh | 12 ++++++++---- tests/infrared/17.1/stf-connectors.yaml.template | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/infrared/17.1/README.md b/tests/infrared/17.1/README.md index 15bcf37a9..0db5c0734 100644 --- a/tests/infrared/17.1/README.md +++ b/tests/infrared/17.1/README.md @@ -6,6 +6,7 @@ OCP_ROUTE_IP="10.0.100.50" \ CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ AMQP_HOST="$(oc get route default-interconnect-5671 -ojsonpath='{.spec.host}')" \ +AMQP_PASS="$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d)" \ ENABLE_STF_CONNECTORS=true \ ENABLE_GNOCCHI_CONNECTORS=false \ CONTROLLER_MEMORY="24000" \ diff --git a/tests/infrared/17.1/infrared-openstack.sh b/tests/infrared/17.1/infrared-openstack.sh index 9743a0081..a55de8894 100755 --- a/tests/infrared/17.1/infrared-openstack.sh +++ b/tests/infrared/17.1/infrared-openstack.sh @@ -1,12 +1,11 @@ #!/usr/bin/env bash set -e -# Usage: -# VIRTHOST=my.big.hypervisor.net -# ./infrared-openstack.sh +# Usage: See README.md VIRTHOST=${VIRTHOST:-localhost} AMQP_HOST=${AMQP_HOST:-stf-default-interconnect-5671-service-telemetry.apps-crc.testing} AMQP_PORT=${AMQP_PORT:-443} +AMQP_PASS=${AMQP_PASS:-} SSH_KEY="${SSH_KEY:-${HOME}/.ssh/id_rsa}" NTP_SERVER="${NTP_SERVER:-clock.redhat.com,10.5.27.10,10.11.160.238}" CLOUD_NAME="${CLOUD_NAME:-cloud1}" @@ -97,7 +96,7 @@ ir_create_undercloud() { } stf_create_config() { - sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${STF_ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml + sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${AMQP_PASS}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${STF_ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml } gnocchi_create_config() { @@ -167,6 +166,11 @@ if [ -z "${CA_CERT_FILE_CONTENT}" ]; then exit 1 fi +if [ -z "${AMQP_PASS}" ]; then + echo "AMQP_PASS must be set and passed to the deployment, or QDR will fail to connect." + exit 1 +fi + time if ${TEMPEST_ONLY}; then echo "-- Running tempest tests" ir_run_tempest diff --git a/tests/infrared/17.1/stf-connectors.yaml.template b/tests/infrared/17.1/stf-connectors.yaml.template index 1031e097b..1dfa26827 100644 --- a/tests/infrared/17.1/stf-connectors.yaml.template +++ b/tests/infrared/17.1/stf-connectors.yaml.template @@ -16,6 +16,8 @@ custom_templates: role: edge verifyHostname: false sslProfile: sslProfile + saslUsername: guest@default-interconnect + saslPassword: pass:<> MetricsQdrSSLProfiles: - name: sslProfile From 09044b9e59105220ade9c858ad4450bba2c4de22 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Mon, 13 Nov 2023 13:31:17 +0000 Subject: [PATCH 10/70] [zuul] Define a project template for stf-crc-jobs (#514) * [allow_skip_clone] Use _dir instead of hardcoding all directories relative to base_dir This will allow configuration of the repo clone destination, so we can use pre-cloned dirs instead of explicitly cloning the dirs each time. This is essential for CI systems like zuul, that set-up the repos with particular versions/branches prior to running the test scripts. * [zuul] List the other infrawatch repos as required for the job * [zuul] Set the {sgo,sg-bridge,sg-core,prometheus-webhook-snmp}_dir vars Add in the repo dir locations where the repos should be pre-cloned by zuul * Replace base_dir with sto_dir * set sto_dir relative to base_dir is it isn't already set * [ci] use absolute dir for requirements.txt * [ci] Update sto_dir using explicit reference zuul.project.src_dir refers to the current project dir. When using the jobs in another infrawatch project, this becomes invalid. Instead, sto_dir is explicitly set using zuul.projects[].src_dir, the same way that the other repo dirs are set in vars-zuul-common * [zuul] Define a project template for stf-crc-jobs Instead of listing all the jobs for each preoject in-repo, and needing to update the list every time that a new job is added, the project template can be updated and the changes propogated to the other infrawatch projects * [zuul] don't enable using the template * Revert "[zuul] don't enable using the template" This reverts commit 56e2009773d13587db890a6d6ca22d30f485c9cb. --------- Co-authored-by: Chris Sibbitt --- .zuul.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 20009eefe..bc3d54a7d 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -48,9 +48,16 @@ vars: scenario: "local_build" -- project: - name: infrawatch/service-telemetry-operator +- project-template: + name: stf-crc-jobs + description: | + STF CRC jobs that build and deploy STF github-check: jobs: - stf-crc-latest-nightly_bundles - stf-crc-latest-local_build + +- project: + name: infrawatch/service-telemetry-operator + templates: + - stf-crc-jobs From 16b8197ed3d0413f652c73a8e309f88f46d635ac Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 13 Nov 2023 15:54:09 -0500 Subject: [PATCH 11/70] Restart QDR after changing the password (#530) * Restart QDR after changing the password * Fixes bug reported here: https://github.com/infrawatch/service-telemetry-operator/pull/517#issuecomment-1794919985 * Avoids an extra manual step when changing password * Would affect users who upgrade from earlier STF and subsequently enable basic auth * Also users who need to change their passwords * Fixing ansible lint * Update roles/servicetelemetry/tasks/component_qdr.yml * Adjust QDR restarts to account for HA * [smoketest] Wait for qdr-test to be Running * [smoketest] Wait for QDR password upgrade * Remove zuul QDR auth override --- ci/vars-zuul-common.yml | 1 - .../servicetelemetry/tasks/component_qdr.yml | 48 +++++++++++++------ tests/smoketest/smoketest.sh | 10 +++- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/ci/vars-zuul-common.yml b/ci/vars-zuul-common.yml index e435df462..12308a590 100644 --- a/ci/vars-zuul-common.yml +++ b/ci/vars-zuul-common.yml @@ -2,7 +2,6 @@ namespace: "service-telemetry" setup_bundle_registry_tls_ca: false setup_bundle_registry_auth: false -__service_telemetry_transports_qdr_auth: none base_dir: "{{ sto_dir }}/build" logfile_dir: "{{ ansible_user_dir }}/zuul-output/logs/controller" clone_repos: false diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 4610c4be9..0ddbb03f4 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -160,21 +160,41 @@ namespace: "{{ ansible_operator_meta.namespace }}" register: _qdr_basicauth_object - # Because https://github.com/interconnectedcloud/qdr-operator/blob/576d2b33dac71437ea2b165caaaf6413220767fe/pkg/controller/interconnect/interconnect_controller.go#L634 - - name: Perform a one-time upgrade to the default generated password for QDR BasicAuth - k8s: - definition: - kind: Secret - apiVersion: v1 - metadata: - name: "{{ ansible_operator_meta.name }}-interconnect-users" + - when: + - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object.resources[0].metadata.labels.stf_one_time_upgrade is not defined + block: + # Because https://github.com/interconnectedcloud/qdr-operator/blob/576d2b33dac71437ea2b165caaaf6413220767fe/pkg/controller/interconnect/interconnect_controller.go#L634 + - name: Perform a one-time upgrade to the default generated password for QDR BasicAuth + k8s: + definition: + kind: Secret + apiVersion: v1 + metadata: + name: "{{ ansible_operator_meta.name }}-interconnect-users" + namespace: "{{ ansible_operator_meta.namespace }}" + labels: + stf_one_time_upgrade: "{{ lookup('pipe', 'date +%s') }}" + stringData: + guest: "{{ lookup('password', '/dev/null chars=ascii_letters,digits length=32') }}" + + # label_selectors on the k8s object need kubernetes.core>=2.2.0 + - name: Get the list of QDR pods + k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ ansible_operator_meta.namespace }}" + label_selectors: + - application={{ ansible_operator_meta.name }}-interconnect + register: _qdr_pod + + - name: Restart QDR pods to pick up new password + k8s: + state: absent + api_version: v1 + kind: Pod namespace: "{{ ansible_operator_meta.namespace }}" - labels: - stf_one_time_upgrade: "{{ lookup('pipe', 'date +%s') }}" - stringData: - guest: "{{ lookup('password', '/dev/null chars=ascii_letters,digits length=32') }}" - when: - - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object.resources[0].metadata.labels.stf_one_time_upgrade is not defined + name: "{{ item.metadata.name }}" + loop: "{{ _qdr_pod.resources }}" - name: Set default Interconnect manifest set_fact: diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 29510a837..caaeb4e88 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -59,14 +59,20 @@ oc create configmap stf-smoketest-collectd-entrypoint-script --from-file "${REL} oc create configmap stf-smoketest-ceilometer-publisher --from-file "${REL}/ceilometer_publish.py" oc create configmap stf-smoketest-ceilometer-entrypoint-script --from-file "${REL}/smoketest_ceilometer_entrypoint.sh" -echo "*** [INFO] Creating Mock OSP Metrics QDR router..." +echo "*** [INFO] Waiting for QDR password upgrade" +AMQP_PASS='' +while [ ${#AMQP_PASS} -lt 32 ]; do AMQP_PASS=$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d); sleep 3; done + +echo "*** [INFO] Creating Mock OSP Metrics QDR..." oc delete pod qdr-test oc delete service qdr-test oc delete configmap qdr-test-config -AMQP_PASS=$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d) oc create -f <(sed -e "s/<>/${AMQP_PASS}/;" "${REL}/qdr-test.conf.yaml.template") oc create -f "${REL}/qdr-test.yaml" +echo -e "\n* [INFO] Waiting for OSP Metrics QDR pod to be Running\n" +oc wait --for=jsonpath='{.status.phase}'=Running pod/qdr-test + echo "*** [INFO] Creating smoketest jobs..." oc delete job -l app=stf-smoketest for NAME in "${CLOUDNAMES[@]}"; do From d3d8ee5e881a0897387038ad050dc1cc1df33599 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 14 Nov 2023 19:08:23 +0000 Subject: [PATCH 12/70] [zuul] Add jobs to test with different versions of OCP (#432) * Add crc_ocp_bundle value to select OCP version * zuul: add log collection post-task to get crc logs * Add ocp v13 and a timeout to the job --- .zuul.yaml | 51 +++++++++++++++++++++++++++++++++------- ci/post-collect_logs.yml | 2 +- 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index bc3d54a7d..521b016c2 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -17,7 +17,6 @@ - name: github.com/infrawatch/sg-bridge - name: github.com/infrawatch/sg-core - name: github.com/infrawatch/prometheus-webhook-snmp - pre-run: - ci/prepare.yml run: @@ -33,31 +32,67 @@ crc_parameters: '--memory 16000 --disk-size 80 --cpus 6' # Increase from 14336 - job: - name: stf-crc-latest-nightly_bundles + name: stf-crc-nightly_bundles parent: stf-base - description: - Deploy STF nightly bundles + abstract: true + description: | + Deploy STF using the nightly bundles vars: scenario: "nightly_bundles" - job: - name: stf-crc-latest-local_build + name: stf-crc-local_build parent: stf-base + abstract: true description: | Build images locally and deploy STF vars: scenario: "local_build" +- job: + name: stf-crc-ocp_412-nightly_bundles + parent: stf-crc-nightly_bundles + description: | + Deploy STF using the nightly bundles on OCP 4.12 + vars: + crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.12.13/crc_libvirt_4.12.13_amd64.crcbundle' + +- job: + name: stf-crc-ocp_413-nightly_bundles + parent: stf-crc-nightly_bundles + description: | + Deploy STF using the nightly bundles on OCP 4.13 + vars: + crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.13.14/crc_libvirt_4.13.14_amd64.crcbundle' + +- job: + name: stf-crc-ocp_412-local_build + parent: stf-crc-local_build + description: | + Build images locally and deploy STF on OCP 4.12 + vars: + crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.12.13/crc_libvirt_4.12.13_amd64.crcbundle' + +- job: + name: stf-crc-ocp_413-local_build + parent: stf-crc-local_build + description: | + Build images locally and deploy STF on OCP 4.13 + vars: + crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.13.14/crc_libvirt_4.13.14_amd64.crcbundle' + - project-template: name: stf-crc-jobs description: | STF CRC jobs that build and deploy STF github-check: jobs: - - stf-crc-latest-nightly_bundles - - stf-crc-latest-local_build + - stf-crc-ocp_412-nightly_bundles + - stf-crc-ocp_412-local_build + - stf-crc-ocp_413-nightly_bundles + - stf-crc-ocp_413-local_build - project: name: infrawatch/service-telemetry-operator templates: - - stf-crc-jobs + - stf-crc-jobs \ No newline at end of file diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml index 4340f2eec..11b27f109 100644 --- a/ci/post-collect_logs.yml +++ b/ci/post-collect_logs.yml @@ -46,7 +46,7 @@ ansible.builtin.shell: | cp {{ ansible_env.HOME }}/*.log . args: - chdir: "{{ ansible_user_dir }}/zuul-output/logs/controller" + chdir: "{{ logfile_dir }}" changed_when: true ignore_errors: true From f3cbf32adc762b23cc5f4d6c5152ca11c1dd2361 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 14 Nov 2023 14:43:27 -0500 Subject: [PATCH 13/70] Update README for 17.1 IR test (#533) * Update README for 17.1 IR test Update the 17.1 infrared test script README to show how to deploy a virtualized workload on the deployed overcloud infrastructure. Helps with testing by providing additional telemetry to STF required in certain dashboards. * Update tests/infrared/17.1/README.md Co-authored-by: Chris Sibbitt * Update tests/infrared/17.1/README.md --------- Co-authored-by: Chris Sibbitt --- tests/infrared/17.1/README.md | 74 +++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 13 deletions(-) diff --git a/tests/infrared/17.1/README.md b/tests/infrared/17.1/README.md index 0db5c0734..b1ab53ac3 100644 --- a/tests/infrared/17.1/README.md +++ b/tests/infrared/17.1/README.md @@ -2,16 +2,64 @@ ## Basic deployment -```bash -OCP_ROUTE_IP="10.0.100.50" \ -CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ -AMQP_HOST="$(oc get route default-interconnect-5671 -ojsonpath='{.spec.host}')" \ -AMQP_PASS="$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d)" \ -ENABLE_STF_CONNECTORS=true \ -ENABLE_GNOCCHI_CONNECTORS=false \ -CONTROLLER_MEMORY="24000" \ -COMPUTE_CPU="6" \ -COMPUTE_MEMORY="24000" \ -LIBVIRT_DISKPOOL="/home/libvirt/images" \ -./infrared-openstack.sh -``` +A basic deployment can be deployed and connected to an existing STF deployment automatically after logging into the OpenShift cluster hosting STF from the host system. + +### Prequisites + +* Logged into the host system where you'll deploy the virtualized OpenStack infrastructure. +* Installed infrared and cloned the infrawatch/service-telemetry-operator repository. +* DNS resolution (or `/etc/hosts` entry) of the OpenShift cluster API endpoint. +* Downloaded the `oc` binary, made it executable, and placed in $PATH. +* Logged into the OpenShift hosting STF and changed to the `service-telemetry` project from the host system. + +### Procedure + +* Deploy the overcloud using the infrawatch-openstack.sh script: + ```bash + OCP_ROUTE_IP="10.0.111.41" \ + CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ + AMQP_HOST="$(oc get route default-interconnect-5671 -ojsonpath='{.spec.host}')" \ + AMQP_PASS="$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d)" \ + ENABLE_STF_CONNECTORS=true \ + ENABLE_GNOCCHI_CONNECTORS=false \ + CONTROLLER_MEMORY="24000" \ + COMPUTE_CPU="6" \ + COMPUTE_MEMORY="24000" \ + LIBVIRT_DISKPOOL="/home/libvirt/images" \ + ./infrared-openstack.sh + ``` + +## Running a test workload + +You can run a test workload on the deployed overcloud by logging into the undercloud and completing some additional setup to allow for virtual machine workloads to run. + +### Procedure + +* Login to the undercloud from the host system: + ```bash + ir ssh undercloud-0 + ``` +* Complete the deployment of a private network, router, and other aspects to allow the virtual machine to be deployed: + ```bash + source overcloudrc + export PRIVATE_NETWORK_CIDR=192.168.100.0/24 + openstack flavor create --ram 512 --disk 1 --vcpu 1 --public tiny + curl -L -O https://download.cirros-cloud.net/0.5.0/cirros-0.5.0-x86_64-disk.img + openstack image create cirros --container-format bare --disk-format qcow2 --public --file cirros-0.5.0-x86_64-disk.img + openstack keypair create --public-key ~/.ssh/id_rsa.pub default + openstack security group create basic + openstack security group rule create basic --protocol tcp --dst-port 22:22 --remote-ip 0.0.0.0/0 + openstack security group rule create --protocol icmp basic + openstack security group rule create --protocol udp --dst-port 53:53 basic + openstack network create --internal private + openstack subnet create private-net \ + --subnet-range $PRIVATE_NETWORK_CIDR \ + --network private + openstack router create vrouter + openstack router set vrouter --external-gateway public + openstack router add subnet vrouter private-net + openstack server create --flavor tiny --image cirros --key-name default --security-group basic --network private myserver + until [ "$(openstack server list --name myserver --column Status --format value)" = "ACTIVE" ]; do echo "Waiting for server to be ACTIVE..."; sleep 10; done + openstack server add floating ip myserver $(openstack floating ip create public --format json | jq .floating_ip_address | tr -d '"') + openstack server list + ``` From cba3874b69a20e72ce8939ac49c9616c16711392 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 16 Nov 2023 11:19:54 -0500 Subject: [PATCH 14/70] Support OCP v4.12 through v4.14 (#535) Support STF 1.5.3 starting at OpenShift version 4.12 due to incompatibility with 4.11 due to dependency requirements. Our primary target is support of OCP EUS releases. Closes: STF-1632 --- deploy/olm-catalog/service-telemetry-operator/Dockerfile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index cbe2ccbf3..871edc3c1 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="v4.11-v4.14" +LABEL com.redhat.openshift.versions="v4.12-v4.14" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ From 2cccdc6a25f288a7401f16ad3ee11ae63edb59a2 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 17 Nov 2023 13:48:44 +0000 Subject: [PATCH 15/70] [stf-collect-logs] Add ignore_errors to task (#529) The "Question the deployment" task didn't have ignore_errors: true set, so when the task fails, the play is finished. This means that we don't get to the "copy logs" task and can't see the job logs in zuul. ignore_errors is set to true to be consistent with other tasks --- build/stf-collect-logs/tasks/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/build/stf-collect-logs/tasks/main.yml b/build/stf-collect-logs/tasks/main.yml index 07c602840..25d9681b5 100644 --- a/build/stf-collect-logs/tasks/main.yml +++ b/build/stf-collect-logs/tasks/main.yml @@ -47,6 +47,7 @@ oc -n {{ namespace }} get csv | grep service-telemetry-operator >> {{ logfile_dir }}/post_question_deployment.log 2>&1 oc -n {{ namespace }} get csv $(oc -n {{ namespace }} get csv | grep "service-telemetry-operator" | awk '{ print $1}') -oyaml >> {{ logfile_dir }}/post_question_deployment.log 2>&1 register: output + ignore_errors: true retries: 3 delay: 10 From 2fc9c6ca8693571f6550a3caf32390c548ed6515 Mon Sep 17 00:00:00 2001 From: Marihan Girgis <102027102+mgirgisf@users.noreply.github.com> Date: Fri, 17 Nov 2023 15:33:16 +0100 Subject: [PATCH 16/70] Mgirgisf/stf 1580/fix log commands (#526) * update stf-collect-logs tasks * Update log path * solve log bugs in stf-run-ci tasks * create log directory --- build/run-ci.yaml | 9 ++++- build/stf-collect-logs/tasks/main.yml | 45 ++++++--------------- build/stf-collect-logs/vars/main.yml | 6 +++ build/stf-run-ci/tasks/main.yml | 6 +-- build/stf-run-ci/tasks/preflight_checks.yml | 7 +++- 5 files changed, 35 insertions(+), 38 deletions(-) diff --git a/build/run-ci.yaml b/build/run-ci.yaml index bfd07c3cb..a81fcd8aa 100644 --- a/build/run-ci.yaml +++ b/build/run-ci.yaml @@ -7,9 +7,14 @@ - name: Run the STF CI system import_role: name: stf-run-ci - + + - name: Create Log directory + file: + path: "{{ playbook_dir }}/working/logs" + state: directory + - name: Collect the logs import_role: name: stf-collect-logs vars: - logfile_dir: "{{ playbook_dir }}/" + logfile_dir: "{{ playbook_dir }}/working/logs/" diff --git a/build/stf-collect-logs/tasks/main.yml b/build/stf-collect-logs/tasks/main.yml index 25d9681b5..10fb0c97b 100644 --- a/build/stf-collect-logs/tasks/main.yml +++ b/build/stf-collect-logs/tasks/main.yml @@ -1,33 +1,22 @@ --- -- name: "Get builds" - ansible.builtin.shell: - cmd: | - echo "*** [INFO] Showing oc get builds" > {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - oc -n {{ namespace }} get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - echo "*** [INFO] Showing oc get builds -oyaml" >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - oc -n {{ namespace }} get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - cat {{ logfile_dir }}/post_oc_get_builds.log +- name: "Get resources logs [ Builds, Subscriptions, Image, Imagestreams, Pods ]" + ansible.builtin.shell: | + for resource in {{ resource_types|join(' ') }}; do + log_file="{{ logfile_dir }}/post_oc_get_$resource.log" + echo "*** [INFO] Showing oc get '$resource'" > "$log_file" 2>&1 + oc -n {{ namespace }} get "$resource" >> "$log_file" 2>&1 + echo "[INFO] oc get '$resource' -oyaml" >> "$log_file" 2>&1 + oc -n {{ namespace }} get "$resource" -oyaml >> "$log_file" 2>&1 + done + delay: 10 + retries: 3 ignore_errors: true changed_when: false -- name: "Get subscription details" +- name: "Get Additional Information details" ansible.builtin.shell: cmd: | - oc -n {{ namespace }} get subscriptions > {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 - oc -n {{ namespace }} describe subscription service-telemetry-operator >> {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 - ignore_errors: true - -- name: "Get image infos" - ansible.builtin.shell: - cmd: | - echo "[INFO] oc get images" > {{ logfile_dir }}/post_oc_get_images.log 2>&1 - oc -n {{ namespace }} get images >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - echo "[INFO] oc get imagestreams" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - oc -n {{ namespace }} get imagestream >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - echo "[INFO] oc get imagestream -oyaml" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - oc -n {{ namespace }} get imagestream -oyaml >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - retries: 3 - delay: 10 + oc -n {{ namespace }} describe subscription service-telemetry-operator >> {{ logfile_dir }}/post_oc_describe_subscriptions_STO.log 2>&1 ignore_errors: true - name: "Get STO info" @@ -51,14 +40,6 @@ retries: 3 delay: 10 -- name: "Get pods" - ansible.builtin.command: - cmd: | - oc -n {{ namespace }} get pods > {{ logfile_dir }}/post_oc_get_pods.log 2>&1 - ignore_errors: true - retries: 3 - delay: 10 - - name: "Describe non-completed, non-running pods" ansible.builtin.shell: cmd: | diff --git a/build/stf-collect-logs/vars/main.yml b/build/stf-collect-logs/vars/main.yml index 5197b0284..dbf668d77 100644 --- a/build/stf-collect-logs/vars/main.yml +++ b/build/stf-collect-logs/vars/main.yml @@ -1,2 +1,8 @@ --- # vars file for stf-collect-logs +resource_types: + - builds + - subscriptions + - images + - imagestream + - pods \ No newline at end of file diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 7d64ff56e..503dd279c 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -216,11 +216,11 @@ - name: Validate system is operational ansible.builtin.shell: | OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" timeout 1200 "{{ base_dir }}/validate_deployment.sh" >> {{ logfile_dir }}/validate_deployment.log 2>&1 + cat {{ logfile_dir }}/validate_deployment.log args: executable: /bin/bash register: validate_deployment - name: Show the result of the validate_deployment script - ansible.builtin.shell: - cmd: | - cat {{ logfile_dir }}/validate_deployment.log + ansible.builtin.debug: + var: validate_deployment.stdout diff --git a/build/stf-run-ci/tasks/preflight_checks.yml b/build/stf-run-ci/tasks/preflight_checks.yml index 5c68b5405..1902c0793 100644 --- a/build/stf-run-ci/tasks/preflight_checks.yml +++ b/build/stf-run-ci/tasks/preflight_checks.yml @@ -19,8 +19,13 @@ ansible.builtin.command: cmd: | oc describe csv $(oc get csv | grep "service-telemetry-operator" | awk '{print $1}') > {{ logfile_dir }}/oc_get_csv_sto.log 2>&1 - cat {{ logfile_dir }} + cat {{ logfile_dir }}/oc_get_csv_sto.log + register: csv_sto + - name: "Show service-telemetry-operator CSV information" + ansible.builtin.debug: + var: csv_sto.stdout + - name: "Show fail message if CSV isn't Succeeded after the alotted time" ansible.builtin.fail: msg: "Service Telemetry Operator CSV not Succeeded after 10 minutes. Check {{ logfile_dir }}/oc_get_csv_sto.log for more information" From 77dea87dec0656c48d0c7e0b9a09d0fe826425aa Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Mon, 20 Nov 2023 09:43:03 -0500 Subject: [PATCH 17/70] Adjust Operator dependency version requirements (#538) Adjust the operator package dependency requirements to align to known required versions. Primarily reduce the version of openshift-cert-manager from 1.10 to 1.7 in order to support the tech-preview channel which was previously used. Lowering the version requirement allows for the openshift-cert-manager-operator installed previously to be used during the STF 1.5.2 to 1.5.3 update, removing the update from being blocked. Related: STF-1636 --- .../service-telemetry-operator/metadata/properties.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml index 2a0d93436..2d950b055 100644 --- a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -27,7 +27,7 @@ properties: - failureMessage: Package openshift-cert-manager-operator is needed for AMQ Interconnect setup package: packageName: openshift-cert-manager-operator - versionRange: '>=1.10.0' + versionRange: '>=1.7.0' - type: olm.constraint value: failureMessage: Require Prometheus backend for data storage of metrics for Service Telemetry Framework @@ -38,7 +38,7 @@ properties: versionRange: '>=0.56.0' - package: packageName: observability-operator - versionRange: '>=0.0.1' + versionRange: '>=0.0.25' - package: packageName: cluster-observability-operator - versionRange: '>=0.0.1' + versionRange: '>=0.1.0' From 6ec92f3f3b727d75cf5f22a0790b84fb711a7ca6 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 21 Nov 2023 14:22:09 -0500 Subject: [PATCH 18/70] Clean up stf-run-ci for OCP 4.12 minimum version (#539) Update the stf-run-ci base setup to no longer need testing against OCP 4.10 and earlier, meaning we can rely on a single workflow for installation. Also update the deployment to use cluster-observability-operator via the redhat-operators CatalogSource for installation via use_redhat and use_hybrid strategies. --- build/stf-run-ci/tasks/setup_base.yml | 52 +++------------------------ 1 file changed, 5 insertions(+), 47 deletions(-) diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index cf9c92fdf..3ad34f1f7 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -28,47 +28,7 @@ targetNamespaces: - "{{ namespace }}" -# deploy cert-manager from tech-preview when using versions of OCP < 4.12 -- when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '<') - block: - - name: Create openshift-cert-manager-operator namespace - kubernetes.core.k8s: - definition: - apiVersion: project.openshift.io/v1 - kind: Project - metadata: - name: openshift-cert-manager-operator - spec: - finalizers: - - kubernetes - - - name: Create openshift-cert-manager-operator OperatorGroup - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1 - kind: OperatorGroup - metadata: - name: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - spec: {} - - - name: Subscribe to Cert Manager for OpenShift Operator - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - spec: - channel: "tech-preview" - installPlanApproval: Automatic - name: openshift-cert-manager-operator - source: redhat-operators - sourceNamespace: openshift-marketplace - -# deploy cert-manager from stable-v1 in 4.12 and later using namespace scoped operator -- when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '>=') +- when: not __deploy_from_index_enabled | bool block: - name: Subscribe to Cert Manager for OpenShift Operator kubernetes.core.k8s: @@ -87,8 +47,6 @@ source: redhat-operators sourceNamespace: openshift-marketplace -- when: not __deploy_from_index_enabled | bool - block: - name: Subscribe to AMQ Interconnect Operator kubernetes.core.k8s: definition: @@ -129,13 +87,13 @@ metadata: labels: operators.coreos.com/observability-operator.openshift-operators: "" - name: observability-operator + name: cluster-observability-operator namespace: openshift-operators spec: - channel: stable + channel: development installPlanApproval: Automatic - name: observability-operator - source: community-operators + name: cluster-observability-operator + source: redhat-operators sourceNamespace: openshift-marketplace when: - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] From cd256468e714197337a3cd551fa22d191139c8ce Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 24 Nov 2023 11:28:12 +0000 Subject: [PATCH 19/70] [zuul] Add job to build locally and do an index-based deployment (#495) * [zuul] Add job to build locally and do an index-based deployment --- .zuul.yaml | 31 +++++++++- build/stf-run-ci/tasks/create_catalog.yml | 73 ++++++++++++++--------- ci/vars-local_build-index_deploy.yml | 6 ++ 3 files changed, 81 insertions(+), 29 deletions(-) create mode 100644 ci/vars-local_build-index_deploy.yml diff --git a/.zuul.yaml b/.zuul.yaml index 521b016c2..2c1b66282 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -49,6 +49,15 @@ vars: scenario: "local_build" +- job: + name: stf-crc-local_build-index_deploy + parent: stf-base + abstract: true + description: | + Build STF locally and deploy from index + vars: + scenario: "local_build-index_deploy" + - job: name: stf-crc-ocp_412-nightly_bundles parent: stf-crc-nightly_bundles @@ -81,6 +90,22 @@ vars: crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.13.14/crc_libvirt_4.13.14_amd64.crcbundle' +- job: + name: stf-crc-ocp_412-local_build-index_deploy + parent: stf-crc-local_build-index_deploy + description: | + Build STF locally and deploy from index on OCP 4.12 + vars: + crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.12.13/crc_libvirt_4.12.13_amd64.crcbundle' + +- job: + name: stf-crc-ocp_413-local_build-index_deploy + parent: stf-crc-local_build-index_deploy + description: | + Build STF locally and deploy from index on OCP 4.13 + vars: + crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.13.14/crc_libvirt_4.13.14_amd64.crcbundle' + - project-template: name: stf-crc-jobs description: | @@ -89,10 +114,12 @@ jobs: - stf-crc-ocp_412-nightly_bundles - stf-crc-ocp_412-local_build + - stf-crc-ocp_412-local_build-index_deploy - stf-crc-ocp_413-nightly_bundles - stf-crc-ocp_413-local_build - + - stf-crc-ocp_413-local_build-index_deploy + - project: name: infrawatch/service-telemetry-operator templates: - - stf-crc-jobs \ No newline at end of file + - stf-crc-jobs diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index 6a464afd9..99ad398d1 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -38,9 +38,7 @@ register: index_dockercfg_secret ignore_errors: true -# There's an error when the requested resource doesn't exist, so check the rc -- when: index_dockercfg_secret.rc != 0 - block: +- block: - name: Create config.json to import as Secret ansible.builtin.template: variable_start_string: "<<" @@ -58,9 +56,32 @@ register: ose_op_registry_is ignore_errors: true -- name: Create ImageStream for ose-operator-registry - ansible.builtin.command: oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }} --confirm +- name: Set the operator_registry_image + ansible.builtin.set_fact: + operator_registry_image: "{{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }}" + + # --show-multiarch=true is used because you get an error (and rc!=0) when you query a multi-arch image without specifying the arch, even when the image exists +- name: "Try to get the image info for the operator registry image" + ansible.builtin.command: + cmd: oc image info --show-multiarch=true "{{ operator_registry_image }}" + ignore_errors: true + register: image_info + +- name: Test alternative operator image + ansible.builtin.set_fact: + operator_registry_image: "quay.io/openshift/origin-operator-registry:4.13" + when: image_info.rc != 0 + +- name: Create ImageStream for ose-operator-registry, if it doesn't already exist + ansible.builtin.command: + cmd: | + oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ operator_registry_image }} --confirm when: ose_op_registry_is.rc != 0 + register: create_ose_is + +- name: Show the image stream + ansible.builtin.debug: + var: create_ose_is - name: Delete the existing imagestream, if it exists ansible.builtin.command: oc delete imagestream -n {{ namespace }} service-telemetry-framework-index @@ -75,8 +96,16 @@ register: stf_index_imagestream ignore_errors: true -- when: stf_index_imagestream.rc != 0 - name: Create BuildConfig for service-telemetry-framework-index +- name: Show STF index image stream + ansible.builtin.debug: + var: stf_index_imagestream + +- name: Create index.yaml base for index image + ansible.builtin.template: + src: index-yaml.j2 + dest: "{{ base_dir }}/working/service-telemetry-framework-index/index.yaml" + +- name: Create BuildConfig for service-telemetry-framework-index kubernetes.core.k8s: definition: apiVersion: build.openshift.io/v1 @@ -102,7 +131,8 @@ dockerfile: | # The base image is expected to contain # /bin/opm (with a serve subcommand) and /bin/grpc_health_probe - FROM {{default_operator_registry_image_base}}:{{default_operator_registry_image_tag}} + + FROM {{ operator_registry_image }} COPY --chmod=666 index.yaml /configs/ @@ -121,7 +151,7 @@ dockerStrategy: from: kind: ImageStreamTag - name: "ose-operator-registry:{{default_operator_registry_image_tag}}" + name: "ose-operator-registry:{{ default_operator_registry_image_tag }}" volumes: - mounts: - destinationPath: /opt/app-root/auth @@ -134,24 +164,13 @@ type: Docker successfulBuildsHistoryLimit: 5 -- name: Get builds of service-telemetry-framework-index - kubernetes.core.k8s_info: - api_version: build.openshift.io/v1 - kind: Build - namespace: "{{ namespace }}" - label_selectors: - - "build=service-telemetry-framework-index" - register: index_builds - -- when: index_builds.resources | length == 0 - block: - - name: Create index.yaml base for index image - ansible.builtin.template: - src: index-yaml.j2 - dest: "{{ base_dir }}/working/service-telemetry-framework-index/index.yaml" - - - name: Build service-telemetry-framework-index - ansible.builtin.command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --from-dir {{ base_dir }}/working/service-telemetry-framework-index +- name: Build service-telemetry-framework-index + ansible.builtin.command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --follow --from-dir {{ base_dir }}/working/service-telemetry-framework-index + register: build_result + ignore_errors: true + retries: 3 + delay: 10 + until: build_result.rc == 0 - name: Create CloudOps CatalogSource kubernetes.core.k8s: diff --git a/ci/vars-local_build-index_deploy.yml b/ci/vars-local_build-index_deploy.yml new file mode 100644 index 000000000..0404049b4 --- /dev/null +++ b/ci/vars-local_build-index_deploy.yml @@ -0,0 +1,6 @@ +--- +#ansible-playbook --extra-vars __local_build_enabled=true -e __deploy_from_index_enabled=true --extra-vars working_branch="$(git rev-parse --abbrev-ref HEAD)" --extra-vars __service_telemetry_storage_ephemeral_enabled=true --extra-vars __service_telemetry_observability_strategy=use_redhat ./run-ci.yaml +__local_build_enabled: true +__deploy_from_index_enabled: true +__service_telemetry_ephemeral_enabled: true +__service_telemetry_observability_strategy: use_redhat From ba9c918ec3e5344a599a911c8ef616ae88c55227 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 24 Nov 2023 15:17:23 -0500 Subject: [PATCH 20/70] Only require Interconnect and Smart Gateway (#541) * Only require Interconnect and Smart Gateway Update the dependency management within Service Telemetry Operator to only require AMQ Interconnect and Smart Gateway Operator, which is enough to deploy STF with observabilityStrategy: none. Other Operators can be installed in order to satisfy data storage of telemetry and events. Installation of cert-manager is also required, but needs to be pre-installed similar to Cluster Observability Operator, either as a cluster-scoped operator with the tech-preview channel, or a single time on the cluster as a namespace scoped operator, which is how the stable-v1 channel installs. Documentation will be updated to adjust for this change. Related: STF-1636 * Perform CI update to match docs install changes (#542) * Perform CI update to match docs install changes Update the stf-run-ci scripting to match the documented installation procedures which landed in https://github.com/infrawatch/documentation/pull/513. These changes are also reflected in #541. * Update build/stf-run-ci/tasks/setup_base.yml Co-authored-by: Emma Foley --------- Co-authored-by: Emma Foley * Also drop cert-manager project The cert-manager project gets created with workload items when deploying the cert-manager from the cert-manager-operator project. When removing cert-manager this project is not cleaned up, so we need to delete it as well. --------- Co-authored-by: Emma Foley --- build/stf-run-ci/tasks/create_catalog.yml | 2 +- build/stf-run-ci/tasks/pre-clean.yml | 7 +- build/stf-run-ci/tasks/preflight_checks.yml | 2 +- build/stf-run-ci/tasks/setup_base.yml | 162 +++++++++++------- build/stf-run-ci/tasks/setup_stf.yml | 15 +- .../metadata/properties.yaml | 23 --- 6 files changed, 113 insertions(+), 98 deletions(-) diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index 99ad398d1..feed3b56f 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -189,4 +189,4 @@ securityContextConfig: legacy updateStrategy: registryPoll: - interval: 1m + interval: 5m diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index 8e6df8bef..712d188bf 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -122,7 +122,6 @@ name: smart-gateway-operator-catalog namespace: "{{ namespace }}" -# Remove the cert manager since we install it as part of the CI/documented pre-install process - name: Remove openshift-cert-manager-operator namespace kubernetes.core.k8s: state: absent @@ -131,7 +130,11 @@ apiVersion: project.openshift.io/v1 kind: Project metadata: - name: openshift-cert-manager-operator + name: "{{ item }}" + loop: + - openshift-cert-manager-operator + - cert-manager-operator + - cert-manager - name: Remove Elasticsearch ignore_errors: true diff --git a/build/stf-run-ci/tasks/preflight_checks.yml b/build/stf-run-ci/tasks/preflight_checks.yml index 1902c0793..cd42c76c9 100644 --- a/build/stf-run-ci/tasks/preflight_checks.yml +++ b/build/stf-run-ci/tasks/preflight_checks.yml @@ -25,7 +25,7 @@ - name: "Show service-telemetry-operator CSV information" ansible.builtin.debug: var: csv_sto.stdout - + - name: "Show fail message if CSV isn't Succeeded after the alotted time" ansible.builtin.fail: msg: "Service Telemetry Operator CSV not Succeeded after 10 minutes. Check {{ logfile_dir }}/oc_get_csv_sto.log for more information" diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 3ad34f1f7..bb6667184 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -16,6 +16,7 @@ - disabled: false name: community-operators +# documented procedure: https://infrawatch.github.io/documentation/#deploying-service-telemetry-operator_assembly-installing-the-core-components-of-stf - name: Create OperatorGroup for service-telemetry kubernetes.core.k8s: definition: @@ -28,57 +29,7 @@ targetNamespaces: - "{{ namespace }}" -- when: not __deploy_from_index_enabled | bool - block: - - name: Subscribe to Cert Manager for OpenShift Operator - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - labels: - operators.coreos.com/openshift-cert-manager-operator.service-telemetry: "" - name: openshift-cert-manager-operator-stable-v1-redhat-operators-openshift-marketplace - namespace: "{{ namespace }}" - spec: - channel: stable-v1 - installPlanApproval: Automatic - name: openshift-cert-manager-operator - source: redhat-operators - sourceNamespace: openshift-marketplace - - - name: Subscribe to AMQ Interconnect Operator - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: amq7-interconnect-operator - namespace: "{{ namespace }}" - spec: - channel: 1.10.x - installPlanApproval: Automatic - name: amq7-interconnect-operator - source: redhat-operators - sourceNamespace: openshift-marketplace - - - name: Subscribe to Prometheus Operator - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: prometheus - namespace: "{{ namespace }}" - spec: - channel: beta - installPlanApproval: Automatic - name: prometheus - source: community-operators - sourceNamespace: openshift-marketplace - when: - - __service_telemetry_observability_strategy == "use_community" - +# documented procedure: https://infrawatch.github.io/documentation/#deploying-observability-operator_assembly-installing-the-core-components-of-stf - name: Subscribe to Red Hat Obervability Operator kubernetes.core.k8s: definition: @@ -98,27 +49,108 @@ when: - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] -- name: Subscribe to Elastic Cloud on Kubernetes Operator +# undocumented procedure, used for testing updates or old deployment models +- name: Subscribe to Prometheus Operator kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - name: elasticsearch-eck-operator-certified + name: prometheus namespace: "{{ namespace }}" spec: - channel: stable + channel: beta installPlanApproval: Automatic - name: elasticsearch-eck-operator-certified - source: certified-operators + name: prometheus + source: community-operators sourceNamespace: openshift-marketplace + when: + - __service_telemetry_observability_strategy == "use_community" + +# documented procedure: https://infrawatch.github.io/documentation/#deploying-certificate-manager-for-openshift-operator_assembly-installing-the-core-components-of-stf +- block: + - name: Create project for cert-manager for Red Hat OpenShift + kubernetes.core.k8s: + definition: + apiVersion: project.openshift.io/v1 + kind: Project + metadata: + name: cert-manager-operator + spec: + finalizers: + - kubernetes + + - name: Create OperatorGroup for cert-manager for Red hat OpenShift + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: cert-manager-operator + namespace: cert-manager-operator + spec: + targetNamespaces: + - cert-manager-operator + upgradeStrategy: Default + + - name: Subscribe to cert-manager for Red Hat OpenShift Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/openshift-cert-manager-operator.service-telemetry: "" + name: openshift-cert-manager-operator + namespace: cert-manager-operator + spec: + channel: stable-v1 + installPlanApproval: Automatic + name: openshift-cert-manager-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + +# installed by properties.yaml definition as of STF 1.5.3 +- when: not __deploy_from_index_enabled | bool + block: + - name: Subscribe to AMQ Interconnect Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: amq7-interconnect-operator + namespace: "{{ namespace }}" + spec: + channel: 1.10.x + installPlanApproval: Automatic + name: amq7-interconnect-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + +# undocumented procedure: used for backwards compatilibity verification +- block: + - name: Subscribe to Elastic Cloud on Kubernetes Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: elasticsearch-eck-operator-certified + namespace: "{{ namespace }}" + spec: + channel: stable + installPlanApproval: Automatic + name: elasticsearch-eck-operator-certified + source: certified-operators + sourceNamespace: openshift-marketplace -- name: Wait for Elasticsearch CRD to appear - kubernetes.core.k8s_info: - api_version: apiextensions.k8s.io/v1 - kind: CustomResourceDefinition - name: elasticsearches.elasticsearch.k8s.elastic.co - register: eckCRD - until: eckCRD.resources[0] is defined - retries: 5 - delay: 30 + - name: Wait for Elasticsearch CRD to appear + kubernetes.core.k8s_info: + api_version: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + name: elasticsearches.elasticsearch.k8s.elastic.co + register: eckCRD + until: eckCRD.resources[0] is defined + retries: 5 + delay: 30 diff --git a/build/stf-run-ci/tasks/setup_stf.yml b/build/stf-run-ci/tasks/setup_stf.yml index e76eb1734..ffe366f6f 100644 --- a/build/stf-run-ci/tasks/setup_stf.yml +++ b/build/stf-run-ci/tasks/setup_stf.yml @@ -9,7 +9,7 @@ namespace: openshift-marketplace spec: displayName: InfraWatch Operators - image: quay.io/infrawatch-operators/infrawatch-catalog:unstable + image: quay.io/infrawatch-operators/infrawatch-catalog:nightly publisher: InfraWatch sourceType: grpc updateStrategy: @@ -49,6 +49,14 @@ sourceNamespace: openshift-marketplace when: service_telemetry_operator_subscription_manifest is not defined +# enable catalogsource +- name: Enable InfraWatch Catalog Source + kubernetes.core.k8s: + definition: + '{{ infrawatch_catalog_source_manifest }}' + +# subscribe to the Operators from the defined CatalogSource sources. +# STO will automatically install SGO via dependencies but pre-subscribe in case deployment from different CatalogSources is specified in an override (for testing purposes). - name: Subscribe to Smart Gateway Operator kubernetes.core.k8s: definition: @@ -58,8 +66,3 @@ kubernetes.core.k8s: definition: '{{ service_telemetry_operator_subscription_manifest }}' - -- name: Enable InfraWatch Catalog Source - kubernetes.core.k8s: - definition: - '{{ infrawatch_catalog_source_manifest }}' diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml index 2d950b055..5ffce5254 100644 --- a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -19,26 +19,3 @@ properties: package: packageName: amq7-interconnect-operator versionRange: '>=1.10.0' - - type: olm.constraint - value: - failureMessage: Require certificate management for Service Telemetry Framework - all: - constraints: - - failureMessage: Package openshift-cert-manager-operator is needed for AMQ Interconnect setup - package: - packageName: openshift-cert-manager-operator - versionRange: '>=1.7.0' - - type: olm.constraint - value: - failureMessage: Require Prometheus backend for data storage of metrics for Service Telemetry Framework - any: - constraints: - - package: - packageName: prometheus - versionRange: '>=0.56.0' - - package: - packageName: observability-operator - versionRange: '>=0.0.25' - - package: - packageName: cluster-observability-operator - versionRange: '>=0.1.0' From 8ffbe5a6fb0be0e369bba7852d8e08ed52f6001c Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 1 Dec 2023 16:10:38 +0000 Subject: [PATCH 21/70] [stf-run-ci] Explicitly check the validate_daployment was successful (#545) In [1], the validate_deployment step is successful, despite the deployment not being successful. This causes the job to timeout because the following steps continue to run despite an invalid state. To get the expected behaviour, the output should be checked for a string indicating success. i.e. * [info] CI Build complete. You can now run tests. [2] shows the output for a successful run. [1] https://review.rdoproject.org/zuul/build/245ae63e41884dc09353d938ec9058d7/console#5/0/144/controller [2] https://review.rdoproject.org/zuul/build/802432b23da24649b818985b7b1633bb/console#5/0/82/controller --- build/stf-run-ci/tasks/main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 503dd279c..90fe03684 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -220,7 +220,8 @@ args: executable: /bin/bash register: validate_deployment + failed_when: validate_deployment.stdout_lines[-1] != "* [info] CI Build complete. You can now run tests." - name: Show the result of the validate_deployment script ansible.builtin.debug: - var: validate_deployment.stdout + var: validate_deployment.stdout_lines[-1] From 0d5ed29e6b2570a3caaa062355d576d2d38a691b Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 7 Dec 2023 15:18:34 -0500 Subject: [PATCH 22/70] Implement dashboard management (#548) * Implement dashboard management Implement a new configuration option graphing.grafana.dashboards.enabled which results in dashboards objects being created for the Grafana Operator. Previously loading dashboards would be done manually via 'oc apply' using instructions from documentation. The new CRD parameters to the ServiceTelemetry object allows the Service Telemetry Operator to now make the GrafanaDashboard objects directly. Related: OSPRH-825 * Drop unnecessary cluster roles * Update CSV for owned parameter --- .../infra.watch_servicetelemetrys_crd.yaml | 9 +- ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 5 +- .../infra.watch_servicetelemetrys_crd.yaml | 10 +- ...emetry-operator.clusterserviceversion.yaml | 6 +- roles/servicetelemetry/defaults/main.yml | 6 +- .../files/memcached-dashboard.json | 1513 ++++++++++++ .../files/rhos-cloud-dashboard.json | 1752 +++++++++++++ .../files/rhos-dashboard.json | 2179 +++++++++++++++++ .../files/virtual-machine-view.json | 1112 +++++++++ .../tasks/component_grafana.yml | 71 + 10 files changed, 6657 insertions(+), 6 deletions(-) create mode 100644 roles/servicetelemetry/files/memcached-dashboard.json create mode 100644 roles/servicetelemetry/files/rhos-cloud-dashboard.json create mode 100644 roles/servicetelemetry/files/rhos-dashboard.json create mode 100644 roles/servicetelemetry/files/virtual-machine-view.json diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index 286d2c74b..c29fe03ce 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -289,7 +289,7 @@ spec: description: Whether to disable the Grafana signout menu type: boolean ingressEnabled: - description: Enable ingress access to Grafana + description: Whether to enable ingress access to Grafana type: boolean adminPassword: description: Grafana admin password @@ -301,6 +301,13 @@ spec: baseImage: description: Path to the base container image used to instantiate a Grafana instance type: string + dashboards: + description: Dashboard configurations for Grafana + properties: + enabled: + description: Whether to enable built-in dashboards provided by Service Telemetry Framework + type: boolean + type: object type: object type: object cloudsRemoveOnMissing: diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 8b4cf7142..ee728e2f4 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -79,10 +79,13 @@ spec: graphing: enabled: false grafana: - ingressEnabled: false + ingressEnabled: true adminPassword: secret adminUser: root disableSignoutMenu: false + baseImage: registry.redhat.io/rhel8/grafana:7 + dashboards: + enabled: true transports: qdr: enabled: true diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index f26cbc7b9..545ffd994 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -396,11 +396,19 @@ spec: description: Path to the base container image used to instantiate a Grafana instance type: string + dashboards: + description: Dashboard configurations for Grafana + properties: + enabled: + description: Whether to enable built-in dashboards provided + by Service Telemetry Framework + type: boolean + type: object disableSignoutMenu: description: Whether to disable the Grafana signout menu type: boolean ingressEnabled: - description: Enable ingress access to Grafana + description: Whether to enable ingress access to Grafana type: boolean type: object type: object diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 1e1fdc092..1052f8ba3 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -119,8 +119,12 @@ metadata: "grafana": { "adminPassword": "secret", "adminUser": "root", + "baseImage": "registry.redhat.io/rhel8/grafana:7", + "dashboards": { + "enabled": true + }, "disableSignoutMenu": false, - "ingressEnabled": false + "ingressEnabled": true } }, "highAvailability": { diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index e8e92d855..263480e00 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -82,11 +82,13 @@ servicetelemetry_defaults: graphing: enabled: false grafana: - ingress_enabled: false + ingress_enabled: true admin_password: secret admin_user: root disable_signout_menu: false - base_image: docker.io/grafana/grafana:8.1.2 + base_image: registry.redhat.io/rhel8/grafana:7 + dashboards: + enabled: true # 'clouds' object is not partially updatable like other objects. If 'clouds' # object is defined then the default is overwritten. diff --git a/roles/servicetelemetry/files/memcached-dashboard.json b/roles/servicetelemetry/files/memcached-dashboard.json new file mode 100644 index 000000000..e68a439b9 --- /dev/null +++ b/roles/servicetelemetry/files/memcached-dashboard.json @@ -0,0 +1,1513 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Tracking dashboard for memcached service", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 2, + "iteration": 1698247048278, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 16, + "panels": [], + "title": "Availability and connections", + "type": "row" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_connections{service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Current connections", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_uptime{service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_items{service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Items", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "8.0.4", + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_connections_total{service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Connection rate (1m)", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_total_events_total{service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Max connections reached", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 14, + "panels": [], + "title": "System metrics", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:89", + "alias": "/Rx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_octets_tx_total{service=~\".+-$clouds-.+\"}[1m])", + "hide": false, + "interval": "", + "legendFormat": "Tx {{ host }}", + "refId": "B" + }, + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_octets_rx_total{service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "Rx {{ host }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Transfer rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:62", + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:63", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 42, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "collectd_libpodstats_pod_memory{plugin_instance=\"memcached\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Memory", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "collectd_libpodstats_pod_cpu_percent{plugin_instance=\"memcached\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "CPU percent", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 46, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_libpodstats_pod_cpu_time_total{plugin_instance=\"memcached\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "CPU time", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 22, + "panels": [], + "title": "Cache performance", + "type": "row" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 36, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "(collectd_memcached_df_free{service=~\".+-$clouds-.+\"} + collectd_memcached_df_used{service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total cache available", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_df_used{service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Cache usage over time", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_df_used{service=~\".+-$clouds-.+\"} / (collectd_memcached_df_free{service=~\".+-$clouds-.+\"} + collectd_memcached_df_used{service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Cache utilization", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 33, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_command_total{type_instance=\"get\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total gets", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 35, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_command_total{type_instance=\"set\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total sets", + "type": "stat" + }, + { + "datasource": null, + "description": "This is a calculated metric: get_hits / cmd_get. It indicates how efficient your Memcached server is.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_ops_total{type_instance=\"hits\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Hit rate", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_ops_total{type_instance=\"misses\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Miss rate", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 32, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_command_total{type_instance=\"flush\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total flushes", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 40, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_ops_total{type_instance=\"evictions\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total evictions", + "type": "stat" + }, + { + "datasource": null, + "description": "The flush_all command invalidates all items in the database. This operation incurs a performance penalty and shouldn’t take place in production, so check your debug scripts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_command_total{type_instance=\"flush\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Flush rate", + "type": "timeseries" + }, + { + "datasource": null, + "description": "An eviction is when an item that still has time to live is removed from the cache because a brand new item needs to be allocated.\nThe item is selected with a pseudo-LRU mechanism.\nA high number of evictions coupled with a low hit rate means your application is setting a large number of keys that are never used again.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 26, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_ops_total{type_instance=\"evictions\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Eviction rate", + "type": "timeseries" + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "isNone": true, + "selected": true, + "text": "None", + "value": "" + }, + "datasource": null, + "definition": "label_values(collectd_memcached_percent, service)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "cloud", + "multi": false, + "name": "clouds", + "options": [], + "query": { + "query": "label_values(collectd_memcached_percent, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/.+-(.+)-coll-meter/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Memcached View", + "uid": "VHbfxjinz", + "version": 3 +} diff --git a/roles/servicetelemetry/files/rhos-cloud-dashboard.json b/roles/servicetelemetry/files/rhos-cloud-dashboard.json new file mode 100644 index 000000000..d3ed49146 --- /dev/null +++ b/roles/servicetelemetry/files/rhos-cloud-dashboard.json @@ -0,0 +1,1752 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3, + "iteration": 1695784064538, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": true, + "tags": [ + "cloud-dashboards" + ], + "targetBlank": true, + "title": "Cloud Dashboards", + "tooltip": "", + "type": "dashboards", + "url": "" + } + ], + "panels": [ + { + "cacheTimeout": null, + "cards": { + "cardPadding": 0, + "cardRound": null + }, + "color": { + "cardColor": "#37872D", + "colorScale": "linear", + "colorScheme": "interpolateReds", + "exponent": 0.5, + "max": 1, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 15, + "x": 0, + "y": 0 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 21, + "interval": "10m", + "legend": { + "show": false + }, + "links": [], + "pluginVersion": "6.5.1", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": " avg(sensubility_container_health_status{process=\"glance_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "glance_api", + "refId": "D" + }, + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"nova_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "nova_api", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + }, + { + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"heat_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "hide": false, + "interval": "", + "legendFormat": "heat_api", + "refId": "B" + }, + { + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"neutron_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "hide": false, + "interval": "", + "legendFormat": "neutron_api", + "refId": "C" + }, + { + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"placement_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "hide": false, + "interval": "", + "legendFormat": "placement_api", + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "2m", + "yAxis": { + "decimals": null, + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "middle", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 9, + "x": 15, + "y": 0 + }, + "hiddenSeries": false, + "id": 42, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($top, sum by (plugin_instance, host) (collectd_libpodstats_pod_memory{service=~\".+-$clouds-.+\"}))", + "legendFormat": "{{plugin_instance}} on {{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Top $top Memory Consumers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 9, + "x": 15, + "y": 6 + }, + "hiddenSeries": false, + "id": 43, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($top, avg_over_time(collectd_libpodstats_pod_cpu_percent{service=~\".+-$clouds-.+\"}[10m]))", + "legendFormat": "{{plugin_instance}} on {{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Top $top CPU Consumers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:169", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": null, + "show": true + }, + { + "$$hashKey": "object:170", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 8 + }, + "id": 29, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"placement_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime placement_api", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 8 + }, + "id": 30, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"neutron_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime neutron_api", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 8 + }, + "id": 31, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"heat_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime heat_api", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 8 + }, + "id": 26, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"nova_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime nova_api", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 8 + }, + "id": 32, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"glance_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime glance_api", + "type": "stat" + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 6, + "panels": [], + "title": "Service Resource Usage", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "collectd_libpodstats_pod_cpu_percent{plugin_instance=\"horizon\", service=~\".+-$clouds-.+\"}", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Horizon CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:235", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": null, + "show": true + }, + { + "$$hashKey": "object:236", + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 13 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "collectd_libpodstats_pod_memory{plugin_instance=\"horizon\", service=~\".+-$clouds-.+\"}", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Horizon Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (host) (collectd_libpodstats_pod_cpu_percent{plugin_instance=~\"nova.*\", service=~\".+-$clouds-.+\"})", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nova CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:293", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": null, + "show": true + }, + { + "$$hashKey": "object:294", + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (host) (collectd_libpodstats_pod_memory{plugin_instance=~\"nova.*\", service=~\".+-$clouds-.+\"})", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nova Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 28 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (host) (collectd_libpodstats_pod_cpu_percent{plugin_instance=~\"ceilometer.*\", service=~\".+-$clouds-.+\"})", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ceilometer CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:465", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": null, + "show": true + }, + { + "$$hashKey": "object:466", + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 28 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (host) (collectd_libpodstats_pod_memory{plugin_instance=~\"ceilometer.*\", service=~\".+-$clouds-.+\"})", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ceilometer Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 23, + "panels": [], + "title": "Instances", + "type": "row" + }, + { + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a", + "#4040a0" + ], + "datasource": "STFPrometheus", + "description": "Click instance for drill down view", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 17, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "polystat": { + "animationSpeed": 2500, + "columnAutoSize": true, + "columns": "", + "defaultClickThrough": "", + "defaultClickThroughNewTab": false, + "defaultClickThroughSanitize": false, + "displayLimit": 100, + "fontAutoColor": true, + "fontAutoScale": true, + "fontSize": 12, + "fontType": "Roboto", + "globalDecimals": 2, + "globalDisplayMode": "all", + "globalDisplayTextTriggeredEmpty": "OK", + "globalOperatorName": "avg", + "globalUnitFormat": "short", + "gradientEnabled": true, + "hexagonSortByDirection": 1, + "hexagonSortByField": "name", + "maxMetrics": 0, + "polygonBorderColor": "black", + "polygonBorderSize": 2, + "polygonGlobalFillColor": "#FFF899", + "radius": "", + "radiusAutoSize": true, + "rowAutoSize": true, + "rows": "", + "shape": "hexagon_pointed_top", + "tooltipDisplayMode": "all", + "tooltipDisplayTextTriggeredEmpty": "OK", + "tooltipFontSize": 12, + "tooltipFontType": "Roboto", + "tooltipPrimarySortDirection": 2, + "tooltipPrimarySortField": "thresholdLevel", + "tooltipSecondarySortDirection": 2, + "tooltipSecondarySortField": "value", + "tooltipTimestampEnabled": true, + "valueEnabled": true + }, + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "repeat": "projects", + "repeatDirection": "h", + "savedComposites": [], + "savedOverrides": [], + "targets": [ + { + "exemplar": true, + "expr": "sum by (resource, plugin_instance) (label_replace(collectd_virt_memory{service=~\".+-$clouds-.+\"}, \"resource\", \"$1\", \"host\", \".+:(.+):.+\")) + on(resource) group_right(plugin_instance) ceilometer_cpu{project=\"$projects\", service=~\".+-$clouds-.+\"}", + "instant": true, + "interval": "", + "legendFormat": "{{plugin_instance}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Project $projects", + "type": "grafana-polystat-panel", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ] + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [ + "cloud-dashboards" + ], + "templating": { + "list": [ + { + "allValue": null, + "datasource": null, + "definition": "label_values(collectd_cpu_percent, service)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "cloud", + "multi": false, + "name": "clouds", + "options": [], + "query": { + "query": "label_values(collectd_cpu_percent, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/.+-(.+)-coll-meter/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "STFPrometheus", + "definition": "label_values(ceilometer_cpu{service=~\".+-$clouds-.+\"},project)", + "description": null, + "error": null, + "hide": 2, + "includeAll": true, + "label": null, + "multi": false, + "name": "projects", + "options": [], + "query": { + "query": "label_values(ceilometer_cpu{service=~\".+-$clouds-.+\"},project)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "5", + "value": "5" + }, + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "top", + "options": [ + { + "selected": false, + "text": "1", + "value": "1" + }, + { + "selected": false, + "text": "2", + "value": "2" + }, + { + "selected": false, + "text": "3", + "value": "3" + }, + { + "selected": false, + "text": "4", + "value": "4" + }, + { + "selected": true, + "text": "5", + "value": "5" + }, + { + "selected": false, + "text": "6", + "value": "6" + }, + { + "selected": false, + "text": "7", + "value": "7" + }, + { + "selected": false, + "text": "8", + "value": "8" + }, + { + "selected": false, + "text": "9", + "value": "9" + }, + { + "selected": false, + "text": "10", + "value": "10" + }, + { + "selected": false, + "text": "11", + "value": "11" + }, + { + "selected": false, + "text": "12", + "value": "12" + }, + { + "selected": false, + "text": "13", + "value": "13" + }, + { + "selected": false, + "text": "14", + "value": "14" + }, + { + "selected": false, + "text": "15", + "value": "15" + }, + { + "selected": false, + "text": "16", + "value": "16" + }, + { + "selected": false, + "text": "17", + "value": "17" + }, + { + "selected": false, + "text": "18", + "value": "18" + }, + { + "selected": false, + "text": "19", + "value": "19" + }, + { + "selected": false, + "text": "20", + "value": "20" + } + ], + "query": "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Cloud View", + "uid": "IHqhpjPZz", + "version": 15 +} diff --git a/roles/servicetelemetry/files/rhos-dashboard.json b/roles/servicetelemetry/files/rhos-dashboard.json new file mode 100644 index 000000000..871f02366 --- /dev/null +++ b/roles/servicetelemetry/files/rhos-dashboard.json @@ -0,0 +1,2179 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 2, + "iteration": 1695783546006, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 68, + "panels": [], + "title": "Quickview", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "text": "Node Active" + }, + "1": { + "text": "Node Inactive" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#37872D", + "value": null + }, + { + "color": "#C4162A", + "value": 1 + }, + { + "color": "#C4162A", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 2, + "x": 0, + "y": 1 + }, + "id": 33, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "absent({host = '$hosts', service=~\".+-$clouds-.+\"}) or label_replace(vector(0), \"host\", \"$hosts\", \"host\", \".*\")", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "Time node has been operational", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "dtdurations" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 2, + "x": 2, + "y": 1 + }, + "id": 31, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "collectd_uptime{host=\"$hosts\", service=~\".+-$clouds-.+\"}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "0.0%", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "#d44a3a", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 4, + "y": 1 + }, + "id": 19, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_cpu_percent{type_instance!=\"idle\", host=\"$hosts\", service=~\".+-$clouds-.+\"}) / count(sum by (host,plugin_instance) (collectd_cpu_percent{host=\"$hosts\", service=~\".+-$clouds-.+\"}))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "0.0%", + "type": 1, + "value": "null" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "#d44a3a", + "value": 0.8 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 7, + "y": 1 + }, + "id": 44, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_memory{type_instance=\"used\",host=\"$hosts\", service=~\".+-$clouds-.+\"})/ sum(collectd_memory{host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{memory}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 10, + "y": 1 + }, + "id": 41, + "links": [], + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_df_df_complex{host=\"$hosts\",type_instance=\"used\", service=~\".+-$clouds-.+\"}) by (plugin_instance) / sum(collectd_df_df_complex{host=\"$hosts\", service=~\".+-$clouds-.+\"}) by (plugin_instance)", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{plugin_instance}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "File Systems", + "type": "bargauge" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 13, + "y": 1 + }, + "id": 54, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "delta" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_interface_if_errors_rx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}) + sum(collectd_interface_if_errors_tx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{plugin_instance}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Interface Errors", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "Load average represents the average number of running and un-interruptable processes residing in the kernel's execution queue. \n\nTypically, short term, midterm, and long term series give running averages of 1m, 5m, and 15m, respectively. ", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 1 + }, + "hiddenSeries": false, + "id": 35, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "collectd_load_shortterm{host=\"$hosts\", service=~\".+-$clouds-.+\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "short term", + "refId": "A" + }, + { + "expr": "collectd_load_midterm{host=\"$hosts\", service=~\".+-$clouds-.+\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "mid term", + "refId": "B" + }, + { + "expr": "collectd_load_longterm{host=\"$hosts\", service=~\".+-$clouds-.+\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "long term", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Load Average", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": "Processes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 37, + "panels": [], + "title": "Network Interfaces", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 48, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_interface_if_octets_rx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "legendFormat": "Rx {{plugin_instance}}", + "refId": "A" + }, + { + "expr": "rate(collectd_interface_if_octets_tx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "legendFormat": "Tx {{plugin_instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Data", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 7 + }, + "hiddenSeries": false, + "id": 56, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_interface_if_errors_rx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Rx {{plugin_instance}}", + "refId": "A" + }, + { + "expr": "rate(collectd_interface_if_errors_tx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "legendFormat": "Tx {{plugin_instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error Rates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "errors/s", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 7 + }, + "hiddenSeries": false, + "id": 53, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_interface_if_dropped_rx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Rx {{plugin_instance}}", + "refId": "A" + }, + { + "expr": "rate(collectd_interface_if_dropped_tx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "legendFormat": "Tx {{plugin_instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Drop Rates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 21, + "panels": [], + "title": "CPU", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "Average non-idle CPU activity of all cores on node", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(collectd_cpu_percent{type_instance!=\"idle\", host=\"$hosts\", service=~\".+-$clouds-.+\"}) / count(sum by (type_instance) (collectd_cpu_percent{type_instance!=\"idle\",host=\"$hosts\", service=~\".+-$clouds-.+\"}))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Aggr. Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "Shows average time spent for each activity across all cores", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(collectd_cpu_percent{type_instance!=\"idle\", host=\"$hosts\", service=~\".+-$clouds-.+\"}) by (type_instance) / count(collectd_cpu_percent{host=\"$hosts\", service=~\".+-$clouds-.+\"}) by (type_instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{type_instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Aggr. Usage by Type", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 25, + "panels": [], + "title": "Memory", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "decimals": null, + "description": "Memory used on node", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 25 + }, + "hiddenSeries": false, + "id": 27, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(collectd_memory{type_instance=\"used\",host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "total", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 9, + "x": 12, + "y": 25 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(collectd_hugepages_vmpage_number{type_instance=\"used\",host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{hugepages}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Huge Pages", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "0.0%", + "type": 1, + "value": "null" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.8 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 25 + }, + "id": 71, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_hugepages_vmpage_number{type_instance=\"used\",host=\"$hosts\", service=~\".+-$clouds-.+\"}) / sum(collectd_hugepages_vmpage_number{host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{hugepages}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Huge Pages (%)", + "type": "gauge" + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 11, + "panels": [], + "title": "File System", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 31 + }, + "id": 51, + "links": [], + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum by (plugin_instance) (collectd_df_df_inodes{type_instance=\"used\", host=\"$hosts\", service=~\".+-$clouds-.+\"}) / sum by (plugin_instance) (collectd_df_df_inodes{host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{plugin_instance}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Inode Usage", + "type": "bargauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 18, + "x": 6, + "y": 31 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (plugin_instance) (collectd_df_df_complex{type_instance!~\"free\",host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{plugin_instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File System Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 39 + }, + "id": 70, + "panels": [], + "title": "Disk", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "decimals": 2, + "description": "10m rolling average", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(collectd_disk_disk_octets_read_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "read", + "refId": "B" + }, + { + "expr": "sum(rate(collectd_disk_disk_octets_write_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "write", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "decimals": 2, + "description": "Approximate percentage of total disk bandwidth being used.\n\nWeighted I/O includes the backlog that may be accumulating.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(collectd_disk_disk_io_time_io_time_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[1h]))/1000", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "i/o", + "refId": "A" + }, + { + "expr": "sum(rate(collectd_disk_disk_io_time_weighted_io_time_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[1h]))/1000", + "legendFormat": "weighted i/o", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "decimals": null, + "description": "", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 46 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(collectd_disk_disk_ops_read_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "sum(rate(collectd_disk_disk_ops_write_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operations/s", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "Average time each I/O operation took to complete. Per the collectd disk plugin docs (https://collectd.org/wiki/index.php/Plugin:Disk), this average is not very accurate.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 46 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(collectd_disk_disk_time_read_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "sum(rate(collectd_disk_disk_time_write_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg. I/O Operation Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "datasource": null, + "definition": "label_values(collectd_cpu_percent, service)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "cloud", + "multi": false, + "name": "clouds", + "options": [], + "query": { + "query": "label_values(collectd_cpu_percent, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/.+-(.+)-coll-meter/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "datasource": "STFPrometheus", + "definition": "label_values(collectd_cpu_percent{service=~\".+-$clouds-.+\"}, host)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "node", + "multi": false, + "name": "hosts", + "options": [], + "query": { + "query": "label_values(collectd_cpu_percent{service=~\".+-$clouds-.+\"}, host)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Infrastructure Node View", + "uid": "1F1OJZEWz", + "version": 4 +} diff --git a/roles/servicetelemetry/files/virtual-machine-view.json b/roles/servicetelemetry/files/virtual-machine-view.json new file mode 100644 index 000000000..0d5b4a191 --- /dev/null +++ b/roles/servicetelemetry/files/virtual-machine-view.json @@ -0,0 +1,1112 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 4, + "iteration": 1695785660982, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 8, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "count((ceilometer_cpu{project=\"$project\"}))", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Virtual Machine Instances", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "8.0.4", + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(collectd_virt_percent, \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (avg by (resource,project) (ceilometer_cpu{project=\"$project\"}) * 0)", + "format": "time_series", + "interval": "", + "legendFormat": "{{ plugin_instance }} on {{ node }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "VM CPU %", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "8.0.4", + "targets": [ + { + "exemplar": true, + "expr": "rate(ceilometer_cpu{project=\"$project\"}[1m])", + "interval": "", + "legendFormat": "{{ resource }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Time for Instances", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "displayMode": "auto", + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "plugin_instance" + }, + "properties": [ + { + "id": "displayName", + "value": "Virtual Machines" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/Value/" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "auto" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 3 + }, + "id": 16, + "options": { + "showHeader": true + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "(ceilometer_cpu{project=\"$project\"}) + on (resource) group_right(project) label_replace(label_replace(collectd_virt_virt_cpu_total_total, \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"Node\", \"$1\", \"host\", \".+:.+:(.+)\")", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "{{ plugin_instance }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + }, + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "plugin_instance", + "Node" + ] + } + } + } + ], + "transparent": true, + "type": "table" + }, + { + "datasource": null, + "description": "Memory utilization of that allocated to the virtual machine.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 50, + "gradientMode": "opacity", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "links": [], + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 6 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "8.0.4", + "targets": [ + { + "exemplar": true, + "expr": "(label_replace(label_replace(collectd_virt_memory, \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") / 1000000) / on (resource) group_left (project) ceilometer_memory_usage{project=\"$project\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ plugin_instance }} [{{ type_instance }}] on {{ node }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "VM Memory Utilization (Allocated)", + "type": "timeseries" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine disk operations rate (in operations/second)", + "fieldConfig": { + "defaults": { + "links": [], + "unit": "decbytes" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 6 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_disk_ops_read_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Read {{ plugin_instance }} disk {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_disk_ops_write_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Write {{ plugin_instance }} disk {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Disk Operations Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine network dropped packet rate (in packets-per-second)", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 12 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:140", + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_dropped_rx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Rx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_dropped_tx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Tx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Network Dropped Packet Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine network error rate (in packets-per-second)", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 12 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:201", + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_errors_rx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Rx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_errors_tx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Tx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Network Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine disk throughput rate (in bytes)", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 17 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:322", + "alias": "/Write/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_disk_octets_read_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Read {{ plugin_instance }} disk {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_disk_octets_write_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Write {{ plugin_instance }} disk {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Disk Throughput Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine network throughput rate (in bytes)", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 17 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:383", + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_octets_rx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Rx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_octets_tx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Tx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Network Throughput Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "datasource": null, + "definition": "label_values(service)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "cloud", + "multi": false, + "name": "clouds", + "options": [], + "query": { + "query": "label_values(service)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/.+-(.+)-coll-meter/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "datasource": null, + "definition": "label_values(ceilometer_cpu{service=~\".+-$clouds-.+\"}, project)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "project", + "multi": false, + "name": "project", + "options": [], + "query": { + "query": "label_values(ceilometer_cpu{service=~\".+-$clouds-.+\"}, project)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Virtual Machine View", + "uid": "JJzvn8mnz", + "version": 4 +} diff --git a/roles/servicetelemetry/tasks/component_grafana.yml b/roles/servicetelemetry/tasks/component_grafana.yml index 068507610..df012b70a 100644 --- a/roles/servicetelemetry/tasks/component_grafana.yml +++ b/roles/servicetelemetry/tasks/component_grafana.yml @@ -115,3 +115,74 @@ state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' definition: '{{ ds_manifest }}' + + - name: Load Cloud Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-cloud-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: rhos-cloud-dashboard.json + plugins: + - name: grafana-polystat-panel + version: "1.2.11" + json: | + {{ lookup('file', 'rhos-cloud-dashboard.json') | string }} + + - name: Load Infrastructure Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: rhos-dashboard.json + json: | + {{ lookup('file', 'rhos-dashboard.json') | string }} + + - name: Load Memcached Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: memcached-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: memcached-dashboard.json + json: | + {{ lookup('file', 'memcached-dashboard.json') | string }} + + - name: Load Virtual Machine View Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: virtual-machine-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: virtual-machine-view.json + json: | + {{ lookup('file', 'virtual-machine-view.json') | string }} From b29d023e6d9c438fa40adee15282eb630e4027f4 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Thu, 7 Dec 2023 17:21:24 -0500 Subject: [PATCH 23/70] Remove basic-auth method from grafana (#550) * Only openshift auth will be allowed --- .../infra.watch_servicetelemetrys_crd.yaml | 7 --- ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 2 - .../infra.watch_servicetelemetrys_crd.yaml | 7 --- ...emetry-operator.clusterserviceversion.yaml | 2 - roles/servicetelemetry/defaults/main.yml | 2 - .../tasks/component_grafana.yml | 48 ------------------- .../templates/manifest_grafana.j2 | 9 +--- 7 files changed, 1 insertion(+), 76 deletions(-) diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index c29fe03ce..37a847303 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -291,13 +291,6 @@ spec: ingressEnabled: description: Whether to enable ingress access to Grafana type: boolean - adminPassword: - description: Grafana admin password - type: string - format: password - adminUser: - description: Grafana admin user - type: string baseImage: description: Path to the base container image used to instantiate a Grafana instance type: string diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index ee728e2f4..dc9ddda59 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -80,8 +80,6 @@ spec: enabled: false grafana: ingressEnabled: true - adminPassword: secret - adminUser: root disableSignoutMenu: false baseImage: registry.redhat.io/rhel8/grafana:7 dashboards: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index 545ffd994..23efd2236 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -385,13 +385,6 @@ spec: grafana: description: Grafana related configuration properties: - adminPassword: - description: Grafana admin password - format: password - type: string - adminUser: - description: Grafana admin user - type: string baseImage: description: Path to the base container image used to instantiate a Grafana instance diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 1052f8ba3..6e758b6f3 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -117,8 +117,6 @@ metadata: "graphing": { "enabled": false, "grafana": { - "adminPassword": "secret", - "adminUser": "root", "baseImage": "registry.redhat.io/rhel8/grafana:7", "dashboards": { "enabled": true diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 263480e00..fc1cd8a91 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -83,8 +83,6 @@ servicetelemetry_defaults: enabled: false grafana: ingress_enabled: true - admin_password: secret - admin_user: root disable_signout_menu: false base_image: registry.redhat.io/rhel8/grafana:7 dashboards: diff --git a/roles/servicetelemetry/tasks/component_grafana.yml b/roles/servicetelemetry/tasks/component_grafana.yml index df012b70a..7eff9174f 100644 --- a/roles/servicetelemetry/tasks/component_grafana.yml +++ b/roles/servicetelemetry/tasks/component_grafana.yml @@ -7,54 +7,6 @@ kind: Route name: 'grafana-route' -- name: Check for existing grafana htpasswd secret - no_log: true - k8s_info: - api_version: v1 - kind: Secret - namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-grafana-htpasswd' - register: grafana_htpasswd_secret - -- block: - - name: Parse current Grafana htpasswd salt from secret - no_log: true - set_fact: - grafana_htpasswd_salt: "{{ ((grafana_htpasswd_secret.resources[0].data.auth | b64decode).split('$')[-1])[0:22] }}" - rescue: - - name: Generate initial Grafana htpasswd bcrypt string from grafana.admin_password - no_log: true - set_fact: - init_grafana_htpasswd_bcrypt_string: "{{ (servicetelemetry_vars.graphing.grafana.admin_password | password_hash('bcrypt') | replace('$2b$','$2y$', 1)) }}" - - - name: Read newly generated Grafana htpasswd salt - no_log: true - set_fact: - grafana_htpasswd_salt: "{{ (init_grafana_htpasswd_bcrypt_string.split('$')[-1])[0:22] }}" - always: - - name: Generate Grafana htpasswd bcrypt string from grafana.adminPassword using salt - no_log: true - set_fact: - grafana_htpasswd_bcrypt_string: "{{ (servicetelemetry_vars.graphing.grafana.admin_password | password_hash('bcrypt', grafana_htpasswd_salt) | replace('$2b$','$2y$', 1)) }}" - - - name: Generate Grafana auth string from grafana.adminUser and grafana_htpasswd_bcrypt_string - no_log: true - set_fact: - grafana_htpasswd_auth_string: "{{ servicetelemetry_vars.graphing.grafana.admin_user }}:{{ grafana_htpasswd_bcrypt_string }}" - -- name: Create or patch htpasswd secret for grafana admin - no_log: false - k8s: - definition: - api_version: v1 - kind: Secret - metadata: - name: '{{ ansible_operator_meta.name }}-grafana-htpasswd' - namespace: '{{ ansible_operator_meta.namespace }}' - type: Opaque - stringData: - auth: '{{ grafana_htpasswd_auth_string }}' - - name: Lookup template debug: msg: "{{ lookup('template', './manifest_grafana.j2') | from_yaml }}" diff --git a/roles/servicetelemetry/templates/manifest_grafana.j2 b/roles/servicetelemetry/templates/manifest_grafana.j2 index 792f7065c..8b176b103 100644 --- a/roles/servicetelemetry/templates/manifest_grafana.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana.j2 @@ -7,9 +7,6 @@ spec: serviceAccount: annotations: serviceaccounts.openshift.io/oauth-redirectreference.primary: '{{ grafana_oauth_redir_ref | to_json }}' - deployment: - annotations: - hash-of-creds-to-force-restart-if-changed: {{ grafana_htpasswd_auth_string | b64encode }} baseImage: {{ servicetelemetry_vars.graphing.grafana.base_image }} ingress: enabled: {{ servicetelemetry_vars.graphing.grafana.ingress_enabled }} @@ -40,13 +37,12 @@ spec: - -provider=openshift - -pass-basic-auth=false - -https-address=:3002 - - -htpasswd-file=/etc/proxy/htpasswd/auth - -tls-cert=/etc/tls/private/tls.crt - -tls-key=/etc/tls/private/tls.key - -upstream=http://localhost:3000 - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=grafana-serviceaccount - - '-openshift-sar={"resource": "namespaces", "verb": "get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafana", "group":"integreatly.org", "verb":"get"}' - -openshift-ca=/etc/pki/tls/cert.pem - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt ports: @@ -58,12 +54,9 @@ spec: name: secret-{{ ansible_operator_meta.name }}-grafana-proxy-tls - mountPath: /etc/proxy/secrets name: secret-{{ ansible_operator_meta.name }}-session-secret - - mountPath: /etc/proxy/htpasswd - name: secret-{{ ansible_operator_meta.name }}-grafana-htpasswd secrets: - '{{ ansible_operator_meta.name }}-grafana-proxy-tls' - '{{ ansible_operator_meta.name }}-session-secret' - - '{{ ansible_operator_meta.name }}-grafana-htpasswd' service: ports: - name: web From 0f94fd577617aee6a85fc4141f98ebdfc49a9f92 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Thu, 7 Dec 2023 16:10:03 -0500 Subject: [PATCH 24/70] Adjust Alertmanager SAR to be more specific * This matches recent changes in prometheus[1] and grafana[2] [1] https://github.com/infrawatch/service-telemetry-operator/pull/549/files#diff-2cf84bcf66f12393c86949ec0d3f16c473a650173d55549bb02556d23aa22bd2R46 [2] https://github.com/infrawatch/service-telemetry-operator/pull/550/files#diff-ae71801975adb4f8dd4aa5479a66ad46e46f17de40f9d147b2e09e13ce26633eR45 --- .../tasks/component_prometheus.yml | 18 ++++++++++++------ .../templates/manifest_alertmanager.j2 | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/roles/servicetelemetry/tasks/component_prometheus.yml b/roles/servicetelemetry/tasks/component_prometheus.yml index 2e865abd2..f506dae80 100644 --- a/roles/servicetelemetry/tasks/component_prometheus.yml +++ b/roles/servicetelemetry/tasks/component_prometheus.yml @@ -42,12 +42,6 @@ - subjectaccessreviews verbs: - create - - apiGroups: - - "" - resources: - - namespaces - verbs: - - get - name: Setup ClusterRoleBinding for Prometheus block: @@ -123,6 +117,18 @@ - securitycontextconstraints verbs: - use + - apiGroups: + - '{{ prometheus_operator_api_string | replace("/v1","") }}' + resources: + - alertmanagers + verbs: + - get + - apiGroups: + - smartgateway.infra.watch + resources: + - smartgateways + verbs: + - get - name: Setup RoleBinding for Prometheus block: diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 2465ee43f..5b53cc592 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -26,8 +26,8 @@ spec: - -upstream=http://localhost:9093/ - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=alertmanager-stf - - '-openshift-sar={"resource": "namespaces", "verb": "get"}' - - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get"}}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' + - '-openshift-delegate-urls={"/": {"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' ports: - containerPort: 9095 name: https From 28ce38e323306b2d4c0b94975a1b9ac5f319a297 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 11 Dec 2023 19:34:33 -0500 Subject: [PATCH 25/70] Revert "Adjust Alertmanager SAR to be more specific" This reverts commit 0f94fd577617aee6a85fc4141f98ebdfc49a9f92. --- .../tasks/component_prometheus.yml | 18 ++++++------------ .../templates/manifest_alertmanager.j2 | 4 ++-- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/roles/servicetelemetry/tasks/component_prometheus.yml b/roles/servicetelemetry/tasks/component_prometheus.yml index f506dae80..2e865abd2 100644 --- a/roles/servicetelemetry/tasks/component_prometheus.yml +++ b/roles/servicetelemetry/tasks/component_prometheus.yml @@ -42,6 +42,12 @@ - subjectaccessreviews verbs: - create + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get - name: Setup ClusterRoleBinding for Prometheus block: @@ -117,18 +123,6 @@ - securitycontextconstraints verbs: - use - - apiGroups: - - '{{ prometheus_operator_api_string | replace("/v1","") }}' - resources: - - alertmanagers - verbs: - - get - - apiGroups: - - smartgateway.infra.watch - resources: - - smartgateways - verbs: - - get - name: Setup RoleBinding for Prometheus block: diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 5b53cc592..2465ee43f 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -26,8 +26,8 @@ spec: - -upstream=http://localhost:9093/ - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=alertmanager-stf - - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' - - '-openshift-delegate-urls={"/": {"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' + - '-openshift-sar={"resource": "namespaces", "verb": "get"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get"}}' ports: - containerPort: 9095 name: https From 5189c0a0d6ccf026e3fcafd88180a04d0c439606 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 12 Dec 2023 15:23:08 -0500 Subject: [PATCH 26/70] Auth to prometheus using token instead of basicauth (#549) * Auth to prometheus using token instead of basicauth * Add present/absent logic to prometheus-reader resources * s/password/token in smoketest output * [zuul] Make nightly_bundles jobs non-voting (#551) --------- Co-authored-by: Emma Foley --- .zuul.yaml | 10 ++-- .../tasks/component_grafana.yml | 10 ++-- .../tasks/component_prometheus.yml | 40 ------------- .../tasks/component_prometheus_reader.yml | 58 +++++++++++++++++++ roles/servicetelemetry/tasks/main.yml | 2 + .../templates/manifest_grafana_ds.j2 | 5 +- .../templates/manifest_prometheus.j2 | 9 +-- tests/smoketest/smoketest.sh | 6 +- .../smoketest_ceilometer_entrypoint.sh | 6 +- .../smoketest_collectd_entrypoint.sh | 8 +-- tests/smoketest/smoketest_job.yaml.template | 8 +-- 11 files changed, 90 insertions(+), 72 deletions(-) create mode 100644 roles/servicetelemetry/tasks/component_prometheus_reader.yml diff --git a/.zuul.yaml b/.zuul.yaml index 2c1b66282..994d65327 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -39,7 +39,7 @@ Deploy STF using the nightly bundles vars: scenario: "nightly_bundles" - + - job: name: stf-crc-local_build parent: stf-base @@ -112,13 +112,15 @@ STF CRC jobs that build and deploy STF github-check: jobs: - - stf-crc-ocp_412-nightly_bundles + - stf-crc-ocp_412-nightly_bundles: + voting: false - stf-crc-ocp_412-local_build - stf-crc-ocp_412-local_build-index_deploy - - stf-crc-ocp_413-nightly_bundles + - stf-crc-ocp_413-nightly_bundles: + voting: false - stf-crc-ocp_413-local_build - stf-crc-ocp_413-local_build-index_deploy - + - project: name: infrawatch/service-telemetry-operator templates: diff --git a/roles/servicetelemetry/tasks/component_grafana.yml b/roles/servicetelemetry/tasks/component_grafana.yml index 7eff9174f..e5c9ba989 100644 --- a/roles/servicetelemetry/tasks/component_grafana.yml +++ b/roles/servicetelemetry/tasks/component_grafana.yml @@ -34,18 +34,18 @@ namespace: '{{ ansible_operator_meta.namespace }}' register: serving_certs_ca - - name: Retrieve prometheus secret + - name: Retrieve prometheus reader token k8s_info: api_version: v1 kind: Secret namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - register: prometheus_secret + name: stf-prometheus-reader-token + register: prometheus_reader_secret - - name: Decode prometheus password + - name: Decode prometheus reader token no_log: true set_fact: - prom_basicauth_passwd: '{{ prometheus_secret.resources[0].data.password | b64decode }}' + prometheus_reader_token: '{{ prometheus_reader_secret.resources[0].data.token | b64decode }}' # Lookup existing datasources - name: Remove legacy datasources diff --git a/roles/servicetelemetry/tasks/component_prometheus.yml b/roles/servicetelemetry/tasks/component_prometheus.yml index 2e865abd2..ac65044d5 100644 --- a/roles/servicetelemetry/tasks/component_prometheus.yml +++ b/roles/servicetelemetry/tasks/component_prometheus.yml @@ -173,46 +173,6 @@ name: prometheus-k8s-{{ ansible_operator_meta.namespace }} namespace: '{{ ansible_operator_meta.namespace }}' -- name: Check for existing prometheus htpasswd user secret - k8s_info: - api_version: v1 - kind: Secret - namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - register: prometheus_htpasswd - -- name: Create a new prometheus password if it doesn't exist yet - when: prometheus_htpasswd.resources|length == 0 - block: - - name: Set prometheus htpasswd - no_log: true - set_fact: - prom_basicauth_passwd: "{{ lookup('password', '/dev/null') }}" - - - name: Create htpasswd secret # Contains both the htpasswd version and plaintext for lookup - no_log: true - k8s: - definition: - api_version: v1 - kind: Secret - metadata: - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - namespace: '{{ ansible_operator_meta.namespace }}' - type: Opaque - stringData: - auth: 'internal:{{ prom_basicauth_passwd | password_hash("bcrypt") | replace("$2b$","$2y$", 1)}}' - password: '{{ prom_basicauth_passwd }}' - tags: - - skip_ansible_lint - - - name: Re-register new object for use in the annotation - k8s_info: - api_version: v1 - kind: Secret - namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - register: prometheus_htpasswd - - name: Lookup template debug: msg: "{{ lookup('template', './manifest_prometheus.j2') | from_yaml }}" diff --git a/roles/servicetelemetry/tasks/component_prometheus_reader.yml b/roles/servicetelemetry/tasks/component_prometheus_reader.yml new file mode 100644 index 000000000..6cbee8b42 --- /dev/null +++ b/roles/servicetelemetry/tasks/component_prometheus_reader.yml @@ -0,0 +1,58 @@ +- name: Create ServiceAccount/stf-prometheus-reader + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + apiVersion: v1 + kind: ServiceAccount + metadata: + name: stf-prometheus-reader + namespace: '{{ ansible_operator_meta.namespace }}' + +- name: Create prometheus-reader Role + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-reader + namespace: '{{ ansible_operator_meta.namespace }}' + rules: + - apiGroups: + - '{{ prometheus_operator_api_string | replace("/v1","") }}' + resources: + - prometheus + verbs: + - get + namespaces: + - '{{ ansible_operator_meta.namespace }}' + +- name: Create prometheus-reader RoleBinding for stf-prometheus-reader + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: stf-prometheus-reader + namespace: '{{ ansible_operator_meta.namespace }}' + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-reader + subjects: + - kind: ServiceAccount + name: stf-prometheus-reader + +- name: Create an access token for stf-prometheus-reader + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + apiVersion: v1 + kind: Secret + metadata: + name: stf-prometheus-reader-token + namespace: '{{ ansible_operator_meta.namespace }}' + annotations: + kubernetes.io/service-account.name: stf-prometheus-reader + type: kubernetes.io/service-account-token diff --git a/roles/servicetelemetry/tasks/main.yml b/roles/servicetelemetry/tasks/main.yml index bc33df647..dc3e881c7 100644 --- a/roles/servicetelemetry/tasks/main.yml +++ b/roles/servicetelemetry/tasks/main.yml @@ -47,6 +47,8 @@ - block: - name: Create Prometheus instance include_tasks: component_prometheus.yml + - name: Create Prometheus read-only user + include_tasks: component_prometheus_reader.yml # --> alerting - name: Create Alertmanager instance diff --git a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 index d0f0478d1..a453b311a 100644 --- a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 @@ -12,14 +12,13 @@ spec: jsonData: timeInterval: 5s tlsAuthWithCACert: true + httpHeaderName1: 'Authorization' name: STFPrometheus type: prometheus url: 'https://{{ ansible_operator_meta.name }}-prometheus-proxy.{{ ansible_operator_meta.namespace }}.svc:9092' version: 1 - basicAuth: true - basicAuthUser: internal secureJsonData: - basicAuthPassword: '{{ prom_basicauth_passwd }}' + httpHeaderValue1: 'Bearer {{prometheus_reader_token}}' tlsCACert: | {{ serving_certs_ca.resources[0].data['service-ca.crt'] | indent(10) }} {% endif %} diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index 2bdf408b9..e9eb63786 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -20,7 +20,6 @@ spec: labels: prometheus: '{{ ansible_operator_meta.name }}' annotations: - hash-of-entire-htpasswd-secret-to-force-restart-if-changed: {{ prometheus_htpasswd | sha1 }} {% if servicetelemetry_vars.alerting.enabled %} alerting: alertmanagers: @@ -42,10 +41,11 @@ spec: - -tls-cert=/etc/tls/private/tls.crt - -tls-key=/etc/tls/private/tls.key - -upstream=http://localhost:9090/ - - -htpasswd-file=/etc/proxy/htpasswd/auth - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=prometheus-stf - - '-openshift-sar={"resource": "namespaces", "verb": "get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheus", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' + - '-openshift-delegate-urls={"/":{"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheus", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' + ports: - containerPort: 9092 name: https @@ -55,14 +55,11 @@ spec: name: secret-{{ ansible_operator_meta.name }}-prometheus-proxy-tls - mountPath: /etc/proxy/secrets name: secret-{{ ansible_operator_meta.name }}-session-secret - - mountPath: /etc/proxy/htpasswd - name: secret-{{ ansible_operator_meta.name }}-prometheus-htpasswd configMaps: - serving-certs-ca-bundle secrets: - '{{ ansible_operator_meta.name }}-prometheus-proxy-tls' - '{{ ansible_operator_meta.name }}-session-secret' - - '{{ ansible_operator_meta.name }}-prometheus-htpasswd' {% if servicetelemetry_vars.backends.metrics.prometheus.storage.strategy == "persistent" %} storage: volumeClaimTemplate: diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index caaeb4e88..177157fe3 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -47,8 +47,8 @@ echo "*** [INFO] Working in project ${OCP_PROJECT}" echo "*** [INFO] Getting ElasticSearch authentication password" ELASTICSEARCH_AUTH_PASS=$(oc get secret elasticsearch-es-elastic-user -ogo-template='{{ .data.elastic | base64decode }}') -echo "*** [INFO] Getting Prometheus authentication password" -PROMETHEUS_AUTH_PASS=$(oc get secret default-prometheus-htpasswd -ogo-template='{{ .data.password | base64decode }}') +echo "*** [INFO] Getting Prometheus authentication token" +PROMETHEUS_AUTH_TOKEN=$(oc create token stf-prometheus-reader) echo "*** [INFO] Creating configmaps..." oc delete configmap/stf-smoketest-healthcheck-log configmap/stf-smoketest-collectd-config configmap/stf-smoketest-sensubility-config configmap/stf-smoketest-collectd-entrypoint-script configmap/stf-smoketest-ceilometer-publisher configmap/stf-smoketest-ceilometer-entrypoint-script job/stf-smoketest || true @@ -76,7 +76,7 @@ oc wait --for=jsonpath='{.status.phase}'=Running pod/qdr-test echo "*** [INFO] Creating smoketest jobs..." oc delete job -l app=stf-smoketest for NAME in "${CLOUDNAMES[@]}"; do - oc create -f <(sed -e "s/<>/${NAME}/;s/<>/${ELASTICSEARCH_AUTH_PASS}/;s/<>/${PROMETHEUS_AUTH_PASS}/" ${REL}/smoketest_job.yaml.template) + oc create -f <(sed -e "s/<>/${NAME}/;s/<>/${ELASTICSEARCH_AUTH_PASS}/;s/<>/${PROMETHEUS_AUTH_TOKEN}/" ${REL}/smoketest_job.yaml.template) done echo "*** [INFO] Triggering an alertmanager notification..." diff --git a/tests/smoketest/smoketest_ceilometer_entrypoint.sh b/tests/smoketest/smoketest_ceilometer_entrypoint.sh index adf3a9046..0fc6f232f 100644 --- a/tests/smoketest/smoketest_ceilometer_entrypoint.sh +++ b/tests/smoketest/smoketest_ceilometer_entrypoint.sh @@ -5,7 +5,7 @@ set +e PROMETHEUS=${PROMETHEUS:-"https://default-prometheus-proxy:9092"} ELASTICSEARCH=${ELASTICSEARCH:-"https://elasticsearch-es-http:9200"} ELASTICSEARCH_AUTH_PASS=${ELASTICSEARCH_AUTH_PASS:-""} -PROMETHEUS_AUTH_PASS=${PROMETHEUS_AUTH_PASS:-""} +PROMETHEUS_AUTH_TOKEN=${PROMETHEUS_AUTH_TOKEN:-""} CLOUDNAME=${CLOUDNAME:-"smoke1"} POD=$(hostname) @@ -20,14 +20,14 @@ echo "*** [INFO] Sleeping for 30 seconds to produce all metrics and events" sleep 30 echo "*** [INFO] List of metric names for debugging..." -curl -sk -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names +curl -sk -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names echo; echo # Checks that the metrics actually appear in prometheus echo "*** [INFO] Checking for recent image metrics..." echo "[DEBUG] Running the curl command to return a query" -curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=ceilometer_image_size' 2>&1 | grep '"result":\[{"metric":{"__name__":"ceilometer_image_size"' +curl -k -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=ceilometer_image_size' 2>&1 | grep '"result":\[{"metric":{"__name__":"ceilometer_image_size"' metrics_result=$? echo "[DEBUG] Set metrics_result to $metrics_result" diff --git a/tests/smoketest/smoketest_collectd_entrypoint.sh b/tests/smoketest/smoketest_collectd_entrypoint.sh index d7f5132e8..d0dd800c3 100755 --- a/tests/smoketest/smoketest_collectd_entrypoint.sh +++ b/tests/smoketest/smoketest_collectd_entrypoint.sh @@ -5,7 +5,7 @@ set +e PROMETHEUS=${PROMETHEUS:-"https://default-prometheus-proxy:9092"} ELASTICSEARCH=${ELASTICSEARCH:-"https://elasticsearch-es-http:9200"} ELASTICSEARCH_AUTH_PASS=${ELASTICSEARCH_AUTH_PASS:-""} -PROMETHEUS_AUTH_PASS=${PROMETHEUS_AUTH_PASS:-""} +PROMETHEUS_AUTH_TOKEN=${PROMETHEUS_AUTH_TOKEN:-""} CLOUDNAME=${CLOUDNAME:-"smoke1"} POD=$(hostname) @@ -37,12 +37,12 @@ sleep 30 echo "*** [INFO] List of metric names for debugging..." -curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names +curl -k -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names echo; echo # Checks that the metrics actually appear in prometheus echo "*** [INFO] Checking for recent CPU metrics..." -curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=collectd_cpu_total{container="sg-core",plugin_instance="0",type_instance="user",service="default-cloud1-coll-meter",host="'"${POD}"'"}[1m]' 2>&2 | tee /tmp/query_output +curl -k -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=collectd_cpu_total{container="sg-core",plugin_instance="0",type_instance="user",service="default-cloud1-coll-meter",host="'"${POD}"'"}[1m]' 2>&2 | tee /tmp/query_output echo; echo # The egrep exit code is the result of the test and becomes the container/pod/job exit code @@ -53,7 +53,7 @@ echo; echo # Checks that the metrics actually appear in prometheus echo "*** [INFO] Checking for recent healthcheck metrics..." -curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=sensubility_container_health_status{container="sg-core",service="default-cloud1-sens-meter",host="'"${POD}"'"}[1m]' 2>&2 | tee /tmp/query_output +curl -k -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=sensubility_container_health_status{container="sg-core",service="default-cloud1-sens-meter",host="'"${POD}"'"}[1m]' 2>&2 | tee /tmp/query_output echo; echo # The egrep exit code is the result of the test and becomes the container/pod/job exit code diff --git a/tests/smoketest/smoketest_job.yaml.template b/tests/smoketest/smoketest_job.yaml.template index 4a9c20cc9..12626f3b2 100644 --- a/tests/smoketest/smoketest_job.yaml.template +++ b/tests/smoketest/smoketest_job.yaml.template @@ -22,8 +22,8 @@ spec: value: <> - name: ELASTICSEARCH_AUTH_PASS value: "<>" - - name: PROMETHEUS_AUTH_PASS - value: "<>" + - name: PROMETHEUS_AUTH_TOKEN + value: "<>" - name: OBSERVABILITY_STRATEGY value: "<>" volumeMounts: @@ -51,8 +51,8 @@ spec: value: <> - name: ELASTICSEARCH_AUTH_PASS value: "<>" - - name: PROMETHEUS_AUTH_PASS - value: "<>" + - name: PROMETHEUS_AUTH_TOKEN + value: "<>" - name: OBSERVABILITY_STRATEGY value: "<>" volumeMounts: From 073548ec1bbfc5f9550155488b8de6d69ef4b3ec Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 13 Dec 2023 09:31:01 -0500 Subject: [PATCH 27/70] Fix branch co-ordination in stf-run-ci (#555) I think it got broken by an oops recently[1]. Since that change, working_branch (`branch` at that point) is never used because version_branches.sgo has a default value. This breaks the branch co-ordination in Jenkins[2] and in local testing[3]. [1] https://github.com/infrawatch/service-telemetry-operator/pull/512/files#diff-c073fe1e346d08112920aa0bbc8a7453bbd3032b7a9b09ae8cbc70df4db4ea2dR19 [2] https://github.com/infrawatch/service-telemetry-operator/blob/0f94fd577617aee6a85fc4141f98ebdfc49a9f92/Jenkinsfile#L157 [3] https://github.com/infrawatch/service-telemetry-operator/blob/0f94fd577617aee6a85fc4141f98ebdfc49a9f92/README.md?plain=1#L62 --- build/stf-run-ci/tasks/clone_repos.yml | 8 ++++---- build/stf-run-ci/tasks/main.yml | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index 5f0131832..632170676 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -16,7 +16,7 @@ ansible.builtin.git: repo: "{{ sgo_repository }}" dest: "{{ sgo_dir }}" - version: "{{ version_branches.sgo | default(branch, true) }}" + version: "{{ sgo_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sgo }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: @@ -36,7 +36,7 @@ ansible.builtin.git: repo: "{{ sg_core_repository }}" dest: "{{ sg_core_dir }}" - version: "{{ version_branches.sg_core | default(branch, true) }}" + version: "{{ sg_core_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_core }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: @@ -56,7 +56,7 @@ ansible.builtin.git: repo: "{{ sg_bridge_repository }}" dest: "{{ sg_bridge_dir }}" - version: "{{ version_branches.sg_bridge | default(branch, true) }}" + version: "{{ sg_bridge_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_bridge }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: @@ -76,7 +76,7 @@ ansible.builtin.git: repo: "{{ prometheus_webhook_snmp_repository }}" dest: "{{ prometheus_webhook_snmp_dir }}" - version: "{{ version_branches.prometheus_webhook_snmp | default(branch, true) }}" + version: "{{ prometheus_webhook_snmp_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.prometheus_webhook_snmp }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 90fe03684..52d35fa1e 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -4,7 +4,6 @@ # -- initial setup - name: Setup default values ansible.builtin.set_fact: - # The branch should be removed, we should assume that everything is checked out to the right place. branch: "{{ working_branch | default('master') }}" namespace: "{{ namespace if namespace is defined else (working_namespace | default('service-telemetry'))}}" From 6798b4168251610d8cb47c9f4a0dc281b3d361a7 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Thu, 14 Dec 2023 13:35:47 -0500 Subject: [PATCH 28/70] Adjust Alertmanager SAR to be more specific (#553) * This matches recent changes in prometheus[1] and grafana[2] [1] https://github.com/infrawatch/service-telemetry-operator/pull/549/files#diff-2cf84bcf66f12393c86949ec0d3f16c473a650173d55549bb02556d23aa22bd2R46 [2] https://github.com/infrawatch/service-telemetry-operator/pull/550/files#diff-ae71801975adb4f8dd4aa5479a66ad46e46f17de40f9d147b2e09e13ce26633eR45 --- .../tasks/component_prometheus.yml | 18 ++++++++++++------ .../templates/manifest_alertmanager.j2 | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/roles/servicetelemetry/tasks/component_prometheus.yml b/roles/servicetelemetry/tasks/component_prometheus.yml index ac65044d5..601a28247 100644 --- a/roles/servicetelemetry/tasks/component_prometheus.yml +++ b/roles/servicetelemetry/tasks/component_prometheus.yml @@ -42,12 +42,6 @@ - subjectaccessreviews verbs: - create - - apiGroups: - - "" - resources: - - namespaces - verbs: - - get - name: Setup ClusterRoleBinding for Prometheus block: @@ -123,6 +117,18 @@ - securitycontextconstraints verbs: - use + - apiGroups: + - '{{ prometheus_operator_api_string | replace("/v1","") }}' + resources: + - alertmanagers + verbs: + - get + - apiGroups: + - smartgateway.infra.watch + resources: + - smartgateways + verbs: + - get - name: Setup RoleBinding for Prometheus block: diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 2465ee43f..5b53cc592 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -26,8 +26,8 @@ spec: - -upstream=http://localhost:9093/ - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=alertmanager-stf - - '-openshift-sar={"resource": "namespaces", "verb": "get"}' - - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get"}}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' + - '-openshift-delegate-urls={"/": {"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' ports: - containerPort: 9095 name: https From 99221fbffe2d658161c00fae5905aebe0d989ffa Mon Sep 17 00:00:00 2001 From: Miguel Garcia Date: Wed, 3 Jan 2024 17:34:12 +0100 Subject: [PATCH 29/70] Add optional spec.replaces field to CSV for update graph compliance The way we generate our CSVs uses OLM's skipRange functionality. This is fine, but using only this leads to older versions becoming unavailable after the fact -- see the warning at [1]. By adding an optional spec.replaces to our CSV we allow update testing as well as actual production updates for downstream builds that leverage it. Populating the field requires knowledge of the latest-released bundle, so we take it from an environment variable to be provided by the builder. If this is unset we don't include the spec.replaces field at all -- leaving previous behavior unchanged. Resolves #559 Related: STF-1658 [1] https://olm.operatorframework.io/docs/concepts/olm-architecture/operator-catalog/creating-an-update-graph/#skiprange --- build/generate_bundle.sh | 9 +++++++++ ...service-telemetry-operator.clusterserviceversion.yaml | 1 + 2 files changed, 10 insertions(+) diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index 14a635cf5..12eea7c2d 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -35,6 +35,15 @@ generate_bundle() { ${OPERATOR_SDK} generate bundle --verbose --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" >> ${LOGFILE} 2>&1 popd > /dev/null 2>&1 + # CSVs without a spec.replaces field are valid, so fall back to those if + # latest released version is unknown. + # Placeholder value is validated by operator-sdk during local bundle + # generation and so needs to conform to RFC1123. + if [[ -n "$BUNDLE_LATEST_RELEASED_VERSION" ]]; then + REPLACE_REGEX="$REPLACE_REGEX;s#---bundle-latest-released-version#${BUNDLE_LATEST_RELEASED_VERSION}#g" + else sed -i '/---bundle-latest-released-version/d' "${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" + fi + sed -i -E "${REPLACE_REGEX}" "${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" } diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 6e758b6f3..63e94c60f 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -460,4 +460,5 @@ spec: minKubeVersion: 1.23.0 provider: name: Red Hat + replaces: service-telemetry-operator.v---bundle-latest-released-version version: 1.99.0 From c008c6e811cd088819076a518e36af03a5f4c086 Mon Sep 17 00:00:00 2001 From: Marihan Girgis <102027102+mgirgisf@users.noreply.github.com> Date: Tue, 9 Jan 2024 19:29:53 +0100 Subject: [PATCH 30/70] Stop using ephemeral storage for testing (#547) Update the __service_telemetry_storage_persistent_storage_class to use CRC PVs Use the default value (false) for __service_telemetry_storage_ephemeral_enabled --- build/stf-collect-logs/tasks/main.yml | 17 +++++++++++++++++ ci/vars-local_build-index_deploy.yml | 3 +-- ci/vars-local_build.yml | 1 - ci/vars-nightly_bundles.yml | 1 - ci/vars-zuul-common.yml | 1 + 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/build/stf-collect-logs/tasks/main.yml b/build/stf-collect-logs/tasks/main.yml index 10fb0c97b..347d07f37 100644 --- a/build/stf-collect-logs/tasks/main.yml +++ b/build/stf-collect-logs/tasks/main.yml @@ -61,3 +61,20 @@ ignore_errors: true retries: 3 delay: 10 + +- name: "Get PV and PVC information details" + ansible.builtin.shell: + cmd: | + oc -n {{ namespace }} get pv >> {{ logfile_dir }}/post_pv.log 2>&1 + oc -n {{ namespace }} get pvc >> {{ logfile_dir }}/post_pvc.log 2>&1 + ignore_errors: true + +- name: "Get SGO,STO and QDR logs" + ansible.builtin.shell: + cmd: | + oc -n {{ namespace }} logs $(oc -n {{ namespace }} get pod -l name=service-telemetry-operator -o jsonpath='{.items[].metadata.name}') >> {{ logfile_dir }}/logs_sto.log 2>&1 + oc -n {{ namespace }} logs $(oc -n {{ namespace }} get pod -l app=smart-gateway-operator -o jsonpath='{.items[].metadata.name}') >> {{ logfile_dir }}/logs_sgo.log 2>&1 + oc -n {{ namespace }} logs $(oc -n {{ namespace }} get pod -l qdr -o jsonpath='{.items[].metadata.name}') >> {{ logfile_dir }}/logs_qdr.log 2>&1 + ignore_errors: true + retries: 3 + delay: 10 \ No newline at end of file diff --git a/ci/vars-local_build-index_deploy.yml b/ci/vars-local_build-index_deploy.yml index 0404049b4..ed9acc624 100644 --- a/ci/vars-local_build-index_deploy.yml +++ b/ci/vars-local_build-index_deploy.yml @@ -1,6 +1,5 @@ --- -#ansible-playbook --extra-vars __local_build_enabled=true -e __deploy_from_index_enabled=true --extra-vars working_branch="$(git rev-parse --abbrev-ref HEAD)" --extra-vars __service_telemetry_storage_ephemeral_enabled=true --extra-vars __service_telemetry_observability_strategy=use_redhat ./run-ci.yaml +#ansible-playbook --extra-vars __local_build_enabled=true -e __deploy_from_index_enabled=true --extra-vars working_branch="$(git rev-parse --abbrev-ref HEAD)" --extra-vars __service_telemetry_observability_strategy=use_redhat ./run-ci.yaml __local_build_enabled: true __deploy_from_index_enabled: true -__service_telemetry_ephemeral_enabled: true __service_telemetry_observability_strategy: use_redhat diff --git a/ci/vars-local_build.yml b/ci/vars-local_build.yml index 3126605a4..206e2b327 100644 --- a/ci/vars-local_build.yml +++ b/ci/vars-local_build.yml @@ -2,4 +2,3 @@ __deploy_stf: true __local_build_enabled: true __service_telemetry_snmptraps_enabled: true -__service_telemetry_storage_ephemeral_enabled: true diff --git a/ci/vars-nightly_bundles.yml b/ci/vars-nightly_bundles.yml index ca49656f3..26572fe9c 100644 --- a/ci/vars-nightly_bundles.yml +++ b/ci/vars-nightly_bundles.yml @@ -4,4 +4,3 @@ __local_build_enabled: false __deploy_from_bundles_enabled: true -__service_telemetry_storage_ephemeral_enabled: true diff --git a/ci/vars-zuul-common.yml b/ci/vars-zuul-common.yml index 12308a590..de0f17613 100644 --- a/ci/vars-zuul-common.yml +++ b/ci/vars-zuul-common.yml @@ -9,3 +9,4 @@ sgo_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/smart-g sg_core_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/sg-core'].src_dir }}" sg_bridge_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/sg-bridge'].src_dir }}" prometheus_webhook_snmp_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/prometheus-webhook-snmp'].src_dir }}" +__service_telemetry_storage_persistent_storage_class: "crc-csi-hostpath-provisioner" \ No newline at end of file From 665577e07b83492c0204e2321d4115a1c01249f9 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Sat, 13 Jan 2024 10:06:50 +0000 Subject: [PATCH 31/70] [zuul] Use extracted CRC nodes in stf-base (#531) * [zuul] Update base job for stf-base * Add in required projects: dataplane-operator, infra-operator, openstack-operator * Remove nodeset from stf-base it overrides the nodeset set in the base job. The nodeset is going to be used to select the OCP version * [zuul] define nodesets for easy reuse * Define the nodeset * Rename the base * Select OCP version with the nodeset * [zuul] Add a login command to get initial kubeconfig file * [stf-run-ci] Add retries to pre-clean * Update galaxy requirements * [ci] Add retry to login command * [ci] Configure kubeconfig for rhol_crc role * Apply suggestions from code review * Zuul: Update how we get the initial kubeconfig (#558) * use ci-framework infra playbook * add make targets to do set-up * link the kubeconfig files * Remove pre-get_kubeconfig.yml; the script is no longer used * [ci] Add common-tasks.yml to cover the tasks that setup every play (#556) * [zuul] Update the labels used for extracted CRC * Remove non-default cifmw_rhol_crc_kubeconfig value --- .zuul.yaml | 128 +++++++++++++++++++---- build/stf-run-ci/tasks/create_builds.yml | 2 - build/stf-run-ci/tasks/pre-clean.yml | 2 + ci/common-tasks.yml | 13 +++ ci/deploy_stf.yml | 14 +-- ci/post-collect_logs.yml | 14 +-- ci/pre-2node.yml | 33 ++++++ ci/prepare.yml | 16 +-- ci/test_stf.yml | 14 +-- 9 files changed, 166 insertions(+), 70 deletions(-) create mode 100644 ci/common-tasks.yml create mode 100644 ci/pre-2node.yml diff --git a/.zuul.yaml b/.zuul.yaml index 994d65327..732fa94fc 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -1,8 +1,83 @@ --- +- nodeset: + name: stf-crc_extracted-ocp412 + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-19-0-xxl + +- nodeset: + name: stf-crc_extracted-ocp413 + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-28-0-xxl + +- nodeset: + name: stf-crc_extracted-ocp414 + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-30-0-xxl + +# Based on the 2-node job cookbook at https://github.com/openstack-k8s-operators/ci-framework/blob/main/docs/source/cookbooks/zuul-job-nodeset.md +- job: + name: stf-base-2node + parent: podified-multinode-edpm-deployment-crc + abstract: true + required-projects: + - name: github.com/openstack-k8s-operators/dataplane-operator + override-checkout: main + - name: github.com/openstack-k8s-operators/infra-operator + override-checkout: main + - name: github.com/openstack-k8s-operators/openstack-operator + override-checkout: main + - name: github.com/openstack-k8s-operators/openstack-must-gather + override-checkout: main + pre-run: + - ci/pre-2node.yml + vars: + cifmw_deploy_edpm: false + podified_validation: true + cifmw_run_tests: false + extra-vars: + crc_ci_bootstrap_networking: + networks: + default: + range: 192.168.122.0/24 + mtu: 1500 + internal-api: + vlan: 20 + range: 172.17.0.0/24 + storage: + vlan: 21 + range: 172.18.0.0/24 + tenant: + vlan: 22 + range: 172.19.0.0/24 + instances: + controller: + networks: + default: + ip: 192.168.122.11 + crc: + networks: + default: + ip: 192.168.122.10 + internal-api: + ip: 172.17.0.5 + storage: + ip: 172.18.0.5 + tenant: + ip: 172.19.0.5 + - job: name: stf-base # defined in: https://review.rdoproject.org/cgit/config/tree/zuul.d/_jobs-crc.yaml - parent: base-simple-crc + parent: stf-base-2node abstract: true description: | Run the stf-run-ci role, and then test stf @@ -24,12 +99,8 @@ - ci/test_stf.yml post-run: - ci/post-collect_logs.yml - nodeset: centos-9-crc-xxl # The default (~30 minutes) is not enough to run through all the job stages timeout: 3600 - vars: - # Pass vars to crc cli https://review.rdoproject.org/cgit/config/tree/playbooks/crc/simple-start.yaml#n30 - crc_parameters: '--memory 16000 --disk-size 80 --cpus 6' # Increase from 14336 - job: name: stf-crc-nightly_bundles @@ -63,48 +134,63 @@ parent: stf-crc-nightly_bundles description: | Deploy STF using the nightly bundles on OCP 4.12 - vars: - crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.12.13/crc_libvirt_4.12.13_amd64.crcbundle' + nodeset: stf-crc_extracted-ocp412 - job: name: stf-crc-ocp_413-nightly_bundles parent: stf-crc-nightly_bundles description: | Deploy STF using the nightly bundles on OCP 4.13 - vars: - crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.13.14/crc_libvirt_4.13.14_amd64.crcbundle' + nodeset: stf-crc_extracted-ocp413 + +- job: + name: stf-crc-ocp_414-nightly_bundles + parent: stf-crc-nightly_bundles + description: | + Deploy STF using the nightly bundles on OCP 4.14 + nodeset: stf-crc_extracted-ocp414 - job: name: stf-crc-ocp_412-local_build parent: stf-crc-local_build description: | Build images locally and deploy STF on OCP 4.12 - vars: - crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.12.13/crc_libvirt_4.12.13_amd64.crcbundle' + nodeset: stf-crc_extracted-ocp412 - job: name: stf-crc-ocp_413-local_build parent: stf-crc-local_build description: | Build images locally and deploy STF on OCP 4.13 - vars: - crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.13.14/crc_libvirt_4.13.14_amd64.crcbundle' + nodeset: stf-crc_extracted-ocp413 + +- job: + name: stf-crc-ocp_414-local_build + parent: stf-crc-local_build + description: | + Build images locally and deploy STF on OCP 4.14 + nodeset: stf-crc_extracted-ocp414 - job: name: stf-crc-ocp_412-local_build-index_deploy parent: stf-crc-local_build-index_deploy description: | Build STF locally and deploy from index on OCP 4.12 - vars: - crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.12.13/crc_libvirt_4.12.13_amd64.crcbundle' + nodeset: stf-crc_extracted-ocp412 - job: name: stf-crc-ocp_413-local_build-index_deploy parent: stf-crc-local_build-index_deploy description: | Build STF locally and deploy from index on OCP 4.13 - vars: - crc_ocp_bundle: 'https://mirror.openshift.com/pub/openshift-v4/clients/crc/bundles/openshift/4.13.14/crc_libvirt_4.13.14_amd64.crcbundle' + nodeset: stf-crc_extracted-ocp413 + +- job: + name: stf-crc-ocp_414-local_build-index_deploy + parent: stf-crc-local_build-index_deploy + description: | + Build STF locally and deploy from index on OCP 4.14 + nodeset: stf-crc_extracted-ocp414 - project-template: name: stf-crc-jobs @@ -114,12 +200,16 @@ jobs: - stf-crc-ocp_412-nightly_bundles: voting: false - - stf-crc-ocp_412-local_build - - stf-crc-ocp_412-local_build-index_deploy - stf-crc-ocp_413-nightly_bundles: voting: false + - stf-crc-ocp_414-nightly_bundles: + voting: false + - stf-crc-ocp_412-local_build - stf-crc-ocp_413-local_build + - stf-crc-ocp_414-local_build + - stf-crc-ocp_412-local_build-index_deploy - stf-crc-ocp_413-local_build-index_deploy + - stf-crc-ocp_414-local_build-index_deploy - project: name: infrawatch/service-telemetry-operator diff --git a/build/stf-run-ci/tasks/create_builds.yml b/build/stf-run-ci/tasks/create_builds.yml index e54b77cb9..8d287b121 100644 --- a/build/stf-run-ci/tasks/create_builds.yml +++ b/build/stf-run-ci/tasks/create_builds.yml @@ -23,7 +23,6 @@ - name: Kill first build since it will always fail (triggered on BuildConfig creation) ansible.builtin.shell: sleep 10 ; oc delete build {{ artifact.name }}-1 -n "{{ namespace }}" - ignore_errors: true retries: 3 delay: 10 register: kill_build @@ -34,7 +33,6 @@ ansible.builtin.command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --follow --wait --from-dir "{{ artifact.working_build_dir }}" register: build_results when: build_lookup.resources | length == 0 - ignore_errors: true retries: 3 delay: 10 until: build_results.rc == 0 diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index 712d188bf..87e649ddd 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -135,6 +135,8 @@ - openshift-cert-manager-operator - cert-manager-operator - cert-manager + retries: 3 + delay: 10 - name: Remove Elasticsearch ignore_errors: true diff --git a/ci/common-tasks.yml b/ci/common-tasks.yml new file mode 100644 index 000000000..40ff4ad1e --- /dev/null +++ b/ci/common-tasks.yml @@ -0,0 +1,13 @@ +--- +- name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' + when: sto_dir | default('') | length == 0 + +- name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + +- name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" diff --git a/ci/deploy_stf.yml b/ci/deploy_stf.yml index cd1e4d2ed..b90683f6a 100644 --- a/ci/deploy_stf.yml +++ b/ci/deploy_stf.yml @@ -2,18 +2,8 @@ - name: "Deploy STF" hosts: controller tasks: - - name: "Set the sto_dir if it isn't already set" - ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' - when: sto_dir | default('') | length == 0 - - - name: "Get vars common to all jobs" - ansible.builtin.include_vars: - file: "vars-zuul-common.yml" - - - name: "Get scenario-specific vars" - ansible.builtin.include_vars: - file: "vars-{{ scenario }}.yml" + - name: "Setup play vars" + ansible.builtin.include_tasks: "common-tasks.yml" - name: "Log into the cluster" ansible.builtin.import_role: diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml index 11b27f109..c37b512df 100644 --- a/ci/post-collect_logs.yml +++ b/ci/post-collect_logs.yml @@ -14,18 +14,8 @@ name: Collect logs on the controller gather_facts: false tasks: - - name: "Set the sto_dir if it isn't already set" - ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' - when: sto_dir | default('') | length == 0 - - - name: "Get vars common to all jobs" - ansible.builtin.include_vars: - file: "vars-zuul-common.yml" - - - name: "Get scenario-specific vars" - ansible.builtin.include_vars: - file: "vars-{{ scenario }}.yml" + - name: "Setup play vars" + ansible.builtin.include_tasks: "common-tasks.yml" - name: "Create log dir" ansible.builtin.file: diff --git a/ci/pre-2node.yml b/ci/pre-2node.yml new file mode 100644 index 000000000..1d44b5f40 --- /dev/null +++ b/ci/pre-2node.yml @@ -0,0 +1,33 @@ +--- +- name: "Do pre-work to get kubeconfig" + hosts: controller + vars: + ci_framework_dir: "{{ ansible_user_dir }}/{{ zuul.projects['github.com/openstack-k8s-operators/ci-framework'].src_dir }}" + environment: + PATH: "~/.crc/bin:~/.crc/bin/oc:~/bin:{{ ansible_env.PATH }}" + tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Run bootstrap playbook" + ansible.builtin.shell: + cmd: | + ansible-playbook -e@{{ ansible_user_dir }}/ci-framework-data/artifacts/parameters/zuul-params.yml {{ ci_framework_dir }}/playbooks/01-bootstrap.yml + chdir: "{{ ci_framework_dir }}" + + - name: Run ci_framework infra playbook + ansible.builtin.shell: + cmd: | + ansible-playbook -e cifmw_use_opn=false -e cifmw_use_devscripts=false -e cifmw_basedir={{ ansible_user_dir }}/ci-framework-data/ -e cifmw_openshift_setup_skip_internal_registry_tls_verify=true playbooks/02-infra.yml + chdir: "{{ ci_framework_dir }}" + + - name: Run make targets for setup + community.general.make: + chdir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/openstack-k8s-operators/ci-framework"].src_dir }}' + target: "{{ item }}" + with_items: + - setup_tests + - setup_molecule + diff --git a/ci/prepare.yml b/ci/prepare.yml index 02be5114c..9557d34b1 100644 --- a/ci/prepare.yml +++ b/ci/prepare.yml @@ -2,18 +2,8 @@ - name: "Prepare the environment for running stf" hosts: controller tasks: - - name: "Set the sto_dir if it isn't already set" - ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' - when: sto_dir | default('') | length == 0 - - - name: "Get vars common to all jobs" - ansible.builtin.include_vars: - file: "vars-zuul-common.yml" - - - name: "Get scenario-specific vars" - ansible.builtin.include_vars: - file: "vars-{{ scenario }}.yml" + - name: "Setup play vars" + ansible.builtin.include_tasks: "common-tasks.yml" - name: "Update pip" ansible.builtin.pip: @@ -33,7 +23,7 @@ name: "{{ item }}" with_items: - "kubernetes.core:2.3.2" - - "community.general:6.2.0" + - "community.general" - name: "Log into the cluster" ansible.builtin.import_role: diff --git a/ci/test_stf.yml b/ci/test_stf.yml index 4fcec7c13..493775a3f 100644 --- a/ci/test_stf.yml +++ b/ci/test_stf.yml @@ -2,18 +2,8 @@ - name: "Run tests to verify that STF runs as expected" hosts: controller tasks: - - name: "Set the sto_dir if it isn't already set" - ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' - when: sto_dir | default('') | length == 0 - - - name: "Get vars common to all jobs" - ansible.builtin.include_vars: - file: "vars-zuul-common.yml" - - - name: "Get scenario-specific vars" - ansible.builtin.include_vars: - file: "vars-{{ scenario }}.yml" + - name: "Setup play vars" + ansible.builtin.include_tasks: "common-tasks.yml" - name: "Log into the cluster" ansible.builtin.import_role: From f2a24ef7bdd6f124f2851f44c39fb254e7d00cac Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Mon, 15 Jan 2024 15:59:36 -0500 Subject: [PATCH 32/70] Implement support for Grafana Operator v5 (#561) * Implement support for Grafana Operator v5 Implement changes to support Grafana Operator v5 when the new grafana.integreatly.org CRD is available. Use the new CRDs as default when they are available. Fallover to deploying with Grafana Operator v4 when the Grafana Operator v5 CRDs are not available, thereby providing backwards compatibility to allow administrators time to migrate. Additionally, the polystat plugin has been removed from the rhos-cloud dashboard due to compatibility issues with grafana-cli usage when dynamically loading plugins. Usage of Grafana Operator v5 is also a target for disconnected support, and dynamically loading plugins in these environments is expected to be a problem. Related: OSPRH-2577 Closes: STF-1667 * Default Grafana role set to Admin In order to match the previous (Grafana Operator v4) role, set auto_assign_org_role to the Admin value. Default is Viewer. --- ...emetry-operator.clusterserviceversion.yaml | 1 + deploy/role.yaml | 1 + .../files/rhos-cloud-dashboard.json | 119 ------ .../tasks/component_grafana.yml | 362 ++++++++++++------ roles/servicetelemetry/tasks/main.yml | 8 +- .../manifest_grafana_ds_prometheus.j2 | 24 ++ .../templates/manifest_grafana_v5.j2 | 97 +++++ 7 files changed, 376 insertions(+), 236 deletions(-) create mode 100644 roles/servicetelemetry/templates/manifest_grafana_ds_prometheus.j2 create mode 100644 roles/servicetelemetry/templates/manifest_grafana_v5.j2 diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 63e94c60f..610a279a4 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -365,6 +365,7 @@ spec: - monitoring.coreos.com - monitoring.rhobs - elasticsearch.k8s.elastic.co + - grafana.integreatly.org - integreatly.org resources: - '*' diff --git a/deploy/role.yaml b/deploy/role.yaml index 6e22854e4..cdade2ce7 100644 --- a/deploy/role.yaml +++ b/deploy/role.yaml @@ -120,6 +120,7 @@ rules: - monitoring.coreos.com - monitoring.rhobs - elasticsearch.k8s.elastic.co + - grafana.integreatly.org - integreatly.org resources: - '*' diff --git a/roles/servicetelemetry/files/rhos-cloud-dashboard.json b/roles/servicetelemetry/files/rhos-cloud-dashboard.json index d3ed49146..47525b043 100644 --- a/roles/servicetelemetry/files/rhos-cloud-dashboard.json +++ b/roles/servicetelemetry/files/rhos-cloud-dashboard.json @@ -1418,125 +1418,6 @@ "align": false, "alignLevel": null } - }, - { - "collapsed": false, - "datasource": "STFPrometheus", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 23, - "panels": [], - "title": "Instances", - "type": "row" - }, - { - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a", - "#4040a0" - ], - "datasource": "STFPrometheus", - "description": "Click instance for drill down view", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 5, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 17, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "polystat": { - "animationSpeed": 2500, - "columnAutoSize": true, - "columns": "", - "defaultClickThrough": "", - "defaultClickThroughNewTab": false, - "defaultClickThroughSanitize": false, - "displayLimit": 100, - "fontAutoColor": true, - "fontAutoScale": true, - "fontSize": 12, - "fontType": "Roboto", - "globalDecimals": 2, - "globalDisplayMode": "all", - "globalDisplayTextTriggeredEmpty": "OK", - "globalOperatorName": "avg", - "globalUnitFormat": "short", - "gradientEnabled": true, - "hexagonSortByDirection": 1, - "hexagonSortByField": "name", - "maxMetrics": 0, - "polygonBorderColor": "black", - "polygonBorderSize": 2, - "polygonGlobalFillColor": "#FFF899", - "radius": "", - "radiusAutoSize": true, - "rowAutoSize": true, - "rows": "", - "shape": "hexagon_pointed_top", - "tooltipDisplayMode": "all", - "tooltipDisplayTextTriggeredEmpty": "OK", - "tooltipFontSize": 12, - "tooltipFontType": "Roboto", - "tooltipPrimarySortDirection": 2, - "tooltipPrimarySortField": "thresholdLevel", - "tooltipSecondarySortDirection": 2, - "tooltipSecondarySortField": "value", - "tooltipTimestampEnabled": true, - "valueEnabled": true - }, - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "repeat": "projects", - "repeatDirection": "h", - "savedComposites": [], - "savedOverrides": [], - "targets": [ - { - "exemplar": true, - "expr": "sum by (resource, plugin_instance) (label_replace(collectd_virt_memory{service=~\".+-$clouds-.+\"}, \"resource\", \"$1\", \"host\", \".+:(.+):.+\")) + on(resource) group_right(plugin_instance) ceilometer_cpu{project=\"$projects\", service=~\".+-$clouds-.+\"}", - "instant": true, - "interval": "", - "legendFormat": "{{plugin_instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Project $projects", - "type": "grafana-polystat-panel", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ] } ], "refresh": "1m", diff --git a/roles/servicetelemetry/tasks/component_grafana.yml b/roles/servicetelemetry/tasks/component_grafana.yml index e5c9ba989..7bdd939ee 100644 --- a/roles/servicetelemetry/tasks/component_grafana.yml +++ b/roles/servicetelemetry/tasks/component_grafana.yml @@ -1,30 +1,9 @@ -- name: Construct oauth redirect reference - set_fact: - grafana_oauth_redir_ref: - kind: OAuthRedirectReference - apiVersion: v1 - reference: - kind: Route - name: 'grafana-route' - -- name: Lookup template - debug: - msg: "{{ lookup('template', './manifest_grafana.j2') | from_yaml }}" - -- name: Set default Grafana manifest - set_fact: - grafana_manifest: "{{ lookup('template', './manifest_grafana.j2') | from_yaml }}" - when: grafana_manifest is not defined - -- name: Create an instance of Grafana - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' - definition: - '{{ grafana_manifest }}' - +# dashboard setup first looks for Grafana Operator v5 CRDs. If existing, prefer setup with v5. +# If v5 doesn't exist, then try v4. Don't create objects for v4 if v5 CRDs exist. - when: servicetelemetry_vars.graphing.enabled block: - when: servicetelemetry_vars.backends.metrics.prometheus.enabled + name: Get auth data for datasources to Prometheus block: - name: Retrieve configmap for OAUTH CA certs k8s_info: @@ -47,94 +26,247 @@ set_fact: prometheus_reader_token: '{{ prometheus_reader_secret.resources[0].data.token | b64decode }}' - # Lookup existing datasources - - name: Remove legacy datasources - k8s: - api_version: integreatly.org/v1alpha1 - name: '{{ ansible_operator_meta.name }}-ds-prometheus' - kind: GrafanaDataSource - namespace: '{{ ansible_operator_meta.namespace }}' - state: absent - - # NOTE: this can fail if you enable grafana without prometheus due to missing resources referenced in the template - - name: Set datasources - set_fact: - ds_manifest: "{{ lookup('template', './manifest_grafana_ds.j2') | from_yaml }}" - when: ds_manifest is not defined - - - name: Create the datasources - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' - definition: - '{{ ds_manifest }}' - - - name: Load Cloud Overview Dashboard - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' - definition: - apiVersion: integreatly.org/v1alpha1 - kind: GrafanaDashboard - metadata: - labels: - app: grafana - stf_owner: "{{ ansible_operator_meta.name }}" - name: rhos-cloud-dashboard-1 - namespace: "{{ ansible_operator_meta.namespace }}" - spec: - name: rhos-cloud-dashboard.json - plugins: - - name: grafana-polystat-panel - version: "1.2.11" - json: | - {{ lookup('file', 'rhos-cloud-dashboard.json') | string }} - - - name: Load Infrastructure Overview Dashboard - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' - definition: - apiVersion: integreatly.org/v1alpha1 - kind: GrafanaDashboard - metadata: - labels: - app: grafana - stf_owner: "{{ ansible_operator_meta.name }}" - name: rhos-dashboard-1 - namespace: "{{ ansible_operator_meta.namespace }}" - spec: - name: rhos-dashboard.json - json: | - {{ lookup('file', 'rhos-dashboard.json') | string }} - - - name: Load Memcached Dashboard - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' - definition: - apiVersion: integreatly.org/v1alpha1 - kind: GrafanaDashboard - metadata: - labels: - app: grafana - stf_owner: "{{ ansible_operator_meta.name }}" - name: memcached-dashboard-1 - namespace: "{{ ansible_operator_meta.namespace }}" - spec: - name: memcached-dashboard.json - json: | - {{ lookup('file', 'memcached-dashboard.json') | string }} - - - name: Load Virtual Machine View Dashboard - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' - definition: - apiVersion: integreatly.org/v1alpha1 - kind: GrafanaDashboard - metadata: - labels: - app: grafana - stf_owner: "{{ ansible_operator_meta.name }}" - name: virtual-machine-dashboard-1 - namespace: "{{ ansible_operator_meta.namespace }}" - spec: - name: virtual-machine-view.json - json: | - {{ lookup('file', 'virtual-machine-view.json') | string }} +#---- deploy Grafana with v5 Operator (preferred) + - when: has_grafana_integreatly_api + name: Deploying with Grafana Operator v5 + block: + - name: Construct oauth redirect reference + set_fact: + grafana_oauth_redir_ref: + kind: OAuthRedirectReference + apiVersion: v1 + reference: + kind: Route + name: '{{ ansible_operator_meta.name }}-grafana-route' + + - name: Lookup template + debug: + msg: "{{ lookup('template', './manifest_grafana_v5.j2') | from_yaml }}" + + - name: Set default Grafana manifest (Grafana Operator v5) + set_fact: + grafana_manifest: "{{ lookup('template', './manifest_grafana_v5.j2') | from_yaml }}" + when: grafana_manifest is not defined + + - name: Create an instance of Grafana (Grafana Operator v5) + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ grafana_manifest }}' + + # NOTE: we only provide events forwarding with STF. We don't use events + # in dashboards, so there is no need to create an Elasticsearch + # datasource. + - when: servicetelemetry_vars.backends.metrics.prometheus.enabled + name: Create Grafana datasource for Prometheus + block: + - name: Set datasource for Prometheus + set_fact: + ds_manifest: "{{ lookup('template', './manifest_grafana_ds_prometheus.j2') | from_yaml }}" + when: ds_manifest is not defined + + - name: Create the datasource for Prometheus + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ ds_manifest }}' + + - name: Load Cloud Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-cloud-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + instanceSelector: + matchLabels: + dashboards: "stf" + name: rhos-cloud-dashboard.json + json: | + {{ lookup('file', 'rhos-cloud-dashboard.json') | string }} + + - name: Load Infrastructure Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + instanceSelector: + matchLabels: + dashboards: "stf" + name: rhos-dashboard.json + json: | + {{ lookup('file', 'rhos-dashboard.json') | string }} + + - name: Load Memcached Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: memcached-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + instanceSelector: + matchLabels: + dashboards: "stf" + name: memcached-dashboard.json + json: | + {{ lookup('file', 'memcached-dashboard.json') | string }} + + - name: Load Virtual Machine View Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: virtual-machine-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + instanceSelector: + matchLabels: + dashboards: "stf" + name: virtual-machine-view.json + json: | + {{ lookup('file', 'virtual-machine-view.json') | string }} + +#---- deploy Grafana with v4 Operator if v5 CRDs are not available (legacy deployments) + - when: has_integreatly_api and not has_grafana_integreatly_api + name: Deploying with Grafana Operator v4 + block: + - name: Construct oauth redirect reference + set_fact: + grafana_oauth_redir_ref: + kind: OAuthRedirectReference + apiVersion: v1 + reference: + kind: Route + name: 'grafana-route' + + - name: Lookup template + debug: + msg: "{{ lookup('template', './manifest_grafana.j2') | from_yaml }}" + + - name: Set default Grafana manifest (Grafana Operator v4) + set_fact: + grafana_manifest: "{{ lookup('template', './manifest_grafana.j2') | from_yaml }}" + when: grafana_manifest is not defined + + - name: Create an instance of Grafana (Grafana Operator v4) + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ grafana_manifest }}' + + - name: Remove legacy datasources + k8s: + api_version: integreatly.org/v1alpha1 + name: '{{ ansible_operator_meta.name }}-ds-prometheus' + kind: GrafanaDataSource + namespace: '{{ ansible_operator_meta.namespace }}' + state: absent + + # NOTE: This can fail if you enable grafana without prometheus due + # to missing resources referenced in the template. The v1alpha1 CRD + # of GrafanaDatasources uses a list, so logic would need to be + # added to the template directly checking for parameters set in + # ServiceTelemetry. + - name: Set datasources + set_fact: + ds_manifest: "{{ lookup('template', './manifest_grafana_ds.j2') | from_yaml }}" + when: ds_manifest is not defined + + - name: Create the datasources + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ ds_manifest }}' + + - name: Load Cloud Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-cloud-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: rhos-cloud-dashboard.json + json: | + {{ lookup('file', 'rhos-cloud-dashboard.json') | string }} + + - name: Load Infrastructure Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: rhos-dashboard.json + json: | + {{ lookup('file', 'rhos-dashboard.json') | string }} + + - name: Load Memcached Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: memcached-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: memcached-dashboard.json + json: | + {{ lookup('file', 'memcached-dashboard.json') | string }} + + - name: Load Virtual Machine View Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: virtual-machine-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: virtual-machine-view.json + json: | + {{ lookup('file', 'virtual-machine-view.json') | string }} diff --git a/roles/servicetelemetry/tasks/main.yml b/roles/servicetelemetry/tasks/main.yml index dc3e881c7..3bef4a10f 100644 --- a/roles/servicetelemetry/tasks/main.yml +++ b/roles/servicetelemetry/tasks/main.yml @@ -87,12 +87,16 @@ loop_var: this_cloud # --> graphing -- name: Check if we have integreatly.org API +- name: Check if we have integreatly.org API (Grafana Operator v4) set_fact: has_integreatly_api: "{{ True if 'integreatly.org' in api_groups else False }}" +- name: Check if we have grafana.integreatly.org API (Grafana Operator v5) + set_fact: + has_grafana_integreatly_api: "{{ True if 'grafana.integreatly.org' in api_groups else False }}" + - when: - - has_integreatly_api | bool + - (has_integreatly_api | bool) or (has_grafana_integreatly_api | bool) name: Start graphing component plays include_tasks: component_grafana.yml diff --git a/roles/servicetelemetry/templates/manifest_grafana_ds_prometheus.j2 b/roles/servicetelemetry/templates/manifest_grafana_ds_prometheus.j2 new file mode 100644 index 000000000..473389cf8 --- /dev/null +++ b/roles/servicetelemetry/templates/manifest_grafana_ds_prometheus.j2 @@ -0,0 +1,24 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: {{ ansible_operator_meta.name }}-ds-stf-prometheus + namespace: {{ ansible_operator_meta.namespace }} +spec: + instanceSelector: + matchLabels: + dashboards: "stf" + datasource: + name: STFPrometheus + type: prometheus + access: proxy + url: 'https://{{ ansible_operator_meta.name }}-prometheus-proxy.{{ ansible_operator_meta.namespace }}.svc:9092' + isDefault: true + editable: true + jsonData: + 'timeInterval': "5s" + 'tlsAuthWithCACert': true + 'httpHeaderName1': 'Authorization' + secureJsonData: + 'httpHeaderValue1': 'Bearer {{prometheus_reader_token}}' + 'tlsCACert': | + {{ serving_certs_ca.resources[0].data['service-ca.crt'] | indent(8) }} diff --git a/roles/servicetelemetry/templates/manifest_grafana_v5.j2 b/roles/servicetelemetry/templates/manifest_grafana_v5.j2 new file mode 100644 index 000000000..278e452ff --- /dev/null +++ b/roles/servicetelemetry/templates/manifest_grafana_v5.j2 @@ -0,0 +1,97 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: {{ ansible_operator_meta.name }}-grafana + namespace: {{ ansible_operator_meta.namespace }} + labels: + dashboards: "stf" +spec: + serviceAccount: + metadata: + annotations: + serviceaccounts.openshift.io/oauth-redirectreference.primary: '{{ grafana_oauth_redir_ref | to_json }}' +{% if servicetelemetry_vars.graphing.grafana.ingress_enabled is defined and servicetelemetry_vars.graphing.grafana.ingress_enabled %} + route: + spec: + port: + targetPort: web + tls: + termination: reencrypt + to: + kind: Service + name: {{ ansible_operator_meta.name }}-grafana-service + weight: 100 + wildcardPolicy: None +{% endif %} + client: + preferIngress: false + config: + auth: + disable_signout_menu: "{{ servicetelemetry_vars.graphing.grafana.disable_signout_menu }}" + disable_login_form: "True" + auth.anonymous: + enabled: "True" + auth.proxy: + enabled: "True" + enable_login_token: "True" + header_property: "username" + header_name: "X-Forwarded-User" + log: + level: warn + mode: "console" + users: + auto_assign_org_role: Admin + deployment: + spec: + template: + spec: + volumes: + - name: 'secret-{{ ansible_operator_meta.name }}-grafana-proxy-tls' + secret: + secretName: '{{ ansible_operator_meta.name }}-grafana-proxy-tls' + - name: 'secret-{{ ansible_operator_meta.name }}-session-secret' + secret: + secretName: '{{ ansible_operator_meta.name }}-session-secret' + containers: + - name: oauth-proxy + image: {{ oauth_proxy_image }} + args: + - '-provider=openshift' + - '-pass-basic-auth=false' + - '-https-address=:3002' + - '-http-address=' + - '-email-domain=*' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafana", "group":"grafana.integreatly.org", "verb":"get"}' + - '-upstream=http://localhost:3000' + - '-tls-cert=/etc/tls/private/tls.crt' + - '-tls-key=/etc/tls/private/tls.key' + - '-client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token' + - '-cookie-secret-file=/etc/proxy/secrets/session_secret' + - '-openshift-service-account={{ ansible_operator_meta.name }}-grafana-sa' + - '-openshift-ca=/etc/pki/tls/cert.pem' + - '-openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + - '-skip-auth-regex=^/metrics' + ports: + - containerPort: 3002 + name: https + protocol: TCP + resources: { } + volumeMounts: + - mountPath: /etc/tls/private + name: secret-{{ ansible_operator_meta.name }}-grafana-proxy-tls + - mountPath: /etc/proxy/secrets + name: secret-{{ ansible_operator_meta.name }}-session-secret +{% if servicetelemetry_vars.graphing.grafana.base_image is defined %} + - name: grafana + image: {{ servicetelemetry_vars.graphing.grafana.base_image }} +{% endif %} + service: + metadata: + annotations: + service.alpha.openshift.io/serving-cert-secret-name: {{ ansible_operator_meta.name }}-grafana-proxy-tls + spec: + ports: + - name: web + port: 3002 + protocol: TCP + targetPort: https From 20fb41094d43c2839912eeebe5a8eaba70fad549 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Mon, 15 Jan 2024 17:33:18 -0500 Subject: [PATCH 33/70] Remove old vendored operator_sdk/util collection (#563) Remove the old 0.1.0 vendored collection operator_sdk/util from the upstream Dockerfile and repository. Instead use the default operator_sdk/util in the base image which is a newer version of 0.4.0. We only use the util collection for one call to k8s_status when ephemeral storage is enabled. The newer collection also provides a k8s_event module which could be useful in the future. Closes: STF-1683 --- build/Dockerfile | 1 - .../operator_sdk/util/.gitignore | 1 - .../operator_sdk/util/FILES.json | 89 ---- .../operator_sdk/util/LICENSE | 201 --------- .../operator_sdk/util/MANIFEST.json | 36 -- .../operator_sdk/util/README.md | 40 -- .../operator_sdk/util/demo/README.md | 26 -- .../operator_sdk/util/demo/playbook.yml | 15 - .../operator_sdk/util/plugins/README.md | 31 -- .../util/plugins/modules/k8s_status.py | 404 ------------------ .../util/plugins/modules/requeue_after.py | 93 ---- 11 files changed, 937 deletions(-) delete mode 100644 collections/ansible_collections/operator_sdk/util/.gitignore delete mode 100644 collections/ansible_collections/operator_sdk/util/FILES.json delete mode 100644 collections/ansible_collections/operator_sdk/util/LICENSE delete mode 100644 collections/ansible_collections/operator_sdk/util/MANIFEST.json delete mode 100644 collections/ansible_collections/operator_sdk/util/README.md delete mode 100644 collections/ansible_collections/operator_sdk/util/demo/README.md delete mode 100644 collections/ansible_collections/operator_sdk/util/demo/playbook.yml delete mode 100644 collections/ansible_collections/operator_sdk/util/plugins/README.md delete mode 100644 collections/ansible_collections/operator_sdk/util/plugins/modules/k8s_status.py delete mode 100644 collections/ansible_collections/operator_sdk/util/plugins/modules/requeue_after.py diff --git a/build/Dockerfile b/build/Dockerfile index da2b7508f..a236fb9c8 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -18,4 +18,3 @@ USER 1001 # copy in required artifacts for the operator COPY watches.yaml ${HOME}/watches.yaml COPY roles/ ${HOME}/roles/ -COPY collections/ ${HOME}/.ansible/collections/ diff --git a/collections/ansible_collections/operator_sdk/util/.gitignore b/collections/ansible_collections/operator_sdk/util/.gitignore deleted file mode 100644 index dd4b63d94..000000000 --- a/collections/ansible_collections/operator_sdk/util/.gitignore +++ /dev/null @@ -1 +0,0 @@ -operator_sdk-util-*.tar.gz diff --git a/collections/ansible_collections/operator_sdk/util/FILES.json b/collections/ansible_collections/operator_sdk/util/FILES.json deleted file mode 100644 index df11b5cf5..000000000 --- a/collections/ansible_collections/operator_sdk/util/FILES.json +++ /dev/null @@ -1,89 +0,0 @@ -{ - "files": [ - { - "name": ".", - "ftype": "dir", - "chksum_type": null, - "chksum_sha256": null, - "format": 1 - }, - { - "name": ".gitignore", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "1e87175a024a4cf4bf7b3a5fa623d046351a591d5cee549d8ca8c30ec669a013", - "format": 1 - }, - { - "name": "plugins", - "ftype": "dir", - "chksum_type": null, - "chksum_sha256": null, - "format": 1 - }, - { - "name": "plugins/modules", - "ftype": "dir", - "chksum_type": null, - "chksum_sha256": null, - "format": 1 - }, - { - "name": "plugins/modules/requeue_after.py", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "16da24bb189ab5b48a2071ae21449bbd4ee332787ed62c24c094acbe64e7248e", - "format": 1 - }, - { - "name": "plugins/modules/k8s_status.py", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "fabd8a42babf96433569e4ff9887c9a163d7adab68df94d4712f7d6dbb8c1030", - "format": 1 - }, - { - "name": "plugins/README.md", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "58b5a167904c91786df167dd097ab76aed73ffba6cc746a3624c2a5bbf62ef6f", - "format": 1 - }, - { - "name": "LICENSE", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "c71d239df91726fc519c6eb72d318ec65820627232b2f796219e87dcf35d0ab4", - "format": 1 - }, - { - "name": "README.md", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "2d8e64d77e0a8202ce2ec6dd36b30df06840e41ab9ade86b3a55908181d322b2", - "format": 1 - }, - { - "name": "demo", - "ftype": "dir", - "chksum_type": null, - "chksum_sha256": null, - "format": 1 - }, - { - "name": "demo/playbook.yml", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "fe562128e0c234462e315568b7ef38657d601976c74a87d903c31d8ddc4ff907", - "format": 1 - }, - { - "name": "demo/README.md", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "5b6b9137f90122c4fac758a5089cd01bfdc7cf007a379e21ceb3ff1f82aafd55", - "format": 1 - } - ], - "format": 1 -} \ No newline at end of file diff --git a/collections/ansible_collections/operator_sdk/util/LICENSE b/collections/ansible_collections/operator_sdk/util/LICENSE deleted file mode 100644 index 261eeb9e9..000000000 --- a/collections/ansible_collections/operator_sdk/util/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/collections/ansible_collections/operator_sdk/util/MANIFEST.json b/collections/ansible_collections/operator_sdk/util/MANIFEST.json deleted file mode 100644 index 6a4b9836c..000000000 --- a/collections/ansible_collections/operator_sdk/util/MANIFEST.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "collection_info": { - "namespace": "operator_sdk", - "name": "util", - "version": "0.1.0", - "authors": [ - "Austin Macdonald ", - "Fabian von Feilitzsch ", - "Venkat Ramaraju " - ], - "readme": "README.md", - "tags": [ - "operator_sdk", - "kubernetes", - "k8s", - "k8s_status", - "ansible_operator" - ], - "description": "This is a collection of Ansible assets used by the Operator SDK. https://github.com/operator-framework/operator-sdk", - "license": [], - "license_file": "LICENSE", - "dependencies": {}, - "repository": "https://github.com/operator-framework/operator-sdk-ansible-util", - "documentation": "https://github.com/operator-framework/operator-sdk-ansible-util", - "homepage": "https://github.com/operator-framework/operator-sdk-ansible-util", - "issues": "https://github.com/operator-framework/operator-sdk-ansible-util/issues" - }, - "file_manifest_file": { - "name": "FILES.json", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "d7240f7df82fd9bfe60801b40703ad185932cba629271a99aad3657406c81eb0", - "format": 1 - }, - "format": 1 -} \ No newline at end of file diff --git a/collections/ansible_collections/operator_sdk/util/README.md b/collections/ansible_collections/operator_sdk/util/README.md deleted file mode 100644 index f4140831c..000000000 --- a/collections/ansible_collections/operator_sdk/util/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Ansible Collection - operator_sdk.util - -A collection of Ansible assets for use with Ansible-based operators -built with the [operator-sdk](https://github.com/operator-framework/operator-sdk/). - - https://galaxy.ansible.com/operator_sdk/util - - -## Installation - -#### From Galaxy - -``` -ansible-galaxy collection install operator_sdk.util -``` - -#### Local - -``` -ansible-galaxy collection install operator_sdk-util-0.0.1.tar.gz -p ~/.ansible/collections -``` - -## Developer Docs - -### Build and Publish Collection - -Before building the collection, edit `galaxy.yml` and update the -version. - -**Build the collection:** - -``` -$ ansible-galaxy collection build -``` - -**Publish the collection:** - -``` -ansible-galaxy collection publish operator_sdk-util-0.0.0.tar.gz --api-key=$GALAXY_API_KEY -``` diff --git a/collections/ansible_collections/operator_sdk/util/demo/README.md b/collections/ansible_collections/operator_sdk/util/demo/README.md deleted file mode 100644 index 5ccc732e5..000000000 --- a/collections/ansible_collections/operator_sdk/util/demo/README.md +++ /dev/null @@ -1,26 +0,0 @@ -ansible-playbook -i localhost demo/playbook.yml - -```yaml -$ kubectl get memcacheds example-memcached -o yaml - -apiVersion: cache.example.com/v1alpha1 -kind: Memcached - name: example-memcached - namespace: default - selfLink: /apis/cache.example.com/v1alpha1/namespaces/default/memcacheds/example-memcached - uid: 2a94ff2b-84e0-40ce-8b5e-2b7e4d2bc0e2 -status: - conditions: - - ansibleResult: - changed: 0 - completion: 2019-10-16T13:23:21.64021 - failures: 0 - ok: 3 - skipped: 0 - lastTransitionTime: "2019-10-15T13:26:58Z" - message: Awaiting next reconciliation - reason: Successful - status: "True" - type: Running - diditwork: why yes it did -``` diff --git a/collections/ansible_collections/operator_sdk/util/demo/playbook.yml b/collections/ansible_collections/operator_sdk/util/demo/playbook.yml deleted file mode 100644 index 8127ec5de..000000000 --- a/collections/ansible_collections/operator_sdk/util/demo/playbook.yml +++ /dev/null @@ -1,15 +0,0 @@ -- hosts: localhost - # Syntax option 1 - collections: - - operator_sdk.util - tasks: - - k8s_status: - # Syntax option 2 - # tasks: - # - operator_sdk.util.k8s_status: - api_version: cache.example.com/v1alpha1 - kind: Memcached - name: example-memcached - namespace: default - status: - diditwork: "yes it did" diff --git a/collections/ansible_collections/operator_sdk/util/plugins/README.md b/collections/ansible_collections/operator_sdk/util/plugins/README.md deleted file mode 100644 index 7e9e2f1fc..000000000 --- a/collections/ansible_collections/operator_sdk/util/plugins/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Collections Plugins Directory - -This directory can be used to ship various plugins inside an Ansible collection. Each plugin is placed in a folder that -is named after the type of plugin it is in. It can also include the `module_utils` and `modules` directory that -would contain module utils and modules respectively. - -Here is an example directory of the majority of plugins currently supported by Ansible: - -``` -└── plugins - ├── action - ├── become - ├── cache - ├── callback - ├── cliconf - ├── connection - ├── filter - ├── httpapi - ├── inventory - ├── lookup - ├── module_utils - ├── modules - ├── netconf - ├── shell - ├── strategy - ├── terminal - ├── test - └── vars -``` - -A full list of plugin types can be found at [Working With Plugins](https://docs.ansible.com/ansible/devel/plugins/plugins.html). \ No newline at end of file diff --git a/collections/ansible_collections/operator_sdk/util/plugins/modules/k8s_status.py b/collections/ansible_collections/operator_sdk/util/plugins/modules/k8s_status.py deleted file mode 100644 index ed7827db7..000000000 --- a/collections/ansible_collections/operator_sdk/util/plugins/modules/k8s_status.py +++ /dev/null @@ -1,404 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- - -from __future__ import absolute_import, division, print_function - -import re -import copy - -from ansible.module_utils.k8s.common import AUTH_ARG_SPEC, COMMON_ARG_SPEC, KubernetesAnsibleModule - -try: - from openshift.dynamic.exceptions import DynamicApiError -except ImportError as exc: - class KubernetesException(Exception): - pass - - -__metaclass__ = type - -ANSIBLE_METADATA = {'metadata_version': '1.1', - 'status': ['preview'], - 'supported_by': 'community'} - -DOCUMENTATION = ''' - -module: k8s_status - -short_description: Update the status for a Kubernetes API resource - -version_added: "2.7" - -author: "Fabian von Feilitzsch (@fabianvf)" - -description: - - Sets the status field on a Kubernetes API resource. Only should be used if you are using Ansible to - implement a controller for the resource being modified. - -options: - status: - type: dict - description: - - A object containing `key: value` pairs that will be set on the status object of the specified resource. - - One of I(status) or I(conditions) is required. - conditions: - type: list - description: - - A list of condition objects that will be set on the status.conditions field of the specified resource. - - Unless I(force) is C(true) the specified conditions will be merged with the conditions already set on the status field of the specified resource. - - Each element in the list will be validated according to the conventions specified in the - [Kubernetes API conventions document](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status). - - 'The fields supported for each condition are: - `type` (required), - `status` (required, one of "True", "False", "Unknown"), - `reason` (single CamelCase word), - `message`, - `lastHeartbeatTime` (RFC3339 datetime string), and - `lastTransitionTime` (RFC3339 datetime string).' - - One of I(status) or I(conditions) is required.' - api_version: - description: - - Use to specify the API version. Use in conjunction with I(kind), I(name), and I(namespace) to identify a - specific object. - required: yes - aliases: - - api - - version - kind: - description: - - Use to specify an object model. Use in conjunction with I(api_version), I(name), and I(namespace) to identify a - specific object. - required: yes - name: - description: - - Use to specify an object name. Use in conjunction with I(api_version), I(kind) and I(namespace) to identify a - specific object. - required: yes - namespace: - description: - - Use to specify an object namespace. Use in conjunction with I(api_version), I(kind), and I(name) - to identify a specific object. - force: - description: - - If set to C(True), the status will be set using `PUT` rather than `PATCH`, replacing the full status object. - default: false - type: bool - host: - description: - - Provide a URL for accessing the API. Can also be specified via K8S_AUTH_HOST environment variable. - api_key: - description: - - Token used to authenticate with the API. Can also be specified via K8S_AUTH_API_KEY environment variable. - kubeconfig: - description: - - Path to an instance Kubernetes config file. If not provided, and no other connection - options are provided, the openshift client will attempt to load the default - configuration file from I(~/.kube/config.json). Can also be specified via K8S_AUTH_KUBECONFIG environment - variable. - context: - description: - - The name of a context found in the config file. Can also be specified via K8S_AUTH_CONTEXT environment variable. - username: - description: - - Provide a username for authenticating with the API. Can also be specified via K8S_AUTH_USERNAME environment - variable. - password: - description: - - Provide a password for authenticating with the API. Can also be specified via K8S_AUTH_PASSWORD environment - variable. - cert_file: - description: - - Path to a certificate used to authenticate with the API. Can also be specified via K8S_AUTH_CERT_FILE environment - variable. - key_file: - description: - - Path to a key file used to authenticate with the API. Can also be specified via K8S_AUTH_KEY_FILE environment - variable. - ssl_ca_cert: - description: - - Path to a CA certificate used to authenticate with the API. Can also be specified via K8S_AUTH_SSL_CA_CERT - environment variable. - verify_ssl: - description: - - "Whether or not to verify the API server's SSL certificates. Can also be specified via K8S_AUTH_VERIFY_SSL - environment variable." - type: bool - -requirements: - - "python >= 3.7" - - "openshift >= 0.8.1" - - "PyYAML >= 3.11" -''' - -EXAMPLES = ''' -- name: Set custom status fields on TestCR - k8s_status: - api_version: apps.example.com/v1alpha1 - kind: TestCR - name: my-test - namespace: testing - status: - hello: world - custom: entries - -- name: Update the standard condition of an Ansible Operator - k8s_status: - api_version: apps.example.com/v1alpha1 - kind: TestCR - name: my-test - namespace: testing - conditions: - - type: Running - status: "True" - reason: MigrationStarted - message: "Migration from v2 to v3 has begun" - lastTransitionTime: "{{ ansible_date_time.iso8601 }}" - -- name: | - Create custom conditions. WARNING: The default Ansible Operator status management - will never overwrite custom conditions, so they will persist indefinitely. If you - want the values to change or be removed, you will need to clean them up manually. - k8s_status: - conditions: - - type: Available - status: "False" - reason: PingFailed - message: "The service did not respond to a ping" - -''' - -RETURN = ''' -result: - description: - - If a change was made, will return the patched object, otherwise returns the instance object. - returned: success - type: complex - contains: - api_version: - description: The versioned schema of this representation of an object. - returned: success - type: str - kind: - description: Represents the REST resource this object represents. - returned: success - type: str - metadata: - description: Standard object metadata. Includes name, namespace, annotations, labels, etc. - returned: success - type: complex - spec: - description: Specific attributes of the object. Will vary based on the I(api_version) and I(kind). - returned: success - type: complex - status: - description: Current status details for the object. - returned: success - type: complex -''' - - -def condition_array(conditions): - - VALID_KEYS = ['type', 'status', 'reason', 'message', 'lastHeartbeatTime', 'lastTransitionTime'] - REQUIRED = ['type', 'status'] - CAMEL_CASE = re.compile(r'^(?:[A-Z]*[a-z]*)+$') - RFC3339_datetime = re.compile(r'^\d{4}-\d\d-\d\dT\d\d:\d\d(:\d\d)?(\.\d+)?(([+-]\d\d:\d\d)|Z)$') - - def validate_condition(condition): - if not isinstance(condition, dict): - raise ValueError('`conditions` must be a list of objects') - if isinstance(condition.get('status'), bool): - condition['status'] = 'True' if condition['status'] else 'False' - - for key in condition.keys(): - if key not in VALID_KEYS: - raise ValueError('{} is not a valid field for a condition, accepted fields are {}'.format(key, VALID_KEYS)) - for key in REQUIRED: - if not condition.get(key): - raise ValueError('Condition `{}` must be set'.format(key)) - - if condition['status'] not in ['True', 'False', 'Unknown']: - raise ValueError('Condition `status` must be one of ["True", "False", "Unknown"], not {}'.format(condition['status'])) - - if condition.get('reason') and not re.match(CAMEL_CASE, condition['reason']): - raise ValueError('Condition `reason` must be a single, CamelCase word') - - for key in ['lastHeartBeatTime', 'lastTransitionTime']: - if condition.get(key) and not re.match(RFC3339_datetime, condition[key]): - raise ValueError('`{}` must be a RFC3339 compliant datetime string'.format(key)) - - return condition - - return [validate_condition(c) for c in conditions] - - -STATUS_ARG_SPEC = { - 'status': { - 'type': 'dict', - 'required': False - }, - 'conditions': { - 'type': condition_array, - 'required': False - } -} - - -def main(): - KubernetesAnsibleStatusModule().execute_module() - - -class KubernetesAnsibleStatusModule(KubernetesAnsibleModule): - - def __init__(self, *args, **kwargs): - KubernetesAnsibleModule.__init__( - self, *args, - supports_check_mode=True, - **kwargs - ) - self.kind = self.params.get('kind') - self.api_version = self.params.get('api_version') - self.name = self.params.get('name') - self.namespace = self.params.get('namespace') - self.force = self.params.get('force') - - self.status = self.params.get('status') or {} - self.conditions = self.params.get('conditions') or [] - - if self.conditions and self.status and self.status.get('conditions'): - raise ValueError("You cannot specify conditions in both the `status` and `conditions` parameters") - - if self.conditions: - self.status['conditions'] = self.conditions - - def execute_module(self): - self.client = self.get_api_client() - - resource = self.find_resource(self.kind, self.api_version, fail=True) - if 'status' not in resource.subresources: - self.fail_json(msg='Resource {}.{} does not support the status subresource'.format(resource.api_version, resource.kind)) - - try: - instance = resource.get(name=self.name, namespace=self.namespace).to_dict() - except DynamicApiError as exc: - self.fail_json(msg='Failed to retrieve requested object: {0}'.format(exc), - error=exc.summary()) - # Make sure status is at least initialized to an empty dict - instance['status'] = instance.get('status', {}) - - if self.force: - self.exit_json(**self.replace(resource, instance)) - else: - self.exit_json(**self.patch(resource, instance)) - - def replace(self, resource, instance): - if self.status == instance['status']: - return {'result': instance, 'changed': False} - instance['status'] = self.status - try: - result = resource.status.replace(body=instance).to_dict(), - except DynamicApiError as exc: - self.fail_json(msg='Failed to replace status: {}'.format(exc), error=exc.summary()) - - return { - 'result': result, - 'changed': True - } - - def clean_last_transition_time(self, status): - '''clean_last_transition_time removes lastTransitionTime attribute from each status.conditions[*] (from old conditions). - It returns copy of status with updated conditions. Copy of status is returned, because if new conditions - are subset of old conditions, then module would return conditions without lastTransitionTime. Updated status - should be used only for check in object_contains function, not for next updates, because otherwise it can create - a mess with lastTransitionTime attribute. - - If new conditions don't contain lastTransitionTime and they are different from old conditions - (e.g. they have different status), conditions are updated and kubernetes should sets lastTransitionTime - field during update. If new conditions contain lastTransitionTime, then conditions are updated. - - Parameters: - status (dict): dictionary, which contains conditions list - - Returns: - dict: copy of status with updated conditions - ''' - updated_old_status = copy.deepcopy(status) - - for item in updated_old_status.get('conditions', []): - if 'lastTransitionTime' in item: - del item['lastTransitionTime'] - - return updated_old_status - - def patch(self, resource, instance): - # Remove lastTransitionTime from status.conditions[*] and use updated_old_status only for check in object_contains function. - # Updates of conditions should be done only with original data not with updated_old_status. - updated_old_status = self.clean_last_transition_time(instance['status']) - if self.object_contains(updated_old_status, self.status): - return {'result': instance, 'changed': False} - instance['status'] = self.merge_status(instance['status'], self.status) - try: - result = resource.status.patch(body=instance, content_type='application/merge-patch+json').to_dict() - except DynamicApiError as exc: - self.fail_json(msg='Failed to replace status: {}'.format(exc), error=exc.summary()) - - return { - 'result': result, - 'changed': True - } - - def merge_status(self, old, new): - old_conditions = old.get('conditions', []) - new_conditions = new.get('conditions', []) - if not (old_conditions and new_conditions): - return new - - merged = copy.deepcopy(old_conditions) - - for condition in new_conditions: - idx = self.get_condition_idx(merged, condition['type']) - if idx is not None: - merged[idx] = condition - else: - merged.append(condition) - new['conditions'] = merged - return new - - def get_condition_idx(self, conditions, name): - for i, condition in enumerate(conditions): - if condition.get('type') == name: - return i - return None - - def object_contains(self, obj, subset): - def dict_is_subset(obj, subset): - return all([mapping.get(type(obj.get(k)), mapping['default'])(obj.get(k), v) for (k, v) in subset.items()]) - - def list_is_subset(obj, subset): - return all(item in obj for item in subset) - - def values_match(obj, subset): - return obj == subset - - mapping = { - dict: dict_is_subset, - list: list_is_subset, - tuple: list_is_subset, - 'default': values_match - } - - return dict_is_subset(obj, subset) - - @property - def argspec(self): - args = copy.deepcopy(COMMON_ARG_SPEC) - args.pop('state') - args.pop('resource_definition') - args.pop('src') - args.update(AUTH_ARG_SPEC) - args.update(STATUS_ARG_SPEC) - return args - - -if __name__ == '__main__': - main() diff --git a/collections/ansible_collections/operator_sdk/util/plugins/modules/requeue_after.py b/collections/ansible_collections/operator_sdk/util/plugins/modules/requeue_after.py deleted file mode 100644 index 8485a2288..000000000 --- a/collections/ansible_collections/operator_sdk/util/plugins/modules/requeue_after.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -from __future__ import absolute_import, division, print_function -from ansible.module_utils.basic import AnsibleModule -import re - -__metaclass__ = type - -ANSIBLE_METADATA = {'metadata_version': '1.1', - 'status': ['preview'], - 'supported_by': 'community'} - -DOCUMENTATION = ''' -module: requeue_after -short_description: Tells the controller to re-trigger reconciliation after the specified time -version_added: "0.1" -author: "Venkat Ramaraju (@VenkatRamaraju)" -description: - - Tells the controller to pause reconciliation and resume reconciliation after a specified amounts of time. - If the requeue_reconciliation period is set to 't', reconciliation will occur in intervals of 't'. - -options: - time: - type: str - description: - - A string containing a time period that will be set on the returned JSON object and then used to requeue - reconciliation of an event. Time can be specified in any combination of hours, minutes, and seconds. -''' - -EXAMPLES = ''' -- name: "Running the requeue_after module" - requeue_after: - time: 24h - -- name: "Running the requeue_after module" - requeue_after: - time: 30m - -- name: "Running the requeue_after module" - requeue_after: - time: 5s -''' - -RETURN = ''' -result: - description: - - If a requeue period was specified under 'time' when calling the requeue_after period from the module, - this module will return a JSON object. - returned: success - contains: - _ansible_no_log: - description: This is a boolean. If it’s True then the playbook specified no_log (in a task’s parameters or as - a play parameter). - returned: success - type: boolean - changed: - description: A boolean indicating if the task had to make changes. - returned: success - type: boolean - invocation: - description: Information on how the module was invoked. - returned: success - type: map - period: - description: A time value read in from a playbook that specifies how long the reconciliation should be - requeued after. - returned: success - type: str -''' - - -def requeue_after(): - module = AnsibleModule(argument_spec={ - 'time': {'type': 'str', 'required': True}, - }) - - if not re.match("^[hms0-9]*$", module.params['time']): - module.fail_json(msg="invalid time input") - - result = dict( - period=module.params['time'], - ) - - module.exit_json(**result) - - -def main(): - requeue_after() - - -if __name__ == '__main__': - main() From 55b89e297067f88e878d0672edcf69c3a762bc68 Mon Sep 17 00:00:00 2001 From: Marihan Girgis <102027102+mgirgisf@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:35:28 +0100 Subject: [PATCH 34/70] Add nightly_bundle jobs to periodic pipeline (#564) The nightly_bundle jobs will run once a day --- .zuul.yaml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 732fa94fc..a418c6506 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -198,12 +198,6 @@ STF CRC jobs that build and deploy STF github-check: jobs: - - stf-crc-ocp_412-nightly_bundles: - voting: false - - stf-crc-ocp_413-nightly_bundles: - voting: false - - stf-crc-ocp_414-nightly_bundles: - voting: false - stf-crc-ocp_412-local_build - stf-crc-ocp_413-local_build - stf-crc-ocp_414-local_build @@ -215,3 +209,8 @@ name: infrawatch/service-telemetry-operator templates: - stf-crc-jobs + periodic: + jobs: + - stf-crc-ocp_412-nightly_bundles + - stf-crc-ocp_413-nightly_bundles + - stf-crc-ocp_414-nightly_bundles From 35476e16338337deae706977f497b2decd74e00b Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 19 Jan 2024 14:48:15 -0500 Subject: [PATCH 35/70] Remove hard-coded Prometheus version in template (#565) Remove the hard-coded Prometheus version in the Prometheus template when using observabilityStrategy use_redhat, which uses Cluster Observability Operator to manage the Prometheus instance requests. Previously this value was hard-coded to prevent a potential rollback when moving from Community Prometheus Operator to Cluster Observability Operator. Resolves: JIRA#OSPRH-2140 --- roles/servicetelemetry/templates/manifest_prometheus.j2 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index e9eb63786..66f2d5a8d 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -7,7 +7,11 @@ metadata: name: '{{ ansible_operator_meta.name }}' namespace: '{{ ansible_operator_meta.namespace }}' spec: +{% if observability_strategy != "use_community" %} + version: null +{% else %} version: v2.43.0 +{% endif %} replicas: {{ servicetelemetry_vars.backends.metrics.prometheus.deployment_size }} ruleSelector: {} securityContext: {} From f900181cebc3198570036c0c7d624782aa0d81f5 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Mon, 29 Jan 2024 18:37:32 +0100 Subject: [PATCH 36/70] Set features.operators.openshift.io/disconnected to True (#570) STF can now be deployed in disconnected mode. This change updates the features.operators.openshift.io/disconnected annotation to reflect this. --- .../service-telemetry-operator.clusterserviceversion.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 610a279a4..025db7f7e 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -156,7 +156,7 @@ metadata: features.operators.openshift.io/cnf: "false" features.operators.openshift.io/cni: "false" features.operators.openshift.io/csi: "false" - features.operators.openshift.io/disconnected: "false" + features.operators.openshift.io/disconnected: "true" features.operators.openshift.io/fips-compliant: "false" features.operators.openshift.io/proxy-aware: "false" features.operators.openshift.io/tls-profiles: "false" From 164f0ca5d6a2b262efcc0ea2265ffc2ca1cd96e1 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 1 Feb 2024 11:31:28 +0000 Subject: [PATCH 37/70] [stf-run-ci] Update validation check for bundle URLs (#571) * [stf-run-ci] Update validation check for bundle URLs An empty string passed as the bundle URL will pass the existing test of "is defined" and "is not None" and still be invalid. The validation for the bundle URL can be done in one check per var: * If the var is undefined, it becomes "", and the check fails, because of length * If the var is None, there's an error because None does not have a length * If the var is an empty string, the check fails because of the length This simplifies the check and improves readability --- build/stf-run-ci/tasks/setup_stf_from_bundles.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index b4883c7c5..f549fc209 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -90,10 +90,10 @@ - name: "Ensure that the bundle paths are set." ansible.builtin.assert: that: - - '__smart_gateway_bundle_image_path is defined and __smart_gateway_bundle_image_path != None' - - '__service_telemetry_bundle_image_path is defined and __service_telemetry_bundle_image_path != None' + - '__smart_gateway_bundle_image_path | default("") | length > 0' + - '__service_telemetry_bindle_image_path | default("") | length > 0' fail_msg: "Bundle path(s) not set. __smart_gateway_bundle_image_path is '{{ __smart_gateway_bundle_image_path }}' and __service_telemetry_bundle_image_path is '{{ __service_telemetry_bundle_image_path }}'. Both values need to be set." - success_msg: "Bundle paths are defined and not None" + success_msg: "Bundle paths are defined, are not None and have a non-zero-length" - name: Deploy SGO via OLM bundle ansible.builtin.shell: From 28183df3ad5200351d495b7f823a6d0d15dd0897 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Sun, 11 Feb 2024 00:35:20 -0500 Subject: [PATCH 38/70] Prefer Grafana 9 workload (#575) Prefer usage of Grafana 9 container image from RHCC. Grafana 7 is EOL upstream and receives no security support. Prefer use of Grafana 9 which is still supported. --- deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml | 2 +- .../service-telemetry-operator.clusterserviceversion.yaml | 2 +- roles/servicetelemetry/defaults/main.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index dc9ddda59..468347c9a 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -81,7 +81,7 @@ spec: grafana: ingressEnabled: true disableSignoutMenu: false - baseImage: registry.redhat.io/rhel8/grafana:7 + baseImage: registry.redhat.io/rhel8/grafana:9 dashboards: enabled: true transports: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 025db7f7e..00cec8767 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -117,7 +117,7 @@ metadata: "graphing": { "enabled": false, "grafana": { - "baseImage": "registry.redhat.io/rhel8/grafana:7", + "baseImage": "registry.redhat.io/rhel8/grafana:9", "dashboards": { "enabled": true }, diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index fc1cd8a91..e1a6c395e 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -84,7 +84,7 @@ servicetelemetry_defaults: grafana: ingress_enabled: true disable_signout_menu: false - base_image: registry.redhat.io/rhel8/grafana:7 + base_image: registry.redhat.io/rhel8/grafana:9 dashboards: enabled: true From dd6e1b39b2759d8e276a6b7e55b95fb680bcf868 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 15 Feb 2024 14:04:20 +0000 Subject: [PATCH 39/70] Fix typo in check for setup_from_bundles (#576) --- build/stf-run-ci/tasks/setup_stf_from_bundles.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index f549fc209..cdb09be85 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -91,7 +91,7 @@ ansible.builtin.assert: that: - '__smart_gateway_bundle_image_path | default("") | length > 0' - - '__service_telemetry_bindle_image_path | default("") | length > 0' + - '__service_telemetry_bundle_image_path | default("") | length > 0' fail_msg: "Bundle path(s) not set. __smart_gateway_bundle_image_path is '{{ __smart_gateway_bundle_image_path }}' and __service_telemetry_bundle_image_path is '{{ __service_telemetry_bundle_image_path }}'. Both values need to be set." success_msg: "Bundle paths are defined, are not None and have a non-zero-length" From 85a32c0fc1eea49fb8c4882d8d122c42f23ee37e Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 16 Feb 2024 12:13:09 -0500 Subject: [PATCH 40/70] Add related images for Prometheus and Alertmanager (#578) * Add related images for Prometheus and Alertmanager Add support for RELATED_IMAGES for other artifacts we need in a default deployment when installing via disconnected. These images are required to be specifically called out in disconnected environments as the Prometheus and Alertmanager controllers in the Cluster Observability Operator (COO) don't use the RELATED_IMAGE environment variables specified in the downstream CSV for COO, so we need to specify them ourselves and then deploy Prometheus and Alertmanager using the spec.image reference in the Custom Resource. Closes: JIRA#STF-1713 * Place related images in correct deploy manifest * Use grafana_path_image by default Remove the baseImage configuration for Grafana from the example CR so that in the OLM UI (OCP console) the baseImage isn't a populated value by default. Use the value from grafana_image_path as the preferred default as it will be populated from the RELATED_IMAGES_GRAFANA_IMAGE environment variable, which then itself has a default of the base_image value in defaults/main.yml. Only if the administrator sets a specific value for baseImage should we deploy with that image. We do this to allow for a better disconnected environment experience out of the box. * Drop setting Grafana related images For upstream, there isn't a good option that is on quay.io (which doesn't have quotas), and since it's out of scope for this effort, I'm just going to drop it and revert back to how it worked before. * Revert removal of Grafana default image path * Fix issue with alertmanager related image tag var --- build/generate_bundle.sh | 2 +- build/metadata.sh | 4 ++++ .../service-telemetry-operator.clusterserviceversion.yaml | 4 ++++ deploy/operator.yaml | 4 ++++ roles/servicetelemetry/tasks/pre.yml | 4 +++- roles/servicetelemetry/templates/manifest_alertmanager.j2 | 3 +++ roles/servicetelemetry/templates/manifest_prometheus.j2 | 1 + 7 files changed, 20 insertions(+), 2 deletions(-) diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index 12eea7c2d..3e77bab2d 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -29,7 +29,7 @@ generate_dockerfile() { } generate_bundle() { - REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" + REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_TAG}#g;s#<>#${RELATED_IMAGE_ALERTMANAGER}#g;s#<>#${RELATED_IMAGE_ALERTMANAGER_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" pushd "${REL}/../" > /dev/null 2>&1 ${OPERATOR_SDK} generate bundle --verbose --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" >> ${LOGFILE} 2>&1 diff --git a/build/metadata.sh b/build/metadata.sh index 759892400..0bc917288 100644 --- a/build/metadata.sh +++ b/build/metadata.sh @@ -21,6 +21,10 @@ RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP:-q RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG:-latest} RELATED_IMAGE_OAUTH_PROXY=${RELATED_IMAGE_OAUTH_PROXY:-quay.io/openshift/origin-oauth-proxy} RELATED_IMAGE_OAUTH_PROXY_TAG=${RELATED_IMAGE_OAUTH_PROXY_TAG:-latest} +RELATED_IMAGE_PROMETHEUS=${RELATED_IMAGE_PROMETHEUS:-quay.io/prometheus/prometheus} +RELATED_IMAGE_PROMETHEUS_TAG=${RELATED_IMAGE_PROMETHEUS_TAG:-latest} +RELATED_IMAGE_ALERTMANAGER=${RELATED_IMAGE_ALERTMANAGER:-quay.io/prometheus/alertmanager} +RELATED_IMAGE_ALERTMANAGER_TAG=${RELATED_IMAGE_ALERTMANAGER_TAG:-latest} BUNDLE_PATH=${BUNDLE_PATH:-deploy/olm-catalog/service-telemetry-operator} BUNDLE_CHANNELS=${BUNDLE_CHANNELS:-unstable} BUNDLE_DEFAULT_CHANNEL=${BUNDLE_DEFAULT_CHANNEL:-unstable} diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 00cec8767..0392f58a3 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -295,6 +295,10 @@ spec: value: <>:<> - name: RELATED_IMAGE_OAUTH_PROXY_IMAGE value: <>:<> + - name: RELATED_IMAGE_PROMETHEUS_IMAGE + value: <>:<> + - name: RELATED_IMAGE_ALERTMANAGER + value: <>:<> image: <>:<> imagePullPolicy: Always name: operator diff --git a/deploy/operator.yaml b/deploy/operator.yaml index c56c11daa..b82ea4cf5 100644 --- a/deploy/operator.yaml +++ b/deploy/operator.yaml @@ -37,6 +37,10 @@ spec: value: <>:<> - name: RELATED_IMAGE_OAUTH_PROXY_IMAGE value: <>:<> + - name: RELATED_IMAGE_PROMETHEUS_IMAGE + value: <>:<> + - name: RELATED_IMAGE_ALERTMANAGER + value: <>:<> volumes: - name: runner emptyDir: {} diff --git a/roles/servicetelemetry/tasks/pre.yml b/roles/servicetelemetry/tasks/pre.yml index 38477b02b..6b771dec0 100644 --- a/roles/servicetelemetry/tasks/pre.yml +++ b/roles/servicetelemetry/tasks/pre.yml @@ -33,8 +33,10 @@ - name: "Set supporting container image paths" set_fact: - prometheus_webhook_snmp_container_image_path: "{{ lookup('env', 'RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE') | default('quay.io/infrawatch/prometheus-webhook-snmp:latest', true) }}" # noqa 204 + alertmanager_image_path: "{{ lookup('env', 'RELATED_IMAGE_ALERTMANAGER_IMAGE') | default('quay.io/prometheus/alertmanager:latest', true) }}" # noqa 204 oauth_proxy_image: "{{ lookup('env', 'RELATED_IMAGE_OAUTH_PROXY_IMAGE') | default('quay.io/openshift/origin-oauth-proxy:latest', true) }}" # noqa 204 + prometheus_webhook_snmp_container_image_path: "{{ lookup('env', 'RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE') | default('quay.io/infrawatch/prometheus-webhook-snmp:latest', true) }}" # noqa 204 + prometheus_image_path: "{{ lookup('env', 'RELATED_IMAGE_PROMETHEUS_IMAGE') | default('quay.io/prometheus/prometheus:latest', true) }}" # noqa 204 - name: Adjust defaults when highAvailability.enabled is true block: diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 5b53cc592..4e2287fe9 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -7,6 +7,9 @@ metadata: name: '{{ ansible_operator_meta.name }}' namespace: '{{ ansible_operator_meta.namespace }}' spec: +{% if observability_strategy != "use_community" %} + image: {{ alertmanager_image_path }} +{% endif %} replicas: {{ servicetelemetry_vars.alerting.alertmanager.deployment_size }} serviceAccountName: alertmanager-stf serviceMonitorSelector: diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index 66f2d5a8d..d9610b056 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -9,6 +9,7 @@ metadata: spec: {% if observability_strategy != "use_community" %} version: null + image: {{ prometheus_image_path }} {% else %} version: v2.43.0 {% endif %} From c9c40654e6756cd709c53f642f6bfac634164442 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:23:27 +0000 Subject: [PATCH 41/70] Bump ansible-core from 2.12.10 to 2.15.9 in /build/stf-run-ci (#573) Bumps [ansible-core](https://github.com/ansible/ansible) from 2.12.10 to 2.15.9. - [Release notes](https://github.com/ansible/ansible/releases) - [Commits](https://github.com/ansible/ansible/compare/v2.12.10...v2.15.9) --- updated-dependencies: - dependency-name: ansible-core dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Leif Madsen --- build/stf-run-ci/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/stf-run-ci/requirements.txt b/build/stf-run-ci/requirements.txt index 70c742e0e..607e7d87c 100644 --- a/build/stf-run-ci/requirements.txt +++ b/build/stf-run-ci/requirements.txt @@ -5,4 +5,4 @@ requests_oauthlib==1.3.0 oauthlib==3.2.2 kubernetes==24.2.0 openshift==0.13.1 -ansible-core==2.12.10 +ansible-core==2.15.9 From 6568039b83e0caaa557af0a462407077e3c1176c Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Mon, 26 Feb 2024 10:20:36 -0500 Subject: [PATCH 42/70] Fix env var naming issue (#581) * Fix env var naming issue Fix an environment variable naming issue in the CSV for STO when attempting to install alertmanager disconnected. The env var being looked up should have a postfix of _IMAGE to match the other env vars. Found in testing by vkmc. * Run operator-sdk generate bundle Run the generate bundle command to make sure everything is in sync. Fixes CI found issue in previous commit. --- .../service-telemetry-operator.clusterserviceversion.yaml | 2 +- deploy/operator.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 0392f58a3..4ec0f7f5c 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -297,7 +297,7 @@ spec: value: <>:<> - name: RELATED_IMAGE_PROMETHEUS_IMAGE value: <>:<> - - name: RELATED_IMAGE_ALERTMANAGER + - name: RELATED_IMAGE_ALERTMANAGER_IMAGE value: <>:<> image: <>:<> imagePullPolicy: Always diff --git a/deploy/operator.yaml b/deploy/operator.yaml index b82ea4cf5..a6a940791 100644 --- a/deploy/operator.yaml +++ b/deploy/operator.yaml @@ -39,7 +39,7 @@ spec: value: <>:<> - name: RELATED_IMAGE_PROMETHEUS_IMAGE value: <>:<> - - name: RELATED_IMAGE_ALERTMANAGER + - name: RELATED_IMAGE_ALERTMANAGER_IMAGE value: <>:<> volumes: - name: runner From 853e4574b5d3c93faa2b9001f8bf18f0a9eddc9e Mon Sep 17 00:00:00 2001 From: Alex Yefimov <126113326+ayefimov-1@users.noreply.github.com> Date: Thu, 29 Feb 2024 12:35:55 -0500 Subject: [PATCH 43/70] Update smoketest.sh (#583) Changed default-alertmanager URL to v2 --- tests/smoketest/smoketest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 177157fe3..b4a8db29f 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -90,7 +90,7 @@ fi # create the alert using startsAt which in theory may cause trigger to be faster echo "*** [INFO] Create alert" -oc delete pod -l run=curl ; oc run curl --wait --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -v -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"status\":\"firing\",\"labels\":{\"alertname\":\"smoketest\",\"severity\":\"warning\"},\"startsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\"}]' https://default-alertmanager-proxy:9095/api/v1/alerts" +oc delete pod -l run=curl ; oc run curl --wait --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -v -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"status\":\"firing\",\"labels\":{\"alertname\":\"smoketest\",\"severity\":\"warning\"},\"startsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\"}]' https://default-alertmanager-proxy:9095/api/v2/alerts" oc wait --for=jsonpath='{.status.phase}'=Succeeded pod/curl oc logs curl From 6941ce0608151e01e7e6b9fd11a27e855ebf9612 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Wed, 20 Mar 2024 11:37:35 +0100 Subject: [PATCH 44/70] Skip installing deps when installing from bundles (#586) Dependencies for STF are being handled by OLM since STF 1.5.3. Related change https://github.com/infrawatch/service-telemetry-operator/pull/423 Except for Prometheus/COO and cert-manager (since those are scoped cluster-wide and OLM cannot automatically resolve conflicts) Related change https://github.com/infrawatch/service-telemetry-operator/pull/472 So there is no need to pre subscribe to those when deploying for index or bundles. It is required, though, for local installs. The only operator that was on this condition is the AMQ Interconnect operator. So this change updates the check to reflect this. --- build/stf-run-ci/tasks/setup_base.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index bb6667184..1856bd34d 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -30,7 +30,7 @@ - "{{ namespace }}" # documented procedure: https://infrawatch.github.io/documentation/#deploying-observability-operator_assembly-installing-the-core-components-of-stf -- name: Subscribe to Red Hat Obervability Operator +- name: Subscribe to Red Hat Observability Operator kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 @@ -111,7 +111,7 @@ sourceNamespace: openshift-marketplace # installed by properties.yaml definition as of STF 1.5.3 -- when: not __deploy_from_index_enabled | bool +- when: __local_build_enabled | bool block: - name: Subscribe to AMQ Interconnect Operator kubernetes.core.k8s: From ceebd9e9221ac7e0d9dfe562ec97d10d227e9754 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 22 Mar 2024 11:52:11 +0000 Subject: [PATCH 45/70] [zuul] Remove OCP 4.13 jobs (#587) OCP 4.13 jobs were added as a way to test ocp-latest before 4.14 was released --- .zuul.yaml | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index a418c6506..636f0cf44 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -7,14 +7,6 @@ - name: crc label: coreos-crc-extracted-2-19-0-xxl -- nodeset: - name: stf-crc_extracted-ocp413 - nodes: - - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost - - name: crc - label: coreos-crc-extracted-2-28-0-xxl - - nodeset: name: stf-crc_extracted-ocp414 nodes: @@ -136,13 +128,6 @@ Deploy STF using the nightly bundles on OCP 4.12 nodeset: stf-crc_extracted-ocp412 -- job: - name: stf-crc-ocp_413-nightly_bundles - parent: stf-crc-nightly_bundles - description: | - Deploy STF using the nightly bundles on OCP 4.13 - nodeset: stf-crc_extracted-ocp413 - - job: name: stf-crc-ocp_414-nightly_bundles parent: stf-crc-nightly_bundles @@ -157,13 +142,6 @@ Build images locally and deploy STF on OCP 4.12 nodeset: stf-crc_extracted-ocp412 -- job: - name: stf-crc-ocp_413-local_build - parent: stf-crc-local_build - description: | - Build images locally and deploy STF on OCP 4.13 - nodeset: stf-crc_extracted-ocp413 - - job: name: stf-crc-ocp_414-local_build parent: stf-crc-local_build @@ -178,13 +156,6 @@ Build STF locally and deploy from index on OCP 4.12 nodeset: stf-crc_extracted-ocp412 -- job: - name: stf-crc-ocp_413-local_build-index_deploy - parent: stf-crc-local_build-index_deploy - description: | - Build STF locally and deploy from index on OCP 4.13 - nodeset: stf-crc_extracted-ocp413 - - job: name: stf-crc-ocp_414-local_build-index_deploy parent: stf-crc-local_build-index_deploy @@ -199,10 +170,8 @@ github-check: jobs: - stf-crc-ocp_412-local_build - - stf-crc-ocp_413-local_build - stf-crc-ocp_414-local_build - stf-crc-ocp_412-local_build-index_deploy - - stf-crc-ocp_413-local_build-index_deploy - stf-crc-ocp_414-local_build-index_deploy - project: @@ -212,5 +181,4 @@ periodic: jobs: - stf-crc-ocp_412-nightly_bundles - - stf-crc-ocp_413-nightly_bundles - stf-crc-ocp_414-nightly_bundles From c85d9a4d83f6aa4b1da7f582a23f3b28834e1276 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 5 Apr 2024 11:37:53 +0100 Subject: [PATCH 46/70] [Jenkins] Remove jenkins config and deployment files (#585) We are now using Zuul to replace Jenkins --- .jenkins/Dockerfile | 33 ---- .jenkins/README.md | 55 ------- .jenkins/agent/Dockerfile | 14 -- .jenkins/agent/README.md | 29 ---- .jenkins/agent/install-sdk.sh | 7 - .jenkins/deploy/casc-configmap.yaml | 144 ------------------ .jenkins/deploy/deploy-smee.yaml | 49 ------ .jenkins/deploy/jenkins-deploy.yaml | 43 ------ .jenkins/deploy/sa-rbac.yaml | 21 --- .jenkins/deploy/service-route.yaml | 35 ----- Jenkinsfile | 228 ---------------------------- 11 files changed, 658 deletions(-) delete mode 100644 .jenkins/Dockerfile delete mode 100644 .jenkins/README.md delete mode 100644 .jenkins/agent/Dockerfile delete mode 100644 .jenkins/agent/README.md delete mode 100755 .jenkins/agent/install-sdk.sh delete mode 100644 .jenkins/deploy/casc-configmap.yaml delete mode 100644 .jenkins/deploy/deploy-smee.yaml delete mode 100644 .jenkins/deploy/jenkins-deploy.yaml delete mode 100644 .jenkins/deploy/sa-rbac.yaml delete mode 100644 .jenkins/deploy/service-route.yaml delete mode 100644 Jenkinsfile diff --git a/.jenkins/Dockerfile b/.jenkins/Dockerfile deleted file mode 100644 index 9930d6c57..000000000 --- a/.jenkins/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -FROM jenkins/jenkins:latest -RUN jenkins-plugin-cli -p ant:latest \ - antisamy-markup-formatter:latest \ - authorize-project:latest \ - build-timeout:latest \ - blueocean:latest \ - cloudbees-folder:latest \ - configuration-as-code:latest \ - credentials-binding:latest \ - email-ext:latest \ - git:latest \ - github-branch-source:latest \ - github-scm-trait-notification-context:latest \ - gradle:latest \ - ldap:latest \ - mailer:latest \ - matrix-auth:latest \ - pam-auth:latest \ - pipeline-github-lib:latest \ - pipeline-stage-view:latest \ - role-strategy:latest \ - ssh-slaves:latest \ - timestamper:latest \ - workflow-aggregator:latest \ - ws-cleanup:latest \ - kubernetes-credentials-provider:latest \ - kubernetes:latest \ - openshift-client:latest \ - openshift-login:latest \ - job-dsl:latest \ - ansible:latest \ - ansicolor:latest \ - github-pr-comment-build:latest diff --git a/.jenkins/README.md b/.jenkins/README.md deleted file mode 100644 index f39f1b771..000000000 --- a/.jenkins/README.md +++ /dev/null @@ -1,55 +0,0 @@ -# Jenkins artifacts for STF -Use these to run a jenkins in Openshift that will do CI tests on STF - -## Start a new project for CI -``` -oc new-project ci -``` - -## Build the Jenkins master image -``` -oc new-build --binary=true --name=jenkins -oc start-build jenkins --from-file ./Dockerfile --follow=true -``` - -## Build the Jenkins agent image -NOTE: The operator-sdk version in the Dockerfile is the latest version that will work with the STO build scripts -``` -cd agent -oc new-build --binary=true --name=jenkins-agent -oc start-build jenkins-agent --from-file ./Dockerfile --follow=true -cd .. -``` - -## Set your local secret stuff -You'll need to get/generate a GitHub App ID & Privkey and put the key in gh-app-privkey.pem -``` -GH_APPID= -GH_ORG= - -oc create secret generic github-app-key --from-literal=owner=${GH_ORG} --from-literal=appID=${GH_APPID} --from-literal=privateKey="$(cat ./gh-app-privkey.pem)" -oc annotate secret/github-app-key jenkins.io/credentials-description="gh-app-key" -oc label secret/github-app-key jenkins.io/credentials-type=gitHubApp -``` - -## Deploy all the things -``` -oc apply -f deploy/service-route.yaml - -export SMEE_CHANNEL= #(just the slug, not the whole URL) -export GH_ORG= -export JENKINS_URL=$(oc get route jenkins -ojsonpath='{.spec.host}') -# This is for labelling the status that is returned to github -export OCP_VERSION= # e.g. 4.14 - -for f in deploy/*; do - envsubst < "${f}" | oc apply -f - -done -``` - -## Access the console and load the jobs -`xdg-open https://$JENKINS_URL` - -The Jenkins master pod is configured to use OpenShift SSO. To login as an admin, use the host cluster's "kubeadmin" credentials. - -After logging in, navigate to your organization from the home panel and press the "Scan Organization Now" button. This will discover all projects in the organization that have valid Jenkinsfiles in them. diff --git a/.jenkins/agent/Dockerfile b/.jenkins/agent/Dockerfile deleted file mode 100644 index ab7f97cb3..000000000 --- a/.jenkins/agent/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM quay.io/openshift/origin-jenkins-agent-base:latest - -# pass --build-arg OC_CLIENT_VERSION= to build stage to change client version -ARG OC_CLIENT_VERSION="4.13" - -RUN curl -LO "https://github.com/operator-framework/operator-sdk/releases/download/v0.19.4/operator-sdk-v0.19.4-x86_64-linux-gnu" && \ - chmod +x operator-sdk-v0.19.4-x86_64-linux-gnu && mv operator-sdk-v0.19.4-x86_64-linux-gnu /usr/local/bin/operator-sdk -RUN dnf install -y ansible golang python38 && \ - dnf groupinstall -y "Development Tools" -y && \ - alternatives --set python /usr/bin/python3.8 && \ - python -m pip install openshift kubernetes "ansible-core~=2.12" && \ - ansible-galaxy collection install -f 'kubernetes.core:>=2.2.0' community.general -RUN curl -LO "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest-$OC_CLIENT_VERSION/openshift-client-linux.tar.gz" && \ - tar -xv -C /usr/local/bin -f openshift-client-linux.tar.gz diff --git a/.jenkins/agent/README.md b/.jenkins/agent/README.md deleted file mode 100644 index dbef269fc..000000000 --- a/.jenkins/agent/README.md +++ /dev/null @@ -1,29 +0,0 @@ -The Jenkins agent pod is used to run all Jenkins pipelines for the Service Telemetry Framework. - -# Build in OpenShift - -```bash -oc new-build --binary=true --name=jenkins-agent -oc start-build jenkins-agent --from-dir . -``` - -You can override the default `oc` client version being installed by overriding the default argument `OC_CLIENT_VERSION` from the `Dockerfile`. - -```bash -oc new-build --build-arg OC_CLIENT_VERSION=4.10 --binary=true --name=jenkins-agent -oc start-build jenkins-agent --from-dir . -``` - -Builds will be available in-cluster at the address: `image-registry.openshift-image-registry.svc:5000//jenkins-agent:latest` - -# Build with Podman/Docker - -```bash -podman build -t jenkins-agent:latest . -``` - -You can override the default `oc` client version being installed by overriding the default argument `OC_CLIENT_VERSION` from the `Dockerfile`. - -```bash -podman build --build-arg OC_CLIENT_VERSION=4.10 -t jenkins-agent:latest . -``` diff --git a/.jenkins/agent/install-sdk.sh b/.jenkins/agent/install-sdk.sh deleted file mode 100755 index fe4813198..000000000 --- a/.jenkins/agent/install-sdk.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# NOTE: any version of operator-sdk later than v0.19.4 is incompatable with the build scripts - -SDK_FULL_NAME="operator-sdk-v0.19.4-x86_64-linux-gnu" -curl -LO "https://github.com/operator-framework/operator-sdk/releases/download/v0.19.4/$SDK_FULL_NAME" -chmod +x "$SDK_FULL_NAME" && mv "$SDK_FULL_NAME" /usr/local/bin/operator-sdk diff --git a/.jenkins/deploy/casc-configmap.yaml b/.jenkins/deploy/casc-configmap.yaml deleted file mode 100644 index ab372d230..000000000 --- a/.jenkins/deploy/casc-configmap.yaml +++ /dev/null @@ -1,144 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: jenkins-casc - namespace: ci -data: - casc.yaml: | - unclassified: - location: - url: "https://${JENKINS_URL}" - jenkins: - numExecutors: 1 - remotingSecurity: - enabled: true - - # permissions for anonymous - revisit is moved to public internet rather than behind firewall - authorizationStrategy: - globalMatrix: - permissions: - - "Job/Build:anonymous" - - "Job/Discover:anonymous" - - "Job/Read:anonymous" - - "Job/Cancel:anonymous" - - "Overall/Read:anonymous" - clouds: - - kubernetes: - name: kubernetes - containerCapStr: 100 - containerCap: 1 - jenkinsUrl: "http://jenkins:8080/" - templates: - - name: ocp-agent - label: ocp-agent - namespace: ci - serviceAccount: jenkins - containers: - - name: exec - workingDir: "/home/jenkins/agent" - image: image-registry.openshift-image-registry.svc:5000/ci/jenkins-agent:latest - alwaysPullImage: true - command: sleep - args: infinity - - security: - queueItemAuthenticator: - authenticators: - - global: - strategy: triggeringUsersAuthorizationStrategy - - # organization config - jobs: - - script: > - organizationFolder('${GH_ORG}') { - description("${GH_ORG} GitHub Organization") - displayName('${GH_ORG}') - - organizations { - github { - apiUri("https://api.github.com") - repoOwner("${GH_ORG}") - credentialsId("github-app-key") - } - } - configure { node -> - def traits = node / navigators / 'org.jenkinsci.plugins.github__branch__source.GitHubSCMNavigator' / traits - - // Discover branches - traits << 'org.jenkinsci.plugins.github__branch__source.BranchDiscoveryTrait' { - strategyId(1) - // Values - // 1 : Exclude branches that are also filed as PRs - // 2 : Only branches that are also filed as PRs - // 3 : All branches - } - // Discover pull requests from origin - traits << 'org.jenkinsci.plugins.github__branch__source.OriginPullRequestDiscoveryTrait' { - strategyId(1) - // Values - // 1 : Merging the pull request with the current target branch revision - // 2 : The current pull request revision - // 3 : Both the current pull request revision and the pull request merged with the current target branch revision - } - // Discover pull requests from forks - traits << 'org.jenkinsci.plugins.github__branch__source.ForkPullRequestDiscoveryTrait' { - strategyId(1) - // Values - // 1 : Merging the pull request with the current target branch revision - // 2 : The current pull request revision - // 3 : Both the current pull request revision and the pull request merged with the current target branch revision - - trustID('1') - // Values - // 0 : Everyone - // 1 : Forks in the same account - // 2 : Nobody - } - // Custom Github Notification Context; https://github.com/jenkinsci/github-scm-trait-notification-context-plugin - traits << 'org.jenkinsci.plugins.githubScmTraitNotificationContext.NotificationContextTrait' { - contextLabel("continuous-integration/jenkins/ocp-${OCP_VERSION}") - typeSuffix(true) - } - } - - // "Project Recognizers" - projectFactories { - workflowMultiBranchProjectFactory { - scriptPath 'Jenkinsfile' - } - } - - // "Orphaned Item Strategy" - orphanedItemStrategy { - discardOldItems { - daysToKeep(-1) - numToKeep(-1) - } - } - - // "Scan Organization Folder Triggers" : 1 day - // We need to configure this stuff by hand because JobDSL only allow 'periodic(int min)' for now - configure { node -> - node / triggers / 'com.cloudbees.hudson.plugins.folder.computed.PeriodicFolderTrigger' { - spec('H H * * *') - interval(86400000) - } - } - - // set webhook triggers and suppress automatic issue triggering - // for now, we want trigger - def commentTriggerPhrase = '^test$|^retest$|^recheck$' - configure { node -> - node / strategy(class: 'jenkins.branch.DefaultBranchPropertyStrategy') { - properties(class: 'java.util.Arrays$${ESCAPEDOLLAR}ArrayList') { - def s = a(class: 'jenkins.branch.BranchProperty-array') - s / 'jenkins.branch.NoTriggerBranchProperty' {} - s / 'com.adobe.jenkins.github__pr__comment__build.TriggerPRCommentBranchProperty'(plugin: 'github-pr-comment-build@2.3') { - commentBody(commentTriggerPhrase) - } - // s / 'com.adobe.jenkins.github__pr__comment__build.TriggerPRCommentBranchProperty' - // s / 'com.adobe.jenkins.github__pr__comment__build.TriggerPRReviewBranchProperty'(plugin: 'github-pr-comment-build@2.3') - } - } - } - } diff --git a/.jenkins/deploy/deploy-smee.yaml b/.jenkins/deploy/deploy-smee.yaml deleted file mode 100644 index 4afa9531a..000000000 --- a/.jenkins/deploy/deploy-smee.yaml +++ /dev/null @@ -1,49 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: smee-client -spec: - progressDeadlineSeconds: 600 - replicas: 1 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: smee-client - strategy: - rollingUpdate: - maxSurge: 25% - maxUnavailable: 25% - type: RollingUpdate - template: - metadata: - creationTimestamp: null - labels: - name: smee-client - spec: - containers: - - args: - - --url - - "https://smee.io/${SMEE_CHANNEL}" - - --target - - "http://jenkins:8080/github-webhook/" - env: - - name: WATCH_NAMESPACE - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - - name: POD_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.name - image: quay.io/infrawatch/smee:latest - imagePullPolicy: Always - name: smee-client - resources: {} - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - dnsPolicy: ClusterFirst - restartPolicy: Always - schedulerName: default-scheduler - terminationGracePeriodSeconds: 30 diff --git a/.jenkins/deploy/jenkins-deploy.yaml b/.jenkins/deploy/jenkins-deploy.yaml deleted file mode 100644 index b2cbb01d0..000000000 --- a/.jenkins/deploy/jenkins-deploy.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jenkins - namespace: ci -spec: - selector: - matchLabels: - app: jenkins - replicas: 1 - template: - metadata: - labels: - app: jenkins - spec: - serviceAccountName: jenkins - containers: - - name: jenkins - image: >- - image-registry.openshift-image-registry.svc:5000/ci/jenkins:latest - ports: - - name: http-port - containerPort: 8080 - - name: jnlp-port - containerPort: 50000 - volumeMounts: - - name: jenkins-home - mountPath: /var/jenkins_home - - name: jenkins-casc - mountPath: /var/jenkins_config - env: - - name: OPENSHIFT_ENABLE_OAUTH - value: "true" - - name: JAVA_OPTS - value: -Djenkins.install.runSetupWizard=false - - name: CASC_JENKINS_CONFIG - value: /var/jenkins_config/casc.yaml - volumes: - - name: jenkins-home - emptyDir: {} - - name: jenkins-casc - configMap: - name: jenkins-casc diff --git a/.jenkins/deploy/sa-rbac.yaml b/.jenkins/deploy/sa-rbac.yaml deleted file mode 100644 index a18a5fabf..000000000 --- a/.jenkins/deploy/sa-rbac.yaml +++ /dev/null @@ -1,21 +0,0 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: jenkins - annotations: - serviceaccounts.openshift.io/oauth-redirectreference.jenkins: "{\"kind\":\"OAuthRedirectReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Route\",\"name\":\"jenkins\"}}" ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: admin-access - namespace: default -roleRef: - kind: ClusterRole - name: cluster-admin - apiGroup: rbac.authorization.k8s.io -subjects: - - kind: ServiceAccount - name: jenkins - namespace: ci diff --git a/.jenkins/deploy/service-route.yaml b/.jenkins/deploy/service-route.yaml deleted file mode 100644 index d8fe3e53b..000000000 --- a/.jenkins/deploy/service-route.yaml +++ /dev/null @@ -1,35 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: jenkins - namespace: ci -spec: - selector: - app: jenkins - ports: - - protocol: TCP - name: jnlp - port: 50000 - targetPort: 50000 - - protocol: TCP - name: http - port: 8080 - targetPort: 8080 ---- -kind: Route -apiVersion: route.openshift.io/v1 -metadata: - name: jenkins - namespace: ci -spec: - to: - kind: Service - name: jenkins - weight: 100 - port: - targetPort: http - tls: - termination: edge - insecureEdgeTerminationPolicy: Redirect - wildcardPolicy: None diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index f94b64b1e..000000000 --- a/Jenkinsfile +++ /dev/null @@ -1,228 +0,0 @@ -#!/usr/bin/env groovy - - -def tested_files = "build/.*|deploy/.*|roles/.*|tests/smoketest/.*|Makefile|watches.yaml|Jenkinsfile" - -// can't just use BUILD_TAG because qdr operator limits name of resources to 60 chars -def namespace = env.JOB_BASE_NAME + '-' + env.BUILD_NUMBER -namespace = namespace.toLowerCase() -namespace = namespace.replaceAll('\\.', '-') - -def stf_resource = """ -apiVersion: infra.watch/v1beta1 -kind: ServiceTelemetry -metadata: - name: default - namespace: ${namespace} -spec: - observabilityStrategy: use_redhat - alerting: - alertmanager: - storage: - strategy: ephemeral - receivers: - snmpTraps: - enabled: true - backends: - events: - elasticsearch: - enabled: true - storage: - strategy: ephemeral - metrics: - prometheus: - enabled: true - storage: - strategy: ephemeral - transports: - qdr: - enabled: true - deploymentSize: 1 - web: - enabled: false - elasticsearchManifest: | - apiVersion: elasticsearch.k8s.elastic.co/v1 - kind: Elasticsearch - metadata: - name: elasticsearch - namespace: $namespace - spec: - version: 7.16.1 - volumeClaimDeletePolicy: DeleteOnScaledownAndClusterDeletion - http: - tls: - certificate: - secretName: 'elasticsearch-es-cert' - nodeSets: - - config: - node.roles: - - master - - data - - ingest - node.store.allow_mmap: true - count: 1 - name: default - podTemplate: - metadata: - labels: - tuned.openshift.io/elasticsearch: elasticsearch - spec: - containers: - - name: elasticsearch - resources: - limits: - cpu: '2' - memory: 2Gi - requests: - cpu: '1' - memory: 1Gi - volumes: - - emptyDir: {} - name: elasticsearch-data -""" - -def working_branch = "master" - -pipeline { - agent { - kubernetes { - inheritFrom 'ocp-agent' - defaultContainer 'exec' - } - } - environment { - run_ci = sh(script: "git fetch origin ${env.CHANGE_TARGET} && git diff --name-only origin/${env.CHANGE_TARGET} | egrep \"${tested_files}\"", returnStatus: true) - } - stages { - stage('Clone Upstream') { - when { - environment name: 'run_ci', value: '0' - } - steps { - dir('service-telemetry-operator') { - catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') { - checkout scm - script { - working_branch = sh(script: 'git ls-remote --heads origin | grep $(git rev-parse HEAD) | cut -d / -f 3-', returnStdout: true).toString().trim() - if (!working_branch) { - // in this case, a merge with the base branch was required thus we use the second to last commit - // to find the original topic branch name - working_branch = sh(script: 'git ls-remote --heads origin | grep $(git rev-parse HEAD~1) | cut -d / -f 3-', returnStdout: true).toString().trim() - } - } - sh "git checkout -b ${working_branch}" - } - } - } - } - stage('Create project') { - when { - environment name: 'run_ci', value: '0' - expression { - currentBuild.result == null - } - } - steps { - dir('service-telemetry-operator') { - catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') { - script { - openshift.withCluster() { - openshift.newProject(namespace) - } - } - } - } - } - } - stage('Build STF Containers') { - when { - environment name: 'run_ci', value: '0' - expression { - currentBuild.result == null - } - } - steps { - dir('service-telemetry-operator') { - catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') { - ansiColor('xterm') { - ansiblePlaybook( - playbook: 'build/run-ci.yaml', - colorized: true, - extraVars: [ - "namespace": namespace, - "__deploy_stf": "false", - "__local_build_enabled": "true", - "__service_telemetry_snmptraps_enabled": "true", - "__service_telemetry_storage_ephemeral_enabled": "true", - "working_branch":"${working_branch}" - ] - ) - } - } - } - } - } - stage('Deploy STF Object') { - when { - environment name: 'run_ci', value: '0' - expression { - currentBuild.result == null - } - } - steps { - dir('service-telemetry-operator') { - catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') { - script { - openshift.withCluster() { - openshift.withProject(namespace) { - timeout(time: 800, unit: 'SECONDS') { - openshift.create(stf_resource) - sh "OCP_PROJECT=${namespace} VALIDATION_SCOPE=use_redhat ./build/validate_deployment.sh" - } - } - } - } - } - } - } - } - stage('Run Smoketest') { - when { - environment name: 'run_ci', value: '0' - expression { - currentBuild.result == null - } - } - steps { - dir('service-telemetry-operator') { - catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') { - sh "OCP_PROJECT=${namespace} ./tests/smoketest/smoketest.sh" - } - } - } - } - stage('Cleanup') { - when { - environment name: 'run_ci', value: '0' - } - steps { - dir('service-telemetry-operator') { - script { - openshift.withCluster(){ - openshift.selector("project/${namespace}").delete() - } - } - } - } - post { - always { - script { - if ( currentBuild.result != null && currentBuild.result != 'SUCCESS' ) { - currentBuild.result = 'FAILURE' - } - } - } - } - } - } -} From fc28f2f77192cb81cea8fd5aa95bf2d51edcb7e8 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 19 Apr 2024 17:15:06 +0100 Subject: [PATCH 47/70] [stf-run-ci] Remove the unused is_crc and ocp_ver vars (#589) --- build/stf-run-ci/tasks/main.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 52d35fa1e..4f9e8b8f6 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -41,14 +41,6 @@ kind: Node register: node_info -- name: Get OCP version - ansible.builtin.shell: oc version -o yaml | grep openshiftVersion | awk '{print $2}' - register: ocp_ver - -- name: Find out if we are using crc by looking at the node hostnames - ansible.builtin.set_fact: - is_crc: "{{ True if 'crc' in node_info.resources[0].metadata.labels[\"kubernetes.io/hostname\"] else False }}" - # -- prepare environment and cleanup - name: Clean up any existing global artifacts ansible.builtin.include_tasks: pre-clean.yml From f7825ba421f1482472daffd5653cb10551496885 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Fri, 26 Apr 2024 15:38:37 +0200 Subject: [PATCH 48/70] Bump actions checkout to v4 (#594) * Bump actions checkout to v4.1.3 Node.js 16 actions are deprecated. We need to update to Node.js 20, which is included in actions/checkout@v4. Also bumps helm kind-action to v1.10.0 --- .github/workflows/main.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2b04f59a5..1adcade8c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4.1.3 - name: Install Ansible run: python -m pip install 'ansible <= 2.9' @@ -26,7 +26,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4.1.3 - name: Install Ansible run: python -m pip install 'ansible' @@ -46,7 +46,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4.1.3 - name: Get operator-sdk image 0.19.4 run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu @@ -69,7 +69,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4.1.3 - name: Verify image builds run: docker build --tag infrawatch/service-telemetry-operator:latest --file build/Dockerfile . @@ -82,7 +82,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4.1.3 - name: Get operator-sdk image 0.19.4 run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu @@ -110,7 +110,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4.1.3 # prepare environment to buld the bundle - name: Get operator-sdk image 0.19.4 @@ -146,7 +146,7 @@ jobs: run: operator-sdk-$RELEASE_VERSION bundle validate --verbose /tmp/bundle - name: Create KinD cluster to execute scorecard tests - uses: helm/kind-action@v1.4.0 + uses: helm/kind-action@v1.10.0 # perform scorecard checks against a KinD cluster - name: Check scorecord validation From 0873cb4d9d5afdbc888d9df4fcc2c7b4c88acaf8 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 2 May 2024 18:30:41 +0100 Subject: [PATCH 49/70] Add documentation about the Zuul jobs (#567) * Describe the Zuul jobs * Update the job description in .zuul.yaml * Update FAQs and level of detail * Add info about triggering jobs Co-authored-by: Chris Sibbitt Co-authored-by: Leif Madsen --- .zuul.yaml | 9 ++-- ci/README.md | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 ci/README.md diff --git a/.zuul.yaml b/.zuul.yaml index 636f0cf44..f81d81ffb 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -15,10 +15,13 @@ - name: crc label: coreos-crc-extracted-2-30-0-xxl -# Based on the 2-node job cookbook at https://github.com/openstack-k8s-operators/ci-framework/blob/main/docs/source/cookbooks/zuul-job-nodeset.md - job: name: stf-base-2node parent: podified-multinode-edpm-deployment-crc + description: | + A base job for STF that deploys extracted CRC with a 2-node topology. + This job is based on the cookbook example in: https://github.com/openstack-k8s-operators/ci-framework/blob/main/docs/source/cookbooks/zuul-job-nodeset.md + All non-abstract jobs that inherit from this must have a nodeset configured. abstract: true required-projects: - name: github.com/openstack-k8s-operators/dataplane-operator @@ -68,11 +71,11 @@ - job: name: stf-base - # defined in: https://review.rdoproject.org/cgit/config/tree/zuul.d/_jobs-crc.yaml parent: stf-base-2node abstract: true description: | Run the stf-run-ci role, and then test stf + Any non-abstract jobs that inherit from this must pass a `scenario` var. roles: # adds in dependent roles i.e. put it in the role path - zuul: github.com/openstack-k8s-operators/ci-framework # These are the additional repos that zuul will clone @@ -99,7 +102,7 @@ parent: stf-base abstract: true description: | - Deploy STF using the nightly bundles + Deploy stf using the nightly bundles vars: scenario: "nightly_bundles" diff --git a/ci/README.md b/ci/README.md new file mode 100644 index 000000000..93e9e60f2 --- /dev/null +++ b/ci/README.md @@ -0,0 +1,133 @@ +# service-telemetry-operator CI playbooks + +The playbooks in this directory are used by zuul jobs, which are defined in ../.zuul.yaml. + +## Job descriptions + +### PR jobs + +There are 6 jobs run on every PR that is targeting `master`. +These are reported under the `rdoproject.org/github-check` check. + +Two scenarios run: +- `local_build`, which builds the STF images and deploys by creating a STF object. +- `local_build-index_deploy`, which builds the images and does an index-based deployment + +Each of these scenarios run across the following OCP versions: +- 4.12 +- 4.13 +- 4.14 + +### Periodic jobs + +The `nightly_bundles` jobs are run nightly. These jobs deploy STF using the nightly builds published to quay.io. +The same three versions of OCP are used. + +## Job hierarchy + +The jobs in this repo have two base jobs: + +- `stf-base-2node` +- `stf-base` + +These two base jobs are split according to purpose: infrastructure provisioning and STF deployment. + +`stf-base-2node` inherits from jobs defined in [ci-framework](http://github.com/openstack-k8s-operators/ci-framework), [rdo-jobs](https://review.rdoproject.org/cgit/rdo-jobs/) and [rdo/config](https://review.rdoproject.org/cgit/config/) repos. +This job configures the hosts used for running the jobs. +It is expected that `stf-base-2node` should not be modified unless there are changes to the upstream jobs. + +`stf-base` inherits from `stf-base-2node`, and defines the stf-specific parts of the jobs (prepare hosts, build STF images, deploy STF, test STF). + +These jobs are [abstract](https://zuul-ci.org/docs/zuul/latest/config/job.html#attr-job.abstract) and cannot be run directly, however, they contain the plumbing that allows the deployment scenario and OCP version to be configured. + +The scenario (`nightly_bundles`, `local_build`, `local_build-index_deploy`) is selected by passing a `scenario` [var to the job](https://zuul-ci.org/docs/zuul/latest/config/job.html#attr-job.vars). +The OCP version is selected by changing the nodeset that is use in the job. + +The jobs are named to describe the combination of scenario and OCP version that is used in the job. +The naming convention is `stf-crc-ocp_- e.g. `stf-crc-ocp_413-local_build` + +## OCP version selection + +The OCP version selection is done by specifying the `nodeset` to be used by the job. +The `nodesets` are defined in `.zuul.yaml`. Each nodeset corresponds to a different version of OCP. +Each nodeset contains two hosts: `crc` and `controller`. +All ansible playbooks are run against `controller`. + +The rest of this section provides further detail on the OCP version selection, and how it relates to CRC and the deployment topology. + +The nodesets select the hosts based on labels in zuul. +The labels available in zuul are shown on the [RDO Zuul labels tab](https://review.rdoproject.org/zuul/labels). + +The labels used for the nodesets are `coreos-crc-extracted--`. +The “extracted” CRC describes the way that the job deploys and interacts with CRC. + +Usually, CRC is run using the `crc start` command, which created a VM on your host which runs the OCP cloud. +In Zuul, the provisioned hosts are also virtual machines, so running `crc start` would result in a VM in a VM. This nested virtualisation causes some performance issues. + +The `extracted` deployment try to address the performance issues associated with nested virtualisation. The infrastructure is more complicated than nested. +The `coreos-crc-extracted-...` labels provide a VM with an extracted CRC VM image, so that the CRC VM can be booted directly by the cloud provider. The `crc` VM is not accessed directly, but via a second `controller` VM, on which tests are run. The `stf-base-2node` job includes a network configuration to make sure the controller can communicate with the OCP deployment in CRC. + +The name of each nodeset corresponds to the version of OCP that is deployed by the CRC image. + +## Adding new jobs + +If a new job needs to be added, it should inherit from `stf-base` ( or one of its child-jobs) which includes common tasks for setting up STF. The new jobs should have minimal configuration lines; either the `scenario` var is passed, which selects a vars file for stf-run-ci, to change its configuration, or the nodeset should be updated, which selects the OCP version. + +Below is an example of how to add a job. Take note of how the `scenario` var and the `nodeset` is passed. + + - job: + name: stf-crc-nightly_bundles + parent: stf-base + abstract: true + description: | + Example of a job that extends the `stf-base` job, and passes the `nightly_bundles` scenario var. This job does NOT have a nodeset defined so it must be abstract. + vars: + scenario: "nightly_bundles" + + + - job: + name: stf-crc-ocp_414-nightly_bundles + parent: stf-crc-nightly_bundles + description: | + Example of a job defining a nodeset to be used. + Since this job derives from a job with a scenario, it can be run directly. + nodeset: stf-crc_extracted-ocp414 + +All non-abstract jobs inheriting from `stf-base` must pass a `scenario` var to work correctly. There is no default value for the `scenario`. +All non-abstract jobs defined in this repo must have a `nodeset` to run correctly. Specifically, the nodeset must include nodes called `controller` and `crc`. This requirements comes from the `stf-base-2node` job. + +Once a new job is defined, it should be added to a project or to the `stf-crc-jobs` [template](https://zuul-ci.org/docs/zuul/latest/config/project.html#project-template) in `.zuul.yaml`. +Any job added to a project is run only against changes to that project. +Any job added to the `stf-crc-jobs` project template is run in the other repos across the infrawatch org. + +## Troubleshooting + +## FAQ + +### How does Zuul work across branches? +Each branch has its own zuul configuration. The configuration for a particular branch lives on that branch. +To run jobs on a branch, the `.zuul.yaml` file needs to exist on that branch. + +### How does Zuul decide which branches to check out? + +- For the repo-in-test, zuul checks out the dev branch. +- For all other required repos, zuul checks out the branch with the same name as the target (usually master, sometimes stable*) +- If `branch-override` option is specified in the job definition, then that branch is checked out instead of the default. +- When you use `Depends-On`, it checks out the branch in the referenced PR/changeset. + +### How do I test dependant patches? +If you're working on a a change that involves PRs to multiple repos (which are tested by Zuul), you can add a `Depends-On: ` line to the PR description of your change. + +You can use `Depends-On` to reference a change in any repo that zuul knows about (i.e. included in `project.yaml` in RDO in this case). + +### How do I add Zuul to a new repo? +The Zuul instance we use is hosted by RDO. In order for jobs to be run on a new repo, the following criteria must be met: +- The `softwarefactory-project-zuul` github app must also be added to the organisation (this is already done for infrawatch). +- The repo must be configured in [rdo/config](https://review.rdoproject.org/cgit/config/tree/zuul/rdo.yaml). An example of adding a repo is (here)[https://review.rdoproject.org/r/c/config/+/51666). +- The `softwarefactory-project-zuul` app must have repository access configured for the repo you want to add. This setting can be found in organisation/infrawatch -> settings -> Github Apps. + +### How do I configure job triggers? +In Zuul, jobs themselves don't have triggers. Triggers are configure per-pipeline. +Each job needs to be added to a pipeline to run. + +RDO Zuul defines the (pipelines that we can use)[https://review.rdoproject.org/cgit/config/tree/zuul.d/pipelines.yaml]. From 9023f0ebad0fa46869b4eb44b4f3aeb2331889dd Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Fri, 3 May 2024 10:57:21 +0200 Subject: [PATCH 50/70] Set a default image for ElasticSearch to use (#593) * Set a default image for ElasticSearch to use Currently we are not selecting the image for ElasticSearch that the ElasticSearch Operator pulls, we are allowing the default (in ElasticSearch registry) to be used Add the option to select the ElasticSearch image to use Also bumps the version to ElasticSearch 7.17.20, since 7.16.1 has several vulnerabilities * Fix ElasticSearch image and version setting The ElasticSearch CRD expects a version (X.Y.Z format) and a URL without version for the location of the image Correct how we set this in the default variables --- build/stf-run-ci/defaults/main.yml | 3 ++- build/stf-run-ci/templates/manifest_elasticsearch.j2 | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index a6c5ee184..32763c48d 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -36,7 +36,8 @@ __smart_gateway_bundle_image_path: "quay.io/infrawatch-operators/smart-gateway-o default_operator_registry_image_base: registry.redhat.io/openshift4/ose-operator-registry default_operator_registry_image_tag: v4.13 -elasticsearch_version: 7.16.1 +elasticsearch_version: 7.17.20 +elasticsearch_image: registry.connect.redhat.com/elastic/elasticsearch sgo_image_tag: latest sto_image_tag: latest diff --git a/build/stf-run-ci/templates/manifest_elasticsearch.j2 b/build/stf-run-ci/templates/manifest_elasticsearch.j2 index e2e50c6f4..0a59a046e 100644 --- a/build/stf-run-ci/templates/manifest_elasticsearch.j2 +++ b/build/stf-run-ci/templates/manifest_elasticsearch.j2 @@ -49,4 +49,5 @@ spec: certificateAuthorities: {} updateStrategy: changeBudget: {} - version: {{ elasticsearch_version }} \ No newline at end of file + version: {{ elasticsearch_version }} + image: {{ elasticsearch_image }}:{{ elasticsearch_version }} From 8a6ed20eaa3a775e0aa5e16e08c3d22f7aa7d34b Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Fri, 3 May 2024 11:50:53 +0200 Subject: [PATCH 51/70] Fail earlier when registry creds are not set (#592) * Fail earlier when registry creds are not set Move the credential setup for the internal registry up in the execution and perform a simple check with the "oc image info" command to fail earlier in case the credentials haven't been set properly * Add assert to the internal registry creds check Enhance debugging experience by adding more information when trying to access to the required bundles in the internal registry * Change "internal registry" for "bundles registry" Use a more accurate term when refering to the registry in which the bundles are located when doing the early registry access check * Keep logic for checking bundle registry creds Maintain the conditionals when checking the config for the bundle registry credentials and cert --- build/stf-run-ci/tasks/main.yml | 4 + .../stf-run-ci/tasks/setup_registry_auth.yml | 104 ++++++++++++++++++ .../tasks/setup_stf_from_bundles.yml | 86 --------------- 3 files changed, 108 insertions(+), 86 deletions(-) create mode 100644 build/stf-run-ci/tasks/setup_registry_auth.yml diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 4f9e8b8f6..eec4313cb 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -47,6 +47,10 @@ tags: - pre-clean +- name: Set up bundle registry credentials (deploy from bundles) + ansible.builtin.include_tasks: setup_registry_auth.yml + when: __deploy_from_bundles_enabled | bool or setup_bundle_registry_auth | bool + - name: Setup supporting Operator subscriptions ansible.builtin.include_tasks: setup_base.yml tags: diff --git a/build/stf-run-ci/tasks/setup_registry_auth.yml b/build/stf-run-ci/tasks/setup_registry_auth.yml new file mode 100644 index 000000000..bd33b8194 --- /dev/null +++ b/build/stf-run-ci/tasks/setup_registry_auth.yml @@ -0,0 +1,104 @@ +- name: Update Pull Secret with bundle registry credentials + when: setup_bundle_registry_auth | bool + block: + - name: Get existing Pull Secret from openshift config + kubernetes.core.k8s_info: + api_version: v1 + kind: Secret + namespace: openshift-config + name: pull-secret + register: pull_secret + + - name: Decode docker config json + ansible.builtin.set_fact: + dockerconfigjson: "{{ pull_secret.resources[0].data['.dockerconfigjson'] | b64decode }}" + + - name: Merge registry creds into auth section of docker config + ansible.builtin.set_fact: + new_dockerauths: "{{ dockerconfigjson['auths'] | combine( { + pull_secret_registry:{ + 'auth': (pull_secret_user ~ ':' ~ pull_secret_pass) | b64encode + } + }) }}" + + - name: Create new docker config + ansible.builtin.set_fact: + new_dockerconfigjson: "{{ dockerconfigjson | combine({'auths': new_dockerauths}) }}" + + - name: Create Pull Secret for bundle registry access (in the local namespace) + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Secret + type: kubernetes.io/dockerconfigjson + metadata: + name: pull-secret + namespace: "{{ namespace }}" + data: + .dockerconfigjson: "{{ new_dockerconfigjson | tojson | b64encode }}" + + - name: Create Pull Secret for bundle registry access (in the global namespace) + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Secret + type: kubernetes.io/dockerconfigjson + metadata: + name: pull-secret + namespace: openshift-config + data: + .dockerconfigjson: "{{ new_dockerconfigjson | tojson | b64encode }}" + +- name: Create registry CA Cert + when: setup_bundle_registry_tls_ca | bool + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Secret + type: Opaque + metadata: + name: registry-tls-ca + namespace: "{{ namespace }}" + data: + cert.pem: "{{ lookup('file', 'CA.pem') | b64encode }}" + +- name: Patch the default service account to use our pull secret + when: setup_bundle_registry_tls_ca | bool + kubernetes.core.k8s_json_patch: + kind: ServiceAccount + namespace: "{{ namespace }}" + name: default + patch: + - op: add + path: /imagePullSecrets + value: + - name: pull-secret + +- name: Ensure that the bundle paths are set + ansible.builtin.assert: + that: + - '__smart_gateway_bundle_image_path | default("") | length > 0' + - '__service_telemetry_bundle_image_path | default("") | length > 0' + fail_msg: "Bundle path(s) not set. __smart_gateway_bundle_image_path is '{{ __smart_gateway_bundle_image_path }}' and __service_telemetry_bundle_image_path is '{{ __service_telemetry_bundle_image_path }}'. Both values need to be set." + success_msg: "Bundle paths are defined, are not None and have a non-zero-length." + +- name: Try to access to the STO bundle + ansible.builtin.command: oc image info {{ __service_telemetry_bundle_image_path }} + register: sto_bundle_info + ignore_errors: true + +- name: Try to access to the SGO bundle + ansible.builtin.command: oc image info {{ __smart_gateway_bundle_image_path }} + register: sgo_bundle_info + ignore_errors: true + +- name: Check successful read access to STO and SGO bundles in the internal registry + ansible.builtin.assert: + that: + - sto_bundle_info.rc != 0 + - sgo_bundle_info.rc != 0 + fail_msg: "Bundles couldn't be retrieved. Check configuration for the bundles registry and retry." + success_msg: "Bundles were correctly retrieved from the registry." diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index cdb09be85..9406ad278 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -1,81 +1,3 @@ -- when: setup_bundle_registry_auth | bool - block: - - name: Get existing Pull Secret from openshift config - kubernetes.core.k8s_info: - api_version: v1 - kind: Secret - namespace: openshift-config - name: pull-secret - register: pull_secret - - - name: Decode docker config json - ansible.builtin.set_fact: - dockerconfigjson: "{{ pull_secret.resources[0].data['.dockerconfigjson'] | b64decode }}" - - - name: Merge registry creds into auth section of docker config - ansible.builtin.set_fact: - new_dockerauths: "{{ dockerconfigjson['auths'] | combine( { - pull_secret_registry:{ - 'auth': (pull_secret_user ~ ':' ~ pull_secret_pass) | b64encode - } - }) }}" - - - name: Create new docker config - ansible.builtin.set_fact: - new_dockerconfigjson: "{{ dockerconfigjson | combine({'auths': new_dockerauths}) }}" - - - name: Create Pull Secret for bundle registry access (in the local namespace) - kubernetes.core.k8s: - state: present - definition: - apiVersion: v1 - kind: Secret - type: kubernetes.io/dockerconfigjson - metadata: - name: pull-secret - namespace: "{{ namespace }}" - data: - .dockerconfigjson: "{{ new_dockerconfigjson | tojson | b64encode }}" - - - name: Create Pull Secret for bundle registry access (in the global namespace) - kubernetes.core.k8s: - state: present - definition: - apiVersion: v1 - kind: Secret - type: kubernetes.io/dockerconfigjson - metadata: - name: pull-secret - namespace: openshift-config - data: - .dockerconfigjson: "{{ new_dockerconfigjson | tojson | b64encode }}" - -- when: setup_bundle_registry_tls_ca | bool - name: Create registry CA Cert - kubernetes.core.k8s: - state: present - definition: - apiVersion: v1 - kind: Secret - type: Opaque - metadata: - name: registry-tls-ca - namespace: "{{ namespace }}" - data: - cert.pem: "{{ lookup('file', 'CA.pem') | b64encode }}" - -- when: setup_bundle_registry_tls_ca | bool - name: Patch the default service account to use our pull secret - kubernetes.core.k8s_json_patch: - kind: ServiceAccount - namespace: "{{ namespace }}" - name: default - patch: - - op: add - path: /imagePullSecrets - value: - - name: pull-secret - # When the task is skipped, pull_secret is still defined. It is set to the task output i.e. # "pull_secret": { # "changed": false, @@ -87,14 +9,6 @@ ansible.builtin.set_fact: pull_secret: '' -- name: "Ensure that the bundle paths are set." - ansible.builtin.assert: - that: - - '__smart_gateway_bundle_image_path | default("") | length > 0' - - '__service_telemetry_bundle_image_path | default("") | length > 0' - fail_msg: "Bundle path(s) not set. __smart_gateway_bundle_image_path is '{{ __smart_gateway_bundle_image_path }}' and __service_telemetry_bundle_image_path is '{{ __service_telemetry_bundle_image_path }}'. Both values need to be set." - success_msg: "Bundle paths are defined, are not None and have a non-zero-length" - - name: Deploy SGO via OLM bundle ansible.builtin.shell: cmd: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }} --verbose run bundle {{ __smart_gateway_bundle_image_path }} {% if pull_secret | length > 0 %} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca {% endif %} --namespace={{ namespace }} --timeout 600s" From acdb05f95e4159f2c333d51c169338d831955c28 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Fri, 10 May 2024 15:02:46 +0200 Subject: [PATCH 52/70] Change STO/SGO bundles sanity check (#595) * Change STO/SGO bundles sanity check stdout for "oc image info" should be checked instead of the rc to make sure that the correct bundle is being used * Drop the "successful read bundles" assertion Make checks simpler and more granular by using the fail module instead of the assertion for each of the bundles. * Update build/stf-run-ci/tasks/setup_registry_auth.yml Co-authored-by: Chris Sibbitt --------- Co-authored-by: Chris Sibbitt --- .../stf-run-ci/tasks/setup_registry_auth.yml | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/build/stf-run-ci/tasks/setup_registry_auth.yml b/build/stf-run-ci/tasks/setup_registry_auth.yml index bd33b8194..5c096fdcc 100644 --- a/build/stf-run-ci/tasks/setup_registry_auth.yml +++ b/build/stf-run-ci/tasks/setup_registry_auth.yml @@ -90,15 +90,27 @@ register: sto_bundle_info ignore_errors: true +- name: Print STO bundle info + ansible.builtin.debug: + msg: + - "{{ sto_bundle_info }}" + +- name: Fail if the expected STO bundle was not correctly fetched + ansible.builtin.fail: + msg: "The requested STO bundle couldn't be retrieved from the bundle registry. Check configuration for the bundles registry and retry." + when: "__service_telemetry_bundle_image_path not in sto_bundle_info.stdout" + - name: Try to access to the SGO bundle ansible.builtin.command: oc image info {{ __smart_gateway_bundle_image_path }} register: sgo_bundle_info ignore_errors: true -- name: Check successful read access to STO and SGO bundles in the internal registry - ansible.builtin.assert: - that: - - sto_bundle_info.rc != 0 - - sgo_bundle_info.rc != 0 - fail_msg: "Bundles couldn't be retrieved. Check configuration for the bundles registry and retry." - success_msg: "Bundles were correctly retrieved from the registry." +- name: Print SGO bundle info + ansible.builtin.debug: + msg: + - "{{ sgo_bundle_info }}" + +- name: Fail is the expected SGO bundle was not correctly fetched + ansible.builtin.fail: + msg: "The requested SGO bundle couldn't be retrieved from the bundle registry. Check configuration for the bundles registry and retry." + when: "__smart_gateway_bundle_image_path not in sgo_bundle_info.stdout" From 69b8fe5d242874bc244e489e3d587202cb5a41b5 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Tue, 21 May 2024 09:49:59 +0200 Subject: [PATCH 53/70] Add __deploy_disconnected option (#591) * Add __deploy_disconnected option This new option if stf-run-ci allows the utilization of the setup_base and setup_stf roles for the purpose of deploying STF on OCP Disconnected It is expected that this is run in an environment in which the catalogsources have been defined and the precise catalogsource for each of the dependency operators needs to be passed to the role. For this, the catalogsource for each of the operators can be modified with the new parameters redhat_operators, community_operators and certified_operators Default values for these are set to the previous values to keep backwards compatibility * Small README fix in the __disconnected_deploy opt * Small README fix in the __disconnected_deploy opt * Define infrawatch_operators in default * Add a fail check when __disconnected_deploy is set For now, we should fail when __disconnected_deploy is set with other deployment options. At the moment, we deploy in OCP disconnected by using pre catalog generated by the oc-mirror tool * Pass fail conditions as list instead of a oneliner Make conditions easier to read by passing them as a list instead of a single line See https://docs.ansible.com/ansible/latest/playbook_guide/playbooks_conditionals.html#conditionals-based-on-ansible-facts --- build/stf-run-ci/README.md | 1 + build/stf-run-ci/defaults/main.yml | 9 +++++++++ build/stf-run-ci/tasks/main.yml | 20 +++++++++++++++++--- build/stf-run-ci/tasks/setup_base.yml | 11 ++++++----- build/stf-run-ci/tasks/setup_stf.yml | 17 ++++++++++------- 5 files changed, 43 insertions(+), 15 deletions(-) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 664e57e7a..b9b1afbc0 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -21,6 +21,7 @@ choose to override: | `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | | `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | | `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | +| `__disconnected_deploy` | {true,false} | false | Whether to deploy on a disconnected cluster | | `__service_telemetry_bundle_image_path` | | `quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head` | Image path to Service Telemetry Operator bundle | | `__smart_gateway_bundle_image_path` | | `quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head` | Image path to Smart Gateway Operator bundle | | `setup_bundle_registry_tls_ca` | {true,false} | true | Whether to setup or not a TLS CA cert for the bundle registry access | diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 32763c48d..5001dd2c0 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -10,6 +10,8 @@ __deploy_from_bundles_enabled: false __deploy_from_index_enabled: false __deploy_stf: true +__disconnected_deploy: false + __service_telemetry_events_certificates_endpoint_cert_duration: 70080h __service_telemetry_events_certificates_ca_cert_duration: 70080h __service_telemetry_events_enabled: true @@ -54,6 +56,13 @@ pull_secret_registry: pull_secret_user: pull_secret_pass: +redhat_operators: redhat-operators +community_operators: community-operators +certified_operators: certified-operators +infrawatch_operators: infrawatch-operators + +stf_channel: unstable + # used when building images to default to correct version branch for STF subcomponents per STF version version_branches: sgo: master diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index eec4313cb..73508fedc 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -24,17 +24,30 @@ - name: Fail on mutually exclusive flags ansible.builtin.fail: msg: __deploy_from_bundles_enabled not currently supported with __local_build_enabled (but should be) - when: __local_build_enabled | bool and __deploy_from_bundles_enabled | bool + when: + - __local_build_enabled | bool + - __deploy_from_bundles_enabled | bool - name: Fail when deploying from index image and local build disabled ansible.builtin.fail: msg: __deploy_from_index_enabled must also have __local_build_enabled - when: __deploy_from_index_enabled | bool and not __local_build_enabled | bool + when: + - __deploy_from_index_enabled | bool + - not __local_build_enabled | bool - name: Fail when deploying from index images and deployment from bundles also requested (mutually exclusive methods) ansible.builtin.fail: msg: __deploy_from_index_enabled can not be used with __deploy_from_bundles_enabled - when: __deploy_from_index_enabled | bool and __deploy_from_bundles_enabled | bool + when: + - __deploy_from_index_enabled | bool + - __deploy_from_bundles_enabled | bool + +- name: Fail when disconnected deploy and other deployment options also requested + ansible.builtin.fail: + msg: __disconnected_deploy cannot be used if __deploy_from_bundles_enabled, __deploy_from_index_enabled or __local_build_enabled + when: + - __disconnected_deploy | bool + - __deploy_from_bundles_enabled | bool or __deploy_from_index_enabled | bool or __local_build_enabled | bool - name: Get the list of nodes kubernetes.core.k8s_info: @@ -73,6 +86,7 @@ prometheus_webhook_snmp_dir: "{{ prometheus_webhook_snmp_dir if prometheus_webhook_snmp_dir is defined else base_dir + '/working/prometheus-webhook-snmp' }}" - name: Get operator_sdk_v0 (build bundles) + when: __local_build_enabled | bool ansible.builtin.command: cmd: "./get_operator_sdk.sh {{ operator_sdk_v0 }}" creates: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 1856bd34d..443d7cb76 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -1,5 +1,6 @@ --- - name: Setup OperatorHub dependencies + when: not __disconnected_deploy | bool kubernetes.core.k8s: definition: apiVersion: config.openshift.io/v1 @@ -44,7 +45,7 @@ channel: development installPlanApproval: Automatic name: cluster-observability-operator - source: redhat-operators + source: "{{ redhat_operators }}" sourceNamespace: openshift-marketplace when: - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] @@ -62,7 +63,7 @@ channel: beta installPlanApproval: Automatic name: prometheus - source: community-operators + source: "{{ community_operators }}" sourceNamespace: openshift-marketplace when: - __service_telemetry_observability_strategy == "use_community" @@ -107,7 +108,7 @@ channel: stable-v1 installPlanApproval: Automatic name: openshift-cert-manager-operator - source: redhat-operators + source: "{{ redhat_operators }}" sourceNamespace: openshift-marketplace # installed by properties.yaml definition as of STF 1.5.3 @@ -125,7 +126,7 @@ channel: 1.10.x installPlanApproval: Automatic name: amq7-interconnect-operator - source: redhat-operators + source: "{{ redhat_operators }}" sourceNamespace: openshift-marketplace # undocumented procedure: used for backwards compatilibity verification @@ -142,7 +143,7 @@ channel: stable installPlanApproval: Automatic name: elasticsearch-eck-operator-certified - source: certified-operators + source: "{{ certified_operators }}" sourceNamespace: openshift-marketplace - name: Wait for Elasticsearch CRD to appear diff --git a/build/stf-run-ci/tasks/setup_stf.yml b/build/stf-run-ci/tasks/setup_stf.yml index ffe366f6f..819d74809 100644 --- a/build/stf-run-ci/tasks/setup_stf.yml +++ b/build/stf-run-ci/tasks/setup_stf.yml @@ -1,5 +1,8 @@ --- - name: Set default InfraWatch OperatorSource manifest + when: + - __deploy_from_index_enabled | bool + - infrawatch_catalog_source_manifest is not defined ansible.builtin.set_fact: infrawatch_catalog_source_manifest: | apiVersion: operators.coreos.com/v1alpha1 @@ -15,9 +18,9 @@ updateStrategy: registryPoll: interval: 30m - when: infrawatch_catalog_source_manifest is not defined - name: Set default Smart Gateway Operator Subscription manifest + when: smart_gateway_operator_subscription_manifest is not defined ansible.builtin.set_fact: smart_gateway_operator_subscription_manifest: | apiVersion: operators.coreos.com/v1alpha1 @@ -26,14 +29,14 @@ name: smart-gateway-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: "{{ stf_channel }}" installPlanApproval: Automatic name: smart-gateway-operator - source: infrawatch-operators + source: "{{ infrawatch_operators }}" sourceNamespace: openshift-marketplace - when: smart_gateway_operator_subscription_manifest is not defined - name: Set default Service Telemetry Operator Subscription manifest + when: service_telemetry_operator_subscription_manifest is not defined ansible.builtin.set_fact: service_telemetry_operator_subscription_manifest: | apiVersion: operators.coreos.com/v1alpha1 @@ -42,15 +45,15 @@ name: service-telemetry-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: "{{ stf_channel }}" installPlanApproval: Automatic name: service-telemetry-operator - source: infrawatch-operators + source: "{{ infrawatch_operators }}" sourceNamespace: openshift-marketplace - when: service_telemetry_operator_subscription_manifest is not defined # enable catalogsource - name: Enable InfraWatch Catalog Source + when: __deploy_from_index_enabled | bool kubernetes.core.k8s: definition: '{{ infrawatch_catalog_source_manifest }}' From 2695610ef0f7acc374fb002963455eb16eab4343 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 28 May 2024 06:19:59 -0400 Subject: [PATCH 54/70] Limit port 5672 to localhost (#596) --- roles/servicetelemetry/tasks/component_qdr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 0ddbb03f4..1285ae059 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -246,6 +246,7 @@ sslProfile: inter-router listeners: - port: 5672 + host: 127.0.0.1 - expose: {{ servicetelemetry_vars.transports.qdr.web.enabled }} http: true port: 8672 From 429cc3bad77e337877f62f271f92a2a45725c7ea Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Wed, 29 May 2024 20:11:51 +0100 Subject: [PATCH 55/70] [zuul] Give OCP 4.12 jobs more resources (#600) * [zuul] Give OCP 4.12 jobs more resources * [zuul] Add new nodeset for OCP4.12 Editing the existing nodeset caused an error because the nodeset definition on master was different. * Add multiple 412 jobs with different nodesets --- .zuul.yaml | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index f81d81ffb..75d9ce781 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -7,6 +7,14 @@ - name: crc label: coreos-crc-extracted-2-19-0-xxl +- nodeset: + name: stf-crc_extracted-ocp412-new + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-19-0-3xl + - nodeset: name: stf-crc_extracted-ocp414 nodes: @@ -129,7 +137,7 @@ parent: stf-crc-nightly_bundles description: | Deploy STF using the nightly bundles on OCP 4.12 - nodeset: stf-crc_extracted-ocp412 + nodeset: stf-crc_extracted-ocp412-new - job: name: stf-crc-ocp_414-nightly_bundles @@ -143,7 +151,7 @@ parent: stf-crc-local_build description: | Build images locally and deploy STF on OCP 4.12 - nodeset: stf-crc_extracted-ocp412 + nodeset: stf-crc_extracted-ocp412-new - job: name: stf-crc-ocp_414-local_build @@ -157,7 +165,7 @@ parent: stf-crc-local_build-index_deploy description: | Build STF locally and deploy from index on OCP 4.12 - nodeset: stf-crc_extracted-ocp412 + nodeset: stf-crc_extracted-ocp412-new - job: name: stf-crc-ocp_414-local_build-index_deploy @@ -185,3 +193,39 @@ jobs: - stf-crc-ocp_412-nightly_bundles - stf-crc-ocp_414-nightly_bundles + github-check: + jobs: + - stf-crc-ocp_412-local_build + - stf-crc-ocp_412-local_build-index_deploy + - stf-crc-ocp_412-local_build-index_deploy: + nodeset: + #name: 3xl + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-19-0-3xl + - stf-crc-ocp_412-local_build-index_deploy: + nodeset: + #name: 4xlargdde + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-19-0-4xlarge + - stf-crc-ocp_412-local_build-index_deploy: + nodeset: + #name: 5xlarge + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-19-0-5xlarge + - stf-crc-ocp_412-local_build-index_deploy: + nodeset: + #name: 6xlarge + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-19-0-6xlarge From e7c231e54eb93bb94afcbd58e859aa85ad65cf64 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 31 May 2024 12:43:01 +0100 Subject: [PATCH 56/70] Revert "[zuul] Give OCP 4.12 jobs more resources (#600)" (#603) This change is blocking the gates, due to jobs being queued indefinitely. This reverts commit 429cc3bad77e337877f62f271f92a2a45725c7ea. --- .zuul.yaml | 50 +++----------------------------------------------- 1 file changed, 3 insertions(+), 47 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 75d9ce781..f81d81ffb 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -7,14 +7,6 @@ - name: crc label: coreos-crc-extracted-2-19-0-xxl -- nodeset: - name: stf-crc_extracted-ocp412-new - nodes: - - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost - - name: crc - label: coreos-crc-extracted-2-19-0-3xl - - nodeset: name: stf-crc_extracted-ocp414 nodes: @@ -137,7 +129,7 @@ parent: stf-crc-nightly_bundles description: | Deploy STF using the nightly bundles on OCP 4.12 - nodeset: stf-crc_extracted-ocp412-new + nodeset: stf-crc_extracted-ocp412 - job: name: stf-crc-ocp_414-nightly_bundles @@ -151,7 +143,7 @@ parent: stf-crc-local_build description: | Build images locally and deploy STF on OCP 4.12 - nodeset: stf-crc_extracted-ocp412-new + nodeset: stf-crc_extracted-ocp412 - job: name: stf-crc-ocp_414-local_build @@ -165,7 +157,7 @@ parent: stf-crc-local_build-index_deploy description: | Build STF locally and deploy from index on OCP 4.12 - nodeset: stf-crc_extracted-ocp412-new + nodeset: stf-crc_extracted-ocp412 - job: name: stf-crc-ocp_414-local_build-index_deploy @@ -193,39 +185,3 @@ jobs: - stf-crc-ocp_412-nightly_bundles - stf-crc-ocp_414-nightly_bundles - github-check: - jobs: - - stf-crc-ocp_412-local_build - - stf-crc-ocp_412-local_build-index_deploy - - stf-crc-ocp_412-local_build-index_deploy: - nodeset: - #name: 3xl - nodes: - - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost - - name: crc - label: coreos-crc-extracted-2-19-0-3xl - - stf-crc-ocp_412-local_build-index_deploy: - nodeset: - #name: 4xlargdde - nodes: - - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost - - name: crc - label: coreos-crc-extracted-2-19-0-4xlarge - - stf-crc-ocp_412-local_build-index_deploy: - nodeset: - #name: 5xlarge - nodes: - - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost - - name: crc - label: coreos-crc-extracted-2-19-0-5xlarge - - stf-crc-ocp_412-local_build-index_deploy: - nodeset: - #name: 6xlarge - nodes: - - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost - - name: crc - label: coreos-crc-extracted-2-19-0-6xlarge From b46f215d5a45a4b0b3703c3c7e54d2a4316ba02f Mon Sep 17 00:00:00 2001 From: chaturvedi-kna <63336082+chaturvedi-kna@users.noreply.github.com> Date: Mon, 10 Jun 2024 22:33:57 +0530 Subject: [PATCH 57/70] Enable QDR port 5671 to listen on both IPv4 and IPv6 (#599) * Make port 5671 listen on all interfaces (IPv4 and IPv6) * Update roles/servicetelemetry/tasks/component_qdr.yml Co-authored-by: Chris Sibbitt --------- Co-authored-by: chaturvedi.kna@ril.com Co-authored-by: Chris Sibbitt Co-authored-by: Emma Foley --- roles/servicetelemetry/tasks/component_qdr.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 1285ae059..e045a290c 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -227,7 +227,6 @@ prefix: ceilometer edgeListeners: - expose: true - host: 0.0.0.0 port: 5671 {% if servicetelemetry_vars.transports.qdr.auth == "basic" %} saslMechanisms: PLAIN From b4050b3123aa32281c1f845150a7a54d23087981 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Tue, 18 Jun 2024 16:17:33 +0200 Subject: [PATCH 58/70] Lower ElasticSearch memory requirements (#605) Lower ES memory reqs so we don't exhaust resources in CI environments. This is reproducible in OCP 4.12 jobs. --- build/stf-run-ci/templates/manifest_elasticsearch.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/stf-run-ci/templates/manifest_elasticsearch.j2 b/build/stf-run-ci/templates/manifest_elasticsearch.j2 index 0a59a046e..ebd0dc9d0 100644 --- a/build/stf-run-ci/templates/manifest_elasticsearch.j2 +++ b/build/stf-run-ci/templates/manifest_elasticsearch.j2 @@ -33,10 +33,10 @@ spec: resources: limits: cpu: "2" - memory: 4Gi + memory: 2Gi requests: cpu: "1" - memory: 4Gi + memory: 2Gi volumes: - emptyDir: {} name: elasticsearch-data From e6be5e0bb8cf43d32d2ba2a419f71470b549ac05 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Jun 2024 19:39:00 +0200 Subject: [PATCH 59/70] --- (#598) updated-dependencies: - dependency-name: requests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Victoria Martinez de la Cruz --- build/stf-run-ci/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/stf-run-ci/requirements.txt b/build/stf-run-ci/requirements.txt index 607e7d87c..ecdb60633 100644 --- a/build/stf-run-ci/requirements.txt +++ b/build/stf-run-ci/requirements.txt @@ -1,5 +1,5 @@ # https://stackoverflow.com/questions/64073422/importerror-cannot-import-name-oauth1session-from-requests-oauthlib -requests==2.31.0 +requests==2.32.0 requests_oauthlib==1.3.0 # https://github.com/domainaware/parsedmarc/issues/318 oauthlib==3.2.2 From 8d0688a1a62be169a0216f5a3a714c0b488a4a5d Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 19 Jun 2024 11:44:20 -0400 Subject: [PATCH 60/70] Fully disable qdr web by default (#606) --- roles/servicetelemetry/tasks/component_qdr.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index e045a290c..0182b9161 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -246,9 +246,11 @@ listeners: - port: 5672 host: 127.0.0.1 - - expose: {{ servicetelemetry_vars.transports.qdr.web.enabled }} + {% if servicetelemetry_vars.transports.qdr.web.enabled == "true" %} + - expose: true http: true port: 8672 + {% endif %} sslProfiles: - caCert: {{ ansible_operator_meta.name }}-interconnect-openstack-ca credentials: {{ ansible_operator_meta.name }}-interconnect-openstack-credentials From 43726ee4a7650a2e120893ee4d48fd6fe2b64bed Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 10 Jul 2024 16:10:13 -0400 Subject: [PATCH 61/70] Use scrapeconfig instead of servicemonitor (#607) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Move from ServiceMonitor to ScrapeConfig * Clean up legacy servicemonitors * Emulate servicemonitor-compatible labels * Update roles/servicetelemetry/tasks/component_scrapeconfig.yml Co-authored-by: Jaromír Wysoglad --- README.md | 1 + ...emetry-operator.clusterserviceversion.yaml | 13 +++ deploy/role.yaml | 4 + .../tasks/base_smartgateway.yml | 4 +- .../tasks/component_scrapeconfig.yml | 93 +++++++++++++++++++ .../tasks/component_servicemonitor.yml | 52 ----------- .../templates/manifest_alertmanager.j2 | 2 +- .../templates/manifest_prometheus.j2 | 2 +- tests/smoketest/smoketest.sh | 4 +- 9 files changed, 117 insertions(+), 58 deletions(-) create mode 100644 roles/servicetelemetry/tasks/component_scrapeconfig.yml delete mode 100644 roles/servicetelemetry/tasks/component_servicemonitor.yml diff --git a/README.md b/README.md index e336242b5..757073711 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ loaded for you. * smartgatewayCollectdEventsManifest * smartgatewayCeilometerEventsManifest * servicemonitorManifest +* scrapeconfigManifest ## Development diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 4ec0f7f5c..953bb7739 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -203,6 +203,15 @@ spec: - kind: ServiceMonitors name: servicemonitors.monitoring.coreos.com version: v1 + - kind: ScrapeConfigs + name: scrapeconfigs.monitoring.coreos.com + version: v1alpha1 + - kind: ServiceMonitors + name: servicemonitors.monitoring.rhobs + version: v1 + - kind: ScrapeConfigs + name: scrapeconfigs.monitoring.rhobs + version: v1alpha1 version: v1beta1 description: Service Telemetry Operator for monitoring clouds displayName: Service Telemetry Operator @@ -378,17 +387,21 @@ spec: - apiGroups: - monitoring.coreos.com resources: + - scrapeconfigs - servicemonitors verbs: - get - create + - delete - apiGroups: - monitoring.rhobs resources: + - scrapeconfigs - servicemonitors verbs: - get - create + - delete - apiGroups: - apps resourceNames: diff --git a/deploy/role.yaml b/deploy/role.yaml index cdade2ce7..ba2dfbbe8 100644 --- a/deploy/role.yaml +++ b/deploy/role.yaml @@ -129,17 +129,21 @@ rules: - apiGroups: - monitoring.coreos.com resources: + - scrapeconfigs - servicemonitors verbs: - get - create + - delete - apiGroups: - monitoring.rhobs resources: + - scrapeconfigs - servicemonitors verbs: - get - create + - delete - apiGroups: - apps resourceNames: diff --git a/roles/servicetelemetry/tasks/base_smartgateway.yml b/roles/servicetelemetry/tasks/base_smartgateway.yml index 4d0cfdafd..7077e928d 100644 --- a/roles/servicetelemetry/tasks/base_smartgateway.yml +++ b/roles/servicetelemetry/tasks/base_smartgateway.yml @@ -6,8 +6,8 @@ k8s: definition: "{{ lookup('template', manifest) | from_yaml }}" -- name: Deploy SG-specific ServiceMonitor for metrics SGs - include_tasks: component_servicemonitor.yml +- name: Deploy SG-specific ScrapeConfig for metrics SGs + include_tasks: component_scrapeconfig.yml when: - data_type == 'metrics' - has_monitoring_api | bool diff --git a/roles/servicetelemetry/tasks/component_scrapeconfig.yml b/roles/servicetelemetry/tasks/component_scrapeconfig.yml new file mode 100644 index 000000000..b05a5ac03 --- /dev/null +++ b/roles/servicetelemetry/tasks/component_scrapeconfig.yml @@ -0,0 +1,93 @@ +- name: Look up prometheus-stf SA to get auth secret name + k8s_info: + api_version: v1 + kind: ServiceAccount + namespace: '{{ ansible_operator_meta.namespace }}' + name: prometheus-stf + register: service_account + +- name: Look up auth secret to get token secret name + k8s_info: + api_version: v1 + kind: Secret + namespace: '{{ ansible_operator_meta.namespace }}' + name: '{{ service_account.resources[0].secrets[0].name }}' + register: auth_secret + +- name: Create SG-specific Scrape Config manifest + set_fact: + sg_specific_scrapeconfig_manifest: | + apiVersion: {{ prometheus_operator_api_string | replace("/v1","/v1alpha1") }} + kind: ScrapeConfig + metadata: + labels: + app: smart-gateway + name: '{{ this_smartgateway }}' + namespace: '{{ ansible_operator_meta.namespace }}' + spec: + authorization: + type: bearer + credentials: + name: '{{ auth_secret.resources[0].metadata.annotations['openshift.io/token-secret.name'] }}' + key: token + metricRelabelings: + - action: labeldrop + regex: pod + - action: labeldrop + regex: namespace + - action: labeldrop + regex: instance + - action: replace + regex: '.*/(.*)$' + replacement: $1 + sourceLabels: [job] + targetLabel: service + - action: labeldrop + regex: job + - action: labeldrop + regex: publisher + - action: replace + targetLabel: container + replacement: sg-core + - action: replace + targetLabel: endpoint + replacement: prom-https + scheme: HTTPS + scrapeInterval: {{ servicetelemetry_vars.backends.metrics.prometheus.scrape_interval }} + staticConfigs: + - targets: + - '{{ this_smartgateway }}.{{ ansible_operator_meta.namespace }}.svc:8083' + tlsConfig: + ca: + configMap: + name: serving-certs-ca-bundle + key: service-ca.crt + serverName: '{{ this_smartgateway }}.{{ ansible_operator_meta.namespace }}.svc' + +- name: Create ScrapeConfig to scrape Smart Gateway + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + '{{ sg_specific_scrapeconfig_manifest }}' + +- name: Create additional ScrapeConfig if provided + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + '{{ scrapeconfig_manifest }}' + when: scrapeconfig_manifest is defined + +- name: Create additional ServiceMonitor if provided (legacy) + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + '{{ servicemonitor_manifest }}' + when: servicemonitor_manifest is defined + +- name: Remove (legacy) default ServiceMonitors + k8s: + state: absent + api_version: '{{ prometheus_operator_api_string }}' + kind: ServiceMonitor + namespace: '{{ ansible_operator_meta.namespace }}' + name: '{{ this_smartgateway }}' \ No newline at end of file diff --git a/roles/servicetelemetry/tasks/component_servicemonitor.yml b/roles/servicetelemetry/tasks/component_servicemonitor.yml deleted file mode 100644 index 753116c46..000000000 --- a/roles/servicetelemetry/tasks/component_servicemonitor.yml +++ /dev/null @@ -1,52 +0,0 @@ -- name: Create SG-specific Service Monitor manifest - set_fact: - sg_specific_servicemonitor_manifest: | - apiVersion: {{ prometheus_operator_api_string }} - kind: ServiceMonitor - metadata: - labels: - app: smart-gateway - name: '{{ this_smartgateway }}' - namespace: '{{ ansible_operator_meta.namespace }}' - spec: - endpoints: - - interval: {{ servicetelemetry_vars.backends.metrics.prometheus.scrape_interval }} - metricRelabelings: - - action: labeldrop - regex: pod - sourcelabels: [] - - action: labeldrop - regex: namespace - sourcelabels: [] - - action: labeldrop - regex: instance - sourcelabels: [] - - action: labeldrop - regex: job - sourcelabels: [] - - action: labeldrop - regex: publisher - sourcelabels: [] - port: prom-https - scheme: https - tlsConfig: - caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt - serverName: "{{ this_smartgateway }}.{{ ansible_operator_meta.namespace }}.svc" - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - selector: - matchLabels: - app: smart-gateway - smart-gateway: "{{ this_smartgateway }}" - -- name: Create ServiceMonitor to scrape Smart Gateway - k8s: - state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' - definition: - '{{ sg_specific_servicemonitor_manifest }}' - -- name: Create additional serviceMonitor if provided - k8s: - state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' - definition: - '{{ servicemonitor_manifest }}' - when: servicemonitor_manifest is defined diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 4e2287fe9..70a6d68a3 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -12,7 +12,7 @@ spec: {% endif %} replicas: {{ servicetelemetry_vars.alerting.alertmanager.deployment_size }} serviceAccountName: alertmanager-stf - serviceMonitorSelector: + scrapeConfigSelector: matchLabels: app: smart-gateway listenLocal: true diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index d9610b056..cd3680883 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -17,7 +17,7 @@ spec: ruleSelector: {} securityContext: {} serviceAccountName: prometheus-stf - serviceMonitorSelector: + scrapeConfigSelector: matchLabels: app: smart-gateway listenLocal: true diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index b4a8db29f..c8bfd62c3 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -129,8 +129,8 @@ echo "*** [INFO] Showing oc get all..." oc get all echo -echo "*** [INFO] Showing servicemonitors..." -oc get servicemonitors.monitoring.rhobs -o yaml +echo "*** [INFO] Showing scrapeconfigs..." +oc get scrapeconfigs.monitoring.rhobs -o yaml echo if [ "$SMOKETEST_VERBOSE" = "true" ]; then From e82ab70a02db01982aaa516abaf7c218d9f383c0 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Thu, 11 Jul 2024 14:47:56 +0200 Subject: [PATCH 62/70] Create index image from pre-built bundles (#597) * Create index image from pre-built bundles Deploy from bundles can now be used along with deploy from index enabled. This way, an index image can be created with specific pre built bundles and deploy STF with those. * Fix set_fact syntax for SGO/SGO bundle info dicts * Add ImageStreams for STO and SGO * Fix minor typo in STO/SGO bundle info dict * Set correct STO and SGO bundles path We need to use the internal registry address to build the index image that will serve as base for the catalog * Drop build number from SGO and STO bundles We only need the tag when creating the index image Keeping this value will generate errors * Get STO and SGO tags from the bundle path * Fix typo on SGO and STO images path * Update README with bundle + index instructions Add more details on how to deploy from index with pre built bundles * Use stf_channel on the deploy_stf step Define this value to stable-1.5 when deploying from bundles and index, unstable (the default) if bundles have been built locally. Also, honor the sgo and sto bundles tags. Ideally, we would update the sgo and sto image path to be only the path (without the tag) and define the tag separately. In this case, it is responsibility from the user to make sure that the tag in the image path and the tag provided in sto and sgo image path match when deploying from externally built bundles and index. * Update missing reference to SGO bundles channel * Update some logic paths to simplify the execution * Honor the stf_channel variable Nightly bundles use the "unstable" channel while RH catalog bundles use "stable-1.5" channel We should use nightly bundles "unstable" by default and pass "stable-1.5" when deploying from RH catalog * Get opm for getting bundles information We need opm to get the bundle version when deploying from index with pre-built bundles Get opm following the same logic we use for operator-sdk * Get the operator bundle version for STO and SGO Inspect the provided bundles with opm render and obtain the operator bundle version for each of them. * Set a default version for opm Let's stick to latest-4.14 for now, with the option of updating to a different one if needed * Fix typo when executing opm render * Revert "Fix typo when executing opm render" This reverts commit 1050227747d741b2d9f1c11885bd2eb91fb5ee73. * Revert "Set a default version for opm" This reverts commit d6a51df8b2b5883e19a29c00abd1b0d6286c53af. * Revert "Get the operator bundle version for STO and SGO" This reverts commit c1d9db36574f3ddc2fa46a5cd1a7eff379d9b24f. * Revert "Get opm for getting bundles information" This reverts commit 642df50f8171b5709638c407cbc552df49068f27. * Get STO and SGO operator bundle version using oc oc image info command provides operator bundle version information. Use this to get the proper version numbers when building the index image * Get STO and SGO bundle versions from info Parse the oc image info output to get the STO and SGO bundle versions * Add extra logic to avoid variables overwrite sto_bundle_info and sgo_bundle_info is overriding even though the step is skipped Add an extra logic check to avoid variables override --- build/stf-run-ci/README.md | 30 ++++++++++-- build/stf-run-ci/tasks/create_catalog.yml | 56 ++++++++++++++++++++++- build/stf-run-ci/tasks/main.yml | 29 ++---------- 3 files changed, 86 insertions(+), 29 deletions(-) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index b9b1afbc0..ecc4fd38e 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -20,7 +20,7 @@ choose to override: | `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | | `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | | `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | -| `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | +| `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles/OLM bundles and index image. | | `__disconnected_deploy` | {true,false} | false | Whether to deploy on a disconnected cluster | | `__service_telemetry_bundle_image_path` | | `quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head` | Image path to Service Telemetry Operator bundle | | `__smart_gateway_bundle_image_path` | | `quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head` | Image path to Smart Gateway Operator bundle | @@ -83,9 +83,10 @@ choose to override: You can deploy Service Telemetry Framework using this role in a few configuration methods: -* local build artifacts from Git repository cloned locally -* local build artifacts, local bundle artifacts, and Subscription via OLM using locally built index image -* standard deployment using Subscription and OLM +* local build artifacts from Git repository cloned locally (local build) +* local build artifacts, local bundle artifacts, and Subscription via OLM using locally built index image (local build + deploy from index) +* externally build bundle artifacts and Subscription via OLM using locally built index image (deploy from bundles + deploy from index) +* standard deployment using Subscription and OLM (deploy from bundles) * supporting components but no instance of Service Telemetry Operator ## Basic deployment @@ -134,6 +135,27 @@ You can perform a deployment using OLM and a Subscription from locally built art ansible-playbook -e __local_build_enabled=true -e __deploy_from_index_enabled=true run-ci.yaml ``` +## Deployment with pre-build bundles and index + +Instead of relying on the operator-sdk to deploy from selected bundles using the "operator-sdk run bundle" utility, +you can perform a deployment using OLM and a Subscription to a locally created index image like this: + +```sh +ansible-playbook -e __local_build_enabled=false -e __deploy_from_bundles_enabled=true \ + -e __deploy_from_index_enabled=true \ + -e __service_telemetry_bundle_image_path=//stf-service-telemetry-operator-bundle: \ + -e __smart_gateway_bundle_image_path=//stf-smart-gateway-operator-bundle: \ + -e pull_secret_registry= \ + -e pull_secret_user= \ + -e pull_secret_pass= + run-ci.yaml +``` + +Since you will fetch the selected images from a bundle registry, it is required that you have all the required +access credentials for the desired registry correctly configured. Check the "Deployment with pre-build bundles" +docs above to get more information about this. + + # License Apache v2.0 diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index feed3b56f..828f0c905 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -7,11 +7,65 @@ # Updating to use stdout_lines[-1] so that any additional info that gets added to generate_bundles (e.g. for debug) doesn't break this task # Adding from_json so that the JSON output is parsed into a dictionary -- name: Create info variables from bundle generation output +- name: Create info variables from bundle generation output (local build) + when: __local_build_enabled | bool and not __deploy_from_bundles_enabled | bool ansible.builtin.set_fact: sto_bundle_info: "{{ generate_bundle_sto.stdout_lines[-1] | from_json }}" sgo_bundle_info: "{{ generate_bundle_sgo.stdout_lines[-1] | from_json }}" +- name: Create info variables from provided pre-built bundles (deploy from bundles) + when: __deploy_from_bundles_enabled | bool and not __local_build_enabled | bool + block: + - name: Get STO operator bundle info + ansible.builtin.command: oc image info {{ __service_telemetry_bundle_image_path }} + register: sto_prebuilt_image_info + + - name: Get SGO operator bundle info + ansible.builtin.command: oc image info {{ __smart_gateway_bundle_image_path }} + register: sgo_prebuilt_image_info + + - name: Get STO and SGO bundle versions + ansible.builtin.set_fact: + sto_prebuilt_bundle_version: "{{ sto_prebuilt_image_info.stdout_lines[-1] | split('=') | last }}" + sgo_prebuilt_bundle_version: "{{ sgo_prebuilt_image_info.stdout_lines[-1] | split('=') | last }}" + + - name: Set info variables from provided pre-built bundles + ansible.builtin.set_fact: + sto_bundle_info: + 'bundle_default_channel': "{{ stf_channel }}" + 'bundle_channels': "{{ stf_channel }}" + 'operator_bundle_version': "{{ sto_prebuilt_bundle_version }}" + sgo_bundle_info: + 'bundle_default_channel': "{{ stf_channel }}" + 'bundle_channels': "{{ stf_channel }}" + 'operator_bundle_version': "{{ sgo_prebuilt_bundle_version }}" + +- name: Show STO and SGO bundle info that will used in the index image + ansible.builtin.debug: + msg: + - "{{ sto_bundle_info }}" + - "{{ sgo_bundle_info }}" + +- name: Create ImageStream for STO and SGO (deploying from bundles) + when: __deploy_from_bundles_enabled | bool + block: + - name: Set correct STO and SGO bundle paths when deploying from index with pre-built bundles + ansible.builtin.set_fact: + sto_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-operator-bundle:{{ sto_bundle_image_tag }}" + sgo_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/smart-gateway-operator-bundle:{{ sgo_bundle_image_tag }}" + + - name: Create ImageStream for STO + ansible.builtin.command: + cmd: | + oc import-image -n {{ namespace }} service-telemetry-operator-bundle:{{ sto_bundle_image_tag }} --from={{ __service_telemetry_bundle_image_path }} --confirm --insecure + register: sto_is + + - name: Create ImageStream for SGO + ansible.builtin.command: + cmd: | + oc import-image -n {{ namespace }} smart-gateway-operator-bundle:{{ sgo_bundle_image_tag }} --from={{ __smart_gateway_bundle_image_path }} --confirm --insecure + register: sgo_is + - name: Get the builder-dockercfg Secret name ansible.builtin.command: oc get secret -n {{ namespace }} --field-selector='type==kubernetes.io/dockercfg' -ojsonpath='{.items[?(@.metadata.annotations.kubernetes\.io/service-account\.name=="builder")].metadata.name}' register: secret_builder_dockercfg_name diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 73508fedc..df29982ab 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -28,27 +28,6 @@ - __local_build_enabled | bool - __deploy_from_bundles_enabled | bool -- name: Fail when deploying from index image and local build disabled - ansible.builtin.fail: - msg: __deploy_from_index_enabled must also have __local_build_enabled - when: - - __deploy_from_index_enabled | bool - - not __local_build_enabled | bool - -- name: Fail when deploying from index images and deployment from bundles also requested (mutually exclusive methods) - ansible.builtin.fail: - msg: __deploy_from_index_enabled can not be used with __deploy_from_bundles_enabled - when: - - __deploy_from_index_enabled | bool - - __deploy_from_bundles_enabled | bool - -- name: Fail when disconnected deploy and other deployment options also requested - ansible.builtin.fail: - msg: __disconnected_deploy cannot be used if __deploy_from_bundles_enabled, __deploy_from_index_enabled or __local_build_enabled - when: - - __disconnected_deploy | bool - - __deploy_from_bundles_enabled | bool or __deploy_from_index_enabled | bool or __local_build_enabled | bool - - name: Get the list of nodes kubernetes.core.k8s_info: kind: Node @@ -159,7 +138,7 @@ pod-security.kubernetes.io/audit: restricted pod-security.kubernetes.io/warn: restricted -- when: __deploy_from_index_enabled | bool +- when: __deploy_from_index_enabled | bool and __local_build_enabled | bool tags: - create_bundles block: @@ -180,11 +159,13 @@ tags: - build +- when: __deploy_from_index_enabled | bool + block: - name: Create file-based catalog ansible.builtin.include_tasks: create_catalog.yml # -- deploy -- when: not __local_build_enabled | bool +- when: not __local_build_enabled | bool and not __deploy_from_index_enabled | bool block: - name: Setup Service Telemetry Framework from supplied bundle URLs ansible.builtin.include_tasks: setup_stf_from_bundles.yml @@ -206,7 +187,7 @@ name: service-telemetry-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: "{{ stf_channel }}" installPlanApproval: Automatic name: service-telemetry-operator source: service-telemetry-framework-operators From 9206fa454d7fea741eaa176498f66599d4dff7ee Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Fri, 12 Jul 2024 09:10:57 +0200 Subject: [PATCH 63/70] Add a Zuul job for nightly bundles and index deploy (#608) * Add a Zuul job for nightly bundles and index deploy * Minor typo fix on zuul.yaml The job needs to use nightly_bundles-index_deploy vars --- .zuul.yaml | 17 +++++++++++++++++ ci/vars-nightly_bundles-index_deploy.yml | 5 +++++ 2 files changed, 22 insertions(+) create mode 100644 ci/vars-nightly_bundles-index_deploy.yml diff --git a/.zuul.yaml b/.zuul.yaml index f81d81ffb..935ed4683 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -124,6 +124,15 @@ vars: scenario: "local_build-index_deploy" +- job: + name: stf-crc-nightly_bundles-index_deploy + parent: stf-base + abstract: true + description: | + Create an index image using nightly bundles and deploy STF from it + vars: + scenario: "nightly_bundles-index_deploy" + - job: name: stf-crc-ocp_412-nightly_bundles parent: stf-crc-nightly_bundles @@ -166,6 +175,13 @@ Build STF locally and deploy from index on OCP 4.14 nodeset: stf-crc_extracted-ocp414 +- job: + name: stf-crc-ocp_414-nightly_bundles-index_deploy + parent: stf-crc-nightly_bundles-index_deploy + description: | + Create an index image using nightly bundles and deploy STF from it on OCP 4.14 + nodeset: stf-crc_extracted-ocp414 + - project-template: name: stf-crc-jobs description: | @@ -176,6 +192,7 @@ - stf-crc-ocp_414-local_build - stf-crc-ocp_412-local_build-index_deploy - stf-crc-ocp_414-local_build-index_deploy + - stf-crc-ocp_414-nightly_bundles-index_deploy - project: name: infrawatch/service-telemetry-operator diff --git a/ci/vars-nightly_bundles-index_deploy.yml b/ci/vars-nightly_bundles-index_deploy.yml new file mode 100644 index 000000000..a50563158 --- /dev/null +++ b/ci/vars-nightly_bundles-index_deploy.yml @@ -0,0 +1,5 @@ +--- +# ansible-playbook -e __local_build_enabled=false -e __deploy_from_index_enabled=true -e __deploy_from_bundles_enabled=true -e __service_telemetry_bundle_image_path=quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head -e __smart_gateway_bundle_image_path=quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head --skip-tags bundle_registry_tls_ca --skip-tags bundle_registry_auth build/run-ci.yaml +__local_build_enabled: false +__deploy_from_bundles_enabled: true +__deploy_from_index_enabled: true From fe6ee95e419600807a0c66578edc6ee8321fe2a8 Mon Sep 17 00:00:00 2001 From: Daniel Pawlik <3049495+danpawlik@users.noreply.github.com> Date: Wed, 17 Jul 2024 19:19:19 +0200 Subject: [PATCH 64/70] Update nodesets label (#613) The CI jobs does not need to be force use one cloud provider in the CI. This commit set the label that is defined in multiple cloud provider in Zuul. Available labels you can find [1]. [1] https://review.rdoproject.org/zuul/labels --- .zuul.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 935ed4683..6f0ee1c7d 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -3,7 +3,7 @@ name: stf-crc_extracted-ocp412 nodes: - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost + label: cloud-centos-9-stream-tripleo - name: crc label: coreos-crc-extracted-2-19-0-xxl @@ -11,7 +11,7 @@ name: stf-crc_extracted-ocp414 nodes: - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost + label: cloud-centos-9-stream-tripleo - name: crc label: coreos-crc-extracted-2-30-0-xxl From 8853c3cc3576711af91cc51c289823e32c6d0129 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Thu, 18 Jul 2024 05:24:14 -0400 Subject: [PATCH 65/70] Fix oauth SARs for interactive login (#612) - Doesn't work unless resource name is plural form - The "group" property is called "resourceAPIGroup" in SARs -"to avoid confusion with the 'groups' field when inlined"[1] [1] https://docs.openshift.com/container-platform/4.14/rest_api/authorization_apis/subjectaccessreview-authorization-openshift-io-v1.html --- roles/servicetelemetry/tasks/component_prometheus_reader.yml | 2 +- roles/servicetelemetry/templates/manifest_alertmanager.j2 | 2 +- roles/servicetelemetry/templates/manifest_grafana.j2 | 2 +- roles/servicetelemetry/templates/manifest_grafana_v5.j2 | 2 +- roles/servicetelemetry/templates/manifest_prometheus.j2 | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/roles/servicetelemetry/tasks/component_prometheus_reader.yml b/roles/servicetelemetry/tasks/component_prometheus_reader.yml index 6cbee8b42..c86f021fc 100644 --- a/roles/servicetelemetry/tasks/component_prometheus_reader.yml +++ b/roles/servicetelemetry/tasks/component_prometheus_reader.yml @@ -21,7 +21,7 @@ - apiGroups: - '{{ prometheus_operator_api_string | replace("/v1","") }}' resources: - - prometheus + - prometheuses verbs: - get namespaces: diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 70a6d68a3..c24dcd603 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -29,7 +29,7 @@ spec: - -upstream=http://localhost:9093/ - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=alertmanager-stf - - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "resourceAPIGroup":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' - '-openshift-delegate-urls={"/": {"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' ports: - containerPort: 9095 diff --git a/roles/servicetelemetry/templates/manifest_grafana.j2 b/roles/servicetelemetry/templates/manifest_grafana.j2 index 8b176b103..b7fdd6a4c 100644 --- a/roles/servicetelemetry/templates/manifest_grafana.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana.j2 @@ -42,7 +42,7 @@ spec: - -upstream=http://localhost:3000 - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=grafana-serviceaccount - - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafana", "group":"integreatly.org", "verb":"get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafanas", "resourceAPIGroup":"integreatly.org", "verb":"get"}' - -openshift-ca=/etc/pki/tls/cert.pem - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt ports: diff --git a/roles/servicetelemetry/templates/manifest_grafana_v5.j2 b/roles/servicetelemetry/templates/manifest_grafana_v5.j2 index 278e452ff..4c775c411 100644 --- a/roles/servicetelemetry/templates/manifest_grafana_v5.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana_v5.j2 @@ -61,7 +61,7 @@ spec: - '-https-address=:3002' - '-http-address=' - '-email-domain=*' - - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafana", "group":"grafana.integreatly.org", "verb":"get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafanas", "resourceAPIGroup":"grafana.integreatly.org", "verb":"get"}' - '-upstream=http://localhost:3000' - '-tls-cert=/etc/tls/private/tls.crt' - '-tls-key=/etc/tls/private/tls.key' diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index cd3680883..ad9fff789 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -48,8 +48,8 @@ spec: - -upstream=http://localhost:9090/ - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=prometheus-stf - - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheus", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' - - '-openshift-delegate-urls={"/":{"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheus", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheuses", "resourceAPIGroup":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' + - '-openshift-delegate-urls={"/":{"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheuses", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' ports: - containerPort: 9092 From f815e2c336f43009b7f02417cee8016711fdfb25 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 18 Jul 2024 16:12:11 +0100 Subject: [PATCH 66/70] Revert "Update nodesets label (#613)" (#621) This reverts commit fe6ee95e419600807a0c66578edc6ee8321fe2a8. --- .zuul.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 6f0ee1c7d..935ed4683 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -3,7 +3,7 @@ name: stf-crc_extracted-ocp412 nodes: - name: controller - label: cloud-centos-9-stream-tripleo + label: cloud-centos-9-stream-tripleo-vexxhost - name: crc label: coreos-crc-extracted-2-19-0-xxl @@ -11,7 +11,7 @@ name: stf-crc_extracted-ocp414 nodes: - name: controller - label: cloud-centos-9-stream-tripleo + label: cloud-centos-9-stream-tripleo-vexxhost - name: crc label: coreos-crc-extracted-2-30-0-xxl From d1bf0425e1952e010e7914e7fdcb00c2bd15b834 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Thu, 25 Jul 2024 04:24:21 -0400 Subject: [PATCH 67/70] Create our own token for prometheus-stf SA (#623) * In OCP 4.16, these are no long created by default See https://docs.openshift.com/container-platform/4.16/release_notes/ocp-4-16-release-notes.html#ocp-4-16-deprecated-features_release-notes (Legacy service account API token secrets are no longer generated for each service account) --- .../tasks/component_scrapeconfig.yml | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/roles/servicetelemetry/tasks/component_scrapeconfig.yml b/roles/servicetelemetry/tasks/component_scrapeconfig.yml index b05a5ac03..274472262 100644 --- a/roles/servicetelemetry/tasks/component_scrapeconfig.yml +++ b/roles/servicetelemetry/tasks/component_scrapeconfig.yml @@ -1,18 +1,15 @@ -- name: Look up prometheus-stf SA to get auth secret name - k8s_info: - api_version: v1 - kind: ServiceAccount - namespace: '{{ ansible_operator_meta.namespace }}' - name: prometheus-stf - register: service_account - -- name: Look up auth secret to get token secret name - k8s_info: - api_version: v1 - kind: Secret - namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ service_account.resources[0].secrets[0].name }}' - register: auth_secret +- name: Create an access token for prometheus-stf to use in scrapeconfigs + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + apiVersion: v1 + kind: Secret + metadata: + name: prometheus-stf-token + namespace: '{{ ansible_operator_meta.namespace }}' + annotations: + kubernetes.io/service-account.name: prometheus-stf + type: kubernetes.io/service-account-token - name: Create SG-specific Scrape Config manifest set_fact: @@ -28,7 +25,7 @@ authorization: type: bearer credentials: - name: '{{ auth_secret.resources[0].metadata.annotations['openshift.io/token-secret.name'] }}' + name: prometheus-stf-token key: token metricRelabelings: - action: labeldrop From 417131b934eefb44d80828edfd23187b42effff6 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 30 Jul 2024 07:16:39 +0100 Subject: [PATCH 68/70] [zuul] Add 4.16 jobs (#609) * [zuul] Add 4.16 jobs * Add local_build variation * [zuul] Update OCP 4.16 nodeset to use biggest flavor This is to see if sg-core just needs more resource to build. Eventually, the size will be reduced again to not-the-largest-available. * Update the nodeset label for CRC 2.39 * Update supported openshift versions to 4.16 * fix typo * [stf-collect-logs] Add oc get deployment to logs * [zuul] Update 4.16 node to use any cloud provider * [zuul] Only run the nightly bundles job All the jobs are currently failing for the same reason, disable all but the fastest one to save resources * fix typo on stf-collect-logs * Re-add vexxhost * [tmp] Check whether failures are due to the new OCP version * Update .zuul.yaml Switching from nightly bundles to local builds in order to test 4.16 fixes * test all 4.16 jobs * Re-enable all 4.16 jobs and remove 4.12 * Remove commented lines * Un-update stf-collectd-logs and update ci/README --------- Co-authored-by: Chris Sibbitt --- .zuul.yaml | 52 +++++++++++-------- ci/README.md | 3 +- .../service-telemetry-operator/Dockerfile.in | 2 +- .../metadata/properties.yaml | 2 +- 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 935ed4683..10a508aca 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -1,19 +1,19 @@ --- - nodeset: - name: stf-crc_extracted-ocp412 + name: stf-crc_extracted-ocp414 nodes: - name: controller label: cloud-centos-9-stream-tripleo-vexxhost - name: crc - label: coreos-crc-extracted-2-19-0-xxl + label: coreos-crc-extracted-2-30-0-xxl - nodeset: - name: stf-crc_extracted-ocp414 + name: stf-crc_extracted-ocp416 nodes: - name: controller label: cloud-centos-9-stream-tripleo-vexxhost - name: crc - label: coreos-crc-extracted-2-30-0-xxl + label: coreos-crc-extracted-2-39-0-3xl - job: name: stf-base-2node @@ -133,13 +133,6 @@ vars: scenario: "nightly_bundles-index_deploy" -- job: - name: stf-crc-ocp_412-nightly_bundles - parent: stf-crc-nightly_bundles - description: | - Deploy STF using the nightly bundles on OCP 4.12 - nodeset: stf-crc_extracted-ocp412 - - job: name: stf-crc-ocp_414-nightly_bundles parent: stf-crc-nightly_bundles @@ -148,11 +141,11 @@ nodeset: stf-crc_extracted-ocp414 - job: - name: stf-crc-ocp_412-local_build - parent: stf-crc-local_build + name: stf-crc-ocp_416-nightly_bundles + parent: stf-crc-nightly_bundles description: | - Build images locally and deploy STF on OCP 4.12 - nodeset: stf-crc_extracted-ocp412 + Deploy STF using the nightly bundles on OCP 4.16 + nodeset: stf-crc_extracted-ocp416 - job: name: stf-crc-ocp_414-local_build @@ -162,11 +155,11 @@ nodeset: stf-crc_extracted-ocp414 - job: - name: stf-crc-ocp_412-local_build-index_deploy - parent: stf-crc-local_build-index_deploy + name: stf-crc-ocp_416-local_build + parent: stf-crc-local_build description: | - Build STF locally and deploy from index on OCP 4.12 - nodeset: stf-crc_extracted-ocp412 + Build STF locally and deploy from index on OCP 4.16 + nodeset: stf-crc_extracted-ocp416 - job: name: stf-crc-ocp_414-local_build-index_deploy @@ -175,6 +168,13 @@ Build STF locally and deploy from index on OCP 4.14 nodeset: stf-crc_extracted-ocp414 +- job: + name: stf-crc-ocp_416-local_build-index_deploy + parent: stf-crc-local_build-index_deploy + description: | + Build STF locally and deploy from index on OCP 4.16 + nodeset: stf-crc_extracted-ocp416 + - job: name: stf-crc-ocp_414-nightly_bundles-index_deploy parent: stf-crc-nightly_bundles-index_deploy @@ -182,17 +182,25 @@ Create an index image using nightly bundles and deploy STF from it on OCP 4.14 nodeset: stf-crc_extracted-ocp414 +- job: + name: stf-crc-ocp_416-nightly_bundles-index_deploy + parent: stf-crc-nightly_bundles-index_deploy + description: | + Create an index image using nightly bundles and deploy STF from it on OCP 4.16 + nodeset: stf-crc_extracted-ocp416 + - project-template: name: stf-crc-jobs description: | STF CRC jobs that build and deploy STF github-check: jobs: - - stf-crc-ocp_412-local_build - stf-crc-ocp_414-local_build - - stf-crc-ocp_412-local_build-index_deploy + - stf-crc-ocp_416-local_build - stf-crc-ocp_414-local_build-index_deploy + - stf-crc-ocp_416-local_build-index_deploy - stf-crc-ocp_414-nightly_bundles-index_deploy + - stf-crc-ocp_416-nightly_bundles-index_deploy - project: name: infrawatch/service-telemetry-operator @@ -200,5 +208,5 @@ - stf-crc-jobs periodic: jobs: - - stf-crc-ocp_412-nightly_bundles - stf-crc-ocp_414-nightly_bundles + - stf-crc-ocp_416-nightly_bundles diff --git a/ci/README.md b/ci/README.md index 93e9e60f2..2c559f013 100644 --- a/ci/README.md +++ b/ci/README.md @@ -14,9 +14,8 @@ Two scenarios run: - `local_build-index_deploy`, which builds the images and does an index-based deployment Each of these scenarios run across the following OCP versions: -- 4.12 -- 4.13 - 4.14 +- 4.16 ### Periodic jobs diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index 871edc3c1..9fd49e86b 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="v4.12-v4.14" +LABEL com.redhat.openshift.versions="v4.12-v4.16" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml index 5ffce5254..7dd691a01 100644 --- a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -1,6 +1,6 @@ properties: - type: olm.maxOpenShiftVersion - value: "4.14" + value: "4.16" - type: olm.constraint value: failureMessage: Require Smart Gateway for Service Telemetry Framework From e1c312419a285494391a85adef1c5b984f2948e6 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Fri, 16 Aug 2024 15:30:03 +0200 Subject: [PATCH 69/70] Update smoketest images (#625) Update repositories for openstack-collectd and openstack-ceilometer-notification --- tests/smoketest/smoketest_job.yaml.template | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/smoketest/smoketest_job.yaml.template b/tests/smoketest/smoketest_job.yaml.template index 12626f3b2..d1a4846fa 100644 --- a/tests/smoketest/smoketest_job.yaml.template +++ b/tests/smoketest/smoketest_job.yaml.template @@ -14,7 +14,7 @@ spec: restartPolicy: Never containers: - name: smoketest-collectd - image: quay.io/tripleomaster/openstack-collectd:current-tripleo + image: quay.io/tripleomastercentos9/openstack-collectd:current-tripleo command: - /smoketest_collectd_entrypoint.sh env: @@ -43,7 +43,7 @@ spec: allowPrivilegeEscalation: false - name: smoketest-ceilometer - image: quay.io/tripleomaster/openstack-ceilometer-notification:current-tripleo + image: quay.io/tripleomastercentos9/openstack-ceilometer-notification:current-tripleo command: - /smoketest_ceilometer_entrypoint.sh env: From b046d5635d70d4100f98ea47edc8c305a477cbad Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Thu, 22 Aug 2024 19:52:09 +0200 Subject: [PATCH 70/70] Use bundle version instead of latest as tag (#626) * Use bundle version instead of latest as tag And use external registry instead of internal when creating the index image with bundles (no need to use the internal one) * Revert use external registry instead of internal We need to update the tags in the internal registry, since the default points to latest --- build/stf-run-ci/tasks/create_catalog.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index 828f0c905..6c2b5b6cc 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -49,23 +49,23 @@ - name: Create ImageStream for STO and SGO (deploying from bundles) when: __deploy_from_bundles_enabled | bool block: - - name: Set correct STO and SGO bundle paths when deploying from index with pre-built bundles - ansible.builtin.set_fact: - sto_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-operator-bundle:{{ sto_bundle_image_tag }}" - sgo_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/smart-gateway-operator-bundle:{{ sgo_bundle_image_tag }}" - - name: Create ImageStream for STO ansible.builtin.command: cmd: | - oc import-image -n {{ namespace }} service-telemetry-operator-bundle:{{ sto_bundle_image_tag }} --from={{ __service_telemetry_bundle_image_path }} --confirm --insecure + oc import-image -n {{ namespace }} service-telemetry-operator-bundle:{{ sto_bundle_info.operator_bundle_version }} --from={{ __service_telemetry_bundle_image_path }} --confirm --insecure register: sto_is - name: Create ImageStream for SGO ansible.builtin.command: cmd: | - oc import-image -n {{ namespace }} smart-gateway-operator-bundle:{{ sgo_bundle_image_tag }} --from={{ __smart_gateway_bundle_image_path }} --confirm --insecure + oc import-image -n {{ namespace }} smart-gateway-operator-bundle:{{ sgo_bundle_info.operator_bundle_version }} --from={{ __smart_gateway_bundle_image_path }} --confirm --insecure register: sgo_is + - name: Set correct STO and SGO bundle paths when deploying from index with pre-built bundles + ansible.builtin.set_fact: + sto_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-operator-bundle:{{ sto_bundle_info.operator_bundle_version }}" + sgo_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/smart-gateway-operator-bundle:{{ sgo_bundle_info.operator_bundle_version }}" + - name: Get the builder-dockercfg Secret name ansible.builtin.command: oc get secret -n {{ namespace }} --field-selector='type==kubernetes.io/dockercfg' -ojsonpath='{.items[?(@.metadata.annotations.kubernetes\.io/service-account\.name=="builder")].metadata.name}' register: secret_builder_dockercfg_name