From 1b9c8c8a6d4cf8791387e1a7846f661957595b11 Mon Sep 17 00:00:00 2001 From: khalillilahk <147730849+khalillilahk@users.noreply.github.com> Date: Mon, 30 Sep 2024 22:55:33 +0200 Subject: [PATCH] Add ability to mute all errors (mainly due to access rights) coming from process scraper of the hostmetricsreceiver (#34981) **Description:** We are currently encountering an issue with the `process` scraper in the `hostmetricsreceiver`, primarily due to access rights restrictions for certain processes like system processes for example. This is resulting in a large number of verbose error logs. Most of them are coming from the `process.open_file_descriptors` metric but we have errors coming from other metrics as well. In order to solve this issue, we added a flag `mute_process_all_errors `that mutes errors comming from the process scraper metrics, as these errors are predominantly associated with processes that we should not be monitoring anyways. **Link to tracking Issue:** https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/20435 **Testing:** Added unit tests **Documentation:** **Errors**: - Permission denied errors: ``` go.opentelemetry.io/collector/receiver@v0.90.1/scraperhelper/scrapercontroller.go:176 2024-09-02T17:24:10.341+0200 error scraping metrics {"kind": "receiver", "name": "hostmetrics/linux/localhost", "data_type": "metrics", "error": "error reading open file descriptor count for process \"systemd\" (pid 1): open /proc/1/fd: permission denied; ``` - File not found errors: ``` go.opentelemetry.io/collector/receiver@v0.90.1/scraperhelper/scrapercontroller.go:176 2024-09-02T17:25:38.688+0200 error scraperhelper/scrapercontroller.go:200 Error scraping metrics {"kind": "receiver", "name": "hostmetrics/process", "data_type": "metrics", "error": "error reading cpu times for process \"java\" (pid 466650): open /proc/466650/stat: no such file or directory; error reading memory info for process \"java\" (pid 466650): open /proc/466650/statm: no such file or directory; error reading thread info for process \"java\" (pid 466650): open /proc/466650/status: no such file or directory; error reading cpu times for process \"java\" (pid 474774): open /proc/474774/stat: no such file or directory; error reading memory info for process \"java\" (pid 474774): open /proc/474774/statm: no such file or directory; error reading thread info for process \"java\" (pid 474774): open /proc/474774/status: no such file or directory; error reading cpu times for process \"java\" (pid 481780): open /proc/481780/stat: no such file or directory; error reading memory info for process \"java\" (pid 481780): open /proc/481780/statm: no such file or directory; error reading thread info for process \"java\" (pid 481780): open /proc/481780/status: no such file or directory", "scraper": "process"} ``` **Config**: ``` receiver hostmetrics/process: collection_interval: ${PROCESSES_COLLECTION_INTERVAL}s scrapers: process: mute_process_name_error: true mute_process_exe_error: true mute_process_io_error: true mute_process_user_error: true resource_attributes: # disable non_used default attributes process.command: enabled: false process.command_line: enabled: false process.executable.path: enabled: false process.owner: enabled: false process.parent_pid: enabled: false metrics: # disable non-used default metrics process.cpu.time: enabled: false process.memory.virtual: enabled: false # enable used optional metrics process.cpu.utilization: enabled: true process.open_file_descriptors: enabled: true process.threads: enabled: true ``` --- .../hostmetricsreceiver-mute-all-errors.yaml | 27 ++++++++++++++++++ receiver/hostmetricsreceiver/README.md | 13 +++++---- .../internal/scraper/processscraper/config.go | 15 ++++++++-- .../scraper/processscraper/process_scraper.go | 4 +++ .../processscraper/process_scraper_test.go | 28 ++++++++++++++++++- 5 files changed, 77 insertions(+), 10 deletions(-) create mode 100644 .chloggen/hostmetricsreceiver-mute-all-errors.yaml diff --git a/.chloggen/hostmetricsreceiver-mute-all-errors.yaml b/.chloggen/hostmetricsreceiver-mute-all-errors.yaml new file mode 100644 index 000000000000..585928d258af --- /dev/null +++ b/.chloggen/hostmetricsreceiver-mute-all-errors.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: hostmetricsreceiver + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add ability to mute all errors (mainly due to access rights) coming from process scraper of the hostmetricsreceiver + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [20435] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/receiver/hostmetricsreceiver/README.md b/receiver/hostmetricsreceiver/README.md index 1098fbe6f4ca..d4ab9b7e6a94 100644 --- a/receiver/hostmetricsreceiver/README.md +++ b/receiver/hostmetricsreceiver/README.md @@ -114,6 +114,7 @@ process: : names: [ , ... ] match_type: + mute_process_all_errors: mute_process_name_error: mute_process_exe_error: mute_process_io_error: @@ -123,12 +124,12 @@ process: ``` The following settings are optional: - -- `mute_process_name_error` (default: false): mute the error encountered when trying to read a process name the collector does not have permission to read -- `mute_process_io_error` (default: false): mute the error encountered when trying to read IO metrics of a process the collector does not have permission to read -- `mute_process_cgroup_error` (default: false): mute the error encountered when trying to read the cgroup of a process the collector does not have permission to read -- `mute_process_exe_error` (default: false): mute the error encountered when trying to read the executable path of a process the collector does not have permission to read (Linux only) -- `mute_process_user_error` (default: false): mute the error encountered when trying to read a uid which doesn't exist on the system, eg. is owned by a user that only exists in a container. +- `mute_process_all_errors` (default: false): mute all the errors encountered when trying to read metrics of a process. When this flag is enabled, there is no need to activate any other error suppression flags. +- `mute_process_name_error` (default: false): mute the error encountered when trying to read a process name the collector does not have permission to read. This flag is ignored when `mute_process_all_errors` is set to true as all errors are muted. +- `mute_process_io_error` (default: false): mute the error encountered when trying to read IO metrics of a process the collector does not have permission to read. This flag is ignored when `mute_process_all_errors` is set to true as all errors are muted. +- `mute_process_cgroup_error` (default: false): mute the error encountered when trying to read the cgroup of a process the collector does not have permission to read. This flag is ignored when `mute_process_all_errors` is set to true as all errors are muted. +- `mute_process_exe_error` (default: false): mute the error encountered when trying to read the executable path of a process the collector does not have permission to read (Linux only). This flag is ignored when `mute_process_all_errors` is set to true as all errors are muted. +- `mute_process_user_error` (default: false): mute the error encountered when trying to read a uid which doesn't exist on the system, eg. is owned by a user that only exists in a container. This flag is ignored when `mute_process_all_errors` is set to true as all errors are muted. ## Advanced Configuration diff --git a/receiver/hostmetricsreceiver/internal/scraper/processscraper/config.go b/receiver/hostmetricsreceiver/internal/scraper/processscraper/config.go index 76003bf72ce0..60c3ea0b8535 100644 --- a/receiver/hostmetricsreceiver/internal/scraper/processscraper/config.go +++ b/receiver/hostmetricsreceiver/internal/scraper/processscraper/config.go @@ -22,29 +22,38 @@ type Config struct { Include MatchConfig `mapstructure:"include"` Exclude MatchConfig `mapstructure:"exclude"` + // MuteProcessAllErrors is a flag that will mute all the errors encountered when trying to read metrics of a process. + // When this flag is enabled, there is no need to activate any other error suppression flags. + MuteProcessAllErrors bool `mapstructure:"mute_process_all_errors,omitempty"` + // MuteProcessNameError is a flag that will mute the error encountered when trying to read a process name the // collector does not have permission to read. // See https://github.com/open-telemetry/opentelemetry-collector/issues/3004 for more information. + // This flag is ignored when MuteProcessAllErrors is set to true as all errors are muted. MuteProcessNameError bool `mapstructure:"mute_process_name_error,omitempty"` // MuteProcessIOError is a flag that will mute the error encountered when trying to read IO metrics of a process // the collector does not have permission to read. + // This flag is ignored when MuteProcessAllErrors is set to true as all errors are muted. MuteProcessIOError bool `mapstructure:"mute_process_io_error,omitempty"` // MuteProcessCgroupError is a flag that will mute the error encountered when trying to read the cgroup of a process // the collector does not have permission to read. + // This flag is ignored when MuteProcessAllErrors is set to true as all errors are muted. MuteProcessCgroupError bool `mapstructure:"mute_process_cgroup_error,omitempty"` // MuteProcessExeError is a flag that will mute the error encountered when trying to read the executable path of a process - // the collector does not have permission to read (Linux) + // the collector does not have permission to read (Linux). + // This flag is ignored when MuteProcessAllErrors is set to true as all errors are muted. MuteProcessExeError bool `mapstructure:"mute_process_exe_error,omitempty"` // MuteProcessUserError is a flag that will mute the error encountered when trying to read uid which - // doesn't exist on the system, eg. is owned by user existing in container only + // doesn't exist on the system, eg. is owned by user existing in container only. + // This flag is ignored when MuteProcessAllErrors is set to true as all errors are muted. MuteProcessUserError bool `mapstructure:"mute_process_user_error,omitempty"` // ScrapeProcessDelay is used to indicate the minimum amount of time a process must be running - // before metrics are scraped for it. The default value is 0 seconds (0s) + // before metrics are scraped for it. The default value is 0 seconds (0s). ScrapeProcessDelay time.Duration `mapstructure:"scrape_process_delay"` } diff --git a/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper.go b/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper.go index 3b9a4bbee701..0a918a112ba1 100644 --- a/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper.go +++ b/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper.go @@ -187,6 +187,10 @@ func (s *scraper) scrape(ctx context.Context) (pmetric.Metrics, error) { } } + if s.config.MuteProcessAllErrors { + return s.mb.Emit(), nil + } + return s.mb.Emit(), errs.Combine() } diff --git a/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper_test.go b/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper_test.go index b1a232e5d7cd..4d983b8a664e 100644 --- a/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper_test.go +++ b/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper_test.go @@ -1003,6 +1003,7 @@ func TestScrapeMetrics_MuteErrorFlags(t *testing.T) { muteProcessExeError bool muteProcessIOError bool muteProcessUserError bool + muteProcessAllErrors bool skipProcessNameError bool omitConfigField bool expectedError string @@ -1093,6 +1094,30 @@ func TestScrapeMetrics_MuteErrorFlags(t *testing.T) { return 4 }(), }, + { + name: "All Process Errors Muted", + muteProcessNameError: false, + muteProcessExeError: false, + muteProcessIOError: false, + muteProcessUserError: false, + muteProcessAllErrors: true, + expectedCount: 0, + }, + { + name: "Process User Error Enabled and All Process Errors Muted", + muteProcessUserError: false, + skipProcessNameError: true, + muteProcessExeError: true, + muteProcessNameError: true, + muteProcessAllErrors: true, + expectedCount: func() int { + if runtime.GOOS == "darwin" { + // disk.io is not collected on darwin + return 3 + } + return 4 + }(), + }, } for _, test := range testCases { @@ -1106,6 +1131,7 @@ func TestScrapeMetrics_MuteErrorFlags(t *testing.T) { config.MuteProcessExeError = test.muteProcessExeError config.MuteProcessIOError = test.muteProcessIOError config.MuteProcessUserError = test.muteProcessUserError + config.MuteProcessAllErrors = test.muteProcessAllErrors } scraper, err := newProcessScraper(receivertest.NewNopSettings(), config) require.NoError(t, err, "Failed to create process scraper: %v", err) @@ -1135,7 +1161,7 @@ func TestScrapeMetrics_MuteErrorFlags(t *testing.T) { assert.Equal(t, test.expectedCount, md.MetricCount()) - if config.MuteProcessNameError && config.MuteProcessExeError && config.MuteProcessUserError { + if (config.MuteProcessNameError && config.MuteProcessExeError && config.MuteProcessUserError) || config.MuteProcessAllErrors { assert.NoError(t, err) } else { assert.EqualError(t, err, test.expectedError)