From f6610dfcfc87ae9731ee70de317c0a5d8cbdf8c8 Mon Sep 17 00:00:00 2001 From: Paul Maidment Date: Thu, 27 Feb 2025 14:42:58 +0200 Subject: [PATCH] MGMT-19840: Gather operational metrics from installercache The intent of this PR is to trace the following statistics, implemented as counts and incremented from applicable parts of the solution. counterDescriptionInstallerCachePrunedHardlink "Counts the number of times the installercache pruned a hardlink for being too old" counterDescriptionInstallerCacheGetReleaseOK "Counts the number of times that a release was fetched succesfully" counterDescriptionInstallerCacheGetReleaseTimeout "Counts the number of times that a release timed out or had the context cancelled" counterDescriptionInstallerCacheGetReleaseError "Counts the number of times that a release fetch resulted in error" counterDescriptionInstallerCacheReleaseCached "Counts the number of times that a release was found in the cache" counterDescriptionInstallerCacheReleaseExtracted "Counts the number of times that a release was extracted" counterDescriptionInstallerCacheTryEviction "Counts the number of times that the eviction function was called" counterDescriptionInstallerCacheReleaseEvicted "Counts the number of times that a release was evicted" This, combined with the event based metrics gathered in #7156 should provide enough information to track the behaviour of the cache. --- cmd/main.go | 2 +- internal/ignition/installmanifests_test.go | 20 ++- internal/installercache/installercache.go | 33 +++-- .../installercache/installercache_test.go | 102 +++++++++------ internal/metrics/metricsManager.go | 120 ++++++++++++++---- internal/metrics/mock_metrics_manager_api.go | 84 ++++++++++-- 6 files changed, 269 insertions(+), 92 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index e2a1c6094c..799e0f5507 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -492,7 +492,7 @@ func main() { Options.BMConfig.S3EndpointURL = newUrl Options.InstallerCacheConfig.CacheDir = filepath.Join(Options.GeneratorConfig.GetWorkingDirectory(), "installercache") - installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, diskStatsHelper, log) + installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, metricsManager, diskStatsHelper, log) failOnError(err, "failed to instantiate installercache") generator := generator.New(log, objectHandler, Options.GeneratorConfig, providerRegistry, manifestsApi, eventsHandler, installerCache) diff --git a/internal/ignition/installmanifests_test.go b/internal/ignition/installmanifests_test.go index b84bfd9341..37a8699266 100644 --- a/internal/ignition/installmanifests_test.go +++ b/internal/ignition/installmanifests_test.go @@ -94,6 +94,7 @@ var _ = Describe("Bootstrap Ignition Update", func() { manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler *eventsapi.MockHandler installerCache *installercache.Installers + metricsAPI *metrics.MockAPI ) BeforeEach(func() { @@ -105,12 +106,13 @@ var _ = Describe("Bootstrap Ignition Update", func() { err1 = os.WriteFile(examplePath, []byte(bootstrap1), 0600) Expect(err1).NotTo(HaveOccurred()) ctrl = gomock.NewController(GinkgoT()) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) mockS3Client = s3wrapper.NewMockAPI(ctrl) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) @@ -262,6 +264,7 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6 ctrl *gomock.Controller manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -286,12 +289,13 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6 ctrl = gomock.NewController(GinkgoT()) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) @@ -456,6 +460,7 @@ var _ = Describe("createHostIgnitions", func() { workDir string manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -476,13 +481,14 @@ var _ = Describe("createHostIgnitions", func() { mockS3Client = s3wrapper.NewMockAPI(ctrl) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) cluster = testCluster() installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) @@ -1779,6 +1785,7 @@ var _ = Describe("Bare metal host generation", func() { ctrl *gomock.Controller manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -1789,13 +1796,14 @@ var _ = Describe("Bare metal host generation", func() { ctrl = gomock.NewController(GinkgoT()) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), ReleaseFetchRetryInterval: 1 * time.Microsecond, } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) @@ -1889,6 +1897,7 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() { ctrl *gomock.Controller manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -1920,13 +1929,14 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() { ctrl = gomock.NewController(GinkgoT()) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), ReleaseFetchRetryInterval: 1 * time.Microsecond, } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) diff --git a/internal/installercache/installercache.go b/internal/installercache/installercache.go index 24f9efc5ac..bcd8d95628 100644 --- a/internal/installercache/installercache.go +++ b/internal/installercache/installercache.go @@ -36,6 +36,7 @@ type Installers struct { eventsHandler eventsapi.Handler diskStatsHelper metrics.DiskStatsHelper config Config + metricsAPI metrics.API } type Size int64 @@ -116,7 +117,7 @@ func (rl *Release) Cleanup(ctx context.Context) error { } // New constructs an installer cache with a given storage capacity -func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) { +func New(config Config, eventsHandler eventsapi.Handler, metricsAPI metrics.API, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) { if config.MaxCapacity > 0 && config.MaxReleaseSize == 0 { return nil, fmt.Errorf("config.MaxReleaseSize (%d bytes) must not be zero", config.MaxReleaseSize) } @@ -128,6 +129,7 @@ func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics eventsHandler: eventsHandler, diskStatsHelper: diskStatsHelper, config: config, + metricsAPI: metricsAPI, }, nil } @@ -138,14 +140,25 @@ func (i *Installers) Get(ctx context.Context, releaseID, releaseIDMirror, pullSe for { select { case <-ctx.Done(): - return nil, ctx.Err() + err := ctx.Err() + if err == context.DeadlineExceeded { + i.metricsAPI.InstallerCacheGetReleaseTimeout(releaseID) + } + return nil, err default: - release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID) + majorMinorVersion, err := ocRelease.GetMajorMinorVersion(i.log, releaseID, releaseIDMirror, pullSecret) + if err != nil { + i.log.Warnf("unable to get majorMinorVersion to record metric for %s falling back to full URI", releaseID) + majorMinorVersion = releaseID + } + release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID, majorMinorVersion) if err == nil { + i.metricsAPI.InstallerCacheGetReleaseOK(majorMinorVersion) return release, nil } _, isCapacityError := err.(*errorInsufficientCacheCapacity) if !isCapacityError { + i.metricsAPI.InstallerCacheGetReleaseError(releaseID) return nil, errors.Wrapf(err, "failed to get installer path for release %s", releaseID) } time.Sleep(i.config.ReleaseFetchRetryInterval) @@ -161,14 +174,16 @@ func (i *Installers) getDiskUsageIncludingHardlinks() (uint64, error) { return usedBytes, nil } -func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release) (extractDuration float64, cached bool, err error) { +func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release, majorMinorVersion string) (extractDuration float64, cached bool, err error) { _, err = os.Stat(path) if err == nil { + i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, true) return 0, true, nil // release was found in the cache } if !os.IsNotExist(err) { return 0, false, err } + i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, false) usedBytes, err := i.getDiskUsageIncludingHardlinks() if err != nil && !os.IsNotExist(err) { return 0, false, errors.Wrapf(err, "could not determine disk usage information for cache dir %s", i.config.CacheDir) @@ -184,7 +199,7 @@ func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pu return time.Since(extractStartTime).Seconds(), false, nil } -func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID) (*Release, error) { +func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID, majorMinorVersion string) (*Release, error) { i.Lock() defer i.Unlock() @@ -199,7 +214,7 @@ func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocReleas if err != nil { return nil, err } - release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease) + release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease, majorMinorVersion) if err != nil { return nil, err } @@ -310,8 +325,10 @@ func (i *Installers) evictFile(filePath string) error { i.log.Infof("evicting binary file %s due to storage pressure", filePath) err := os.Remove(filePath) if err != nil { + i.metricsAPI.InstallerCacheReleaseEvicted(false) return err } + i.metricsAPI.InstallerCacheReleaseEvicted(true) // if the parent directory was left empty, // remove it to avoid dangling directories parentDir := path.Dir(filePath) @@ -334,9 +351,9 @@ func (i *Installers) pruneExpiredHardLinks(links []*fileInfo, gracePeriod time.D grace := graceTime.Unix() if finfo.info.ModTime().Unix() < grace { i.log.Infof("attempting to prune hard link %s", finfo.path) - err := os.Remove(finfo.path) - if err != nil { + if err := os.Remove(finfo.path); err != nil { i.log.WithError(err).Errorf("failed to prune hard link %s", finfo.path) + continue } } } diff --git a/internal/installercache/installercache_test.go b/internal/installercache/installercache_test.go index 9e285e64d4..393d1b9bac 100644 --- a/internal/installercache/installercache_test.go +++ b/internal/installercache/installercache_test.go @@ -71,6 +71,7 @@ var _ = Describe("installer cache", func() { manager *Installers cacheDir string eventsHandler *eventsapi.MockHandler + metricsAPI *metrics.MockAPI ctx context.Context diskStatsHelper metrics.DiskStatsHelper ) @@ -85,17 +86,17 @@ var _ = Describe("installer cache", func() { } BeforeEach(func() { - ctrl = gomock.NewController(GinkgoT()) diskStatsHelper = metrics.NewOSDiskStatsHelper(logrus.New()) mockRelease = oc.NewMockRelease(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) var err error cacheDir, err = os.MkdirTemp("/tmp", "cacheDir") Expect(err).NotTo(HaveOccurred()) Expect(os.Mkdir(filepath.Join(cacheDir, "quay.io"), 0755)).To(Succeed()) Expect(os.Mkdir(filepath.Join(filepath.Join(cacheDir, "quay.io"), "release-dev"), 0755)).To(Succeed()) - manager, err = New(getInstallerCacheConfig(12, 5), eventsHandler, diskStatsHelper, logrus.New()) + manager, err = New(getInstallerCacheConfig(12, 5), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).NotTo(HaveOccurred()) ctx = context.TODO() }) @@ -111,7 +112,7 @@ var _ = Describe("installer cache", func() { ).AnyTimes() } - mockReleaseCalls := func(releaseID string, version string) { + mockReleaseCalls := func(releaseID string, version string, expectedMajorMinorVersion string) { workdir := filepath.Join(manager.config.CacheDir, "quay.io", "release-dev") fname := filepath.Join(workdir, releaseID) @@ -131,15 +132,21 @@ var _ = Describe("installer cache", func() { mockRelease.EXPECT().Extract(gomock.Any(), releaseID, gomock.Any(), manager.config.CacheDir, gomock.Any(), version). DoAndReturn(writeMockedReleaseToDisk).AnyTimes() + + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(expectedMajorMinorVersion).Times(1) } - testGet := func(releaseID, version string, clusterID strfmt.UUID, expectCached bool) (string, string) { + testGet := func(releaseID, version string, clusterID strfmt.UUID, expectCached bool, expectedMajorMinorVersion string) (string, string) { workdir := filepath.Join(manager.config.CacheDir, "quay.io", "release-dev") fname := filepath.Join(workdir, releaseID) if !expectCached { - mockReleaseCalls(releaseID, version) + mockReleaseCalls(releaseID, version, expectedMajorMinorVersion) } expectEventsSent() + mockReleaseCalls(releaseID, version, expectedMajorMinorVersion) + expectEventsSent() + mockRelease.EXPECT().GetMajorMinorVersion(gomock.Any(), releaseID, gomock.Any(), gomock.Any()).Return(expectedMajorMinorVersion, nil).Times(1) + metricsAPI.EXPECT().InstallerCacheGetReleaseCached(expectedMajorMinorVersion, gomock.Any()).AnyTimes() l, err := manager.Get(ctx, releaseID, "mirror", "pull-secret", mockRelease, version, clusterID) Expect(err).ShouldNot(HaveOccurred()) Expect(l.releaseID).To(Equal(releaseID)) @@ -157,9 +164,10 @@ var _ = Describe("installer cache", func() { } type test struct { - releaseID string - version string - clusterID strfmt.UUID + releaseID string + version string + clusterID strfmt.UUID + majorMinorVersion string } getUsedBytesForDirectory := func(directory string) uint64 { @@ -188,7 +196,11 @@ var _ = Describe("installer cache", func() { runTest := func(t test, manager *Installers) (*Release, error) { expectEventsSent() - mockReleaseCalls(t.releaseID, t.version) + mockReleaseCalls(t.releaseID, t.version, t.majorMinorVersion) + mockRelease.EXPECT().GetMajorMinorVersion(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(t.majorMinorVersion, nil).AnyTimes() + metricsAPI.EXPECT().InstallerCacheGetReleaseCached(t.majorMinorVersion, gomock.Any()).AnyTimes() + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(t.majorMinorVersion).AnyTimes() + metricsAPI.EXPECT().InstallerCacheReleaseEvicted(gomock.Any()).AnyTimes() return manager.Get(ctx, t.releaseID, "mirror", "pull-secret", mockRelease, t.version, t.clusterID) } @@ -221,7 +233,7 @@ var _ = Describe("installer cache", func() { // returns the first error encountered or nil if no error encountered. runParallelTest := func(maxCapacity int64, maxReleaseSize int64, tests []test) error { var err error - manager, err = New(getInstallerCacheConfig(maxCapacity, maxReleaseSize), eventsHandler, diskStatsHelper, getLogger()) + manager, err = New(getInstallerCacheConfig(maxCapacity, maxReleaseSize), eventsHandler, metricsAPI, diskStatsHelper, getLogger()) Expect(err).ToNot(HaveOccurred()) var wg sync.WaitGroup var reportedError error @@ -244,11 +256,11 @@ var _ = Describe("installer cache", func() { maxCapacity := int64(10) maxReleaseSize := int64(5) err := runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, }) Expect(err).ToNot(HaveOccurred()) // Now measure disk usage, we should be under the cache size @@ -259,11 +271,11 @@ var _ = Describe("installer cache", func() { maxCapacity := int64(25) maxReleaseSize := int64(5) err := runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.18.11-x86_64", version: "4.18.11"}, - {releaseID: "4.19.11-x86_64", version: "4.19.11"}, - {releaseID: "4.20.11-x86_64", version: "4.20.11"}, - {releaseID: "4.21.11-x86_64", version: "4.21.11"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.18.11-x86_64", version: "4.18.11", majorMinorVersion: "4.17"}, + {releaseID: "4.19.11-x86_64", version: "4.19.11", majorMinorVersion: "4.17"}, + {releaseID: "4.20.11-x86_64", version: "4.20.11", majorMinorVersion: "4.17"}, + {releaseID: "4.21.11-x86_64", version: "4.21.11", majorMinorVersion: "4.17"}, }) Expect(err).ToNot(HaveOccurred()) // Now measure disk usage, we should be under the cache size @@ -274,8 +286,8 @@ var _ = Describe("installer cache", func() { maxCapacity := int64(5) maxReleaseSize := int64(5) err := runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.18.11-x86_64", version: "4.18.11"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.18.11-x86_64", version: "4.18.11", majorMinorVersion: "4.18"}, }) Expect(err).ToNot(HaveOccurred()) // Now measure disk usage, we should be under the cache size @@ -284,41 +296,41 @@ var _ = Describe("installer cache", func() { // Now assert that a retry would work, there should be enough space for another release // use a brand new release ID to prove we are not hitting cache here. err = runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.19.11-x86_64", version: "4.19.11"}, + {releaseID: "4.19.11-x86_64", version: "4.19.11", majorMinorVersion: "4.19"}, }) Expect(err).ToNot(HaveOccurred()) }) It("Should raise error on construction if max release size is larger than cache and cache is enabled", func() { - _, err := New(getInstallerCacheConfig(5, 10), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(5, 10), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(Equal("config.MaxReleaseSize (10 bytes) must not be greater than config.MaxCapacity (5 bytes)")) }) It("Should raise error on construction if max release size is zero and cache is enabled", func() { - _, err := New(getInstallerCacheConfig(5, 0), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(5, 0), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(Equal("config.MaxReleaseSize (0 bytes) must not be zero")) }) It("Should not raise error on construction if max release size is larger than cache and cache eviction is disabled", func() { - _, err := New(getInstallerCacheConfig(0, 10), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(0, 10), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).ToNot(HaveOccurred()) }) It("Should not raise error on construction if max release size is zero and cache eviction is disabled", func() { - _, err := New(getInstallerCacheConfig(0, 0), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(0, 0), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).ToNot(HaveOccurred()) }) It("when cache limit is zero - eviction is skipped", func() { var err error - manager, err = New(getInstallerCacheConfig(0, 5), eventsHandler, diskStatsHelper, logrus.New()) + manager, err = New(getInstallerCacheConfig(0, 5), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).ToNot(HaveOccurred()) clusterId := strfmt.UUID(uuid.New().String()) - r1, _ := testGet("4.8", "4.8.0", clusterId, false) - r2, _ := testGet("4.9", "4.9.0", clusterId, false) - r3, _ := testGet("4.10", "4.10.0", clusterId, false) + r1, _ := testGet("4.8", "4.8.0", clusterId, false, "4.8") + r2, _ := testGet("4.9", "4.9.0", clusterId, false, "4.9") + r3, _ := testGet("4.10", "4.10.0", clusterId, false, "4.10") By("verify that the no file was deleted") _, err = os.Stat(r1) @@ -329,12 +341,14 @@ var _ = Describe("installer cache", func() { Expect(os.IsNotExist(err)).To(BeFalse()) }) - It("exising files access time is updated", func() { + It("existing files access time is updated", func() { + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(gomock.Any()).AnyTimes() clusterId := strfmt.UUID(uuid.New().String()) - _, _ = testGet("4.8", "4.8.0", clusterId, false) - r2, _ := testGet("4.9", "4.9.0", clusterId, false) - r1, _ := testGet("4.8", "4.8.0", clusterId, true) - r3, _ := testGet("4.10", "4.10.0", clusterId, false) + _, _ = testGet("4.8", "4.8.0", clusterId, false, "4.8") + r2, _ := testGet("4.9", "4.9.0", clusterId, false, "4.9") + r1, _ := testGet("4.8", "4.8.0", clusterId, true, "4.8") + metricsAPI.EXPECT().InstallerCacheReleaseEvicted(true).Times(1) + r3, _ := testGet("4.10", "4.10.0", clusterId, false, "4.10") By("verify that the oldest file was deleted") _, err := os.Stat(r1) @@ -348,10 +362,12 @@ var _ = Describe("installer cache", func() { }) It("evicts the oldest file", func() { + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(gomock.Any()).AnyTimes() clusterId := strfmt.UUID(uuid.New().String()) - r1, _ := testGet("4.8", "4.8.0", clusterId, false) - r2, _ := testGet("4.9", "4.9.0", clusterId, false) - r3, _ := testGet("4.10", "4.10.0", clusterId, false) + r1, _ := testGet("4.8", "4.8.0", clusterId, false, "4.8") + r2, _ := testGet("4.9", "4.9.0", clusterId, false, "4.9") + metricsAPI.EXPECT().InstallerCacheReleaseEvicted(true).Times(1) + r3, _ := testGet("4.10", "4.10.0", clusterId, false, "4.10") By("verify that the oldest file was deleted") _, err := os.Stat(r1) @@ -366,11 +382,16 @@ var _ = Describe("installer cache", func() { }) It("extracts a release", func() { + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(gomock.Any()).AnyTimes() releaseID := "4.10-orig" releaseMirrorID := "" version := "4.10.0" + majorMinorVersion := "4.10" clusterID := strfmt.UUID(uuid.NewString()) - mockReleaseCalls(releaseID, version) + mockRelease.EXPECT().GetMajorMinorVersion(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(majorMinorVersion, nil).AnyTimes() + mockReleaseCalls(releaseID, version, majorMinorVersion) + metricsAPI.EXPECT().InstallerCacheGetReleaseCached(majorMinorVersion, false) + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(majorMinorVersion).Times(1) l, err := manager.Get(ctx, releaseID, releaseMirrorID, "pull-secret", mockRelease, version, clusterID) Expect(err).ShouldNot(HaveOccurred()) Expect(l.releaseID).To(Equal(releaseID)) @@ -389,7 +410,6 @@ var _ = Describe("installer cache", func() { numberOfLinks := 10 numberOfExpiredLinks := 5 - directory, err := os.MkdirTemp("", "testPruneExpiredHardLinks") Expect(err).ToNot(HaveOccurred()) diff --git a/internal/metrics/metricsManager.go b/internal/metrics/metricsManager.go index 57b8d7235f..787e61b411 100644 --- a/internal/metrics/metricsManager.go +++ b/internal/metrics/metricsManager.go @@ -3,6 +3,7 @@ package metrics import ( "context" "encoding/json" + "fmt" "time" "github.com/alecthomas/units" @@ -37,8 +38,12 @@ const ( counterClusterValidationFailed = "assisted_installer_cluster_validation_is_in_failed_status_on_cluster_deletion" counterClusterValidationChanged = "assisted_installer_cluster_validation_failed_after_success_before_installation" counterFilesystemUsagePercentage = "assisted_installer_filesystem_usage_percentage" - histogramMonitoredHostsDurationMs = "assisted_installer_monitored_hosts_duration_ms" - histogramMonitoredClustersDurationMs = "assisted_installer_monitored_clusters_duration_ms" + counterMonitoredHosts = "assisted_installer_monitored_hosts" + counterMonitoredClusters = "assisted_installer_monitored_clusters" + counterInstallerCacheGetRelease = "assisted_installer_cache_get_release" + counterInstallerCacheReleaseCached = "assisted_installer_cache_get_release_cached" + counterInstallerCacheReleaseEvicted = "assisted_installer_cache_release_evicted" + counterInstallerCacheGetReleaseCached = "assisted_installer_cache_release_cached" ) const ( @@ -59,8 +64,11 @@ const ( counterDescriptionClusterValidationFailed = "Number of cluster validation errors" counterDescriptionClusterValidationChanged = "Number of cluster validations that already succeed but start to fail again" counterDescriptionFilesystemUsagePercentage = "The percentage of the filesystem usage by the service" - histogramDescriptionMonitoredHostsDurationMs = "Histogram/sum/count of monitored hosts duration (ms)" - histogramDescriptionMonitoredClustersDurationMs = "Histogram/sum/count of monitored clusters duration (ms)" + counterDescriptionMonitoredHosts = "Number of hosts monitored by host monitor" + counterDescriptionMonitoredClusters = "Number of clusters monitored by cluster monitor" + counterDescriptionInstallerCacheGetRelease = "Counts the number of times a release was attempted, with the outcome as a label, cache status as label" + counterDescriptionInstallerCacheReleaseEvicted = "Counts the number of times that a release was evicted, label with success or fail of eviction" + counterDescriptionInstallerCacheGetReleaseCached = "Counts the number of times that a release was either cached or not cached" ) const ( @@ -77,6 +85,15 @@ const ( imageLabel = "imageName" hosts = "hosts" clusters = "clusters" + labelStatus = "status" + labelValueOK = "ok" + labelValueError = "error" + labelCacheHit = "hit" + labelReleaseID = "releaseId" + labelSucceess = "succeess" + labelErrorType = "errorType" + labelValueTimeout = "timeout" + labelValueOther = "other" ) type API interface { @@ -92,8 +109,13 @@ type API interface { DiskSyncDuration(syncDuration int64) ImagePullStatus(imageName, resultStatus string, downloadRate float64) FileSystemUsage(usageInPercentage float64) - MonitoredHostsDurationMs(monitoredHostsMillis float64) - MonitoredClustersDurationMs(monitoredClustersMillis float64) + MonitoredHostsCount(monitoredHosts int64) + MonitoredClusterCount(monitoredClusters int64) + InstallerCacheGetReleaseOK(releaseID string) + InstallerCacheGetReleaseTimeout(releaseID string) + InstallerCacheGetReleaseError(releaseID string) + InstallerCacheReleaseEvicted(succeeded bool) + InstallerCacheGetReleaseCached(releaseId string, cacheHit bool) } type MetricsManager struct { @@ -117,9 +139,13 @@ type MetricsManager struct { serviceLogicClusterValidationFailed *prometheus.CounterVec serviceLogicClusterValidationChanged *prometheus.CounterVec serviceLogicFilesystemUsagePercentage *prometheus.GaugeVec - serviceLogicMonitoredHostsDurationMs *prometheus.HistogramVec - serviceLogicMonitoredClustersDurationMs *prometheus.HistogramVec - collectors []prometheus.Collector + serviceLogicMonitoredHosts *prometheus.GaugeVec + serviceLogicMonitoredClusters *prometheus.GaugeVec + serviceLogicInstallerCacheGetRelease *prometheus.CounterVec + serviceLogicInstallerCacheReleaseEvicted *prometheus.CounterVec + serviceLogicInstallerCacheGetReleaseCached *prometheus.CounterVec + + collectors []prometheus.Collector } var _ API = &MetricsManager{} @@ -275,21 +301,43 @@ func NewMetricsManager(registry prometheus.Registerer, eventsHandler eventsapi.H }, []string{}, ), - serviceLogicMonitoredHostsDurationMs: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + serviceLogicMonitoredHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, - Name: histogramMonitoredHostsDurationMs, - Help: histogramDescriptionMonitoredHostsDurationMs, - Buckets: []float64{10, 100, 200, 500, 1000, 10000, 30000}, - }, []string{}), + Name: counterMonitoredHosts, + Help: counterDescriptionMonitoredHosts, + }, []string{hosts}), - serviceLogicMonitoredClustersDurationMs: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + serviceLogicMonitoredClusters: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, - Name: histogramMonitoredClustersDurationMs, - Help: histogramDescriptionMonitoredClustersDurationMs, - Buckets: []float64{10, 100, 200, 500, 1000, 10000, 30000}, - }, []string{}), + Name: counterMonitoredClusters, + Help: counterDescriptionMonitoredClusters, + }, []string{hosts}), + + serviceLogicInstallerCacheGetRelease: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: counterInstallerCacheGetRelease, + Help: counterDescriptionInstallerCacheGetRelease, + }, []string{labelStatus, labelReleaseID}), + + serviceLogicInstallerCacheReleaseEvicted: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: counterInstallerCacheReleaseEvicted, + Help: counterDescriptionInstallerCacheReleaseEvicted, + }, []string{labelStatus}), + + serviceLogicInstallerCacheGetReleaseCached: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: counterInstallerCacheGetReleaseCached, + Help: counterDescriptionInstallerCacheGetReleaseCached, + }, []string{labelReleaseID, labelCacheHit}), } m.collectors = append(m.collectors, newDirectoryUsageCollector(metricsManagerConfig.DirectoryUsageMonitorConfig.Directories, diskStatsHelper, log)) @@ -312,8 +360,10 @@ func NewMetricsManager(registry prometheus.Registerer, eventsHandler eventsapi.H m.serviceLogicClusterValidationChanged, m.serviceLogicClusterImagePullStatus, m.serviceLogicFilesystemUsagePercentage, - m.serviceLogicMonitoredHostsDurationMs, - m.serviceLogicMonitoredClustersDurationMs, + m.serviceLogicMonitoredHosts, + m.serviceLogicMonitoredClusters, + m.serviceLogicInstallerCacheGetRelease, + m.serviceLogicInstallerCacheReleaseEvicted, ) for _, collector := range m.collectors { @@ -477,14 +527,34 @@ func (m *MetricsManager) FileSystemUsage(usageInPercentage float64) { m.serviceLogicFilesystemUsagePercentage.WithLabelValues().Set(usageInPercentage) } -func (m *MetricsManager) MonitoredHostsDurationMs(monitoredHostsMs float64) { - m.serviceLogicMonitoredHostsDurationMs.WithLabelValues().Observe(monitoredHostsMs) +func (m *MetricsManager) MonitoredHostsCount(monitoredHosts int64) { + m.serviceLogicMonitoredHosts.WithLabelValues(hosts).Set(float64(monitoredHosts)) } -func (m *MetricsManager) MonitoredClustersDurationMs(monitoredClustersMs float64) { - m.serviceLogicMonitoredClustersDurationMs.WithLabelValues().Observe(monitoredClustersMs) +func (m *MetricsManager) MonitoredClusterCount(monitoredClusters int64) { + m.serviceLogicMonitoredClusters.WithLabelValues(clusters).Set(float64(monitoredClusters)) } func bytesToGib(bytes int64) int64 { return bytes / int64(units.GiB) } + +func (m *MetricsManager) InstallerCacheGetReleaseCached(releaseId string, cacheHit bool) { + m.serviceLogicInstallerCacheGetReleaseCached.WithLabelValues(releaseId, fmt.Sprintf("%t", cacheHit)).Inc() +} + +func (m *MetricsManager) InstallerCacheGetReleaseOK(releaseId string) { + m.serviceLogicInstallerCacheGetRelease.WithLabelValues(labelValueOK, releaseId).Inc() +} + +func (m *MetricsManager) InstallerCacheGetReleaseTimeout(releaseId string) { + m.serviceLogicInstallerCacheGetRelease.WithLabelValues(labelValueTimeout, releaseId).Inc() +} + +func (m *MetricsManager) InstallerCacheGetReleaseError(releaseId string) { + m.serviceLogicInstallerCacheGetRelease.WithLabelValues(labelValueError, releaseId).Inc() +} + +func (m *MetricsManager) InstallerCacheReleaseEvicted(succeeded bool) { + m.serviceLogicInstallerCacheReleaseEvicted.WithLabelValues(fmt.Sprintf("%t", succeeded)).Inc() +} diff --git a/internal/metrics/mock_metrics_manager_api.go b/internal/metrics/mock_metrics_manager_api.go index 26e3e09791..17f3ee599e 100644 --- a/internal/metrics/mock_metrics_manager_api.go +++ b/internal/metrics/mock_metrics_manager_api.go @@ -169,28 +169,88 @@ func (mr *MockAPIMockRecorder) InstallationStarted() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallationStarted", reflect.TypeOf((*MockAPI)(nil).InstallationStarted)) } -// MonitoredClustersDurationMs mocks base method. -func (m *MockAPI) MonitoredClustersDurationMs(monitoredClustersMillis float64) { +// InstallerCacheGetReleaseCached mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseCached(releaseId string, cacheHit bool) { m.ctrl.T.Helper() - m.ctrl.Call(m, "MonitoredClustersDurationMs", monitoredClustersMillis) + m.ctrl.Call(m, "InstallerCacheGetReleaseCached", releaseId, cacheHit) } -// MonitoredClustersDurationMs indicates an expected call of MonitoredClustersDurationMs. -func (mr *MockAPIMockRecorder) MonitoredClustersDurationMs(monitoredClustersMillis interface{}) *gomock.Call { +// InstallerCacheGetReleaseCached indicates an expected call of InstallerCacheGetReleaseCached. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseCached(releaseId, cacheHit interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MonitoredClustersDurationMs", reflect.TypeOf((*MockAPI)(nil).MonitoredClustersDurationMs), monitoredClustersMillis) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseCached", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseCached), releaseId, cacheHit) } -// MonitoredHostsDurationMs mocks base method. -func (m *MockAPI) MonitoredHostsDurationMs(monitoredHostsMillis float64) { +// InstallerCacheGetReleaseError mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseError(releaseID string) { m.ctrl.T.Helper() - m.ctrl.Call(m, "MonitoredHostsDurationMs", monitoredHostsMillis) + m.ctrl.Call(m, "InstallerCacheGetReleaseError", releaseID) } -// MonitoredHostsDurationMs indicates an expected call of MonitoredHostsDurationMs. -func (mr *MockAPIMockRecorder) MonitoredHostsDurationMs(monitoredHostsMillis interface{}) *gomock.Call { +// InstallerCacheGetReleaseError indicates an expected call of InstallerCacheGetReleaseError. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseError(releaseID interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MonitoredHostsDurationMs", reflect.TypeOf((*MockAPI)(nil).MonitoredHostsDurationMs), monitoredHostsMillis) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseError", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseError), releaseID) +} + +// InstallerCacheGetReleaseOK mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseOK(releaseID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheGetReleaseOK", releaseID) +} + +// InstallerCacheGetReleaseOK indicates an expected call of InstallerCacheGetReleaseOK. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseOK(releaseID interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseOK", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseOK), releaseID) +} + +// InstallerCacheGetReleaseTimeout mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseTimeout(releaseID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheGetReleaseTimeout", releaseID) +} + +// InstallerCacheGetReleaseTimeout indicates an expected call of InstallerCacheGetReleaseTimeout. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseTimeout(releaseID interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseTimeout", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseTimeout), releaseID) +} + +// InstallerCacheReleaseEvicted mocks base method. +func (m *MockAPI) InstallerCacheReleaseEvicted(succeeded bool) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheReleaseEvicted", succeeded) +} + +// InstallerCacheReleaseEvicted indicates an expected call of InstallerCacheReleaseEvicted. +func (mr *MockAPIMockRecorder) InstallerCacheReleaseEvicted(succeeded interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheReleaseEvicted", reflect.TypeOf((*MockAPI)(nil).InstallerCacheReleaseEvicted), succeeded) +} + +// MonitoredClusterCount mocks base method. +func (m *MockAPI) MonitoredClusterCount(monitoredClusters int64) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "MonitoredClusterCount", monitoredClusters) +} + +// MonitoredClusterCount indicates an expected call of MonitoredClusterCount. +func (mr *MockAPIMockRecorder) MonitoredClusterCount(monitoredClusters interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MonitoredClusterCount", reflect.TypeOf((*MockAPI)(nil).MonitoredClusterCount), monitoredClusters) +} + +// MonitoredHostsCount mocks base method. +func (m *MockAPI) MonitoredHostsCount(monitoredHosts int64) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "MonitoredHostsCount", monitoredHosts) +} + +// MonitoredHostsCount indicates an expected call of MonitoredHostsCount. +func (mr *MockAPIMockRecorder) MonitoredHostsCount(monitoredHosts interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MonitoredHostsCount", reflect.TypeOf((*MockAPI)(nil).MonitoredHostsCount), monitoredHosts) } // ReportHostInstallationMetrics mocks base method.