From 9b6070c3f79ea39808bbd54af337385b679f1cef Mon Sep 17 00:00:00 2001 From: Paul Maidment Date: Thu, 6 Feb 2025 14:46:35 +0200 Subject: [PATCH] MGMT-19840: Gather operational metrics from installercache The intent of this PR is to trace the following statistics, implemented as counts and incremented from applicable parts of the solution. counterDescriptionInstallerCachePrunedHardlink "Counts the number of times the installercache pruned a hardlink for being too old" counterDescriptionInstallerCacheGetReleaseOK "Counts the number of times that a release was fetched succesfully" counterDescriptionInstallerCacheGetReleaseTimeout "Counts the number of times that a release timed out or had the context cancelled" counterDescriptionInstallerCacheGetReleaseError "Counts the number of times that a release fetch resulted in error" counterDescriptionInstallerCacheReleaseCached "Counts the number of times that a release was found in the cache" counterDescriptionInstallerCacheReleaseExtracted "Counts the number of times that a release was extracted" counterDescriptionInstallerCacheTryEviction "Counts the number of times that the eviction function was called" counterDescriptionInstallerCacheReleaseEvicted "Counts the number of times that a release was evicted" This, combined with the event based metrics gathered in openshift#7156 should provide enough information to track the behaviour of the cache. --- cmd/main.go | 2 +- internal/ignition/installmanifests_test.go | 20 +++- internal/installercache/installercache.go | 33 ++++-- .../installercache/installercache_test.go | 102 +++++++++++------- internal/metrics/metricsManager.go | 74 ++++++++++++- internal/metrics/mock_metrics_manager_api.go | 60 +++++++++++ 6 files changed, 235 insertions(+), 56 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index e2a1c6094ca..799e0f55078 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -492,7 +492,7 @@ func main() { Options.BMConfig.S3EndpointURL = newUrl Options.InstallerCacheConfig.CacheDir = filepath.Join(Options.GeneratorConfig.GetWorkingDirectory(), "installercache") - installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, diskStatsHelper, log) + installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, metricsManager, diskStatsHelper, log) failOnError(err, "failed to instantiate installercache") generator := generator.New(log, objectHandler, Options.GeneratorConfig, providerRegistry, manifestsApi, eventsHandler, installerCache) diff --git a/internal/ignition/installmanifests_test.go b/internal/ignition/installmanifests_test.go index b84bfd93413..37a86992668 100644 --- a/internal/ignition/installmanifests_test.go +++ b/internal/ignition/installmanifests_test.go @@ -94,6 +94,7 @@ var _ = Describe("Bootstrap Ignition Update", func() { manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler *eventsapi.MockHandler installerCache *installercache.Installers + metricsAPI *metrics.MockAPI ) BeforeEach(func() { @@ -105,12 +106,13 @@ var _ = Describe("Bootstrap Ignition Update", func() { err1 = os.WriteFile(examplePath, []byte(bootstrap1), 0600) Expect(err1).NotTo(HaveOccurred()) ctrl = gomock.NewController(GinkgoT()) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) mockS3Client = s3wrapper.NewMockAPI(ctrl) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) @@ -262,6 +264,7 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6 ctrl *gomock.Controller manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -286,12 +289,13 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6 ctrl = gomock.NewController(GinkgoT()) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) @@ -456,6 +460,7 @@ var _ = Describe("createHostIgnitions", func() { workDir string manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -476,13 +481,14 @@ var _ = Describe("createHostIgnitions", func() { mockS3Client = s3wrapper.NewMockAPI(ctrl) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) cluster = testCluster() installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) @@ -1779,6 +1785,7 @@ var _ = Describe("Bare metal host generation", func() { ctrl *gomock.Controller manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -1789,13 +1796,14 @@ var _ = Describe("Bare metal host generation", func() { ctrl = gomock.NewController(GinkgoT()) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), ReleaseFetchRetryInterval: 1 * time.Microsecond, } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) @@ -1889,6 +1897,7 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() { ctrl *gomock.Controller manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -1920,13 +1929,14 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() { ctrl = gomock.NewController(GinkgoT()) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), ReleaseFetchRetryInterval: 1 * time.Microsecond, } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) diff --git a/internal/installercache/installercache.go b/internal/installercache/installercache.go index 24f9efc5ac8..bcd8d95628d 100644 --- a/internal/installercache/installercache.go +++ b/internal/installercache/installercache.go @@ -36,6 +36,7 @@ type Installers struct { eventsHandler eventsapi.Handler diskStatsHelper metrics.DiskStatsHelper config Config + metricsAPI metrics.API } type Size int64 @@ -116,7 +117,7 @@ func (rl *Release) Cleanup(ctx context.Context) error { } // New constructs an installer cache with a given storage capacity -func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) { +func New(config Config, eventsHandler eventsapi.Handler, metricsAPI metrics.API, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) { if config.MaxCapacity > 0 && config.MaxReleaseSize == 0 { return nil, fmt.Errorf("config.MaxReleaseSize (%d bytes) must not be zero", config.MaxReleaseSize) } @@ -128,6 +129,7 @@ func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics eventsHandler: eventsHandler, diskStatsHelper: diskStatsHelper, config: config, + metricsAPI: metricsAPI, }, nil } @@ -138,14 +140,25 @@ func (i *Installers) Get(ctx context.Context, releaseID, releaseIDMirror, pullSe for { select { case <-ctx.Done(): - return nil, ctx.Err() + err := ctx.Err() + if err == context.DeadlineExceeded { + i.metricsAPI.InstallerCacheGetReleaseTimeout(releaseID) + } + return nil, err default: - release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID) + majorMinorVersion, err := ocRelease.GetMajorMinorVersion(i.log, releaseID, releaseIDMirror, pullSecret) + if err != nil { + i.log.Warnf("unable to get majorMinorVersion to record metric for %s falling back to full URI", releaseID) + majorMinorVersion = releaseID + } + release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID, majorMinorVersion) if err == nil { + i.metricsAPI.InstallerCacheGetReleaseOK(majorMinorVersion) return release, nil } _, isCapacityError := err.(*errorInsufficientCacheCapacity) if !isCapacityError { + i.metricsAPI.InstallerCacheGetReleaseError(releaseID) return nil, errors.Wrapf(err, "failed to get installer path for release %s", releaseID) } time.Sleep(i.config.ReleaseFetchRetryInterval) @@ -161,14 +174,16 @@ func (i *Installers) getDiskUsageIncludingHardlinks() (uint64, error) { return usedBytes, nil } -func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release) (extractDuration float64, cached bool, err error) { +func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release, majorMinorVersion string) (extractDuration float64, cached bool, err error) { _, err = os.Stat(path) if err == nil { + i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, true) return 0, true, nil // release was found in the cache } if !os.IsNotExist(err) { return 0, false, err } + i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, false) usedBytes, err := i.getDiskUsageIncludingHardlinks() if err != nil && !os.IsNotExist(err) { return 0, false, errors.Wrapf(err, "could not determine disk usage information for cache dir %s", i.config.CacheDir) @@ -184,7 +199,7 @@ func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pu return time.Since(extractStartTime).Seconds(), false, nil } -func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID) (*Release, error) { +func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID, majorMinorVersion string) (*Release, error) { i.Lock() defer i.Unlock() @@ -199,7 +214,7 @@ func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocReleas if err != nil { return nil, err } - release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease) + release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease, majorMinorVersion) if err != nil { return nil, err } @@ -310,8 +325,10 @@ func (i *Installers) evictFile(filePath string) error { i.log.Infof("evicting binary file %s due to storage pressure", filePath) err := os.Remove(filePath) if err != nil { + i.metricsAPI.InstallerCacheReleaseEvicted(false) return err } + i.metricsAPI.InstallerCacheReleaseEvicted(true) // if the parent directory was left empty, // remove it to avoid dangling directories parentDir := path.Dir(filePath) @@ -334,9 +351,9 @@ func (i *Installers) pruneExpiredHardLinks(links []*fileInfo, gracePeriod time.D grace := graceTime.Unix() if finfo.info.ModTime().Unix() < grace { i.log.Infof("attempting to prune hard link %s", finfo.path) - err := os.Remove(finfo.path) - if err != nil { + if err := os.Remove(finfo.path); err != nil { i.log.WithError(err).Errorf("failed to prune hard link %s", finfo.path) + continue } } } diff --git a/internal/installercache/installercache_test.go b/internal/installercache/installercache_test.go index 9e285e64d42..393d1b9bac3 100644 --- a/internal/installercache/installercache_test.go +++ b/internal/installercache/installercache_test.go @@ -71,6 +71,7 @@ var _ = Describe("installer cache", func() { manager *Installers cacheDir string eventsHandler *eventsapi.MockHandler + metricsAPI *metrics.MockAPI ctx context.Context diskStatsHelper metrics.DiskStatsHelper ) @@ -85,17 +86,17 @@ var _ = Describe("installer cache", func() { } BeforeEach(func() { - ctrl = gomock.NewController(GinkgoT()) diskStatsHelper = metrics.NewOSDiskStatsHelper(logrus.New()) mockRelease = oc.NewMockRelease(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) var err error cacheDir, err = os.MkdirTemp("/tmp", "cacheDir") Expect(err).NotTo(HaveOccurred()) Expect(os.Mkdir(filepath.Join(cacheDir, "quay.io"), 0755)).To(Succeed()) Expect(os.Mkdir(filepath.Join(filepath.Join(cacheDir, "quay.io"), "release-dev"), 0755)).To(Succeed()) - manager, err = New(getInstallerCacheConfig(12, 5), eventsHandler, diskStatsHelper, logrus.New()) + manager, err = New(getInstallerCacheConfig(12, 5), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).NotTo(HaveOccurred()) ctx = context.TODO() }) @@ -111,7 +112,7 @@ var _ = Describe("installer cache", func() { ).AnyTimes() } - mockReleaseCalls := func(releaseID string, version string) { + mockReleaseCalls := func(releaseID string, version string, expectedMajorMinorVersion string) { workdir := filepath.Join(manager.config.CacheDir, "quay.io", "release-dev") fname := filepath.Join(workdir, releaseID) @@ -131,15 +132,21 @@ var _ = Describe("installer cache", func() { mockRelease.EXPECT().Extract(gomock.Any(), releaseID, gomock.Any(), manager.config.CacheDir, gomock.Any(), version). DoAndReturn(writeMockedReleaseToDisk).AnyTimes() + + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(expectedMajorMinorVersion).Times(1) } - testGet := func(releaseID, version string, clusterID strfmt.UUID, expectCached bool) (string, string) { + testGet := func(releaseID, version string, clusterID strfmt.UUID, expectCached bool, expectedMajorMinorVersion string) (string, string) { workdir := filepath.Join(manager.config.CacheDir, "quay.io", "release-dev") fname := filepath.Join(workdir, releaseID) if !expectCached { - mockReleaseCalls(releaseID, version) + mockReleaseCalls(releaseID, version, expectedMajorMinorVersion) } expectEventsSent() + mockReleaseCalls(releaseID, version, expectedMajorMinorVersion) + expectEventsSent() + mockRelease.EXPECT().GetMajorMinorVersion(gomock.Any(), releaseID, gomock.Any(), gomock.Any()).Return(expectedMajorMinorVersion, nil).Times(1) + metricsAPI.EXPECT().InstallerCacheGetReleaseCached(expectedMajorMinorVersion, gomock.Any()).AnyTimes() l, err := manager.Get(ctx, releaseID, "mirror", "pull-secret", mockRelease, version, clusterID) Expect(err).ShouldNot(HaveOccurred()) Expect(l.releaseID).To(Equal(releaseID)) @@ -157,9 +164,10 @@ var _ = Describe("installer cache", func() { } type test struct { - releaseID string - version string - clusterID strfmt.UUID + releaseID string + version string + clusterID strfmt.UUID + majorMinorVersion string } getUsedBytesForDirectory := func(directory string) uint64 { @@ -188,7 +196,11 @@ var _ = Describe("installer cache", func() { runTest := func(t test, manager *Installers) (*Release, error) { expectEventsSent() - mockReleaseCalls(t.releaseID, t.version) + mockReleaseCalls(t.releaseID, t.version, t.majorMinorVersion) + mockRelease.EXPECT().GetMajorMinorVersion(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(t.majorMinorVersion, nil).AnyTimes() + metricsAPI.EXPECT().InstallerCacheGetReleaseCached(t.majorMinorVersion, gomock.Any()).AnyTimes() + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(t.majorMinorVersion).AnyTimes() + metricsAPI.EXPECT().InstallerCacheReleaseEvicted(gomock.Any()).AnyTimes() return manager.Get(ctx, t.releaseID, "mirror", "pull-secret", mockRelease, t.version, t.clusterID) } @@ -221,7 +233,7 @@ var _ = Describe("installer cache", func() { // returns the first error encountered or nil if no error encountered. runParallelTest := func(maxCapacity int64, maxReleaseSize int64, tests []test) error { var err error - manager, err = New(getInstallerCacheConfig(maxCapacity, maxReleaseSize), eventsHandler, diskStatsHelper, getLogger()) + manager, err = New(getInstallerCacheConfig(maxCapacity, maxReleaseSize), eventsHandler, metricsAPI, diskStatsHelper, getLogger()) Expect(err).ToNot(HaveOccurred()) var wg sync.WaitGroup var reportedError error @@ -244,11 +256,11 @@ var _ = Describe("installer cache", func() { maxCapacity := int64(10) maxReleaseSize := int64(5) err := runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, }) Expect(err).ToNot(HaveOccurred()) // Now measure disk usage, we should be under the cache size @@ -259,11 +271,11 @@ var _ = Describe("installer cache", func() { maxCapacity := int64(25) maxReleaseSize := int64(5) err := runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.18.11-x86_64", version: "4.18.11"}, - {releaseID: "4.19.11-x86_64", version: "4.19.11"}, - {releaseID: "4.20.11-x86_64", version: "4.20.11"}, - {releaseID: "4.21.11-x86_64", version: "4.21.11"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.18.11-x86_64", version: "4.18.11", majorMinorVersion: "4.17"}, + {releaseID: "4.19.11-x86_64", version: "4.19.11", majorMinorVersion: "4.17"}, + {releaseID: "4.20.11-x86_64", version: "4.20.11", majorMinorVersion: "4.17"}, + {releaseID: "4.21.11-x86_64", version: "4.21.11", majorMinorVersion: "4.17"}, }) Expect(err).ToNot(HaveOccurred()) // Now measure disk usage, we should be under the cache size @@ -274,8 +286,8 @@ var _ = Describe("installer cache", func() { maxCapacity := int64(5) maxReleaseSize := int64(5) err := runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.18.11-x86_64", version: "4.18.11"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.18.11-x86_64", version: "4.18.11", majorMinorVersion: "4.18"}, }) Expect(err).ToNot(HaveOccurred()) // Now measure disk usage, we should be under the cache size @@ -284,41 +296,41 @@ var _ = Describe("installer cache", func() { // Now assert that a retry would work, there should be enough space for another release // use a brand new release ID to prove we are not hitting cache here. err = runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.19.11-x86_64", version: "4.19.11"}, + {releaseID: "4.19.11-x86_64", version: "4.19.11", majorMinorVersion: "4.19"}, }) Expect(err).ToNot(HaveOccurred()) }) It("Should raise error on construction if max release size is larger than cache and cache is enabled", func() { - _, err := New(getInstallerCacheConfig(5, 10), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(5, 10), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(Equal("config.MaxReleaseSize (10 bytes) must not be greater than config.MaxCapacity (5 bytes)")) }) It("Should raise error on construction if max release size is zero and cache is enabled", func() { - _, err := New(getInstallerCacheConfig(5, 0), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(5, 0), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(Equal("config.MaxReleaseSize (0 bytes) must not be zero")) }) It("Should not raise error on construction if max release size is larger than cache and cache eviction is disabled", func() { - _, err := New(getInstallerCacheConfig(0, 10), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(0, 10), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).ToNot(HaveOccurred()) }) It("Should not raise error on construction if max release size is zero and cache eviction is disabled", func() { - _, err := New(getInstallerCacheConfig(0, 0), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(0, 0), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).ToNot(HaveOccurred()) }) It("when cache limit is zero - eviction is skipped", func() { var err error - manager, err = New(getInstallerCacheConfig(0, 5), eventsHandler, diskStatsHelper, logrus.New()) + manager, err = New(getInstallerCacheConfig(0, 5), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).ToNot(HaveOccurred()) clusterId := strfmt.UUID(uuid.New().String()) - r1, _ := testGet("4.8", "4.8.0", clusterId, false) - r2, _ := testGet("4.9", "4.9.0", clusterId, false) - r3, _ := testGet("4.10", "4.10.0", clusterId, false) + r1, _ := testGet("4.8", "4.8.0", clusterId, false, "4.8") + r2, _ := testGet("4.9", "4.9.0", clusterId, false, "4.9") + r3, _ := testGet("4.10", "4.10.0", clusterId, false, "4.10") By("verify that the no file was deleted") _, err = os.Stat(r1) @@ -329,12 +341,14 @@ var _ = Describe("installer cache", func() { Expect(os.IsNotExist(err)).To(BeFalse()) }) - It("exising files access time is updated", func() { + It("existing files access time is updated", func() { + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(gomock.Any()).AnyTimes() clusterId := strfmt.UUID(uuid.New().String()) - _, _ = testGet("4.8", "4.8.0", clusterId, false) - r2, _ := testGet("4.9", "4.9.0", clusterId, false) - r1, _ := testGet("4.8", "4.8.0", clusterId, true) - r3, _ := testGet("4.10", "4.10.0", clusterId, false) + _, _ = testGet("4.8", "4.8.0", clusterId, false, "4.8") + r2, _ := testGet("4.9", "4.9.0", clusterId, false, "4.9") + r1, _ := testGet("4.8", "4.8.0", clusterId, true, "4.8") + metricsAPI.EXPECT().InstallerCacheReleaseEvicted(true).Times(1) + r3, _ := testGet("4.10", "4.10.0", clusterId, false, "4.10") By("verify that the oldest file was deleted") _, err := os.Stat(r1) @@ -348,10 +362,12 @@ var _ = Describe("installer cache", func() { }) It("evicts the oldest file", func() { + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(gomock.Any()).AnyTimes() clusterId := strfmt.UUID(uuid.New().String()) - r1, _ := testGet("4.8", "4.8.0", clusterId, false) - r2, _ := testGet("4.9", "4.9.0", clusterId, false) - r3, _ := testGet("4.10", "4.10.0", clusterId, false) + r1, _ := testGet("4.8", "4.8.0", clusterId, false, "4.8") + r2, _ := testGet("4.9", "4.9.0", clusterId, false, "4.9") + metricsAPI.EXPECT().InstallerCacheReleaseEvicted(true).Times(1) + r3, _ := testGet("4.10", "4.10.0", clusterId, false, "4.10") By("verify that the oldest file was deleted") _, err := os.Stat(r1) @@ -366,11 +382,16 @@ var _ = Describe("installer cache", func() { }) It("extracts a release", func() { + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(gomock.Any()).AnyTimes() releaseID := "4.10-orig" releaseMirrorID := "" version := "4.10.0" + majorMinorVersion := "4.10" clusterID := strfmt.UUID(uuid.NewString()) - mockReleaseCalls(releaseID, version) + mockRelease.EXPECT().GetMajorMinorVersion(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(majorMinorVersion, nil).AnyTimes() + mockReleaseCalls(releaseID, version, majorMinorVersion) + metricsAPI.EXPECT().InstallerCacheGetReleaseCached(majorMinorVersion, false) + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(majorMinorVersion).Times(1) l, err := manager.Get(ctx, releaseID, releaseMirrorID, "pull-secret", mockRelease, version, clusterID) Expect(err).ShouldNot(HaveOccurred()) Expect(l.releaseID).To(Equal(releaseID)) @@ -389,7 +410,6 @@ var _ = Describe("installer cache", func() { numberOfLinks := 10 numberOfExpiredLinks := 5 - directory, err := os.MkdirTemp("", "testPruneExpiredHardLinks") Expect(err).ToNot(HaveOccurred()) diff --git a/internal/metrics/metricsManager.go b/internal/metrics/metricsManager.go index 6e5111f2802..787e61b4117 100644 --- a/internal/metrics/metricsManager.go +++ b/internal/metrics/metricsManager.go @@ -3,6 +3,7 @@ package metrics import ( "context" "encoding/json" + "fmt" "time" "github.com/alecthomas/units" @@ -39,6 +40,10 @@ const ( counterFilesystemUsagePercentage = "assisted_installer_filesystem_usage_percentage" counterMonitoredHosts = "assisted_installer_monitored_hosts" counterMonitoredClusters = "assisted_installer_monitored_clusters" + counterInstallerCacheGetRelease = "assisted_installer_cache_get_release" + counterInstallerCacheReleaseCached = "assisted_installer_cache_get_release_cached" + counterInstallerCacheReleaseEvicted = "assisted_installer_cache_release_evicted" + counterInstallerCacheGetReleaseCached = "assisted_installer_cache_release_cached" ) const ( @@ -61,6 +66,9 @@ const ( counterDescriptionFilesystemUsagePercentage = "The percentage of the filesystem usage by the service" counterDescriptionMonitoredHosts = "Number of hosts monitored by host monitor" counterDescriptionMonitoredClusters = "Number of clusters monitored by cluster monitor" + counterDescriptionInstallerCacheGetRelease = "Counts the number of times a release was attempted, with the outcome as a label, cache status as label" + counterDescriptionInstallerCacheReleaseEvicted = "Counts the number of times that a release was evicted, label with success or fail of eviction" + counterDescriptionInstallerCacheGetReleaseCached = "Counts the number of times that a release was either cached or not cached" ) const ( @@ -77,6 +85,15 @@ const ( imageLabel = "imageName" hosts = "hosts" clusters = "clusters" + labelStatus = "status" + labelValueOK = "ok" + labelValueError = "error" + labelCacheHit = "hit" + labelReleaseID = "releaseId" + labelSucceess = "succeess" + labelErrorType = "errorType" + labelValueTimeout = "timeout" + labelValueOther = "other" ) type API interface { @@ -94,6 +111,11 @@ type API interface { FileSystemUsage(usageInPercentage float64) MonitoredHostsCount(monitoredHosts int64) MonitoredClusterCount(monitoredClusters int64) + InstallerCacheGetReleaseOK(releaseID string) + InstallerCacheGetReleaseTimeout(releaseID string) + InstallerCacheGetReleaseError(releaseID string) + InstallerCacheReleaseEvicted(succeeded bool) + InstallerCacheGetReleaseCached(releaseId string, cacheHit bool) } type MetricsManager struct { @@ -119,7 +141,11 @@ type MetricsManager struct { serviceLogicFilesystemUsagePercentage *prometheus.GaugeVec serviceLogicMonitoredHosts *prometheus.GaugeVec serviceLogicMonitoredClusters *prometheus.GaugeVec - collectors []prometheus.Collector + serviceLogicInstallerCacheGetRelease *prometheus.CounterVec + serviceLogicInstallerCacheReleaseEvicted *prometheus.CounterVec + serviceLogicInstallerCacheGetReleaseCached *prometheus.CounterVec + + collectors []prometheus.Collector } var _ API = &MetricsManager{} @@ -288,6 +314,30 @@ func NewMetricsManager(registry prometheus.Registerer, eventsHandler eventsapi.H Name: counterMonitoredClusters, Help: counterDescriptionMonitoredClusters, }, []string{hosts}), + + serviceLogicInstallerCacheGetRelease: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: counterInstallerCacheGetRelease, + Help: counterDescriptionInstallerCacheGetRelease, + }, []string{labelStatus, labelReleaseID}), + + serviceLogicInstallerCacheReleaseEvicted: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: counterInstallerCacheReleaseEvicted, + Help: counterDescriptionInstallerCacheReleaseEvicted, + }, []string{labelStatus}), + + serviceLogicInstallerCacheGetReleaseCached: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: counterInstallerCacheGetReleaseCached, + Help: counterDescriptionInstallerCacheGetReleaseCached, + }, []string{labelReleaseID, labelCacheHit}), } m.collectors = append(m.collectors, newDirectoryUsageCollector(metricsManagerConfig.DirectoryUsageMonitorConfig.Directories, diskStatsHelper, log)) @@ -312,6 +362,8 @@ func NewMetricsManager(registry prometheus.Registerer, eventsHandler eventsapi.H m.serviceLogicFilesystemUsagePercentage, m.serviceLogicMonitoredHosts, m.serviceLogicMonitoredClusters, + m.serviceLogicInstallerCacheGetRelease, + m.serviceLogicInstallerCacheReleaseEvicted, ) for _, collector := range m.collectors { @@ -486,3 +538,23 @@ func (m *MetricsManager) MonitoredClusterCount(monitoredClusters int64) { func bytesToGib(bytes int64) int64 { return bytes / int64(units.GiB) } + +func (m *MetricsManager) InstallerCacheGetReleaseCached(releaseId string, cacheHit bool) { + m.serviceLogicInstallerCacheGetReleaseCached.WithLabelValues(releaseId, fmt.Sprintf("%t", cacheHit)).Inc() +} + +func (m *MetricsManager) InstallerCacheGetReleaseOK(releaseId string) { + m.serviceLogicInstallerCacheGetRelease.WithLabelValues(labelValueOK, releaseId).Inc() +} + +func (m *MetricsManager) InstallerCacheGetReleaseTimeout(releaseId string) { + m.serviceLogicInstallerCacheGetRelease.WithLabelValues(labelValueTimeout, releaseId).Inc() +} + +func (m *MetricsManager) InstallerCacheGetReleaseError(releaseId string) { + m.serviceLogicInstallerCacheGetRelease.WithLabelValues(labelValueError, releaseId).Inc() +} + +func (m *MetricsManager) InstallerCacheReleaseEvicted(succeeded bool) { + m.serviceLogicInstallerCacheReleaseEvicted.WithLabelValues(fmt.Sprintf("%t", succeeded)).Inc() +} diff --git a/internal/metrics/mock_metrics_manager_api.go b/internal/metrics/mock_metrics_manager_api.go index 7d6163539e9..17f3ee599e1 100644 --- a/internal/metrics/mock_metrics_manager_api.go +++ b/internal/metrics/mock_metrics_manager_api.go @@ -169,6 +169,66 @@ func (mr *MockAPIMockRecorder) InstallationStarted() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallationStarted", reflect.TypeOf((*MockAPI)(nil).InstallationStarted)) } +// InstallerCacheGetReleaseCached mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseCached(releaseId string, cacheHit bool) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheGetReleaseCached", releaseId, cacheHit) +} + +// InstallerCacheGetReleaseCached indicates an expected call of InstallerCacheGetReleaseCached. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseCached(releaseId, cacheHit interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseCached", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseCached), releaseId, cacheHit) +} + +// InstallerCacheGetReleaseError mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseError(releaseID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheGetReleaseError", releaseID) +} + +// InstallerCacheGetReleaseError indicates an expected call of InstallerCacheGetReleaseError. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseError(releaseID interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseError", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseError), releaseID) +} + +// InstallerCacheGetReleaseOK mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseOK(releaseID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheGetReleaseOK", releaseID) +} + +// InstallerCacheGetReleaseOK indicates an expected call of InstallerCacheGetReleaseOK. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseOK(releaseID interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseOK", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseOK), releaseID) +} + +// InstallerCacheGetReleaseTimeout mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseTimeout(releaseID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheGetReleaseTimeout", releaseID) +} + +// InstallerCacheGetReleaseTimeout indicates an expected call of InstallerCacheGetReleaseTimeout. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseTimeout(releaseID interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseTimeout", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseTimeout), releaseID) +} + +// InstallerCacheReleaseEvicted mocks base method. +func (m *MockAPI) InstallerCacheReleaseEvicted(succeeded bool) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheReleaseEvicted", succeeded) +} + +// InstallerCacheReleaseEvicted indicates an expected call of InstallerCacheReleaseEvicted. +func (mr *MockAPIMockRecorder) InstallerCacheReleaseEvicted(succeeded interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheReleaseEvicted", reflect.TypeOf((*MockAPI)(nil).InstallerCacheReleaseEvicted), succeeded) +} + // MonitoredClusterCount mocks base method. func (m *MockAPI) MonitoredClusterCount(monitoredClusters int64) { m.ctrl.T.Helper()