diff --git a/cmd/main.go b/cmd/main.go index e2a1c6094c..799e0f5507 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -492,7 +492,7 @@ func main() { Options.BMConfig.S3EndpointURL = newUrl Options.InstallerCacheConfig.CacheDir = filepath.Join(Options.GeneratorConfig.GetWorkingDirectory(), "installercache") - installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, diskStatsHelper, log) + installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, metricsManager, diskStatsHelper, log) failOnError(err, "failed to instantiate installercache") generator := generator.New(log, objectHandler, Options.GeneratorConfig, providerRegistry, manifestsApi, eventsHandler, installerCache) diff --git a/internal/ignition/installmanifests_test.go b/internal/ignition/installmanifests_test.go index b84bfd9341..37a8699266 100644 --- a/internal/ignition/installmanifests_test.go +++ b/internal/ignition/installmanifests_test.go @@ -94,6 +94,7 @@ var _ = Describe("Bootstrap Ignition Update", func() { manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler *eventsapi.MockHandler installerCache *installercache.Installers + metricsAPI *metrics.MockAPI ) BeforeEach(func() { @@ -105,12 +106,13 @@ var _ = Describe("Bootstrap Ignition Update", func() { err1 = os.WriteFile(examplePath, []byte(bootstrap1), 0600) Expect(err1).NotTo(HaveOccurred()) ctrl = gomock.NewController(GinkgoT()) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) mockS3Client = s3wrapper.NewMockAPI(ctrl) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) @@ -262,6 +264,7 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6 ctrl *gomock.Controller manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -286,12 +289,13 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6 ctrl = gomock.NewController(GinkgoT()) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) @@ -456,6 +460,7 @@ var _ = Describe("createHostIgnitions", func() { workDir string manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -476,13 +481,14 @@ var _ = Describe("createHostIgnitions", func() { mockS3Client = s3wrapper.NewMockAPI(ctrl) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) cluster = testCluster() installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) @@ -1779,6 +1785,7 @@ var _ = Describe("Bare metal host generation", func() { ctrl *gomock.Controller manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -1789,13 +1796,14 @@ var _ = Describe("Bare metal host generation", func() { ctrl = gomock.NewController(GinkgoT()) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), ReleaseFetchRetryInterval: 1 * time.Microsecond, } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) @@ -1889,6 +1897,7 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() { ctrl *gomock.Controller manifestsAPI *manifestsapi.MockManifestsAPI eventsHandler eventsapi.Handler + metricsAPI *metrics.MockAPI installerCache *installercache.Installers ) @@ -1920,13 +1929,14 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() { ctrl = gomock.NewController(GinkgoT()) manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) installerCacheConfig := installercache.Config{ CacheDir: filepath.Join(workDir, "some-dir", "installercache"), MaxCapacity: installercache.Size(5), MaxReleaseSize: installercache.Size(5), ReleaseFetchRetryInterval: 1 * time.Microsecond, } - installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) + installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New()) Expect(err).NotTo(HaveOccurred()) }) diff --git a/internal/installercache/installercache.go b/internal/installercache/installercache.go index 24f9efc5ac..bcd8d95628 100644 --- a/internal/installercache/installercache.go +++ b/internal/installercache/installercache.go @@ -36,6 +36,7 @@ type Installers struct { eventsHandler eventsapi.Handler diskStatsHelper metrics.DiskStatsHelper config Config + metricsAPI metrics.API } type Size int64 @@ -116,7 +117,7 @@ func (rl *Release) Cleanup(ctx context.Context) error { } // New constructs an installer cache with a given storage capacity -func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) { +func New(config Config, eventsHandler eventsapi.Handler, metricsAPI metrics.API, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) { if config.MaxCapacity > 0 && config.MaxReleaseSize == 0 { return nil, fmt.Errorf("config.MaxReleaseSize (%d bytes) must not be zero", config.MaxReleaseSize) } @@ -128,6 +129,7 @@ func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics eventsHandler: eventsHandler, diskStatsHelper: diskStatsHelper, config: config, + metricsAPI: metricsAPI, }, nil } @@ -138,14 +140,25 @@ func (i *Installers) Get(ctx context.Context, releaseID, releaseIDMirror, pullSe for { select { case <-ctx.Done(): - return nil, ctx.Err() + err := ctx.Err() + if err == context.DeadlineExceeded { + i.metricsAPI.InstallerCacheGetReleaseTimeout(releaseID) + } + return nil, err default: - release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID) + majorMinorVersion, err := ocRelease.GetMajorMinorVersion(i.log, releaseID, releaseIDMirror, pullSecret) + if err != nil { + i.log.Warnf("unable to get majorMinorVersion to record metric for %s falling back to full URI", releaseID) + majorMinorVersion = releaseID + } + release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID, majorMinorVersion) if err == nil { + i.metricsAPI.InstallerCacheGetReleaseOK(majorMinorVersion) return release, nil } _, isCapacityError := err.(*errorInsufficientCacheCapacity) if !isCapacityError { + i.metricsAPI.InstallerCacheGetReleaseError(releaseID) return nil, errors.Wrapf(err, "failed to get installer path for release %s", releaseID) } time.Sleep(i.config.ReleaseFetchRetryInterval) @@ -161,14 +174,16 @@ func (i *Installers) getDiskUsageIncludingHardlinks() (uint64, error) { return usedBytes, nil } -func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release) (extractDuration float64, cached bool, err error) { +func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release, majorMinorVersion string) (extractDuration float64, cached bool, err error) { _, err = os.Stat(path) if err == nil { + i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, true) return 0, true, nil // release was found in the cache } if !os.IsNotExist(err) { return 0, false, err } + i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, false) usedBytes, err := i.getDiskUsageIncludingHardlinks() if err != nil && !os.IsNotExist(err) { return 0, false, errors.Wrapf(err, "could not determine disk usage information for cache dir %s", i.config.CacheDir) @@ -184,7 +199,7 @@ func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pu return time.Since(extractStartTime).Seconds(), false, nil } -func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID) (*Release, error) { +func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID, majorMinorVersion string) (*Release, error) { i.Lock() defer i.Unlock() @@ -199,7 +214,7 @@ func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocReleas if err != nil { return nil, err } - release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease) + release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease, majorMinorVersion) if err != nil { return nil, err } @@ -310,8 +325,10 @@ func (i *Installers) evictFile(filePath string) error { i.log.Infof("evicting binary file %s due to storage pressure", filePath) err := os.Remove(filePath) if err != nil { + i.metricsAPI.InstallerCacheReleaseEvicted(false) return err } + i.metricsAPI.InstallerCacheReleaseEvicted(true) // if the parent directory was left empty, // remove it to avoid dangling directories parentDir := path.Dir(filePath) @@ -334,9 +351,9 @@ func (i *Installers) pruneExpiredHardLinks(links []*fileInfo, gracePeriod time.D grace := graceTime.Unix() if finfo.info.ModTime().Unix() < grace { i.log.Infof("attempting to prune hard link %s", finfo.path) - err := os.Remove(finfo.path) - if err != nil { + if err := os.Remove(finfo.path); err != nil { i.log.WithError(err).Errorf("failed to prune hard link %s", finfo.path) + continue } } } diff --git a/internal/installercache/installercache_test.go b/internal/installercache/installercache_test.go index 9e285e64d4..393d1b9bac 100644 --- a/internal/installercache/installercache_test.go +++ b/internal/installercache/installercache_test.go @@ -71,6 +71,7 @@ var _ = Describe("installer cache", func() { manager *Installers cacheDir string eventsHandler *eventsapi.MockHandler + metricsAPI *metrics.MockAPI ctx context.Context diskStatsHelper metrics.DiskStatsHelper ) @@ -85,17 +86,17 @@ var _ = Describe("installer cache", func() { } BeforeEach(func() { - ctrl = gomock.NewController(GinkgoT()) diskStatsHelper = metrics.NewOSDiskStatsHelper(logrus.New()) mockRelease = oc.NewMockRelease(ctrl) eventsHandler = eventsapi.NewMockHandler(ctrl) + metricsAPI = metrics.NewMockAPI(ctrl) var err error cacheDir, err = os.MkdirTemp("/tmp", "cacheDir") Expect(err).NotTo(HaveOccurred()) Expect(os.Mkdir(filepath.Join(cacheDir, "quay.io"), 0755)).To(Succeed()) Expect(os.Mkdir(filepath.Join(filepath.Join(cacheDir, "quay.io"), "release-dev"), 0755)).To(Succeed()) - manager, err = New(getInstallerCacheConfig(12, 5), eventsHandler, diskStatsHelper, logrus.New()) + manager, err = New(getInstallerCacheConfig(12, 5), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).NotTo(HaveOccurred()) ctx = context.TODO() }) @@ -111,7 +112,7 @@ var _ = Describe("installer cache", func() { ).AnyTimes() } - mockReleaseCalls := func(releaseID string, version string) { + mockReleaseCalls := func(releaseID string, version string, expectedMajorMinorVersion string) { workdir := filepath.Join(manager.config.CacheDir, "quay.io", "release-dev") fname := filepath.Join(workdir, releaseID) @@ -131,15 +132,21 @@ var _ = Describe("installer cache", func() { mockRelease.EXPECT().Extract(gomock.Any(), releaseID, gomock.Any(), manager.config.CacheDir, gomock.Any(), version). DoAndReturn(writeMockedReleaseToDisk).AnyTimes() + + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(expectedMajorMinorVersion).Times(1) } - testGet := func(releaseID, version string, clusterID strfmt.UUID, expectCached bool) (string, string) { + testGet := func(releaseID, version string, clusterID strfmt.UUID, expectCached bool, expectedMajorMinorVersion string) (string, string) { workdir := filepath.Join(manager.config.CacheDir, "quay.io", "release-dev") fname := filepath.Join(workdir, releaseID) if !expectCached { - mockReleaseCalls(releaseID, version) + mockReleaseCalls(releaseID, version, expectedMajorMinorVersion) } expectEventsSent() + mockReleaseCalls(releaseID, version, expectedMajorMinorVersion) + expectEventsSent() + mockRelease.EXPECT().GetMajorMinorVersion(gomock.Any(), releaseID, gomock.Any(), gomock.Any()).Return(expectedMajorMinorVersion, nil).Times(1) + metricsAPI.EXPECT().InstallerCacheGetReleaseCached(expectedMajorMinorVersion, gomock.Any()).AnyTimes() l, err := manager.Get(ctx, releaseID, "mirror", "pull-secret", mockRelease, version, clusterID) Expect(err).ShouldNot(HaveOccurred()) Expect(l.releaseID).To(Equal(releaseID)) @@ -157,9 +164,10 @@ var _ = Describe("installer cache", func() { } type test struct { - releaseID string - version string - clusterID strfmt.UUID + releaseID string + version string + clusterID strfmt.UUID + majorMinorVersion string } getUsedBytesForDirectory := func(directory string) uint64 { @@ -188,7 +196,11 @@ var _ = Describe("installer cache", func() { runTest := func(t test, manager *Installers) (*Release, error) { expectEventsSent() - mockReleaseCalls(t.releaseID, t.version) + mockReleaseCalls(t.releaseID, t.version, t.majorMinorVersion) + mockRelease.EXPECT().GetMajorMinorVersion(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(t.majorMinorVersion, nil).AnyTimes() + metricsAPI.EXPECT().InstallerCacheGetReleaseCached(t.majorMinorVersion, gomock.Any()).AnyTimes() + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(t.majorMinorVersion).AnyTimes() + metricsAPI.EXPECT().InstallerCacheReleaseEvicted(gomock.Any()).AnyTimes() return manager.Get(ctx, t.releaseID, "mirror", "pull-secret", mockRelease, t.version, t.clusterID) } @@ -221,7 +233,7 @@ var _ = Describe("installer cache", func() { // returns the first error encountered or nil if no error encountered. runParallelTest := func(maxCapacity int64, maxReleaseSize int64, tests []test) error { var err error - manager, err = New(getInstallerCacheConfig(maxCapacity, maxReleaseSize), eventsHandler, diskStatsHelper, getLogger()) + manager, err = New(getInstallerCacheConfig(maxCapacity, maxReleaseSize), eventsHandler, metricsAPI, diskStatsHelper, getLogger()) Expect(err).ToNot(HaveOccurred()) var wg sync.WaitGroup var reportedError error @@ -244,11 +256,11 @@ var _ = Describe("installer cache", func() { maxCapacity := int64(10) maxReleaseSize := int64(5) err := runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, }) Expect(err).ToNot(HaveOccurred()) // Now measure disk usage, we should be under the cache size @@ -259,11 +271,11 @@ var _ = Describe("installer cache", func() { maxCapacity := int64(25) maxReleaseSize := int64(5) err := runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.18.11-x86_64", version: "4.18.11"}, - {releaseID: "4.19.11-x86_64", version: "4.19.11"}, - {releaseID: "4.20.11-x86_64", version: "4.20.11"}, - {releaseID: "4.21.11-x86_64", version: "4.21.11"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.18.11-x86_64", version: "4.18.11", majorMinorVersion: "4.17"}, + {releaseID: "4.19.11-x86_64", version: "4.19.11", majorMinorVersion: "4.17"}, + {releaseID: "4.20.11-x86_64", version: "4.20.11", majorMinorVersion: "4.17"}, + {releaseID: "4.21.11-x86_64", version: "4.21.11", majorMinorVersion: "4.17"}, }) Expect(err).ToNot(HaveOccurred()) // Now measure disk usage, we should be under the cache size @@ -274,8 +286,8 @@ var _ = Describe("installer cache", func() { maxCapacity := int64(5) maxReleaseSize := int64(5) err := runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.17.11-x86_64", version: "4.17.11"}, - {releaseID: "4.18.11-x86_64", version: "4.18.11"}, + {releaseID: "4.17.11-x86_64", version: "4.17.11", majorMinorVersion: "4.17"}, + {releaseID: "4.18.11-x86_64", version: "4.18.11", majorMinorVersion: "4.18"}, }) Expect(err).ToNot(HaveOccurred()) // Now measure disk usage, we should be under the cache size @@ -284,41 +296,41 @@ var _ = Describe("installer cache", func() { // Now assert that a retry would work, there should be enough space for another release // use a brand new release ID to prove we are not hitting cache here. err = runParallelTest(maxCapacity, maxReleaseSize, []test{ - {releaseID: "4.19.11-x86_64", version: "4.19.11"}, + {releaseID: "4.19.11-x86_64", version: "4.19.11", majorMinorVersion: "4.19"}, }) Expect(err).ToNot(HaveOccurred()) }) It("Should raise error on construction if max release size is larger than cache and cache is enabled", func() { - _, err := New(getInstallerCacheConfig(5, 10), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(5, 10), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(Equal("config.MaxReleaseSize (10 bytes) must not be greater than config.MaxCapacity (5 bytes)")) }) It("Should raise error on construction if max release size is zero and cache is enabled", func() { - _, err := New(getInstallerCacheConfig(5, 0), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(5, 0), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(Equal("config.MaxReleaseSize (0 bytes) must not be zero")) }) It("Should not raise error on construction if max release size is larger than cache and cache eviction is disabled", func() { - _, err := New(getInstallerCacheConfig(0, 10), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(0, 10), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).ToNot(HaveOccurred()) }) It("Should not raise error on construction if max release size is zero and cache eviction is disabled", func() { - _, err := New(getInstallerCacheConfig(0, 0), eventsHandler, diskStatsHelper, logrus.New()) + _, err := New(getInstallerCacheConfig(0, 0), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).ToNot(HaveOccurred()) }) It("when cache limit is zero - eviction is skipped", func() { var err error - manager, err = New(getInstallerCacheConfig(0, 5), eventsHandler, diskStatsHelper, logrus.New()) + manager, err = New(getInstallerCacheConfig(0, 5), eventsHandler, metricsAPI, diskStatsHelper, logrus.New()) Expect(err).ToNot(HaveOccurred()) clusterId := strfmt.UUID(uuid.New().String()) - r1, _ := testGet("4.8", "4.8.0", clusterId, false) - r2, _ := testGet("4.9", "4.9.0", clusterId, false) - r3, _ := testGet("4.10", "4.10.0", clusterId, false) + r1, _ := testGet("4.8", "4.8.0", clusterId, false, "4.8") + r2, _ := testGet("4.9", "4.9.0", clusterId, false, "4.9") + r3, _ := testGet("4.10", "4.10.0", clusterId, false, "4.10") By("verify that the no file was deleted") _, err = os.Stat(r1) @@ -329,12 +341,14 @@ var _ = Describe("installer cache", func() { Expect(os.IsNotExist(err)).To(BeFalse()) }) - It("exising files access time is updated", func() { + It("existing files access time is updated", func() { + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(gomock.Any()).AnyTimes() clusterId := strfmt.UUID(uuid.New().String()) - _, _ = testGet("4.8", "4.8.0", clusterId, false) - r2, _ := testGet("4.9", "4.9.0", clusterId, false) - r1, _ := testGet("4.8", "4.8.0", clusterId, true) - r3, _ := testGet("4.10", "4.10.0", clusterId, false) + _, _ = testGet("4.8", "4.8.0", clusterId, false, "4.8") + r2, _ := testGet("4.9", "4.9.0", clusterId, false, "4.9") + r1, _ := testGet("4.8", "4.8.0", clusterId, true, "4.8") + metricsAPI.EXPECT().InstallerCacheReleaseEvicted(true).Times(1) + r3, _ := testGet("4.10", "4.10.0", clusterId, false, "4.10") By("verify that the oldest file was deleted") _, err := os.Stat(r1) @@ -348,10 +362,12 @@ var _ = Describe("installer cache", func() { }) It("evicts the oldest file", func() { + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(gomock.Any()).AnyTimes() clusterId := strfmt.UUID(uuid.New().String()) - r1, _ := testGet("4.8", "4.8.0", clusterId, false) - r2, _ := testGet("4.9", "4.9.0", clusterId, false) - r3, _ := testGet("4.10", "4.10.0", clusterId, false) + r1, _ := testGet("4.8", "4.8.0", clusterId, false, "4.8") + r2, _ := testGet("4.9", "4.9.0", clusterId, false, "4.9") + metricsAPI.EXPECT().InstallerCacheReleaseEvicted(true).Times(1) + r3, _ := testGet("4.10", "4.10.0", clusterId, false, "4.10") By("verify that the oldest file was deleted") _, err := os.Stat(r1) @@ -366,11 +382,16 @@ var _ = Describe("installer cache", func() { }) It("extracts a release", func() { + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(gomock.Any()).AnyTimes() releaseID := "4.10-orig" releaseMirrorID := "" version := "4.10.0" + majorMinorVersion := "4.10" clusterID := strfmt.UUID(uuid.NewString()) - mockReleaseCalls(releaseID, version) + mockRelease.EXPECT().GetMajorMinorVersion(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(majorMinorVersion, nil).AnyTimes() + mockReleaseCalls(releaseID, version, majorMinorVersion) + metricsAPI.EXPECT().InstallerCacheGetReleaseCached(majorMinorVersion, false) + metricsAPI.EXPECT().InstallerCacheGetReleaseOK(majorMinorVersion).Times(1) l, err := manager.Get(ctx, releaseID, releaseMirrorID, "pull-secret", mockRelease, version, clusterID) Expect(err).ShouldNot(HaveOccurred()) Expect(l.releaseID).To(Equal(releaseID)) @@ -389,7 +410,6 @@ var _ = Describe("installer cache", func() { numberOfLinks := 10 numberOfExpiredLinks := 5 - directory, err := os.MkdirTemp("", "testPruneExpiredHardLinks") Expect(err).ToNot(HaveOccurred()) diff --git a/internal/metrics/metricsManager.go b/internal/metrics/metricsManager.go index 57b8d7235f..787e61b411 100644 --- a/internal/metrics/metricsManager.go +++ b/internal/metrics/metricsManager.go @@ -3,6 +3,7 @@ package metrics import ( "context" "encoding/json" + "fmt" "time" "github.com/alecthomas/units" @@ -37,8 +38,12 @@ const ( counterClusterValidationFailed = "assisted_installer_cluster_validation_is_in_failed_status_on_cluster_deletion" counterClusterValidationChanged = "assisted_installer_cluster_validation_failed_after_success_before_installation" counterFilesystemUsagePercentage = "assisted_installer_filesystem_usage_percentage" - histogramMonitoredHostsDurationMs = "assisted_installer_monitored_hosts_duration_ms" - histogramMonitoredClustersDurationMs = "assisted_installer_monitored_clusters_duration_ms" + counterMonitoredHosts = "assisted_installer_monitored_hosts" + counterMonitoredClusters = "assisted_installer_monitored_clusters" + counterInstallerCacheGetRelease = "assisted_installer_cache_get_release" + counterInstallerCacheReleaseCached = "assisted_installer_cache_get_release_cached" + counterInstallerCacheReleaseEvicted = "assisted_installer_cache_release_evicted" + counterInstallerCacheGetReleaseCached = "assisted_installer_cache_release_cached" ) const ( @@ -59,8 +64,11 @@ const ( counterDescriptionClusterValidationFailed = "Number of cluster validation errors" counterDescriptionClusterValidationChanged = "Number of cluster validations that already succeed but start to fail again" counterDescriptionFilesystemUsagePercentage = "The percentage of the filesystem usage by the service" - histogramDescriptionMonitoredHostsDurationMs = "Histogram/sum/count of monitored hosts duration (ms)" - histogramDescriptionMonitoredClustersDurationMs = "Histogram/sum/count of monitored clusters duration (ms)" + counterDescriptionMonitoredHosts = "Number of hosts monitored by host monitor" + counterDescriptionMonitoredClusters = "Number of clusters monitored by cluster monitor" + counterDescriptionInstallerCacheGetRelease = "Counts the number of times a release was attempted, with the outcome as a label, cache status as label" + counterDescriptionInstallerCacheReleaseEvicted = "Counts the number of times that a release was evicted, label with success or fail of eviction" + counterDescriptionInstallerCacheGetReleaseCached = "Counts the number of times that a release was either cached or not cached" ) const ( @@ -77,6 +85,15 @@ const ( imageLabel = "imageName" hosts = "hosts" clusters = "clusters" + labelStatus = "status" + labelValueOK = "ok" + labelValueError = "error" + labelCacheHit = "hit" + labelReleaseID = "releaseId" + labelSucceess = "succeess" + labelErrorType = "errorType" + labelValueTimeout = "timeout" + labelValueOther = "other" ) type API interface { @@ -92,8 +109,13 @@ type API interface { DiskSyncDuration(syncDuration int64) ImagePullStatus(imageName, resultStatus string, downloadRate float64) FileSystemUsage(usageInPercentage float64) - MonitoredHostsDurationMs(monitoredHostsMillis float64) - MonitoredClustersDurationMs(monitoredClustersMillis float64) + MonitoredHostsCount(monitoredHosts int64) + MonitoredClusterCount(monitoredClusters int64) + InstallerCacheGetReleaseOK(releaseID string) + InstallerCacheGetReleaseTimeout(releaseID string) + InstallerCacheGetReleaseError(releaseID string) + InstallerCacheReleaseEvicted(succeeded bool) + InstallerCacheGetReleaseCached(releaseId string, cacheHit bool) } type MetricsManager struct { @@ -117,9 +139,13 @@ type MetricsManager struct { serviceLogicClusterValidationFailed *prometheus.CounterVec serviceLogicClusterValidationChanged *prometheus.CounterVec serviceLogicFilesystemUsagePercentage *prometheus.GaugeVec - serviceLogicMonitoredHostsDurationMs *prometheus.HistogramVec - serviceLogicMonitoredClustersDurationMs *prometheus.HistogramVec - collectors []prometheus.Collector + serviceLogicMonitoredHosts *prometheus.GaugeVec + serviceLogicMonitoredClusters *prometheus.GaugeVec + serviceLogicInstallerCacheGetRelease *prometheus.CounterVec + serviceLogicInstallerCacheReleaseEvicted *prometheus.CounterVec + serviceLogicInstallerCacheGetReleaseCached *prometheus.CounterVec + + collectors []prometheus.Collector } var _ API = &MetricsManager{} @@ -275,21 +301,43 @@ func NewMetricsManager(registry prometheus.Registerer, eventsHandler eventsapi.H }, []string{}, ), - serviceLogicMonitoredHostsDurationMs: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + serviceLogicMonitoredHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, - Name: histogramMonitoredHostsDurationMs, - Help: histogramDescriptionMonitoredHostsDurationMs, - Buckets: []float64{10, 100, 200, 500, 1000, 10000, 30000}, - }, []string{}), + Name: counterMonitoredHosts, + Help: counterDescriptionMonitoredHosts, + }, []string{hosts}), - serviceLogicMonitoredClustersDurationMs: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + serviceLogicMonitoredClusters: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, - Name: histogramMonitoredClustersDurationMs, - Help: histogramDescriptionMonitoredClustersDurationMs, - Buckets: []float64{10, 100, 200, 500, 1000, 10000, 30000}, - }, []string{}), + Name: counterMonitoredClusters, + Help: counterDescriptionMonitoredClusters, + }, []string{hosts}), + + serviceLogicInstallerCacheGetRelease: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: counterInstallerCacheGetRelease, + Help: counterDescriptionInstallerCacheGetRelease, + }, []string{labelStatus, labelReleaseID}), + + serviceLogicInstallerCacheReleaseEvicted: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: counterInstallerCacheReleaseEvicted, + Help: counterDescriptionInstallerCacheReleaseEvicted, + }, []string{labelStatus}), + + serviceLogicInstallerCacheGetReleaseCached: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: counterInstallerCacheGetReleaseCached, + Help: counterDescriptionInstallerCacheGetReleaseCached, + }, []string{labelReleaseID, labelCacheHit}), } m.collectors = append(m.collectors, newDirectoryUsageCollector(metricsManagerConfig.DirectoryUsageMonitorConfig.Directories, diskStatsHelper, log)) @@ -312,8 +360,10 @@ func NewMetricsManager(registry prometheus.Registerer, eventsHandler eventsapi.H m.serviceLogicClusterValidationChanged, m.serviceLogicClusterImagePullStatus, m.serviceLogicFilesystemUsagePercentage, - m.serviceLogicMonitoredHostsDurationMs, - m.serviceLogicMonitoredClustersDurationMs, + m.serviceLogicMonitoredHosts, + m.serviceLogicMonitoredClusters, + m.serviceLogicInstallerCacheGetRelease, + m.serviceLogicInstallerCacheReleaseEvicted, ) for _, collector := range m.collectors { @@ -477,14 +527,34 @@ func (m *MetricsManager) FileSystemUsage(usageInPercentage float64) { m.serviceLogicFilesystemUsagePercentage.WithLabelValues().Set(usageInPercentage) } -func (m *MetricsManager) MonitoredHostsDurationMs(monitoredHostsMs float64) { - m.serviceLogicMonitoredHostsDurationMs.WithLabelValues().Observe(monitoredHostsMs) +func (m *MetricsManager) MonitoredHostsCount(monitoredHosts int64) { + m.serviceLogicMonitoredHosts.WithLabelValues(hosts).Set(float64(monitoredHosts)) } -func (m *MetricsManager) MonitoredClustersDurationMs(monitoredClustersMs float64) { - m.serviceLogicMonitoredClustersDurationMs.WithLabelValues().Observe(monitoredClustersMs) +func (m *MetricsManager) MonitoredClusterCount(monitoredClusters int64) { + m.serviceLogicMonitoredClusters.WithLabelValues(clusters).Set(float64(monitoredClusters)) } func bytesToGib(bytes int64) int64 { return bytes / int64(units.GiB) } + +func (m *MetricsManager) InstallerCacheGetReleaseCached(releaseId string, cacheHit bool) { + m.serviceLogicInstallerCacheGetReleaseCached.WithLabelValues(releaseId, fmt.Sprintf("%t", cacheHit)).Inc() +} + +func (m *MetricsManager) InstallerCacheGetReleaseOK(releaseId string) { + m.serviceLogicInstallerCacheGetRelease.WithLabelValues(labelValueOK, releaseId).Inc() +} + +func (m *MetricsManager) InstallerCacheGetReleaseTimeout(releaseId string) { + m.serviceLogicInstallerCacheGetRelease.WithLabelValues(labelValueTimeout, releaseId).Inc() +} + +func (m *MetricsManager) InstallerCacheGetReleaseError(releaseId string) { + m.serviceLogicInstallerCacheGetRelease.WithLabelValues(labelValueError, releaseId).Inc() +} + +func (m *MetricsManager) InstallerCacheReleaseEvicted(succeeded bool) { + m.serviceLogicInstallerCacheReleaseEvicted.WithLabelValues(fmt.Sprintf("%t", succeeded)).Inc() +} diff --git a/internal/metrics/mock_metrics_manager_api.go b/internal/metrics/mock_metrics_manager_api.go index 26e3e09791..17f3ee599e 100644 --- a/internal/metrics/mock_metrics_manager_api.go +++ b/internal/metrics/mock_metrics_manager_api.go @@ -169,28 +169,88 @@ func (mr *MockAPIMockRecorder) InstallationStarted() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallationStarted", reflect.TypeOf((*MockAPI)(nil).InstallationStarted)) } -// MonitoredClustersDurationMs mocks base method. -func (m *MockAPI) MonitoredClustersDurationMs(monitoredClustersMillis float64) { +// InstallerCacheGetReleaseCached mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseCached(releaseId string, cacheHit bool) { m.ctrl.T.Helper() - m.ctrl.Call(m, "MonitoredClustersDurationMs", monitoredClustersMillis) + m.ctrl.Call(m, "InstallerCacheGetReleaseCached", releaseId, cacheHit) } -// MonitoredClustersDurationMs indicates an expected call of MonitoredClustersDurationMs. -func (mr *MockAPIMockRecorder) MonitoredClustersDurationMs(monitoredClustersMillis interface{}) *gomock.Call { +// InstallerCacheGetReleaseCached indicates an expected call of InstallerCacheGetReleaseCached. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseCached(releaseId, cacheHit interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MonitoredClustersDurationMs", reflect.TypeOf((*MockAPI)(nil).MonitoredClustersDurationMs), monitoredClustersMillis) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseCached", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseCached), releaseId, cacheHit) } -// MonitoredHostsDurationMs mocks base method. -func (m *MockAPI) MonitoredHostsDurationMs(monitoredHostsMillis float64) { +// InstallerCacheGetReleaseError mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseError(releaseID string) { m.ctrl.T.Helper() - m.ctrl.Call(m, "MonitoredHostsDurationMs", monitoredHostsMillis) + m.ctrl.Call(m, "InstallerCacheGetReleaseError", releaseID) } -// MonitoredHostsDurationMs indicates an expected call of MonitoredHostsDurationMs. -func (mr *MockAPIMockRecorder) MonitoredHostsDurationMs(monitoredHostsMillis interface{}) *gomock.Call { +// InstallerCacheGetReleaseError indicates an expected call of InstallerCacheGetReleaseError. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseError(releaseID interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MonitoredHostsDurationMs", reflect.TypeOf((*MockAPI)(nil).MonitoredHostsDurationMs), monitoredHostsMillis) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseError", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseError), releaseID) +} + +// InstallerCacheGetReleaseOK mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseOK(releaseID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheGetReleaseOK", releaseID) +} + +// InstallerCacheGetReleaseOK indicates an expected call of InstallerCacheGetReleaseOK. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseOK(releaseID interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseOK", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseOK), releaseID) +} + +// InstallerCacheGetReleaseTimeout mocks base method. +func (m *MockAPI) InstallerCacheGetReleaseTimeout(releaseID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheGetReleaseTimeout", releaseID) +} + +// InstallerCacheGetReleaseTimeout indicates an expected call of InstallerCacheGetReleaseTimeout. +func (mr *MockAPIMockRecorder) InstallerCacheGetReleaseTimeout(releaseID interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheGetReleaseTimeout", reflect.TypeOf((*MockAPI)(nil).InstallerCacheGetReleaseTimeout), releaseID) +} + +// InstallerCacheReleaseEvicted mocks base method. +func (m *MockAPI) InstallerCacheReleaseEvicted(succeeded bool) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "InstallerCacheReleaseEvicted", succeeded) +} + +// InstallerCacheReleaseEvicted indicates an expected call of InstallerCacheReleaseEvicted. +func (mr *MockAPIMockRecorder) InstallerCacheReleaseEvicted(succeeded interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallerCacheReleaseEvicted", reflect.TypeOf((*MockAPI)(nil).InstallerCacheReleaseEvicted), succeeded) +} + +// MonitoredClusterCount mocks base method. +func (m *MockAPI) MonitoredClusterCount(monitoredClusters int64) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "MonitoredClusterCount", monitoredClusters) +} + +// MonitoredClusterCount indicates an expected call of MonitoredClusterCount. +func (mr *MockAPIMockRecorder) MonitoredClusterCount(monitoredClusters interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MonitoredClusterCount", reflect.TypeOf((*MockAPI)(nil).MonitoredClusterCount), monitoredClusters) +} + +// MonitoredHostsCount mocks base method. +func (m *MockAPI) MonitoredHostsCount(monitoredHosts int64) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "MonitoredHostsCount", monitoredHosts) +} + +// MonitoredHostsCount indicates an expected call of MonitoredHostsCount. +func (mr *MockAPIMockRecorder) MonitoredHostsCount(monitoredHosts interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MonitoredHostsCount", reflect.TypeOf((*MockAPI)(nil).MonitoredHostsCount), monitoredHosts) } // ReportHostInstallationMetrics mocks base method.