Skip to content

Commit

Permalink
MGMT-19840: Gather operational metrics from installercache
Browse files Browse the repository at this point in the history
The intent of this PR is to trace the following statistics, implemented as counts and incremented from applicable parts of the solution.

	counterDescriptionInstallerCachePrunedHardlink           "Counts the number of times the installercache pruned a hardlink for being too old"
	counterDescriptionInstallerCacheGetReleaseOK             "Counts the number of times that a release was fetched succesfully"
	counterDescriptionInstallerCacheGetReleaseTimeout        "Counts the number of times that a release timed out or had the context cancelled"
	counterDescriptionInstallerCacheGetReleaseError          "Counts the number of times that a release fetch resulted in error"
	counterDescriptionInstallerCacheReleaseCached            "Counts the number of times that a release was found in the cache"
	counterDescriptionInstallerCacheReleaseExtracted         "Counts the number of times that a release was extracted"
	counterDescriptionInstallerCacheTryEviction              "Counts the number of times that the eviction function was called"
	counterDescriptionInstallerCacheReleaseEvicted           "Counts the number of times that a release was evicted"

This, combined with the event based metrics gathered in #7156 should provide enough information to track the behaviour of the cache.
  • Loading branch information
paul-maidment committed Feb 27, 2025
1 parent 86a016c commit 7b79d30
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 56 deletions.
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ func main() {
Options.BMConfig.S3EndpointURL = newUrl

Options.InstallerCacheConfig.CacheDir = filepath.Join(Options.GeneratorConfig.GetWorkingDirectory(), "installercache")
installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, diskStatsHelper, log)
installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, metricsManager, diskStatsHelper, log)
failOnError(err, "failed to instantiate installercache")

generator := generator.New(log, objectHandler, Options.GeneratorConfig, providerRegistry, manifestsApi, eventsHandler, installerCache)
Expand Down
20 changes: 15 additions & 5 deletions internal/ignition/installmanifests_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ var _ = Describe("Bootstrap Ignition Update", func() {
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler *eventsapi.MockHandler
installerCache *installercache.Installers
metricsAPI *metrics.MockAPI
)

BeforeEach(func() {
Expand All @@ -105,12 +106,13 @@ var _ = Describe("Bootstrap Ignition Update", func() {
err1 = os.WriteFile(examplePath, []byte(bootstrap1), 0600)
Expect(err1).NotTo(HaveOccurred())
ctrl = gomock.NewController(GinkgoT())
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
mockS3Client = s3wrapper.NewMockAPI(ctrl)
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
Expand Down Expand Up @@ -262,6 +264,7 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6
ctrl *gomock.Controller
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand All @@ -286,12 +289,13 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6
ctrl = gomock.NewController(GinkgoT())
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down Expand Up @@ -456,6 +460,7 @@ var _ = Describe("createHostIgnitions", func() {
workDir string
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand All @@ -476,13 +481,14 @@ var _ = Describe("createHostIgnitions", func() {
mockS3Client = s3wrapper.NewMockAPI(ctrl)
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
cluster = testCluster()
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down Expand Up @@ -1779,6 +1785,7 @@ var _ = Describe("Bare metal host generation", func() {
ctrl *gomock.Controller
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand All @@ -1789,13 +1796,14 @@ var _ = Describe("Bare metal host generation", func() {
ctrl = gomock.NewController(GinkgoT())
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
ReleaseFetchRetryInterval: 1 * time.Microsecond,
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down Expand Up @@ -1889,6 +1897,7 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() {
ctrl *gomock.Controller
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand Down Expand Up @@ -1920,13 +1929,14 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() {
ctrl = gomock.NewController(GinkgoT())
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
ReleaseFetchRetryInterval: 1 * time.Microsecond,
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down
33 changes: 25 additions & 8 deletions internal/installercache/installercache.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type Installers struct {
eventsHandler eventsapi.Handler
diskStatsHelper metrics.DiskStatsHelper
config Config
metricsAPI metrics.API
}

type Size int64
Expand Down Expand Up @@ -116,7 +117,7 @@ func (rl *Release) Cleanup(ctx context.Context) error {
}

// New constructs an installer cache with a given storage capacity
func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) {
func New(config Config, eventsHandler eventsapi.Handler, metricsAPI metrics.API, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) {
if config.MaxCapacity > 0 && config.MaxReleaseSize == 0 {
return nil, fmt.Errorf("config.MaxReleaseSize (%d bytes) must not be zero", config.MaxReleaseSize)
}
Expand All @@ -128,6 +129,7 @@ func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics
eventsHandler: eventsHandler,
diskStatsHelper: diskStatsHelper,
config: config,
metricsAPI: metricsAPI,
}, nil
}

Expand All @@ -138,14 +140,25 @@ func (i *Installers) Get(ctx context.Context, releaseID, releaseIDMirror, pullSe
for {
select {
case <-ctx.Done():
return nil, ctx.Err()
err := ctx.Err()
if err == context.DeadlineExceeded {
i.metricsAPI.InstallerCacheGetReleaseTimeout(releaseID)
}
return nil, err
default:
release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID)
majorMinorVersion, err := ocRelease.GetMajorMinorVersion(i.log, releaseID, releaseIDMirror, pullSecret)
if err != nil {
i.log.Warnf("unable to get majorMinorVersion to record metric for %s falling back to full URI", releaseID)
majorMinorVersion = releaseID
}
release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID, majorMinorVersion)
if err == nil {
i.metricsAPI.InstallerCacheGetReleaseOK(majorMinorVersion)
return release, nil
}
_, isCapacityError := err.(*errorInsufficientCacheCapacity)
if !isCapacityError {
i.metricsAPI.InstallerCacheGetReleaseError(releaseID)
return nil, errors.Wrapf(err, "failed to get installer path for release %s", releaseID)
}
time.Sleep(i.config.ReleaseFetchRetryInterval)
Expand All @@ -161,14 +174,16 @@ func (i *Installers) getDiskUsageIncludingHardlinks() (uint64, error) {
return usedBytes, nil
}

func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release) (extractDuration float64, cached bool, err error) {
func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release, majorMinorVersion string) (extractDuration float64, cached bool, err error) {
_, err = os.Stat(path)
if err == nil {
i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, true)
return 0, true, nil // release was found in the cache
}
if !os.IsNotExist(err) {
return 0, false, err
}
i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, false)
usedBytes, err := i.getDiskUsageIncludingHardlinks()
if err != nil && !os.IsNotExist(err) {
return 0, false, errors.Wrapf(err, "could not determine disk usage information for cache dir %s", i.config.CacheDir)
Expand All @@ -184,7 +199,7 @@ func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pu
return time.Since(extractStartTime).Seconds(), false, nil
}

func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID) (*Release, error) {
func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID, majorMinorVersion string) (*Release, error) {
i.Lock()
defer i.Unlock()

Expand All @@ -199,7 +214,7 @@ func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocReleas
if err != nil {
return nil, err
}
release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease)
release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease, majorMinorVersion)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -310,8 +325,10 @@ func (i *Installers) evictFile(filePath string) error {
i.log.Infof("evicting binary file %s due to storage pressure", filePath)
err := os.Remove(filePath)
if err != nil {
i.metricsAPI.InstallerCacheReleaseEvicted(false)
return err
}
i.metricsAPI.InstallerCacheReleaseEvicted(true)
// if the parent directory was left empty,
// remove it to avoid dangling directories
parentDir := path.Dir(filePath)
Expand All @@ -334,9 +351,9 @@ func (i *Installers) pruneExpiredHardLinks(links []*fileInfo, gracePeriod time.D
grace := graceTime.Unix()
if finfo.info.ModTime().Unix() < grace {
i.log.Infof("attempting to prune hard link %s", finfo.path)
err := os.Remove(finfo.path)
if err != nil {
if err := os.Remove(finfo.path); err != nil {
i.log.WithError(err).Errorf("failed to prune hard link %s", finfo.path)
continue
}
}
}
Expand Down
Loading

0 comments on commit 7b79d30

Please sign in to comment.