Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MGMT-19840: Gather operational metrics from installercache #7281

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ func main() {
Options.BMConfig.S3EndpointURL = newUrl

Options.InstallerCacheConfig.CacheDir = filepath.Join(Options.GeneratorConfig.GetWorkingDirectory(), "installercache")
installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, diskStatsHelper, log)
installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, metricsManager, diskStatsHelper, log)
failOnError(err, "failed to instantiate installercache")

generator := generator.New(log, objectHandler, Options.GeneratorConfig, providerRegistry, manifestsApi, eventsHandler, installerCache)
Expand Down
20 changes: 15 additions & 5 deletions internal/ignition/installmanifests_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ var _ = Describe("Bootstrap Ignition Update", func() {
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler *eventsapi.MockHandler
installerCache *installercache.Installers
metricsAPI *metrics.MockAPI
)

BeforeEach(func() {
Expand All @@ -105,12 +106,13 @@ var _ = Describe("Bootstrap Ignition Update", func() {
err1 = os.WriteFile(examplePath, []byte(bootstrap1), 0600)
Expect(err1).NotTo(HaveOccurred())
ctrl = gomock.NewController(GinkgoT())
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
mockS3Client = s3wrapper.NewMockAPI(ctrl)
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
Expand Down Expand Up @@ -262,6 +264,7 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6
ctrl *gomock.Controller
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand All @@ -286,12 +289,13 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6
ctrl = gomock.NewController(GinkgoT())
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down Expand Up @@ -456,6 +460,7 @@ var _ = Describe("createHostIgnitions", func() {
workDir string
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand All @@ -476,13 +481,14 @@ var _ = Describe("createHostIgnitions", func() {
mockS3Client = s3wrapper.NewMockAPI(ctrl)
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
cluster = testCluster()
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down Expand Up @@ -1779,6 +1785,7 @@ var _ = Describe("Bare metal host generation", func() {
ctrl *gomock.Controller
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand All @@ -1789,13 +1796,14 @@ var _ = Describe("Bare metal host generation", func() {
ctrl = gomock.NewController(GinkgoT())
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
ReleaseFetchRetryInterval: 1 * time.Microsecond,
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down Expand Up @@ -1889,6 +1897,7 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() {
ctrl *gomock.Controller
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand Down Expand Up @@ -1920,13 +1929,14 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() {
ctrl = gomock.NewController(GinkgoT())
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
ReleaseFetchRetryInterval: 1 * time.Microsecond,
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down
33 changes: 25 additions & 8 deletions internal/installercache/installercache.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type Installers struct {
eventsHandler eventsapi.Handler
diskStatsHelper metrics.DiskStatsHelper
config Config
metricsAPI metrics.API
}

type Size int64
Expand Down Expand Up @@ -116,7 +117,7 @@ func (rl *Release) Cleanup(ctx context.Context) error {
}

// New constructs an installer cache with a given storage capacity
func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) {
func New(config Config, eventsHandler eventsapi.Handler, metricsAPI metrics.API, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) {
if config.MaxCapacity > 0 && config.MaxReleaseSize == 0 {
return nil, fmt.Errorf("config.MaxReleaseSize (%d bytes) must not be zero", config.MaxReleaseSize)
}
Expand All @@ -128,6 +129,7 @@ func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics
eventsHandler: eventsHandler,
diskStatsHelper: diskStatsHelper,
config: config,
metricsAPI: metricsAPI,
}, nil
}

Expand All @@ -138,14 +140,25 @@ func (i *Installers) Get(ctx context.Context, releaseID, releaseIDMirror, pullSe
for {
select {
case <-ctx.Done():
return nil, ctx.Err()
err := ctx.Err()
if err == context.DeadlineExceeded {
i.metricsAPI.InstallerCacheGetReleaseTimeout(releaseID)
}
return nil, err
default:
release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID)
majorMinorVersion, err := ocRelease.GetMajorMinorVersion(i.log, releaseID, releaseIDMirror, pullSecret)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be done outside the for select

if err != nil {
i.log.Warnf("unable to get majorMinorVersion to record metric for %s falling back to full URI", releaseID)
majorMinorVersion = releaseID
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as discussed releaseID should not be a value. We can set unknown or undfined or something if we cannot extract the value from the release (which should never happen anyway)

}
release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID, majorMinorVersion)
if err == nil {
i.metricsAPI.InstallerCacheGetReleaseOK(majorMinorVersion)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We agreed on having 2 metrics:

  • assisted_installer_cache_get_release{hit="(true|false)"}
  • assisted_installer_cache_get_release_error{error="(timeout|whatever|other/unknown/undefined)"}

I still think the error metric won't be of much (or any) use, but definitely we do not want to count all requests there. What I thought of this is hits+misses+errors should return the total number of requests to this function.

Did I misunderstand anything?

return release, nil
}
_, isCapacityError := err.(*errorInsufficientCacheCapacity)
if !isCapacityError {
i.metricsAPI.InstallerCacheGetReleaseError(releaseID)
return nil, errors.Wrapf(err, "failed to get installer path for release %s", releaseID)
}
time.Sleep(i.config.ReleaseFetchRetryInterval)
Expand All @@ -161,14 +174,16 @@ func (i *Installers) getDiskUsageIncludingHardlinks() (uint64, error) {
return usedBytes, nil
}

func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release) (extractDuration float64, cached bool, err error) {
func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release, majorMinorVersion string) (extractDuration float64, cached bool, err error) {
_, err = os.Stat(path)
if err == nil {
i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, true)
return 0, true, nil // release was found in the cache
}
if !os.IsNotExist(err) {
return 0, false, err
}
i.metricsAPI.InstallerCacheGetReleaseCached(majorMinorVersion, false)
usedBytes, err := i.getDiskUsageIncludingHardlinks()
if err != nil && !os.IsNotExist(err) {
return 0, false, errors.Wrapf(err, "could not determine disk usage information for cache dir %s", i.config.CacheDir)
Expand All @@ -184,7 +199,7 @@ func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pu
return time.Since(extractStartTime).Seconds(), false, nil
}

func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID) (*Release, error) {
func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocRelease oc.Release, ocpVersion string, clusterID strfmt.UUID, majorMinorVersion string) (*Release, error) {
i.Lock()
defer i.Unlock()

Expand All @@ -199,7 +214,7 @@ func (i *Installers) get(releaseID, releaseIDMirror, pullSecret string, ocReleas
if err != nil {
return nil, err
}
release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease)
release.extractDuration, release.cached, err = i.extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion, ocRelease, majorMinorVersion)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -310,8 +325,10 @@ func (i *Installers) evictFile(filePath string) error {
i.log.Infof("evicting binary file %s due to storage pressure", filePath)
err := os.Remove(filePath)
if err != nil {
i.metricsAPI.InstallerCacheReleaseEvicted(false)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we measure this in the outer method evict()? https://github.com/openshift/assisted-service/pull/7281/files#diff-cf73de10b668273452bf6aeb594f0385d3f3ea5b1a851c0f577d2d8989e6cccdR320

If we really want to add number of files evicted we should change the metric type to bucketed

return err
}
i.metricsAPI.InstallerCacheReleaseEvicted(true)
// if the parent directory was left empty,
// remove it to avoid dangling directories
parentDir := path.Dir(filePath)
Expand All @@ -334,9 +351,9 @@ func (i *Installers) pruneExpiredHardLinks(links []*fileInfo, gracePeriod time.D
grace := graceTime.Unix()
if finfo.info.ModTime().Unix() < grace {
i.log.Infof("attempting to prune hard link %s", finfo.path)
err := os.Remove(finfo.path)
if err != nil {
if err := os.Remove(finfo.path); err != nil {
i.log.WithError(err).Errorf("failed to prune hard link %s", finfo.path)
continue
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the purpose of this continue?

}
}
}
Expand Down
Loading