Remove benchmark generate corpus command (#1553)

* remove benchmark generate coprus command * minor fixes in benhcmark rally * make check-statis * fix docs for benchmark rally command
elastic · Nov 13, 2023 · b2251c5 · b2251c5
1 parent 2c241f1
commit b2251c5
Show file tree

Hide file tree

Showing 13 changed files with 79 additions and 576 deletions.
diff --git a/README.md b/README.md
@@ -165,16 +165,6 @@ These benchmarks allow you to benchmark an integration end to end.
 
 For details on how to configure system benchmarks for a package, review the [HOWTO guide](./docs/howto/system_benchmarking.md).
 
-### `elastic-package benchmark generate-corpus`
-
-_Context: package_
-
-
-*BEWARE*: this command is in beta and it's behaviour may change in the future.
-Use this command to generate benchmarks corpus data for a package.
-Currently, only data for what we have related assets on https://github.com/elastic/elastic-integration-corpus-generator-tool are supported.
-For details on how to run this command, review the [HOWTO guide](./docs/howto/generate_corpus.md).
-
 ### `elastic-package benchmark pipeline`
 
 _Context: package_

diff --git a/cmd/benchmark.go b/cmd/benchmark.go
@@ -12,9 +12,6 @@ import (
 	"strings"
 	"time"
 
-	"github.com/dustin/go-humanize"
-
-	"github.com/elastic/elastic-package/internal/corpusgenerator"
 	"github.com/elastic/elastic-package/internal/elasticsearch"
 	"github.com/elastic/elastic-package/internal/install"
 	"github.com/elastic/elastic-package/internal/logger"
@@ -36,12 +33,6 @@ import (
 	"github.com/elastic/elastic-package/internal/testrunner"
 )
 
-const generateLongDescription = `
-*BEWARE*: this command is in beta and it's behaviour may change in the future.
-Use this command to generate benchmarks corpus data for a package.
-Currently, only data for what we have related assets on https://github.com/elastic/elastic-integration-corpus-generator-tool are supported.
-For details on how to run this command, review the [HOWTO guide](./docs/howto/generate_corpus.md).`
-
 const benchLongDescription = `Use this command to run benchmarks on a package. Currently, the following types of benchmarks are available:
 
 #### Pipeline Benchmarks
@@ -80,9 +71,6 @@ func setupBenchmarkCommand() *cobraext.Command {
 	systemCmd := getSystemCommand()
 	cmd.AddCommand(systemCmd)
 
-	generateCorpusCmd := getGenerateCorpusCommand()
-	cmd.AddCommand(generateCorpusCmd)
-
 	return cobraext.NewCommand(cmd, cobraext.ContextPackage)
 }
 
@@ -257,16 +245,6 @@ func rallyCommandAction(cmd *cobra.Command, args []string) error {
 		return cobraext.FlagParsingError(err, cobraext.BenchNameFlagName)
 	}
 
-	deferCleanup, err := cmd.Flags().GetDuration(cobraext.DeferCleanupFlagName)
-	if err != nil {
-		return cobraext.FlagParsingError(err, cobraext.DeferCleanupFlagName)
-	}
-
-	metricsInterval, err := cmd.Flags().GetDuration(cobraext.BenchMetricsIntervalFlagName)
-	if err != nil {
-		return cobraext.FlagParsingError(err, cobraext.BenchMetricsIntervalFlagName)
-	}
-
 	dataReindex, err := cmd.Flags().GetBool(cobraext.BenchReindexToMetricstoreFlagName)
 	if err != nil {
 		return cobraext.FlagParsingError(err, cobraext.BenchReindexToMetricstoreFlagName)
@@ -314,8 +292,6 @@ func rallyCommandAction(cmd *cobra.Command, args []string) error {
 	withOpts := []rally.OptionFunc{
 		rally.WithVariant(variant),
 		rally.WithBenchmarkName(benchName),
-		rally.WithDeferCleanup(deferCleanup),
-		rally.WithMetricsInterval(metricsInterval),
 		rally.WithDataReindexing(dataReindex),
 		rally.WithPackageRootPath(packageRootPath),
 		rally.WithESAPI(esClient.API),
@@ -496,75 +472,6 @@ func systemCommandAction(cmd *cobra.Command, args []string) error {
 	return nil
 }
 
-func getGenerateCorpusCommand() *cobra.Command {
-	generateCorpusCmd := &cobra.Command{
-		Use:   "generate-corpus",
-		Short: "Generate benchmarks corpus data for the package",
-		Long:  generateLongDescription,
-		Args:  cobra.NoArgs,
-		RunE:  generateDataStreamCorpusCommandAction,
-	}
-
-	generateCorpusCmd.Flags().StringP(cobraext.PackageFlagName, cobraext.PackageFlagShorthand, "", cobraext.PackageFlagDescription)
-	generateCorpusCmd.Flags().StringP(cobraext.GenerateCorpusDataSetFlagName, cobraext.GenerateCorpusDataSetFlagShorthand, "", cobraext.GenerateCorpusDataSetFlagDescription)
-	generateCorpusCmd.Flags().StringP(cobraext.GenerateCorpusSizeFlagName, cobraext.GenerateCorpusSizeFlagShorthand, "", cobraext.GenerateCorpusSizeFlagDescription)
-	generateCorpusCmd.Flags().StringP(cobraext.GenerateCorpusCommitFlagName, cobraext.GenerateCorpusCommitFlagShorthand, "main", cobraext.GenerateCorpusCommitFlagDescription)
-	generateCorpusCmd.Flags().StringP(cobraext.GenerateCorpusRallyTrackOutputDirFlagName, cobraext.GenerateCorpusRallyTrackOutputDirFlagShorthand, "", cobraext.GenerateCorpusRallyTrackOutputDirFlagDescription)
-
-	return generateCorpusCmd
-}
-
-func generateDataStreamCorpusCommandAction(cmd *cobra.Command, _ []string) error {
-	packageName, err := cmd.Flags().GetString(cobraext.PackageFlagName)
-	if err != nil {
-		return cobraext.FlagParsingError(err, cobraext.PackageFlagName)
-	}
-
-	dataSetName, err := cmd.Flags().GetString(cobraext.GenerateCorpusDataSetFlagName)
-	if err != nil {
-		return cobraext.FlagParsingError(err, cobraext.GenerateCorpusDataSetFlagName)
-	}
-
-	totSize, err := cmd.Flags().GetString(cobraext.GenerateCorpusSizeFlagName)
-	if err != nil {
-		return cobraext.FlagParsingError(err, cobraext.GenerateCorpusSizeFlagName)
-	}
-
-	totSizeInBytes, err := humanize.ParseBytes(totSize)
-	if err != nil {
-		return cobraext.FlagParsingError(err, cobraext.GenerateCorpusSizeFlagName)
-	}
-
-	commit, err := cmd.Flags().GetString(cobraext.GenerateCorpusCommitFlagName)
-	if err != nil {
-		return cobraext.FlagParsingError(err, cobraext.GenerateCorpusCommitFlagName)
-	}
-
-	if len(commit) == 0 {
-		commit = "main"
-	}
-
-	rallyTrackOutputDir, err := cmd.Flags().GetString(cobraext.GenerateCorpusRallyTrackOutputDirFlagName)
-	if err != nil {
-		return cobraext.FlagParsingError(err, cobraext.GenerateCorpusRallyTrackOutputDirFlagName)
-	}
-
-	genLibClient := corpusgenerator.NewClient(commit)
-	generator, err := corpusgenerator.NewGenerator(genLibClient, packageName, dataSetName, totSizeInBytes)
-	if err != nil {
-		return fmt.Errorf("can't generate benchmarks data corpus for data stream: %w", err)
-	}
-
-	// TODO: we need a way to extract the type from the package and dataset, currently hardcode to `metrics`
-	dataStream := fmt.Sprintf("metrics-%s.%s-default", packageName, dataSetName)
-	err = corpusgenerator.RunGenerator(generator, dataStream, rallyTrackOutputDir)
-	if err != nil {
-		return fmt.Errorf("can't generate benchmarks data corpus for data stream: %w", err)
-	}
-
-	return nil
-}
-
 func initializeESMetricsClient(ctx context.Context) (*elasticsearch.Client, error) {
 	address := os.Getenv(benchcommon.ESMetricstoreHostEnv)
 	user := os.Getenv(benchcommon.ESMetricstoreUsernameEnv)

diff --git a/docs/howto/generate_corpus.md b/docs/howto/generate_corpus.md
diff --git a/docs/howto/rally_benchmarking.md b/docs/howto/rally_benchmarking.md
@@ -1,4 +1,4 @@
-# HOWTO: Writing system benchmarks for a package
+# HOWTO: Writing rally benchmarks for a package
 
 ## Introduction
 Elastic Packages are comprised of data streams. A rally benchmark runs `esrally` track with a corpus of data into an Elasticsearch data stream, and reports rally stats as well as retrieving performance metrics from the Elasticsearch nodes.
@@ -10,11 +10,11 @@ Conceptually, running a rally benchmark involves the following steps:
 1. Deploy the Elastic Stack, including Elasticsearch, Kibana, and the Elastic Agent(s). This step takes time so it should typically be done once as a pre-requisite to running a system benchmark scenario.
 1. Install a package that configures its assets for every data stream in the package.
 1. Metrics collections from the cluster starts. (**TODO**: record metrics from all Elastic Agents involved using the `system` integration.)
-1. Send the collected metrics to the ES Metricstore if set.
 1. Generate data (it uses the [corpus-generator-tool](https://github.com/elastic/elastic-integration-corpus-generator-tool))
 1. Run an `esrally` track with the corpus of generated data. `esrally` must be installed on the system where the `elastic-package` is run and available in the `PATH`. 
 1. Wait for the `esrally` track to be executed.
 1. Metrics collection ends and a summary report is created.
+1. Send the collected metrics to the ES Metricstore if set.
 1. Delete test artifacts.
 1. Optionally reindex all ingested data into the ES Metricstore for further analysis.
 1. **TODO**: Optionally compare results against another benchmark run.
@@ -60,7 +60,6 @@ Example:
 description: Benchmark 20000 events ingested
 data_stream:
    name: testds
-warmup_time_period: 10s
 corpora:
    generator:
       total_events: 900000
@@ -275,7 +274,7 @@ In the directory of the `rally-track-output-dir` flag two files are saved:
 Both files are required to replay the rally benchmark. The first file references the second in its content.
 The command to run for replaying the track is the following:
 ```shell
-rally --target-hosts='{"default":["%es_cluster_host:es_cluster_port%"]}' --track-path=%path/to/saved-track-json% --client-options='{"default":{"basic_auth_user":"%es_user%","basic_auth_password":"%es_user%","use_ssl":true,"verify_certs":false}}' --pipeline=benchmark-only 
+esrally --target-hosts='{"defauelt":["%es_cluster_host:es_cluster_port%"]}' --track-path=%path/to/saved-track-json% --client-options='{"default":{"basic_auth_user":"%es_user%","basic_auth_password":"%es_user%","use_ssl":true,"verify_certs":false}}' --pipeline=benchmark-only 
 ```
 
 Please refer to [esrally CLI reference](https://esrally.readthedocs.io/en/stable/command_line_reference.html) for more details.

diff --git a/internal/benchrunner/runners/rally/options.go b/internal/benchrunner/runners/rally/options.go
@@ -67,18 +67,6 @@ func WithBenchmarkName(name string) OptionFunc {
 	}
 }
 
-func WithDeferCleanup(d time.Duration) OptionFunc {
-	return func(opts *Options) {
-		opts.DeferCleanup = d
-	}
-}
-
-func WithMetricsInterval(d time.Duration) OptionFunc {
-	return func(opts *Options) {
-		opts.MetricsInterval = d
-	}
-}
-
 func WithDataReindexing(b bool) OptionFunc {
 	return func(opts *Options) {
 		opts.ReindexData = b

diff --git a/internal/benchrunner/runners/rally/runner.go b/internal/benchrunner/runners/rally/runner.go
@@ -17,13 +17,13 @@ import (
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"text/template"
 	"time"
 
 	"github.com/elastic/elastic-package/internal/packages/installer"
 
 	"github.com/magefile/mage/sh"
 
-	"github.com/elastic/elastic-package/internal/corpusgenerator"
 	"github.com/elastic/elastic-package/internal/stack"
 
 	"github.com/google/uuid"
@@ -51,6 +51,42 @@ const (
 
 	// BenchType defining rally benchmark
 	BenchType benchrunner.Type = "rally"
+
+	rallyTrackTemplate = `{% import "rally.helpers" as rally with context %}
+{
+  "version": 2,
+  "description": "Track for [[.DataStream]]",
+  "datastream": [
+    {
+      "name": "[[.DataStream]]",
+      "body": "[[.CorpusFilename]]"
+    }
+  ],
+  "corpora": [
+    {
+      "name": "[[.CorpusFilename]]",
+      "documents": [
+        {
+          "target-data-stream": "[[.DataStream]]",
+          "source-file": "[[.CorpusFilename]]",
+          "document-count": [[.CorpusDocsCount]],
+          "uncompressed-bytes": [[.CorpusSizeInBytes]]
+        }
+      ]
+    }
+  ],
+  "schedule": [
+    {
+      "operation": {
+        "operation-type": "bulk",
+        "bulk-size": {{bulk_size | default(5000)}},
+        "ingest-percentage": {{ingest_percentage | default(100)}}
+      },
+      "clients": {{bulk_indexing_clients | default(8)}}
+    }
+  ]
+}
+`
 )
 
 var ErrDryRun = errors.New("dry run: rally benchmark not executed")
@@ -524,7 +560,7 @@ func (r *runner) runGenerator(destDir string) error {
 		return fmt.Errorf("cannot not create rally track file: %w", err)
 	}
 	r.trackFile = trackFile.Name()
-	rallyTrackContent, err := corpusgenerator.GenerateRallyTrack(r.runtimeDataStream, corpusFile, corpusDocsCount)
+	rallyTrackContent, err := generateRallyTrack(r.runtimeDataStream, corpusFile, corpusDocsCount)
 	if err != nil {
 		return fmt.Errorf("cannot not generate rally track content: %w", err)
 	}
@@ -909,3 +945,34 @@ func createRunID() string {
 func getDataStreamPath(packageRoot, dataStream string) string {
 	return filepath.Join(packageRoot, "data_stream", dataStream)
 }
+
+func generateRallyTrack(dataStream string, corpusFile *os.File, corpusDocsCount uint64) ([]byte, error) {
+	t := template.New("rallytrack")
+
+	parsedTpl, err := t.Delims("[[", "]]").Parse(rallyTrackTemplate)
+	if err != nil {
+		return nil, fmt.Errorf("error while parsing rally track template: %w", err)
+	}
+
+	fi, err := corpusFile.Stat()
+	if err != nil {
+		return nil, fmt.Errorf("error with stat on rally corpus file: %w", err)
+	}
+
+	corpusSizeInBytes := fi.Size()
+
+	buf := new(bytes.Buffer)
+	templateData := map[string]any{
+		"DataStream":        dataStream,
+		"CorpusFilename":    filepath.Base(corpusFile.Name()),
+		"CorpusDocsCount":   corpusDocsCount,
+		"CorpusSizeInBytes": corpusSizeInBytes,
+	}
+
+	err = parsedTpl.Execute(buf, templateData)
+	if err != nil {
+		return nil, fmt.Errorf("error on parsin on rally track template: %w", err)
+	}
+
+	return buf.Bytes(), nil
+}
diff --git a/internal/benchrunner/runners/rally/scenario.go b/internal/benchrunner/runners/rally/scenario.go
@@ -7,7 +7,6 @@ package rally
 import (
 	"errors"
 	"fmt"
-	"os"
 	"path/filepath"
 
 	"github.com/elastic/go-ucfg/yaml"
@@ -56,7 +55,7 @@ func readConfig(path, scenario, packageName, packageVersion string) (*scenario,
 	configPath := filepath.Join(path, devPath, fmt.Sprintf("%s.yml", scenario))
 	c := defaultConfig()
 	cfg, err := yaml.NewConfigWithFile(configPath)
-	if err != nil && !errors.Is(err, os.ErrNotExist) {
+	if err != nil {
 		return nil, fmt.Errorf("can't load benchmark configuration: %s: %w", configPath, err)
 	}
 
@@ -69,5 +68,9 @@ func readConfig(path, scenario, packageName, packageVersion string) (*scenario,
 	c.Package = packageName
 	c.Version = packageVersion
 
+	if c.DataStream.Name == "" {
+		return nil, errors.New("can't read data stream name from benchmark configuration: empty")
+	}
+
 	return c, nil
 }