Add Vegeta rates / targets to SLA in performance tests (#14429)

* Add Vegeta rates / targets to SLA in performance tests Signed-off-by: pingjiang <[email protected]> * dataplane-probe use total request Signed-off-by: pingjiang <[email protected]> * fix after review Signed-off-by: pingjiang <[email protected]> * delete useless condition Signed-off-by: pingjiang <[email protected]> * fix after review Signed-off-by: pingjiang <[email protected]> * add a deviation to vegeta total requests test Signed-off-by: pingjiang <[email protected]> * add threshold in vegeta total requests check Signed-off-by: pingjiang <[email protected]> --------- Signed-off-by: pingjiang <[email protected]>
knative · Jan 15, 2024 · 8162fe2 · 8162fe2
1 parent e5602d7
commit 8162fe2
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 10 deletions.
diff --git a/test/performance/benchmarks/dataplane-probe/main.go b/test/performance/benchmarks/dataplane-probe/main.go
@@ -159,7 +159,7 @@ LOOP:
 	influxReporter.AddDataPointsForMetrics(metricResults, benchmarkName)
 	_ = vegeta.NewTextReporter(metricResults).Report(os.Stdout)
 
-	if err := checkSLA(metricResults, t.slaMin, t.slaMax); err != nil {
+	if err := checkSLA(metricResults, t.slaMin, t.slaMax, rate, *duration); err != nil {
 		// make sure to still write the stats
 		influxReporter.FlushAndShutdown()
 		log.Fatalf(err.Error())
@@ -168,7 +168,7 @@ LOOP:
 	log.Println("Dataplane probe test finished")
 }
 
-func checkSLA(results *vegeta.Metrics, slaMin time.Duration, slaMax time.Duration) error {
+func checkSLA(results *vegeta.Metrics, slaMin time.Duration, slaMax time.Duration, rate vegeta.ConstantPacer, duration time.Duration) error {
 	// SLA 1: The p95 latency hitting the target has to be between the range defined
 	// in the target map on top.
 	if results.Latencies.P95 >= slaMin && results.Latencies.P95 <= slaMax {
@@ -177,5 +177,12 @@ func checkSLA(results *vegeta.Metrics, slaMin time.Duration, slaMax time.Duratio
 		return fmt.Errorf("SLA 1 failed. P95 latency is not in %d-%dms time range: %s", slaMin, slaMax, results.Latencies.P95)
 	}
 
+	// SLA 2: making sure the defined total request is met
+	if results.Requests == uint64(rate.Rate(time.Second)*duration.Seconds()) {
+		log.Printf("SLA 2 passed. vegeta total request is %d", results.Requests)
+	} else {
+		return fmt.Errorf("SLA 2 failed. vegeta total request is %d, expected total request is %f", results.Requests, rate.Rate(time.Second)*duration.Seconds())
+	}
+
 	return nil
 }
diff --git a/test/performance/benchmarks/load-test/main.go b/test/performance/benchmarks/load-test/main.go
@@ -104,7 +104,7 @@ func main() {
 	influxReporter.AddDataPointsForMetrics(metricResults, benchmarkName)
 	_ = vegeta.NewTextReporter(metricResults).Report(os.Stdout)
 
-	if err := checkSLA(metricResults); err != nil {
+	if err := checkSLA(metricResults, pacers, durations); err != nil {
 		// make sure to still write the stats
 		influxReporter.FlushAndShutdown()
 		log.Fatalf(err.Error())
@@ -156,7 +156,7 @@ func processResults(ctx context.Context, results <-chan *vegeta.Result, reporter
 	}
 }
 
-func checkSLA(results *vegeta.Metrics) error {
+func checkSLA(results *vegeta.Metrics, pacers []vegeta.Pacer, durations []time.Duration) error {
 	// SLA 1: the p95 latency has to be over the 0->3k stepped burst
 	// falls in the +15ms range (we sleep 100 ms, so 100-115ms).
 	// This includes a mix of cold-starts and steady state (once the autoscaling decisions have leveled off).
@@ -183,5 +183,18 @@ func checkSLA(results *vegeta.Metrics) error {
 		return fmt.Errorf("SLA 3 failed. Errors occurred: %d", len(results.Errors))
 	}
 
+	// SLA 4: making sure the defined vegeta total requests is met
+	var expectedSum float64
+	var expectedRequests uint64
+	for i := 0; i < len(pacers); i++ {
+		expectedSum = expectedSum + pacers[i].Rate(time.Second)*durations[i].Seconds()
+	}
+	expectedRequests = uint64(expectedSum)
+	if results.Requests >= expectedRequests-(expectedRequests/1000) {
+		log.Printf("SLA 4 passed. total requests is %d, expected threshold is %d", results.Requests, expectedRequests-(expectedRequests/1000))
+	} else {
+		return fmt.Errorf("SLA 4 failed. total requests is %d, expected threshold is %d", results.Requests, expectedRequests-(expectedRequests/1000))
+	}
+
 	return nil
 }
diff --git a/test/performance/benchmarks/real-traffic-test/main.go b/test/performance/benchmarks/real-traffic-test/main.go
@@ -182,7 +182,7 @@ LOOP:
 	influxReporter.AddDataPointsForMetrics(metricResults, benchmarkName)
 	_ = vegeta.NewTextReporter(metricResults).Report(os.Stdout)
 
-	if err := checkSLA(metricResults); err != nil {
+	if err := checkSLA(metricResults, rate); err != nil {
 		cleanup()
 		influxReporter.FlushAndShutdown()
 		log.Fatal(err.Error())
@@ -289,13 +289,20 @@ func getRandomBool() bool {
 	return rand.Intn(2) == 1
 }
 
-func checkSLA(results *vegeta.Metrics) error {
+func checkSLA(results *vegeta.Metrics, rate vegeta.ConstantPacer) error {
 	// SLA 1: All requests should pass successfully.
 	if len(results.Errors) == 0 {
 		log.Println("SLA 1 passed. No errors occurred")
 	} else {
 		return fmt.Errorf("SLA 1 failed. Errors occurred: %d", len(results.Errors))
 	}
 
+	// SLA 2: making sure the defined vegeta rates is met
+	if results.Rate == rate.Rate(time.Second) {
+		log.Printf("SLA 2 passed. vegeta rate is %f", rate.Rate(time.Second))
+	} else {
+		return fmt.Errorf("SLA 2 failed. vegeta rate is %f, expected Rate is %f", results.Rate, rate.Rate(time.Second))
+	}
+
 	return nil
 }
diff --git a/test/performance/benchmarks/rollout-probe/main.go b/test/performance/benchmarks/rollout-probe/main.go
@@ -21,6 +21,7 @@ import (
 	"flag"
 	"fmt"
 	"log"
+	"math"
 	"net/http"
 	"os"
 	"strings"
@@ -210,7 +211,7 @@ LOOP:
 	influxReporter.AddDataPointsForMetrics(metricResults, benchmarkName)
 	_ = vegeta.NewTextReporter(metricResults).Report(os.Stdout)
 
-	if err := checkSLA(metricResults); err != nil {
+	if err := checkSLA(metricResults, rate); err != nil {
 		// make sure to still write the stats
 		influxReporter.FlushAndShutdown()
 		log.Fatalf(err.Error())
@@ -219,7 +220,7 @@ LOOP:
 	log.Println("Load test finished")
 }
 
-func checkSLA(results *vegeta.Metrics) error {
+func checkSLA(results *vegeta.Metrics, rate vegeta.ConstantPacer) error {
 	// SLA 1: The p95 latency hitting a Knative Service
 	// going through either JUST the queue-proxy or BOTH the activator and queue-proxy
 	// falls in the +10ms range. Given that we sleep 100ms, the SLA is between 100-110ms.
@@ -229,5 +230,12 @@ func checkSLA(results *vegeta.Metrics) error {
 		return fmt.Errorf("SLA 1 failed. P95 latency is not in 100-110ms time range: %s", results.Latencies.P95)
 	}
 
+	// SLA 2: making sure the defined vegeta rates is met
+	if math.Round(results.Rate) == rate.Rate(time.Second) {
+		log.Printf("SLA 2 passed. vegeta rate is %f", rate.Rate(time.Second))
+	} else {
+		return fmt.Errorf("SLA 2 failed. vegeta rate is %f, expected Rate is %f", results.Rate, rate.Rate(time.Second))
+	}
+
 	return nil
 }
diff --git a/test/performance/benchmarks/scale-from-zero/main.go b/test/performance/benchmarks/scale-from-zero/main.go
@@ -164,7 +164,7 @@ func main() {
 	_ = vegeta.NewTextReporter(metricResults).Report(os.Stdout)
 
 	sla := slas[*parallelCount]
-	if err := checkSLA(metricResults, sla.p95min, sla.p95max, sla.latencyMax); err != nil {
+	if err := checkSLA(metricResults, sla.p95min, sla.p95max, sla.latencyMax, *parallelCount); err != nil {
 		// make sure to still write the stats
 		influxReporter.FlushAndShutdown()
 		log.Fatalf(err.Error())
@@ -343,7 +343,7 @@ func runScaleFromZero(ctx context.Context, clients *test.Clients, idx int, ro *v
 	}
 }
 
-func checkSLA(results *vegeta.Metrics, p95min time.Duration, p95max time.Duration, latencyMax time.Duration) error {
+func checkSLA(results *vegeta.Metrics, p95min time.Duration, p95max time.Duration, latencyMax time.Duration, parallel int) error {
 	// SLA 1: The p95 latency hitting the target has to be between the range defined
 	if results.Latencies.P95 >= p95min && results.Latencies.P95 <= p95max {
 		log.Printf("SLA 1 passed. P95 latency is in %d-%dms time range", p95min, p95max)
@@ -358,5 +358,12 @@ func checkSLA(results *vegeta.Metrics, p95min time.Duration, p95max time.Duratio
 		return fmt.Errorf("SLA 2 failed. Max latency is higher than %dms: %s", latencyMax, results.Latencies.Max)
 	}
 
+	// SLA 3: making sure the defined vegeta total requests is met, the defined vegeta total requests should equal to the count of ksvcs we want to run scale-from-zero in parallel
+	if results.Requests == uint64(parallel) {
+		log.Printf("SLA 3 passed. total requests is %d", results.Requests)
+	} else {
+		return fmt.Errorf("SLA 3 failed. total requests is %d, expected total requests is %d", results.Requests, uint64(parallel))
+	}
+
 	return nil
 }