snowplow · colmsnowplow · Jun 5, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/cmd/aws/cli/main.go b/cmd/aws/cli/main.go
@@ -19,13 +19,14 @@ import (
 	pubsubsource "github.com/snowplow/snowbridge/pkg/source/pubsub"
 	sqssource "github.com/snowplow/snowbridge/pkg/source/sqs"
 	stdinsource "github.com/snowplow/snowbridge/pkg/source/stdin"
-	"github.com/snowplow/snowbridge/pkg/transform/transformconfig"
+	"github.com/snowplow/snowbridge/pkg/transform/batch/batchtransformconfig"
+	"github.com/snowplow/snowbridge/pkg/transform/single/transformconfig"
 )
 
 func main() {
 	// Make a slice of SourceConfigPairs supported for this build
 	sourceConfigPairs := []config.ConfigurationPair{stdinsource.ConfigPair, sqssource.ConfigPair,
 		pubsubsource.ConfigPair, kafkasource.ConfigPair, kinesissource.ConfigPair}
 
-	cli.RunCli(sourceConfigPairs, transformconfig.SupportedTransformations)
+	cli.RunCli(sourceConfigPairs, transformconfig.SupportedTransformations, batchtransformconfig.SupportedTransformations)
 }
diff --git a/cmd/cli/cli.go b/cmd/cli/cli.go
@@ -35,8 +35,10 @@ import (
 	"github.com/snowplow/snowbridge/pkg/source/sourceiface"
 	"github.com/snowplow/snowbridge/pkg/target/targetiface"
 	"github.com/snowplow/snowbridge/pkg/telemetry"
-	"github.com/snowplow/snowbridge/pkg/transform"
-	"github.com/snowplow/snowbridge/pkg/transform/transformconfig"
+	batchtransform "github.com/snowplow/snowbridge/pkg/transform/batch"
+	"github.com/snowplow/snowbridge/pkg/transform/batch/batchtransformconfig"
+	transform "github.com/snowplow/snowbridge/pkg/transform/single"
+	"github.com/snowplow/snowbridge/pkg/transform/single/transformconfig"
 )
 
 const (
@@ -47,7 +49,11 @@ const (
 )
 
 // RunCli runs the app
-func RunCli(supportedSources []config.ConfigurationPair, supportedTransformations []config.ConfigurationPair) {
+func RunCli(
+	supportedSources []config.ConfigurationPair,
+	supportedTransformations []config.ConfigurationPair,
+	supportedBatchTransformations []config.ConfigurationPair,
+) {
 	cfg, sentryEnabled, err := cmd.Init()
 	if err != nil {
 		exitWithError(err, sentryEnabled)
@@ -95,6 +101,11 @@ func RunCli(supportedSources []config.ConfigurationPair, supportedTransformation
 			return err
 		}
 
+		btr, err := batchtransformconfig.GetBatchTransformations(cfg, batchtransformconfig.SupportedTransformations)
+		if err != nil {
+			return err
+		}
+
 		t, err := cfg.GetTarget()
 		if err != nil {
 			return err
@@ -158,7 +169,7 @@ func RunCli(supportedSources []config.ConfigurationPair, supportedTransformation
 
 		// Callback functions for the source to leverage when writing data
 		sf := sourceiface.SourceFunctions{
-			WriteToTarget: sourceWriteFunc(t, ft, tr, o),
+			WriteToTarget: sourceWriteFunc(t, ft, tr, btr, o),
 		}
 
 		// Read is a long running process and will only return when the source
@@ -189,7 +200,13 @@ func RunCli(supportedSources []config.ConfigurationPair, supportedTransformation
 // 4. Observing these results
 //
 // All with retry logic baked in to remove any of this handling from the implementations
-func sourceWriteFunc(t targetiface.Target, ft failureiface.Failure, tr transform.TransformationApplyFunction, o *observer.Observer) func(messages []*models.Message) error {
+func sourceWriteFunc(
+	t targetiface.Target,
+	ft failureiface.Failure,
+	tr transform.TransformationApplyFunction,
+	btr batchtransform.BatchTransformationApplyFunction,
+	o *observer.Observer,
+) func(messages []*models.Message) error {
 	return func(messages []*models.Message) error {
 
 		// Apply transformations
@@ -211,7 +228,7 @@ func sourceWriteFunc(t targetiface.Target, ft failureiface.Failure, tr transform
 		messagesToSend := transformed.Result
 
 		res, err := retry.ExponentialWithInterface(5, time.Second, "target.Write", func() (interface{}, error) {
-			res, err := t.Write(messagesToSend)
+			res, err := t.Write(messagesToSend, btr)
 
 			o.TargetWrite(res)
 			messagesToSend = res.Failed

diff --git a/cmd/main/cli/main.go b/cmd/main/cli/main.go
@@ -18,7 +18,8 @@ import (
 	pubsubsource "github.com/snowplow/snowbridge/pkg/source/pubsub"
 	sqssource "github.com/snowplow/snowbridge/pkg/source/sqs"
 	stdinsource "github.com/snowplow/snowbridge/pkg/source/stdin"
-	"github.com/snowplow/snowbridge/pkg/transform/transformconfig"
+	"github.com/snowplow/snowbridge/pkg/transform/batch/batchtransformconfig"
+	"github.com/snowplow/snowbridge/pkg/transform/single/transformconfig"
 )
 
 func main() {
@@ -28,5 +29,5 @@ func main() {
 		kafkasource.ConfigPair, pubsubsource.ConfigPair,
 	}
 
-	cli.RunCli(sourceConfigPairs, transformconfig.SupportedTransformations)
+	cli.RunCli(sourceConfigPairs, transformconfig.SupportedTransformations, batchtransformconfig.SupportedTransformations)
 }
diff --git a/config/config.go b/config/config.go
@@ -46,16 +46,17 @@ type Config struct {
 
 // configurationData for holding all configuration options
 type configurationData struct {
-	Source           *component     `hcl:"source,block" envPrefix:"SOURCE_"`
-	Target           *component     `hcl:"target,block" envPrefix:"TARGET_"`
-	FailureTarget    *failureConfig `hcl:"failure_target,block"`
-	Sentry           *sentryConfig  `hcl:"sentry,block"`
-	StatsReceiver    *statsConfig   `hcl:"stats_receiver,block"`
-	Transformations  []*component   `hcl:"transform,block"`
-	LogLevel         string         `hcl:"log_level,optional" env:"LOG_LEVEL"`
-	UserProvidedID   string         `hcl:"user_provided_id,optional" env:"USER_PROVIDED_ID"`
-	DisableTelemetry bool           `hcl:"disable_telemetry,optional" env:"DISABLE_TELEMETRY"`
-	License          *licenseConfig `hcl:"license,block"`
+	Source               *component     `hcl:"source,block" envPrefix:"SOURCE_"`
+	Target               *component     `hcl:"target,block" envPrefix:"TARGET_"`
+	FailureTarget        *failureConfig `hcl:"failure_target,block"`
+	Sentry               *sentryConfig  `hcl:"sentry,block"`
+	StatsReceiver        *statsConfig   `hcl:"stats_receiver,block"`
+	Transformations      []*component   `hcl:"transform,block"`
+	BatchTransformations []*component   `hcl:"batch_transform,block"`
+	LogLevel             string         `hcl:"log_level,optional" env:"LOG_LEVEL"`
+	UserProvidedID       string         `hcl:"user_provided_id,optional" env:"USER_PROVIDED_ID"`
+	DisableTelemetry     bool           `hcl:"disable_telemetry,optional" env:"DISABLE_TELEMETRY"`
+	License              *licenseConfig `hcl:"license,block"`
 }
 
 // component is a type to abstract over configuration blocks.

diff --git a/docs/configuration_transformations_docs_test.go b/docs/configuration_transformations_docs_test.go
@@ -21,10 +21,10 @@ import (
 	"github.com/hashicorp/hcl/v2/gohcl"
 	"github.com/snowplow/snowbridge/assets"
 	"github.com/snowplow/snowbridge/config"
-	"github.com/snowplow/snowbridge/pkg/transform"
-	"github.com/snowplow/snowbridge/pkg/transform/engine"
-	"github.com/snowplow/snowbridge/pkg/transform/filter"
-	"github.com/snowplow/snowbridge/pkg/transform/transformconfig"
+	transform "github.com/snowplow/snowbridge/pkg/transform/single"
+	"github.com/snowplow/snowbridge/pkg/transform/single/engine"
+	"github.com/snowplow/snowbridge/pkg/transform/single/filter"
+	"github.com/snowplow/snowbridge/pkg/transform/single/transformconfig"
 	"github.com/stretchr/testify/assert"
 )
 

diff --git a/pkg/failure/snowplow.go b/pkg/failure/snowplow.go
@@ -79,7 +79,7 @@ func (d *SnowplowFailure) WriteInvalid(invalid []*models.Message) (*models.Targe
 		transformed = append(transformed, tMsg)
 	}
 
-	return d.target.Write(transformed)
+	return d.target.Write(transformed, nil)
 }
 
 // WriteOversized will handle the conversion of oversized messages into failure
@@ -114,7 +114,7 @@ func (d *SnowplowFailure) WriteOversized(maximumAllowedSizeBytes int, oversized
 		transformed = append(transformed, tMsg)
 	}
 
-	return d.target.Write(transformed)
+	return d.target.Write(transformed, nil)
 }
 
 // Open manages opening the underlying target

diff --git a/pkg/failure/snowplow_test.go b/pkg/failure/snowplow_test.go
@@ -19,6 +19,7 @@ import (
 
 	"github.com/snowplow/snowbridge/pkg/models"
 	"github.com/snowplow/snowbridge/pkg/testutil"
+	batchtransform "github.com/snowplow/snowbridge/pkg/transform/batch"
 )
 
 // --- Test FailureTarget
@@ -27,7 +28,7 @@ type TestFailureTarget struct {
 	onWrite func(messages []*models.Message) (*models.TargetWriteResult, error)
 }
 
-func (t *TestFailureTarget) Write(messages []*models.Message) (*models.TargetWriteResult, error) {
+func (t *TestFailureTarget) Write(messages []*models.Message, btf batchtransform.BatchTransformationApplyFunction) (*models.TargetWriteResult, error) {
 	return t.onWrite(messages)
 }
 

diff --git a/pkg/models/batch_transformation.go b/pkg/models/batch_transformation.go
@@ -0,0 +1,30 @@
+/**
+ * Copyright (c) 2020-present Snowplow Analytics Ltd.
+ * All rights reserved.
+ *
+ * This software is made available by Snowplow Analytics, Ltd.,
+ * under the terms of the Snowplow Limited Use License Agreement, Version 1.0
+ * located at https://docs.snowplow.io/limited-use-license-1.0
+ * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION
+ * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT.
+ */
+
+package models
+
+import "time"
+
+// MessageBatch houses batches of messages, for batch transformations to operate across
+type MessageBatch struct {
+	OriginalMessages    []*Message        // Most targets will use the data from here, but where we have a http templating transformation, we would use this to ack batches of messages
+	BatchData           []byte            // Where we template http requests, we use this to define the body of the request
+	HTTPHeaders         map[string]string // For dynamic headers feature
+	TimeRequestStarted  time.Time
+	TimeRequestFinished time.Time
+}
+
+// BatchTransformationResult houses the result of a batch transformation
+type BatchTransformationResult struct {
+	Success   []*MessageBatch
+	Invalid   []*Message
+	Oversized []*Message
+}
diff --git a/pkg/target/common.go b/pkg/target/common.go
@@ -0,0 +1,47 @@
+/**
+ * Copyright (c) 2020-present Snowplow Analytics Ltd.
+ * All rights reserved.
+ *
+ * This software is made available by Snowplow Analytics, Ltd.,
+ * under the terms of the Snowplow Limited Use License Agreement, Version 1.0
+ * located at https://docs.snowplow.io/limited-use-license-1.0
+ * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION
+ * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT.
+ */
+
+package target
+
+import (
+	"github.com/snowplow/snowbridge/pkg/models"
+	batchtransform "github.com/snowplow/snowbridge/pkg/transform/batch"
+)
+
+// chunkBatcherWithConfig returns a batch transformation which incorporates models.GetChunkedMessages() into the batch transformation model.
+// It is done this way in order to pass GetChunkedMessages its config within the confines of the BatchTransfomration design
+func chunkBatcherWithConfig(chunkSize int, maxMessageByteSize int, maxChunkByteSize int) batchtransform.BatchTransformationFunction {
+
+	// chunkBatcher is a batch transformation which incorporates models.GetChunkedMessages() into the batch transformation model,
+	// preserving the original logic and ownership of the function.
+	chunkBatcher := func(batchesIn []*models.MessageBatch) ([]*models.MessageBatch, []*models.Message, []*models.Message) {
+		oversizedOut := make([]*models.Message, 0)
+		chunkedBatches := make([]*models.MessageBatch, 0)
+
+		for _, batch := range batchesIn {
+			chunks, oversized := models.GetChunkedMessages(batch.OriginalMessages, chunkSize, maxMessageByteSize, maxChunkByteSize)
+
+			oversizedOut = append(oversizedOut, oversized...)
+
+			for _, chunk := range chunks {
+				asBatch := &models.MessageBatch{
+					OriginalMessages: chunk,
+				}
+
+				chunkedBatches = append(chunkedBatches, asBatch)
+			}
+
+		}
+		return chunkedBatches, nil, oversizedOut
+	}
+
+	return chunkBatcher
+}
diff --git a/pkg/target/eventhub.go b/pkg/target/eventhub.go
@@ -23,6 +23,7 @@ import (
 	log "github.com/sirupsen/logrus"
 
 	"github.com/snowplow/snowbridge/pkg/models"
+	batchtransform "github.com/snowplow/snowbridge/pkg/transform/batch"
 )
 
 // EventHubConfig holds a config object for Azure EventHub
@@ -147,9 +148,10 @@ func AdaptEventHubTargetFunc(f func(c *EventHubConfig) (*EventHubTarget, error))
 	}
 }
 
-func (eht *EventHubTarget) Write(messages []*models.Message) (*models.TargetWriteResult, error) {
+func (eht *EventHubTarget) Write(messages []*models.Message, batchTransformFunc batchtransform.BatchTransformationApplyFunction) (*models.TargetWriteResult, error) {
 	eht.log.Debugf("Writing %d messages to stream ...", len(messages))
 
+	// TODO: Replace this with the new Chunker Batch Transformation - should be a post function.
 	chunks, oversized := models.GetChunkedMessages(
 		messages,
 		eht.chunkMessageLimit,                // Max Chunk size (number of messages)