From f2acba3afe9402ee22c40aa68f6d150ee7f9f172 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Wed, 15 Jan 2025 14:34:54 +0100 Subject: [PATCH] feat: partition mask (#383) * feat(partition): add venom test * feat(partition): create empty partition mask * feat(partition): test partition conditions * feat(partition): exec active partition * fix(partition): partitions must be ordered * feat(partition): update docs --- CHANGELOG.md | 4 + README.md | 30 +++++++ internal/app/pimo/pimo.go | 2 + pkg/model/model.go | 7 ++ pkg/partition/partition.go | 142 ++++++++++++++++++++++++++++++ schema/v1/pimo.schema.json | 39 ++++++++ test/suites/masking_partition.yml | 45 ++++++++++ 7 files changed, 269 insertions(+) create mode 100644 pkg/partition/partition.go create mode 100644 test/suites/masking_partition.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 97f33458..b43ac96f 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ Types of changes - `Fixed` for any bug fixes. - `Security` in case of vulnerabilities. +## [1.30.0] + +- `Added` mask `partition` to handle fields containing different types of values by applying distinct transformations + ## [1.29.1] - `Fixed` mock command ignores global seed flag diff --git a/README.md b/README.md index 28ba5d21..ea823c44 100755 --- a/README.md +++ b/README.md @@ -165,6 +165,7 @@ The following types of masks can be used : * [`replacement`](#replacement) is to mask a data with another data from the jsonline. * [`pipe`](#pipe) is a mask to handle complex nested array structures, it can read an array as an object stream and process it with a sub-pipeline. * [`apply`](#apply) process selected data with a sub-pipeline. + * [`partitions`](#partitions) will rely on conditions to identify specific cases. * [`luhn`](#luhn) can generate valid numbers using the Luhn algorithm (e.g. french SIRET or SIREN). * [`markov`](#markov) can generate pseudo text based on a sample text. * [`findInCSV`](#findincsv) get one or multiple csv lines which matched with Json entry value from CSV files. @@ -1069,6 +1070,35 @@ By default, if not specified otherwise, these classes will be used (input -> out [Return to list of masks](#possible-masks) +### Partitions + +[![Try it](https://img.shields.io/badge/-Try%20it%20in%20PIMO%20Play-brightgreen)](https://cgi-fr.github.io/pimo-play/#c=G4UwTgzglg9gdgLgAQCICMKBQBbAhhAayjgHMFNMkkBaJCEAGxAGMAXGMcq7pAKwngAHXKwAWyFLgCuYjlh55CXHkmFhWUDfAjKVNJHFzYQEkHigN5e7gHdRIREgDkAbxcA6abLBIAPkgATEAAzaQZWVBQ-JGwpCFYAJRASEAAPAFkRZlFUAD0AbVxqAC8AXQBqAAFCkoqAHTr3GrLygBIogF8Op0prKjEHXT79Zm1WXDhWCQn4AE9sSoCYczh3UewrPVpDYwkoAM3rO0HnN08ZUQ5ooNCpcMjo2PiklIysnJQCgAZqAE4K9pILo9YZIAaIXqg2ijODxCZTVBfJHIlGHHjbIwmVAwAZgNEqcFDPrQsbw6ZwOYbTBAA&i=N4KABGBECGCuAuALA9gJ0gLjAbXBKApgLbQCWANgAIAmyJpAdgHQDGdkANHhJAIwBMAZgAsAVgBsnblABSyRAzAARZAUh4AuiAC+QA) + +The partition mask will rely on conditions to identify specific cases and apply a defined list of masks for each case. Example configuration: + +```yaml +- selector: + jsonpath: "ID" + mask: + partitions: # only the fist active condition will execute + - name: case1 + when: '{{ regexMatch "P[A-Z]{3}[0-9]{3}" .ID }}' + then: + # List of masks for case 1 + - constant: "this is case 1" + - name: case2 + when: '{{ regexMatch "G[0-9]{11}" .ID }}' + then: + # List of masks for case 2 + - constant: "this is case 2" + - name: default # case with no "when" condition will always execute + then: + # List of masks for unrecognized cases + - constant: "this is another case" +``` + +[Return to list of masks](#possible-masks) + ### FindInCSV [![Try it](https://img.shields.io/badge/-Try%20it%20in%20PIMO%20Play-brightgreen)](https://cgi-fr.github.io/pimo-play/#c=G4UwTgzglg9gdgLgAQCICMKBQBbAhhAayjgHMFMkkBaJCEAGxAGMAXGMcyrpAKwngAOuFgAtkKYgDMYWbnkIRO3aklwATNUnGzlNScTUBJOAGEAygDUlyrgFcwUcSJYsBigPTuSUCCwB03qK2AEa2dGBM8CwgcP6R2O64YNje9IwQ7mgAnAAswUySkgDMAKwADGVoIADsIMElRbgATLgAHME5OVXtTUwAbO5guADu7llNTRX5Zbh91UVqJUwgTWhoM7jqOSWdIGp9TDmVZZJ9rdXuAjAEINjwfkwQwDo2lCAAHrisALLCTGIUV7cR7AZAAcgA3hCABQGD5IPyoAAqAE8BCAkBgAJRIAA+SHoMGG4CQAF9SWDAUC3rEwCjxFC-Cw0SAAPpockvV48L5MJJqazUkHgxkAOVw2Ax+MJxLAZIpVOpMRYdIZEL8cAlUpl4E51K4ipsH3RrD24mEVEY+BYVHgIC5NhEIHU4GQKtsIENyhVUGwbrAHswQA&i=N4KABGBEAuCeAOBTA+gRkgLigMwJYCdFIAacKAOwEMBbIrSAY0v1vIBNF9IQBfIA) diff --git a/internal/app/pimo/pimo.go b/internal/app/pimo/pimo.go index 2232938d..a541cc42 100755 --- a/internal/app/pimo/pimo.go +++ b/internal/app/pimo/pimo.go @@ -45,6 +45,7 @@ import ( "github.com/cgi-fr/pimo/pkg/markov" "github.com/cgi-fr/pimo/pkg/model" "github.com/cgi-fr/pimo/pkg/parquet" + "github.com/cgi-fr/pimo/pkg/partition" "github.com/cgi-fr/pimo/pkg/pipe" "github.com/cgi-fr/pimo/pkg/randdate" "github.com/cgi-fr/pimo/pkg/randdura" @@ -343,6 +344,7 @@ func injectMaskFactories() []model.MaskFactory { sequence.Factory, sha3.Factory, apply.Factory, + partition.Factory, } } diff --git a/pkg/model/model.go b/pkg/model/model.go index fcf4c4c2..ad5a48b0 100755 --- a/pkg/model/model.go +++ b/pkg/model/model.go @@ -241,6 +241,12 @@ type ApplyType struct { URI string `yaml:"uri" json:"uri" jsonschema_description:"URI of the mask resource"` } +type PartitionType struct { + Name string `yaml:"name" json:"name" jsonschema_description:"name of the partition"` + When string `yaml:"when,omitempty" json:"when,omitempty" jsonschema_description:"template to execute, if true the condition is active"` + Then []MaskType `yaml:"then" json:"then" jsonschema_description:"list of masks to execute if the condition is active"` +} + type MaskType struct { Add Entry `yaml:"add,omitempty" json:"add,omitempty" jsonschema:"oneof_required=Add,title=Add Mask,description=Add a new field in the JSON stream"` AddTransient Entry `yaml:"add-transient,omitempty" json:"add-transient,omitempty" jsonschema:"oneof_required=AddTransient,title=Add Transient Mask" jsonschema_description:"Add a new temporary field, that will not show in the JSON output"` @@ -280,6 +286,7 @@ type MaskType struct { Sequence SequenceType `yaml:"sequence,omitempty" json:"sequence,omitempty" jsonschema:"oneof_required=Sequence,title=Sequence Mask" jsonschema_description:"Generate a sequenced ID that follows specified format"` Sha3 Sha3Type `yaml:"sha3,omitempty" json:"sha3,omitempty" jsonschema:"oneof_required=Sha3,title=Sha3 Mask" jsonschema_description:"Generate a variable-length crytographic hash (collision resistant)"` Apply ApplyType `yaml:"apply,omitempty" json:"apply,omitempty" jsonschema:"oneof_required=Apply,title=Apply Mask" jsonschema_description:"Call external masking file"` + Partition []PartitionType `yaml:"partitions,omitempty" json:"partitions,omitempty" jsonschema:"oneof_required=Partition,title=Partition Mask" jsonschema_description:"Identify specific cases and apply a defined list of masks for each case"` } type Masking struct { diff --git a/pkg/partition/partition.go b/pkg/partition/partition.go new file mode 100644 index 00000000..8cc4ecd3 --- /dev/null +++ b/pkg/partition/partition.go @@ -0,0 +1,142 @@ +package partition + +import ( + "bytes" + "hash/fnv" + tmpl "text/template" + + "github.com/cgi-fr/pimo/pkg/template" + + "github.com/cgi-fr/pimo/pkg/model" + "github.com/rs/zerolog/log" +) + +type MaskEngine struct { + partitions []Partition + seed int64 + seeder model.Seeder +} + +type Partition struct { + name string + when *template.Engine + exec model.Pipeline +} + +func buildDefinition(masks []model.MaskType, globalSeed int64) model.Definition { + definition := model.Definition{ + Version: "1", + Seed: globalSeed, + Functions: nil, + Masking: []model.Masking{}, + Caches: nil, + } + + for _, mask := range masks { + definition.Masking = append(definition.Masking, model.Masking{ + Selector: model.SelectorType{Jsonpath: "."}, + Mask: mask, + }) + } + + return definition +} + +// NewMask return a MaskEngine from a value +func NewMask(partitions []model.PartitionType, caches map[string]model.Cache, fns tmpl.FuncMap, seed int64, seeder model.Seeder, seedField string) (MaskEngine, error) { + parts := []Partition{} + + // Build partitions pipelines + for _, partition := range partitions { + template, err := template.NewEngine(partition.When, fns, seed, seedField) + if err != nil { + return MaskEngine{}, err + } + + if partition.When == "" { + template = nil + } + + definition := buildDefinition(partition.Then, seed) + pipeline := model.NewPipeline(nil) + pipeline, _, err = model.BuildPipeline(pipeline, definition, caches, fns, "", "") + if err != nil { + return MaskEngine{}, err + } + + parts = append(parts, Partition{ + name: partition.Name, + when: template, + exec: pipeline, + }) + } + + return MaskEngine{parts, seed, seeder}, nil +} + +func execPipeline(pipeline model.Pipeline, e model.Entry) (model.Entry, error) { + var result []model.Entry + + err := pipeline. + WithSource(model.NewSourceFromSlice([]model.Dictionary{model.NewDictionary().With(".", e)})). + // Process(model.NewCounterProcessWithCallback("internal", 1, updateContext)). + AddSink(model.NewSinkToSlice(&result)). + Run() + if err != nil { + return nil, err + } + + if len(result) == 0 { + return nil, nil + } + + return result[0], nil +} + +func (me MaskEngine) Mask(e model.Entry, context ...model.Dictionary) (model.Entry, error) { + log.Info().Msg("Mask partition") + + // exec all partitions + for _, partition := range me.partitions { + var output bytes.Buffer + + if partition.when != nil { + if err := partition.when.Execute(&output, context[0].UnpackUnordered()); err != nil { + return nil, err + } + } else { + output.WriteString("true") + } + + if output.String() == "true" { + log.Info().Msgf("Mask partition - executing partition %s", partition.name) + + result, err := execPipeline(partition.exec, e) + if err != nil { + return e, err + } + + return result, nil + } + } + + return e, nil +} + +// Factory create a mask from a configuration +func Factory(conf model.MaskFactoryConfiguration) (model.MaskEngine, bool, error) { + if len(conf.Masking.Mask.Partition) > 0 { + seeder := model.NewSeeder(conf.Masking.Seed.Field, conf.Seed) + + // set differents seeds for differents jsonpath + h := fnv.New64a() + h.Write([]byte(conf.Masking.Selector.Jsonpath)) + conf.Seed += int64(h.Sum64()) //nolint:gosec + mask, err := NewMask(conf.Masking.Mask.Partition, conf.Cache, conf.Functions, conf.Seed, seeder, conf.Masking.Seed.Field) + if err != nil { + return mask, true, err + } + return mask, true, nil + } + return nil, false, nil +} diff --git a/schema/v1/pimo.schema.json b/schema/v1/pimo.schema.json index 88a4efca..bfc49234 100644 --- a/schema/v1/pimo.schema.json +++ b/schema/v1/pimo.schema.json @@ -584,6 +584,12 @@ "apply" ], "title": "Apply" + }, + { + "required": [ + "partitions" + ], + "title": "Partition" } ], "properties": { @@ -778,6 +784,14 @@ "$ref": "#/$defs/ApplyType", "title": "Apply Mask", "description": "Call external masking file" + }, + "partitions": { + "items": { + "$ref": "#/$defs/PartitionType" + }, + "type": "array", + "title": "Partition Mask", + "description": "Identify specific cases and apply a defined list of masks for each case" } }, "additionalProperties": false, @@ -877,6 +891,31 @@ "name" ] }, + "PartitionType": { + "properties": { + "name": { + "type": "string", + "description": "name of the partition" + }, + "when": { + "type": "string", + "description": "template to execute, if true the condition is active" + }, + "then": { + "items": { + "$ref": "#/$defs/MaskType" + }, + "type": "array", + "description": "list of masks to execute if the condition is active" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "name", + "then" + ] + }, "PipeType": { "properties": { "masking": { diff --git a/test/suites/masking_partition.yml b/test/suites/masking_partition.yml new file mode 100644 index 00000000..abab8228 --- /dev/null +++ b/test/suites/masking_partition.yml @@ -0,0 +1,45 @@ +name: partition mask +testcases: +- name: simple partition with default case + steps: + - script: |- + cat > masking.yml <