diff --git a/src/adapter/etl-adapter-elasticsearch/src/Flow/ETL/Adapter/Elasticsearch/EntryIdFactory/HashIdFactory.php b/src/adapter/etl-adapter-elasticsearch/src/Flow/ETL/Adapter/Elasticsearch/EntryIdFactory/HashIdFactory.php index 4742b31b5..926bd43b8 100644 --- a/src/adapter/etl-adapter-elasticsearch/src/Flow/ETL/Adapter/Elasticsearch/EntryIdFactory/HashIdFactory.php +++ b/src/adapter/etl-adapter-elasticsearch/src/Flow/ETL/Adapter/Elasticsearch/EntryIdFactory/HashIdFactory.php @@ -5,7 +5,7 @@ namespace Flow\ETL\Adapter\Elasticsearch\EntryIdFactory; use Flow\ETL\Adapter\Elasticsearch\IdFactory; -use Flow\ETL\Exception\InvalidArgumentException; +use Flow\ETL\Hash\{Algorithm, NativePHPHash}; use Flow\ETL\Row; use Flow\ETL\Row\Entry; @@ -16,32 +16,28 @@ final class HashIdFactory implements IdFactory */ private array $entryNames; - private string $hashName = 'xxh128'; + private Algorithm $hashAlgorithm; public function __construct(string ...$entryNames) { $this->entryNames = $entryNames; + $this->hashAlgorithm = new NativePHPHash(); } public function create(Row $row) : Entry { return new Entry\StringEntry( 'id', - \hash( - $this->hashName, + $this->hashAlgorithm->hash( \implode(':', \array_map(fn (string $name) : string => (string) $row->valueOf($name), $this->entryNames)) ) ); } - public function withAlgorithm(string $hashName) : self + public function withAlgorithm(Algorithm $algorithm) : self { - if (!\in_array($hashName, \hash_algos(), true)) { - throw InvalidArgumentException::because('Unsupported hash algorithm name provided: ' . $hashName . ', did you mean: ' . \implode(', ', \hash_algos())); - } - $factory = new self(...$this->entryNames); - $factory->hashName = $hashName; + $factory->hashAlgorithm = $algorithm; return $factory; } diff --git a/src/adapter/etl-adapter-elasticsearch/tests/Flow/ETL/Adapter/Elasticsearch/Tests/Unit/EntryIdFactory/HashIdFactoryTest.php b/src/adapter/etl-adapter-elasticsearch/tests/Flow/ETL/Adapter/Elasticsearch/Tests/Unit/EntryIdFactory/HashIdFactoryTest.php index 4b7025620..59b56e173 100644 --- a/src/adapter/etl-adapter-elasticsearch/tests/Flow/ETL/Adapter/Elasticsearch/Tests/Unit/EntryIdFactory/HashIdFactoryTest.php +++ b/src/adapter/etl-adapter-elasticsearch/tests/Flow/ETL/Adapter/Elasticsearch/Tests/Unit/EntryIdFactory/HashIdFactoryTest.php @@ -6,7 +6,7 @@ use function Flow\ETL\DSL\str_entry; use Flow\ETL\Adapter\Elasticsearch\EntryIdFactory\HashIdFactory; -use Flow\ETL\Exception\InvalidArgumentException; +use Flow\ETL\Hash\NativePHPHash; use Flow\ETL\Row; use PHPUnit\Framework\TestCase; @@ -29,7 +29,7 @@ public function test_create_row() : void public function test_create_row_with_different_hash() : void { - $factory = (new HashIdFactory('first_name', 'last_name'))->withAlgorithm('sha1'); + $factory = (new HashIdFactory('first_name', 'last_name'))->withAlgorithm(new NativePHPHash('sha1')); self::assertEquals( new Row\Entry\StringEntry( @@ -41,12 +41,4 @@ public function test_create_row_with_different_hash() : void ) ); } - - public function test_invalid_hash_algorithm_name() : void - { - $this->expectException(InvalidArgumentException::class); - $this->expectExceptionMessage('Unsupported hash algorithm name provided: whatever, did you mean: '); - - (new HashIdFactory('first_name'))->withAlgorithm('whatever'); - } } diff --git a/src/core/etl/src/Flow/ETL/DSL/functions.php b/src/core/etl/src/Flow/ETL/DSL/functions.php index 7d0402a15..3d084942c 100644 --- a/src/core/etl/src/Flow/ETL/DSL/functions.php +++ b/src/core/etl/src/Flow/ETL/DSL/functions.php @@ -42,6 +42,8 @@ Flow, FlowContext, Formatter, + Hash\Algorithm, + Hash\NativePHPHash, Join\Comparison, Join\Comparison\Equal, Join\Comparison\Identical, @@ -593,9 +595,9 @@ function concat(ScalarFunction ...$functions) : Concat return new Concat(...$functions); } -function hash(ScalarFunction $function, string $algorithm = 'xxh128', bool $binary = false, array $options = []) : Hash +function hash(ScalarFunction $function, Algorithm $algorithm = new NativePHPHash()) : Hash { - return new Hash($function, $algorithm, $binary, $options); + return new Hash($function, $algorithm); } function cast(ScalarFunction $function, string|Type $type) : Cast diff --git a/src/core/etl/src/Flow/ETL/Function/Hash.php b/src/core/etl/src/Flow/ETL/Function/Hash.php index 96aa58395..530d0fb1f 100644 --- a/src/core/etl/src/Flow/ETL/Function/Hash.php +++ b/src/core/etl/src/Flow/ETL/Function/Hash.php @@ -4,19 +4,15 @@ namespace Flow\ETL\Function; +use Flow\ETL\Hash\{Algorithm, NativePHPHash}; use Flow\ETL\Row; final class Hash extends ScalarFunctionChain { public function __construct( private readonly ScalarFunction $ref, - private readonly string $algorithm = 'xxh128', - private readonly bool $binary = false, - private readonly array $options = [] + private readonly Algorithm $algorithm = new NativePHPHash(), ) { - if (!\in_array($this->algorithm, \hash_algos(), true)) { - throw new \InvalidArgumentException(\sprintf('Hash algorithm "%s" is not supported', $this->algorithm)); - } } public function eval(Row $row) : ?string @@ -27,8 +23,8 @@ public function eval(Row $row) : ?string return match ($value) { null => null, default => match (\gettype($value)) { - 'array', 'object' => \hash($this->algorithm, \serialize($value), $this->binary, $this->options), - default => \hash($this->algorithm, (string) $value, $this->binary, $this->options), + 'array', 'object' => $this->algorithm->hash(\serialize($value)), + default => $this->algorithm->hash((string) $value), } }; } diff --git a/src/core/etl/src/Flow/ETL/Function/ScalarFunctionChain.php b/src/core/etl/src/Flow/ETL/Function/ScalarFunctionChain.php index 7b50fe651..df8c7ad0d 100644 --- a/src/core/etl/src/Flow/ETL/Function/ScalarFunctionChain.php +++ b/src/core/etl/src/Flow/ETL/Function/ScalarFunctionChain.php @@ -10,6 +10,7 @@ use Flow\ETL\Function\ArrayExpand\ArrayExpand; use Flow\ETL\Function\ArraySort\Sort; use Flow\ETL\Function\Between\Boundary; +use Flow\ETL\Hash\{Algorithm, NativePHPHash}; use Flow\ETL\PHP\Type\Type; use Flow\ETL\Row\Entry; @@ -150,9 +151,9 @@ public function greaterThanEqual(ScalarFunction $ref) : self return new GreaterThanEqual($this, $ref); } - public function hash(string $algorithm = 'xxh128', bool $binary = false, array $options = []) : self + public function hash(Algorithm $algorithm = new NativePHPHash()) : self { - return new Hash($this, $algorithm, $binary, $options); + return new Hash($this, $algorithm); } public function isEven() : self diff --git a/src/core/etl/src/Flow/ETL/GroupBy.php b/src/core/etl/src/Flow/ETL/GroupBy.php index e1be73f66..1251c532d 100644 --- a/src/core/etl/src/Flow/ETL/GroupBy.php +++ b/src/core/etl/src/Flow/ETL/GroupBy.php @@ -7,6 +7,7 @@ use function Flow\ETL\DSL\array_to_rows; use Flow\ETL\Exception\{InvalidArgumentException, RuntimeException}; use Flow\ETL\Function\AggregatingFunction; +use Flow\ETL\Hash\NativePHPHash; use Flow\ETL\Row\{Reference, References}; final class GroupBy @@ -198,6 +199,6 @@ private function hash(array $values) : string } } - return \hash('xxh128', \implode('', $stringValues)); + return NativePHPHash::xxh128(\implode('', $stringValues)); } } diff --git a/src/core/etl/src/Flow/ETL/Hash/NativePHPHash.php b/src/core/etl/src/Flow/ETL/Hash/NativePHPHash.php index 7e7ee2947..646368912 100644 --- a/src/core/etl/src/Flow/ETL/Hash/NativePHPHash.php +++ b/src/core/etl/src/Flow/ETL/Hash/NativePHPHash.php @@ -13,6 +13,11 @@ public function __construct(private string $algorithm = 'xxh128', private bool $ } } + public static function xxh128(string $string) : string + { + return (new self('xxh128'))->hash($string); + } + public function hash(string $value) : string { return \hash($this->algorithm, $value, $this->binary, $this->options); diff --git a/src/core/etl/src/Flow/ETL/Pipeline/PartitioningPipeline.php b/src/core/etl/src/Flow/ETL/Pipeline/PartitioningPipeline.php index 0628ddf88..81533e90a 100644 --- a/src/core/etl/src/Flow/ETL/Pipeline/PartitioningPipeline.php +++ b/src/core/etl/src/Flow/ETL/Pipeline/PartitioningPipeline.php @@ -8,11 +8,13 @@ use Flow\ETL\Exception\InvalidArgumentException; use Flow\ETL\Extractor\CollectingExtractor; use Flow\ETL\Row\Reference; -use Flow\ETL\{Extractor, FlowContext, Loader, Pipeline, Transformer}; +use Flow\ETL\{Extractor, FlowContext, Hash\Algorithm, Hash\NativePHPHash, Loader, Pipeline, Transformer}; use Flow\Filesystem\Partition; final class PartitioningPipeline implements Pipeline { + private readonly Algorithm $hashAlgorithm; + /** * @param Pipeline $pipeline * @param array $partitionBy @@ -28,6 +30,7 @@ public function __construct( if (!\count($this->partitionBy)) { throw new InvalidArgumentException('PartitioningPipeline requires at least one partitionBy entry'); } + $this->hashAlgorithm = new NativePHPHash(); } public function add(Loader|Transformer $pipe) : Pipeline @@ -56,7 +59,7 @@ public function process(FlowContext $context) : \Generator $rows = $partitionedRows->sortBy(...$this->orderBy); - $partitionId = \hash('xxh128', $context->config->id() . '_' . \implode('_', \array_map( + $partitionId = $this->hashAlgorithm->hash($context->config->id() . '_' . \implode('_', \array_map( static fn (Partition $partition) : string => $partition->id(), $partitionedRows->partitions()->toArray() ))); diff --git a/src/core/etl/src/Flow/ETL/Row.php b/src/core/etl/src/Flow/ETL/Row.php index fbbd1aaf1..697f7f542 100644 --- a/src/core/etl/src/Flow/ETL/Row.php +++ b/src/core/etl/src/Flow/ETL/Row.php @@ -5,6 +5,7 @@ namespace Flow\ETL; use Flow\ETL\Exception\InvalidArgumentException; +use Flow\ETL\Hash\{Algorithm, NativePHPHash}; use Flow\ETL\Row\{Entries, Entry, Reference, References, Schema}; final class Row @@ -54,19 +55,15 @@ public function has(string|Reference $ref) : bool return $this->entries->has($ref); } - public function hash(string $algorithm = 'xxh128', bool $binary = false, array $options = []) : string + public function hash(Algorithm $algorithm = new NativePHPHash()) : string { - if (!\in_array($algorithm, \hash_algos(), true)) { - throw new \InvalidArgumentException(\sprintf('Hashing algorithm "%s" is not supported', $algorithm)); - } - $string = ''; foreach ($this->entries->sort()->all() as $entry) { $string .= $entry->name() . $entry->toString(); } - return \hash($algorithm, $string, $binary, $options); + return $algorithm->hash($string); } public function isEqual(self $row) : bool diff --git a/src/core/etl/src/Flow/ETL/Rows.php b/src/core/etl/src/Flow/ETL/Rows.php index 5ff8fd892..a5babc232 100644 --- a/src/core/etl/src/Flow/ETL/Rows.php +++ b/src/core/etl/src/Flow/ETL/Rows.php @@ -6,6 +6,7 @@ use function Flow\ETL\DSL\{array_to_rows, row}; use Flow\ETL\Exception\{DuplicatedEntriesException, InvalidArgumentException, RuntimeException}; +use Flow\ETL\Hash\{Algorithm, NativePHPHash}; use Flow\ETL\Join\Expression; use Flow\ETL\Row\CartesianProduct; use Flow\ETL\Row\Comparator\NativeComparator; @@ -267,19 +268,15 @@ public function getIterator() : \Iterator return new \ArrayIterator($this->rows); } - public function hash(string $algorithm = 'xxh128', bool $binary = false, array $options = []) : string + public function hash(Algorithm $algorithm = new NativePHPHash()) : string { - $hashes = []; - - if (!\in_array($algorithm, \hash_algos(), true)) { - throw new \InvalidArgumentException(\sprintf('Hashing algorithm "%s" is not supported', $algorithm)); - } + $hash = ''; foreach ($this->rows as $row) { - $hashes[] = $row->hash($algorithm, $binary, $options); + $hash .= $row->hash($algorithm); } - return \hash($algorithm, \implode('', $hashes), $binary, $options); + return $algorithm->hash($hash); } public function isPartitioned() : bool diff --git a/src/core/etl/src/Flow/ETL/Transformer/DropDuplicatesTransformer.php b/src/core/etl/src/Flow/ETL/Transformer/DropDuplicatesTransformer.php index d996efa92..cab2fda19 100644 --- a/src/core/etl/src/Flow/ETL/Transformer/DropDuplicatesTransformer.php +++ b/src/core/etl/src/Flow/ETL/Transformer/DropDuplicatesTransformer.php @@ -7,7 +7,7 @@ use Flow\ETL\Exception\InvalidArgumentException; use Flow\ETL\Row\Reference; use Flow\ETL\Transformer\DropDuplicates\Hashes; -use Flow\ETL\{FlowContext, Rows, Transformer}; +use Flow\ETL\{FlowContext, Hash\Algorithm, Hash\NativePHPHash, Rows, Transformer}; final class DropDuplicatesTransformer implements Transformer { @@ -18,6 +18,8 @@ final class DropDuplicatesTransformer implements Transformer */ private array $entries; + private Algorithm $hashAlgorithm; + public function __construct(string|Reference ...$entries) { if ([] === $entries) { @@ -26,6 +28,7 @@ public function __construct(string|Reference ...$entries) $this->entries = $entries; $this->deduplication = new Hashes(); + $this->hashAlgorithm = new NativePHPHash(); } public function transform(Rows $rows, FlowContext $context) : Rows @@ -43,7 +46,7 @@ public function transform(Rows $rows, FlowContext $context) : Rows } } - $hash = \hash('xxh128', \serialize($values)); + $hash = $this->hashAlgorithm->hash(\serialize($values)); if (!$this->deduplication->exists($hash)) { $newRows[] = $row; diff --git a/src/core/etl/tests/Flow/ETL/Tests/Integration/Function/HashTest.php b/src/core/etl/tests/Flow/ETL/Tests/Integration/Function/HashTest.php index b0c0c3904..d8ddb534a 100644 --- a/src/core/etl/tests/Flow/ETL/Tests/Integration/Function/HashTest.php +++ b/src/core/etl/tests/Flow/ETL/Tests/Integration/Function/HashTest.php @@ -6,6 +6,7 @@ use function Flow\ETL\DSL\{from_array, ref, to_memory}; use Flow\ETL\Flow; +use Flow\ETL\Hash\NativePHPHash; use Flow\ETL\Memory\ArrayMemory; use PHPUnit\Framework\Attributes\DataProvider; use PHPUnit\Framework\TestCase; @@ -52,7 +53,7 @@ public function test_hash_with_different_algorithm() : void ] ) ) - ->withEntry('hash', ref('key')->hash('sha512')) + ->withEntry('hash', ref('key')->hash(new NativePHPHash('sha512'))) ->write(to_memory($memory = new ArrayMemory())) ->run(); diff --git a/src/core/etl/tests/Flow/ETL/Tests/Unit/Function/HashTest.php b/src/core/etl/tests/Flow/ETL/Tests/Unit/Function/HashTest.php index ee47f5259..44ac9a4d1 100644 --- a/src/core/etl/tests/Flow/ETL/Tests/Unit/Function/HashTest.php +++ b/src/core/etl/tests/Flow/ETL/Tests/Unit/Function/HashTest.php @@ -5,6 +5,7 @@ namespace Flow\ETL\Tests\Unit\Function; use function Flow\ETL\DSL\{array_entry, concat, datetime_entry, hash, lit, ref, str_entry}; +use Flow\ETL\Hash\NativePHPHash; use Flow\ETL\Row; use PHPUnit\Framework\TestCase; @@ -21,8 +22,8 @@ public function test_hashing_array_value() : void public function test_hashing_concat() : void { self::assertSame( - \hash('xxh128', 'test_test'), - hash(concat(ref('value'), lit('_'), ref('value')))->eval(Row::create(str_entry('value', 'test'))) + NativePHPHash::xxh128('test_test'), + hash(concat(ref('value'), lit('_'), ref('value')), new NativePHPHash('xxh128'))->eval(Row::create(str_entry('value', 'test'))) ); } diff --git a/src/core/etl/tests/Flow/ETL/Tests/Unit/Hash/NativePHPHashTest.php b/src/core/etl/tests/Flow/ETL/Tests/Unit/Hash/NativePHPHashTest.php new file mode 100644 index 000000000..aa1bf3a24 --- /dev/null +++ b/src/core/etl/tests/Flow/ETL/Tests/Unit/Hash/NativePHPHashTest.php @@ -0,0 +1,36 @@ +hash('test'), + ); + } + + public function test_support_sha512_hash() : void + { + static::assertSame( + 'ee26b0dd4af7e749aa1a8ee3c10ae9923f618980772e473f8819a5d4940e0db27ac185f8a0e1d5f84f88bc887fd67b143732c304cc5fa9ad8e6f57f50028a8ff', + (new NativePHPHash('sha512'))->hash('test') + ); + + } +}