Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use generic hash algorithm #1122

Merged
merged 6 commits into from
Jul 20, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
namespace Flow\ETL\Adapter\Elasticsearch\EntryIdFactory;

use Flow\ETL\Adapter\Elasticsearch\IdFactory;
use Flow\ETL\Exception\InvalidArgumentException;
use Flow\ETL\Hash\{Algorithm, NativePHPHash};
use Flow\ETL\Row;
use Flow\ETL\Row\Entry;

Expand All @@ -16,32 +16,28 @@ final class HashIdFactory implements IdFactory
*/
private array $entryNames;

private string $hashName = 'xxh128';
private Algorithm $hashAlgorithm;

public function __construct(string ...$entryNames)
{
$this->entryNames = $entryNames;
$this->hashAlgorithm = new NativePHPHash();
}

public function create(Row $row) : Entry
{
return new Entry\StringEntry(
'id',
\hash(
$this->hashName,
$this->hashAlgorithm->hash(
\implode(':', \array_map(fn (string $name) : string => (string) $row->valueOf($name), $this->entryNames))
)
);
}

public function withAlgorithm(string $hashName) : self
public function withAlgorithm(Algorithm $algorithm) : self
{
if (!\in_array($hashName, \hash_algos(), true)) {
throw InvalidArgumentException::because('Unsupported hash algorithm name provided: ' . $hashName . ', did you mean: ' . \implode(', ', \hash_algos()));
}

$factory = new self(...$this->entryNames);
$factory->hashName = $hashName;
$factory->hashAlgorithm = $algorithm;

return $factory;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

use function Flow\ETL\DSL\str_entry;
use Flow\ETL\Adapter\Elasticsearch\EntryIdFactory\HashIdFactory;
use Flow\ETL\Exception\InvalidArgumentException;
use Flow\ETL\Hash\NativePHPHash;
use Flow\ETL\Row;
use PHPUnit\Framework\TestCase;

Expand All @@ -29,7 +29,7 @@ public function test_create_row() : void

public function test_create_row_with_different_hash() : void
{
$factory = (new HashIdFactory('first_name', 'last_name'))->withAlgorithm('sha1');
$factory = (new HashIdFactory('first_name', 'last_name'))->withAlgorithm(new NativePHPHash('sha1'));

self::assertEquals(
new Row\Entry\StringEntry(
Expand All @@ -41,12 +41,4 @@ public function test_create_row_with_different_hash() : void
)
);
}

public function test_invalid_hash_algorithm_name() : void
{
$this->expectException(InvalidArgumentException::class);
$this->expectExceptionMessage('Unsupported hash algorithm name provided: whatever, did you mean: ');

(new HashIdFactory('first_name'))->withAlgorithm('whatever');
}
}
6 changes: 4 additions & 2 deletions src/core/etl/src/Flow/ETL/DSL/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
Flow,
FlowContext,
Formatter,
Hash\Algorithm,
Hash\NativePHPHash,
Join\Comparison,
Join\Comparison\Equal,
Join\Comparison\Identical,
Expand Down Expand Up @@ -593,9 +595,9 @@ function concat(ScalarFunction ...$functions) : Concat
return new Concat(...$functions);
}

function hash(ScalarFunction $function, string $algorithm = 'xxh128', bool $binary = false, array $options = []) : Hash
function hash(ScalarFunction $function, Algorithm $algorithm = new NativePHPHash()) : Hash
norberttech marked this conversation as resolved.
Show resolved Hide resolved
{
return new Hash($function, $algorithm, $binary, $options);
return new Hash($function, $algorithm);
}

function cast(ScalarFunction $function, string|Type $type) : Cast
Expand Down
12 changes: 4 additions & 8 deletions src/core/etl/src/Flow/ETL/Function/Hash.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,15 @@

namespace Flow\ETL\Function;

use Flow\ETL\Hash\{Algorithm, NativePHPHash};
use Flow\ETL\Row;

final class Hash extends ScalarFunctionChain
{
public function __construct(
private readonly ScalarFunction $ref,
private readonly string $algorithm = 'xxh128',
private readonly bool $binary = false,
private readonly array $options = []
private readonly Algorithm $algorithm = new NativePHPHash(),
) {
if (!\in_array($this->algorithm, \hash_algos(), true)) {
throw new \InvalidArgumentException(\sprintf('Hash algorithm "%s" is not supported', $this->algorithm));
}
}

public function eval(Row $row) : ?string
Expand All @@ -27,8 +23,8 @@ public function eval(Row $row) : ?string
return match ($value) {
null => null,
default => match (\gettype($value)) {
'array', 'object' => \hash($this->algorithm, \serialize($value), $this->binary, $this->options),
default => \hash($this->algorithm, (string) $value, $this->binary, $this->options),
'array', 'object' => $this->algorithm->hash(\serialize($value)),
default => $this->algorithm->hash((string) $value),
}
};
}
Expand Down
5 changes: 3 additions & 2 deletions src/core/etl/src/Flow/ETL/Function/ScalarFunctionChain.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
use Flow\ETL\Function\ArrayExpand\ArrayExpand;
use Flow\ETL\Function\ArraySort\Sort;
use Flow\ETL\Function\Between\Boundary;
use Flow\ETL\Hash\{Algorithm, NativePHPHash};
use Flow\ETL\PHP\Type\Type;
use Flow\ETL\Row\Entry;

Expand Down Expand Up @@ -150,9 +151,9 @@ public function greaterThanEqual(ScalarFunction $ref) : self
return new GreaterThanEqual($this, $ref);
}

public function hash(string $algorithm = 'xxh128', bool $binary = false, array $options = []) : self
public function hash(Algorithm $algorithm = new NativePHPHash()) : self
{
return new Hash($this, $algorithm, $binary, $options);
return new Hash($this, $algorithm);
}

public function isEven() : self
Expand Down
3 changes: 2 additions & 1 deletion src/core/etl/src/Flow/ETL/GroupBy.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
use function Flow\ETL\DSL\array_to_rows;
use Flow\ETL\Exception\{InvalidArgumentException, RuntimeException};
use Flow\ETL\Function\AggregatingFunction;
use Flow\ETL\Hash\NativePHPHash;
use Flow\ETL\Row\{Reference, References};

final class GroupBy
Expand Down Expand Up @@ -198,6 +199,6 @@ private function hash(array $values) : string
}
}

return \hash('xxh128', \implode('', $stringValues));
return NativePHPHash::xxh128(\implode('', $stringValues));
}
}
5 changes: 5 additions & 0 deletions src/core/etl/src/Flow/ETL/Hash/NativePHPHash.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ public function __construct(private string $algorithm = 'xxh128', private bool $
}
}

public static function xxh128(string $string) : string
{
return (new self('xxh128'))->hash($string);
}

public function hash(string $value) : string
{
return \hash($this->algorithm, $value, $this->binary, $this->options);
Expand Down
7 changes: 5 additions & 2 deletions src/core/etl/src/Flow/ETL/Pipeline/PartitioningPipeline.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
use Flow\ETL\Exception\InvalidArgumentException;
use Flow\ETL\Extractor\CollectingExtractor;
use Flow\ETL\Row\Reference;
use Flow\ETL\{Extractor, FlowContext, Loader, Pipeline, Transformer};
use Flow\ETL\{Extractor, FlowContext, Hash\Algorithm, Hash\NativePHPHash, Loader, Pipeline, Transformer};
use Flow\Filesystem\Partition;

final class PartitioningPipeline implements Pipeline
{
private readonly Algorithm $hashAlgorithm;

/**
* @param Pipeline $pipeline
* @param array<Reference> $partitionBy
Expand All @@ -28,6 +30,7 @@ public function __construct(
if (!\count($this->partitionBy)) {
throw new InvalidArgumentException('PartitioningPipeline requires at least one partitionBy entry');
}
$this->hashAlgorithm = new NativePHPHash();
}

public function add(Loader|Transformer $pipe) : Pipeline
Expand Down Expand Up @@ -56,7 +59,7 @@ public function process(FlowContext $context) : \Generator

$rows = $partitionedRows->sortBy(...$this->orderBy);

$partitionId = \hash('xxh128', $context->config->id() . '_' . \implode('_', \array_map(
$partitionId = $this->hashAlgorithm->hash($context->config->id() . '_' . \implode('_', \array_map(
static fn (Partition $partition) : string => $partition->id(),
$partitionedRows->partitions()->toArray()
)));
Expand Down
9 changes: 3 additions & 6 deletions src/core/etl/src/Flow/ETL/Row.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
namespace Flow\ETL;

use Flow\ETL\Exception\InvalidArgumentException;
use Flow\ETL\Hash\{Algorithm, NativePHPHash};
use Flow\ETL\Row\{Entries, Entry, Reference, References, Schema};

final class Row
Expand Down Expand Up @@ -54,19 +55,15 @@ public function has(string|Reference $ref) : bool
return $this->entries->has($ref);
}

public function hash(string $algorithm = 'xxh128', bool $binary = false, array $options = []) : string
public function hash(Algorithm $algorithm = new NativePHPHash()) : string
{
if (!\in_array($algorithm, \hash_algos(), true)) {
throw new \InvalidArgumentException(\sprintf('Hashing algorithm "%s" is not supported', $algorithm));
}

$string = '';

foreach ($this->entries->sort()->all() as $entry) {
$string .= $entry->name() . $entry->toString();
}

return \hash($algorithm, $string, $binary, $options);
return $algorithm->hash($string);
}

public function isEqual(self $row) : bool
Expand Down
11 changes: 4 additions & 7 deletions src/core/etl/src/Flow/ETL/Rows.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

use function Flow\ETL\DSL\{array_to_rows, row};
use Flow\ETL\Exception\{DuplicatedEntriesException, InvalidArgumentException, RuntimeException};
use Flow\ETL\Hash\{Algorithm, NativePHPHash};
use Flow\ETL\Join\Expression;
use Flow\ETL\Row\CartesianProduct;
use Flow\ETL\Row\Comparator\NativeComparator;
Expand Down Expand Up @@ -267,19 +268,15 @@ public function getIterator() : \Iterator
return new \ArrayIterator($this->rows);
}

public function hash(string $algorithm = 'xxh128', bool $binary = false, array $options = []) : string
public function hash(Algorithm $algorithm = new NativePHPHash()) : string
{
$hashes = [];

if (!\in_array($algorithm, \hash_algos(), true)) {
throw new \InvalidArgumentException(\sprintf('Hashing algorithm "%s" is not supported', $algorithm));
}

foreach ($this->rows as $row) {
$hashes[] = $row->hash($algorithm, $binary, $options);
$hashes[] = $row->hash($algorithm);
}

return \hash($algorithm, \implode('', $hashes), $binary, $options);
return $algorithm->hash(\implode('', $hashes));
norberttech marked this conversation as resolved.
Show resolved Hide resolved
}

public function isPartitioned() : bool
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
use Flow\ETL\Exception\InvalidArgumentException;
use Flow\ETL\Row\Reference;
use Flow\ETL\Transformer\DropDuplicates\Hashes;
use Flow\ETL\{FlowContext, Rows, Transformer};
use Flow\ETL\{FlowContext, Hash\Algorithm, Hash\NativePHPHash, Rows, Transformer};

final class DropDuplicatesTransformer implements Transformer
{
Expand All @@ -18,6 +18,8 @@ final class DropDuplicatesTransformer implements Transformer
*/
private array $entries;

private Algorithm $hashAlgorithm;

public function __construct(string|Reference ...$entries)
{
if ([] === $entries) {
Expand All @@ -26,6 +28,7 @@ public function __construct(string|Reference ...$entries)

$this->entries = $entries;
$this->deduplication = new Hashes();
$this->hashAlgorithm = new NativePHPHash();
}

public function transform(Rows $rows, FlowContext $context) : Rows
Expand All @@ -43,7 +46,7 @@ public function transform(Rows $rows, FlowContext $context) : Rows
}
}

$hash = \hash('xxh128', \serialize($values));
$hash = $this->hashAlgorithm->hash(\serialize($values));

if (!$this->deduplication->exists($hash)) {
$newRows[] = $row;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

use function Flow\ETL\DSL\{from_array, ref, to_memory};
use Flow\ETL\Flow;
use Flow\ETL\Hash\NativePHPHash;
use Flow\ETL\Memory\ArrayMemory;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;
Expand Down Expand Up @@ -52,7 +53,7 @@ public function test_hash_with_different_algorithm() : void
]
)
)
->withEntry('hash', ref('key')->hash('sha512'))
->withEntry('hash', ref('key')->hash(new NativePHPHash('sha512')))
->write(to_memory($memory = new ArrayMemory()))
->run();

Expand Down
5 changes: 3 additions & 2 deletions src/core/etl/tests/Flow/ETL/Tests/Unit/Function/HashTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
namespace Flow\ETL\Tests\Unit\Function;

use function Flow\ETL\DSL\{array_entry, concat, datetime_entry, hash, lit, ref, str_entry};
use Flow\ETL\Hash\NativePHPHash;
use Flow\ETL\Row;
use PHPUnit\Framework\TestCase;

Expand All @@ -21,8 +22,8 @@ public function test_hashing_array_value() : void
public function test_hashing_concat() : void
{
self::assertSame(
\hash('xxh128', 'test_test'),
hash(concat(ref('value'), lit('_'), ref('value')))->eval(Row::create(str_entry('value', 'test')))
NativePHPHash::xxh128('test_test'),
hash(concat(ref('value'), lit('_'), ref('value')), new NativePHPHash('xxh128'))->eval(Row::create(str_entry('value', 'test')))
);
}

Expand Down
36 changes: 36 additions & 0 deletions src/core/etl/tests/Flow/ETL/Tests/Unit/Hash/NativePHPHashTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

declare(strict_types=1);

namespace Flow\ETL\Tests\Unit\Hash;

use Flow\ETL\Hash\NativePHPHash;
use PHPUnit\Framework\TestCase;

class NativePHPHashTest extends TestCase
{
public static function test_hashing_xxh128_by_static_call() : void
{
static::assertSame(
'6c78e0e3bd51d358d01e758642b85fb8',
NativePHPHash::xxh128('test'),
);
}

public function test_hashing_string_using_xxh128_by_default() : void
{
static::assertSame(
'6c78e0e3bd51d358d01e758642b85fb8',
NativePHPHash::xxh128('test'),
);
}
norberttech marked this conversation as resolved.
Show resolved Hide resolved

public function test_support_sha512_hash() : void
{
static::assertSame(
'ee26b0dd4af7e749aa1a8ee3c10ae9923f618980772e473f8819a5d4940e0db27ac185f8a0e1d5f84f88bc887fd67b143732c304cc5fa9ad8e6f57f50028a8ff',
(new NativePHPHash('sha512'))->hash('test')
);

}
}
Loading