-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added automated detection of CSV separator and enclousure
- Loading branch information
1 parent
d6a2ab7
commit f3b5b23
Showing
8 changed files
with
320 additions
and
118 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
94 changes: 94 additions & 0 deletions
94
src/adapter/etl-adapter-csv/src/Flow/ETL/Adapter/CSV/Detector/Option.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Flow\ETL\Adapter\CSV\Detector; | ||
|
||
use Flow\ETL\Exception\InvalidArgumentException; | ||
|
||
final class Option | ||
{ | ||
private const COLUMN_SCORE_WEIGHT = 100_000; | ||
|
||
private const COLUMNS_LENGTH_WEIGHT = 10_000; | ||
|
||
/** | ||
* @var array<mixed> | ||
*/ | ||
private array $rows; | ||
|
||
public function __construct( | ||
public string $separator, | ||
public string $enclosure, | ||
public string $escape = '\\' | ||
) { | ||
if (\mb_strlen($this->separator) !== 1) { | ||
throw new InvalidArgumentException('Separator must be a single character'); | ||
} | ||
|
||
if (\mb_strlen($this->enclosure) !== 1) { | ||
throw new InvalidArgumentException('Enclosure must be a single character'); | ||
} | ||
|
||
$this->rows = []; | ||
} | ||
|
||
public function isValid() : bool | ||
{ | ||
$columnsCount = null; | ||
|
||
foreach ($this->rows as $row) { | ||
if ($columnsCount === null) { | ||
$columnsCount = \count($row); | ||
|
||
continue; | ||
} | ||
|
||
if ($columnsCount !== \count($row)) { | ||
return false; | ||
} | ||
} | ||
|
||
if ($columnsCount === 1) { | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
public function parse(string $line) : void | ||
{ | ||
$this->rows[] = \str_getcsv($line, $this->separator, $this->enclosure); | ||
} | ||
|
||
public function reset() : self | ||
{ | ||
return new self($this->separator, $this->enclosure); | ||
} | ||
|
||
public function score() : int | ||
{ | ||
if (!$this->isValid()) { | ||
return 0; | ||
} | ||
|
||
if (!\count($this->rows)) { | ||
return 0; | ||
} | ||
|
||
$columnScore = \count($this->rows[0]) * self::COLUMN_SCORE_WEIGHT; | ||
$totalLength = \array_reduce( | ||
$this->rows, | ||
static fn (int $carry, array $row) : int => $carry + \array_reduce( | ||
$row, | ||
static fn (int $carry, $column) : int => $carry + (\is_string($column) ? \mb_strlen($column) : 0), | ||
0 | ||
), | ||
0 | ||
); | ||
|
||
$lengthScore = (int) \round((1 / ($totalLength + 1) * self::COLUMNS_LENGTH_WEIGHT)); | ||
|
||
return $columnScore + $lengthScore; | ||
} | ||
} |
85 changes: 85 additions & 0 deletions
85
src/adapter/etl-adapter-csv/src/Flow/ETL/Adapter/CSV/Detector/Options.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Flow\ETL\Adapter\CSV\Detector; | ||
|
||
use Flow\ETL\Adapter\CSV\Exception\CantDetectCSVOptions; | ||
|
||
final class Options | ||
{ | ||
/** | ||
* @var array<Option> | ||
*/ | ||
private array $options; | ||
|
||
/** | ||
* @param array<Option> $options | ||
*/ | ||
public function __construct(array $options) | ||
{ | ||
$this->options = $options; | ||
} | ||
|
||
public static function all() : self | ||
{ | ||
$separators = [',', "\t", ';', '|', ' ', '_', '-', ':', '~', '@', '#', '$', '%', '^', '&', '*', '(', ')', '+', '=', '?', '!', '\\', '/', '.', '>', '<']; | ||
$enclosures = ['"', "'"]; | ||
|
||
$options = []; | ||
|
||
foreach ($separators as $separator) { | ||
foreach ($enclosures as $enclosure) { | ||
$options[] = new Option($separator, $enclosure); | ||
} | ||
} | ||
|
||
return new self($options); | ||
} | ||
|
||
public function best() : Option | ||
{ | ||
$best = null; | ||
|
||
foreach ($this->options as $option) { | ||
if ($best === null) { | ||
$best = $option; | ||
|
||
continue; | ||
} | ||
|
||
if ($option->score() > $best->score()) { | ||
$best = $option; | ||
} | ||
} | ||
|
||
if ($best === null) { | ||
throw new CantDetectCSVOptions('No best option found'); | ||
} | ||
|
||
return $best; | ||
} | ||
|
||
public function onlyValid() : self | ||
{ | ||
return new self(\array_filter($this->options, fn (Option $option) : bool => $option->isValid())); | ||
} | ||
|
||
public function parse(string $line) : void | ||
{ | ||
foreach ($this->options as $option) { | ||
$option->parse($line); | ||
} | ||
} | ||
|
||
public function reset() : self | ||
{ | ||
$options = []; | ||
|
||
foreach ($this->options as $option) { | ||
$options[] = $option->reset(); | ||
} | ||
|
||
return new self($options); | ||
} | ||
} |
11 changes: 11 additions & 0 deletions
11
src/adapter/etl-adapter-csv/src/Flow/ETL/Adapter/CSV/Exception/CantDetectCSVOptions.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Flow\ETL\Adapter\CSV\Exception; | ||
|
||
use Flow\ETL\Exception\RuntimeException; | ||
|
||
final class CantDetectCSVOptions extends RuntimeException | ||
{ | ||
} |
Oops, something went wrong.