Skip to content

Commit

Permalink
Added schema inferring example
Browse files Browse the repository at this point in the history
  • Loading branch information
norberttech committed Feb 4, 2024
1 parent 811264c commit 5fb70b4
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 15 deletions.
3 changes: 2 additions & 1 deletion examples/topics/phar/data_frame/code.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
use function Flow\ETL\DSL\to_output;

// flow.phar run examples/topics/phar/data_frame/code.php

// when executing data processing pipeline through phar make sure to not use any trigger, like ->run();
// this is handled by the phar internally.
return df()
->read(from_rows(rows(
row(int_entry('id', 1), array_entry('array', ['a' => 1, 'b' => 2, 'c' => 3])),
Expand Down
31 changes: 31 additions & 0 deletions examples/topics/schema/inferring/code.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?php

declare(strict_types=1);

use function Flow\ETL\Adapter\CSV\from_csv;
use function Flow\ETL\DSL\df;
use function Flow\ETL\DSL\schema_from_json;
use function Flow\ETL\DSL\schema_to_json;
use function Flow\ETL\DSL\to_output;
use Flow\ETL\Loader\StreamLoader\Output;

require __DIR__ . '/../../../autoload.php';

if (!\file_exists(__DIR__ . '/output/schema.json')) {
$schema = df()
->read(from_csv(__DIR__ . '/input/dataset.csv'))
->limit(100) // Limiting the number of rows to read will speed up the process but might bring less accurate results
->autoCast()
->schema();

\file_put_contents(__DIR__ . '/output/schema.json', schema_to_json($schema));
} else {
$schema = schema_from_json(\file_get_contents(__DIR__ . '/output/schema.json'));
}

// Reading schemaless data formats with predefined schema can significantly improve performance
df()
->read(from_csv(__DIR__ . '/input/dataset.csv', schema: $schema))
->collect()
->write(to_output(truncate: false, output: Output::rows_and_schema))
->run();
21 changes: 21 additions & 0 deletions examples/topics/schema/inferring/input/dataset.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
1,8cC6B5992C0309c,Acevedo LLC,https://www.donovan.com/,Holy See (Vatican City State),Multi-channeled bottom-line core,2019,Graphic Design / Web Design,7070
2,ec094061FeaF7Bc,Walls-Mcdonald,http://arias-willis.net/,Lithuania,Compatible encompassing groupware,2005,Utilities,8156
3,DAcC5dbc58946A7,Gregory PLC,http://www.lynch-hoover.net/,Tokelau,Multi-channeled intangible help-desk,2019,Leisure / Travel,6121
4,8Dd7beDa37FbeD0,"Byrd, Patterson and Knox",https://www.james-velez.net/,Netherlands,Pre-emptive national function,1982,Furniture,3494
5,a3b5c54AEC163e4,Mcdowell-Hopkins,http://fuentes.com/,Mayotte,Cloned bifurcated solution,2016,Online Publishing,36
6,fDfEBeFDaEb59Af,Hayden and Sons,https://www.shaw-mooney.info/,Belize,Persistent mobile task-force,1978,Insurance,7010
7,752ef90Eae1f7f5,Castro LLC,http://wilkinson.com/,Jamaica,Advanced value-added definition,2008,Outsourcing / Offshoring,2526
8,B1D4c5CA34f9992,"Barajas, Baird and Shaw",http://www.jordan-harvey.com/,United States of America,Stand-alone bandwidth-monitored algorithm,2000,Wholesale,4478
9,Cfa1a44106faD4B,"Lucas, Galloway and Benjamin",http://silva.info/,Western Sahara,Persevering leadingedge ability,1990,Retail Industry,8223
10,C08fcf292AB17DF,"Barker, Hubbard and Bennett",http://www.allen.biz/,Mauritania,Decentralized fault-tolerant functionalities,2014,Museums / Institutions,7716
11,94B9bEedc626820,Underwood-Mitchell,https://www.leonard.com/,Italy,Compatible dynamic support,1992,Fine Art,4564
12,FE42dEd40f5DfD8,"Lester, Ochoa and Franco",http://www.munoz.com/,Timor-Leste,Vision-oriented dynamic conglomeration,2014,Motion Pictures / Film,8075
13,1F861fAbeDdCFea,"Arias, Jackson and Hester",https://hardin-thompson.com/,Algeria,Switchable maximized synergy,1980,Utilities,1319
14,456de7dE1ab18ca,Riggs and Sons,http://klein-benton.info/,Czech Republic,Object-based discrete orchestration,2012,Law Enforcement,4946
15,457bcfFF18A7DD2,Stanley LLC,https://bowman.com/,Eritrea,Self-enabling 24/7 groupware,1984,Executive Office,4980
16,5B5ea5aea34dc5F,Page-Ware,http://lam-soto.com/,Togo,Realigned mobile groupware,1991,Entertainment / Movie Production,1307
17,A66F35C298Dfd82,"Garner, Melton and Burgess",https://mathews-knox.com/,Guinea-Bissau,Automated 5thgeneration complexity,2003,E - Learning,9038
18,EdAC2EF13734E0B,Andersen-Fuentes,http://www.mann.com/,Oman,Ameliorated coherent database,1991,Textiles,6436
19,dD1612190b24B12,Ford-Rice,https://peterson-irwin.com/,Turks and Caicos Islands,Sharable intangible leverage,1971,Computer / Network Security,3038
20,992CAdffccEebEa,Collins-Figueroa,http://www.holt-bartlett.info/,Mongolia,Realigned multi-state installation,1985,Aviation / Aerospace,9420
2 changes: 2 additions & 0 deletions examples/topics/schema/inferring/output/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
14 changes: 7 additions & 7 deletions examples/topics/schema/validate/code.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@

require __DIR__ . '/../../../autoload.php';

$schema = schema(
int_schema('id', $nullable = false),
str_schema('name', $nullable = true),
bool_schema('active', $nullable = false, Metadata::empty()->add('key', 'value')),
);

df()
->read(from_array([
['id' => 1, 'name' => 'Product 1', 'active' => true],
['id' => 2, 'name' => 'Product 2', 'active' => false],
['id' => 3, 'name' => 'Product 3', 'active' => true],
]))
->validate(
schema(
int_schema('id', $nullable = false),
str_schema('name', $nullable = true),
bool_schema('active', $nullable = false, Metadata::empty()->add('key', 'value')),
)
)
->validate($schema)
->write(to_output(false, Output::rows_and_schema))
->run();
9 changes: 2 additions & 7 deletions examples/topics/window_functions/dens_rank/code.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,5 @@
)
->withEntry('rank', dense_rank()->over(window()->partitionBy(ref('department'))->orderBy(ref('salary')->desc())))
->sortBy(ref('department'), ref('rank'))
->write(to_output(false));

if ($_ENV['FLOW_PHAR_APP'] ?? false) {
return $df;
}

$df->run();
->write(to_output(false))
->run();

0 comments on commit 5fb70b4

Please sign in to comment.