From 5fb70b477cdcac3866d93741602f530f29b6b6aa Mon Sep 17 00:00:00 2001 From: Norbert Orzechowicz Date: Sun, 4 Feb 2024 20:30:21 +0100 Subject: [PATCH] Added schema inferring example --- examples/topics/phar/data_frame/code.php | 3 +- examples/topics/schema/inferring/code.php | 31 +++++++++++++++++++ .../topics/schema/inferring/input/dataset.csv | 21 +++++++++++++ .../topics/schema/inferring/output/.gitignore | 2 ++ examples/topics/schema/validate/code.php | 14 ++++----- .../window_functions/dens_rank/code.php | 9 ++---- 6 files changed, 65 insertions(+), 15 deletions(-) create mode 100644 examples/topics/schema/inferring/code.php create mode 100644 examples/topics/schema/inferring/input/dataset.csv create mode 100644 examples/topics/schema/inferring/output/.gitignore diff --git a/examples/topics/phar/data_frame/code.php b/examples/topics/phar/data_frame/code.php index 4b19e8163..a272dde91 100644 --- a/examples/topics/phar/data_frame/code.php +++ b/examples/topics/phar/data_frame/code.php @@ -13,7 +13,8 @@ use function Flow\ETL\DSL\to_output; // flow.phar run examples/topics/phar/data_frame/code.php - +// when executing data processing pipeline through phar make sure to not use any trigger, like ->run(); +// this is handled by the phar internally. return df() ->read(from_rows(rows( row(int_entry('id', 1), array_entry('array', ['a' => 1, 'b' => 2, 'c' => 3])), diff --git a/examples/topics/schema/inferring/code.php b/examples/topics/schema/inferring/code.php new file mode 100644 index 000000000..25fe44c6b --- /dev/null +++ b/examples/topics/schema/inferring/code.php @@ -0,0 +1,31 @@ +read(from_csv(__DIR__ . '/input/dataset.csv')) + ->limit(100) // Limiting the number of rows to read will speed up the process but might bring less accurate results + ->autoCast() + ->schema(); + + \file_put_contents(__DIR__ . '/output/schema.json', schema_to_json($schema)); +} else { + $schema = schema_from_json(\file_get_contents(__DIR__ . '/output/schema.json')); +} + +// Reading schemaless data formats with predefined schema can significantly improve performance +df() + ->read(from_csv(__DIR__ . '/input/dataset.csv', schema: $schema)) + ->collect() + ->write(to_output(truncate: false, output: Output::rows_and_schema)) + ->run(); diff --git a/examples/topics/schema/inferring/input/dataset.csv b/examples/topics/schema/inferring/input/dataset.csv new file mode 100644 index 000000000..fb7d4947f --- /dev/null +++ b/examples/topics/schema/inferring/input/dataset.csv @@ -0,0 +1,21 @@ +Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees +1,8cC6B5992C0309c,Acevedo LLC,https://www.donovan.com/,Holy See (Vatican City State),Multi-channeled bottom-line core,2019,Graphic Design / Web Design,7070 +2,ec094061FeaF7Bc,Walls-Mcdonald,http://arias-willis.net/,Lithuania,Compatible encompassing groupware,2005,Utilities,8156 +3,DAcC5dbc58946A7,Gregory PLC,http://www.lynch-hoover.net/,Tokelau,Multi-channeled intangible help-desk,2019,Leisure / Travel,6121 +4,8Dd7beDa37FbeD0,"Byrd, Patterson and Knox",https://www.james-velez.net/,Netherlands,Pre-emptive national function,1982,Furniture,3494 +5,a3b5c54AEC163e4,Mcdowell-Hopkins,http://fuentes.com/,Mayotte,Cloned bifurcated solution,2016,Online Publishing,36 +6,fDfEBeFDaEb59Af,Hayden and Sons,https://www.shaw-mooney.info/,Belize,Persistent mobile task-force,1978,Insurance,7010 +7,752ef90Eae1f7f5,Castro LLC,http://wilkinson.com/,Jamaica,Advanced value-added definition,2008,Outsourcing / Offshoring,2526 +8,B1D4c5CA34f9992,"Barajas, Baird and Shaw",http://www.jordan-harvey.com/,United States of America,Stand-alone bandwidth-monitored algorithm,2000,Wholesale,4478 +9,Cfa1a44106faD4B,"Lucas, Galloway and Benjamin",http://silva.info/,Western Sahara,Persevering leadingedge ability,1990,Retail Industry,8223 +10,C08fcf292AB17DF,"Barker, Hubbard and Bennett",http://www.allen.biz/,Mauritania,Decentralized fault-tolerant functionalities,2014,Museums / Institutions,7716 +11,94B9bEedc626820,Underwood-Mitchell,https://www.leonard.com/,Italy,Compatible dynamic support,1992,Fine Art,4564 +12,FE42dEd40f5DfD8,"Lester, Ochoa and Franco",http://www.munoz.com/,Timor-Leste,Vision-oriented dynamic conglomeration,2014,Motion Pictures / Film,8075 +13,1F861fAbeDdCFea,"Arias, Jackson and Hester",https://hardin-thompson.com/,Algeria,Switchable maximized synergy,1980,Utilities,1319 +14,456de7dE1ab18ca,Riggs and Sons,http://klein-benton.info/,Czech Republic,Object-based discrete orchestration,2012,Law Enforcement,4946 +15,457bcfFF18A7DD2,Stanley LLC,https://bowman.com/,Eritrea,Self-enabling 24/7 groupware,1984,Executive Office,4980 +16,5B5ea5aea34dc5F,Page-Ware,http://lam-soto.com/,Togo,Realigned mobile groupware,1991,Entertainment / Movie Production,1307 +17,A66F35C298Dfd82,"Garner, Melton and Burgess",https://mathews-knox.com/,Guinea-Bissau,Automated 5thgeneration complexity,2003,E - Learning,9038 +18,EdAC2EF13734E0B,Andersen-Fuentes,http://www.mann.com/,Oman,Ameliorated coherent database,1991,Textiles,6436 +19,dD1612190b24B12,Ford-Rice,https://peterson-irwin.com/,Turks and Caicos Islands,Sharable intangible leverage,1971,Computer / Network Security,3038 +20,992CAdffccEebEa,Collins-Figueroa,http://www.holt-bartlett.info/,Mongolia,Realigned multi-state installation,1985,Aviation / Aerospace,9420 \ No newline at end of file diff --git a/examples/topics/schema/inferring/output/.gitignore b/examples/topics/schema/inferring/output/.gitignore new file mode 100644 index 000000000..c96a04f00 --- /dev/null +++ b/examples/topics/schema/inferring/output/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/examples/topics/schema/validate/code.php b/examples/topics/schema/validate/code.php index 6c245cbf7..cba1c10bc 100644 --- a/examples/topics/schema/validate/code.php +++ b/examples/topics/schema/validate/code.php @@ -14,18 +14,18 @@ require __DIR__ . '/../../../autoload.php'; +$schema = schema( + int_schema('id', $nullable = false), + str_schema('name', $nullable = true), + bool_schema('active', $nullable = false, Metadata::empty()->add('key', 'value')), +); + df() ->read(from_array([ ['id' => 1, 'name' => 'Product 1', 'active' => true], ['id' => 2, 'name' => 'Product 2', 'active' => false], ['id' => 3, 'name' => 'Product 3', 'active' => true], ])) - ->validate( - schema( - int_schema('id', $nullable = false), - str_schema('name', $nullable = true), - bool_schema('active', $nullable = false, Metadata::empty()->add('key', 'value')), - ) - ) + ->validate($schema) ->write(to_output(false, Output::rows_and_schema)) ->run(); diff --git a/examples/topics/window_functions/dens_rank/code.php b/examples/topics/window_functions/dens_rank/code.php index 7dcd06f90..9d7224fdc 100644 --- a/examples/topics/window_functions/dens_rank/code.php +++ b/examples/topics/window_functions/dens_rank/code.php @@ -24,10 +24,5 @@ ) ->withEntry('rank', dense_rank()->over(window()->partitionBy(ref('department'))->orderBy(ref('salary')->desc()))) ->sortBy(ref('department'), ref('rank')) - ->write(to_output(false)); - -if ($_ENV['FLOW_PHAR_APP'] ?? false) { - return $df; -} - -$df->run(); + ->write(to_output(false)) + ->run();