diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex index da27b437e..148baa695 100644 --- a/lib/explorer/polars_backend/data_frame.ex +++ b/lib/explorer/polars_backend/data_frame.ex @@ -195,11 +195,6 @@ defmodule Explorer.PolarsBackend.DataFrame do {columns, with_projection} = column_names_or_projection(columns) - dtypes_list = - if not Enum.empty?(dtypes) do - Map.to_list(dtypes) - end - df = Native.df_load_csv( contents, @@ -212,7 +207,7 @@ defmodule Explorer.PolarsBackend.DataFrame do delimiter, true, columns, - dtypes_list, + Map.to_list(dtypes), encoding, nil_values, parse_dates, diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs index 676b36c68..aea24dfa9 100644 --- a/native/explorer/src/dataframe/io.rs +++ b/native/explorer/src/dataframe/io.rs @@ -47,7 +47,13 @@ pub fn df_from_csv( _ => CsvEncoding::Utf8, }; - let dataframe = CsvReadOptions::default() + let read_options = if dtypes.is_empty() { + CsvReadOptions::default() + } else { + CsvReadOptions::default().with_schema_overwrite(Some(schema_from_dtypes_pairs(dtypes)?)) + }; + + let dataframe = read_options .with_infer_schema_length(infer_schema_length) .with_has_header(has_header) .with_n_rows(stop_after_n_rows) @@ -56,7 +62,6 @@ pub fn df_from_csv( .with_projection(projection.map(Arc::new)) .with_rechunk(do_rechunk) .with_columns(column_names.map(Arc::from)) - .with_schema_overwrite(Some(schema_from_dtypes_pairs(dtypes)?)) .with_parse_options( CsvParseOptions::default() .with_encoding(encoding) @@ -152,7 +157,7 @@ pub fn df_load_csv( delimiter_as_byte: u8, do_rechunk: bool, column_names: Option>, - dtypes: Option>, + dtypes: Vec<(&str, ExSeriesDtype)>, encoding: &str, null_vals: Vec, parse_dates: bool, @@ -165,9 +170,10 @@ pub fn df_load_csv( let cursor = Cursor::new(binary.as_slice()); - let read_options = match dtypes { - Some(val) => CsvReadOptions::default().with_schema(Some(schema_from_dtypes_pairs(val)?)), - None => CsvReadOptions::default(), + let read_options = if dtypes.is_empty() { + CsvReadOptions::default() + } else { + CsvReadOptions::default().with_schema_overwrite(Some(schema_from_dtypes_pairs(dtypes)?)) }; let dataframe = read_options diff --git a/test/explorer/data_frame/csv_test.exs b/test/explorer/data_frame/csv_test.exs index 8e4fe5925..ac5579c5b 100644 --- a/test/explorer/data_frame/csv_test.exs +++ b/test/explorer/data_frame/csv_test.exs @@ -78,6 +78,130 @@ defmodule Explorer.DataFrame.CSVTest do assert city[13] == "Aberdeen, Aberdeen City, UK" end + test "load_csv/2 dtypes - all as strings" do + csv = + """ + id,first_name,last_name,email,gender,ip_address,salary,latitude,longitude + 1,Torey,Geraghty,email@shutterfly.com,Male,119.110.38.172,14036.68,38.9187037,-76.9611991 + 2,Nevin,Mandrake,email@ovh.net,Male,161.2.124.233,32530.27,41.4176872,-8.7653155 + 3,Melisenda,Guiso,email@wp.com,Female,192.152.64.134,9177.8,21.3772424,110.2485736 + 4,Noble,Doggett,email@springer.com,Male,252.234.29.244,20328.76,37.268428,55.1487513 + 5,Janaya,Claypoole,email@infoseek.co.jp,Female,150.191.214.252,21442.93,15.3553417,120.5293228 + 6,Sarah,Hugk,email@bbb.org,Female,211.158.246.13,79709.16,28.168408,120.482198 + 7,Ulberto,Simenon,email@unblog.fr,Male,206.56.108.90,16248.98,48.4046776,-0.9746208 + 8,Kevon,Lingner,email@dyndns.org,Male,181.71.212.116,7497.64,-23.351784,-47.6931718 + 9,Sada,Garbert,email@flavors.me,Female,170.42.190.231,15969.95,30.3414125,114.1543243 + 10,Salmon,Shoulders,email@prweb.com,Male,68.138.106.143,19996.71,49.2152833,17.7687416 + """ + + headers = ~w(id first_name last_name email gender ip_address salary latitude longitude) + + # Out of order on purpose. + df = DF.load_csv!(csv, dtypes: for(l <- Enum.shuffle(headers), do: {l, :string})) + + assert DF.names(df) == headers + + assert DF.to_columns(df, atom_keys: true) == %{ + email: [ + "email@shutterfly.com", + "email@ovh.net", + "email@wp.com", + "email@springer.com", + "email@infoseek.co.jp", + "email@bbb.org", + "email@unblog.fr", + "email@dyndns.org", + "email@flavors.me", + "email@prweb.com" + ], + first_name: [ + "Torey", + "Nevin", + "Melisenda", + "Noble", + "Janaya", + "Sarah", + "Ulberto", + "Kevon", + "Sada", + "Salmon" + ], + gender: [ + "Male", + "Male", + "Female", + "Male", + "Female", + "Female", + "Male", + "Male", + "Female", + "Male" + ], + id: ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], + ip_address: [ + "119.110.38.172", + "161.2.124.233", + "192.152.64.134", + "252.234.29.244", + "150.191.214.252", + "211.158.246.13", + "206.56.108.90", + "181.71.212.116", + "170.42.190.231", + "68.138.106.143" + ], + last_name: [ + "Geraghty", + "Mandrake", + "Guiso", + "Doggett", + "Claypoole", + "Hugk", + "Simenon", + "Lingner", + "Garbert", + "Shoulders" + ], + latitude: [ + "38.9187037", + "41.4176872", + "21.3772424", + "37.268428", + "15.3553417", + "28.168408", + "48.4046776", + "-23.351784", + "30.3414125", + "49.2152833" + ], + longitude: [ + "-76.9611991", + "-8.7653155", + "110.2485736", + "55.1487513", + "120.5293228", + "120.482198", + "-0.9746208", + "-47.6931718", + "114.1543243", + "17.7687416" + ], + salary: [ + "14036.68", + "32530.27", + "9177.8", + "20328.76", + "21442.93", + "79709.16", + "16248.98", + "7497.64", + "15969.95", + "19996.71" + ] + } + end + def assert_csv(type, csv_value, parsed_value, from_csv_options) do data = "column\n#{csv_value}\n" # parsing should work as expected @@ -182,6 +306,131 @@ defmodule Explorer.DataFrame.CSVTest do } end + @tag :tmp_dir + test "dtypes - all as strings", config do + csv = + tmp_csv(config.tmp_dir, """ + id,first_name,last_name,email,gender,ip_address,salary,latitude,longitude + 1,Torey,Geraghty,email@shutterfly.com,Male,119.110.38.172,14036.68,38.9187037,-76.9611991 + 2,Nevin,Mandrake,email@ovh.net,Male,161.2.124.233,32530.27,41.4176872,-8.7653155 + 3,Melisenda,Guiso,email@wp.com,Female,192.152.64.134,9177.8,21.3772424,110.2485736 + 4,Noble,Doggett,email@springer.com,Male,252.234.29.244,20328.76,37.268428,55.1487513 + 5,Janaya,Claypoole,email@infoseek.co.jp,Female,150.191.214.252,21442.93,15.3553417,120.5293228 + 6,Sarah,Hugk,email@bbb.org,Female,211.158.246.13,79709.16,28.168408,120.482198 + 7,Ulberto,Simenon,email@unblog.fr,Male,206.56.108.90,16248.98,48.4046776,-0.9746208 + 8,Kevon,Lingner,email@dyndns.org,Male,181.71.212.116,7497.64,-23.351784,-47.6931718 + 9,Sada,Garbert,email@flavors.me,Female,170.42.190.231,15969.95,30.3414125,114.1543243 + 10,Salmon,Shoulders,email@prweb.com,Male,68.138.106.143,19996.71,49.2152833,17.7687416 + """) + + headers = ~w(id first_name last_name email gender ip_address salary latitude longitude) + + # Out of order on purpose. + df = DF.from_csv!(csv, dtypes: for(l <- Enum.shuffle(headers), do: {l, :string})) + + assert DF.names(df) == headers + + assert DF.to_columns(df, atom_keys: true) == %{ + email: [ + "email@shutterfly.com", + "email@ovh.net", + "email@wp.com", + "email@springer.com", + "email@infoseek.co.jp", + "email@bbb.org", + "email@unblog.fr", + "email@dyndns.org", + "email@flavors.me", + "email@prweb.com" + ], + first_name: [ + "Torey", + "Nevin", + "Melisenda", + "Noble", + "Janaya", + "Sarah", + "Ulberto", + "Kevon", + "Sada", + "Salmon" + ], + gender: [ + "Male", + "Male", + "Female", + "Male", + "Female", + "Female", + "Male", + "Male", + "Female", + "Male" + ], + id: ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], + ip_address: [ + "119.110.38.172", + "161.2.124.233", + "192.152.64.134", + "252.234.29.244", + "150.191.214.252", + "211.158.246.13", + "206.56.108.90", + "181.71.212.116", + "170.42.190.231", + "68.138.106.143" + ], + last_name: [ + "Geraghty", + "Mandrake", + "Guiso", + "Doggett", + "Claypoole", + "Hugk", + "Simenon", + "Lingner", + "Garbert", + "Shoulders" + ], + latitude: [ + "38.9187037", + "41.4176872", + "21.3772424", + "37.268428", + "15.3553417", + "28.168408", + "48.4046776", + "-23.351784", + "30.3414125", + "49.2152833" + ], + longitude: [ + "-76.9611991", + "-8.7653155", + "110.2485736", + "55.1487513", + "120.5293228", + "120.482198", + "-0.9746208", + "-47.6931718", + "114.1543243", + "17.7687416" + ], + salary: [ + "14036.68", + "32530.27", + "9177.8", + "20328.76", + "21442.93", + "79709.16", + "16248.98", + "7497.64", + "15969.95", + "19996.71" + ] + } + end + @tag :tmp_dir test "dtypes - parse datetime", config do csv =