From 64839810e957ab03dff9f52712481269acca5393 Mon Sep 17 00:00:00 2001 From: Shivam Thapar Date: Wed, 20 Nov 2024 14:32:56 -0500 Subject: [PATCH] Fix Issue #169 - NA is still converted to NaN even in a string column Copied from this PR: https://github.com/go-gota/gota/pull/175 --- dataframe/dataframe.go | 55 ++++++++++++++++++++----------------- dataframe/dataframe_test.go | 43 +++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 25 deletions(-) diff --git a/dataframe/dataframe.go b/dataframe/dataframe.go index 51d38f6..a68a78f 100644 --- a/dataframe/dataframe.go +++ b/dataframe/dataframe.go @@ -305,14 +305,14 @@ func (df DataFrame) Subset(indexes series.Indexes) DataFrame { // SelectIndexes are the supported indexes used for the DataFrame.Select method. Currently supported are: // -// int // Matches the given index number -// []int // Matches all given index numbers -// []bool // Matches all columns marked as true -// string // Matches the column with the matching column name -// []string // Matches all columns with the matching column names -// Series [Int] // Same as []int -// Series [Bool] // Same as []bool -// Series [String] // Same as []string +// int // Matches the given index number +// []int // Matches all given index numbers +// []bool // Matches all columns marked as true +// string // Matches the column with the matching column name +// []string // Matches all columns with the matching column names +// Series [Int] // Same as []int +// Series [Bool] // Same as []bool +// Series [String] // Same as []string type SelectIndexes interface{} // Select the given DataFrame columns @@ -382,7 +382,7 @@ func (df DataFrame) Drop(indexes SelectIndexes) DataFrame { const KEY_ERROR = "KEY_ERROR" -//GroupBy Group dataframe by columns +// GroupBy Group dataframe by columns func (df DataFrame) GroupBy(colnames ...string) *Groups { if len(colnames) <= 0 { return nil @@ -434,7 +434,7 @@ func (df DataFrame) GroupBy(colnames ...string) *Groups { return groups } -//AggregationType Aggregation method type +// AggregationType Aggregation method type type AggregationType int //go:generate stringer -type=AggregationType -linecomment @@ -448,7 +448,7 @@ const ( Aggregation_COUNT // COUNT ) -//Groups : structure generated by groupby +// Groups : structure generated by groupby type Groups struct { groups map[string]DataFrame colnames []string @@ -1039,23 +1039,23 @@ func WithComments(b rune) LoadOption { // // Examples: // -// // field will be ignored -// field int +// // field will be ignored +// field int // -// // Field will be ignored -// Field int `dataframe:"-"` +// // Field will be ignored +// Field int `dataframe:"-"` // -// // Field will be parsed with column name Field and type int -// Field int +// // Field will be parsed with column name Field and type int +// Field int // -// // Field will be parsed with column name `field_column` and type int. -// Field int `dataframe:"field_column"` +// // Field will be parsed with column name `field_column` and type int. +// Field int `dataframe:"field_column"` // -// // Field will be parsed with column name `field` and type string. -// Field int `dataframe:"field,string"` +// // Field will be parsed with column name `field` and type string. +// Field int `dataframe:"field,string"` // -// // Field will be parsed with column name `Field` and type string. -// Field int `dataframe:",string"` +// // Field will be parsed with column name `Field` and type string. +// Field int `dataframe:",string"` // // If the struct tags and the given LoadOptions contradict each other, the later // will have preference over the former. @@ -1219,17 +1219,22 @@ func LoadRecords(records [][]string, options ...LoadOption) DataFrame { types := make([]series.Type, len(headers)) rawcols := make([][]string, len(headers)) for i, colname := range headers { + t, useCustomType := cfg.types[colname] rawcol := make([]string, len(records)) for j := 0; j < len(records); j++ { rawcol[j] = records[j][i] + if useCustomType && t == series.String { + // skip the convertion when using custom string type + continue + } if findInStringSlice(rawcol[j], cfg.nanValues) != -1 { rawcol[j] = "NaN" } } rawcols[i] = rawcol - t, ok := cfg.types[colname] - if !ok { + // try to auto detect the data type + if !useCustomType { t = cfg.defaultType if cfg.detectTypes { if l, err := findType(rawcol); err == nil { diff --git a/dataframe/dataframe_test.go b/dataframe/dataframe_test.go index 6cb0c2b..2ca8f70 100644 --- a/dataframe/dataframe_test.go +++ b/dataframe/dataframe_test.go @@ -1420,6 +1420,49 @@ Spain,2012-02-01,66,555.42,00241 } } +// test case for issue #169 +func TestReadCSV_Issue169(t *testing.T) { + // Load the data from a CSV string and try to infer the type of the + // columns, but NA won't be converted to NaN when data type is specified + // as string. + const ExampleData = ` +Country,Region,Date,Age,Amount,Id +"United States",NA,2012-02-01,50,112.1,01234 +"United States",US,2012-02-01,32,321.31,54320 +"United Kingdom",GB,2012-02-01,17,18.2,12345 +"United States",NA,2012-02-01,32,321.31,54320 +"United States","NA",2012-02-01,17,321.31,54320 +"United Kingdom",GB,2012-02-01,NA,18.2,12345 +"United States",NA,2012-02-01,32,321.31,54320 +Spain,EU,2012-02-01,66,555.42,00241 +` + + df := ReadCSV( + strings.NewReader(ExampleData), + WithTypes(map[string]series.Type{ + "Region": series.String, + "Age": series.String, + }), + ) + + if df.Err != nil { + t.Errorf("Expected success, got error: %v", df.Err) + } + + for _, v := range df.Col("Region").Records() { + if v == "NaN" { + t.Errorf("Expected not to convert NA to NaN, but it does") + } + } + + for _, v := range df.Col("Age").Records() { + if v == "NaN" { + t.Errorf("Expected not to convert NA to NaN, but it does") + } + } + +} + func TestReadJSON(t *testing.T) { table := []struct { jsonStr string