Fix Issue go-gota#169 - NA is still converted to NaN even in a string…

… column Copied from this PR: go-gota#175
runway · Nov 20, 2024 · 6483981 · 6483981
1 parent f705409
commit 6483981
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 25 deletions.
diff --git a/dataframe/dataframe.go b/dataframe/dataframe.go
@@ -305,14 +305,14 @@ func (df DataFrame) Subset(indexes series.Indexes) DataFrame {
 
 // SelectIndexes are the supported indexes used for the DataFrame.Select method. Currently supported are:
 //
-//     int              // Matches the given index number
-//     []int            // Matches all given index numbers
-//     []bool           // Matches all columns marked as true
-//     string           // Matches the column with the matching column name
-//     []string         // Matches all columns with the matching column names
-//     Series [Int]     // Same as []int
-//     Series [Bool]    // Same as []bool
-//     Series [String]  // Same as []string
+//	int              // Matches the given index number
+//	[]int            // Matches all given index numbers
+//	[]bool           // Matches all columns marked as true
+//	string           // Matches the column with the matching column name
+//	[]string         // Matches all columns with the matching column names
+//	Series [Int]     // Same as []int
+//	Series [Bool]    // Same as []bool
+//	Series [String]  // Same as []string
 type SelectIndexes interface{}
 
 // Select the given DataFrame columns
@@ -382,7 +382,7 @@ func (df DataFrame) Drop(indexes SelectIndexes) DataFrame {
 
 const KEY_ERROR = "KEY_ERROR"
 
-//GroupBy Group dataframe by columns
+// GroupBy Group dataframe by columns
 func (df DataFrame) GroupBy(colnames ...string) *Groups {
 	if len(colnames) <= 0 {
 		return nil
@@ -434,7 +434,7 @@ func (df DataFrame) GroupBy(colnames ...string) *Groups {
 	return groups
 }
 
-//AggregationType Aggregation method type
+// AggregationType Aggregation method type
 type AggregationType int
 
 //go:generate stringer -type=AggregationType -linecomment
@@ -448,7 +448,7 @@ const (
 	Aggregation_COUNT                             // COUNT
 )
 
-//Groups : structure generated by groupby
+// Groups : structure generated by groupby
 type Groups struct {
 	groups      map[string]DataFrame
 	colnames    []string
@@ -1039,23 +1039,23 @@ func WithComments(b rune) LoadOption {
 //
 // Examples:
 //
-//    // field will be ignored
-//    field int
+//	// field will be ignored
+//	field int
 //
-//    // Field will be ignored
-//    Field int `dataframe:"-"`
+//	// Field will be ignored
+//	Field int `dataframe:"-"`
 //
-//    // Field will be parsed with column name Field and type int
-//    Field int
+//	// Field will be parsed with column name Field and type int
+//	Field int
 //
-//    // Field will be parsed with column name `field_column` and type int.
-//    Field int `dataframe:"field_column"`
+//	// Field will be parsed with column name `field_column` and type int.
+//	Field int `dataframe:"field_column"`
 //
-//    // Field will be parsed with column name `field` and type string.
-//    Field int `dataframe:"field,string"`
+//	// Field will be parsed with column name `field` and type string.
+//	Field int `dataframe:"field,string"`
 //
-//    // Field will be parsed with column name `Field` and type string.
-//    Field int `dataframe:",string"`
+//	// Field will be parsed with column name `Field` and type string.
+//	Field int `dataframe:",string"`
 //
 // If the struct tags and the given LoadOptions contradict each other, the later
 // will have preference over the former.
@@ -1219,17 +1219,22 @@ func LoadRecords(records [][]string, options ...LoadOption) DataFrame {
 	types := make([]series.Type, len(headers))
 	rawcols := make([][]string, len(headers))
 	for i, colname := range headers {
+		t, useCustomType := cfg.types[colname]
 		rawcol := make([]string, len(records))
 		for j := 0; j < len(records); j++ {
 			rawcol[j] = records[j][i]
+			if useCustomType && t == series.String {
+				// skip the convertion when using custom string type
+				continue
+			}
 			if findInStringSlice(rawcol[j], cfg.nanValues) != -1 {
 				rawcol[j] = "NaN"
 			}
 		}
 		rawcols[i] = rawcol
 
-		t, ok := cfg.types[colname]
-		if !ok {
+		// try to auto detect the data type
+		if !useCustomType {
 			t = cfg.defaultType
 			if cfg.detectTypes {
 				if l, err := findType(rawcol); err == nil {

diff --git a/dataframe/dataframe_test.go b/dataframe/dataframe_test.go
@@ -1420,6 +1420,49 @@ Spain,2012-02-01,66,555.42,00241
 	}
 }
 
+// test case for issue #169
+func TestReadCSV_Issue169(t *testing.T) {
+	// Load the data from a CSV string and try to infer the type of the
+	// columns, but NA won't be converted to NaN when data type is specified
+	// as string.
+	const ExampleData = `
+Country,Region,Date,Age,Amount,Id
+"United States",NA,2012-02-01,50,112.1,01234
+"United States",US,2012-02-01,32,321.31,54320
+"United Kingdom",GB,2012-02-01,17,18.2,12345
+"United States",NA,2012-02-01,32,321.31,54320
+"United States","NA",2012-02-01,17,321.31,54320
+"United Kingdom",GB,2012-02-01,NA,18.2,12345
+"United States",NA,2012-02-01,32,321.31,54320
+Spain,EU,2012-02-01,66,555.42,00241
+`
+
+	df := ReadCSV(
+		strings.NewReader(ExampleData),
+		WithTypes(map[string]series.Type{
+			"Region": series.String,
+			"Age":    series.String,
+		}),
+	)
+
+	if df.Err != nil {
+		t.Errorf("Expected success, got error: %v", df.Err)
+	}
+
+	for _, v := range df.Col("Region").Records() {
+		if v == "NaN" {
+			t.Errorf("Expected not to convert NA to NaN, but it does")
+		}
+	}
+
+	for _, v := range df.Col("Age").Records() {
+		if v == "NaN" {
+			t.Errorf("Expected not to convert NA to NaN, but it does")
+		}
+	}
+
+}
+
 func TestReadJSON(t *testing.T) {
 	table := []struct {
 		jsonStr string