Skip to content

Commit

Permalink
Fix Issue go-gota#169 - NA is still converted to NaN even in a string…
Browse files Browse the repository at this point in the history
… column

Copied from this PR:
go-gota#175
  • Loading branch information
shivamthapar committed Nov 20, 2024
1 parent f705409 commit 6483981
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 25 deletions.
55 changes: 30 additions & 25 deletions dataframe/dataframe.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,14 +305,14 @@ func (df DataFrame) Subset(indexes series.Indexes) DataFrame {

// SelectIndexes are the supported indexes used for the DataFrame.Select method. Currently supported are:
//
// int // Matches the given index number
// []int // Matches all given index numbers
// []bool // Matches all columns marked as true
// string // Matches the column with the matching column name
// []string // Matches all columns with the matching column names
// Series [Int] // Same as []int
// Series [Bool] // Same as []bool
// Series [String] // Same as []string
// int // Matches the given index number
// []int // Matches all given index numbers
// []bool // Matches all columns marked as true
// string // Matches the column with the matching column name
// []string // Matches all columns with the matching column names
// Series [Int] // Same as []int
// Series [Bool] // Same as []bool
// Series [String] // Same as []string
type SelectIndexes interface{}

// Select the given DataFrame columns
Expand Down Expand Up @@ -382,7 +382,7 @@ func (df DataFrame) Drop(indexes SelectIndexes) DataFrame {

const KEY_ERROR = "KEY_ERROR"

//GroupBy Group dataframe by columns
// GroupBy Group dataframe by columns
func (df DataFrame) GroupBy(colnames ...string) *Groups {
if len(colnames) <= 0 {
return nil
Expand Down Expand Up @@ -434,7 +434,7 @@ func (df DataFrame) GroupBy(colnames ...string) *Groups {
return groups
}

//AggregationType Aggregation method type
// AggregationType Aggregation method type
type AggregationType int

//go:generate stringer -type=AggregationType -linecomment
Expand All @@ -448,7 +448,7 @@ const (
Aggregation_COUNT // COUNT
)

//Groups : structure generated by groupby
// Groups : structure generated by groupby
type Groups struct {
groups map[string]DataFrame
colnames []string
Expand Down Expand Up @@ -1039,23 +1039,23 @@ func WithComments(b rune) LoadOption {
//
// Examples:
//
// // field will be ignored
// field int
// // field will be ignored
// field int
//
// // Field will be ignored
// Field int `dataframe:"-"`
// // Field will be ignored
// Field int `dataframe:"-"`
//
// // Field will be parsed with column name Field and type int
// Field int
// // Field will be parsed with column name Field and type int
// Field int
//
// // Field will be parsed with column name `field_column` and type int.
// Field int `dataframe:"field_column"`
// // Field will be parsed with column name `field_column` and type int.
// Field int `dataframe:"field_column"`
//
// // Field will be parsed with column name `field` and type string.
// Field int `dataframe:"field,string"`
// // Field will be parsed with column name `field` and type string.
// Field int `dataframe:"field,string"`
//
// // Field will be parsed with column name `Field` and type string.
// Field int `dataframe:",string"`
// // Field will be parsed with column name `Field` and type string.
// Field int `dataframe:",string"`
//
// If the struct tags and the given LoadOptions contradict each other, the later
// will have preference over the former.
Expand Down Expand Up @@ -1219,17 +1219,22 @@ func LoadRecords(records [][]string, options ...LoadOption) DataFrame {
types := make([]series.Type, len(headers))
rawcols := make([][]string, len(headers))
for i, colname := range headers {
t, useCustomType := cfg.types[colname]
rawcol := make([]string, len(records))
for j := 0; j < len(records); j++ {
rawcol[j] = records[j][i]
if useCustomType && t == series.String {
// skip the convertion when using custom string type
continue
}
if findInStringSlice(rawcol[j], cfg.nanValues) != -1 {
rawcol[j] = "NaN"
}
}
rawcols[i] = rawcol

t, ok := cfg.types[colname]
if !ok {
// try to auto detect the data type
if !useCustomType {
t = cfg.defaultType
if cfg.detectTypes {
if l, err := findType(rawcol); err == nil {
Expand Down
43 changes: 43 additions & 0 deletions dataframe/dataframe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1420,6 +1420,49 @@ Spain,2012-02-01,66,555.42,00241
}
}

// test case for issue #169
func TestReadCSV_Issue169(t *testing.T) {
// Load the data from a CSV string and try to infer the type of the
// columns, but NA won't be converted to NaN when data type is specified
// as string.
const ExampleData = `
Country,Region,Date,Age,Amount,Id
"United States",NA,2012-02-01,50,112.1,01234
"United States",US,2012-02-01,32,321.31,54320
"United Kingdom",GB,2012-02-01,17,18.2,12345
"United States",NA,2012-02-01,32,321.31,54320
"United States","NA",2012-02-01,17,321.31,54320
"United Kingdom",GB,2012-02-01,NA,18.2,12345
"United States",NA,2012-02-01,32,321.31,54320
Spain,EU,2012-02-01,66,555.42,00241
`

df := ReadCSV(
strings.NewReader(ExampleData),
WithTypes(map[string]series.Type{
"Region": series.String,
"Age": series.String,
}),
)

if df.Err != nil {
t.Errorf("Expected success, got error: %v", df.Err)
}

for _, v := range df.Col("Region").Records() {
if v == "NaN" {
t.Errorf("Expected not to convert NA to NaN, but it does")
}
}

for _, v := range df.Col("Age").Records() {
if v == "NaN" {
t.Errorf("Expected not to convert NA to NaN, but it does")
}
}

}

func TestReadJSON(t *testing.T) {
table := []struct {
jsonStr string
Expand Down

0 comments on commit 6483981

Please sign in to comment.