From 84d7fa22fd6e78086a413c91f5482fbf2eaae48b Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Sat, 31 Mar 2018 21:02:40 -0700 Subject: [PATCH 1/7] Issue #37 - ability to convert column to specific types given by the user --- src/CIDict.jl | 42 ++++++++++++++++++++++++++++++++++++++++++ src/SASLib.jl | 47 ++++++++++++++++++++++++++++++++++++++++++++--- src/Types.jl | 3 +++ test/runtests.jl | 31 +++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 src/CIDict.jl diff --git a/src/CIDict.jl b/src/CIDict.jl new file mode 100644 index 0000000..0a6fffb --- /dev/null +++ b/src/CIDict.jl @@ -0,0 +1,42 @@ +# Case insensitive Dict - a simple wrapper over Dict + +struct CIDict{K, T} + + dct::Dict{K, T} + + # type checking + check(K) = K <: AbstractString || K <: Symbol || + throw(ArgumentError("Key must be Symbol or String type")) + + # constructors + CIDict{K, T}() where {K,T} = (check(K); new(Dict{K,T}())) + CIDict{K, T}(d::Dict{K,T}) where {K,T} = begin + check(K) + d2 = Dict{K,T}() + for k in keys(d) + d2[lcase(k)] = d[k] + end + new(d2) + end +end + +lcase(s::Symbol) = Symbol(lowercase(String(s))) +lcase(s::AbstractString) = lowercase(s) + +Base.getindex(d::CIDict, s::Symbol) = d.dct[lcase(s)] +Base.getindex(d::CIDict, s::String) = d.dct[lcase(s)] + +Base.setindex!(d::CIDict, v, s::Symbol) = d.dct[lcase(s)] = v +Base.setindex!(d::CIDict, v, s::String) = d.dct[lcase(s)] = v + +Base.haskey(d::CIDict, s::Symbol) = haskey(d.dct, lcase(s)) +Base.haskey(d::CIDict, s::String) = haskey(d.dct, lcase(s)) + +Base.keys(d::CIDict) = keys(d.dct) +Base.values(d::CIDict) = values(d.dct) + +Base.start(d::CIDict) = start(d.dct) +Base.next(d::CIDict, i::Int) = next(d.dct, i) +Base.done(d::CIDict, i::Int) = done(d.dct, i) + +Base.show(io::IO, d::CIDict) = show(io, d.dct) diff --git a/src/SASLib.jl b/src/SASLib.jl index 87994c5..06a031f 100644 --- a/src/SASLib.jl +++ b/src/SASLib.jl @@ -12,6 +12,7 @@ import Base: show, size include("constants.jl") include("utils.jl") include("ObjectPool.jl") +include("CIDict.jl") include("Types.jl") include("ResultSet.jl") include("Metadata.jl") @@ -32,6 +33,7 @@ function _open(config::ReaderConfig) handler.current_page = 0 _get_properties(handler) _parse_metadata(handler) + _post_metadata_handler(handler) return handler end @@ -43,6 +45,7 @@ open(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), + column_types::Dict = Dict{Symbol,DataType}(), verbose_level::Int64 = 1) Open a SAS7BDAT data file. Returns a `SASLib.Handler` object that can be used in @@ -55,9 +58,11 @@ function open(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), + column_types::Dict = Dict{Symbol,DataType}(), verbose_level::Int64 = 1) return _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates, - include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level)) + include_columns, exclude_columns, string_array_fn, number_array_fn, + column_types, verbose_level)) end """ @@ -97,6 +102,7 @@ readsas(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), + column_types::Dict = Dict{Symbol,DataType}(), verbose_level::Int64 = 1) Read a SAS7BDAT file. @@ -135,6 +141,9 @@ For numeric columns, you may specify your own array constructors using the `number_array_fn` parameter. Perhaps you have a different kind of array to store the values e.g. SharedArray. +Specify `column_type` argument if any conversion is required. It should +be a Dict, mapping column symbol to a data type. + For debugging purpose, `verbose_level` may be set to a value higher than 1. Verbose level 0 will output nothing to the console, essentially a total quiet option. @@ -146,11 +155,13 @@ function readsas(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), + column_types::Dict = Dict{Symbol,DataType}(), verbose_level::Int64 = 1) handler = nothing try handler = _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates, - include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level)) + include_columns, exclude_columns, string_array_fn, number_array_fn, + column_types, verbose_level)) return read(handler) finally isdefined(handler, :string_decoder) && Base.close(handler.string_decoder) @@ -390,6 +401,20 @@ function _parse_metadata(handler) end end +# Do this after finish reading metadata but before reading data +function _post_metadata_handler(handler) + + # save a copy of column types in a case insensitive dict + handler.column_types_dict = CIDict{Symbol,DataType}(handler.config.column_types) + + # check column_types + for k in keys(handler.config.column_types) + if !case_insensitive_in(k, handler.column_symbols) + Compat.@warn("Unknown column symbol ($k) in column_types. Ignored.") + end + end +end + function _process_page_meta(handler) # println3(handler, "IN: _process_page_meta") _read_page_header(handler) @@ -1006,7 +1031,7 @@ function _chunk_to_dataframe(handler, nrows) rslt[name] = datetime_from_float(rslt[name]) end end - + convert_column_type_if_needed!(handler, rslt, name) elseif ty == column_type_string # println(" String: size=$(size(handler.string_chunk))") # println(" String: column $j, name $name, size=$(size(handler.string_chunk[js, :]))") @@ -1018,6 +1043,22 @@ function _chunk_to_dataframe(handler, nrows) return rslt end +# If the user specified a type for the column, try to convert the column data. +function convert_column_type_if_needed!(handler, rslt, name) + if haskey(handler.column_types_dict, name) + type_wanted = handler.column_types_dict[name] + #println("$name exists in config.column_types, type_wanted=$type_wanted") + if type_wanted != Float64 + try + converted_data = convert.(type_wanted, rslt[name]) + rslt[name] = converted_data + catch ex + Compat.@warn("Unable to convert column to type $type_wanted, error=$ex") + end + end + end +end + # Simple loop that reads data row-by-row. function read_data(handler, nrows) # println("IN: read_data, nrows=$nrows") diff --git a/src/Types.jl b/src/Types.jl index 1c3452f..0bf0abc 100644 --- a/src/Types.jl +++ b/src/Types.jl @@ -15,6 +15,7 @@ struct ReaderConfig exclude_columns::Vector string_array_fn::Dict{Symbol, Function} number_array_fn::Dict{Symbol, Function} + column_types::Dict{Symbol, DataType} verbose_level::Int64 end @@ -108,6 +109,8 @@ mutable struct Handler string_decoder_buffer::IOBuffer string_decoder::StringDecoder + column_types_dict::CIDict{Symbol,DataType} + Handler(config::ReaderConfig) = new( Base.open(config.filename), config) diff --git a/test/runtests.jl b/test/runtests.jl index dbcd0c2..62bbf87 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,6 +10,10 @@ readfile(dir, file; kwargs...) = readsas(getpath(dir, file); kwargs...) openfile(dir, file; kwargs...) = SASLib.open(getpath(dir, file), kwargs...) getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...) +# Struct used for column type conversion test case below +struct YearStr year::String end +Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v))) + @testset "SASLib" begin @testset "object pool" begin @@ -301,6 +305,33 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...) end + # column type conversion + @testset "user specified column types" begin + + # normal use case + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:YEAR => Int16, :QUARTER => Int8)) + @test eltype(rs[:YEAR]) == Int16 + @test eltype(rs[:QUARTER]) == Int8 + + # error handling - warn() when a column cannot be converted + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:YEAR => Int8, :QUARTER => Int8)) + @test eltype(rs[:YEAR]) == Float64 + @test eltype(rs[:QUARTER]) == Int8 + #TODO expect warning for :YEAR conversion + + # case insensitive column symbol + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:Quarter => Int8)) + @test eltype(rs[:QUARTER]) == Int8 + + # conversion to custom types + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:Year => YearStr)) + @test eltype(rs[:YEAR]) == YearStr + end + # see output; keep this for coverage reason @testset "verbosity" begin rs = readfile("data_pandas", "test1.sas7bdat"; verbose_level = 2) From 74f0ddd56727e89453e46e0e943856839f1bea98 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Sat, 31 Mar 2018 21:35:04 -0700 Subject: [PATCH 2/7] CIDict - added length function and unit tests --- src/CIDict.jl | 2 ++ test/runtests.jl | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/CIDict.jl b/src/CIDict.jl index 0a6fffb..ade48d8 100644 --- a/src/CIDict.jl +++ b/src/CIDict.jl @@ -40,3 +40,5 @@ Base.next(d::CIDict, i::Int) = next(d.dct, i) Base.done(d::CIDict, i::Int) = done(d.dct, i) Base.show(io::IO, d::CIDict) = show(io, d.dct) + +Base.length(d::CIDict) = length(d.dct) diff --git a/test/runtests.jl b/test/runtests.jl index 62bbf87..cc75662 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -52,6 +52,47 @@ Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v))) @test_throws BoundsError z[1:300] = 1:300 end + @testset "case insensitive dict" begin + function testdict(lowercase_key, mixedcase_key, second_lowercase_key) + + T = typeof(lowercase_key) + d = CIDict{T,Int}() + + # getindex/setindex! + d[lowercase_key] = 99 + @test d[lowercase_key] == 99 + @test d[mixedcase_key] == 99 + d[mixedcase_key] = 88 # should replace original value + @test length(d) == 1 # still 1 element + @test d[lowercase_key] == 88 + @test d[mixedcase_key] == 88 + + # haskey + @test haskey(d, lowercase_key) == true + @test haskey(d, mixedcase_key) == true + + # iteration + d[second_lowercase_key] = 77 + ks = T[] + vs = Int[] + for (k,v) in d + push!(ks, k) + push!(vs, v) + end + @test ks == [lowercase_key, second_lowercase_key] + @test vs == [88, 77] + + # keys/values + @test collect(keys(d)) == [lowercase_key, second_lowercase_key] + @test collect(values(d)) == [88, 77] + + # show + @test show(d) == nothing + end + testdict(:abc, :ABC, :def) + testdict("abc", "ABC", "def") + end + @testset "open and close" begin handler = openfile("data_pandas", "test1.sas7bdat") @test typeof(handler) == SASLib.Handler @@ -174,7 +215,7 @@ Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v))) @test rs[1,:ACTUAL] ≈ 200.0 # display related - @test typeof(show(rs)) == Void + @test show(rs) == nothing @test SASLib.sizestr(rs) == "1440 rows x 10 columns" end @@ -192,7 +233,7 @@ Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v))) @test md.columnsinfo[1] == Pair(:Column1, Float64) md = getmetadata("data_pandas", "productsales.sas7bdat") - @test typeof(show(md)) == Void + @test show(md) == nothing println() # Deal with v0.6/v0.7 difference @@ -230,7 +271,7 @@ Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v))) handler = openfile("data_AHS2013", "topical.sas7bdat") rs = SASLib.read(handler, 1000) @test size(rs) == (1000, 114) - @test typeof(show(handler)) == Void + @test show(handler) == nothing SASLib.close(handler) # @test result[:page_count] == 10 # @test result[:page_length] == 16384 From 2554f43f84c0699b35333887f655b90a67a74958 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Sat, 31 Mar 2018 21:42:04 -0700 Subject: [PATCH 3/7] fixed CIDict test --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index cc75662..71aa595 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -56,7 +56,7 @@ Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v))) function testdict(lowercase_key, mixedcase_key, second_lowercase_key) T = typeof(lowercase_key) - d = CIDict{T,Int}() + d = SASLib.CIDict{T,Int}() # getindex/setindex! d[lowercase_key] = 99 From ef496a8a5366491da96c3b2f6968b7f59eea747f Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Sat, 7 Apr 2018 12:41:54 -0700 Subject: [PATCH 4/7] added column type conversion to user guide --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 5f4ea5f..e7a900b 100644 --- a/README.md +++ b/README.md @@ -317,6 +317,25 @@ julia> A[1:5,:] 656.0 646.0 ``` +### Column Type Conversion + +Often, you want a column to be an integer but the SAS7BDAT stores everything as Float64. Specifying the `column_type` argument does the conversion for you. + +``` +julia> rs = readsas("productsales.sas7bdat", column_types=Dict(:ACTUAL=>Int)) +Read productsales.sas7bdat with size 1440 x 10 in 0.08043 seconds +SASLib.ResultSet (1440 rows x 10 columns) +Columns 1:ACTUAL, 2:PREDICT, 3:COUNTRY, 4:REGION, 5:DIVISION, 6:PRODTYPE, 7:PRODUCT, 8:QUARTER, 9:YEAR, 10:MONTH +1: 925, 850.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-01-01 +2: 999, 297.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-02-01 +3: 608, 846.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-03-01 +4: 642, 533.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 2.0, 1993.0, 1993-04-01 +5: 656, 646.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 2.0, 1993.0, 1993-05-01 + +julia> typeof(rs[:ACTUAL]) +Array{Int64,1} +``` + ### File Metadata You may obtain meta data for a SAS data file using the `metadata` function. From 0017d04015e550a3e8350d908ae5a6c12c4775b5 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Sat, 7 Apr 2018 12:42:13 -0700 Subject: [PATCH 5/7] updated show function --- src/CIDict.jl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/CIDict.jl b/src/CIDict.jl index ade48d8..21a3355 100644 --- a/src/CIDict.jl +++ b/src/CIDict.jl @@ -39,6 +39,15 @@ Base.start(d::CIDict) = start(d.dct) Base.next(d::CIDict, i::Int) = next(d.dct, i) Base.done(d::CIDict, i::Int) = done(d.dct, i) -Base.show(io::IO, d::CIDict) = show(io, d.dct) - Base.length(d::CIDict) = length(d.dct) + +issym(x) = typeof(x) == Symbol + +function Base.show(io::IO, d::SASLib.CIDict) + print(io, "CIDict(") + for (i, (k,v)) in enumerate(d.dct) + i > 1 && print(io, ", ") + print(io, issym(k) ? ":" : "", k, " => ", v) + end + print(io, ")") +end From ed1618586b0323c07e575661de5cf42c44a0d653 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Tue, 10 Apr 2018 11:20:02 -0700 Subject: [PATCH 6/7] support Union column type --- src/SASLib.jl | 13 ++++++------- src/Types.jl | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/SASLib.jl b/src/SASLib.jl index 06a031f..f9c1ce3 100644 --- a/src/SASLib.jl +++ b/src/SASLib.jl @@ -45,7 +45,7 @@ open(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), - column_types::Dict = Dict{Symbol,DataType}(), + column_types::Dict = Dict{Symbol,Type}(), verbose_level::Int64 = 1) Open a SAS7BDAT data file. Returns a `SASLib.Handler` object that can be used in @@ -58,7 +58,7 @@ function open(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), - column_types::Dict = Dict{Symbol,DataType}(), + column_types::Dict = Dict{Symbol,Type}(), verbose_level::Int64 = 1) return _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates, include_columns, exclude_columns, string_array_fn, number_array_fn, @@ -102,7 +102,7 @@ readsas(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), - column_types::Dict = Dict{Symbol,DataType}(), + column_types::Dict = Dict{Symbol,Type}(), verbose_level::Int64 = 1) Read a SAS7BDAT file. @@ -155,7 +155,7 @@ function readsas(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), - column_types::Dict = Dict{Symbol,DataType}(), + column_types::Dict = Dict{Symbol,Type}(), verbose_level::Int64 = 1) handler = nothing try @@ -405,7 +405,7 @@ end function _post_metadata_handler(handler) # save a copy of column types in a case insensitive dict - handler.column_types_dict = CIDict{Symbol,DataType}(handler.config.column_types) + handler.column_types_dict = CIDict{Symbol,Type}(handler.config.column_types) # check column_types for k in keys(handler.config.column_types) @@ -1050,8 +1050,7 @@ function convert_column_type_if_needed!(handler, rslt, name) #println("$name exists in config.column_types, type_wanted=$type_wanted") if type_wanted != Float64 try - converted_data = convert.(type_wanted, rslt[name]) - rslt[name] = converted_data + rslt[name] = convert(Vector{type_wanted}, rslt[name]) catch ex Compat.@warn("Unable to convert column to type $type_wanted, error=$ex") end diff --git a/src/Types.jl b/src/Types.jl index 0bf0abc..5fc68ae 100644 --- a/src/Types.jl +++ b/src/Types.jl @@ -15,7 +15,7 @@ struct ReaderConfig exclude_columns::Vector string_array_fn::Dict{Symbol, Function} number_array_fn::Dict{Symbol, Function} - column_types::Dict{Symbol, DataType} + column_types::Dict{Symbol, Type} verbose_level::Int64 end @@ -109,7 +109,7 @@ mutable struct Handler string_decoder_buffer::IOBuffer string_decoder::StringDecoder - column_types_dict::CIDict{Symbol,DataType} + column_types_dict::CIDict{Symbol,Type} Handler(config::ReaderConfig) = new( Base.open(config.filename), From 563b321853bf185bc00d5b77aa598981b4d5f633 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Tue, 10 Apr 2018 22:42:33 -0700 Subject: [PATCH 7/7] added union type test --- test/runtests.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 71aa595..f545658 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -371,6 +371,13 @@ Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v))) rs = readfile("data_pandas", "productsales.sas7bdat"; verbose_level = 0, column_types = Dict(:Year => YearStr)) @test eltype(rs[:YEAR]) == YearStr + + # test Union type + let T = Union{Int,Missing} + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:Year => T)) + @test eltype(rs[:YEAR]) == T + end end # see output; keep this for coverage reason