diff --git a/README.md b/README.md index 5f4ea5f..e7a900b 100644 --- a/README.md +++ b/README.md @@ -317,6 +317,25 @@ julia> A[1:5,:] 656.0 646.0 ``` +### Column Type Conversion + +Often, you want a column to be an integer but the SAS7BDAT stores everything as Float64. Specifying the `column_type` argument does the conversion for you. + +``` +julia> rs = readsas("productsales.sas7bdat", column_types=Dict(:ACTUAL=>Int)) +Read productsales.sas7bdat with size 1440 x 10 in 0.08043 seconds +SASLib.ResultSet (1440 rows x 10 columns) +Columns 1:ACTUAL, 2:PREDICT, 3:COUNTRY, 4:REGION, 5:DIVISION, 6:PRODTYPE, 7:PRODUCT, 8:QUARTER, 9:YEAR, 10:MONTH +1: 925, 850.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-01-01 +2: 999, 297.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-02-01 +3: 608, 846.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-03-01 +4: 642, 533.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 2.0, 1993.0, 1993-04-01 +5: 656, 646.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 2.0, 1993.0, 1993-05-01 + +julia> typeof(rs[:ACTUAL]) +Array{Int64,1} +``` + ### File Metadata You may obtain meta data for a SAS data file using the `metadata` function. diff --git a/src/CIDict.jl b/src/CIDict.jl new file mode 100644 index 0000000..21a3355 --- /dev/null +++ b/src/CIDict.jl @@ -0,0 +1,53 @@ +# Case insensitive Dict - a simple wrapper over Dict + +struct CIDict{K, T} + + dct::Dict{K, T} + + # type checking + check(K) = K <: AbstractString || K <: Symbol || + throw(ArgumentError("Key must be Symbol or String type")) + + # constructors + CIDict{K, T}() where {K,T} = (check(K); new(Dict{K,T}())) + CIDict{K, T}(d::Dict{K,T}) where {K,T} = begin + check(K) + d2 = Dict{K,T}() + for k in keys(d) + d2[lcase(k)] = d[k] + end + new(d2) + end +end + +lcase(s::Symbol) = Symbol(lowercase(String(s))) +lcase(s::AbstractString) = lowercase(s) + +Base.getindex(d::CIDict, s::Symbol) = d.dct[lcase(s)] +Base.getindex(d::CIDict, s::String) = d.dct[lcase(s)] + +Base.setindex!(d::CIDict, v, s::Symbol) = d.dct[lcase(s)] = v +Base.setindex!(d::CIDict, v, s::String) = d.dct[lcase(s)] = v + +Base.haskey(d::CIDict, s::Symbol) = haskey(d.dct, lcase(s)) +Base.haskey(d::CIDict, s::String) = haskey(d.dct, lcase(s)) + +Base.keys(d::CIDict) = keys(d.dct) +Base.values(d::CIDict) = values(d.dct) + +Base.start(d::CIDict) = start(d.dct) +Base.next(d::CIDict, i::Int) = next(d.dct, i) +Base.done(d::CIDict, i::Int) = done(d.dct, i) + +Base.length(d::CIDict) = length(d.dct) + +issym(x) = typeof(x) == Symbol + +function Base.show(io::IO, d::SASLib.CIDict) + print(io, "CIDict(") + for (i, (k,v)) in enumerate(d.dct) + i > 1 && print(io, ", ") + print(io, issym(k) ? ":" : "", k, " => ", v) + end + print(io, ")") +end diff --git a/src/SASLib.jl b/src/SASLib.jl index 87994c5..f9c1ce3 100644 --- a/src/SASLib.jl +++ b/src/SASLib.jl @@ -12,6 +12,7 @@ import Base: show, size include("constants.jl") include("utils.jl") include("ObjectPool.jl") +include("CIDict.jl") include("Types.jl") include("ResultSet.jl") include("Metadata.jl") @@ -32,6 +33,7 @@ function _open(config::ReaderConfig) handler.current_page = 0 _get_properties(handler) _parse_metadata(handler) + _post_metadata_handler(handler) return handler end @@ -43,6 +45,7 @@ open(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), + column_types::Dict = Dict{Symbol,Type}(), verbose_level::Int64 = 1) Open a SAS7BDAT data file. Returns a `SASLib.Handler` object that can be used in @@ -55,9 +58,11 @@ function open(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), + column_types::Dict = Dict{Symbol,Type}(), verbose_level::Int64 = 1) return _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates, - include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level)) + include_columns, exclude_columns, string_array_fn, number_array_fn, + column_types, verbose_level)) end """ @@ -97,6 +102,7 @@ readsas(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), + column_types::Dict = Dict{Symbol,Type}(), verbose_level::Int64 = 1) Read a SAS7BDAT file. @@ -135,6 +141,9 @@ For numeric columns, you may specify your own array constructors using the `number_array_fn` parameter. Perhaps you have a different kind of array to store the values e.g. SharedArray. +Specify `column_type` argument if any conversion is required. It should +be a Dict, mapping column symbol to a data type. + For debugging purpose, `verbose_level` may be set to a value higher than 1. Verbose level 0 will output nothing to the console, essentially a total quiet option. @@ -146,11 +155,13 @@ function readsas(filename::AbstractString; exclude_columns::Vector = [], string_array_fn::Dict = Dict(), number_array_fn::Dict = Dict(), + column_types::Dict = Dict{Symbol,Type}(), verbose_level::Int64 = 1) handler = nothing try handler = _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates, - include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level)) + include_columns, exclude_columns, string_array_fn, number_array_fn, + column_types, verbose_level)) return read(handler) finally isdefined(handler, :string_decoder) && Base.close(handler.string_decoder) @@ -390,6 +401,20 @@ function _parse_metadata(handler) end end +# Do this after finish reading metadata but before reading data +function _post_metadata_handler(handler) + + # save a copy of column types in a case insensitive dict + handler.column_types_dict = CIDict{Symbol,Type}(handler.config.column_types) + + # check column_types + for k in keys(handler.config.column_types) + if !case_insensitive_in(k, handler.column_symbols) + Compat.@warn("Unknown column symbol ($k) in column_types. Ignored.") + end + end +end + function _process_page_meta(handler) # println3(handler, "IN: _process_page_meta") _read_page_header(handler) @@ -1006,7 +1031,7 @@ function _chunk_to_dataframe(handler, nrows) rslt[name] = datetime_from_float(rslt[name]) end end - + convert_column_type_if_needed!(handler, rslt, name) elseif ty == column_type_string # println(" String: size=$(size(handler.string_chunk))") # println(" String: column $j, name $name, size=$(size(handler.string_chunk[js, :]))") @@ -1018,6 +1043,21 @@ function _chunk_to_dataframe(handler, nrows) return rslt end +# If the user specified a type for the column, try to convert the column data. +function convert_column_type_if_needed!(handler, rslt, name) + if haskey(handler.column_types_dict, name) + type_wanted = handler.column_types_dict[name] + #println("$name exists in config.column_types, type_wanted=$type_wanted") + if type_wanted != Float64 + try + rslt[name] = convert(Vector{type_wanted}, rslt[name]) + catch ex + Compat.@warn("Unable to convert column to type $type_wanted, error=$ex") + end + end + end +end + # Simple loop that reads data row-by-row. function read_data(handler, nrows) # println("IN: read_data, nrows=$nrows") diff --git a/src/Types.jl b/src/Types.jl index 1c3452f..5fc68ae 100644 --- a/src/Types.jl +++ b/src/Types.jl @@ -15,6 +15,7 @@ struct ReaderConfig exclude_columns::Vector string_array_fn::Dict{Symbol, Function} number_array_fn::Dict{Symbol, Function} + column_types::Dict{Symbol, Type} verbose_level::Int64 end @@ -108,6 +109,8 @@ mutable struct Handler string_decoder_buffer::IOBuffer string_decoder::StringDecoder + column_types_dict::CIDict{Symbol,Type} + Handler(config::ReaderConfig) = new( Base.open(config.filename), config) diff --git a/test/runtests.jl b/test/runtests.jl index dbcd0c2..f545658 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,6 +10,10 @@ readfile(dir, file; kwargs...) = readsas(getpath(dir, file); kwargs...) openfile(dir, file; kwargs...) = SASLib.open(getpath(dir, file), kwargs...) getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...) +# Struct used for column type conversion test case below +struct YearStr year::String end +Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v))) + @testset "SASLib" begin @testset "object pool" begin @@ -48,6 +52,47 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...) @test_throws BoundsError z[1:300] = 1:300 end + @testset "case insensitive dict" begin + function testdict(lowercase_key, mixedcase_key, second_lowercase_key) + + T = typeof(lowercase_key) + d = SASLib.CIDict{T,Int}() + + # getindex/setindex! + d[lowercase_key] = 99 + @test d[lowercase_key] == 99 + @test d[mixedcase_key] == 99 + d[mixedcase_key] = 88 # should replace original value + @test length(d) == 1 # still 1 element + @test d[lowercase_key] == 88 + @test d[mixedcase_key] == 88 + + # haskey + @test haskey(d, lowercase_key) == true + @test haskey(d, mixedcase_key) == true + + # iteration + d[second_lowercase_key] = 77 + ks = T[] + vs = Int[] + for (k,v) in d + push!(ks, k) + push!(vs, v) + end + @test ks == [lowercase_key, second_lowercase_key] + @test vs == [88, 77] + + # keys/values + @test collect(keys(d)) == [lowercase_key, second_lowercase_key] + @test collect(values(d)) == [88, 77] + + # show + @test show(d) == nothing + end + testdict(:abc, :ABC, :def) + testdict("abc", "ABC", "def") + end + @testset "open and close" begin handler = openfile("data_pandas", "test1.sas7bdat") @test typeof(handler) == SASLib.Handler @@ -170,7 +215,7 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...) @test rs[1,:ACTUAL] ≈ 200.0 # display related - @test typeof(show(rs)) == Void + @test show(rs) == nothing @test SASLib.sizestr(rs) == "1440 rows x 10 columns" end @@ -188,7 +233,7 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...) @test md.columnsinfo[1] == Pair(:Column1, Float64) md = getmetadata("data_pandas", "productsales.sas7bdat") - @test typeof(show(md)) == Void + @test show(md) == nothing println() # Deal with v0.6/v0.7 difference @@ -226,7 +271,7 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...) handler = openfile("data_AHS2013", "topical.sas7bdat") rs = SASLib.read(handler, 1000) @test size(rs) == (1000, 114) - @test typeof(show(handler)) == Void + @test show(handler) == nothing SASLib.close(handler) # @test result[:page_count] == 10 # @test result[:page_length] == 16384 @@ -301,6 +346,40 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...) end + # column type conversion + @testset "user specified column types" begin + + # normal use case + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:YEAR => Int16, :QUARTER => Int8)) + @test eltype(rs[:YEAR]) == Int16 + @test eltype(rs[:QUARTER]) == Int8 + + # error handling - warn() when a column cannot be converted + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:YEAR => Int8, :QUARTER => Int8)) + @test eltype(rs[:YEAR]) == Float64 + @test eltype(rs[:QUARTER]) == Int8 + #TODO expect warning for :YEAR conversion + + # case insensitive column symbol + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:Quarter => Int8)) + @test eltype(rs[:QUARTER]) == Int8 + + # conversion to custom types + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:Year => YearStr)) + @test eltype(rs[:YEAR]) == YearStr + + # test Union type + let T = Union{Int,Missing} + rs = readfile("data_pandas", "productsales.sas7bdat"; + verbose_level = 0, column_types = Dict(:Year => T)) + @test eltype(rs[:YEAR]) == T + end + end + # see output; keep this for coverage reason @testset "verbosity" begin rs = readfile("data_pandas", "test1.sas7bdat"; verbose_level = 2)