Skip to content

Commit

Permalink
Merge pull request #42 from tk3369/tk/user-specified-column-type
Browse files Browse the repository at this point in the history
Ability to convert column to specific types given by the user
  • Loading branch information
tk3369 authored Apr 25, 2018
2 parents 4c4f11d + 563b321 commit 57bfdcc
Show file tree
Hide file tree
Showing 5 changed files with 200 additions and 6 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,25 @@ julia> A[1:5,:]
656.0 646.0
```

### Column Type Conversion

Often, you want a column to be an integer but the SAS7BDAT stores everything as Float64. Specifying the `column_type` argument does the conversion for you.

```
julia> rs = readsas("productsales.sas7bdat", column_types=Dict(:ACTUAL=>Int))
Read productsales.sas7bdat with size 1440 x 10 in 0.08043 seconds
SASLib.ResultSet (1440 rows x 10 columns)
Columns 1:ACTUAL, 2:PREDICT, 3:COUNTRY, 4:REGION, 5:DIVISION, 6:PRODTYPE, 7:PRODUCT, 8:QUARTER, 9:YEAR, 10:MONTH
1: 925, 850.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-01-01
2: 999, 297.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-02-01
3: 608, 846.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-03-01
4: 642, 533.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 2.0, 1993.0, 1993-04-01
5: 656, 646.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 2.0, 1993.0, 1993-05-01
julia> typeof(rs[:ACTUAL])
Array{Int64,1}
```

### File Metadata

You may obtain meta data for a SAS data file using the `metadata` function.
Expand Down
53 changes: 53 additions & 0 deletions src/CIDict.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Case insensitive Dict - a simple wrapper over Dict

struct CIDict{K, T}

dct::Dict{K, T}

# type checking
check(K) = K <: AbstractString || K <: Symbol ||
throw(ArgumentError("Key must be Symbol or String type"))

# constructors
CIDict{K, T}() where {K,T} = (check(K); new(Dict{K,T}()))
CIDict{K, T}(d::Dict{K,T}) where {K,T} = begin
check(K)
d2 = Dict{K,T}()
for k in keys(d)
d2[lcase(k)] = d[k]
end
new(d2)
end
end

lcase(s::Symbol) = Symbol(lowercase(String(s)))
lcase(s::AbstractString) = lowercase(s)

Base.getindex(d::CIDict, s::Symbol) = d.dct[lcase(s)]
Base.getindex(d::CIDict, s::String) = d.dct[lcase(s)]

Base.setindex!(d::CIDict, v, s::Symbol) = d.dct[lcase(s)] = v
Base.setindex!(d::CIDict, v, s::String) = d.dct[lcase(s)] = v

Base.haskey(d::CIDict, s::Symbol) = haskey(d.dct, lcase(s))
Base.haskey(d::CIDict, s::String) = haskey(d.dct, lcase(s))

Base.keys(d::CIDict) = keys(d.dct)
Base.values(d::CIDict) = values(d.dct)

Base.start(d::CIDict) = start(d.dct)
Base.next(d::CIDict, i::Int) = next(d.dct, i)
Base.done(d::CIDict, i::Int) = done(d.dct, i)

Base.length(d::CIDict) = length(d.dct)

issym(x) = typeof(x) == Symbol

function Base.show(io::IO, d::SASLib.CIDict)
print(io, "CIDict(")
for (i, (k,v)) in enumerate(d.dct)
i > 1 && print(io, ", ")
print(io, issym(k) ? ":" : "", k, " => ", v)
end
print(io, ")")
end
46 changes: 43 additions & 3 deletions src/SASLib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import Base: show, size
include("constants.jl")
include("utils.jl")
include("ObjectPool.jl")
include("CIDict.jl")
include("Types.jl")
include("ResultSet.jl")
include("Metadata.jl")
Expand All @@ -32,6 +33,7 @@ function _open(config::ReaderConfig)
handler.current_page = 0
_get_properties(handler)
_parse_metadata(handler)
_post_metadata_handler(handler)
return handler
end

Expand All @@ -43,6 +45,7 @@ open(filename::AbstractString;
exclude_columns::Vector = [],
string_array_fn::Dict = Dict(),
number_array_fn::Dict = Dict(),
column_types::Dict = Dict{Symbol,Type}(),
verbose_level::Int64 = 1)
Open a SAS7BDAT data file. Returns a `SASLib.Handler` object that can be used in
Expand All @@ -55,9 +58,11 @@ function open(filename::AbstractString;
exclude_columns::Vector = [],
string_array_fn::Dict = Dict(),
number_array_fn::Dict = Dict(),
column_types::Dict = Dict{Symbol,Type}(),
verbose_level::Int64 = 1)
return _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates,
include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level))
include_columns, exclude_columns, string_array_fn, number_array_fn,
column_types, verbose_level))
end

"""
Expand Down Expand Up @@ -97,6 +102,7 @@ readsas(filename::AbstractString;
exclude_columns::Vector = [],
string_array_fn::Dict = Dict(),
number_array_fn::Dict = Dict(),
column_types::Dict = Dict{Symbol,Type}(),
verbose_level::Int64 = 1)
Read a SAS7BDAT file.
Expand Down Expand Up @@ -135,6 +141,9 @@ For numeric columns, you may specify your own array constructors using
the `number_array_fn` parameter. Perhaps you have a different kind of
array to store the values e.g. SharedArray.
Specify `column_type` argument if any conversion is required. It should
be a Dict, mapping column symbol to a data type.
For debugging purpose, `verbose_level` may be set to a value higher than 1.
Verbose level 0 will output nothing to the console, essentially a total quiet
option.
Expand All @@ -146,11 +155,13 @@ function readsas(filename::AbstractString;
exclude_columns::Vector = [],
string_array_fn::Dict = Dict(),
number_array_fn::Dict = Dict(),
column_types::Dict = Dict{Symbol,Type}(),
verbose_level::Int64 = 1)
handler = nothing
try
handler = _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates,
include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level))
include_columns, exclude_columns, string_array_fn, number_array_fn,
column_types, verbose_level))
return read(handler)
finally
isdefined(handler, :string_decoder) && Base.close(handler.string_decoder)
Expand Down Expand Up @@ -390,6 +401,20 @@ function _parse_metadata(handler)
end
end

# Do this after finish reading metadata but before reading data
function _post_metadata_handler(handler)

# save a copy of column types in a case insensitive dict
handler.column_types_dict = CIDict{Symbol,Type}(handler.config.column_types)

# check column_types
for k in keys(handler.config.column_types)
if !case_insensitive_in(k, handler.column_symbols)
Compat.@warn("Unknown column symbol ($k) in column_types. Ignored.")
end
end
end

function _process_page_meta(handler)
# println3(handler, "IN: _process_page_meta")
_read_page_header(handler)
Expand Down Expand Up @@ -1006,7 +1031,7 @@ function _chunk_to_dataframe(handler, nrows)
rslt[name] = datetime_from_float(rslt[name])
end
end

convert_column_type_if_needed!(handler, rslt, name)
elseif ty == column_type_string
# println(" String: size=$(size(handler.string_chunk))")
# println(" String: column $j, name $name, size=$(size(handler.string_chunk[js, :]))")
Expand All @@ -1018,6 +1043,21 @@ function _chunk_to_dataframe(handler, nrows)
return rslt
end

# If the user specified a type for the column, try to convert the column data.
function convert_column_type_if_needed!(handler, rslt, name)
if haskey(handler.column_types_dict, name)
type_wanted = handler.column_types_dict[name]
#println("$name exists in config.column_types, type_wanted=$type_wanted")
if type_wanted != Float64
try
rslt[name] = convert(Vector{type_wanted}, rslt[name])
catch ex
Compat.@warn("Unable to convert column to type $type_wanted, error=$ex")
end
end
end
end

# Simple loop that reads data row-by-row.
function read_data(handler, nrows)
# println("IN: read_data, nrows=$nrows")
Expand Down
3 changes: 3 additions & 0 deletions src/Types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ struct ReaderConfig
exclude_columns::Vector
string_array_fn::Dict{Symbol, Function}
number_array_fn::Dict{Symbol, Function}
column_types::Dict{Symbol, Type}
verbose_level::Int64
end

Expand Down Expand Up @@ -108,6 +109,8 @@ mutable struct Handler
string_decoder_buffer::IOBuffer
string_decoder::StringDecoder

column_types_dict::CIDict{Symbol,Type}

Handler(config::ReaderConfig) = new(
Base.open(config.filename),
config)
Expand Down
85 changes: 82 additions & 3 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ readfile(dir, file; kwargs...) = readsas(getpath(dir, file); kwargs...)
openfile(dir, file; kwargs...) = SASLib.open(getpath(dir, file), kwargs...)
getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)

# Struct used for column type conversion test case below
struct YearStr year::String end
Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v)))

@testset "SASLib" begin

@testset "object pool" begin
Expand Down Expand Up @@ -48,6 +52,47 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
@test_throws BoundsError z[1:300] = 1:300
end

@testset "case insensitive dict" begin
function testdict(lowercase_key, mixedcase_key, second_lowercase_key)

T = typeof(lowercase_key)
d = SASLib.CIDict{T,Int}()

# getindex/setindex!
d[lowercase_key] = 99
@test d[lowercase_key] == 99
@test d[mixedcase_key] == 99
d[mixedcase_key] = 88 # should replace original value
@test length(d) == 1 # still 1 element
@test d[lowercase_key] == 88
@test d[mixedcase_key] == 88

# haskey
@test haskey(d, lowercase_key) == true
@test haskey(d, mixedcase_key) == true

# iteration
d[second_lowercase_key] = 77
ks = T[]
vs = Int[]
for (k,v) in d
push!(ks, k)
push!(vs, v)
end
@test ks == [lowercase_key, second_lowercase_key]
@test vs == [88, 77]

# keys/values
@test collect(keys(d)) == [lowercase_key, second_lowercase_key]
@test collect(values(d)) == [88, 77]

# show
@test show(d) == nothing
end
testdict(:abc, :ABC, :def)
testdict("abc", "ABC", "def")
end

@testset "open and close" begin
handler = openfile("data_pandas", "test1.sas7bdat")
@test typeof(handler) == SASLib.Handler
Expand Down Expand Up @@ -170,7 +215,7 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
@test rs[1,:ACTUAL] 200.0

# display related
@test typeof(show(rs)) == Void
@test show(rs) == nothing
@test SASLib.sizestr(rs) == "1440 rows x 10 columns"
end

Expand All @@ -188,7 +233,7 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
@test md.columnsinfo[1] == Pair(:Column1, Float64)

md = getmetadata("data_pandas", "productsales.sas7bdat")
@test typeof(show(md)) == Void
@test show(md) == nothing
println()

# Deal with v0.6/v0.7 difference
Expand Down Expand Up @@ -226,7 +271,7 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
handler = openfile("data_AHS2013", "topical.sas7bdat")
rs = SASLib.read(handler, 1000)
@test size(rs) == (1000, 114)
@test typeof(show(handler)) == Void
@test show(handler) == nothing
SASLib.close(handler)
# @test result[:page_count] == 10
# @test result[:page_length] == 16384
Expand Down Expand Up @@ -301,6 +346,40 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)

end

# column type conversion
@testset "user specified column types" begin

# normal use case
rs = readfile("data_pandas", "productsales.sas7bdat";
verbose_level = 0, column_types = Dict(:YEAR => Int16, :QUARTER => Int8))
@test eltype(rs[:YEAR]) == Int16
@test eltype(rs[:QUARTER]) == Int8

# error handling - warn() when a column cannot be converted
rs = readfile("data_pandas", "productsales.sas7bdat";
verbose_level = 0, column_types = Dict(:YEAR => Int8, :QUARTER => Int8))
@test eltype(rs[:YEAR]) == Float64
@test eltype(rs[:QUARTER]) == Int8
#TODO expect warning for :YEAR conversion

# case insensitive column symbol
rs = readfile("data_pandas", "productsales.sas7bdat";
verbose_level = 0, column_types = Dict(:Quarter => Int8))
@test eltype(rs[:QUARTER]) == Int8

# conversion to custom types
rs = readfile("data_pandas", "productsales.sas7bdat";
verbose_level = 0, column_types = Dict(:Year => YearStr))
@test eltype(rs[:YEAR]) == YearStr

# test Union type
let T = Union{Int,Missing}
rs = readfile("data_pandas", "productsales.sas7bdat";
verbose_level = 0, column_types = Dict(:Year => T))
@test eltype(rs[:YEAR]) == T
end
end

# see output; keep this for coverage reason
@testset "verbosity" begin
rs = readfile("data_pandas", "test1.sas7bdat"; verbose_level = 2)
Expand Down

0 comments on commit 57bfdcc

Please sign in to comment.