Skip to content

Commit

Permalink
ResultSet/Metadata changes initial commit for beta testing (re: #29, #30
Browse files Browse the repository at this point in the history
, #31)
  • Loading branch information
tk3369 committed Mar 7, 2018
1 parent f217006 commit 2b9e47c
Show file tree
Hide file tree
Showing 6 changed files with 358 additions and 231 deletions.
3 changes: 3 additions & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ julia 0.6
StringEncodings
Missings
Compat
IteratorInterfaceExtensions
TableTraits
TableTraitsUtils
32 changes: 32 additions & 0 deletions src/Metadata.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
export metadata

struct Metadata
filename::AbstractString
encoding::AbstractString # e.g. "ISO8859-1"
endianness::Symbol # :LittleEndian, :BigEndian
compression::Symbol # :RDC, :RLE
pagesize::Int
npages::Int
nrows::Int
ncols::Int
columnsinfo::Vector{Pair{Symbol, DataType}} # Float64 or String
end

function metadata(h::Handler)
ci = [Pair(h.column_symbols[i],
h.column_types[i] == column_type_decimal ? Float64 : String)
for i in 1:h.column_count]
cmp = ifelse(h.compression == compression_method_rle, :RLE,
ifelse(h.compression == compression_method_rdc, :RDC, :none))
Metadata(
h.config.filename,
h.file_encoding,
h.file_endianness,
cmp,
h.page_length,
h.page_count,
h.row_count,
h.column_count,
ci
)
end
123 changes: 123 additions & 0 deletions src/ResultSet.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
using IteratorInterfaceExtensions, TableTraits, TableTraitsUtils

import Base.size

"""
ResultSet is the primary object that represents data returned from
reading a SAS data file. ResultSet implements the Base.Iteration
interface as well as the IterableTables.jl interface.
*Fields*
- `columns`: a vector of columns, each being a vector itself
- `names`: a vector of column symbols
- `size`: a tuple (nrows, ncols)
*Accessors*
- `columns(::ResultSet)`
- `names(::ResultSet)`
- `size(::ResultSet)`
- `size(::ResultSet, dim::Integer)`
*Single Row/Column Indexing*
- `rs[i]` returns a tuple for row `i`
- `rs[:c]` returns a vector for column with symbol `c`
*Multi Row/Column Indexing*
- `rs[i:j]` returns a view of ResultSet with rows between `i` and `j`
- `rs[c...]` returns a view of ResultSet with columns specified
*Cell Indexing*
- `rs[i,j]` returns a single value for row `i` column `j`
- `rs[i,c]` returns a single value for row `i` column symbol `c`
- Specific cell can be assigned with the above indexing methods
"""
struct ResultSet
columns::AbstractVector{AbstractVector}
names::AbstractVector{Symbol}
size::NTuple{2, Int}
end

# exports
export columns

# accessors
columns(rs::ResultSet) = rs.columns
Base.names(rs::ResultSet) = rs.names
Base.size(rs::ResultSet) = rs.size
Base.size(rs::ResultSet, i::Integer) = rs.size[i]

# Size displayed as a string
sizestr(rs::ResultSet) = string(size(rs, 1)) * " rows x " * string(size(rs, 2)) * " columns"

# find index for the column symbol
function symindex(rs::ResultSet, s::Symbol)
n = findfirst(x -> x == s, rs.names)
n == 0 && error("column symbol not found: $s")
n
end

# Direct cell access
Base.getindex(rs::ResultSet, i::Integer, j::Integer) = rs.columns[j][i]
Base.getindex(rs::ResultSet, i::Integer, s::Symbol) = rs.columns[symindex(rs, s)][i]
Base.setindex!(rs::ResultSet, val, i::Integer, j::Integer) = rs.columns[j][i] = val
Base.setindex!(rs::ResultSet, val, i::Integer, s::Symbol) = rs.columns[symindex(rs, s)][i] = val

# Return a single row as a tuple
Base.getindex(rs::ResultSet, i::Integer) = Tuple([c[i] for c in rs.columns])

# Return a single row as a tuple
Base.getindex(rs::ResultSet, c::Symbol) = rs.columns[symindex(rs, c)]

# index by row range => returns ResultSet object
function Base.getindex(rs::ResultSet, r::UnitRange{Int})
ResultSet(map(x -> view(x, r), rs.columns), rs.names, (length(r), size(rs, 2)))
end

# index by columns => returns ResultSet object
function Base.getindex(rs::ResultSet, ss::Symbol...)
v = Int[]
for (idx, nam) in enumerate(rs.names)
nam in ss && push!(v, idx)
end
ResultSet(rs.columns[v], rs.names[v], (size(rs, 1), length(v)))
end

# Iterators
Base.start(rs::ResultSet) = 1
Base.done(rs::ResultSet, i::Int) = i > size(rs, 1)
Base.next(rs::ResultSet, i::Int) = (rs[i], i+1)

# Display ResultSet object
function Base.show(io::IO, rs::ResultSet)
println(io, "ResultSet (", sizestr(rs), ")")
max_rows = 5
max_cols = 10
n = min(size(rs, 1), max_rows)
m = min(size(rs, 2), max_cols)
print(io, "Columns ")
for i in 1:m
i > 1 && print(io, ", ")
print(io, i, ":", rs.names[i])
end
m < length(rs.names) && print(io, "")
println(io)
for i in 1:n
print(io, i, ": ")
for j in 1:m
j > 1 && print(", ")
print(io, rs.columns[j][i])
end
println(io)
end
n < size(rs, 1) && println(io, "")
end

# IteratableTables
IteratorInterfaceExtensions.isiterable(::ResultSet) = true

TableTraits.isiterabletable(::ResultSet) = true

function IteratorInterfaceExtensions.getiterator(rs::ResultSet)
TableTraitsUtils.create_tableiterator(rs.columns, rs.names)
end
160 changes: 11 additions & 149 deletions src/SASLib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,121 +11,9 @@ import Base.show
include("constants.jl")
include("utils.jl")
include("ObjectPool.jl")

struct FileFormatError <: Exception
message::AbstractString
end

struct ConfigError <: Exception
message::AbstractString
end

struct ReaderConfig
filename::AbstractString
encoding::AbstractString
chunk_size::Int64
convert_dates::Bool
include_columns::Vector
exclude_columns::Vector
string_array_fn::Dict{Symbol, Function}
number_array_fn::Dict{Symbol, Function}
verbose_level::Int64
end

struct Column
id::Int64
name::AbstractString
label::Vector{UInt8} # really?
format::AbstractString
coltype::UInt8
length::Int64
end

# technically these fields may have lower precision (need casting?)
struct SubHeaderPointer
offset::Int64
length::Int64
compression::Int64
shtype::Int64
end

mutable struct Handler
io::IOStream
config::ReaderConfig

compression::UInt8
column_names_strings::Vector{Vector{UInt8}}
column_names::Vector{AbstractString}
column_symbols::Vector{Symbol}
column_types::Vector{UInt8}
column_formats::Vector{AbstractString}
columns::Vector{Column}

# column indices being read/returned
# tuple of column index, column symbol, column type
column_indices::Vector{Tuple{Int64, Symbol, UInt8}}

current_page_data_subheader_pointers::Vector{SubHeaderPointer}
cached_page::Vector{UInt8}
column_data_lengths::Vector{Int64}
column_data_offsets::Vector{Int64}
current_row_in_file_index::Int64
current_row_in_page_index::Int64

file_endianness::Symbol
sys_endianness::Symbol
byte_swap::Bool

U64::Bool
int_length::Int8
page_bit_offset::Int8
subheader_pointer_length::UInt8

file_encoding::AbstractString
platform::AbstractString
name::Union{AbstractString,Vector{UInt8}}
file_type::Union{AbstractString,Vector{UInt8}}

date_created::DateTime
date_modified::DateTime

header_length::Int64
page_length::Int64
page_count::Int64
sas_release::Union{AbstractString,Vector{UInt8}}
server_type::Union{AbstractString,Vector{UInt8}}
os_version::Union{AbstractString,Vector{UInt8}}
os_name::Union{AbstractString,Vector{UInt8}}

row_length::Int64
row_count::Int64
col_count_p1::Int64
col_count_p2::Int64
mix_page_row_count::Int64
lcs::Int64
lcp::Int64

current_page_type::Int64
current_page_block_count::Int64 # number of records in current page
current_page_subheaders_count::Int64
column_count::Int64
# creator_proc::Union{Void, Vector{UInt8}}

byte_chunk::Dict{Symbol, Vector{UInt8}}
string_chunk::Dict{Symbol, AbstractArray{String,1}}
current_row_in_chunk_index::Int64

current_page::Int64
vendor::UInt8
use_base_transcoder::Bool

string_decoder_buffer::IOBuffer
string_decoder::StringDecoder

Handler(config::ReaderConfig) = new(
Base.open(config.filename),
config)
end
include("Types.jl")
include("ResultSet.jl")
include("Metadata.jl")

function _open(config::ReaderConfig)
# println("Opening $(config.filename)")
Expand Down Expand Up @@ -181,7 +69,7 @@ function read(handler::Handler, nrows=0)
# println("Reading $(handler.config.filename)")
elapsed = @elapsed result = read_chunk(handler, nrows)
elapsed = round(elapsed, 5)
println1(handler, "Read $(handler.config.filename) with size $(result[:nrows]) x $(result[:ncols]) in $elapsed seconds")
println1(handler, "Read $(handler.config.filename) with size $(size(result, 1)) x $(size(result, 2)) in $elapsed seconds")
return result
end

Expand Down Expand Up @@ -986,42 +874,16 @@ function read_chunk(handler, nrows=0)
handler.current_row_in_chunk_index = 0

perf_read_data = @elapsed read_data(handler, nrows)

perf_chunk_to_data_frame = @elapsed rslt = _chunk_to_dataframe(handler, nrows)

# here column symbols contains only ones for columns that are actually read
if handler.config.verbose_level > 1
println("Read data in ", perf_read_data, " msec")
println("Converted data in ", perf_chunk_to_data_frame, " msec")
end

column_symbols = [sym for (k, sym, ty) in handler.column_indices]
column_names = String.(column_symbols)
column_types = [eltype(typeof(rslt[sym])) for (k, sym, ty) in handler.column_indices]
column_info = [(
k,
sym,
ty == column_type_string ? :String : :Number,
eltype(typeof(rslt[sym])),
typeof(rslt[sym])
) for (k, sym, ty) in handler.column_indices]

return Dict(
:data => rslt,
:nrows => nrows,
:ncols => length(column_symbols),
:filename => handler.config.filename,
:page_count => handler.current_page,
:page_length => Int64(handler.page_length),
:file_encoding => handler.file_encoding,
:file_endianness => handler.file_endianness,
:system_endianness => handler.sys_endianness,
:column_offsets => handler.column_data_offsets,
:column_lengths => handler.column_data_lengths,
:column_types => column_types,
:column_symbols => column_symbols,
:column_names => column_names,
:column_info => column_info,
:compression => compressionstring(handler),
:perf_read_data => perf_read_data,
:perf_type_conversion => perf_chunk_to_data_frame,
:process_id => myid()
)
return ResultSet([rslt[s] for s in column_symbols], column_symbols,
(nrows, length(column_symbols)))
end

# not extremely efficient but is a safe way to do it
Expand Down
Loading

0 comments on commit 2b9e47c

Please sign in to comment.