Skip to content

Commit

Permalink
Add EOL array to csv parser to convey end of line bit
Browse files Browse the repository at this point in the history
  • Loading branch information
pityka committed Feb 9, 2024
1 parent 9f5b25b commit f95c548
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,12 @@ class CsvCheck extends Specification with ScalaCheck {

"csv99" in {
val data =
s"""-------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf--------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf---------,-,----,-,--,------,$crlf---------,-,----,-,-,,"""
s"""-------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf--------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf---------,-,----,-,--,-----A,$crlf?--------,-,----,-,-,,"""

println(data.toCharArray().map(_.toInt).toList.grouped(20).toList.mkString("\n"))
val src = ByteChannel(data)

println(CsvParser
.parseFromChannel[String](src, bufferSize = 20))
// frame.colAt(0) must_== Series("a", "", "")
CsvParser
.parseFromChannel[String](src, bufferSize = 20).toOption.get._1.numRows must_== 7
1 must_== 1
}

Expand Down
5 changes: 3 additions & 2 deletions saddle-core/src/main/scala/org/saddle/csv/CsvParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,8 @@ object CsvParser {
s: Array[Char],
from: Array[Int],
to: Array[Int],
len: Int
len: Int,
eol: Array[Int]
): org.saddle.io.csv.Control = {
var i = 0

Expand Down Expand Up @@ -344,7 +345,7 @@ object CsvParser {
}
}

if (toi < 0) {
if (toi < 0 || eol(i) < 0) {
if (line == 0 && !emptyLoc && headerLocFields != locs.length) {
error = true
errorString =
Expand Down
47 changes: 29 additions & 18 deletions saddle-io/src/main/scala/org/saddle/io/csv/DataBuffer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1028,7 +1028,8 @@ private[csv] object DataBuffer {
recordSeparatorMask: BitSet,
quoteMask: BitSet,
from: Array[Int],
to: Array[Int]
to: Array[Int],
eol: Array[Int]
): Int = {
var i = 0
from(i) = 0
Expand All @@ -1052,6 +1053,7 @@ private[csv] object DataBuffer {
}
}
to(i) = (next + offset) * p
eol(i) = p

i += 1
from(i) = next + 1
Expand All @@ -1068,7 +1070,8 @@ private[csv] object DataBuffer {
recordSeparatorMask: BitSet,
quoteMask: BitSet,
from: Array[Int],
to: Array[Int]
to: Array[Int],
eol: Array[Int]
): Int = {
var i = 0
from(i) = 0
Expand All @@ -1091,6 +1094,7 @@ private[csv] object DataBuffer {
}
}
to(i) = (next + offset) * p
eol(i) = p

i += 1
from(i) = next + 1
Expand Down Expand Up @@ -1121,8 +1125,8 @@ private[csv] sealed trait DataBuffer {
* arrays. Elements present in the index array after this length are
* should not get read.
*/
def nextBatch: (Array[Char], Array[Int], Array[Int], Int)
def emitRest: (Array[Char], Array[Int], Array[Int], Int)
def nextBatch: (Array[Char], Array[Int], Array[Int], Int, Array[Int])
def emitRest: (Array[Char], Array[Int], Array[Int], Int, Array[Int])
}

private[csv] final class DataBuffer1(
Expand All @@ -1141,6 +1145,7 @@ private[csv] final class DataBuffer1(

private val outputFrom = Array.ofDim[Int](bufferSize + 2)
private val outputTo = Array.ofDim[Int](bufferSize + 2)
private val outputEol = Array.ofDim[Int](bufferSize + 2)
private val quoteMask = BitSet.allocate(bufferSize)
private val lfMask = BitSet.allocate(bufferSize)
private val fieldSeparatorMask = BitSet.allocate(bufferSize)
Expand All @@ -1159,29 +1164,31 @@ private[csv] final class DataBuffer1(

final def nextBatch = {
filledNewData = false
(outputChars, outputFrom, outputTo, outputLength)
(outputChars, outputFrom, outputTo, outputLength, outputEol)
}

final def emitRest =
if (outputLength >= -1) {
if (filledNewData || !lineClosed) {
val from = outputFrom(outputLength)
val to = outputTo(outputLength)
val eol = outputEol(outputLength)
outputFrom(0) = from
outputTo(0) = to
outputEol(0) = eol
// check unclosed quotes
if (from >= 1 && outputChars(from - 1) == quoteChar) {
if (to > from && outputChars(to - 1) == quoteChar) {
outputTo(0) -= 1
(outputChars, outputFrom, outputTo, 1)
(outputChars, outputFrom, outputTo, 1, outputEol)
} else {
(outputChars, outputFrom, outputTo, -2)
(outputChars, outputFrom, outputTo, -2, outputEol)
}
} else (outputChars, outputFrom, outputTo, 1)
} else (outputChars, outputFrom, outputTo, 1, outputEol)
} else {
(outputChars, outputFrom, outputTo, 0)
(outputChars, outputFrom, outputTo, 0, outputEol)
}
} else ((outputChars, outputFrom, outputTo, outputLength))
} else ((outputChars, outputFrom, outputTo, outputLength, outputEol))

private def fillBuffer(): Boolean = {
if (!data.hasNext) {
Expand Down Expand Up @@ -1243,7 +1250,8 @@ private[csv] final class DataBuffer1(
recordSeparatorMask = recordSeparatorMask,
quoteMask = quoteMask,
from = outputFrom,
to = outputTo
to = outputTo,
eol = outputEol
)

if (recordSeparatorMask.contains(next.limit() - 1)) {
Expand Down Expand Up @@ -1282,6 +1290,7 @@ private[csv] final class DataBuffer2(

private val outputFrom = Array.ofDim[Int](bufferSize + 2)
private val outputTo = Array.ofDim[Int](bufferSize + 2)
private val outputEol = Array.ofDim[Int](bufferSize + 2)
private val quoteMask = BitSet.allocate(bufferSize)
private val crMask = BitSet.allocate(bufferSize)
private val lfMask = BitSet.allocate(bufferSize)
Expand All @@ -1299,29 +1308,30 @@ private[csv] final class DataBuffer2(

final def nextBatch = {
filledNewData = false
(outputChars, outputFrom, outputTo, outputLength)
(outputChars, outputFrom, outputTo, outputLength, outputEol)
}

final def emitRest =
if (outputLength >= -1) {
if (filledNewData || !lineClosed) {
outputFrom(0) = outputFrom(outputLength)
outputTo(0) = outputTo(outputLength)
outputEol(0) = outputEol(outputLength)
val from = outputFrom(0)
val to = outputTo(0)
// check unclosed quotes
if (from >= 1 && outputChars(from - 1) == quoteChar) {
if (to > from && outputChars(to - 1) == quoteChar) {
outputTo(0) -= 1
(outputChars, outputFrom, outputTo, 1)
(outputChars, outputFrom, outputTo, 1, outputEol)
} else {
(outputChars, outputFrom, outputTo, -2)
(outputChars, outputFrom, outputTo, -2, outputEol)
}
} else (outputChars, outputFrom, outputTo, 1)
} else (outputChars, outputFrom, outputTo, 1, outputEol)
} else {
(outputChars, outputFrom, outputTo, 0)
(outputChars, outputFrom, outputTo, 0, outputEol)
}
} else (outputChars, outputFrom, outputTo, outputLength)
} else (outputChars, outputFrom, outputTo, outputLength, outputEol)

private def fillBuffer(): Boolean = {
if (!data.hasNext) false
Expand Down Expand Up @@ -1383,7 +1393,8 @@ private[csv] final class DataBuffer2(
recordSeparatorMask = recordSeparatorMask,
quoteMask = quoteMask,
from = outputFrom,
to = outputTo
to = outputTo,
eol = outputEol
)

if (recordSeparatorMask.contains(next.limit() - 1)) {
Expand Down
14 changes: 9 additions & 5 deletions saddle-io/src/main/scala/org/saddle/io/csv/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ package object csv {
* @param to
* Int array of indices of end (exclusive) offsets of tokens in the
* `char` array. Owned by the parser.
* @param eoll
* Int array of indices of markers of end of line (true if == -1) in the
* `char` array. Owned by the parser.
* @param len
* Number of tokens. Do not read the index arrays beyond this number.
* @return
Expand All @@ -40,7 +43,8 @@ package object csv {
chars: Array[Char],
from: Array[Int],
to: Array[Int],
len: Int
len: Int,
eol: Array[Int]
): Control
}

Expand Down Expand Up @@ -104,8 +108,8 @@ package object csv {
var done = false
var errorString = ""
while (data.hasNext && !error && !done) {
val (chars, from, to, len) = data.nextBatch
callback(chars, from, to, len) match {
val (chars, from, to, len, eol) = data.nextBatch
callback(chars, from, to, len, eol) match {
case Done => done = true
case Next => ()
case Error(err) =>
Expand All @@ -120,8 +124,8 @@ package object csv {
}

if (!error && !done) {
val (chars, from, to, len) = data.emitRest
callback(chars, from, to, len) match {
val (chars, from, to, len, eol) = data.emitRest
callback(chars, from, to, len, eol) match {
case Error(err) =>
error = true
errorString = err
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,17 +147,19 @@ class DataBufferSpec extends Specification with ScalaCheck {
val buffer =
new org.saddle.io.csv.DataBuffer1(iter, '"', ',', '[', 4)
buffer.hasNext must_== true
val (chars: Array[Char], from, to, len) = buffer.nextBatch
val (chars: Array[Char], from, to, len, eol) = buffer.nextBatch

chars.toList must_== List(97, 44, 97, 0)
from.toList must_== List(0, 2, -1, 0, 0, 0)
to.toList must_== List(1, 3, 0, 0, 0, 0)
eol.toList must_== List(1, 1, 1, 1, 1, 1)
len must_== 1
buffer.hasNext must_== false
val (chars2, from2, to2, len2) = buffer.emitRest
val (chars2, from2, to2, len2, eol2) = buffer.emitRest
chars2.toList must_== List(97, 44, 97, 0)
from2.toList must_== List(2, 2, -1, 0, 0, 0)
to2.toList must_== List(3, 3, 0, 0, 0, 0)
eol2.toList must_== List(1, 1, 1, 1, 1, 1)
len2 must_== 1

1 must_== 1
Expand Down
24 changes: 21 additions & 3 deletions saddle-io/src/test/scala/org/saddle/io/csv/testcallbacks.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,26 @@ package org.saddle.io.csv

object TestCallbacks {
val noop = new Callback {
def apply(s: Array[Char], from: Array[Int], to: Array[Int], len: Int) =
def apply(
s: Array[Char],
from: Array[Int],
to: Array[Int],
len: Int,
eol: Array[Int]
) =
if (len >= 0) Next else Error("Unclosed")
}
}
class BufferCallback extends Callback {
val buffer = scala.collection.mutable.ArrayBuffer[String]()
def toList = buffer.toList
def apply(s: Array[Char], from: Array[Int], to: Array[Int], len: Int) = {
def apply(
s: Array[Char],
from: Array[Int],
to: Array[Int],
len: Int,
eol: Array[Int]
) = {
var i = 0
if (len < 0) Error("Unclosed quote")
else {
Expand All @@ -40,7 +52,13 @@ class BufferCallback extends Callback {
class ForeachCallback(t: (CharSequence, Int) => Unit) extends Callback {
val buffer = scala.collection.mutable.ArrayBuffer[String]()
def toList = buffer.toList
def apply(s: Array[Char], from: Array[Int], to: Array[Int], len: Int) = {
def apply(
s: Array[Char],
from: Array[Int],
to: Array[Int],
len: Int,
eol: Array[Int]
) = {
var i = 0
var loc = 0
if (len < 0) Error("Unclosed quote")
Expand Down

0 comments on commit f95c548

Please sign in to comment.