diff --git a/saddle-core-jvm-test/src/test/scala/org/saddle/csv/CsvCheck.scala b/saddle-core-jvm-test/src/test/scala/org/saddle/csv/CsvCheck.scala index 8ac9cffb..b5173214 100644 --- a/saddle-core-jvm-test/src/test/scala/org/saddle/csv/CsvCheck.scala +++ b/saddle-core-jvm-test/src/test/scala/org/saddle/csv/CsvCheck.scala @@ -25,14 +25,12 @@ class CsvCheck extends Specification with ScalaCheck { "csv99" in { val data = - s"""-------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf--------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf---------,-,----,-,--,------,$crlf---------,-,----,-,-,,""" + s"""-------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf--------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf---------,-,----,-,--,,---$crlf---------,-,----,-,--,-----A,$crlf?--------,-,----,-,-,,""" - println(data.toCharArray().map(_.toInt).toList.grouped(20).toList.mkString("\n")) val src = ByteChannel(data) - println(CsvParser - .parseFromChannel[String](src, bufferSize = 20)) - // frame.colAt(0) must_== Series("a", "", "") + CsvParser + .parseFromChannel[String](src, bufferSize = 20).toOption.get._1.numRows must_== 7 1 must_== 1 } diff --git a/saddle-core/src/main/scala/org/saddle/csv/CsvParser.scala b/saddle-core/src/main/scala/org/saddle/csv/CsvParser.scala index 78753726..226b71f7 100644 --- a/saddle-core/src/main/scala/org/saddle/csv/CsvParser.scala +++ b/saddle-core/src/main/scala/org/saddle/csv/CsvParser.scala @@ -304,7 +304,8 @@ object CsvParser { s: Array[Char], from: Array[Int], to: Array[Int], - len: Int + len: Int, + eol: Array[Int] ): org.saddle.io.csv.Control = { var i = 0 @@ -344,7 +345,7 @@ object CsvParser { } } - if (toi < 0) { + if (toi < 0 || eol(i) < 0) { if (line == 0 && !emptyLoc && headerLocFields != locs.length) { error = true errorString = diff --git a/saddle-io/src/main/scala/org/saddle/io/csv/DataBuffer.scala b/saddle-io/src/main/scala/org/saddle/io/csv/DataBuffer.scala index d776f2a5..d4583645 100644 --- a/saddle-io/src/main/scala/org/saddle/io/csv/DataBuffer.scala +++ b/saddle-io/src/main/scala/org/saddle/io/csv/DataBuffer.scala @@ -1028,7 +1028,8 @@ private[csv] object DataBuffer { recordSeparatorMask: BitSet, quoteMask: BitSet, from: Array[Int], - to: Array[Int] + to: Array[Int], + eol: Array[Int] ): Int = { var i = 0 from(i) = 0 @@ -1052,6 +1053,7 @@ private[csv] object DataBuffer { } } to(i) = (next + offset) * p + eol(i) = p i += 1 from(i) = next + 1 @@ -1068,7 +1070,8 @@ private[csv] object DataBuffer { recordSeparatorMask: BitSet, quoteMask: BitSet, from: Array[Int], - to: Array[Int] + to: Array[Int], + eol: Array[Int] ): Int = { var i = 0 from(i) = 0 @@ -1091,6 +1094,7 @@ private[csv] object DataBuffer { } } to(i) = (next + offset) * p + eol(i) = p i += 1 from(i) = next + 1 @@ -1121,8 +1125,8 @@ private[csv] sealed trait DataBuffer { * arrays. Elements present in the index array after this length are * should not get read. */ - def nextBatch: (Array[Char], Array[Int], Array[Int], Int) - def emitRest: (Array[Char], Array[Int], Array[Int], Int) + def nextBatch: (Array[Char], Array[Int], Array[Int], Int, Array[Int]) + def emitRest: (Array[Char], Array[Int], Array[Int], Int, Array[Int]) } private[csv] final class DataBuffer1( @@ -1141,6 +1145,7 @@ private[csv] final class DataBuffer1( private val outputFrom = Array.ofDim[Int](bufferSize + 2) private val outputTo = Array.ofDim[Int](bufferSize + 2) + private val outputEol = Array.ofDim[Int](bufferSize + 2) private val quoteMask = BitSet.allocate(bufferSize) private val lfMask = BitSet.allocate(bufferSize) private val fieldSeparatorMask = BitSet.allocate(bufferSize) @@ -1159,7 +1164,7 @@ private[csv] final class DataBuffer1( final def nextBatch = { filledNewData = false - (outputChars, outputFrom, outputTo, outputLength) + (outputChars, outputFrom, outputTo, outputLength, outputEol) } final def emitRest = @@ -1167,21 +1172,23 @@ private[csv] final class DataBuffer1( if (filledNewData || !lineClosed) { val from = outputFrom(outputLength) val to = outputTo(outputLength) + val eol = outputEol(outputLength) outputFrom(0) = from outputTo(0) = to + outputEol(0) = eol // check unclosed quotes if (from >= 1 && outputChars(from - 1) == quoteChar) { if (to > from && outputChars(to - 1) == quoteChar) { outputTo(0) -= 1 - (outputChars, outputFrom, outputTo, 1) + (outputChars, outputFrom, outputTo, 1, outputEol) } else { - (outputChars, outputFrom, outputTo, -2) + (outputChars, outputFrom, outputTo, -2, outputEol) } - } else (outputChars, outputFrom, outputTo, 1) + } else (outputChars, outputFrom, outputTo, 1, outputEol) } else { - (outputChars, outputFrom, outputTo, 0) + (outputChars, outputFrom, outputTo, 0, outputEol) } - } else ((outputChars, outputFrom, outputTo, outputLength)) + } else ((outputChars, outputFrom, outputTo, outputLength, outputEol)) private def fillBuffer(): Boolean = { if (!data.hasNext) { @@ -1243,7 +1250,8 @@ private[csv] final class DataBuffer1( recordSeparatorMask = recordSeparatorMask, quoteMask = quoteMask, from = outputFrom, - to = outputTo + to = outputTo, + eol = outputEol ) if (recordSeparatorMask.contains(next.limit() - 1)) { @@ -1282,6 +1290,7 @@ private[csv] final class DataBuffer2( private val outputFrom = Array.ofDim[Int](bufferSize + 2) private val outputTo = Array.ofDim[Int](bufferSize + 2) + private val outputEol = Array.ofDim[Int](bufferSize + 2) private val quoteMask = BitSet.allocate(bufferSize) private val crMask = BitSet.allocate(bufferSize) private val lfMask = BitSet.allocate(bufferSize) @@ -1299,7 +1308,7 @@ private[csv] final class DataBuffer2( final def nextBatch = { filledNewData = false - (outputChars, outputFrom, outputTo, outputLength) + (outputChars, outputFrom, outputTo, outputLength, outputEol) } final def emitRest = @@ -1307,21 +1316,22 @@ private[csv] final class DataBuffer2( if (filledNewData || !lineClosed) { outputFrom(0) = outputFrom(outputLength) outputTo(0) = outputTo(outputLength) + outputEol(0) = outputEol(outputLength) val from = outputFrom(0) val to = outputTo(0) // check unclosed quotes if (from >= 1 && outputChars(from - 1) == quoteChar) { if (to > from && outputChars(to - 1) == quoteChar) { outputTo(0) -= 1 - (outputChars, outputFrom, outputTo, 1) + (outputChars, outputFrom, outputTo, 1, outputEol) } else { - (outputChars, outputFrom, outputTo, -2) + (outputChars, outputFrom, outputTo, -2, outputEol) } - } else (outputChars, outputFrom, outputTo, 1) + } else (outputChars, outputFrom, outputTo, 1, outputEol) } else { - (outputChars, outputFrom, outputTo, 0) + (outputChars, outputFrom, outputTo, 0, outputEol) } - } else (outputChars, outputFrom, outputTo, outputLength) + } else (outputChars, outputFrom, outputTo, outputLength, outputEol) private def fillBuffer(): Boolean = { if (!data.hasNext) false @@ -1383,7 +1393,8 @@ private[csv] final class DataBuffer2( recordSeparatorMask = recordSeparatorMask, quoteMask = quoteMask, from = outputFrom, - to = outputTo + to = outputTo, + eol = outputEol ) if (recordSeparatorMask.contains(next.limit() - 1)) { diff --git a/saddle-io/src/main/scala/org/saddle/io/csv/package.scala b/saddle-io/src/main/scala/org/saddle/io/csv/package.scala index 1fbfd66c..319c4552 100644 --- a/saddle-io/src/main/scala/org/saddle/io/csv/package.scala +++ b/saddle-io/src/main/scala/org/saddle/io/csv/package.scala @@ -30,6 +30,9 @@ package object csv { * @param to * Int array of indices of end (exclusive) offsets of tokens in the * `char` array. Owned by the parser. + * @param eoll + * Int array of indices of markers of end of line (true if == -1) in the + * `char` array. Owned by the parser. * @param len * Number of tokens. Do not read the index arrays beyond this number. * @return @@ -40,7 +43,8 @@ package object csv { chars: Array[Char], from: Array[Int], to: Array[Int], - len: Int + len: Int, + eol: Array[Int] ): Control } @@ -104,8 +108,8 @@ package object csv { var done = false var errorString = "" while (data.hasNext && !error && !done) { - val (chars, from, to, len) = data.nextBatch - callback(chars, from, to, len) match { + val (chars, from, to, len, eol) = data.nextBatch + callback(chars, from, to, len, eol) match { case Done => done = true case Next => () case Error(err) => @@ -120,8 +124,8 @@ package object csv { } if (!error && !done) { - val (chars, from, to, len) = data.emitRest - callback(chars, from, to, len) match { + val (chars, from, to, len, eol) = data.emitRest + callback(chars, from, to, len, eol) match { case Error(err) => error = true errorString = err diff --git a/saddle-io/src/test/scala/org/saddle/io/csv/databuffer.test.scala b/saddle-io/src/test/scala/org/saddle/io/csv/databuffer.test.scala index aab79956..21465591 100644 --- a/saddle-io/src/test/scala/org/saddle/io/csv/databuffer.test.scala +++ b/saddle-io/src/test/scala/org/saddle/io/csv/databuffer.test.scala @@ -147,17 +147,19 @@ class DataBufferSpec extends Specification with ScalaCheck { val buffer = new org.saddle.io.csv.DataBuffer1(iter, '"', ',', '[', 4) buffer.hasNext must_== true - val (chars: Array[Char], from, to, len) = buffer.nextBatch + val (chars: Array[Char], from, to, len, eol) = buffer.nextBatch chars.toList must_== List(97, 44, 97, 0) from.toList must_== List(0, 2, -1, 0, 0, 0) to.toList must_== List(1, 3, 0, 0, 0, 0) + eol.toList must_== List(1, 1, 1, 1, 1, 1) len must_== 1 buffer.hasNext must_== false - val (chars2, from2, to2, len2) = buffer.emitRest + val (chars2, from2, to2, len2, eol2) = buffer.emitRest chars2.toList must_== List(97, 44, 97, 0) from2.toList must_== List(2, 2, -1, 0, 0, 0) to2.toList must_== List(3, 3, 0, 0, 0, 0) + eol2.toList must_== List(1, 1, 1, 1, 1, 1) len2 must_== 1 1 must_== 1 diff --git a/saddle-io/src/test/scala/org/saddle/io/csv/testcallbacks.scala b/saddle-io/src/test/scala/org/saddle/io/csv/testcallbacks.scala index 4222db8c..6c7e4274 100644 --- a/saddle-io/src/test/scala/org/saddle/io/csv/testcallbacks.scala +++ b/saddle-io/src/test/scala/org/saddle/io/csv/testcallbacks.scala @@ -16,14 +16,26 @@ package org.saddle.io.csv object TestCallbacks { val noop = new Callback { - def apply(s: Array[Char], from: Array[Int], to: Array[Int], len: Int) = + def apply( + s: Array[Char], + from: Array[Int], + to: Array[Int], + len: Int, + eol: Array[Int] + ) = if (len >= 0) Next else Error("Unclosed") } } class BufferCallback extends Callback { val buffer = scala.collection.mutable.ArrayBuffer[String]() def toList = buffer.toList - def apply(s: Array[Char], from: Array[Int], to: Array[Int], len: Int) = { + def apply( + s: Array[Char], + from: Array[Int], + to: Array[Int], + len: Int, + eol: Array[Int] + ) = { var i = 0 if (len < 0) Error("Unclosed quote") else { @@ -40,7 +52,13 @@ class BufferCallback extends Callback { class ForeachCallback(t: (CharSequence, Int) => Unit) extends Callback { val buffer = scala.collection.mutable.ArrayBuffer[String]() def toList = buffer.toList - def apply(s: Array[Char], from: Array[Int], to: Array[Int], len: Int) = { + def apply( + s: Array[Char], + from: Array[Int], + to: Array[Int], + len: Int, + eol: Array[Int] + ) = { var i = 0 var loc = 0 if (len < 0) Error("Unclosed quote")