amplab · shivaram · May 19, 2015 · May 18, 2015 · May 18, 2015 · May 18, 2015
diff --git a/src/main/scala/loaders/ImageNetLoader.scala b/src/main/scala/loaders/ImageNetLoader.scala
@@ -9,6 +9,8 @@ import utils.LabeledImage
  */
 
 object ImageNetLoader {
+
+  val NUM_CLASSES = 1000
 
   /**
    * Loads images from @dataPath and associates images with the labels provided in @labelPath

diff --git a/src/main/scala/nodes/images/LCSExtractor.scala b/src/main/scala/nodes/images/LCSExtractor.scala
@@ -90,7 +90,7 @@ class LCSExtractor(
       c = c + 1
     }
 
-    val lcsValues = new DenseMatrix[Float](numPoolsX * numPoolsY, numLCSValues)
+    val lcsValues = new DenseMatrix[Float](numLCSValues, numPoolsX * numPoolsY)
 
     var lcsIdx = 0
     // Start at strideStart in (x, y) and  
@@ -114,10 +114,10 @@ class LCSExtractor(
         for (nx <- subPatchRange; 
              ny <- subPatchRange) {
           // lcsValues(lcsIdx) =  means(c).get((xPos + nx), (yPos + ny), 0)
-          lcsValues(xKeyPoint * numPoolsY + yKeyPoint, lcsIdx) =
+          lcsValues(lcsIdx, xKeyPoint * numPoolsY + yKeyPoint) =
             means(c).get((xPos + nx), (yPos + ny), 0).toFloat
           lcsIdx = lcsIdx + 1
-          lcsValues(xKeyPoint * numPoolsY + yKeyPoint, lcsIdx) =
+          lcsValues(lcsIdx, xKeyPoint * numPoolsY + yKeyPoint) =
             stds(c).get((xPos + nx), (yPos + ny), 0).toFloat
           lcsIdx = lcsIdx + 1
         }

diff --git a/src/main/scala/nodes/learning/BlockLinearMapper.scala b/src/main/scala/nodes/learning/BlockLinearMapper.scala
@@ -136,7 +136,7 @@ class BlockLeastSquaresEstimator(blockSize: Int, numIter: Int, lambda: Double =
     val b = RowPartitionedMatrix.fromArray(
       labelScaler.apply(trainingLabels).map(_.toArray)).cache()
     val numRows = Some(b.numRows())
-    val numCols = Some(trainingFeatures.head.first().length.toLong)
+    val numCols = Some(blockSize.toLong)
 
     // NOTE: This will cause trainingFeatures to be evaluated twice
     // which might not be optimal if its not cached ?
@@ -159,8 +159,15 @@ class BlockLeastSquaresEstimator(blockSize: Int, numIter: Int, lambda: Double =
   override def fit(
       trainingFeatures: RDD[DenseVector[Double]],
       trainingLabels: RDD[DenseVector[Double]]): BlockLinearMapper = {
-    val vectorSplitter = new VectorSplitter(blockSize)
+    fit(trainingFeatures, trainingLabels, None)
+  }
+
+  def fit(
+      trainingFeatures: RDD[DenseVector[Double]],
+      trainingLabels: RDD[DenseVector[Double]],
+      numFeaturesOpt: Option[Int]): BlockLinearMapper = {
+    val vectorSplitter = new VectorSplitter(blockSize, numFeaturesOpt)
     val featureBlocks = vectorSplitter.apply(trainingFeatures)
-    fit(featureBlocks, trainingLabels) 
+    fit(featureBlocks, trainingLabels)
   }
 }
diff --git a/src/main/scala/nodes/learning/BlockWeightedLeastSquares.scala b/src/main/scala/nodes/learning/BlockWeightedLeastSquares.scala
@@ -45,7 +45,14 @@ class BlockWeightedLeastSquaresEstimator(
   override def fit(
       trainingFeatures: RDD[DenseVector[Double]],
       trainingLabels: RDD[DenseVector[Double]]): BlockLinearMapper = {
-    val trainingFeaturesSplit = new VectorSplitter(blockSize).apply(trainingFeatures)
+    fit(trainingFeatures, trainingLabels, None)
+  }
+
+  def fit(
+      trainingFeatures: RDD[DenseVector[Double]],
+      trainingLabels: RDD[DenseVector[Double]],
+      numFeaturesOpt: Option[Int]): BlockLinearMapper = {
+    val trainingFeaturesSplit = new VectorSplitter(blockSize, numFeaturesOpt).apply(trainingFeatures)
     fit(trainingFeaturesSplit, trainingLabels)
   }
 
@@ -97,9 +104,9 @@ object BlockWeightedLeastSquaresEstimator extends Logging {
     }.collect():_*)
 
     // Initialize models to zero here. Each model is a (W, b)
-    // NOTE: We get first element from every training block here
     val models = trainingFeatures.map { block =>
-      val blockSize = block.first.length
+      // TODO: This assumes uniform block sizes. We should check the number of columns
+      // in each block to ensure safety.
       DenseMatrix.zeros[Double](blockSize, nClasses)
     }.toArray
 

diff --git a/src/main/scala/nodes/stats/Sampling.scala b/src/main/scala/nodes/stats/Sampling.scala
@@ -8,9 +8,13 @@ import pipelines.{FunctionNode, Transformer}
  * Given a collection of Dense Matrices, this will generate a sample of `numSamples` columns from the entire set.
  * @param numSamples
  */
-class ColumnSampler(numSamples: Int) extends Transformer[DenseMatrix[Float], DenseVector[Float]] {
-  override def apply(in: RDD[DenseMatrix[Float]]): RDD[DenseVector[Float]] = {
-    val numImgs = in.count.toInt
+class ColumnSampler(
+    numSamples: Int,
+    numImgsOpt: Option[Int] = None)
+  extends FunctionNode[RDD[DenseMatrix[Float]], RDD[DenseVector[Float]]] {
+
+  def apply(in: RDD[DenseMatrix[Float]]): RDD[DenseVector[Float]] = {
+    val numImgs = numImgsOpt.getOrElse(in.count.toInt)
     val samplesPerImage = numSamples/numImgs
 
     in.flatMap(mat => {
@@ -20,7 +24,6 @@ class ColumnSampler(numSamples: Int) extends Transformer[DenseMatrix[Float], Den
     })
   }
 
-  def apply(in: DenseMatrix[Float]): DenseVector[Float] = ???
 }
 
 /**
@@ -31,4 +34,4 @@ class Sampler[T](val size: Int, val seed: Int = 42) extends FunctionNode[RDD[T],
   def apply(in: RDD[T]): Array[T] = {
     in.takeSample(false, size, seed)
   }
-}
+}
diff --git a/src/main/scala/nodes/stats/SignedHellingerMapper.scala b/src/main/scala/nodes/stats/SignedHellingerMapper.scala
@@ -1,6 +1,6 @@
 package nodes.stats
 
-import breeze.linalg.DenseVector
+import breeze.linalg.{DenseVector, DenseMatrix}
 import breeze.numerics._
 import pipelines.Transformer
 
@@ -13,4 +13,10 @@ object SignedHellingerMapper extends Transformer[DenseVector[Double], DenseVecto
   def apply(in: DenseVector[Double]): DenseVector[Double] = {
     signum(in) :* sqrt(abs(in))
   }
-}
+}
+
+object BatchSignedHellingerMapper extends Transformer[DenseMatrix[Float], DenseMatrix[Float]] {
+  def apply(in: DenseMatrix[Float]): DenseMatrix[Float] = {
+    in.map(x => (math.signum(x) * math.sqrt(math.abs(x))).toFloat)
+  }
+}
diff --git a/src/main/scala/nodes/util/VectorSplitter.scala b/src/main/scala/nodes/util/VectorSplitter.scala
@@ -7,9 +7,13 @@ import pipelines.FunctionNode
 /**
  * This transformer splits the input vector into a number of blocks.
  */
-class VectorSplitter(blockSize: Int) extends FunctionNode[RDD[DenseVector[Double]], Seq[RDD[DenseVector[Double]]]] {
+class VectorSplitter(
+    blockSize: Int,
+    numFeaturesOpt: Option[Int] = None) 
+  extends FunctionNode[RDD[DenseVector[Double]], Seq[RDD[DenseVector[Double]]]] {
+
   override def apply(in: RDD[DenseVector[Double]]): Seq[RDD[DenseVector[Double]]] = {
-    val numFeatures = in.first.length
+    val numFeatures = numFeaturesOpt.getOrElse(in.first.length)
     val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt
     (0 until numBlocks).map { blockNum =>
       in.map { vec =>
@@ -20,7 +24,7 @@ class VectorSplitter(blockSize: Int) extends FunctionNode[RDD[DenseVector[Double
   }
 
   def splitVector(in: DenseVector[Double]): Seq[DenseVector[Double]] = {
-    val numFeatures = in.length
+    val numFeatures = numFeaturesOpt.getOrElse(in.length)
     val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt
     (0 until numBlocks).map { blockNum =>
       // Expliclity call toArray as breeze's slice is lazy