Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Imagenet Pipeline #120

Merged
merged 11 commits into from
May 19, 2015
2 changes: 2 additions & 0 deletions src/main/scala/loaders/ImageNetLoader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import utils.LabeledImage
*/

object ImageNetLoader {

val NUM_CLASSES = 1000

/**
* Loads images from @dataPath and associates images with the labels provided in @labelPath
Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/nodes/images/LCSExtractor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class LCSExtractor(
c = c + 1
}

val lcsValues = new DenseMatrix[Float](numPoolsX * numPoolsY, numLCSValues)
val lcsValues = new DenseMatrix[Float](numLCSValues, numPoolsX * numPoolsY)

var lcsIdx = 0
// Start at strideStart in (x, y) and
Expand All @@ -114,10 +114,10 @@ class LCSExtractor(
for (nx <- subPatchRange;
ny <- subPatchRange) {
// lcsValues(lcsIdx) = means(c).get((xPos + nx), (yPos + ny), 0)
lcsValues(xKeyPoint * numPoolsY + yKeyPoint, lcsIdx) =
lcsValues(lcsIdx, xKeyPoint * numPoolsY + yKeyPoint) =
means(c).get((xPos + nx), (yPos + ny), 0).toFloat
lcsIdx = lcsIdx + 1
lcsValues(xKeyPoint * numPoolsY + yKeyPoint, lcsIdx) =
lcsValues(lcsIdx, xKeyPoint * numPoolsY + yKeyPoint) =
stds(c).get((xPos + nx), (yPos + ny), 0).toFloat
lcsIdx = lcsIdx + 1
}
Expand Down
13 changes: 10 additions & 3 deletions src/main/scala/nodes/learning/BlockLinearMapper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ class BlockLeastSquaresEstimator(blockSize: Int, numIter: Int, lambda: Double =
val b = RowPartitionedMatrix.fromArray(
labelScaler.apply(trainingLabels).map(_.toArray)).cache()
val numRows = Some(b.numRows())
val numCols = Some(trainingFeatures.head.first().length.toLong)
val numCols = Some(blockSize.toLong)

// NOTE: This will cause trainingFeatures to be evaluated twice
// which might not be optimal if its not cached ?
Expand All @@ -159,8 +159,15 @@ class BlockLeastSquaresEstimator(blockSize: Int, numIter: Int, lambda: Double =
override def fit(
trainingFeatures: RDD[DenseVector[Double]],
trainingLabels: RDD[DenseVector[Double]]): BlockLinearMapper = {
val vectorSplitter = new VectorSplitter(blockSize)
fit(trainingFeatures, trainingLabels, None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it a problem to have a single version of these with None or does it break the Estimator API?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I tried it and it breaks the api.

}

def fit(
trainingFeatures: RDD[DenseVector[Double]],
trainingLabels: RDD[DenseVector[Double]],
numFeaturesOpt: Option[Int]): BlockLinearMapper = {
val vectorSplitter = new VectorSplitter(blockSize, numFeaturesOpt)
val featureBlocks = vectorSplitter.apply(trainingFeatures)
fit(featureBlocks, trainingLabels)
fit(featureBlocks, trainingLabels)
}
}
13 changes: 10 additions & 3 deletions src/main/scala/nodes/learning/BlockWeightedLeastSquares.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,14 @@ class BlockWeightedLeastSquaresEstimator(
override def fit(
trainingFeatures: RDD[DenseVector[Double]],
trainingLabels: RDD[DenseVector[Double]]): BlockLinearMapper = {
val trainingFeaturesSplit = new VectorSplitter(blockSize).apply(trainingFeatures)
fit(trainingFeatures, trainingLabels, None)
}

def fit(
trainingFeatures: RDD[DenseVector[Double]],
trainingLabels: RDD[DenseVector[Double]],
numFeaturesOpt: Option[Int]): BlockLinearMapper = {
val trainingFeaturesSplit = new VectorSplitter(blockSize, numFeaturesOpt).apply(trainingFeatures)
fit(trainingFeaturesSplit, trainingLabels)
}

Expand Down Expand Up @@ -97,9 +104,9 @@ object BlockWeightedLeastSquaresEstimator extends Logging {
}.collect():_*)

// Initialize models to zero here. Each model is a (W, b)
// NOTE: We get first element from every training block here
val models = trainingFeatures.map { block =>
val blockSize = block.first.length
// TODO: This assumes uniform block sizes. We should check the number of columns
// in each block to ensure safety.
DenseMatrix.zeros[Double](blockSize, nClasses)
}.toArray

Expand Down
13 changes: 8 additions & 5 deletions src/main/scala/nodes/stats/Sampling.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@ import pipelines.{FunctionNode, Transformer}
* Given a collection of Dense Matrices, this will generate a sample of `numSamples` columns from the entire set.
* @param numSamples
*/
class ColumnSampler(numSamples: Int) extends Transformer[DenseMatrix[Float], DenseVector[Float]] {
override def apply(in: RDD[DenseMatrix[Float]]): RDD[DenseVector[Float]] = {
val numImgs = in.count.toInt
class ColumnSampler(
numSamples: Int,
numImgsOpt: Option[Int] = None)
extends FunctionNode[RDD[DenseMatrix[Float]], RDD[DenseVector[Float]]] {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this FunctionNode and not a Transformer?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nevermind, I get it.


def apply(in: RDD[DenseMatrix[Float]]): RDD[DenseVector[Float]] = {
val numImgs = numImgsOpt.getOrElse(in.count.toInt)
val samplesPerImage = numSamples/numImgs

in.flatMap(mat => {
Expand All @@ -20,7 +24,6 @@ class ColumnSampler(numSamples: Int) extends Transformer[DenseMatrix[Float], Den
})
}

def apply(in: DenseMatrix[Float]): DenseVector[Float] = ???
}

/**
Expand All @@ -31,4 +34,4 @@ class Sampler[T](val size: Int, val seed: Int = 42) extends FunctionNode[RDD[T],
def apply(in: RDD[T]): Array[T] = {
in.takeSample(false, size, seed)
}
}
}
10 changes: 8 additions & 2 deletions src/main/scala/nodes/stats/SignedHellingerMapper.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package nodes.stats

import breeze.linalg.DenseVector
import breeze.linalg.{DenseVector, DenseMatrix}
import breeze.numerics._
import pipelines.Transformer

Expand All @@ -13,4 +13,10 @@ object SignedHellingerMapper extends Transformer[DenseVector[Double], DenseVecto
def apply(in: DenseVector[Double]): DenseVector[Double] = {
signum(in) :* sqrt(abs(in))
}
}
}

object BatchSignedHellingerMapper extends Transformer[DenseMatrix[Float], DenseMatrix[Float]] {
def apply(in: DenseMatrix[Float]): DenseMatrix[Float] = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see a pattern emerging - perhaps we want to support Vector or Matrix in NumericTransformer?

in.map(x => (math.signum(x) * math.sqrt(math.abs(x))).toFloat)
}
}
10 changes: 7 additions & 3 deletions src/main/scala/nodes/util/VectorSplitter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@ import pipelines.FunctionNode
/**
* This transformer splits the input vector into a number of blocks.
*/
class VectorSplitter(blockSize: Int) extends FunctionNode[RDD[DenseVector[Double]], Seq[RDD[DenseVector[Double]]]] {
class VectorSplitter(
blockSize: Int,
numFeaturesOpt: Option[Int] = None)
extends FunctionNode[RDD[DenseVector[Double]], Seq[RDD[DenseVector[Double]]]] {

override def apply(in: RDD[DenseVector[Double]]): Seq[RDD[DenseVector[Double]]] = {
val numFeatures = in.first.length
val numFeatures = numFeaturesOpt.getOrElse(in.first.length)
val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt
(0 until numBlocks).map { blockNum =>
in.map { vec =>
Expand All @@ -20,7 +24,7 @@ class VectorSplitter(blockSize: Int) extends FunctionNode[RDD[DenseVector[Double
}

def splitVector(in: DenseVector[Double]): Seq[DenseVector[Double]] = {
val numFeatures = in.length
val numFeatures = numFeaturesOpt.getOrElse(in.length)
val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt
(0 until numBlocks).map { blockNum =>
// Expliclity call toArray as breeze's slice is lazy
Expand Down
Loading