Skip to content

Commit

Permalink
Improved quickstart
Browse files Browse the repository at this point in the history
- improved Quickstart.scala
- minor fix in IntervalExpression
- added docs/.gitignore to remove generated apidocs and copied reference/sql_functions
  • Loading branch information
sumwale committed Oct 27, 2021
1 parent 06bbac6 commit 05ebab4
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ case class IntervalExpression(children: Seq[Expression], units: Seq[Long])
var months = 0
var micros = 0L
unit match {
case -1 => months = v.toInt * 12
case -2 => months = v.toInt
case -1L => months = v.toInt * 12
case -2L => months = v.toInt
case _ => micros = v * unit
}
new CalendarInterval(months, micros)
Expand All @@ -100,7 +100,7 @@ case class IntervalExpression(children: Seq[Expression], units: Seq[Long])
|${childGen.code}
|$intervalClass ${ev.value};
|${doGenCodeSingle(childGen.value, childIsNull, ev.value,
units.head.toString, months, micros, intervalClass)}
s"${units.head}L", months, micros, intervalClass)}
""".stripMargin
if (childIsNull == "false") {
ev.copy(code = code, isNull = "false")
Expand Down Expand Up @@ -153,8 +153,8 @@ case class IntervalExpression(children: Seq[Expression], units: Seq[Long])
|} else {
| int $months = 0;
| long $micros = 0L;
| if ($unit == -1) $months = ((int)$value) * 12;
| else if ($unit == -2) $months = (int)$value;
| if ($unit == -1L) $months = ((int)$value) * 12;
| else if ($unit == -2L) $months = (int)$value;
| else $micros = $value * $unit;
| $result = new $intervalClass($months, $micros);
|}
Expand Down
2 changes: 2 additions & 0 deletions docs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
apidocs/
reference/sql_functions/
1 change: 0 additions & 1 deletion docs/reference/.gitignore

This file was deleted.

60 changes: 39 additions & 21 deletions examples/quickstart/scripts/Quickstart.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,53 +17,71 @@
/**
* This script demonstrate the performance difference between Spark and Snappydata.
* To execute this script on spark you can use below command:
* ./spark-shell --driver-memory 4g --master local[*] --packages "TIBCOSoftware:snappydata:1.3.0-s_2.11" \
* -i Quickstart.scala
*
* To execute this script on spark you can use same command as above without specifying packages
* as follows:
* ./bin/spark-shell --driver-memory=4g --driver-java-options="-XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:+CMSClassUnloadingEnabled -XX:MaxNewSize=1g -i Quickstart.scala
* ./spark-shell --driver-memory 4g --master local[*] \
* --packages "TIBCOSoftware:snappydata:1.3.0-s_2.11" -i Quickstart.scala
*
* Or you can execute this script on SnappyData's spark distribution with the same command as above
* without requiring packages as follows:
*
* ./bin/spark-shell --driver-memory=4g --driver-java-options="-XX:+UseConcMarkSweepGC \
* -XX:+UseParNewGC -XX:+CMSClassUnloadingEnabled -XX:MaxNewSize=1g" -i Quickstart.scala
*/

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf

//Benchmark function that will execute a function and returns time taken to execute that function
def benchmark(name: String, times: Int = 5, warmups: Int = 3)(f: => Unit) : Double = {
for (i <- 1 to warmups) {
// Benchmark function that will execute a function and returns time taken to execute that function
def benchmark(name: String, times: Int = 5, warmups: Int = 3)(f: => Unit): Double = {
for (_ <- 1 to warmups) {
f
}
val startTime = System.nanoTime
for (i <- 1 to times) {
for (_ <- 1 to times) {
f
}
val endTime = System.nanoTime
val timeTaken = (endTime - startTime).toDouble / (times * 1000000.0)
// scalastyle:off println
println()
println(s"Average time taken in $name for $times runs: $timeTaken millis")
println()
// scalastyle:on println
timeTaken
}

//Create Dataframe can register temp table
var testDF = spark.range(100000000).selectExpr("id", "concat('sym', cast((id % 100) as STRING)) as sym")
testDF.cache
sc.setLogLevel("ERROR")
// Create Dataframe can register temp table
var testDF = spark.range(100000000).selectExpr("id",
"concat('sym', cast((id % 100) as STRING)) as sym")
testDF.cache()
testDF.createOrReplaceTempView("sparkCacheTable")


val timeTakenSpark = benchmark("Spark perf") {spark.sql("select sym, avg(id) from sparkCacheTable group by sym").collect()}
val timeTakenSpark = benchmark("Spark perf") {
spark.sql("select sym, avg(id) from sparkCacheTable group by sym").collect()
}


testDF.unpersist()
System.gc()
System.runFinalization()


//Create SnappySession to execute queries from spark
// Create SnappySession to execute queries from spark
val snappy = new org.apache.spark.sql.SnappySession(spark.sparkContext)
testDF = snappy.range(100000000).selectExpr("id", "concat('sym', cast((id % 100) as varchar(10))) as sym")
testDF = snappy.range(100000000).selectExpr("id",
"concat('sym', cast((id % 100) as varchar(10))) as sym")

snappy.sql("drop table if exists snappyTable")
snappy.sql("create table snappyTable (id bigint not null, sym varchar(10) not null) using column")
benchmark("Snappy insert perf", 1, 0) {testDF.write.insertInto("snappyTable") }
benchmark("Snappy insert perf", 1, 0) {
testDF.write.insertInto("snappyTable")
}

// GROUP BY on single key when number of results is not large is faster with older implementation
snappy.conf.set("snappydata.sql.useOptimizedHashAggregateForSingleKey", "false")
// Direct collect for GROUP BY at driver avoiding an EXCHANGE when number of results is not large
snappy.conf.set("snappydata.sql.useDriverCollectForGroupBy", "true")

val timeTakenSnappy = benchmark("Snappy perf") {
snappy.sql("select sym, avg(id) from snappyTable group by sym").collect()
}

val timeTakenSnappy = benchmark("Snappy perf") {snappy.sql("select sym, avg(id) from snappyTable group by sym").collect()}
System.exit(0)
2 changes: 1 addition & 1 deletion snappy-connectors

0 comments on commit 05ebab4

Please sign in to comment.