Improved quickstart

- improved Quickstart.scala - minor fix in IntervalExpression - added docs/.gitignore to remove generated apidocs and copied reference/sql_functions
TIBCOSoftware · Oct 27, 2021 · 05ebab4 · 05ebab4
1 parent 06bbac6
commit 05ebab4
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 28 deletions.
diff --git a/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpression.scala b/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpression.scala
@@ -80,8 +80,8 @@ case class IntervalExpression(children: Seq[Expression], units: Seq[Long])
     var months = 0
     var micros = 0L
     unit match {
-      case -1 => months = v.toInt * 12
-      case -2 => months = v.toInt
+      case -1L => months = v.toInt * 12
+      case -2L => months = v.toInt
       case _ => micros = v * unit
     }
     new CalendarInterval(months, micros)
@@ -100,7 +100,7 @@ case class IntervalExpression(children: Seq[Expression], units: Seq[Long])
            |${childGen.code}
            |$intervalClass ${ev.value};
            |${doGenCodeSingle(childGen.value, childIsNull, ev.value,
-              units.head.toString, months, micros, intervalClass)}
+              s"${units.head}L", months, micros, intervalClass)}
         """.stripMargin
       if (childIsNull == "false") {
         ev.copy(code = code, isNull = "false")
@@ -153,8 +153,8 @@ case class IntervalExpression(children: Seq[Expression], units: Seq[Long])
        |} else {
        |  int $months = 0;
        |  long $micros = 0L;
-       |  if ($unit == -1) $months = ((int)$value) * 12;
-       |  else if ($unit == -2) $months = (int)$value;
+       |  if ($unit == -1L) $months = ((int)$value) * 12;
+       |  else if ($unit == -2L) $months = (int)$value;
        |  else $micros = $value * $unit;
        |  $result = new $intervalClass($months, $micros);
        |}

diff --git a/docs/.gitignore b/docs/.gitignore
@@ -0,0 +1,2 @@
+apidocs/
+reference/sql_functions/
diff --git a/docs/reference/.gitignore b/docs/reference/.gitignore
diff --git a/examples/quickstart/scripts/Quickstart.scala b/examples/quickstart/scripts/Quickstart.scala
@@ -17,53 +17,71 @@
 /**
  * This script demonstrate the performance difference between Spark and Snappydata.
  * To execute this script on spark you can use below command:
- * ./spark-shell --driver-memory 4g --master local[*] --packages "TIBCOSoftware:snappydata:1.3.0-s_2.11" \
- * -i Quickstart.scala
  *
- * To execute this script on spark you can use same command as above without specifying packages
- * as follows:
- * ./bin/spark-shell --driver-memory=4g --driver-java-options="-XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:+CMSClassUnloadingEnabled -XX:MaxNewSize=1g -i Quickstart.scala
+ * ./spark-shell --driver-memory 4g --master local[*] \
+ * --packages "TIBCOSoftware:snappydata:1.3.0-s_2.11" -i Quickstart.scala
+ *
+ * Or you can execute this script on SnappyData's spark distribution with the same command as above
+ * without requiring packages as follows:
+ *
+ * ./bin/spark-shell --driver-memory=4g --driver-java-options="-XX:+UseConcMarkSweepGC \
+ * -XX:+UseParNewGC -XX:+CMSClassUnloadingEnabled -XX:MaxNewSize=1g" -i Quickstart.scala
  */
 
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.internal.SQLConf
-
-//Benchmark function that will execute a function and returns time taken to execute that function
-def benchmark(name: String, times: Int = 5, warmups: Int = 3)(f: => Unit) : Double = {
-  for (i <- 1 to warmups) {
+// Benchmark function that will execute a function and returns time taken to execute that function
+def benchmark(name: String, times: Int = 5, warmups: Int = 3)(f: => Unit): Double = {
+  for (_ <- 1 to warmups) {
     f
   }
   val startTime = System.nanoTime
-  for (i <- 1 to times) {
+  for (_ <- 1 to times) {
     f
   }
   val endTime = System.nanoTime
   val timeTaken = (endTime - startTime).toDouble / (times * 1000000.0)
+  // scalastyle:off println
+  println()
   println(s"Average time taken in $name for $times runs: $timeTaken millis")
+  println()
+  // scalastyle:on println
   timeTaken
 }
 
-//Create Dataframe can register temp table
-var testDF = spark.range(100000000).selectExpr("id", "concat('sym', cast((id % 100) as STRING)) as sym")
-testDF.cache
+sc.setLogLevel("ERROR")
+// Create Dataframe can register temp table
+var testDF = spark.range(100000000).selectExpr("id",
+  "concat('sym', cast((id % 100) as STRING)) as sym")
+testDF.cache()
 testDF.createOrReplaceTempView("sparkCacheTable")
 
-
-val timeTakenSpark = benchmark("Spark perf") {spark.sql("select sym, avg(id) from sparkCacheTable group by sym").collect()}
+val timeTakenSpark = benchmark("Spark perf") {
+  spark.sql("select sym, avg(id) from sparkCacheTable group by sym").collect()
+}
 
 
 testDF.unpersist()
 System.gc()
 System.runFinalization()
 
 
-//Create SnappySession to execute queries from spark
+// Create SnappySession to execute queries from spark
 val snappy = new org.apache.spark.sql.SnappySession(spark.sparkContext)
-testDF = snappy.range(100000000).selectExpr("id", "concat('sym', cast((id % 100) as varchar(10))) as sym")
+testDF = snappy.range(100000000).selectExpr("id",
+  "concat('sym', cast((id % 100) as varchar(10))) as sym")
 
 snappy.sql("drop table if exists snappyTable")
 snappy.sql("create table snappyTable (id bigint not null, sym varchar(10) not null) using column")
-benchmark("Snappy insert perf", 1, 0) {testDF.write.insertInto("snappyTable") }
+benchmark("Snappy insert perf", 1, 0) {
+  testDF.write.insertInto("snappyTable")
+}
+
+// GROUP BY on single key when number of results is not large is faster with older implementation
+snappy.conf.set("snappydata.sql.useOptimizedHashAggregateForSingleKey", "false")
+// Direct collect for GROUP BY at driver avoiding an EXCHANGE when number of results is not large
+snappy.conf.set("snappydata.sql.useDriverCollectForGroupBy", "true")
+
+val timeTakenSnappy = benchmark("Snappy perf") {
+  snappy.sql("select sym, avg(id) from snappyTable group by sym").collect()
+}
 
-val timeTakenSnappy = benchmark("Snappy perf") {snappy.sql("select sym, avg(id) from snappyTable group by sym").collect()}
 System.exit(0)
diff --git a/snappy-connectors b/snappy-connectors