First release before packaging for pip

javadba · Sep 14, 2019 · d5c4a69 · d5c4a69
1 parent 2f7c05d
commit d5c4a69
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+*.sw*  
+
 # C extensions
 *.so
 

diff --git a/pyspark_utils/__init__.py b/pyspark_utils/__init__.py
@@ -0,0 +1,2 @@
+from pyspark_utils import p, setupSpark, getSpark, describeWithNulls, toPandas, sparkSql, fromCsvSpark, cols
+
diff --git a/pyspark_utils/pyspark_utils.py b/pyspark_utils/pyspark_utils.py
@@ -0,0 +1,96 @@
+from pyspark.sql import *
+from pyspark import SparkConf
+from pyspark.sql import Row, Column, DataFrame as SDataFrame
+from pyspark.sql.types import *
+import pandas as pd
+import os, sys
+from collections import namedtuple as ntup
+
+def p(msg,o=None): omsg = ": %s" %repr(o) if o is not None else ""; print("%s%s\n" %(msg, omsg))
+
+def setupSpark():
+    os.environ["PYSPARK_SUBMIT_ARGS"] = "pyspark-shell"
+    pd.set_option('display.max_rows', 20)
+    pd.set_option('display.max_colwidth', -1)
+    pd.set_option('display.max_columns', None)
+    pd.set_option('expand_frame_repr', False)
+    spark = SparkSession.builder.appName("pyspark_utils").master("local").getOrCreate()
+    return spark
+
+# # Start the Apache Spark server
+#
+def getSpark():
+    spark = spark if 'spark' in globals() else setupSpark()
+    return spark
+
+def describeWithNulls(df, doPrint=True):
+    # df = pd.DataFrame({ 'a': [1,2,3], 'b': ['a','b','c'], 'c': [99.5,11.2, 433.1], 'd':[123,'abc',None]})
+    desc = df.describe()  # Returns a DataFrame with stats in the row index
+    combo = pd.concat([df.isna().sum(),desc.T],axis=1).set_axis(['Nulls']+list(desc.index),axis=1,inplace=False)
+    if doPrint:
+        p(combo.head(100))
+    return combo
+
+# Read a Parquet File into Spark DataFRame and also create Pandas Dataframe
+#
+def toPandas(path, tname, sql=None, count=False):
+    df = getSpark().read.parquet(path)
+    if count: p(tname + ": count="+str(df.count()))
+    df.createOrReplaceTempView(tname)
+    if sql is not None:
+        df = getSpark().sql(sql)
+        df.createOrReplaceTempView(tname)
+    pdf = df.toPandas()
+    describeWithNulls(pdf)
+    return df,pdf
+
+#  Run a Spark SQL and return pandas and spark dataframes
+#
+def sparkSql(sql, tname=None, count=True, describe=False):
+    sdf = getSpark().sql(sql)
+    if count and tname: p(tname + ": count="+str(sdf.count()))
+    if tname:    sdf.createOrReplaceTempView(tname)
+    pdf = sdf.toPandas()
+    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
+      print(pdf)
+    if describe:
+      describeWithNulls(pdf)
+    return sdf,pdf
+
+#  Read a CSV and create pandas and spark dataframes
+#
+def fromCsvSpark(path, tname, header=True, count=True, sql=None):
+    p('Reading from %s..' %path)
+    df = getSpark().read.csv(path,header=header)
+    if count: p(tname + ": count="+str(df.count()))
+    df.createOrReplaceTempView(tname)
+    if sql is not None:
+        df = getSpark().sql(sql)
+        df.createOrReplaceTempView(tname)
+    pdf = df.toPandas()
+    describeWithNulls(pdf)
+    return df,pdf
+
+# Return selected columns of pandas DF
+#
+def cols(df, cnames):
+    cnames = cnames if isinstance(cnames,list) else [cnames]
+    return df.loc[:,cnames]
+
+# Tokenization Spark UDF using comma delimiter
+#
+def tokenize(txt):
+    def stripx(x): return x.strip()
+    if txt.find(',') < 0: return txt
+    else:
+        toks = list(map(stripx, txt.split(',')))
+        # return [toks[0], toks]
+        return toks
+
+if __name__ == '__main__':
+    sampleCode="""
+    from pyspark.sql.types import *
+    from pyspark.sql.functions import udf
+    tokenize_udf = udf(tokenize, ArrayType(StructType([StructField("tok", StringType(), False)])))
+    spark.udf.register("tokenize",tokenize_udf)
+    """