[SNAP-3165] Instantiating snappy session only when catalogImplementation (#191)

vatsal mevada · web-flow · commit 870029788c8c · 2019-12-06T15:01:59.000+05:30
is in-memory which running pyspark shell.

## What changes were proposed in this pull request?

We are initializing `SparkSession` as well as `SnappySession` while starting pyspark shell.
`SparkSession` and `SparkContext`were always initialized with hive support enable
 irrespective of value of `spark.sql.catalogImplementation` config. 

With these changes, we are checking the value of `spark.sql.catalogImplementation` and
hive support is not enabled when the value of above-mentioned property is set to
 `in-memory` explicitly. 

SnappySession will be only initialized when catalog implementation is set to `in-memory`
to avoid failure reported in SNAP-3165. 

Later we can provide support for hive catalog implementation for python with SnappySession.
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
@@ -47,6 +47,8 @@
 import py4j
 
 import pyspark
+
+from pyspark import SparkConf
 from pyspark.context import SparkContext
 from pyspark.sql import SparkSession, SQLContext
 from pyspark.sql.snappy import SnappySession
@@ -57,25 +59,36 @@
 
 SparkContext._ensure_initialized()
 
+conf = SparkConf()
+catalogImplementation = conf.get('spark.sql.catalogImplementation', 'hive').lower()
 try:
-    # Try to access HiveConf, it will raise exception if Hive is not added
-    SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
-    spark = SparkSession.builder\
-        .enableHiveSupport()\
-        .getOrCreate()
+    if catalogImplementation == 'hive':
+        # Try to access HiveConf, it will raise exception if Hive is not added
+        SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
+        spark = SparkSession.builder\
+            .enableHiveSupport()\
+            .getOrCreate()
+    else:
+        spark = SparkSession.builder.getOrCreate()
 except py4j.protocol.Py4JError:
     spark = SparkSession.builder.getOrCreate()
 except TypeError:
     spark = SparkSession.builder.getOrCreate()
 
 
 sc = spark.sparkContext
-snappy = SnappySession(sc)
-sql = snappy.sql
+if catalogImplementation == 'in-memory':
+    snappy = SnappySession(sc)
+    sql = snappy.sql
+else:
+    sql = spark.sql
 atexit.register(lambda: sc.stop())
 
 # for compatibility
-sqlContext = snappy._wrapped
+if catalogImplementation == 'in-memory':
+    sqlContext = snappy._wrapped
+else:
+    sqlContext = spark._wrapped
 sqlCtx = sqlContext
 
 print("""Welcome to
@@ -90,7 +103,8 @@
     platform.python_build()[0],
     platform.python_build()[1]))
 print("SparkSession available as 'spark'.")
-print("SnappySession available as 'snappy'.")
+if catalogImplementation == 'in-memory':
+    print("SnappySession available as 'snappy'.")
 
 # The ./bin/pyspark script stores the old PYTHONSTARTUP value in OLD_PYTHONSTARTUP,
 # which allows us to execute the user's PYTHONSTARTUP file: