Start working on adding previous run data (#114)

holdenk · web-flow · commit 7f6f8b2ce3e5 · 2023-10-14T11:27:36.000-07:00
* Start working on adding previous run data

* Fix up load previous run a bit

* Drop large target size ex.

* line breaks on log ops make sense to me.

* in CI we don't have previous jobs

* Use modern spark-testing-base for Python and use session not SQLCtx.
diff --git a/python/examples/load_previous_run_data.py b/python/examples/load_previous_run_data.py
@@ -0,0 +1,31 @@
+import os
+import tempfile
+
+
+class LoadPreviousRunData(object):
+    def __init__(self, session):
+        self.session = session
+
+    def find_oldest_id(self, local_path):
+        """Find the oldest Spark job since it's probably not being updated."""
+        directories = os.listdir(local_path)
+        return min(directories, key=lambda x: os.path.getmtime(f"{local_path}/{x}"))
+
+    def do_magic(self):
+        local_path = "/tmp/spark-events"
+        event_log_path = f"file://{local_path}"
+        application_id = self.find_oldest_id(local_path)
+        return self.load_json_records(event_log_path, application_id)
+
+    # tag::load[]
+    def load_json_records(self, event_log_path, application_id):
+        print(f"Loading {application_id}")
+        full_log_path = f"{event_log_path}/{application_id}"
+        df = self.session.read.json(full_log_path)
+        special_events = df.filter(
+            (df["Event"] == "SparkListenerExecutorAdded")
+            | (df["Event"] == "SparkListenerJobEnd")
+        )
+        special_events.show()
+
+    # end::load[]
diff --git a/python/examples/test_load_previous_run_data.py b/python/examples/test_load_previous_run_data.py
@@ -0,0 +1,15 @@
+from pyspark.sql.session import SparkSession
+import os
+import tempfile
+
+from sparktestingbase.sqltestcase import SQLTestCase
+from .load_previous_run_data import LoadPreviousRunData
+
+
+class TestLoadPreviousRunData(SQLTestCase):
+    def test_do_magic(self):
+        lprd = LoadPreviousRunData(self.session)
+        try:
+            lprd.do_magic()
+        except FileNotFoundError:
+            print("No previous jobs")
diff --git a/python/requirements.txt b/python/requirements.txt
@@ -1,11 +1,10 @@
 spark-testing-base
 pandas
 pyarrow
-pyspark<3.5
+pyspark==3.5.0
 pyspark-asyncactions
 pandera
 pandera[pyspark]
 spark-expectations>=1.0
 venv-pack
-delta-spark
 requests
diff --git a/python/tox.ini b/python/tox.ini
@@ -4,7 +4,7 @@ isolated_build = True
 requires = tox-conda
 envlist =
     isort
-    py39
+    py310
     black
     mypy
     flake8
@@ -13,9 +13,9 @@ skip_missing_interpeters = true
 
 [gh-actions]
 python =
-    3.9: py39
+#    3.9: py39
 # We need a new version of PySpark w/3.10 support.
-#    3.10: py310
+    3.10: py310
 
 [testenv]
 setenv =
@@ -29,9 +29,9 @@ extras =
 deps =
   pytest
   isort==4.3.21
-  pyspark
+  pyspark==3.5.0
   flake8
-  spark-testing-base
+  spark-testing-base>=0.11.1
   -rrequirements.txt
 commands =
   pytest examples \
@@ -56,7 +56,7 @@ deps =
 [testenv:flake8]
 extras = tests
 skipsdist = True
-commands = flake8 --ignore=F403,E402,F401,F405 examples
+commands = flake8 --ignore=F403,E402,F401,F405,W503 examples
 allowlist_externals = flake8
 
 [testenv:mypy]