Fixin python lint issues

apache · grundprinzip · Jun 16, 2022 · Jul 5, 2022 · Jul 7, 2022 · Jul 4, 2022
commit e1f862eeba1468a36e3b88585e6c4507f55d0c8a
diff --git a/dev/lint-python b/dev/lint-python
@@ -129,6 +129,7 @@ function mypy_examples_test {
     MYPY_REPORT=$( (MYPYPATH=python $MYPY_BUILD \
       --config-file python/mypy.ini \
       --exclude "mllib/*" \
+      --exclude "sql/connect/proto/*" \
       examples/src/main/python/) 2>&1)
 
     MYPY_STATUS=$?

diff --git a/dev/tox.ini b/dev/tox.ini
@@ -51,4 +51,6 @@ exclude =
     python/pyspark/worker.pyi,
     python/pyspark/java_gateway.pyi,
     dev/ansible-for-test-node/*,
+    python/pyspark/sql/connect/proto/*,
+    python/venv/*,
 max-line-length = 100
diff --git a/python/pyspark/sql/connect/__init__.py b/python/pyspark/sql/connect/__init__.py
@@ -15,4 +15,4 @@
 # limitations under the License.
 #
 
-from pyspark.sql.connect.data_frame import DataFrame
+from pyspark.sql.connect.data_frame import DataFrame  # noqa: F401
diff --git a/python/pyspark/sql/connect/client.py b/python/pyspark/sql/connect/client.py
@@ -124,7 +124,7 @@ def register_udf(self, function, return_type) -> str:
         req.user_context.user_id = self._user_id
         req.plan.command.create_function.CopyFrom(fun)
 
-        result = self._execute_and_fetch(req)
+        self._execute_and_fetch(req)
         return name
 
     def _build_metrics(self, metrics: "pb2.Response.Metrics") -> typing.List[PlanMetrics]:

diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py
@@ -55,7 +55,8 @@ def __init__(self, value: PrimitiveType) -> None:  # type: ignore[name-defined]
     def to_plan(self, session: "RemoteSparkSession") -> proto.Expression:
         """Converts the literal expression to the literal in proto.
 
-        TODO This method always assumes the largest type and can thus create weird interpretations of the literal."""
+        TODO This method always assumes the largest type and can thus
+             create weird interpretations of the literal."""
         value_type = type(self._value)
         exp = proto.Expression()
         if value_type is int:
@@ -97,7 +98,7 @@ def from_qualified_name(cls, name) -> "ColumnRef":
 
     def __init__(self, *parts: str) -> None:  # type: ignore[name-defined]
         super().__init__()
-        self._parts: List[str] = list(filter(lambda x: not x is None, list(parts)))
+        self._parts: List[str] = list(filter(lambda x: x is not None, list(parts)))
 
     def name(self) -> str:
         """Returns the qualified name of the column reference."""

diff --git a/python/pyspark/sql/connect/data_frame.py b/python/pyspark/sql/connect/data_frame.py
@@ -133,7 +133,7 @@ def columns(self) -> List[str]:
         """Returns the list of columns of the current data frame."""
         if self._plan is None:
             return []
-        if not "columns" in self._cache and self._plan is not None:
+        if "columns" not in self._cache and self._plan is not None:
             pdd = self.limit(0).collect()
             # Translate to standard pytho array
             self._cache["columns"] = pdd.columns.values
@@ -210,7 +210,7 @@ def where(self, condition):
 
     def _get_alias(self):
         p = self._plan
-        while not p is None:
+        while p is not None:
             if isinstance(p, plan.Project) and p.alias:
                 return p.alias
             p = p._child

diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from typing import TYPE_CHECKING
 from pyspark.sql.connect.column import ColumnRef, LiteralExpression
 from pyspark.sql.connect.column import PrimitiveType
 

diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
@@ -15,11 +15,7 @@
 # limitations under the License.
 #
 
-import base64
-from calendar import c
 from typing import (
-    AnyStr,
-    Dict,
     List,
     Optional,
     Sequence,
@@ -165,7 +161,7 @@ def plan(self, session: "RemoteSparkSession") -> proto.Relation:
             for c in self._raw_columns
         ]  # [self.unresolved_attr(*x) for x in self.columns]
         common = proto.RelationCommon()
-        if not self.alias is None:
+        if self.alias is not None:
             common.alias = self.alias
 
         plan = proto.Relation()
@@ -348,7 +344,10 @@ def plan(self, session: "RemoteSparkSession") -> proto.Relation:
 
     def print(self, indent=0) -> str:
         c_buf = self._child.print(indent + LogicalPlan.INDENT) if self._child else ""
-        return f"{self._i(indent)}<Sort columns={self.grouping_cols} measures={self.measures}>\n{c_buf}"
+        return (
+            f"{self._i(indent)}<Sort columns={self.grouping_cols}"
+            f"measures={self.measures}>\n{c_buf}"
+        )
 
     def _repr_html_(self):
         return f"""
@@ -388,7 +387,10 @@ def print(self, indent=0) -> str:
         i = self._i(indent)
         o = self._i(indent + LogicalPlan.INDENT)
         n = indent + LogicalPlan.INDENT * 2
-        return f"""{i}<Join on={self.on} how={self.how}>\n{o}left=\n{self.left.print(n)}\n{o}right=\n{self.right.print(n)}"""
+        return (
+            f"{i}<Join on={self.on} how={self.how}>\n{o}"
+            f"left=\n{self.left.print(n)}\n{o}right=\n{self.right.print(n)}"
+        )
 
     def _repr_html_(self):
         return f"""
@@ -420,7 +422,10 @@ def print(self, indent=0) -> str:
         i = self._i(indent)
         o = self._i(indent + LogicalPlan.INDENT)
         n = indent + LogicalPlan.INDENT * 2
-        return f"""{i}UnionAll\n{o}child1=\n{self._child.print(n)}\n{o}child2=\n{self.other.print(n)}"""
+        return (
+            f"{i}UnionAll\n{o}child1=\n{self._child.print(n)}"
+            f"\n{o}child2=\n{self.other.print(n)}"
+        )
 
     def _repr_html_(self) -> str:
         assert self._child is not None

diff --git a/python/pyspark/sql/tests/connect/test_plan_only.py b/python/pyspark/sql/tests/connect/test_plan_only.py
@@ -28,7 +28,9 @@ class SparkConnectTestsPlanOnly(PlanOnlyTestFixture):
     generation but do not call Spark."""
 
     def test_simple_project(self):
-        read_table = lambda x: DataFrame.withPlan(Read(x), self.connect)
+        def read_table(x):
+            return DataFrame.withPlan(Read(x), self.connect)
+
         self.connect.set_hook("readTable", read_table)
 
         plan = self.connect.readTable(self.tbl_name)._plan.collect(self.connect)
@@ -46,9 +48,12 @@ def udf_mock(*args, **kwargs):
         expr = u("ThisCol", "ThatCol", "OtherCol")
         self.assertTrue(isinstance(expr, UserDefinedFunction))
         u_plan = expr.to_plan(self.connect)
+        assert u_plan is not None
 
     def test_all_the_plans(self):
-        read_table = lambda x: DataFrame.withPlan(Read(x), self.connect)
+        def read_table(x):
+            return DataFrame.withPlan(Read(x), self.connect)
+
         self.connect.set_hook("readTable", read_table)
 
         df = self.connect.readTable(self.tbl_name)

diff --git a/python/pyspark/sql/tests/connect/test_spark_connect.py b/python/pyspark/sql/tests/connect/test_spark_connect.py
@@ -14,26 +14,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from typing import Any
 import uuid
 import unittest
 import tempfile
-import os
-import shutil
 
 from pyspark.sql import SparkSession, Row
 from pyspark.sql.connect.client import RemoteSparkSession
-from pyspark.sql.connect.function_builder import udf, UserDefinedFunction
+from pyspark.sql.connect.function_builder import udf
 from pyspark.testing.utils import ReusedPySparkTestCase
 
-import py4j
-
 
 class SparkConnectSQLTestCase(ReusedPySparkTestCase):
     """Parent test fixture class for all Spark Connect related
     test cases."""
 
     @classmethod
-    def setUpClass(cls):
+    def setUpClass(cls: Any) -> None:
         ReusedPySparkTestCase.setUpClass()
         cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
         cls.hive_available = True
@@ -46,7 +43,7 @@ def setUpClass(cls):
         cls.spark_connect_test_data()
 
     @classmethod
-    def spark_connect_test_data(cls):
+    def spark_connect_test_data(cls: Any) -> None:
         # Setup Remote Spark Session
         cls.tbl_name = f"tbl{uuid.uuid4()}".replace("-", "")
         cls.connect = RemoteSparkSession(port=15002)
@@ -57,22 +54,23 @@ def spark_connect_test_data(cls):
 
 
 class SparkConnectTests(SparkConnectSQLTestCase):
-    def test_simple_read(self):
+    def test_simple_read(self) -> None:
         """Tests that we can access the Spark Connect GRPC service locally."""
         df = self.connect.readTable(self.tbl_name)
         data = df.limit(10).collect()
         # Check that the limit is applied
         assert len(data.index) == 10
 
-    def test_simple_udf(self):
-        def conv_udf(x):
+    def test_simple_udf(self) -> None:
+        def conv_udf(x) -> str:
             return "Martin"
 
         u = udf(conv_udf)
         df = self.connect.readTable(self.tbl_name)
         result = df.select(u(df.id)).collect()
+        assert result is not None
 
-    def test_simple_explain_string(self):
+    def test_simple_explain_string(self) -> None:
         df = self.connect.readTable(self.tbl_name).limit(10)
         result = df.explain()
         assert len(result) > 0

diff --git a/python/pyspark/sql/tests/connect/utils/__init__.py b/python/pyspark/sql/tests/connect/utils/__init__.py
@@ -14,4 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from pyspark.sql.tests.connect.utils.spark_connect_test_utils import PlanOnlyTestFixture
+
+from pyspark.sql.tests.connect.utils.spark_connect_test_utils import (  # noqa: F401
+    PlanOnlyTestFixture,  # noqa: F401
+)  # noqa: F401
diff --git a/python/pyspark/sql/tests/connect/utils/spark_connect_test_utils.py b/python/pyspark/sql/tests/connect/utils/spark_connect_test_utils.py
@@ -14,26 +14,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from typing import Any, Dict
 import functools
 import unittest
 import uuid
 
 
 class MockRemoteSession:
-    def __init__(self):
-        self.hooks = {}
+    def __init__(self) -> None:
+        self.hooks: Dict[str, Any] = {}
 
-    def set_hook(self, name, hook):
+    def set_hook(self, name: str, hook: Any) -> None:
         self.hooks[name] = hook
 
-    def __getattr__(self, item):
-        if not item in self.hooks:
+    def __getattr__(self, item: str) -> Any:
+        if item not in self.hooks:
             raise LookupError(f"{item} is not defined as a method hook in MockRemoteSession")
         return functools.partial(self.hooks[item])
 
 
 class PlanOnlyTestFixture(unittest.TestCase):
     @classmethod
-    def setUpClass(cls) -> None:
+    def setUpClass(cls: Any) -> None:
         cls.connect = MockRemoteSession()
         cls.tbl_name = f"tbl{uuid.uuid4()}".replace("-", "")
diff --git a/python/run-tests.py b/python/run-tests.py
@@ -114,9 +114,10 @@ def run_individual_python_test(target_dir, test_name, pyspark_python, keep_test_
     additional_config = []
     if test_name.startswith("pyspark.sql.tests.connect"):
         # Adding Spark Connect JAR and Config
-        additional_config += ["--conf",
-                              "spark.plugins=org.apache.spark.sql.sparkconnect.service.SparkConnectPlugin"]
-
+        additional_config += [
+            "--conf",
+            "spark.plugins=org.apache.spark.sql.sparkconnect.service.SparkConnectPlugin"
+        ]
 
     # Also override the JVM's temp directory by setting driver and executor options.
     java_options = "-Djava.io.tmpdir={0}".format(tmp_dir)
@@ -125,11 +126,11 @@ def run_individual_python_test(target_dir, test_name, pyspark_python, keep_test_
         "--conf", "spark.driver.extraJavaOptions='{0}'".format(java_options),
         "--conf", "spark.executor.extraJavaOptions='{0}'".format(java_options),
         "--conf", "spark.sql.warehouse.dir='{0}'".format(metastore_dir),
-        ] + additional_config + [
-        "pyspark-shell"
     ]
-    env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args)
+    spark_args += additional_config
+    spark_args += ["pyspark-shell"]
 
+    env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args)
 
     output_prefix = get_valid_filename(pyspark_python + "__" + test_name + "__").lstrip("_")
     # Delete is always set to False since the cleanup will be either done by removing the