Skip to content
Closed
Next Next commit
Decouple PySpark core API to pyspark.core package
  • Loading branch information
HyukjinKwon committed Apr 3, 2024
commit 3817a45767493d594d6b2bb942bcd299364dfc14
12 changes: 6 additions & 6 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,17 +430,17 @@ def __hash__(self):
source_file_regexes=["python/(?!pyspark/(ml|mllib|sql|streaming))"],
python_test_goals=[
# doctests
"pyspark.rdd",
"pyspark.context",
"pyspark.conf",
"pyspark.broadcast",
"pyspark.core.rdd",
"pyspark.core.context",
"pyspark.core.conf",
"pyspark.core.broadcast",
"pyspark.accumulators",
"pyspark.files",
"pyspark.core.files",
"pyspark.serializers",
"pyspark.profiler",
"pyspark.shuffle",
"pyspark.taskcontext",
"pyspark.util",
"pyspark.core.util",
# unittests
"pyspark.tests.test_appsubmit",
"pyspark.tests.test_broadcast",
Expand Down
2 changes: 1 addition & 1 deletion docs/rdd-programming-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -1321,7 +1321,7 @@ method. The code below shows this:

{% highlight python %}
>>> broadcastVar = sc.broadcast([1, 2, 3])
<pyspark.broadcast.Broadcast object at 0x102789f10>
<pyspark.core.broadcast.Broadcast object at 0x102789f10>

>>> broadcastVar.value
[1, 2, 3]
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/avro_inputformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from typing import Any, Tuple

from functools import reduce
from pyspark.rdd import RDD
from pyspark import RDD
from pyspark.sql import SparkSession

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/parquet_inputformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
import sys
from typing import Any, Tuple

from pyspark.rdd import RDD
from pyspark import RDD
from pyspark.sql import SparkSession

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import sys
from typing import Tuple

from pyspark.rdd import RDD
from pyspark import RDD
from pyspark.sql import SparkSession


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from typing import Tuple

from pyspark import SparkContext
from pyspark.rdd import RDD
from pyspark import RDD
from pyspark.streaming import DStream, StreamingContext


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@
import sys
from typing import List, Tuple

from pyspark import SparkContext
from pyspark.accumulators import Accumulator
from pyspark.broadcast import Broadcast
from pyspark.rdd import RDD
from pyspark import SparkContext, Accumulator, Broadcast, RDD
from pyspark.streaming import StreamingContext


Expand Down
3 changes: 1 addition & 2 deletions examples/src/main/python/streaming/sql_network_wordcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
import sys
import datetime

from pyspark import SparkConf, SparkContext
from pyspark.rdd import RDD
from pyspark import SparkConf, SparkContext, RDD
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession

Expand Down
18 changes: 12 additions & 6 deletions python/pyspark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,17 @@
from functools import wraps
from typing import cast, Any, Callable, TypeVar, Union

from pyspark.conf import SparkConf
from pyspark.rdd import RDD, RDDBarrier
from pyspark.files import SparkFiles
from pyspark.status import StatusTracker, SparkJobInfo, SparkStageInfo
try:
from pyspark.core.conf import SparkConf
from pyspark.core.rdd import RDD, RDDBarrier
from pyspark.core.files import SparkFiles
from pyspark.core.status import StatusTracker, SparkJobInfo, SparkStageInfo
from pyspark.core.broadcast import Broadcast
except ImportError:
pass
from pyspark.util import InheritableThread, inheritable_thread_target
from pyspark.storagelevel import StorageLevel
from pyspark.accumulators import Accumulator, AccumulatorParam
from pyspark.broadcast import Broadcast
from pyspark.serializers import MarshalSerializer, CPickleSerializer
from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo
from pyspark.profiler import Profiler, BasicProfiler
Expand Down Expand Up @@ -106,7 +109,10 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:


# To avoid circular dependencies
from pyspark.context import SparkContext
try:
from pyspark.core.context import SparkContext
except ImportError:
pass

# for back compatibility
from pyspark.sql import SQLContext, HiveContext, Row # noqa: F401
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/accumulators.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def _start_update_server(auth_token: str) -> AccumulatorServer:
if __name__ == "__main__":
import doctest

from pyspark.context import SparkContext
from pyspark.core.context import SparkContext

globs = globals().copy()
# The small batch size here ensures that we see multiple batches,
Expand Down
16 changes: 16 additions & 0 deletions python/pyspark/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
11 changes: 5 additions & 6 deletions python/pyspark/broadcast.py → python/pyspark/core/broadcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,8 @@
Union,
)

from pyspark.java_gateway import local_connect_and_auth
from pyspark.serializers import ChunkedStream, pickle_protocol
from pyspark.util import print_exec
from pyspark.util import print_exec, local_connect_and_auth
from pyspark.errors import PySparkRuntimeError

if TYPE_CHECKING:
Expand All @@ -56,7 +55,7 @@


def _from_id(bid: int) -> "Broadcast[Any]":
from pyspark.broadcast import _broadcastRegistry
from pyspark.core.broadcast import _broadcastRegistry

if bid not in _broadcastRegistry:
raise PySparkRuntimeError(
Expand Down Expand Up @@ -367,13 +366,13 @@ def clear(self) -> None:
def _test() -> None:
import doctest
from pyspark.sql import SparkSession
import pyspark.broadcast
import pyspark.core.broadcast

globs = pyspark.broadcast.__dict__.copy()
globs = pyspark.core.broadcast.__dict__.copy()
spark = SparkSession.builder.master("local[4]").appName("broadcast tests").getOrCreate()
globs["spark"] = spark

(failure_count, test_count) = doctest.testmod(pyspark.broadcast, globs=globs)
(failure_count, test_count) = doctest.testmod(pyspark.core.broadcast, globs=globs)
spark.stop()
if failure_count:
sys.exit(-1)
Expand Down
14 changes: 7 additions & 7 deletions python/pyspark/conf.py → python/pyspark/core/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,11 @@ class SparkConf:

Examples
--------
>>> from pyspark.conf import SparkConf
>>> from pyspark.context import SparkContext
>>> from pyspark.core.conf import SparkConf
>>> from pyspark.core.context import SparkContext
>>> conf = SparkConf()
>>> conf.setMaster("local").setAppName("My app")
<pyspark.conf.SparkConf object at ...>
<pyspark.core.conf.SparkConf object at ...>
>>> conf.get("spark.master")
'local'
>>> conf.get("spark.app.name")
Expand All @@ -79,13 +79,13 @@ class SparkConf:

>>> conf = SparkConf(loadDefaults=False)
>>> conf.setSparkHome("/path")
<pyspark.conf.SparkConf object at ...>
<pyspark.core.conf.SparkConf object at ...>
>>> conf.get("spark.home")
'/path'
>>> conf.setExecutorEnv("VAR1", "value1")
<pyspark.conf.SparkConf object at ...>
<pyspark.core.conf.SparkConf object at ...>
>>> conf.setExecutorEnv(pairs = [("VAR3", "value3"), ("VAR4", "value4")])
<pyspark.conf.SparkConf object at ...>
<pyspark.core.conf.SparkConf object at ...>
>>> conf.get("spark.executorEnv.VAR1")
'value1'
>>> print(conf.toDebugString())
Expand Down Expand Up @@ -124,7 +124,7 @@ def __init__(
if _jconf:
self._jconf = _jconf
else:
from pyspark.context import SparkContext
from pyspark.core.context import SparkContext

_jvm = _jvm or SparkContext._jvm

Expand Down
15 changes: 8 additions & 7 deletions python/pyspark/context.py → python/pyspark/core/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@

from pyspark import accumulators
from pyspark.accumulators import Accumulator
from pyspark.broadcast import Broadcast, BroadcastPickleRegistry
from pyspark.conf import SparkConf
from pyspark.files import SparkFiles
from pyspark.java_gateway import launch_gateway, local_connect_and_auth
from pyspark.core.broadcast import Broadcast, BroadcastPickleRegistry
from pyspark.core.conf import SparkConf
from pyspark.core.files import SparkFiles
from pyspark.java_gateway import launch_gateway
from pyspark.serializers import (
CPickleSerializer,
BatchedSerializer,
Expand All @@ -64,10 +64,11 @@
)
from pyspark.storagelevel import StorageLevel
from pyspark.resource.information import ResourceInformation
from pyspark.rdd import RDD, _load_from_socket
from pyspark.core.rdd import RDD
from pyspark.util import _load_from_socket, local_connect_and_auth
from pyspark.taskcontext import TaskContext
from pyspark.traceback_utils import CallSite, first_spark_call
from pyspark.status import StatusTracker
from pyspark.core.status import StatusTracker
from pyspark.profiler import ProfilerCollector, BasicProfiler, UDFBasicProfiler, MemoryProfiler
from pyspark.errors import PySparkRuntimeError
from py4j.java_gateway import is_instance_of, JavaGateway, JavaObject, JVMView
Expand Down Expand Up @@ -144,7 +145,7 @@ class SparkContext:

Examples
--------
>>> from pyspark.context import SparkContext
>>> from pyspark.core.context import SparkContext
>>> sc = SparkContext('local', 'test')
>>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/files.py → python/pyspark/core/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def getRootDirectory(cls) -> str:

Examples
--------
>>> from pyspark.files import SparkFiles
>>> from pyspark.core.files import SparkFiles
>>> SparkFiles.getRootDirectory() # doctest: +SKIP
'.../spark-a904728e-08d3-400c-a872-cfd82fd6dcd2/userFiles-648cf6d6-bb2c-4f53-82bd-e658aba0c5de'
"""
Expand Down
Loading