From 43019db973eac2e89f77bfdbd5d15fdea3a3050a Mon Sep 17 00:00:00 2001 From: Shuai Lin Date: Thu, 1 Dec 2016 01:09:24 +0800 Subject: [PATCH 1/6] [SPARK-18652] Include the data in pyspark package. --- python/MANIFEST.in | 1 + python/setup.py | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index bbcce1baa439..cf773712e5bc 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -17,6 +17,7 @@ global-exclude *.py[cod] __pycache__ .DS_Store recursive-include deps/jars *.jar graft deps/bin +recursive-include deps/data * recursive-include deps/examples *.py recursive-include lib *.zip include README.md diff --git a/python/setup.py b/python/setup.py index 625aea04073f..a13a024a98cd 100644 --- a/python/setup.py +++ b/python/setup.py @@ -69,9 +69,11 @@ EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python") SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin") +DATA_PATH = os.path.join(SPARK_HOME, "data") SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin") JARS_TARGET = os.path.join(TEMP_PATH, "jars") EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples") +DATA_TARGET = os.path.join(TEMP_PATH, "data") # Check and see if we are under the spark path in which case we need to build the symlink farm. @@ -114,11 +116,13 @@ def _supports_symlinks(): os.symlink(JARS_PATH, JARS_TARGET) os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) + os.symlink(DATA_PATH, DATA_TARGET) else: # For windows fall back to the slower copytree copytree(JARS_PATH, JARS_TARGET) copytree(SCRIPTS_PATH, SCRIPTS_TARGET) copytree(EXAMPLES_PATH, EXAMPLES_TARGET) + copytree(DATA_PATH, DATA_TARGET) else: # If we are not inside of SPARK_HOME verify we have the required symlink farm if not os.path.exists(JARS_TARGET): @@ -161,18 +165,21 @@ def _supports_symlinks(): 'pyspark.jars', 'pyspark.python.pyspark', 'pyspark.python.lib', + 'pyspark.data', 'pyspark.examples.src.main.python'], include_package_data=True, package_dir={ 'pyspark.jars': 'deps/jars', 'pyspark.bin': 'deps/bin', 'pyspark.python.lib': 'lib', + 'pyspark.data': 'deps/data', 'pyspark.examples.src.main.python': 'deps/examples', }, package_data={ 'pyspark.jars': ['*.jar'], 'pyspark.bin': ['*'], 'pyspark.python.lib': ['*.zip'], + 'pyspark.data': ['*'], 'pyspark.examples.src.main.python': ['*.py', '*/*.py']}, scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', @@ -202,8 +209,10 @@ def _supports_symlinks(): os.remove(os.path.join(TEMP_PATH, "jars")) os.remove(os.path.join(TEMP_PATH, "bin")) os.remove(os.path.join(TEMP_PATH, "examples")) + os.remove(os.path.join(TEMP_PATH, "data")) else: rmtree(os.path.join(TEMP_PATH, "jars")) rmtree(os.path.join(TEMP_PATH, "bin")) rmtree(os.path.join(TEMP_PATH, "examples")) + rmtree(os.path.join(TEMP_PATH, "data")) os.rmdir(TEMP_PATH) From 9735e201bbfd0fefedcfaf5ad86c5a4716ee6392 Mon Sep 17 00:00:00 2001 From: Shuai Lin Date: Thu, 1 Dec 2016 01:36:02 +0800 Subject: [PATCH 2/6] Also include the third-party licenses in pyspark package. --- python/MANIFEST.in | 1 + python/setup.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index cf773712e5bc..4827ec614946 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -21,3 +21,4 @@ recursive-include deps/data * recursive-include deps/examples *.py recursive-include lib *.zip include README.md +include deps/licenses/*.txt diff --git a/python/setup.py b/python/setup.py index a13a024a98cd..d66d58a6e76a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -74,7 +74,10 @@ JARS_TARGET = os.path.join(TEMP_PATH, "jars") EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples") DATA_TARGET = os.path.join(TEMP_PATH, "data") +LICENSES_PATH = os.path.join(SPARK_HOME, "licenses") +LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses") +data_files = glob.glob(os.path.join(LICENSES_PATH, "*")) # Check and see if we are under the spark path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Spark otherwise we @@ -117,12 +120,14 @@ def _supports_symlinks(): os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) os.symlink(DATA_PATH, DATA_TARGET) + os.symlink(LICENSES_PATH, LICENSES_TARGET) else: # For windows fall back to the slower copytree copytree(JARS_PATH, JARS_TARGET) copytree(SCRIPTS_PATH, SCRIPTS_TARGET) copytree(EXAMPLES_PATH, EXAMPLES_TARGET) copytree(DATA_PATH, DATA_TARGET) + copytree(LICENSES_PATH, LICENSES_TARGET) else: # If we are not inside of SPARK_HOME verify we have the required symlink farm if not os.path.exists(JARS_TARGET): @@ -166,6 +171,7 @@ def _supports_symlinks(): 'pyspark.python.pyspark', 'pyspark.python.lib', 'pyspark.data', + 'pyspark.licenses', 'pyspark.examples.src.main.python'], include_package_data=True, package_dir={ @@ -173,6 +179,7 @@ def _supports_symlinks(): 'pyspark.bin': 'deps/bin', 'pyspark.python.lib': 'lib', 'pyspark.data': 'deps/data', + 'pyspark.licenses': 'deps/licenses', 'pyspark.examples.src.main.python': 'deps/examples', }, package_data={ @@ -180,7 +187,9 @@ def _supports_symlinks(): 'pyspark.bin': ['*'], 'pyspark.python.lib': ['*.zip'], 'pyspark.data': ['*'], + 'pyspark.licenses': ['*.txt'], 'pyspark.examples.src.main.python': ['*.py', '*/*.py']}, + data_files=[('', data_files)], scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.4'], @@ -210,9 +219,11 @@ def _supports_symlinks(): os.remove(os.path.join(TEMP_PATH, "bin")) os.remove(os.path.join(TEMP_PATH, "examples")) os.remove(os.path.join(TEMP_PATH, "data")) + os.remove(os.path.join(TEMP_PATH, "licenses")) else: rmtree(os.path.join(TEMP_PATH, "jars")) rmtree(os.path.join(TEMP_PATH, "bin")) rmtree(os.path.join(TEMP_PATH, "examples")) rmtree(os.path.join(TEMP_PATH, "data")) + rmtree(os.path.join(TEMP_PATH, "licenses")) os.rmdir(TEMP_PATH) From ab51ae3b2ec0297f25c77bdd6d4a924fefe23f59 Mon Sep 17 00:00:00 2001 From: Shuai Lin Date: Thu, 1 Dec 2016 01:45:37 +0800 Subject: [PATCH 3/6] Use graft instead of include --- python/MANIFEST.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 4827ec614946..3b8c99ef9cc4 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -17,8 +17,8 @@ global-exclude *.py[cod] __pycache__ .DS_Store recursive-include deps/jars *.jar graft deps/bin -recursive-include deps/data * +graft deps/data +graft deps/licenses recursive-include deps/examples *.py recursive-include lib *.zip include README.md -include deps/licenses/*.txt From 90ca61d3e7c0c74b18e762ed03e53bab48454143 Mon Sep 17 00:00:00 2001 From: Shuai Lin Date: Thu, 1 Dec 2016 02:14:28 +0800 Subject: [PATCH 4/6] Fixed the error when using python3. --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index d66d58a6e76a..e1bdb952698e 100644 --- a/python/setup.py +++ b/python/setup.py @@ -186,7 +186,7 @@ def _supports_symlinks(): 'pyspark.jars': ['*.jar'], 'pyspark.bin': ['*'], 'pyspark.python.lib': ['*.zip'], - 'pyspark.data': ['*'], + 'pyspark.data': ['*.txt', '*.data'], 'pyspark.licenses': ['*.txt'], 'pyspark.examples.src.main.python': ['*.py', '*/*.py']}, data_files=[('', data_files)], From 5706d8c39571348973026ab0b0eb988f4d83a9fe Mon Sep 17 00:00:00 2001 From: Shuai Lin Date: Sun, 4 Dec 2016 18:35:33 +0800 Subject: [PATCH 5/6] Address review comments. --- python/MANIFEST.in | 4 ++-- python/setup.py | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 3b8c99ef9cc4..40f1fb2f1ee7 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -17,8 +17,8 @@ global-exclude *.py[cod] __pycache__ .DS_Store recursive-include deps/jars *.jar graft deps/bin -graft deps/data -graft deps/licenses +recursive-include deps/data *.data *.txt +recursive-include deps/licenses *.txt recursive-include deps/examples *.py recursive-include lib *.zip include README.md diff --git a/python/setup.py b/python/setup.py index e1bdb952698e..317279a97e07 100644 --- a/python/setup.py +++ b/python/setup.py @@ -77,8 +77,6 @@ LICENSES_PATH = os.path.join(SPARK_HOME, "licenses") LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses") -data_files = glob.glob(os.path.join(LICENSES_PATH, "*")) - # Check and see if we are under the spark path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Spark otherwise we # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a @@ -189,7 +187,6 @@ def _supports_symlinks(): 'pyspark.data': ['*.txt', '*.data'], 'pyspark.licenses': ['*.txt'], 'pyspark.examples.src.main.python': ['*.py', '*/*.py']}, - data_files=[('', data_files)], scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.4'], From 8d3ef535158f25f58ad88dc2d071d13d72cfa32c Mon Sep 17 00:00:00 2001 From: Shuai Lin Date: Sun, 4 Dec 2016 18:55:31 +0800 Subject: [PATCH 6/6] CR --- python/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 317279a97e07..bc2eb4ce9dbd 100644 --- a/python/setup.py +++ b/python/setup.py @@ -70,11 +70,12 @@ EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python") SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin") DATA_PATH = os.path.join(SPARK_HOME, "data") +LICENSES_PATH = os.path.join(SPARK_HOME, "licenses") + SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin") JARS_TARGET = os.path.join(TEMP_PATH, "jars") EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples") DATA_TARGET = os.path.join(TEMP_PATH, "data") -LICENSES_PATH = os.path.join(SPARK_HOME, "licenses") LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses") # Check and see if we are under the spark path in which case we need to build the symlink farm.