Skip to content

Commit 8c1e1ff

Browse files
committed
Merge branch 'file-handle-optimizations' into hive-compat-suite-speedup
2 parents fac08d5 + c7caa5c commit 8c1e1ff

File tree

124 files changed

+1939
-1857
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+1939
-1857
lines changed

R/create-docs.sh

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,7 @@ set -e
3030
export FWDIR="$(cd "`dirname "$0"`"; pwd)"
3131
pushd $FWDIR
3232

33-
# Generate Rd file
34-
Rscript -e 'library(devtools); devtools::document(pkg="./pkg", roclets=c("rd"))'
35-
36-
# Install the package
33+
# Install the package (this will also generate the Rd files)
3734
./install-dev.sh
3835

3936
# Now create HTML files

R/install-dev.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,12 @@ LIB_DIR="$FWDIR/lib"
3434

3535
mkdir -p $LIB_DIR
3636

37-
# Install R
37+
pushd $FWDIR
38+
39+
# Generate Rd files if devtools is installed
40+
Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
41+
42+
# Install SparkR to $LIB_DIR
3843
R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
44+
45+
popd

R/pkg/R/SQLContext.R

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ dropTempTable <- function(sqlContext, tableName) {
452452
#' df <- read.df(sqlContext, "path/to/file.json", source = "json")
453453
#' }
454454

455-
read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
455+
read.df <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
456456
options <- varargsToEnv(...)
457457
if (!is.null(path)) {
458458
options[['path']] <- path
@@ -462,15 +462,21 @@ read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
462462
source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
463463
"org.apache.spark.sql.parquet")
464464
}
465-
sdf <- callJMethod(sqlContext, "load", source, options)
465+
if (!is.null(schema)) {
466+
stopifnot(class(schema) == "structType")
467+
sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sqlContext, source,
468+
schema$jobj, options)
469+
} else {
470+
sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sqlContext, source, options)
471+
}
466472
dataFrame(sdf)
467473
}
468474

469475
#' @aliases loadDF
470476
#' @export
471477

472-
loadDF <- function(sqlContext, path = NULL, source = NULL, ...) {
473-
read.df(sqlContext, path, source, ...)
478+
loadDF <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
479+
read.df(sqlContext, path, source, schema, ...)
474480
}
475481

476482
#' Create an external table

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,19 @@ test_that("read.df() from json file", {
504504
df <- read.df(sqlContext, jsonPath, "json")
505505
expect_true(inherits(df, "DataFrame"))
506506
expect_true(count(df) == 3)
507+
508+
# Check if we can apply a user defined schema
509+
schema <- structType(structField("name", type = "string"),
510+
structField("age", type = "double"))
511+
512+
df1 <- read.df(sqlContext, jsonPath, "json", schema)
513+
expect_true(inherits(df1, "DataFrame"))
514+
expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
515+
516+
# Run the same with loadDF
517+
df2 <- loadDF(sqlContext, jsonPath, "json", schema)
518+
expect_true(inherits(df2, "DataFrame"))
519+
expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
507520
})
508521

509522
test_that("write.df() as parquet file", {

bin/pyspark

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,24 +17,10 @@
1717
# limitations under the License.
1818
#
1919

20-
# Figure out where Spark is installed
2120
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
2221

2322
source "$SPARK_HOME"/bin/load-spark-env.sh
24-
25-
function usage() {
26-
if [ -n "$1" ]; then
27-
echo $1
28-
fi
29-
echo "Usage: ./bin/pyspark [options]" 1>&2
30-
"$SPARK_HOME"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
31-
exit $2
32-
}
33-
export -f usage
34-
35-
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
36-
usage
37-
fi
23+
export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]"
3824

3925
# In Spark <= 1.1, setting IPYTHON=1 would cause the driver to be launched using the `ipython`
4026
# executable, while the worker would still be launched using PYSPARK_PYTHON.

bin/pyspark2.cmd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ rem Figure out where the Spark framework is installed
2121
set SPARK_HOME=%~dp0..
2222

2323
call %SPARK_HOME%\bin\load-spark-env.cmd
24+
set _SPARK_CMD_USAGE=Usage: bin\pyspark.cmd [options]
2425

2526
rem Figure out which Python to use.
2627
if "x%PYSPARK_DRIVER_PYTHON%"=="x" (

bin/spark-class

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,12 @@
1616
# See the License for the specific language governing permissions and
1717
# limitations under the License.
1818
#
19-
set -e
2019

2120
# Figure out where Spark is installed
2221
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
2322

2423
. "$SPARK_HOME"/bin/load-spark-env.sh
2524

26-
if [ -z "$1" ]; then
27-
echo "Usage: spark-class <class> [<args>]" 1>&2
28-
exit 1
29-
fi
30-
3125
# Find the java binary
3226
if [ -n "${JAVA_HOME}" ]; then
3327
RUNNER="${JAVA_HOME}/bin/java"
@@ -98,9 +92,4 @@ CMD=()
9892
while IFS= read -d '' -r ARG; do
9993
CMD+=("$ARG")
10094
done < <("$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@")
101-
102-
if [ "${CMD[0]}" = "usage" ]; then
103-
"${CMD[@]}"
104-
else
105-
exec "${CMD[@]}"
106-
fi
95+
exec "${CMD[@]}"

bin/spark-shell

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,7 @@ esac
2929
set -o posix
3030

3131
export FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
32-
33-
usage() {
34-
if [ -n "$1" ]; then
35-
echo "$1"
36-
fi
37-
echo "Usage: ./bin/spark-shell [options]"
38-
"$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
39-
exit "$2"
40-
}
41-
export -f usage
42-
43-
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
44-
usage "" 0
45-
fi
32+
export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]"
4633

4734
# SPARK-4161: scala does not assume use of the java classpath,
4835
# so we need to add the "-Dscala.usejavacp=true" flag manually. We

bin/spark-shell2.cmd

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,7 @@ rem limitations under the License.
1818
rem
1919

2020
set SPARK_HOME=%~dp0..
21-
22-
echo "%*" | findstr " \<--help\> \<-h\>" >nul
23-
if %ERRORLEVEL% equ 0 (
24-
call :usage
25-
exit /b 0
26-
)
21+
set _SPARK_CMD_USAGE=Usage: .\bin\spark-shell.cmd [options]
2722

2823
rem SPARK-4161: scala does not assume use of the java classpath,
2924
rem so we need to add the "-Dscala.usejavacp=true" flag manually. We
@@ -37,16 +32,4 @@ if "x%SPARK_SUBMIT_OPTS%"=="x" (
3732
set SPARK_SUBMIT_OPTS="%SPARK_SUBMIT_OPTS% -Dscala.usejavacp=true"
3833

3934
:run_shell
40-
call %SPARK_HOME%\bin\spark-submit2.cmd --class org.apache.spark.repl.Main %*
41-
set SPARK_ERROR_LEVEL=%ERRORLEVEL%
42-
if not "x%SPARK_LAUNCHER_USAGE_ERROR%"=="x" (
43-
call :usage
44-
exit /b 1
45-
)
46-
exit /b %SPARK_ERROR_LEVEL%
47-
48-
:usage
49-
echo %SPARK_LAUNCHER_USAGE_ERROR%
50-
echo "Usage: .\bin\spark-shell.cmd [options]" >&2
51-
call %SPARK_HOME%\bin\spark-submit2.cmd --help 2>&1 | findstr /V "Usage" 1>&2
52-
goto :eof
35+
%SPARK_HOME%\bin\spark-submit2.cmd --class org.apache.spark.repl.Main %*

bin/spark-sql

Lines changed: 2 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,41 +17,6 @@
1717
# limitations under the License.
1818
#
1919

20-
#
21-
# Shell script for starting the Spark SQL CLI
22-
23-
# Enter posix mode for bash
24-
set -o posix
25-
26-
# NOTE: This exact class name is matched downstream by SparkSubmit.
27-
# Any changes need to be reflected there.
28-
export CLASS="org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver"
29-
30-
# Figure out where Spark is installed
3120
export FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
32-
33-
function usage {
34-
if [ -n "$1" ]; then
35-
echo "$1"
36-
fi
37-
echo "Usage: ./bin/spark-sql [options] [cli option]"
38-
pattern="usage"
39-
pattern+="\|Spark assembly has been built with Hive"
40-
pattern+="\|NOTE: SPARK_PREPEND_CLASSES is set"
41-
pattern+="\|Spark Command: "
42-
pattern+="\|--help"
43-
pattern+="\|======="
44-
45-
"$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
46-
echo
47-
echo "CLI options:"
48-
"$FWDIR"/bin/spark-class "$CLASS" --help 2>&1 | grep -v "$pattern" 1>&2
49-
exit "$2"
50-
}
51-
export -f usage
52-
53-
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
54-
usage "" 0
55-
fi
56-
57-
exec "$FWDIR"/bin/spark-submit --class "$CLASS" "$@"
21+
export _SPARK_CMD_USAGE="Usage: ./bin/spark-sql [options] [cli option]"
22+
exec "$FWDIR"/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver "$@"

0 commit comments

Comments
 (0)