diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 000000000000..2260493b46ab --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +sql-configs.html diff --git a/docs/configuration.md b/docs/configuration.md index 2febfe9744d5..6d7ac1f67edf 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2399,47 +2399,15 @@ the driver or executor, or, in the absence of that value, the number of cores av Please refer to the [Security](security.html) page for available options on how to secure different Spark subsystems. -### Spark SQL - -Running the SET -v command will show the entire list of the SQL configuration. - -
-
-{% highlight scala %} -// spark is an existing SparkSession -spark.sql("SET -v").show(numRows = 200, truncate = false) -{% endhighlight %} - -
- -
- -{% highlight java %} -// spark is an existing SparkSession -spark.sql("SET -v").show(200, false); -{% endhighlight %} -
- -
- -{% highlight python %} -# spark is an existing SparkSession -spark.sql("SET -v").show(n=200, truncate=False) -{% endhighlight %} - -
- -
- -{% highlight r %} -sparkR.session() -properties <- sql("SET -v") -showDF(properties, numRows = 200, truncate = FALSE) -{% endhighlight %} +{% for static_file in site.static_files %} + {% if static_file.name == 'sql-configs.html' %} +### Spark SQL -
-
+ {% include_relative sql-configs.html %} + {% break %} + {% endif %} +{% endfor %} ### Spark Streaming diff --git a/sql/README.md b/sql/README.md index 67e3225e2c27..ae5ebd1d7537 100644 --- a/sql/README.md +++ b/sql/README.md @@ -9,4 +9,4 @@ Spark SQL is broken up into four subprojects: - Hive Support (sql/hive) - Includes extensions that allow users to write queries using a subset of HiveQL and access data from a Hive Metastore using Hive SerDes. There are also wrappers that allow users to run queries that include Hive UDFs, UDAFs, and UDTFs. - HiveServer and CLI support (sql/hive-thriftserver) - Includes support for the SQL CLI (bin/spark-sql) and a HiveServer2 (for JDBC/ODBC) compatible server. -Running `./sql/create-docs.sh` generates SQL documentation for built-in functions under `sql/site`. +Running `./sql/create-docs.sh` generates SQL documentation for built-in functions under `sql/site`, and SQL configuration documentation that gets included as part of `configuration.md` in the main `docs` directory. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3ad3416256c7..3362af267ca2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -324,11 +324,11 @@ object SQLConf { .doc("Configures the maximum size in bytes for a table that will be broadcast to all worker " + "nodes when performing a join. By setting this value to -1 broadcasting can be disabled. " + "Note that currently statistics are only supported for Hive Metastore tables where the " + - "command ANALYZE TABLE <tableName> COMPUTE STATISTICS noscan has been " + + "command `ANALYZE TABLE COMPUTE STATISTICS noscan` has been " + "run, and file-based data source tables where the statistics are computed directly on " + "the files of data.") .bytesConf(ByteUnit.BYTE) - .createWithDefault(10L * 1024 * 1024) + .createWithDefaultString("10MB") val LIMIT_SCALE_UP_FACTOR = buildConf("spark.sql.limit.scaleUpFactor") .internal() @@ -393,7 +393,7 @@ object SQLConf { s"an effect when '${ADAPTIVE_EXECUTION_ENABLED.key}' and " + s"'${REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key}' is enabled.") .bytesConf(ByteUnit.BYTE) - .createWithDefault(64 * 1024 * 1024) + .createWithDefaultString("64MB") val SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS = buildConf("spark.sql.adaptive.shuffle.maxNumPostShufflePartitions") @@ -427,7 +427,7 @@ object SQLConf { .doc("Configures the minimum size in bytes for a partition that is considered as a skewed " + "partition in adaptive skewed join.") .bytesConf(ByteUnit.BYTE) - .createWithDefault(64 * 1024 * 1024) + .createWithDefaultString("64MB") val ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR = buildConf("spark.sql.adaptive.optimizeSkewedJoin.skewedPartitionFactor") @@ -761,7 +761,7 @@ object SQLConf { val BROADCAST_TIMEOUT = buildConf("spark.sql.broadcastTimeout") .doc("Timeout in seconds for the broadcast wait time in broadcast joins.") .timeConf(TimeUnit.SECONDS) - .createWithDefault(5 * 60) + .createWithDefaultString(s"${5 * 60}") // This is only used for the thriftserver val THRIFTSERVER_POOL = buildConf("spark.sql.thriftserver.scheduler.pool") @@ -821,7 +821,7 @@ object SQLConf { .createWithDefault(true) val BUCKETING_MAX_BUCKETS = buildConf("spark.sql.sources.bucketing.maxBuckets") - .doc("The maximum number of buckets allowed. Defaults to 100000") + .doc("The maximum number of buckets allowed.") .intConf .checkValue(_ > 0, "the value of spark.sql.sources.bucketing.maxBuckets must be greater than 0") .createWithDefault(100000) @@ -1013,7 +1013,7 @@ object SQLConf { "This configuration is effective only when using file-based sources such as Parquet, JSON " + "and ORC.") .bytesConf(ByteUnit.BYTE) - .createWithDefault(128 * 1024 * 1024) // parquet.block.size + .createWithDefaultString("128MB") // parquet.block.size val FILES_OPEN_COST_IN_BYTES = buildConf("spark.sql.files.openCostInBytes") .internal() @@ -1152,7 +1152,8 @@ object SQLConf { val VARIABLE_SUBSTITUTE_ENABLED = buildConf("spark.sql.variable.substitute") - .doc("This enables substitution using syntax like ${var} ${system:var} and ${env:var}.") + .doc("This enables substitution using syntax like `${var}`, `${system:var}`, " + + "and `${env:var}`.") .booleanConf .createWithDefault(true) @@ -1162,7 +1163,7 @@ object SQLConf { .doc("Enable two-level aggregate hash map. When enabled, records will first be " + "inserted/looked-up at a 1st-level, small, fast map, and then fallback to a " + "2nd-level, larger, slower map when 1st level is full or keys cannot be found. " + - "When disabled, records go directly to the 2nd level. Defaults to true.") + "When disabled, records go directly to the 2nd level.") .booleanConf .createWithDefault(true) @@ -1316,10 +1317,10 @@ object SQLConf { val STREAMING_STOP_TIMEOUT = buildConf("spark.sql.streaming.stopTimeout") - .doc("How long to wait for the streaming execution thread to stop when calling the " + - "streaming query's stop() method in milliseconds. 0 or negative values wait indefinitely.") + .doc("How long to wait in milliseconds for the streaming execution thread to stop when " + + "calling the streaming query's stop() method. 0 or negative values wait indefinitely.") .timeConf(TimeUnit.MILLISECONDS) - .createWithDefault(0L) + .createWithDefaultString("0") val STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL = buildConf("spark.sql.streaming.noDataProgressEventInterval") @@ -1602,10 +1603,10 @@ object SQLConf { val PANDAS_UDF_BUFFER_SIZE = buildConf("spark.sql.pandas.udf.buffer.size") .doc( - s"Same as ${BUFFER_SIZE} but only applies to Pandas UDF executions. If it is not set, " + - s"the fallback is ${BUFFER_SIZE}. Note that Pandas execution requires more than 4 bytes. " + - "Lowering this value could make small Pandas UDF batch iterated and pipelined; however, " + - "it might degrade performance. See SPARK-27870.") + s"Same as `${BUFFER_SIZE.key}` but only applies to Pandas UDF executions. If it is not " + + s"set, the fallback is `${BUFFER_SIZE.key}`. Note that Pandas execution requires more " + + "than 4 bytes. Lowering this value could make small Pandas UDF batch iterated and " + + "pipelined; however, it might degrade performance. See SPARK-27870.") .fallbackConf(BUFFER_SIZE) val PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME = @@ -2020,7 +2021,7 @@ object SQLConf { .checkValue(i => i >= 0 && i <= ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH, "Invalid " + "value for 'spark.sql.maxPlanStringLength'. Length must be a valid string length " + "(nonnegative and shorter than the maximum size).") - .createWithDefault(ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) + .createWithDefaultString(s"${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}") val SET_COMMAND_REJECTS_SPARK_CORE_CONFS = buildConf("spark.sql.legacy.setCommandRejectsSparkCoreConfs") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index b232aa18c816..bf3055d5e3e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.{ExplainMode, QueryExecution} import org.apache.spark.sql.execution.arrow.ArrowConverters +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.DataType private[sql] object PythonSQLUtils { @@ -39,6 +40,12 @@ private[sql] object PythonSQLUtils { FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray } + def listSQLConfigs(): Array[(String, String, String)] = { + val conf = new SQLConf() + // Py4J doesn't seem to translate Seq well, so we convert to an Array. + conf.getAllDefinedConfs.toArray + } + /** * Python callable function to read a file in Arrow stream format and create a [[RDD]] * using each serialized ArrowRecordBatch as a partition. diff --git a/sql/create-docs.sh b/sql/create-docs.sh index 4353708d22f7..44aa877332fd 100755 --- a/sql/create-docs.sh +++ b/sql/create-docs.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# Script to create SQL API docs. This requires `mkdocs` and to build +# Script to create SQL API and config docs. This requires `mkdocs` and to build # Spark first. After running this script the html docs can be found in # $SPARK_HOME/sql/site @@ -39,14 +39,16 @@ fi pushd "$FWDIR" > /dev/null -# Now create the markdown file rm -fr docs mkdir docs -echo "Generating markdown files for SQL documentation." -"$SPARK_HOME/bin/spark-submit" gen-sql-markdown.py -# Now create the HTML files -echo "Generating HTML files for SQL documentation." +echo "Generating SQL API Markdown files." +"$SPARK_HOME/bin/spark-submit" gen-sql-api-docs.py + +echo "Generating SQL configuration table HTML file." +"$SPARK_HOME/bin/spark-submit" gen-sql-config-docs.py + +echo "Generating HTML files for SQL API documentation." mkdocs build --clean rm -fr docs diff --git a/sql/gen-sql-markdown.py b/sql/gen-sql-api-docs.py similarity index 96% rename from sql/gen-sql-markdown.py rename to sql/gen-sql-api-docs.py index e0529f831061..4feee7ad5257 100644 --- a/sql/gen-sql-markdown.py +++ b/sql/gen-sql-api-docs.py @@ -15,10 +15,11 @@ # limitations under the License. # -import sys import os from collections import namedtuple +from pyspark.java_gateway import launch_gateway + ExpressionInfo = namedtuple( "ExpressionInfo", "className name usage arguments examples note since deprecated") @@ -219,8 +220,7 @@ def generate_sql_markdown(jvm, path): if __name__ == "__main__": - from pyspark.java_gateway import launch_gateway - jvm = launch_gateway().jvm - markdown_file_path = "%s/docs/index.md" % os.path.dirname(sys.argv[0]) + spark_root_dir = os.path.dirname(os.path.dirname(__file__)) + markdown_file_path = os.path.join(spark_root_dir, "sql/docs/index.md") generate_sql_markdown(jvm, markdown_file_path) diff --git a/sql/gen-sql-config-docs.py b/sql/gen-sql-config-docs.py new file mode 100644 index 000000000000..04f5a850c998 --- /dev/null +++ b/sql/gen-sql-config-docs.py @@ -0,0 +1,117 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re +from collections import namedtuple +from textwrap import dedent + +# To avoid adding a new direct dependency, we import markdown from within mkdocs. +from mkdocs.structure.pages import markdown +from pyspark.java_gateway import launch_gateway + +SQLConfEntry = namedtuple( + "SQLConfEntry", ["name", "default", "description"]) + + +def get_public_sql_configs(jvm): + sql_configs = [ + SQLConfEntry( + name=_sql_config._1(), + default=_sql_config._2(), + description=_sql_config._3(), + ) + for _sql_config in jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listSQLConfigs() + ] + return sql_configs + + +def generate_sql_configs_table(sql_configs, path): + """ + Generates an HTML table at `path` that lists all public SQL + configuration options. + + The table will look something like this: + + ```html + + + + + + + + + + ... + +
Property NameDefaultMeaning
spark.sql.adaptive.enabledfalse

When true, enable adaptive query execution.

+ ``` + """ + value_reference_pattern = re.compile(r"^$") + + with open(path, 'w') as f: + f.write(dedent( + """ + + + """ + )) + for config in sorted(sql_configs, key=lambda x: x.name): + if config.default == "": + default = "(none)" + elif config.default.startswith(" + + + + + """ + .format( + name=config.name, + default=default, + description=markdown.markdown(config.description), + ) + )) + f.write("
Property NameDefaultMeaning
{name}{default}{description}
\n") + + +if __name__ == "__main__": + jvm = launch_gateway().jvm + sql_configs = get_public_sql_configs(jvm) + + spark_root_dir = os.path.dirname(os.path.dirname(__file__)) + sql_configs_table_path = os.path.join(spark_root_dir, "docs/sql-configs.html") + + generate_sql_configs_table(sql_configs, path=sql_configs_table_path)