Skip to content

Commit be46fed

Browse files
MaxGekkJackey Lee
authored andcommitted
[SPARK-26066][SQL] Move truncatedString to sql/catalyst and add spark.sql.debug.maxToStringFields conf
## What changes were proposed in this pull request? In the PR, I propose: - new SQL config `spark.sql.debug.maxToStringFields` to control maximum number fields up to which `truncatedString` cuts its input sequences. - Moving `truncatedString` out of `core` to `sql/catalyst` because it is used only in the `sql/catalyst` packages for restricting number of fields converted to strings from `TreeNode` and expressions of`StructType`. ## How was this patch tested? Added a test to `QueryExecutionSuite` to check that `spark.sql.debug.maxToStringFields` impacts to behavior of `truncatedString`. Closes apache#23039 from MaxGekk/truncated-string-catalyst. Lead-authored-by: Maxim Gekk <maxim.gekk@databricks.com> Co-authored-by: Maxim Gekk <max.gekk@gmail.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
1 parent 4ad162a commit be46fed

File tree

24 files changed

+156
-103
lines changed

24 files changed

+156
-103
lines changed

core/src/main/scala/org/apache/spark/util/Utils.scala

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import java.security.SecureRandom
3131
import java.util.{Locale, Properties, Random, UUID}
3232
import java.util.concurrent._
3333
import java.util.concurrent.TimeUnit.NANOSECONDS
34-
import java.util.concurrent.atomic.AtomicBoolean
3534
import java.util.zip.GZIPInputStream
3635

3736
import scala.annotation.tailrec
@@ -93,53 +92,6 @@ private[spark] object Utils extends Logging {
9392
private val MAX_DIR_CREATION_ATTEMPTS: Int = 10
9493
@volatile private var localRootDirs: Array[String] = null
9594

96-
/**
97-
* The performance overhead of creating and logging strings for wide schemas can be large. To
98-
* limit the impact, we bound the number of fields to include by default. This can be overridden
99-
* by setting the 'spark.debug.maxToStringFields' conf in SparkEnv.
100-
*/
101-
val DEFAULT_MAX_TO_STRING_FIELDS = 25
102-
103-
private[spark] def maxNumToStringFields = {
104-
if (SparkEnv.get != null) {
105-
SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS)
106-
} else {
107-
DEFAULT_MAX_TO_STRING_FIELDS
108-
}
109-
}
110-
111-
/** Whether we have warned about plan string truncation yet. */
112-
private val truncationWarningPrinted = new AtomicBoolean(false)
113-
114-
/**
115-
* Format a sequence with semantics similar to calling .mkString(). Any elements beyond
116-
* maxNumToStringFields will be dropped and replaced by a "... N more fields" placeholder.
117-
*
118-
* @return the trimmed and formatted string.
119-
*/
120-
def truncatedString[T](
121-
seq: Seq[T],
122-
start: String,
123-
sep: String,
124-
end: String,
125-
maxNumFields: Int = maxNumToStringFields): String = {
126-
if (seq.length > maxNumFields) {
127-
if (truncationWarningPrinted.compareAndSet(false, true)) {
128-
logWarning(
129-
"Truncated the string representation of a plan since it was too large. This " +
130-
"behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.")
131-
}
132-
val numFields = math.max(0, maxNumFields - 1)
133-
seq.take(numFields).mkString(
134-
start, sep, sep + "... " + (seq.length - numFields) + " more fields" + end)
135-
} else {
136-
seq.mkString(start, sep, end)
137-
}
138-
}
139-
140-
/** Shorthand for calling truncatedString() without start or end strings. */
141-
def truncatedString[T](seq: Seq[T], sep: String): String = truncatedString(seq, "", sep, "")
142-
14395
/** Serialize an object using Java serialization */
14496
def serialize[T](o: T): Array[Byte] = {
14597
val bos = new ByteArrayOutputStream()

core/src/test/scala/org/apache/spark/util/UtilsSuite.scala

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,6 @@ import org.apache.spark.scheduler.SparkListener
4545

4646
class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
4747

48-
test("truncatedString") {
49-
assert(Utils.truncatedString(Nil, "[", ", ", "]", 2) == "[]")
50-
assert(Utils.truncatedString(Seq(1, 2), "[", ", ", "]", 2) == "[1, 2]")
51-
assert(Utils.truncatedString(Seq(1, 2, 3), "[", ", ", "]", 2) == "[1, ... 2 more fields]")
52-
assert(Utils.truncatedString(Seq(1, 2, 3), "[", ", ", "]", -5) == "[, ... 3 more fields]")
53-
assert(Utils.truncatedString(Seq(1, 2, 3), ", ") == "1, 2, 3")
54-
}
55-
5648
test("timeConversion") {
5749
// Test -1
5850
assert(Utils.timeStringAsSeconds("-1") === -1)

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate
2525
import org.apache.spark.sql.catalyst.expressions.codegen._
2626
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
2727
import org.apache.spark.sql.catalyst.trees.TreeNode
28+
import org.apache.spark.sql.catalyst.util.truncatedString
2829
import org.apache.spark.sql.internal.SQLConf
2930
import org.apache.spark.sql.types._
30-
import org.apache.spark.util.Utils
3131

3232
////////////////////////////////////////////////////////////////////////////////////////////////////
3333
// This file defines the basic expression abstract classes in Catalyst.
@@ -237,7 +237,7 @@ abstract class Expression extends TreeNode[Expression] {
237237

238238
override def simpleString: String = toString
239239

240-
override def toString: String = prettyName + Utils.truncatedString(
240+
override def toString: String = prettyName + truncatedString(
241241
flatArguments.toSeq, "(", ", ", ")")
242242

243243
/**

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ import org.apache.spark.sql.catalyst.expressions._
2424
import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
2525
import org.apache.spark.sql.catalyst.plans._
2626
import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning}
27+
import org.apache.spark.sql.catalyst.util.truncatedString
2728
import org.apache.spark.sql.types._
28-
import org.apache.spark.util.Utils
2929
import org.apache.spark.util.random.RandomSampler
3030

3131
/**
@@ -485,7 +485,7 @@ case class With(child: LogicalPlan, cteRelations: Seq[(String, SubqueryAlias)])
485485
override def output: Seq[Attribute] = child.output
486486

487487
override def simpleString: String = {
488-
val cteAliases = Utils.truncatedString(cteRelations.map(_._1), "[", ", ", "]")
488+
val cteAliases = truncatedString(cteRelations.map(_._1), "[", ", ", "]")
489489
s"CTE $cteAliases"
490490
}
491491

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ import org.apache.spark.sql.catalyst.errors._
3737
import org.apache.spark.sql.catalyst.expressions._
3838
import org.apache.spark.sql.catalyst.plans.JoinType
3939
import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning}
40+
import org.apache.spark.sql.catalyst.util.truncatedString
4041
import org.apache.spark.sql.types._
4142
import org.apache.spark.storage.StorageLevel
42-
import org.apache.spark.util.Utils
4343

4444
/** Used by [[TreeNode.getNodeNumbered]] when traversing the tree for a given number */
4545
private class MutableInt(var i: Int)
@@ -440,10 +440,10 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
440440
case tn: TreeNode[_] => tn.simpleString :: Nil
441441
case seq: Seq[Any] if seq.toSet.subsetOf(allChildren.asInstanceOf[Set[Any]]) => Nil
442442
case iter: Iterable[_] if iter.isEmpty => Nil
443-
case seq: Seq[_] => Utils.truncatedString(seq, "[", ", ", "]") :: Nil
444-
case set: Set[_] => Utils.truncatedString(set.toSeq, "{", ", ", "}") :: Nil
443+
case seq: Seq[_] => truncatedString(seq, "[", ", ", "]") :: Nil
444+
case set: Set[_] => truncatedString(set.toSeq, "{", ", ", "}") :: Nil
445445
case array: Array[_] if array.isEmpty => Nil
446-
case array: Array[_] => Utils.truncatedString(array, "[", ", ", "]") :: Nil
446+
case array: Array[_] => truncatedString(array, "[", ", ", "]") :: Nil
447447
case null => Nil
448448
case None => Nil
449449
case Some(null) => Nil
@@ -664,7 +664,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
664664
t.forall(_.isInstanceOf[Partitioning]) || t.forall(_.isInstanceOf[DataType]) =>
665665
JArray(t.map(parseToJson).toList)
666666
case t: Seq[_] if t.length > 0 && t.head.isInstanceOf[String] =>
667-
JString(Utils.truncatedString(t, "[", ", ", "]"))
667+
JString(truncatedString(t, "[", ", ", "]"))
668668
case t: Seq[_] => JNull
669669
case m: Map[_, _] => JNull
670670
// if it's a scala object, we can simply keep the full class path.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,16 @@ package org.apache.spark.sql.catalyst
1919

2020
import java.io._
2121
import java.nio.charset.StandardCharsets
22+
import java.util.concurrent.atomic.AtomicBoolean
2223

24+
import org.apache.spark.internal.Logging
2325
import org.apache.spark.sql.catalyst.expressions._
26+
import org.apache.spark.sql.internal.SQLConf
2427
import org.apache.spark.sql.types.{NumericType, StringType}
2528
import org.apache.spark.unsafe.types.UTF8String
2629
import org.apache.spark.util.Utils
2730

28-
package object util {
31+
package object util extends Logging {
2932

3033
/** Silences output to stderr or stdout for the duration of f */
3134
def quietly[A](f: => A): A = {
@@ -167,6 +170,38 @@ package object util {
167170
builder.toString()
168171
}
169172

173+
/** Whether we have warned about plan string truncation yet. */
174+
private val truncationWarningPrinted = new AtomicBoolean(false)
175+
176+
/**
177+
* Format a sequence with semantics similar to calling .mkString(). Any elements beyond
178+
* maxNumToStringFields will be dropped and replaced by a "... N more fields" placeholder.
179+
*
180+
* @return the trimmed and formatted string.
181+
*/
182+
def truncatedString[T](
183+
seq: Seq[T],
184+
start: String,
185+
sep: String,
186+
end: String,
187+
maxNumFields: Int = SQLConf.get.maxToStringFields): String = {
188+
if (seq.length > maxNumFields) {
189+
if (truncationWarningPrinted.compareAndSet(false, true)) {
190+
logWarning(
191+
"Truncated the string representation of a plan since it was too large. This " +
192+
s"behavior can be adjusted by setting '${SQLConf.MAX_TO_STRING_FIELDS.key}'.")
193+
}
194+
val numFields = math.max(0, maxNumFields - 1)
195+
seq.take(numFields).mkString(
196+
start, sep, sep + "... " + (seq.length - numFields) + " more fields" + end)
197+
} else {
198+
seq.mkString(start, sep, end)
199+
}
200+
}
201+
202+
/** Shorthand for calling truncatedString() without start or end strings. */
203+
def truncatedString[T](seq: Seq[T], sep: String): String = truncatedString(seq, "", sep, "")
204+
170205
/* FIX ME
171206
implicit class debugLogging(a: Any) {
172207
def debugLogging() {

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1594,6 +1594,13 @@ object SQLConf {
15941594
"WHERE, which does not follow SQL standard.")
15951595
.booleanConf
15961596
.createWithDefault(false)
1597+
1598+
val MAX_TO_STRING_FIELDS = buildConf("spark.sql.debug.maxToStringFields")
1599+
.doc("Maximum number of fields of sequence-like entries can be converted to strings " +
1600+
"in debug output. Any elements beyond the limit will be dropped and replaced by a" +
1601+
""" "... N more fields" placeholder.""")
1602+
.intConf
1603+
.createWithDefault(25)
15971604
}
15981605

15991606
/**
@@ -2009,6 +2016,8 @@ class SQLConf extends Serializable with Logging {
20092016

20102017
def integralDivideReturnLong: Boolean = getConf(SQLConf.LEGACY_INTEGRALDIVIDE_RETURN_LONG)
20112018

2019+
def maxToStringFields: Int = getConf(SQLConf.MAX_TO_STRING_FIELDS)
2020+
20122021
/** ********************** SQLConf functionality methods ************ */
20132022

20142023
/** Set Spark SQL configuration properties. */

sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ import org.apache.spark.SparkException
2727
import org.apache.spark.annotation.Stable
2828
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, InterpretedOrdering}
2929
import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, LegacyTypeStringParser}
30-
import org.apache.spark.sql.catalyst.util.quoteIdentifier
30+
import org.apache.spark.sql.catalyst.util.{quoteIdentifier, truncatedString}
3131
import org.apache.spark.util.Utils
3232

3333
/**
@@ -346,7 +346,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
346346

347347
override def simpleString: String = {
348348
val fieldTypes = fields.view.map(field => s"${field.name}:${field.dataType.simpleString}")
349-
Utils.truncatedString(fieldTypes, "struct<", ",", ">")
349+
truncatedString(fieldTypes, "struct<", ",", ">")
350350
}
351351

352352
override def catalogString: String = {
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.util
19+
20+
import org.apache.spark.SparkFunSuite
21+
import org.apache.spark.sql.catalyst.util.truncatedString
22+
23+
class UtilSuite extends SparkFunSuite {
24+
test("truncatedString") {
25+
assert(truncatedString(Nil, "[", ", ", "]", 2) == "[]")
26+
assert(truncatedString(Seq(1, 2), "[", ", ", "]", 2) == "[1, 2]")
27+
assert(truncatedString(Seq(1, 2, 3), "[", ", ", "]", 2) == "[1, ... 2 more fields]")
28+
assert(truncatedString(Seq(1, 2, 3), "[", ", ", "]", -5) == "[, ... 3 more fields]")
29+
assert(truncatedString(Seq(1, 2, 3), ", ") == "1, 2, 3")
30+
}
31+
}

sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.expressions._
3030
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
3131
import org.apache.spark.sql.catalyst.plans.QueryPlan
3232
import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, UnknownPartitioning}
33+
import org.apache.spark.sql.catalyst.util.truncatedString
3334
import org.apache.spark.sql.execution.datasources._
3435
import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => ParquetSource}
3536
import org.apache.spark.sql.execution.metric.SQLMetrics
@@ -56,8 +57,8 @@ trait DataSourceScanExec extends LeafExecNode with CodegenSupport {
5657
case (key, value) =>
5758
key + ": " + StringUtils.abbreviate(redact(value), 100)
5859
}
59-
val metadataStr = Utils.truncatedString(metadataEntries, " ", ", ", "")
60-
s"$nodeNamePrefix$nodeName${Utils.truncatedString(output, "[", ",", "]")}$metadataStr"
60+
val metadataStr = truncatedString(metadataEntries, " ", ", ", "")
61+
s"$nodeNamePrefix$nodeName${truncatedString(output, "[", ",", "]")}$metadataStr"
6162
}
6263

6364
override def verboseString: String = redact(super.verboseString)

0 commit comments

Comments
 (0)