-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21617][SQL] Store correct table metadata when altering schema in Hive metastore. #18849
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
aae3abd
2350b10
7ccf474
40ebc96
2f57a3c
0b27209
6824e35
7b777ed
abd6cf1
4a05b55
3c8a6bb
b63539d
cef66ac
c41683a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
HiveExternalCatalog.alterTableSchema takes a shortcut by modifying the raw Hive table metadata instead of the full Spark view; that means it needs to be aware of whether the table is Hive-compatible or not. For compatible tables, the current "replace the schema" code is the correct path, except that an exception in that path should result in an error, and not in retrying in a different way. For non-compatible tables, Spark should just update the table properties, and leave the schema stored in the raw table untouched. Because Spark doesn't explicitly store metadata about whether a table is Hive-compatible or not, a new property was added just to make that explicit. The code tries to detect old DS tables that don't have the property and do the right thing. These changes also uncovered a problem with the way case-sensitive DS tables were being saved to the Hive metastore; the metastore is case-insensitive, and the code was treating these tables as Hive-compatible if the data source had a Hive counterpart (e.g. for parquet). In this scenario, the schema could be corrupted when being updated from Spark if conflicting columns existed ignoring case. The change fixes this by making case-sensitive DS-tables not Hive-compatible.
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,7 +32,8 @@ import org.apache.thrift.TException | |
|
|
||
| import org.apache.spark.{SparkConf, SparkException} | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.internal.config.ConfigEntry | ||
| import org.apache.spark.sql.{AnalysisException, SparkSession} | ||
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException | ||
| import org.apache.spark.sql.catalyst.catalog._ | ||
|
|
@@ -43,7 +44,7 @@ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap | |
| import org.apache.spark.sql.execution.command.DDLUtils | ||
| import org.apache.spark.sql.execution.datasources.PartitioningUtils | ||
| import org.apache.spark.sql.hive.client.HiveClient | ||
| import org.apache.spark.sql.internal.HiveSerDe | ||
| import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} | ||
| import org.apache.spark.sql.internal.StaticSQLConf._ | ||
| import org.apache.spark.sql.types.{DataType, StructType} | ||
|
|
||
|
|
@@ -257,6 +258,20 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Retrieve a configuration value for the current active session, if any. | ||
| */ | ||
| private def currentSessionConf[T](entry: ConfigEntry[T]): T = { | ||
| SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession).map { session => | ||
| session.conf.get(entry) | ||
| }.getOrElse { | ||
| // If there's no active session, try to read from the SparkConf object instead. Normally | ||
| // there should be an active session, but unit tests invoke methods on the catalog directly, | ||
| // so that might not be true in some cases. | ||
| conf.get(entry) | ||
| } | ||
| } | ||
|
|
||
| private def createDataSourceTable(table: CatalogTable, ignoreIfExists: Boolean): Unit = { | ||
| // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`. | ||
| val provider = table.provider.get | ||
|
|
@@ -288,6 +303,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| // bucket specification to empty. Note that partition columns are retained, so that we can | ||
| // call partition-related Hive API later. | ||
| def newSparkSQLSpecificMetastoreTable(): CatalogTable = { | ||
| val hiveCompatible = Map(DATASOURCE_HIVE_COMPATIBLE -> "false") | ||
| table.copy( | ||
| // Hive only allows directory paths as location URIs while Spark SQL data source tables | ||
| // also allow file paths. For non-hive-compatible format, we should not set location URI | ||
|
|
@@ -297,11 +313,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| properties = storagePropsWithLocation), | ||
| schema = table.partitionSchema, | ||
| bucketSpec = None, | ||
| properties = table.properties ++ tableProperties) | ||
| properties = table.properties ++ tableProperties ++ hiveCompatible) | ||
| } | ||
|
|
||
| // converts the table metadata to Hive compatible format, i.e. set the serde information. | ||
| def newHiveCompatibleMetastoreTable(serde: HiveSerDe): CatalogTable = { | ||
| val hiveCompatible = Map(DATASOURCE_HIVE_COMPATIBLE -> "true") | ||
| val location = if (table.tableType == EXTERNAL) { | ||
| // When we hit this branch, we are saving an external data source table with hive | ||
| // compatible format, which means the data source is file-based and must have a `path`. | ||
|
|
@@ -320,7 +337,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| serde = serde.serde, | ||
| properties = storagePropsWithLocation | ||
| ), | ||
| properties = table.properties ++ tableProperties) | ||
| properties = table.properties ++ tableProperties ++ hiveCompatible) | ||
| } | ||
|
|
||
| val qualifiedTableName = table.identifier.quotedString | ||
|
|
@@ -342,6 +359,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " | ||
| (None, message) | ||
|
|
||
| case _ if currentSessionConf(SQLConf.CASE_SENSITIVE) => | ||
|
||
| val message = | ||
| s"Persisting case sensitive data source table $qualifiedTableName into " + | ||
| "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " | ||
| (None, message) | ||
|
|
||
| case Some(serde) => | ||
| val message = | ||
| s"Persisting file based data source table $qualifiedTableName into " + | ||
|
|
@@ -386,6 +409,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| * can be used as table properties later. | ||
| */ | ||
| private def tableMetaToTableProps(table: CatalogTable): mutable.Map[String, String] = { | ||
| tableMetaToTableProps(table, table.schema) | ||
| } | ||
|
|
||
| private def tableMetaToTableProps( | ||
| table: CatalogTable, | ||
| schema: StructType): mutable.Map[String, String] = { | ||
| val partitionColumns = table.partitionColumnNames | ||
| val bucketSpec = table.bucketSpec | ||
|
|
||
|
|
@@ -394,7 +423,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| // property. In this case, we split the JSON string and store each part as a separate table | ||
| // property. | ||
| val threshold = conf.get(SCHEMA_STRING_LENGTH_THRESHOLD) | ||
| val schemaJsonString = table.schema.json | ||
| val schemaJsonString = schema.json | ||
| // Split the JSON string. | ||
| val parts = schemaJsonString.grouped(threshold).toSeq | ||
| properties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString) | ||
|
|
@@ -611,30 +640,39 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| override def alterTableSchema(db: String, table: String, schema: StructType): Unit = withClient { | ||
| requireTableExists(db, table) | ||
| val rawTable = getRawTable(db, table) | ||
| val withNewSchema = rawTable.copy(schema = schema) | ||
| verifyColumnNames(withNewSchema) | ||
| // Add table metadata such as table schema, partition columns, etc. to table properties. | ||
| val updatedTable = withNewSchema.copy( | ||
| properties = withNewSchema.properties ++ tableMetaToTableProps(withNewSchema)) | ||
|
|
||
| // If it's a data source table, make sure the original schema is left unchanged; the | ||
| // actual schema is recorded as a table property. | ||
| val tableToStore = if (DDLUtils.isDatasourceTable(updatedTable)) { | ||
| updatedTable.copy(schema = rawTable.schema) | ||
| val updatedProperties = rawTable.properties ++ tableMetaToTableProps(rawTable, schema) | ||
|
|
||
| // Detect whether this is a Hive-compatible table. | ||
| val provider = rawTable.properties.get(DATASOURCE_PROVIDER) | ||
| val isHiveCompatible = if (provider.isDefined && provider != Some(DDLUtils.HIVE_PROVIDER)) { | ||
|
||
| rawTable.properties.get(DATASOURCE_HIVE_COMPATIBLE) match { | ||
| case Some(value) => | ||
| value.toBoolean | ||
| case _ => | ||
| // If the property is not set, the table may have been created by an old version | ||
| // of Spark. Those versions set a "path" property in the table's storage descriptor | ||
| // for non-Hive-compatible tables, so use that to detect compatibility. | ||
| rawTable.storage.properties.get("path").isDefined | ||
| } | ||
| } else { | ||
| updatedTable | ||
| // All non-DS tables are treated as regular Hive tables. | ||
| true | ||
| } | ||
|
|
||
| try { | ||
| client.alterTable(tableToStore) | ||
| } catch { | ||
| case NonFatal(e) => | ||
| val warningMessage = | ||
| s"Could not alter schema of table ${rawTable.identifier.quotedString} in a Hive " + | ||
| "compatible way. Updating Hive metastore in Spark SQL specific format." | ||
| logWarning(warningMessage, e) | ||
| client.alterTable(updatedTable.copy(schema = tableToStore.partitionSchema)) | ||
| val updatedTable = if (isHiveCompatible) { | ||
| val _updated = rawTable.copy(properties = updatedProperties, schema = schema) | ||
| verifyColumnNames(_updated) | ||
| _updated | ||
| } else { | ||
| // If the table is not Hive-compatible, the schema of the table should not be overwritten with | ||
| // the updated schema. The previous value stored in the metastore should be preserved; that | ||
| // will be either the table's original partition schema, or a placeholder schema inserted by | ||
| // the Hive client wrapper if the partition schema was empty. | ||
| rawTable.copy(properties = updatedProperties) | ||
| } | ||
|
|
||
| client.alterTable(updatedTable) | ||
| } | ||
|
|
||
| override def alterTableStats( | ||
|
|
@@ -1202,6 +1240,7 @@ object HiveExternalCatalog { | |
| val DATASOURCE_SCHEMA_PARTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "partCol." | ||
| val DATASOURCE_SCHEMA_BUCKETCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "bucketCol." | ||
| val DATASOURCE_SCHEMA_SORTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "sortCol." | ||
| val DATASOURCE_HIVE_COMPATIBLE = SPARK_SQL_PREFIX + "hive.compatibility" | ||
|
||
|
|
||
| val STATISTICS_PREFIX = SPARK_SQL_PREFIX + "statistics." | ||
| val STATISTICS_TOTAL_SIZE = STATISTICS_PREFIX + "totalSize" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -414,10 +414,7 @@ private[hive] class HiveClientImpl( | |
| unsupportedFeatures += "partitioned view" | ||
| } | ||
|
|
||
| val properties = Option(h.getParameters).map(_.asScala.toMap).getOrElse(Map()) | ||
|
|
||
| val provider = properties.get(HiveExternalCatalog.DATASOURCE_PROVIDER) | ||
| .orElse(Some(DDLUtils.HIVE_PROVIDER)) | ||
| val properties = Option(h.getParameters).map(_.asScala.toMap).orNull | ||
|
|
||
| // Hive-generated Statistics are also recorded in ignoredProperties | ||
| val ignoredProperties = scala.collection.mutable.Map.empty[String, String] | ||
|
|
@@ -472,7 +469,6 @@ private[hive] class HiveClientImpl( | |
| throw new AnalysisException("Hive index table is not supported.") | ||
| }, | ||
| schema = schema, | ||
| provider = provider, | ||
| partitionColumnNames = partCols.map(_.name), | ||
| // If the table is written by Spark, we will put bucketing information in table properties, | ||
| // and will always overwrite the bucket spec in hive metastore by the bucketing information | ||
|
|
@@ -913,7 +909,13 @@ private[hive] object HiveClientImpl { | |
| } | ||
| // after SPARK-19279, it is not allowed to create a hive table with an empty schema, | ||
| // so here we should not add a default col schema | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment looks like needing to move accordingly? |
||
| if (schema.isEmpty && DDLUtils.isDatasourceTable(table)) { | ||
| // | ||
| // Because HiveExternalCatalog sometimes writes back "raw" tables that have not been | ||
| // completely translated to Spark's view, the provider information needs to be looked | ||
| // up in two places. | ||
| val provider = table.provider.orElse( | ||
|
||
| table.properties.get(HiveExternalCatalog.DATASOURCE_PROVIDER)) | ||
| if (schema.isEmpty && provider != Some(DDLUtils.HIVE_PROVIDER)) { | ||
| // This is a hack to preserve existing behavior. Before Spark 2.0, we do not | ||
| // set a default serde here (this was done in Hive), and so if the user provides | ||
| // an empty schema Hive would automatically populate the schema with a single | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a good idea if we do this from the first version. But now, for backward compatibility, we have to handle the case without this special flag at read path, then I can't see the point of having this flag.