Skip to content
Merged
6 changes: 6 additions & 0 deletions core/dbt/context/exceptions_jinja.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
PropertyYMLError,
NotImplementedError,
RelationWrongTypeError,
ContractError,
)


Expand Down Expand Up @@ -65,6 +66,10 @@ def raise_compiler_error(msg, node=None) -> NoReturn:
raise CompilationError(msg, node)


def raise_contract_error(yaml_columns, sql_columns) -> NoReturn:
raise ContractError(yaml_columns, sql_columns)


def raise_database_error(msg, node=None) -> NoReturn:
raise DbtDatabaseError(msg, node)

Expand Down Expand Up @@ -119,6 +124,7 @@ def relation_wrong_type(relation, expected_type, model=None) -> NoReturn:
raise_invalid_property_yml_version,
raise_not_implemented,
relation_wrong_type,
raise_contract_error,
]
}

Expand Down
16 changes: 16 additions & 0 deletions core/dbt/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2136,6 +2136,22 @@ def get_message(self) -> str:
return msg


class ContractError(CompilationError):
def __init__(self, yaml_columns, sql_columns):
self.yaml_columns = yaml_columns
self.sql_columns = sql_columns
super().__init__(msg=self.get_message())

def get_message(self) -> str:
msg = (
"Please ensure the name, data_type, and number of columns in your `yml` file "
"match the columns in your SQL file.\n"
f"Schema File Columns: {self.yaml_columns}\n"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be useful to sort these both alphabetically (message only) so that it's easier for the user to spot the difference.

f"SQL File Columns: {self.sql_columns}"
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Were you going to update this as related to #7147?

return msg


# not modifying these since rpc should be deprecated soon
class UnknownAsyncIDException(Exception):
CODE = 10012
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,31 +37,29 @@
{#--Obtain the column schema provided by the schema file by generating an 'empty schema' query from the model's columns. #}
{%- set schema_file_provided_columns = get_column_schema_from_query(get_empty_schema_sql(model['columns'])) -%}

{#-- Build compiler error msg #}
{%- set sql_file_provided_columns_formatted = format_columns(sql_file_provided_columns) -%}
{%- set schema_file_provided_columns_formatted = format_columns(schema_file_provided_columns) -%}
{%- set compiler_error_msg = 'Please ensure the name, data_type, and number of columns in your `yml` file match the columns in your SQL file.\nSchema File Columns: ' ~ (schema_file_provided_columns_formatted|trim) ~ '\n\nSQL File Columns: ' ~ (sql_file_provided_columns_formatted|trim) ~ ' ' -%}

{#-- For compiler error msg #}
{%- set sql_columns = (format_columns(sql_file_provided_columns)|trim) -%}
{%- set yaml_columns = (format_columns(schema_file_provided_columns)|trim) -%}

{%- if sql_file_provided_columns|length != schema_file_provided_columns|length -%}
{%- do exceptions.raise_compiler_error(compiler_error_msg) -%}
{%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%}
{%- endif -%}

{%- for sql_col in sql_file_provided_columns -%}
{%- set yaml_col = [] -%}
{%- for schema_col in schema_file_provided_columns -%}
{%- if schema_col.name == sql_col.name -%}
{%- do yaml_col.append(schema_col) -%}
{% break %}
{%- break -%}
{%- endif -%}
{%- endfor -%}
{%- if not yaml_col -%}
{#-- Column with name not found in yaml --#}
{%- do exceptions.raise_compiler_error(compiler_error_msg) -%}
{%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%}
{%- endif -%}
{%- if sql_col.dtype != yaml_col[0].dtype -%}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously column types were compared based on their formatted representation, which could be data platform specific as implemented by the <adapter>__format_column macro.

For example, BigQuery's format_column implementation compares data_type values by column.data_type as opposed to column.dtype in order to make comparisons on nested data types.

Strictly comparing SQL and yml data_type values by dtype would allow the following contract to be accepted in BigQuery:

SELECT
STRUCT("test" AS name, [1.0,2.0] AS laps) as some_struct
models:
  - name: test_schema
    config:
      contract: true
    columns:
      - name: some_struct
        data_type: STRUCT<name FLOAT64, laps STRING> #wrong! but accepted because dtype == STRUCT for both SQL and schema.yml

One workaround would be to do this comparison using format_column instead, i.e: adapter.dispatch('format_column', 'dbt')(sql_col) != adapter.dispatch('format_column', 'dbt')(yaml_col[0]). This would also ensure the comparison and error messaging are using consistent logic. An alternative would be to implement a default__compare_data_types macro to enable adapter-specific implementations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked at using format_column but the default implementation of that just uses the dtype. Is that different in the other adapters?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't actually see any implementations of format_column in the adapters.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I was on the wrong bigquery branch. Bigquery is the only adapter that implements it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we include the formatted column in the Column structures returned by "get_column_schema_from_query"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or concatenate the other parts of the column structure for comparison?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a lot of logic to put in a jinja template. Could we put this in a python function and then wrap that python function in this macro? I'm cutting corners, but this is what's in my head:

columns_spec_ddl.sql

{% macro assert_columns_equivalent(sql) %}

  {% set sql_schema = get_column_schema_from_query(sql) %}

  {% set model_contract = model_contract(model['columns']) %}

  {% do assert_schema_meets_contract(sql_schema, model_contract) %}

dbt/adapters/base/impl.py (for lack of a better spot)

# these are defined elsewhere, but look something like this
ModelContract = List[ColumnContract]
Schema = List[Column]


def model_contract(model) -> ModelContract:
    # I assume we have a way of creating a model contract from a `schema.yml` file
    return ModelContract(model)


def assert_schema_meets_contract(schema: Schema, model_contract: ModelContract)
    if len(schema) != len(model_contract):
        raise ContractError(msg)
    for schema_column, contract_column in zip(sorted(schema), sorted(model_contract)):
        try:
            assert schema_column.name == contract_column.name
            assert schema_column.dtype == contract_column.dtype
        except AssertionError:
            raise ContractError(msg)

I think the python version would be much easier to unit test.

{#-- Column data types don't match --#}
{%- do exceptions.raise_compiler_error(compiler_error_msg) -%}
{%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%}
{%- endif -%}
{%- endfor -%}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
select
{% for column in model['columns'] %}
{{ column }}{{ ", " if not loop.last }}
{%- endfor %}
{% endfor %}
from (
{{ sql }}
) as model_subq
Expand Down