diff --git a/.github/workflows/docker-feast-source.yml b/.github/workflows/docker-feast-source.yml new file mode 100644 index 00000000000000..cc5ebf943b5213 --- /dev/null +++ b/.github/workflows/docker-feast-source.yml @@ -0,0 +1,74 @@ +name: docker-feast-source docker +on: + push: + branches: + - master + paths-ignore: + - 'docs/**' + - '**.md' + pull_request: + branches: + - master + paths: + - 'metadata-ingestion/src/datahub/ingestion/source/feast_image/**' + - '.github/workflows/docker-feast-source.yml' + paths_ignore: + - '**.md' + - '**.env' + release: + types: [published, edited] + +jobs: + setup: + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + publish: ${{ steps.publish.outputs.publish }} + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Compute Tag + id: tag + run: | + echo "GITHUB_REF: $GITHUB_REF" + SHORT_SHA=$(git rev-parse --short "$GITHUB_SHA") + echo "SHORT_SHA: $SHORT_SHA" + TAG=$(echo ${GITHUB_REF} | sed -e "s,refs/heads/master,latest\,${SHORT_SHA},g" -e 's,refs/tags/,,g' -e 's,refs/pull/\([0-9]*\).*,pr\1,g') + echo "tag=$TAG" + echo "::set-output name=tag::$TAG" + - name: Check whether publishing enabled + id: publish + env: + ENABLE_PUBLISH: ${{ secrets.ORG_DOCKER_PASSWORD }} + run: | + echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" + echo "::set-output name=publish::${{ env.ENABLE_PUBLISH != '' }}" + push_to_registries: + name: Build and Push Docker Image to Docker Hub + runs-on: ubuntu-latest + if: ${{ needs.setup.outputs.publish == 'true' }} + needs: setup + steps: + - name: Check out the repo + uses: actions/checkout@v2 + - name: Docker meta + id: docker_meta + uses: crazy-max/ghaction-docker-meta@v1 + with: + # list of Docker images to use as base name for tags + images: | + acryldata/datahub-ingestion-feast-wrapper + # add git short SHA as Docker tag + tag-custom: ${{ needs.setup.outputs.tag }} + tag-custom-only: true + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.ORG_DOCKER_PASSWORD }} + - name: Build and Push image + uses: docker/build-push-action@v2 + with: + context: ./metadata-ingestion/src/datahub/ingestion/source/feast_image + tags: ${{ steps.docker_meta.outputs.tags }} + push: ${{ needs.setup.outputs.publish == 'true' }} diff --git a/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index d4174980301feb..807110e180970b 100644 --- a/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -842,10 +842,12 @@ "name" : "MLFeatureDataType", "namespace" : "com.linkedin.common", "doc" : "MLFeature Data Type", - "symbols" : [ "USELESS", "NOMINAL", "ORDINAL", "BINARY", "COUNT", "TIME", "INTERVAL", "IMAGE", "VIDEO", "AUDIO", "TEXT", "MAP", "SEQUENCE", "SET" ], + "symbols" : [ "USELESS", "NOMINAL", "ORDINAL", "BINARY", "COUNT", "TIME", "INTERVAL", "IMAGE", "VIDEO", "AUDIO", "TEXT", "MAP", "SEQUENCE", "SET", "CONTINUOUS", "BYTE", "UNKNOWN" ], "symbolDocs" : { "AUDIO" : "Audio Data", "BINARY" : "Binary data is discrete data that can be in only one of two categories — either yes or no, 1 or 0, off or on, etc", + "BYTE" : "Bytes data are binary-encoded values that can represent complex objects.", + "CONTINUOUS" : "Continuous data are made of uncountable values, often the result of a measurement such as height, weight, age etc.", "COUNT" : "Count data is discrete whole number data — no negative numbers here.\nCount data often has many small values, such as zero and one.", "IMAGE" : "Image Data", "INTERVAL" : "Interval data has equal spaces between the numbers and does not represent a temporal pattern.\nExamples include percentages, temperatures, and income.", @@ -856,6 +858,7 @@ "SET" : "Set Data Type ex: set, frozenset", "TEXT" : "Text Data", "TIME" : "Time data is a cyclical, repeating continuous form of data.\nThe relevant time features can be any period— daily, weekly, monthly, annual, etc.", + "UNKNOWN" : "Unknown data are data that we don't know the type for.", "USELESS" : "Useless data is unique, discrete data with no potential relationship with the outcome variable.\nA useless feature has high cardinality. An example would be bank account numbers that were generated randomly.", "VIDEO" : "Video Data" } @@ -2641,8 +2644,8 @@ "type" : "com.linkedin.common.Urn", "doc" : "Standardized platform urn for the model", "Searchable" : { - "boostScore" : 0.1, - "fieldType" : "TEXT_PARTIAL" + "addToFilters" : true, + "fieldType" : "URN" } }, { "name" : "name", @@ -3065,6 +3068,94 @@ "keyAspect" : "mlModelKey", "name" : "mlModel" } + }, { + "type" : "record", + "name" : "MLPrimaryKeySnapshot", + "fields" : [ { + "name" : "urn", + "type" : "com.linkedin.common.Urn", + "doc" : "URN for the entity the metadata snapshot is associated with." + }, { + "name" : "aspects", + "type" : { + "type" : "array", + "items" : { + "type" : "typeref", + "name" : "MLPrimaryKeyAspect", + "namespace" : "com.linkedin.metadata.aspect", + "doc" : "A union of all supported metadata aspects for a MLPrimaryKey", + "ref" : [ { + "type" : "record", + "name" : "MLPrimaryKeyKey", + "namespace" : "com.linkedin.metadata.key", + "doc" : "Key for an MLPrimaryKey", + "fields" : [ { + "name" : "featureNamespace", + "type" : "string", + "doc" : "Namespace for the primary key", + "Searchable" : { + "addToFilters" : true, + "fieldType" : "TEXT_PARTIAL" + } + }, { + "name" : "name", + "type" : "string", + "doc" : "Name of the primary key", + "Searchable" : { + "boostScore" : 8.0, + "enableAutocomplete" : true, + "fieldType" : "TEXT_PARTIAL" + } + } ], + "Aspect" : { + "name" : "mlPrimaryKeyKey" + } + }, { + "type" : "record", + "name" : "MLPrimaryKeyProperties", + "namespace" : "com.linkedin.ml.metadata", + "doc" : "Properties associated with a MLPrimaryKey", + "fields" : [ { + "name" : "description", + "type" : "string", + "doc" : "Documentation of the MLPrimaryKey", + "optional" : true + }, { + "name" : "dataType", + "type" : "com.linkedin.common.MLFeatureDataType", + "doc" : "Data Type of the MLPrimaryKey", + "optional" : true + }, { + "name" : "version", + "type" : "com.linkedin.common.VersionTag", + "doc" : "Version of the MLPrimaryKey", + "optional" : true + }, { + "name" : "sources", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "Source of the MLPrimaryKey", + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataset" ], + "name" : "DerivedFrom" + } + } + } ], + "Aspect" : { + "name" : "mlPrimaryKeyProperties" + } + }, "com.linkedin.common.Ownership", "com.linkedin.common.InstitutionalMemory", "com.linkedin.common.Status", "com.linkedin.common.Deprecation" ] + } + }, + "doc" : "The list of metadata aspects associated with the MLPrimaryKey. Depending on the use case, this can either be all, or a selection, of supported aspects." + } ], + "Entity" : { + "keyAspect" : "mlPrimaryKeyKey", + "name" : "mlPrimaryKey" + } }, { "type" : "record", "name" : "MLFeatureSnapshot", @@ -3085,15 +3176,23 @@ "type" : "record", "name" : "MLFeatureKey", "namespace" : "com.linkedin.metadata.key", - "doc" : "Key for an ML model", + "doc" : "Key for an MLFeature", "fields" : [ { "name" : "featureNamespace", "type" : "string", - "doc" : "Namespace for the feature" + "doc" : "Namespace for the feature", + "Searchable" : { + "fieldType" : "TEXT_PARTIAL" + } }, { "name" : "name", "type" : "string", - "doc" : "Name of the feature" + "doc" : "Name of the feature", + "Searchable" : { + "boostScore" : 8.0, + "enableAutocomplete" : true, + "fieldType" : "TEXT_PARTIAL" + } } ], "Aspect" : { "name" : "mlFeatureKey" @@ -3118,6 +3217,20 @@ "type" : "com.linkedin.common.VersionTag", "doc" : "Version of the MLFeature", "optional" : true + }, { + "name" : "sources", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "Source of the MLFeature", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataset" ], + "name" : "DerivedFrom" + } + } } ], "Aspect" : { "name" : "mlFeatureProperties" @@ -3125,12 +3238,109 @@ }, "com.linkedin.common.Ownership", "com.linkedin.common.InstitutionalMemory", "com.linkedin.common.Status", "com.linkedin.common.Deprecation", "com.linkedin.common.BrowsePaths" ] } }, - "doc" : "The list of metadata aspects associated with the MLModel. Depending on the use case, this can either be all, or a selection, of supported aspects." + "doc" : "The list of metadata aspects associated with the MLFeature. Depending on the use case, this can either be all, or a selection, of supported aspects." } ], "Entity" : { "keyAspect" : "mlFeatureKey", "name" : "mlFeature" } + }, { + "type" : "record", + "name" : "MLFeatureTableSnapshot", + "fields" : [ { + "name" : "urn", + "type" : "com.linkedin.common.Urn", + "doc" : "URN for the entity the metadata snapshot is associated with." + }, { + "name" : "aspects", + "type" : { + "type" : "array", + "items" : { + "type" : "typeref", + "name" : "MLFeatureTableAspect", + "namespace" : "com.linkedin.metadata.aspect", + "doc" : "A union of all supported metadata aspects for a MLFeatureTable", + "ref" : [ { + "type" : "record", + "name" : "MLFeatureTableKey", + "namespace" : "com.linkedin.metadata.key", + "doc" : "Key for an MLFeatureTable", + "fields" : [ { + "name" : "platform", + "type" : "com.linkedin.common.Urn", + "doc" : "Data platform urn associated with the feature table", + "Relationship" : { + "entityTypes" : [ "dataPlatform" ], + "name" : "SourcePlatform" + }, + "Searchable" : { + "addToFilters" : true, + "fieldType" : "URN" + } + }, { + "name" : "name", + "type" : "string", + "doc" : "Name of the feature table", + "Searchable" : { + "boostScore" : 8.0, + "enableAutocomplete" : true, + "fieldType" : "TEXT_PARTIAL" + } + } ], + "Aspect" : { + "name" : "mlFeatureTableKey" + } + }, { + "type" : "record", + "name" : "MLFeatureTableProperties", + "namespace" : "com.linkedin.ml.metadata", + "doc" : "Properties associated with a MLFeatureTable", + "fields" : [ { + "name" : "description", + "type" : "string", + "doc" : "Documentation of the MLFeatureTable", + "optional" : true + }, { + "name" : "mlFeatures", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of features contained in the feature table", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "mlFeature" ], + "name" : "Contains" + } + } + }, { + "name" : "mlPrimaryKeys", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of primary keys in the feature table (if multiple, assumed to act as a composite key)", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "mlPrimaryKey" ], + "name" : "KeyedBy" + } + } + } ], + "Aspect" : { + "name" : "mlFeatureTableProperties" + } + }, "com.linkedin.common.Ownership", "com.linkedin.common.InstitutionalMemory", "com.linkedin.common.Status", "com.linkedin.common.Deprecation" ] + } + }, + "doc" : "The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects." + } ], + "Entity" : { + "keyAspect" : "mlFeatureTableKey", + "name" : "mlFeatureTable" + } }, { "type" : "record", "name" : "TagSnapshot", @@ -3346,7 +3556,7 @@ } ] } } ] - }, "com.linkedin.glossary.GlossaryNodeInfo", "com.linkedin.glossary.GlossaryTermInfo", "com.linkedin.identity.CorpGroupInfo", "com.linkedin.identity.CorpUserEditableInfo", "com.linkedin.identity.CorpUserInfo", "com.linkedin.metadata.aspect.ChartAspect", "com.linkedin.metadata.aspect.CorpGroupAspect", "com.linkedin.metadata.aspect.CorpUserAspect", "com.linkedin.metadata.aspect.DashboardAspect", "com.linkedin.metadata.aspect.DataFlowAspect", "com.linkedin.metadata.aspect.DataJobAspect", "com.linkedin.metadata.aspect.DataPlatformAspect", "com.linkedin.metadata.aspect.DataProcessAspect", "com.linkedin.metadata.aspect.DatasetAspect", "com.linkedin.metadata.aspect.GlossaryNodeAspect", "com.linkedin.metadata.aspect.GlossaryTermAspect", "com.linkedin.metadata.aspect.MLFeatureAspect", "com.linkedin.metadata.aspect.MLModelAspect", "com.linkedin.metadata.aspect.TagAspect", "com.linkedin.metadata.key.ChartKey", "com.linkedin.metadata.key.CorpGroupKey", "com.linkedin.metadata.key.CorpUserKey", "com.linkedin.metadata.key.DashboardKey", "com.linkedin.metadata.key.DataFlowKey", "com.linkedin.metadata.key.DataJobKey", "com.linkedin.metadata.key.DataPlatformKey", "com.linkedin.metadata.key.DataProcessKey", "com.linkedin.metadata.key.DatasetKey", "com.linkedin.metadata.key.GlossaryNodeKey", "com.linkedin.metadata.key.GlossaryTermKey", "com.linkedin.metadata.key.MLFeatureKey", "com.linkedin.metadata.key.MLModelKey", "com.linkedin.metadata.key.TagKey", { + }, "com.linkedin.glossary.GlossaryNodeInfo", "com.linkedin.glossary.GlossaryTermInfo", "com.linkedin.identity.CorpGroupInfo", "com.linkedin.identity.CorpUserEditableInfo", "com.linkedin.identity.CorpUserInfo", "com.linkedin.metadata.aspect.ChartAspect", "com.linkedin.metadata.aspect.CorpGroupAspect", "com.linkedin.metadata.aspect.CorpUserAspect", "com.linkedin.metadata.aspect.DashboardAspect", "com.linkedin.metadata.aspect.DataFlowAspect", "com.linkedin.metadata.aspect.DataJobAspect", "com.linkedin.metadata.aspect.DataPlatformAspect", "com.linkedin.metadata.aspect.DataProcessAspect", "com.linkedin.metadata.aspect.DatasetAspect", "com.linkedin.metadata.aspect.GlossaryNodeAspect", "com.linkedin.metadata.aspect.GlossaryTermAspect", "com.linkedin.metadata.aspect.MLFeatureAspect", "com.linkedin.metadata.aspect.MLFeatureTableAspect", "com.linkedin.metadata.aspect.MLModelAspect", "com.linkedin.metadata.aspect.MLPrimaryKeyAspect", "com.linkedin.metadata.aspect.TagAspect", "com.linkedin.metadata.key.ChartKey", "com.linkedin.metadata.key.CorpGroupKey", "com.linkedin.metadata.key.CorpUserKey", "com.linkedin.metadata.key.DashboardKey", "com.linkedin.metadata.key.DataFlowKey", "com.linkedin.metadata.key.DataJobKey", "com.linkedin.metadata.key.DataPlatformKey", "com.linkedin.metadata.key.DataProcessKey", "com.linkedin.metadata.key.DatasetKey", "com.linkedin.metadata.key.GlossaryNodeKey", "com.linkedin.metadata.key.GlossaryTermKey", "com.linkedin.metadata.key.MLFeatureKey", "com.linkedin.metadata.key.MLFeatureTableKey", "com.linkedin.metadata.key.MLModelKey", "com.linkedin.metadata.key.MLPrimaryKeyKey", "com.linkedin.metadata.key.TagKey", { "type" : "record", "name" : "AggregationMetadata", "namespace" : "com.linkedin.metadata.query", @@ -3606,7 +3816,7 @@ }, "doc" : "The order to sort the results i.e. ASCENDING or DESCENDING" } ] - }, "com.linkedin.metadata.query.SortOrder", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", "com.linkedin.schema.ArrayType", "com.linkedin.schema.BinaryJsonSchema", "com.linkedin.schema.BooleanType", "com.linkedin.schema.BytesType", "com.linkedin.schema.DatasetFieldForeignKey", "com.linkedin.schema.DateType", "com.linkedin.schema.EditableSchemaFieldInfo", "com.linkedin.schema.EditableSchemaMetadata", "com.linkedin.schema.EnumType", "com.linkedin.schema.EspressoSchema", "com.linkedin.schema.FixedType", "com.linkedin.schema.ForeignKeySpec", "com.linkedin.schema.KafkaSchema", "com.linkedin.schema.KeyValueSchema", "com.linkedin.schema.MapType", "com.linkedin.schema.MySqlDDL", "com.linkedin.schema.NullType", "com.linkedin.schema.NumberType", "com.linkedin.schema.OracleDDL", "com.linkedin.schema.OrcSchema", "com.linkedin.schema.OtherSchema", "com.linkedin.schema.PrestoDDL", "com.linkedin.schema.RecordType", "com.linkedin.schema.SchemaField", "com.linkedin.schema.SchemaFieldDataType", "com.linkedin.schema.SchemaMetadata", "com.linkedin.schema.SchemaMetadataKey", "com.linkedin.schema.Schemaless", "com.linkedin.schema.StringType", "com.linkedin.schema.TimeType", "com.linkedin.schema.UnionType", "com.linkedin.schema.UrnForeignKey", "com.linkedin.tag.TagProperties" ], + }, "com.linkedin.metadata.query.SortOrder", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", "com.linkedin.schema.ArrayType", "com.linkedin.schema.BinaryJsonSchema", "com.linkedin.schema.BooleanType", "com.linkedin.schema.BytesType", "com.linkedin.schema.DatasetFieldForeignKey", "com.linkedin.schema.DateType", "com.linkedin.schema.EditableSchemaFieldInfo", "com.linkedin.schema.EditableSchemaMetadata", "com.linkedin.schema.EnumType", "com.linkedin.schema.EspressoSchema", "com.linkedin.schema.FixedType", "com.linkedin.schema.ForeignKeySpec", "com.linkedin.schema.KafkaSchema", "com.linkedin.schema.KeyValueSchema", "com.linkedin.schema.MapType", "com.linkedin.schema.MySqlDDL", "com.linkedin.schema.NullType", "com.linkedin.schema.NumberType", "com.linkedin.schema.OracleDDL", "com.linkedin.schema.OrcSchema", "com.linkedin.schema.OtherSchema", "com.linkedin.schema.PrestoDDL", "com.linkedin.schema.RecordType", "com.linkedin.schema.SchemaField", "com.linkedin.schema.SchemaFieldDataType", "com.linkedin.schema.SchemaMetadata", "com.linkedin.schema.SchemaMetadataKey", "com.linkedin.schema.Schemaless", "com.linkedin.schema.StringType", "com.linkedin.schema.TimeType", "com.linkedin.schema.UnionType", "com.linkedin.schema.UrnForeignKey", "com.linkedin.tag.TagProperties" ], "schema" : { "name" : "entities", "namespace" : "com.linkedin.entity", diff --git a/gms/api/src/main/snapshot/com.linkedin.ml.mlModels.snapshot.json b/gms/api/src/main/snapshot/com.linkedin.ml.mlModels.snapshot.json index a1d1ca5bd502c1..356c3e029a0263 100644 --- a/gms/api/src/main/snapshot/com.linkedin.ml.mlModels.snapshot.json +++ b/gms/api/src/main/snapshot/com.linkedin.ml.mlModels.snapshot.json @@ -481,8 +481,8 @@ "type" : "com.linkedin.common.Urn", "doc" : "Standardized platform urn for the model", "Searchable" : { - "boostScore" : 0.1, - "fieldType" : "TEXT_PARTIAL" + "addToFilters" : true, + "fieldType" : "URN" } }, { "name" : "name", diff --git a/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureDataType.pdl b/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureDataType.pdl index 86b142b3fcdc77..23ebc3aa355984 100644 --- a/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureDataType.pdl +++ b/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureDataType.pdl @@ -80,4 +80,19 @@ enum MLFeatureDataType { * Set Data Type ex: set, frozenset */ SET + + /** + * Continuous data are made of uncountable values, often the result of a measurement such as height, weight, age etc. + */ + CONTINUOUS + + /** + * Bytes data are binary-encoded values that can represent complex objects. + */ + BYTE + + /** + * Unknown data are data that we don't know the type for. + */ + UNKNOWN } diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index a7561249fe94cb..e829e4198224f4 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -37,6 +37,7 @@ We use a plugin architecture so that you can install only the dependencies you a | console | _included by default_ | Console sink | | athena | `pip install 'acryl-datahub[athena]'` | AWS Athena source | | bigquery | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | +| feast | `pip install 'acryl-datahub[feast]'` | Feast source | | glue | `pip install 'acryl-datahub[glue]'` | AWS Glue source | | hive | `pip install 'acryl-datahub[hive]'` | Hive source | | mssql | `pip install 'acryl-datahub[mssql]'` | SQL Server source | @@ -401,6 +402,27 @@ source: # options is same as above ``` +### Feast `feast` + +**Note: Feast ingestion requires Docker to be installed.** + +Extracts: + +- List of feature tables (modeled as `MLFeatureTable`s), features (`MLFeature`s), and entities (`MLPrimaryKey`s) +- Column types associated with each feature and entity + +Note: this uses a separate Docker container to extract Feast's metadata into a JSON file, which is then +parsed to DataHub's native objects. This was done because of a dependency conflict in the `feast` module. + +```yml +source: + type: feast + config: + core_url: localhost:6565 # default + env: "PROD" # Optional, default is "PROD" + use_local_build: False # Whether to build Feast ingestion image locally, default is False +``` + ### Google BigQuery `bigquery` Extracts: diff --git a/metadata-ingestion/examples/recipes/feast_to_datahub.yml b/metadata-ingestion/examples/recipes/feast_to_datahub.yml new file mode 100644 index 00000000000000..2eb169631a59a8 --- /dev/null +++ b/metadata-ingestion/examples/recipes/feast_to_datahub.yml @@ -0,0 +1,11 @@ +source: + type: feast + config: + core_url: localhost:6565 # default + env: "PROD" # Optional, default is "PROD" + use_local_build: False # Whether to build Feast ingestion image locally, default is False + +sink: + type: "datahub-rest" + config: + server: "http://localhost:8080" diff --git a/metadata-ingestion/scripts/update_golden_files.sh b/metadata-ingestion/scripts/update_golden_files.sh index 3e3ed6500c882c..11279d7bfd2283 100755 --- a/metadata-ingestion/scripts/update_golden_files.sh +++ b/metadata-ingestion/scripts/update_golden_files.sh @@ -12,6 +12,7 @@ cp tmp/test_ldap_ingest0/ldap_mces.json tests/integration/ldap/ldap_mce_golden.j cp tmp/test_mysql_ingest0/mysql_mces.json tests/integration/mysql/mysql_mce_golden.json cp tmp/test_mssql_ingest0/mssql_mces.json tests/integration/sql_server/mssql_mce_golden.json cp tmp/test_mongodb_ingest0/mongodb_mces.json tests/integration/mongodb/mongodb_mce_golden.json +cp tmp/test_feast_ingest0/feast_mces.json tests/integration/feast/feast_mce_golden.json # Print success message. set +x diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index c70025ff596161..3e0716aa1dae6a 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -70,26 +70,27 @@ def get_long_description(): "sqlalchemy": sql_common, "athena": sql_common | {"PyAthena[SQLAlchemy]"}, "bigquery": sql_common | {"pybigquery >= 0.6.0"}, + "druid": sql_common | {"pydruid>=0.6.2"}, + "feast": {"docker"}, + "glue": {"boto3"}, "hive": sql_common | { # Acryl Data maintains a fork of PyHive, which adds support for table comments # and column comments, and also releases HTTP and HTTPS transport schemes. "acryl-pyhive[hive]>=0.6.6" }, + "ldap": {"python-ldap>=2.4"}, + "looker": {"looker-sdk==21.6.0"}, + "lookml": {"lkml>=1.1.0", "sql-metadata==1.12.0"}, + "mongodb": {"pymongo>=3.11"}, "mssql": sql_common | {"sqlalchemy-pytds>=0.3"}, "mssql-odbc": sql_common | {"pyodbc"}, "mysql": sql_common | {"pymysql>=1.0.2"}, + "oracle": sql_common | {"cx_Oracle"}, "postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"}, "redshift": sql_common | {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2"}, "snowflake": sql_common | {"snowflake-sqlalchemy"}, - "oracle": sql_common | {"cx_Oracle"}, - "ldap": {"python-ldap>=2.4"}, - "looker": {"looker-sdk==21.6.0"}, - "lookml": {"lkml>=1.1.0", "sql-metadata==1.12.0"}, - "druid": sql_common | {"pydruid>=0.6.2"}, - "mongodb": {"pymongo>=3.11"}, "superset": {"requests"}, - "glue": {"boto3"}, } all_exclude_plugins: Set[str] = { @@ -140,6 +141,7 @@ def get_long_description(): "mysql", "mssql", "mongodb", + "feast", "ldap", "looker", "glue", @@ -178,6 +180,7 @@ def get_long_description(): "bigquery = datahub.ingestion.source.bigquery:BigQuerySource", "dbt = datahub.ingestion.source.dbt:DBTSource", "druid = datahub.ingestion.source.druid:DruidSource", + "feast = datahub.ingestion.source.feast:FeastSource", "glue = datahub.ingestion.source.glue:GlueSource", "hive = datahub.ingestion.source.hive:HiveSource", "kafka = datahub.ingestion.source.kafka:KafkaSource", diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index abfa9ce1ff409b..d25f62b3795a55 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -52,6 +52,26 @@ def make_data_job_urn( ) +def make_ml_primary_key_urn(feature_table_name: str, primary_key_name: str) -> str: + + return f"urn:li:mlPrimaryKey:({feature_table_name},{primary_key_name})" + + +def make_ml_feature_urn( + feature_table_name: str, + feature_name: str, +) -> str: + + return f"urn:li:mlFeature:({feature_table_name},{feature_name})" + + +def make_ml_feature_table_urn(platform: str, feature_table_name: str) -> str: + + return ( + f"urn:li:mlFeatureTable:(urn:li:dataPlatform:{platform},{feature_table_name})" + ) + + def make_lineage_mce( upstream_urns: List[str], downstream_urn: str, diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast.py b/metadata-ingestion/src/datahub/ingestion/source/feast.py new file mode 100644 index 00000000000000..52ad9b099f918b --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/feast.py @@ -0,0 +1,305 @@ +import json +import os +import tempfile +from dataclasses import dataclass, field +from shlex import quote +from typing import Dict, Iterable, List + +import docker + +import datahub.emitter.mce_builder as builder +from datahub.configuration.common import ConfigModel +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.source.metadata_common import MetadataWorkUnit +from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType +from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import ( + MLFeatureSnapshot, + MLFeatureTableSnapshot, + MLPrimaryKeySnapshot, +) +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent +from datahub.metadata.schema_classes import ( + MLFeaturePropertiesClass, + MLFeatureTablePropertiesClass, + MLPrimaryKeyPropertiesClass, +) + +# map Feast types to DataHub classes +_field_type_mapping: Dict[str, str] = { + "BYTES": MLFeatureDataType.BYTE, + "STRING": MLFeatureDataType.TEXT, + "INT32": MLFeatureDataType.ORDINAL, + "INT64": MLFeatureDataType.ORDINAL, + "DOUBLE": MLFeatureDataType.CONTINUOUS, + "FLOAT": MLFeatureDataType.CONTINUOUS, + "BOOL": MLFeatureDataType.BINARY, + "UNIX_TIMESTAMP": MLFeatureDataType.TIME, + "BYTES_LIST": MLFeatureDataType.SEQUENCE, + "STRING_LIST": MLFeatureDataType.SEQUENCE, + "INT32_LIST": MLFeatureDataType.SEQUENCE, + "INT64_LIST": MLFeatureDataType.SEQUENCE, + "DOUBLE_LIST": MLFeatureDataType.SEQUENCE, + "FLOAT_LIST": MLFeatureDataType.SEQUENCE, + "BOOL_LIST": MLFeatureDataType.SEQUENCE, + "UNIX_TIMESTAMP_LIST": MLFeatureDataType.SEQUENCE, +} + +DEFAULT_ENV = "PROD" + +# image to use for initial feast extraction +HOSTED_FEAST_IMAGE = "acryldata/datahub-ingestion-feast-wrapper" + + +class FeastConfig(ConfigModel): + core_url: str = "localhost:6565" + env: str = DEFAULT_ENV + use_local_build: bool = False + + +@dataclass +class FeastSourceReport(SourceReport): + filtered: List[str] = field(default_factory=list) + + def report_dropped(self, name: str) -> None: + self.filtered.append(name) + + +@dataclass +class FeastSource(Source): + config: FeastConfig + report: FeastSourceReport + + def __init__(self, ctx: PipelineContext, config: FeastConfig): + super().__init__(ctx) + self.config = config + self.report = FeastSourceReport() + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> "FeastSource": + config = FeastConfig.parse_obj(config_dict) + return cls(ctx, config) + + def get_field_type(self, field_type: str, parent_name: str) -> str: + """ + Maps types encountered in Feast to corresponding schema types. + + Parameters + ---------- + field_type: + type of a Feast object + parent_name: + name of table (for logging) + """ + enum_type = _field_type_mapping.get(field_type) + + if enum_type is None: + self.report.report_warning( + parent_name, f"unable to map type {field_type} to metadata schema" + ) + enum_type = MLFeatureDataType.UNKNOWN + + return enum_type + + def get_entity_wu(self, ingest_table, ingest_entity): + """ + Generate an MLPrimaryKey workunit for a Feast entity. + + Parameters + ---------- + ingest_table: + ingested Feast table + ingest_entity: + ingested Feast entity + """ + + # create snapshot instance for the entity + entity_snapshot = MLPrimaryKeySnapshot( + urn=builder.make_ml_primary_key_urn( + ingest_table["name"], ingest_entity["name"] + ), + aspects=[], + ) + + entity_sources = [] + + if ingest_entity["batch_source"] is not None: + entity_sources.append( + builder.make_dataset_urn( + ingest_entity["batch_source_platform"], + ingest_entity["batch_source_name"], + self.config.env, + ) + ) + + if ingest_entity["stream_source"] is not None: + entity_sources.append( + builder.make_dataset_urn( + ingest_entity["stream_source_platform"], + ingest_entity["stream_source_name"], + self.config.env, + ) + ) + + # append entity name and type + entity_snapshot.aspects.append( + MLPrimaryKeyPropertiesClass( + description=ingest_entity["description"], + dataType=self.get_field_type( + ingest_entity["type"], ingest_entity["name"] + ), + sources=entity_sources, + ) + ) + + # make the MCE and workunit + mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot) + return MetadataWorkUnit(id=ingest_entity["name"], mce=mce) + + def get_feature_wu(self, ingest_table, ingest_feature): + """ + Generate an MLFeature workunit for a Feast feature. + + Parameters + ---------- + ingest_table: + ingested Feast table + ingest_feature: + ingested Feast feature + """ + + # create snapshot instance for the feature + feature_snapshot = MLFeatureSnapshot( + urn=builder.make_ml_feature_urn( + ingest_table["name"], ingest_feature["name"] + ), + aspects=[], + ) + + feature_sources = [] + + if ingest_feature["batch_source"] is not None: + feature_sources.append( + builder.make_dataset_urn( + ingest_feature["batch_source_platform"], + ingest_feature["batch_source_name"], + self.config.env, + ) + ) + + if ingest_feature["stream_source"] is not None: + feature_sources.append( + builder.make_dataset_urn( + ingest_feature["stream_source_platform"], + ingest_feature["stream_source_name"], + self.config.env, + ) + ) + + # append feature name and type + feature_snapshot.aspects.append( + MLFeaturePropertiesClass( + dataType=self.get_field_type( + ingest_feature["type"], ingest_feature["name"] + ), + sources=feature_sources, + ) + ) + + # make the MCE and workunit + mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) + return MetadataWorkUnit(id=ingest_feature["name"], mce=mce) + + def get_feature_table_wu(self, ingest_table): + """ + Generate an MLFeatureTable workunit for a Feast feature table. + + Parameters + ---------- + ingest_table: + ingested Feast table + """ + + featuretable_snapshot = MLFeatureTableSnapshot( + urn=builder.make_ml_feature_table_urn("feast", ingest_table["name"]), + aspects=[], + ) + + featuretable_snapshot.aspects.append( + MLFeatureTablePropertiesClass( + mlFeatures=[ + builder.make_ml_feature_urn( + ingest_table["name"], + feature["name"], + ) + for feature in ingest_table["features"] + ], + # a feature table can have multiple primary keys, which then act as a composite key + mlPrimaryKeys=[ + builder.make_ml_primary_key_urn( + ingest_table["name"], entity["name"] + ) + for entity in ingest_table["entities"] + ], + ) + ) + + # make the MCE and workunit + mce = MetadataChangeEvent(proposedSnapshot=featuretable_snapshot) + return MetadataWorkUnit(id=ingest_table["name"], mce=mce) + + def get_workunits(self) -> Iterable[MetadataWorkUnit]: + with tempfile.NamedTemporaryFile(suffix=".json") as tf: + + docker_client = docker.from_env() + + feast_image = HOSTED_FEAST_IMAGE + + # build the image locally if specified + if self.config.use_local_build: + dirname = os.path.dirname(__file__) + image_directory = os.path.join(dirname, "feast_image/") + + image, _ = docker_client.images.build(path=image_directory) + + feast_image = image.id + + docker_client.containers.run( + feast_image, + f"python3 ingest.py --core_url={quote(self.config.core_url)} --output_path=/out.json", + # allow the image to access the core URL if on host + network_mode="host", + # mount the tempfile so the Docker image has access + volumes={ + tf.name: {"bind": "/out.json", "mode": "rw"}, + }, + ) + + ingest = json.load(tf) + + # ingest tables + for ingest_table in ingest: + + # ingest entities in table + for ingest_entity in ingest_table["entities"]: + + wu = self.get_entity_wu(ingest_table, ingest_entity) + self.report.report_workunit(wu) + yield wu + + # ingest features in table + for ingest_feature in ingest_table["features"]: + + wu = self.get_feature_wu(ingest_table, ingest_feature) + self.report.report_workunit(wu) + yield wu + + wu = self.get_feature_table_wu(ingest_table) + self.report.report_workunit(wu) + yield wu + + def get_report(self) -> FeastSourceReport: + return self.report + + def close(self): + return diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast_image/Dockerfile b/metadata-ingestion/src/datahub/ingestion/source/feast_image/Dockerfile new file mode 100644 index 00000000000000..0f02a704dbe83a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/feast_image/Dockerfile @@ -0,0 +1,19 @@ +# Use the official lightweight Python image. +# https://hub.docker.com/_/python +FROM python:3.8-slim-buster + +# Allow statements and log messages to immediately appear +ENV PYTHONUNBUFFERED True +# Disable since it will slow things down +ENV FEAST_TELEMETRY False + +# Copy local code to the container image. +ENV APP_HOME /app +WORKDIR $APP_HOME +COPY requirements.txt ./ + +# install dependencies +RUN python -m pip install -r requirements.txt + +# Copy everything else +COPY . ./ \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast_image/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/feast_image/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast_image/ingest.py b/metadata-ingestion/src/datahub/ingestion/source/feast_image/ingest.py new file mode 100644 index 00000000000000..7c6371f61d4ed8 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/feast_image/ingest.py @@ -0,0 +1,138 @@ +import json + +import click +from feast import Client +from feast.data_source import BigQuerySource, FileSource, KafkaSource, KinesisSource + + +@click.command( + context_settings=dict( + ignore_unknown_options=True, + allow_extra_args=True, + ) +) +@click.option("--core_url", required=True, type=str, help="Feast core URL") +@click.option( + "--output_path", + required=False, + default=None, + type=str, + help="Path to write output JSON file to", +) +def cli(core_url, output_path): + + client = Client(core_url=core_url) + + tables = client.list_feature_tables() + + # sort tables by name for consistent outputs + tables = sorted(tables, key=lambda x: x.name) + + parsed_tables = [] + + for table in tables: + + # sort entities by name for consistent outputs + entities = sorted(table.entities) + + batch_source = None + stream_source = None + + # platform and name for constructing URN later on + batch_source_platform = "unknown" + stream_source_platform = "unknown" + batch_source_name = "unknown" + stream_source_name = "unknown" + + if isinstance(table.batch_source, BigQuerySource): + batch_source = "BigQuerySource" + batch_source_platform = "bigquery" + batch_source_name = table.batch_source.bigquery_options.table_ref + + if isinstance(table.batch_source, FileSource): + batch_source = "FileSource" + batch_source_platform = "file" + + # replace slashes because the react frontend can't parse them correctly + batch_source_name = table.batch_source.file_options.file_url.replace( + "/", "." + ) + + # replace redundant file prefix + if batch_source_name.startswith("file:.."): + batch_source_name = batch_source_name[7:] + + if isinstance(table.stream_source, KafkaSource): + stream_source = "KafkaSource" + stream_source_platform = "kafka" + stream_source_name = table.stream_source.kafka_options.topic + + if isinstance(table.stream_source, KinesisSource): + stream_source = "KinesisSource" + stream_source_platform = "kinesis" + stream_source_name = f"{table.stream_source.kinesis_options.region}-{table.stream_source.kinesis_options.stream_name}" + + # currently unused in MCE outputs, but useful for debugging + stream_source_config = table.to_dict()["spec"].get("streamSource") + batch_source_config = table.to_dict()["spec"]["batchSource"] + + raw_entities = [ + client.get_entity(entity_name) for entity_name in table.entities + ] + raw_entities = sorted(raw_entities, key=lambda x: x.name) + + source_info = { + "batch_source": batch_source, + "stream_source": stream_source, + "batch_source_config": batch_source_config, + "stream_source_config": stream_source_config, + "batch_source_platform": batch_source_platform, + "stream_source_platform": stream_source_platform, + "batch_source_name": batch_source_name, + "stream_source_name": stream_source_name, + } + + # sort entities by name for consistent outputs + entities = sorted( + [ + { + "name": x.name, + "type": x.value_type.name, + "description": x.description, + **source_info, + } + for x in raw_entities + ], + key=lambda x: x["name"], + ) + + # sort features by name for consistent outputs + features = sorted( + [ + {"name": x.name, "type": x.dtype.name, **source_info} + for x in table.features + ], + key=lambda x: x["name"], + ) + + parsed_tables.append( + { + "name": table.name, + "entities": entities, + "features": features, + } + ) + + if output_path is not None: + + with open(output_path, "w") as f: + json.dump(parsed_tables, f) + + else: + + print(parsed_tables) + + +if __name__ == "__main__": + + cli() diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast_image/requirements.txt b/metadata-ingestion/src/datahub/ingestion/source/feast_image/requirements.txt new file mode 100644 index 00000000000000..1c41862b0b9a7b --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/feast_image/requirements.txt @@ -0,0 +1,39 @@ +attrs==21.2.0; python_full_version >= "3.7.0" +cachetools==4.2.2; python_version >= "3.5" and python_version < "4.0" and python_full_version >= "3.7.0" +certifi==2020.12.5; python_full_version >= "3.7.0" +chardet==4.0.0; python_full_version >= "3.7.0" +click==7.1.2; python_full_version >= "3.7.0" +colorama==0.4.4; python_full_version >= "3.7.0" +fastavro==0.22.13; python_full_version >= "3.7.0" +feast==0.10.5; python_full_version >= "3.7.0" +google-api-core==1.28.0; python_full_version >= "3.7.0" +google-auth==1.30.1; python_full_version >= "3.7.0" +googleapis-common-protos==1.52.0; python_full_version >= "3.7.0" +grpcio==1.38.0; python_full_version >= "3.7.0" +idna==2.10; python_full_version >= "3.7.0" +jinja2==3.0.1; python_version >= "3.6" and python_full_version >= "3.7.0" +jsonschema==3.2.0; python_full_version >= "3.7.0" +markupsafe==2.0.1; python_version >= "3.6" and python_full_version >= "3.7.0" +mmh3==3.0.0; python_full_version >= "3.7.0" +numpy==1.20.3; python_version >= "3.7" and python_full_version >= "3.7.1" +packaging==20.9; python_full_version >= "3.7.0" +pandas==1.2.4; python_full_version >= "3.7.1" +pandavro==1.5.2; python_full_version >= "3.7.0" +protobuf==3.17.1; python_full_version >= "3.7.0" +pyarrow==4.0.0; python_version >= "3.6" and python_full_version >= "3.7.0" +pyasn1-modules==0.2.8; python_full_version >= "3.7.0" +pyasn1==0.4.8; python_version >= "3.6" and python_version < "4" and python_full_version >= "3.7.0" +pydantic==1.8.2; python_full_version >= "3.7.0" +pyparsing==2.4.7; python_full_version >= "3.7.0" +pyrsistent==0.17.3; python_version >= "3.5" and python_full_version >= "3.7.0" +python-dateutil==2.8.1; python_full_version >= "3.7.1" +pytz==2021.1; python_full_version >= "3.7.1" +pyyaml==5.3.1; python_full_version >= "3.7.0" +requests==2.25.1; python_full_version >= "3.7.0" +rsa==4.7.2; python_version >= "3.6" and python_version < "4" and python_full_version >= "3.7.0" +six==1.16.0; python_full_version >= "3.7.1" +tabulate==0.8.9; python_full_version >= "3.7.0" +toml==0.10.2; python_full_version >= "3.7.0" +tqdm==4.61.0; python_full_version >= "3.7.0" +typing-extensions==3.10.0.0; python_full_version >= "3.7.0" +urllib3==1.26.4; python_full_version >= "3.7.0" and python_version < "4" diff --git a/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py b/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py index 55840ba8a5f1f7..48353e186fbf72 100644 --- a/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +++ b/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py @@ -10,7 +10,9 @@ from ......schema_classes import GlossaryNodeKeyClass from ......schema_classes import GlossaryTermKeyClass from ......schema_classes import MLFeatureKeyClass +from ......schema_classes import MLFeatureTableKeyClass from ......schema_classes import MLModelKeyClass +from ......schema_classes import MLPrimaryKeyKeyClass from ......schema_classes import TagKeyClass @@ -26,5 +28,7 @@ GlossaryNodeKey = GlossaryNodeKeyClass GlossaryTermKey = GlossaryTermKeyClass MLFeatureKey = MLFeatureKeyClass +MLFeatureTableKey = MLFeatureTableKeyClass MLModelKey = MLModelKeyClass +MLPrimaryKeyKey = MLPrimaryKeyKeyClass TagKey = TagKeyClass diff --git a/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/metadata/snapshot/__init__.py b/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/metadata/snapshot/__init__.py index 68864b1cb63f28..9da58c6eba3cf5 100644 --- a/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/metadata/snapshot/__init__.py +++ b/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/metadata/snapshot/__init__.py @@ -10,7 +10,9 @@ from ......schema_classes import GlossaryNodeSnapshotClass from ......schema_classes import GlossaryTermSnapshotClass from ......schema_classes import MLFeatureSnapshotClass +from ......schema_classes import MLFeatureTableSnapshotClass from ......schema_classes import MLModelSnapshotClass +from ......schema_classes import MLPrimaryKeySnapshotClass from ......schema_classes import TagSnapshotClass @@ -26,5 +28,7 @@ GlossaryNodeSnapshot = GlossaryNodeSnapshotClass GlossaryTermSnapshot = GlossaryTermSnapshotClass MLFeatureSnapshot = MLFeatureSnapshotClass +MLFeatureTableSnapshot = MLFeatureTableSnapshotClass MLModelSnapshot = MLModelSnapshotClass +MLPrimaryKeySnapshot = MLPrimaryKeySnapshotClass TagSnapshot = TagSnapshotClass diff --git a/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py b/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py index 4055b9a91012d8..b8ffc51e21100e 100644 --- a/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +++ b/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py @@ -6,9 +6,11 @@ from ......schema_classes import IntendedUseClass from ......schema_classes import IntendedUserTypeClass from ......schema_classes import MLFeaturePropertiesClass +from ......schema_classes import MLFeatureTablePropertiesClass from ......schema_classes import MLModelFactorPromptsClass from ......schema_classes import MLModelFactorsClass from ......schema_classes import MLModelPropertiesClass +from ......schema_classes import MLPrimaryKeyPropertiesClass from ......schema_classes import MetricsClass from ......schema_classes import QuantitativeAnalysesClass from ......schema_classes import SourceCodeClass @@ -25,9 +27,11 @@ IntendedUse = IntendedUseClass IntendedUserType = IntendedUserTypeClass MLFeatureProperties = MLFeaturePropertiesClass +MLFeatureTableProperties = MLFeatureTablePropertiesClass MLModelFactorPrompts = MLModelFactorPromptsClass MLModelFactors = MLModelFactorsClass MLModelProperties = MLModelPropertiesClass +MLPrimaryKeyProperties = MLPrimaryKeyPropertiesClass Metrics = MetricsClass QuantitativeAnalyses = QuantitativeAnalysesClass SourceCode = SourceCodeClass diff --git a/metadata-ingestion/src/datahub/metadata/schema.avsc b/metadata-ingestion/src/datahub/metadata/schema.avsc index 52f89a291e8c15..76be1563a4a72c 100644 --- a/metadata-ingestion/src/datahub/metadata/schema.avsc +++ b/metadata-ingestion/src/datahub/metadata/schema.avsc @@ -2853,8 +2853,8 @@ "type": "string", "doc": "Standardized platform urn for the model", "Searchable": { - "boostScore": 0.1, - "fieldType": "TEXT_PARTIAL" + "addToFilters": true, + "fieldType": "URN" }, "java": { "class": "com.linkedin.pegasus2avro.common.urn.Urn" @@ -3577,7 +3577,7 @@ }, { "type": "record", - "name": "MLFeatureSnapshot", + "name": "MLPrimaryKeySnapshot", "namespace": "com.linkedin.pegasus2avro.metadata.snapshot", "fields": [ { @@ -3585,7 +3585,7 @@ "type": "string", "doc": "URN for the entity the metadata snapshot is associated with.", "java": { - "class": "com.linkedin.pegasus2avro.common.urn.MLFeatureUrn" + "class": "com.linkedin.pegasus2avro.common.urn.Urn" } }, { @@ -3595,30 +3595,39 @@ "items": [ { "type": "record", - "name": "MLFeatureKey", + "name": "MLPrimaryKeyKey", "namespace": "com.linkedin.pegasus2avro.metadata.key", - "doc": "Key for an ML model", + "doc": "Key for an MLPrimaryKey", "fields": [ { "name": "featureNamespace", "type": "string", - "doc": "Namespace for the feature" + "doc": "Namespace for the primary key", + "Searchable": { + "addToFilters": true, + "fieldType": "TEXT_PARTIAL" + } }, { "name": "name", "type": "string", - "doc": "Name of the feature" + "doc": "Name of the primary key", + "Searchable": { + "boostScore": 8.0, + "enableAutocomplete": true, + "fieldType": "TEXT_PARTIAL" + } } ], "Aspect": { - "name": "mlFeatureKey" + "name": "mlPrimaryKeyKey" } }, { "type": "record", - "name": "MLFeatureProperties", + "name": "MLPrimaryKeyProperties", "namespace": "com.linkedin.pegasus2avro.ml.metadata", - "doc": "Properties associated with a MLFeature", + "doc": "Properties associated with a MLPrimaryKey", "fields": [ { "name": "description", @@ -3626,7 +3635,7 @@ "null", "string" ], - "doc": "Documentation of the MLFeature", + "doc": "Documentation of the MLPrimaryKey", "default": null }, { @@ -3652,11 +3661,16 @@ "TEXT", "MAP", "SEQUENCE", - "SET" + "SET", + "CONTINUOUS", + "BYTE", + "UNKNOWN" ], "symbolDocs": { "AUDIO": "Audio Data", "BINARY": "Binary data is discrete data that can be in only one of two categories \u2014 either yes or no, 1 or 0, off or on, etc", + "BYTE": "Bytes data are binary-encoded values that can represent complex objects.", + "CONTINUOUS": "Continuous data are made of uncountable values, often the result of a measurement such as height, weight, age etc.", "COUNT": "Count data is discrete whole number data \u2014 no negative numbers here.\nCount data often has many small values, such as zero and one.", "IMAGE": "Image Data", "INTERVAL": "Interval data has equal spaces between the numbers and does not represent a temporal pattern.\nExamples include percentages, temperatures, and income.", @@ -3667,11 +3681,127 @@ "SET": "Set Data Type ex: set, frozenset", "TEXT": "Text Data", "TIME": "Time data is a cyclical, repeating continuous form of data.\nThe relevant time features can be any period\u2014 daily, weekly, monthly, annual, etc.", + "UNKNOWN": "Unknown data are data that we don't know the type for.", "USELESS": "Useless data is unique, discrete data with no potential relationship with the outcome variable.\nA useless feature has high cardinality. An example would be bank account numbers that were generated randomly.", "VIDEO": "Video Data" } } ], + "doc": "Data Type of the MLPrimaryKey", + "default": null + }, + { + "name": "version", + "type": [ + "null", + "com.linkedin.pegasus2avro.common.VersionTag" + ], + "doc": "Version of the MLPrimaryKey", + "default": null + }, + { + "name": "sources", + "type": { + "type": "array", + "items": "string" + }, + "doc": "Source of the MLPrimaryKey", + "Relationship": { + "/*": { + "entityTypes": [ + "dataset" + ], + "name": "DerivedFrom" + } + } + } + ], + "Aspect": { + "name": "mlPrimaryKeyProperties" + } + }, + "com.linkedin.pegasus2avro.common.Ownership", + "com.linkedin.pegasus2avro.common.InstitutionalMemory", + "com.linkedin.pegasus2avro.common.Status", + "com.linkedin.pegasus2avro.common.Deprecation" + ] + }, + "doc": "The list of metadata aspects associated with the MLPrimaryKey. Depending on the use case, this can either be all, or a selection, of supported aspects." + } + ], + "Entity": { + "keyAspect": "mlPrimaryKeyKey", + "name": "mlPrimaryKey" + } + }, + { + "type": "record", + "name": "MLFeatureSnapshot", + "namespace": "com.linkedin.pegasus2avro.metadata.snapshot", + "fields": [ + { + "name": "urn", + "type": "string", + "doc": "URN for the entity the metadata snapshot is associated with.", + "java": { + "class": "com.linkedin.pegasus2avro.common.urn.MLFeatureUrn" + } + }, + { + "name": "aspects", + "type": { + "type": "array", + "items": [ + { + "type": "record", + "name": "MLFeatureKey", + "namespace": "com.linkedin.pegasus2avro.metadata.key", + "doc": "Key for an MLFeature", + "fields": [ + { + "name": "featureNamespace", + "type": "string", + "doc": "Namespace for the feature", + "Searchable": { + "fieldType": "TEXT_PARTIAL" + } + }, + { + "name": "name", + "type": "string", + "doc": "Name of the feature", + "Searchable": { + "boostScore": 8.0, + "enableAutocomplete": true, + "fieldType": "TEXT_PARTIAL" + } + } + ], + "Aspect": { + "name": "mlFeatureKey" + } + }, + { + "type": "record", + "name": "MLFeatureProperties", + "namespace": "com.linkedin.pegasus2avro.ml.metadata", + "doc": "Properties associated with a MLFeature", + "fields": [ + { + "name": "description", + "type": [ + "null", + "string" + ], + "doc": "Documentation of the MLFeature", + "default": null + }, + { + "name": "dataType", + "type": [ + "null", + "com.linkedin.pegasus2avro.common.MLFeatureDataType" + ], "doc": "Data Type of the MLFeature", "default": null }, @@ -3683,6 +3813,26 @@ ], "doc": "Version of the MLFeature", "default": null + }, + { + "name": "sources", + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "doc": "Source of the MLFeature", + "default": null, + "Relationship": { + "/*": { + "entityTypes": [ + "dataset" + ], + "name": "DerivedFrom" + } + } } ], "Aspect": { @@ -3696,7 +3846,7 @@ "com.linkedin.pegasus2avro.common.BrowsePaths" ] }, - "doc": "The list of metadata aspects associated with the MLModel. Depending on the use case, this can either be all, or a selection, of supported aspects." + "doc": "The list of metadata aspects associated with the MLFeature. Depending on the use case, this can either be all, or a selection, of supported aspects." } ], "Entity": { @@ -3704,6 +3854,137 @@ "name": "mlFeature" } }, + { + "type": "record", + "name": "MLFeatureTableSnapshot", + "namespace": "com.linkedin.pegasus2avro.metadata.snapshot", + "fields": [ + { + "name": "urn", + "type": "string", + "doc": "URN for the entity the metadata snapshot is associated with.", + "java": { + "class": "com.linkedin.pegasus2avro.common.urn.Urn" + } + }, + { + "name": "aspects", + "type": { + "type": "array", + "items": [ + { + "type": "record", + "name": "MLFeatureTableKey", + "namespace": "com.linkedin.pegasus2avro.metadata.key", + "doc": "Key for an MLFeatureTable", + "fields": [ + { + "name": "platform", + "type": "string", + "doc": "Data platform urn associated with the feature table", + "Relationship": { + "entityTypes": [ + "dataPlatform" + ], + "name": "SourcePlatform" + }, + "Searchable": { + "addToFilters": true, + "fieldType": "URN" + }, + "java": { + "class": "com.linkedin.pegasus2avro.common.urn.Urn" + } + }, + { + "name": "name", + "type": "string", + "doc": "Name of the feature table", + "Searchable": { + "boostScore": 8.0, + "enableAutocomplete": true, + "fieldType": "TEXT_PARTIAL" + } + } + ], + "Aspect": { + "name": "mlFeatureTableKey" + } + }, + { + "type": "record", + "name": "MLFeatureTableProperties", + "namespace": "com.linkedin.pegasus2avro.ml.metadata", + "doc": "Properties associated with a MLFeatureTable", + "fields": [ + { + "name": "description", + "type": [ + "null", + "string" + ], + "doc": "Documentation of the MLFeatureTable", + "default": null + }, + { + "name": "mlFeatures", + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "doc": "List of features contained in the feature table", + "default": null, + "Relationship": { + "/*": { + "entityTypes": [ + "mlFeature" + ], + "name": "Contains" + } + } + }, + { + "name": "mlPrimaryKeys", + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "doc": "List of primary keys in the feature table (if multiple, assumed to act as a composite key)", + "default": null, + "Relationship": { + "/*": { + "entityTypes": [ + "mlPrimaryKey" + ], + "name": "KeyedBy" + } + } + } + ], + "Aspect": { + "name": "mlFeatureTableProperties" + } + }, + "com.linkedin.pegasus2avro.common.Ownership", + "com.linkedin.pegasus2avro.common.InstitutionalMemory", + "com.linkedin.pegasus2avro.common.Status", + "com.linkedin.pegasus2avro.common.Deprecation" + ] + }, + "doc": "The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects." + } + ], + "Entity": { + "keyAspect": "mlFeatureTableKey", + "name": "mlFeatureTable" + } + }, { "type": "record", "name": "TagSnapshot", diff --git a/metadata-ingestion/src/datahub/metadata/schema_classes.py b/metadata-ingestion/src/datahub/metadata/schema_classes.py index 3845658a7a819b..5fea1d6d3ea2e4 100644 --- a/metadata-ingestion/src/datahub/metadata/schema_classes.py +++ b/metadata-ingestion/src/datahub/metadata/schema_classes.py @@ -1103,6 +1103,15 @@ class MLFeatureDataTypeClass(object): """Set Data Type ex: set, frozenset""" SET = "SET" + """Continuous data are made of uncountable values, often the result of a measurement such as height, weight, age etc.""" + CONTINUOUS = "CONTINUOUS" + + """Bytes data are binary-encoded values that can represent complex objects.""" + BYTE = "BYTE" + + """Unknown data are data that we don't know the type for.""" + UNKNOWN = "UNKNOWN" + class OwnerClass(DictWrapper): """Ownership information""" @@ -3533,7 +3542,7 @@ def name(self, value: str) -> None: class MLFeatureKeyClass(DictWrapper): - """Key for an ML model""" + """Key for an MLFeature""" RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.MLFeatureKey") def __init__(self, @@ -3581,6 +3590,55 @@ def name(self, value: str) -> None: self._inner_dict['name'] = value +class MLFeatureTableKeyClass(DictWrapper): + """Key for an MLFeatureTable""" + + RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.MLFeatureTableKey") + def __init__(self, + platform: str, + name: str, + ): + super().__init__() + + self.platform = platform + self.name = name + + @classmethod + def construct_with_defaults(cls) -> "MLFeatureTableKeyClass": + self = cls.construct({}) + self._restore_defaults() + + return self + + def _restore_defaults(self) -> None: + self.platform = str() + self.name = str() + + + @property + def platform(self) -> str: + """Getter: Data platform urn associated with the feature table""" + return self._inner_dict.get('platform') # type: ignore + + + @platform.setter + def platform(self, value: str) -> None: + """Setter: Data platform urn associated with the feature table""" + self._inner_dict['platform'] = value + + + @property + def name(self) -> str: + """Getter: Name of the feature table""" + return self._inner_dict.get('name') # type: ignore + + + @name.setter + def name(self, value: str) -> None: + """Setter: Name of the feature table""" + self._inner_dict['name'] = value + + class MLModelKeyClass(DictWrapper): """Key for an ML model""" @@ -3645,6 +3703,55 @@ def origin(self, value: Union[str, "FabricTypeClass"]) -> None: self._inner_dict['origin'] = value +class MLPrimaryKeyKeyClass(DictWrapper): + """Key for an MLPrimaryKey""" + + RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.MLPrimaryKeyKey") + def __init__(self, + featureNamespace: str, + name: str, + ): + super().__init__() + + self.featureNamespace = featureNamespace + self.name = name + + @classmethod + def construct_with_defaults(cls) -> "MLPrimaryKeyKeyClass": + self = cls.construct({}) + self._restore_defaults() + + return self + + def _restore_defaults(self) -> None: + self.featureNamespace = str() + self.name = str() + + + @property + def featureNamespace(self) -> str: + """Getter: Namespace for the primary key""" + return self._inner_dict.get('featureNamespace') # type: ignore + + + @featureNamespace.setter + def featureNamespace(self, value: str) -> None: + """Setter: Namespace for the primary key""" + self._inner_dict['featureNamespace'] = value + + + @property + def name(self) -> str: + """Getter: Name of the primary key""" + return self._inner_dict.get('name') # type: ignore + + + @name.setter + def name(self, value: str) -> None: + """Setter: Name of the primary key""" + self._inner_dict['name'] = value + + class TagKeyClass(DictWrapper): """Key for a Tag""" @@ -4257,13 +4364,62 @@ def urn(self, value: str) -> None: @property def aspects(self) -> List[Union["MLFeatureKeyClass", "MLFeaturePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass", "BrowsePathsClass"]]: - """Getter: The list of metadata aspects associated with the MLModel. Depending on the use case, this can either be all, or a selection, of supported aspects.""" + """Getter: The list of metadata aspects associated with the MLFeature. Depending on the use case, this can either be all, or a selection, of supported aspects.""" return self._inner_dict.get('aspects') # type: ignore @aspects.setter def aspects(self, value: List[Union["MLFeatureKeyClass", "MLFeaturePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass", "BrowsePathsClass"]]) -> None: - """Setter: The list of metadata aspects associated with the MLModel. Depending on the use case, this can either be all, or a selection, of supported aspects.""" + """Setter: The list of metadata aspects associated with the MLFeature. Depending on the use case, this can either be all, or a selection, of supported aspects.""" + self._inner_dict['aspects'] = value + + +class MLFeatureTableSnapshotClass(DictWrapper): + # No docs available. + + RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureTableSnapshot") + def __init__(self, + urn: str, + aspects: List[Union["MLFeatureTableKeyClass", "MLFeatureTablePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass"]], + ): + super().__init__() + + self.urn = urn + self.aspects = aspects + + @classmethod + def construct_with_defaults(cls) -> "MLFeatureTableSnapshotClass": + self = cls.construct({}) + self._restore_defaults() + + return self + + def _restore_defaults(self) -> None: + self.urn = str() + self.aspects = list() + + + @property + def urn(self) -> str: + """Getter: URN for the entity the metadata snapshot is associated with.""" + return self._inner_dict.get('urn') # type: ignore + + + @urn.setter + def urn(self, value: str) -> None: + """Setter: URN for the entity the metadata snapshot is associated with.""" + self._inner_dict['urn'] = value + + + @property + def aspects(self) -> List[Union["MLFeatureTableKeyClass", "MLFeatureTablePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass"]]: + """Getter: The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects.""" + return self._inner_dict.get('aspects') # type: ignore + + + @aspects.setter + def aspects(self, value: List[Union["MLFeatureTableKeyClass", "MLFeatureTablePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass"]]) -> None: + """Setter: The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects.""" self._inner_dict['aspects'] = value @@ -4316,6 +4472,55 @@ def aspects(self, value: List[Union["MLModelKeyClass", "OwnershipClass", "MLMode self._inner_dict['aspects'] = value +class MLPrimaryKeySnapshotClass(DictWrapper): + # No docs available. + + RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.snapshot.MLPrimaryKeySnapshot") + def __init__(self, + urn: str, + aspects: List[Union["MLPrimaryKeyKeyClass", "MLPrimaryKeyPropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass"]], + ): + super().__init__() + + self.urn = urn + self.aspects = aspects + + @classmethod + def construct_with_defaults(cls) -> "MLPrimaryKeySnapshotClass": + self = cls.construct({}) + self._restore_defaults() + + return self + + def _restore_defaults(self) -> None: + self.urn = str() + self.aspects = list() + + + @property + def urn(self) -> str: + """Getter: URN for the entity the metadata snapshot is associated with.""" + return self._inner_dict.get('urn') # type: ignore + + + @urn.setter + def urn(self, value: str) -> None: + """Setter: URN for the entity the metadata snapshot is associated with.""" + self._inner_dict['urn'] = value + + + @property + def aspects(self) -> List[Union["MLPrimaryKeyKeyClass", "MLPrimaryKeyPropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass"]]: + """Getter: The list of metadata aspects associated with the MLPrimaryKey. Depending on the use case, this can either be all, or a selection, of supported aspects.""" + return self._inner_dict.get('aspects') # type: ignore + + + @aspects.setter + def aspects(self, value: List[Union["MLPrimaryKeyKeyClass", "MLPrimaryKeyPropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass"]]) -> None: + """Setter: The list of metadata aspects associated with the MLPrimaryKey. Depending on the use case, this can either be all, or a selection, of supported aspects.""" + self._inner_dict['aspects'] = value + + class TagSnapshotClass(DictWrapper): """A metadata snapshot for a specific dataset entity.""" @@ -4767,12 +4972,14 @@ def __init__(self, description: Union[None, str]=None, dataType: Union[None, Union[str, "MLFeatureDataTypeClass"]]=None, version: Union[None, "VersionTagClass"]=None, + sources: Union[None, List[str]]=None, ): super().__init__() self.description = description self.dataType = dataType self.version = version + self.sources = sources @classmethod def construct_with_defaults(cls) -> "MLFeaturePropertiesClass": @@ -4785,6 +4992,7 @@ def _restore_defaults(self) -> None: self.description = self.RECORD_SCHEMA.field_map["description"].default self.dataType = self.RECORD_SCHEMA.field_map["dataType"].default self.version = self.RECORD_SCHEMA.field_map["version"].default + self.sources = self.RECORD_SCHEMA.field_map["sources"].default @property @@ -4823,6 +5031,82 @@ def version(self, value: Union[None, "VersionTagClass"]) -> None: self._inner_dict['version'] = value + @property + def sources(self) -> Union[None, List[str]]: + """Getter: Source of the MLFeature""" + return self._inner_dict.get('sources') # type: ignore + + + @sources.setter + def sources(self, value: Union[None, List[str]]) -> None: + """Setter: Source of the MLFeature""" + self._inner_dict['sources'] = value + + +class MLFeatureTablePropertiesClass(DictWrapper): + """Properties associated with a MLFeatureTable""" + + RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties") + def __init__(self, + description: Union[None, str]=None, + mlFeatures: Union[None, List[str]]=None, + mlPrimaryKeys: Union[None, List[str]]=None, + ): + super().__init__() + + self.description = description + self.mlFeatures = mlFeatures + self.mlPrimaryKeys = mlPrimaryKeys + + @classmethod + def construct_with_defaults(cls) -> "MLFeatureTablePropertiesClass": + self = cls.construct({}) + self._restore_defaults() + + return self + + def _restore_defaults(self) -> None: + self.description = self.RECORD_SCHEMA.field_map["description"].default + self.mlFeatures = self.RECORD_SCHEMA.field_map["mlFeatures"].default + self.mlPrimaryKeys = self.RECORD_SCHEMA.field_map["mlPrimaryKeys"].default + + + @property + def description(self) -> Union[None, str]: + """Getter: Documentation of the MLFeatureTable""" + return self._inner_dict.get('description') # type: ignore + + + @description.setter + def description(self, value: Union[None, str]) -> None: + """Setter: Documentation of the MLFeatureTable""" + self._inner_dict['description'] = value + + + @property + def mlFeatures(self) -> Union[None, List[str]]: + """Getter: List of features contained in the feature table""" + return self._inner_dict.get('mlFeatures') # type: ignore + + + @mlFeatures.setter + def mlFeatures(self, value: Union[None, List[str]]) -> None: + """Setter: List of features contained in the feature table""" + self._inner_dict['mlFeatures'] = value + + + @property + def mlPrimaryKeys(self) -> Union[None, List[str]]: + """Getter: List of primary keys in the feature table (if multiple, assumed to act as a composite key)""" + return self._inner_dict.get('mlPrimaryKeys') # type: ignore + + + @mlPrimaryKeys.setter + def mlPrimaryKeys(self, value: Union[None, List[str]]) -> None: + """Setter: List of primary keys in the feature table (if multiple, assumed to act as a composite key)""" + self._inner_dict['mlPrimaryKeys'] = value + + class MLModelFactorPromptsClass(DictWrapper): """Prompts which affect the performance of the MLModel""" @@ -5069,6 +5353,85 @@ def tags(self, value: List[str]) -> None: self._inner_dict['tags'] = value +class MLPrimaryKeyPropertiesClass(DictWrapper): + """Properties associated with a MLPrimaryKey""" + + RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.ml.metadata.MLPrimaryKeyProperties") + def __init__(self, + sources: List[str], + description: Union[None, str]=None, + dataType: Union[None, Union[str, "MLFeatureDataTypeClass"]]=None, + version: Union[None, "VersionTagClass"]=None, + ): + super().__init__() + + self.description = description + self.dataType = dataType + self.version = version + self.sources = sources + + @classmethod + def construct_with_defaults(cls) -> "MLPrimaryKeyPropertiesClass": + self = cls.construct({}) + self._restore_defaults() + + return self + + def _restore_defaults(self) -> None: + self.description = self.RECORD_SCHEMA.field_map["description"].default + self.dataType = self.RECORD_SCHEMA.field_map["dataType"].default + self.version = self.RECORD_SCHEMA.field_map["version"].default + self.sources = list() + + + @property + def description(self) -> Union[None, str]: + """Getter: Documentation of the MLPrimaryKey""" + return self._inner_dict.get('description') # type: ignore + + + @description.setter + def description(self, value: Union[None, str]) -> None: + """Setter: Documentation of the MLPrimaryKey""" + self._inner_dict['description'] = value + + + @property + def dataType(self) -> Union[None, Union[str, "MLFeatureDataTypeClass"]]: + """Getter: Data Type of the MLPrimaryKey""" + return self._inner_dict.get('dataType') # type: ignore + + + @dataType.setter + def dataType(self, value: Union[None, Union[str, "MLFeatureDataTypeClass"]]) -> None: + """Setter: Data Type of the MLPrimaryKey""" + self._inner_dict['dataType'] = value + + + @property + def version(self) -> Union[None, "VersionTagClass"]: + """Getter: Version of the MLPrimaryKey""" + return self._inner_dict.get('version') # type: ignore + + + @version.setter + def version(self, value: Union[None, "VersionTagClass"]) -> None: + """Setter: Version of the MLPrimaryKey""" + self._inner_dict['version'] = value + + + @property + def sources(self) -> List[str]: + """Getter: Source of the MLPrimaryKey""" + return self._inner_dict.get('sources') # type: ignore + + + @sources.setter + def sources(self, value: List[str]) -> None: + """Setter: Source of the MLPrimaryKey""" + self._inner_dict['sources'] = value + + class MetricsClass(DictWrapper): """Metrics to be featured for the MLModel.""" @@ -5297,7 +5660,7 @@ class MetadataChangeEventClass(DictWrapper): RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.mxe.MetadataChangeEvent") def __init__(self, - proposedSnapshot: Union["ChartSnapshotClass", "CorpGroupSnapshotClass", "CorpUserSnapshotClass", "DashboardSnapshotClass", "DataFlowSnapshotClass", "DataJobSnapshotClass", "DatasetSnapshotClass", "DataProcessSnapshotClass", "DataPlatformSnapshotClass", "MLModelSnapshotClass", "MLFeatureSnapshotClass", "TagSnapshotClass", "GlossaryTermSnapshotClass", "GlossaryNodeSnapshotClass"], + proposedSnapshot: Union["ChartSnapshotClass", "CorpGroupSnapshotClass", "CorpUserSnapshotClass", "DashboardSnapshotClass", "DataFlowSnapshotClass", "DataJobSnapshotClass", "DatasetSnapshotClass", "DataProcessSnapshotClass", "DataPlatformSnapshotClass", "MLModelSnapshotClass", "MLPrimaryKeySnapshotClass", "MLFeatureSnapshotClass", "MLFeatureTableSnapshotClass", "TagSnapshotClass", "GlossaryTermSnapshotClass", "GlossaryNodeSnapshotClass"], auditHeader: Union[None, "KafkaAuditHeaderClass"]=None, proposedDelta: None=None, ): @@ -5333,13 +5696,13 @@ def auditHeader(self, value: Union[None, "KafkaAuditHeaderClass"]) -> None: @property - def proposedSnapshot(self) -> Union["ChartSnapshotClass", "CorpGroupSnapshotClass", "CorpUserSnapshotClass", "DashboardSnapshotClass", "DataFlowSnapshotClass", "DataJobSnapshotClass", "DatasetSnapshotClass", "DataProcessSnapshotClass", "DataPlatformSnapshotClass", "MLModelSnapshotClass", "MLFeatureSnapshotClass", "TagSnapshotClass", "GlossaryTermSnapshotClass", "GlossaryNodeSnapshotClass"]: + def proposedSnapshot(self) -> Union["ChartSnapshotClass", "CorpGroupSnapshotClass", "CorpUserSnapshotClass", "DashboardSnapshotClass", "DataFlowSnapshotClass", "DataJobSnapshotClass", "DatasetSnapshotClass", "DataProcessSnapshotClass", "DataPlatformSnapshotClass", "MLModelSnapshotClass", "MLPrimaryKeySnapshotClass", "MLFeatureSnapshotClass", "MLFeatureTableSnapshotClass", "TagSnapshotClass", "GlossaryTermSnapshotClass", "GlossaryNodeSnapshotClass"]: """Getter: Snapshot of the proposed metadata change. Include only the aspects affected by the change in the snapshot.""" return self._inner_dict.get('proposedSnapshot') # type: ignore @proposedSnapshot.setter - def proposedSnapshot(self, value: Union["ChartSnapshotClass", "CorpGroupSnapshotClass", "CorpUserSnapshotClass", "DashboardSnapshotClass", "DataFlowSnapshotClass", "DataJobSnapshotClass", "DatasetSnapshotClass", "DataProcessSnapshotClass", "DataPlatformSnapshotClass", "MLModelSnapshotClass", "MLFeatureSnapshotClass", "TagSnapshotClass", "GlossaryTermSnapshotClass", "GlossaryNodeSnapshotClass"]) -> None: + def proposedSnapshot(self, value: Union["ChartSnapshotClass", "CorpGroupSnapshotClass", "CorpUserSnapshotClass", "DashboardSnapshotClass", "DataFlowSnapshotClass", "DataJobSnapshotClass", "DatasetSnapshotClass", "DataProcessSnapshotClass", "DataPlatformSnapshotClass", "MLModelSnapshotClass", "MLPrimaryKeySnapshotClass", "MLFeatureSnapshotClass", "MLFeatureTableSnapshotClass", "TagSnapshotClass", "GlossaryTermSnapshotClass", "GlossaryNodeSnapshotClass"]) -> None: """Setter: Snapshot of the proposed metadata change. Include only the aspects affected by the change in the snapshot.""" self._inner_dict['proposedSnapshot'] = value @@ -6826,7 +7189,9 @@ def description(self, value: Union[None, str]) -> None: 'com.linkedin.pegasus2avro.metadata.key.GlossaryNodeKey': GlossaryNodeKeyClass, 'com.linkedin.pegasus2avro.metadata.key.GlossaryTermKey': GlossaryTermKeyClass, 'com.linkedin.pegasus2avro.metadata.key.MLFeatureKey': MLFeatureKeyClass, + 'com.linkedin.pegasus2avro.metadata.key.MLFeatureTableKey': MLFeatureTableKeyClass, 'com.linkedin.pegasus2avro.metadata.key.MLModelKey': MLModelKeyClass, + 'com.linkedin.pegasus2avro.metadata.key.MLPrimaryKeyKey': MLPrimaryKeyKeyClass, 'com.linkedin.pegasus2avro.metadata.key.TagKey': TagKeyClass, 'com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot': ChartSnapshotClass, 'com.linkedin.pegasus2avro.metadata.snapshot.CorpGroupSnapshot': CorpGroupSnapshotClass, @@ -6840,7 +7205,9 @@ def description(self, value: Union[None, str]) -> None: 'com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot': GlossaryNodeSnapshotClass, 'com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot': GlossaryTermSnapshotClass, 'com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot': MLFeatureSnapshotClass, + 'com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureTableSnapshot': MLFeatureTableSnapshotClass, 'com.linkedin.pegasus2avro.metadata.snapshot.MLModelSnapshot': MLModelSnapshotClass, + 'com.linkedin.pegasus2avro.metadata.snapshot.MLPrimaryKeySnapshot': MLPrimaryKeySnapshotClass, 'com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot': TagSnapshotClass, 'com.linkedin.pegasus2avro.ml.metadata.BaseData': BaseDataClass, 'com.linkedin.pegasus2avro.ml.metadata.CaveatDetails': CaveatDetailsClass, @@ -6850,9 +7217,11 @@ def description(self, value: Union[None, str]) -> None: 'com.linkedin.pegasus2avro.ml.metadata.IntendedUse': IntendedUseClass, 'com.linkedin.pegasus2avro.ml.metadata.IntendedUserType': IntendedUserTypeClass, 'com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties': MLFeaturePropertiesClass, + 'com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties': MLFeatureTablePropertiesClass, 'com.linkedin.pegasus2avro.ml.metadata.MLModelFactorPrompts': MLModelFactorPromptsClass, 'com.linkedin.pegasus2avro.ml.metadata.MLModelFactors': MLModelFactorsClass, 'com.linkedin.pegasus2avro.ml.metadata.MLModelProperties': MLModelPropertiesClass, + 'com.linkedin.pegasus2avro.ml.metadata.MLPrimaryKeyProperties': MLPrimaryKeyPropertiesClass, 'com.linkedin.pegasus2avro.ml.metadata.Metrics': MetricsClass, 'com.linkedin.pegasus2avro.ml.metadata.QuantitativeAnalyses': QuantitativeAnalysesClass, 'com.linkedin.pegasus2avro.ml.metadata.SourceCode': SourceCodeClass, @@ -6955,7 +7324,9 @@ def description(self, value: Union[None, str]) -> None: 'GlossaryNodeKey': GlossaryNodeKeyClass, 'GlossaryTermKey': GlossaryTermKeyClass, 'MLFeatureKey': MLFeatureKeyClass, + 'MLFeatureTableKey': MLFeatureTableKeyClass, 'MLModelKey': MLModelKeyClass, + 'MLPrimaryKeyKey': MLPrimaryKeyKeyClass, 'TagKey': TagKeyClass, 'ChartSnapshot': ChartSnapshotClass, 'CorpGroupSnapshot': CorpGroupSnapshotClass, @@ -6969,7 +7340,9 @@ def description(self, value: Union[None, str]) -> None: 'GlossaryNodeSnapshot': GlossaryNodeSnapshotClass, 'GlossaryTermSnapshot': GlossaryTermSnapshotClass, 'MLFeatureSnapshot': MLFeatureSnapshotClass, + 'MLFeatureTableSnapshot': MLFeatureTableSnapshotClass, 'MLModelSnapshot': MLModelSnapshotClass, + 'MLPrimaryKeySnapshot': MLPrimaryKeySnapshotClass, 'TagSnapshot': TagSnapshotClass, 'BaseData': BaseDataClass, 'CaveatDetails': CaveatDetailsClass, @@ -6979,9 +7352,11 @@ def description(self, value: Union[None, str]) -> None: 'IntendedUse': IntendedUseClass, 'IntendedUserType': IntendedUserTypeClass, 'MLFeatureProperties': MLFeaturePropertiesClass, + 'MLFeatureTableProperties': MLFeatureTablePropertiesClass, 'MLModelFactorPrompts': MLModelFactorPromptsClass, 'MLModelFactors': MLModelFactorsClass, 'MLModelProperties': MLModelPropertiesClass, + 'MLPrimaryKeyProperties': MLPrimaryKeyPropertiesClass, 'Metrics': MetricsClass, 'QuantitativeAnalyses': QuantitativeAnalysesClass, 'SourceCode': SourceCodeClass, diff --git a/metadata-ingestion/tests/integration/feast/core/core.yml b/metadata-ingestion/tests/integration/feast/core/core.yml new file mode 100644 index 00000000000000..517b2649ca2504 --- /dev/null +++ b/metadata-ingestion/tests/integration/feast/core/core.yml @@ -0,0 +1,5 @@ +spring: + datasource: + url: jdbc:postgresql://${DB_HOST:127.0.0.1}:${DB_PORT:5432}/${DB_DATABASE:postgres} + username: ${DB_USERNAME:postgres} + password: ${DB_PASSWORD:password} diff --git a/metadata-ingestion/tests/integration/feast/docker-compose.yml b/metadata-ingestion/tests/integration/feast/docker-compose.yml new file mode 100644 index 00000000000000..35bc8b3916999d --- /dev/null +++ b/metadata-ingestion/tests/integration/feast/docker-compose.yml @@ -0,0 +1,42 @@ +version: "3.7" + +services: + core: + image: gcr.io/kf-feast/feast-core:develop + container_name: "testfeast" + volumes: + - ./core/core.yml:/etc/feast/application.yml + environment: + DB_HOST: db + # restart: on-failure + depends_on: + - db + ports: + - 6565:6565 + command: + - java + - -jar + - /opt/feast/feast-core.jar + - --spring.config.location=classpath:/application.yml,file:/etc/feast/application.yml + + setup: + container_name: "testfeast_setup" + # build from the same Feast image used for ingestion + build: ../../../src/datahub/ingestion/source/feast_image/ + volumes: + - ./make_tests.py:/app/make_tests.py + - ./wait-for-it.sh:/app/wait-for-it.sh + depends_on: + - core + # listen to this port once test cases have been imported, so test script can see when done + ports: + - 6789:6789 + # wait for Feast to start, then insert test data + command: ./wait-for-it.sh testfeast:6565 -- python3 make_tests.py + + db: + image: postgres:12-alpine + environment: + POSTGRES_PASSWORD: password + ports: + - "5432:5432" diff --git a/metadata-ingestion/tests/integration/feast/feast_mce_golden.json b/metadata-ingestion/tests/integration/feast/feast_mce_golden.json new file mode 100644 index 00000000000000..2956bdd9dcdac0 --- /dev/null +++ b/metadata-ingestion/tests/integration/feast/feast_mce_golden.json @@ -0,0 +1,502 @@ +[ +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLPrimaryKeySnapshot": { + "urn": "urn:li:mlPrimaryKey:(test_feature_table_all_feature_dtypes,dummy_entity_1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLPrimaryKeyProperties": { + "description": "Dummy entity 1", + "dataType": "TEXT", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLPrimaryKeySnapshot": { + "urn": "urn:li:mlPrimaryKey:(test_feature_table_all_feature_dtypes,dummy_entity_2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLPrimaryKeyProperties": { + "description": "Dummy entity 2", + "dataType": "ORDINAL", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BOOL_LIST_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "SEQUENCE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BOOL_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "BINARY", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BYTES_LIST_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "SEQUENCE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BYTES_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "BYTE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_DOUBLE_LIST_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "SEQUENCE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_DOUBLE_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "CONTINUOUS", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_FLOAT_LIST_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "SEQUENCE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_FLOAT_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "CONTINUOUS", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_INT32_LIST_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "SEQUENCE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_INT32_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "ORDINAL", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_INT64_LIST_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "SEQUENCE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_INT64_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "ORDINAL", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_STRING_LIST_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "SEQUENCE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_STRING_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "TEXT", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureTableSnapshot": { + "urn": "urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_all_feature_dtypes)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties": { + "description": null, + "mlFeatures": [ + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BOOL_LIST_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BOOL_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BYTES_LIST_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BYTES_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_DOUBLE_LIST_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_DOUBLE_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_FLOAT_LIST_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_FLOAT_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_INT32_LIST_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_INT32_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_INT64_LIST_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_INT64_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_STRING_LIST_feature)", + "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_STRING_feature)" + ], + "mlPrimaryKeys": [ + "urn:li:mlPrimaryKey:(test_feature_table_all_feature_dtypes,dummy_entity_1)", + "urn:li:mlPrimaryKey:(test_feature_table_all_feature_dtypes,dummy_entity_2)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLPrimaryKeySnapshot": { + "urn": "urn:li:mlPrimaryKey:(test_feature_table_no_labels,dummy_entity_2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLPrimaryKeyProperties": { + "description": "Dummy entity 2", + "dataType": "ORDINAL", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_no_labels,test_BYTES_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "BYTE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureTableSnapshot": { + "urn": "urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_no_labels)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties": { + "description": null, + "mlFeatures": [ + "urn:li:mlFeature:(test_feature_table_no_labels,test_BYTES_feature)" + ], + "mlPrimaryKeys": [ + "urn:li:mlPrimaryKey:(test_feature_table_no_labels,dummy_entity_2)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLPrimaryKeySnapshot": { + "urn": "urn:li:mlPrimaryKey:(test_feature_table_single_feature,dummy_entity_1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLPrimaryKeyProperties": { + "description": "Dummy entity 1", + "dataType": "TEXT", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureSnapshot": { + "urn": "urn:li:mlFeature:(test_feature_table_single_feature,test_BYTES_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "description": null, + "dataType": "BYTE", + "version": null, + "sources": [ + "urn:li:dataset:(urn:li:dataPlatform:file,feast.*,PROD)" + ] + } + } + ] + } + }, + "proposedDelta": null +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureTableSnapshot": { + "urn": "urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_single_feature)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties": { + "description": null, + "mlFeatures": [ + "urn:li:mlFeature:(test_feature_table_single_feature,test_BYTES_feature)" + ], + "mlPrimaryKeys": [ + "urn:li:mlPrimaryKey:(test_feature_table_single_feature,dummy_entity_1)" + ] + } + } + ] + } + }, + "proposedDelta": null +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/feast/make_tests.py b/metadata-ingestion/tests/integration/feast/make_tests.py new file mode 100644 index 00000000000000..3d5edf4a47e781 --- /dev/null +++ b/metadata-ingestion/tests/integration/feast/make_tests.py @@ -0,0 +1,106 @@ +import socket + +from feast import Client +from feast.data_format import ParquetFormat +from feast.data_source import FileSource +from feast.entity import Entity +from feast.feature import Feature +from feast.feature_table import FeatureTable +from feast.value_type import ValueType + +if __name__ == "__main__": + + test_client = Client(core_url="testfeast:6565") + + # create dummy entity since Feast demands it + entity_1 = Entity( + name="dummy_entity_1", + description="Dummy entity 1", + value_type=ValueType.STRING, + labels={"key": "val"}, + ) + + # create dummy entity since Feast demands it + entity_2 = Entity( + name="dummy_entity_2", + description="Dummy entity 2", + value_type=ValueType.INT32, + labels={"key": "val"}, + ) + + # commit entities + test_client.apply([entity_1, entity_2]) + + # dummy file source + batch_source = FileSource( + file_format=ParquetFormat(), + file_url="file://feast/*", + event_timestamp_column="ts_col", + created_timestamp_column="timestamp", + date_partition_column="date_partition_col", + ) + + # first feature table for testing, with all of Feast's datatypes + table_1 = FeatureTable( + name="test_feature_table_all_feature_dtypes", + features=[ + Feature(name="test_BYTES_feature", dtype=ValueType.BYTES), + Feature(name="test_STRING_feature", dtype=ValueType.STRING), + Feature(name="test_INT32_feature", dtype=ValueType.INT32), + Feature(name="test_INT64_feature", dtype=ValueType.INT64), + Feature(name="test_DOUBLE_feature", dtype=ValueType.DOUBLE), + Feature(name="test_FLOAT_feature", dtype=ValueType.FLOAT), + Feature(name="test_BOOL_feature", dtype=ValueType.BOOL), + Feature(name="test_BYTES_LIST_feature", dtype=ValueType.BYTES_LIST), + Feature(name="test_STRING_LIST_feature", dtype=ValueType.STRING_LIST), + Feature(name="test_INT32_LIST_feature", dtype=ValueType.INT32_LIST), + Feature(name="test_INT64_LIST_feature", dtype=ValueType.INT64_LIST), + Feature(name="test_DOUBLE_LIST_feature", dtype=ValueType.DOUBLE_LIST), + Feature(name="test_FLOAT_LIST_feature", dtype=ValueType.FLOAT_LIST), + Feature(name="test_BOOL_LIST_feature", dtype=ValueType.BOOL_LIST), + ], + entities=["dummy_entity_1", "dummy_entity_2"], + labels={"team": "matchmaking"}, + batch_source=batch_source, + ) + + # second feature table for testing, with just a single feature + table_2 = FeatureTable( + name="test_feature_table_single_feature", + features=[ + Feature(name="test_BYTES_feature", dtype=ValueType.BYTES), + ], + entities=["dummy_entity_1"], + labels={"team": "matchmaking"}, + batch_source=batch_source, + ) + + # third feature table for testing, no labels + table_3 = FeatureTable( + name="test_feature_table_no_labels", + features=[ + Feature(name="test_BYTES_feature", dtype=ValueType.BYTES), + ], + entities=["dummy_entity_2"], + labels={}, + batch_source=batch_source, + ) + + # commit the tables to the feature store + test_client.apply([table_1, table_2, table_3]) + + print("make_tests.py setup finished") + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + + # listen to port 6789 once done so test script knows when to start ingestion + server_address = ("localhost", 6789) + sock.bind(server_address) + + sock.listen(1) + + print("make_tests.py listening on 6789") + + while True: + # Wait for a connection + connection, client_address = sock.accept() diff --git a/metadata-ingestion/tests/integration/feast/serving/online-serving.yml b/metadata-ingestion/tests/integration/feast/serving/online-serving.yml new file mode 100644 index 00000000000000..f0cb272a2a924b --- /dev/null +++ b/metadata-ingestion/tests/integration/feast/serving/online-serving.yml @@ -0,0 +1,6 @@ +COMPOSE_PROJECT_NAME=feast +FEAST_VERSION=develop +FEAST_CORE_CONFIG=./core/core.yml +FEAST_ONLINE_SERVING_CONFIG=./serving/online-serving.yml +GCP_SERVICE_ACCOUNT=./gcp-service-accounts/placeholder.json +INGESTION_JAR_PATH=https://storage.googleapis.com/feast-jobs/spark/ingestion/feast-ingestion-spark-develop.jar diff --git a/metadata-ingestion/tests/integration/feast/test_feast.py b/metadata-ingestion/tests/integration/feast/test_feast.py new file mode 100644 index 00000000000000..24eb4dc09e7176 --- /dev/null +++ b/metadata-ingestion/tests/integration/feast/test_feast.py @@ -0,0 +1,51 @@ +import pytest + +from datahub.ingestion.run.pipeline import Pipeline +from tests.test_helpers import mce_helpers + +# from datahub.ingestion.run.pipeline import Pipeline +# from tests.test_helpers import mce_helpers +from tests.test_helpers.docker_helpers import wait_for_port + + +# make sure that mock_time is excluded here because it messes with feast +@pytest.mark.slow +def test_feast_ingest(docker_compose_runner, pytestconfig, tmp_path): + test_resources_dir = pytestconfig.rootpath / "tests/integration/feast" + + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "feast" + ) as docker_services: + wait_for_port(docker_services, "testfeast", 6565) + + # container listens to this port once test cases have been setup + wait_for_port(docker_services, "testfeast_setup", 6789) + + # Run the metadata ingestion pipeline. + pipeline = Pipeline.create( + { + "run_id": "feast-test", + "source": { + "type": "feast", + "config": { + "core_url": "localhost:6565", + "use_local_build": True, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/feast_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status() + + # Verify the output. + output = mce_helpers.load_json_file(str(tmp_path / "feast_mces.json")) + golden = mce_helpers.load_json_file( + str(test_resources_dir / "feast_mce_golden.json") + ) + mce_helpers.assert_mces_equal(output, golden) diff --git a/metadata-ingestion/tests/integration/feast/wait-for-it.sh b/metadata-ingestion/tests/integration/feast/wait-for-it.sh new file mode 100755 index 00000000000000..5b551220d9ee5c --- /dev/null +++ b/metadata-ingestion/tests/integration/feast/wait-for-it.sh @@ -0,0 +1,183 @@ +#!/usr/bin/env bash +# Use this script to test if a given TCP host/port are available +# from https://github.com/vishnubob/wait-for-it + +WAITFORIT_cmdname=${0##*/} + +echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } + +usage() +{ + cat << USAGE >&2 +Usage: + $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] + -h HOST | --host=HOST Host or IP under test + -p PORT | --port=PORT TCP port under test + Alternatively, you specify the host and port as host:port + -s | --strict Only execute subcommand if the test succeeds + -q | --quiet Don't output any status messages + -t TIMEOUT | --timeout=TIMEOUT + Timeout in seconds, zero for no timeout + -- COMMAND ARGS Execute command with args after the test finishes +USAGE + exit 1 +} + +wait_for() +{ + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + else + echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" + fi + WAITFORIT_start_ts=$(date +%s) + while : + do + if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then + nc -z $WAITFORIT_HOST $WAITFORIT_PORT + WAITFORIT_result=$? + else + (echo -n > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 + WAITFORIT_result=$? + fi + if [[ $WAITFORIT_result -eq 0 ]]; then + WAITFORIT_end_ts=$(date +%s) + echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" + break + fi + sleep 1 + done + return $WAITFORIT_result +} + +wait_for_wrapper() +{ + # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 + if [[ $WAITFORIT_QUIET -eq 1 ]]; then + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + else + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + fi + WAITFORIT_PID=$! + trap "kill -INT -$WAITFORIT_PID" INT + wait $WAITFORIT_PID + WAITFORIT_RESULT=$? + if [[ $WAITFORIT_RESULT -ne 0 ]]; then + echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + fi + return $WAITFORIT_RESULT +} + +# process arguments +while [[ $# -gt 0 ]] +do + case "$1" in + *:* ) + WAITFORIT_hostport=(${1//:/ }) + WAITFORIT_HOST=${WAITFORIT_hostport[0]} + WAITFORIT_PORT=${WAITFORIT_hostport[1]} + shift 1 + ;; + --child) + WAITFORIT_CHILD=1 + shift 1 + ;; + -q | --quiet) + WAITFORIT_QUIET=1 + shift 1 + ;; + -s | --strict) + WAITFORIT_STRICT=1 + shift 1 + ;; + -h) + WAITFORIT_HOST="$2" + if [[ $WAITFORIT_HOST == "" ]]; then break; fi + shift 2 + ;; + --host=*) + WAITFORIT_HOST="${1#*=}" + shift 1 + ;; + -p) + WAITFORIT_PORT="$2" + if [[ $WAITFORIT_PORT == "" ]]; then break; fi + shift 2 + ;; + --port=*) + WAITFORIT_PORT="${1#*=}" + shift 1 + ;; + -t) + WAITFORIT_TIMEOUT="$2" + if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi + shift 2 + ;; + --timeout=*) + WAITFORIT_TIMEOUT="${1#*=}" + shift 1 + ;; + --) + shift + WAITFORIT_CLI=("$@") + break + ;; + --help) + usage + ;; + *) + echoerr "Unknown argument: $1" + usage + ;; + esac +done + +if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then + echoerr "Error: you need to provide a host and port to test." + usage +fi + +WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} +WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} +WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} +WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} + +# Check to see if timeout is from busybox? +WAITFORIT_TIMEOUT_PATH=$(type -p timeout) +WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) + +WAITFORIT_BUSYTIMEFLAG="" +if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then + WAITFORIT_ISBUSY=1 + # Check if busybox timeout uses -t flag + # (recent Alpine versions don't support -t anymore) + if timeout &>/dev/stdout | grep -q -e '-t '; then + WAITFORIT_BUSYTIMEFLAG="-t" + fi +else + WAITFORIT_ISBUSY=0 +fi + +if [[ $WAITFORIT_CHILD -gt 0 ]]; then + wait_for + WAITFORIT_RESULT=$? + exit $WAITFORIT_RESULT +else + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + wait_for_wrapper + WAITFORIT_RESULT=$? + else + wait_for + WAITFORIT_RESULT=$? + fi +fi + +if [[ $WAITFORIT_CLI != "" ]]; then + if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then + echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" + exit $WAITFORIT_RESULT + fi + exec "${WAITFORIT_CLI[@]}" +else + exit $WAITFORIT_RESULT +fi \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/ldap/ldap_mce_golden.json b/metadata-ingestion/tests/integration/ldap/ldap_mce_golden.json index 63ae119b6d5c78..79bc57ddbfc829 100644 --- a/metadata-ingestion/tests/integration/ldap/ldap_mce_golden.json +++ b/metadata-ingestion/tests/integration/ldap/ldap_mce_golden.json @@ -1,125 +1,125 @@ [ - { +{ "auditHeader": null, "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.CorpGroupSnapshot": { - "urn": "urn:li:corpGroup:simpons-group", - "aspects": [ - { - "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { - "email": "simpons-group", - "admins": [], - "members": [], - "groups": [] - } - } - ] - } + "com.linkedin.pegasus2avro.metadata.snapshot.CorpGroupSnapshot": { + "urn": "urn:li:corpGroup:simpons-group", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { + "email": "simpons-group", + "admins": [], + "members": [], + "groups": [] + } + } + ] + } }, "proposedDelta": null - }, - { +}, +{ "auditHeader": null, "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { - "urn": "urn:li:corpuser:bsimpson", - "aspects": [ - { - "com.linkedin.pegasus2avro.identity.CorpUserInfo": { - "active": true, - "displayName": "Bart Simpson", - "email": "bsimpson", - "title": "Mr. Boss", - "managerUrn": null, - "departmentId": null, - "departmentName": null, - "firstName": "Bart", - "lastName": "Simpson", - "fullName": "Bart Simpson", - "countryCode": null - } - } - ] - } + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:bsimpson", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": "Bart Simpson", + "email": "bsimpson", + "title": "Mr. Boss", + "managerUrn": null, + "departmentId": null, + "departmentName": null, + "firstName": "Bart", + "lastName": "Simpson", + "fullName": "Bart Simpson", + "countryCode": null + } + } + ] + } }, "proposedDelta": null - }, - { +}, +{ "auditHeader": null, "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { - "urn": "urn:li:corpuser:hsimpson", - "aspects": [ - { - "com.linkedin.pegasus2avro.identity.CorpUserInfo": { - "active": true, - "displayName": "Homer Simpson", - "email": "hsimpson", - "title": "Mr. Everything", - "managerUrn": "urn:li:corpuser:bsimpson", - "departmentId": null, - "departmentName": "1001", - "firstName": "Homer", - "lastName": "Simpson", - "fullName": "Homer Simpson", - "countryCode": null - } - } - ] - } + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:hsimpson", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": "Homer Simpson", + "email": "hsimpson", + "title": "Mr. Everything", + "managerUrn": "urn:li:corpuser:bsimpson", + "departmentId": null, + "departmentName": "1001", + "firstName": "Homer", + "lastName": "Simpson", + "fullName": "Homer Simpson", + "countryCode": null + } + } + ] + } }, "proposedDelta": null - }, - { +}, +{ "auditHeader": null, "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { - "urn": "urn:li:corpuser:lsimpson", - "aspects": [ - { - "com.linkedin.pegasus2avro.identity.CorpUserInfo": { - "active": true, - "displayName": "Lisa Simpson", - "email": "lsimpson", - "title": null, - "managerUrn": null, - "departmentId": null, - "departmentName": null, - "firstName": "Lisa", - "lastName": "Simpson", - "fullName": "Lisa Simpson", - "countryCode": null - } - } - ] - } + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:lsimpson", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": "Lisa Simpson", + "email": "lsimpson", + "title": null, + "managerUrn": null, + "departmentId": null, + "departmentName": null, + "firstName": "Lisa", + "lastName": "Simpson", + "fullName": "Lisa Simpson", + "countryCode": null + } + } + ] + } }, "proposedDelta": null - }, - { +}, +{ "auditHeader": null, "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { - "urn": "urn:li:corpuser:msimpson", - "aspects": [ - { - "com.linkedin.pegasus2avro.identity.CorpUserInfo": { - "active": true, - "displayName": "Maggie Simpson", - "email": "msimpson", - "title": null, - "managerUrn": null, - "departmentId": null, - "departmentName": null, - "firstName": "Maggie", - "lastName": "Simpson", - "fullName": "Maggie Simpson", - "countryCode": null - } - } - ] - } + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:msimpson", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": "Maggie Simpson", + "email": "msimpson", + "title": null, + "managerUrn": null, + "departmentId": null, + "departmentName": null, + "firstName": "Maggie", + "lastName": "Simpson", + "fullName": "Maggie Simpson", + "countryCode": null + } + } + ] + } }, "proposedDelta": null - } -] +} +] \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureTableAspect.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureTableAspect.pdl new file mode 100644 index 00000000000000..e63d43e9372b09 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureTableAspect.pdl @@ -0,0 +1,20 @@ +namespace com.linkedin.metadata.aspect + +import com.linkedin.metadata.key.MLFeatureTableKey +import com.linkedin.common.InstitutionalMemory +import com.linkedin.common.Ownership +import com.linkedin.common.Status +import com.linkedin.ml.metadata.MLFeatureTableProperties +import com.linkedin.common.Deprecation + +/** + * A union of all supported metadata aspects for a MLFeatureTable + */ +typeref MLFeatureTableAspect = union[ + MLFeatureTableKey, + MLFeatureTableProperties, + Ownership, + InstitutionalMemory, + Status, + Deprecation +] diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLPrimaryKeyAspect.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLPrimaryKeyAspect.pdl new file mode 100644 index 00000000000000..60711df7516145 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLPrimaryKeyAspect.pdl @@ -0,0 +1,20 @@ +namespace com.linkedin.metadata.aspect + +import com.linkedin.metadata.key.MLPrimaryKeyKey +import com.linkedin.common.InstitutionalMemory +import com.linkedin.common.Ownership +import com.linkedin.common.Status +import com.linkedin.ml.metadata.MLPrimaryKeyProperties +import com.linkedin.common.Deprecation + +/** + * A union of all supported metadata aspects for a MLPrimaryKey + */ +typeref MLPrimaryKeyAspect = union[ + MLPrimaryKeyKey, + MLPrimaryKeyProperties, + Ownership, + InstitutionalMemory, + Status, + Deprecation +] diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl index 584ff46a632c4f..579f1966977a97 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl @@ -1,7 +1,7 @@ namespace com.linkedin.metadata.key /** - * Key for an ML model + * Key for an MLFeature */ @Aspect = { "name": "mlFeatureKey" @@ -11,10 +11,18 @@ record MLFeatureKey { /** * Namespace for the feature */ - featureNamespace: string + @Searchable = { + "fieldType": "TEXT_PARTIAL", + } + featureNamespace: string /** * Name of the feature */ - name: string + @Searchable = { + "fieldType": "TEXT_PARTIAL", + "enableAutocomplete": true, + "boostScore": 8.0 + } + name: string } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl new file mode 100644 index 00000000000000..c29b8f621c316e --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl @@ -0,0 +1,34 @@ +namespace com.linkedin.metadata.key + +import com.linkedin.common.Urn + +/** + * Key for an MLFeatureTable + */ +@Aspect = { + "name": "mlFeatureTableKey" +} +record MLFeatureTableKey { + /** + * Data platform urn associated with the feature table + */ + @Searchable = { + "fieldType": "URN", + "addToFilters": true + } + @Relationship = { + "name": "SourcePlatform", + "entityTypes": [ "dataPlatform" ] + } + platform: Urn + + /** + * Name of the feature table + */ + @Searchable = { + "fieldType": "TEXT_PARTIAL", + "enableAutocomplete": true, + "boostScore": 8.0 + } + name: string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl index ef0460efe112e6..2fca3ceba0f75f 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl @@ -14,8 +14,8 @@ record MLModelKey { * Standardized platform urn for the model */ @Searchable = { - "fieldType": "TEXT_PARTIAL", - "boostScore": 0.1 + "fieldType": "URN", + "addToFilters": true } platform: Urn diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl new file mode 100644 index 00000000000000..a923fed979fbfe --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl @@ -0,0 +1,30 @@ +namespace com.linkedin.metadata.key + +import com.linkedin.common.Urn + +/** + * Key for an MLPrimaryKey + */ +@Aspect = { + "name": "mlPrimaryKeyKey" +} +record MLPrimaryKeyKey { + /** + * Namespace for the primary key + */ + @Searchable = { + "fieldType": "TEXT_PARTIAL", + "addToFilters": true + } + featureNamespace: string + + /** + * Name of the primary key + */ + @Searchable = { + "fieldType": "TEXT_PARTIAL", + "enableAutocomplete": true, + "boostScore": 8.0 + } + name: string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureSnapshot.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureSnapshot.pdl index a72253c30a23be..0eddb93324bfed 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureSnapshot.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureSnapshot.pdl @@ -15,7 +15,7 @@ record MLFeatureSnapshot { urn: MLFeatureUrn /** - * The list of metadata aspects associated with the MLModel. Depending on the use case, this can either be all, or a selection, of supported aspects. + * The list of metadata aspects associated with the MLFeature. Depending on the use case, this can either be all, or a selection, of supported aspects. */ aspects: array[MLFeatureAspect] } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureTableSnapshot.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureTableSnapshot.pdl new file mode 100644 index 00000000000000..40d8fbdd2234fa --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureTableSnapshot.pdl @@ -0,0 +1,21 @@ +namespace com.linkedin.metadata.snapshot + +import com.linkedin.common.Urn +import com.linkedin.metadata.aspect.MLFeatureTableAspect + +@Entity = { + "name": "mlFeatureTable", + "keyAspect": "mlFeatureTableKey" +} +record MLFeatureTableSnapshot { + + /** + * URN for the entity the metadata snapshot is associated with. + */ + urn: Urn + + /** + * The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects. + */ + aspects: array[MLFeatureTableAspect] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLPrimaryKeySnapshot.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLPrimaryKeySnapshot.pdl new file mode 100644 index 00000000000000..cf99ee4a2a723a --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLPrimaryKeySnapshot.pdl @@ -0,0 +1,21 @@ +namespace com.linkedin.metadata.snapshot + +import com.linkedin.common.Urn +import com.linkedin.metadata.aspect.MLPrimaryKeyAspect + +@Entity = { + "name": "mlPrimaryKey", + "keyAspect": "mlPrimaryKeyKey" +} +record MLPrimaryKeySnapshot { + + /** + * URN for the entity the metadata snapshot is associated with. + */ + urn: Urn + + /** + * The list of metadata aspects associated with the MLPrimaryKey. Depending on the use case, this can either be all, or a selection, of supported aspects. + */ + aspects: array[MLPrimaryKeyAspect] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl index c9a20560bc8a13..ec66542613efdc 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl @@ -14,7 +14,9 @@ typeref Snapshot = union[ DataProcessSnapshot, DataPlatformSnapshot, MLModelSnapshot, + MLPrimaryKeySnapshot, MLFeatureSnapshot, + MLFeatureTableSnapshot, TagSnapshot, GlossaryTermSnapshot, GlossaryNodeSnapshot diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl index 56114edcd7f5bd..936b178677dcae 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl @@ -1,5 +1,6 @@ namespace com.linkedin.ml.metadata +import com.linkedin.common.Urn import com.linkedin.common.MLFeatureDataType import com.linkedin.common.VersionTag @@ -25,4 +26,15 @@ record MLFeatureProperties { * Version of the MLFeature */ version: optional VersionTag + + /** + * Source of the MLFeature + */ + @Relationship = { + "/*": { + "name": "DerivedFrom", + "entityTypes": [ "dataset" ] + } + } + sources: optional array[Urn] } diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureTableProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureTableProperties.pdl new file mode 100644 index 00000000000000..7666e8296cf9be --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureTableProperties.pdl @@ -0,0 +1,39 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.Urn + +/** + * Properties associated with a MLFeatureTable + */ +@Aspect = { + "name": "mlFeatureTableProperties" +} +record MLFeatureTableProperties { + + /** + * Documentation of the MLFeatureTable + */ + description: optional string + + /** + * List of features contained in the feature table + */ + @Relationship = { + "/*": { + "name": "Contains", + "entityTypes": [ "mlFeature" ] + } + } + mlFeatures: optional array[Urn] + + /** + * List of primary keys in the feature table (if multiple, assumed to act as a composite key) + */ + @Relationship = { + "/*": { + "name": "KeyedBy", + "entityTypes": [ "mlPrimaryKey" ] + } + } + mlPrimaryKeys: optional array[Urn] +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLPrimaryKeyProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLPrimaryKeyProperties.pdl new file mode 100644 index 00000000000000..adc037f2511603 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLPrimaryKeyProperties.pdl @@ -0,0 +1,40 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.Urn +import com.linkedin.common.MLFeatureDataType +import com.linkedin.common.VersionTag + +/** + * Properties associated with a MLPrimaryKey + */ +@Aspect = { + "name": "mlPrimaryKeyProperties" +} +record MLPrimaryKeyProperties { + + /** + * Documentation of the MLPrimaryKey + */ + description: optional string + + /** + * Data Type of the MLPrimaryKey + */ + dataType: optional MLFeatureDataType + + /** + * Version of the MLPrimaryKey + */ + version: optional VersionTag + + /** + * Source of the MLPrimaryKey + */ + @Relationship = { + "/*": { + "name": "DerivedFrom", + "entityTypes": [ "dataset" ] + } + } + sources: array[Urn] +} diff --git a/metadata-models/src/test/java/com/linkedin/metadata/ModelValidation.java b/metadata-models/src/test/java/com/linkedin/metadata/ModelValidation.java index 46df99410df132..c9a77d72ff79b6 100644 --- a/metadata-models/src/test/java/com/linkedin/metadata/ModelValidation.java +++ b/metadata-models/src/test/java/com/linkedin/metadata/ModelValidation.java @@ -10,7 +10,6 @@ import com.linkedin.metadata.validator.RelationshipValidator; import com.linkedin.metadata.validator.SnapshotValidator; import java.io.IOException; -import java.util.Collection; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -67,7 +66,6 @@ public void validateSnapshots() throws Exception { assertFalse("Failed to find any snapshots", snapshots.isEmpty()); snapshots.forEach(SnapshotValidator::validateSnapshotSchema); - SnapshotValidator.validateUniqueUrn((Collection>) snapshots); } @Test