diff --git a/datahub-web-react/src/images/feastlogo.png b/datahub-web-react/src/images/feastlogo.png new file mode 100644 index 00000000000000..8d8bd306684ccc Binary files /dev/null and b/datahub-web-react/src/images/feastlogo.png differ diff --git a/datahub-web-react/src/images/sagemakerlogo.png b/datahub-web-react/src/images/sagemakerlogo.png new file mode 100644 index 00000000000000..599f4f218790a3 Binary files /dev/null and b/datahub-web-react/src/images/sagemakerlogo.png differ diff --git a/gms/api/src/main/snapshot/com.linkedin.datajob.dataJobs.snapshot.json b/gms/api/src/main/snapshot/com.linkedin.datajob.dataJobs.snapshot.json index 1f419dc8b3cb10..0c814cfa9237ac 100644 --- a/gms/api/src/main/snapshot/com.linkedin.datajob.dataJobs.snapshot.json +++ b/gms/api/src/main/snapshot/com.linkedin.datajob.dataJobs.snapshot.json @@ -460,8 +460,8 @@ "PIG" : "Pig type is for running Pig jobs.", "SQL" : "SQL is for running Presto, mysql queries etc" } - } ], - "doc" : "Datajob type" + }, "string" ], + "doc" : "Datajob type\n**NOTE**: AzkabanJobType is deprecated. Please use strings instead." }, { "name" : "flowUrn", "type" : "com.linkedin.common.DataFlowUrn", @@ -471,6 +471,25 @@ "entityTypes" : [ "dataFlow" ], "name" : "IsPartOf" } + }, { + "name" : "status", + "type" : { + "type" : "enum", + "name" : "JobStatus", + "doc" : "Job statuses", + "symbols" : [ "STARTING", "IN_PROGRESS", "STOPPING", "STOPPED", "COMPLETED", "FAILED", "UNKNOWN" ], + "symbolDocs" : { + "COMPLETED" : "Jobs with successful completion.", + "FAILED" : "Jobs that have failed.", + "IN_PROGRESS" : "Jobs currently running.", + "STARTING" : "Jobs being initialized.", + "STOPPED" : "Jobs that have stopped.", + "STOPPING" : "Jobs being stopped.", + "UNKNOWN" : "Jobs with unknown status (either unmappable or unavailable)" + } + }, + "doc" : "Status of the job", + "optional" : true } ], "Aspect" : { "name" : "dataJobInfo" @@ -586,7 +605,7 @@ "doc" : "Editable properties", "optional" : true } ] - }, "com.linkedin.datajob.DataJobInfo", "com.linkedin.datajob.DataJobInputOutput", "com.linkedin.datajob.DataJobKey", "com.linkedin.datajob.EditableDataJobProperties", "com.linkedin.datajob.azkaban.AzkabanJobType", { + }, "com.linkedin.datajob.DataJobInfo", "com.linkedin.datajob.DataJobInputOutput", "com.linkedin.datajob.DataJobKey", "com.linkedin.datajob.EditableDataJobProperties", "com.linkedin.datajob.JobStatus", "com.linkedin.datajob.azkaban.AzkabanJobType", { "type" : "typeref", "name" : "DataJobAspect", "namespace" : "com.linkedin.metadata.aspect", diff --git a/gms/api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/gms/api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 652e81a3e6115e..4e9b4fec3b0d8b 100644 --- a/gms/api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/gms/api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -978,8 +978,8 @@ "PIG" : "Pig type is for running Pig jobs.", "SQL" : "SQL is for running Presto, mysql queries etc" } - } ], - "doc" : "Datajob type" + }, "string" ], + "doc" : "Datajob type\n**NOTE**: AzkabanJobType is deprecated. Please use strings instead." }, { "name" : "flowUrn", "type" : "com.linkedin.common.DataFlowUrn", @@ -989,6 +989,25 @@ "entityTypes" : [ "dataFlow" ], "name" : "IsPartOf" } + }, { + "name" : "status", + "type" : { + "type" : "enum", + "name" : "JobStatus", + "doc" : "Job statuses", + "symbols" : [ "STARTING", "IN_PROGRESS", "STOPPING", "STOPPED", "COMPLETED", "FAILED", "UNKNOWN" ], + "symbolDocs" : { + "COMPLETED" : "Jobs with successful completion.", + "FAILED" : "Jobs that have failed.", + "IN_PROGRESS" : "Jobs currently running.", + "STARTING" : "Jobs being initialized.", + "STOPPED" : "Jobs that have stopped.", + "STOPPING" : "Jobs being stopped.", + "UNKNOWN" : "Jobs with unknown status (either unmappable or unavailable)" + } + }, + "doc" : "Status of the job", + "optional" : true } ], "Aspect" : { "name" : "dataJobInfo" @@ -1058,7 +1077,7 @@ "Aspect" : { "name" : "dataJobInputOutput" } - }, "com.linkedin.datajob.azkaban.AzkabanJobType", { + }, "com.linkedin.datajob.JobStatus", "com.linkedin.datajob.azkaban.AzkabanJobType", { "type" : "record", "name" : "DatasetDeprecation", "namespace" : "com.linkedin.dataset", @@ -2392,6 +2411,7 @@ "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with a ML Model", + "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { "name" : "description", "type" : "string", diff --git a/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 9d5cf0eec096c3..4835763bc8b0ab 100644 --- a/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -1253,8 +1253,8 @@ "PIG" : "Pig type is for running Pig jobs.", "SQL" : "SQL is for running Presto, mysql queries etc" } - } ], - "doc" : "Datajob type" + }, "string" ], + "doc" : "Datajob type\n**NOTE**: AzkabanJobType is deprecated. Please use strings instead." }, { "name" : "flowUrn", "type" : "com.linkedin.common.DataFlowUrn", @@ -1264,6 +1264,25 @@ "entityTypes" : [ "dataFlow" ], "name" : "IsPartOf" } + }, { + "name" : "status", + "type" : { + "type" : "enum", + "name" : "JobStatus", + "doc" : "Job statuses", + "symbols" : [ "STARTING", "IN_PROGRESS", "STOPPING", "STOPPED", "COMPLETED", "FAILED", "UNKNOWN" ], + "symbolDocs" : { + "COMPLETED" : "Jobs with successful completion.", + "FAILED" : "Jobs that have failed.", + "IN_PROGRESS" : "Jobs currently running.", + "STARTING" : "Jobs being initialized.", + "STOPPED" : "Jobs that have stopped.", + "STOPPING" : "Jobs being stopped.", + "UNKNOWN" : "Jobs with unknown status (either unmappable or unavailable)" + } + }, + "doc" : "Status of the job", + "optional" : true } ], "Aspect" : { "name" : "dataJobInfo" @@ -1371,7 +1390,7 @@ "Aspect" : { "name" : "editableDataJobProperties" } - }, "com.linkedin.datajob.azkaban.AzkabanJobType", { + }, "com.linkedin.datajob.JobStatus", "com.linkedin.datajob.azkaban.AzkabanJobType", { "type" : "record", "name" : "DataPlatformInfo", "namespace" : "com.linkedin.dataplatform", @@ -2792,6 +2811,7 @@ "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with a ML Model", + "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { "name" : "description", "type" : "string", @@ -3452,7 +3472,7 @@ "Aspect" : { "name" : "mlFeatureTableProperties" } - }, "com.linkedin.common.Ownership", "com.linkedin.common.InstitutionalMemory", "com.linkedin.common.Status", "com.linkedin.common.Deprecation" ] + }, "com.linkedin.common.Ownership", "com.linkedin.common.InstitutionalMemory", "com.linkedin.common.Status", "com.linkedin.common.Deprecation", "com.linkedin.common.BrowsePaths" ] } }, "doc" : "The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects." diff --git a/gms/api/src/main/snapshot/com.linkedin.ml.mlModels.snapshot.json b/gms/api/src/main/snapshot/com.linkedin.ml.mlModels.snapshot.json index 24c686f4802619..0c8fd1b3ec2b32 100644 --- a/gms/api/src/main/snapshot/com.linkedin.ml.mlModels.snapshot.json +++ b/gms/api/src/main/snapshot/com.linkedin.ml.mlModels.snapshot.json @@ -141,6 +141,20 @@ "name" : "cost" } }, "com.linkedin.common.CostType", "com.linkedin.common.CostValue", { + "type" : "record", + "name" : "CustomProperties", + "namespace" : "com.linkedin.common", + "doc" : "Misc. properties about an entity.", + "fields" : [ { + "name" : "customProperties", + "type" : { + "type" : "map", + "values" : "string" + }, + "doc" : "Custom property bag.", + "default" : { } + } ] + }, { "type" : "typeref", "name" : "DataPlatformUrn", "namespace" : "com.linkedin.common", @@ -522,6 +536,7 @@ "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with a ML Model", + "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { "name" : "description", "type" : "string", diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 564a0050adb70d..4083aac71f4289 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -387,6 +387,18 @@ source: aws_secret_access_key: # Optional. aws_session_token: # Optional. aws_role: # Optional (Role chaining supported by using a sorted list). + + extract_feature_groups: True # if feature groups should be ingested, default True + extract_models: True # if models should be ingested, default True + extract_jobs: # if jobs should be ingested, default True for all + auto_ml: True + compilation: True + edge_packaging: True + hyper_parameter_tuning: True + labeling: True + processing: True + training: True + transform: True ``` ### Snowflake `snowflake` diff --git a/metadata-ingestion/examples/mce_files/bootstrap_mce.json b/metadata-ingestion/examples/mce_files/bootstrap_mce.json index a98a61cdf7bf9d..7ca11823be0136 100644 --- a/metadata-ingestion/examples/mce_files/bootstrap_mce.json +++ b/metadata-ingestion/examples/mce_files/bootstrap_mce.json @@ -1,1904 +1,1935 @@ [ - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { - "urn": "urn:li:corpuser:datahub", - "aspects": [ - { - "com.linkedin.pegasus2avro.identity.CorpUserInfo": { - "active": true, - "displayName": { - "string": "Data Hub" - }, - "email": "datahub@linkedin.com", - "title": { - "string": "CEO" - }, - "managerUrn": null, - "departmentId": null, - "departmentName": null, - "firstName": null, - "lastName": null, - "fullName": { - "string": "Data Hub" - }, - "countryCode": null - } - } - ] + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:datahub", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": { + "string": "Data Hub" + }, + "email": "datahub@linkedin.com", + "title": { + "string": "CEO" + }, + "managerUrn": null, + "departmentId": null, + "departmentName": null, + "firstName": null, + "lastName": null, + "fullName": { + "string": "Data Hub" + }, + "countryCode": null } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { - "urn": "urn:li:corpuser:jdoe", - "aspects": [ - { - "com.linkedin.pegasus2avro.identity.CorpUserInfo": { - "active": true, - "displayName": { - "string": "John Doe" - }, - "email": "jdoe@linkedin.com", - "title": { - "string": "Software Engineer" - }, - "managerUrn": null, - "departmentId": null, - "departmentName": null, - "firstName": null, - "lastName": null, - "fullName": { - "string": "John Doe" - }, - "countryCode": null - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.CorpUserSnapshot": { + "urn": "urn:li:corpuser:jdoe", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpUserInfo": { + "active": true, + "displayName": { + "string": "John Doe" + }, + "email": "jdoe@linkedin.com", + "title": { + "string": "Software Engineer" + }, + "managerUrn": null, + "departmentId": null, + "departmentName": null, + "firstName": null, + "lastName": null, + "fullName": { + "string": "John Doe" + }, + "countryCode": null } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.CorpGroupSnapshot": { - "urn": "urn:li:corpGroup:jdoe", - "aspects": [ - { - "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { - "email": "jdoe@linkedin.com", - "admins": ["urn:li:corpuser:jdoe", "urn:li:corpuser:datahub"], - "members": ["urn:li:corpuser:jdoe", "urn:li:corpuser:datahub"], - "groups": [] - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.CorpGroupSnapshot": { + "urn": "urn:li:corpGroup:jdoe", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { + "email": "jdoe@linkedin.com", + "admins": ["urn:li:corpuser:jdoe", "urn:li:corpuser:datahub"], + "members": ["urn:li:corpuser:jdoe", "urn:li:corpuser:datahub"], + "groups": [] } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.CorpGroupSnapshot": { - "urn": "urn:li:corpGroup:bfoo", - "aspects": [ - { - "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { - "email": "bfoo@linkedin.com", - "admins": ["urn:li:corpuser:jdoe", "urn:li:corpuser:datahub"], - "members": ["urn:li:corpuser:jdoe", "urn:li:corpuser:datahub"], - "groups": ["urn:li:corpGroup:jdoe"] - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.CorpGroupSnapshot": { + "urn": "urn:li:corpGroup:bfoo", + "aspects": [ + { + "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { + "email": "bfoo@linkedin.com", + "admins": ["urn:li:corpuser:jdoe", "urn:li:corpuser:datahub"], + "members": ["urn:li:corpuser:jdoe", "urn:li:corpuser:datahub"], + "groups": ["urn:li:corpGroup:jdoe"] } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": ["/prod/kafka/SampleKafkaDataset"] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "description": null, - "uri": null, - "tags": [ - "myfaketags" - ], - "customProperties": { - "prop1": "fakeprop", - "prop2": "pikachu" - } - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:jdoe", - "type": "DATAOWNER", - "source": null - }, - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.common.InstitutionalMemory": { - "elements": [ - { - "url": "https://www.linkedin.com", - "description": "Sample doc", - "createStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.EditableSchemaMetadata": { - "created": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "deleted": null, - "editableSchemaFieldInfo": [ - { - "fieldPath": "field_foo", - "globalTags": { "tags": [{ "tag": "urn:li:tag:Legacy" }] } - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "SampleKafkaSchema", - "platform": "urn:li:dataPlatform:kafka", - "version": 0, - "created": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.KafkaSchema": { - "documentSchema": "{\"type\":\"record\",\"name\":\"SampleKafkaSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Kafka dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" - } - }, - "fields": [ - { - "fieldPath": "field_foo_2", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Foo field description" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "varchar(100)", - "globalTags": { "tags": [{ "tag": "urn:li:tag:NeedsDocumentation" }] }, - "recursive": false - }, - { - "fieldPath": "field_bar", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Bar field description" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "boolean", - "recursive": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null - } + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": ["/prod/kafka/SampleKafkaDataset"] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "description": null, + "uri": null, + "tags": ["myfaketags"], + "customProperties": { + "prop1": "fakeprop", + "prop2": "pikachu" + } + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.common.InstitutionalMemory": { + "elements": [ + { + "url": "https://www.linkedin.com", + "description": "Sample doc", + "createStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.EditableSchemaMetadata": { + "created": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "deleted": null, + "editableSchemaFieldInfo": [ + { + "fieldPath": "field_foo", + "globalTags": { "tags": [{ "tag": "urn:li:tag:Legacy" }] } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "SampleKafkaSchema", + "platform": "urn:li:dataPlatform:kafka", + "version": 0, + "created": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.KafkaSchema": { + "documentSchema": "{\"type\":\"record\",\"name\":\"SampleKafkaSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Kafka dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" + } + }, + "fields": [ + { + "fieldPath": "field_foo_2", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Foo field description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} } - ] + }, + "nativeDataType": "varchar(100)", + "globalTags": { + "tags": [{ "tag": "urn:li:tag:NeedsDocumentation" }] + }, + "recursive": false + }, + { + "fieldPath": "field_bar", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Bar field description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": ["/prod/hdfs/SampleHdfsDataset"] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:jdoe", - "type": "DATAOWNER", - "source": null - }, - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)", - "type": "TRANSFORMED" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.InstitutionalMemory": { - "elements": [ - { - "url": "https://www.linkedin.com", - "description": "Sample doc", - "createStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "SampleHdfsSchema", - "platform": "urn:li:dataPlatform:hdfs", - "version": 0, - "created": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.KafkaSchema": { - "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHdfsSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample HDFS dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" - } - }, - "fields": [ - { - "fieldPath": "shipment_info", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Shipment info description" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "varchar(100)", - "recursive": false - }, - { - "fieldPath": "shipment_info.date", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Shipment info date description" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} - } - }, - "nativeDataType": "Date", - "recursive": false - }, - { - "fieldPath": "shipment_info.target", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Shipment info target description" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "text", - "recursive": false - }, - { - "fieldPath": "shipment_info.destination", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Shipment info destination description" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "varchar(100)", - "recursive": false - }, - { - "fieldPath": "shipment_info.geo_info", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Shipment info geo_info description" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "varchar(100)", - "recursive": false - }, - { - "fieldPath": "shipment_info.geo_info.lat", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Shipment info geo_info lat" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false - }, - { - "fieldPath": "shipment_info.geo_info.lng", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Shipment info geo_info lng" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null - } + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": ["/prod/hdfs/SampleHdfsDataset"] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.InstitutionalMemory": { + "elements": [ + { + "url": "https://www.linkedin.com", + "description": "Sample doc", + "createStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "SampleHdfsSchema", + "platform": "urn:li:dataPlatform:hdfs", + "version": 0, + "created": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.KafkaSchema": { + "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHdfsSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample HDFS dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" + } + }, + "fields": [ + { + "fieldPath": "shipment_info", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Shipment info description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} } - ] + }, + "nativeDataType": "varchar(100)", + "recursive": false + }, + { + "fieldPath": "shipment_info.date", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Shipment info date description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "Date", + "recursive": false + }, + { + "fieldPath": "shipment_info.target", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Shipment info target description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false + }, + { + "fieldPath": "shipment_info.destination", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Shipment info destination description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "varchar(100)", + "recursive": false + }, + { + "fieldPath": "shipment_info.geo_info", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Shipment info geo_info description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "varchar(100)", + "recursive": false + }, + { + "fieldPath": "shipment_info.geo_info.lat", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Shipment info geo_info lat" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false + }, + { + "fieldPath": "shipment_info.geo_info.lng", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Shipment info geo_info lng" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:jdoe", - "type": "DATAOWNER", - "source": null - }, - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)", - "type": "TRANSFORMED" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.InstitutionalMemory": { - "elements": [ - { - "url": "https://www.linkedin.com", - "description": "Sample doc", - "createStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "SampleHiveSchema", - "platform": "urn:li:dataPlatform:hive", - "version": 0, - "created": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.KafkaSchema": { - "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHiveSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Hive dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" - } - }, - "fields": [ - { - "fieldPath": "field_foo", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Foo field description" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "varchar(100)", - "recursive": false - }, - { - "fieldPath": "field_bar", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Bar field description" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "boolean", - "recursive": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Legacy" }] - } + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.InstitutionalMemory": { + "elements": [ + { + "url": "https://www.linkedin.com", + "description": "Sample doc", + "createStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "SampleHiveSchema", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.KafkaSchema": { + "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHiveSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Hive dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" + } + }, + "fields": [ + { + "fieldPath": "field_foo", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Foo field description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} } - ] + }, + "nativeDataType": "varchar(100)", + "recursive": false + }, + { + "fieldPath": "field_bar", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Bar field description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null } - }, - "proposedDelta": null + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [{ "tag": "urn:li:tag:Legacy" }] + } + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:jdoe", - "type": "DATAOWNER", - "source": null - }, - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", - "type": "TRANSFORMED" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.InstitutionalMemory": { - "elements": [ - { - "url": "https://www.linkedin.com", - "description": "Sample doc", - "createStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "SampleHiveSchema", - "platform": "urn:li:dataPlatform:hive", - "version": 0, - "created": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.KafkaSchema": { - "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHiveSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Hive dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" - } - }, - "fields": [ - { - "fieldPath": "event_name", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Name of your logging event" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "varchar(100)", - "recursive": false - }, - { - "fieldPath": "event_data", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Data of your event" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "boolean", - "recursive": false - }, - { - "fieldPath": "timestamp", - "jsonPath": null, - "nullable": false, - "description": { - "string": "TS the event was ingested" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "boolean", - "recursive": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null - } + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.InstitutionalMemory": { + "elements": [ + { + "url": "https://www.linkedin.com", + "description": "Sample doc", + "createStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "SampleHiveSchema", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.KafkaSchema": { + "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHiveSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Hive dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" + } + }, + "fields": [ + { + "fieldPath": "event_name", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Name of your logging event" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} } - ] + }, + "nativeDataType": "varchar(100)", + "recursive": false + }, + { + "fieldPath": "event_data", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Data of your event" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false + }, + { + "fieldPath": "timestamp", + "jsonPath": null, + "nullable": false, + "description": { + "string": "TS the event was ingested" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:jdoe", - "type": "DATAOWNER", - "source": null - }, - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", - "type": "TRANSFORMED" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.InstitutionalMemory": { - "elements": [ - { - "url": "https://www.linkedin.com", - "description": "Sample doc", - "createStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "SampleHiveSchema", - "platform": "urn:li:dataPlatform:hive", - "version": 0, - "created": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.KafkaSchema": { - "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHiveSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Hive dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" - } - }, - "fields": [ - { - "fieldPath": "user_id", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Id of the user created" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "varchar(100)", - "recursive": false - }, - { - "fieldPath": "user_name", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Name of the user who signed up" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "boolean", - "recursive": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null - } + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.InstitutionalMemory": { + "elements": [ + { + "url": "https://www.linkedin.com", + "description": "Sample doc", + "createStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "SampleHiveSchema", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.KafkaSchema": { + "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHiveSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Hive dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" + } + }, + "fields": [ + { + "fieldPath": "user_id", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Id of the user created" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} } - ] + }, + "nativeDataType": "varchar(100)", + "recursive": false + }, + { + "fieldPath": "user_name", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Name of the user who signed up" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:jdoe", - "type": "DATAOWNER", - "source": null - }, - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", - "type": "TRANSFORMED" - }, - { - "auditStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)", - "type": "TRANSFORMED" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.InstitutionalMemory": { - "elements": [ - { - "url": "https://www.linkedin.com", - "description": "Sample doc", - "createStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "SampleHiveSchema", - "platform": "urn:li:dataPlatform:hive", - "version": 0, - "created": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.KafkaSchema": { - "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHiveSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Hive dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" - } - }, - "fields": [ - { - "fieldPath": "user_id", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Id of the user deleted" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "varchar(100)", - "recursive": false - }, - { - "fieldPath": "user_name", - "jsonPath": null, - "nullable": false, - "description": { - "string": "Name of the user who was deleted" - }, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "boolean", - "recursive": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null - } + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.InstitutionalMemory": { + "elements": [ + { + "url": "https://www.linkedin.com", + "description": "Sample doc", + "createStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "SampleHiveSchema", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.KafkaSchema": { + "documentSchema": "{\"type\":\"record\",\"name\":\"SampleHiveSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Hive dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" + } + }, + "fields": [ + { + "fieldPath": "user_id", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Id of the user deleted" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} } - ] + }, + "nativeDataType": "varchar(100)", + "recursive": false + }, + { + "fieldPath": "user_name", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Name of the user who was deleted" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:datahub", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "name": "User Creations", - "description": "Constructs the fct_users_created from logging_events", - "type": "SQL", - "flowUrn": "urn:li:dataFlow:(airflow,dag_abc,PROD)" - } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" - ], - "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)" - ] - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:datahub", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "name": "User Creations", + "description": "Constructs the fct_users_created from logging_events", + "type": "SQL", + "flowUrn": "urn:li:dataFlow:(airflow,dag_abc,PROD)" + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)" + ] } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:datahub", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "name": "User Deletions", - "description": "Constructs the fct_users_deleted from logging_events", - "type": "SQL", - "flowUrn": "urn:li:dataFlow:(airflow,dag_abc,PROD)" - } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" - ], - "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)" - ], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)" - ] - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:datahub", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "name": "User Deletions", + "description": "Constructs the fct_users_deleted from logging_events", + "type": "SQL", + "flowUrn": "urn:li:dataFlow:(airflow,dag_abc,PROD)" } - }, - "proposedDelta": null + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)" + ] + } + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { - "urn": "urn:li:dataFlow:(airflow,dag_abc,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:datahub", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { - "name": "Users", - "description": "Constructs the fct_users_deleted and fct_users_created tables", - "project": null - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(airflow,dag_abc,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:datahub", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "name": "Users", + "description": "Constructs the fct_users_deleted and fct_users_created tables", + "project": null } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { - "urn": "urn:li:chart:(looker,baz1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.chart.ChartInfo": { - "title": "Baz Chart 1", - "description": "Baz Chart 1", - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:datahub", - "impersonator": null - }, - "deleted": null - }, - "chartUrl": null, - "inputs": ["urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)"], - "type": null, - "access": null, - "lastRefreshed": null - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Legacy" }] - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(looker,baz1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "title": "Baz Chart 1", + "description": "Baz Chart 1", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:datahub", + "impersonator": null + }, + "deleted": null + }, + "chartUrl": null, + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)" + ], + "type": null, + "access": null, + "lastRefreshed": null } - }, - "proposedDelta": null + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [{ "tag": "urn:li:tag:Legacy" }] + } + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { - "urn": "urn:li:chart:(looker,baz2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.chart.ChartInfo": { - "title": "Baz Chart 2", - "description": "Baz Chart 2", - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:datahub", - "impersonator": null - }, - "deleted": null - }, - "chartUrl": null, - "inputs": { - "array": [ - { - "string": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)" - } - ] - }, - "type": null, - "access": null, - "lastRefreshed": null - } - } + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(looker,baz2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "title": "Baz Chart 2", + "description": "Baz Chart 2", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:datahub", + "impersonator": null + }, + "deleted": null + }, + "chartUrl": null, + "inputs": { + "array": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)" + } ] + }, + "type": null, + "access": null, + "lastRefreshed": null } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { - "urn": "urn:li:dashboard:(looker,baz)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpGroup:bfoo", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - }, - { - "com.linkedin.pegasus2avro.dashboard.DashboardInfo": { - "title": "Baz Dashboard", - "description": "Baz Dashboard", - "customProperties": { - "prop1": "fakeprop", - "prop2": "pikachu" - }, - "charts": [ - "urn:li:chart:(looker,baz1)", - "urn:li:chart:(looker,baz2)" - ], - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:datahub", - "impersonator": null - }, - "deleted": null - }, - "dashboardUrl": null, - "access": null, - "lastRefreshed": null - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { + "urn": "urn:li:dashboard:(looker,baz)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpGroup:bfoo", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } } - }, - "proposedDelta": null + }, + { + "com.linkedin.pegasus2avro.dashboard.DashboardInfo": { + "title": "Baz Dashboard", + "description": "Baz Dashboard", + "customProperties": { + "prop1": "fakeprop", + "prop2": "pikachu" + }, + "charts": [ + "urn:li:chart:(looker,baz1)", + "urn:li:chart:(looker,baz2)" + ], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:datahub", + "impersonator": null + }, + "deleted": null + }, + "dashboardUrl": null, + "access": null, + "lastRefreshed": null + } + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.MLModelSnapshot": { - "urn": "urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:jdoe", - "type": "DATAOWNER" - }, - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:jdoe" - } - } - }, - { - "com.linkedin.pegasus2avro.ml.metadata.MLModelProperties": { - "description": "A sample model for predicting some outcome.", - "date": null, - "tags": ["Sample"], - "version": null, - "type": "Naive Bayes classifier" - } - }, - { - "com.linkedin.pegasus2avro.ml.metadata.TrainingData": { - "trainingData": [ - { - "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,pageViewsHive,PROD)", - "motivation": "For science!", - "preProcessing": [ - "Aggregation" - ] - } - ] - } - }, - { - "com.linkedin.pegasus2avro.ml.metadata.EvaluationData": { - "evaluationData": [ - { - "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,pageViewsHive,PROD)" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.InstitutionalMemory": { - "elements": [ - { - "url": "https://www.linkedin.com", - "description": "Sample doc", - "createStamp": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - ] - } - }, - { - "com.linkedin.pegasus2avro.ml.metadata.IntendedUse": { - "primaryUses" : [ - "Sample Model", - "Primary Use" - ], - "primaryUsers": [ - "ENTERPRISE" - ], - "outOfScopeUses": [ - "Production Deployment" - ] - } - }, - { - "com.linkedin.pegasus2avro.ml.metadata.Metrics": { - "performanceMeasures": ["performanceMeasures"], - "decisionThreshold": ["decisionThreshold"] - } - }, - { - "com.linkedin.pegasus2avro.ml.metadata.EthicalConsiderations": { - "data": ["data"], - "humanLife": ["humanLife"], - "mitigations": ["mitigations"], - "risksAndHarms": ["risksAndHarms"], - "useCases": ["useCases"] - } - }, - { - "com.linkedin.pegasus2avro.ml.metadata.CaveatsAndRecommendations": { - "recommendations": "recommendations", - "idealDatasetCharacteristics": [ - "idealDatasetCharacteristics" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "boolean" : false - } - }, - { - "com.linkedin.pegasus2avro.common.Cost": { - "costType": "ORG_COST_TYPE", - "cost": { - "fieldDiscriminator": "costCode", - "costCode": "sampleCostCode" - } - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLModelSnapshot": { + "urn": "urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER" + }, + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:jdoe" + } + } + }, + { + "com.linkedin.pegasus2avro.ml.metadata.MLModelProperties": { + "description": "A sample model for predicting some outcome.", + "date": null, + "tags": ["Sample"], + "version": null, + "type": "Naive Bayes classifier" } - }, - "proposedDelta": null + }, + { + "com.linkedin.pegasus2avro.ml.metadata.TrainingData": { + "trainingData": [ + { + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,pageViewsHive,PROD)", + "motivation": "For science!", + "preProcessing": ["Aggregation"] + } + ] + } + }, + { + "com.linkedin.pegasus2avro.ml.metadata.EvaluationData": { + "evaluationData": [ + { + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,pageViewsHive,PROD)" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.InstitutionalMemory": { + "elements": [ + { + "url": "https://www.linkedin.com", + "description": "Sample doc", + "createStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.ml.metadata.IntendedUse": { + "primaryUses": ["Sample Model", "Primary Use"], + "primaryUsers": ["ENTERPRISE"], + "outOfScopeUses": ["Production Deployment"] + } + }, + { + "com.linkedin.pegasus2avro.ml.metadata.Metrics": { + "performanceMeasures": ["performanceMeasures"], + "decisionThreshold": ["decisionThreshold"] + } + }, + { + "com.linkedin.pegasus2avro.ml.metadata.EthicalConsiderations": { + "data": ["data"], + "humanLife": ["humanLife"], + "mitigations": ["mitigations"], + "risksAndHarms": ["risksAndHarms"], + "useCases": ["useCases"] + } + }, + { + "com.linkedin.pegasus2avro.ml.metadata.CaveatsAndRecommendations": { + "recommendations": "recommendations", + "idealDatasetCharacteristics": ["idealDatasetCharacteristics"] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "boolean": false + } + }, + { + "com.linkedin.pegasus2avro.common.Cost": { + "costType": "ORG_COST_TYPE", + "cost": { + "fieldDiscriminator": "costCode", + "costCode": "sampleCostCode" + } + } + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { - "urn": "urn:li:tag:Legacy", - "aspects": [ - { - "com.linkedin.pegasus2avro.tag.TagProperties": { - "name": "Legacy", - "description": "Indicates the dataset is no longer supported" - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:jdoe", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { + "urn": "urn:li:tag:Legacy", + "aspects": [ + { + "com.linkedin.pegasus2avro.tag.TagProperties": { + "name": "Legacy", + "description": "Indicates the dataset is no longer supported" } - }, - "proposedDelta": null + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { - "urn": "urn:li:tag:NeedsDocumentation", - "aspects": [ - { - "com.linkedin.pegasus2avro.tag.TagProperties": { - "name": "NeedsDocumentation", - "description": "Indicates the data element needs documentation" - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:jdoe", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 1581407189000, - "actor": "urn:li:corpuser:jdoe", - "impersonator": null - } - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { + "urn": "urn:li:tag:NeedsDocumentation", + "aspects": [ + { + "com.linkedin.pegasus2avro.tag.TagProperties": { + "name": "NeedsDocumentation", + "description": "Indicates the data element needs documentation" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:adlsGen1", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": "/", - "name": "adlsGen1", - "displayName": "Azure Data Lake (Gen 1)", - "type": "FILE_SYSTEM", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/adlslogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:adlsGen1", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": "/", + "name": "adlsGen1", + "displayName": "Azure Data Lake (Gen 1)", + "type": "FILE_SYSTEM", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/adlslogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:adlsGen2", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": "/", - "name": "adlsGen2", - "displayName": "Azure Data Lake (Gen 2)", - "type": "FILE_SYSTEM", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/adlslogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:adlsGen2", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": "/", + "name": "adlsGen2", + "displayName": "Azure Data Lake (Gen 2)", + "type": "FILE_SYSTEM", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/adlslogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:ambry", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "ambry", - "displayName": "Ambry", - "type": "OBJECT_STORE" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:ambry", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "ambry", + "displayName": "Ambry", + "type": "OBJECT_STORE" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:couchbase", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "couchbase", - "displayName": "Couchbase", - "type": "KEY_VALUE_STORE", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/couchbaselogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:couchbase", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "couchbase", + "displayName": "Couchbase", + "type": "KEY_VALUE_STORE", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/couchbaselogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:external", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "external", - "displayName": "External Source", - "type": "OTHERS" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:external", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "external", + "displayName": "External Source", + "type": "OTHERS" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:hdfs", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": "/", - "name": "hdfs", - "displayName": "HDFS", - "type": "FILE_SYSTEM", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/hadooplogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:hdfs", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": "/", + "name": "hdfs", + "displayName": "HDFS", + "type": "FILE_SYSTEM", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/hadooplogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:hive", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "hive", - "displayName": "Hive", - "type": "FILE_SYSTEM", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/hivelogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:hive", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "hive", + "displayName": "Hive", + "type": "FILE_SYSTEM", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/hivelogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:s3", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": "/", - "name": "s3", - "displayName": "AWS S3", - "type": "FILE_SYSTEM", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/s3.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:s3", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": "/", + "name": "s3", + "displayName": "AWS S3", + "type": "FILE_SYSTEM", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/s3.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:kafka", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "kafka", - "displayName": "Kafka", - "type": "MESSAGE_BROKER", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/kafkalogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:kafka", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "kafka", + "displayName": "Kafka", + "type": "MESSAGE_BROKER", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/kafkalogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:kusto", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "kusto", - "displayName": "Kusto", - "type": "OLAP_DATASTORE", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/kustologo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:kusto", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "kusto", + "displayName": "Kusto", + "type": "OLAP_DATASTORE", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/kustologo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:mongodb", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "mongodb", - "displayName": "MongoDB", - "type": "KEY_VALUE_STORE", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/mongodblogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:mongodb", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "mongodb", + "displayName": "MongoDB", + "type": "KEY_VALUE_STORE", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/mongodblogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:mysql", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "mysql", - "displayName": "MySQL", - "type": "RELATIONAL_DB", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/mysqllogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:mysql", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "mysql", + "displayName": "MySQL", + "type": "RELATIONAL_DB", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/mysqllogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:oracle", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "oracle", - "displayName": "Oracle", - "type": "RELATIONAL_DB", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/oraclelogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:oracle", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "oracle", + "displayName": "Oracle", + "type": "RELATIONAL_DB", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/oraclelogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:pinot", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "pinot", - "displayName": "Pinot", - "type": "OLAP_DATASTORE", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/pinotlogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:pinot", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "pinot", + "displayName": "Pinot", + "type": "OLAP_DATASTORE", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/pinotlogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:postgres", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "postgres", - "displayName": "PostgreSQL", - "type": "RELATIONAL_DB", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/postgreslogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:postgres", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "postgres", + "displayName": "PostgreSQL", + "type": "RELATIONAL_DB", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/postgreslogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:presto", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "presto", - "displayName": "Presto", - "type": "QUERY_ENGINE", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/prestologo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:presto", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "presto", + "displayName": "Presto", + "type": "QUERY_ENGINE", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/prestologo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:teradata", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "teradata", - "displayName": "Teradata", - "type": "RELATIONAL_DB", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/teradatalogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:teradata", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "teradata", + "displayName": "Teradata", + "type": "RELATIONAL_DB", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/teradatalogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:voldemort", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "voldemort", - "displayName": "Voldemort", - "type": "KEY_VALUE_STORE" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:voldemort", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "voldemort", + "displayName": "Voldemort", + "type": "KEY_VALUE_STORE" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:snowflake", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "snowflake", - "displayName": "Snowflake", - "type": "RELATIONAL_DB", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/snowflakelogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:snowflake", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "snowflake", + "displayName": "Snowflake", + "type": "RELATIONAL_DB", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/snowflakelogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:redshift", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "redshift", - "displayName": "Redshift", - "type": "RELATIONAL_DB", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/redshiftlogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:redshift", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "redshift", + "displayName": "Redshift", + "type": "RELATIONAL_DB", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/redshiftlogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:mssql", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "mssql", - "displayName": "SQL Server", - "type": "RELATIONAL_DB", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/mssqllogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:mssql", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "mssql", + "displayName": "SQL Server", + "type": "RELATIONAL_DB", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/mssqllogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:bigquery", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "bigquery", - "displayName": "BigQuery", - "type": "RELATIONAL_DB", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/bigquerylogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:bigquery", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "bigquery", + "displayName": "BigQuery", + "type": "RELATIONAL_DB", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/bigquerylogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:druid", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "druid", - "displayName": "Druid", - "type": "OLAP_DATASTORE", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/druidlogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:druid", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "druid", + "displayName": "Druid", + "type": "OLAP_DATASTORE", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/druidlogo.png" } - }, - "proposedDelta": null + } + ] + } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { - "urn": "urn:li:dataPlatform:looker", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { - "datasetNameDelimiter": ".", - "name": "looker", - "displayName": "Looker", - "type": "OTHERS", - "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/lookerlogo.png" - } - } - ] + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:looker", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "looker", + "displayName": "Looker", + "type": "OTHERS", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/lookerlogo.png" + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:feast", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "feast", + "displayName": "Feast", + "type": "OTHERS", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/feastlogo.png" + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:sagemaker", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "sagemaker", + "displayName": "SageMaker", + "type": "OTHERS", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/sagemakerlogo.png" } - }, - "proposedDelta": null - } + } + ] + } + }, + "proposedDelta": null + } ] diff --git a/metadata-ingestion/examples/mce_files/data_platforms.json b/metadata-ingestion/examples/mce_files/data_platforms.json index d80668a7f75348..c400dc6c401214 100644 --- a/metadata-ingestion/examples/mce_files/data_platforms.json +++ b/metadata-ingestion/examples/mce_files/data_platforms.json @@ -475,5 +475,45 @@ } }, "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:feast", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "feast", + "displayName": "Feast", + "type": "OTHERS", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/feastlogo.png" + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:sagemaker", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": ".", + "name": "sagemaker", + "displayName": "SageMaker", + "type": "OTHERS", + "logoUrl": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/sagemakerlogo.png" + } + } + ] + } + }, + "proposedDelta": null } ] diff --git a/metadata-ingestion/examples/recipes/file_to_datahub_rest.yml b/metadata-ingestion/examples/recipes/file_to_datahub_rest.yml index 75c14800ce0c52..d8798a9b2e5987 100644 --- a/metadata-ingestion/examples/recipes/file_to_datahub_rest.yml +++ b/metadata-ingestion/examples/recipes/file_to_datahub_rest.yml @@ -7,4 +7,4 @@ source: sink: type: "datahub-rest" config: - server: 'http://localhost:8080' + server: "http://localhost:8080" diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 52a665b24e7cf0..cfd504fd234c27 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -73,6 +73,11 @@ def make_ml_feature_table_urn(platform: str, feature_table_name: str) -> str: ) +def make_ml_model_urn(platform: str, model_name: str, env: str) -> str: + + return f"urn:li:mlModel:(urn:li:dataPlatform:{platform},{model_name},{env})" + + def make_lineage_mce( upstream_urns: List[str], downstream_urn: str, diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws_common.py b/metadata-ingestion/src/datahub/ingestion/source/aws_common.py index 5ce8a7eed1f1f1..dd5116e13d0abc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws_common.py @@ -86,3 +86,19 @@ def get_client(self, service: str) -> boto3.client: ) else: return boto3.client(service, region_name=self.aws_region) + + +def make_s3_urn(s3_uri: str, env: str, suffix: Optional[str] = None) -> str: + + if not s3_uri.startswith("s3://"): + raise ValueError("S3 URIs should begin with 's3://'") + # remove S3 prefix (s3://) + s3_name = s3_uri[5:] + + if s3_name.endswith("/"): + s3_name = s3_name[:-1] + + if suffix is not None: + return f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name}_{suffix},{env})" + + return f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{env})" diff --git a/metadata-ingestion/src/datahub/ingestion/source/glue.py b/metadata-ingestion/src/datahub/ingestion/source/glue.py index 0c8c1e9329726e..760c1d90ca2e0f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/glue.py @@ -10,7 +10,7 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws_common import AwsSourceConfig +from datahub.ingestion.source.aws_common import AwsSourceConfig, make_s3_urn from datahub.metadata.com.linkedin.pegasus2avro.common import Status from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent @@ -142,15 +142,11 @@ def get_dataflow_s3_names( # if data object is S3 bucket if node_args.get("connection_type") == "s3": - # remove S3 prefix (s3://) - s3_name = node_args["connection_options"]["path"][5:] - - if s3_name.endswith("/"): - s3_name = s3_name[:-1] + s3_uri = node_args["connection_options"]["path"] extension = node_args.get("format") - yield s3_name, extension + yield s3_uri, extension def process_dataflow_node( self, @@ -179,20 +175,18 @@ def process_dataflow_node( # if data object is S3 bucket elif node_args.get("connection_type") == "s3": - # remove S3 prefix (s3://) - s3_name = node_args["connection_options"]["path"][5:] - - if s3_name.endswith("/"): - s3_name = s3_name[:-1] + s3_uri = node_args["connection_options"]["path"] # append S3 format if different ones exist - if len(s3_formats[s3_name]) > 1: - node_urn = f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name}_{node_args.get('format')},{self.env})" + if len(s3_formats[s3_uri]) > 1: + node_urn = make_s3_urn( + s3_uri, + self.env, + suffix=node_args.get("format"), + ) else: - node_urn = ( - f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{self.env})" - ) + node_urn = make_s3_urn(s3_uri, self.env) dataset_snapshot = DatasetSnapshot( urn=node_urn, @@ -235,7 +229,7 @@ def process_dataflow_graph( self, dataflow_graph: Dict[str, Any], flow_urn: str, - s3_names: typing.DefaultDict[str, Set[Union[str, None]]], + s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], ) -> Tuple[Dict[str, Dict[str, Any]], List[str], List[MetadataChangeEvent]]: """ Prepare a job's DAG for ingestion. @@ -245,6 +239,8 @@ def process_dataflow_graph( Job DAG returned from get_dataflow_graph() flow_urn: URN of the flow (i.e. the AWS Glue job itself). + s3_formats: + Map from s3 URIs to formats used (for deduplication purposes) """ new_dataset_ids: List[str] = [] @@ -256,7 +252,7 @@ def process_dataflow_graph( for node in dataflow_graph["DagNodes"]: nodes[node["Id"]] = self.process_dataflow_node( - node, flow_urn, new_dataset_ids, new_dataset_mces, s3_names + node, flow_urn, new_dataset_ids, new_dataset_mces, s3_formats ) # traverse edges to fill in node properties diff --git a/metadata-ingestion/src/datahub/ingestion/source/sagemaker.py b/metadata-ingestion/src/datahub/ingestion/source/sagemaker.py index 3bc87053e4b6a8..3ebf9ba8adbb17 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sagemaker.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sagemaker.py @@ -1,42 +1,17 @@ -from dataclasses import dataclass -from dataclasses import field as dataclass_field -from typing import Any, Dict, Iterable, List +from typing import Iterable -import datahub.emitter.mce_builder as builder from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.source import Source from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws_common import AwsSourceConfig -from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType -from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import ( - MLFeatureSnapshot, - MLFeatureTableSnapshot, - MLPrimaryKeySnapshot, +from datahub.ingestion.source.sagemaker_processors.common import ( + SagemakerSourceConfig, + SagemakerSourceReport, ) -from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent -from datahub.metadata.schema_classes import ( - MLFeaturePropertiesClass, - MLFeatureTablePropertiesClass, - MLPrimaryKeyPropertiesClass, +from datahub.ingestion.source.sagemaker_processors.feature_groups import ( + FeatureGroupProcessor, ) - - -class SagemakerSourceConfig(AwsSourceConfig): - @property - def sagemaker_client(self): - return self.get_client("sagemaker") - - -@dataclass -class SagemakerSourceReport(SourceReport): - tables_scanned = 0 - filtered: List[str] = dataclass_field(default_factory=list) - - def report_table_scanned(self) -> None: - self.tables_scanned += 1 - - def report_table_dropped(self, table: str) -> None: - self.filtered.append(table) +from datahub.ingestion.source.sagemaker_processors.jobs import JobProcessor +from datahub.ingestion.source.sagemaker_processors.models import ModelProcessor class SagemakerSource(Source): @@ -55,244 +30,34 @@ def create(cls, config_dict, ctx): config = SagemakerSourceConfig.parse_obj(config_dict) return cls(config, ctx) - def get_all_feature_groups(self) -> List[Dict[str, Any]]: - """ - List all feature groups in SageMaker. - """ - - feature_groups = [] - - # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_feature_groups - paginator = self.sagemaker_client.get_paginator("list_feature_groups") - for page in paginator.paginate(): - feature_groups += page["FeatureGroupSummaries"] - - return feature_groups - - def get_feature_group_details(self, feature_group_name: str) -> Dict[str, Any]: - """ - Get details of a feature group (including list of component features). - """ - - # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_feature_group - feature_group = self.sagemaker_client.describe_feature_group( - FeatureGroupName=feature_group_name - ) - - # use falsy fallback since AWS stubs require this to be a string in tests - next_token = feature_group.get("NextToken", "") - - # paginate over feature group features - while next_token: - next_features = self.sagemaker_client.describe_feature_group( - FeatureGroupName=feature_group_name, NextToken=next_token - ) - feature_group["FeatureDefinitions"].append( - next_features["FeatureDefinitions"] - ) - next_token = feature_group.get("NextToken", "") - - return feature_group - - def get_feature_group_wu( - self, feature_group_details: Dict[str, Any] - ) -> MetadataWorkUnit: - """ - Generate an MLFeatureTable workunit for a SageMaker feature group. - - Parameters - ---------- - feature_group_details: - ingested SageMaker feature group from get_feature_group_details() - """ - - feature_group_name = feature_group_details["FeatureGroupName"] - - feature_group_snapshot = MLFeatureTableSnapshot( - urn=builder.make_ml_feature_table_urn("sagemaker", feature_group_name), - aspects=[], - ) - - feature_group_snapshot.aspects.append( - MLFeatureTablePropertiesClass( - description=feature_group_details.get("Description"), - # non-primary key features - mlFeatures=[ - builder.make_ml_feature_urn( - feature_group_name, - feature["FeatureName"], - ) - for feature in feature_group_details["FeatureDefinitions"] - if feature["FeatureName"] - != feature_group_details["RecordIdentifierFeatureName"] - ], - mlPrimaryKeys=[ - builder.make_ml_primary_key_urn( - feature_group_name, - feature_group_details["RecordIdentifierFeatureName"], - ) - ], - # additional metadata - customProperties={ - "arn": feature_group_details["FeatureGroupArn"], - "creation_time": str(feature_group_details["CreationTime"]), - "status": feature_group_details["FeatureGroupStatus"], - }, - ) - ) - - # make the MCE and workunit - mce = MetadataChangeEvent(proposedSnapshot=feature_group_snapshot) - return MetadataWorkUnit(id=feature_group_name, mce=mce) - - field_type_mappings = { - "String": MLFeatureDataType.TEXT, - "Integral": MLFeatureDataType.ORDINAL, - "Fractional": MLFeatureDataType.CONTINUOUS, - } - - def get_feature_type(self, aws_type: str, feature_name: str) -> str: + def get_workunits(self) -> Iterable[MetadataWorkUnit]: - mapped_type = self.field_type_mappings.get(aws_type) + # extract feature groups if specified + if self.source_config.extract_feature_groups: - if mapped_type is None: - self.report.report_warning( - feature_name, f"unable to map type {aws_type} to metadata schema" + feature_group_processor = FeatureGroupProcessor( + sagemaker_client=self.sagemaker_client, env=self.env, report=self.report ) - mapped_type = MLFeatureDataType.UNKNOWN - - return mapped_type - - def get_feature_wu( - self, feature_group_details: Dict[str, Any], feature: Dict[str, Any] - ) -> MetadataWorkUnit: - """ - Generate an MLFeature workunit for a SageMaker feature. - - Parameters - ---------- - feature_group_details: - ingested SageMaker feature group from get_feature_group_details() - feature: - ingested SageMaker feature - """ - - # if the feature acts as the record identifier, then we ingest it as an MLPrimaryKey - # the RecordIdentifierFeatureName is guaranteed to exist as it's required on creation - is_record_identifier = ( - feature_group_details["RecordIdentifierFeatureName"] - == feature["FeatureName"] - ) - - feature_sources = [] - - if "OfflineStoreConfig" in feature_group_details: + yield from feature_group_processor.get_workunits() - # remove S3 prefix (s3://) - s3_name = feature_group_details["OfflineStoreConfig"]["S3StorageConfig"][ - "S3Uri" - ][5:] + # extract models if specified + if self.source_config.extract_models: - if s3_name.endswith("/"): - s3_name = s3_name[:-1] - - feature_sources.append( - builder.make_dataset_urn( - "s3", - s3_name, - self.source_config.env, - ) + model_processor = ModelProcessor( + sagemaker_client=self.sagemaker_client, env=self.env, report=self.report ) + yield from model_processor.get_workunits() - if "DataCatalogConfig" in feature_group_details["OfflineStoreConfig"]: - - # if Glue catalog associated with offline store - glue_database = feature_group_details["OfflineStoreConfig"][ - "DataCatalogConfig" - ]["Database"] - glue_table = feature_group_details["OfflineStoreConfig"][ - "DataCatalogConfig" - ]["TableName"] - - full_table_name = f"{glue_database}.{glue_table}" - - self.report.report_warning( - full_table_name, - f"""Note: table {full_table_name} is an AWS Glue object. - To view full table metadata, run Glue ingestion - (see https://datahubproject.io/docs/metadata-ingestion/#aws-glue-glue)""", - ) - - feature_sources.append( - f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.source_config.env})" - ) - - # note that there's also an OnlineStoreConfig field, but this - # lacks enough metadata to create a dataset - # (only specifies the security config and whether it's enabled at all) + # extract jobs if specified + if self.source_config.extract_jobs is not False: - # append feature name and type - if is_record_identifier: - primary_key_snapshot: MLPrimaryKeySnapshot = MLPrimaryKeySnapshot( - urn=builder.make_ml_primary_key_urn( - feature_group_details["FeatureGroupName"], - feature["FeatureName"], - ), - aspects=[ - MLPrimaryKeyPropertiesClass( - dataType=self.get_feature_type( - feature["FeatureType"], feature["FeatureName"] - ), - sources=feature_sources, - ), - ], + job_processor = JobProcessor( + sagemaker_client=self.sagemaker_client, + env=self.env, + report=self.report, + job_type_filter=self.source_config.extract_jobs, ) - - # make the MCE and workunit - mce = MetadataChangeEvent(proposedSnapshot=primary_key_snapshot) - else: - # create snapshot instance for the feature - feature_snapshot: MLFeatureSnapshot = MLFeatureSnapshot( - urn=builder.make_ml_feature_urn( - feature_group_details["FeatureGroupName"], - feature["FeatureName"], - ), - aspects=[ - MLFeaturePropertiesClass( - dataType=self.get_feature_type( - feature["FeatureType"], feature["FeatureName"] - ), - sources=feature_sources, - ) - ], - ) - - # make the MCE and workunit - mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) - - return MetadataWorkUnit( - id=f'{feature_group_details["FeatureGroupName"]}-{feature["FeatureName"]}', - mce=mce, - ) - - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - - feature_groups = self.get_all_feature_groups() - - for feature_group in feature_groups: - - feature_group_details = self.get_feature_group_details( - feature_group["FeatureGroupName"] - ) - - for feature in feature_group_details["FeatureDefinitions"]: - wu = self.get_feature_wu(feature_group_details, feature) - self.report.report_workunit(wu) - yield wu - - wu = self.get_feature_group_wu(feature_group_details) - self.report.report_workunit(wu) - yield wu + yield from job_processor.get_workunits() def get_report(self): return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/common.py b/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/common.py new file mode 100644 index 00000000000000..93d3df9971ddfb --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/common.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass +from typing import Dict, Optional, Union + +from datahub.ingestion.api.source import SourceReport +from datahub.ingestion.source.aws_common import AwsSourceConfig + + +class SagemakerSourceConfig(AwsSourceConfig): + + extract_feature_groups: Optional[bool] = True + extract_models: Optional[bool] = True + extract_jobs: Optional[Union[Dict[str, str], bool]] = True + + @property + def sagemaker_client(self): + return self.get_client("sagemaker") + + +@dataclass +class SagemakerSourceReport(SourceReport): + feature_groups_scanned = 0 + features_scanned = 0 + models_scanned = 0 + jobs_scanned = 0 + datasets_scanned = 0 + + def report_feature_group_scanned(self) -> None: + self.feature_groups_scanned += 1 + + def report_feature_scanned(self) -> None: + self.features_scanned += 1 + + def report_model_scanned(self) -> None: + self.models_scanned += 1 + + def report_job_scanned(self) -> None: + self.jobs_scanned += 1 + + def report_dataset_scanned(self) -> None: + self.datasets_scanned += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/feature_groups.py b/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/feature_groups.py new file mode 100644 index 00000000000000..2b8ca5231c4a7c --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/feature_groups.py @@ -0,0 +1,268 @@ +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List + +import datahub.emitter.mce_builder as builder +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.sagemaker_processors.common import SagemakerSourceReport +from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType +from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import ( + MLFeatureSnapshot, + MLFeatureTableSnapshot, + MLPrimaryKeySnapshot, +) +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent +from datahub.metadata.schema_classes import ( + BrowsePathsClass, + MLFeaturePropertiesClass, + MLFeatureTablePropertiesClass, + MLPrimaryKeyPropertiesClass, +) + + +@dataclass +class FeatureGroupProcessor: + sagemaker_client: Any + env: str + report: SagemakerSourceReport + + def get_all_feature_groups(self) -> List[Dict[str, Any]]: + """ + List all feature groups in SageMaker. + """ + + feature_groups = [] + + # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_feature_groups + paginator = self.sagemaker_client.get_paginator("list_feature_groups") + for page in paginator.paginate(): + feature_groups += page["FeatureGroupSummaries"] + + return feature_groups + + def get_feature_group_details(self, feature_group_name: str) -> Dict[str, Any]: + """ + Get details of a feature group (including list of component features). + """ + + # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_feature_group + feature_group = self.sagemaker_client.describe_feature_group( + FeatureGroupName=feature_group_name + ) + + # use falsy fallback since AWS stubs require this to be a string in tests + next_token = feature_group.get("NextToken", "") + + # paginate over feature group features + while next_token: + next_features = self.sagemaker_client.describe_feature_group( + FeatureGroupName=feature_group_name, NextToken=next_token + ) + feature_group["FeatureDefinitions"].append( + next_features["FeatureDefinitions"] + ) + next_token = feature_group.get("NextToken", "") + + return feature_group + + def get_feature_group_wu( + self, feature_group_details: Dict[str, Any] + ) -> MetadataWorkUnit: + """ + Generate an MLFeatureTable workunit for a SageMaker feature group. + + Parameters + ---------- + feature_group_details: + ingested SageMaker feature group from get_feature_group_details() + """ + + feature_group_name = feature_group_details["FeatureGroupName"] + + feature_group_snapshot = MLFeatureTableSnapshot( + urn=builder.make_ml_feature_table_urn("sagemaker", feature_group_name), + aspects=[ + BrowsePathsClass(paths=[f"sagemaker/{feature_group_name}"]), + ], + ) + + feature_group_snapshot.aspects.append( + MLFeatureTablePropertiesClass( + description=feature_group_details.get("Description"), + # non-primary key features + mlFeatures=[ + builder.make_ml_feature_urn( + feature_group_name, + feature["FeatureName"], + ) + for feature in feature_group_details["FeatureDefinitions"] + if feature["FeatureName"] + != feature_group_details["RecordIdentifierFeatureName"] + ], + mlPrimaryKeys=[ + builder.make_ml_primary_key_urn( + feature_group_name, + feature_group_details["RecordIdentifierFeatureName"], + ) + ], + # additional metadata + customProperties={ + "arn": feature_group_details["FeatureGroupArn"], + "creation_time": str(feature_group_details["CreationTime"]), + "status": feature_group_details["FeatureGroupStatus"], + }, + ) + ) + + # make the MCE and workunit + mce = MetadataChangeEvent(proposedSnapshot=feature_group_snapshot) + return MetadataWorkUnit(id=feature_group_name, mce=mce) + + field_type_mappings = { + "String": MLFeatureDataType.TEXT, + "Integral": MLFeatureDataType.ORDINAL, + "Fractional": MLFeatureDataType.CONTINUOUS, + } + + def get_feature_type(self, aws_type: str, feature_name: str) -> str: + + mapped_type = self.field_type_mappings.get(aws_type) + + if mapped_type is None: + self.report.report_warning( + feature_name, f"unable to map type {aws_type} to metadata schema" + ) + mapped_type = MLFeatureDataType.UNKNOWN + + return mapped_type + + def get_feature_wu( + self, feature_group_details: Dict[str, Any], feature: Dict[str, Any] + ) -> MetadataWorkUnit: + """ + Generate an MLFeature workunit for a SageMaker feature. + + Parameters + ---------- + feature_group_details: + ingested SageMaker feature group from get_feature_group_details() + feature: + ingested SageMaker feature + """ + + # if the feature acts as the record identifier, then we ingest it as an MLPrimaryKey + # the RecordIdentifierFeatureName is guaranteed to exist as it's required on creation + is_record_identifier = ( + feature_group_details["RecordIdentifierFeatureName"] + == feature["FeatureName"] + ) + + feature_sources = [] + + if "OfflineStoreConfig" in feature_group_details: + + # remove S3 prefix (s3://) + s3_name = feature_group_details["OfflineStoreConfig"]["S3StorageConfig"][ + "S3Uri" + ][5:] + + if s3_name.endswith("/"): + s3_name = s3_name[:-1] + + feature_sources.append( + builder.make_dataset_urn( + "s3", + s3_name, + self.env, + ) + ) + + if "DataCatalogConfig" in feature_group_details["OfflineStoreConfig"]: + + # if Glue catalog associated with offline store + glue_database = feature_group_details["OfflineStoreConfig"][ + "DataCatalogConfig" + ]["Database"] + glue_table = feature_group_details["OfflineStoreConfig"][ + "DataCatalogConfig" + ]["TableName"] + + full_table_name = f"{glue_database}.{glue_table}" + + self.report.report_warning( + full_table_name, + f"""Note: table {full_table_name} is an AWS Glue object. + To view full table metadata, run Glue ingestion + (see https://datahubproject.io/docs/metadata-ingestion/#aws-glue-glue)""", + ) + + feature_sources.append( + f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.env})" + ) + + # note that there's also an OnlineStoreConfig field, but this + # lacks enough metadata to create a dataset + # (only specifies the security config and whether it's enabled at all) + + # append feature name and type + if is_record_identifier: + primary_key_snapshot: MLPrimaryKeySnapshot = MLPrimaryKeySnapshot( + urn=builder.make_ml_primary_key_urn( + feature_group_details["FeatureGroupName"], + feature["FeatureName"], + ), + aspects=[ + MLPrimaryKeyPropertiesClass( + dataType=self.get_feature_type( + feature["FeatureType"], feature["FeatureName"] + ), + sources=feature_sources, + ), + ], + ) + + # make the MCE and workunit + mce = MetadataChangeEvent(proposedSnapshot=primary_key_snapshot) + else: + # create snapshot instance for the feature + feature_snapshot: MLFeatureSnapshot = MLFeatureSnapshot( + urn=builder.make_ml_feature_urn( + feature_group_details["FeatureGroupName"], + feature["FeatureName"], + ), + aspects=[ + MLFeaturePropertiesClass( + dataType=self.get_feature_type( + feature["FeatureType"], feature["FeatureName"] + ), + sources=feature_sources, + ) + ], + ) + + # make the MCE and workunit + mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) + + return MetadataWorkUnit( + id=f'{feature_group_details["FeatureGroupName"]}-{feature["FeatureName"]}', + mce=mce, + ) + + def get_workunits(self) -> Iterable[MetadataWorkUnit]: + + feature_groups = self.get_all_feature_groups() + + for feature_group in feature_groups: + + feature_group_details = self.get_feature_group_details( + feature_group["FeatureGroupName"] + ) + + for feature in feature_group_details["FeatureDefinitions"]: + self.report.report_feature_scanned() + wu = self.get_feature_wu(feature_group_details, feature) + self.report.report_workunit(wu) + yield wu + self.report.report_feature_group_scanned() + wu = self.get_feature_group_wu(feature_group_details) + self.report.report_workunit(wu) + yield wu diff --git a/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/jobs.py b/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/jobs.py new file mode 100644 index 00000000000000..badccef3a2a12a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/jobs.py @@ -0,0 +1,1028 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union + +from datahub.emitter import mce_builder +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.aws_common import make_s3_urn +from datahub.ingestion.source.sagemaker_processors.common import SagemakerSourceReport +from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent +from datahub.metadata.schema_classes import ( + BrowsePathsClass, + DataFlowInfoClass, + DataFlowSnapshotClass, + DataJobInfoClass, + DataJobInputOutputClass, + DataJobSnapshotClass, + DatasetPropertiesClass, + JobStatusClass, +) + + +@dataclass +class SageMakerJobType: + # boto3 command to get list of jobs + list_command: str + # field in job listing response containing actual list + list_key: str + # field in job listing response element corresponding to job name + list_name_key: str + # field in job listing response element corresponding to job ARN + list_arn_key: str + + # boto3 command to get job details + describe_command: str + # field in job description response corresponding to job name + describe_name_key: str + # field in job description response corresponding to job ARN + describe_arn_key: str + # field in job description response corresponding to job status + describe_status_key: str + # job-specific mapping from boto3 status strings to DataHub-native enum + status_map: Dict[str, str] + + # name of function for processing job for ingestion + processor: str + + +# map from SageMaker job code to metadata on API access commands, fields, and processors +SAGEMAKER_JOB_TYPES = { + "auto_ml": SageMakerJobType( + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_auto_ml_jobs + list_command="list_auto_ml_jobs", + list_key="AutoMLJobSummaries", + list_name_key="AutoMLJobName", + list_arn_key="AutoMLJobArn", + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_auto_ml_job + describe_command="describe_auto_ml_job", + describe_name_key="AutoMLJobName", + describe_arn_key="AutoMLJobArn", + describe_status_key="AutoMLJobStatus", + status_map={ + "Completed": JobStatusClass.COMPLETED, + "InProgress": JobStatusClass.IN_PROGRESS, + "Failed": JobStatusClass.FAILED, + "Stopped": JobStatusClass.STOPPED, + "Stopping": JobStatusClass.STOPPING, + }, + processor="process_auto_ml_job", + ), + "compilation": SageMakerJobType( + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_compilation_jobs + list_command="list_compilation_jobs", + list_key="CompilationJobSummaries", + list_name_key="CompilationJobName", + list_arn_key="CompilationJobArn", + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_compilation_job + describe_command="describe_compilation_job", + describe_name_key="CompilationJobName", + describe_arn_key="CompilationJobArn", + describe_status_key="CompilationJobStatus", + status_map={ + "INPROGRESS": JobStatusClass.IN_PROGRESS, + "COMPLETED": JobStatusClass.COMPLETED, + "FAILED": JobStatusClass.FAILED, + "STARTING": JobStatusClass.STARTING, + "STOPPING": JobStatusClass.STOPPING, + "STOPPED": JobStatusClass.STOPPED, + }, + processor="process_compilation_job", + ), + "edge_packaging": SageMakerJobType( + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_edge_packaging_jobs + list_command="list_edge_packaging_jobs", + list_key="EdgePackagingJobSummaries", + list_name_key="EdgePackagingJobName", + list_arn_key="EdgePackagingJobArn", + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_edge_packaging_job + describe_command="describe_edge_packaging_job", + describe_name_key="EdgePackagingJobName", + describe_arn_key="EdgePackagingJobArn", + describe_status_key="EdgePackagingJobStatus", + status_map={ + "INPROGRESS": JobStatusClass.IN_PROGRESS, + "COMPLETED": JobStatusClass.COMPLETED, + "FAILED": JobStatusClass.FAILED, + "STARTING": JobStatusClass.STARTING, + "STOPPING": JobStatusClass.STOPPING, + "STOPPED": JobStatusClass.STOPPED, + }, + processor="process_edge_packaging_job", + ), + "hyper_parameter_tuning": SageMakerJobType( + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_hyper_parameter_tuning_jobs + list_command="list_hyper_parameter_tuning_jobs", + list_key="HyperParameterTuningJobSummaries", + list_name_key="HyperParameterTuningJobName", + list_arn_key="HyperParameterTuningJobArn", + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_hyper_parameter_tuning_job + describe_command="describe_hyper_parameter_tuning_job", + describe_name_key="HyperParameterTuningJobName", + describe_arn_key="HyperParameterTuningJobArn", + describe_status_key="HyperParameterTuningJobStatus", + status_map={ + "InProgress": JobStatusClass.IN_PROGRESS, + "Completed": JobStatusClass.COMPLETED, + "Failed": JobStatusClass.FAILED, + "Stopping": JobStatusClass.STOPPING, + "Stopped": JobStatusClass.STOPPED, + }, + processor="process_hyper_parameter_tuning_job", + ), + "labeling": SageMakerJobType( + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_labeling_jobs + list_command="list_labeling_jobs", + list_key="LabelingJobSummaryList", + list_name_key="LabelingJobName", + list_arn_key="LabelingJobArn", + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_labeling_job + describe_command="describe_labeling_job", + describe_name_key="LabelingJobName", + describe_arn_key="LabelingJobArn", + describe_status_key="LabelingJobStatus", + status_map={ + "Initializing": JobStatusClass.STARTING, + "InProgress": JobStatusClass.IN_PROGRESS, + "Completed": JobStatusClass.COMPLETED, + "Failed": JobStatusClass.FAILED, + "Stopping": JobStatusClass.STOPPING, + "Stopped": JobStatusClass.STOPPED, + }, + processor="process_labeling_job", + ), + "processing": SageMakerJobType( + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_processing_jobs + list_command="list_processing_jobs", + list_key="ProcessingJobSummaries", + list_name_key="ProcessingJobName", + list_arn_key="ProcessingJobArn", + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_processing_job + describe_command="describe_processing_job", + describe_name_key="ProcessingJobName", + describe_arn_key="ProcessingJobArn", + describe_status_key="ProcessingJobStatus", + status_map={ + "InProgress": JobStatusClass.IN_PROGRESS, + "Completed": JobStatusClass.COMPLETED, + "Failed": JobStatusClass.FAILED, + "Stopping": JobStatusClass.STOPPING, + "Stopped": JobStatusClass.STOPPED, + }, + processor="process_processing_job", + ), + "training": SageMakerJobType( + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_training_jobs + list_command="list_training_jobs", + list_key="TrainingJobSummaries", + list_name_key="TrainingJobName", + list_arn_key="TrainingJobArn", + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_training_job + describe_command="describe_training_job", + describe_name_key="TrainingJobName", + describe_arn_key="TrainingJobArn", + describe_status_key="TrainingJobStatus", + status_map={ + "InProgress": JobStatusClass.IN_PROGRESS, + "Completed": JobStatusClass.COMPLETED, + "Failed": JobStatusClass.FAILED, + "Stopping": JobStatusClass.STOPPING, + "Stopped": JobStatusClass.STOPPED, + }, + processor="process_training_job", + ), + "transform": SageMakerJobType( + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_transform_jobs + list_command="list_transform_jobs", + list_key="TransformJobSummaries", + list_name_key="TransformJobName", + list_arn_key="TransformJobArn", + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_transform_job + describe_command="describe_transform_job", + describe_name_key="TransformJobName", + describe_arn_key="TransformJobArn", + describe_status_key="TransformJobStatus", + status_map={ + "InProgress": JobStatusClass.IN_PROGRESS, + "Completed": JobStatusClass.COMPLETED, + "Failed": JobStatusClass.FAILED, + "Stopping": JobStatusClass.STOPPING, + "Stopped": JobStatusClass.STOPPED, + }, + processor="process_transform_job", + ), +} + + +def make_sagemaker_flow_urn(job_type: str, job_name: str, env: str) -> str: + + return mce_builder.make_data_flow_urn( + orchestrator="sagemaker", flow_id=f"{job_type}:{job_name}", cluster=env + ) + + +def make_sagemaker_job_urn(job_type: str, job_name: str, arn: str, env: str) -> str: + + flow_urn = make_sagemaker_flow_urn(job_type, job_name, env) + + # SageMaker has no global grouping property for jobs, + # so we create a flow for every single job + return mce_builder.make_data_job_urn_with_flow(flow_urn=flow_urn, job_id=arn) + + +@dataclass +class SageMakerJob: + """ + Intermediate job representation for storing result of initial ingestion from raw API response. + + Produced by first-pass ingestion and basis for subsequent extraction. + """ + + job_snapshot: DataJobSnapshotClass + job_name: str + job_arn: str + job_type: str + input_datasets: Dict[str, Dict[str, Any]] = field(default_factory=dict) + output_datasets: Dict[str, Dict[str, Any]] = field(default_factory=dict) + input_jobs: Set[str] = field(default_factory=set) + # we resolve output jobs to input ones after processing + output_jobs: Set[str] = field(default_factory=set) + + +@dataclass +class JobProcessor: + """ + Job ingestion module, called by top-level SageMaker ingestion handler. + """ + + # boto3 SageMaker client + sagemaker_client: Any + env: str + report: SagemakerSourceReport + # config filter for specific job types to ingest (see metadata-ingestion README) + job_type_filter: Union[Dict[str, str], bool, None] + + # translators between ARNs and job names (represented as tuples of (job_type, job_name)) + arn_to_name: Dict[str, Tuple[str, str]] = field(default_factory=dict) + name_to_arn: Dict[Tuple[str, str], str] = field(default_factory=dict) + + def get_all_jobs( + self, + ) -> List[Dict[str, Any]]: + """ + List all jobs in SageMaker. + """ + + jobs = [] + + # dictionaries for translating between type-specific job names and ARNs + self.arn_to_name: Dict[str, Tuple[str, str]] = {} + self.name_to_arn: Dict[Tuple[str, str], str] = {} + + if self.job_type_filter is True: + allowed_jobs = sorted(SAGEMAKER_JOB_TYPES.keys()) + elif isinstance(self.job_type_filter, dict): + allowed_jobs = sorted( + [ + job_type + for job_type in SAGEMAKER_JOB_TYPES.keys() + if self.job_type_filter.get(job_type, True) is True + ] + ) + + # iterate through keys in sorted order for consistency + for job_type in sorted(allowed_jobs): + + job_spec = SAGEMAKER_JOB_TYPES[job_type] + + paginator = self.sagemaker_client.get_paginator(job_spec.list_command) + for page in paginator.paginate(): + page_jobs = page[job_spec.list_key] + + for job in page_jobs: + job_name = (job_type, job[job_spec.list_name_key]) + job_arn = job[job_spec.list_arn_key] + + self.arn_to_name[job_arn] = job_name + self.name_to_arn[job_name] = job_arn + + page_jobs = [{**job, "type": job_type} for job in page_jobs] + + jobs += page_jobs + + return jobs + + def get_job_details(self, job_name: str, job_type: str) -> Dict[str, Any]: + """ + Get boto3 describe_ response + """ + + describe_command = SAGEMAKER_JOB_TYPES[job_type].describe_command + describe_name_key = SAGEMAKER_JOB_TYPES[job_type].describe_name_key + + return getattr(self.sagemaker_client, describe_command)( + **{describe_name_key: job_name} + ) + + def get_workunits(self) -> Iterable[MetadataWorkUnit]: + + jobs = self.get_all_jobs() + + processed_jobs: Dict[str, SageMakerJob] = {} + + # first pass: process jobs and collect datasets used + for job in jobs: + + job_type = SAGEMAKER_JOB_TYPES[job["type"]] + job_name = job[job_type.list_name_key] + + job_details = self.get_job_details(job_name, job["type"]) + + processed_job = getattr(self, job_type.processor)(job_details) + processed_jobs[processed_job.job_snapshot.urn] = processed_job + + all_datasets = {} + + # second pass: + # - move output jobs to inputs + # - aggregate i/o datasets + for job_urn in sorted(processed_jobs): + processed_job = processed_jobs[job_urn] + + for output_job_urn in processed_job.output_jobs: + processed_jobs[output_job_urn].input_jobs.add(output_job_urn) + + all_datasets.update(processed_job.input_datasets) + all_datasets.update(processed_job.output_datasets) + + # yield datasets + for dataset_urn, dataset in all_datasets.items(): + + dataset_snapshot = DatasetSnapshot( + urn=dataset_urn, + aspects=[], + ) + dataset_snapshot.aspects.append( + DatasetPropertiesClass( + customProperties={k: str(v) for k, v in dataset.items()}, + tags=[], + ) + ) + dataset_mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) + dataset_wu = MetadataWorkUnit( + id=dataset_urn, + mce=dataset_mce, + ) + self.report.report_dataset_scanned() + self.report.report_workunit(dataset_wu) + yield dataset_wu + + # third pass: construct and yield MCEs + for job_urn in sorted(processed_jobs): + + processed_job = processed_jobs[job_urn] + job_snapshot = processed_job.job_snapshot + + flow_urn = make_sagemaker_flow_urn( + processed_job.job_type, processed_job.job_name, self.env + ) + + # create flow for each job + flow_mce = MetadataChangeEvent( + proposedSnapshot=DataFlowSnapshotClass( + urn=flow_urn, + aspects=[ + DataFlowInfoClass( + name=processed_job.job_name, + ), + ], + ) + ) + flow_wu = MetadataWorkUnit( + id=flow_urn, + mce=flow_mce, + ) + self.report.report_workunit(flow_wu) + yield flow_wu + + job_snapshot.aspects.append( + DataJobInputOutputClass( + inputDatasets=sorted(list(processed_job.input_datasets.keys())), + outputDatasets=sorted(list(processed_job.output_datasets.keys())), + inputDatajobs=sorted(list(processed_job.input_jobs)), + ) + ) + + job_mce = MetadataChangeEvent(proposedSnapshot=job_snapshot) + job_wu = MetadataWorkUnit( + id=job_urn, + mce=job_mce, + ) + self.report.report_job_scanned() + self.report.report_workunit(job_wu) + yield job_wu + + def create_common_job_snapshot( + self, + job: Dict[str, Any], + job_type: str, + ) -> Tuple[DataJobSnapshotClass, str, str]: + """ + General function for generating a job snapshot. + """ + + job_type_info = SAGEMAKER_JOB_TYPES[job_type] + + name = job[job_type_info.describe_name_key] + arn = job[job_type_info.describe_arn_key] + + sagemaker_status = job[job_type_info.describe_status_key] + + mapped_status = job_type_info.status_map.get(sagemaker_status) + + if mapped_status is None: + mapped_status = JobStatusClass.UNKNOWN + + self.report.report_warning( + name, + f"Unknown status for {name} ({arn}): {sagemaker_status}", + ) + + job_urn = make_sagemaker_job_urn(job_type, name, arn, self.env) + job_snapshot = DataJobSnapshotClass( + urn=job_urn, + aspects=[ + DataJobInfoClass( + name=name, + type="SAGEMAKER", + status=mapped_status, + customProperties={ + **{key: str(value) for key, value in job.items()}, + "jobType": job_type, + }, + ), + BrowsePathsClass(paths=[f"{job_type}/{name}"]), + ], + ) + + return job_snapshot, name, arn + + def process_auto_ml_job(self, job: Dict[str, Any]) -> SageMakerJob: + """ + Process outputs from Boto3 describe_auto_ml_job() + + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_auto_ml_job + """ + + JOB_TYPE = "auto_ml" + + input_datasets = {} + + for input_config in job.get("InputDataConfig", []): + input_data = input_config.get("DataSource", {}).get("S3DataSource") + + if input_data is not None and "S3Uri" in input_data: + input_datasets[make_s3_urn(input_data["S3Uri"], self.env)] = { + "dataset_type": "s3", + "uri": input_data["S3Uri"], + "datatype": input_data.get("S3DataType"), + } + + output_datasets = {} + + output_s3_path = job.get("OutputDataConfig", {}).get("S3OutputPath") + + if output_s3_path is not None: + output_datasets[make_s3_urn(output_s3_path, self.env)] = { + "dataset_type": "s3", + "uri": output_s3_path, + } + + job_snapshot, job_name, job_arn = self.create_common_job_snapshot( + job, + JOB_TYPE, + ) + + return SageMakerJob( + job_name=job_name, + job_arn=job_arn, + job_type=JOB_TYPE, + job_snapshot=job_snapshot, + input_datasets=input_datasets, + output_datasets=output_datasets, + ) + + def process_compilation_job(self, job: Dict[str, Any]) -> SageMakerJob: + + """ + Process outputs from Boto3 describe_compilation_job() + + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_compilation_job + """ + + JOB_TYPE = "compilation" + + input_datasets = {} + + input_data: Optional[Dict[str, Any]] = job.get("InputConfig") + + if input_data is not None and "S3Uri" in input_data: + input_datasets[make_s3_urn(input_data["S3Uri"], self.env)] = { + "dataset_type": "s3", + "uri": input_data["S3Uri"], + "framework": input_data.get("Framework"), + "framework_version": input_data.get("FrameworkVersion"), + } + + output_datasets = {} + + output_data: Optional[Dict[str, Any]] = job.get("OutputConfig") + + if output_data is not None and "S3OutputLocation" in output_data: + output_datasets[make_s3_urn(output_data["S3OutputLocation"], self.env)] = { + "dataset_type": "s3", + "uri": output_data["S3OutputLocation"], + "target_device": output_data.get("TargetDevice"), + "target_platform": output_data.get("TargetPlatform"), + } + + job_snapshot, job_name, job_arn = self.create_common_job_snapshot( + job, + JOB_TYPE, + ) + + return SageMakerJob( + job_name=job_name, + job_arn=job_arn, + job_type=JOB_TYPE, + job_snapshot=job_snapshot, + input_datasets=input_datasets, + output_datasets=output_datasets, + ) + + def process_edge_packaging_job( + self, + job: Dict[str, Any], + ) -> SageMakerJob: + + """ + Process outputs from Boto3 describe_edge_packaging_job() + + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_edge_packaging_job + """ + + JOB_TYPE = "edge_packaging" + + name: str = job["EdgePackagingJobName"] + arn: str = job["EdgePackagingJobArn"] + + output_datasets = {} + + model_artifact_s3_uri: Optional[str] = job.get("ModelArtifact") + output_s3_uri: Optional[str] = job.get("OutputConfig", {}).get( + "S3OutputLocation" + ) + + if model_artifact_s3_uri is not None: + output_datasets[make_s3_urn(model_artifact_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": model_artifact_s3_uri, + } + + if output_s3_uri is not None: + output_datasets[make_s3_urn(output_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": output_s3_uri, + } + + # from docs: "The name of the SageMaker Neo compilation job that is used to locate model artifacts that are being packaged." + compilation_job_name: Optional[str] = job.get("CompilationJobName") + + output_jobs = set() + if compilation_job_name is not None: + + # globally unique job name + full_job_name = ("compilation", compilation_job_name) + + if full_job_name in self.name_to_arn: + + output_jobs.add( + make_sagemaker_job_urn( + "compilation", + compilation_job_name, + self.name_to_arn[full_job_name], + self.env, + ) + ) + else: + + self.report.report_warning( + name, + f"Unable to find ARN for compilation job {compilation_job_name} produced by edge packaging job {arn}", + ) + + # TODO: see if we can link models here (will require adding some aspect to either jobs or models) + # model: Optional[str] = job.get("ModelName") + # model_version: Optional[str] = job.get("ModelVersion") + + job_snapshot, job_name, job_arn = self.create_common_job_snapshot( + job, + JOB_TYPE, + ) + + return SageMakerJob( + job_name=job_name, + job_arn=job_arn, + job_type=JOB_TYPE, + job_snapshot=job_snapshot, + output_datasets=output_datasets, + output_jobs=output_jobs, + ) + + def process_hyper_parameter_tuning_job( + self, + job: Dict[str, Any], + ) -> SageMakerJob: + + """ + Process outputs from Boto3 describe_hyper_parameter_tuning_job() + + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_hyper_parameter_tuning_job + """ + + JOB_TYPE = "hyper_parameter_tuning" + + name: str = job["HyperParameterTuningJobName"] + arn: str = job["HyperParameterTuningJobArn"] + + training_jobs = set() + + for training_job in job.get("TrainingJobDefinitions", []): + + full_job_name = ("training", training_job["DefinitionName"]) + + if full_job_name in self.name_to_arn: + + training_jobs.add( + make_sagemaker_job_urn( + "training", + training_job["DefinitionName"], + self.name_to_arn[full_job_name], + self.env, + ) + ) + else: + + self.report.report_warning( + name, + f"Unable to find ARN for training job {training_job['DefinitionName']} produced by hyperparameter tuning job {arn}", + ) + + job_snapshot, job_name, job_arn = self.create_common_job_snapshot( + job, + JOB_TYPE, + ) + + return SageMakerJob( + job_name=job_name, + job_arn=job_arn, + job_type=JOB_TYPE, + job_snapshot=job_snapshot, + output_jobs=training_jobs, + ) + + def process_labeling_job(self, job: Dict[str, Any]) -> SageMakerJob: + + """ + Process outputs from Boto3 describe_labeling_job() + + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_labeling_job + """ + + JOB_TYPE = "labeling" + + input_datasets = {} + + input_s3_uri: Optional[str] = ( + job.get("InputConfig", {}) + .get("DataSource", {}) + .get("S3DataSource", {}) + .get("ManifestS3Uri") + ) + if input_s3_uri is not None: + input_datasets[make_s3_urn(input_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": input_s3_uri, + } + category_config_s3_uri: Optional[str] = job.get("LabelCategoryConfigS3Uri") + if category_config_s3_uri is not None: + input_datasets[make_s3_urn(category_config_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": category_config_s3_uri, + } + + output_datasets = {} + + output_s3_uri: Optional[str] = job.get("LabelingJobOutput", {}).get( + "OutputDatasetS3Uri" + ) + if output_s3_uri is not None: + output_datasets[make_s3_urn(output_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": output_s3_uri, + } + output_config_s3_uri: Optional[str] = job.get("OutputConfig", {}).get( + "S3OutputPath" + ) + if output_config_s3_uri is not None: + output_datasets[make_s3_urn(output_config_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": output_config_s3_uri, + } + + job_snapshot, job_name, job_arn = self.create_common_job_snapshot( + job, + JOB_TYPE, + ) + + return SageMakerJob( + job_name=job_name, + job_arn=job_arn, + job_type=JOB_TYPE, + job_snapshot=job_snapshot, + input_datasets=input_datasets, + output_datasets=output_datasets, + ) + + def process_processing_job(self, job: Dict[str, Any]) -> SageMakerJob: + + """ + Process outputs from Boto3 describe_processing_job() + + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_processing_job + """ + + JOB_TYPE = "processing" + + input_jobs = set() + + auto_ml_arn: Optional[str] = job.get("AutoMLJobArn") + training_arn: Optional[str] = job.get("TrainingJobArn") + + if auto_ml_arn is not None: + auto_ml_type, auto_ml_name = self.arn_to_name.get(auto_ml_arn, (None, None)) + + if auto_ml_type is not None and auto_ml_name is not None: + input_jobs.add( + make_sagemaker_job_urn( + auto_ml_type, auto_ml_name, auto_ml_arn, self.env + ) + ) + + if training_arn is not None: + training_type, training_name = self.arn_to_name.get( + training_arn, (None, None) + ) + if training_type is not None and training_name is not None: + input_jobs.add( + make_sagemaker_job_urn( + training_type, training_name, training_arn, self.env + ) + ) + + input_datasets = {} + + inputs = job["ProcessingInputs"] + + for input_config in inputs: + + input_name = input_config["InputName"] + + input_s3 = input_config.get("S3Input", {}) + input_s3_uri = input_s3.get("S3Uri") + + if input_s3_uri is not None: + + input_datasets[make_s3_urn(input_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": input_s3_uri, + "datatype": input_s3.get("S3DataType"), + "mode": input_s3.get("S3InputMode"), + "distribution_type": input_s3.get("S3DataDistributionType"), + "compression": input_s3.get("S3CompressionType"), + "name": input_name, + } + + # TODO: ingest Athena and Redshift data sources + # We don't do this at the moment because we need to parse the QueryString SQL + # in order to get the tables used (otherwise we just have databases) + + # input_athena = input_config.get("DatasetDefinition", {}).get( + # "AthenaDatasetDefinition", {} + # ) + + # input_redshift = input_config.get("DatasetDefinition", {}).get( + # "RedshiftDatasetDefinition", {} + # ) + + outputs: List[Dict[str, Any]] = job.get("ProcessingOutputConfig", {}).get( + "Outputs", [] + ) + + output_datasets = {} + + for output in outputs: + output_name = output["OutputName"] + + output_s3_uri = output.get("S3Output", {}).get("S3Uri") + if output_s3_uri is not None: + output_datasets[make_s3_urn(output_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": output_s3_uri, + "name": output_name, + } + + output_feature_group = output.get("FeatureStoreOutput", {}).get( + "FeatureGroupName" + ) + if output_feature_group is not None: + output_datasets[ + mce_builder.make_ml_feature_table_urn( + "sagemaker", output_feature_group + ) + ] = { + "dataset_type": "sagemaker_feature_group", + } + + job_snapshot, job_name, job_arn = self.create_common_job_snapshot( + job, + JOB_TYPE, + ) + + return SageMakerJob( + job_name=job_name, + job_arn=job_arn, + job_type=JOB_TYPE, + job_snapshot=job_snapshot, + input_datasets=input_datasets, + input_jobs=input_jobs, + ) + + def process_training_job(self, job: Dict[str, Any]) -> SageMakerJob: + + """ + Process outputs from Boto3 describe_training_job() + + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_training_job + """ + + JOB_TYPE = "training" + + input_datasets = {} + + input_data_configs = job.get("InputDataConfig", []) + + for config in input_data_configs: + + data_source = config.get("DataSource", {}) + + s3_source = data_source.get("S3DataSource", {}) + s3_uri = s3_source.get("S3Uri") + + if s3_uri is not None: + input_datasets[make_s3_urn(s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": s3_uri, + "datatype": s3_source.get("S3Datatype"), + "distribution_type": s3_source.get("S3DataDistributionType"), + "attribute_names": s3_source.get("AttributeNames"), + "channel_name": config.get("ChannelName"), + } + + output_s3_uri = job.get("OutputDataConfig", {}).get("S3OutputPath") + checkpoint_s3_uri = job.get("CheckpointConfig", {}).get("S3Uri") + debug_s3_path = job.get("DebugHookConfig", {}).get("S3OutputPath") + tensorboard_output_path = job.get("TensorBoardOutputConfig", {}).get( + "S3OutputPath" + ) + profiler_output_path = job.get("ProfilerConfig", {}).get("S3OutputPath") + + debug_rule_configs = job.get("DebugRuleConfigurations", []) + processed_debug_configs = [ + config.get("S3OutputPath") for config in debug_rule_configs + ] + profiler_rule_configs = job.get("ProfilerRuleConfigurations", []) + processed_profiler_configs = [ + config.get("S3OutputPath") for config in profiler_rule_configs + ] + + output_datasets = {} + + # process all output datasets at once + for output_s3_uri in [ + output_s3_uri, + checkpoint_s3_uri, + debug_s3_path, + tensorboard_output_path, + profiler_output_path, + *processed_debug_configs, + *processed_profiler_configs, + ]: + + if output_s3_uri is not None: + output_datasets[make_s3_urn(output_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": output_s3_uri, + } + + job_snapshot, job_name, job_arn = self.create_common_job_snapshot( + job, + JOB_TYPE, + ) + + return SageMakerJob( + job_name=job_name, + job_arn=job_arn, + job_type=JOB_TYPE, + job_snapshot=job_snapshot, + input_datasets=input_datasets, + output_datasets=output_datasets, + ) + + def process_transform_job(self, job: Dict[str, Any]) -> SageMakerJob: + + """ + Process outputs from Boto3 describe_transform_job() + + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_transform_job + """ + + JOB_TYPE = "transform" + + job_input = job.get("TransformInput", {}) + input_s3 = job_input.get("DataSource", {}).get("S3DataSource", {}) + + input_s3_uri = input_s3.get("S3Uri") + + input_datasets = {} + + if input_s3_uri is not None: + + input_datasets[make_s3_urn(input_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": input_s3_uri, + "datatype": input_s3.get("S3DataType"), + "compression": job_input.get("CompressionType"), + "split": job_input.get("SplitType"), + } + + output_datasets = {} + + output_s3_uri = job.get("TransformOutput", {}).get("S3OutputPath") + + if output_s3_uri is not None: + output_datasets[make_s3_urn(output_s3_uri, self.env)] = { + "dataset_type": "s3", + "uri": output_s3_uri, + } + + labeling_arn = job.get("LabelingJobArn") + auto_ml_arn = job.get("AutoMLJobArn") + + input_jobs = set() + + if labeling_arn is not None: + labeling_type, labeling_name = self.arn_to_name.get( + labeling_arn, (None, None) + ) + + if labeling_type is not None and labeling_name is not None: + input_jobs.add( + make_sagemaker_job_urn( + labeling_type, labeling_name, labeling_arn, self.env + ) + ) + + if auto_ml_arn is not None: + auto_ml_type, auto_ml_name = self.arn_to_name.get(auto_ml_arn, (None, None)) + + if auto_ml_type is not None and auto_ml_name is not None: + input_jobs.add( + make_sagemaker_job_urn( + auto_ml_type, auto_ml_name, auto_ml_arn, self.env + ) + ) + + job_snapshot, job_name, job_arn = self.create_common_job_snapshot( + job, + JOB_TYPE, + ) + + return SageMakerJob( + job_name=job_name, + job_arn=job_arn, + job_type=JOB_TYPE, + job_snapshot=job_snapshot, + input_datasets=input_datasets, + output_datasets=output_datasets, + input_jobs=input_jobs, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/models.py b/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/models.py new file mode 100644 index 00000000000000..1c5db293485799 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sagemaker_processors/models.py @@ -0,0 +1,86 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, Iterable, List + +import datahub.emitter.mce_builder as builder +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.sagemaker_processors.common import SagemakerSourceReport +from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import MLModelSnapshot +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent +from datahub.metadata.schema_classes import MLModelPropertiesClass + + +@dataclass +class ModelProcessor: + sagemaker_client: Any + env: str + report: SagemakerSourceReport + + def get_all_models(self) -> List[Dict[str, Any]]: + """ + List all models in SageMaker. + """ + + models = [] + + # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_models + paginator = self.sagemaker_client.get_paginator("list_models") + for page in paginator.paginate(): + models += page["Models"] + + return models + + def get_model_details(self, model_name: str) -> Dict[str, Any]: + """ + Get details of a model. + """ + + # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_model + return self.sagemaker_client.describe_model(ModelName=model_name) + + def get_model_wu(self, model_details: Dict[str, Any]) -> MetadataWorkUnit: + + # params to remove since we extract them + redundant_fields = {"ModelName", "CreationTime"} + + model_snapshot = MLModelSnapshot( + urn=builder.make_ml_model_urn( + "sagemaker", model_details["ModelName"], self.env + ), + aspects=[ + MLModelPropertiesClass( + date=int( + model_details.get("CreationTime", datetime.now()).timestamp() + * 1000 + ), + customProperties={ + key: str(value) + for key, value in model_details.items() + if key not in redundant_fields + }, + ) + ], + ) + + # make the MCE and workunit + mce = MetadataChangeEvent(proposedSnapshot=model_snapshot) + + return MetadataWorkUnit( + id=f'{model_details["ModelName"]}', + mce=mce, + ) + + def get_workunits(self) -> Iterable[MetadataWorkUnit]: + + models = self.get_all_models() + # sort models for consistency + models = sorted(models, key=lambda x: x["ModelArn"]) + + for model in models: + + model_details = self.get_model_details(model["ModelName"]) + + self.report.report_model_scanned() + wu = self.get_model_wu(model_details) + self.report.report_workunit(wu) + yield wu diff --git a/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/datajob/__init__.py b/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/datajob/__init__.py index c28a414f74407b..80e8e082d1b39b 100644 --- a/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/datajob/__init__.py +++ b/metadata-ingestion/src/datahub/metadata/com/linkedin/pegasus2avro/datajob/__init__.py @@ -9,6 +9,7 @@ from .....schema_classes import DataJobInputOutputClass from .....schema_classes import EditableDataFlowPropertiesClass from .....schema_classes import EditableDataJobPropertiesClass +from .....schema_classes import JobStatusClass DataFlowInfo = DataFlowInfoClass @@ -16,4 +17,5 @@ DataJobInputOutput = DataJobInputOutputClass EditableDataFlowProperties = EditableDataFlowPropertiesClass EditableDataJobProperties = EditableDataJobPropertiesClass +JobStatus = JobStatusClass # fmt: on diff --git a/metadata-ingestion/src/datahub/metadata/schema.avsc b/metadata-ingestion/src/datahub/metadata/schema.avsc index 0bfaee0ccfd87d..6b6691c4dae6f2 100644 --- a/metadata-ingestion/src/datahub/metadata/schema.avsc +++ b/metadata-ingestion/src/datahub/metadata/schema.avsc @@ -1607,10 +1607,11 @@ "GLUE" ], "doc": "The various types of support azkaban jobs" - } + }, + "string" ], "name": "type", - "doc": "Datajob type" + "doc": "Datajob type\n**NOTE**: AzkabanJobType is deprecated. Please use strings instead." }, { "Relationship": { @@ -1629,6 +1630,38 @@ "name": "flowUrn", "default": null, "doc": "DataFlow urn that this job is part of" + }, + { + "type": [ + "null", + { + "type": "enum", + "symbolDocs": { + "COMPLETED": "Jobs with successful completion.", + "FAILED": "Jobs that have failed.", + "IN_PROGRESS": "Jobs currently running.", + "STARTING": "Jobs being initialized.", + "STOPPED": "Jobs that have stopped.", + "STOPPING": "Jobs being stopped.", + "UNKNOWN": "Jobs with unknown status (either unmappable or unavailable)" + }, + "name": "JobStatus", + "namespace": "com.linkedin.pegasus2avro.datajob", + "symbols": [ + "STARTING", + "IN_PROGRESS", + "STOPPING", + "STOPPED", + "COMPLETED", + "FAILED", + "UNKNOWN" + ], + "doc": "Job statuses" + } + ], + "name": "status", + "default": null, + "doc": "Status of the job" } ], "doc": "Information about a Data processing job" @@ -3255,6 +3288,15 @@ "name": "MLModelProperties", "namespace": "com.linkedin.pegasus2avro.ml.metadata", "fields": [ + { + "type": { + "type": "map", + "values": "string" + }, + "name": "customProperties", + "default": {}, + "doc": "Custom property bag." + }, { "Searchable": { "fieldType": "TEXT", @@ -4355,7 +4397,8 @@ "com.linkedin.pegasus2avro.common.Ownership", "com.linkedin.pegasus2avro.common.InstitutionalMemory", "com.linkedin.pegasus2avro.common.Status", - "com.linkedin.pegasus2avro.common.Deprecation" + "com.linkedin.pegasus2avro.common.Deprecation", + "com.linkedin.pegasus2avro.common.BrowsePaths" ] }, "name": "aspects", diff --git a/metadata-ingestion/src/datahub/metadata/schema_classes.py b/metadata-ingestion/src/datahub/metadata/schema_classes.py index 523622741a4d54..1578de5523ef3e 100644 --- a/metadata-ingestion/src/datahub/metadata/schema_classes.py +++ b/metadata-ingestion/src/datahub/metadata/schema_classes.py @@ -1873,11 +1873,12 @@ class DataJobInfoClass(DictWrapper): RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.datajob.DataJobInfo") def __init__(self, name: str, - type: Union[str, "AzkabanJobTypeClass"], + type: Union[Union[str, "AzkabanJobTypeClass"], str], customProperties: Optional[Dict[str, str]]=None, externalUrl: Union[None, str]=None, description: Union[None, str]=None, flowUrn: Union[None, str]=None, + status: Union[None, Union[str, "JobStatusClass"]]=None, ): super().__init__() @@ -1891,6 +1892,7 @@ def __init__(self, self.description = description self.type = type self.flowUrn = flowUrn + self.status = status @classmethod def construct_with_defaults(cls) -> "DataJobInfoClass": @@ -1906,6 +1908,7 @@ def _restore_defaults(self) -> None: self.description = self.RECORD_SCHEMA.field_map["description"].default self.type = AzkabanJobTypeClass.COMMAND self.flowUrn = self.RECORD_SCHEMA.field_map["flowUrn"].default + self.status = self.RECORD_SCHEMA.field_map["status"].default @property @@ -1953,13 +1956,15 @@ def description(self, value: Union[None, str]) -> None: @property - def type(self) -> Union[str, "AzkabanJobTypeClass"]: - """Getter: Datajob type""" + def type(self) -> Union[Union[str, "AzkabanJobTypeClass"], str]: + """Getter: Datajob type + **NOTE**: AzkabanJobType is deprecated. Please use strings instead.""" return self._inner_dict.get('type') # type: ignore @type.setter - def type(self, value: Union[str, "AzkabanJobTypeClass"]) -> None: - """Setter: Datajob type""" + def type(self, value: Union[Union[str, "AzkabanJobTypeClass"], str]) -> None: + """Setter: Datajob type + **NOTE**: AzkabanJobType is deprecated. Please use strings instead.""" self._inner_dict['type'] = value @@ -1974,6 +1979,17 @@ def flowUrn(self, value: Union[None, str]) -> None: self._inner_dict['flowUrn'] = value + @property + def status(self) -> Union[None, Union[str, "JobStatusClass"]]: + """Getter: Status of the job""" + return self._inner_dict.get('status') # type: ignore + + @status.setter + def status(self, value: Union[None, Union[str, "JobStatusClass"]]) -> None: + """Setter: Status of the job""" + self._inner_dict['status'] = value + + class DataJobInputOutputClass(DictWrapper): """Information about the inputs and outputs of a Data processing job""" @@ -2203,6 +2219,32 @@ def description(self, value: Union[None, str]) -> None: self._inner_dict['description'] = value +class JobStatusClass(object): + """Job statuses""" + + + """Jobs being initialized.""" + STARTING = "STARTING" + + """Jobs currently running.""" + IN_PROGRESS = "IN_PROGRESS" + + """Jobs being stopped.""" + STOPPING = "STOPPING" + + """Jobs that have stopped.""" + STOPPED = "STOPPED" + + """Jobs with successful completion.""" + COMPLETED = "COMPLETED" + + """Jobs that have failed.""" + FAILED = "FAILED" + + """Jobs with unknown status (either unmappable or unavailable)""" + UNKNOWN = "UNKNOWN" + + class AzkabanJobTypeClass(object): """The various types of support azkaban jobs""" @@ -4663,7 +4705,7 @@ class MLFeatureTableSnapshotClass(DictWrapper): RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureTableSnapshot") def __init__(self, urn: str, - aspects: List[Union["MLFeatureTableKeyClass", "MLFeatureTablePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass"]], + aspects: List[Union["MLFeatureTableKeyClass", "MLFeatureTablePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass", "BrowsePathsClass"]], ): super().__init__() @@ -4694,12 +4736,12 @@ def urn(self, value: str) -> None: @property - def aspects(self) -> List[Union["MLFeatureTableKeyClass", "MLFeatureTablePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass"]]: + def aspects(self) -> List[Union["MLFeatureTableKeyClass", "MLFeatureTablePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass", "BrowsePathsClass"]]: """Getter: The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects.""" return self._inner_dict.get('aspects') # type: ignore @aspects.setter - def aspects(self, value: List[Union["MLFeatureTableKeyClass", "MLFeatureTablePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass"]]) -> None: + def aspects(self, value: List[Union["MLFeatureTableKeyClass", "MLFeatureTablePropertiesClass", "OwnershipClass", "InstitutionalMemoryClass", "StatusClass", "DeprecationClass", "BrowsePathsClass"]]) -> None: """Setter: The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects.""" self._inner_dict['aspects'] = value @@ -5494,6 +5536,7 @@ class MLModelPropertiesClass(DictWrapper): RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.ml.metadata.MLModelProperties") def __init__(self, + customProperties: Optional[Dict[str, str]]=None, description: Union[None, str]=None, date: Union[None, int]=None, version: Union[None, "VersionTagClass"]=None, @@ -5504,6 +5547,11 @@ def __init__(self, ): super().__init__() + if customProperties is None: + # default: {} + self.customProperties = dict() + else: + self.customProperties = customProperties self.description = description self.date = date self.version = version @@ -5524,6 +5572,7 @@ def construct_with_defaults(cls) -> "MLModelPropertiesClass": return self def _restore_defaults(self) -> None: + self.customProperties = dict() self.description = self.RECORD_SCHEMA.field_map["description"].default self.date = self.RECORD_SCHEMA.field_map["date"].default self.version = self.RECORD_SCHEMA.field_map["version"].default @@ -5533,6 +5582,17 @@ def _restore_defaults(self) -> None: self.tags = list() + @property + def customProperties(self) -> Dict[str, str]: + """Getter: Custom property bag.""" + return self._inner_dict.get('customProperties') # type: ignore + + @customProperties.setter + def customProperties(self, value: Dict[str, str]) -> None: + """Setter: Custom property bag.""" + self._inner_dict['customProperties'] = value + + @property def description(self) -> Union[None, str]: """Getter: Documentation of the MLModel""" @@ -7708,6 +7768,7 @@ def userEmail(self, value: Union[None, str]) -> None: 'com.linkedin.pegasus2avro.datajob.DataJobInputOutput': DataJobInputOutputClass, 'com.linkedin.pegasus2avro.datajob.EditableDataFlowProperties': EditableDataFlowPropertiesClass, 'com.linkedin.pegasus2avro.datajob.EditableDataJobProperties': EditableDataJobPropertiesClass, + 'com.linkedin.pegasus2avro.datajob.JobStatus': JobStatusClass, 'com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType': AzkabanJobTypeClass, 'com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo': DataPlatformInfoClass, 'com.linkedin.pegasus2avro.dataplatform.PlatformType': PlatformTypeClass, @@ -7854,6 +7915,7 @@ def userEmail(self, value: Union[None, str]) -> None: 'DataJobInputOutput': DataJobInputOutputClass, 'EditableDataFlowProperties': EditableDataFlowPropertiesClass, 'EditableDataJobProperties': EditableDataJobPropertiesClass, + 'JobStatus': JobStatusClass, 'AzkabanJobType': AzkabanJobTypeClass, 'DataPlatformInfo': DataPlatformInfoClass, 'PlatformType': PlatformTypeClass, diff --git a/metadata-ingestion/src/datahub/metadata/schemas/MetadataAuditEvent.avsc b/metadata-ingestion/src/datahub/metadata/schemas/MetadataAuditEvent.avsc index 7d5b1e67d34286..98f157e591f3b5 100644 --- a/metadata-ingestion/src/datahub/metadata/schemas/MetadataAuditEvent.avsc +++ b/metadata-ingestion/src/datahub/metadata/schemas/MetadataAuditEvent.avsc @@ -1596,9 +1596,10 @@ "PIG": "Pig type is for running Pig jobs.", "SQL": "SQL is for running Presto, mysql queries etc" } - } + }, + "string" ], - "doc": "Datajob type" + "doc": "Datajob type\n**NOTE**: AzkabanJobType is deprecated. Please use strings instead." }, { "name": "flowUrn", @@ -1617,6 +1618,37 @@ "java": { "class": "com.linkedin.pegasus2avro.common.urn.DataFlowUrn" } + }, + { + "name": "status", + "type": [ + "null", + { + "type": "enum", + "name": "JobStatus", + "doc": "Job statuses", + "symbols": [ + "STARTING", + "IN_PROGRESS", + "STOPPING", + "STOPPED", + "COMPLETED", + "FAILED", + "UNKNOWN" + ], + "symbolDocs": { + "COMPLETED": "Jobs with successful completion.", + "FAILED": "Jobs that have failed.", + "IN_PROGRESS": "Jobs currently running.", + "STARTING": "Jobs being initialized.", + "STOPPED": "Jobs that have stopped.", + "STOPPING": "Jobs being stopped.", + "UNKNOWN": "Jobs with unknown status (either unmappable or unavailable)" + } + } + ], + "doc": "Status of the job", + "default": null } ], "Aspect": { @@ -3208,6 +3240,15 @@ "namespace": "com.linkedin.pegasus2avro.ml.metadata", "doc": "Properties associated with a ML Model", "fields": [ + { + "name": "customProperties", + "type": { + "type": "map", + "values": "string" + }, + "doc": "Custom property bag.", + "default": {} + }, { "name": "description", "type": [ @@ -4300,7 +4341,8 @@ "com.linkedin.pegasus2avro.common.Ownership", "com.linkedin.pegasus2avro.common.InstitutionalMemory", "com.linkedin.pegasus2avro.common.Status", - "com.linkedin.pegasus2avro.common.Deprecation" + "com.linkedin.pegasus2avro.common.Deprecation", + "com.linkedin.pegasus2avro.common.BrowsePaths" ] }, "doc": "The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects." diff --git a/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc b/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc index 8a8dcf3eb31272..842f18b2e14a07 100644 --- a/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc +++ b/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc @@ -1595,9 +1595,10 @@ "PIG": "Pig type is for running Pig jobs.", "SQL": "SQL is for running Presto, mysql queries etc" } - } + }, + "string" ], - "doc": "Datajob type" + "doc": "Datajob type\n**NOTE**: AzkabanJobType is deprecated. Please use strings instead." }, { "name": "flowUrn", @@ -1616,6 +1617,37 @@ "java": { "class": "com.linkedin.pegasus2avro.common.urn.DataFlowUrn" } + }, + { + "name": "status", + "type": [ + "null", + { + "type": "enum", + "name": "JobStatus", + "doc": "Job statuses", + "symbols": [ + "STARTING", + "IN_PROGRESS", + "STOPPING", + "STOPPED", + "COMPLETED", + "FAILED", + "UNKNOWN" + ], + "symbolDocs": { + "COMPLETED": "Jobs with successful completion.", + "FAILED": "Jobs that have failed.", + "IN_PROGRESS": "Jobs currently running.", + "STARTING": "Jobs being initialized.", + "STOPPED": "Jobs that have stopped.", + "STOPPING": "Jobs being stopped.", + "UNKNOWN": "Jobs with unknown status (either unmappable or unavailable)" + } + } + ], + "doc": "Status of the job", + "default": null } ], "Aspect": { @@ -3207,6 +3239,15 @@ "namespace": "com.linkedin.pegasus2avro.ml.metadata", "doc": "Properties associated with a ML Model", "fields": [ + { + "name": "customProperties", + "type": { + "type": "map", + "values": "string" + }, + "doc": "Custom property bag.", + "default": {} + }, { "name": "description", "type": [ @@ -4299,7 +4340,8 @@ "com.linkedin.pegasus2avro.common.Ownership", "com.linkedin.pegasus2avro.common.InstitutionalMemory", "com.linkedin.pegasus2avro.common.Status", - "com.linkedin.pegasus2avro.common.Deprecation" + "com.linkedin.pegasus2avro.common.Deprecation", + "com.linkedin.pegasus2avro.common.BrowsePaths" ] }, "doc": "The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects." diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json index 010bfb1d255b1d..30f5d538b4513d 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json @@ -523,9 +523,10 @@ "name": "test-job-2:Filter-Transform0", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { @@ -560,9 +561,10 @@ "name": "test-job-2:ApplyMapping-Transform1", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { @@ -597,9 +599,10 @@ "name": "test-job-2:ApplyMapping-Transform2", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { @@ -635,9 +638,10 @@ "name": "test-job-2:Join-Transform3", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { @@ -672,9 +676,10 @@ "name": "test-job-2:ApplyMapping-Transform4", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { @@ -707,9 +712,10 @@ "name": "test-job-2:ApplyMapping-Transform5", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { @@ -774,9 +780,10 @@ "name": "test-job-2:SplitFields-Transform0", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { @@ -811,9 +818,10 @@ "name": "test-job-2:ApplyMapping-Transform1", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { @@ -848,9 +856,10 @@ "name": "test-job-2:FillMissingValues-Transform2", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { @@ -885,9 +894,10 @@ "name": "test-job-2:SelectFields-Transform3", "description": null, "type": { - "com.linkedin.pegasus2avro.datajob.azkaban.AzkabanJobType": "GLUE" + "string": "GLUE" }, - "flowUrn": null + "flowUrn": null, + "status": null } }, { diff --git a/metadata-ingestion/tests/unit/sagemaker/sagemaker_mces_golden.json b/metadata-ingestion/tests/unit/sagemaker/sagemaker_mces_golden.json index 06ec895eb219dd..54d924b86529a3 100644 --- a/metadata-ingestion/tests/unit/sagemaker/sagemaker_mces_golden.json +++ b/metadata-ingestion/tests/unit/sagemaker/sagemaker_mces_golden.json @@ -71,6 +71,13 @@ "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureTableSnapshot": { "urn": "urn:li:mlFeatureTable:(urn:li:dataPlatform:sagemaker,test-2)", "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "sagemaker/test-2" + ] + } + }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties": { "customProperties": { @@ -175,6 +182,13 @@ "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureTableSnapshot": { "urn": "urn:li:mlFeatureTable:(urn:li:dataPlatform:sagemaker,test-1)", "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "sagemaker/test-1" + ] + } + }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties": { "customProperties": { @@ -261,6 +275,13 @@ "com.linkedin.pegasus2avro.metadata.snapshot.MLFeatureTableSnapshot": { "urn": "urn:li:mlFeatureTable:(urn:li:dataPlatform:sagemaker,test)", "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "sagemaker/test" + ] + } + }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties": { "customProperties": { @@ -282,5 +303,1254 @@ } }, "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLModelSnapshot": { + "urn": "urn:li:mlModel:(urn:li:dataPlatform:sagemaker,the-first-model,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLModelProperties": { + "customProperties": { + "PrimaryContainer": "{'ContainerHostname': 'string', 'Image': 'string', 'ImageConfig': {'RepositoryAccessMode': 'Platform', 'RepositoryAuthConfig': {'RepositoryCredentialsProviderArn': 'string'}}, 'Mode': 'SingleModel', 'ModelDataUrl': 'string', 'Environment': {'string': 'string'}, 'ModelPackageName': 'string', 'MultiModelConfig': {'ModelCacheSetting': 'Enabled'}}", + "Containers": "[{'ContainerHostname': 'string', 'Image': 'string', 'ImageConfig': {'RepositoryAccessMode': 'Platform', 'RepositoryAuthConfig': {'RepositoryCredentialsProviderArn': 'string'}}, 'Mode': 'SingleModel', 'ModelDataUrl': 'string', 'Environment': {'string': 'string'}, 'ModelPackageName': 'string', 'MultiModelConfig': {'ModelCacheSetting': 'Enabled'}}]", + "InferenceExecutionConfig": "{'Mode': 'Serial'}", + "ExecutionRoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMaker-ExecutionRole-20210614T104201", + "VpcConfig": "{'SecurityGroupIds': ['string'], 'Subnets': ['string']}", + "ModelArn": "arn:aws:sagemaker:us-west-2:123412341234:model/the-first-model", + "EnableNetworkIsolation": "True" + }, + "description": null, + "date": 1420070400000, + "version": null, + "type": null, + "hyperParameters": null, + "mlFeatures": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.MLModelSnapshot": { + "urn": "urn:li:mlModel:(urn:li:dataPlatform:sagemaker,the-second-model,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.ml.metadata.MLModelProperties": { + "customProperties": { + "PrimaryContainer": "{'ContainerHostname': 'string', 'Image': 'string', 'ImageConfig': {'RepositoryAccessMode': 'Platform', 'RepositoryAuthConfig': {'RepositoryCredentialsProviderArn': 'string'}}, 'Mode': 'MultiModel', 'ModelDataUrl': 'string', 'Environment': {'string': 'string'}, 'ModelPackageName': 'string', 'MultiModelConfig': {'ModelCacheSetting': 'Disabled'}}", + "Containers": "[{'ContainerHostname': 'string', 'Image': 'string', 'ImageConfig': {'RepositoryAccessMode': 'Vpc', 'RepositoryAuthConfig': {'RepositoryCredentialsProviderArn': 'string'}}, 'Mode': 'SingleModel', 'ModelDataUrl': 'string', 'Environment': {'string': 'string'}, 'ModelPackageName': 'string', 'MultiModelConfig': {'ModelCacheSetting': 'Disabled'}}]", + "InferenceExecutionConfig": "{'Mode': 'Serial'}", + "ExecutionRoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMaker-ExecutionRole-20210614T104201", + "VpcConfig": "{'SecurityGroupIds': ['string'], 'Subnets': ['string']}", + "ModelArn": "arn:aws:sagemaker:us-west-2:123412341234:model/the-second-model", + "EnableNetworkIsolation": "False" + }, + "description": null, + "date": 1420070400000, + "version": null, + "type": null, + "hyperParameters": null, + "mlFeatures": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-input-bucket/file.txt,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://auto-ml-job-input-bucket/file.txt", + "datatype": "ManifestFile" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-output-bucket/file.txt,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://auto-ml-job-output-bucket/file.txt" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/input-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://compilation-job-bucket/input-config.tar.gz", + "framework": "TENSORFLOW", + "framework_version": "string" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/output-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://compilation-job-bucket/output-config.tar.gz", + "target_device": "lambda", + "target_platform": "{'Os': 'ANDROID', 'Arch': 'X86_64', 'Accelerator': 'INTEL_GRAPHICS'}" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/model-artifact.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://edge-packaging-bucket/model-artifact.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/output-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://edge-packaging-bucket/output-config.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/data-source.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://labeling-job/data-source.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/category-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://labeling-job/category-config.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-dataset.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://labeling-job/output-dataset.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://labeling-job/output-config.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,processing-job/input-data.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://processing-job/input-data.tar.gz", + "datatype": "ManifestFile", + "mode": "Pipe", + "distribution_type": "FullyReplicated", + "compression": "None", + "name": "string" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/input-dataset.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://training-job/input-dataset.tar.gz", + "datatype": "None", + "distribution_type": "FullyReplicated", + "attribute_names": "['string']", + "channel_name": "string" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/output-data.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://training-job/output-data.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/checkpoint-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://training-job/checkpoint-config.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-hook-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://training-job/debug-hook-config.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/tensorboard-output-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://training-job/tensorboard-output-config.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://training-job/profiler-config.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-rule-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://training-job/debug-rule-config.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-rule-config.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://training-job/profiler-rule-config.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/input-data-source.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://transform-job/input-data-source.tar.gz", + "datatype": "ManifestFile", + "compression": "None", + "split": "None" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/output.tar.gz,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "dataset_type": "s3", + "uri": "s3://transform-job/output.tar.gz" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(sagemaker,auto_ml:an-auto-ml-job,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": {}, + "externalUrl": null, + "name": "an-auto-ml-job", + "description": null, + "project": null + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,auto_ml:an-auto-ml-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:auto-ml-job/an-auto-ml-job)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "AutoMLJobName": "an-auto-ml-job", + "AutoMLJobArn": "arn:aws:sagemaker:us-west-2:123412341234:auto-ml-job/an-auto-ml-job", + "InputDataConfig": "[{'DataSource': {'S3DataSource': {'S3DataType': 'ManifestFile', 'S3Uri': 's3://auto-ml-job-input-bucket/file.txt'}}, 'CompressionType': 'None', 'TargetAttributeName': 'some-name'}]", + "OutputDataConfig": "{'KmsKeyId': 'some-key-id', 'S3OutputPath': 's3://auto-ml-job-output-bucket/file.txt'}", + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "AutoMLJobObjective": "{'MetricName': 'Accuracy'}", + "ProblemType": "BinaryClassification", + "AutoMLJobConfig": "{'CompletionCriteria': {'MaxCandidates': 123, 'MaxRuntimePerTrainingJobInSeconds': 123, 'MaxAutoMLJobRuntimeInSeconds': 123}, 'SecurityConfig': {'VolumeKmsKeyId': 'string', 'EnableInterContainerTrafficEncryption': True, 'VpcConfig': {'SecurityGroupIds': ['string'], 'Subnets': ['string']}}}", + "CreationTime": "2015-01-01 00:00:00+00:00", + "EndTime": "2015-01-01 00:00:00+00:00", + "LastModifiedTime": "2015-01-01 00:00:00+00:00", + "FailureReason": "string", + "PartialFailureReasons": "[{'PartialFailureMessage': 'string'}]", + "BestCandidate": "{'CandidateName': 'string', 'FinalAutoMLJobObjectiveMetric': {'Type': 'Maximize', 'MetricName': 'Accuracy', 'Value': 1.0}, 'ObjectiveStatus': 'Succeeded', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'string', 'CandidateStepName': 'string'}], 'CandidateStatus': 'Completed', 'InferenceContainers': [{'Image': 'string', 'ModelDataUrl': 'string', 'Environment': {'string': 'string'}}], 'CreationTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'EndTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'LastModifiedTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'FailureReason': 'string', 'CandidateProperties': {'CandidateArtifactLocations': {'Explainability': 'string'}}}", + "AutoMLJobStatus": "Completed", + "AutoMLJobSecondaryStatus": "Starting", + "GenerateCandidateDefinitionsOnly": "True", + "AutoMLJobArtifacts": "{'CandidateDefinitionNotebookLocation': 'string', 'DataExplorationNotebookLocation': 'string'}", + "ResolvedAttributes": "{'AutoMLJobObjective': {'MetricName': 'Accuracy'}, 'ProblemType': 'BinaryClassification', 'CompletionCriteria': {'MaxCandidates': 123, 'MaxRuntimePerTrainingJobInSeconds': 123, 'MaxAutoMLJobRuntimeInSeconds': 123}}", + "ModelDeployConfig": "{'AutoGenerateEndpointName': True, 'EndpointName': 'string'}", + "ModelDeployResult": "{'EndpointName': 'string'}", + "jobType": "auto_ml" + }, + "externalUrl": null, + "name": "an-auto-ml-job", + "description": null, + "type": { + "string": "SAGEMAKER" + }, + "flowUrn": null, + "status": "COMPLETED" + } + }, + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "auto_ml/an-auto-ml-job" + ] + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-input-bucket/file.txt,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-output-bucket/file.txt,PROD)" + ], + "inputDatajobs": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(sagemaker,compilation:a-compilation-job,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": {}, + "externalUrl": null, + "name": "a-compilation-job", + "description": null, + "project": null + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,compilation:a-compilation-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:compilation-job/a-compilation-job)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "CompilationJobName": "a-compilation-job", + "CompilationJobArn": "arn:aws:sagemaker:us-west-2:123412341234:compilation-job/a-compilation-job", + "CompilationJobStatus": "INPROGRESS", + "CompilationStartTime": "2015-01-01 00:00:00+00:00", + "CompilationEndTime": "2015-01-01 00:00:00+00:00", + "StoppingCondition": "{'MaxRuntimeInSeconds': 123, 'MaxWaitTimeInSeconds': 123}", + "InferenceImage": "string", + "CreationTime": "2015-01-01 00:00:00+00:00", + "LastModifiedTime": "2015-01-01 00:00:00+00:00", + "FailureReason": "string", + "ModelArtifacts": "{'S3ModelArtifacts': 's3://compilation-job-bucket/model-artifacts.tar.gz'}", + "ModelDigests": "{'ArtifactDigest': 'string'}", + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "InputConfig": "{'S3Uri': 's3://compilation-job-bucket/input-config.tar.gz', 'DataInputConfig': 'string', 'Framework': 'TENSORFLOW', 'FrameworkVersion': 'string'}", + "OutputConfig": "{'S3OutputLocation': 's3://compilation-job-bucket/output-config.tar.gz', 'TargetDevice': 'lambda', 'TargetPlatform': {'Os': 'ANDROID', 'Arch': 'X86_64', 'Accelerator': 'INTEL_GRAPHICS'}, 'CompilerOptions': 'string', 'KmsKeyId': 'string'}", + "VpcConfig": "{'SecurityGroupIds': ['string'], 'Subnets': ['string']}", + "jobType": "compilation" + }, + "externalUrl": null, + "name": "a-compilation-job", + "description": null, + "type": { + "string": "SAGEMAKER" + }, + "flowUrn": null, + "status": "IN_PROGRESS" + } + }, + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "compilation/a-compilation-job" + ] + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/input-config.tar.gz,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/output-config.tar.gz,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,compilation:a-compilation-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:compilation-job/a-compilation-job)" + ] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(sagemaker,edge_packaging:an-edge-packaging-job,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": {}, + "externalUrl": null, + "name": "an-edge-packaging-job", + "description": null, + "project": null + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,edge_packaging:an-edge-packaging-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:edge-packaging-job/an-edge-packaging-job)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "EdgePackagingJobArn": "arn:aws:sagemaker:us-west-2:123412341234:edge-packaging-job/an-edge-packaging-job", + "EdgePackagingJobName": "an-edge-packaging-job", + "CompilationJobName": "a-compilation-job", + "ModelName": "string", + "ModelVersion": "string", + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "OutputConfig": "{'S3OutputLocation': 's3://edge-packaging-bucket/output-config.tar.gz', 'KmsKeyId': 'string', 'PresetDeploymentType': 'GreengrassV2Component', 'PresetDeploymentConfig': 'string'}", + "ResourceKey": "string", + "EdgePackagingJobStatus": "STARTING", + "EdgePackagingJobStatusMessage": "string", + "CreationTime": "2015-01-01 00:00:00+00:00", + "LastModifiedTime": "2015-01-01 00:00:00+00:00", + "ModelArtifact": "s3://edge-packaging-bucket/model-artifact.tar.gz", + "ModelSignature": "string", + "PresetDeploymentOutput": "{'Type': 'GreengrassV2Component', 'Artifact': 'arn:aws:sagemaker:us-west-2:123412341234:edge-packaging-job/some-artifact', 'Status': 'COMPLETED', 'StatusMessage': 'string'}", + "jobType": "edge_packaging" + }, + "externalUrl": null, + "name": "an-edge-packaging-job", + "description": null, + "type": { + "string": "SAGEMAKER" + }, + "flowUrn": null, + "status": "STARTING" + } + }, + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "edge_packaging/an-edge-packaging-job" + ] + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/model-artifact.tar.gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/output-config.tar.gz,PROD)" + ], + "inputDatajobs": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(sagemaker,hyper_parameter_tuning:a-hyper-parameter-tuning-job,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": {}, + "externalUrl": null, + "name": "a-hyper-parameter-tuning-job", + "description": null, + "project": null + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,hyper_parameter_tuning:a-hyper-parameter-tuning-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:hyper-parameter-tuning-job/a-hyper-parameter-tuning-job)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "HyperParameterTuningJobName": "a-hyper-parameter-tuning-job", + "HyperParameterTuningJobArn": "arn:aws:sagemaker:us-west-2:123412341234:hyper-parameter-tuning-job/a-hyper-parameter-tuning-job", + "HyperParameterTuningJobConfig": "{'Strategy': 'Bayesian', 'HyperParameterTuningJobObjective': {'Type': 'Maximize', 'MetricName': 'string'}, 'ResourceLimits': {'MaxNumberOfTrainingJobs': 123, 'MaxParallelTrainingJobs': 123}, 'ParameterRanges': {'IntegerParameterRanges': [{'Name': 'string', 'MinValue': 'string', 'MaxValue': 'string', 'ScalingType': 'Auto'}], 'ContinuousParameterRanges': [{'Name': 'string', 'MinValue': 'string', 'MaxValue': 'string', 'ScalingType': 'Auto'}], 'CategoricalParameterRanges': [{'Name': 'string', 'Values': ['string']}]}, 'TrainingJobEarlyStoppingType': 'Off', 'TuningJobCompletionCriteria': {'TargetObjectiveMetricValue': 1.0}}", + "TrainingJobDefinition": "{'DefinitionName': 'string', 'TuningObjective': {'Type': 'Maximize', 'MetricName': 'string'}, 'HyperParameterRanges': {'IntegerParameterRanges': [{'Name': 'string', 'MinValue': 'string', 'MaxValue': 'string', 'ScalingType': 'Auto'}], 'ContinuousParameterRanges': [{'Name': 'string', 'MinValue': 'string', 'MaxValue': 'string', 'ScalingType': 'Auto'}], 'CategoricalParameterRanges': [{'Name': 'string', 'Values': ['string']}]}, 'StaticHyperParameters': {'string': 'string'}, 'AlgorithmSpecification': {'TrainingImage': 'string', 'TrainingInputMode': 'Pipe', 'AlgorithmName': 'string', 'MetricDefinitions': [{'Name': 'string', 'Regex': 'string'}]}, 'RoleArn': 'arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole', 'InputDataConfig': [{'ChannelName': 'string', 'DataSource': {'S3DataSource': {'S3DataType': 'ManifestFile', 'S3Uri': 's3://hyper-parameter-tuning-job/data-source.tar.gz', 'S3DataDistributionType': 'FullyReplicated', 'AttributeNames': ['string']}, 'FileSystemDataSource': {'FileSystemId': 'abcdefgihjklmnopqrstuvwxyz', 'FileSystemAccessMode': 'rw', 'FileSystemType': 'EFS', 'DirectoryPath': 'string'}}, 'ContentType': 'string', 'CompressionType': 'None', 'RecordWrapperType': 'None', 'InputMode': 'Pipe', 'ShuffleConfig': {'Seed': 123}}], 'VpcConfig': {'SecurityGroupIds': ['string'], 'Subnets': ['string']}, 'OutputDataConfig': {'KmsKeyId': 'string', 'S3OutputPath': 's3://hyper-parameter-tuning-job/data-output.tar.gz'}, 'ResourceConfig': {'InstanceType': 'ml.m4.xlarge', 'InstanceCount': 123, 'VolumeSizeInGB': 123, 'VolumeKmsKeyId': 'string'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 123, 'MaxWaitTimeInSeconds': 123}, 'EnableNetworkIsolation': True, 'EnableInterContainerTrafficEncryption': True, 'EnableManagedSpotTraining': True, 'CheckpointConfig': {'S3Uri': 's3://hyper-parameter-tuning-job/checkpoint-config.tar.gz', 'LocalPath': 'string'}, 'RetryStrategy': {'MaximumRetryAttempts': 123}}", + "TrainingJobDefinitions": "[{'DefinitionName': 'string', 'TuningObjective': {'Type': 'Maximize', 'MetricName': 'string'}, 'HyperParameterRanges': {'IntegerParameterRanges': [{'Name': 'string', 'MinValue': 'string', 'MaxValue': 'string', 'ScalingType': 'Auto'}], 'ContinuousParameterRanges': [{'Name': 'string', 'MinValue': 'string', 'MaxValue': 'string', 'ScalingType': 'Auto'}], 'CategoricalParameterRanges': [{'Name': 'string', 'Values': ['string']}]}, 'StaticHyperParameters': {'string': 'string'}, 'AlgorithmSpecification': {'TrainingImage': 'string', 'TrainingInputMode': 'Pipe', 'AlgorithmName': 'string', 'MetricDefinitions': [{'Name': 'string', 'Regex': 'string'}]}, 'RoleArn': 'arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole', 'InputDataConfig': [{'ChannelName': 'string', 'DataSource': {'S3DataSource': {'S3DataType': 'ManifestFile', 'S3Uri': 's3://hyper-parameter-tuning-job/data-source.tar.gz', 'S3DataDistributionType': 'FullyReplicated', 'AttributeNames': ['string']}, 'FileSystemDataSource': {'FileSystemId': 'abcdefgihjklmnopqrstuvwxyz', 'FileSystemAccessMode': 'rw', 'FileSystemType': 'EFS', 'DirectoryPath': 'string'}}, 'ContentType': 'string', 'CompressionType': 'None', 'RecordWrapperType': 'None', 'InputMode': 'Pipe', 'ShuffleConfig': {'Seed': 123}}], 'VpcConfig': {'SecurityGroupIds': ['string'], 'Subnets': ['string']}, 'OutputDataConfig': {'KmsKeyId': 'string', 'S3OutputPath': 's3://hyper-parameter-tuning-job/data-output.tar.gz'}, 'ResourceConfig': {'InstanceType': 'ml.m4.xlarge', 'InstanceCount': 123, 'VolumeSizeInGB': 123, 'VolumeKmsKeyId': 'string'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 123, 'MaxWaitTimeInSeconds': 123}, 'EnableNetworkIsolation': True, 'EnableInterContainerTrafficEncryption': True, 'EnableManagedSpotTraining': True, 'CheckpointConfig': {'S3Uri': 's3://hyper-parameter-tuning-job/checkpoint-config.tar.gz', 'LocalPath': 'string'}, 'RetryStrategy': {'MaximumRetryAttempts': 123}}]", + "HyperParameterTuningJobStatus": "Completed", + "CreationTime": "2015-01-01 00:00:00+00:00", + "HyperParameterTuningEndTime": "2015-01-01 00:00:00+00:00", + "LastModifiedTime": "2015-01-01 00:00:00+00:00", + "TrainingJobStatusCounters": "{'Completed': 123, 'InProgress': 123, 'RetryableError': 123, 'NonRetryableError': 123, 'Stopped': 123}", + "ObjectiveStatusCounters": "{'Succeeded': 123, 'Pending': 123, 'Failed': 123}", + "BestTrainingJob": "{'TrainingJobDefinitionName': 'string', 'TrainingJobName': 'string', 'TrainingJobArn': 'string', 'TuningJobName': 'string', 'CreationTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'TrainingStartTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'TrainingEndTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'TrainingJobStatus': 'InProgress', 'TunedHyperParameters': {'string': 'string'}, 'FailureReason': 'string', 'FinalHyperParameterTuningJobObjectiveMetric': {'Type': 'Maximize', 'MetricName': 'string', 'Value': 1.0}, 'ObjectiveStatus': 'Succeeded'}", + "OverallBestTrainingJob": "{'TrainingJobDefinitionName': 'string', 'TrainingJobName': 'string', 'TrainingJobArn': 'string', 'TuningJobName': 'string', 'CreationTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'TrainingStartTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'TrainingEndTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'TrainingJobStatus': 'InProgress', 'TunedHyperParameters': {'string': 'string'}, 'FailureReason': 'string', 'FinalHyperParameterTuningJobObjectiveMetric': {'Type': 'Maximize', 'MetricName': 'string', 'Value': 1.0}, 'ObjectiveStatus': 'Succeeded'}", + "WarmStartConfig": "{'ParentHyperParameterTuningJobs': [{'HyperParameterTuningJobName': 'string'}], 'WarmStartType': 'IdenticalDataAndAlgorithm'}", + "FailureReason": "string", + "jobType": "hyper_parameter_tuning" + }, + "externalUrl": null, + "name": "a-hyper-parameter-tuning-job", + "description": null, + "type": { + "string": "SAGEMAKER" + }, + "flowUrn": null, + "status": "COMPLETED" + } + }, + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "hyper_parameter_tuning/a-hyper-parameter-tuning-job" + ] + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(sagemaker,labeling:a-labeling-job,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": {}, + "externalUrl": null, + "name": "a-labeling-job", + "description": null, + "project": null + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,labeling:a-labeling-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:labeling-job/a-labeling-job)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "LabelingJobStatus": "Initializing", + "LabelCounters": "{'TotalLabeled': 123, 'HumanLabeled': 123, 'MachineLabeled': 123, 'FailedNonRetryableError': 123, 'Unlabeled': 123}", + "FailureReason": "string", + "CreationTime": "2015-01-01 00:00:00+00:00", + "LastModifiedTime": "2015-01-01 00:00:00+00:00", + "JobReferenceCode": "string", + "LabelingJobName": "a-labeling-job", + "LabelingJobArn": "arn:aws:sagemaker:us-west-2:123412341234:labeling-job/a-labeling-job", + "LabelAttributeName": "string", + "InputConfig": "{'DataSource': {'S3DataSource': {'ManifestS3Uri': 's3://labeling-job/data-source.tar.gz'}, 'SnsDataSource': {'SnsTopicArn': 'string'}}, 'DataAttributes': {'ContentClassifiers': ['FreeOfPersonallyIdentifiableInformation', 'FreeOfAdultContent']}}", + "OutputConfig": "{'S3OutputPath': 's3://labeling-job/output-config.tar.gz', 'KmsKeyId': 'string', 'SnsTopicArn': 'string'}", + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "LabelCategoryConfigS3Uri": "s3://labeling-job/category-config.tar.gz", + "StoppingConditions": "{'MaxHumanLabeledObjectCount': 123, 'MaxPercentageOfInputDatasetLabeled': 123}", + "LabelingJobAlgorithmsConfig": "{'LabelingJobAlgorithmSpecificationArn': 'string', 'InitialActiveLearningModelArn': 'arn:aws:sagemaker:us-west-2:123412341234:labeling-job/initial-active-learning-model', 'LabelingJobResourceConfig': {'VolumeKmsKeyId': 'string'}}", + "HumanTaskConfig": "{'WorkteamArn': 'string', 'UiConfig': {'UiTemplateS3Uri': 's3://labeling-job/ui-config.tar.gz', 'HumanTaskUiArn': 'string'}, 'PreHumanTaskLambdaArn': 'string', 'TaskKeywords': ['string'], 'TaskTitle': 'string', 'TaskDescription': 'string', 'NumberOfHumanWorkersPerDataObject': 123, 'TaskTimeLimitInSeconds': 123, 'TaskAvailabilityLifetimeInSeconds': 123, 'MaxConcurrentTaskCount': 123, 'AnnotationConsolidationConfig': {'AnnotationConsolidationLambdaArn': 'string'}, 'PublicWorkforceTaskPrice': {'AmountInUsd': {'Dollars': 123, 'Cents': 123, 'TenthFractionsOfACent': 123}}}", + "Tags": "[{'Key': 'string', 'Value': 'string'}]", + "LabelingJobOutput": "{'OutputDatasetS3Uri': 's3://labeling-job/output-dataset.tar.gz', 'FinalActiveLearningModelArn': 'arn:aws:sagemaker:us-west-2:123412341234:labeling-job/final-active-learning-model'}", + "jobType": "labeling" + }, + "externalUrl": null, + "name": "a-labeling-job", + "description": null, + "type": { + "string": "SAGEMAKER" + }, + "flowUrn": null, + "status": "STARTING" + } + }, + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "labeling/a-labeling-job" + ] + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/category-config.tar.gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/data-source.tar.gz,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-config.tar.gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-dataset.tar.gz,PROD)" + ], + "inputDatajobs": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(sagemaker,processing:a-processing-job,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": {}, + "externalUrl": null, + "name": "a-processing-job", + "description": null, + "project": null + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,processing:a-processing-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:processing-job/a-processing-job)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "ProcessingJobName": "a-processing-job", + "ProcessingJobArn": "arn:aws:sagemaker:us-west-2:123412341234:processing-job/a-processing-job", + "ProcessingInputs": "[{'InputName': 'string', 'AppManaged': True, 'S3Input': {'S3Uri': 's3://processing-job/input-data.tar.gz', 'LocalPath': 'string', 'S3DataType': 'ManifestFile', 'S3InputMode': 'Pipe', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}, 'DatasetDefinition': {'AthenaDatasetDefinition': {'Catalog': 'athena-catalog', 'Database': 'athena-database', 'QueryString': 'athena-query-string', 'WorkGroup': 'athena-work-group', 'OutputS3Uri': 's3://processing-job/athena-output.tar.gz', 'KmsKeyId': 'string', 'OutputFormat': 'PARQUET', 'OutputCompression': 'GZIP'}, 'RedshiftDatasetDefinition': {'ClusterId': 'redshift-cluster', 'Database': 'redshift-database', 'DbUser': 'redshift-db-user', 'QueryString': 'redshift-query-string', 'ClusterRoleArn': 'arn:aws:sagemaker:us-west-2:123412341234:processing-job/redshift-cluster', 'OutputS3Uri': 's3://processing-job/redshift-output.tar.gz', 'KmsKeyId': 'string', 'OutputFormat': 'PARQUET', 'OutputCompression': 'None'}, 'LocalPath': 'string', 'DataDistributionType': 'FullyReplicated', 'InputMode': 'Pipe'}}]", + "ProcessingOutputConfig": "{'Outputs': [{'OutputName': 'string', 'S3Output': {'S3Uri': 's3://processing-job/processing-output.tar.gz', 'LocalPath': 'string', 'S3UploadMode': 'Continuous'}, 'FeatureStoreOutput': {'FeatureGroupName': 'string'}, 'AppManaged': True}], 'KmsKeyId': 'string'}", + "ProcessingResources": "{'ClusterConfig': {'InstanceCount': 123, 'InstanceType': 'ml.t3.medium', 'VolumeSizeInGB': 123, 'VolumeKmsKeyId': 'string'}}", + "StoppingCondition": "{'MaxRuntimeInSeconds': 123}", + "AppSpecification": "{'ImageUri': 'string', 'ContainerEntrypoint': ['string'], 'ContainerArguments': ['string']}", + "Environment": "{'string': 'string'}", + "NetworkConfig": "{'EnableInterContainerTrafficEncryption': True, 'EnableNetworkIsolation': True, 'VpcConfig': {'SecurityGroupIds': ['string'], 'Subnets': ['string']}}", + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "ExperimentConfig": "{'ExperimentName': 'string', 'TrialName': 'string', 'TrialComponentDisplayName': 'string'}", + "ProcessingJobStatus": "InProgress", + "ExitMessage": "string", + "FailureReason": "string", + "ProcessingEndTime": "2015-01-01 00:00:00+00:00", + "ProcessingStartTime": "2015-01-01 00:00:00+00:00", + "LastModifiedTime": "2015-01-01 00:00:00+00:00", + "CreationTime": "2015-01-01 00:00:00+00:00", + "MonitoringScheduleArn": "string", + "AutoMLJobArn": "arn:aws:sagemaker:us-west-2:123412341234:auto-ml-job/an-auto-ml-job", + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123412341234:training-job/a-training-job", + "jobType": "processing" + }, + "externalUrl": null, + "name": "a-processing-job", + "description": null, + "type": { + "string": "SAGEMAKER" + }, + "flowUrn": null, + "status": "IN_PROGRESS" + } + }, + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "processing/a-processing-job" + ] + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,processing-job/input-data.tar.gz,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,auto_ml:an-auto-ml-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:auto-ml-job/an-auto-ml-job)", + "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,training:a-training-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:training-job/a-training-job)" + ] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(sagemaker,training:a-training-job,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": {}, + "externalUrl": null, + "name": "a-training-job", + "description": null, + "project": null + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,training:a-training-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:training-job/a-training-job)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "TrainingJobName": "a-training-job", + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123412341234:training-job/a-training-job", + "TuningJobArn": "string", + "LabelingJobArn": "string", + "AutoMLJobArn": "string", + "ModelArtifacts": "{'S3ModelArtifacts': 's3://training-job/model-artifact.tar.gz'}", + "TrainingJobStatus": "InProgress", + "SecondaryStatus": "Starting", + "FailureReason": "string", + "HyperParameters": "{'string': 'string'}", + "AlgorithmSpecification": "{'TrainingImage': 'string', 'AlgorithmName': 'string', 'TrainingInputMode': 'Pipe', 'MetricDefinitions': [{'Name': 'string', 'Regex': 'string'}], 'EnableSageMakerMetricsTimeSeries': True}", + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "InputDataConfig": "[{'ChannelName': 'string', 'DataSource': {'S3DataSource': {'S3DataType': 'ManifestFile', 'S3Uri': 's3://training-job/input-dataset.tar.gz', 'S3DataDistributionType': 'FullyReplicated', 'AttributeNames': ['string']}, 'FileSystemDataSource': {'FileSystemId': 'abcdefgihjklmnopqrstuvwxyz', 'FileSystemAccessMode': 'rw', 'FileSystemType': 'EFS', 'DirectoryPath': 'string'}}, 'ContentType': 'string', 'CompressionType': 'None', 'RecordWrapperType': 'None', 'InputMode': 'Pipe', 'ShuffleConfig': {'Seed': 123}}]", + "OutputDataConfig": "{'KmsKeyId': 'string', 'S3OutputPath': 's3://training-job/output-data.tar.gz'}", + "ResourceConfig": "{'InstanceType': 'ml.m4.xlarge', 'InstanceCount': 123, 'VolumeSizeInGB': 123, 'VolumeKmsKeyId': 'string'}", + "VpcConfig": "{'SecurityGroupIds': ['string'], 'Subnets': ['string']}", + "StoppingCondition": "{'MaxRuntimeInSeconds': 123, 'MaxWaitTimeInSeconds': 123}", + "CreationTime": "2015-01-01 00:00:00+00:00", + "TrainingStartTime": "2015-01-01 00:00:00+00:00", + "TrainingEndTime": "2015-01-01 00:00:00+00:00", + "LastModifiedTime": "2015-01-01 00:00:00+00:00", + "SecondaryStatusTransitions": "[{'Status': 'Starting', 'StartTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'EndTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'StatusMessage': 'string'}]", + "FinalMetricDataList": "[{'MetricName': 'string', 'Value': 1.0, 'Timestamp': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)}]", + "EnableNetworkIsolation": "True", + "EnableInterContainerTrafficEncryption": "True", + "EnableManagedSpotTraining": "True", + "CheckpointConfig": "{'S3Uri': 's3://training-job/checkpoint-config.tar.gz', 'LocalPath': 'string'}", + "TrainingTimeInSeconds": "123", + "BillableTimeInSeconds": "123", + "DebugHookConfig": "{'LocalPath': 'string', 'S3OutputPath': 's3://training-job/debug-hook-config.tar.gz', 'HookParameters': {'string': 'string'}, 'CollectionConfigurations': [{'CollectionName': 'string', 'CollectionParameters': {'string': 'string'}}]}", + "ExperimentConfig": "{'ExperimentName': 'string', 'TrialName': 'string', 'TrialComponentDisplayName': 'string'}", + "DebugRuleConfigurations": "[{'RuleConfigurationName': 'string', 'LocalPath': 'string', 'S3OutputPath': 's3://training-job/debug-rule-config.tar.gz', 'RuleEvaluatorImage': 'string', 'InstanceType': 'ml.t3.medium', 'VolumeSizeInGB': 123, 'RuleParameters': {'string': 'string'}}]", + "TensorBoardOutputConfig": "{'LocalPath': 'string', 'S3OutputPath': 's3://training-job/tensorboard-output-config.tar.gz'}", + "DebugRuleEvaluationStatuses": "[{'RuleConfigurationName': 'string', 'RuleEvaluationJobArn': 'string', 'RuleEvaluationStatus': 'InProgress', 'StatusDetails': 'string', 'LastModifiedTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)}]", + "ProfilerConfig": "{'S3OutputPath': 's3://training-job/profiler-config.tar.gz', 'ProfilingIntervalInMilliseconds': 123, 'ProfilingParameters': {'string': 'string'}}", + "ProfilerRuleConfigurations": "[{'RuleConfigurationName': 'string', 'LocalPath': 'string', 'S3OutputPath': 's3://training-job/profiler-rule-config.tar.gz', 'RuleEvaluatorImage': 'string', 'InstanceType': 'ml.t3.medium', 'VolumeSizeInGB': 123, 'RuleParameters': {'string': 'string'}}]", + "ProfilerRuleEvaluationStatuses": "[{'RuleConfigurationName': 'string', 'RuleEvaluationJobArn': 'string', 'RuleEvaluationStatus': 'InProgress', 'StatusDetails': 'string', 'LastModifiedTime': datetime.datetime(2015, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)}]", + "ProfilingStatus": "Enabled", + "RetryStrategy": "{'MaximumRetryAttempts': 123}", + "Environment": "{'string': 'string'}", + "jobType": "training" + }, + "externalUrl": null, + "name": "a-training-job", + "description": null, + "type": { + "string": "SAGEMAKER" + }, + "flowUrn": null, + "status": "IN_PROGRESS" + } + }, + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "training/a-training-job" + ] + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/input-dataset.tar.gz,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/checkpoint-config.tar.gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-hook-config.tar.gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-rule-config.tar.gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/output-data.tar.gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-config.tar.gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-rule-config.tar.gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/tensorboard-output-config.tar.gz,PROD)" + ], + "inputDatajobs": [] + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(sagemaker,transform:a-transform-job,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": {}, + "externalUrl": null, + "name": "a-transform-job", + "description": null, + "project": null + } + } + ] + } + }, + "proposedDelta": null + }, + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,transform:a-transform-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:transform-job/a-transform-job)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "TransformJobName": "a-transform-job", + "TransformJobArn": "arn:aws:sagemaker:us-west-2:123412341234:transform-job/a-transform-job", + "TransformJobStatus": "InProgress", + "FailureReason": "string", + "ModelName": "string", + "MaxConcurrentTransforms": "123", + "ModelClientConfig": "{'InvocationsTimeoutInSeconds': 123, 'InvocationsMaxRetries': 123}", + "MaxPayloadInMB": "123", + "BatchStrategy": "MultiRecord", + "Environment": "{'string': 'string'}", + "TransformInput": "{'DataSource': {'S3DataSource': {'S3DataType': 'ManifestFile', 'S3Uri': 's3://transform-job/input-data-source.tar.gz'}}, 'ContentType': 'string', 'CompressionType': 'None', 'SplitType': 'None'}", + "TransformOutput": "{'S3OutputPath': 's3://transform-job/output.tar.gz', 'Accept': 'string', 'AssembleWith': 'None', 'KmsKeyId': 'string'}", + "TransformResources": "{'InstanceType': 'ml.m4.xlarge', 'InstanceCount': 123, 'VolumeKmsKeyId': 'string'}", + "CreationTime": "2015-01-01 00:00:00+00:00", + "TransformStartTime": "2015-01-01 00:00:00+00:00", + "TransformEndTime": "2015-01-01 00:00:00+00:00", + "LabelingJobArn": "arn:aws:sagemaker:us-west-2:123412341234:labeling-job/a-labeling-job", + "AutoMLJobArn": "arn:aws:sagemaker:us-west-2:123412341234:auto-ml-job/an-auto-ml-job", + "DataProcessing": "{'InputFilter': 'string', 'OutputFilter': 'string', 'JoinSource': 'Input'}", + "ExperimentConfig": "{'ExperimentName': 'string', 'TrialName': 'string', 'TrialComponentDisplayName': 'string'}", + "jobType": "transform" + }, + "externalUrl": null, + "name": "a-transform-job", + "description": null, + "type": { + "string": "SAGEMAKER" + }, + "flowUrn": null, + "status": "IN_PROGRESS" + } + }, + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "transform/a-transform-job" + ] + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/input-data-source.tar.gz,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/output.tar.gz,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,auto_ml:an-auto-ml-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:auto-ml-job/an-auto-ml-job)", + "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,labeling:a-labeling-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:labeling-job/a-labeling-job)" + ] + } + } + ] + } + }, + "proposedDelta": null } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py b/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py index b4aea5e917357a..7283b6e611a092 100644 --- a/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py +++ b/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py @@ -5,12 +5,17 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.sagemaker import SagemakerSource, SagemakerSourceConfig +from datahub.ingestion.source.sagemaker_processors.jobs import SAGEMAKER_JOB_TYPES from tests.test_helpers import mce_helpers from tests.unit.test_sagemaker_source_stubs import ( describe_feature_group_response_1, describe_feature_group_response_2, describe_feature_group_response_3, + describe_model_response_1, + describe_model_response_2, + job_stubs, list_feature_groups_response, + list_models_response, ) FROZEN_TIME = "2020-04-14 07:00:00" @@ -57,6 +62,44 @@ def test_sagemaker_ingest(tmp_path, pytestconfig): }, ) + sagemaker_stubber.add_response( + "list_models", + list_models_response, + {}, + ) + + sagemaker_stubber.add_response( + "describe_model", + describe_model_response_1, + {"ModelName": "the-first-model"}, + ) + + sagemaker_stubber.add_response( + "describe_model", + describe_model_response_2, + {"ModelName": "the-second-model"}, + ) + + for job_type, job in job_stubs.items(): + + job_info = SAGEMAKER_JOB_TYPES[job_type] + + sagemaker_stubber.add_response( + job_info.list_command, + job["list"], + {}, + ) + + for job_type, job in job_stubs.items(): + + job_info = SAGEMAKER_JOB_TYPES[job_type] + + sagemaker_stubber.add_response( + job_info.describe_command, + job["describe"], + {job_info.describe_name_key: job["describe_name"]}, + ) + mce_objects = [ wu.mce.to_obj() for wu in sagemaker_source_instance.get_workunits() ] diff --git a/metadata-ingestion/tests/unit/test_rest_sink.py b/metadata-ingestion/tests/unit/test_rest_sink.py index e7b40f30427189..f3d58956eb90f7 100644 --- a/metadata-ingestion/tests/unit/test_rest_sink.py +++ b/metadata-ingestion/tests/unit/test_rest_sink.py @@ -186,9 +186,7 @@ "customProperties": {}, "name": "User Deletions", "description": "Constructs the fct_users_deleted from logging_events", - "type": { - "com.linkedin.datajob.azkaban.AzkabanJobType": "SQL" - }, + "type": {"string": "SQL"}, } } ], diff --git a/metadata-ingestion/tests/unit/test_sagemaker_source_stubs.py b/metadata-ingestion/tests/unit/test_sagemaker_source_stubs.py index aebf0e524acc15..12da689522f402 100644 --- a/metadata-ingestion/tests/unit/test_sagemaker_source_stubs.py +++ b/metadata-ingestion/tests/unit/test_sagemaker_source_stubs.py @@ -1,29 +1,28 @@ -import datetime +from datetime import datetime, timezone list_feature_groups_response = { "FeatureGroupSummaries": [ { "FeatureGroupName": "test-2", "FeatureGroupArn": "arn:aws:sagemaker:us-west-2:123412341234:feature-group/test-2", - "CreationTime": datetime.datetime(2021, 6, 24, 9, 48, 37, 35000), + "CreationTime": datetime(2021, 6, 24, 9, 48, 37, 35000), "FeatureGroupStatus": "Created", }, { "FeatureGroupName": "test-1", "FeatureGroupArn": "arn:aws:sagemaker:us-west-2:123412341234:feature-group/test-1", - "CreationTime": datetime.datetime(2021, 6, 23, 13, 58, 10, 264000), + "CreationTime": datetime(2021, 6, 23, 13, 58, 10, 264000), "FeatureGroupStatus": "Created", }, { "FeatureGroupName": "test", "FeatureGroupArn": "arn:aws:sagemaker:us-west-2:123412341234:feature-group/test", - "CreationTime": datetime.datetime(2021, 6, 14, 11, 3, 0, 803000), + "CreationTime": datetime(2021, 6, 14, 11, 3, 0, 803000), "FeatureGroupStatus": "Created", }, ], "NextToken": "", } - describe_feature_group_response_1 = { "FeatureGroupArn": "arn:aws:sagemaker:us-west-2:123412341234:feature-group/test-2", "FeatureGroupName": "test-2", @@ -34,7 +33,7 @@ {"FeatureName": "some-feature-2", "FeatureType": "Integral"}, {"FeatureName": "some-feature-3", "FeatureType": "Fractional"}, ], - "CreationTime": datetime.datetime(2021, 6, 24, 9, 48, 37, 35000), + "CreationTime": datetime(2021, 6, 24, 9, 48, 37, 35000), "OnlineStoreConfig": {"EnableOnlineStore": True}, "OfflineStoreConfig": { "S3StorageConfig": { @@ -53,7 +52,6 @@ "Description": "Yet another test feature group", "NextToken": "", } - describe_feature_group_response_2 = { "FeatureGroupArn": "arn:aws:sagemaker:us-west-2:123412341234:feature-group/test-1", "FeatureGroupName": "test-1", @@ -65,13 +63,12 @@ {"FeatureName": "height", "FeatureType": "Fractional"}, {"FeatureName": "time", "FeatureType": "String"}, ], - "CreationTime": datetime.datetime(2021, 6, 23, 13, 58, 10, 264000), + "CreationTime": datetime(2021, 6, 23, 13, 58, 10, 264000), "OnlineStoreConfig": {"EnableOnlineStore": True}, "FeatureGroupStatus": "Created", "Description": "First test feature group", "NextToken": "", } - describe_feature_group_response_3 = { "FeatureGroupArn": "arn:aws:sagemaker:us-west-2:123412341234:feature-group/test", "FeatureGroupName": "test", @@ -82,7 +79,7 @@ {"FeatureName": "feature_2", "FeatureType": "Integral"}, {"FeatureName": "feature_3", "FeatureType": "Fractional"}, ], - "CreationTime": datetime.datetime( + "CreationTime": datetime( 2021, 6, 14, @@ -95,3 +92,1238 @@ "FeatureGroupStatus": "Created", "NextToken": "", } + +auto_ml_job_name = "an-auto-ml-job" +auto_ml_job_arn = "arn:aws:sagemaker:us-west-2:123412341234:auto-ml-job/an-auto-ml-job" +list_auto_ml_jobs_response = { + "AutoMLJobSummaries": [ + { + "AutoMLJobName": auto_ml_job_name, + "AutoMLJobArn": auto_ml_job_arn, + "AutoMLJobStatus": "Completed", + "AutoMLJobSecondaryStatus": "Starting", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "EndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "FailureReason": "string", + "PartialFailureReasons": [ + {"PartialFailureMessage": "string"}, + ], + }, + ], +} +describe_auto_ml_job_response = { + "AutoMLJobName": auto_ml_job_name, + "AutoMLJobArn": auto_ml_job_arn, + "InputDataConfig": [ + { + "DataSource": { + "S3DataSource": { + "S3DataType": "ManifestFile", # 'ManifestFile'|'S3Prefix' + "S3Uri": "s3://auto-ml-job-input-bucket/file.txt", + } + }, + "CompressionType": "None", # 'None'|'Gzip' + "TargetAttributeName": "some-name", + }, + ], + "OutputDataConfig": { + "KmsKeyId": "some-key-id", + "S3OutputPath": "s3://auto-ml-job-output-bucket/file.txt", + }, + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "AutoMLJobObjective": { + "MetricName": "Accuracy", # 'Accuracy'|'MSE'|'F1'|'F1macro'|'AUC' + }, + "ProblemType": "BinaryClassification", # 'BinaryClassification'|'MulticlassClassification'|'Regression' + "AutoMLJobConfig": { + "CompletionCriteria": { + "MaxCandidates": 123, + "MaxRuntimePerTrainingJobInSeconds": 123, + "MaxAutoMLJobRuntimeInSeconds": 123, + }, + "SecurityConfig": { + "VolumeKmsKeyId": "string", + "EnableInterContainerTrafficEncryption": True, # True|False + "VpcConfig": { + "SecurityGroupIds": [ + "string", + ], + "Subnets": [ + "string", + ], + }, + }, + }, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "EndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "FailureReason": "string", + "PartialFailureReasons": [ + {"PartialFailureMessage": "string"}, + ], + "BestCandidate": { + "CandidateName": "string", + "FinalAutoMLJobObjectiveMetric": { + "Type": "Maximize", # "Maximize" | "Minimize" + "MetricName": "Accuracy", # "Accuracy" | "MSE" | "F1" | "F1macro" | "AUC" + "Value": 1.0, + }, + "ObjectiveStatus": "Succeeded", # "Succeeded" | "Pending" | "Failed" + "CandidateSteps": [ + { + "CandidateStepType": "AWS::SageMaker::TrainingJob", + # "AWS::SageMaker::TrainingJob" + # | "AWS::SageMaker::TransformJob" + # | "AWS::SageMaker::ProcessingJob", + "CandidateStepArn": "string", + "CandidateStepName": "string", + }, + ], + "CandidateStatus": "Completed", + # "Completed" + # | "InProgress" + # | "Failed" + # | "Stopped" + # | "Stopping" + "InferenceContainers": [ + { + "Image": "string", + "ModelDataUrl": "string", + "Environment": {"string": "string"}, + }, + ], + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "EndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "FailureReason": "string", + "CandidateProperties": { + "CandidateArtifactLocations": {"Explainability": "string"} + }, + }, + "AutoMLJobStatus": "Completed", # "Completed" | "InProgress" | "Failed" | "Stopped" | "Stopping" + "AutoMLJobSecondaryStatus": "Starting", + # "Starting" + # | "AnalyzingData" + # | "FeatureEngineering" + # | "ModelTuning" + # | "MaxCandidatesReached" + # | "Failed" + # | "Stopped" + # | "MaxAutoMLJobRuntimeReached" + # | "Stopping" + # | "CandidateDefinitionsGenerated" + # | "GeneratingExplainabilityReport" + # | "Completed" + # | "ExplainabilityError" + # | "DeployingModel" + # | "ModelDeploymentError" + "GenerateCandidateDefinitionsOnly": True, # True | False + "AutoMLJobArtifacts": { + "CandidateDefinitionNotebookLocation": "string", + "DataExplorationNotebookLocation": "string", + }, + "ResolvedAttributes": { + "AutoMLJobObjective": { + "MetricName": "Accuracy", # "Accuracy" | "MSE" | "F1" | "F1macro" | "AUC" + }, + "ProblemType": "BinaryClassification", + # "BinaryClassification" + # | "MulticlassClassification" + # | "Regression", + "CompletionCriteria": { + "MaxCandidates": 123, + "MaxRuntimePerTrainingJobInSeconds": 123, + "MaxAutoMLJobRuntimeInSeconds": 123, + }, + }, + "ModelDeployConfig": { + "AutoGenerateEndpointName": True, # True | False + "EndpointName": "string", + }, + "ModelDeployResult": {"EndpointName": "string"}, +} + +compilation_job_name = "a-compilation-job" +compilation_job_arn = ( + "arn:aws:sagemaker:us-west-2:123412341234:compilation-job/a-compilation-job" +) +list_compilation_jobs_response = { + "CompilationJobSummaries": [ + { + "CompilationJobName": compilation_job_name, + "CompilationJobArn": compilation_job_arn, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "CompilationStartTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "CompilationEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "CompilationTargetDevice": "lambda", + "CompilationTargetPlatformOs": "ANDROID", + "CompilationTargetPlatformArch": "X86_64", + "CompilationTargetPlatformAccelerator": "INTEL_GRAPHICS", + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "CompilationJobStatus": "INPROGRESS", + }, + ], +} +describe_compilation_job_response = { + "CompilationJobName": compilation_job_name, + "CompilationJobArn": compilation_job_arn, + "CompilationJobStatus": "INPROGRESS", # 'INPROGRESS'|'COMPLETED'|'FAILED'|'STARTING'|'STOPPING'|'STOPPED' + "CompilationStartTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "CompilationEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "StoppingCondition": {"MaxRuntimeInSeconds": 123, "MaxWaitTimeInSeconds": 123}, + "InferenceImage": "string", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "FailureReason": "string", + "ModelArtifacts": { + "S3ModelArtifacts": "s3://compilation-job-bucket/model-artifacts.tar.gz" + }, + "ModelDigests": {"ArtifactDigest": "string"}, + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "InputConfig": { + "S3Uri": "s3://compilation-job-bucket/input-config.tar.gz", + "DataInputConfig": "string", + "Framework": "TENSORFLOW", # 'TENSORFLOW'|'KERAS'|'MXNET'|'ONNX'|'PYTORCH'|'XGBOOST'|'TFLITE'|'DARKNET'|'SKLEARN' + "FrameworkVersion": "string", + }, + "OutputConfig": { + "S3OutputLocation": "s3://compilation-job-bucket/output-config.tar.gz", + "TargetDevice": "lambda", + "TargetPlatform": { + "Os": "ANDROID", # 'ANDROID'|'LINUX' + "Arch": "X86_64", # 'X86_64'|'X86'|'ARM64'|'ARM_EABI'|'ARM_EABIHF' + "Accelerator": "INTEL_GRAPHICS", # 'INTEL_GRAPHICS'|'MALI'|'NVIDIA' + }, + "CompilerOptions": "string", + "KmsKeyId": "string", + }, + "VpcConfig": { + "SecurityGroupIds": [ + "string", + ], + "Subnets": [ + "string", + ], + }, +} + +edge_packaging_job_name = "an-edge-packaging-job" +edge_packaging_job_arn = ( + "arn:aws:sagemaker:us-west-2:123412341234:edge-packaging-job/an-edge-packaging-job" +) +list_edge_packaging_jobs_response = { + "EdgePackagingJobSummaries": [ + { + "EdgePackagingJobName": edge_packaging_job_name, + "EdgePackagingJobArn": edge_packaging_job_arn, + "EdgePackagingJobStatus": "STARTING", + "CompilationJobName": "string", + "ModelName": "string", + "ModelVersion": "string", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + }, + ], +} +describe_edge_packaging_job_response = { + "EdgePackagingJobArn": edge_packaging_job_arn, + "EdgePackagingJobName": edge_packaging_job_name, + "CompilationJobName": compilation_job_name, + "ModelName": "string", + "ModelVersion": "string", + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "OutputConfig": { + "S3OutputLocation": "s3://edge-packaging-bucket/output-config.tar.gz", + "KmsKeyId": "string", + "PresetDeploymentType": "GreengrassV2Component", + "PresetDeploymentConfig": "string", + }, + "ResourceKey": "string", + "EdgePackagingJobStatus": "STARTING", # 'STARTING'|'INPROGRESS'|'COMPLETED'|'FAILED'|'STOPPING'|'STOPPED' + "EdgePackagingJobStatusMessage": "string", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "ModelArtifact": "s3://edge-packaging-bucket/model-artifact.tar.gz", + "ModelSignature": "string", + "PresetDeploymentOutput": { + "Type": "GreengrassV2Component", + "Artifact": "arn:aws:sagemaker:us-west-2:123412341234:edge-packaging-job/some-artifact", + "Status": "COMPLETED", # 'COMPLETED'|'FAILED' + "StatusMessage": "string", + }, +} + +hyper_parameter_tuning_job_name = "a-hyper-parameter-tuning-job" +hyper_parameter_tuning_job_arn = "arn:aws:sagemaker:us-west-2:123412341234:hyper-parameter-tuning-job/a-hyper-parameter-tuning-job" +list_hyper_parameter_tuning_jobs_response = { + "HyperParameterTuningJobSummaries": [ + { + "HyperParameterTuningJobName": hyper_parameter_tuning_job_name, + "HyperParameterTuningJobArn": hyper_parameter_tuning_job_arn, + "HyperParameterTuningJobStatus": "Completed", + "Strategy": "Bayesian", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "HyperParameterTuningEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingJobStatusCounters": { + "Completed": 123, + "InProgress": 123, + "RetryableError": 123, + "NonRetryableError": 123, + "Stopped": 123, + }, + "ObjectiveStatusCounters": { + "Succeeded": 123, + "Pending": 123, + "Failed": 123, + }, + "ResourceLimits": { + "MaxNumberOfTrainingJobs": 123, + "MaxParallelTrainingJobs": 123, + }, + }, + ], +} +describe_hyper_parameter_tuning_job_response = { + "HyperParameterTuningJobName": hyper_parameter_tuning_job_name, + "HyperParameterTuningJobArn": hyper_parameter_tuning_job_arn, + "HyperParameterTuningJobConfig": { + "Strategy": "Bayesian", # 'Bayesian'|'Random' + "HyperParameterTuningJobObjective": { + "Type": "Maximize", # 'Maximize'|'Minimize' + "MetricName": "string", + }, + "ResourceLimits": { + "MaxNumberOfTrainingJobs": 123, + "MaxParallelTrainingJobs": 123, + }, + "ParameterRanges": { + "IntegerParameterRanges": [ + { + "Name": "string", + "MinValue": "string", + "MaxValue": "string", + "ScalingType": "Auto", # 'Auto'|'Linear'|'Logarithmic'|'ReverseLogarithmic' + }, + ], + "ContinuousParameterRanges": [ + { + "Name": "string", + "MinValue": "string", + "MaxValue": "string", + "ScalingType": "Auto", # 'Auto'|'Linear'|'Logarithmic'|'ReverseLogarithmic' + }, + ], + "CategoricalParameterRanges": [ + { + "Name": "string", + "Values": [ + "string", + ], + }, + ], + }, + "TrainingJobEarlyStoppingType": "Off", # 'Off'|'Auto' + "TuningJobCompletionCriteria": {"TargetObjectiveMetricValue": 1.0}, + }, + "TrainingJobDefinition": { + "DefinitionName": "string", + "TuningObjective": { + "Type": "Maximize", # "Maximize" | "Minimize" + "MetricName": "string", + }, + "HyperParameterRanges": { + "IntegerParameterRanges": [ + { + "Name": "string", + "MinValue": "string", + "MaxValue": "string", + "ScalingType": "Auto", # 'Auto'|'Linear'|'Logarithmic'|'ReverseLogarithmic' + }, + ], + "ContinuousParameterRanges": [ + { + "Name": "string", + "MinValue": "string", + "MaxValue": "string", + "ScalingType": "Auto", # 'Auto'|'Linear'|'Logarithmic'|'ReverseLogarithmic' + }, + ], + "CategoricalParameterRanges": [ + { + "Name": "string", + "Values": [ + "string", + ], + }, + ], + }, + "StaticHyperParameters": {"string": "string"}, + "AlgorithmSpecification": { + "TrainingImage": "string", + "TrainingInputMode": "Pipe", # 'Pipe'|'File' + "AlgorithmName": "string", + "MetricDefinitions": [ + {"Name": "string", "Regex": "string"}, + ], + }, + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "InputDataConfig": [ + { + "ChannelName": "string", + "DataSource": { + "S3DataSource": { + "S3DataType": "ManifestFile", # 'ManifestFile'|'S3Prefix'|'AugmentedManifestFile' + "S3Uri": "s3://hyper-parameter-tuning-job/data-source.tar.gz", + "S3DataDistributionType": "FullyReplicated", # 'FullyReplicated'|'ShardedByS3Key' + "AttributeNames": [ + "string", + ], + }, + "FileSystemDataSource": { + "FileSystemId": "abcdefgihjklmnopqrstuvwxyz", + "FileSystemAccessMode": "rw", # 'rw'|'ro' + "FileSystemType": "EFS", # 'EFS'|'FSxLustre' + "DirectoryPath": "string", + }, + }, + "ContentType": "string", + "CompressionType": "None", # 'None'|'Gzip' + "RecordWrapperType": "None", # 'None'|'RecordIO' + "InputMode": "Pipe", # 'Pipe'|'File' + "ShuffleConfig": {"Seed": 123}, + }, + ], + "VpcConfig": { + "SecurityGroupIds": [ + "string", + ], + "Subnets": [ + "string", + ], + }, + "OutputDataConfig": { + "KmsKeyId": "string", + "S3OutputPath": "s3://hyper-parameter-tuning-job/data-output.tar.gz", + }, + "ResourceConfig": { + "InstanceType": "ml.m4.xlarge", + "InstanceCount": 123, + "VolumeSizeInGB": 123, + "VolumeKmsKeyId": "string", + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 123, "MaxWaitTimeInSeconds": 123}, + "EnableNetworkIsolation": True, # True|False + "EnableInterContainerTrafficEncryption": True, # True|False + "EnableManagedSpotTraining": True, # True|False + "CheckpointConfig": { + "S3Uri": "s3://hyper-parameter-tuning-job/checkpoint-config.tar.gz", + "LocalPath": "string", + }, + "RetryStrategy": {"MaximumRetryAttempts": 123}, + }, + "TrainingJobDefinitions": [ + { + "DefinitionName": "string", + "TuningObjective": { + "Type": "Maximize", # 'Maximize'|'Minimize' + "MetricName": "string", + }, + "HyperParameterRanges": { + "IntegerParameterRanges": [ + { + "Name": "string", + "MinValue": "string", + "MaxValue": "string", + "ScalingType": "Auto", # 'Auto'|'Linear'|'Logarithmic'|'ReverseLogarithmic' + }, + ], + "ContinuousParameterRanges": [ + { + "Name": "string", + "MinValue": "string", + "MaxValue": "string", + "ScalingType": "Auto", # 'Auto'|'Linear'|'Logarithmic'|'ReverseLogarithmic' + }, + ], + "CategoricalParameterRanges": [ + { + "Name": "string", + "Values": [ + "string", + ], + }, + ], + }, + "StaticHyperParameters": {"string": "string"}, + "AlgorithmSpecification": { + "TrainingImage": "string", + "TrainingInputMode": "Pipe", # 'Pipe'|'File' + "AlgorithmName": "string", + "MetricDefinitions": [ + {"Name": "string", "Regex": "string"}, + ], + }, + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "InputDataConfig": [ + { + "ChannelName": "string", + "DataSource": { + "S3DataSource": { + "S3DataType": "ManifestFile", # 'ManifestFile'|'S3Prefix'|'AugmentedManifestFile' + "S3Uri": "s3://hyper-parameter-tuning-job/data-source.tar.gz", + "S3DataDistributionType": "FullyReplicated", # 'FullyReplicated'|'ShardedByS3Key' + "AttributeNames": [ + "string", + ], + }, + "FileSystemDataSource": { + "FileSystemId": "abcdefgihjklmnopqrstuvwxyz", + "FileSystemAccessMode": "rw", # 'rw'|'ro' + "FileSystemType": "EFS", # 'EFS'|'FSxLustre' + "DirectoryPath": "string", + }, + }, + "ContentType": "string", + "CompressionType": "None", # 'None'|'Gzip' + "RecordWrapperType": "None", # 'None'|'RecordIO' + "InputMode": "Pipe", # 'Pipe'|'File' + "ShuffleConfig": {"Seed": 123}, + }, + ], + "VpcConfig": { + "SecurityGroupIds": [ + "string", + ], + "Subnets": [ + "string", + ], + }, + "OutputDataConfig": { + "KmsKeyId": "string", + "S3OutputPath": "s3://hyper-parameter-tuning-job/data-output.tar.gz", + }, + "ResourceConfig": { + "InstanceType": "ml.m4.xlarge", + "InstanceCount": 123, + "VolumeSizeInGB": 123, + "VolumeKmsKeyId": "string", + }, + "StoppingCondition": { + "MaxRuntimeInSeconds": 123, + "MaxWaitTimeInSeconds": 123, + }, + "EnableNetworkIsolation": True, # True|False + "EnableInterContainerTrafficEncryption": True, # True|False + "EnableManagedSpotTraining": True, # True|False + "CheckpointConfig": { + "S3Uri": "s3://hyper-parameter-tuning-job/checkpoint-config.tar.gz", + "LocalPath": "string", + }, + "RetryStrategy": {"MaximumRetryAttempts": 123}, + }, + ], + "HyperParameterTuningJobStatus": "Completed", # 'Completed'|'InProgress'|'Failed'|'Stopped'|'Stopping' + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "HyperParameterTuningEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingJobStatusCounters": { + "Completed": 123, + "InProgress": 123, + "RetryableError": 123, + "NonRetryableError": 123, + "Stopped": 123, + }, + "ObjectiveStatusCounters": {"Succeeded": 123, "Pending": 123, "Failed": 123}, + "BestTrainingJob": { + "TrainingJobDefinitionName": "string", + "TrainingJobName": "string", + "TrainingJobArn": "string", + "TuningJobName": "string", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingStartTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingJobStatus": "InProgress", # 'InProgress'|'Completed'|'Failed'|'Stopping'|'Stopped' + "TunedHyperParameters": {"string": "string"}, + "FailureReason": "string", + "FinalHyperParameterTuningJobObjectiveMetric": { + "Type": "Maximize", # 'Maximize'|'Minimize' + "MetricName": "string", + "Value": 1.0, + }, + "ObjectiveStatus": "Succeeded", # 'Succeeded'|'Pending'|'Failed' + }, + "OverallBestTrainingJob": { + "TrainingJobDefinitionName": "string", + "TrainingJobName": "string", + "TrainingJobArn": "string", + "TuningJobName": "string", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingStartTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingJobStatus": "InProgress", # 'InProgress'|'Completed'|'Failed'|'Stopping'|'Stopped' + "TunedHyperParameters": {"string": "string"}, + "FailureReason": "string", + "FinalHyperParameterTuningJobObjectiveMetric": { + "Type": "Maximize", # 'Maximize'|'Minimize' + "MetricName": "string", + "Value": 1.0, + }, + "ObjectiveStatus": "Succeeded", # 'Succeeded'|'Pending'|'Failed' + }, + "WarmStartConfig": { + "ParentHyperParameterTuningJobs": [ + {"HyperParameterTuningJobName": "string"}, + ], + "WarmStartType": "IdenticalDataAndAlgorithm", # 'IdenticalDataAndAlgorithm'|'TransferLearning' + }, + "FailureReason": "string", +} + +labeling_job_name = "a-labeling-job" +labeling_job_arn = ( + "arn:aws:sagemaker:us-west-2:123412341234:labeling-job/a-labeling-job" +) +list_labeling_jobs_response = { + "LabelingJobSummaryList": [ + { + "LabelingJobName": labeling_job_name, + "LabelingJobArn": labeling_job_arn, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LabelingJobStatus": "Initializing", + "LabelCounters": { + "TotalLabeled": 123, + "HumanLabeled": 123, + "MachineLabeled": 123, + "FailedNonRetryableError": 123, + "Unlabeled": 123, + }, + "WorkteamArn": "string", + "PreHumanTaskLambdaArn": "string", + "AnnotationConsolidationLambdaArn": "string", + "FailureReason": "string", + "LabelingJobOutput": { + "OutputDatasetS3Uri": "s3://labeling-job/output-dataset.tar.gz", + "FinalActiveLearningModelArn": "arn:aws:sagemaker:us-west-2:123412341234:labeling-job/final-active-learning-model", + }, + "InputConfig": { + "DataSource": { + "S3DataSource": {"ManifestS3Uri": "string"}, + "SnsDataSource": {"SnsTopicArn": "string"}, + }, + "DataAttributes": { + "ContentClassifiers": [ + "FreeOfPersonallyIdentifiableInformation", + "FreeOfAdultContent", + ] + }, + }, + }, + ], +} +describe_labeling_job_response = { + "LabelingJobStatus": "Initializing", # 'Initializing'|'InProgress'|'Completed'|'Failed'|'Stopping'|'Stopped' + "LabelCounters": { + "TotalLabeled": 123, + "HumanLabeled": 123, + "MachineLabeled": 123, + "FailedNonRetryableError": 123, + "Unlabeled": 123, + }, + "FailureReason": "string", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "JobReferenceCode": "string", + "LabelingJobName": labeling_job_name, + "LabelingJobArn": labeling_job_arn, + "LabelAttributeName": "string", + "InputConfig": { + "DataSource": { + "S3DataSource": {"ManifestS3Uri": "s3://labeling-job/data-source.tar.gz"}, + "SnsDataSource": {"SnsTopicArn": "string"}, + }, + "DataAttributes": { + "ContentClassifiers": [ + "FreeOfPersonallyIdentifiableInformation", + "FreeOfAdultContent", + ] + }, + }, + "OutputConfig": { + "S3OutputPath": "s3://labeling-job/output-config.tar.gz", + "KmsKeyId": "string", + "SnsTopicArn": "string", + }, + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "LabelCategoryConfigS3Uri": "s3://labeling-job/category-config.tar.gz", + "StoppingConditions": { + "MaxHumanLabeledObjectCount": 123, + "MaxPercentageOfInputDatasetLabeled": 123, + }, + "LabelingJobAlgorithmsConfig": { + "LabelingJobAlgorithmSpecificationArn": "string", + "InitialActiveLearningModelArn": "arn:aws:sagemaker:us-west-2:123412341234:labeling-job/initial-active-learning-model", + "LabelingJobResourceConfig": {"VolumeKmsKeyId": "string"}, + }, + "HumanTaskConfig": { + "WorkteamArn": "string", + "UiConfig": { + "UiTemplateS3Uri": "s3://labeling-job/ui-config.tar.gz", + "HumanTaskUiArn": "string", + }, + "PreHumanTaskLambdaArn": "string", + "TaskKeywords": [ + "string", + ], + "TaskTitle": "string", + "TaskDescription": "string", + "NumberOfHumanWorkersPerDataObject": 123, + "TaskTimeLimitInSeconds": 123, + "TaskAvailabilityLifetimeInSeconds": 123, + "MaxConcurrentTaskCount": 123, + "AnnotationConsolidationConfig": {"AnnotationConsolidationLambdaArn": "string"}, + "PublicWorkforceTaskPrice": { + "AmountInUsd": {"Dollars": 123, "Cents": 123, "TenthFractionsOfACent": 123} + }, + }, + "Tags": [ + {"Key": "string", "Value": "string"}, + ], + "LabelingJobOutput": { + "OutputDatasetS3Uri": "s3://labeling-job/output-dataset.tar.gz", + "FinalActiveLearningModelArn": "arn:aws:sagemaker:us-west-2:123412341234:labeling-job/final-active-learning-model", + }, +} + +training_job_name = "a-training-job" +training_job_arn = ( + "arn:aws:sagemaker:us-west-2:123412341234:training-job/a-training-job" +) +list_training_jobs_response = { + "TrainingJobSummaries": [ + { + "TrainingJobName": training_job_name, + "TrainingJobArn": training_job_arn, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingJobStatus": "InProgress", + }, + ], +} +describe_training_job_response = { + "TrainingJobName": training_job_name, + "TrainingJobArn": training_job_arn, + "TuningJobArn": "string", + "LabelingJobArn": "string", + "AutoMLJobArn": "string", + "ModelArtifacts": {"S3ModelArtifacts": "s3://training-job/model-artifact.tar.gz"}, + "TrainingJobStatus": "InProgress", # 'InProgress'|'Completed'|'Failed'|'Stopping'|'Stopped' + "SecondaryStatus": "Starting", # 'Starting'|'LaunchingMLInstances'|'PreparingTrainingStack'|'Downloading'|'DownloadingTrainingImage'|'Training'|'Uploading'|'Stopping'|'Stopped'|'MaxRuntimeExceeded'|'Completed'|'Failed'|'Interrupted'|'MaxWaitTimeExceeded'|'Updating'|'Restarting' + "FailureReason": "string", + "HyperParameters": {"string": "string"}, + "AlgorithmSpecification": { + "TrainingImage": "string", + "AlgorithmName": "string", + "TrainingInputMode": "Pipe", # 'Pipe'|'File' + "MetricDefinitions": [ + {"Name": "string", "Regex": "string"}, + ], + "EnableSageMakerMetricsTimeSeries": True, # True|False + }, + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "InputDataConfig": [ + { + "ChannelName": "string", + "DataSource": { + "S3DataSource": { + "S3DataType": "ManifestFile", # 'ManifestFile'|'S3Prefix'|'AugmentedManifestFile' + "S3Uri": "s3://training-job/input-dataset.tar.gz", + "S3DataDistributionType": "FullyReplicated", # 'FullyReplicated'|'ShardedByS3Key' + "AttributeNames": [ + "string", + ], + }, + "FileSystemDataSource": { + "FileSystemId": "abcdefgihjklmnopqrstuvwxyz", + "FileSystemAccessMode": "rw", # 'rw'|'ro' + "FileSystemType": "EFS", # 'EFS'|'FSxLustre', + "DirectoryPath": "string", + }, + }, + "ContentType": "string", + "CompressionType": "None", # 'None'|'Gzip' + "RecordWrapperType": "None", # 'None'|'RecordIO' + "InputMode": "Pipe", # 'Pipe'|'File' + "ShuffleConfig": {"Seed": 123}, + }, + ], + "OutputDataConfig": { + "KmsKeyId": "string", + "S3OutputPath": "s3://training-job/output-data.tar.gz", + }, + "ResourceConfig": { + "InstanceType": "ml.m4.xlarge", + "InstanceCount": 123, + "VolumeSizeInGB": 123, + "VolumeKmsKeyId": "string", + }, + "VpcConfig": { + "SecurityGroupIds": [ + "string", + ], + "Subnets": [ + "string", + ], + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 123, "MaxWaitTimeInSeconds": 123}, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingStartTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TrainingEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "SecondaryStatusTransitions": [ + { + "Status": "Starting", # 'Starting'|'LaunchingMLInstances'|'PreparingTrainingStack'|'Downloading'|'DownloadingTrainingImage'|'Training'|'Uploading'|'Stopping'|'Stopped'|'MaxRuntimeExceeded'|'Completed'|'Failed'|'Interrupted'|'MaxWaitTimeExceeded'|'Updating'|'Restarting' + "StartTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "EndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "StatusMessage": "string", + }, + ], + "FinalMetricDataList": [ + { + "MetricName": "string", + "Value": 1.0, + "Timestamp": datetime(2015, 1, 1, tzinfo=timezone.utc), + }, + ], + "EnableNetworkIsolation": True, # True|False + "EnableInterContainerTrafficEncryption": True, # True|False + "EnableManagedSpotTraining": True, # True|False + "CheckpointConfig": { + "S3Uri": "s3://training-job/checkpoint-config.tar.gz", + "LocalPath": "string", + }, + "TrainingTimeInSeconds": 123, + "BillableTimeInSeconds": 123, + "DebugHookConfig": { + "LocalPath": "string", + "S3OutputPath": "s3://training-job/debug-hook-config.tar.gz", + "HookParameters": {"string": "string"}, + "CollectionConfigurations": [ + {"CollectionName": "string", "CollectionParameters": {"string": "string"}}, + ], + }, + "ExperimentConfig": { + "ExperimentName": "string", + "TrialName": "string", + "TrialComponentDisplayName": "string", + }, + "DebugRuleConfigurations": [ + { + "RuleConfigurationName": "string", + "LocalPath": "string", + "S3OutputPath": "s3://training-job/debug-rule-config.tar.gz", + "RuleEvaluatorImage": "string", + "InstanceType": "ml.t3.medium", + "VolumeSizeInGB": 123, + "RuleParameters": {"string": "string"}, + }, + ], + "TensorBoardOutputConfig": { + "LocalPath": "string", + "S3OutputPath": "s3://training-job/tensorboard-output-config.tar.gz", + }, + "DebugRuleEvaluationStatuses": [ + { + "RuleConfigurationName": "string", + "RuleEvaluationJobArn": "string", + "RuleEvaluationStatus": "InProgress", # 'InProgress'|'NoIssuesFound'|'IssuesFound'|'Error'|'Stopping'|'Stopped' + "StatusDetails": "string", + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + }, + ], + "ProfilerConfig": { + "S3OutputPath": "s3://training-job/profiler-config.tar.gz", + "ProfilingIntervalInMilliseconds": 123, + "ProfilingParameters": {"string": "string"}, + }, + "ProfilerRuleConfigurations": [ + { + "RuleConfigurationName": "string", + "LocalPath": "string", + "S3OutputPath": "s3://training-job/profiler-rule-config.tar.gz", + "RuleEvaluatorImage": "string", + "InstanceType": "ml.t3.medium", + "VolumeSizeInGB": 123, + "RuleParameters": {"string": "string"}, + }, + ], + "ProfilerRuleEvaluationStatuses": [ + { + "RuleConfigurationName": "string", + "RuleEvaluationJobArn": "string", + "RuleEvaluationStatus": "InProgress", # 'InProgress'|'NoIssuesFound'|'IssuesFound'|'Error'|'Stopping'|'Stopped' + "StatusDetails": "string", + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + }, + ], + "ProfilingStatus": "Enabled", # 'Enabled'|'Disabled' + "RetryStrategy": {"MaximumRetryAttempts": 123}, + "Environment": {"string": "string"}, +} + +processing_job_name = "a-processing-job" +processing_job_arn = ( + "arn:aws:sagemaker:us-west-2:123412341234:processing-job/a-processing-job" +) +list_processing_jobs_response = { + "ProcessingJobSummaries": [ + { + "ProcessingJobName": processing_job_name, + "ProcessingJobArn": processing_job_arn, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "ProcessingEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "ProcessingJobStatus": "InProgress", + "FailureReason": "string", + "ExitMessage": "string", + }, + ], +} +describe_processing_job_response = { + "ProcessingJobName": processing_job_name, + "ProcessingJobArn": processing_job_arn, + "ProcessingInputs": [ + { + "InputName": "string", + "AppManaged": True, # True|False + "S3Input": { + "S3Uri": "s3://processing-job/input-data.tar.gz", + "LocalPath": "string", + "S3DataType": "ManifestFile", # 'ManifestFile'|'S3Prefix' + "S3InputMode": "Pipe", # 'Pipe'|'File' + "S3DataDistributionType": "FullyReplicated", # 'FullyReplicated'|'ShardedByS3Key' + "S3CompressionType": "None", # 'None'|'Gzip' + }, + "DatasetDefinition": { + "AthenaDatasetDefinition": { + "Catalog": "athena-catalog", + "Database": "athena-database", + "QueryString": "athena-query-string", + "WorkGroup": "athena-work-group", + "OutputS3Uri": "s3://processing-job/athena-output.tar.gz", + "KmsKeyId": "string", + "OutputFormat": "PARQUET", # 'PARQUET'|'ORC'|'AVRO'|'JSON'|'TEXTFILE' + "OutputCompression": "GZIP", # 'GZIP'|'SNAPPY'|'ZLIB' + }, + "RedshiftDatasetDefinition": { + "ClusterId": "redshift-cluster", + "Database": "redshift-database", + "DbUser": "redshift-db-user", + "QueryString": "redshift-query-string", + "ClusterRoleArn": "arn:aws:sagemaker:us-west-2:123412341234:processing-job/redshift-cluster", + "OutputS3Uri": "s3://processing-job/redshift-output.tar.gz", + "KmsKeyId": "string", + "OutputFormat": "PARQUET", # 'PARQUET'|'CSV' + "OutputCompression": "None", # 'None'|'GZIP'|'BZIP2'|'ZSTD'|'SNAPPY' + }, + "LocalPath": "string", + "DataDistributionType": "FullyReplicated", # 'FullyReplicated'|'ShardedByS3Key' + "InputMode": "Pipe", # 'Pipe'|'File' + }, + }, + ], + "ProcessingOutputConfig": { + "Outputs": [ + { + "OutputName": "string", + "S3Output": { + "S3Uri": "s3://processing-job/processing-output.tar.gz", + "LocalPath": "string", + "S3UploadMode": "Continuous", # 'Continuous'|'EndOfJob' + }, + "FeatureStoreOutput": {"FeatureGroupName": "string"}, + "AppManaged": True, # True|False + }, + ], + "KmsKeyId": "string", + }, + "ProcessingResources": { + "ClusterConfig": { + "InstanceCount": 123, + "InstanceType": "ml.t3.medium", + "VolumeSizeInGB": 123, + "VolumeKmsKeyId": "string", + } + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 123}, + "AppSpecification": { + "ImageUri": "string", + "ContainerEntrypoint": [ + "string", + ], + "ContainerArguments": [ + "string", + ], + }, + "Environment": {"string": "string"}, + "NetworkConfig": { + "EnableInterContainerTrafficEncryption": True, # True|False + "EnableNetworkIsolation": True, # True|False + "VpcConfig": { + "SecurityGroupIds": [ + "string", + ], + "Subnets": [ + "string", + ], + }, + }, + "RoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole", + "ExperimentConfig": { + "ExperimentName": "string", + "TrialName": "string", + "TrialComponentDisplayName": "string", + }, + "ProcessingJobStatus": "InProgress", # 'InProgress'|'Completed'|'Failed'|'Stopping'|'Stopped' + "ExitMessage": "string", + "FailureReason": "string", + "ProcessingEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "ProcessingStartTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "MonitoringScheduleArn": "string", + "AutoMLJobArn": auto_ml_job_arn, + "TrainingJobArn": training_job_arn, +} + +transform_job_name = "a-transform-job" +transform_job_arn = ( + "arn:aws:sagemaker:us-west-2:123412341234:transform-job/a-transform-job" +) +list_transform_jobs_response = { + "TransformJobSummaries": [ + { + "TransformJobName": transform_job_name, + "TransformJobArn": transform_job_arn, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TransformEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LastModifiedTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TransformJobStatus": "InProgress", + "FailureReason": "string", + }, + ], +} +describe_transform_job_response = { + "TransformJobName": transform_job_name, + "TransformJobArn": transform_job_arn, + "TransformJobStatus": "InProgress", + # 'InProgress' |'Completed'|'Failed'|'Stopping'|'Stopped' + "FailureReason": "string", + "ModelName": "string", + "MaxConcurrentTransforms": 123, + "ModelClientConfig": { + "InvocationsTimeoutInSeconds": 123, + "InvocationsMaxRetries": 123, + }, + "MaxPayloadInMB": 123, + "BatchStrategy": "MultiRecord", # 'MultiRecord'|'SingleRecord' + "Environment": {"string": "string"}, + "TransformInput": { + "DataSource": { + "S3DataSource": { + "S3DataType": "ManifestFile", # "ManifestFile" | "S3Prefix" | "AugmentedManifestFile" + "S3Uri": "s3://transform-job/input-data-source.tar.gz", + } + }, + "ContentType": "string", + "CompressionType": "None", # "None" | "Gzip" + "SplitType": "None", # "None" | "Line" | "RecordIO" | "TFRecord" + }, + "TransformOutput": { + "S3OutputPath": "s3://transform-job/output.tar.gz", + "Accept": "string", + "AssembleWith": "None", # "None" | "Line" + "KmsKeyId": "string", + }, + "TransformResources": { + "InstanceType": "ml.m4.xlarge", + "InstanceCount": 123, + "VolumeKmsKeyId": "string", + }, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TransformStartTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "TransformEndTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "LabelingJobArn": labeling_job_arn, + "AutoMLJobArn": auto_ml_job_arn, + "DataProcessing": { + "InputFilter": "string", + "OutputFilter": "string", + "JoinSource": "Input", # "Input" | "None" + }, + "ExperimentConfig": { + "ExperimentName": "string", + "TrialName": "string", + "TrialComponentDisplayName": "string", + }, +} + +job_stubs = { + "auto_ml": { + "list": list_auto_ml_jobs_response, + "describe": describe_auto_ml_job_response, + "describe_name": auto_ml_job_name, + }, + "compilation": { + "list": list_compilation_jobs_response, + "describe": describe_compilation_job_response, + "describe_name": compilation_job_name, + }, + "edge_packaging": { + "list": list_edge_packaging_jobs_response, + "describe": describe_edge_packaging_job_response, + "describe_name": edge_packaging_job_name, + }, + "hyper_parameter_tuning": { + "list": list_hyper_parameter_tuning_jobs_response, + "describe": describe_hyper_parameter_tuning_job_response, + "describe_name": hyper_parameter_tuning_job_name, + }, + "labeling": { + "list": list_labeling_jobs_response, + "describe": describe_labeling_job_response, + "describe_name": labeling_job_name, + }, + "processing": { + "list": list_processing_jobs_response, + "describe": describe_processing_job_response, + "describe_name": processing_job_name, + }, + "training": { + "list": list_training_jobs_response, + "describe": describe_training_job_response, + "describe_name": training_job_name, + }, + "transform": { + "list": list_transform_jobs_response, + "describe": describe_transform_job_response, + "describe_name": transform_job_name, + }, +} + +list_models_response = { + "Models": [ + { + "ModelName": "the-first-model", + "ModelArn": "arn:aws:sagemaker:us-west-2:123412341234:model/the-first-model", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + }, + { + "ModelName": "the-second-model", + "ModelArn": "arn:aws:sagemaker:us-west-2:123412341234:model/the-second-model", + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + }, + ], +} +describe_model_response_1 = { + "ModelName": "the-first-model", + "PrimaryContainer": { + "ContainerHostname": "string", + "Image": "string", + "ImageConfig": { + "RepositoryAccessMode": "Platform", # 'Platform'|'Vpc' + "RepositoryAuthConfig": {"RepositoryCredentialsProviderArn": "string"}, + }, + "Mode": "SingleModel", # 'SingleModel'|'MultiModel' + "ModelDataUrl": "string", + "Environment": {"string": "string"}, + "ModelPackageName": "string", + "MultiModelConfig": { + "ModelCacheSetting": "Enabled", # 'Enabled'|'Disabled' + }, + }, + "Containers": [ + { + "ContainerHostname": "string", + "Image": "string", + "ImageConfig": { + "RepositoryAccessMode": "Platform", # 'Platform'|'Vpc' + "RepositoryAuthConfig": {"RepositoryCredentialsProviderArn": "string"}, + }, + "Mode": "SingleModel", # 'SingleModel'|'MultiModel' + "ModelDataUrl": "string", + "Environment": {"string": "string"}, + "ModelPackageName": "string", + "MultiModelConfig": { + "ModelCacheSetting": "Enabled", # 'Enabled'|'Disabled' + }, + }, + ], + "InferenceExecutionConfig": { + "Mode": "Serial", # 'Serial'|'Direct' + }, + "ExecutionRoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMaker-ExecutionRole-20210614T104201", + "VpcConfig": { + "SecurityGroupIds": [ + "string", + ], + "Subnets": [ + "string", + ], + }, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "ModelArn": "arn:aws:sagemaker:us-west-2:123412341234:model/the-first-model", + "EnableNetworkIsolation": True, # True | False +} +describe_model_response_2 = { + "ModelName": "the-second-model", + "PrimaryContainer": { + "ContainerHostname": "string", + "Image": "string", + "ImageConfig": { + "RepositoryAccessMode": "Platform", # 'Platform'|'Vpc' + "RepositoryAuthConfig": {"RepositoryCredentialsProviderArn": "string"}, + }, + "Mode": "MultiModel", # 'SingleModel'|'MultiModel' + "ModelDataUrl": "string", + "Environment": {"string": "string"}, + "ModelPackageName": "string", + "MultiModelConfig": { + "ModelCacheSetting": "Disabled", # 'Enabled'|'Disabled' + }, + }, + "Containers": [ + { + "ContainerHostname": "string", + "Image": "string", + "ImageConfig": { + "RepositoryAccessMode": "Vpc", # 'Platform'|'Vpc' + "RepositoryAuthConfig": {"RepositoryCredentialsProviderArn": "string"}, + }, + "Mode": "SingleModel", # 'SingleModel'|'MultiModel' + "ModelDataUrl": "string", + "Environment": {"string": "string"}, + "ModelPackageName": "string", + "MultiModelConfig": { + "ModelCacheSetting": "Disabled", # 'Enabled'|'Disabled' + }, + }, + ], + "InferenceExecutionConfig": { + "Mode": "Serial", # 'Serial'|'Direct' + }, + "ExecutionRoleArn": "arn:aws:iam::123412341234:role/service-role/AmazonSageMaker-ExecutionRole-20210614T104201", + "VpcConfig": { + "SecurityGroupIds": [ + "string", + ], + "Subnets": [ + "string", + ], + }, + "CreationTime": datetime(2015, 1, 1, tzinfo=timezone.utc), + "ModelArn": "arn:aws:sagemaker:us-west-2:123412341234:model/the-second-model", + "EnableNetworkIsolation": False, # True | False +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl index 1342e6fefe9391..a41b688a921e65 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl @@ -34,8 +34,9 @@ record DataJobInfo includes CustomProperties, ExternalReference { /** * Datajob type + * **NOTE**: AzkabanJobType is deprecated. Please use strings instead. */ - type: union[AzkabanJobType] + type: union[AzkabanJobType, string] /** * DataFlow urn that this job is part of @@ -45,4 +46,9 @@ record DataJobInfo includes CustomProperties, ExternalReference { "entityTypes": [ "dataFlow" ] } flowUrn: optional DataFlowUrn + + /** + * Status of the job + */ + status: optional JobStatus } diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/JobStatus.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/JobStatus.pdl new file mode 100644 index 00000000000000..375cefcadf6010 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/JobStatus.pdl @@ -0,0 +1,42 @@ +namespace com.linkedin.datajob + +/** + * Job statuses + */ +enum JobStatus { + + /** + * Jobs being initialized. + */ + STARTING + + /** + * Jobs currently running. + */ + IN_PROGRESS + + /** + * Jobs being stopped. + */ + STOPPING + + /** + * Jobs that have stopped. + */ + STOPPED + + /** + * Jobs with successful completion. + */ + COMPLETED + + /** + * Jobs that have failed. + */ + FAILED + + /** + * Jobs with unknown status (either unmappable or unavailable) + */ + UNKNOWN +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureTableAspect.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureTableAspect.pdl index e63d43e9372b09..528734722bb0cd 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureTableAspect.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureTableAspect.pdl @@ -6,6 +6,7 @@ import com.linkedin.common.Ownership import com.linkedin.common.Status import com.linkedin.ml.metadata.MLFeatureTableProperties import com.linkedin.common.Deprecation +import com.linkedin.common.BrowsePaths /** * A union of all supported metadata aspects for a MLFeatureTable @@ -16,5 +17,6 @@ typeref MLFeatureTableAspect = union[ Ownership, InstitutionalMemory, Status, - Deprecation + Deprecation, + BrowsePaths ] diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl index 4065cae276a243..5696e7dd02340a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl @@ -3,6 +3,7 @@ namespace com.linkedin.ml.metadata import com.linkedin.common.MLFeatureUrn import com.linkedin.common.Time import com.linkedin.common.VersionTag +import com.linkedin.common.CustomProperties /** * Properties associated with a ML Model @@ -10,7 +11,7 @@ import com.linkedin.common.VersionTag @Aspect = { "name": "mlModelProperties" } -record MLModelProperties { +record MLModelProperties includes CustomProperties { /** * Documentation of the MLModel