Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ec8a385
Create common AWS config
kevinhu Jun 23, 2021
d3bf612
Init sagemaker
kevinhu Jun 23, 2021
3db0a58
Common AWS dependencies
kevinhu Jun 23, 2021
d282559
Get features in feature group
kevinhu Jun 23, 2021
8f455c9
Ingest feature groups
kevinhu Jun 23, 2021
2bfb882
Add example ingestion config
kevinhu Jun 23, 2021
5bff9f4
Fix feature ingestion
kevinhu Jun 23, 2021
44ecb58
Append Glue data catalog source
kevinhu Jun 23, 2021
d660a9b
Handle primary key ingestion
kevinhu Jun 24, 2021
0259845
Init tests and stubs
kevinhu Jun 24, 2021
cd4d233
Add sagemaker golden
kevinhu Jun 24, 2021
4ff8434
Clean up golden
kevinhu Jun 24, 2021
8971109
Add descriptions and filter primary keys
kevinhu Jun 24, 2021
9133c85
Include custom fields in feature tables
kevinhu Jun 24, 2021
777f7df
Add sagemaker custom properties
kevinhu Jun 24, 2021
3722726
Merge
kevinhu Jun 24, 2021
149584a
Cleanup
kevinhu Jun 24, 2021
fb70c0b
Fix old references
kevinhu Jun 24, 2021
1c248c3
Add test stub with offline store
kevinhu Jun 24, 2021
3a4012e
Update custom properties
kevinhu Jun 24, 2021
3b575b1
Merge
kevinhu Jun 25, 2021
ffcd8cc
Merge branch 'master' of github.com:kevinhu/datahub into sagemaker-fe…
kevinhu Jun 25, 2021
768393e
Refactor
kevinhu Jun 25, 2021
4bc4601
Merge branch 'master' of github.com:kevinhu/datahub into sagemaker-fe…
kevinhu Jun 28, 2021
63841e4
Update comments
kevinhu Jun 28, 2021
30564cc
Merge branch 'master' of github.com:kevinhu/datahub into sagemaker-fe…
kevinhu Jun 29, 2021
0bbe932
Merge
kevinhu Jun 29, 2021
8f96239
Fix imports order
kevinhu Jun 29, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Init sagemaker
  • Loading branch information
kevinhu committed Jun 23, 2021
commit d3bf61248a0f04bfcc282152cd1f5c7117d93820
22 changes: 22 additions & 0 deletions metadata-ingestion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ We use a plugin architecture so that you can install only the dependencies you a
| oracle | `pip install 'acryl-datahub[oracle]'` | Oracle source |
| postgres | `pip install 'acryl-datahub[postgres]'` | Postgres source |
| redshift | `pip install 'acryl-datahub[redshift]'` | Redshift source |
| sagemaker | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source |
| sqlalchemy | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
| snowflake | `pip install 'acryl-datahub[snowflake]'` | Snowflake source |
| superset | `pip install 'acryl-datahub[superset]'` | Superset source |
Expand Down Expand Up @@ -344,6 +345,27 @@ source:
# options is same as above
```

### AWS SageMaker `sagemaker`

Extracts:

- Feature groups (support for models and jobs coming soon!)

```yml
source:
type: glue
config:
aws_region: # aws_region_name, i.e. "eu-west-1"
env: # environment for the DatasetSnapshot URN, one of "DEV", "EI", "PROD" or "CORP". Defaults to "PROD".

# Credentials. If not specified here, these are picked up according to boto3 rules.
# (see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html)
aws_access_key_id: # Optional.
aws_secret_access_key: # Optional.
aws_session_token: # Optional.
aws_role: # Optional (Role chaining supported by using a sorted list).
```

### Snowflake `snowflake`

Extracts:
Expand Down
8 changes: 0 additions & 8 deletions metadata-ingestion/src/datahub/ingestion/source/aws_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,3 @@ def get_client(self, service: str) -> boto3.client:
)
else:
return boto3.client(service, region_name=self.aws_region)

@property
def glue_client(self):
return self.get_client("glue")

@property
def s3_client(self):
return self.get_client("s3")
8 changes: 8 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/source/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ class GlueSourceConfig(AwsSourceConfig):

extract_transforms: Optional[bool] = True

@property
def glue_client(self):
return self.get_client("glue")

@property
def s3_client(self):
return self.get_client("s3")


@dataclass
class GlueSourceReport(SourceReport):
Expand Down
56 changes: 56 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/source/sagemaker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from dataclasses import dataclass
from dataclasses import field as dataclass_field
from typing import List

from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.source import Source, SourceReport
from datahub.ingestion.source.aws_common import AwsSourceConfig


class SagemakerSourceConfig(AwsSourceConfig):
@property
def sagemaker_client(self):
return self.get_client("samgemaker")


@dataclass
class SagemakerSourceReport(SourceReport):
tables_scanned = 0
filtered: List[str] = dataclass_field(default_factory=list)

def report_table_scanned(self) -> None:
self.tables_scanned += 1

def report_table_dropped(self, table: str) -> None:
self.filtered.append(table)


class GlueSource(Source):
source_config: SagemakerSourceConfig
report = SagemakerSourceReport()

def __init__(self, config: SagemakerSourceConfig, ctx: PipelineContext):
super().__init__(ctx)
self.source_config = config
self.report = SagemakerSourceReport()
self.sagemaker_client = config.sagemaker_client
self.env = config.env

@classmethod
def create(cls, config_dict, ctx):
config = SagemakerSourceConfig.parse_obj(config_dict)
return cls(config, ctx)

def get_all_feature_groups(self):
"""
List all feature groups in SageMaker.
"""

feature_groups = []

# see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.list_feature_groups
paginator = self.sagemaker_client.get_paginator("list_feature_groups")
for page in paginator.paginate():
feature_groups += page["FeatureGroupSummaries"]

return feature_groups