Skip to content

Commit 1494205

Browse files
committed
Add UNKNOWN data type
1 parent 4e8cb2e commit 1494205

File tree

5 files changed

+14
-3
lines changed

5 files changed

+14
-3
lines changed

gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -842,7 +842,7 @@
842842
"name" : "MLFeatureDataType",
843843
"namespace" : "com.linkedin.common",
844844
"doc" : "MLFeature Data Type",
845-
"symbols" : [ "USELESS", "NOMINAL", "ORDINAL", "BINARY", "COUNT", "TIME", "INTERVAL", "IMAGE", "VIDEO", "AUDIO", "TEXT", "MAP", "SEQUENCE", "SET", "CONTINUOUS", "BYTE" ],
845+
"symbols" : [ "USELESS", "NOMINAL", "ORDINAL", "BINARY", "COUNT", "TIME", "INTERVAL", "IMAGE", "VIDEO", "AUDIO", "TEXT", "MAP", "SEQUENCE", "SET", "CONTINUOUS", "BYTE", "UNKNOWN" ],
846846
"symbolDocs" : {
847847
"AUDIO" : "Audio Data",
848848
"BINARY" : "Binary data is discrete data that can be in only one of two categories — either yes or no, 1 or 0, off or on, etc",
@@ -858,6 +858,7 @@
858858
"SET" : "Set Data Type ex: set, frozenset",
859859
"TEXT" : "Text Data",
860860
"TIME" : "Time data is a cyclical, repeating continuous form of data.\nThe relevant time features can be any period— daily, weekly, monthly, annual, etc.",
861+
"UNKNOWN" : "Unknown data are data that we don't know the type for.",
861862
"USELESS" : "Useless data is unique, discrete data with no potential relationship with the outcome variable.\nA useless feature has high cardinality. An example would be bank account numbers that were generated randomly.",
862863
"VIDEO" : "Video Data"
863864
}

li-utils/src/main/pegasus/com/linkedin/common/MLFeatureDataType.pdl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,9 @@ enum MLFeatureDataType {
9090
* Bytes data are binary-encoded values that can represent complex objects.
9191
*/
9292
BYTE
93+
94+
/**
95+
* Unknown data are data that we don't know the type for.
96+
*/
97+
UNKNOWN
9398
}

metadata-ingestion/src/datahub/ingestion/source/feast.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def get_field_type(self, field_type: str, parent_name: str) -> str:
9999
self.report.report_warning(
100100
parent_name, f"unable to map type {field_type} to metadata schema"
101101
)
102-
enum_type = MLFeatureDataType.USELESS
102+
enum_type = MLFeatureDataType.UNKNOWN
103103

104104
return enum_type
105105

metadata-ingestion/src/datahub/metadata/schema.avsc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3664,7 +3664,8 @@
36643664
"SEQUENCE",
36653665
"SET",
36663666
"CONTINUOUS",
3667-
"BYTE"
3667+
"BYTE",
3668+
"UNKNOWN"
36683669
],
36693670
"symbolDocs": {
36703671
"AUDIO": "Audio Data",
@@ -3681,6 +3682,7 @@
36813682
"SET": "Set Data Type ex: set, frozenset",
36823683
"TEXT": "Text Data",
36833684
"TIME": "Time data is a cyclical, repeating continuous form of data.\nThe relevant time features can be any period\u2014 daily, weekly, monthly, annual, etc.",
3685+
"UNKNOWN": "Unknown data are data that we don't know the type for.",
36843686
"USELESS": "Useless data is unique, discrete data with no potential relationship with the outcome variable.\nA useless feature has high cardinality. An example would be bank account numbers that were generated randomly.",
36853687
"VIDEO": "Video Data"
36863688
}

metadata-ingestion/src/datahub/metadata/schema_classes.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1109,6 +1109,9 @@ class MLFeatureDataTypeClass(object):
11091109
"""Bytes data are binary-encoded values that can represent complex objects."""
11101110
BYTE = "BYTE"
11111111

1112+
"""Unknown data are data that we don't know the type for."""
1113+
UNKNOWN = "UNKNOWN"
1114+
11121115

11131116
class OwnerClass(DictWrapper):
11141117
"""Ownership information"""

0 commit comments

Comments
 (0)