Skip to content

Commit d9a38d4

Browse files
committed
2 parents 8e0f9ba + 9be6d2b commit d9a38d4

File tree

80 files changed

+5038
-9363
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+5038
-9363
lines changed

datahub-web-react/src/app/browse/BrowseResultCard.tsx

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { Card, Row, Space, Typography } from 'antd';
33
import { Link } from 'react-router-dom';
44
import { ArrowRightOutlined, FolderOutlined } from '@ant-design/icons';
55
import styled from 'styled-components';
6+
import { singularizeCollectionName } from '../entity/shared/utils';
67

78
const styles = {
89
row: { padding: 8 },
@@ -28,6 +29,10 @@ export interface BrowseResultProps {
2829
}
2930

3031
export default function BrowseResultCard({ url, count, name, type, onClick }: BrowseResultProps) {
32+
let displayType = type;
33+
if (count === 1) {
34+
displayType = singularizeCollectionName(type);
35+
}
3136
return (
3237
<Link to={url} onClick={onClick}>
3338
<ResultCard hoverable>
@@ -41,7 +46,7 @@ export default function BrowseResultCard({ url, count, name, type, onClick }: Br
4146
<Space>
4247
{count && (
4348
<Typography.Text strong>
44-
{count} {type}
49+
{count} {displayType}
4550
</Typography.Text>
4651
)}
4752
<ArrowRightOutlined />

datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ export class GlossaryTermEntity implements Entity<GlossaryTerm> {
4040

4141
getPathName = () => 'glossary';
4242

43-
getCollectionName = () => 'Business Glossary';
43+
getCollectionName = () => 'Glossary Terms';
4444

4545
getEntityName = () => 'Glossary Term';
4646

datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/getTopNQueries.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,16 @@ export default function getTopNQueries(responseSize: number, buckets?: Maybe<Usa
66
return response;
77
}
88

9+
const unique = (value: string | null, index: number, self): boolean => {
10+
return self.indexOf(value) === index;
11+
};
12+
913
for (let i = 0; i < buckets.length; i++) {
1014
const bucket = buckets[i];
1115

1216
if (bucket?.metrics?.topSqlQueries && bucket?.metrics?.topSqlQueries !== null) {
1317
response = [...response, ...bucket?.metrics?.topSqlQueries];
14-
response = response.filter(Boolean);
18+
response = response.filter(Boolean).filter(unique);
1519
if (response.length >= responseSize) {
1620
return response.slice(0, responseSize);
1721
}

datahub-web-react/src/app/entity/shared/utils.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,16 @@ export const truncate = (length: number, input?: string | null) => {
1313
}
1414
return input;
1515
};
16+
17+
export const singularizeCollectionName = (collectionName: string): string => {
18+
if (!collectionName) {
19+
return collectionName;
20+
}
21+
22+
const lastChar = collectionName[collectionName.length - 1];
23+
if (lastChar === 's') {
24+
return collectionName.slice(0, -1);
25+
}
26+
27+
return collectionName;
28+
};

datahub-web-react/src/app/shared/tags/TagTermGroup.tsx

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ const TagLink = styled(Link)`
3838
margin-bottom: 8px;
3939
`;
4040

41+
const NoElementButton = styled(Button)`
42+
:not(:last-child) {
43+
margin-right: 8px;
44+
}
45+
`;
46+
4147
export default function TagTermGroup({
4248
uneditableTags,
4349
editableTags,
@@ -203,7 +209,7 @@ export default function TagTermGroup({
203209
</Typography.Paragraph>
204210
)}
205211
{canAddTag && (uneditableTags?.tags?.length || 0) + (editableTags?.tags?.length || 0) < 10 && (
206-
<Button
212+
<NoElementButton
207213
type={showEmptyMessage && tagsEmpty ? 'default' : 'text'}
208214
onClick={() => {
209215
setAddModalType(EntityType.Tag);
@@ -213,11 +219,11 @@ export default function TagTermGroup({
213219
>
214220
<PlusOutlined />
215221
Add Tag
216-
</Button>
222+
</NoElementButton>
217223
)}
218224
{canAddTerm &&
219225
(uneditableGlossaryTerms?.terms?.length || 0) + (editableGlossaryTerms?.terms?.length || 0) < 10 && (
220-
<Button
226+
<NoElementButton
221227
type={showEmptyMessage && tagsEmpty ? 'default' : 'text'}
222228
onClick={() => {
223229
setAddModalType(EntityType.GlossaryTerm);
@@ -227,7 +233,7 @@ export default function TagTermGroup({
227233
>
228234
<PlusOutlined />
229235
Add Term
230-
</Button>
236+
</NoElementButton>
231237
)}
232238
{showAddModal && !!entityUrn && !!entityType && (
233239
<AddTagTermModal

docs/modeling/metadata-model.md

Lines changed: 206 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,4 +331,209 @@ to see an example of a timeseries aspect.
331331
Because timeseries aspects are updated on a frequent basis, ingests of these aspects go straight to elastic search (
332332
instead of being stored in local DB).
333333

334-
You can retrieve timeseries aspects using the "aspects?action=getTimeseriesAspectValues" end point.
334+
You can retrieve timeseries aspects using the "aspects?action=getTimeseriesAspectValues" end point.
335+
336+
#### Aggregatable Timeseries aspects
337+
Being able to perform SQL like *group by + aggregate* operations on the timeseries aspects is a very natural use-case for
338+
this kind of data (dataset profiles, usage statistics etc.). This section describes how to define, ingest and perform an
339+
aggregation query against a timeseries aspect.
340+
341+
##### Defining a new aggregatable Timeseries aspect.
342+
343+
The *@TimeseriesField* and the *@TimeseriesFieldCollection* are two new annotations that can be attached to a field of
344+
a *Timeseries aspect* that allows it to be part of an aggregatable query. The kinds of aggregations allowed on these
345+
annotated fields depends on the type of the field, as well as the kind of aggregation, as
346+
described [here](#Performing-an-aggregation-on-a-Timeseries-aspect).
347+
348+
* `@TimeseriesField = {}` - this annotation can be used with any type of non-collection type field of the aspect such as
349+
primitive types and records (see the fields *stat*, *strStat* and *strArray* fields
350+
of [TestEntityProfile.pdl](https://github.com/linkedin/datahub/blob/master/test-models/src/main/pegasus/com/datahub/test/TestEntityProfile.pdl)).
351+
352+
* The `@TimeseriesFieldCollection {"key":"<name of the key field of collection item type>"}` annotation allows for
353+
aggregation support on the items of a collection type (supported only for the array type collections for now), where the
354+
value of `"key"` is the name of the field in the collection item type that will be used to specify the group-by clause (
355+
see *userCounts* and *fieldCounts* fields of [DatasetUsageStatistics.pdl](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetUsageStatistics.pdl)).
356+
357+
In addition to defining the new aspect with appropriate Timeseries annotations,
358+
the [entity-registry.yml](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/resources/entity-registry.yml)
359+
file needs to be updated as well. Just add the new aspect name under the list of aspects against the appropriate entity as shown below, such as `datasetUsageStatistics` for the aspect DatasetUsageStatistics.
360+
```yaml
361+
entities:
362+
- name: dataset
363+
keyAspect: datasetKey
364+
aspects:
365+
- datasetProfile
366+
- datasetUsageStatistics
367+
```
368+
369+
##### Ingesting a Timeseries aspect
370+
The timeseries aspects can be ingested via the GSM REST endpoint `/aspects?action=ingestProposal` or via the python API.
371+
372+
Example1: Via GSM REST API using curl.
373+
374+
```shell
375+
curl --location --request POST 'http://localhost:8080/aspects?action=ingestProposal' \
376+
--header 'X-RestLi-Protocol-Version: 2.0.0' \
377+
--header 'Content-Type: application/json' \
378+
--data-raw '{
379+
"proposal" : {
380+
"entityType": "dataset",
381+
"entityUrn" : "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)",
382+
"changeType" : "UPSERT",
383+
"aspectName" : "datasetUsageStatistics",
384+
"aspect" : {
385+
"value" : "{ \"timestampMillis\":1629840771000,\"uniqueUserCount\" : 10, \"totalSqlQueries\": 20, \"fieldCounts\": [ {\"fieldPath\": \"col1\", \"count\": 20}, {\"fieldPath\" : \"col2\", \"count\": 5} ]}",
386+
"contentType": "application/json"
387+
}
388+
}
389+
}'
390+
```
391+
Example2: Via Python API to Kafka(or REST)
392+
```python
393+
from datahub.metadata.schema_classes import (
394+
ChangeTypeClass,
395+
DatasetFieldUsageCountsClass,
396+
DatasetUsageStatisticsClass,
397+
)
398+
from datahub.emitter.kafka_emitter import DatahubKafkaEmitter
399+
from datahub.emitter.rest_emitter import DatahubRestEmitter
400+
401+
usageStats = DatasetUsageStatisticsClass(
402+
timestampMillis=1629840771000,
403+
uniqueUserCount=10,
404+
totalSqlQueries=20,
405+
fieldCounts=[
406+
DatasetFieldUsageCountsClass(
407+
fieldPath="col1",
408+
count=10
409+
)
410+
]
411+
)
412+
413+
mcpw = MetadataChangeProposalWrapper(
414+
entityType="dataset",
415+
aspectName="datasetUsageStatistics",
416+
changeType=ChangeTypeClass.UPSERT,
417+
entityUrn="urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)",
418+
aspect=usageStats,
419+
)
420+
421+
# Instantiate appropriate emitter (kafk_emitter/rest_emitter)
422+
my_emitter = DatahubKafkaEmitter("""<config>""")
423+
my_emitter.emit(mcpw)
424+
```
425+
426+
##### Performing an aggregation on a Timeseries aspect.
427+
428+
Aggreations on timeseries aspects can be performed by the GSM REST API for `/analytics?action=getTimeseriesStats` which
429+
accepts the following params.
430+
* `entityName` - The name of the entity the aspect is associated with.
431+
* `aspectName` - The name of the aspect.
432+
* `filter` - Any pre-filtering criteria before grouping and aggregations are performed.
433+
* `metrics` - A list of aggregation specification. The `fieldPath` member of an aggregation specification refers to the
434+
field name against which the aggregation needs to be performed, and the `aggregationType` specifies the kind of aggregation.
435+
* `buckets` - A list of grouping bucket specifications. Each grouping bucket has a `key` field that refers to the field
436+
to use for grouping. The `type` field specifies the kind of grouping bucket.
437+
438+
We support three kinds of aggregations that can be specified in an aggregation query on the Timeseries annotated fields.
439+
The values that `aggregationType` can take are
440+
441+
* `LATEST`: The latest value of the field in each bucket. Supported for any type of field.
442+
* `SUM`: The cumulative sum of the field in each bucket. Supported only for integral types.
443+
* `CARDINALITY`: The number of unique values or the cardinality of the set in each bucket. Supported for string and
444+
record types.
445+
446+
We support two types of grouping for defining the buckets to perform aggregations against.
447+
448+
* `DATE_GROUPING_BUCKET`: Allows for creating time-based buckets such as by second, minute, hour, day, week, month,
449+
quarter, year etc. Should be used in conjunction with a timestamp field whose value is in milliseconds since *epoch*.
450+
The `timeWindowSize` param specifies the date histogram bucket width.
451+
* `STRING_GROUPING_BUCKET`: Allows for creating buckets grouped by the unique values of a field. Should be used in
452+
conjunction with a string type field always.
453+
454+
The API returns a generic SQL like table as the `table` member of the output that contains the results of
455+
the `group-by/aggregate` query, in addition to echoing the input params.
456+
457+
* `columnNames`: the names of the table columns. The group-by `key` names appear in the same order as they are specified
458+
in the request. Aggregation specifications follow the grouping fields in the same order as specified in the request,
459+
and will be named `<agg_name>_<fieldPath>`.
460+
* `columnTypes`: the data types of the columns.
461+
* `rows`: the data values, each row corresponding to the respective bucket(s).
462+
463+
Example1: Latest unique user count for each day.
464+
```shell
465+
# QUERY
466+
curl --location --request POST 'http://localhost:8080/analytics?action=getTimeseriesStats' \
467+
--header 'X-RestLi-Protocol-Version: 2.0.0' \
468+
--header 'Content-Type: application/json' \
469+
--data-raw '{
470+
"entityName": "dataset",
471+
"aspectName": "datasetUsageStatistics",
472+
"filter": {
473+
"criteria": []
474+
},
475+
"metrics": [
476+
{
477+
"fieldPath": "uniqueUserCount",
478+
"aggregationType": "LATEST"
479+
}
480+
],
481+
"buckets": [
482+
{
483+
"key": "timestampMillis",
484+
"type": "DATE_GROUPING_BUCKET",
485+
"timeWindowSize": {
486+
"multiple": 1,
487+
"unit": "DAY"
488+
}
489+
}
490+
]
491+
}'
492+
493+
# SAMPLE RESPOSNE
494+
{
495+
"value": {
496+
"filter": {
497+
"criteria": []
498+
},
499+
"aspectName": "datasetUsageStatistics",
500+
"entityName": "dataset",
501+
"groupingBuckets": [
502+
{
503+
"type": "DATE_GROUPING_BUCKET",
504+
"timeWindowSize": {
505+
"multiple": 1,
506+
"unit": "DAY"
507+
},
508+
"key": "timestampMillis"
509+
}
510+
],
511+
"aggregationSpecs": [
512+
{
513+
"fieldPath": "uniqueUserCount",
514+
"aggregationType": "LATEST"
515+
}
516+
],
517+
"table": {
518+
"columnNames": [
519+
"timestampMillis",
520+
"latest_uniqueUserCount"
521+
],
522+
"rows": [
523+
[
524+
"1631491200000",
525+
"1"
526+
]
527+
],
528+
"columnTypes": [
529+
"long",
530+
"int"
531+
]
532+
}
533+
}
534+
}
535+
```
536+
For more examples on the complex types of group-by/aggregations, refer to the tests in the group `getAggregatedStats` of [ElasticSearchTimeseriesAspectServiceTest.java](https://github.com/linkedin/datahub/blob/master/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java).
537+
538+
539+

entity-registry/src/main/java/com/linkedin/metadata/models/AspectSpec.java

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,29 @@ public class AspectSpec {
1414
private final AspectAnnotation _aspectAnnotation;
1515
private final Map<String, SearchableFieldSpec> _searchableFieldSpecs;
1616
private final Map<String, RelationshipFieldSpec> _relationshipFieldSpecs;
17+
private final Map<String, TimeseriesFieldSpec> _timeseriesFieldSpecs;
18+
private final Map<String, TimeseriesFieldCollectionSpec> _timeseriesFieldCollectionSpecs;
1719

1820
// Classpath & Pegasus-specific: Temporary.
1921
private final RecordDataSchema _schema;
2022

2123
public AspectSpec(@Nonnull final AspectAnnotation aspectAnnotation,
2224
@Nonnull final List<SearchableFieldSpec> searchableFieldSpecs,
23-
@Nonnull final List<RelationshipFieldSpec> relationshipFieldSpecs, final RecordDataSchema schema) {
25+
@Nonnull final List<RelationshipFieldSpec> relationshipFieldSpecs,
26+
@Nonnull final List<TimeseriesFieldSpec> timeseriesFieldSpecs,
27+
@Nonnull final List<TimeseriesFieldCollectionSpec> timeseriesFieldCollectionSpecs,
28+
final RecordDataSchema schema) {
2429
_aspectAnnotation = aspectAnnotation;
2530
_searchableFieldSpecs = searchableFieldSpecs.stream()
2631
.collect(Collectors.toMap(spec -> spec.getPath().toString(), spec -> spec, (val1, val2) -> val1));
2732
_relationshipFieldSpecs = relationshipFieldSpecs.stream()
2833
.collect(Collectors.toMap(spec -> spec.getPath().toString(), spec -> spec, (val1, val2) -> val1));
34+
_timeseriesFieldSpecs = timeseriesFieldSpecs.stream()
35+
.collect(Collectors.toMap(spec -> spec.getTimeseriesFieldAnnotation().getStatName(), spec -> spec,
36+
(val1, val2) -> val1));
37+
_timeseriesFieldCollectionSpecs = timeseriesFieldCollectionSpecs.stream()
38+
.collect(Collectors.toMap(spec -> spec.getTimeseriesFieldCollectionAnnotation().getCollectionName(), spec -> spec,
39+
(val1, val2) -> val1));
2940
_schema = schema;
3041
}
3142

@@ -45,6 +56,14 @@ public Map<String, RelationshipFieldSpec> getRelationshipFieldSpecMap() {
4556
return _relationshipFieldSpecs;
4657
}
4758

59+
public Map<String, TimeseriesFieldSpec> getTimeseriesFieldSpecMap() {
60+
return _timeseriesFieldSpecs;
61+
}
62+
63+
public Map<String, TimeseriesFieldCollectionSpec> getTimeseriesFieldCollectionSpecMap() {
64+
return _timeseriesFieldCollectionSpecs;
65+
}
66+
4867
public List<SearchableFieldSpec> getSearchableFieldSpecs() {
4968
return new ArrayList<>(_searchableFieldSpecs.values());
5069
}
@@ -53,6 +72,14 @@ public List<RelationshipFieldSpec> getRelationshipFieldSpecs() {
5372
return new ArrayList<>(_relationshipFieldSpecs.values());
5473
}
5574

75+
public List<TimeseriesFieldSpec> getTimeseriesFieldSpecs() {
76+
return new ArrayList<>(_timeseriesFieldSpecs.values());
77+
}
78+
79+
public List<TimeseriesFieldCollectionSpec> getTimeseriesFieldCollectionSpecs() {
80+
return new ArrayList<>(_timeseriesFieldCollectionSpecs.values());
81+
}
82+
5683
public RecordDataSchema getPegasusSchema() {
5784
return _schema;
5885
}

0 commit comments

Comments
 (0)