Skip to content
Prev Previous commit
Next Next commit
field-level usage stats in gms
  • Loading branch information
hsheth2 committed Jun 25, 2021
commit 6721d246e550021c79f5061588343f8fcec3333c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import com.linkedin.restli.server.annotations.ActionParam;
import com.linkedin.restli.server.annotations.RestLiSimpleResource;
import com.linkedin.restli.server.resources.SimpleResourceTemplate;
import com.linkedin.usage.FieldUsageCounts;
import com.linkedin.usage.FieldUsageCountsArray;
import com.linkedin.usage.UsageAggregation;
import com.linkedin.usage.UsageAggregationArray;
import com.linkedin.usage.UsageQueryResult;
Expand Down Expand Up @@ -124,6 +126,30 @@ public Task<UsageQueryResult> query(@ActionParam(PARAM_RESOURCE) @Nonnull String
}
}

// Compute aggregations for field usage counts.
{
Map<String, Integer> fieldAgg = new HashMap<>();
buckets.forEach((bucket) -> {
Optional.ofNullable(bucket.getMetrics().getFields()).ifPresent(fieldUsageCounts -> {
fieldUsageCounts.forEach((fieldCount -> {
String key = fieldCount.getFieldName();
int count = fieldAgg.getOrDefault(key, 0);
count += fieldCount.getCount();
fieldAgg.put(key, count);
}));
});
});

if (!fieldAgg.isEmpty()) {
FieldUsageCountsArray fields = new FieldUsageCountsArray();
fields.addAll(fieldAgg.entrySet().stream().map((mapping) -> new FieldUsageCounts()
.setFieldName(mapping.getKey())
.setCount(mapping.getValue())).collect(Collectors.toList()));
aggregations.setFields(fields);
}
}


return new UsageQueryResult()
.setBuckets(buckets)
.setAggregations(aggregations);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def _aggregate_access_events(
agg_bucket.add_read_entry(
event.email,
event.query_text,
[colRef.columnName for colRef in object.columns],
[colRef.columnName.lower() for colRef in object.columns],
)

return datasets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from datahub.configuration.common import ConfigModel
from datahub.ingestion.api.workunit import UsageStatsWorkUnit
from datahub.metadata.schema_classes import (
FieldUsageCountsClass,
UsageAggregationClass,
UsageAggregationMetricsClass,
UserUsageCountsClass,
Expand Down Expand Up @@ -90,6 +91,13 @@ def make_usage_workunit(
topSqlQueries=[
query for query, _ in self.queryFreq.most_common(top_n_queries)
],
fields=[
FieldUsageCountsClass(
fieldName=column,
count=count,
)
for column, count in self.columnFreq.most_common()
]
),
),
)
Expand Down
Loading