From 8af3d7a89120b6a9ae9813057bb7353bb9a5d111 Mon Sep 17 00:00:00 2001 From: Surbhi Garg Date: Thu, 11 Sep 2025 10:19:40 +0530 Subject: [PATCH] fix: increment connectivity metrics only for certain error code --- src/metrics/constants.ts | 4 ++ src/metrics/metrics-tracer.ts | 11 +++- test/metrics/metrics-tracer.ts | 19 ++++-- test/metrics/metrics.ts | 111 +++++++++++++++++++++++++++------ 4 files changed, 121 insertions(+), 24 deletions(-) diff --git a/src/metrics/constants.ts b/src/metrics/constants.ts index 959eeb3d8..0da5600d8 100644 --- a/src/metrics/constants.ts +++ b/src/metrics/constants.ts @@ -56,6 +56,10 @@ export const METRIC_LABELS = new Set([ METRIC_LABEL_KEY_METHOD, METRIC_LABEL_KEY_STATUS, ]); +export const CONNECTIVITY_ERROR_STATUSES = new Set([ + 'DEADLINE_EXCEEDED', + 'CANCELLED', +]); // Metric names export const METRIC_NAME_OPERATION_LATENCIES = 'operation_latencies'; diff --git a/src/metrics/metrics-tracer.ts b/src/metrics/metrics-tracer.ts index ede4c7f7e..9722f52cc 100644 --- a/src/metrics/metrics-tracer.ts +++ b/src/metrics/metrics-tracer.ts @@ -16,6 +16,7 @@ import {status as Status} from '@grpc/grpc-js'; import {Counter, Histogram} from '@opentelemetry/api'; import {MetricsTracerFactory} from './metrics-tracer-factory'; import { + CONNECTIVITY_ERROR_STATUSES, METRIC_LABEL_KEY_DATABASE, METRIC_LABEL_KEY_METHOD, METRIC_LABEL_KEY_STATUS, @@ -334,7 +335,8 @@ export class MetricsTracer { * Increments the GFE connectivity error count metric. */ public recordGfeConnectivityErrorCount(statusCode: Status) { - if (!this.enabled) return; + if (!this.enabled || !CONNECTIVITY_ERROR_STATUSES.has(Status[statusCode])) + return; const attributes = {...this._clientAttributes}; attributes[METRIC_LABEL_KEY_STATUS] = Status[statusCode]; this._instrumentGfeConnectivityErrorCount?.add(1, attributes); @@ -344,7 +346,12 @@ export class MetricsTracer { * Increments the AFE connectivity error count metric. */ public recordAfeConnectivityErrorCount(statusCode: Status) { - if (!this.enabled || !Spanner.isAFEServerTimingEnabled()) return; + if ( + !this.enabled || + !Spanner.isAFEServerTimingEnabled() || + !CONNECTIVITY_ERROR_STATUSES.has(Status[statusCode]) + ) + return; const attributes = {...this._clientAttributes}; attributes[METRIC_LABEL_KEY_STATUS] = Status[statusCode]; this._instrumentAfeConnectivityErrorCount?.add(1, attributes); diff --git a/test/metrics/metrics-tracer.ts b/test/metrics/metrics-tracer.ts index 87bd90a2e..32885d8d9 100644 --- a/test/metrics/metrics-tracer.ts +++ b/test/metrics/metrics-tracer.ts @@ -165,11 +165,16 @@ describe('MetricsTracer', () => { }); describe('recordGfeConnectivityErrorCount', () => { - it('should increment GFE error counter if enabled', () => { - tracer.recordGfeConnectivityErrorCount(Status.OK); + it('should increment GFE error counter if enabled and connectivity error status code', () => { + tracer.recordGfeConnectivityErrorCount(Status.DEADLINE_EXCEEDED); assert.strictEqual(fakeGfeCounter.add.calledOnce, true); }); + it('should not increment GFE error counter if enabled and non-connectivity error status code', () => { + tracer.recordGfeConnectivityErrorCount(Status.OK); + assert.strictEqual(fakeGfeCounter.add.calledOnce, false); + }); + it('should not increment if disabled', () => { tracer.enabled = false; tracer.recordGfeConnectivityErrorCount(Status.OK); @@ -213,12 +218,18 @@ describe('MetricsTracer', () => { process.env['SPANNER_DISABLE_AFE_SERVER_TIMING'] = 'false'; }); - it('should increment AFE error counter if enabled', () => { + it('should increment AFE error counter if enabled and connectivity error status code', () => { tracer.enabled = true; - tracer.recordAfeConnectivityErrorCount(Status.OK); + tracer.recordAfeConnectivityErrorCount(Status.DEADLINE_EXCEEDED); assert.strictEqual(fakeAfeCounter.add.calledOnce, true); }); + it('should increment AFE error counter if enabled and non-connectivity error status code', () => { + tracer.enabled = true; + tracer.recordAfeConnectivityErrorCount(Status.OK); + assert.strictEqual(fakeAfeCounter.add.calledOnce, false); + }); + it('should not increment if metrics are disabled', () => { tracer.enabled = false; tracer.recordAfeConnectivityErrorCount(Status.OK); diff --git a/test/metrics/metrics.ts b/test/metrics/metrics.ts index cf567928f..0bdda4101 100644 --- a/test/metrics/metrics.ts +++ b/test/metrics/metrics.ts @@ -426,7 +426,7 @@ describe('Test metrics with mock server', () => { ); }); - it('should create connectivity error count metric if GFE/AFE latency is not in header', async () => { + it('should not create connectivity error count metric if GFE/AFE latency is not in header and status is not connectivity error', async () => { gfeStub = sandbox .stub(MetricsTracer.prototype, 'extractGfeLatency') .callsFake(() => null); @@ -453,18 +453,23 @@ describe('Test metrics with mock server', () => { resourceMetrics, METRIC_NAME_ATTEMPT_LATENCIES, ); - const connectivityErrorCountData = getMetricData( - resourceMetrics, - METRIC_NAME_GFE_CONNECTIVITY_ERROR_COUNT, - ); - const afeConnectivityErrorCountData = getMetricData( - resourceMetrics, - METRIC_NAME_AFE_CONNECTIVITY_ERROR_COUNT, - ); - // Verify GFE AFE latency doesn't exist + // Verify GFE AFE latency and connectity error metricsdoesn't exist assert.ok(!hasMetricData(resourceMetrics, METRIC_NAME_GFE_LATENCIES)); assert.ok(!hasMetricData(resourceMetrics, METRIC_NAME_AFE_LATENCIES)); + assert.ok( + !hasMetricData( + resourceMetrics, + METRIC_NAME_GFE_CONNECTIVITY_ERROR_COUNT, + ), + ); + assert.ok( + !hasMetricData( + resourceMetrics, + METRIC_NAME_AFE_CONNECTIVITY_ERROR_COUNT, + ), + ); + const methods = ['batchCreateSessions', 'executeStreamingSql']; methods.forEach(method => { const attributes = { @@ -480,17 +485,87 @@ describe('Test metrics with mock server', () => { getAggregatedValue(operationLatenciesData, attributes); assert.strictEqual(getAggregatedValue(attemptCountData, attributes), 1); getAggregatedValue(attemptLatenciesData, attributes); + }); + }); - // Verify that GFE AFE connectivity error count increased - assert.strictEqual( - getAggregatedValue(connectivityErrorCountData, attributes), - 1, - ); + it('should create connectivity error count metric if GFE/AFE latency is not in header and status is connectivity error', async () => { + gfeStub = sandbox + .stub(MetricsTracer.prototype, 'extractGfeLatency') + .callsFake(() => null); + afeStub = sandbox + .stub(MetricsTracer.prototype, 'extractAfeLatency') + .callsFake(() => null); + const err = { + message: 'Cancelled', + code: grpc.status.CANCELLED, + } as MockError; + spannerMock.setExecutionTime( + spannerMock.commit, + SimulatedExecutionTime.ofError(err), + ); + const database = newTestDatabase(); + try { + await database.runTransactionAsync(async tx => { + await tx.run(selectSql); + await tx.commit(); + }); + } catch (e) { assert.strictEqual( - getAggregatedValue(afeConnectivityErrorCountData, attributes), - 1, + (e as grpc.ServiceError).code, + grpc.status.CANCELLED, ); - }); + } + const {resourceMetrics} = await reader.collect(); + + const operationCountData = getMetricData( + resourceMetrics, + METRIC_NAME_OPERATION_COUNT, + ); + const attemptCountData = getMetricData( + resourceMetrics, + METRIC_NAME_ATTEMPT_COUNT, + ); + const operationLatenciesData = getMetricData( + resourceMetrics, + METRIC_NAME_OPERATION_LATENCIES, + ); + const attemptLatenciesData = getMetricData( + resourceMetrics, + METRIC_NAME_ATTEMPT_LATENCIES, + ); + const connectivityErrorCountData = getMetricData( + resourceMetrics, + METRIC_NAME_GFE_CONNECTIVITY_ERROR_COUNT, + ); + const afeConnectivityErrorCountData = getMetricData( + resourceMetrics, + METRIC_NAME_AFE_CONNECTIVITY_ERROR_COUNT, + ); + + // Verify GFE AFE latency doesn't exist + assert.ok(!hasMetricData(resourceMetrics, METRIC_NAME_GFE_LATENCIES)); + assert.ok(!hasMetricData(resourceMetrics, METRIC_NAME_AFE_LATENCIES)); + const attributes = { + ...commonAttributes, + database: `database-${dbCounter}`, + method: 'commit', + status: 'CANCELLED', + }; + // Verify attempt and operational metrics are unaffected + assert.strictEqual(getAggregatedValue(operationCountData, attributes), 1); + getAggregatedValue(operationLatenciesData, attributes); + assert.strictEqual(getAggregatedValue(attemptCountData, attributes), 1); + getAggregatedValue(attemptLatenciesData, attributes); + + // Verify that GFE AFE connectivity error count increased + assert.strictEqual( + getAggregatedValue(connectivityErrorCountData, attributes), + 1, + ); + assert.strictEqual( + getAggregatedValue(afeConnectivityErrorCountData, attributes), + 1, + ); }); it('should increase attempts on retries for non streaming calls with gax options', async () => {