diff --git a/README.md b/README.md index cd86ede..886a307 100644 --- a/README.md +++ b/README.md @@ -92,9 +92,9 @@ AIO_TELEMETRY_DISABLED=yes aio app deploy Telemetry is **best-effort**: events are not persisted when the proxy is down or the network fails. -On **`postrun`**, any in-memory metrics from earlier hooks in the same command are merged with the `postrun` metric and the combined batch is handed off to a **fire-and-forget detached subprocess** (`src/flush-worker.js`). The parent CLI spawns the worker and immediately `unref()`s it, so the CLI can exit without waiting for the HTTP POST. If the POST fails (network error or non-2xx response), the batch is dropped; telemetry must not block or slow normal CLI use. +A flush happens on a **terminal event** — `postrun` (command succeeded), `command-error` (command threw), or `command-not-found`. oclif runs `postrun` only on success, so `command-error`/`command-not-found` must flush on their own; otherwise error telemetry (the most valuable signal) would be buffered and silently dropped on exit. On a terminal event, any in-memory metrics from earlier hooks in the same command are merged with the terminal metric and the combined batch is handed off to a **fire-and-forget detached subprocess** (`src/flush-worker.js`). The parent CLI spawns the worker and immediately `unref()`s it, so the CLI can exit without waiting for the HTTP POST. If the POST fails (network error or non-2xx response), the batch is dropped; telemetry must not block or slow normal CLI use. -Non-`postrun` events (for example `command-error`, `telemetry-notice`) are held in an **in-memory buffer** until that flush. If the process exits before `postrun` (crash, `SIGKILL`), buffered events are lost. The buffer is cleared when telemetry is disabled or when `init` runs again (new command session). +Non-terminal events (for example `telemetry-notice`) are held in an **in-memory buffer** until the next terminal event flushes them. If the process exits before any terminal event (crash, `SIGKILL`), buffered events are lost. The buffer is cleared when telemetry is disabled or when `init` runs again (new command session). ## Agent detection @@ -134,7 +134,7 @@ Example shape of the metric payload: "value": 1, "timestamp": 1673404918437, "attributes": { - "eventType": "postrun", + "eventName": "postrun", "eventData": "{}", "cliVersion": "@adobe/aio-cli@11.0.2", "clientId": 264421030538, diff --git a/src/telemetry-lib.js b/src/telemetry-lib.js index 12c113f..662490d 100644 --- a/src/telemetry-lib.js +++ b/src/telemetry-lib.js @@ -26,9 +26,18 @@ function isEnvTelemetryDisabled () { let isDisabledForCommand = false -/** Metrics for non-`postrun` events in the current command; merged into one POST on `postrun`. */ +/** Metrics for non-terminal events in the current command; merged into the POST on a terminal event. */ const pendingCommandMetrics = [] +/** Events with no `postrun` after them; each flushes immediately so error telemetry isn't lost. */ +const TERMINAL_EVENTS = ['postrun', 'command-error', 'command-not-found'] + +/** Events that mean the command did not succeed (so `commandSuccess` is false). */ +const FAILED_COMMAND_EVENTS = ['command-error', 'command-not-found'] + +/** Set by `command-not-found` so we can drop the host's immediate `command-error` rethrow of the same typo. */ +let commandNotFoundFired = false + /** * Detects GitHub Copilot Chat on PATH via the extension id in globalStorage paths (any OS path separator). * @@ -173,9 +182,10 @@ function formatEventDataAttribute (eventData) { } /** - * Records a telemetry event. Non-`postrun` metrics are held in memory and sent in a single - * batched POST when `postrun` runs. When enabled, the flush worker is detached so the CLI - * never waits on the network; failed deliveries are dropped (no disk queue). + * Records a telemetry event. Non-terminal metrics are held in memory and sent in a single + * batched POST on the next terminal event (`postrun`, `command-error`, `command-not-found`). + * When enabled, the flush worker is detached so the CLI never waits on the network; failed + * deliveries are dropped (no disk queue). * * @param {string} eventType prerun, postrun, command-error, command-not-found, telemetry * @param {object|string|number|undefined} [rawEventData] Optional hook payload (e.g. `{ message }` on errors). @@ -204,14 +214,16 @@ async function trackEvent (eventType, rawEventData = {}) { value: 1, timestamp, attributes: { - eventType, + // NB: the wire attribute is `eventName`, not `eventType` — `eventType` is a New Relic + // reserved word and is dropped on Metric API ingest, so it is unqueryable in NRQL. + eventName: eventType, eventData: formatEventDataAttribute(eventData), cliVersion: rootCliVersion, clientId, command: prerunEvent.command, commandDuration: timestamp - prerunEvent.start, commandFlags: prerunEvent.flags.toString(), - commandSuccess: eventType !== 'command-error', + commandSuccess: !FAILED_COMMAND_EVENTS.includes(eventType), nodeVersion: process.version, osNameVersion, invocation_context: /* istanbul ignore next */ invocationContext.isAgent ? 'agent' : 'human', @@ -219,11 +231,21 @@ async function trackEvent (eventType, rawEventData = {}) { } } - if (eventType !== 'postrun') { + // Non-terminal events buffer; terminal events flush (oclif runs no postrun after error/not-found). + if (!TERMINAL_EVENTS.includes(eventType)) { pendingCommandMetrics.push(metric) return } + // A typo fires command-not-found, then the host rethrows it as command-error; drop that rethrow. + if (eventType === 'command-not-found') { + commandNotFoundFired = true + } else if (eventType === 'command-error' && commandNotFoundFired) { + commandNotFoundFired = false + debug('dropping command-error that rethrows a command-not-found (same typo)') + return + } + const mergedMetrics = [...pendingCommandMetrics, metric] pendingCommandMetrics.length = 0 const mergedBody = JSON.stringify([{ metrics: mergedMetrics }]) @@ -253,12 +275,15 @@ async function trackEvent (eventType, rawEventData = {}) { */ function trackPrerun (command, flags, start) { prerunEvent = { command, flags, start } + // A real command is now running (e.g. an accepted "did you mean" suggestion), so its errors count. + commandNotFoundFired = false } module.exports = { getInvocationContext, init: (versionString, binName, remoteConf = {}) => { pendingCommandMetrics.length = 0 + commandNotFoundFired = false global.commandHookStartTime = Date.now() rootCliVersion = versionString postUrl = remoteConf.postUrl || process.env.AIO_TELEMETRY_POST_URL || DEFAULT_TELEMETRY_POST_URL diff --git a/test/flush-worker.test.js b/test/flush-worker.test.js index f47a15c..5dd9c18 100644 --- a/test/flush-worker.test.js +++ b/test/flush-worker.test.js @@ -16,7 +16,7 @@ const fetch = createFetch() const { main } = require('../src/flush-worker') const PROXY = 'https://telemetry-proxy.example/api/v1/web/dx-excshell-1/telemetry' -const METRIC = { name: 'aio.cli.telemetry', type: 'gauge', value: 1, timestamp: 1000, attributes: { eventType: 'postrun' } } +const METRIC = { name: 'aio.cli.telemetry', type: 'gauge', value: 1, timestamp: 1000, attributes: { eventName: 'postrun' } } const BODY = JSON.stringify([{ metrics: [METRIC] }]) const flushArg = (body = BODY, extraHeaders) => { const base = { body, postUrl: PROXY } diff --git a/test/hooks.test.js b/test/hooks.test.js index aa24115..2009a1b 100644 --- a/test/hooks.test.js +++ b/test/hooks.test.js @@ -41,32 +41,32 @@ describe('hook interfaces', () => { noticeSpy.mockRestore() }) - test('command-error', async () => { + // oclif does not run `postrun` after a command throws, so the error hook itself must flush. + test('command-error flushes immediately (no postrun follows an error)', async () => { config.get.mockImplementation((key) => (String(key).includes('optOut') ? false : 'clientid')) telemetryLib.init('name@0.0.1', 'aio', mockPackageJson.aioTelemetry) const hook = require('../src/hooks/command-error') expect(typeof hook).toBe('function') await hook({ message: 'msg' }) - expect(spawn).not.toHaveBeenCalled() - await telemetryLib.trackEvent('postrun') expect(spawn).toHaveBeenCalledTimes(1) const flushPayload = JSON.parse(spawn.mock.calls[0][1][1]) const body = JSON.parse(flushPayload.body) - expect(body[0].metrics.map((m) => m.attributes.eventType)).toEqual(['command-error', 'postrun']) + expect(body[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-error']) + expect(body[0].metrics[0].attributes.commandSuccess).toBe(false) }) - test('command-not-found', async () => { + // command-not-found is terminal too: no command runs, so no postrun. + test('command-not-found flushes immediately', async () => { config.get.mockImplementation((key) => (String(key).includes('optOut') ? false : 'clientid')) telemetryLib.init('name@0.0.1', 'aio', mockPackageJson.aioTelemetry) const hook = require('../src/hooks/command-not-found') expect(typeof hook).toBe('function') await hook({ id: 'id' }) - expect(spawn).not.toHaveBeenCalled() - await telemetryLib.trackEvent('postrun') expect(spawn).toHaveBeenCalledTimes(1) const flushPayloadNf = JSON.parse(spawn.mock.calls[0][1][1]) const bodyNf = JSON.parse(flushPayloadNf.body) - expect(bodyNf[0].metrics.map((m) => m.attributes.eventType)).toEqual(['command-not-found', 'postrun']) + expect(bodyNf[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-not-found']) + expect(bodyNf[0].metrics[0].attributes.commandSuccess).toBe(false) }) test('init shows one-time notice on first run', async () => { @@ -83,7 +83,7 @@ describe('hook interfaces', () => { expect(spawn).toHaveBeenCalledTimes(1) const flushPayloadAcc = JSON.parse(spawn.mock.calls[0][1][1]) const bodyAcc = JSON.parse(flushPayloadAcc.body) - expect(bodyAcc[0].metrics.map((m) => m.attributes.eventType)).toEqual(['telemetry-notice', 'postrun']) + expect(bodyAcc[0].metrics.map((m) => m.attributes.eventName)).toEqual(['telemetry-notice', 'postrun']) expect(bodyAcc[0].metrics[0].attributes.eventData).toBe('shown') process.env = preEnv }) @@ -158,7 +158,7 @@ describe('hook interfaces', () => { expect(spawn).toHaveBeenCalledTimes(1) const flushPayloadCe = JSON.parse(spawn.mock.calls[0][1][1]) const bodyCe = JSON.parse(flushPayloadCe.body) - expect(bodyCe[0].metrics.map((m) => m.attributes.eventType)).toEqual(['telemetry-custom-event', 'postrun']) + expect(bodyCe[0].metrics.map((m) => m.attributes.eventName)).toEqual(['telemetry-custom-event', 'postrun']) }) test('postrun', async () => { @@ -170,7 +170,7 @@ describe('hook interfaces', () => { expect(spawn).toHaveBeenCalledTimes(1) const flushPayload = JSON.parse(spawn.mock.calls[0][1][1]) expect(flushPayload.headers).toBeUndefined() - expect(flushPayload.body).toContain('"eventType":"postrun"') + expect(flushPayload.body).toContain('"eventName":"postrun"') }) /** diff --git a/test/telemetry-lib.test.js b/test/telemetry-lib.test.js index f4b2485..df40047 100644 --- a/test/telemetry-lib.test.js +++ b/test/telemetry-lib.test.js @@ -78,8 +78,77 @@ describe('telemetry-lib', () => { const flushPayload = JSON.parse(spawn.mock.calls[0][1][1]) const body = JSON.parse(flushPayload.body) expect(body[0].metrics).toHaveLength(2) - expect(body[0].metrics[0].attributes.eventType).toBe('telemetry-custom-event') - expect(body[0].metrics[1].attributes.eventType).toBe('postrun') + expect(body[0].metrics[0].attributes.eventName).toBe('telemetry-custom-event') + expect(body[0].metrics[1].attributes.eventName).toBe('postrun') + }) + + test('terminal command-error flushes buffered events without a postrun', async () => { + config.get.mockReturnValue('clientidxyz') + telemetryLib.init('a@4', 'binErr', {}) + await telemetryLib.trackEvent('telemetry-custom-event') + expect(spawn).not.toHaveBeenCalled() + await telemetryLib.trackEvent('command-error', { message: 'boom' }) + expect(spawn).toHaveBeenCalledTimes(1) + const flushPayload = JSON.parse(spawn.mock.calls[0][1][1]) + const body = JSON.parse(flushPayload.body) + expect(body[0].metrics.map((m) => m.attributes.eventName)).toEqual(['telemetry-custom-event', 'command-error']) + }) + + test('a typo (command-not-found then host command-error) is recorded as a single command-not-found', async () => { + config.get.mockReturnValue('clientidxyz') + telemetryLib.init('a@4', 'binOnce', {}) + // a typo fires command-not-found (from oclif) and then command-error (rethrown by the host) + await telemetryLib.trackEvent('command-not-found', 'bogus') + await telemetryLib.trackEvent('command-error', { message: 'Run aio help' }) + expect(spawn).toHaveBeenCalledTimes(1) + const flushPayload = JSON.parse(spawn.mock.calls[0][1][1]) + const body = JSON.parse(flushPayload.body) + expect(body[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-not-found']) + }) + + test('only the first command-error after a not-found is suppressed; a later one still flushes', async () => { + config.get.mockReturnValue('clientidxyz') + telemetryLib.init('a@4', 'binReset', {}) + await telemetryLib.trackEvent('command-not-found', 'bogus') // flush #1 + await telemetryLib.trackEvent('command-error', { message: 'Run aio help' }) // dropped: the typo rethrow + await telemetryLib.trackEvent('command-error', { message: 'unrelated later error' }) // flush #2 + expect(spawn).toHaveBeenCalledTimes(2) + expect(JSON.parse(JSON.parse(spawn.mock.calls[0][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-not-found']) + expect(JSON.parse(JSON.parse(spawn.mock.calls[1][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-error']) + }) + + test('"Did you mean ...? Yes": not-found then the suggested command runs and its postrun flushes', async () => { + config.get.mockReturnValue('clientidxyz') + telemetryLib.init('a@4', 'binYes', {}) + await telemetryLib.trackEvent('command-not-found', 'telemetryd') // flush #1 + telemetryLib.trackPrerun('telemetry', [], Date.now()) // suggestion accepted -> command runs + await telemetryLib.trackEvent('postrun') // flush #2 + expect(spawn).toHaveBeenCalledTimes(2) + expect(JSON.parse(JSON.parse(spawn.mock.calls[1][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['postrun']) + }) + + test('"Did you mean ...? Yes" then the suggested command errors: that command-error is NOT masked', async () => { + config.get.mockReturnValue('clientidxyz') + telemetryLib.init('a@4', 'binYesErr', {}) + await telemetryLib.trackEvent('command-not-found', 'telemetryd') // flush #1 + telemetryLib.trackPrerun('telemetry', [], Date.now()) // suggestion accepted -> command runs (clears the flag) + await telemetryLib.trackEvent('command-error', { message: 'the real command failed' }) // flush #2 + expect(spawn).toHaveBeenCalledTimes(2) + expect(JSON.parse(JSON.parse(spawn.mock.calls[1][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-error']) + }) + + test('nested commands in one process each flush (no masking): two postruns => two flushes', async () => { + // A command that calls config.runCommand re-fires prerun/postrun for the child; init runs once. + // The child postrun must NOT suppress the parent terminal event. + config.get.mockReturnValue('clientidxyz') + telemetryLib.init('a@4', 'binNested', {}) + telemetryLib.trackPrerun('app:clean', [], Date.now()) + await telemetryLib.trackEvent('postrun') // child (e.g. app:clean) succeeds + telemetryLib.trackPrerun('app:undeploy', [], Date.now()) + await telemetryLib.trackEvent('command-error', { message: 'parent failed after child succeeded' }) + expect(spawn).toHaveBeenCalledTimes(2) + expect(JSON.parse(JSON.parse(spawn.mock.calls[0][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['postrun']) + expect(JSON.parse(JSON.parse(spawn.mock.calls[1][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-error']) }) test('postrun adds durationMs to eventData when the hook omits payload and prerunTimer is set', async () => {