Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ AIO_TELEMETRY_DISABLED=yes aio app deploy

Telemetry is **best-effort**: events are not persisted when the proxy is down or the network fails.

On **`postrun`**, any in-memory metrics from earlier hooks in the same command are merged with the `postrun` metric and the combined batch is handed off to a **fire-and-forget detached subprocess** (`src/flush-worker.js`). The parent CLI spawns the worker and immediately `unref()`s it, so the CLI can exit without waiting for the HTTP POST. If the POST fails (network error or non-2xx response), the batch is dropped; telemetry must not block or slow normal CLI use.
A flush happens on a **terminal event**`postrun` (command succeeded), `command-error` (command threw), or `command-not-found`. oclif runs `postrun` only on success, so `command-error`/`command-not-found` must flush on their own; otherwise error telemetry (the most valuable signal) would be buffered and silently dropped on exit. On a terminal event, any in-memory metrics from earlier hooks in the same command are merged with the terminal metric and the combined batch is handed off to a **fire-and-forget detached subprocess** (`src/flush-worker.js`). The parent CLI spawns the worker and immediately `unref()`s it, so the CLI can exit without waiting for the HTTP POST. If the POST fails (network error or non-2xx response), the batch is dropped; telemetry must not block or slow normal CLI use.

Non-`postrun` events (for example `command-error`, `telemetry-notice`) are held in an **in-memory buffer** until that flush. If the process exits before `postrun` (crash, `SIGKILL`), buffered events are lost. The buffer is cleared when telemetry is disabled or when `init` runs again (new command session).
Non-terminal events (for example `telemetry-notice`) are held in an **in-memory buffer** until the next terminal event flushes them. If the process exits before any terminal event (crash, `SIGKILL`), buffered events are lost. The buffer is cleared when telemetry is disabled or when `init` runs again (new command session).

## Agent detection

Expand Down Expand Up @@ -134,7 +134,7 @@ Example shape of the metric payload:
"value": 1,
"timestamp": 1673404918437,
"attributes": {
"eventType": "postrun",
"eventName": "postrun",
"eventData": "{}",
"cliVersion": "@adobe/aio-cli@11.0.2",
"clientId": 264421030538,
Expand Down
39 changes: 32 additions & 7 deletions src/telemetry-lib.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,18 @@ function isEnvTelemetryDisabled () {

let isDisabledForCommand = false

/** Metrics for non-`postrun` events in the current command; merged into one POST on `postrun`. */
/** Metrics for non-terminal events in the current command; merged into the POST on a terminal event. */
const pendingCommandMetrics = []

/** Events with no `postrun` after them; each flushes immediately so error telemetry isn't lost. */
const TERMINAL_EVENTS = ['postrun', 'command-error', 'command-not-found']

/** Events that mean the command did not succeed (so `commandSuccess` is false). */
const FAILED_COMMAND_EVENTS = ['command-error', 'command-not-found']

/** Set by `command-not-found` so we can drop the host's immediate `command-error` rethrow of the same typo. */
let commandNotFoundFired = false

/**
* Detects GitHub Copilot Chat on PATH via the extension id in globalStorage paths (any OS path separator).
*
Expand Down Expand Up @@ -173,9 +182,10 @@ function formatEventDataAttribute (eventData) {
}

/**
* Records a telemetry event. Non-`postrun` metrics are held in memory and sent in a single
* batched POST when `postrun` runs. When enabled, the flush worker is detached so the CLI
* never waits on the network; failed deliveries are dropped (no disk queue).
* Records a telemetry event. Non-terminal metrics are held in memory and sent in a single
* batched POST on the next terminal event (`postrun`, `command-error`, `command-not-found`).
* When enabled, the flush worker is detached so the CLI never waits on the network; failed
* deliveries are dropped (no disk queue).
*
* @param {string} eventType prerun, postrun, command-error, command-not-found, telemetry
* @param {object|string|number|undefined} [rawEventData] Optional hook payload (e.g. `{ message }` on errors).
Expand Down Expand Up @@ -204,26 +214,38 @@ async function trackEvent (eventType, rawEventData = {}) {
value: 1,
timestamp,
attributes: {
eventType,
// NB: the wire attribute is `eventName`, not `eventType` — `eventType` is a New Relic
// reserved word and is dropped on Metric API ingest, so it is unqueryable in NRQL.
eventName: eventType,
eventData: formatEventDataAttribute(eventData),
cliVersion: rootCliVersion,
clientId,
command: prerunEvent.command,
commandDuration: timestamp - prerunEvent.start,
commandFlags: prerunEvent.flags.toString(),
commandSuccess: eventType !== 'command-error',
commandSuccess: !FAILED_COMMAND_EVENTS.includes(eventType),
nodeVersion: process.version,
osNameVersion,
invocation_context: /* istanbul ignore next */ invocationContext.isAgent ? 'agent' : 'human',
agent_name: /* istanbul ignore next */ invocationContext.agentName || 'unknown'
}
}

if (eventType !== 'postrun') {
// Non-terminal events buffer; terminal events flush (oclif runs no postrun after error/not-found).
if (!TERMINAL_EVENTS.includes(eventType)) {
pendingCommandMetrics.push(metric)
return
}

// A typo fires command-not-found, then the host rethrows it as command-error; drop that rethrow.
if (eventType === 'command-not-found') {
commandNotFoundFired = true
} else if (eventType === 'command-error' && commandNotFoundFired) {
commandNotFoundFired = false
debug('dropping command-error that rethrows a command-not-found (same typo)')
return
Comment thread
mgar marked this conversation as resolved.
}

const mergedMetrics = [...pendingCommandMetrics, metric]
pendingCommandMetrics.length = 0
const mergedBody = JSON.stringify([{ metrics: mergedMetrics }])
Expand Down Expand Up @@ -253,12 +275,15 @@ async function trackEvent (eventType, rawEventData = {}) {
*/
function trackPrerun (command, flags, start) {
prerunEvent = { command, flags, start }
// A real command is now running (e.g. an accepted "did you mean" suggestion), so its errors count.
commandNotFoundFired = false
}

module.exports = {
getInvocationContext,
init: (versionString, binName, remoteConf = {}) => {
pendingCommandMetrics.length = 0
commandNotFoundFired = false
global.commandHookStartTime = Date.now()
rootCliVersion = versionString
postUrl = remoteConf.postUrl || process.env.AIO_TELEMETRY_POST_URL || DEFAULT_TELEMETRY_POST_URL
Expand Down
2 changes: 1 addition & 1 deletion test/flush-worker.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ const fetch = createFetch()
const { main } = require('../src/flush-worker')

const PROXY = 'https://telemetry-proxy.example/api/v1/web/dx-excshell-1/telemetry'
const METRIC = { name: 'aio.cli.telemetry', type: 'gauge', value: 1, timestamp: 1000, attributes: { eventType: 'postrun' } }
const METRIC = { name: 'aio.cli.telemetry', type: 'gauge', value: 1, timestamp: 1000, attributes: { eventName: 'postrun' } }
const BODY = JSON.stringify([{ metrics: [METRIC] }])
const flushArg = (body = BODY, extraHeaders) => {
const base = { body, postUrl: PROXY }
Expand Down
22 changes: 11 additions & 11 deletions test/hooks.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,32 +41,32 @@ describe('hook interfaces', () => {
noticeSpy.mockRestore()
})

test('command-error', async () => {
// oclif does not run `postrun` after a command throws, so the error hook itself must flush.
test('command-error flushes immediately (no postrun follows an error)', async () => {
config.get.mockImplementation((key) => (String(key).includes('optOut') ? false : 'clientid'))
telemetryLib.init('name@0.0.1', 'aio', mockPackageJson.aioTelemetry)
const hook = require('../src/hooks/command-error')
expect(typeof hook).toBe('function')
await hook({ message: 'msg' })
expect(spawn).not.toHaveBeenCalled()
await telemetryLib.trackEvent('postrun')
expect(spawn).toHaveBeenCalledTimes(1)
const flushPayload = JSON.parse(spawn.mock.calls[0][1][1])
const body = JSON.parse(flushPayload.body)
expect(body[0].metrics.map((m) => m.attributes.eventType)).toEqual(['command-error', 'postrun'])
expect(body[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-error'])
expect(body[0].metrics[0].attributes.commandSuccess).toBe(false)
})

test('command-not-found', async () => {
// command-not-found is terminal too: no command runs, so no postrun.
test('command-not-found flushes immediately', async () => {
config.get.mockImplementation((key) => (String(key).includes('optOut') ? false : 'clientid'))
telemetryLib.init('name@0.0.1', 'aio', mockPackageJson.aioTelemetry)
const hook = require('../src/hooks/command-not-found')
expect(typeof hook).toBe('function')
await hook({ id: 'id' })
expect(spawn).not.toHaveBeenCalled()
await telemetryLib.trackEvent('postrun')
expect(spawn).toHaveBeenCalledTimes(1)
const flushPayloadNf = JSON.parse(spawn.mock.calls[0][1][1])
const bodyNf = JSON.parse(flushPayloadNf.body)
expect(bodyNf[0].metrics.map((m) => m.attributes.eventType)).toEqual(['command-not-found', 'postrun'])
expect(bodyNf[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-not-found'])
expect(bodyNf[0].metrics[0].attributes.commandSuccess).toBe(false)
})

test('init shows one-time notice on first run', async () => {
Expand All @@ -83,7 +83,7 @@ describe('hook interfaces', () => {
expect(spawn).toHaveBeenCalledTimes(1)
const flushPayloadAcc = JSON.parse(spawn.mock.calls[0][1][1])
const bodyAcc = JSON.parse(flushPayloadAcc.body)
expect(bodyAcc[0].metrics.map((m) => m.attributes.eventType)).toEqual(['telemetry-notice', 'postrun'])
expect(bodyAcc[0].metrics.map((m) => m.attributes.eventName)).toEqual(['telemetry-notice', 'postrun'])
expect(bodyAcc[0].metrics[0].attributes.eventData).toBe('shown')
process.env = preEnv
})
Expand Down Expand Up @@ -158,7 +158,7 @@ describe('hook interfaces', () => {
expect(spawn).toHaveBeenCalledTimes(1)
const flushPayloadCe = JSON.parse(spawn.mock.calls[0][1][1])
const bodyCe = JSON.parse(flushPayloadCe.body)
expect(bodyCe[0].metrics.map((m) => m.attributes.eventType)).toEqual(['telemetry-custom-event', 'postrun'])
expect(bodyCe[0].metrics.map((m) => m.attributes.eventName)).toEqual(['telemetry-custom-event', 'postrun'])
})

test('postrun', async () => {
Expand All @@ -170,7 +170,7 @@ describe('hook interfaces', () => {
expect(spawn).toHaveBeenCalledTimes(1)
const flushPayload = JSON.parse(spawn.mock.calls[0][1][1])
expect(flushPayload.headers).toBeUndefined()
expect(flushPayload.body).toContain('"eventType":"postrun"')
expect(flushPayload.body).toContain('"eventName":"postrun"')
})

/**
Expand Down
73 changes: 71 additions & 2 deletions test/telemetry-lib.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,77 @@ describe('telemetry-lib', () => {
const flushPayload = JSON.parse(spawn.mock.calls[0][1][1])
const body = JSON.parse(flushPayload.body)
expect(body[0].metrics).toHaveLength(2)
expect(body[0].metrics[0].attributes.eventType).toBe('telemetry-custom-event')
expect(body[0].metrics[1].attributes.eventType).toBe('postrun')
expect(body[0].metrics[0].attributes.eventName).toBe('telemetry-custom-event')
expect(body[0].metrics[1].attributes.eventName).toBe('postrun')
})

test('terminal command-error flushes buffered events without a postrun', async () => {
config.get.mockReturnValue('clientidxyz')
telemetryLib.init('a@4', 'binErr', {})
await telemetryLib.trackEvent('telemetry-custom-event')
expect(spawn).not.toHaveBeenCalled()
await telemetryLib.trackEvent('command-error', { message: 'boom' })
expect(spawn).toHaveBeenCalledTimes(1)
const flushPayload = JSON.parse(spawn.mock.calls[0][1][1])
const body = JSON.parse(flushPayload.body)
expect(body[0].metrics.map((m) => m.attributes.eventName)).toEqual(['telemetry-custom-event', 'command-error'])
})

test('a typo (command-not-found then host command-error) is recorded as a single command-not-found', async () => {
config.get.mockReturnValue('clientidxyz')
telemetryLib.init('a@4', 'binOnce', {})
// a typo fires command-not-found (from oclif) and then command-error (rethrown by the host)
await telemetryLib.trackEvent('command-not-found', 'bogus')
await telemetryLib.trackEvent('command-error', { message: 'Run aio help' })
expect(spawn).toHaveBeenCalledTimes(1)
const flushPayload = JSON.parse(spawn.mock.calls[0][1][1])
const body = JSON.parse(flushPayload.body)
expect(body[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-not-found'])
})

test('only the first command-error after a not-found is suppressed; a later one still flushes', async () => {
config.get.mockReturnValue('clientidxyz')
telemetryLib.init('a@4', 'binReset', {})
await telemetryLib.trackEvent('command-not-found', 'bogus') // flush #1
await telemetryLib.trackEvent('command-error', { message: 'Run aio help' }) // dropped: the typo rethrow
await telemetryLib.trackEvent('command-error', { message: 'unrelated later error' }) // flush #2
expect(spawn).toHaveBeenCalledTimes(2)
expect(JSON.parse(JSON.parse(spawn.mock.calls[0][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-not-found'])
expect(JSON.parse(JSON.parse(spawn.mock.calls[1][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-error'])
})

test('"Did you mean ...? Yes": not-found then the suggested command runs and its postrun flushes', async () => {
config.get.mockReturnValue('clientidxyz')
telemetryLib.init('a@4', 'binYes', {})
await telemetryLib.trackEvent('command-not-found', 'telemetryd') // flush #1
telemetryLib.trackPrerun('telemetry', [], Date.now()) // suggestion accepted -> command runs
await telemetryLib.trackEvent('postrun') // flush #2
expect(spawn).toHaveBeenCalledTimes(2)
expect(JSON.parse(JSON.parse(spawn.mock.calls[1][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['postrun'])
})

test('"Did you mean ...? Yes" then the suggested command errors: that command-error is NOT masked', async () => {
config.get.mockReturnValue('clientidxyz')
telemetryLib.init('a@4', 'binYesErr', {})
await telemetryLib.trackEvent('command-not-found', 'telemetryd') // flush #1
telemetryLib.trackPrerun('telemetry', [], Date.now()) // suggestion accepted -> command runs (clears the flag)
await telemetryLib.trackEvent('command-error', { message: 'the real command failed' }) // flush #2
expect(spawn).toHaveBeenCalledTimes(2)
expect(JSON.parse(JSON.parse(spawn.mock.calls[1][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-error'])
})

test('nested commands in one process each flush (no masking): two postruns => two flushes', async () => {
// A command that calls config.runCommand re-fires prerun/postrun for the child; init runs once.
// The child postrun must NOT suppress the parent terminal event.
config.get.mockReturnValue('clientidxyz')
telemetryLib.init('a@4', 'binNested', {})
telemetryLib.trackPrerun('app:clean', [], Date.now())
await telemetryLib.trackEvent('postrun') // child (e.g. app:clean) succeeds
telemetryLib.trackPrerun('app:undeploy', [], Date.now())
await telemetryLib.trackEvent('command-error', { message: 'parent failed after child succeeded' })
expect(spawn).toHaveBeenCalledTimes(2)
expect(JSON.parse(JSON.parse(spawn.mock.calls[0][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['postrun'])
expect(JSON.parse(JSON.parse(spawn.mock.calls[1][1][1]).body)[0].metrics.map((m) => m.attributes.eventName)).toEqual(['command-error'])
})

test('postrun adds durationMs to eventData when the hook omits payload and prerunTimer is set', async () => {
Expand Down
Loading