NVIDIA · Edwardf0t1 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/.claude/scripts/sync-upstream-skills.sh b/.claude/scripts/sync-upstream-skills.sh
@@ -0,0 +1,150 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Re-vendor upstream Claude skills from NVIDIA-NeMo/Evaluator at a pinned SHA.
+#
+# Scope: only skills we vendor verbatim (launching-evals, accessing-mlflow).
+# The `evaluation` skill is a *modified* fork of upstream nel-assistant and is
+# NOT managed by this script — update it manually when pulling upstream changes.
+#
+# Usage:
+#   .claude/scripts/sync-upstream-skills.sh            # re-vendor at the pinned SHA
+#   UPSTREAM_SHA=<sha> .claude/scripts/sync-upstream-skills.sh   # bump to a new SHA
+#
+# Requires: gh, base64, awk. Run from the repo root.
+#
+# The script overwrites .claude/skills/<skill>/ with upstream contents and
+# re-applies our provenance lines into each SKILL.md frontmatter. If you have
+# local changes to a vendored skill, they will be lost — that is expected,
+# since vendored-verbatim skills should not be modified locally.
+
+set -euo pipefail
+
+# Pinned upstream commit. Bump this (or pass UPSTREAM_SHA=...) when syncing.
+DEFAULT_SHA="01899f89e8f31116efbca56e8f87fbd8513e24ac"
+SHA="${UPSTREAM_SHA:-$DEFAULT_SHA}"
+SHORT_SHA="${SHA:0:7}"
+
+UPSTREAM_REPO="NVIDIA-NeMo/Evaluator"
+UPSTREAM_BASE="packages/nemo-evaluator-launcher/.claude/skills"
+DEST_BASE=".claude/skills"
+
+if [[ ! -d "$DEST_BASE" ]]; then
+    echo "error: run from the repo root (expected $DEST_BASE/ to exist)" >&2
+    exit 1
+fi
+
+echo "Syncing upstream skills from $UPSTREAM_REPO @ $SHORT_SHA"
+
+fetch_tree() {
+    local skill="$1"
+    local path="$2"
+    gh api "repos/$UPSTREAM_REPO/contents/$UPSTREAM_BASE/$skill/$path?ref=$SHA" \
+        -q '.[] | "\(.type)\t\(.name)"'
+}
+
+fetch_file() {
+    local src="$1"
+    local dst="$2"
+    mkdir -p "$(dirname "$dst")"
+    gh api "repos/$UPSTREAM_REPO/contents/$src?ref=$SHA" -q '.content' | base64 -d > "$dst"
+}
+
+fetch_skill_recursive() {
+    local skill="$1"
+    local subpath="${2:-}"
+    local remote="$UPSTREAM_BASE/$skill"
+    [[ -n "$subpath" ]] && remote="$remote/$subpath"
+
+    local entries
+    entries=$(gh api "repos/$UPSTREAM_REPO/contents/$remote?ref=$SHA" -q '.[] | "\(.type)\t\(.name)"')
+
+    while IFS=$'\t' read -r type name; do
+        local rel_path
+        if [[ -n "$subpath" ]]; then
+            rel_path="$subpath/$name"
+        else
+            rel_path="$name"
+        fi
+
+        if [[ "$type" == "file" ]]; then
+            local dst="$DEST_BASE/$skill/$rel_path"
+            echo "  fetch: $dst"
+            fetch_file "$UPSTREAM_BASE/$skill/$rel_path" "$dst"
+        elif [[ "$type" == "dir" ]]; then
+            fetch_skill_recursive "$skill" "$rel_path"
+        fi
+    done <<< "$entries"
+}
+
+# Inject our provenance lines into a SKILL.md frontmatter, right after the
+# `description:` line. Idempotent — removes any existing provenance block first.
+inject_provenance() {
+    local skill="$1"
+    local extra_note="${2:-}"
+    local path="$DEST_BASE/$skill/SKILL.md"
+
+    awk -v sha="$SHA" -v short="$SHORT_SHA" -v skill="$skill" -v extra="$extra_note" '
+        BEGIN { in_fm = 0; injected = 0; fm_end_seen = 0 }
+        # Skip any pre-existing provenance or license lines we own
+        /^license: Apache-2\.0$/ && in_fm && !fm_end_seen { next }
+        /^# Vendored verbatim/ && in_fm && !fm_end_seen { next }
+        /^# https:\/\/github\.com\/NVIDIA-NeMo\/Evaluator\/tree\// && in_fm && !fm_end_seen { next }
+        /^# To re-sync:/ && in_fm && !fm_end_seen { next }
+        /^# Note: this skill depends on the mlflow-mcp/ && in_fm && !fm_end_seen { next }
+        /^# configured in the user/ && in_fm && !fm_end_seen { next }
+        {
+            print
+            if ($0 == "---") {
+                if (in_fm == 0) { in_fm = 1 }
+                else { in_fm = 0; fm_end_seen = 1 }
+            }
+            if (in_fm && !injected && $0 ~ /^description: /) {
+                print "license: Apache-2.0"
+                print "# Vendored verbatim from NVIDIA NeMo Evaluator (commit " short ")"
+                print "# https://github.com/NVIDIA-NeMo/Evaluator/tree/" sha "/packages/nemo-evaluator-launcher/.claude/skills/" skill
+                print "# To re-sync: .claude/scripts/sync-upstream-skills.sh"
+                if (extra != "") {
+                    n = split(extra, lines, "\\|")
+                    for (i = 1; i <= n; i++) print "# " lines[i]
+                }
+                injected = 1
+            }
+        }
+    ' "$path" > "$path.tmp"
+    mv "$path.tmp" "$path"
+}
+
+for skill in launching-evals accessing-mlflow; do
+    echo ""
+    echo "== $skill =="
+    rm -rf "${DEST_BASE:?}/$skill"
+    fetch_skill_recursive "$skill"
+
+    case "$skill" in
+        accessing-mlflow)
+            inject_provenance "$skill" \
+                "Note: this skill depends on the mlflow-mcp MCP server (https://github.com/kkruglik/mlflow-mcp)|configured in the user's Claude Code setup."
+            ;;
+        *)
+            inject_provenance "$skill"
+            ;;
+    esac
+done
+
+echo ""
+echo "Done. Review with: git diff $DEST_BASE/launching-evals $DEST_BASE/accessing-mlflow"
+echo "If the SHA changed, update DEFAULT_SHA at the top of this script before committing."
diff --git a/.claude/skills/accessing-mlflow/SKILL.md b/.claude/skills/accessing-mlflow/SKILL.md
@@ -0,0 +1,104 @@
+---
+name: accessing-mlflow
+description: Query and browse evaluation results stored in MLflow. Use when the user wants to look up runs by invocation ID, compare metrics across models, fetch artifacts (configs, logs, results), or set up the MLflow MCP server. ALWAYS triggers on mentions of MLflow, experiment results, run comparison, invocation IDs in the context of results, or MLflow MCP setup.
+license: Apache-2.0
+# Vendored verbatim from NVIDIA NeMo Evaluator (commit 01899f8)
+# https://github.com/NVIDIA-NeMo/Evaluator/tree/01899f89e8f31116efbca56e8f87fbd8513e24ac/packages/nemo-evaluator-launcher/.claude/skills/accessing-mlflow
+# To re-sync: .claude/scripts/sync-upstream-skills.sh
+# Note: this skill depends on the mlflow-mcp MCP server (https://github.com/kkruglik/mlflow-mcp)
+# configured in the user's Claude Code setup.
+---
+
+# Accessing MLflow
+
+## MCP Server
+
+[mlflow-mcp](https://github.com/kkruglik/mlflow-mcp) gives agents direct access to MLflow — query runs, compare metrics, browse artifacts, all through natural language.
+
+## ID Convention
+
+When the user provides a hex ID (e.g. `71f3f3199ea5e1f0`) without specifying what it is, assume it is an **invocation_id** (not an MLflow run_id). An invocation_id identifies a launcher invocation and is stored as both a tag and a param on MLflow runs. One invocation can produce multiple MLflow runs (one per task). You may need to search across multiple experiments if you don't know which experiment the run belongs to.
+
+## Querying Runs
+
+```python
+# Find runs by invocation_id
+MLflow:search_runs_by_tags(experiment_id, {"invocation_id": "<invocation_id>"})
+
+# Query for example model/task runs
+MLflow:query_runs(experiment_id, "tags.model LIKE '%<model>%'")
+MLflow:query_runs(experiment_id, "tags.task_name LIKE '%<task_name>%'")
+
+# Get a config from run's artifacts
+MLflow:get_artifact_content(run_id, "config.yml")
+
+# Get nested stats from run's artifacts
+MLflow:get_artifact_content(run_id, "artifacts/eval_factory_metrics.json")
+```
+
+NOTE: You WILL NOT find PENDING, RUNNING, KILLED, or FAILED runs in MLflow! Only SUCCESSFUL runs are exported to MLflow.
+
+## Workflow Tips
+
+When comparing metrics across runs, fetch the data via MCP, then run the computation in Python for exact results rather than doing math in-context:
+
+```bash
+uv run --with pandas python3 << 'EOF'
+import pandas as pd
+# ... compute deltas, averages, etc.
+EOF
+```
+
+## Artifacts Structure
+
+```
+<harness>.<task>/
+├── artifacts/
+│   ├── config.yml                # Fully resolved config used during the evaluation
+│   ├── launcher_unresolved_config.yaml # Unresolved config passed to the launcher
+│   ├── results.yml               # All results in YAML format
+│   ├── eval_factory_metrics.json # Runtime stats (latency, tokens count, memory)
+│   ├── report.html               # Request-Response Pairs samples in HTML format (if enabled)
+│   └── report.json               # Request-Response Pairs samples in JSON format (if enabled)
+└── logs/
+    ├── client-*.log              # Evaluation client
+    ├── server-*-N.log            # Deployment per node
+    ├── slurm-*.log               # Slurm job
+    └── proxy-*.log               # Request proxy
+```
+
+## Troubleshooting
+
+If the MLflow MCP server fails to load or its tools are unavailable:
+
+1. **`uvx` not found** — install [uv](https://docs.astral.sh/uv/getting-started/installation/):
+   ```bash
+   curl -LsSf https://astral.sh/uv/install.sh | sh
+   ```
+2. **MCP server not configured** — add the config and restart the agent:
+
+   **For Claude Code** — add to `.claude/settings.json` (project or user level), under `"mcpServers"`:
+   ```json
+   "MLflow": {
+     "command": "uvx",
+     "args": ["mlflow-mcp"],
+     "env": {
+       "MLFLOW_TRACKING_URI": "https://<your-mlflow-server>/"
+     }
+   }
+   ```
+
+   **For Cursor** — edit `~/.cursor/mcp.json` (Settings > Tools & MCP > New MCP Server):
+   ```json
+   {
+     "mcpServers": {
+       "MLflow": {
+         "command": "uvx",
+         "args": ["mlflow-mcp"],
+         "env": {
+           "MLFLOW_TRACKING_URI": "https://<your-mlflow-server>/"
+         }
+       }
+     }
+   }
+   ```
diff --git a/.claude/skills/common/credentials.md b/.claude/skills/common/credentials.md
@@ -0,0 +1,60 @@
+# Credentials Setup
+
+Tokens and registry credentials that ModelOpt workflows need across local and cluster environments. Not SLURM-specific — referenced from PTQ, deployment, evaluation, and slurm-setup skills.
+
+## HuggingFace token (`HF_TOKEN`)
+
+Required for gated models (e.g., Llama, Mistral, some Nemotron variants) and gated datasets (e.g., GPQA, HLE).
+
+Generate at <https://huggingface.co/settings/tokens>, then export:
+
+```bash
+export HF_TOKEN=hf_...
+```
+
+Persist in `~/.bashrc` or a project-local `.env` file. For remote clusters, check whether the cluster's shell config already sets it: `ssh <cluster-login> 'env | grep -c HF_TOKEN'`.
+
+## NGC API key (for `nvcr.io`)
+
+Required for pulling NGC images (`nvcr.io/nvidia/pytorch:...`, `nvcr.io/nvidia/vllm:...`) via Docker, `srun --container-image`, or enroot.
+
+Generate at <https://ngc.nvidia.com/setup/api-key>.
+
+### Docker
+
+```bash
+docker login nvcr.io -u '$oauthtoken' -p <NGC_API_KEY>
+```
+
+### Enroot (SLURM / pyxis)
+
+Add an entry to `~/.config/enroot/.credentials` on the cluster. The file may already hold credentials for other registries — **append rather than overwrite**:
+
+```bash
+mkdir -p ~/.config/enroot
+CREDS=~/.config/enroot/.credentials
+touch "$CREDS"
+grep -q '^machine nvcr.io ' "$CREDS" || \
+    echo 'machine nvcr.io login $oauthtoken password <NGC_API_KEY>' >> "$CREDS"
+chmod 600 "$CREDS"
+```
+
+> **Note**: `$oauthtoken` is a **literal string** required by NGC, not a shell variable. Do not replace it and do not let your shell expand it — the single quotes above keep it literal.
+
+Without this, `srun --container-image=nvcr.io/...` fails with `401 Unauthorized` when the compute node tries to pull.
+
+## Docker Hub login
+
+Only needed if you hit rate limits pulling public images:
+
+```bash
+docker login
+```
+
+## Summary
+
+| Credential | Used for | Set via |
+|---|---|---|
+| `HF_TOKEN` | Gated HF models / datasets | Env var (`export HF_TOKEN=...`) or `.env` |
+| NGC API key | `nvcr.io` image pulls | `docker login` or `~/.config/enroot/.credentials` |
+| Docker Hub | Rate-limited public image pulls | `docker login` |
diff --git a/.claude/skills/common/remote-execution.md b/.claude/skills/common/remote-execution.md
@@ -28,6 +28,16 @@ clusters:
 default_cluster: my-cluster
 ```
 
+### Staging checkpoints from your workstation
+
+Workstation filesystems (`/home/scratch.*`, local NFS) are **not** mounted on the cluster. If a checkpoint was produced on your workstation, copy it to the cluster's own storage before submitting any job that references it — NEL and SLURM do NOT sync checkpoints automatically.
+
+```bash
+rsync -av /path/to/local/checkpoint <cluster-login>:<cluster-workspace>/checkpoints/
+```
+
+Use the `workspace` path from your cluster config as the destination. Compute nodes on a given cluster share the same storage as its login node, so once staged, the path works everywhere on that cluster.
+
 See `.claude/clusters.yaml.example` for a fully annotated example with multiple cluster types.
 
 ---

diff --git a/.claude/skills/common/slurm-setup.md b/.claude/skills/common/slurm-setup.md
@@ -51,6 +51,10 @@ srun \
     "
 ```
 
+### Container registry credentials (pyxis)
+
+If `srun --container-image` uses an image from a private registry (e.g., `nvcr.io/nvidia/...`), pyxis/enroot needs registry credentials on the cluster in `~/.config/enroot/.credentials`. See `skills/common/credentials.md` for the NGC / Docker / HF token setup. Without this, `srun` fails with `401 Unauthorized` when the compute node pulls.
+
 Submit and capture the job ID:
 
 ```bash

diff --git a/.claude/skills/common/workspace-management.md b/.claude/skills/common/workspace-management.md
@@ -92,6 +92,19 @@ rsync -a --quiet \
     "$MODELOPT_REPO_DIR/" "$MODELOPT_WORKSPACE_ROOT/<name>/"
 ```
 
+## Cross-Skill Workspace Flow
+
+Workspaces carry over across the PTQ → Deploy → Eval pipeline. Each stage adds to the same directory:
+
+```text
+workspaces/model-name-format/
+  output/              ← PTQ: quantized checkpoint
+  eval_results/        ← Evaluation: NEL artifacts (results.yml per task)
+  eval_config.yaml     ← Evaluation: NEL config
+  scripts/             ← Deployment/PTQ: custom run scripts
+  logs/                ← All: SLURM job logs
+```
+
 ## Example Flow
 
 ```text
@@ -104,6 +117,10 @@ User: "deploy the model I just quantized"
 Agent: ls workspaces/ → sees "qwen3-0.6b-nvfp4"
        → reuse, find checkpoint at workspaces/qwen3-0.6b-nvfp4/output/
 
+User: "evaluate the quantized model on MMLU and GSM8K"
+Agent: ls workspaces/ → sees "qwen3-0.6b-nvfp4"
+       → reuse, write eval_config.yaml, results to workspaces/qwen3-0.6b-nvfp4/eval_results/
+
 User: "now quantize Llama-3.1-8B with fp8"
 Agent: ls workspaces/ → no llama
        → mkdir workspaces/llama-3.1-8b-fp8

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
@@ -12,10 +12,12 @@ license: Apache-2.0
 
 You're an expert in NeMo Evaluator Launcher! Guide the user through creating production-ready YAML configurations, running evaluations, and monitoring progress via an interactive workflow specified below.
 
-### Workspace (multi-user / Slack bot)
+### Workspace and Pipeline Integration
 
 If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Check for existing workspaces — especially if evaluating a model from a prior PTQ or deployment step. Reuse the existing workspace so you have access to the quantized checkpoint and any code modifications.
 
+This skill is often the final stage of the PTQ → Deploy → Eval pipeline. If the model required runtime patches during deployment (transformers upgrade, framework source fixes), carry those patches into the NEL config via `deployment.command`.
+
 ### Workflow
 
 ```text
-Original file line number
+Diff line change
@@ Expand Up / @@ -51,6 +51,10 @@ srun \ @@
         "
     ```
+    ### Container registry credentials (pyxis)
+    If `srun --container-image` uses an image from a private registry (e.g., `nvcr.io/nvidia/...`), pyxis/enroot needs registry credentials on the cluster in `~/.config/enroot/.credentials`. See `skills/common/credentials.md` for the NGC / Docker / HF token setup. Without this, `srun` fails with `401 Unauthorized` when the compute node pulls.
     Submit and capture the job ID:
     ```bash
@@ Expand Down @@