diff --git a/.github/workflows/testrail-ff-tests-deduplication.yml b/.github/workflows/testrail-ff-tests-deduplication.yml new file mode 100644 index 0000000..3d80099 --- /dev/null +++ b/.github/workflows/testrail-ff-tests-deduplication.yml @@ -0,0 +1,205 @@ +name: TestRail Test Case Deduplication + +on: + workflow_dispatch: + schedule: + - cron: "0 9 * * 1" # Every Monday at 9am UTC + +env: + BUCKET: mobile-reports + BUCKET_PREFIX: public/testrail-ff-test-deduplication + DEFAULT_DIR: ./testrail/testcases-deduplication + STORAGE_URL_PREFIX: https://console.cloud.google.com/storage/browser + BQ_DATASET: testops_stats + BQ_TABLE: testrail_deduplication_runs + +jobs: + deduplication: + name: Deduplication — ${{ matrix.project_name }} + runs-on: ubuntu-24.04 + defaults: + run: + working-directory: ${{ env.DEFAULT_DIR }} + + strategy: + fail-fast: false + matrix: + include: + - project_id: '14' + project_name: firefox-ios + suite_id: '45443' + - project_id: '59' + project_name: fenix + suite_id: '3192' + - project_id: '27' + project_name: focus-ios + suite_id: '5291' + - project_id: '48' + project_name: focus-android + suite_id: '1028' + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.11' + + - name: Get sentence-transformers version + id: st-version + run: | + version=$(grep '^sentence-transformers' requirements.txt | sed 's/[^0-9.]//g') + echo "version=$version" >> $GITHUB_OUTPUT + + - name: Cache sentence-transformers model + uses: actions/cache@v5 + with: + path: ~/.cache/huggingface + key: sentence-transformers-all-MiniLM-L6-v2-${{ steps.st-version.outputs.version }}-${{ runner.os }} + + - name: Install dependencies + run: pip install -r requirements.txt + + - name: Fetch test cases from TestRail + env: + TESTRAIL_HOST: ${{ secrets.TESTRAIL_HOST }} + TESTRAIL_USERNAME: ${{ secrets.TESTRAIL_USERNAME }} + TESTRAIL_PASSWORD: ${{ secrets.TESTRAIL_PASSWORD }} + run: | + python3 fetch_testrail_export.py \ + --project-id "${{ matrix.project_id }}" \ + --suite-id "${{ matrix.suite_id }}" \ + --output testrail_export.xlsx + + - name: Run deduplication pipeline + run: | + python3 run_all.py testrail_export.xlsx --output-dir ./output + + - name: Set run metadata + run: | + echo "today=$(date '+%Y-%m-%d')" >> $GITHUB_ENV + echo "project_id=${{ matrix.project_id }}" >> $GITHUB_ENV + echo "project_name=${{ matrix.project_name }}" >> $GITHUB_ENV + + - name: Establish Google Cloud connection + uses: google-github-actions/auth@v3 + with: + credentials_json: ${{ secrets.GCLOUD_AUTH }} + + - name: Upload results to GCS + id: upload-results + uses: google-github-actions/upload-cloud-storage@v3 + with: + path: ${{ env.DEFAULT_DIR }}/output + destination: ${{ env.BUCKET }}/${{ env.BUCKET_PREFIX }}/${{ matrix.project_name }}/${{ env.today }} + glob: '*.csv' + parent: false + + - name: Query previous stats from BigQuery + run: | + bq_result=$(bq query \ + --project_id=moz-mobile-tools \ + --use_legacy_sql=false \ + --format=json \ + "SELECT exact_duplicate_cases, similar_pairs, high_priority_similar_pairs, total_cases + FROM \`moz-mobile-tools.${{ env.BQ_DATASET }}.${{ env.BQ_TABLE }}\` + WHERE project_id = '${{ matrix.project_id }}' + ORDER BY run_date DESC + LIMIT 1" 2>/dev/null || echo "[]") + + python3 - << PYEOF + import json, os, sys + raw = """${bq_result}""" + rows = [] + try: + rows = json.loads(raw.strip()) + except (json.JSONDecodeError, ValueError): + pass + with open(os.environ["GITHUB_ENV"], "a") as f: + if rows: + row = rows[0] + f.write(f"prev_exact={row.get('exact_duplicate_cases', 0)}\n") + f.write(f"prev_similar={row.get('similar_pairs', 0)}\n") + f.write(f"prev_high_priority_similar={row.get('high_priority_similar_pairs', 0)}\n") + f.write(f"prev_total={row.get('total_cases', 0)}\n") + f.write("has_prev_data=true\n") + else: + f.write("prev_exact=0\nprev_similar=0\nprev_high_priority_similar=0\nprev_total=0\nhas_prev_data=false\n") + PYEOF + + - name: Insert current stats into BigQuery + run: | + python3 insert_bq_stats.py \ + --output-dir ./output \ + --project-id "${{ matrix.project_id }}" \ + --project-name "${{ matrix.project_name }}" \ + --run-date "${{ env.today }}" \ + --github-run-id "${{ github.run_id }}" \ + --bq-project moz-mobile-tools \ + --bq-dataset ${{ env.BQ_DATASET }} \ + --bq-table ${{ env.BQ_TABLE }} + + - name: Build Slack payloads + run: | + python3 build_slack_payloads.py \ + --output-dir . \ + --today "${{ env.today }}" \ + --project-id "${{ matrix.project_id }}" \ + --project-name "${{ matrix.project_name }}" \ + --gcs-url "${{ env.GCS_URL }}" \ + --run-url "${{ env.RUN_URL }}" + env: + GCS_URL: ${{ env.STORAGE_URL_PREFIX }}/${{ env.BUCKET }}/${{ env.BUCKET_PREFIX }}/${{ matrix.project_name }}/${{ env.today }}/ + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + - name: Write job summary + run: | + echo "## TestRail Deduplication Report" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Field | Value |" >> $GITHUB_STEP_SUMMARY + echo "|-------|-------|" >> $GITHUB_STEP_SUMMARY + echo "| Project | ${{ matrix.project_name }} (ID: ${{ matrix.project_id }}) |" >> $GITHUB_STEP_SUMMARY + echo "| Date | ${{ env.today }} |" >> $GITHUB_STEP_SUMMARY + echo "| Total cases | ${{ env.current_total }} |" >> $GITHUB_STEP_SUMMARY + echo "| Exact duplicates | ${{ env.current_exact }} |" >> $GITHUB_STEP_SUMMARY + echo "| Similar pairs | ${{ env.current_similar }} |" >> $GITHUB_STEP_SUMMARY + echo "| Duplicate rate | ${{ env.current_rate }} |" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "[Download results from GCS](${{ env.STORAGE_URL_PREFIX }}/${{ env.BUCKET }}/${{ env.BUCKET_PREFIX }}/${{ matrix.project_name }}/${{ env.today }}/)" >> $GITHUB_STEP_SUMMARY + + - name: Notify Slack — weekly digest + if: success() + uses: slackapi/slack-github-action@v3.0.1 + with: + webhook: ${{ secrets.SLACK_WEBHOOK_URL_TEST_ALERTS_SANDBOX }} + webhook-type: incoming-webhook + payload-file-path: ${{ env.DEFAULT_DIR }}/slack-digest.json + + - name: Notify Slack — spike alert + if: success() && env.send_spike == 'true' + uses: slackapi/slack-github-action@v3.0.1 + with: + webhook: ${{ secrets.SLACK_WEBHOOK_URL_TEST_ALERTS_SANDBOX }} + webhook-type: incoming-webhook + payload-file-path: ${{ env.DEFAULT_DIR }}/slack-spike.json + + - name: Notify Slack (failure) + if: failure() + uses: slackapi/slack-github-action@v3.0.1 + with: + webhook: ${{ secrets.SLACK_WEBHOOK_URL_TEST_ALERTS_SANDBOX }} + webhook-type: incoming-webhook + payload: | + { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":x: *TestRail Deduplication failed* (${{ matrix.project_name }})\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" + } + } + ] + } \ No newline at end of file diff --git a/testrail/testcases-deduplication/.gitignore b/testrail/testcases-deduplication/.gitignore new file mode 100644 index 0000000..1f60ff8 --- /dev/null +++ b/testrail/testcases-deduplication/.gitignore @@ -0,0 +1,34 @@ +# Virtual environment +venv/ +.venv/ + +# Generated output files (contain sensitive TestRail data — do not commit) +output/ +duplicates_exact.csv +similar_pairs.csv +WORK_LIST_EXACT.csv +WORK_LIST_PERFECT_MATCHES.csv +WORK_LIST_SIMILAR_HIGH_PRIORITY.csv +PRIORITIZED_DUPLICATES.csv +ACTION_ITEMS.csv +analysis_stats.json + +# Input files (TestRail exports contain product test coverage data) +*.xlsx +*.xls + +# Reports generated locally +DEDUPLICATION_REPORT.md + +# Python +__pycache__/ +*.pyc +*.pyo +.pytest_cache/ + +# sentence-transformers model cache (if stored locally) +.cache/ +models/ + +# OS +.DS_Store diff --git a/testrail/testcases-deduplication/HOW_TO_USE.md b/testrail/testcases-deduplication/HOW_TO_USE.md new file mode 100644 index 0000000..dc4c049 --- /dev/null +++ b/testrail/testcases-deduplication/HOW_TO_USE.md @@ -0,0 +1,348 @@ +# How to Use the Duplicate Detection Results + +**Updated:** 2026-04-21 + +--- + +## Running the Pipeline + +### Full pipeline (recommended) + +```bash +cd testrail/testcases-deduplication +pip install -r requirements.txt + +python run_all.py /path/to/testrail_export.xlsx +``` + +This runs all three steps and writes output files to the script directory by default. + +### Custom output directory + +Use `--output-dir` to control where output CSVs are written — useful for CI or when you want to keep results from different runs separate: + +```bash +python run_all.py /path/to/export.xlsx --output-dir /tmp/dedup-2024-01-15 +``` + +### Adjust detection thresholds + +```bash +python run_all.py export.xlsx \ + --dup-threshold 0.92 \ + --sim-threshold 0.85 \ + --overlap-threshold 0.80 +``` + +| Flag | Default | Description | +|------|---------|-------------| +| `--dup-threshold` | 0.90 | Minimum similarity to label a pair as `semantic_duplicate` | +| `--sim-threshold` | 0.80 | Minimum similarity to include a pair in the report at all | +| `--overlap-threshold` | 0.80 | Minimum step overlap to set `shares_most_steps = True` | + +### Run individual steps + +Each step can also be run standalone if you already have intermediate CSVs: + +```bash +python generate-work-list.py --output-dir /path/to/results +python export-priority-list.py --output-dir /path/to/results +``` + +> **Note:** Output CSVs contain TestRail test case data — do not commit them to the repository. They are covered by `.gitignore`. + +--- + +## Automated Monitoring (GitHub Action) + +The workflow `testrail-ff-tests-deduplication.yml` runs automatically every **Monday at 9am UTC** and can also be triggered manually from GitHub Actions. + +### What it does + +1. Fetches test cases from TestRail via API +2. Runs the full deduplication pipeline +3. Uploads result CSVs to GCS (`mobile-reports/public/testrail-ff-test-deduplication/{project}/{date}/`) +4. Stores run stats in BigQuery (`moz-mobile-tools.testops_stats.testrail_deduplication_runs`) +5. Sends Slack notifications to `#mobile-alerts-sandbox` + +### Slack notifications + +There are three types of notifications: + +#### 1. Weekly digest (every Monday, on success) + +Sent after every successful run. Shows the current stats with week-over-week deltas so you can track trends at a glance. + +``` +🔍 TestRail Deduplication — 2026-04-28 + +Project: firefox-ios (ID: 14) +Total cases: 1,620 (+5 vs last week) +Exact duplicates: 42 (+3 vs last week) +Similar pairs: 310 (no change) +Duplicate rate: 2.6% +Download results from GCS +``` + +#### 2. Spike alert (only when exact duplicates jump by more than 10) + +Sent in addition to the digest when there's a significant increase in exact duplicates — this usually means a bulk import or copy-paste of test cases happened. + +``` +⚠️ Duplicate spike detected — firefox-ios + +Exact duplicates jumped by 23 this week (42 → 65) +Project: firefox-ios (ID: 14) | Date: 2026-04-28 +Download results · View run +``` + +> This alert is skipped on the very first run (no previous data to compare against). + +#### 3. Failure notification (any step fails) + +``` +❌ TestRail Deduplication failed (project 14) +View run +``` + +### Triggering manually + +Go to **Actions → TestRail Test Case Deduplication → Run workflow** and select: +- **Project ID**: 14 (Firefox iOS), 59 (Fenix), 27 (Focus iOS), 48 (Focus Android) +- **Suite ID** (optional): leave empty to fetch all suites + +--- + +## Quick Start Guide (reviewing results) + +### Step 1: Open the Data Files + +Two CSV files contain the raw data: + +#### duplicates_exact.csv +- Each row is a test case that belongs to a duplicate group +- All cases with the same `duplicate_group_id` are exact duplicates +- **Action:** Keep one test per group, archive the rest + +#### similar_pairs.csv +- Each row is a pair of similar tests with their similarity scores +- **Action:** Review high-similarity pairs (≥95%) for potential consolidation + +### Step 3: Understand the Columns + +#### duplicates_exact.csv + +| Column | What it means | +|--------|---------------| +| **_case_id** | TestRail case ID (e.g., C2575167) | +| **_title** | Test case title | +| **duplicate_group_id** | Group identifier - same ID = exact duplicates | + +**Usage:** +1. Sort by `duplicate_group_id` +2. For each group, choose one test to keep +3. Archive all others in that group + +#### similar_pairs.csv + +| Column | What it means | +|--------|---------------| +| **case_id_1**, **case_id_2** | The two test case IDs being compared | +| **title_1**, **title_2** | Their titles | +| **similarity** | Semantic similarity score (0.0-1.0) | +| **step_overlap** | Percentage of shared steps (0.0-1.0) | +| **relation** | "semantic_duplicate" (≥90%) or "similar" (80-90%) | +| **shares_most_steps** | True if ≥80% of steps are identical | + +**Usage:** +```python +# In Excel/Sheets, filter by: +similarity >= 0.95 # High priority duplicates +relation == "semantic_duplicate" # Strong duplicate candidates +shares_most_steps == TRUE # Tests with identical execution +``` + +--- + +## Action Plan + +### Phase 1: Address Exact Duplicates + +**Goal:** Review and archive all exact duplicate groups + +**Priority Order** (use `WORK_LIST_EXACT.csv`): + +1. Start with the largest groups (4+ duplicates) — biggest savings per group +2. Then medium groups (3 duplicates) +3. Finally the 2-duplicate groups + +For each group: keep the case with the **lowest ID** (usually the original), archive the rest. + +### Phase 2: Review High Similarity Cases + +**Goal:** Identify consolidation opportunities in near-duplicates + +**Priority** (use `WORK_LIST_PERFECT_MATCHES.csv` and `WORK_LIST_SIMILAR_HIGH_PRIORITY.csv`): + +1. **Perfect semantic matches (100% similarity)** — treat as exact duplicates; likely differ only in formatting +2. **Near-perfect matches (95-99%)** — review manually; small differences may be intentional +3. **High step overlap (≥80%)** — consider parameterization if tests differ only by a variable + +### Phase 3: Pattern Analysis + +**Goal:** Understand root causes to prevent future duplication + +**Tasks:** +1. Identify which sections/suites have the most duplicates +2. Map duplicates to test creation periods (bulk imports, copy-paste) +3. Create process guidelines to prevent recurrence + +--- + +## How to Archive Tests in TestRail + +### Option 1: Individual Archive +1. Open the test case in TestRail +2. Click "Edit" +3. Check the "Archived" checkbox +4. Add a comment: "Archived - exact duplicate of [CASE_ID]" +5. Save + +### Option 2: Bulk Archive +1. Go to the test suite in TestRail +2. Select multiple test cases (checkbox selection) +3. Click "Bulk Update" +4. Set "Archived" = Yes +5. Add comment: "Archived - duplicate cleanup [DATE]" +6. Apply + +### Best Practices +- ✅ Always add a comment explaining why you archived +- ✅ Reference the test you're keeping +- ✅ Archive rather than delete (can be restored) +- ✅ Verify automation coverage before archiving +- ✅ Update any test runs or plans that reference archived cases + +--- + +## Example Walkthrough + +### Example 1: Exact Duplicate Group + +**From duplicates_exact.csv:** +``` +_case_id, _title, duplicate_group_id +C1000001, "Select and save System auto theme", 5 +C1000045, "Select and save System auto theme", 5 +C1000089, "Select and save System auto theme", 5 +``` + +**Steps:** +1. Open all 3 cases in TestRail +2. Verify they are truly identical (check steps, expected results) +3. Choose to keep: **C1000001** (lowest ID = original) +4. Archive C1000045 with comment: "Archived - exact duplicate of C1000001" +5. Archive C1000089 with comment: "Archived - exact duplicate of C1000001" +6. Result: 2 fewer test cases ✅ + +### Example 2: High Similarity Pair + +**From similar_pairs.csv:** +``` +case_id_1, title_1, case_id_2, title_2, similarity, step_overlap +C1000010, "Verify CFR displayed - bottom toolbar", C1000011, "Verify CFR displayed - top toolbar", 1.000, 1.000 +``` + +**Steps:** +1. Open both cases +2. Review: They test the same thing with different toolbar positions +3. **Decision:** These are intentional variants - KEEP BOTH +4. No action needed + +**Alternative scenario:** +If the toolbar position isn't critical, consider: +- Create one parameterized test: "Verify CFR displayed [toolbar_position]" +- Archive both originals +- Result: 1 test instead of 2 ✅ + +--- + +## Using the Data Files + +### In Excel/Google Sheets + +**duplicates_exact.csv:** +1. Open in Excel +2. Sort by `duplicate_group_id` +3. For each group: + - Highlight the row you'll KEEP (lowest case_id) + - Mark others for archive +4. Track progress with a "Status" column + +**similar_pairs.csv:** +1. Open in Excel +2. Add filters to all columns +3. Filter: `similarity >= 0.95` +4. Sort by `similarity` descending +5. Review top matches first + +### In Python + +```python +import pandas as pd + +# Load exact duplicates +exact = pd.read_csv('duplicates_exact.csv') + +# Find largest groups +group_sizes = exact.groupby('duplicate_group_id').size() +largest_groups = group_sizes[group_sizes >= 3].sort_values(ascending=False) + +# Load similar pairs +similar = pd.read_csv('similar_pairs.csv') + +# High priority: near-perfect matches +high_priority = similar[similar['similarity'] >= 0.95].sort_values('similarity', ascending=False) + +# Cases that share most steps +high_overlap = similar[similar['shares_most_steps'] == True] + +# Semantic duplicates only +duplicates = similar[similar['relation'] == 'semantic_duplicate'] +``` + +--- + +## FAQs + +### Q: Which test should I keep if they're all identical? +**A:** Keep the test that is linked in githug to avoid creating a PR just to modify the link. + +### Q: What if one duplicate has automation coverage and the other doesn't? +**A:** Keep the one with automation coverage, or migrate the automation to the version you want to keep before archiving. + +### Q: What if the tests are 100% similar but have different titles? +**A:** Review carefully - they might be testing subtly different scenarios. If truly identical, consolidate to one and update the title to be more descriptive. + +### Q: Should I delete or archive? +**A:** **Always archive** (never delete). Archiving allows you to restore if you make a mistake or discover the "duplicate" was actually testing something different. + +### Q: What if I disagree with a suggested duplicate? +**A:** Trust your judgment! The script uses semantic analysis, but you have domain knowledge. If two tests seem similar but test different things, keep both and document why. + +### Q: How do I prevent future duplicates? +**A:** See "Priority 4" in DEDUPLICATION_REPORT.md for prevention strategies: +- Search before creating new tests +- Use consistent naming conventions +- Run quarterly deduplication audits +- Consider automated duplicate detection in CI/CD + +--- + +## Need Help? + +If you encounter issues or have questions: + +1. Check the script source: `find-duplicates.py` +2. Re-run the pipeline with updated data: `python run_all.py export.xlsx` +3. Adjust thresholds if results seem off (see the table above) diff --git a/testrail/testcases-deduplication/build_slack_payloads.py b/testrail/testcases-deduplication/build_slack_payloads.py new file mode 100644 index 0000000..bcb07db --- /dev/null +++ b/testrail/testcases-deduplication/build_slack_payloads.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +""" +Build Slack JSON payload files for the deduplication weekly digest and spike alert. + +Reads stats from environment variables (set by insert_bq_stats.py / GITHUB_ENV) +and writes slack-digest.json and optionally slack-spike.json to the output directory. + +Usage: + python build_slack_payloads.py \ + --output-dir . \ + --today 2026-04-24 \ + --project-id 14 \ + --project-name firefox-ios \ + --gcs-url https://console.cloud.google.com/storage/browser/... \ + --run-url https://github.com/... +""" +import argparse +import json +import os +import sys +from pathlib import Path + + +def delta_str(current: int, previous: int, has_prev_data: bool) -> str: + if not has_prev_data: + return "" + diff = current - previous + if diff > 0: + return f" _(+{diff} vs last week)_" + if diff < 0: + return f" _({diff} vs last week)_" + return " _(no change)_" + + +def main(): + parser = argparse.ArgumentParser(description="Build Slack payload JSON files.") + parser.add_argument("--output-dir", default=".", help="Directory to write slack-*.json files") + parser.add_argument("--today", required=True) + parser.add_argument("--project-id", required=True) + parser.add_argument("--project-name", required=True) + parser.add_argument("--gcs-url", required=True) + parser.add_argument("--run-url", required=True) + args = parser.parse_args() + + output_dir = Path(args.output_dir) + + # Current stats (written to GITHUB_ENV by insert_bq_stats.py) + current_total = int(os.environ.get("current_total", 0)) + current_exact = int(os.environ.get("current_exact", 0)) + current_high_priority_similar = int(os.environ.get("current_high_priority_similar", 0)) + current_rate = float(os.environ.get("current_rate", 0)) + + # Previous stats (written to GITHUB_ENV by the BQ query step) + prev_exact = int(os.environ.get("prev_exact", 0)) + prev_high_priority_similar = int(os.environ.get("prev_high_priority_similar", 0)) + prev_total = int(os.environ.get("prev_total", 0)) + has_prev_data = os.environ.get("has_prev_data", "false") == "true" + + digest_text = ( + f"*Project:* {args.project_name} (ID: {args.project_id})\n" + f"*Total cases:* {current_total}{delta_str(current_total, prev_total, has_prev_data)}\n" + f"*Exact duplicates:* {current_exact}{delta_str(current_exact, prev_exact, has_prev_data)}\n" + f"*High-priority similar pairs:* {current_high_priority_similar}" + f"{delta_str(current_high_priority_similar, prev_high_priority_similar, has_prev_data)}\n" + f"*Duplicate rate:* {current_rate:.1%}\n" + f"<{args.gcs_url}|Download results from GCS>" + ) + + digest_payload = { + "blocks": [ + {"type": "header", "text": {"type": "plain_text", "text": f":mag: TestRail Deduplication — {args.today}"}}, + {"type": "section", "text": {"type": "mrkdwn", "text": digest_text}}, + ] + } + (output_dir / "slack-digest.json").write_text(json.dumps(digest_payload)) + print(f"Written slack-digest.json") + + delta_exact = current_exact - prev_exact + send_spike = has_prev_data and delta_exact > 10 + + github_env = os.environ.get("GITHUB_ENV") + if github_env: + with open(github_env, "a") as f: + f.write(f"send_spike={'true' if send_spike else 'false'}\n") + + if send_spike: + spike_payload = { + "blocks": [ + {"type": "header", "text": {"type": "plain_text", "text": f":warning: Duplicate spike detected — {args.project_name}"}}, + {"type": "section", "text": {"type": "mrkdwn", "text": ( + f"*Exact duplicates jumped by {delta_exact}* this week " + f"({prev_exact} \u2192 {current_exact})\n" + f"*Project:* {args.project_name} (ID: {args.project_id}) | *Date:* {args.today}\n" + f"<{args.gcs_url}|Download results> \u00b7 <{args.run_url}|View run>" + )}}, + ] + } + (output_dir / "slack-spike.json").write_text(json.dumps(spike_payload)) + print(f"Written slack-spike.json (spike detected: +{delta_exact})") + else: + print("No spike detected — slack-spike.json not written") + + +if __name__ == "__main__": + main() diff --git a/testrail/testcases-deduplication/export-priority-list.py b/testrail/testcases-deduplication/export-priority-list.py new file mode 100644 index 0000000..111a496 --- /dev/null +++ b/testrail/testcases-deduplication/export-priority-list.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +""" +Export filtered priority lists to CSV for easy review in Excel +""" +import argparse +import os +import re +import sys +import pandas as pd + +def load_csv(path: str) -> pd.DataFrame: + """Load a CSV file, returning an empty DataFrame if the file doesn't exist.""" + if not os.path.exists(path): + print(f"Warning: {path} not found — skipping (no results for this category).") + return pd.DataFrame() + return pd.read_csv(path) + + +def case_id_sort_key(case_id: str) -> int: + """Extract numeric part of a case ID like 'C12345' for correct numeric sorting.""" + match = re.search(r'\d+', str(case_id)) + return int(match.group()) if match else 0 + + +def export_priority_lists(): + # Load data + exact = load_csv('duplicates_exact.csv') + similar = load_csv('similar_pairs.csv') + + if exact.empty and similar.empty: + raise RuntimeError("No duplicate data found. Run find-duplicates.py first.") + + # 1. Exact duplicates - sorted by group size + if exact.empty: + print("⚠️ No exact duplicates — skipping WORK_LIST_EXACT.csv") + else: + group_sizes = exact.groupby('duplicate_group_id').size() + exact['group_size'] = exact['duplicate_group_id'].map(group_sizes) + exact = exact.sort_values(['group_size', 'duplicate_group_id', '_case_id'], ascending=[False, True, True]) + + # Add decision columns (sort numerically by case ID to pick the lowest-numbered case to keep) + # Use Series.groupby to avoid pandas 2.2+ include_groups deprecation + group_keep = ( + exact.groupby('duplicate_group_id')['_case_id'] + .apply(lambda ids: sorted(ids.tolist(), key=case_id_sort_key)[0]) + ) + group_archive = ( + exact.groupby('duplicate_group_id')['_case_id'] + .apply(lambda ids: ', '.join(sorted(ids.tolist(), key=case_id_sort_key)[1:])) + ) + exact['KEEP'] = exact['duplicate_group_id'].map(group_keep) + exact['ARCHIVE'] = exact['duplicate_group_id'].map(group_archive) + + # Reorder columns + section_col = '_section' if '_section' in exact.columns else None + base_cols = ['duplicate_group_id', 'group_size', '_case_id', '_title'] + if section_col: + base_cols.append(section_col) + base_cols += ['KEEP', 'ARCHIVE'] + exact = exact[base_cols] + new_names = ['Group_ID', 'Group_Size', 'Case_ID', 'Title'] + if section_col: + new_names.append('Section') + new_names += ['Suggested_KEEP', 'Suggested_ARCHIVE'] + exact.columns = new_names + + # Add empty tracking columns + exact['Decision'] = '' + exact['Status'] = 'TODO' + exact['Notes'] = '' + + # Save + exact.to_csv('WORK_LIST_EXACT.csv', index=False) + print(f"✅ Created WORK_LIST_EXACT.csv ({len(exact)} cases in {exact['Group_ID'].nunique()} groups)") + + # 2. High priority similar pairs (>= 95% similarity) + if similar.empty: + print("⚠️ No similar pairs — skipping WORK_LIST_SIMILAR_HIGH_PRIORITY.csv and WORK_LIST_PERFECT_MATCHES.csv") + else: + high_sim = similar[similar['similarity'] >= 0.95].copy() + high_sim = high_sim.sort_values('similarity', ascending=False) + + # Add decision columns + high_sim['Suggested_Action'] = '' + high_sim['Decision'] = '' + high_sim['Status'] = 'TODO' + high_sim['Notes'] = '' + + # Reorder columns (include section if present) + base_cols = ['case_id_1', 'title_1'] + if 'section_1' in high_sim.columns: + base_cols.append('section_1') + base_cols += ['case_id_2', 'title_2'] + if 'section_2' in high_sim.columns: + base_cols.append('section_2') + base_cols += ['similarity', 'step_overlap', 'relation', 'shares_most_steps', + 'Suggested_Action', 'Decision', 'Status', 'Notes'] + high_sim = high_sim[base_cols] + + high_sim.to_csv('WORK_LIST_SIMILAR_HIGH_PRIORITY.csv', index=False) + print(f"✅ Created WORK_LIST_SIMILAR_HIGH_PRIORITY.csv ({len(high_sim)} pairs)") + + # 3. Perfect matches (100% similarity) - these are basically exact duplicates + perfect = similar[similar['similarity'] == 1.0].copy() + perfect = perfect.sort_values('step_overlap', ascending=False) + + # Keep the case with the lower numeric ID + perfect['Suggested_KEEP'] = perfect.apply( + lambda r: r['case_id_1'] if case_id_sort_key(r['case_id_1']) <= case_id_sort_key(r['case_id_2']) else r['case_id_2'], + axis=1, + ) + perfect['Suggested_ARCHIVE'] = perfect.apply( + lambda r: r['case_id_2'] if case_id_sort_key(r['case_id_1']) <= case_id_sort_key(r['case_id_2']) else r['case_id_1'], + axis=1, + ) + perfect['Decision'] = '' + perfect['Status'] = 'TODO' + perfect['Notes'] = '' + + base_cols = ['case_id_1', 'title_1'] + if 'section_1' in perfect.columns: + base_cols.append('section_1') + base_cols += ['case_id_2', 'title_2'] + if 'section_2' in perfect.columns: + base_cols.append('section_2') + base_cols += ['similarity', 'step_overlap', 'Suggested_KEEP', 'Suggested_ARCHIVE', + 'Decision', 'Status', 'Notes'] + perfect = perfect[base_cols] + + perfect.to_csv('WORK_LIST_PERFECT_MATCHES.csv', index=False) + print(f"✅ Created WORK_LIST_PERFECT_MATCHES.csv ({len(perfect)} pairs)") + + # Summary + print("\n" + "="*60) + print("WORK LISTS CREATED") + print("="*60) + if not exact.empty and 'Group_ID' in exact.columns: + print("\n1. WORK_LIST_EXACT.csv") + print(f" - {len(exact)} cases in {exact['Group_ID'].nunique()} groups") + print(f" - Suggested savings: ~{len(exact) - exact['Group_ID'].nunique()} cases") + if not similar.empty: + print("\n2. WORK_LIST_PERFECT_MATCHES.csv") + print(f" - {len(perfect)} pairs with 100% similarity") + print(f" - Suggested savings: ~{len(perfect)} cases") + print("\n3. WORK_LIST_SIMILAR_HIGH_PRIORITY.csv") + print(f" - {len(high_sim)} pairs with ≥95% similarity") + print(f" - Review and decide case by case") + print("\n💡 TIP: Open these in Excel and use filters/sorting to prioritize") + print("="*60) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Export priority work lists from deduplication results.") + parser.add_argument( + "--output-dir", default=".", + help="Directory containing input CSVs and where work lists will be written (default: current directory)" + ) + cli_args = parser.parse_args() + os.chdir(cli_args.output_dir) + try: + export_priority_lists() + except RuntimeError as e: + print(f"Error: {e}") + sys.exit(1) diff --git a/testrail/testcases-deduplication/fetch_testrail_export.py b/testrail/testcases-deduplication/fetch_testrail_export.py new file mode 100644 index 0000000..95a0be7 --- /dev/null +++ b/testrail/testcases-deduplication/fetch_testrail_export.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +""" +Fetch test cases from the TestRail API and export as xlsx for deduplication analysis. + +Reads credentials from environment variables: + TESTRAIL_HOST — e.g. "yourcompany.testrail.io" + TESTRAIL_USERNAME — API user email + TESTRAIL_PASSWORD — API key or password + +Usage: + python fetch_testrail_export.py --project-id 14 --output export.xlsx + python fetch_testrail_export.py --project-id 14 --suite-id 123 --output export.xlsx +""" +import argparse +import os +import sys + +import pandas as pd +import requests +from requests.auth import HTTPBasicAuth + + +def testrail_client() -> tuple[str, HTTPBasicAuth]: + host = os.environ.get("TESTRAIL_HOST", "").strip().rstrip("/") + username = os.environ.get("TESTRAIL_USERNAME", "") + password = os.environ.get("TESTRAIL_PASSWORD", "") + + if not all([host, username, password]): + print("Error: TESTRAIL_HOST, TESTRAIL_USERNAME and TESTRAIL_PASSWORD must be set.") + sys.exit(1) + + base_url = f"{host}/index.php?/api/v2" + return base_url, HTTPBasicAuth(username, password) + + +REQUEST_TIMEOUT = 30 # seconds + + +def api_get(base_url: str, auth: HTTPBasicAuth, endpoint: str, params: dict = None) -> dict: + url = f"{base_url}/{endpoint}" + resp = requests.get(url, auth=auth, params=params or {}, timeout=REQUEST_TIMEOUT) + resp.raise_for_status() + return resp.json() + + +def fetch_cases(base_url: str, auth: HTTPBasicAuth, project_id: str, suite_id: str = None) -> list[dict]: + """Fetch all test cases for a project (paginated, 250 per page).""" + cases = [] + offset = 0 + limit = 250 + + while True: + params = {"limit": limit, "offset": offset} + if suite_id: + params["suite_id"] = suite_id + + data = api_get(base_url, auth, f"get_cases/{project_id}", params) + batch = data.get("cases", []) + cases.extend(batch) + + if len(batch) < limit: + break + offset += len(batch) # advance by page size, not cumulative total + + return cases + + +def fetch_sections(base_url: str, auth: HTTPBasicAuth, project_id: str, suite_id: str = None) -> dict[int, str]: + """Return a mapping of section_id → section name (paginated, 250 per page).""" + sections = [] + offset = 0 + limit = 250 + + while True: + params = {"limit": limit, "offset": offset} + if suite_id: + params["suite_id"] = suite_id + + data = api_get(base_url, auth, f"get_sections/{project_id}", params) + batch = data.get("sections", []) + sections.extend(batch) + + if len(batch) < limit: + break + offset += len(batch) + + return {s["id"]: s["name"] for s in sections} + + +def format_steps(steps_list: list[dict], field: str) -> str: + """Convert [{content, expected}, ...] to a numbered string (TestRail xlsx export format).""" + if not steps_list: + return "" + return "\n".join(f"{i}. {step.get(field, '')}" for i, step in enumerate(steps_list, 1)) + + +def build_xlsx(cases: list[dict], sections: dict[int, str], output_path: str) -> None: + rows = [] + for case in cases: + steps_raw = case.get("custom_steps_separated") or [] + rows.append({ + "ID": f"C{case['id']}", + "Title": case.get("title", ""), + "Section": sections.get(case.get("section_id"), ""), + "Steps (Step)": format_steps(steps_raw, "content"), + "Steps (Expected Result)": format_steps(steps_raw, "expected"), + }) + + df = pd.DataFrame(rows) + df.to_excel(output_path, index=False) + print(f"Exported {len(df)} test cases to {output_path}") + + +def main(): + parser = argparse.ArgumentParser( + description="Fetch TestRail test cases and export as xlsx for deduplication." + ) + parser.add_argument("--project-id", required=True, help="TestRail project ID") + parser.add_argument("--suite-id", default=None, help="TestRail suite ID (optional, fetches all suites if omitted)") + parser.add_argument("--output", default="testrail_export.xlsx", help="Output xlsx file path") + args = parser.parse_args() + + base_url, auth = testrail_client() + + print(f"Fetching test cases for project {args.project_id}...") + cases = fetch_cases(base_url, auth, args.project_id, args.suite_id) + print(f"Fetched {len(cases)} test cases") + + if not cases: + print("No test cases found. Check project ID and suite ID.") + sys.exit(1) + + print("Fetching section names...") + sections = fetch_sections(base_url, auth, args.project_id, args.suite_id) + + build_xlsx(cases, sections, args.output) + + +if __name__ == "__main__": + main() diff --git a/testrail/testcases-deduplication/find-duplicates.py b/testrail/testcases-deduplication/find-duplicates.py new file mode 100644 index 0000000..4a6aaae --- /dev/null +++ b/testrail/testcases-deduplication/find-duplicates.py @@ -0,0 +1,451 @@ +import argparse +import json +import pandas as pd +import re +import unicodedata +from difflib import SequenceMatcher +from sentence_transformers import SentenceTransformer +from sklearn.neighbors import NearestNeighbors + + +# ---------- Configuration ---------- +EXACT_OUTPUT = "duplicates_exact.csv" +SIMILAR_OUTPUT = "similar_pairs.csv" + +# Thresholds +SEMANTIC_DUP_THRESHOLD = 0.90 # >= this is considered a strong duplicate +SEMANTIC_SIM_THRESHOLD = 0.80 # >= this is considered very similar +STEP_OVERLAP_THRESHOLD = 0.80 # % of common steps to mark "shares most steps" + + +# ---------- Text utilities ---------- + +HTML_TAG_RE = re.compile(r"<.*?>", re.DOTALL) + +def strip_html(text: str) -> str: + """Remove HTML tags from text.""" + text = HTML_TAG_RE.sub(" ", text) + return text + +def normalize_text(text: str) -> str: + """ + Normalize text by: + - Stripping HTML tags + - Converting to lowercase + - Normalizing Unicode + - Removing numbering (1. 2) etc.) from beginning of lines + - Collapsing whitespace + """ + if not isinstance(text, str): + return "" + + # Step 1: Strip whitespace and HTML first (before expensive operations) + text = text.strip() + text = strip_html(text) + + # Step 2: Normalize Unicode and lowercase + text = unicodedata.normalize("NFKC", text) + text = text.lower() + + # Step 3: Remove numbering like "1. ", "2) ", "- ", etc. at the beginning of lines + # This also handles double numbering like "1. 1. text" + lines = [] + for line in text.splitlines(): + # Remove one or more number patterns at the start (handles "1. 1. text") + line = re.sub(r"^(\s*\d+\s*[\.\)]\s*)+", " ", line) + # Remove bullet points + line = re.sub(r"^\s*[-•]\s*", " ", line) + lines.append(line.strip()) + text = " ".join(l for l in lines if l) + + # Step 4: Collapse whitespace + text = re.sub(r"\s+", " ", text) + return text.strip() + + +def split_numbered_items(text: str): + """ + Converts something like: + '1. step one\n2. step two' -> ['step one', 'step two'] + Also handles formats like '1.
step one
\n2.step two
' + """ + if not isinstance(text, str): + return [] + + # First remove HTML + text = strip_html(text) + + # Split by patterns "n. " or "n) " at the beginning of line + # Add a fictitious newline at the beginning to simplify the split + text = "\n" + text + pieces = re.split(r"\n\s*\d+\s*[\.\)]\s*", text) + # pieces[0] will be what's before the first numbering, we ignore it + steps = [normalize_text(p) for p in pieces[1:]] + return [s for s in steps if s] + + +def parse_notes_steps_and_expected(notes: str): + """ + From a Notes block with: + Step Description: + ... + Expected Result: + ... + extracts two lists: [desc1, desc2,...], [exp1, exp2,...] + """ + if not isinstance(notes, str): + return [], [] + + # Normalize line breaks + text = notes.replace("\r\n", "\n").replace("\r", "\n") + + # Split by 'Step Description:' + chunks = text.split("Step Description:") + descs = [] + exps = [] + + for chunk in chunks[1:]: # the first piece is before the first description + parts = chunk.split("Expected Result:", 1) + desc = parts[0] + exp = parts[1] if len(parts) > 1 else "" + descs.append(normalize_text(desc)) + exps.append(normalize_text(exp)) + + return descs, exps + + +def parse_testrail_steps(steps_text: str, expected_text: str): + """ + Parse TestRail's 'Steps (Step)' and 'Steps (Expected Result)' columns. + These come in format: + Steps: "1. step one\n2. step two\n..." + Expected: "1. expected one\n2. expected two\n..." + Returns: ([step1, step2, ...], [expected1, expected2, ...]) + """ + steps = split_numbered_items(steps_text) if isinstance(steps_text, str) else [] + expected = split_numbered_items(expected_text) if isinstance(expected_text, str) else [] + return steps, expected + + +def step_similarity(a: str, b: str) -> float: + return SequenceMatcher(None, a, b).ratio() + + +def compute_step_overlap(steps1, steps2, min_ratio=0.85): + """ + Calculates the % of common steps by pairing each step from steps1 + with the most similar one from steps2 that exceeds min_ratio. + Uses min() to detect when shorter tests are contained in longer ones. + """ + if not steps1 or not steps2: + return 0.0 + + used_j = set() + matches = 0 + + for s1 in steps1: + best_ratio = 0.0 + best_j = None + for j, s2 in enumerate(steps2): + if j in used_j: + continue + ratio = step_similarity(s1, s2) + if ratio > best_ratio: + best_ratio = ratio + best_j = j + if best_ratio >= min_ratio: + matches += 1 + used_j.add(best_j) + + overlap = matches / min(len(steps1), len(steps2)) + return overlap + + +# ---------- Data loading and normalization ---------- + +def load_and_normalize(path: str) -> pd.DataFrame: + # Try different header rows to find the right format + raw = None + for header_row in [0, 2]: + try: + test_df = pd.read_excel(path, header=header_row, nrows=1) + # Check if we have ID or Unnamed: 1 column with case ID pattern + if "ID" in test_df.columns or "Unnamed: 1" in test_df.columns: + raw = pd.read_excel(path, header=header_row) + break + except Exception: + continue + + if raw is None: + raise ValueError("Could not determine Excel file structure") + + # Rename key columns for readability (only if they don't already exist) + rename_map = {} + if "Unnamed: 1" in raw.columns and "CaseID" not in raw.columns: + rename_map["Unnamed: 1"] = "CaseID" + if "Unnamed: 2" in raw.columns and "Title" not in raw.columns: + rename_map["Unnamed: 2"] = "TestTitle" + + if rename_map: + raw = raw.rename(columns=rename_map) + + # Determine which column has the Case ID + if "ID" in raw.columns: + case_id_col = "ID" + elif "CaseID" in raw.columns: + case_id_col = "CaseID" + elif "Unnamed: 1" in raw.columns: + case_id_col = "Unnamed: 1" + else: + raise ValueError("Could not find Case ID column") + + if "Title" in raw.columns: + title_col = "Title" + elif "TestTitle" in raw.columns: + title_col = "TestTitle" + elif "Unnamed: 2" in raw.columns: + title_col = "Unnamed: 2" + else: + raise ValueError("Could not find Title column") + + # Keep only rows that have a CaseID + raw = raw[raw[case_id_col].notna()].copy() + + # Extract lists of steps and expected results + step_lists = [] + expected_lists = [] + + # Store CaseID, Title, and Section for later use + raw["_case_id"] = raw[case_id_col] + raw["_title"] = raw[title_col] + + # Extract Section column (TestRail exports use "Section" or "Section Hierarchy") + if "Section" in raw.columns: + raw["_section"] = raw["Section"].fillna("").astype(str) + elif "Section Hierarchy" in raw.columns: + raw["_section"] = raw["Section Hierarchy"].fillna("").astype(str) + else: + raw["_section"] = "" + + for _, row in raw.iterrows(): + # Try different sources for steps in priority order: + # 1) TestRail's standard 'Steps (Step)' and 'Steps (Expected Result)' columns + steps_col = row.get("Steps (Step)") + expected_col = row.get("Steps (Expected Result)") + + if pd.notna(steps_col): + steps, expected = parse_testrail_steps(steps_col, expected_col) + else: + # 2) Try to parse Notes (Step Description / Expected Result format) + notes = row.get("Notes") + descs, exps = parse_notes_steps_and_expected(notes) + if descs: + steps = descs + expected = exps + else: + # 3) Fallback to Section Description + section_desc = row.get("Section Description") + steps = split_numbered_items(str(section_desc)) if pd.notna(section_desc) else [] + expected = [] + + # 4) If still no expected results, try the global Expected Result column + if not expected: + expected_global = row.get("Expected Result") + if pd.notna(expected_global) and isinstance(expected_global, str): + expected = [normalize_text(expected_global)] + + step_lists.append(steps) + expected_lists.append(expected) + + raw["steps_list"] = step_lists + raw["expected_list"] = expected_lists + + # Canonical text fields + raw["canonical_title"] = raw["_title"].fillna("").apply(lambda x: normalize_text(str(x))) + raw["canonical_steps"] = raw["steps_list"].apply( + lambda lst: " | ".join(normalize_text(s) for s in lst) + ) + raw["canonical_expected"] = raw["expected_list"].apply( + lambda lst: " | ".join(normalize_text(s) for s in lst) + ) + + def build_full(row): + return ( + f"title: {row['canonical_title']}\n" + f"steps: {row['canonical_steps']}\n" + f"expected: {row['canonical_expected']}" + ) + + raw["canonical_full_text"] = raw.apply(build_full, axis=1) + + return raw + + +# ---------- Exact duplicates ---------- + +def find_exact_duplicates(df: pd.DataFrame) -> pd.DataFrame: + df["exact_key"] = df["canonical_full_text"] + dup_groups = df.groupby("exact_key").filter(lambda g: len(g) > 1).copy() + # Assign a group ID for each set of duplicates + dup_groups["duplicate_group_id"] = dup_groups.groupby("exact_key").ngroup() + return dup_groups + + +# ---------- Semantic similarity ---------- + +def compute_semantic_pairs(df: pd.DataFrame) -> pd.DataFrame: + texts = df["canonical_full_text"].tolist() + case_ids = df["_case_id"].tolist() + titles = df["_title"].fillna("").tolist() + sections = df["_section"].fillna("").tolist() + + # Local sentence-transformers model + print("Loading embeddings model...") + model = SentenceTransformer("all-MiniLM-L6-v2") + print("Generating embeddings...") + embeddings = model.encode(texts, batch_size=32, show_progress_bar=True) + + # Nearest neighbors + print("Searching for similar neighbors...") + nn = NearestNeighbors(metric="cosine", algorithm="brute") + nn.fit(embeddings) + + # Include the point itself (n_neighbors=21 -> 1 self + 20 neighbors for medium datasets) + distances, indices = nn.kneighbors(embeddings, n_neighbors=min(21, len(embeddings))) + + rows = [] + n = len(df) + + for i in range(n): + for k in range(1, indices.shape[1]): # skip neighbor 0 (itself) + j = indices[i, k] + if i >= j: + continue # avoid duplicating pairs (i,j) and (j,i) + + dist = distances[i, k] + sim = 1.0 - dist + + if sim < SEMANTIC_SIM_THRESHOLD: + continue + + steps1 = df.iloc[i]["steps_list"] + steps2 = df.iloc[j]["steps_list"] + overlap = compute_step_overlap(steps1, steps2) + + label = "similar" + if sim >= SEMANTIC_DUP_THRESHOLD: + label = "semantic_duplicate" + + shares_most_steps = overlap >= STEP_OVERLAP_THRESHOLD + + rows.append({ + "case_id_1": case_ids[i], + "title_1": titles[i], + "section_1": sections[i], + "case_id_2": case_ids[j], + "title_2": titles[j], + "section_2": sections[j], + "similarity": round(float(sim), 4), + "step_overlap": round(float(overlap), 4), + "relation": label, + "shares_most_steps": shares_most_steps, + }) + + return pd.DataFrame(rows) + + +# ---------- Main ---------- + +def parse_args(): + parser = argparse.ArgumentParser( + description="Detect duplicate and similar test cases from a TestRail Excel export." + ) + parser.add_argument( + "input_xlsx", + help="Path to the TestRail Excel export (.xlsx)", + ) + parser.add_argument( + "--exact-output", + default=EXACT_OUTPUT, + help=f"Output CSV for exact duplicates (default: {EXACT_OUTPUT})", + ) + parser.add_argument( + "--similar-output", + default=SIMILAR_OUTPUT, + help=f"Output CSV for similar pairs (default: {SIMILAR_OUTPUT})", + ) + parser.add_argument( + "--dup-threshold", + type=float, + default=SEMANTIC_DUP_THRESHOLD, + help=f"Semantic similarity threshold for 'duplicate' label (default: {SEMANTIC_DUP_THRESHOLD})", + ) + parser.add_argument( + "--sim-threshold", + type=float, + default=SEMANTIC_SIM_THRESHOLD, + help=f"Minimum similarity threshold to report a pair (default: {SEMANTIC_SIM_THRESHOLD})", + ) + parser.add_argument( + "--overlap-threshold", + type=float, + default=STEP_OVERLAP_THRESHOLD, + help=f"Step overlap threshold for 'shares_most_steps' flag (default: {STEP_OVERLAP_THRESHOLD})", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + # Apply CLI overrides to module-level thresholds + global SEMANTIC_DUP_THRESHOLD, SEMANTIC_SIM_THRESHOLD, STEP_OVERLAP_THRESHOLD + SEMANTIC_DUP_THRESHOLD = args.dup_threshold + SEMANTIC_SIM_THRESHOLD = args.sim_threshold + STEP_OVERLAP_THRESHOLD = args.overlap_threshold + + try: + print("Loading and normalizing data...") + df = load_and_normalize(args.input_xlsx) + + print(f"Total test cases loaded: {len(df)}") + + if len(df) == 0: + print("Warning: No test cases found in the input file.") + return + + print("Searching for exact duplicates...") + exact_dups = find_exact_duplicates(df) + if not exact_dups.empty: + exact_dups[["_case_id", "_title", "_section", "duplicate_group_id"]].to_csv(args.exact_output, index=False) + print(f"Exact duplicates saved to {args.exact_output}") + else: + print("No exact duplicates found.") + + print("Searching for similar tests (semantic + steps)...") + similar_pairs = compute_semantic_pairs(df) + if not similar_pairs.empty: + similar_pairs.to_csv(args.similar_output, index=False) + print(f"Similar pairs saved to {args.similar_output}") + else: + print("No similar pairs found with the defined thresholds.") + + # Save stats for downstream scripts + stats = { + "total_cases": len(df), + "input_file": args.input_xlsx, + } + with open("analysis_stats.json", "w") as f: + json.dump(stats, f, indent=2) + print(f"Stats saved to analysis_stats.json") + + except FileNotFoundError: + print(f"Error: Input file '{args.input_xlsx}' not found.") + except KeyError as e: + print(f"Error: Required column not found in Excel file: {e}") + except Exception as e: + print(f"Error: {e}") + + +if __name__ == "__main__": + main() diff --git a/testrail/testcases-deduplication/generate-work-list.py b/testrail/testcases-deduplication/generate-work-list.py new file mode 100644 index 0000000..5240883 --- /dev/null +++ b/testrail/testcases-deduplication/generate-work-list.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Generate prioritized work list for reviewing duplicates +""" +import json +import os +import pandas as pd + + +def load_csv(path: str) -> pd.DataFrame: + """Load a CSV file, returning an empty DataFrame if the file doesn't exist.""" + if not os.path.exists(path): + print(f"Warning: {path} not found — skipping (no results for this category).") + return pd.DataFrame() + return pd.read_csv(path) + + +def generate_work_list(): + # Load data + exact = load_csv('duplicates_exact.csv') + similar = load_csv('similar_pairs.csv') + + if exact.empty and similar.empty: + raise RuntimeError("No duplicate data found. Run find-duplicates.py first.") + + # Load total case count from stats file (written by find-duplicates.py) + total_cases = 0 + if os.path.exists('analysis_stats.json'): + with open('analysis_stats.json') as f: + stats = json.load(f) + total_cases = stats.get('total_cases', 0) + + print("=" * 80) + print("WORK LIST - TEST CASE DEDUPLICATION") + print("=" * 80) + print() + + # PART 1: Exact Duplicates + print("PHASE 1: EXACT DUPLICATES") + print("-" * 80) + + total_to_archive = 0 + + if exact.empty: + print(" No exact duplicates found.") + else: + group_sizes = exact.groupby('duplicate_group_id').size().sort_values(ascending=False) + priority = 1 + + # Large groups first (4+) + print("\n🔴 PRIORITY 1: Large groups (4+ duplicates)") + print() + for group_id, size in group_sizes[group_sizes >= 4].items(): + group = exact[exact['duplicate_group_id'] == group_id].sort_values('_case_id') + title = group.iloc[0]['_title'] + case_ids = list(group['_case_id'].values) + + print(f"{priority}. Group {group_id}: {size} duplicates") + print(f" Title: {title}") + print(f" ✅ KEEP: {case_ids[0]} (lowest ID)") + print(f" 🗑️ ARCHIVE: {', '.join(case_ids[1:])}") + print(f" Savings: {size - 1} test cases") + print() + + priority += 1 + total_to_archive += size - 1 + + # Medium groups (3) + print("\n🟠 PRIORITY 2: Medium groups (3 duplicates)") + print() + for group_id, size in group_sizes[group_sizes == 3].items(): + group = exact[exact['duplicate_group_id'] == group_id].sort_values('_case_id') + title = group.iloc[0]['_title'] + case_ids = list(group['_case_id'].values) + + print(f"{priority}. Group {group_id}: {title[:60]}") + print(f" ✅ KEEP: {case_ids[0]}") + print(f" 🗑️ ARCHIVE: {', '.join(case_ids[1:])}") + print() + + priority += 1 + total_to_archive += size - 1 + + if priority > 25: # Limit output + remaining = len(group_sizes[group_sizes == 3]) - (priority - len(group_sizes[group_sizes >= 4]) - 1) + if remaining > 0: + print(f" ... and {remaining} more groups of 3") + break + + # Small groups (2) + print(f"\n🟡 PRIORITY 3: Small groups (2 duplicates)") + print(f" {(group_sizes == 2).sum()} groups") + print(f" Savings: {(group_sizes == 2).sum()} test cases") + print() + + total_to_archive += (group_sizes == 2).sum() + + print(f"\n📊 PHASE 1 SUMMARY:") + print(f" Total groups: {len(group_sizes)}") + print(f" Total cases to archive: {total_to_archive}") + print() + + # PART 2: High Similarity + print("\n" + "=" * 80) + print("PHASE 2: HIGH SIMILARITY PAIRS") + print("-" * 80) + + if similar.empty: + print(" No similar pairs found.") + else: + perfect = similar[similar['similarity'] == 1.0] + print(f"\n🔴 PRIORITY 1: Perfect semantic matches (100%)") + print(f" {len(perfect)} pairs") + print(f" Estimated savings: ~{len(perfect) // 2} test cases") + print() + + print(" Top 5 examples:") + for i, (_, row) in enumerate(perfect.head(5).iterrows(), 1): + print(f" {i}. {row['case_id_1']} vs {row['case_id_2']}") + print(f" {row['title_1'][:60]}") + print() + + near_perfect = similar[(similar['similarity'] >= 0.95) & (similar['similarity'] < 1.0)] + print(f"🟠 PRIORITY 2: Near-perfect matches (95-100%)") + print(f" {len(near_perfect)} pairs") + print(f" Recommended: Review top 50") + print(f" Estimated savings: ~{len(near_perfect) // 3} test cases") + print() + + high_overlap = similar[(similar['similarity'] >= 0.90) & (similar['step_overlap'] >= 0.8)] + print(f"🟡 PRIORITY 3: High step overlap (≥90% sim, ≥80% overlap)") + print(f" {len(high_overlap)} pairs") + print(f" These share most execution steps") + print() + + # Compute Phase 2 estimate from real data (perfect + ~1/3 of near-perfect) + if not similar.empty: + perfect_savings = len(similar[similar['similarity'] == 1.0]) // 2 + near_perfect_savings = len(similar[(similar['similarity'] >= 0.95) & (similar['similarity'] < 1.0)]) // 3 + phase2_estimate = perfect_savings + near_perfect_savings + else: + phase2_estimate = 0 + + # Overall summary + print("\n" + "=" * 80) + print("ESTIMATED TOTALS") + print("-" * 80) + print(f"Phase 1 (Exact): ~{total_to_archive} cases") + if phase2_estimate > 0: + print(f"Phase 2 (Similar): ~{phase2_estimate} cases") + estimated_total = total_to_archive + phase2_estimate + if total_cases > 0: + pct = estimated_total / total_cases * 100 + print(f"GRAND TOTAL: ~{estimated_total} cases ({pct:.1f}% reduction out of {total_cases} total)") + else: + print(f"GRAND TOTAL: ~{estimated_total} cases") + print("=" * 80) + + +if __name__ == "__main__": + import argparse + import sys + parser = argparse.ArgumentParser(description="Generate prioritized work list from deduplication results.") + parser.add_argument( + "--output-dir", default=".", + help="Directory containing duplicates_exact.csv and similar_pairs.csv (default: current directory)" + ) + cli_args = parser.parse_args() + os.chdir(cli_args.output_dir) + try: + generate_work_list() + except RuntimeError as e: + print(f"Error: {e}") + sys.exit(1) diff --git a/testrail/testcases-deduplication/insert_bq_stats.py b/testrail/testcases-deduplication/insert_bq_stats.py new file mode 100644 index 0000000..906f03c --- /dev/null +++ b/testrail/testcases-deduplication/insert_bq_stats.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Compute deduplication stats from output CSVs, write them to GITHUB_ENV, +and insert a row into BigQuery. + +Usage: + python insert_bq_stats.py \ + --output-dir ./output \ + --project-id 14 \ + --project-name firefox-ios \ + --run-date 2026-04-23 \ + --github-run-id 12345678 \ + --bq-project moz-mobile-tools \ + --bq-dataset testops_stats \ + --bq-table testrail_deduplication_runs +""" +import argparse +import csv +import json +import os +import subprocess +import sys +from pathlib import Path + + +HIGH_PRIORITY_THRESHOLD = 0.95 + + +def compute_stats(output_dir: Path) -> dict: + total = 0 + exact = 0 + similar = 0 + high_priority_similar = 0 + + stats_file = output_dir / "analysis_stats.json" + if stats_file.exists(): + total = json.loads(stats_file.read_text()).get("total_cases", 0) + + exact_file = output_dir / "duplicates_exact.csv" + if exact_file.exists(): + with exact_file.open() as f: + exact = sum(1 for _ in csv.DictReader(f)) + + similar_file = output_dir / "similar_pairs.csv" + if similar_file.exists(): + with similar_file.open() as f: + for row in csv.DictReader(f): + similar += 1 + try: + if float(row["similarity"]) >= HIGH_PRIORITY_THRESHOLD: + high_priority_similar += 1 + except (ValueError, KeyError): + pass + + duplicate_rate = round(exact / total, 4) if total > 0 else 0.0 + + return { + "total": total, + "exact": exact, + "similar": similar, + "high_priority_similar": high_priority_similar, + "duplicate_rate": duplicate_rate, + } + + +def write_github_env(stats: dict) -> None: + github_env = os.environ.get("GITHUB_ENV") + if not github_env: + return + with open(github_env, "a") as f: + f.write(f"current_total={stats['total']}\n") + f.write(f"current_exact={stats['exact']}\n") + f.write(f"current_similar={stats['similar']}\n") + f.write(f"current_high_priority_similar={stats['high_priority_similar']}\n") + f.write(f"current_rate={stats['duplicate_rate']}\n") + + +def insert_bigquery(stats: dict, args: argparse.Namespace) -> None: + payload = json.dumps({ + "run_date": args.run_date, + "project_id": args.project_id, + "project_name": args.project_name, + "total_cases": stats["total"], + "exact_duplicate_cases": stats["exact"], + "similar_pairs": stats["similar"], + "high_priority_similar_pairs": stats["high_priority_similar"], + "duplicate_rate": stats["duplicate_rate"], + "github_run_id": args.github_run_id, + }) + + result = subprocess.run( + ["bq", "insert", + f"--project_id={args.bq_project}", + f"{args.bq_dataset}.{args.bq_table}"], + input=payload.encode(), + capture_output=True, + ) + + if result.returncode != 0: + print(f"BigQuery insert failed: {result.stderr.decode()}", file=sys.stderr) + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser(description="Compute dedup stats and insert into BigQuery.") + parser.add_argument("--output-dir", required=True, help="Directory with output CSVs") + parser.add_argument("--project-id", required=True, help="TestRail project ID") + parser.add_argument("--project-name", required=True, help="Project name (e.g. firefox-ios)") + parser.add_argument("--run-date", required=True, help="Run date (YYYY-MM-DD)") + parser.add_argument("--github-run-id", required=True, help="GitHub Actions run ID") + parser.add_argument("--bq-project", default="moz-mobile-tools") + parser.add_argument("--bq-dataset", default="testops_stats") + parser.add_argument("--bq-table", default="testrail_deduplication_runs") + args = parser.parse_args() + + output_dir = Path(args.output_dir) + if not output_dir.exists(): + print(f"Error: output directory '{output_dir}' does not exist.", file=sys.stderr) + sys.exit(1) + + stats = compute_stats(output_dir) + + print(f"Total cases: {stats['total']}") + print(f"Exact duplicates: {stats['exact']}") + print(f"Similar pairs: {stats['similar']}") + print(f"High-priority similar pairs:{stats['high_priority_similar']}") + print(f"Duplicate rate: {stats['duplicate_rate']:.1%}") + + write_github_env(stats) + insert_bigquery(stats, args) + print("BigQuery insert successful.") + + +if __name__ == "__main__": + main() diff --git a/testrail/testcases-deduplication/requirements.txt b/testrail/testcases-deduplication/requirements.txt new file mode 100644 index 0000000..14b61de --- /dev/null +++ b/testrail/testcases-deduplication/requirements.txt @@ -0,0 +1,5 @@ +pandas>=2.0.0 +openpyxl>=3.1.0 +requests>=2.32.0 +sentence-transformers>=2.2.0 +scikit-learn>=1.3.0 diff --git a/testrail/testcases-deduplication/run_all.py b/testrail/testcases-deduplication/run_all.py new file mode 100644 index 0000000..3db3fc7 --- /dev/null +++ b/testrail/testcases-deduplication/run_all.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +""" +Full deduplication pipeline: find duplicates → generate work list → export CSVs. + +Usage: + python run_all.py /path/to/testrail_export.xlsx + python run_all.py /path/to/export.xlsx --sim-threshold 0.85 +""" +import argparse +import importlib.util +import json +import os +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent + + +def load_module(filename: str): + """Load a module from a hyphen-named file in the same directory as this script.""" + path = SCRIPT_DIR / filename + module_name = filename.removesuffix(".py").replace("-", "_") + spec = importlib.util.spec_from_file_location(module_name, path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def run_find_duplicates(input_xlsx: str, dup_threshold: float, sim_threshold: float, overlap_threshold: float): + fd = load_module("find-duplicates.py") + + # Set thresholds before calling any function (they are module-level constants) + fd.SEMANTIC_DUP_THRESHOLD = dup_threshold + fd.SEMANTIC_SIM_THRESHOLD = sim_threshold + fd.STEP_OVERLAP_THRESHOLD = overlap_threshold + + print("Loading and normalizing data...") + df = fd.load_and_normalize(input_xlsx) + print(f"Total test cases loaded: {len(df)}") + + if len(df) == 0: + print("Warning: No test cases found in the input file.") + return + + print("Searching for exact duplicates...") + exact_dups = fd.find_exact_duplicates(df) + if not exact_dups.empty: + exact_dups[["_case_id", "_title", "_section", "duplicate_group_id"]].to_csv("duplicates_exact.csv", index=False) + print("Exact duplicates saved to duplicates_exact.csv") + else: + print("No exact duplicates found.") + + print("Searching for similar tests (semantic + steps)...") + similar_pairs = fd.compute_semantic_pairs(df) + if not similar_pairs.empty: + similar_pairs.to_csv("similar_pairs.csv", index=False) + print("Similar pairs saved to similar_pairs.csv") + else: + print("No similar pairs found with the defined thresholds.") + + stats = {"total_cases": len(df), "input_file": input_xlsx} + with open("analysis_stats.json", "w") as f: + json.dump(stats, f, indent=2) + print("Stats saved to analysis_stats.json") + + +def run_generate_work_list(): + gwl = load_module("generate-work-list.py") + gwl.generate_work_list() + + +def run_export_priority_lists(): + epl = load_module("export-priority-list.py") + epl.export_priority_lists() + + +def main(): + parser = argparse.ArgumentParser( + description="Run the full test case deduplication pipeline." + ) + parser.add_argument("input_xlsx", help="Path to the TestRail Excel export (.xlsx)") + parser.add_argument( + "--dup-threshold", type=float, default=0.90, + help="Semantic similarity threshold for 'duplicate' label (default: 0.90)" + ) + parser.add_argument( + "--sim-threshold", type=float, default=0.80, + help="Minimum similarity threshold to report a pair (default: 0.80)" + ) + parser.add_argument( + "--overlap-threshold", type=float, default=0.80, + help="Step overlap threshold for 'shares_most_steps' flag (default: 0.80)" + ) + parser.add_argument( + "--output-dir", default=str(SCRIPT_DIR), + help="Directory where output CSVs will be written (default: script directory)" + ) + args = parser.parse_args() + + # Resolve both paths before chdir, in case they are relative to the caller's CWD + input_xlsx = str(Path(args.input_xlsx).resolve()) + output_dir = Path(args.output_dir).resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + # All sub-scripts use relative paths for their CSVs — chdir so they resolve to output_dir + os.chdir(output_dir) + + print(f"\n{'='*60}") + print(" Step 1/3: Finding duplicates") + print(f"{'='*60}") + run_find_duplicates(input_xlsx, args.dup_threshold, args.sim_threshold, args.overlap_threshold) + + for name, fn in [ + ("Step 2/3: Generating work list", run_generate_work_list), + ("Step 3/3: Exporting priority CSVs", run_export_priority_lists), + ]: + print(f"\n{'='*60}") + print(f" {name}") + print(f"{'='*60}") + try: + fn() + except RuntimeError as e: + print(f" Skipped: {e}") + + print(f"\n{'='*60}") + print(" Pipeline complete.") + print(f" Output directory: {output_dir}") + print(" Output files:") + print(" - duplicates_exact.csv") + print(" - similar_pairs.csv") + print(" - WORK_LIST_EXACT.csv") + print(" - WORK_LIST_PERFECT_MATCHES.csv") + print(" - WORK_LIST_SIMILAR_HIGH_PRIORITY.csv") + print(" - analysis_stats.json") + print(f"{'='*60}\n") + + +if __name__ == "__main__": + main()