AOSSIE-Org · Chethan-Regala · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
+dist/
+# Node.js
+node_modules/
+
 ## Core latex/pdflatex auxiliary files:
 *.aux
 *.lof

diff --git a/apps/retriever/package-lock.json b/apps/retriever/package-lock.json
diff --git a/apps/retriever/package.json b/apps/retriever/package.json
@@ -0,0 +1,13 @@
+{
+  "name": "retriever",
+  "version": "1.0.0",
+  "description": "Hybrid retrieval engine for Smart Notes",
+  "main": "dist/index.js",
+  "scripts": {
+    "build": "tsc"
+  },
+  "devDependencies": {
+    "@types/node": "^20.0.0",
+    "typescript": "^5.0.0"
+  }
+}
diff --git a/apps/retriever/src/CosineSimilarity.ts b/apps/retriever/src/CosineSimilarity.ts
@@ -0,0 +1,106 @@
+/**
+ * @file CosineSimilarity.ts
+ * @description Utility functions for computing cosine similarity between
+ * dense embedding vectors. Used by the hybrid retrieval engine to measure
+ * semantic closeness between a query embedding and stored chunk embeddings.
+ */
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Computes the Euclidean magnitude (L2 norm) of a vector.
+ *
+ * The magnitude is defined as:
+ * ```
+ * ||v|| = sqrt(v[0]² + v[1]² + ... + v[n-1]²)
+ * ```
+ *
+ * @param v - A dense numeric vector.
+ * @returns The non-negative scalar magnitude of `v`.
+ */
+export function vectorMagnitude(v: number[]): number {
+  let sumOfSquares = 0;
+  for (let i = 0; i < v.length; i++) {
+    sumOfSquares += v[i] * v[i];
+  }
+  return Math.sqrt(sumOfSquares);
+}
+
+// ---------------------------------------------------------------------------
+// Core
+// ---------------------------------------------------------------------------
+
+/**
+ * Computes the cosine similarity between two dense numeric vectors.
+ *
+ * Cosine similarity measures the cosine of the angle between two vectors in
+ * an inner-product space. It is widely used in semantic search to gauge how
+ * similar two embedding vectors are, regardless of their magnitude:
+ *
+ * ```
+ * cosineSimilarity(A, B) = (A · B) / (||A|| × ||B||)
+ * ```
+ *
+ * The result lies in the range **[-1, 1]**:
+ * - `1`  → vectors point in the same direction (identical semantics)
+ * - `0`  → vectors are orthogonal (unrelated)
+ * - `-1` → vectors point in opposite directions
+ *
+ * For typical text embeddings produced by transformer models the practical
+ * range is **[0, 1]**.
+ *
+ * ### Validation
+ * - Throws a `RangeError` if `a` and `b` have different lengths, because the
+ *   dot product is undefined for vectors of unequal dimension.
+ * - Returns `0` if either vector has zero magnitude to avoid division by zero;
+ *   a zero vector carries no directional information so similarity is
+ *   treated as neutral.
+ *
+ * ### Performance
+ * The dot product and both sum-of-squares accumulators are computed in a
+ * **single pass** over the vectors, avoiding extra allocations or iterations.
+ *
+ * @param a - First dense numeric vector (e.g. a query embedding).
+ * @param b - Second dense numeric vector (e.g. a chunk embedding).
+ * @returns Cosine similarity in the range [-1, 1], or 0 if either vector
+ *          has zero magnitude.
+ * @throws {RangeError} When `a` and `b` have different lengths.
+ *
+ * @example
+ * ```ts
+ * const score = cosineSimilarity([1, 0, 0], [1, 0, 0]); // 1
+ * const score = cosineSimilarity([1, 0],    [0, 1]);    // 0
+ * ```
+ */
+export function cosineSimilarity(a: number[], b: number[]): number {
+  if (a.length !== b.length) {
+    throw new RangeError(
+      `cosineSimilarity: vectors must have the same length ` +
+        `(got ${a.length} and ${b.length}).`
+    );
+  }
+
+  let dotProduct = 0;
+  let sumOfSquaresA = 0;
+  let sumOfSquaresB = 0;
+
+  // Single pass: accumulate dot product and both magnitudes simultaneously.
+  for (let i = 0; i < a.length; i++) {
+    dotProduct += a[i] * b[i];
+    sumOfSquaresA += a[i] * a[i];
+    sumOfSquaresB += b[i] * b[i];
+  }
+
+  const magnitudeA = Math.sqrt(sumOfSquaresA);
+  const magnitudeB = Math.sqrt(sumOfSquaresB);
+
+  // Guard against division by zero for zero-magnitude vectors.
+  if (magnitudeA === 0 || magnitudeB === 0) {
+    return 0;
+  }
+
+  // Clamp to [-1, 1] to correct for floating-point drift.
+  return Math.max(-1, Math.min(1, dotProduct / (magnitudeA * magnitudeB)));
+}
diff --git a/apps/retriever/src/HybridScorer.ts b/apps/retriever/src/HybridScorer.ts
@@ -0,0 +1,152 @@
+/**
+ * @file HybridScorer.ts
+ * @description Hybrid ranking utility for the Smart Notes retrieval engine.
+ *
+ * Combines lexical relevance scores (e.g. BM25 from SQLite FTS5) with semantic
+ * similarity scores (cosine similarity between query and chunk embeddings) into
+ * a single blended ranking signal.
+ */
+
+import type {
+  SearchCandidate,
+  SearchResult,
+  HybridScoreWeights,
+} from "./types";
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/**
+ * Tolerance used when validating that alpha + beta ≈ 1.
+ * Accounts for normal IEEE-754 floating-point rounding.
+ */
+const WEIGHT_SUM_TOLERANCE = 1e-6;
+
+// ---------------------------------------------------------------------------
+// Core
+// ---------------------------------------------------------------------------
+
+/**
+ * Combines lexical and semantic scores into a ranked list of {@link SearchResult}s.
+ *
+ * ### Hybrid Ranking
+ * Pure lexical search (BM25 / FTS) excels at exact keyword matching but misses
+ * paraphrases and synonyms. Pure semantic search captures conceptual similarity
+ * but can surface results that share no keywords with the query. Hybrid ranking
+ * blends both signals to get the best of both worlds.
+ *
+ * ### Scoring Formula
+ * For each candidate at index `i`:
+ * ```
+ * finalScore = (alpha × semanticScore[i]) + (beta × candidates[i].lexicalScore)
+ * ```
+ * `alpha` and `beta` should sum to `1.0` for a standard weighted average,
+ * though values outside this range are accepted with a console warning.
+ *
+ * Results are returned **sorted in descending order** by `finalScore` so that
+ * the most relevant chunk appears first.
+ *
+ * @param candidates      - Lexical search candidates produced by the retrieval store.
+ *                          Each carries a `lexicalScore` and identifying metadata.
+ * @param semanticScores  - Cosine similarity scores in the same order as `candidates`.
+ *                          `semanticScores[i]` must correspond to `candidates[i]`.
+ * @param weights         - Blending weights `{ alpha, beta }` applied to the
+ *                          semantic and lexical scores respectively.
+ *
+ * @returns An array of {@link SearchResult} objects sorted by `finalScore` (desc).
+ *
+ * @throws {RangeError} When `candidates` and `semanticScores` have different lengths,
+ *                      as a 1-to-1 correspondence is required for correct scoring.
+ *
+ * @example
+ * ```ts
+ * const results = scoreHybridResults(candidates, semanticScores, { alpha: 0.7, beta: 0.3 });
+ * console.log(results[0].finalScore); // highest scoring chunk
+ * ```
+ */
+export function scoreHybridResults(
+  candidates: SearchCandidate[],
+  semanticScores: number[],
+  weights: HybridScoreWeights
+): SearchResult[] {
+  // ------------------------------------------------------------------
+  // Validation
+  // ------------------------------------------------------------------
+
+  if (candidates.length !== semanticScores.length) {
+    throw new RangeError(
+      `scoreHybridResults: candidates and semanticScores must have the same length ` +
+        `(got ${candidates.length} candidates and ${semanticScores.length} scores).`
+    );
+  }
+
+  if (!Number.isFinite(weights.alpha) || !Number.isFinite(weights.beta)) {
+    throw new RangeError(
+      `scoreHybridResults: weights.alpha and weights.beta must be finite numbers ` +
+        `(got ${weights.alpha} and ${weights.beta}).`
+    );
+  }
+
+  if (weights.alpha < 0 || weights.beta < 0) {
+    throw new RangeError(
+      `scoreHybridResults: weights.alpha and weights.beta must be non-negative ` +
+        `(got ${weights.alpha} and ${weights.beta}).`
+    );
+  }
+
+  const weightSum = weights.alpha + weights.beta;
+  if (Math.abs(weightSum - 1) > WEIGHT_SUM_TOLERANCE) {
+    throw new RangeError(
+      `scoreHybridResults: weights.alpha (${weights.alpha}) + weights.beta (${weights.beta}) ` +
+        `must sum to 1 (got ${weightSum}).`
+    );
+  }
+
+  // ------------------------------------------------------------------
+  // Scoring
+  // ------------------------------------------------------------------
+
+  const minLexicalScore = candidates.reduce(
+    (minScore: number, candidate: SearchCandidate): number =>
+      Math.min(minScore, candidate.lexicalScore),
+    Number.POSITIVE_INFINITY
+  );
+  const maxLexicalScore = candidates.reduce(
+    (maxScore: number, candidate: SearchCandidate): number =>
+      Math.max(maxScore, candidate.lexicalScore),
+    Number.NEGATIVE_INFINITY
+  );
+  const range = maxLexicalScore - minLexicalScore;
+
+  const results: SearchResult[] = candidates.map(
+    (candidate: SearchCandidate, i: number): SearchResult => {
+      const semanticScore = semanticScores[i];
+      const lexicalScore = candidate.lexicalScore;
+      const normalizedLexicalScore =
+        range > 0 ? (lexicalScore - minLexicalScore) / range : 0;
+      const finalScore =
+        weights.alpha * semanticScore +
+        weights.beta * normalizedLexicalScore;
+
+      return {
+        chunkId: candidate.chunkId,
+        notePath: candidate.notePath,
+        text: candidate.text,
+        lexicalScore,
+        semanticScore,
+        finalScore,
+      };
+    }
+  );
+
+  // ------------------------------------------------------------------
+  // Ranking — descending by finalScore
+  // ------------------------------------------------------------------
+
+  results.sort(
+    (a: SearchResult, b: SearchResult): number => b.finalScore - a.finalScore
+  );
+
+  return results;
+}