Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
dist/
# Node.js
node_modules/

## Core latex/pdflatex auxiliary files:
*.aux
*.lof
Expand Down
48 changes: 48 additions & 0 deletions apps/retriever/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions apps/retriever/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"name": "retriever",
"version": "1.0.0",
"description": "Hybrid retrieval engine for Smart Notes",
"main": "dist/index.js",
"scripts": {
"build": "tsc"
},
"devDependencies": {
"@types/node": "^20.0.0",
"typescript": "^5.0.0"
}
}
106 changes: 106 additions & 0 deletions apps/retriever/src/CosineSimilarity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/**
* @file CosineSimilarity.ts
* @description Utility functions for computing cosine similarity between
* dense embedding vectors. Used by the hybrid retrieval engine to measure
* semantic closeness between a query embedding and stored chunk embeddings.
*/

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/**
* Computes the Euclidean magnitude (L2 norm) of a vector.
*
* The magnitude is defined as:
* ```
* ||v|| = sqrt(v[0]² + v[1]² + ... + v[n-1]²)
* ```
*
* @param v - A dense numeric vector.
* @returns The non-negative scalar magnitude of `v`.
*/
export function vectorMagnitude(v: number[]): number {
let sumOfSquares = 0;
for (let i = 0; i < v.length; i++) {
sumOfSquares += v[i] * v[i];
}
return Math.sqrt(sumOfSquares);
}

// ---------------------------------------------------------------------------
// Core
// ---------------------------------------------------------------------------

/**
* Computes the cosine similarity between two dense numeric vectors.
*
* Cosine similarity measures the cosine of the angle between two vectors in
* an inner-product space. It is widely used in semantic search to gauge how
* similar two embedding vectors are, regardless of their magnitude:
*
* ```
* cosineSimilarity(A, B) = (A · B) / (||A|| × ||B||)
* ```
*
* The result lies in the range **[-1, 1]**:
* - `1` → vectors point in the same direction (identical semantics)
* - `0` → vectors are orthogonal (unrelated)
* - `-1` → vectors point in opposite directions
*
* For typical text embeddings produced by transformer models the practical
* range is **[0, 1]**.
*
* ### Validation
* - Throws a `RangeError` if `a` and `b` have different lengths, because the
* dot product is undefined for vectors of unequal dimension.
* - Returns `0` if either vector has zero magnitude to avoid division by zero;
* a zero vector carries no directional information so similarity is
* treated as neutral.
*
* ### Performance
* The dot product and both sum-of-squares accumulators are computed in a
* **single pass** over the vectors, avoiding extra allocations or iterations.
*
* @param a - First dense numeric vector (e.g. a query embedding).
* @param b - Second dense numeric vector (e.g. a chunk embedding).
* @returns Cosine similarity in the range [-1, 1], or 0 if either vector
* has zero magnitude.
* @throws {RangeError} When `a` and `b` have different lengths.
*
* @example
* ```ts
* const score = cosineSimilarity([1, 0, 0], [1, 0, 0]); // 1
* const score = cosineSimilarity([1, 0], [0, 1]); // 0
* ```
*/
export function cosineSimilarity(a: number[], b: number[]): number {
if (a.length !== b.length) {
throw new RangeError(
`cosineSimilarity: vectors must have the same length ` +
`(got ${a.length} and ${b.length}).`
);
}

let dotProduct = 0;
let sumOfSquaresA = 0;
let sumOfSquaresB = 0;

// Single pass: accumulate dot product and both magnitudes simultaneously.
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
sumOfSquaresA += a[i] * a[i];
sumOfSquaresB += b[i] * b[i];
}

const magnitudeA = Math.sqrt(sumOfSquaresA);
const magnitudeB = Math.sqrt(sumOfSquaresB);

// Guard against division by zero for zero-magnitude vectors.
if (magnitudeA === 0 || magnitudeB === 0) {
return 0;
}

// Clamp to [-1, 1] to correct for floating-point drift.
return Math.max(-1, Math.min(1, dotProduct / (magnitudeA * magnitudeB)));
}
152 changes: 152 additions & 0 deletions apps/retriever/src/HybridScorer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/**
* @file HybridScorer.ts
* @description Hybrid ranking utility for the Smart Notes retrieval engine.
*
* Combines lexical relevance scores (e.g. BM25 from SQLite FTS5) with semantic
* similarity scores (cosine similarity between query and chunk embeddings) into
* a single blended ranking signal.
*/

import type {
SearchCandidate,
SearchResult,
HybridScoreWeights,
} from "./types";

// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------

/**
* Tolerance used when validating that alpha + beta ≈ 1.
* Accounts for normal IEEE-754 floating-point rounding.
*/
const WEIGHT_SUM_TOLERANCE = 1e-6;

// ---------------------------------------------------------------------------
// Core
// ---------------------------------------------------------------------------

/**
* Combines lexical and semantic scores into a ranked list of {@link SearchResult}s.
*
* ### Hybrid Ranking
* Pure lexical search (BM25 / FTS) excels at exact keyword matching but misses
* paraphrases and synonyms. Pure semantic search captures conceptual similarity
* but can surface results that share no keywords with the query. Hybrid ranking
* blends both signals to get the best of both worlds.
*
* ### Scoring Formula
* For each candidate at index `i`:
* ```
* finalScore = (alpha × semanticScore[i]) + (beta × candidates[i].lexicalScore)
* ```
* `alpha` and `beta` should sum to `1.0` for a standard weighted average,
* though values outside this range are accepted with a console warning.
*
* Results are returned **sorted in descending order** by `finalScore` so that
* the most relevant chunk appears first.
*
* @param candidates - Lexical search candidates produced by the retrieval store.
* Each carries a `lexicalScore` and identifying metadata.
* @param semanticScores - Cosine similarity scores in the same order as `candidates`.
* `semanticScores[i]` must correspond to `candidates[i]`.
* @param weights - Blending weights `{ alpha, beta }` applied to the
* semantic and lexical scores respectively.
*
* @returns An array of {@link SearchResult} objects sorted by `finalScore` (desc).
*
* @throws {RangeError} When `candidates` and `semanticScores` have different lengths,
* as a 1-to-1 correspondence is required for correct scoring.
*
* @example
* ```ts
* const results = scoreHybridResults(candidates, semanticScores, { alpha: 0.7, beta: 0.3 });
* console.log(results[0].finalScore); // highest scoring chunk
* ```
*/
export function scoreHybridResults(
candidates: SearchCandidate[],
semanticScores: number[],
weights: HybridScoreWeights
): SearchResult[] {
// ------------------------------------------------------------------
// Validation
// ------------------------------------------------------------------

if (candidates.length !== semanticScores.length) {
throw new RangeError(
`scoreHybridResults: candidates and semanticScores must have the same length ` +
`(got ${candidates.length} candidates and ${semanticScores.length} scores).`
);
}

if (!Number.isFinite(weights.alpha) || !Number.isFinite(weights.beta)) {
throw new RangeError(
`scoreHybridResults: weights.alpha and weights.beta must be finite numbers ` +
`(got ${weights.alpha} and ${weights.beta}).`
);
}

if (weights.alpha < 0 || weights.beta < 0) {
throw new RangeError(
`scoreHybridResults: weights.alpha and weights.beta must be non-negative ` +
`(got ${weights.alpha} and ${weights.beta}).`
);
}

const weightSum = weights.alpha + weights.beta;
if (Math.abs(weightSum - 1) > WEIGHT_SUM_TOLERANCE) {
throw new RangeError(
`scoreHybridResults: weights.alpha (${weights.alpha}) + weights.beta (${weights.beta}) ` +
`must sum to 1 (got ${weightSum}).`
);
}

// ------------------------------------------------------------------
// Scoring
// ------------------------------------------------------------------

const minLexicalScore = candidates.reduce(
(minScore: number, candidate: SearchCandidate): number =>
Math.min(minScore, candidate.lexicalScore),
Number.POSITIVE_INFINITY
);
const maxLexicalScore = candidates.reduce(
(maxScore: number, candidate: SearchCandidate): number =>
Math.max(maxScore, candidate.lexicalScore),
Number.NEGATIVE_INFINITY
);
const range = maxLexicalScore - minLexicalScore;

const results: SearchResult[] = candidates.map(
(candidate: SearchCandidate, i: number): SearchResult => {
const semanticScore = semanticScores[i];
const lexicalScore = candidate.lexicalScore;
const normalizedLexicalScore =
range > 0 ? (lexicalScore - minLexicalScore) / range : 0;
const finalScore =
weights.alpha * semanticScore +
weights.beta * normalizedLexicalScore;

return {
chunkId: candidate.chunkId,
notePath: candidate.notePath,
text: candidate.text,
lexicalScore,
semanticScore,
finalScore,
};
}
);

// ------------------------------------------------------------------
// Ranking — descending by finalScore
// ------------------------------------------------------------------

results.sort(
(a: SearchResult, b: SearchResult): number => b.finalScore - a.finalScore
);

return results;
}
Loading
Loading