Files
homepage/scripts/embed-nutrition-db.ts
Alexander Bocken 7e1181461e recipes: nutrition calculator with BLS/USDA matching, manual overwrites, and skip
Dual-source nutrition system using BLS (German, primary) and USDA (English, fallback)
with ML embedding matching (multilingual-e5-small / all-MiniLM-L6-v2), hybrid
substring-first search, and position-aware scoring heuristics.

Includes per-recipe and global manual ingredient overwrites, ingredient skip/exclude,
referenced recipe nutrition (base refs + anchor tags), section-name dedup,
amino acid tracking, and reactive client-side calculator with NutritionSummary component.
2026-04-01 13:00:55 +02:00

61 lines
2.2 KiB
TypeScript

/**
* Pre-computes sentence embeddings for all USDA nutrition DB entries using
* all-MiniLM-L6-v2 via @huggingface/transformers.
*
* Run with: pnpm exec vite-node scripts/embed-nutrition-db.ts
*
* Outputs: src/lib/data/nutritionEmbeddings.json
* Format: { entries: [{ fdcId, name, vector: number[384] }] }
*/
import { writeFileSync } from 'fs';
import { resolve } from 'path';
import { pipeline } from '@huggingface/transformers';
import { NUTRITION_DB } from '../src/lib/data/nutritionDb';
const OUTPUT_PATH = resolve('src/lib/data/nutritionEmbeddings.json');
const MODEL_NAME = 'Xenova/all-MiniLM-L6-v2';
const BATCH_SIZE = 64;
async function main() {
console.log('=== Nutrition DB Embedding Generation ===\n');
console.log(`Entries to embed: ${NUTRITION_DB.length}`);
console.log(`Model: ${MODEL_NAME}`);
console.log(`Loading model (first run downloads ~23MB)...\n`);
const embedder = await pipeline('feature-extraction', MODEL_NAME, {
dtype: 'q8',
});
const entries: { fdcId: number; name: string; vector: number[] }[] = [];
const totalBatches = Math.ceil(NUTRITION_DB.length / BATCH_SIZE);
for (let i = 0; i < NUTRITION_DB.length; i += BATCH_SIZE) {
const batch = NUTRITION_DB.slice(i, i + BATCH_SIZE);
const batchNum = Math.floor(i / BATCH_SIZE) + 1;
process.stdout.write(`\r Batch ${batchNum}/${totalBatches} (${i + batch.length}/${NUTRITION_DB.length})`);
// Embed all names in this batch
for (const item of batch) {
const result = await embedder(item.name, { pooling: 'mean', normalize: true });
// result.data is a Float32Array — truncate to 4 decimal places to save space
const vector = Array.from(result.data as Float32Array).map(v => Math.round(v * 10000) / 10000);
entries.push({ fdcId: item.fdcId, name: item.name, vector });
}
}
console.log('\n\nWriting embeddings...');
const output = { model: MODEL_NAME, dimensions: 384, count: entries.length, entries };
writeFileSync(OUTPUT_PATH, JSON.stringify(output), 'utf-8');
const fileSizeMB = (Buffer.byteLength(JSON.stringify(output)) / 1024 / 1024).toFixed(1);
console.log(`Written ${entries.length} embeddings to ${OUTPUT_PATH} (${fileSizeMB}MB)`);
await embedder.dispose();
}
main().catch(err => {
console.error('Embedding generation failed:', err);
process.exit(1);
});