Dual-source nutrition system using BLS (German, primary) and USDA (English, fallback) with ML embedding matching (multilingual-e5-small / all-MiniLM-L6-v2), hybrid substring-first search, and position-aware scoring heuristics. Includes per-recipe and global manual ingredient overwrites, ingredient skip/exclude, referenced recipe nutrition (base refs + anchor tags), section-name dedup, amino acid tracking, and reactive client-side calculator with NutritionSummary component.
61 lines
2.2 KiB
TypeScript
61 lines
2.2 KiB
TypeScript
/**
|
|
* Pre-computes sentence embeddings for all USDA nutrition DB entries using
|
|
* all-MiniLM-L6-v2 via @huggingface/transformers.
|
|
*
|
|
* Run with: pnpm exec vite-node scripts/embed-nutrition-db.ts
|
|
*
|
|
* Outputs: src/lib/data/nutritionEmbeddings.json
|
|
* Format: { entries: [{ fdcId, name, vector: number[384] }] }
|
|
*/
|
|
import { writeFileSync } from 'fs';
|
|
import { resolve } from 'path';
|
|
import { pipeline } from '@huggingface/transformers';
|
|
import { NUTRITION_DB } from '../src/lib/data/nutritionDb';
|
|
|
|
const OUTPUT_PATH = resolve('src/lib/data/nutritionEmbeddings.json');
|
|
const MODEL_NAME = 'Xenova/all-MiniLM-L6-v2';
|
|
const BATCH_SIZE = 64;
|
|
|
|
async function main() {
|
|
console.log('=== Nutrition DB Embedding Generation ===\n');
|
|
console.log(`Entries to embed: ${NUTRITION_DB.length}`);
|
|
console.log(`Model: ${MODEL_NAME}`);
|
|
console.log(`Loading model (first run downloads ~23MB)...\n`);
|
|
|
|
const embedder = await pipeline('feature-extraction', MODEL_NAME, {
|
|
dtype: 'q8',
|
|
});
|
|
|
|
const entries: { fdcId: number; name: string; vector: number[] }[] = [];
|
|
const totalBatches = Math.ceil(NUTRITION_DB.length / BATCH_SIZE);
|
|
|
|
for (let i = 0; i < NUTRITION_DB.length; i += BATCH_SIZE) {
|
|
const batch = NUTRITION_DB.slice(i, i + BATCH_SIZE);
|
|
const batchNum = Math.floor(i / BATCH_SIZE) + 1;
|
|
process.stdout.write(`\r Batch ${batchNum}/${totalBatches} (${i + batch.length}/${NUTRITION_DB.length})`);
|
|
|
|
// Embed all names in this batch
|
|
for (const item of batch) {
|
|
const result = await embedder(item.name, { pooling: 'mean', normalize: true });
|
|
// result.data is a Float32Array — truncate to 4 decimal places to save space
|
|
const vector = Array.from(result.data as Float32Array).map(v => Math.round(v * 10000) / 10000);
|
|
entries.push({ fdcId: item.fdcId, name: item.name, vector });
|
|
}
|
|
}
|
|
|
|
console.log('\n\nWriting embeddings...');
|
|
|
|
const output = { model: MODEL_NAME, dimensions: 384, count: entries.length, entries };
|
|
writeFileSync(OUTPUT_PATH, JSON.stringify(output), 'utf-8');
|
|
|
|
const fileSizeMB = (Buffer.byteLength(JSON.stringify(output)) / 1024 / 1024).toFixed(1);
|
|
console.log(`Written ${entries.length} embeddings to ${OUTPUT_PATH} (${fileSizeMB}MB)`);
|
|
|
|
await embedder.dispose();
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error('Embedding generation failed:', err);
|
|
process.exit(1);
|
|
});
|