recipes: nutrition calculator with BLS/USDA matching, manual overwrites, and skip
Dual-source nutrition system using BLS (German, primary) and USDA (English, fallback) with ML embedding matching (multilingual-e5-small / all-MiniLM-L6-v2), hybrid substring-first search, and position-aware scoring heuristics. Includes per-recipe and global manual ingredient overwrites, ingredient skip/exclude, referenced recipe nutrition (base refs + anchor tags), section-name dedup, amino acid tracking, and reactive client-side calculator with NutritionSummary component.
This commit is contained in:
61
scripts/embed-bls-db.ts
Normal file
61
scripts/embed-bls-db.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
/**
|
||||
* Pre-compute sentence embeddings for BLS German food names.
|
||||
* Uses multilingual-e5-small for good German language understanding.
|
||||
*
|
||||
* Run: pnpm exec vite-node scripts/embed-bls-db.ts
|
||||
*/
|
||||
import { pipeline } from '@huggingface/transformers';
|
||||
import { writeFileSync } from 'fs';
|
||||
import { resolve } from 'path';
|
||||
|
||||
// Dynamic import of blsDb (generated file)
|
||||
const { BLS_DB } = await import('../src/lib/data/blsDb');
|
||||
|
||||
const MODEL_NAME = 'Xenova/multilingual-e5-small';
|
||||
const OUTPUT_FILE = resolve('src/lib/data/blsEmbeddings.json');
|
||||
|
||||
async function main() {
|
||||
console.log(`Loading model ${MODEL_NAME}...`);
|
||||
const embedder = await pipeline('feature-extraction', MODEL_NAME, {
|
||||
dtype: 'q8',
|
||||
});
|
||||
|
||||
console.log(`Embedding ${BLS_DB.length} BLS entries...`);
|
||||
|
||||
const entries: { blsCode: string; name: string; vector: number[] }[] = [];
|
||||
const batchSize = 32;
|
||||
|
||||
for (let i = 0; i < BLS_DB.length; i += batchSize) {
|
||||
const batch = BLS_DB.slice(i, i + batchSize);
|
||||
// e5 models require "passage: " prefix for documents
|
||||
const texts = batch.map(e => `passage: ${e.nameDe}`);
|
||||
|
||||
for (let j = 0; j < batch.length; j++) {
|
||||
const result = await embedder(texts[j], { pooling: 'mean', normalize: true });
|
||||
const vector = Array.from(result.data as Float32Array).map(v => Math.round(v * 10000) / 10000);
|
||||
|
||||
entries.push({
|
||||
blsCode: batch[j].blsCode,
|
||||
name: batch[j].nameDe,
|
||||
vector,
|
||||
});
|
||||
}
|
||||
|
||||
if ((i + batchSize) % 500 < batchSize) {
|
||||
console.log(` ${Math.min(i + batchSize, BLS_DB.length)}/${BLS_DB.length}`);
|
||||
}
|
||||
}
|
||||
|
||||
const output = {
|
||||
model: MODEL_NAME,
|
||||
dimensions: entries[0]?.vector.length || 384,
|
||||
count: entries.length,
|
||||
entries,
|
||||
};
|
||||
|
||||
const json = JSON.stringify(output);
|
||||
writeFileSync(OUTPUT_FILE, json, 'utf-8');
|
||||
console.log(`Written ${OUTPUT_FILE} (${(json.length / 1024 / 1024).toFixed(1)}MB, ${entries.length} entries)`);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user