recipes: nutrition calculator with BLS/USDA matching, manual overwrites, and skip

Dual-source nutrition system using BLS (German, primary) and USDA (English, fallback) with ML embedding matching (multilingual-e5-small / all-MiniLM-L6-v2), hybrid substring-first search, and position-aware scoring heuristics. Includes per-recipe and global manual ingredient overwrites, ingredient skip/exclude, referenced recipe nutrition (base refs + anchor tags), section-name dedup, amino acid tracking, and reactive client-side calculator with NutritionSummary component.
2026-04-01 13:00:52 +02:00
parent 3cafe8955a
commit 7e1181461e
30 changed files with 722384 additions and 12 deletions
--- a/scripts/import-usda-nutrition.ts
+++ b/scripts/import-usda-nutrition.ts
@@ -0,0 +1,371 @@
+/**
+ * Imports USDA FoodData Central data (SR Legacy + Foundation Foods) and generates
+ * a typed nutrition database for the recipe calorie calculator.
+ *
+ * Run with: pnpm exec vite-node scripts/import-usda-nutrition.ts
+ *
+ * Downloads bulk CSV data from USDA FDC, filters to relevant food categories,
+ * extracts macro/micronutrient data per 100g, and outputs src/lib/data/nutritionDb.ts
+ */
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'fs';
+import { resolve } from 'path';
+
+const DATA_DIR = resolve('data/usda');
+const OUTPUT_PATH = resolve('src/lib/data/nutritionDb.ts');
+
+// USDA FDC bulk download URLs
+const USDA_URLS = {
+	srLegacy: 'https://fdc.nal.usda.gov/fdc-datasets/FoodData_Central_sr_legacy_food_csv_2018-04.zip',
+	foundation: 'https://fdc.nal.usda.gov/fdc-datasets/FoodData_Central_foundation_food_csv_2024-10-31.zip',
+};
+
+// Nutrient IDs we care about
+const NUTRIENT_IDS: Record<number, string> = {
+	1008: 'calories',
+	1003: 'protein',
+	1004: 'fat',
+	1258: 'saturatedFat',
+	1005: 'carbs',
+	1079: 'fiber',
+	1063: 'sugars',
+	// Minerals
+	1087: 'calcium',
+	1089: 'iron',
+	1090: 'magnesium',
+	1091: 'phosphorus',
+	1092: 'potassium',
+	1093: 'sodium',
+	1095: 'zinc',
+	// Vitamins
+	1106: 'vitaminA',    // RAE (mcg)
+	1162: 'vitaminC',
+	1114: 'vitaminD',    // D2+D3 (mcg)
+	1109: 'vitaminE',
+	1185: 'vitaminK',
+	1165: 'thiamin',
+	1166: 'riboflavin',
+	1167: 'niacin',
+	1175: 'vitaminB6',
+	1178: 'vitaminB12',
+	1177: 'folate',
+	// Other
+	1253: 'cholesterol',
+	// Amino acids (g/100g)
+	1212: 'isoleucine',
+	1213: 'leucine',
+	1214: 'lysine',
+	1215: 'methionine',
+	1217: 'phenylalanine',
+	1211: 'threonine',
+	1210: 'tryptophan',
+	1219: 'valine',
+	1221: 'histidine',
+	1222: 'alanine',
+	1220: 'arginine',
+	1223: 'asparticAcid',
+	1216: 'cysteine',
+	1224: 'glutamicAcid',
+	1225: 'glycine',
+	1226: 'proline',
+	1227: 'serine',
+	1218: 'tyrosine',
+};
+
+// Food categories to include (SR Legacy food_category_id descriptions)
+const INCLUDED_CATEGORIES = new Set([
+	'Dairy and Egg Products',
+	'Spices and Herbs',
+	'Baby Foods',
+	'Fats and Oils',
+	'Poultry Products',
+	'Soups, Sauces, and Gravies',
+	'Sausages and Luncheon Meats',
+	'Breakfast Cereals',
+	'Fruits and Fruit Juices',
+	'Pork Products',
+	'Vegetables and Vegetable Products',
+	'Nut and Seed Products',
+	'Beef Products',
+	'Beverages',
+	'Finfish and Shellfish Products',
+	'Legumes and Legume Products',
+	'Lamb, Veal, and Game Products',
+	'Baked Products',
+	'Sweets',
+	'Cereal Grains and Pasta',
+	'Snacks',
+	'Restaurant Foods',
+]);
+
+type NutrientData = Record<string, number>;
+
+interface RawFood {
+	fdcId: number;
+	description: string;
+	categoryId: number;
+	category: string;
+}
+
+interface Portion {
+	description: string;
+	grams: number;
+}
+
+// Simple CSV line parser that handles quoted fields
+function parseCSVLine(line: string): string[] {
+	const fields: string[] = [];
+	let current = '';
+	let inQuotes = false;
+
+	for (let i = 0; i < line.length; i++) {
+		const ch = line[i];
+		if (ch === '"') {
+			if (inQuotes && i + 1 < line.length && line[i + 1] === '"') {
+				current += '"';
+				i++;
+			} else {
+				inQuotes = !inQuotes;
+			}
+		} else if (ch === ',' && !inQuotes) {
+			fields.push(current);
+			current = '';
+		} else {
+			current += ch;
+		}
+	}
+	fields.push(current);
+	return fields;
+}
+
+async function readCSV(filePath: string): Promise<Record<string, string>[]> {
+	if (!existsSync(filePath)) {
+		console.warn(`  File not found: ${filePath}`);
+		return [];
+	}
+
+	const content = readFileSync(filePath, 'utf-8');
+	const lines = content.split('\n').filter(l => l.trim());
+	if (lines.length === 0) return [];
+
+	const headers = parseCSVLine(lines[0]);
+	const rows: Record<string, string>[] = [];
+
+	for (let i = 1; i < lines.length; i++) {
+		const fields = parseCSVLine(lines[i]);
+		const row: Record<string, string> = {};
+		for (let j = 0; j < headers.length; j++) {
+			row[headers[j]] = fields[j] || '';
+		}
+		rows.push(row);
+	}
+
+	return rows;
+}
+
+async function downloadAndExtract(url: string, targetDir: string): Promise<void> {
+	const zipName = url.split('/').pop()!;
+	const zipPath = resolve(DATA_DIR, zipName);
+
+	if (existsSync(targetDir) && readFileSync(resolve(targetDir, '.done'), 'utf-8').trim() === 'ok') {
+		console.log(`  Already extracted: ${targetDir}`);
+		return;
+	}
+
+	mkdirSync(targetDir, { recursive: true });
+
+	if (!existsSync(zipPath)) {
+		console.log(`  Downloading ${zipName}...`);
+		const response = await fetch(url);
+		if (!response.ok) throw new Error(`Download failed: ${response.status} ${response.statusText}`);
+
+		const buffer = Buffer.from(await response.arrayBuffer());
+		writeFileSync(zipPath, buffer);
+		console.log(`  Downloaded ${(buffer.length / 1024 / 1024).toFixed(1)}MB`);
+	}
+
+	console.log(`  Extracting to ${targetDir}...`);
+	const { execSync } = await import('child_process');
+	execSync(`unzip -o -j "${zipPath}" -d "${targetDir}"`, { stdio: 'pipe' });
+	writeFileSync(resolve(targetDir, '.done'), 'ok');
+}
+
+async function importDataset(datasetDir: string, label: string) {
+	console.log(`\nProcessing ${label}...`);
+
+	// Read category mapping
+	const categoryRows = await readCSV(resolve(datasetDir, 'food_category.csv'));
+	const categoryMap = new Map<string, string>();
+	for (const row of categoryRows) {
+		categoryMap.set(row['id'], row['description']);
+	}
+
+	// Read foods
+	const foodRows = await readCSV(resolve(datasetDir, 'food.csv'));
+	const foods = new Map<number, RawFood>();
+
+	for (const row of foodRows) {
+		const catId = parseInt(row['food_category_id'] || '0');
+		const category = categoryMap.get(row['food_category_id']) || '';
+
+		if (!INCLUDED_CATEGORIES.has(category)) continue;
+
+		const fdcId = parseInt(row['fdc_id']);
+		foods.set(fdcId, {
+			fdcId,
+			description: row['description'],
+			categoryId: catId,
+			category,
+		});
+	}
+	console.log(`  Found ${foods.size} foods in included categories`);
+
+	// Read nutrients
+	const nutrientRows = await readCSV(resolve(datasetDir, 'food_nutrient.csv'));
+	const nutrients = new Map<number, NutrientData>();
+
+	for (const row of nutrientRows) {
+		const fdcId = parseInt(row['fdc_id']);
+		if (!foods.has(fdcId)) continue;
+
+		const nutrientId = parseInt(row['nutrient_id']);
+		const fieldName = NUTRIENT_IDS[nutrientId];
+		if (!fieldName) continue;
+
+		if (!nutrients.has(fdcId)) nutrients.set(fdcId, {});
+		const amount = parseFloat(row['amount'] || '0');
+		if (!isNaN(amount)) {
+			nutrients.get(fdcId)![fieldName] = amount;
+		}
+	}
+	console.log(`  Loaded nutrients for ${nutrients.size} foods`);
+
+	// Read portions
+	const portionRows = await readCSV(resolve(datasetDir, 'food_portion.csv'));
+	const portions = new Map<number, Portion[]>();
+
+	for (const row of portionRows) {
+		const fdcId = parseInt(row['fdc_id']);
+		if (!foods.has(fdcId)) continue;
+
+		const gramWeight = parseFloat(row['gram_weight'] || '0');
+		if (!gramWeight || isNaN(gramWeight)) continue;
+
+		// Build description from amount + modifier/description
+		const amount = parseFloat(row['amount'] || '1');
+		const modifier = row['modifier'] || row['portion_description'] || '';
+		const desc = modifier
+			? (amount !== 1 ? `${amount} ${modifier}` : modifier)
+			: `${amount} unit`;
+
+		if (!portions.has(fdcId)) portions.set(fdcId, []);
+		portions.get(fdcId)!.push({ description: desc, grams: Math.round(gramWeight * 100) / 100 });
+	}
+	console.log(`  Loaded portions for ${portions.size} foods`);
+
+	return { foods, nutrients, portions };
+}
+
+function buildNutrientRecord(data: NutrientData | undefined): Record<string, number> {
+	const allFields = Object.values(NUTRIENT_IDS);
+	const result: Record<string, number> = {};
+	for (const field of allFields) {
+		result[field] = Math.round((data?.[field] || 0) * 100) / 100;
+	}
+	return result;
+}
+
+async function main() {
+	console.log('=== USDA Nutrition Database Import ===\n');
+
+	mkdirSync(DATA_DIR, { recursive: true });
+
+	// Download and extract datasets
+	const srDir = resolve(DATA_DIR, 'sr_legacy');
+	const foundationDir = resolve(DATA_DIR, 'foundation');
+
+	await downloadAndExtract(USDA_URLS.srLegacy, srDir);
+	await downloadAndExtract(USDA_URLS.foundation, foundationDir);
+
+	// Import both datasets
+	const sr = await importDataset(srDir, 'SR Legacy');
+	const foundation = await importDataset(foundationDir, 'Foundation Foods');
+
+	// Merge: Foundation Foods takes priority (more detailed), SR Legacy fills gaps
+	const merged = new Map<string, {
+		fdcId: number;
+		name: string;
+		category: string;
+		per100g: Record<string, number>;
+		portions: Portion[];
+	}>();
+
+	// Add SR Legacy first
+	for (const [fdcId, food] of sr.foods) {
+		const nutrientData = buildNutrientRecord(sr.nutrients.get(fdcId));
+		// Skip entries with no nutrient data at all
+		if (!sr.nutrients.has(fdcId)) continue;
+
+		merged.set(food.description.toLowerCase(), {
+			fdcId,
+			name: food.description,
+			category: food.category,
+			per100g: nutrientData,
+			portions: sr.portions.get(fdcId) || [],
+		});
+	}
+
+	// Override with Foundation Foods where available
+	for (const [fdcId, food] of foundation.foods) {
+		const nutrientData = buildNutrientRecord(foundation.nutrients.get(fdcId));
+		if (!foundation.nutrients.has(fdcId)) continue;
+
+		merged.set(food.description.toLowerCase(), {
+			fdcId,
+			name: food.description,
+			category: food.category,
+			per100g: nutrientData,
+			portions: foundation.portions.get(fdcId) || [],
+		});
+	}
+
+	console.log(`\nMerged total: ${merged.size} unique foods`);
+
+	// Sort by name for stable output
+	const entries = [...merged.values()].sort((a, b) => a.name.localeCompare(b.name));
+
+	// Generate TypeScript output
+	const tsContent = `// Auto-generated from USDA FoodData Central (SR Legacy + Foundation Foods)
+// Generated: ${new Date().toISOString().split('T')[0]}
+// Do not edit manually — regenerate with: pnpm exec vite-node scripts/import-usda-nutrition.ts
+
+import type { NutritionPer100g } from '$types/types';
+
+export type NutritionEntry = {
+	fdcId: number;
+	name: string;
+	category: string;
+	per100g: NutritionPer100g;
+	portions: { description: string; grams: number }[];
+};
+
+export const NUTRITION_DB: NutritionEntry[] = ${JSON.stringify(entries, null, '\t')};
+`;
+
+	writeFileSync(OUTPUT_PATH, tsContent, 'utf-8');
+	console.log(`\nWritten ${entries.length} entries to ${OUTPUT_PATH}`);
+
+	// Print category breakdown
+	const categoryCounts = new Map<string, number>();
+	for (const entry of entries) {
+		categoryCounts.set(entry.category, (categoryCounts.get(entry.category) || 0) + 1);
+	}
+	console.log('\nCategory breakdown:');
+	for (const [cat, count] of [...categoryCounts.entries()].sort((a, b) => b[1] - a[1])) {
+		console.log(`  ${cat}: ${count}`);
+	}
+}
+
+main().catch(err => {
+	console.error('Import failed:', err);
+	process.exit(1);
+});