/** * Detect duplicate IDs for the same object type * When duplicates are found, warn the user and ask them to choose */ import type { ExtractedObject } from './metadataExtractor' export interface DuplicateGroup { id: string type: T['type'] objects: T[] } export interface DuplicateWarning { type: ExtractedObject['type'] id: string objects: ExtractedObject[] message: string } /** * Group objects by type and ID to detect duplicates */ export function detectDuplicates(objects: ExtractedObject[]): DuplicateWarning[] { const warnings: DuplicateWarning[] = [] // Group objects by type const byType = new Map() for (const obj of objects) { if (!byType.has(obj.type)) { byType.set(obj.type, []) } const typeArray = byType.get(obj.type) if (typeArray) { typeArray.push(obj) } } // For each type, group by ID for (const [type, typeObjects] of byType.entries()) { const byId = new Map() for (const obj of typeObjects) { if (!byId.has(obj.id)) { byId.set(obj.id, []) } const idArray = byId.get(obj.id) if (idArray) { idArray.push(obj) } } // Check for duplicates (same ID, multiple objects) for (const [id, idObjects] of byId.entries()) { if (idObjects.length > 1) { warnings.push({ type, id, objects: idObjects, message: `Found ${idObjects.length} objects of type "${type}" with the same ID "${id.substring(0, 16)}...". Please choose which one to keep.`, }) } } } return warnings } /** * Resolve duplicates by keeping only the first object for each ID * This is a simple resolution - in production, you'd want user interaction */ export function resolveDuplicatesSimple(objects: ExtractedObject[]): ExtractedObject[] { const seen = new Map() const resolved: ExtractedObject[] = [] for (const obj of objects) { const key = `${obj.type}:${obj.id}` if (!seen.has(key)) { seen.set(key, obj) resolved.push(obj) } else { // Keep the first one, skip duplicates console.warn(`Duplicate detected for ${obj.type} with ID ${obj.id.substring(0, 16)}... Keeping first occurrence.`) } } return resolved } /** * Resolve duplicates by keeping the most recent object (highest event.created_at) */ export function resolveDuplicatesByDate(objects: ExtractedObject[]): ExtractedObject[] { const byKey = new Map() // Group by type and ID for (const obj of objects) { const key = `${obj.type}:${obj.id}` if (!byKey.has(key)) { byKey.set(key, []) } const keyArray = byKey.get(key) if (keyArray) { keyArray.push(obj) } } const resolved: ExtractedObject[] = [] for (const [key, group] of byKey.entries()) { if (group.length === 1) { const obj = group[0] if (obj) { resolved.push(obj) } } else { // Sort by eventId (which should correlate with creation time) // Keep the one with the "latest" eventId (lexicographically) // In practice, you'd want to fetch the actual created_at from events group.sort((a, b) => { // Simple lexicographic comparison - in production, compare actual timestamps return b.eventId.localeCompare(a.eventId) }) const first = group[0] if (first) { resolved.push(first) console.warn(`Resolved ${group.length} duplicates for ${key} by keeping most recent`) } } } return resolved }