story-research-zapwall/lib/duplicateDetector.ts
2026-01-08 21:49:57 +01:00

134 lines
3.6 KiB
TypeScript

/**
* Detect duplicate IDs for the same object type
* When duplicates are found, warn the user and ask them to choose
*/
import type { ExtractedObject } from './metadataExtractor'
export interface DuplicateGroup<T extends ExtractedObject> {
id: string
type: T['type']
objects: T[]
}
export interface DuplicateWarning {
type: ExtractedObject['type']
id: string
objects: ExtractedObject[]
message: string
}
/**
* Group objects by type and ID to detect duplicates
*/
export function detectDuplicates(objects: ExtractedObject[]): DuplicateWarning[] {
const warnings: DuplicateWarning[] = []
// Group objects by type
const byType = new Map<ExtractedObject['type'], ExtractedObject[]>()
for (const obj of objects) {
if (!byType.has(obj.type)) {
byType.set(obj.type, [])
}
const typeArray = byType.get(obj.type)
if (typeArray) {
typeArray.push(obj)
}
}
// For each type, group by ID
for (const [type, typeObjects] of byType.entries()) {
const byId = new Map<string, ExtractedObject[]>()
for (const obj of typeObjects) {
if (!byId.has(obj.id)) {
byId.set(obj.id, [])
}
const idArray = byId.get(obj.id)
if (idArray) {
idArray.push(obj)
}
}
// Check for duplicates (same ID, multiple objects)
for (const [id, idObjects] of byId.entries()) {
if (idObjects.length > 1) {
warnings.push({
type,
id,
objects: idObjects,
message: `Found ${idObjects.length} objects of type "${type}" with the same ID "${id.substring(0, 16)}...". Please choose which one to keep.`,
})
}
}
}
return warnings
}
/**
* Resolve duplicates by keeping only the first object for each ID
* This is a simple resolution - in production, you'd want user interaction
*/
export function resolveDuplicatesSimple(objects: ExtractedObject[]): ExtractedObject[] {
const seen = new Map<string, ExtractedObject>()
const resolved: ExtractedObject[] = []
for (const obj of objects) {
const key = `${obj.type}:${obj.id}`
if (!seen.has(key)) {
seen.set(key, obj)
resolved.push(obj)
} else {
// Keep the first one, skip duplicates
console.warn(`Duplicate detected for ${obj.type} with ID ${obj.id.substring(0, 16)}... Keeping first occurrence.`)
}
}
return resolved
}
/**
* Resolve duplicates by keeping the most recent object (highest event.created_at)
*/
export function resolveDuplicatesByDate(objects: ExtractedObject[]): ExtractedObject[] {
const byKey = new Map<string, ExtractedObject[]>()
// Group by type and ID
for (const obj of objects) {
const key = `${obj.type}:${obj.id}`
if (!byKey.has(key)) {
byKey.set(key, [])
}
const keyArray = byKey.get(key)
if (keyArray) {
keyArray.push(obj)
}
}
const resolved: ExtractedObject[] = []
for (const [key, group] of byKey.entries()) {
if (group.length === 1) {
const obj = group[0]
if (obj) {
resolved.push(obj)
}
} else {
// Sort by eventId (which should correlate with creation time)
// Keep the one with the "latest" eventId (lexicographically)
// In practice, you'd want to fetch the actual created_at from events
group.sort((a, b) => {
// Simple lexicographic comparison - in production, compare actual timestamps
return b.eventId.localeCompare(a.eventId)
})
const first = group[0]
if (first) {
resolved.push(first)
console.warn(`Resolved ${group.length} duplicates for ${key} by keeping most recent`)
}
}
}
return resolved
}