134 lines
3.6 KiB
TypeScript
134 lines
3.6 KiB
TypeScript
/**
|
|
* Detect duplicate IDs for the same object type
|
|
* When duplicates are found, warn the user and ask them to choose
|
|
*/
|
|
|
|
import type { ExtractedObject } from './metadataExtractor'
|
|
|
|
export interface DuplicateGroup<T extends ExtractedObject> {
|
|
id: string
|
|
type: T['type']
|
|
objects: T[]
|
|
}
|
|
|
|
export interface DuplicateWarning {
|
|
type: ExtractedObject['type']
|
|
id: string
|
|
objects: ExtractedObject[]
|
|
message: string
|
|
}
|
|
|
|
/**
|
|
* Group objects by type and ID to detect duplicates
|
|
*/
|
|
export function detectDuplicates(objects: ExtractedObject[]): DuplicateWarning[] {
|
|
const warnings: DuplicateWarning[] = []
|
|
|
|
// Group objects by type
|
|
const byType = new Map<ExtractedObject['type'], ExtractedObject[]>()
|
|
for (const obj of objects) {
|
|
if (!byType.has(obj.type)) {
|
|
byType.set(obj.type, [])
|
|
}
|
|
const typeArray = byType.get(obj.type)
|
|
if (typeArray) {
|
|
typeArray.push(obj)
|
|
}
|
|
}
|
|
|
|
// For each type, group by ID
|
|
for (const [type, typeObjects] of byType.entries()) {
|
|
const byId = new Map<string, ExtractedObject[]>()
|
|
for (const obj of typeObjects) {
|
|
if (!byId.has(obj.id)) {
|
|
byId.set(obj.id, [])
|
|
}
|
|
const idArray = byId.get(obj.id)
|
|
if (idArray) {
|
|
idArray.push(obj)
|
|
}
|
|
}
|
|
|
|
// Check for duplicates (same ID, multiple objects)
|
|
for (const [id, idObjects] of byId.entries()) {
|
|
if (idObjects.length > 1) {
|
|
warnings.push({
|
|
type,
|
|
id,
|
|
objects: idObjects,
|
|
message: `Found ${idObjects.length} objects of type "${type}" with the same ID "${id.substring(0, 16)}...". Please choose which one to keep.`,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
return warnings
|
|
}
|
|
|
|
/**
|
|
* Resolve duplicates by keeping only the first object for each ID
|
|
* This is a simple resolution - in production, you'd want user interaction
|
|
*/
|
|
export function resolveDuplicatesSimple(objects: ExtractedObject[]): ExtractedObject[] {
|
|
const seen = new Map<string, ExtractedObject>()
|
|
const resolved: ExtractedObject[] = []
|
|
|
|
for (const obj of objects) {
|
|
const key = `${obj.type}:${obj.id}`
|
|
if (!seen.has(key)) {
|
|
seen.set(key, obj)
|
|
resolved.push(obj)
|
|
} else {
|
|
// Keep the first one, skip duplicates
|
|
console.warn(`Duplicate detected for ${obj.type} with ID ${obj.id.substring(0, 16)}... Keeping first occurrence.`)
|
|
}
|
|
}
|
|
|
|
return resolved
|
|
}
|
|
|
|
/**
|
|
* Resolve duplicates by keeping the most recent object (highest event.created_at)
|
|
*/
|
|
export function resolveDuplicatesByDate(objects: ExtractedObject[]): ExtractedObject[] {
|
|
const byKey = new Map<string, ExtractedObject[]>()
|
|
|
|
// Group by type and ID
|
|
for (const obj of objects) {
|
|
const key = `${obj.type}:${obj.id}`
|
|
if (!byKey.has(key)) {
|
|
byKey.set(key, [])
|
|
}
|
|
const keyArray = byKey.get(key)
|
|
if (keyArray) {
|
|
keyArray.push(obj)
|
|
}
|
|
}
|
|
|
|
const resolved: ExtractedObject[] = []
|
|
|
|
for (const [key, group] of byKey.entries()) {
|
|
if (group.length === 1) {
|
|
const obj = group[0]
|
|
if (obj) {
|
|
resolved.push(obj)
|
|
}
|
|
} else {
|
|
// Sort by eventId (which should correlate with creation time)
|
|
// Keep the one with the "latest" eventId (lexicographically)
|
|
// In practice, you'd want to fetch the actual created_at from events
|
|
group.sort((a, b) => {
|
|
// Simple lexicographic comparison - in production, compare actual timestamps
|
|
return b.eventId.localeCompare(a.eventId)
|
|
})
|
|
const first = group[0]
|
|
if (first) {
|
|
resolved.push(first)
|
|
console.warn(`Resolved ${group.length} duplicates for ${key} by keeping most recent`)
|
|
}
|
|
}
|
|
}
|
|
|
|
return resolved
|
|
}
|