368def _build_report(records, kept_records, removed_records):
369 """Collect duplicate and conflict information for logging/tests."""
370 removed_duplicates = [
371 {
372 "source": record["source_name"],
373 "duplicate_of_source": record.get("duplicate_of_source_name"),
374 "setting": record["original_setting"],
375 "kept_setting": record.get("kept_original_setting"),
376 "normalized_value": record["normalized_value"],
377 "kept_normalized_value": record.get("kept_normalized_value"),
378 }
379 for record in removed_records
380 ]
381
382 removed_identical = [
383 {
384 "source": record["source"],
385 "duplicate_of_source": record.get("duplicate_of_source"),
386 "setting": record["setting"],
387 "kept_setting": record.get("kept_setting"),
388 }
389 for record in removed_duplicates
390 if record["normalized_value"] == record.get("kept_normalized_value")
391 ]
392 duplicates_in_source = {}
393 duplicates_across_sources = {}
394
395 for record in removed_identical:
396 source_name = record.get("source", "<unknown>")
397 duplicate_of_source = record.get("duplicate_of_source", "<unknown>")
398 if duplicate_of_source == source_name:
399 duplicates_in_source[source_name] = (
400 duplicates_in_source.get(source_name, 0) + 1
401 )
402 continue
403
404 source_pair = (source_name, duplicate_of_source)
405 duplicates_across_sources[source_pair] = (
406 duplicates_across_sources.get(source_pair, 0) + 1
407 )
408
409 conflict_details = _build_conflict_details(records, kept_records)
410
411 return {
412 "removed_duplicates": removed_duplicates,
413 "removed_identical": removed_identical,
414 "conflicting_reassignments": [
415 record
416 for record in removed_duplicates
417 if record["normalized_value"] != record.get("kept_normalized_value")
418 ],
419 "removed_overridden": [
420 record
421 for record in removed_duplicates
422 if record["normalized_value"] != record.get("kept_normalized_value")
423 ],
424 "conflict_details": conflict_details,
425 "conflicts": _find_conflicts(records),
426 "duplicates_in_source": duplicates_in_source,
427 "duplicates_across_sources": [
428 {
429 "source": source_name,
430 "duplicate_of_source": duplicate_of_source,
431 "count": duplicate_count,
432 }
433 for (source_name, duplicate_of_source), duplicate_count
434 in sorted(duplicates_across_sources.items())
435 ],
436 }
437
438