dupfinder.py

#!/usr/bin/env python3
"""
Music duplicate finder - combines three detection methods:

1. Track number prefix duplicates
 "4-04 The Spirit of Radio.mp3" vs "04 The Spirit of Radio.mp3"

2. Partial title matching within the same artist
 "The Spirit of Radio.mp3" vs "The Spirit of Radio (live in Manchester).mp3"

3. Artist name normalization
 "Killers" and "The Killers" treated as the same artist

Run without --delete to preview. Add --delete to remove duplicates.

Usage:
 python3 dupfinder.py /path/to/music
 python3 dupfinder.py /path/to/music --delete
"""

import os
import re
import argparse

AUDIO_EXTENSIONS = ('.mp3', '.flac', '.m4a', '.ogg', '.wav', '.aac')
TRACK_PREFIX = re.compile(r'^\d+[-\s]?\d*[-\s]?\s*')
ARTICLE_PREFIX = re.compile(r'^(the|a|an)\s+', re.IGNORECASE)

# ─── Helpers ────────────────────────────────────────────────────────────────

def strip_track_prefix(filename):
 """Remove leading track number from filename. Returns (bare_title, extension)."""
 name, ext = os.path.splitext(filename)
 stripped = TRACK_PREFIX.sub('', name).strip()
 return stripped, ext.lower()

def normalize_title(title):
 """Lowercase and strip punctuation for loose comparison."""
 return re.sub(r'[^\w\s]', '', title).lower().strip()

def normalize_artist(name):
 """Strip leading articles for artist folder comparison."""
 return ARTICLE_PREFIX.sub('', name).strip().lower()

def is_audio(filename):
 return filename.lower().endswith(AUDIO_EXTENSIONS)

# ─── File collection ────────────────────────────────────────────────────────

def collect_files(folder):
 """Recursively collect all audio files under a folder."""
 files = []
 for root, dirs, filenames in os.walk(folder):
 for filename in filenames:
 if not is_audio(filename):
 continue
 bare, ext = strip_track_prefix(filename)
 files.append({
 'path': os.path.join(root, filename),
 'filename': filename,
 'bare': bare,
 'normalized': normalize_title(bare),
 'ext': ext,
 })
 return files

def group_artist_folders(music_dir):
 """
 Group artist-level folders by normalized name.
 e.g. 'Killers' and 'The Killers' end up in the same group.
 Returns a list of groups, each group being a list of folder paths.
 """
 folders = {}
 for entry in os.scandir(music_dir):
 if not entry.is_dir():
 continue
 key = normalize_artist(entry.name)
 folders.setdefault(key, []).append(entry.path)
 return list(folders.values())

# ─── Duplicate detection ─────────────────────────────────────────────────────

def find_track_prefix_dupes(files):
 """
 Find files in the same folder where only the track number prefix differs.
 e.g. '04 Title.mp3' vs '4-04 Title.mp3'
 """
 by_folder = {}
 for f in files:
 folder = os.path.dirname(f['path'])
 by_folder.setdefault(folder, []).append(f)

 duplicates = []
 for folder, folder_files in by_folder.items():
 seen = {}
 for f in folder_files:
 key = (f['normalized'],)
 seen.setdefault(key, []).append(f)
 for key, group in seen.items():
 if len(group) > 1:
 duplicates.append(group)

 return duplicates

def find_partial_title_dupes(files):
 """
 Find files across the artist group where one title contains another.
 e.g. 'The Spirit of Radio' contained in 'The Spirit of Radio (live in Manchester)'
 Keeps the shorter/cleaner title.
 """
 duplicates = []
 used = set()

 sorted_files = sorted(files, key=lambda f: len(f['normalized']))

 for i, shorter in enumerate(sorted_files):
 if shorter['path'] in used:
 continue
 if not shorter['normalized']:
 continue

 group = [shorter]

 for longer in sorted_files[i + 1:]:
 if longer['path'] in used:
 continue
 if shorter['normalized'] in longer['normalized']:
 group.append(longer)
 used.add(longer['path'])

 if len(group) > 1:
 used.add(shorter['path'])
 duplicates.append(group)

 return duplicates

def pick_keeper_by_prefix(group):
 """For track prefix dupes, prefer simple '04 Title' over '4-04 Title'."""
 def score(f):
 if re.match(r'^\d+-\d+', f['filename']):
 return 1
 if re.match(r'^\d{2}\s', f['filename']):
 return 0
 return 2
 return sorted(group, key=score)

# ─── Main ───────────────────────────────────────────────────────────────────

def main():
 parser = argparse.ArgumentParser(
 description='Find and remove duplicate music files.'
 )
 parser.add_argument('music_dir', help='Path to your music directory')
 parser.add_argument('--delete', action='store_true',
 help='Actually delete duplicates (default is preview only)')
 args = parser.parse_args()

 music_dir = os.path.abspath(args.music_dir)
 print(f"Scanning: {music_dir}")
 print(f"Mode: {'DELETE' if args.delete else 'PREVIEW ONLY'}\n")

 artist_groups = group_artist_folders(music_dir)
 total_would_delete = 0
 total_deleted = 0

 for group_folders in sorted(artist_groups, key=lambda g: os.path.basename(g[0]).lower()):

 # Collect all files across all folders in this artist group
 all_files = []
 for folder in group_folders:
 all_files.extend(collect_files(folder))

 if not all_files:
 continue

 artist_label = ' / '.join(os.path.basename(f) for f in group_folders)

 # Run both detection passes
 prefix_dupes = find_track_prefix_dupes(all_files)
 partial_dupes = find_partial_title_dupes(all_files)

 all_dupes = prefix_dupes + partial_dupes

 if not all_dupes:
 continue

 # Flag if this group has multiple artist folders (name normalization hit)
 if len(group_folders) > 1:
 print(f"Artist (merged): {artist_label}")
 else:
 print(f"Artist: {artist_label}")

 for dup_group in all_dupes:
 sorted_group = pick_keeper_by_prefix(dup_group)
 keeper = sorted_group[0]
 to_delete = sorted_group[1:]

 print(f" KEEP: {keeper['bare']}{keeper['ext']}")
 print(f" ({keeper['path']})")

 for dup in to_delete:
 print(f" DELETE: {dup['bare']}{dup['ext']}")
 print(f" ({dup['path']})")
 total_would_delete += 1

 if args.delete:
 try:
 os.remove(dup['path'])
 total_deleted += 1
 except Exception as e:
 print(f" ERROR: {e}")
 print()

 if args.delete:
 print(f"Done. Deleted {total_deleted} file(s).")
 else:
 print(f"Preview complete. {total_would_delete} file(s) would be deleted.")
 print("Run with --delete to actually remove them.")

if __name__ == '__main__':
 main()