Sunday, January 25, 2009

perl script for iTunes near-duplicates

The "show duplicates" feature of iTunes is nice, but is fairly strict in its matching. I wrote a perl script that compares all of the words in the song name, artist, and album, then outputs a list of potential duplicates along with the percentage match. Catches situations like "First Time (feat. Robin Beck)" by "Sunblock" against "First Time" by "Sunblock feat. Robin Beck".

#!/usr/bin/perl -w
use strict;

my %tracks = ();

my $tid;
my $name;
my $artist;
my $album;
my $kind;
my $genre;

while (<>) {
if (/Track ID<\/key>(\d+)/) {
$tid = $1;
} elsif (/Name<\/key>([^<]+)/) {
$name = $1;
$name =~ s/[^A-Za-z ]//g;
} elsif (/Artist<\/key>([^<]+)/) {
$artist = $1;
$artist =~ s/[^A-Za-z ]//g;
} elsif (/Album<\/key>([^<]+)/) {
$album = $1;
$album =~ s/[^A-Za-z ]//g;
} elsif (/Kind<\/key>([^<]+)/) {
$kind = $1;
} elsif (/Genre<\/key>([^<]+)/) {
$genre = $1;
} elsif (/<\/dict>/) {
if ($kind eq 'MPEG audio file' && $genre ne 'Podcast') {
$tracks{$tid}{'name'} = $name;
$tracks{$tid}{'artist'} = $artist;
$tracks{$tid}{'album'} = $album;
@{ $tracks{$tid}{'words'} } = uniqit($name, $artist, $album);
#print "($name) in ($album) by ($artist)\n";
}
}
}

my @tid1 = keys %tracks;
foreach my $tid1 (@tid1) {
foreach my $tid2 (keys %tracks) {
next if $tid1 == $tid2;
next if $tracks{$tid1}{'artist'} eq $tracks{$tid2}{'artist'};
next if $tracks{$tid1}{'album'} eq $tracks{$tid2}{'album'};
# compare the list of words for some percentage match
my @w1 = @{ $tracks{$tid1}{'words'}};
my @w2 = @{ $tracks{$tid2}{'words'}};
my $percsame = int(percsame(\@w1, \@w2));
# only print if sufficiently similar
print "$percsame: (" . join(" ", sort @w1) . ") and (" .
join(" ", sort @w2) . ")\n" if $percsame >= 30;
}
# avoid matching this one again
delete $tracks{$tid1};
}

sub uniqit {
my ($a, $b, $c) = @_;
my %words = ();
# split a, b, and c by spaces
# put into hash, return keys()
map { $words{$_} = 1 } split / +/, $a;
map { $words{$_} = 1 } split / +/, $b;
map { $words{$_} = 1 } split / +/, $c;
#print "## Returning: " . join(", ", keys %words) . "\n";
return keys %words;
}

sub percsame {
my ($wl1, $wl2) = @_;
my @words1 = @$wl1;
my @words2 = @$wl2;
#print "percsame: " . join(",", @words1) . " with " . join(",", @words2) . "\n";
my @allwords = (@words1, @words2);
my $nwords = scalar(@allwords);
my %hash = ();
map { $hash{$_}++ } @allwords;
my $same = 0;
foreach my $word (keys %hash) {
++$same if $hash{$word} > 1;
}
return 100 * $same / $nwords;
}