#!/usr/bin/perl
use strict;
use warnings;
# findDupeFiles:
# This script attempts to identify which files might be duplicates.
# It searches specified directories for files with a given suffix
# and reports on files that have the same MD5 digest.
# The suffix or suffixes to be searched for are specified by the first
# command-line argument - each suffix separated from the next by a vertical bar.
# The subsequent command-line arguments specify the directories to be searched.
# If no directories are specified on the command-line,
# it searches the current directory.
# Files whose names start with "._" are ignored.
#
# Cameron Hayne (macdev@hayne.net) January 2006
#
#
# Examples of use:
# ----------------
# findDupeFiles '.aif|.aiff' AAA BBB CCC
# would look for duplicates among all the files with ".aif" or ".aiff" suffixes
# under the directories AAA, BBB, and CCC
#
# findDupeFiles '.aif|.aiff'
# would look for duplicates among all the files with ".aif" or ".aiff" suffixes
# under the current directory
#
# findDupeFiles '' AAA BBB CCC
# would look for duplicates among all the files (no matter what suffix)
# under the directories AAA, BBB, and CCC
#
# findDupeFiles
# would look for duplicates among all the files (no matter what suffix)
# under the current directory
# -----------------------------------------------------------------------------
use File::Find;
use File::stat;
use Digest::MD5;
my $matchSomeSuffix; # reference to a subroutine for matching suffixes
if (defined($ARGV[0]))
{
# the list of desired suffixes is supplied in $ARGV[0]
# separated by vertical bars - e.g. ".mp3|.aiff"
# Note that if $ARGV[0] is '', then all files will be looked at
my @suffixes = split(/\|/, $ARGV[0]);
if (scalar(@suffixes) > 0)
{
# create an efficient matching subroutine using the Friedl technique
my $matchExpr = join('||', map {"m/\$suffixes[$_]\$/io"} 0..$#suffixes);
$matchSomeSuffix = eval "sub {$matchExpr}";
}
shift @ARGV;
}
# if no dirs supplied as command-line args, we search the current directory
my @searchDirs = @ARGV ? @ARGV : ".";
# verify that these are in fact directories
foreach my $dir (@searchDirs)
{
die "\"$dir\" is not a directory\n" unless -d "$dir";
}
my %filesByMd5; # global variable holding hash of arrays of dupes
# calcMd5: returns the MD5 digest of the given file
sub calcMd5($)
{
my ($filename) = @_;
if (-d $filename)
{
# doing MD5 on a directory is not supported
return "unsupported"; # we need to return something
}
$filename =~ s#^(\s)#./$1#; # protect against leading whitespace
open(FILE, "< $filename\0")
or die "Unable to open file \"$filename\": $!\n";
binmode(FILE); # just in case we're on Windows!
my $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
close(FILE);
return $md5;
}
# checkFile: invoked from the 'find' routine on each file or directory in turn
sub checkFile()
{
return unless -f $_; # only interested in files, not directories
my $filename = $_;
my $dirname = $File::Find::dir;
return if $filename =~ /^\._/; # ignore files whose names start with "._"
if (defined($matchSomeSuffix))
{
return unless &$matchSomeSuffix;
}
my $statInfo = stat($filename)
or warn "Can't stat file \"$dirname/$filename\": $!\n" and return;
my $size = $statInfo->size;
my $md5 = calcMd5($filename);
my $fileInfo = {
'dirname' => $dirname,
'filename' => $filename,
'size' => $size,
'md5' => $md5,
};
push(@{$filesByMd5{$md5}}, $fileInfo);
}
MAIN:
{
# traverse the directories and build up lists of dupes in %filesByMd5
find(\&checkFile, @searchDirs);
# for each set of dupes, print the full path to the files
my $numDupes = 0;
my $numDupeBytes = 0;
foreach my $key (keys %filesByMd5)
{
my @dupList = @{$filesByMd5{$key}};
my $numCopies = scalar(@dupList);
next unless $numCopies > 1;
my $size = -1;
foreach my $fileInfo (@dupList)
{
my $dirname = $fileInfo->{dirname};
my $filename = $fileInfo->{filename};
if ($size == -1)
{
$size = $fileInfo->{size};
}
elsif ($size != $fileInfo->{size}) # sanity check
{
print "File sizes don't match!\n";
print "previous: $size current: $fileInfo->{size}\n";
}
print "rm $dirname/$filename\n";
}
#print "----------\n";
$numDupes += ($numCopies - 1);
$numDupeBytes += ($size * ($numCopies - 1));
}
my $numDupeMegabytes = sprintf("%.1f", $numDupeBytes / (1024 * 1024));
#print "Number of duplicate files: $numDupes\n";
#print "Megabytes duplicated: $numDupeMegabytes\n";
}
I used it to match the test cases output to a predefined set and find out which cased failed. I modified the script to remove the “—–” lines after every match and prepends rm to all file names, so that I can direct the output to a .bat file and remove the common files.
Usage:
perl findDupeFiles.pl '.aif|.aiff' AAA BBB CCC > rm.bat