# HG changeset patch # User David Bryant # Date 1302831467 -34200 # Node ID b87e2e0a046aeeaf0995754170b6ddab875bf0f7 # Parent 9cc6c428fdbe6dfd036c13d634ac74710c4c03bd Cleanup of duplicates.d diff -r 9cc6c428fdbe -r b87e2e0a046a doodle/utils/prog/duplicates.d --- a/doodle/utils/prog/duplicates.d Thu Apr 14 19:10:46 2011 +0930 +++ b/doodle/utils/prog/duplicates.d Fri Apr 15 11:07:47 2011 +0930 @@ -9,25 +9,34 @@ import core.sys.posix.dirent; import std.md5; +// This program recursively processes files in a list +// of directories, computing an MD5 digest on each file +// and then informing the user of files with duplicate content. +// Only duplicate files over a certain size are reported. + class DuplicateFinder { - this(in string dir) { + this(in string[] dirs) { // First pass to gather the number of files and bytes + // so that we are able to convey progress to the user writeln("Accumulating total bytes / files"); uint total_files = 0; try { - foreach (string name; dirEntries(dir, SpanMode.depth, false)) { - try { - if (isFile(name)) { - _total_bytes += getSize(name); - ++total_files; + foreach (string dir; dirs) { + foreach (string name; dirEntries(dir, SpanMode.depth, false)) { + try { + if (isFile(name)) { + _total_bytes += getSize(name); + ++total_files; + } } - } - catch (Exception ex) { - writefln("Skipping %s", name); - //writefln("Exception %s", ex); + catch (Exception ex) { + writefln("Skipping %s", name); + //writefln("Exception %s", ex); + // TODO accumulate errors and print after traversal + } } } } @@ -37,30 +46,42 @@ } writefln("Files %s, bytes %s", total_files, _total_bytes); - writeln("Accumulating MD5 sums"); + + // Go through the files again, but this time + // compute the MD5 digests and build our data structures + + writeln("Accumulating MD5 digests"); - foreach (string name; dirEntries(dir, SpanMode.depth, false)) { - if (isFile(name)) { - try { - //writefln("MD5'ing %s", name); - compute_md5(name); - } - catch (ErrnoException ex) { - //writefln("Skipping file: %s, %s", name, ex); - //writefln("(errno) Skipping file: %s", name); - // TODO accumulate errors and print after traversal is complete + foreach (string dir; dirs) { + foreach (string name; dirEntries(dir, SpanMode.depth, false)) { + if (isFile(name)) { + try { + //writefln("MD5'ing %s", name); + compute_md5(name); + } + catch (ErrnoException ex) { + //writefln("Skipping file: %s, %s", name, ex); + //writefln("(errno) Skipping file: %s", name); + // TODO accumulate errors and print after traversal is complete + } } } } writefln(""); - writeln("Sorting keys"); + // Sort our duplicate digests by size so that we print + // the biggest duplicate file offenders first + + writeln("Sorting duplicate digests by size"); ubyte[16][] keys = _duplicate_digests.keys; bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } sort!(compare_by_size)(keys); + // Print the results out the user, in descending order + // of file size + writeln("Printing results"); foreach (digest; keys) { @@ -103,15 +124,13 @@ _current_byte += bytes; double progress = cast(double)_current_byte / cast(double)_total_bytes; if (progress - _last_progress > 0.0005) { - writef("\rProgress %3.1f%%", 100.0 * progress); + writef("\rProgress %.1f%%", 100.0 * progress); std.stdio.stdout.flush(); _last_progress = progress; } - } void compute_md5(in string filename) { - //writefln("%s", filename); auto file = File(filename, "r"); scope(exit) file.close; @@ -124,30 +143,26 @@ context.update(buffer); } context.finish(digest); - //writefln("%s: %s", digestToString(digest), filename); if (FileInfo * file_info = (digest in _file_info_map)) { - // duplicate + // This is a duplicate digest, append the subsequent name file_info.names ~= filename; - assert(file_info.names.length > 1); + // Record the duplicate as an offender if its size exceeds the threshold if (file_info.size >= SIZE_THRESHOLD) { _duplicate_digests[digest] = true; } } else { - // unseen + // We have not seen this digest before _file_info_map[digest] = FileInfo(getSize(filename), filename); - //writefln("%s", _file_info_map.length); } } } } int main(string[] args) { - foreach (string arg; args[1..$]) { - new DuplicateFinder(arg); - } + new DuplicateFinder(args[1..$]); return 0; }