# HG changeset patch # User David Bryant # Date 1303220463 -34200 # Node ID c566cdbccaeb3ad2b392611470710bcc69253ad9 # Parent 31c27f4f3bbc48663eb49a61ab27e97fdb87b795 Added dupes, the rewrite. diff -r 31c27f4f3bbc -r c566cdbccaeb doodle/utils/prog/dupes.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doodle/utils/prog/dupes.d Tue Apr 19 23:11:03 2011 +0930 @@ -0,0 +1,163 @@ +import std.stdio; +import std.string; +import std.exception; +import std.random; +import std.algorithm; +import std.file; +import std.c.stdio; +import std.c.string; +import std.cstream; +import core.sys.posix.dirent; +import std.md5; + + +class DuplicateFinder { + this(in string[] dirs) { + writefln("Accumulating files"); + + string last_name; + + foreach (string dir; dirs) { + try { + foreach (string name; dirEntries(dir, SpanMode.depth, false)) { + last_name = name; + try { + if (isFile(name)) { + ulong size = getSize(name); + if (size >= SIZE_THRESHOLD) { + _file_array ~= FileInfo(name, size); + } + } + } + catch (Exception ex) { + writefln("Skipping %s", name); + //writefln("Exception %s", ex); + // TODO accumulate errors and print after traversal + } + } + } + catch (FileException ex) { + // ignore + writefln("dirEntries bailed out (%s). Continuing anyway", last_name); + } + } + + writefln("Processing %s files", _file_array.length); + + uint[][ulong] size_to_file_indices; + bool[ulong] duplicate_sizes; + + foreach (index, file; _file_array) { + //writefln("%s %s %s", index, file.name, file.size); + + if (uint[] * indices = (file.size in size_to_file_indices)) { + if (indices.length == 1) { + // Second time we've seen a file of this size, + // record it in the duplicate_sizes array + duplicate_sizes[file.size] = true; + } + + (*indices) ~= index; + } + else { + size_to_file_indices[file.size] = [ index ]; + } + } + + writefln("Number of files of duplicate size %s", duplicate_sizes.length); + + foreach (size; duplicate_sizes.keys) { + uint[] indices = size_to_file_indices[size]; + //writefln("For size %s there are %s files", size, indices.length); + + uint[][ubyte[16]] digest_to_indices; + + foreach (index; indices) { + FileInfo file_info = _file_array[index]; + + try { + ubyte[16] digest = compute_md5(file_info.name); + + if (uint[] * duplicate_indices = (digest in digest_to_indices)) { + // A true duplicate + // index and index2 are the same + + (*duplicate_indices) ~= index; + } + else { + digest_to_indices[digest] ~= index; + } + } + catch (ErrnoException ex) { + //writefln("Skipping: %s", file_info.name); + } + + //writefln("\t%s", file_info.name); + } + + foreach (indices2; digest_to_indices) { + if (indices2.length > 1) { + // List the duplicates + foreach (index; indices) { + FileInfo file_info = _file_array[index]; + writefln("%s %s", file_info.size, file_info.name); + } + writefln(""); + } + } + } + + writefln("Done\n"); + } + + ubyte[16] compute_md5(in string name) { + ubyte[16] digest; + + auto file = File(name, "r"); + scope(exit) file.close; + + MD5_CTX context; + context.start(); + { // Block 1: + // Compute the actual digest + ulong amount = 0; + foreach (ubyte[] buffer; chunks(file, 1024)) { + context.update(buffer); + //bytes_chewed(buffer.length); + amount += buffer.length; + if (amount >= MD5_AMOUNT) { + break; + } + } + } + context.finish(digest); + + return digest; + } + + private { + immutable ulong KILO = 1 << 10; + immutable ulong MEGA = 1 << 20; + + immutable ulong SIZE_THRESHOLD = 100 * KILO; + immutable ulong MD5_AMOUNT = 10 * KILO; + + struct FileInfo { + this(in string name_, in ulong size_) { + name = name_; + size = size_; + } + + string name; + ulong size; + }; + + FileInfo[] _file_array; + } +} + +int main(string[] args) { + new DuplicateFinder(args[1..$]); + + return 0; +}