changeset 117:c566cdbccaeb

Added dupes, the rewrite.
author David Bryant <bagnose@gmail.com>
date Tue, 19 Apr 2011 23:11:03 +0930
parents 31c27f4f3bbc
children 94233d54e16a
files doodle/utils/prog/dupes.d
diffstat 1 files changed, 163 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doodle/utils/prog/dupes.d	Tue Apr 19 23:11:03 2011 +0930
@@ -0,0 +1,163 @@
+import std.stdio;
+import std.string;
+import std.exception;
+import std.random;
+import std.algorithm;
+import std.file;
+import std.c.stdio;
+import std.c.string;
+import std.cstream;
+import core.sys.posix.dirent;
+import std.md5;
+
+
+class DuplicateFinder {
+    this(in string[] dirs) {
+        writefln("Accumulating files");
+
+        string last_name;
+
+        foreach (string dir; dirs) {
+            try {
+                foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
+                    last_name = name;
+                    try {
+                        if (isFile(name)) {
+                            ulong size = getSize(name);
+                            if (size >= SIZE_THRESHOLD) {
+                                _file_array ~= FileInfo(name, size);
+                            }
+                        }
+                    }
+                    catch (Exception ex) {
+                        writefln("Skipping %s", name);
+                        //writefln("Exception %s", ex);
+                        // TODO accumulate errors and print after traversal
+                    }
+                }
+            }
+            catch (FileException ex) {
+                // ignore
+                writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
+            }
+        }
+
+        writefln("Processing %s files", _file_array.length);
+
+        uint[][ulong] size_to_file_indices;
+        bool[ulong]   duplicate_sizes;
+
+        foreach (index, file; _file_array) {
+            //writefln("%s %s %s", index, file.name, file.size);
+
+            if (uint[] * indices = (file.size in size_to_file_indices)) {
+                if (indices.length == 1) {
+                    // Second time we've seen a file of this size,
+                    // record it in the duplicate_sizes array
+                    duplicate_sizes[file.size] = true;
+                }
+
+                (*indices) ~= index;
+            }
+            else {
+                size_to_file_indices[file.size] = [ index ];
+            }
+        }
+
+        writefln("Number of files of duplicate size %s", duplicate_sizes.length);
+
+        foreach (size; duplicate_sizes.keys) {
+            uint[] indices = size_to_file_indices[size];
+            //writefln("For size %s there are %s files", size, indices.length);
+
+            uint[][ubyte[16]] digest_to_indices;
+
+            foreach (index; indices) {
+                FileInfo file_info = _file_array[index];
+
+                try {
+                    ubyte[16] digest = compute_md5(file_info.name);
+
+                    if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
+                        // A true duplicate
+                        // index and index2 are the same
+
+                        (*duplicate_indices) ~= index;
+                    }
+                    else {
+                        digest_to_indices[digest] ~= index;
+                    }
+                }
+                catch (ErrnoException ex) {
+                    //writefln("Skipping: %s", file_info.name);
+                }
+
+                //writefln("\t%s", file_info.name);
+            }
+
+            foreach (indices2; digest_to_indices) {
+                if (indices2.length > 1) {
+                    // List the duplicates
+                    foreach (index; indices) {
+                        FileInfo file_info = _file_array[index];
+                        writefln("%s %s", file_info.size, file_info.name);
+                    }
+                    writefln("");
+                }
+            }
+        }
+
+        writefln("Done\n");
+    }
+
+    ubyte[16] compute_md5(in string name) {
+        ubyte[16] digest;
+
+        auto file = File(name, "r");
+        scope(exit) file.close;
+
+        MD5_CTX context;
+        context.start();
+        { // Block 1:
+            // Compute the actual digest
+            ulong amount = 0;
+            foreach (ubyte[] buffer; chunks(file, 1024)) {
+                context.update(buffer);
+                //bytes_chewed(buffer.length);
+                amount += buffer.length;
+                if (amount >= MD5_AMOUNT) {
+                    break;
+                }
+            }
+        }
+        context.finish(digest);
+
+        return digest;
+    }
+
+    private {
+        immutable ulong KILO = 1 << 10;
+        immutable ulong MEGA = 1 << 20;
+
+        immutable ulong SIZE_THRESHOLD = 100 * KILO;
+        immutable ulong MD5_AMOUNT     = 10 * KILO;
+
+        struct FileInfo {
+            this(in string name_, in ulong size_) {
+                name = name_;
+                size = size_;
+            }
+
+            string name;
+            ulong  size;
+        };
+
+        FileInfo[] _file_array;
+    }
+}
+
+int main(string[] args) {
+    new DuplicateFinder(args[1..$]);
+
+    return 0;
+}