changeset 121:f1cf62339ed5

More tweaking
author David Bryant <>
date Sun, 24 Apr 2011 16:42:26 +0930
parents c275f26399c6
children 403c34305a39
files doodle/utils/prog/dupes.d
diffstat 1 files changed, 119 insertions(+), 51 deletions(-) [+]
line wrap: on
line diff
--- a/doodle/utils/prog/dupes.d	Fri Apr 22 00:06:07 2011 +0930
+++ b/doodle/utils/prog/dupes.d	Sun Apr 24 16:42:26 2011 +0930
@@ -1,33 +1,83 @@
 import std.stdio;
 import std.string;
 import std.exception;
+import std.algorithm;
 import std.file;
 import std.md5;
 import std.getopt;
 import std.conv;
+import std.ctype;
 import std.c.stdlib;
+ulong string_to_size(string s) {
+    // Convert strings to sizes, eg:
+    //   "50"   -> 50
+    //   "80B"  -> 80
+    //   "10K"  -> 10240
+    //   "1M"   -> 1048576
+    // Throws ConvException
+    immutable map = [ 'B':1UL, 'K':1UL<<10, 'M':1UL<<20, 'G':1UL<<30, 'T':1UL<<40 ];
+    if (s.length == 0) {
+        throw new ConvException("Empty string");
+    }
+    else {
+        ulong multiplier = 1;
+        if (isalpha(s[$-1])) {
+            immutable ulong * m = (s[$-1] in map);
+            if (m) {
+                multiplier = *m;
+            }
+            else {
+                throw new ConvException(format("Bad size unit character: %s", s[$-1]));
+            }
+            s = s[0..$-1];
+        }
+        return multiplier * to!ulong(s);
+    }
+string size_to_string(in ulong size) {
+    /+
+    immutable array = [ 'B', 'K', 'M', 'G', 'T' ];
+    size_t index = 0;
+    foreach (i, c; array) {
+        if (size / (1UL << i
+        writefln("%s %s", i, c);
+    }
+    +/
+    return format("%sK", size / 1024);
 void find_duplicates(in string[] dirs,
                      in ulong    file_size,
                      in ulong    digest_size,
                      bool        verbose) {
-    static ubyte[16] compute_md5(in string name, in ulong max_bytes) {
+    static ubyte[16] compute_md5(in string filename, in ulong max_bytes) {
+        size_t chunk_size = min(max_bytes, 4096 * 1024);
         ubyte[16] digest;
-        auto file = File(name, "r");
+        auto file = File(filename, "r");
         scope(exit) file.close;
         MD5_CTX context;
         ulong byte_count = 0;
-        foreach (ubyte[] buffer; chunks(file, 1024)) {
+        foreach (ubyte[] buffer; chunks(file, chunk_size)) {
             byte_count += buffer.length;
             if (byte_count >= max_bytes) {
         return digest;
@@ -42,30 +92,34 @@
     writefln("Accumulating file list");
-    string last_name;
     foreach (string dir; dirs) {
-        try {
-            foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
-                last_name = name;
-                try {
-                    if (!isSymLink(name) && isFile(name)) {
-                        ulong size = getSize(name);
-                        if (size >= file_size) {
-                            file_array ~= FileInfo(name, size);
+        if (isDir(dir)) {
+            string last_entry;
+            try {
+                foreach (string filename; dirEntries(dir, SpanMode.depth, false)) {
+                    last_entry = filename;
+                    try {
+                        if (!isSymLink(filename) && isFile(filename)) {
+                            ulong size = getSize(filename);
+                            if (size >= file_size) {
+                                file_array ~= FileInfo(filename, size);
+                            }
-                }
-                catch (Exception ex) {
-                    writefln("Skipping %s", name);
-                    //writefln("Exception %s", ex);
-                    // TODO accumulate errors and print after traversal
+                    catch (Exception ex) {
+                        writefln("Skipping %s", filename);
+                        //writefln("Exception %s", ex);
+                        // TODO accumulate errors and print after traversal
+                    }
+            catch (FileException ex) {
+                // ignore
+                writefln("Error, dirEntries bailed out after: %s. Continuing anyway", last_entry);
+            }
-        catch (FileException ex) {
-            // ignore
-            writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
+        else {
+            writefln("Not a dir: %s", dir);
@@ -93,7 +147,9 @@
     writefln("Number of files of duplicate size %s", duplicate_sizes.length);
-    foreach (size; duplicate_sizes.keys) {
+    ulong total_waste = 0;
+    foreach_reverse (size; duplicate_sizes.keys.sort) {
         uint[] indices = size_to_file_indices[size];
         //writefln("For size %s there are %s files", size, indices.length);
@@ -125,49 +181,61 @@
         foreach (indices2; digest_to_indices) {
             if (indices2.length > 1) {
                 // List the duplicates
-                foreach (index; indices) {
+                foreach (i, index; indices) {
                     FileInfo file_info = file_array[index];
-                    writefln("%s %s", file_info.size,;
+                    if (i == 0) {
+                        writefln("%s", size_to_string(file_info.size));
+                        total_waste += file_info.size;
+                    }
+                    writefln("    %s",;
-    writefln("Done");
+    writefln("Done, total waste: %s", size_to_string(total_waste));
 int main(string[] args) {
-    immutable ulong KILO = 1 << 10;
-    immutable ulong MEGA = 1 << 20;
-    immutable ulong GIGA = 1 << 30;
+    ulong file_size;
+    ulong digest_size;
+    bool  verbose;
-    /*
-    static ulong parse_size_string(in string[] s) {
-        if (s.length == 0) {
-            throw new ConvException
+    try {
+        void help(in string) {
+            writefln("Usage: dupes [OPTION]... DIR...\n"
+                     "Recursively locate duplicate files in a list of directories\n"
+                     "\n"
+                     "Options\n"
+                     " -d, --digest-size=SIZE     size of digest used for comparison\n"
+                     " -f, --file-size=SIZE       minimum size of files searched for duplication\n"
+                     " -v, --verbose              be verbose\n"
+                     "     --help                 display this help and exit\n"
+                     "\n"
+                     "SIZE is an integer, optionally followed by K, M, G, T");
+            exit(1);
+        string file_size_string   = "100K";
+        string digest_size_string = "100K";
+        getopt(args,
+               "file-size|f",   &file_size_string,
+               "digest-size|d", &digest_size_string,
+               "verbose|v",     &verbose,
+               "help",          &help);
+        file_size   = string_to_size(file_size_string);
+        digest_size = string_to_size(digest_size_string);
-    */
-    void help(in string) {
-        writefln("Help");
-        exit(1);
+    catch (ConvException ex) {
+        writefln("Conversion error: %s", ex);
+        exit(2);
-    ulong file_size   = 100 * KILO;
-    ulong digest_size =  10 * KILO;
-    bool  verbose     = false;
-    try {
-         getopt(args,
-                "file-size|f",   &file_size,
-                "digest-size|d", &digest_size,
-                "verbose|v",     &verbose,
-                "help|h",        &help);
-    }
-    catch (ConvException ex) {
+    if (verbose) {
+        writefln("file-size=%s, digest-size=%s", size_to_string(file_size), size_to_string(digest_size));
     find_duplicates(args[1..$], file_size, digest_size, verbose);