changeset 113:9cc6c428fdbe

Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls no longer needed. Still blows the hood on memory usage.
author David Bryant <bagnose@gmail.com>
date Thu, 14 Apr 2011 19:10:46 +0930
parents b569d7d5064f
children b87e2e0a046a
files doodle/utils/prog/duplicates.d
diffstat 1 files changed, 80 insertions(+), 34 deletions(-) [+]
line wrap: on
line diff
--- a/doodle/utils/prog/duplicates.d	Thu Apr 14 11:27:17 2011 +0930
+++ b/doodle/utils/prog/duplicates.d	Thu Apr 14 19:10:46 2011 +0930
@@ -1,5 +1,7 @@
 import std.stdio;
 import std.string;
+import std.exception;
+import std.algorithm;
 import std.file;
 import std.c.stdio;
 import std.c.string;
@@ -9,19 +11,71 @@
 
 class DuplicateFinder {
     this(in string dir) {
-        recurse_directory(dir.dup);
+        // First pass to gather the number of files and bytes
+
+        writeln("Accumulating total bytes / files");
+
+        uint total_files = 0;
 
-        writefln("\n");
+        try {
+            foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
+                try {
+                    if (isFile(name)) {
+                        _total_bytes += getSize(name);
+                        ++total_files;
+                    }
+                }
+                catch (Exception ex) {
+                    writefln("Skipping %s", name);
+                    //writefln("Exception %s", ex);
+                }
+            }
+        }
+        catch (FileException ex) {
+            // ignore
+            writefln("dirEntries bailed out. Continuing anyway");
+        }
+
+        writefln("Files %s, bytes %s", total_files, _total_bytes);
+        writeln("Accumulating MD5 sums");
 
-        foreach (digest; _duplicate_digests.keys) {
-            writefln("%s", digestToString(digest));
+        foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
+            if (isFile(name)) {
+                try {
+                    //writefln("MD5'ing %s", name);
+                    compute_md5(name);
+                }
+                catch (ErrnoException ex) {
+                    //writefln("Skipping file: %s, %s", name, ex);
+                    //writefln("(errno) Skipping file: %s", name);
+                    // TODO accumulate errors and print after traversal is complete
+                }
+            }
+        }
+
+        writefln("");
+
+        writeln("Sorting keys");
+
+        ubyte[16][] keys = _duplicate_digests.keys;
+        bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; }
+        sort!(compare_by_size)(keys);
+
+        writeln("Printing results");
+
+        foreach (digest; keys) {
             auto file_info = _file_info_map[digest];
+            /*
             writefln("Size %s, Count %s, Digest %s",
                      file_info.size, file_info.names.length, digestToString(digest));
+                     */
+            writefln("Size %s, Count %s", file_info.size, file_info.names.length);
             foreach (name; file_info.names) {
                 writefln("\t%s", name);
             }
         }
+
+        writeln("Done");
     }
 
     private {
@@ -41,9 +95,24 @@
         bool[ubyte[16]]     _duplicate_digests;             // set of all duplicate digests
         FileInfo[ubyte[16]] _file_info_map;                 // map of digest to file info
 
-        void compute_md5(in char[] filename, in ulong filesize) {
+        ulong               _total_bytes;
+        ulong               _current_byte;
+        double              _last_progress = -1.0;
+
+        void bytes_chewed(ulong bytes) {
+            _current_byte += bytes;
+            double progress = cast(double)_current_byte / cast(double)_total_bytes;
+            if (progress - _last_progress > 0.0005) {
+                writef("\rProgress %3.1f%%", 100.0 * progress);
+                std.stdio.stdout.flush();
+                _last_progress = progress;
+            }
+
+        }
+
+        void compute_md5(in string filename) {
             //writefln("%s", filename);
-            auto file = File(filename.idup, "r");
+            auto file = File(filename, "r");
             scope(exit) file.close;
 
             ubyte[16] digest;
@@ -51,14 +120,16 @@
             MD5_CTX context;
             context.start();
             foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
+                bytes_chewed(buffer.length);
                 context.update(buffer);
             }
             context.finish(digest);
-            writefln("%s: %s", digestToString(digest), filename);
+            //writefln("%s: %s", digestToString(digest), filename);
 
             if (FileInfo * file_info = (digest in _file_info_map)) {
                 // duplicate
-                file_info.names ~= filename.idup;
+                file_info.names ~= filename;
+                assert(file_info.names.length > 1);
 
                 if (file_info.size >= SIZE_THRESHOLD) {
                     _duplicate_digests[digest] = true;
@@ -66,35 +137,10 @@
             }
             else {
                 // unseen
-                _duplicate_digests[digest] = true;
-                _file_info_map[digest] = FileInfo(filesize, filename.idup);
+                _file_info_map[digest] = FileInfo(getSize(filename), filename);
                 //writefln("%s", _file_info_map.length);
             }
         }
-
-        bool entry_callback(DirEntry * de) {
-            //writefln("File: %s", de.name);
-
-            if (de.isdir) {
-                recurse_directory(de.name);
-            }
-            else if (de.isfile) {
-                compute_md5(de.name, de.size);
-            }
-
-            return true;
-        }
-
-        void recurse_directory(in char[] dirname) {
-            //writefln("Dir: %s", dirname);
-
-            try {
-                listdir(dirname, &entry_callback);
-            }
-            catch (FileException ex) {
-                //writefln("Skipping: %s", dirname);
-            }
-        }
     }
 }