changeset 114:b87e2e0a046a

Cleanup of duplicates.d
author David Bryant <bagnose@gmail.com>
date Fri, 15 Apr 2011 11:07:47 +0930
parents 9cc6c428fdbe
children d7330cc52622
files doodle/utils/prog/duplicates.d
diffstat 1 files changed, 48 insertions(+), 33 deletions(-) [+]
line wrap: on
line diff
--- a/doodle/utils/prog/duplicates.d	Thu Apr 14 19:10:46 2011 +0930
+++ b/doodle/utils/prog/duplicates.d	Fri Apr 15 11:07:47 2011 +0930
@@ -9,25 +9,34 @@
 import core.sys.posix.dirent;
 import std.md5;
 
+// This program recursively processes files in a list
+// of directories, computing an MD5 digest on each file
+// and then informing the user of files with duplicate content.
+// Only duplicate files over a certain size are reported.
+
 class DuplicateFinder {
-    this(in string dir) {
+    this(in string[] dirs) {
         // First pass to gather the number of files and bytes
+        // so that we are able to convey progress to the user
 
         writeln("Accumulating total bytes / files");
 
         uint total_files = 0;
 
         try {
-            foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
-                try {
-                    if (isFile(name)) {
-                        _total_bytes += getSize(name);
-                        ++total_files;
+            foreach (string dir; dirs) {
+                foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
+                    try {
+                        if (isFile(name)) {
+                            _total_bytes += getSize(name);
+                            ++total_files;
+                        }
                     }
-                }
-                catch (Exception ex) {
-                    writefln("Skipping %s", name);
-                    //writefln("Exception %s", ex);
+                    catch (Exception ex) {
+                        writefln("Skipping %s", name);
+                        //writefln("Exception %s", ex);
+                        // TODO accumulate errors and print after traversal
+                    }
                 }
             }
         }
@@ -37,30 +46,42 @@
         }
 
         writefln("Files %s, bytes %s", total_files, _total_bytes);
-        writeln("Accumulating MD5 sums");
+
+        // Go through the files again, but this time
+        // compute the MD5 digests and build our data structures
+
+        writeln("Accumulating MD5 digests");
 
-        foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
-            if (isFile(name)) {
-                try {
-                    //writefln("MD5'ing %s", name);
-                    compute_md5(name);
-                }
-                catch (ErrnoException ex) {
-                    //writefln("Skipping file: %s, %s", name, ex);
-                    //writefln("(errno) Skipping file: %s", name);
-                    // TODO accumulate errors and print after traversal is complete
+        foreach (string dir; dirs) {
+            foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
+                if (isFile(name)) {
+                    try {
+                        //writefln("MD5'ing %s", name);
+                        compute_md5(name);
+                    }
+                    catch (ErrnoException ex) {
+                        //writefln("Skipping file: %s, %s", name, ex);
+                        //writefln("(errno) Skipping file: %s", name);
+                        // TODO accumulate errors and print after traversal is complete
+                    }
                 }
             }
         }
 
         writefln("");
 
-        writeln("Sorting keys");
+        // Sort our duplicate digests by size so that we print
+        // the biggest duplicate file offenders first
+
+        writeln("Sorting duplicate digests by size");
 
         ubyte[16][] keys = _duplicate_digests.keys;
         bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; }
         sort!(compare_by_size)(keys);
 
+        // Print the results out the user, in descending order
+        // of file size
+
         writeln("Printing results");
 
         foreach (digest; keys) {
@@ -103,15 +124,13 @@
             _current_byte += bytes;
             double progress = cast(double)_current_byte / cast(double)_total_bytes;
             if (progress - _last_progress > 0.0005) {
-                writef("\rProgress %3.1f%%", 100.0 * progress);
+                writef("\rProgress %.1f%%", 100.0 * progress);
                 std.stdio.stdout.flush();
                 _last_progress = progress;
             }
-
         }
 
         void compute_md5(in string filename) {
-            //writefln("%s", filename);
             auto file = File(filename, "r");
             scope(exit) file.close;
 
@@ -124,30 +143,26 @@
                 context.update(buffer);
             }
             context.finish(digest);
-            //writefln("%s: %s", digestToString(digest), filename);
 
             if (FileInfo * file_info = (digest in _file_info_map)) {
-                // duplicate
+                // This is a duplicate digest, append the subsequent name
                 file_info.names ~= filename;
-                assert(file_info.names.length > 1);
 
+                // Record the duplicate as an offender if its size exceeds the threshold
                 if (file_info.size >= SIZE_THRESHOLD) {
                     _duplicate_digests[digest] = true;
                 }
             }
             else {
-                // unseen
+                // We have not seen this digest before
                 _file_info_map[digest] = FileInfo(getSize(filename), filename);
-                //writefln("%s", _file_info_map.length);
             }
         }
     }
 }
 
 int main(string[] args) {
-    foreach (string arg; args[1..$]) {
-        new DuplicateFinder(arg);
-    }
+    new DuplicateFinder(args[1..$]);
 
     return 0;
 }