Mercurial > projects > doodle

--- a/doodle/utils/prog/duplicates.d	Sat Apr 16 19:48:33 2011 +0930
+++ b/doodle/utils/prog/duplicates.d	Sun Apr 17 23:23:29 2011 +0930
@@ -15,6 +15,14 @@
 // and then informing the user of files with duplicate content.
 // Only duplicate files over a certain size are reported.

+// Thoughts:
+// Size threshold of files we care about:
+// Accumulate array of all files above the size threshold.
+// Only files that have matching sizes can possibly be duplicates.
+// Hash them on size?
+// Compute md5sum of first N bytes for files where there is more than
+// one of the same size
+
 class DuplicateFinder {
     this(in string[] dirs) {
         // First pass to gather the number of files and bytes
@@ -116,8 +124,8 @@
             string[] names;
         };

-        //static const ulong SIZE_THRESHOLD = 1_000;
-        static const ulong SIZE_THRESHOLD = 0;
+        static const ulong SIZE_THRESHOLD = 100 * 1_024;
+        static const ulong AMOUNT_SUMMED  = 100 * 1_024;

         bool[ubyte[16]]     _duplicate_digests;             // set of all duplicate digests
         FileInfo[ubyte[16]] _file_info_map;                 // map of digest to file info
@@ -131,7 +139,7 @@
                 _current_byte += bytes;
                 double progress = cast(double)_current_byte / cast(double)_total_bytes;
                 if (progress - _last_progress > 0.0005) {
-                    writef("\rProgress %.1f%%", 100.0 * progress);
+                    writef("\rProgress %.1f%%  %s           ", 100.0 * progress, filename);
                     std.stdio.stdout.flush();
                     _last_progress = progress;
                 }
@@ -150,10 +158,16 @@
                 context.start();
                 { // Block 1:
                     // Compute the actual digest
-                    foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
+                    ulong amount = 0;
+                    foreach (ubyte[] buffer; chunks(file, 1024)) {
                         context.update(buffer);
-                        bytes_chewed(buffer.length);
+                        //bytes_chewed(buffer.length);
+                        amount += buffer.length;
+                        if (amount >= AMOUNT_SUMMED) {
+                            break;
+                        }
                     }
+                    bytes_chewed(size);
                 }
                 context.finish(digest);