changeset 115:d7330cc52622

Added instructions to duplicates.d on the smallest changes required to trigger/untrigger the memory blowout. Interestingly the blowout only occurs when compiled with -m32, not -m64.
author David Bryant <bagnose@gmail.com>
date Sat, 16 Apr 2011 19:48:33 +0930
parents b87e2e0a046a
children 31c27f4f3bbc
files doodle/utils/prog/duplicates.d doodle/utils/prog/md5_test.d
diffstat 2 files changed, 108 insertions(+), 36 deletions(-) [+]
line wrap: on
line diff
--- a/doodle/utils/prog/duplicates.d	Fri Apr 15 11:07:47 2011 +0930
+++ b/doodle/utils/prog/duplicates.d	Sat Apr 16 19:48:33 2011 +0930
@@ -1,6 +1,7 @@
 import std.stdio;
 import std.string;
 import std.exception;
+import std.random;
 import std.algorithm;
 import std.file;
 import std.c.stdio;
@@ -54,16 +55,19 @@
 
         foreach (string dir; dirs) {
             foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
-                if (isFile(name)) {
-                    try {
+                try {
+                    if (isFile(name)) {
                         //writefln("MD5'ing %s", name);
-                        compute_md5(name);
+                        compute_md5(name, getSize(name));
                     }
-                    catch (ErrnoException ex) {
-                        //writefln("Skipping file: %s, %s", name, ex);
-                        //writefln("(errno) Skipping file: %s", name);
-                        // TODO accumulate errors and print after traversal is complete
-                    }
+                }
+                catch (FileException ex) {
+                    writefln("Skipping %s", name);
+                }
+                catch (ErrnoException ex) {
+                    //writefln("Skipping file: %s, %s", name, ex);
+                    //writefln("(errno) Skipping file: %s", name);
+                    // TODO accumulate errors and print after traversal is complete
                 }
             }
         }
@@ -84,6 +88,8 @@
 
         writeln("Printing results");
 
+        writefln("Number of duplicate files: %s", _duplicate_digests.length);
+
         foreach (digest; keys) {
             auto file_info = _file_info_map[digest];
             /*
@@ -120,43 +126,70 @@
         ulong               _current_byte;
         double              _last_progress = -1.0;
 
-        void bytes_chewed(ulong bytes) {
-            _current_byte += bytes;
-            double progress = cast(double)_current_byte / cast(double)_total_bytes;
-            if (progress - _last_progress > 0.0005) {
-                writef("\rProgress %.1f%%", 100.0 * progress);
-                std.stdio.stdout.flush();
-                _last_progress = progress;
+        void compute_md5(in string filename, in ulong size) {
+            void bytes_chewed(ulong bytes) {
+                _current_byte += bytes;
+                double progress = cast(double)_current_byte / cast(double)_total_bytes;
+                if (progress - _last_progress > 0.0005) {
+                    writef("\rProgress %.1f%%", 100.0 * progress);
+                    std.stdio.stdout.flush();
+                    _last_progress = progress;
+                }
             }
-        }
-
-        void compute_md5(in string filename) {
-            auto file = File(filename, "r");
-            scope(exit) file.close;
 
             ubyte[16] digest;
 
-            MD5_CTX context;
-            context.start();
-            foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
-                bytes_chewed(buffer.length);
-                context.update(buffer);
-            }
-            context.finish(digest);
+            // If Block 1 and Block 2 are both uncommented then there is a memory explosion.
+            // However, if either one is commented out there there isn't...
+
+            {
+                auto file = File(filename, "r");
+                scope(exit) file.close;
+
+                MD5_CTX context;
+                context.start();
+                { // Block 1:
+                    // Compute the actual digest
+                    foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
+                        context.update(buffer);
+                        bytes_chewed(buffer.length);
+                    }
+                }
+                context.finish(digest);
 
-            if (FileInfo * file_info = (digest in _file_info_map)) {
-                // This is a duplicate digest, append the subsequent name
-                file_info.names ~= filename;
+                /+
+                { // Block 1 alternative:
+                    // Create a random digest
+                    digest = make_random_digest;
+                    bytes_chewed(size);
+                }
+                +/
+            }
 
-                // Record the duplicate as an offender if its size exceeds the threshold
-                if (file_info.size >= SIZE_THRESHOLD) {
-                    _duplicate_digests[digest] = true;
+            { // Block 2:
+                // Update the data structures
+                if (FileInfo * file_info = (digest in _file_info_map)) {
+                    // This is a duplicate digest, append the subsequent name
+                    file_info.names ~= filename;
+
+                    // Record the duplicate as an offender if its size exceeds the threshold
+                    if (file_info.size >= SIZE_THRESHOLD) {
+                        _duplicate_digests[digest] = true;
+                    }
+                }
+                else {
+                    // We have not seen this digest before
+                    _file_info_map[digest] = FileInfo(size, filename);
                 }
             }
-            else {
-                // We have not seen this digest before
-                _file_info_map[digest] = FileInfo(getSize(filename), filename);
+        }
+
+        ubyte[16] make_random_digest() {
+            ubyte[16] digest;
+            foreach (ref a; digest) {
+                a = cast(ubyte)uniform(0, 256);
             }
+            return digest;
         }
     }
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doodle/utils/prog/md5_test.d	Sat Apr 16 19:48:33 2011 +0930
@@ -0,0 +1,39 @@
+import std.md5;
+import std.stdio;
+import std.file;
+import std.exception;
+
+int main(in string[] args) {
+    ulong file_count = 0;
+
+    foreach (string dir; args[1..$]) {
+        foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
+            try {
+                ubyte[16] digest;
+
+                //writefln("Doing file: %s", name);
+                ++file_count;
+                writef("\rFile num: %s  ", file_count);
+
+                auto file = File(name, "r");
+                scope(exit) file.close;
+
+                MD5_CTX context;
+                context.start();
+                foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
+                    //bytes_chewed(buffer.length);
+                    context.update(buffer);
+                }
+                context.finish(digest);
+            }
+            catch (FileException ex) {
+                writefln("File exception: %s", name);
+            }
+            catch (ErrnoException ex) {
+                writefln("Errno exception: %s", name);
+            }
+        }
+    }
+
+    return 0;
+}