changeset 120:c275f26399c6

Tinkerings
author David Bryant <bagnose@gmail.com>
date Fri, 22 Apr 2011 00:06:07 +0930
parents 8343c1dafac6
children f1cf62339ed5
files doodle/utils/prog/dupes.d doodle/utils/prog/duplicates.d doodle/utils/prog/hash_test.d doodle/utils/prog/md5_test.d
diffstat 4 files changed, 42 insertions(+), 299 deletions(-) [+]
line wrap: on
line diff
--- a/doodle/utils/prog/dupes.d	Thu Apr 21 18:12:13 2011 +0930
+++ b/doodle/utils/prog/dupes.d	Fri Apr 22 00:06:07 2011 +0930
@@ -3,14 +3,14 @@
 import std.exception;
 import std.file;
 import std.md5;
+import std.getopt;
+import std.conv;
+import std.c.stdlib;
 
-void find_duplicates(in string[] dirs) {
-    immutable ulong KILO = 1 << 10;
-    immutable ulong MEGA = 1 << 20;
-
-    immutable ulong SIZE_THRESHOLD = 100 * KILO;
-    immutable ulong MD5_AMOUNT     = 10 * KILO;
-
+void find_duplicates(in string[] dirs,
+                     in ulong    file_size,
+                     in ulong    digest_size,
+                     bool        verbose) {
     static ubyte[16] compute_md5(in string name, in ulong max_bytes) {
         ubyte[16] digest;
 
@@ -51,7 +51,7 @@
                 try {
                     if (!isSymLink(name) && isFile(name)) {
                         ulong size = getSize(name);
-                        if (size >= SIZE_THRESHOLD) {
+                        if (size >= file_size) {
                             file_array ~= FileInfo(name, size);
                         }
                     }
@@ -103,7 +103,7 @@
             const FileInfo file_info = file_array[index];
 
             try {
-                ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT);
+                ubyte[16] digest = compute_md5(file_info.name, digest_size);
 
                 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
                     // A true duplicate
@@ -138,7 +138,39 @@
 }
 
 int main(string[] args) {
-    find_duplicates(args[1..$]);
+    immutable ulong KILO = 1 << 10;
+    immutable ulong MEGA = 1 << 20;
+    immutable ulong GIGA = 1 << 30;
+
+    /*
+    static ulong parse_size_string(in string[] s) {
+        if (s.length == 0) {
+            throw new ConvException
+        }
+    }
+    */
+
+    void help(in string) {
+        writefln("Help");
+        exit(1);
+    }
+
+    ulong file_size   = 100 * KILO;
+    ulong digest_size =  10 * KILO;
+    bool  verbose     = false;
+
+    try {
+         getopt(args,
+                "file-size|f",   &file_size,
+                "digest-size|d", &digest_size,
+                "verbose|v",     &verbose,
+                "help|h",        &help);
+    }
+    catch (ConvException ex) {
+
+    }
+
+    find_duplicates(args[1..$], file_size, digest_size, verbose);
 
     return 0;
 }
--- a/doodle/utils/prog/duplicates.d	Thu Apr 21 18:12:13 2011 +0930
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,215 +0,0 @@
-import std.stdio;
-import std.string;
-import std.exception;
-import std.random;
-import std.algorithm;
-import std.file;
-import std.c.stdio;
-import std.c.string;
-import std.cstream;
-import core.sys.posix.dirent;
-import std.md5;
-
-// This program recursively processes files in a list
-// of directories, computing an MD5 digest on each file
-// and then informing the user of files with duplicate content.
-// Only duplicate files over a certain size are reported.
-
-// Thoughts:
-// Size threshold of files we care about:
-// Accumulate array of all files above the size threshold.
-// Only files that have matching sizes can possibly be duplicates.
-// Hash them on size?
-// Compute md5sum of first N bytes for files where there is more than
-// one of the same size
-
-class DuplicateFinder {
-    this(in string[] dirs) {
-        // First pass to gather the number of files and bytes
-        // so that we are able to convey progress to the user
-
-        writeln("Accumulating total bytes / files");
-
-        uint total_files = 0;
-
-        try {
-            foreach (string dir; dirs) {
-                foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
-                    try {
-                        if (isFile(name)) {
-                            _total_bytes += getSize(name);
-                            ++total_files;
-                        }
-                    }
-                    catch (Exception ex) {
-                        writefln("Skipping %s", name);
-                        //writefln("Exception %s", ex);
-                        // TODO accumulate errors and print after traversal
-                    }
-                }
-            }
-        }
-        catch (FileException ex) {
-            // ignore
-            writefln("dirEntries bailed out. Continuing anyway");
-        }
-
-        writefln("Files %s, bytes %s", total_files, _total_bytes);
-
-        // Go through the files again, but this time
-        // compute the MD5 digests and build our data structures
-
-        writeln("Accumulating MD5 digests");
-
-        foreach (string dir; dirs) {
-            foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
-                try {
-                    if (isFile(name)) {
-                        //writefln("MD5'ing %s", name);
-                        compute_md5(name, getSize(name));
-                    }
-                }
-                catch (FileException ex) {
-                    writefln("Skipping %s", name);
-                }
-                catch (ErrnoException ex) {
-                    //writefln("Skipping file: %s, %s", name, ex);
-                    //writefln("(errno) Skipping file: %s", name);
-                    // TODO accumulate errors and print after traversal is complete
-                }
-            }
-        }
-
-        writefln("");
-
-        // Sort our duplicate digests by size so that we print
-        // the biggest duplicate file offenders first
-
-        writeln("Sorting duplicate digests by size");
-
-        ubyte[16][] keys = _duplicate_digests.keys;
-        bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; }
-        sort!(compare_by_size)(keys);
-
-        // Print the results out the user, in descending order
-        // of file size
-
-        writeln("Printing results");
-
-        writefln("Number of duplicate files: %s", _duplicate_digests.length);
-
-        foreach (digest; keys) {
-            auto file_info = _file_info_map[digest];
-            /*
-            writefln("Size %s, Count %s, Digest %s",
-                     file_info.size, file_info.names.length, digestToString(digest));
-                     */
-            writefln("Size %s, Count %s", file_info.size, file_info.names.length);
-            foreach (name; file_info.names) {
-                writefln("\t%s", name);
-            }
-        }
-
-        writeln("Done");
-    }
-
-    private {
-        struct FileInfo {
-            this(in ulong size_, string first_name) {
-                size   = size_;
-                names ~= first_name;
-            }
-
-            ulong    size;
-            string[] names;
-        };
-
-        static const ulong SIZE_THRESHOLD = 100 * 1_024;
-        static const ulong AMOUNT_SUMMED  = 100 * 1_024;
-
-        bool[ubyte[16]]     _duplicate_digests;             // set of all duplicate digests
-        FileInfo[ubyte[16]] _file_info_map;                 // map of digest to file info
-
-        ulong               _total_bytes;
-        ulong               _current_byte;
-        double              _last_progress = -1.0;
-
-        void compute_md5(in string filename, in ulong size) {
-            void bytes_chewed(ulong bytes) {
-                _current_byte += bytes;
-                double progress = cast(double)_current_byte / cast(double)_total_bytes;
-                if (progress - _last_progress > 0.0005) {
-                    writef("\rProgress %.1f%%  %s           ", 100.0 * progress, filename);
-                    std.stdio.stdout.flush();
-                    _last_progress = progress;
-                }
-            }
-
-            ubyte[16] digest;
-
-            // If Block 1 and Block 2 are both uncommented then there is a memory explosion.
-            // However, if either one is commented out there there isn't...
-
-            {
-                auto file = File(filename, "r");
-                scope(exit) file.close;
-
-                MD5_CTX context;
-                context.start();
-                { // Block 1:
-                    // Compute the actual digest
-                    ulong amount = 0;
-                    foreach (ubyte[] buffer; chunks(file, 1024)) {
-                        context.update(buffer);
-                        //bytes_chewed(buffer.length);
-                        amount += buffer.length;
-                        if (amount >= AMOUNT_SUMMED) {
-                            break;
-                        }
-                    }
-                    bytes_chewed(size);
-                }
-                context.finish(digest);
-
-                /+
-                { // Block 1 alternative:
-                    // Create a random digest
-                    digest = make_random_digest;
-                    bytes_chewed(size);
-                }
-                +/
-            }
-
-            { // Block 2:
-                // Update the data structures
-                if (FileInfo * file_info = (digest in _file_info_map)) {
-                    // This is a duplicate digest, append the subsequent name
-                    file_info.names ~= filename;
-
-                    // Record the duplicate as an offender if its size exceeds the threshold
-                    if (file_info.size >= SIZE_THRESHOLD) {
-                        _duplicate_digests[digest] = true;
-                    }
-                }
-                else {
-                    // We have not seen this digest before
-                    _file_info_map[digest] = FileInfo(size, filename);
-                }
-            }
-        }
-
-        ubyte[16] make_random_digest() {
-            ubyte[16] digest;
-            foreach (ref a; digest) {
-                a = cast(ubyte)uniform(0, 256);
-            }
-            return digest;
-        }
-    }
-}
-
-int main(string[] args) {
-    new DuplicateFinder(args[1..$]);
-
-    return 0;
-}
--- a/doodle/utils/prog/hash_test.d	Thu Apr 21 18:12:13 2011 +0930
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,35 +0,0 @@
-import std.stdio;
-import std.random;
-import std.string;
-
-int main(string[] args) {
-    /*
-    struct S {
-        string[] names;
-    }
-    */
-
-    bool[ubyte[16]] aa;
-
-    for (int count = 0; ; ++count) {
-        ubyte[16] digest;
-        foreach (ref a; digest) {
-            a = cast(ubyte)uniform(0, 256);
-        }
-
-        /*
-        auto s = S();
-        s.names ~= "hello";
-        aa[digest] = s;
-        */
-        aa[digest] = true;
-
-        if (count % 10000 == 0) {
-            writefln("%s %3s", count, digest);
-        }
-
-        //writefln("%s %s", count, digest);
-    }
-
-    return 0;
-}
--- a/doodle/utils/prog/md5_test.d	Thu Apr 21 18:12:13 2011 +0930
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-import std.md5;
-import std.stdio;
-import std.file;
-import std.exception;
-
-int main(in string[] args) {
-    ulong file_count = 0;
-
-    foreach (string dir; args[1..$]) {
-        foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
-            try {
-                ubyte[16] digest;
-
-                //writefln("Doing file: %s", name);
-                ++file_count;
-                writef("\rFile num: %s  ", file_count);
-
-                auto file = File(name, "r");
-                scope(exit) file.close;
-
-                MD5_CTX context;
-                context.start();
-                foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
-                    //bytes_chewed(buffer.length);
-                    context.update(buffer);
-                }
-                context.finish(digest);
-            }
-            catch (FileException ex) {
-                writefln("File exception: %s", name);
-            }
-            catch (ErrnoException ex) {
-                writefln("Errno exception: %s", name);
-            }
-        }
-    }
-
-    return 0;
-}