changeset 112:b569d7d5064f

Added some utilities that are a work in progress.
author David Bryant <bagnose@gmail.com>
date Thu, 14 Apr 2011 11:27:17 +0930
parents 0387a790e619
children 9cc6c428fdbe
files doodle/utils/prog/duplicates.d doodle/utils/prog/hash_test.d
diffstat 2 files changed, 142 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doodle/utils/prog/duplicates.d	Thu Apr 14 11:27:17 2011 +0930
@@ -0,0 +1,107 @@
+import std.stdio;
+import std.string;
+import std.file;
+import std.c.stdio;
+import std.c.string;
+import std.cstream;
+import core.sys.posix.dirent;
+import std.md5;
+
+class DuplicateFinder {
+    this(in string dir) {
+        recurse_directory(dir.dup);
+
+        writefln("\n");
+
+        foreach (digest; _duplicate_digests.keys) {
+            writefln("%s", digestToString(digest));
+            auto file_info = _file_info_map[digest];
+            writefln("Size %s, Count %s, Digest %s",
+                     file_info.size, file_info.names.length, digestToString(digest));
+            foreach (name; file_info.names) {
+                writefln("\t%s", name);
+            }
+        }
+    }
+
+    private {
+        struct FileInfo {
+            this(in ulong size_, string first_name) {
+                size   = size_;
+                names ~= first_name;
+            }
+
+            ulong    size;
+            string[] names;
+        };
+
+        //static const ulong SIZE_THRESHOLD = 1_000;
+        static const ulong SIZE_THRESHOLD = 0;
+
+        bool[ubyte[16]]     _duplicate_digests;             // set of all duplicate digests
+        FileInfo[ubyte[16]] _file_info_map;                 // map of digest to file info
+
+        void compute_md5(in char[] filename, in ulong filesize) {
+            //writefln("%s", filename);
+            auto file = File(filename.idup, "r");
+            scope(exit) file.close;
+
+            ubyte[16] digest;
+
+            MD5_CTX context;
+            context.start();
+            foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
+                context.update(buffer);
+            }
+            context.finish(digest);
+            writefln("%s: %s", digestToString(digest), filename);
+
+            if (FileInfo * file_info = (digest in _file_info_map)) {
+                // duplicate
+                file_info.names ~= filename.idup;
+
+                if (file_info.size >= SIZE_THRESHOLD) {
+                    _duplicate_digests[digest] = true;
+                }
+            }
+            else {
+                // unseen
+                _duplicate_digests[digest] = true;
+                _file_info_map[digest] = FileInfo(filesize, filename.idup);
+                //writefln("%s", _file_info_map.length);
+            }
+        }
+
+        bool entry_callback(DirEntry * de) {
+            //writefln("File: %s", de.name);
+
+            if (de.isdir) {
+                recurse_directory(de.name);
+            }
+            else if (de.isfile) {
+                compute_md5(de.name, de.size);
+            }
+
+            return true;
+        }
+
+        void recurse_directory(in char[] dirname) {
+            //writefln("Dir: %s", dirname);
+
+            try {
+                listdir(dirname, &entry_callback);
+            }
+            catch (FileException ex) {
+                //writefln("Skipping: %s", dirname);
+            }
+        }
+    }
+}
+
+int main(string[] args) {
+    foreach (string arg; args[1..$]) {
+        new DuplicateFinder(arg);
+    }
+
+    return 0;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doodle/utils/prog/hash_test.d	Thu Apr 14 11:27:17 2011 +0930
@@ -0,0 +1,35 @@
+import std.stdio;
+import std.random;
+import std.string;
+
+int main(string[] args) {
+    /*
+    struct S {
+        string[] names;
+    }
+    */
+
+    bool[ubyte[16]] aa;
+
+    for (int count = 0; ; ++count) {
+        ubyte[16] digest;
+        foreach (ref a; digest) {
+            a = cast(ubyte)uniform(0, 256);
+        }
+
+        /*
+        auto s = S();
+        s.names ~= "hello";
+        aa[digest] = s;
+        */
+        aa[digest] = true;
+
+        if (count % 1000 == 0) {
+            writefln("%s %s", count, digest);
+        }
+
+        //writefln("%s %s", count, digest);
+    }
+
+    return 0;
+}