# HG changeset patch # User David Bryant # Date 1302746237 -34200 # Node ID b569d7d5064fd1f4932aa4c11f381dcd1c6918e5 # Parent 0387a790e619be9705222ee960c7ea0b6b6a7690 Added some utilities that are a work in progress. diff -r 0387a790e619 -r b569d7d5064f doodle/utils/prog/duplicates.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doodle/utils/prog/duplicates.d Thu Apr 14 11:27:17 2011 +0930 @@ -0,0 +1,107 @@ +import std.stdio; +import std.string; +import std.file; +import std.c.stdio; +import std.c.string; +import std.cstream; +import core.sys.posix.dirent; +import std.md5; + +class DuplicateFinder { + this(in string dir) { + recurse_directory(dir.dup); + + writefln("\n"); + + foreach (digest; _duplicate_digests.keys) { + writefln("%s", digestToString(digest)); + auto file_info = _file_info_map[digest]; + writefln("Size %s, Count %s, Digest %s", + file_info.size, file_info.names.length, digestToString(digest)); + foreach (name; file_info.names) { + writefln("\t%s", name); + } + } + } + + private { + struct FileInfo { + this(in ulong size_, string first_name) { + size = size_; + names ~= first_name; + } + + ulong size; + string[] names; + }; + + //static const ulong SIZE_THRESHOLD = 1_000; + static const ulong SIZE_THRESHOLD = 0; + + bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests + FileInfo[ubyte[16]] _file_info_map; // map of digest to file info + + void compute_md5(in char[] filename, in ulong filesize) { + //writefln("%s", filename); + auto file = File(filename.idup, "r"); + scope(exit) file.close; + + ubyte[16] digest; + + MD5_CTX context; + context.start(); + foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { + context.update(buffer); + } + context.finish(digest); + writefln("%s: %s", digestToString(digest), filename); + + if (FileInfo * file_info = (digest in _file_info_map)) { + // duplicate + file_info.names ~= filename.idup; + + if (file_info.size >= SIZE_THRESHOLD) { + _duplicate_digests[digest] = true; + } + } + else { + // unseen + _duplicate_digests[digest] = true; + _file_info_map[digest] = FileInfo(filesize, filename.idup); + //writefln("%s", _file_info_map.length); + } + } + + bool entry_callback(DirEntry * de) { + //writefln("File: %s", de.name); + + if (de.isdir) { + recurse_directory(de.name); + } + else if (de.isfile) { + compute_md5(de.name, de.size); + } + + return true; + } + + void recurse_directory(in char[] dirname) { + //writefln("Dir: %s", dirname); + + try { + listdir(dirname, &entry_callback); + } + catch (FileException ex) { + //writefln("Skipping: %s", dirname); + } + } + } +} + +int main(string[] args) { + foreach (string arg; args[1..$]) { + new DuplicateFinder(arg); + } + + return 0; +} diff -r 0387a790e619 -r b569d7d5064f doodle/utils/prog/hash_test.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doodle/utils/prog/hash_test.d Thu Apr 14 11:27:17 2011 +0930 @@ -0,0 +1,35 @@ +import std.stdio; +import std.random; +import std.string; + +int main(string[] args) { + /* + struct S { + string[] names; + } + */ + + bool[ubyte[16]] aa; + + for (int count = 0; ; ++count) { + ubyte[16] digest; + foreach (ref a; digest) { + a = cast(ubyte)uniform(0, 256); + } + + /* + auto s = S(); + s.names ~= "hello"; + aa[digest] = s; + */ + aa[digest] = true; + + if (count % 1000 == 0) { + writefln("%s %s", count, digest); + } + + //writefln("%s %s", count, digest); + } + + return 0; +}