# HG changeset patch # User David Bryant # Date 1303396567 -34200 # Node ID c275f26399c6c2f5fd450fc8c34b50c50c8a5c75 # Parent 8343c1dafac6b1928fc62447b5eeb81b00651688 Tinkerings diff -r 8343c1dafac6 -r c275f26399c6 doodle/utils/prog/dupes.d --- a/doodle/utils/prog/dupes.d Thu Apr 21 18:12:13 2011 +0930 +++ b/doodle/utils/prog/dupes.d Fri Apr 22 00:06:07 2011 +0930 @@ -3,14 +3,14 @@ import std.exception; import std.file; import std.md5; +import std.getopt; +import std.conv; +import std.c.stdlib; -void find_duplicates(in string[] dirs) { - immutable ulong KILO = 1 << 10; - immutable ulong MEGA = 1 << 20; - - immutable ulong SIZE_THRESHOLD = 100 * KILO; - immutable ulong MD5_AMOUNT = 10 * KILO; - +void find_duplicates(in string[] dirs, + in ulong file_size, + in ulong digest_size, + bool verbose) { static ubyte[16] compute_md5(in string name, in ulong max_bytes) { ubyte[16] digest; @@ -51,7 +51,7 @@ try { if (!isSymLink(name) && isFile(name)) { ulong size = getSize(name); - if (size >= SIZE_THRESHOLD) { + if (size >= file_size) { file_array ~= FileInfo(name, size); } } @@ -103,7 +103,7 @@ const FileInfo file_info = file_array[index]; try { - ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT); + ubyte[16] digest = compute_md5(file_info.name, digest_size); if (uint[] * duplicate_indices = (digest in digest_to_indices)) { // A true duplicate @@ -138,7 +138,39 @@ } int main(string[] args) { - find_duplicates(args[1..$]); + immutable ulong KILO = 1 << 10; + immutable ulong MEGA = 1 << 20; + immutable ulong GIGA = 1 << 30; + + /* + static ulong parse_size_string(in string[] s) { + if (s.length == 0) { + throw new ConvException + } + } + */ + + void help(in string) { + writefln("Help"); + exit(1); + } + + ulong file_size = 100 * KILO; + ulong digest_size = 10 * KILO; + bool verbose = false; + + try { + getopt(args, + "file-size|f", &file_size, + "digest-size|d", &digest_size, + "verbose|v", &verbose, + "help|h", &help); + } + catch (ConvException ex) { + + } + + find_duplicates(args[1..$], file_size, digest_size, verbose); return 0; } diff -r 8343c1dafac6 -r c275f26399c6 doodle/utils/prog/duplicates.d --- a/doodle/utils/prog/duplicates.d Thu Apr 21 18:12:13 2011 +0930 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,215 +0,0 @@ -import std.stdio; -import std.string; -import std.exception; -import std.random; -import std.algorithm; -import std.file; -import std.c.stdio; -import std.c.string; -import std.cstream; -import core.sys.posix.dirent; -import std.md5; - -// This program recursively processes files in a list -// of directories, computing an MD5 digest on each file -// and then informing the user of files with duplicate content. -// Only duplicate files over a certain size are reported. - -// Thoughts: -// Size threshold of files we care about: -// Accumulate array of all files above the size threshold. -// Only files that have matching sizes can possibly be duplicates. -// Hash them on size? -// Compute md5sum of first N bytes for files where there is more than -// one of the same size - -class DuplicateFinder { - this(in string[] dirs) { - // First pass to gather the number of files and bytes - // so that we are able to convey progress to the user - - writeln("Accumulating total bytes / files"); - - uint total_files = 0; - - try { - foreach (string dir; dirs) { - foreach (string name; dirEntries(dir, SpanMode.depth, false)) { - try { - if (isFile(name)) { - _total_bytes += getSize(name); - ++total_files; - } - } - catch (Exception ex) { - writefln("Skipping %s", name); - //writefln("Exception %s", ex); - // TODO accumulate errors and print after traversal - } - } - } - } - catch (FileException ex) { - // ignore - writefln("dirEntries bailed out. Continuing anyway"); - } - - writefln("Files %s, bytes %s", total_files, _total_bytes); - - // Go through the files again, but this time - // compute the MD5 digests and build our data structures - - writeln("Accumulating MD5 digests"); - - foreach (string dir; dirs) { - foreach (string name; dirEntries(dir, SpanMode.depth, false)) { - try { - if (isFile(name)) { - //writefln("MD5'ing %s", name); - compute_md5(name, getSize(name)); - } - } - catch (FileException ex) { - writefln("Skipping %s", name); - } - catch (ErrnoException ex) { - //writefln("Skipping file: %s, %s", name, ex); - //writefln("(errno) Skipping file: %s", name); - // TODO accumulate errors and print after traversal is complete - } - } - } - - writefln(""); - - // Sort our duplicate digests by size so that we print - // the biggest duplicate file offenders first - - writeln("Sorting duplicate digests by size"); - - ubyte[16][] keys = _duplicate_digests.keys; - bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } - sort!(compare_by_size)(keys); - - // Print the results out the user, in descending order - // of file size - - writeln("Printing results"); - - writefln("Number of duplicate files: %s", _duplicate_digests.length); - - foreach (digest; keys) { - auto file_info = _file_info_map[digest]; - /* - writefln("Size %s, Count %s, Digest %s", - file_info.size, file_info.names.length, digestToString(digest)); - */ - writefln("Size %s, Count %s", file_info.size, file_info.names.length); - foreach (name; file_info.names) { - writefln("\t%s", name); - } - } - - writeln("Done"); - } - - private { - struct FileInfo { - this(in ulong size_, string first_name) { - size = size_; - names ~= first_name; - } - - ulong size; - string[] names; - }; - - static const ulong SIZE_THRESHOLD = 100 * 1_024; - static const ulong AMOUNT_SUMMED = 100 * 1_024; - - bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests - FileInfo[ubyte[16]] _file_info_map; // map of digest to file info - - ulong _total_bytes; - ulong _current_byte; - double _last_progress = -1.0; - - void compute_md5(in string filename, in ulong size) { - void bytes_chewed(ulong bytes) { - _current_byte += bytes; - double progress = cast(double)_current_byte / cast(double)_total_bytes; - if (progress - _last_progress > 0.0005) { - writef("\rProgress %.1f%% %s ", 100.0 * progress, filename); - std.stdio.stdout.flush(); - _last_progress = progress; - } - } - - ubyte[16] digest; - - // If Block 1 and Block 2 are both uncommented then there is a memory explosion. - // However, if either one is commented out there there isn't... - - { - auto file = File(filename, "r"); - scope(exit) file.close; - - MD5_CTX context; - context.start(); - { // Block 1: - // Compute the actual digest - ulong amount = 0; - foreach (ubyte[] buffer; chunks(file, 1024)) { - context.update(buffer); - //bytes_chewed(buffer.length); - amount += buffer.length; - if (amount >= AMOUNT_SUMMED) { - break; - } - } - bytes_chewed(size); - } - context.finish(digest); - - /+ - { // Block 1 alternative: - // Create a random digest - digest = make_random_digest; - bytes_chewed(size); - } - +/ - } - - { // Block 2: - // Update the data structures - if (FileInfo * file_info = (digest in _file_info_map)) { - // This is a duplicate digest, append the subsequent name - file_info.names ~= filename; - - // Record the duplicate as an offender if its size exceeds the threshold - if (file_info.size >= SIZE_THRESHOLD) { - _duplicate_digests[digest] = true; - } - } - else { - // We have not seen this digest before - _file_info_map[digest] = FileInfo(size, filename); - } - } - } - - ubyte[16] make_random_digest() { - ubyte[16] digest; - foreach (ref a; digest) { - a = cast(ubyte)uniform(0, 256); - } - return digest; - } - } -} - -int main(string[] args) { - new DuplicateFinder(args[1..$]); - - return 0; -} diff -r 8343c1dafac6 -r c275f26399c6 doodle/utils/prog/hash_test.d --- a/doodle/utils/prog/hash_test.d Thu Apr 21 18:12:13 2011 +0930 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -import std.stdio; -import std.random; -import std.string; - -int main(string[] args) { - /* - struct S { - string[] names; - } - */ - - bool[ubyte[16]] aa; - - for (int count = 0; ; ++count) { - ubyte[16] digest; - foreach (ref a; digest) { - a = cast(ubyte)uniform(0, 256); - } - - /* - auto s = S(); - s.names ~= "hello"; - aa[digest] = s; - */ - aa[digest] = true; - - if (count % 10000 == 0) { - writefln("%s %3s", count, digest); - } - - //writefln("%s %s", count, digest); - } - - return 0; -} diff -r 8343c1dafac6 -r c275f26399c6 doodle/utils/prog/md5_test.d --- a/doodle/utils/prog/md5_test.d Thu Apr 21 18:12:13 2011 +0930 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ -import std.md5; -import std.stdio; -import std.file; -import std.exception; - -int main(in string[] args) { - ulong file_count = 0; - - foreach (string dir; args[1..$]) { - foreach (string name; dirEntries(dir, SpanMode.depth, false)) { - try { - ubyte[16] digest; - - //writefln("Doing file: %s", name); - ++file_count; - writef("\rFile num: %s ", file_count); - - auto file = File(name, "r"); - scope(exit) file.close; - - MD5_CTX context; - context.start(); - foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { - //bytes_chewed(buffer.length); - context.update(buffer); - } - context.finish(digest); - } - catch (FileException ex) { - writefln("File exception: %s", name); - } - catch (ErrnoException ex) { - writefln("Errno exception: %s", name); - } - } - } - - return 0; -}