# HG changeset patch # User David Bryant # Date 1302774046 -34200 # Node ID 9cc6c428fdbe6dfd036c13d634ac74710c4c03bd # Parent b569d7d5064fd1f4932aa4c11f381dcd1c6918e5 Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls no longer needed. Still blows the hood on memory usage. diff -r b569d7d5064f -r 9cc6c428fdbe doodle/utils/prog/duplicates.d --- a/doodle/utils/prog/duplicates.d Thu Apr 14 11:27:17 2011 +0930 +++ b/doodle/utils/prog/duplicates.d Thu Apr 14 19:10:46 2011 +0930 @@ -1,5 +1,7 @@ import std.stdio; import std.string; +import std.exception; +import std.algorithm; import std.file; import std.c.stdio; import std.c.string; @@ -9,19 +11,71 @@ class DuplicateFinder { this(in string dir) { - recurse_directory(dir.dup); + // First pass to gather the number of files and bytes + + writeln("Accumulating total bytes / files"); + + uint total_files = 0; - writefln("\n"); + try { + foreach (string name; dirEntries(dir, SpanMode.depth, false)) { + try { + if (isFile(name)) { + _total_bytes += getSize(name); + ++total_files; + } + } + catch (Exception ex) { + writefln("Skipping %s", name); + //writefln("Exception %s", ex); + } + } + } + catch (FileException ex) { + // ignore + writefln("dirEntries bailed out. Continuing anyway"); + } + + writefln("Files %s, bytes %s", total_files, _total_bytes); + writeln("Accumulating MD5 sums"); - foreach (digest; _duplicate_digests.keys) { - writefln("%s", digestToString(digest)); + foreach (string name; dirEntries(dir, SpanMode.depth, false)) { + if (isFile(name)) { + try { + //writefln("MD5'ing %s", name); + compute_md5(name); + } + catch (ErrnoException ex) { + //writefln("Skipping file: %s, %s", name, ex); + //writefln("(errno) Skipping file: %s", name); + // TODO accumulate errors and print after traversal is complete + } + } + } + + writefln(""); + + writeln("Sorting keys"); + + ubyte[16][] keys = _duplicate_digests.keys; + bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } + sort!(compare_by_size)(keys); + + writeln("Printing results"); + + foreach (digest; keys) { auto file_info = _file_info_map[digest]; + /* writefln("Size %s, Count %s, Digest %s", file_info.size, file_info.names.length, digestToString(digest)); + */ + writefln("Size %s, Count %s", file_info.size, file_info.names.length); foreach (name; file_info.names) { writefln("\t%s", name); } } + + writeln("Done"); } private { @@ -41,9 +95,24 @@ bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests FileInfo[ubyte[16]] _file_info_map; // map of digest to file info - void compute_md5(in char[] filename, in ulong filesize) { + ulong _total_bytes; + ulong _current_byte; + double _last_progress = -1.0; + + void bytes_chewed(ulong bytes) { + _current_byte += bytes; + double progress = cast(double)_current_byte / cast(double)_total_bytes; + if (progress - _last_progress > 0.0005) { + writef("\rProgress %3.1f%%", 100.0 * progress); + std.stdio.stdout.flush(); + _last_progress = progress; + } + + } + + void compute_md5(in string filename) { //writefln("%s", filename); - auto file = File(filename.idup, "r"); + auto file = File(filename, "r"); scope(exit) file.close; ubyte[16] digest; @@ -51,14 +120,16 @@ MD5_CTX context; context.start(); foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { + bytes_chewed(buffer.length); context.update(buffer); } context.finish(digest); - writefln("%s: %s", digestToString(digest), filename); + //writefln("%s: %s", digestToString(digest), filename); if (FileInfo * file_info = (digest in _file_info_map)) { // duplicate - file_info.names ~= filename.idup; + file_info.names ~= filename; + assert(file_info.names.length > 1); if (file_info.size >= SIZE_THRESHOLD) { _duplicate_digests[digest] = true; @@ -66,35 +137,10 @@ } else { // unseen - _duplicate_digests[digest] = true; - _file_info_map[digest] = FileInfo(filesize, filename.idup); + _file_info_map[digest] = FileInfo(getSize(filename), filename); //writefln("%s", _file_info_map.length); } } - - bool entry_callback(DirEntry * de) { - //writefln("File: %s", de.name); - - if (de.isdir) { - recurse_directory(de.name); - } - else if (de.isfile) { - compute_md5(de.name, de.size); - } - - return true; - } - - void recurse_directory(in char[] dirname) { - //writefln("Dir: %s", dirname); - - try { - listdir(dirname, &entry_callback); - } - catch (FileException ex) { - //writefln("Skipping: %s", dirname); - } - } } }