Mercurial > projects > doodle
view doodle/utils/prog/duplicates.d @ 113:9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
no longer needed. Still blows the hood on memory usage.
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Thu, 14 Apr 2011 19:10:46 +0930 |
parents | b569d7d5064f |
children | b87e2e0a046a |
line wrap: on
line source
import std.stdio; import std.string; import std.exception; import std.algorithm; import std.file; import std.c.stdio; import std.c.string; import std.cstream; import core.sys.posix.dirent; import std.md5; class DuplicateFinder { this(in string dir) { // First pass to gather the number of files and bytes writeln("Accumulating total bytes / files"); uint total_files = 0; try { foreach (string name; dirEntries(dir, SpanMode.depth, false)) { try { if (isFile(name)) { _total_bytes += getSize(name); ++total_files; } } catch (Exception ex) { writefln("Skipping %s", name); //writefln("Exception %s", ex); } } } catch (FileException ex) { // ignore writefln("dirEntries bailed out. Continuing anyway"); } writefln("Files %s, bytes %s", total_files, _total_bytes); writeln("Accumulating MD5 sums"); foreach (string name; dirEntries(dir, SpanMode.depth, false)) { if (isFile(name)) { try { //writefln("MD5'ing %s", name); compute_md5(name); } catch (ErrnoException ex) { //writefln("Skipping file: %s, %s", name, ex); //writefln("(errno) Skipping file: %s", name); // TODO accumulate errors and print after traversal is complete } } } writefln(""); writeln("Sorting keys"); ubyte[16][] keys = _duplicate_digests.keys; bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } sort!(compare_by_size)(keys); writeln("Printing results"); foreach (digest; keys) { auto file_info = _file_info_map[digest]; /* writefln("Size %s, Count %s, Digest %s", file_info.size, file_info.names.length, digestToString(digest)); */ writefln("Size %s, Count %s", file_info.size, file_info.names.length); foreach (name; file_info.names) { writefln("\t%s", name); } } writeln("Done"); } private { struct FileInfo { this(in ulong size_, string first_name) { size = size_; names ~= first_name; } ulong size; string[] names; }; //static const ulong SIZE_THRESHOLD = 1_000; static const ulong SIZE_THRESHOLD = 0; bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests FileInfo[ubyte[16]] _file_info_map; // map of digest to file info ulong _total_bytes; ulong _current_byte; double _last_progress = -1.0; void bytes_chewed(ulong bytes) { _current_byte += bytes; double progress = cast(double)_current_byte / cast(double)_total_bytes; if (progress - _last_progress > 0.0005) { writef("\rProgress %3.1f%%", 100.0 * progress); std.stdio.stdout.flush(); _last_progress = progress; } } void compute_md5(in string filename) { //writefln("%s", filename); auto file = File(filename, "r"); scope(exit) file.close; ubyte[16] digest; MD5_CTX context; context.start(); foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { bytes_chewed(buffer.length); context.update(buffer); } context.finish(digest); //writefln("%s: %s", digestToString(digest), filename); if (FileInfo * file_info = (digest in _file_info_map)) { // duplicate file_info.names ~= filename; assert(file_info.names.length > 1); if (file_info.size >= SIZE_THRESHOLD) { _duplicate_digests[digest] = true; } } else { // unseen _file_info_map[digest] = FileInfo(getSize(filename), filename); //writefln("%s", _file_info_map.length); } } } } int main(string[] args) { foreach (string arg; args[1..$]) { new DuplicateFinder(arg); } return 0; }