Mercurial > projects > doodle
view doodle/utils/prog/dupes.d @ 118:94233d54e16a
Cleanup of dupes
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Wed, 20 Apr 2011 22:37:16 +0930 |
parents | c566cdbccaeb |
children | 8343c1dafac6 |
line wrap: on
line source
import std.stdio; import std.string; import std.exception; import std.file; import std.md5; class DuplicateFinder { this(in string[] dirs) { FileInfo[] _file_array; writefln("Accumulating files"); string last_name; foreach (string dir; dirs) { try { foreach (string name; dirEntries(dir, SpanMode.depth, false)) { last_name = name; try { if (isFile(name)) { ulong size = getSize(name); if (size >= SIZE_THRESHOLD) { _file_array ~= FileInfo(name, size); } } } catch (Exception ex) { writefln("Skipping %s", name); //writefln("Exception %s", ex); // TODO accumulate errors and print after traversal } } } catch (FileException ex) { // ignore writefln("dirEntries bailed out (%s). Continuing anyway", last_name); } } writefln("Processing %s files", _file_array.length); uint[][ulong] size_to_file_indices; bool[ulong] duplicate_sizes; foreach (index, file; _file_array) { //writefln("%s %s %s", index, file.name, file.size); if (uint[] * indices = (file.size in size_to_file_indices)) { if (indices.length == 1) { // Second time we've seen a file of this size, // record it in the duplicate_sizes array duplicate_sizes[file.size] = true; } (*indices) ~= index; } else { size_to_file_indices[file.size] = [ index ]; } } writefln("Number of files of duplicate size %s", duplicate_sizes.length); foreach (size; duplicate_sizes.keys) { uint[] indices = size_to_file_indices[size]; //writefln("For size %s there are %s files", size, indices.length); uint[][ubyte[16]] digest_to_indices; foreach (index; indices) { FileInfo file_info = _file_array[index]; try { ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT); if (uint[] * duplicate_indices = (digest in digest_to_indices)) { // A true duplicate // index and index2 are the same (*duplicate_indices) ~= index; } else { digest_to_indices[digest] ~= index; } } catch (ErrnoException ex) { //writefln("Skipping: %s", file_info.name); } //writefln("\t%s", file_info.name); } foreach (indices2; digest_to_indices) { if (indices2.length > 1) { // List the duplicates foreach (index; indices) { FileInfo file_info = _file_array[index]; writefln("%s %s", file_info.size, file_info.name); } writefln(""); } } } writefln("Done\n"); } struct FileInfo { this(in string name_, in ulong size_) { name = name_; size = size_; } string name; ulong size; } immutable ulong KILO = 1 << 10; immutable ulong MEGA = 1 << 20; immutable ulong SIZE_THRESHOLD = 100 * KILO; immutable ulong MD5_AMOUNT = 10 * KILO; static ubyte[16] compute_md5(in string name, in ulong max_bytes) { ubyte[16] digest; auto file = File(name, "r"); scope(exit) file.close; MD5_CTX context; context.start(); ulong byte_count = 0; foreach (ubyte[] buffer; chunks(file, 1024)) { context.update(buffer); byte_count += buffer.length; if (byte_count >= max_bytes) { break; } } context.finish(digest); return digest; } } int main(string[] args) { new DuplicateFinder(args[1..$]); return 0; }