Mercurial > projects > doodle
view doodle/utils/prog/dupes.d @ 120:c275f26399c6
Tinkerings
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Fri, 22 Apr 2011 00:06:07 +0930 |
parents | 8343c1dafac6 |
children | f1cf62339ed5 |
line wrap: on
line source
import std.stdio; import std.string; import std.exception; import std.file; import std.md5; import std.getopt; import std.conv; import std.c.stdlib; void find_duplicates(in string[] dirs, in ulong file_size, in ulong digest_size, bool verbose) { static ubyte[16] compute_md5(in string name, in ulong max_bytes) { ubyte[16] digest; auto file = File(name, "r"); scope(exit) file.close; MD5_CTX context; context.start(); ulong byte_count = 0; foreach (ubyte[] buffer; chunks(file, 1024)) { context.update(buffer); byte_count += buffer.length; if (byte_count >= max_bytes) { break; } } context.finish(digest); return digest; } struct FileInfo { string name; ulong size; } FileInfo[] file_array; writefln("Accumulating file list"); string last_name; foreach (string dir; dirs) { try { foreach (string name; dirEntries(dir, SpanMode.depth, false)) { last_name = name; try { if (!isSymLink(name) && isFile(name)) { ulong size = getSize(name); if (size >= file_size) { file_array ~= FileInfo(name, size); } } } catch (Exception ex) { writefln("Skipping %s", name); //writefln("Exception %s", ex); // TODO accumulate errors and print after traversal } } } catch (FileException ex) { // ignore writefln("dirEntries bailed out (%s). Continuing anyway", last_name); } } writefln("Processing %s files", file_array.length); uint[][ulong] size_to_file_indices; bool[ulong] duplicate_sizes; foreach (index, file; file_array) { //writefln("%s %s %s", index, file.name, file.size); if (uint[] * indices = (file.size in size_to_file_indices)) { if (indices.length == 1) { // Second time we've seen a file of this size, // record it in the duplicate_sizes array duplicate_sizes[file.size] = true; } (*indices) ~= index; } else { size_to_file_indices[file.size] = [ index ]; } } writefln("Number of files of duplicate size %s", duplicate_sizes.length); foreach (size; duplicate_sizes.keys) { uint[] indices = size_to_file_indices[size]; //writefln("For size %s there are %s files", size, indices.length); uint[][ubyte[16]] digest_to_indices; foreach (index; indices) { const FileInfo file_info = file_array[index]; try { ubyte[16] digest = compute_md5(file_info.name, digest_size); if (uint[] * duplicate_indices = (digest in digest_to_indices)) { // A true duplicate // index and index2 are the same (*duplicate_indices) ~= index; } else { digest_to_indices[digest] ~= index; } } catch (ErrnoException ex) { //writefln("Skipping: %s", file_info.name); } //writefln("\t%s", file_info.name); } foreach (indices2; digest_to_indices) { if (indices2.length > 1) { // List the duplicates foreach (index; indices) { FileInfo file_info = file_array[index]; writefln("%s %s", file_info.size, file_info.name); } writefln(""); } } } writefln("Done"); } int main(string[] args) { immutable ulong KILO = 1 << 10; immutable ulong MEGA = 1 << 20; immutable ulong GIGA = 1 << 30; /* static ulong parse_size_string(in string[] s) { if (s.length == 0) { throw new ConvException } } */ void help(in string) { writefln("Help"); exit(1); } ulong file_size = 100 * KILO; ulong digest_size = 10 * KILO; bool verbose = false; try { getopt(args, "file-size|f", &file_size, "digest-size|d", &digest_size, "verbose|v", &verbose, "help|h", &help); } catch (ConvException ex) { } find_duplicates(args[1..$], file_size, digest_size, verbose); return 0; }