Mercurial > projects > doodle
diff doodle/main/dupes.d @ 139:e33f37b14893 default tip
Port to 'no-more-make' https://github.com/GrahamStJack/no-more-make
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Sun, 30 Sep 2012 15:41:25 +0930 |
parents | doodle/main/util/dupes.d@89e8b0d92f36 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doodle/main/dupes.d Sun Sep 30 15:41:25 2012 +0930 @@ -0,0 +1,246 @@ +import std.stdio; +import std.string; +import std.exception; +import std.algorithm; +import std.file; +import std.md5; +import std.getopt; +import std.conv; +import std.ascii; +import std.c.stdlib; + +ulong string_to_size(string s) { + // Convert strings to sizes, eg: + // "50" -> 50 + // "80B" -> 80 + // "10K" -> 10240 + // "1M" -> 1048576 + // Throws ConvException + + immutable map = [ 'B':1UL, 'K':1UL<<10, 'M':1UL<<20, 'G':1UL<<30, 'T':1UL<<40 ]; + + if (s.length == 0) { + throw new ConvException("Empty string"); + } + else { + ulong multiplier = 1; + + if (isAlpha(s[$-1])) { + immutable ulong * m = (s[$-1] in map); + + if (m) { + multiplier = *m; + } + else { + throw new ConvException(format("Bad size unit character: %s", s[$-1])); + } + + s = s[0..$-1]; + } + + return multiplier * to!ulong(s); + } +} + +string size_to_string(in ulong size) { + /+ + immutable array = [ 'B', 'K', 'M', 'G', 'T' ]; + size_t index = 0; + + foreach (i, c; array) { + if (size / (1UL << i + + writefln("%s %s", i, c); + } + +/ + + return format("%sK", size / 1024); +} + +void find_duplicates(in string[] dirs, + in ulong file_size, + in ulong digest_size, + bool verbose) { + static ubyte[16] compute_md5(in string filename, in ulong max_bytes) { + size_t chunk_size = min(max_bytes, 4096 * 1024); + ubyte[16] digest; + + auto file = File(filename, "r"); + scope(exit) file.close(); + + MD5_CTX context; + context.start(); + ulong byte_count = 0; + foreach (ubyte[] buffer; chunks(file, chunk_size)) { + context.update(buffer); + byte_count += buffer.length; + if (byte_count >= max_bytes) { + break; + } + } + context.finish(digest); + + return digest; + } + + struct FileInfo { + string name; + ulong size; + } + + FileInfo[] file_array; + + writefln("Accumulating file list"); + + foreach (string dir; dirs) { + if (isDir(dir)) { + string last_entry; + try { + foreach (string filename; dirEntries(dir, SpanMode.depth, false)) { + last_entry = filename; + try { + if (!isSymlink(filename) && isFile(filename)) { + ulong size = getSize(filename); + if (size >= file_size) { + file_array ~= FileInfo(filename, size); + } + } + } + catch (Exception ex) { + writefln("Skipping %s", filename); + //writefln("Exception %s", ex); + // TODO accumulate errors and print after traversal + } + } + } + catch (FileException ex) { + // ignore + writefln("Error, dirEntries bailed out after: %s. Continuing anyway", last_entry); + } + } + else { + writefln("Not a dir: %s", dir); + } + } + + writefln("Processing %s files", file_array.length); + + uint[][ulong] size_to_file_indices; + bool[ulong] duplicate_sizes; + + foreach (uint index, file; file_array) { + //writefln("%s %s %s", index, file.name, file.size); + + if (uint[] * indices = (file.size in size_to_file_indices)) { + if (indices.length == 1) { + // Second time we've seen a file of this size, + // record it in the duplicate_sizes array + duplicate_sizes[file.size] = true; + } + + (*indices) ~= index; + } + else { + size_to_file_indices[file.size] = [ index ]; + } + } + + writefln("Number of files of duplicate size %s", duplicate_sizes.length); + + ulong total_waste = 0; + + foreach_reverse (size; duplicate_sizes.keys.sort) { + uint[] indices = size_to_file_indices[size]; + //writefln("For size %s there are %s files", size, indices.length); + + uint[][ubyte[16]] digest_to_indices; + + foreach (index; indices) { + const FileInfo file_info = file_array[index]; + + try { + ubyte[16] digest = compute_md5(file_info.name, digest_size); + + if (uint[] * duplicate_indices = (digest in digest_to_indices)) { + // A true duplicate + // index and index2 are the same + + (*duplicate_indices) ~= index; + } + else { + digest_to_indices[digest] ~= index; + } + } + catch (ErrnoException ex) { + //writefln("Skipping: %s", file_info.name); + } + + //writefln("\t%s", file_info.name); + } + + foreach (indices2; digest_to_indices) { + if (indices2.length > 1) { + // List the duplicates + foreach (i, index; indices) { + FileInfo file_info = file_array[index]; + if (i == 0) { + writefln("%s", size_to_string(file_info.size)); + total_waste += file_info.size; + } + writefln(" %s", file_info.name); + } + writefln(""); + } + } + } + + writefln("Done, total waste: %s", size_to_string(total_waste)); +} + +int main(string[] args) { + ulong file_size; + ulong digest_size; + bool verbose; + + try { + string file_size_string = "100K"; + string digest_size_string = "100K"; + + void help(in string) { + writefln("Usage: dupes [OPTION]... DIR...\n" + "Recursively locate duplicate files in a list of directories\n" + "\n" + "Options\n" + " -d, --digest-size=SIZE size of digest used for comparison [%s]\n" + " -f, --file-size=SIZE minimum size of files searched for duplication [%s]\n" + " -v, --verbose be verbose\n" + " --help display this help and exit\n" + "\n" + "SIZE is an integer, optionally followed by K, M, G, T", + file_size_string, + digest_size_string); + exit(1); + } + + getopt(args, + "file-size|f", &file_size_string, + "digest-size|d", &digest_size_string, + "verbose|v", &verbose, + "help", &help); + + file_size = string_to_size(file_size_string); + digest_size = string_to_size(digest_size_string); + } + catch (ConvException ex) { + writefln("Conversion error: %s", ex); + exit(2); + } + + if (verbose) { + writefln("file-size=%s, digest-size=%s", size_to_string(file_size), size_to_string(digest_size)); + } + + find_duplicates(args[1..$], file_size, digest_size, verbose); + + return 0; +}