# HG changeset patch # User David Bryant # Date 1303629146 -34200 # Node ID f1cf62339ed5b1d7d2c1ef854c0bce5802bbdab7 # Parent c275f26399c6c2f5fd450fc8c34b50c50c8a5c75 More tweaking diff -r c275f26399c6 -r f1cf62339ed5 doodle/utils/prog/dupes.d --- a/doodle/utils/prog/dupes.d Fri Apr 22 00:06:07 2011 +0930 +++ b/doodle/utils/prog/dupes.d Sun Apr 24 16:42:26 2011 +0930 @@ -1,33 +1,83 @@ import std.stdio; import std.string; import std.exception; +import std.algorithm; import std.file; import std.md5; import std.getopt; import std.conv; +import std.ctype; import std.c.stdlib; +ulong string_to_size(string s) { + // Convert strings to sizes, eg: + // "50" -> 50 + // "80B" -> 80 + // "10K" -> 10240 + // "1M" -> 1048576 + // Throws ConvException + + immutable map = [ 'B':1UL, 'K':1UL<<10, 'M':1UL<<20, 'G':1UL<<30, 'T':1UL<<40 ]; + + if (s.length == 0) { + throw new ConvException("Empty string"); + } + else { + ulong multiplier = 1; + + if (isalpha(s[$-1])) { + immutable ulong * m = (s[$-1] in map); + + if (m) { + multiplier = *m; + } + else { + throw new ConvException(format("Bad size unit character: %s", s[$-1])); + } + + s = s[0..$-1]; + } + + return multiplier * to!ulong(s); + } +} + +string size_to_string(in ulong size) { + /+ + immutable array = [ 'B', 'K', 'M', 'G', 'T' ]; + size_t index = 0; + + foreach (i, c; array) { + if (size / (1UL << i + + writefln("%s %s", i, c); + } + +/ + + return format("%sK", size / 1024); +} + void find_duplicates(in string[] dirs, in ulong file_size, in ulong digest_size, bool verbose) { - static ubyte[16] compute_md5(in string name, in ulong max_bytes) { + static ubyte[16] compute_md5(in string filename, in ulong max_bytes) { + size_t chunk_size = min(max_bytes, 4096 * 1024); ubyte[16] digest; - auto file = File(name, "r"); + auto file = File(filename, "r"); scope(exit) file.close; MD5_CTX context; context.start(); ulong byte_count = 0; - foreach (ubyte[] buffer; chunks(file, 1024)) { + foreach (ubyte[] buffer; chunks(file, chunk_size)) { context.update(buffer); byte_count += buffer.length; if (byte_count >= max_bytes) { break; } } - context.finish(digest); return digest; @@ -42,30 +92,34 @@ writefln("Accumulating file list"); - string last_name; - foreach (string dir; dirs) { - try { - foreach (string name; dirEntries(dir, SpanMode.depth, false)) { - last_name = name; - try { - if (!isSymLink(name) && isFile(name)) { - ulong size = getSize(name); - if (size >= file_size) { - file_array ~= FileInfo(name, size); + if (isDir(dir)) { + string last_entry; + try { + foreach (string filename; dirEntries(dir, SpanMode.depth, false)) { + last_entry = filename; + try { + if (!isSymLink(filename) && isFile(filename)) { + ulong size = getSize(filename); + if (size >= file_size) { + file_array ~= FileInfo(filename, size); + } } } - } - catch (Exception ex) { - writefln("Skipping %s", name); - //writefln("Exception %s", ex); - // TODO accumulate errors and print after traversal + catch (Exception ex) { + writefln("Skipping %s", filename); + //writefln("Exception %s", ex); + // TODO accumulate errors and print after traversal + } } } + catch (FileException ex) { + // ignore + writefln("Error, dirEntries bailed out after: %s. Continuing anyway", last_entry); + } } - catch (FileException ex) { - // ignore - writefln("dirEntries bailed out (%s). Continuing anyway", last_name); + else { + writefln("Not a dir: %s", dir); } } @@ -93,7 +147,9 @@ writefln("Number of files of duplicate size %s", duplicate_sizes.length); - foreach (size; duplicate_sizes.keys) { + ulong total_waste = 0; + + foreach_reverse (size; duplicate_sizes.keys.sort) { uint[] indices = size_to_file_indices[size]; //writefln("For size %s there are %s files", size, indices.length); @@ -125,49 +181,61 @@ foreach (indices2; digest_to_indices) { if (indices2.length > 1) { // List the duplicates - foreach (index; indices) { + foreach (i, index; indices) { FileInfo file_info = file_array[index]; - writefln("%s %s", file_info.size, file_info.name); + if (i == 0) { + writefln("%s", size_to_string(file_info.size)); + total_waste += file_info.size; + } + writefln(" %s", file_info.name); } writefln(""); } } } - writefln("Done"); + writefln("Done, total waste: %s", size_to_string(total_waste)); } int main(string[] args) { - immutable ulong KILO = 1 << 10; - immutable ulong MEGA = 1 << 20; - immutable ulong GIGA = 1 << 30; + ulong file_size; + ulong digest_size; + bool verbose; - /* - static ulong parse_size_string(in string[] s) { - if (s.length == 0) { - throw new ConvException + try { + void help(in string) { + writefln("Usage: dupes [OPTION]... DIR...\n" + "Recursively locate duplicate files in a list of directories\n" + "\n" + "Options\n" + " -d, --digest-size=SIZE size of digest used for comparison\n" + " -f, --file-size=SIZE minimum size of files searched for duplication\n" + " -v, --verbose be verbose\n" + " --help display this help and exit\n" + "\n" + "SIZE is an integer, optionally followed by K, M, G, T"); + exit(1); } + + string file_size_string = "100K"; + string digest_size_string = "100K"; + + getopt(args, + "file-size|f", &file_size_string, + "digest-size|d", &digest_size_string, + "verbose|v", &verbose, + "help", &help); + + file_size = string_to_size(file_size_string); + digest_size = string_to_size(digest_size_string); } - */ - - void help(in string) { - writefln("Help"); - exit(1); + catch (ConvException ex) { + writefln("Conversion error: %s", ex); + exit(2); } - ulong file_size = 100 * KILO; - ulong digest_size = 10 * KILO; - bool verbose = false; - - try { - getopt(args, - "file-size|f", &file_size, - "digest-size|d", &digest_size, - "verbose|v", &verbose, - "help|h", &help); - } - catch (ConvException ex) { - + if (verbose) { + writefln("file-size=%s, digest-size=%s", size_to_string(file_size), size_to_string(digest_size)); } find_duplicates(args[1..$], file_size, digest_size, verbose);