Mercurial > projects > doodle
view doodle/utils/prog/duplicates.d @ 115:d7330cc52622
Added instructions to duplicates.d on the smallest changes
required to trigger/untrigger the memory blowout.
Interestingly the blowout only occurs when compiled with -m32, not -m64.
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Sat, 16 Apr 2011 19:48:33 +0930 |
parents | b87e2e0a046a |
children | 31c27f4f3bbc |
line wrap: on
line source
import std.stdio; import std.string; import std.exception; import std.random; import std.algorithm; import std.file; import std.c.stdio; import std.c.string; import std.cstream; import core.sys.posix.dirent; import std.md5; // This program recursively processes files in a list // of directories, computing an MD5 digest on each file // and then informing the user of files with duplicate content. // Only duplicate files over a certain size are reported. class DuplicateFinder { this(in string[] dirs) { // First pass to gather the number of files and bytes // so that we are able to convey progress to the user writeln("Accumulating total bytes / files"); uint total_files = 0; try { foreach (string dir; dirs) { foreach (string name; dirEntries(dir, SpanMode.depth, false)) { try { if (isFile(name)) { _total_bytes += getSize(name); ++total_files; } } catch (Exception ex) { writefln("Skipping %s", name); //writefln("Exception %s", ex); // TODO accumulate errors and print after traversal } } } } catch (FileException ex) { // ignore writefln("dirEntries bailed out. Continuing anyway"); } writefln("Files %s, bytes %s", total_files, _total_bytes); // Go through the files again, but this time // compute the MD5 digests and build our data structures writeln("Accumulating MD5 digests"); foreach (string dir; dirs) { foreach (string name; dirEntries(dir, SpanMode.depth, false)) { try { if (isFile(name)) { //writefln("MD5'ing %s", name); compute_md5(name, getSize(name)); } } catch (FileException ex) { writefln("Skipping %s", name); } catch (ErrnoException ex) { //writefln("Skipping file: %s, %s", name, ex); //writefln("(errno) Skipping file: %s", name); // TODO accumulate errors and print after traversal is complete } } } writefln(""); // Sort our duplicate digests by size so that we print // the biggest duplicate file offenders first writeln("Sorting duplicate digests by size"); ubyte[16][] keys = _duplicate_digests.keys; bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } sort!(compare_by_size)(keys); // Print the results out the user, in descending order // of file size writeln("Printing results"); writefln("Number of duplicate files: %s", _duplicate_digests.length); foreach (digest; keys) { auto file_info = _file_info_map[digest]; /* writefln("Size %s, Count %s, Digest %s", file_info.size, file_info.names.length, digestToString(digest)); */ writefln("Size %s, Count %s", file_info.size, file_info.names.length); foreach (name; file_info.names) { writefln("\t%s", name); } } writeln("Done"); } private { struct FileInfo { this(in ulong size_, string first_name) { size = size_; names ~= first_name; } ulong size; string[] names; }; //static const ulong SIZE_THRESHOLD = 1_000; static const ulong SIZE_THRESHOLD = 0; bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests FileInfo[ubyte[16]] _file_info_map; // map of digest to file info ulong _total_bytes; ulong _current_byte; double _last_progress = -1.0; void compute_md5(in string filename, in ulong size) { void bytes_chewed(ulong bytes) { _current_byte += bytes; double progress = cast(double)_current_byte / cast(double)_total_bytes; if (progress - _last_progress > 0.0005) { writef("\rProgress %.1f%%", 100.0 * progress); std.stdio.stdout.flush(); _last_progress = progress; } } ubyte[16] digest; // If Block 1 and Block 2 are both uncommented then there is a memory explosion. // However, if either one is commented out there there isn't... { auto file = File(filename, "r"); scope(exit) file.close; MD5_CTX context; context.start(); { // Block 1: // Compute the actual digest foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { context.update(buffer); bytes_chewed(buffer.length); } } context.finish(digest); /+ { // Block 1 alternative: // Create a random digest digest = make_random_digest; bytes_chewed(size); } +/ } { // Block 2: // Update the data structures if (FileInfo * file_info = (digest in _file_info_map)) { // This is a duplicate digest, append the subsequent name file_info.names ~= filename; // Record the duplicate as an offender if its size exceeds the threshold if (file_info.size >= SIZE_THRESHOLD) { _duplicate_digests[digest] = true; } } else { // We have not seen this digest before _file_info_map[digest] = FileInfo(size, filename); } } } ubyte[16] make_random_digest() { ubyte[16] digest; foreach (ref a; digest) { a = cast(ubyte)uniform(0, 256); } return digest; } } } int main(string[] args) { new DuplicateFinder(args[1..$]); return 0; }