view doodle/utils/prog/duplicates.d @ 115:d7330cc52622

Added instructions to duplicates.d on the smallest changes required to trigger/untrigger the memory blowout. Interestingly the blowout only occurs when compiled with -m32, not -m64.
author David Bryant <bagnose@gmail.com>
date Sat, 16 Apr 2011 19:48:33 +0930
parents b87e2e0a046a
children 31c27f4f3bbc
line wrap: on
line source

import std.stdio;
import std.string;
import std.exception;
import std.random;
import std.algorithm;
import std.file;
import std.c.stdio;
import std.c.string;
import std.cstream;
import core.sys.posix.dirent;
import std.md5;

// This program recursively processes files in a list
// of directories, computing an MD5 digest on each file
// and then informing the user of files with duplicate content.
// Only duplicate files over a certain size are reported.

class DuplicateFinder {
    this(in string[] dirs) {
        // First pass to gather the number of files and bytes
        // so that we are able to convey progress to the user

        writeln("Accumulating total bytes / files");

        uint total_files = 0;

        try {
            foreach (string dir; dirs) {
                foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
                    try {
                        if (isFile(name)) {
                            _total_bytes += getSize(name);
                            ++total_files;
                        }
                    }
                    catch (Exception ex) {
                        writefln("Skipping %s", name);
                        //writefln("Exception %s", ex);
                        // TODO accumulate errors and print after traversal
                    }
                }
            }
        }
        catch (FileException ex) {
            // ignore
            writefln("dirEntries bailed out. Continuing anyway");
        }

        writefln("Files %s, bytes %s", total_files, _total_bytes);

        // Go through the files again, but this time
        // compute the MD5 digests and build our data structures

        writeln("Accumulating MD5 digests");

        foreach (string dir; dirs) {
            foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
                try {
                    if (isFile(name)) {
                        //writefln("MD5'ing %s", name);
                        compute_md5(name, getSize(name));
                    }
                }
                catch (FileException ex) {
                    writefln("Skipping %s", name);
                }
                catch (ErrnoException ex) {
                    //writefln("Skipping file: %s, %s", name, ex);
                    //writefln("(errno) Skipping file: %s", name);
                    // TODO accumulate errors and print after traversal is complete
                }
            }
        }

        writefln("");

        // Sort our duplicate digests by size so that we print
        // the biggest duplicate file offenders first

        writeln("Sorting duplicate digests by size");

        ubyte[16][] keys = _duplicate_digests.keys;
        bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; }
        sort!(compare_by_size)(keys);

        // Print the results out the user, in descending order
        // of file size

        writeln("Printing results");

        writefln("Number of duplicate files: %s", _duplicate_digests.length);

        foreach (digest; keys) {
            auto file_info = _file_info_map[digest];
            /*
            writefln("Size %s, Count %s, Digest %s",
                     file_info.size, file_info.names.length, digestToString(digest));
                     */
            writefln("Size %s, Count %s", file_info.size, file_info.names.length);
            foreach (name; file_info.names) {
                writefln("\t%s", name);
            }
        }

        writeln("Done");
    }

    private {
        struct FileInfo {
            this(in ulong size_, string first_name) {
                size   = size_;
                names ~= first_name;
            }

            ulong    size;
            string[] names;
        };

        //static const ulong SIZE_THRESHOLD = 1_000;
        static const ulong SIZE_THRESHOLD = 0;

        bool[ubyte[16]]     _duplicate_digests;             // set of all duplicate digests
        FileInfo[ubyte[16]] _file_info_map;                 // map of digest to file info

        ulong               _total_bytes;
        ulong               _current_byte;
        double              _last_progress = -1.0;

        void compute_md5(in string filename, in ulong size) {
            void bytes_chewed(ulong bytes) {
                _current_byte += bytes;
                double progress = cast(double)_current_byte / cast(double)_total_bytes;
                if (progress - _last_progress > 0.0005) {
                    writef("\rProgress %.1f%%", 100.0 * progress);
                    std.stdio.stdout.flush();
                    _last_progress = progress;
                }
            }

            ubyte[16] digest;

            // If Block 1 and Block 2 are both uncommented then there is a memory explosion.
            // However, if either one is commented out there there isn't...

            {
                auto file = File(filename, "r");
                scope(exit) file.close;

                MD5_CTX context;
                context.start();
                { // Block 1:
                    // Compute the actual digest
                    foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
                        context.update(buffer);
                        bytes_chewed(buffer.length);
                    }
                }
                context.finish(digest);

                /+
                { // Block 1 alternative:
                    // Create a random digest
                    digest = make_random_digest;
                    bytes_chewed(size);
                }
                +/
            }

            { // Block 2:
                // Update the data structures
                if (FileInfo * file_info = (digest in _file_info_map)) {
                    // This is a duplicate digest, append the subsequent name
                    file_info.names ~= filename;

                    // Record the duplicate as an offender if its size exceeds the threshold
                    if (file_info.size >= SIZE_THRESHOLD) {
                        _duplicate_digests[digest] = true;
                    }
                }
                else {
                    // We have not seen this digest before
                    _file_info_map[digest] = FileInfo(size, filename);
                }
            }
        }

        ubyte[16] make_random_digest() {
            ubyte[16] digest;
            foreach (ref a; digest) {
                a = cast(ubyte)uniform(0, 256);
            }
            return digest;
        }
    }
}

int main(string[] args) {
    new DuplicateFinder(args[1..$]);

    return 0;
}