view doodle/utils/prog/duplicates.d @ 113:9cc6c428fdbe

Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls no longer needed. Still blows the hood on memory usage.
author David Bryant <bagnose@gmail.com>
date Thu, 14 Apr 2011 19:10:46 +0930
parents b569d7d5064f
children b87e2e0a046a
line wrap: on
line source

import std.stdio;
import std.string;
import std.exception;
import std.algorithm;
import std.file;
import std.c.stdio;
import std.c.string;
import std.cstream;
import core.sys.posix.dirent;
import std.md5;

class DuplicateFinder {
    this(in string dir) {
        // First pass to gather the number of files and bytes

        writeln("Accumulating total bytes / files");

        uint total_files = 0;

        try {
            foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
                try {
                    if (isFile(name)) {
                        _total_bytes += getSize(name);
                        ++total_files;
                    }
                }
                catch (Exception ex) {
                    writefln("Skipping %s", name);
                    //writefln("Exception %s", ex);
                }
            }
        }
        catch (FileException ex) {
            // ignore
            writefln("dirEntries bailed out. Continuing anyway");
        }

        writefln("Files %s, bytes %s", total_files, _total_bytes);
        writeln("Accumulating MD5 sums");

        foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
            if (isFile(name)) {
                try {
                    //writefln("MD5'ing %s", name);
                    compute_md5(name);
                }
                catch (ErrnoException ex) {
                    //writefln("Skipping file: %s, %s", name, ex);
                    //writefln("(errno) Skipping file: %s", name);
                    // TODO accumulate errors and print after traversal is complete
                }
            }
        }

        writefln("");

        writeln("Sorting keys");

        ubyte[16][] keys = _duplicate_digests.keys;
        bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; }
        sort!(compare_by_size)(keys);

        writeln("Printing results");

        foreach (digest; keys) {
            auto file_info = _file_info_map[digest];
            /*
            writefln("Size %s, Count %s, Digest %s",
                     file_info.size, file_info.names.length, digestToString(digest));
                     */
            writefln("Size %s, Count %s", file_info.size, file_info.names.length);
            foreach (name; file_info.names) {
                writefln("\t%s", name);
            }
        }

        writeln("Done");
    }

    private {
        struct FileInfo {
            this(in ulong size_, string first_name) {
                size   = size_;
                names ~= first_name;
            }

            ulong    size;
            string[] names;
        };

        //static const ulong SIZE_THRESHOLD = 1_000;
        static const ulong SIZE_THRESHOLD = 0;

        bool[ubyte[16]]     _duplicate_digests;             // set of all duplicate digests
        FileInfo[ubyte[16]] _file_info_map;                 // map of digest to file info

        ulong               _total_bytes;
        ulong               _current_byte;
        double              _last_progress = -1.0;

        void bytes_chewed(ulong bytes) {
            _current_byte += bytes;
            double progress = cast(double)_current_byte / cast(double)_total_bytes;
            if (progress - _last_progress > 0.0005) {
                writef("\rProgress %3.1f%%", 100.0 * progress);
                std.stdio.stdout.flush();
                _last_progress = progress;
            }

        }

        void compute_md5(in string filename) {
            //writefln("%s", filename);
            auto file = File(filename, "r");
            scope(exit) file.close;

            ubyte[16] digest;

            MD5_CTX context;
            context.start();
            foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
                bytes_chewed(buffer.length);
                context.update(buffer);
            }
            context.finish(digest);
            //writefln("%s: %s", digestToString(digest), filename);

            if (FileInfo * file_info = (digest in _file_info_map)) {
                // duplicate
                file_info.names ~= filename;
                assert(file_info.names.length > 1);

                if (file_info.size >= SIZE_THRESHOLD) {
                    _duplicate_digests[digest] = true;
                }
            }
            else {
                // unseen
                _file_info_map[digest] = FileInfo(getSize(filename), filename);
                //writefln("%s", _file_info_map.length);
            }
        }
    }
}

int main(string[] args) {
    foreach (string arg; args[1..$]) {
        new DuplicateFinder(arg);
    }

    return 0;
}