view doodle/utils/prog/dupes.d @ 117:c566cdbccaeb

Added dupes, the rewrite.
author David Bryant <bagnose@gmail.com>
date Tue, 19 Apr 2011 23:11:03 +0930
parents
children 94233d54e16a
line wrap: on
line source

import std.stdio;
import std.string;
import std.exception;
import std.random;
import std.algorithm;
import std.file;
import std.c.stdio;
import std.c.string;
import std.cstream;
import core.sys.posix.dirent;
import std.md5;


class DuplicateFinder {
    this(in string[] dirs) {
        writefln("Accumulating files");

        string last_name;

        foreach (string dir; dirs) {
            try {
                foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
                    last_name = name;
                    try {
                        if (isFile(name)) {
                            ulong size = getSize(name);
                            if (size >= SIZE_THRESHOLD) {
                                _file_array ~= FileInfo(name, size);
                            }
                        }
                    }
                    catch (Exception ex) {
                        writefln("Skipping %s", name);
                        //writefln("Exception %s", ex);
                        // TODO accumulate errors and print after traversal
                    }
                }
            }
            catch (FileException ex) {
                // ignore
                writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
            }
        }

        writefln("Processing %s files", _file_array.length);

        uint[][ulong] size_to_file_indices;
        bool[ulong]   duplicate_sizes;

        foreach (index, file; _file_array) {
            //writefln("%s %s %s", index, file.name, file.size);

            if (uint[] * indices = (file.size in size_to_file_indices)) {
                if (indices.length == 1) {
                    // Second time we've seen a file of this size,
                    // record it in the duplicate_sizes array
                    duplicate_sizes[file.size] = true;
                }

                (*indices) ~= index;
            }
            else {
                size_to_file_indices[file.size] = [ index ];
            }
        }

        writefln("Number of files of duplicate size %s", duplicate_sizes.length);

        foreach (size; duplicate_sizes.keys) {
            uint[] indices = size_to_file_indices[size];
            //writefln("For size %s there are %s files", size, indices.length);

            uint[][ubyte[16]] digest_to_indices;

            foreach (index; indices) {
                FileInfo file_info = _file_array[index];

                try {
                    ubyte[16] digest = compute_md5(file_info.name);

                    if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
                        // A true duplicate
                        // index and index2 are the same

                        (*duplicate_indices) ~= index;
                    }
                    else {
                        digest_to_indices[digest] ~= index;
                    }
                }
                catch (ErrnoException ex) {
                    //writefln("Skipping: %s", file_info.name);
                }

                //writefln("\t%s", file_info.name);
            }

            foreach (indices2; digest_to_indices) {
                if (indices2.length > 1) {
                    // List the duplicates
                    foreach (index; indices) {
                        FileInfo file_info = _file_array[index];
                        writefln("%s %s", file_info.size, file_info.name);
                    }
                    writefln("");
                }
            }
        }

        writefln("Done\n");
    }

    ubyte[16] compute_md5(in string name) {
        ubyte[16] digest;

        auto file = File(name, "r");
        scope(exit) file.close;

        MD5_CTX context;
        context.start();
        { // Block 1:
            // Compute the actual digest
            ulong amount = 0;
            foreach (ubyte[] buffer; chunks(file, 1024)) {
                context.update(buffer);
                //bytes_chewed(buffer.length);
                amount += buffer.length;
                if (amount >= MD5_AMOUNT) {
                    break;
                }
            }
        }
        context.finish(digest);

        return digest;
    }

    private {
        immutable ulong KILO = 1 << 10;
        immutable ulong MEGA = 1 << 20;

        immutable ulong SIZE_THRESHOLD = 100 * KILO;
        immutable ulong MD5_AMOUNT     = 10 * KILO;

        struct FileInfo {
            this(in string name_, in ulong size_) {
                name = name_;
                size = size_;
            }

            string name;
            ulong  size;
        };

        FileInfo[] _file_array;
    }
}

int main(string[] args) {
    new DuplicateFinder(args[1..$]);

    return 0;
}