view doodle/utils/prog/dupes.d @ 120:c275f26399c6

Tinkerings
author David Bryant <bagnose@gmail.com>
date Fri, 22 Apr 2011 00:06:07 +0930
parents 8343c1dafac6
children f1cf62339ed5
line wrap: on
line source

import std.stdio;
import std.string;
import std.exception;
import std.file;
import std.md5;
import std.getopt;
import std.conv;
import std.c.stdlib;

void find_duplicates(in string[] dirs,
                     in ulong    file_size,
                     in ulong    digest_size,
                     bool        verbose) {
    static ubyte[16] compute_md5(in string name, in ulong max_bytes) {
        ubyte[16] digest;

        auto file = File(name, "r");
        scope(exit) file.close;

        MD5_CTX context;
        context.start();
        ulong byte_count = 0;
        foreach (ubyte[] buffer; chunks(file, 1024)) {
            context.update(buffer);
            byte_count += buffer.length;
            if (byte_count >= max_bytes) {
                break;
            }
        }

        context.finish(digest);

        return digest;
    }

    struct FileInfo {
        string name;
        ulong  size;
    }

    FileInfo[] file_array;

    writefln("Accumulating file list");

    string last_name;

    foreach (string dir; dirs) {
        try {
            foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
                last_name = name;
                try {
                    if (!isSymLink(name) && isFile(name)) {
                        ulong size = getSize(name);
                        if (size >= file_size) {
                            file_array ~= FileInfo(name, size);
                        }
                    }
                }
                catch (Exception ex) {
                    writefln("Skipping %s", name);
                    //writefln("Exception %s", ex);
                    // TODO accumulate errors and print after traversal
                }
            }
        }
        catch (FileException ex) {
            // ignore
            writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
        }
    }

    writefln("Processing %s files", file_array.length);

    uint[][ulong] size_to_file_indices;
    bool[ulong]   duplicate_sizes;

    foreach (index, file; file_array) {
        //writefln("%s %s %s", index, file.name, file.size);

        if (uint[] * indices = (file.size in size_to_file_indices)) {
            if (indices.length == 1) {
                // Second time we've seen a file of this size,
                // record it in the duplicate_sizes array
                duplicate_sizes[file.size] = true;
            }

            (*indices) ~= index;
        }
        else {
            size_to_file_indices[file.size] = [ index ];
        }
    }

    writefln("Number of files of duplicate size %s", duplicate_sizes.length);

    foreach (size; duplicate_sizes.keys) {
        uint[] indices = size_to_file_indices[size];
        //writefln("For size %s there are %s files", size, indices.length);

        uint[][ubyte[16]] digest_to_indices;

        foreach (index; indices) {
            const FileInfo file_info = file_array[index];

            try {
                ubyte[16] digest = compute_md5(file_info.name, digest_size);

                if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
                    // A true duplicate
                    // index and index2 are the same

                    (*duplicate_indices) ~= index;
                }
                else {
                    digest_to_indices[digest] ~= index;
                }
            }
            catch (ErrnoException ex) {
                //writefln("Skipping: %s", file_info.name);
            }

            //writefln("\t%s", file_info.name);
        }

        foreach (indices2; digest_to_indices) {
            if (indices2.length > 1) {
                // List the duplicates
                foreach (index; indices) {
                    FileInfo file_info = file_array[index];
                    writefln("%s %s", file_info.size, file_info.name);
                }
                writefln("");
            }
        }
    }

    writefln("Done");
}

int main(string[] args) {
    immutable ulong KILO = 1 << 10;
    immutable ulong MEGA = 1 << 20;
    immutable ulong GIGA = 1 << 30;

    /*
    static ulong parse_size_string(in string[] s) {
        if (s.length == 0) {
            throw new ConvException
        }
    }
    */

    void help(in string) {
        writefln("Help");
        exit(1);
    }

    ulong file_size   = 100 * KILO;
    ulong digest_size =  10 * KILO;
    bool  verbose     = false;

    try {
         getopt(args,
                "file-size|f",   &file_size,
                "digest-size|d", &digest_size,
                "verbose|v",     &verbose,
                "help|h",        &help);
    }
    catch (ConvException ex) {

    }

    find_duplicates(args[1..$], file_size, digest_size, verbose);

    return 0;
}