view doodle/utils/prog/duplicates.d @ 112:b569d7d5064f

Added some utilities that are a work in progress.
author David Bryant <bagnose@gmail.com>
date Thu, 14 Apr 2011 11:27:17 +0930
parents
children 9cc6c428fdbe
line wrap: on
line source

import std.stdio;
import std.string;
import std.file;
import std.c.stdio;
import std.c.string;
import std.cstream;
import core.sys.posix.dirent;
import std.md5;

class DuplicateFinder {
    this(in string dir) {
        recurse_directory(dir.dup);

        writefln("\n");

        foreach (digest; _duplicate_digests.keys) {
            writefln("%s", digestToString(digest));
            auto file_info = _file_info_map[digest];
            writefln("Size %s, Count %s, Digest %s",
                     file_info.size, file_info.names.length, digestToString(digest));
            foreach (name; file_info.names) {
                writefln("\t%s", name);
            }
        }
    }

    private {
        struct FileInfo {
            this(in ulong size_, string first_name) {
                size   = size_;
                names ~= first_name;
            }

            ulong    size;
            string[] names;
        };

        //static const ulong SIZE_THRESHOLD = 1_000;
        static const ulong SIZE_THRESHOLD = 0;

        bool[ubyte[16]]     _duplicate_digests;             // set of all duplicate digests
        FileInfo[ubyte[16]] _file_info_map;                 // map of digest to file info

        void compute_md5(in char[] filename, in ulong filesize) {
            //writefln("%s", filename);
            auto file = File(filename.idup, "r");
            scope(exit) file.close;

            ubyte[16] digest;

            MD5_CTX context;
            context.start();
            foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
                context.update(buffer);
            }
            context.finish(digest);
            writefln("%s: %s", digestToString(digest), filename);

            if (FileInfo * file_info = (digest in _file_info_map)) {
                // duplicate
                file_info.names ~= filename.idup;

                if (file_info.size >= SIZE_THRESHOLD) {
                    _duplicate_digests[digest] = true;
                }
            }
            else {
                // unseen
                _duplicate_digests[digest] = true;
                _file_info_map[digest] = FileInfo(filesize, filename.idup);
                //writefln("%s", _file_info_map.length);
            }
        }

        bool entry_callback(DirEntry * de) {
            //writefln("File: %s", de.name);

            if (de.isdir) {
                recurse_directory(de.name);
            }
            else if (de.isfile) {
                compute_md5(de.name, de.size);
            }

            return true;
        }

        void recurse_directory(in char[] dirname) {
            //writefln("Dir: %s", dirname);

            try {
                listdir(dirname, &entry_callback);
            }
            catch (FileException ex) {
                //writefln("Skipping: %s", dirname);
            }
        }
    }
}

int main(string[] args) {
    foreach (string arg; args[1..$]) {
        new DuplicateFinder(arg);
    }

    return 0;
}