annotate doodle/utils/prog/duplicates.d @ 112:b569d7d5064f

Added some utilities that are a work in progress.
author David Bryant <bagnose@gmail.com>
date Thu, 14 Apr 2011 11:27:17 +0930
parents
children 9cc6c428fdbe
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
1 import std.stdio;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
2 import std.string;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
3 import std.file;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
4 import std.c.stdio;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
5 import std.c.string;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
6 import std.cstream;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
7 import core.sys.posix.dirent;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
8 import std.md5;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
9
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
10 class DuplicateFinder {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
11 this(in string dir) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
12 recurse_directory(dir.dup);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
13
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
14 writefln("\n");
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
15
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
16 foreach (digest; _duplicate_digests.keys) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
17 writefln("%s", digestToString(digest));
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
18 auto file_info = _file_info_map[digest];
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
19 writefln("Size %s, Count %s, Digest %s",
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
20 file_info.size, file_info.names.length, digestToString(digest));
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
21 foreach (name; file_info.names) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
22 writefln("\t%s", name);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
23 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
24 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
25 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
26
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
27 private {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
28 struct FileInfo {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
29 this(in ulong size_, string first_name) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
30 size = size_;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
31 names ~= first_name;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
32 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
33
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
34 ulong size;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
35 string[] names;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
36 };
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
37
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
38 //static const ulong SIZE_THRESHOLD = 1_000;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
39 static const ulong SIZE_THRESHOLD = 0;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
40
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
41 bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
42 FileInfo[ubyte[16]] _file_info_map; // map of digest to file info
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
43
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
44 void compute_md5(in char[] filename, in ulong filesize) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
45 //writefln("%s", filename);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
46 auto file = File(filename.idup, "r");
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
47 scope(exit) file.close;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
48
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
49 ubyte[16] digest;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
50
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
51 MD5_CTX context;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
52 context.start();
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
53 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
54 context.update(buffer);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
55 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
56 context.finish(digest);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
57 writefln("%s: %s", digestToString(digest), filename);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
58
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
59 if (FileInfo * file_info = (digest in _file_info_map)) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
60 // duplicate
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
61 file_info.names ~= filename.idup;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
62
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
63 if (file_info.size >= SIZE_THRESHOLD) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
64 _duplicate_digests[digest] = true;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
65 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
66 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
67 else {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
68 // unseen
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
69 _duplicate_digests[digest] = true;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
70 _file_info_map[digest] = FileInfo(filesize, filename.idup);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
71 //writefln("%s", _file_info_map.length);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
72 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
73 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
74
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
75 bool entry_callback(DirEntry * de) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
76 //writefln("File: %s", de.name);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
77
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
78 if (de.isdir) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
79 recurse_directory(de.name);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
80 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
81 else if (de.isfile) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
82 compute_md5(de.name, de.size);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
83 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
84
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
85 return true;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
86 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
87
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
88 void recurse_directory(in char[] dirname) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
89 //writefln("Dir: %s", dirname);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
90
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
91 try {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
92 listdir(dirname, &entry_callback);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
93 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
94 catch (FileException ex) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
95 //writefln("Skipping: %s", dirname);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
96 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
97 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
98 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
99 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
100
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
101 int main(string[] args) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
102 foreach (string arg; args[1..$]) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
103 new DuplicateFinder(arg);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
104 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
105
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
106 return 0;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
107 }