annotate doodle/utils/prog/dupes.d @ 117:c566cdbccaeb

Added dupes, the rewrite.
author David Bryant <bagnose@gmail.com>
date Tue, 19 Apr 2011 23:11:03 +0930
parents
children 94233d54e16a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
117
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
1 import std.stdio;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
2 import std.string;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
3 import std.exception;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
4 import std.random;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
5 import std.algorithm;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
6 import std.file;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
7 import std.c.stdio;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
8 import std.c.string;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
9 import std.cstream;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
10 import core.sys.posix.dirent;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
11 import std.md5;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
12
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
13
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
14 class DuplicateFinder {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
15 this(in string[] dirs) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
16 writefln("Accumulating files");
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
17
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
18 string last_name;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
19
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
20 foreach (string dir; dirs) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
21 try {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
22 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
23 last_name = name;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
24 try {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
25 if (isFile(name)) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
26 ulong size = getSize(name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
27 if (size >= SIZE_THRESHOLD) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
28 _file_array ~= FileInfo(name, size);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
29 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
30 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
31 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
32 catch (Exception ex) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
33 writefln("Skipping %s", name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
34 //writefln("Exception %s", ex);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
35 // TODO accumulate errors and print after traversal
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
36 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
37 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
38 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
39 catch (FileException ex) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
40 // ignore
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
41 writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
42 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
43 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
44
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
45 writefln("Processing %s files", _file_array.length);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
46
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
47 uint[][ulong] size_to_file_indices;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
48 bool[ulong] duplicate_sizes;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
49
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
50 foreach (index, file; _file_array) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
51 //writefln("%s %s %s", index, file.name, file.size);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
52
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
53 if (uint[] * indices = (file.size in size_to_file_indices)) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
54 if (indices.length == 1) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
55 // Second time we've seen a file of this size,
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
56 // record it in the duplicate_sizes array
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
57 duplicate_sizes[file.size] = true;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
58 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
59
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
60 (*indices) ~= index;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
61 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
62 else {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
63 size_to_file_indices[file.size] = [ index ];
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
64 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
65 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
66
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
67 writefln("Number of files of duplicate size %s", duplicate_sizes.length);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
68
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
69 foreach (size; duplicate_sizes.keys) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
70 uint[] indices = size_to_file_indices[size];
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
71 //writefln("For size %s there are %s files", size, indices.length);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
72
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
73 uint[][ubyte[16]] digest_to_indices;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
74
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
75 foreach (index; indices) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
76 FileInfo file_info = _file_array[index];
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
77
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
78 try {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
79 ubyte[16] digest = compute_md5(file_info.name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
80
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
81 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
82 // A true duplicate
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
83 // index and index2 are the same
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
84
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
85 (*duplicate_indices) ~= index;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
86 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
87 else {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
88 digest_to_indices[digest] ~= index;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
89 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
90 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
91 catch (ErrnoException ex) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
92 //writefln("Skipping: %s", file_info.name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
93 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
94
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
95 //writefln("\t%s", file_info.name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
96 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
97
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
98 foreach (indices2; digest_to_indices) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
99 if (indices2.length > 1) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
100 // List the duplicates
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
101 foreach (index; indices) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
102 FileInfo file_info = _file_array[index];
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
103 writefln("%s %s", file_info.size, file_info.name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
104 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
105 writefln("");
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
106 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
107 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
108 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
109
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
110 writefln("Done\n");
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
111 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
112
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
113 ubyte[16] compute_md5(in string name) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
114 ubyte[16] digest;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
115
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
116 auto file = File(name, "r");
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
117 scope(exit) file.close;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
118
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
119 MD5_CTX context;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
120 context.start();
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
121 { // Block 1:
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
122 // Compute the actual digest
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
123 ulong amount = 0;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
124 foreach (ubyte[] buffer; chunks(file, 1024)) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
125 context.update(buffer);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
126 //bytes_chewed(buffer.length);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
127 amount += buffer.length;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
128 if (amount >= MD5_AMOUNT) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
129 break;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
130 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
131 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
132 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
133 context.finish(digest);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
134
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
135 return digest;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
136 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
137
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
138 private {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
139 immutable ulong KILO = 1 << 10;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
140 immutable ulong MEGA = 1 << 20;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
141
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
142 immutable ulong SIZE_THRESHOLD = 100 * KILO;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
143 immutable ulong MD5_AMOUNT = 10 * KILO;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
144
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
145 struct FileInfo {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
146 this(in string name_, in ulong size_) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
147 name = name_;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
148 size = size_;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
149 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
150
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
151 string name;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
152 ulong size;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
153 };
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
154
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
155 FileInfo[] _file_array;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
156 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
157 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
158
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
159 int main(string[] args) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
160 new DuplicateFinder(args[1..$]);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
161
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
162 return 0;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
163 }