Mercurial > projects > doodle
annotate doodle/utils/prog/dupes.d @ 123:0d427170a805
Move to 64-bit
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Wed, 04 May 2011 22:19:44 +0930 |
parents | f1cf62339ed5 |
children | 89016abde9fe |
rev | line source |
---|---|
117 | 1 import std.stdio; |
2 import std.string; | |
3 import std.exception; | |
121 | 4 import std.algorithm; |
117 | 5 import std.file; |
6 import std.md5; | |
120 | 7 import std.getopt; |
8 import std.conv; | |
121 | 9 import std.ctype; |
120 | 10 import std.c.stdlib; |
117 | 11 |
121 | 12 ulong string_to_size(string s) { |
13 // Convert strings to sizes, eg: | |
14 // "50" -> 50 | |
15 // "80B" -> 80 | |
16 // "10K" -> 10240 | |
17 // "1M" -> 1048576 | |
18 // Throws ConvException | |
19 | |
20 immutable map = [ 'B':1UL, 'K':1UL<<10, 'M':1UL<<20, 'G':1UL<<30, 'T':1UL<<40 ]; | |
21 | |
22 if (s.length == 0) { | |
23 throw new ConvException("Empty string"); | |
24 } | |
25 else { | |
26 ulong multiplier = 1; | |
27 | |
28 if (isalpha(s[$-1])) { | |
29 immutable ulong * m = (s[$-1] in map); | |
30 | |
31 if (m) { | |
32 multiplier = *m; | |
33 } | |
34 else { | |
35 throw new ConvException(format("Bad size unit character: %s", s[$-1])); | |
36 } | |
37 | |
38 s = s[0..$-1]; | |
39 } | |
40 | |
41 return multiplier * to!ulong(s); | |
42 } | |
43 } | |
44 | |
45 string size_to_string(in ulong size) { | |
46 /+ | |
47 immutable array = [ 'B', 'K', 'M', 'G', 'T' ]; | |
48 size_t index = 0; | |
49 | |
50 foreach (i, c; array) { | |
51 if (size / (1UL << i | |
52 | |
53 writefln("%s %s", i, c); | |
54 } | |
55 +/ | |
56 | |
57 return format("%sK", size / 1024); | |
58 } | |
59 | |
120 | 60 void find_duplicates(in string[] dirs, |
61 in ulong file_size, | |
62 in ulong digest_size, | |
63 bool verbose) { | |
121 | 64 static ubyte[16] compute_md5(in string filename, in ulong max_bytes) { |
65 size_t chunk_size = min(max_bytes, 4096 * 1024); | |
117 | 66 ubyte[16] digest; |
67 | |
121 | 68 auto file = File(filename, "r"); |
117 | 69 scope(exit) file.close; |
70 | |
71 MD5_CTX context; | |
72 context.start(); | |
118 | 73 ulong byte_count = 0; |
121 | 74 foreach (ubyte[] buffer; chunks(file, chunk_size)) { |
118 | 75 context.update(buffer); |
76 byte_count += buffer.length; | |
77 if (byte_count >= max_bytes) { | |
78 break; | |
117 | 79 } |
80 } | |
81 context.finish(digest); | |
82 | |
83 return digest; | |
84 } | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
85 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
86 struct FileInfo { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
87 string name; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
88 ulong size; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
89 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
90 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
91 FileInfo[] file_array; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
92 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
93 writefln("Accumulating file list"); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
94 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
95 foreach (string dir; dirs) { |
121 | 96 if (isDir(dir)) { |
97 string last_entry; | |
98 try { | |
99 foreach (string filename; dirEntries(dir, SpanMode.depth, false)) { | |
100 last_entry = filename; | |
101 try { | |
102 if (!isSymLink(filename) && isFile(filename)) { | |
103 ulong size = getSize(filename); | |
104 if (size >= file_size) { | |
105 file_array ~= FileInfo(filename, size); | |
106 } | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
107 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
108 } |
121 | 109 catch (Exception ex) { |
110 writefln("Skipping %s", filename); | |
111 //writefln("Exception %s", ex); | |
112 // TODO accumulate errors and print after traversal | |
113 } | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
114 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
115 } |
121 | 116 catch (FileException ex) { |
117 // ignore | |
118 writefln("Error, dirEntries bailed out after: %s. Continuing anyway", last_entry); | |
119 } | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
120 } |
121 | 121 else { |
122 writefln("Not a dir: %s", dir); | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
123 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
124 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
125 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
126 writefln("Processing %s files", file_array.length); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
127 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
128 uint[][ulong] size_to_file_indices; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
129 bool[ulong] duplicate_sizes; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
130 |
123 | 131 foreach (uint index, file; file_array) { |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
132 //writefln("%s %s %s", index, file.name, file.size); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
133 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
134 if (uint[] * indices = (file.size in size_to_file_indices)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
135 if (indices.length == 1) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
136 // Second time we've seen a file of this size, |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
137 // record it in the duplicate_sizes array |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
138 duplicate_sizes[file.size] = true; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
139 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
140 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
141 (*indices) ~= index; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
142 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
143 else { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
144 size_to_file_indices[file.size] = [ index ]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
145 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
146 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
147 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
148 writefln("Number of files of duplicate size %s", duplicate_sizes.length); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
149 |
121 | 150 ulong total_waste = 0; |
151 | |
152 foreach_reverse (size; duplicate_sizes.keys.sort) { | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
153 uint[] indices = size_to_file_indices[size]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
154 //writefln("For size %s there are %s files", size, indices.length); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
155 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
156 uint[][ubyte[16]] digest_to_indices; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
157 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
158 foreach (index; indices) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
159 const FileInfo file_info = file_array[index]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
160 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
161 try { |
120 | 162 ubyte[16] digest = compute_md5(file_info.name, digest_size); |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
163 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
164 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
165 // A true duplicate |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
166 // index and index2 are the same |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
167 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
168 (*duplicate_indices) ~= index; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
169 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
170 else { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
171 digest_to_indices[digest] ~= index; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
172 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
173 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
174 catch (ErrnoException ex) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
175 //writefln("Skipping: %s", file_info.name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
176 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
177 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
178 //writefln("\t%s", file_info.name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
179 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
180 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
181 foreach (indices2; digest_to_indices) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
182 if (indices2.length > 1) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
183 // List the duplicates |
121 | 184 foreach (i, index; indices) { |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
185 FileInfo file_info = file_array[index]; |
121 | 186 if (i == 0) { |
187 writefln("%s", size_to_string(file_info.size)); | |
188 total_waste += file_info.size; | |
189 } | |
190 writefln(" %s", file_info.name); | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
191 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
192 writefln(""); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
193 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
194 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
195 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
196 |
121 | 197 writefln("Done, total waste: %s", size_to_string(total_waste)); |
117 | 198 } |
199 | |
200 int main(string[] args) { | |
121 | 201 ulong file_size; |
202 ulong digest_size; | |
203 bool verbose; | |
120 | 204 |
121 | 205 try { |
206 void help(in string) { | |
207 writefln("Usage: dupes [OPTION]... DIR...\n" | |
208 "Recursively locate duplicate files in a list of directories\n" | |
209 "\n" | |
210 "Options\n" | |
211 " -d, --digest-size=SIZE size of digest used for comparison\n" | |
212 " -f, --file-size=SIZE minimum size of files searched for duplication\n" | |
213 " -v, --verbose be verbose\n" | |
214 " --help display this help and exit\n" | |
215 "\n" | |
216 "SIZE is an integer, optionally followed by K, M, G, T"); | |
217 exit(1); | |
120 | 218 } |
121 | 219 |
220 string file_size_string = "100K"; | |
221 string digest_size_string = "100K"; | |
222 | |
223 getopt(args, | |
224 "file-size|f", &file_size_string, | |
225 "digest-size|d", &digest_size_string, | |
226 "verbose|v", &verbose, | |
227 "help", &help); | |
228 | |
229 file_size = string_to_size(file_size_string); | |
230 digest_size = string_to_size(digest_size_string); | |
120 | 231 } |
121 | 232 catch (ConvException ex) { |
233 writefln("Conversion error: %s", ex); | |
234 exit(2); | |
120 | 235 } |
236 | |
121 | 237 if (verbose) { |
238 writefln("file-size=%s, digest-size=%s", size_to_string(file_size), size_to_string(digest_size)); | |
120 | 239 } |
240 | |
241 find_duplicates(args[1..$], file_size, digest_size, verbose); | |
117 | 242 |
243 return 0; | |
244 } |