Mercurial > projects > doodle
comparison doodle/main/dupes.d @ 139:e33f37b14893 default tip
Port to 'no-more-make' https://github.com/GrahamStJack/no-more-make
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Sun, 30 Sep 2012 15:41:25 +0930 |
parents | doodle/main/util/dupes.d@89e8b0d92f36 |
children |
comparison
equal
deleted
inserted
replaced
138:a1c2b56cb44d | 139:e33f37b14893 |
---|---|
1 import std.stdio; | |
2 import std.string; | |
3 import std.exception; | |
4 import std.algorithm; | |
5 import std.file; | |
6 import std.md5; | |
7 import std.getopt; | |
8 import std.conv; | |
9 import std.ascii; | |
10 import std.c.stdlib; | |
11 | |
12 ulong string_to_size(string s) { | |
13 // Convert strings to sizes, eg: | |
14 // "50" -> 50 | |
15 // "80B" -> 80 | |
16 // "10K" -> 10240 | |
17 // "1M" -> 1048576 | |
18 // Throws ConvException | |
19 | |
20 immutable map = [ 'B':1UL, 'K':1UL<<10, 'M':1UL<<20, 'G':1UL<<30, 'T':1UL<<40 ]; | |
21 | |
22 if (s.length == 0) { | |
23 throw new ConvException("Empty string"); | |
24 } | |
25 else { | |
26 ulong multiplier = 1; | |
27 | |
28 if (isAlpha(s[$-1])) { | |
29 immutable ulong * m = (s[$-1] in map); | |
30 | |
31 if (m) { | |
32 multiplier = *m; | |
33 } | |
34 else { | |
35 throw new ConvException(format("Bad size unit character: %s", s[$-1])); | |
36 } | |
37 | |
38 s = s[0..$-1]; | |
39 } | |
40 | |
41 return multiplier * to!ulong(s); | |
42 } | |
43 } | |
44 | |
45 string size_to_string(in ulong size) { | |
46 /+ | |
47 immutable array = [ 'B', 'K', 'M', 'G', 'T' ]; | |
48 size_t index = 0; | |
49 | |
50 foreach (i, c; array) { | |
51 if (size / (1UL << i | |
52 | |
53 writefln("%s %s", i, c); | |
54 } | |
55 +/ | |
56 | |
57 return format("%sK", size / 1024); | |
58 } | |
59 | |
60 void find_duplicates(in string[] dirs, | |
61 in ulong file_size, | |
62 in ulong digest_size, | |
63 bool verbose) { | |
64 static ubyte[16] compute_md5(in string filename, in ulong max_bytes) { | |
65 size_t chunk_size = min(max_bytes, 4096 * 1024); | |
66 ubyte[16] digest; | |
67 | |
68 auto file = File(filename, "r"); | |
69 scope(exit) file.close(); | |
70 | |
71 MD5_CTX context; | |
72 context.start(); | |
73 ulong byte_count = 0; | |
74 foreach (ubyte[] buffer; chunks(file, chunk_size)) { | |
75 context.update(buffer); | |
76 byte_count += buffer.length; | |
77 if (byte_count >= max_bytes) { | |
78 break; | |
79 } | |
80 } | |
81 context.finish(digest); | |
82 | |
83 return digest; | |
84 } | |
85 | |
86 struct FileInfo { | |
87 string name; | |
88 ulong size; | |
89 } | |
90 | |
91 FileInfo[] file_array; | |
92 | |
93 writefln("Accumulating file list"); | |
94 | |
95 foreach (string dir; dirs) { | |
96 if (isDir(dir)) { | |
97 string last_entry; | |
98 try { | |
99 foreach (string filename; dirEntries(dir, SpanMode.depth, false)) { | |
100 last_entry = filename; | |
101 try { | |
102 if (!isSymlink(filename) && isFile(filename)) { | |
103 ulong size = getSize(filename); | |
104 if (size >= file_size) { | |
105 file_array ~= FileInfo(filename, size); | |
106 } | |
107 } | |
108 } | |
109 catch (Exception ex) { | |
110 writefln("Skipping %s", filename); | |
111 //writefln("Exception %s", ex); | |
112 // TODO accumulate errors and print after traversal | |
113 } | |
114 } | |
115 } | |
116 catch (FileException ex) { | |
117 // ignore | |
118 writefln("Error, dirEntries bailed out after: %s. Continuing anyway", last_entry); | |
119 } | |
120 } | |
121 else { | |
122 writefln("Not a dir: %s", dir); | |
123 } | |
124 } | |
125 | |
126 writefln("Processing %s files", file_array.length); | |
127 | |
128 uint[][ulong] size_to_file_indices; | |
129 bool[ulong] duplicate_sizes; | |
130 | |
131 foreach (uint index, file; file_array) { | |
132 //writefln("%s %s %s", index, file.name, file.size); | |
133 | |
134 if (uint[] * indices = (file.size in size_to_file_indices)) { | |
135 if (indices.length == 1) { | |
136 // Second time we've seen a file of this size, | |
137 // record it in the duplicate_sizes array | |
138 duplicate_sizes[file.size] = true; | |
139 } | |
140 | |
141 (*indices) ~= index; | |
142 } | |
143 else { | |
144 size_to_file_indices[file.size] = [ index ]; | |
145 } | |
146 } | |
147 | |
148 writefln("Number of files of duplicate size %s", duplicate_sizes.length); | |
149 | |
150 ulong total_waste = 0; | |
151 | |
152 foreach_reverse (size; duplicate_sizes.keys.sort) { | |
153 uint[] indices = size_to_file_indices[size]; | |
154 //writefln("For size %s there are %s files", size, indices.length); | |
155 | |
156 uint[][ubyte[16]] digest_to_indices; | |
157 | |
158 foreach (index; indices) { | |
159 const FileInfo file_info = file_array[index]; | |
160 | |
161 try { | |
162 ubyte[16] digest = compute_md5(file_info.name, digest_size); | |
163 | |
164 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { | |
165 // A true duplicate | |
166 // index and index2 are the same | |
167 | |
168 (*duplicate_indices) ~= index; | |
169 } | |
170 else { | |
171 digest_to_indices[digest] ~= index; | |
172 } | |
173 } | |
174 catch (ErrnoException ex) { | |
175 //writefln("Skipping: %s", file_info.name); | |
176 } | |
177 | |
178 //writefln("\t%s", file_info.name); | |
179 } | |
180 | |
181 foreach (indices2; digest_to_indices) { | |
182 if (indices2.length > 1) { | |
183 // List the duplicates | |
184 foreach (i, index; indices) { | |
185 FileInfo file_info = file_array[index]; | |
186 if (i == 0) { | |
187 writefln("%s", size_to_string(file_info.size)); | |
188 total_waste += file_info.size; | |
189 } | |
190 writefln(" %s", file_info.name); | |
191 } | |
192 writefln(""); | |
193 } | |
194 } | |
195 } | |
196 | |
197 writefln("Done, total waste: %s", size_to_string(total_waste)); | |
198 } | |
199 | |
200 int main(string[] args) { | |
201 ulong file_size; | |
202 ulong digest_size; | |
203 bool verbose; | |
204 | |
205 try { | |
206 string file_size_string = "100K"; | |
207 string digest_size_string = "100K"; | |
208 | |
209 void help(in string) { | |
210 writefln("Usage: dupes [OPTION]... DIR...\n" | |
211 "Recursively locate duplicate files in a list of directories\n" | |
212 "\n" | |
213 "Options\n" | |
214 " -d, --digest-size=SIZE size of digest used for comparison [%s]\n" | |
215 " -f, --file-size=SIZE minimum size of files searched for duplication [%s]\n" | |
216 " -v, --verbose be verbose\n" | |
217 " --help display this help and exit\n" | |
218 "\n" | |
219 "SIZE is an integer, optionally followed by K, M, G, T", | |
220 file_size_string, | |
221 digest_size_string); | |
222 exit(1); | |
223 } | |
224 | |
225 getopt(args, | |
226 "file-size|f", &file_size_string, | |
227 "digest-size|d", &digest_size_string, | |
228 "verbose|v", &verbose, | |
229 "help", &help); | |
230 | |
231 file_size = string_to_size(file_size_string); | |
232 digest_size = string_to_size(digest_size_string); | |
233 } | |
234 catch (ConvException ex) { | |
235 writefln("Conversion error: %s", ex); | |
236 exit(2); | |
237 } | |
238 | |
239 if (verbose) { | |
240 writefln("file-size=%s, digest-size=%s", size_to_string(file_size), size_to_string(digest_size)); | |
241 } | |
242 | |
243 find_duplicates(args[1..$], file_size, digest_size, verbose); | |
244 | |
245 return 0; | |
246 } |