comparison doodle/main/dupes.d @ 139:e33f37b14893 default tip

Port to 'no-more-make' https://github.com/GrahamStJack/no-more-make
author David Bryant <bagnose@gmail.com>
date Sun, 30 Sep 2012 15:41:25 +0930
parents doodle/main/util/dupes.d@89e8b0d92f36
children
comparison
equal deleted inserted replaced
138:a1c2b56cb44d 139:e33f37b14893
1 import std.stdio;
2 import std.string;
3 import std.exception;
4 import std.algorithm;
5 import std.file;
6 import std.md5;
7 import std.getopt;
8 import std.conv;
9 import std.ascii;
10 import std.c.stdlib;
11
12 ulong string_to_size(string s) {
13 // Convert strings to sizes, eg:
14 // "50" -> 50
15 // "80B" -> 80
16 // "10K" -> 10240
17 // "1M" -> 1048576
18 // Throws ConvException
19
20 immutable map = [ 'B':1UL, 'K':1UL<<10, 'M':1UL<<20, 'G':1UL<<30, 'T':1UL<<40 ];
21
22 if (s.length == 0) {
23 throw new ConvException("Empty string");
24 }
25 else {
26 ulong multiplier = 1;
27
28 if (isAlpha(s[$-1])) {
29 immutable ulong * m = (s[$-1] in map);
30
31 if (m) {
32 multiplier = *m;
33 }
34 else {
35 throw new ConvException(format("Bad size unit character: %s", s[$-1]));
36 }
37
38 s = s[0..$-1];
39 }
40
41 return multiplier * to!ulong(s);
42 }
43 }
44
45 string size_to_string(in ulong size) {
46 /+
47 immutable array = [ 'B', 'K', 'M', 'G', 'T' ];
48 size_t index = 0;
49
50 foreach (i, c; array) {
51 if (size / (1UL << i
52
53 writefln("%s %s", i, c);
54 }
55 +/
56
57 return format("%sK", size / 1024);
58 }
59
60 void find_duplicates(in string[] dirs,
61 in ulong file_size,
62 in ulong digest_size,
63 bool verbose) {
64 static ubyte[16] compute_md5(in string filename, in ulong max_bytes) {
65 size_t chunk_size = min(max_bytes, 4096 * 1024);
66 ubyte[16] digest;
67
68 auto file = File(filename, "r");
69 scope(exit) file.close();
70
71 MD5_CTX context;
72 context.start();
73 ulong byte_count = 0;
74 foreach (ubyte[] buffer; chunks(file, chunk_size)) {
75 context.update(buffer);
76 byte_count += buffer.length;
77 if (byte_count >= max_bytes) {
78 break;
79 }
80 }
81 context.finish(digest);
82
83 return digest;
84 }
85
86 struct FileInfo {
87 string name;
88 ulong size;
89 }
90
91 FileInfo[] file_array;
92
93 writefln("Accumulating file list");
94
95 foreach (string dir; dirs) {
96 if (isDir(dir)) {
97 string last_entry;
98 try {
99 foreach (string filename; dirEntries(dir, SpanMode.depth, false)) {
100 last_entry = filename;
101 try {
102 if (!isSymlink(filename) && isFile(filename)) {
103 ulong size = getSize(filename);
104 if (size >= file_size) {
105 file_array ~= FileInfo(filename, size);
106 }
107 }
108 }
109 catch (Exception ex) {
110 writefln("Skipping %s", filename);
111 //writefln("Exception %s", ex);
112 // TODO accumulate errors and print after traversal
113 }
114 }
115 }
116 catch (FileException ex) {
117 // ignore
118 writefln("Error, dirEntries bailed out after: %s. Continuing anyway", last_entry);
119 }
120 }
121 else {
122 writefln("Not a dir: %s", dir);
123 }
124 }
125
126 writefln("Processing %s files", file_array.length);
127
128 uint[][ulong] size_to_file_indices;
129 bool[ulong] duplicate_sizes;
130
131 foreach (uint index, file; file_array) {
132 //writefln("%s %s %s", index, file.name, file.size);
133
134 if (uint[] * indices = (file.size in size_to_file_indices)) {
135 if (indices.length == 1) {
136 // Second time we've seen a file of this size,
137 // record it in the duplicate_sizes array
138 duplicate_sizes[file.size] = true;
139 }
140
141 (*indices) ~= index;
142 }
143 else {
144 size_to_file_indices[file.size] = [ index ];
145 }
146 }
147
148 writefln("Number of files of duplicate size %s", duplicate_sizes.length);
149
150 ulong total_waste = 0;
151
152 foreach_reverse (size; duplicate_sizes.keys.sort) {
153 uint[] indices = size_to_file_indices[size];
154 //writefln("For size %s there are %s files", size, indices.length);
155
156 uint[][ubyte[16]] digest_to_indices;
157
158 foreach (index; indices) {
159 const FileInfo file_info = file_array[index];
160
161 try {
162 ubyte[16] digest = compute_md5(file_info.name, digest_size);
163
164 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
165 // A true duplicate
166 // index and index2 are the same
167
168 (*duplicate_indices) ~= index;
169 }
170 else {
171 digest_to_indices[digest] ~= index;
172 }
173 }
174 catch (ErrnoException ex) {
175 //writefln("Skipping: %s", file_info.name);
176 }
177
178 //writefln("\t%s", file_info.name);
179 }
180
181 foreach (indices2; digest_to_indices) {
182 if (indices2.length > 1) {
183 // List the duplicates
184 foreach (i, index; indices) {
185 FileInfo file_info = file_array[index];
186 if (i == 0) {
187 writefln("%s", size_to_string(file_info.size));
188 total_waste += file_info.size;
189 }
190 writefln(" %s", file_info.name);
191 }
192 writefln("");
193 }
194 }
195 }
196
197 writefln("Done, total waste: %s", size_to_string(total_waste));
198 }
199
200 int main(string[] args) {
201 ulong file_size;
202 ulong digest_size;
203 bool verbose;
204
205 try {
206 string file_size_string = "100K";
207 string digest_size_string = "100K";
208
209 void help(in string) {
210 writefln("Usage: dupes [OPTION]... DIR...\n"
211 "Recursively locate duplicate files in a list of directories\n"
212 "\n"
213 "Options\n"
214 " -d, --digest-size=SIZE size of digest used for comparison [%s]\n"
215 " -f, --file-size=SIZE minimum size of files searched for duplication [%s]\n"
216 " -v, --verbose be verbose\n"
217 " --help display this help and exit\n"
218 "\n"
219 "SIZE is an integer, optionally followed by K, M, G, T",
220 file_size_string,
221 digest_size_string);
222 exit(1);
223 }
224
225 getopt(args,
226 "file-size|f", &file_size_string,
227 "digest-size|d", &digest_size_string,
228 "verbose|v", &verbose,
229 "help", &help);
230
231 file_size = string_to_size(file_size_string);
232 digest_size = string_to_size(digest_size_string);
233 }
234 catch (ConvException ex) {
235 writefln("Conversion error: %s", ex);
236 exit(2);
237 }
238
239 if (verbose) {
240 writefln("file-size=%s, digest-size=%s", size_to_string(file_size), size_to_string(digest_size));
241 }
242
243 find_duplicates(args[1..$], file_size, digest_size, verbose);
244
245 return 0;
246 }