Mercurial > projects > doodle
comparison doodle/utils/prog/dupes.d @ 121:f1cf62339ed5
More tweaking
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Sun, 24 Apr 2011 16:42:26 +0930 |
parents | c275f26399c6 |
children | 0d427170a805 |
comparison
equal
deleted
inserted
replaced
120:c275f26399c6 | 121:f1cf62339ed5 |
---|---|
1 import std.stdio; | 1 import std.stdio; |
2 import std.string; | 2 import std.string; |
3 import std.exception; | 3 import std.exception; |
4 import std.algorithm; | |
4 import std.file; | 5 import std.file; |
5 import std.md5; | 6 import std.md5; |
6 import std.getopt; | 7 import std.getopt; |
7 import std.conv; | 8 import std.conv; |
9 import std.ctype; | |
8 import std.c.stdlib; | 10 import std.c.stdlib; |
11 | |
12 ulong string_to_size(string s) { | |
13 // Convert strings to sizes, eg: | |
14 // "50" -> 50 | |
15 // "80B" -> 80 | |
16 // "10K" -> 10240 | |
17 // "1M" -> 1048576 | |
18 // Throws ConvException | |
19 | |
20 immutable map = [ 'B':1UL, 'K':1UL<<10, 'M':1UL<<20, 'G':1UL<<30, 'T':1UL<<40 ]; | |
21 | |
22 if (s.length == 0) { | |
23 throw new ConvException("Empty string"); | |
24 } | |
25 else { | |
26 ulong multiplier = 1; | |
27 | |
28 if (isalpha(s[$-1])) { | |
29 immutable ulong * m = (s[$-1] in map); | |
30 | |
31 if (m) { | |
32 multiplier = *m; | |
33 } | |
34 else { | |
35 throw new ConvException(format("Bad size unit character: %s", s[$-1])); | |
36 } | |
37 | |
38 s = s[0..$-1]; | |
39 } | |
40 | |
41 return multiplier * to!ulong(s); | |
42 } | |
43 } | |
44 | |
45 string size_to_string(in ulong size) { | |
46 /+ | |
47 immutable array = [ 'B', 'K', 'M', 'G', 'T' ]; | |
48 size_t index = 0; | |
49 | |
50 foreach (i, c; array) { | |
51 if (size / (1UL << i | |
52 | |
53 writefln("%s %s", i, c); | |
54 } | |
55 +/ | |
56 | |
57 return format("%sK", size / 1024); | |
58 } | |
9 | 59 |
10 void find_duplicates(in string[] dirs, | 60 void find_duplicates(in string[] dirs, |
11 in ulong file_size, | 61 in ulong file_size, |
12 in ulong digest_size, | 62 in ulong digest_size, |
13 bool verbose) { | 63 bool verbose) { |
14 static ubyte[16] compute_md5(in string name, in ulong max_bytes) { | 64 static ubyte[16] compute_md5(in string filename, in ulong max_bytes) { |
65 size_t chunk_size = min(max_bytes, 4096 * 1024); | |
15 ubyte[16] digest; | 66 ubyte[16] digest; |
16 | 67 |
17 auto file = File(name, "r"); | 68 auto file = File(filename, "r"); |
18 scope(exit) file.close; | 69 scope(exit) file.close; |
19 | 70 |
20 MD5_CTX context; | 71 MD5_CTX context; |
21 context.start(); | 72 context.start(); |
22 ulong byte_count = 0; | 73 ulong byte_count = 0; |
23 foreach (ubyte[] buffer; chunks(file, 1024)) { | 74 foreach (ubyte[] buffer; chunks(file, chunk_size)) { |
24 context.update(buffer); | 75 context.update(buffer); |
25 byte_count += buffer.length; | 76 byte_count += buffer.length; |
26 if (byte_count >= max_bytes) { | 77 if (byte_count >= max_bytes) { |
27 break; | 78 break; |
28 } | 79 } |
29 } | 80 } |
30 | |
31 context.finish(digest); | 81 context.finish(digest); |
32 | 82 |
33 return digest; | 83 return digest; |
34 } | 84 } |
35 | 85 |
40 | 90 |
41 FileInfo[] file_array; | 91 FileInfo[] file_array; |
42 | 92 |
43 writefln("Accumulating file list"); | 93 writefln("Accumulating file list"); |
44 | 94 |
45 string last_name; | |
46 | |
47 foreach (string dir; dirs) { | 95 foreach (string dir; dirs) { |
48 try { | 96 if (isDir(dir)) { |
49 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | 97 string last_entry; |
50 last_name = name; | 98 try { |
51 try { | 99 foreach (string filename; dirEntries(dir, SpanMode.depth, false)) { |
52 if (!isSymLink(name) && isFile(name)) { | 100 last_entry = filename; |
53 ulong size = getSize(name); | 101 try { |
54 if (size >= file_size) { | 102 if (!isSymLink(filename) && isFile(filename)) { |
55 file_array ~= FileInfo(name, size); | 103 ulong size = getSize(filename); |
104 if (size >= file_size) { | |
105 file_array ~= FileInfo(filename, size); | |
106 } | |
56 } | 107 } |
57 } | 108 } |
58 } | 109 catch (Exception ex) { |
59 catch (Exception ex) { | 110 writefln("Skipping %s", filename); |
60 writefln("Skipping %s", name); | 111 //writefln("Exception %s", ex); |
61 //writefln("Exception %s", ex); | 112 // TODO accumulate errors and print after traversal |
62 // TODO accumulate errors and print after traversal | 113 } |
63 } | 114 } |
64 } | 115 } |
65 } | 116 catch (FileException ex) { |
66 catch (FileException ex) { | 117 // ignore |
67 // ignore | 118 writefln("Error, dirEntries bailed out after: %s. Continuing anyway", last_entry); |
68 writefln("dirEntries bailed out (%s). Continuing anyway", last_name); | 119 } |
120 } | |
121 else { | |
122 writefln("Not a dir: %s", dir); | |
69 } | 123 } |
70 } | 124 } |
71 | 125 |
72 writefln("Processing %s files", file_array.length); | 126 writefln("Processing %s files", file_array.length); |
73 | 127 |
91 } | 145 } |
92 } | 146 } |
93 | 147 |
94 writefln("Number of files of duplicate size %s", duplicate_sizes.length); | 148 writefln("Number of files of duplicate size %s", duplicate_sizes.length); |
95 | 149 |
96 foreach (size; duplicate_sizes.keys) { | 150 ulong total_waste = 0; |
151 | |
152 foreach_reverse (size; duplicate_sizes.keys.sort) { | |
97 uint[] indices = size_to_file_indices[size]; | 153 uint[] indices = size_to_file_indices[size]; |
98 //writefln("For size %s there are %s files", size, indices.length); | 154 //writefln("For size %s there are %s files", size, indices.length); |
99 | 155 |
100 uint[][ubyte[16]] digest_to_indices; | 156 uint[][ubyte[16]] digest_to_indices; |
101 | 157 |
123 } | 179 } |
124 | 180 |
125 foreach (indices2; digest_to_indices) { | 181 foreach (indices2; digest_to_indices) { |
126 if (indices2.length > 1) { | 182 if (indices2.length > 1) { |
127 // List the duplicates | 183 // List the duplicates |
128 foreach (index; indices) { | 184 foreach (i, index; indices) { |
129 FileInfo file_info = file_array[index]; | 185 FileInfo file_info = file_array[index]; |
130 writefln("%s %s", file_info.size, file_info.name); | 186 if (i == 0) { |
187 writefln("%s", size_to_string(file_info.size)); | |
188 total_waste += file_info.size; | |
189 } | |
190 writefln(" %s", file_info.name); | |
131 } | 191 } |
132 writefln(""); | 192 writefln(""); |
133 } | 193 } |
134 } | 194 } |
135 } | 195 } |
136 | 196 |
137 writefln("Done"); | 197 writefln("Done, total waste: %s", size_to_string(total_waste)); |
138 } | 198 } |
139 | 199 |
140 int main(string[] args) { | 200 int main(string[] args) { |
141 immutable ulong KILO = 1 << 10; | 201 ulong file_size; |
142 immutable ulong MEGA = 1 << 20; | 202 ulong digest_size; |
143 immutable ulong GIGA = 1 << 30; | 203 bool verbose; |
144 | |
145 /* | |
146 static ulong parse_size_string(in string[] s) { | |
147 if (s.length == 0) { | |
148 throw new ConvException | |
149 } | |
150 } | |
151 */ | |
152 | |
153 void help(in string) { | |
154 writefln("Help"); | |
155 exit(1); | |
156 } | |
157 | |
158 ulong file_size = 100 * KILO; | |
159 ulong digest_size = 10 * KILO; | |
160 bool verbose = false; | |
161 | 204 |
162 try { | 205 try { |
163 getopt(args, | 206 void help(in string) { |
164 "file-size|f", &file_size, | 207 writefln("Usage: dupes [OPTION]... DIR...\n" |
165 "digest-size|d", &digest_size, | 208 "Recursively locate duplicate files in a list of directories\n" |
166 "verbose|v", &verbose, | 209 "\n" |
167 "help|h", &help); | 210 "Options\n" |
211 " -d, --digest-size=SIZE size of digest used for comparison\n" | |
212 " -f, --file-size=SIZE minimum size of files searched for duplication\n" | |
213 " -v, --verbose be verbose\n" | |
214 " --help display this help and exit\n" | |
215 "\n" | |
216 "SIZE is an integer, optionally followed by K, M, G, T"); | |
217 exit(1); | |
218 } | |
219 | |
220 string file_size_string = "100K"; | |
221 string digest_size_string = "100K"; | |
222 | |
223 getopt(args, | |
224 "file-size|f", &file_size_string, | |
225 "digest-size|d", &digest_size_string, | |
226 "verbose|v", &verbose, | |
227 "help", &help); | |
228 | |
229 file_size = string_to_size(file_size_string); | |
230 digest_size = string_to_size(digest_size_string); | |
168 } | 231 } |
169 catch (ConvException ex) { | 232 catch (ConvException ex) { |
170 | 233 writefln("Conversion error: %s", ex); |
234 exit(2); | |
235 } | |
236 | |
237 if (verbose) { | |
238 writefln("file-size=%s, digest-size=%s", size_to_string(file_size), size_to_string(digest_size)); | |
171 } | 239 } |
172 | 240 |
173 find_duplicates(args[1..$], file_size, digest_size, verbose); | 241 find_duplicates(args[1..$], file_size, digest_size, verbose); |
174 | 242 |
175 return 0; | 243 return 0; |