comparison doodle/utils/prog/dupes.d @ 121:f1cf62339ed5

More tweaking
author David Bryant <bagnose@gmail.com>
date Sun, 24 Apr 2011 16:42:26 +0930
parents c275f26399c6
children 0d427170a805
comparison
equal deleted inserted replaced
120:c275f26399c6 121:f1cf62339ed5
1 import std.stdio; 1 import std.stdio;
2 import std.string; 2 import std.string;
3 import std.exception; 3 import std.exception;
4 import std.algorithm;
4 import std.file; 5 import std.file;
5 import std.md5; 6 import std.md5;
6 import std.getopt; 7 import std.getopt;
7 import std.conv; 8 import std.conv;
9 import std.ctype;
8 import std.c.stdlib; 10 import std.c.stdlib;
11
12 ulong string_to_size(string s) {
13 // Convert strings to sizes, eg:
14 // "50" -> 50
15 // "80B" -> 80
16 // "10K" -> 10240
17 // "1M" -> 1048576
18 // Throws ConvException
19
20 immutable map = [ 'B':1UL, 'K':1UL<<10, 'M':1UL<<20, 'G':1UL<<30, 'T':1UL<<40 ];
21
22 if (s.length == 0) {
23 throw new ConvException("Empty string");
24 }
25 else {
26 ulong multiplier = 1;
27
28 if (isalpha(s[$-1])) {
29 immutable ulong * m = (s[$-1] in map);
30
31 if (m) {
32 multiplier = *m;
33 }
34 else {
35 throw new ConvException(format("Bad size unit character: %s", s[$-1]));
36 }
37
38 s = s[0..$-1];
39 }
40
41 return multiplier * to!ulong(s);
42 }
43 }
44
45 string size_to_string(in ulong size) {
46 /+
47 immutable array = [ 'B', 'K', 'M', 'G', 'T' ];
48 size_t index = 0;
49
50 foreach (i, c; array) {
51 if (size / (1UL << i
52
53 writefln("%s %s", i, c);
54 }
55 +/
56
57 return format("%sK", size / 1024);
58 }
9 59
10 void find_duplicates(in string[] dirs, 60 void find_duplicates(in string[] dirs,
11 in ulong file_size, 61 in ulong file_size,
12 in ulong digest_size, 62 in ulong digest_size,
13 bool verbose) { 63 bool verbose) {
14 static ubyte[16] compute_md5(in string name, in ulong max_bytes) { 64 static ubyte[16] compute_md5(in string filename, in ulong max_bytes) {
65 size_t chunk_size = min(max_bytes, 4096 * 1024);
15 ubyte[16] digest; 66 ubyte[16] digest;
16 67
17 auto file = File(name, "r"); 68 auto file = File(filename, "r");
18 scope(exit) file.close; 69 scope(exit) file.close;
19 70
20 MD5_CTX context; 71 MD5_CTX context;
21 context.start(); 72 context.start();
22 ulong byte_count = 0; 73 ulong byte_count = 0;
23 foreach (ubyte[] buffer; chunks(file, 1024)) { 74 foreach (ubyte[] buffer; chunks(file, chunk_size)) {
24 context.update(buffer); 75 context.update(buffer);
25 byte_count += buffer.length; 76 byte_count += buffer.length;
26 if (byte_count >= max_bytes) { 77 if (byte_count >= max_bytes) {
27 break; 78 break;
28 } 79 }
29 } 80 }
30
31 context.finish(digest); 81 context.finish(digest);
32 82
33 return digest; 83 return digest;
34 } 84 }
35 85
40 90
41 FileInfo[] file_array; 91 FileInfo[] file_array;
42 92
43 writefln("Accumulating file list"); 93 writefln("Accumulating file list");
44 94
45 string last_name;
46
47 foreach (string dir; dirs) { 95 foreach (string dir; dirs) {
48 try { 96 if (isDir(dir)) {
49 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { 97 string last_entry;
50 last_name = name; 98 try {
51 try { 99 foreach (string filename; dirEntries(dir, SpanMode.depth, false)) {
52 if (!isSymLink(name) && isFile(name)) { 100 last_entry = filename;
53 ulong size = getSize(name); 101 try {
54 if (size >= file_size) { 102 if (!isSymLink(filename) && isFile(filename)) {
55 file_array ~= FileInfo(name, size); 103 ulong size = getSize(filename);
104 if (size >= file_size) {
105 file_array ~= FileInfo(filename, size);
106 }
56 } 107 }
57 } 108 }
58 } 109 catch (Exception ex) {
59 catch (Exception ex) { 110 writefln("Skipping %s", filename);
60 writefln("Skipping %s", name); 111 //writefln("Exception %s", ex);
61 //writefln("Exception %s", ex); 112 // TODO accumulate errors and print after traversal
62 // TODO accumulate errors and print after traversal 113 }
63 } 114 }
64 } 115 }
65 } 116 catch (FileException ex) {
66 catch (FileException ex) { 117 // ignore
67 // ignore 118 writefln("Error, dirEntries bailed out after: %s. Continuing anyway", last_entry);
68 writefln("dirEntries bailed out (%s). Continuing anyway", last_name); 119 }
120 }
121 else {
122 writefln("Not a dir: %s", dir);
69 } 123 }
70 } 124 }
71 125
72 writefln("Processing %s files", file_array.length); 126 writefln("Processing %s files", file_array.length);
73 127
91 } 145 }
92 } 146 }
93 147
94 writefln("Number of files of duplicate size %s", duplicate_sizes.length); 148 writefln("Number of files of duplicate size %s", duplicate_sizes.length);
95 149
96 foreach (size; duplicate_sizes.keys) { 150 ulong total_waste = 0;
151
152 foreach_reverse (size; duplicate_sizes.keys.sort) {
97 uint[] indices = size_to_file_indices[size]; 153 uint[] indices = size_to_file_indices[size];
98 //writefln("For size %s there are %s files", size, indices.length); 154 //writefln("For size %s there are %s files", size, indices.length);
99 155
100 uint[][ubyte[16]] digest_to_indices; 156 uint[][ubyte[16]] digest_to_indices;
101 157
123 } 179 }
124 180
125 foreach (indices2; digest_to_indices) { 181 foreach (indices2; digest_to_indices) {
126 if (indices2.length > 1) { 182 if (indices2.length > 1) {
127 // List the duplicates 183 // List the duplicates
128 foreach (index; indices) { 184 foreach (i, index; indices) {
129 FileInfo file_info = file_array[index]; 185 FileInfo file_info = file_array[index];
130 writefln("%s %s", file_info.size, file_info.name); 186 if (i == 0) {
187 writefln("%s", size_to_string(file_info.size));
188 total_waste += file_info.size;
189 }
190 writefln(" %s", file_info.name);
131 } 191 }
132 writefln(""); 192 writefln("");
133 } 193 }
134 } 194 }
135 } 195 }
136 196
137 writefln("Done"); 197 writefln("Done, total waste: %s", size_to_string(total_waste));
138 } 198 }
139 199
140 int main(string[] args) { 200 int main(string[] args) {
141 immutable ulong KILO = 1 << 10; 201 ulong file_size;
142 immutable ulong MEGA = 1 << 20; 202 ulong digest_size;
143 immutable ulong GIGA = 1 << 30; 203 bool verbose;
144
145 /*
146 static ulong parse_size_string(in string[] s) {
147 if (s.length == 0) {
148 throw new ConvException
149 }
150 }
151 */
152
153 void help(in string) {
154 writefln("Help");
155 exit(1);
156 }
157
158 ulong file_size = 100 * KILO;
159 ulong digest_size = 10 * KILO;
160 bool verbose = false;
161 204
162 try { 205 try {
163 getopt(args, 206 void help(in string) {
164 "file-size|f", &file_size, 207 writefln("Usage: dupes [OPTION]... DIR...\n"
165 "digest-size|d", &digest_size, 208 "Recursively locate duplicate files in a list of directories\n"
166 "verbose|v", &verbose, 209 "\n"
167 "help|h", &help); 210 "Options\n"
211 " -d, --digest-size=SIZE size of digest used for comparison\n"
212 " -f, --file-size=SIZE minimum size of files searched for duplication\n"
213 " -v, --verbose be verbose\n"
214 " --help display this help and exit\n"
215 "\n"
216 "SIZE is an integer, optionally followed by K, M, G, T");
217 exit(1);
218 }
219
220 string file_size_string = "100K";
221 string digest_size_string = "100K";
222
223 getopt(args,
224 "file-size|f", &file_size_string,
225 "digest-size|d", &digest_size_string,
226 "verbose|v", &verbose,
227 "help", &help);
228
229 file_size = string_to_size(file_size_string);
230 digest_size = string_to_size(digest_size_string);
168 } 231 }
169 catch (ConvException ex) { 232 catch (ConvException ex) {
170 233 writefln("Conversion error: %s", ex);
234 exit(2);
235 }
236
237 if (verbose) {
238 writefln("file-size=%s, digest-size=%s", size_to_string(file_size), size_to_string(digest_size));
171 } 239 }
172 240
173 find_duplicates(args[1..$], file_size, digest_size, verbose); 241 find_duplicates(args[1..$], file_size, digest_size, verbose);
174 242
175 return 0; 243 return 0;