Mercurial > projects > doodle
annotate doodle/utils/prog/dupes.d @ 120:c275f26399c6
Tinkerings
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Fri, 22 Apr 2011 00:06:07 +0930 |
parents | 8343c1dafac6 |
children | f1cf62339ed5 |
rev | line source |
---|---|
117 | 1 import std.stdio; |
2 import std.string; | |
3 import std.exception; | |
4 import std.file; | |
5 import std.md5; | |
120 | 6 import std.getopt; |
7 import std.conv; | |
8 import std.c.stdlib; | |
117 | 9 |
120 | 10 void find_duplicates(in string[] dirs, |
11 in ulong file_size, | |
12 in ulong digest_size, | |
13 bool verbose) { | |
118 | 14 static ubyte[16] compute_md5(in string name, in ulong max_bytes) { |
117 | 15 ubyte[16] digest; |
16 | |
17 auto file = File(name, "r"); | |
18 scope(exit) file.close; | |
19 | |
20 MD5_CTX context; | |
21 context.start(); | |
118 | 22 ulong byte_count = 0; |
23 foreach (ubyte[] buffer; chunks(file, 1024)) { | |
24 context.update(buffer); | |
25 byte_count += buffer.length; | |
26 if (byte_count >= max_bytes) { | |
27 break; | |
117 | 28 } |
29 } | |
118 | 30 |
117 | 31 context.finish(digest); |
32 | |
33 return digest; | |
34 } | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
35 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
36 struct FileInfo { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
37 string name; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
38 ulong size; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
39 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
40 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
41 FileInfo[] file_array; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
42 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
43 writefln("Accumulating file list"); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
44 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
45 string last_name; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
46 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
47 foreach (string dir; dirs) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
48 try { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
49 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
50 last_name = name; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
51 try { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
52 if (!isSymLink(name) && isFile(name)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
53 ulong size = getSize(name); |
120 | 54 if (size >= file_size) { |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
55 file_array ~= FileInfo(name, size); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
56 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
57 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
58 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
59 catch (Exception ex) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
60 writefln("Skipping %s", name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
61 //writefln("Exception %s", ex); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
62 // TODO accumulate errors and print after traversal |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
63 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
64 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
65 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
66 catch (FileException ex) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
67 // ignore |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
68 writefln("dirEntries bailed out (%s). Continuing anyway", last_name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
69 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
70 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
71 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
72 writefln("Processing %s files", file_array.length); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
73 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
74 uint[][ulong] size_to_file_indices; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
75 bool[ulong] duplicate_sizes; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
76 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
77 foreach (index, file; file_array) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
78 //writefln("%s %s %s", index, file.name, file.size); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
79 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
80 if (uint[] * indices = (file.size in size_to_file_indices)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
81 if (indices.length == 1) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
82 // Second time we've seen a file of this size, |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
83 // record it in the duplicate_sizes array |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
84 duplicate_sizes[file.size] = true; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
85 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
86 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
87 (*indices) ~= index; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
88 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
89 else { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
90 size_to_file_indices[file.size] = [ index ]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
91 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
92 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
93 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
94 writefln("Number of files of duplicate size %s", duplicate_sizes.length); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
95 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
96 foreach (size; duplicate_sizes.keys) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
97 uint[] indices = size_to_file_indices[size]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
98 //writefln("For size %s there are %s files", size, indices.length); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
99 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
100 uint[][ubyte[16]] digest_to_indices; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
101 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
102 foreach (index; indices) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
103 const FileInfo file_info = file_array[index]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
104 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
105 try { |
120 | 106 ubyte[16] digest = compute_md5(file_info.name, digest_size); |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
107 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
108 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
109 // A true duplicate |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
110 // index and index2 are the same |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
111 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
112 (*duplicate_indices) ~= index; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
113 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
114 else { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
115 digest_to_indices[digest] ~= index; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
116 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
117 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
118 catch (ErrnoException ex) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
119 //writefln("Skipping: %s", file_info.name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
120 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
121 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
122 //writefln("\t%s", file_info.name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
123 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
124 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
125 foreach (indices2; digest_to_indices) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
126 if (indices2.length > 1) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
127 // List the duplicates |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
128 foreach (index; indices) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
129 FileInfo file_info = file_array[index]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
130 writefln("%s %s", file_info.size, file_info.name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
131 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
132 writefln(""); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
133 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
134 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
135 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
136 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
137 writefln("Done"); |
117 | 138 } |
139 | |
140 int main(string[] args) { | |
120 | 141 immutable ulong KILO = 1 << 10; |
142 immutable ulong MEGA = 1 << 20; | |
143 immutable ulong GIGA = 1 << 30; | |
144 | |
145 /* | |
146 static ulong parse_size_string(in string[] s) { | |
147 if (s.length == 0) { | |
148 throw new ConvException | |
149 } | |
150 } | |
151 */ | |
152 | |
153 void help(in string) { | |
154 writefln("Help"); | |
155 exit(1); | |
156 } | |
157 | |
158 ulong file_size = 100 * KILO; | |
159 ulong digest_size = 10 * KILO; | |
160 bool verbose = false; | |
161 | |
162 try { | |
163 getopt(args, | |
164 "file-size|f", &file_size, | |
165 "digest-size|d", &digest_size, | |
166 "verbose|v", &verbose, | |
167 "help|h", &help); | |
168 } | |
169 catch (ConvException ex) { | |
170 | |
171 } | |
172 | |
173 find_duplicates(args[1..$], file_size, digest_size, verbose); | |
117 | 174 |
175 return 0; | |
176 } |