Mercurial > projects > doodle
annotate doodle/utils/prog/dupes.d @ 119:8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Thu, 21 Apr 2011 18:12:13 +0930 |
parents | 94233d54e16a |
children | c275f26399c6 |
rev | line source |
---|---|
117 | 1 import std.stdio; |
2 import std.string; | |
3 import std.exception; | |
4 import std.file; | |
5 import std.md5; | |
6 | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
7 void find_duplicates(in string[] dirs) { |
118 | 8 immutable ulong KILO = 1 << 10; |
9 immutable ulong MEGA = 1 << 20; | |
10 | |
11 immutable ulong SIZE_THRESHOLD = 100 * KILO; | |
12 immutable ulong MD5_AMOUNT = 10 * KILO; | |
13 | |
14 static ubyte[16] compute_md5(in string name, in ulong max_bytes) { | |
117 | 15 ubyte[16] digest; |
16 | |
17 auto file = File(name, "r"); | |
18 scope(exit) file.close; | |
19 | |
20 MD5_CTX context; | |
21 context.start(); | |
118 | 22 ulong byte_count = 0; |
23 foreach (ubyte[] buffer; chunks(file, 1024)) { | |
24 context.update(buffer); | |
25 byte_count += buffer.length; | |
26 if (byte_count >= max_bytes) { | |
27 break; | |
117 | 28 } |
29 } | |
118 | 30 |
117 | 31 context.finish(digest); |
32 | |
33 return digest; | |
34 } | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
35 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
36 struct FileInfo { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
37 string name; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
38 ulong size; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
39 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
40 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
41 FileInfo[] file_array; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
42 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
43 writefln("Accumulating file list"); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
44 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
45 string last_name; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
46 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
47 foreach (string dir; dirs) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
48 try { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
49 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
50 last_name = name; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
51 try { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
52 if (!isSymLink(name) && isFile(name)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
53 ulong size = getSize(name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
54 if (size >= SIZE_THRESHOLD) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
55 file_array ~= FileInfo(name, size); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
56 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
57 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
58 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
59 catch (Exception ex) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
60 writefln("Skipping %s", name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
61 //writefln("Exception %s", ex); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
62 // TODO accumulate errors and print after traversal |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
63 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
64 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
65 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
66 catch (FileException ex) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
67 // ignore |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
68 writefln("dirEntries bailed out (%s). Continuing anyway", last_name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
69 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
70 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
71 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
72 writefln("Processing %s files", file_array.length); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
73 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
74 uint[][ulong] size_to_file_indices; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
75 bool[ulong] duplicate_sizes; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
76 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
77 foreach (index, file; file_array) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
78 //writefln("%s %s %s", index, file.name, file.size); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
79 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
80 if (uint[] * indices = (file.size in size_to_file_indices)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
81 if (indices.length == 1) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
82 // Second time we've seen a file of this size, |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
83 // record it in the duplicate_sizes array |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
84 duplicate_sizes[file.size] = true; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
85 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
86 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
87 (*indices) ~= index; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
88 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
89 else { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
90 size_to_file_indices[file.size] = [ index ]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
91 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
92 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
93 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
94 writefln("Number of files of duplicate size %s", duplicate_sizes.length); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
95 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
96 foreach (size; duplicate_sizes.keys) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
97 uint[] indices = size_to_file_indices[size]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
98 //writefln("For size %s there are %s files", size, indices.length); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
99 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
100 uint[][ubyte[16]] digest_to_indices; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
101 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
102 foreach (index; indices) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
103 const FileInfo file_info = file_array[index]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
104 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
105 try { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
106 ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
107 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
108 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
109 // A true duplicate |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
110 // index and index2 are the same |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
111 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
112 (*duplicate_indices) ~= index; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
113 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
114 else { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
115 digest_to_indices[digest] ~= index; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
116 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
117 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
118 catch (ErrnoException ex) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
119 //writefln("Skipping: %s", file_info.name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
120 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
121 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
122 //writefln("\t%s", file_info.name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
123 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
124 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
125 foreach (indices2; digest_to_indices) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
126 if (indices2.length > 1) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
127 // List the duplicates |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
128 foreach (index; indices) { |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
129 FileInfo file_info = file_array[index]; |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
130 writefln("%s %s", file_info.size, file_info.name); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
131 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
132 writefln(""); |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
133 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
134 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
135 } |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
136 |
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
137 writefln("Done"); |
117 | 138 } |
139 | |
140 int main(string[] args) { | |
119
8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
David Bryant <bagnose@gmail.com>
parents:
118
diff
changeset
|
141 find_duplicates(args[1..$]); |
117 | 142 |
143 return 0; | |
144 } |