annotate doodle/utils/prog/dupes.d @ 118:94233d54e16a

Cleanup of dupes
author David Bryant <bagnose@gmail.com>
date Wed, 20 Apr 2011 22:37:16 +0930
parents c566cdbccaeb
children 8343c1dafac6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
117
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
1 import std.stdio;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
2 import std.string;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
3 import std.exception;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
4 import std.file;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
5 import std.md5;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
6
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
7 class DuplicateFinder {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
8 this(in string[] dirs) {
118
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
9 FileInfo[] _file_array;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
10
117
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
11 writefln("Accumulating files");
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
12
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
13 string last_name;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
14
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
15 foreach (string dir; dirs) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
16 try {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
17 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
18 last_name = name;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
19 try {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
20 if (isFile(name)) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
21 ulong size = getSize(name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
22 if (size >= SIZE_THRESHOLD) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
23 _file_array ~= FileInfo(name, size);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
24 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
25 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
26 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
27 catch (Exception ex) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
28 writefln("Skipping %s", name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
29 //writefln("Exception %s", ex);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
30 // TODO accumulate errors and print after traversal
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
31 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
32 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
33 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
34 catch (FileException ex) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
35 // ignore
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
36 writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
37 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
38 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
39
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
40 writefln("Processing %s files", _file_array.length);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
41
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
42 uint[][ulong] size_to_file_indices;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
43 bool[ulong] duplicate_sizes;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
44
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
45 foreach (index, file; _file_array) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
46 //writefln("%s %s %s", index, file.name, file.size);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
47
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
48 if (uint[] * indices = (file.size in size_to_file_indices)) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
49 if (indices.length == 1) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
50 // Second time we've seen a file of this size,
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
51 // record it in the duplicate_sizes array
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
52 duplicate_sizes[file.size] = true;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
53 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
54
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
55 (*indices) ~= index;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
56 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
57 else {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
58 size_to_file_indices[file.size] = [ index ];
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
59 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
60 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
61
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
62 writefln("Number of files of duplicate size %s", duplicate_sizes.length);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
63
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
64 foreach (size; duplicate_sizes.keys) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
65 uint[] indices = size_to_file_indices[size];
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
66 //writefln("For size %s there are %s files", size, indices.length);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
67
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
68 uint[][ubyte[16]] digest_to_indices;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
69
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
70 foreach (index; indices) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
71 FileInfo file_info = _file_array[index];
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
72
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
73 try {
118
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
74 ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT);
117
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
75
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
76 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
77 // A true duplicate
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
78 // index and index2 are the same
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
79
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
80 (*duplicate_indices) ~= index;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
81 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
82 else {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
83 digest_to_indices[digest] ~= index;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
84 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
85 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
86 catch (ErrnoException ex) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
87 //writefln("Skipping: %s", file_info.name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
88 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
89
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
90 //writefln("\t%s", file_info.name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
91 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
92
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
93 foreach (indices2; digest_to_indices) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
94 if (indices2.length > 1) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
95 // List the duplicates
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
96 foreach (index; indices) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
97 FileInfo file_info = _file_array[index];
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
98 writefln("%s %s", file_info.size, file_info.name);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
99 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
100 writefln("");
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
101 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
102 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
103 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
104
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
105 writefln("Done\n");
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
106 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
107
118
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
108 struct FileInfo {
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
109 this(in string name_, in ulong size_) {
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
110 name = name_;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
111 size = size_;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
112 }
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
113
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
114 string name;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
115 ulong size;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
116 }
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
117
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
118 immutable ulong KILO = 1 << 10;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
119 immutable ulong MEGA = 1 << 20;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
120
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
121 immutable ulong SIZE_THRESHOLD = 100 * KILO;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
122 immutable ulong MD5_AMOUNT = 10 * KILO;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
123
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
124 static ubyte[16] compute_md5(in string name, in ulong max_bytes) {
117
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
125 ubyte[16] digest;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
126
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
127 auto file = File(name, "r");
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
128 scope(exit) file.close;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
129
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
130 MD5_CTX context;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
131 context.start();
118
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
132 ulong byte_count = 0;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
133 foreach (ubyte[] buffer; chunks(file, 1024)) {
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
134 context.update(buffer);
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
135 byte_count += buffer.length;
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
136 if (byte_count >= max_bytes) {
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
137 break;
117
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
138 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
139 }
118
94233d54e16a Cleanup of dupes
David Bryant <bagnose@gmail.com>
parents: 117
diff changeset
140
117
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
141 context.finish(digest);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
142
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
143 return digest;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
144 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
145 }
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
146
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
147 int main(string[] args) {
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
148 new DuplicateFinder(args[1..$]);
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
149
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
150 return 0;
c566cdbccaeb Added dupes, the rewrite.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
151 }