comparison doodle/utils/prog/dupes.d @ 117:c566cdbccaeb

Added dupes, the rewrite.
author David Bryant <bagnose@gmail.com>
date Tue, 19 Apr 2011 23:11:03 +0930
parents
children 94233d54e16a
comparison
equal deleted inserted replaced
116:31c27f4f3bbc 117:c566cdbccaeb
1 import std.stdio;
2 import std.string;
3 import std.exception;
4 import std.random;
5 import std.algorithm;
6 import std.file;
7 import std.c.stdio;
8 import std.c.string;
9 import std.cstream;
10 import core.sys.posix.dirent;
11 import std.md5;
12
13
14 class DuplicateFinder {
15 this(in string[] dirs) {
16 writefln("Accumulating files");
17
18 string last_name;
19
20 foreach (string dir; dirs) {
21 try {
22 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
23 last_name = name;
24 try {
25 if (isFile(name)) {
26 ulong size = getSize(name);
27 if (size >= SIZE_THRESHOLD) {
28 _file_array ~= FileInfo(name, size);
29 }
30 }
31 }
32 catch (Exception ex) {
33 writefln("Skipping %s", name);
34 //writefln("Exception %s", ex);
35 // TODO accumulate errors and print after traversal
36 }
37 }
38 }
39 catch (FileException ex) {
40 // ignore
41 writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
42 }
43 }
44
45 writefln("Processing %s files", _file_array.length);
46
47 uint[][ulong] size_to_file_indices;
48 bool[ulong] duplicate_sizes;
49
50 foreach (index, file; _file_array) {
51 //writefln("%s %s %s", index, file.name, file.size);
52
53 if (uint[] * indices = (file.size in size_to_file_indices)) {
54 if (indices.length == 1) {
55 // Second time we've seen a file of this size,
56 // record it in the duplicate_sizes array
57 duplicate_sizes[file.size] = true;
58 }
59
60 (*indices) ~= index;
61 }
62 else {
63 size_to_file_indices[file.size] = [ index ];
64 }
65 }
66
67 writefln("Number of files of duplicate size %s", duplicate_sizes.length);
68
69 foreach (size; duplicate_sizes.keys) {
70 uint[] indices = size_to_file_indices[size];
71 //writefln("For size %s there are %s files", size, indices.length);
72
73 uint[][ubyte[16]] digest_to_indices;
74
75 foreach (index; indices) {
76 FileInfo file_info = _file_array[index];
77
78 try {
79 ubyte[16] digest = compute_md5(file_info.name);
80
81 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
82 // A true duplicate
83 // index and index2 are the same
84
85 (*duplicate_indices) ~= index;
86 }
87 else {
88 digest_to_indices[digest] ~= index;
89 }
90 }
91 catch (ErrnoException ex) {
92 //writefln("Skipping: %s", file_info.name);
93 }
94
95 //writefln("\t%s", file_info.name);
96 }
97
98 foreach (indices2; digest_to_indices) {
99 if (indices2.length > 1) {
100 // List the duplicates
101 foreach (index; indices) {
102 FileInfo file_info = _file_array[index];
103 writefln("%s %s", file_info.size, file_info.name);
104 }
105 writefln("");
106 }
107 }
108 }
109
110 writefln("Done\n");
111 }
112
113 ubyte[16] compute_md5(in string name) {
114 ubyte[16] digest;
115
116 auto file = File(name, "r");
117 scope(exit) file.close;
118
119 MD5_CTX context;
120 context.start();
121 { // Block 1:
122 // Compute the actual digest
123 ulong amount = 0;
124 foreach (ubyte[] buffer; chunks(file, 1024)) {
125 context.update(buffer);
126 //bytes_chewed(buffer.length);
127 amount += buffer.length;
128 if (amount >= MD5_AMOUNT) {
129 break;
130 }
131 }
132 }
133 context.finish(digest);
134
135 return digest;
136 }
137
138 private {
139 immutable ulong KILO = 1 << 10;
140 immutable ulong MEGA = 1 << 20;
141
142 immutable ulong SIZE_THRESHOLD = 100 * KILO;
143 immutable ulong MD5_AMOUNT = 10 * KILO;
144
145 struct FileInfo {
146 this(in string name_, in ulong size_) {
147 name = name_;
148 size = size_;
149 }
150
151 string name;
152 ulong size;
153 };
154
155 FileInfo[] _file_array;
156 }
157 }
158
159 int main(string[] args) {
160 new DuplicateFinder(args[1..$]);
161
162 return 0;
163 }