117
|
1 import std.stdio;
|
|
2 import std.string;
|
|
3 import std.exception;
|
|
4 import std.random;
|
|
5 import std.algorithm;
|
|
6 import std.file;
|
|
7 import std.c.stdio;
|
|
8 import std.c.string;
|
|
9 import std.cstream;
|
|
10 import core.sys.posix.dirent;
|
|
11 import std.md5;
|
|
12
|
|
13
|
|
14 class DuplicateFinder {
|
|
15 this(in string[] dirs) {
|
|
16 writefln("Accumulating files");
|
|
17
|
|
18 string last_name;
|
|
19
|
|
20 foreach (string dir; dirs) {
|
|
21 try {
|
|
22 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
|
|
23 last_name = name;
|
|
24 try {
|
|
25 if (isFile(name)) {
|
|
26 ulong size = getSize(name);
|
|
27 if (size >= SIZE_THRESHOLD) {
|
|
28 _file_array ~= FileInfo(name, size);
|
|
29 }
|
|
30 }
|
|
31 }
|
|
32 catch (Exception ex) {
|
|
33 writefln("Skipping %s", name);
|
|
34 //writefln("Exception %s", ex);
|
|
35 // TODO accumulate errors and print after traversal
|
|
36 }
|
|
37 }
|
|
38 }
|
|
39 catch (FileException ex) {
|
|
40 // ignore
|
|
41 writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
|
|
42 }
|
|
43 }
|
|
44
|
|
45 writefln("Processing %s files", _file_array.length);
|
|
46
|
|
47 uint[][ulong] size_to_file_indices;
|
|
48 bool[ulong] duplicate_sizes;
|
|
49
|
|
50 foreach (index, file; _file_array) {
|
|
51 //writefln("%s %s %s", index, file.name, file.size);
|
|
52
|
|
53 if (uint[] * indices = (file.size in size_to_file_indices)) {
|
|
54 if (indices.length == 1) {
|
|
55 // Second time we've seen a file of this size,
|
|
56 // record it in the duplicate_sizes array
|
|
57 duplicate_sizes[file.size] = true;
|
|
58 }
|
|
59
|
|
60 (*indices) ~= index;
|
|
61 }
|
|
62 else {
|
|
63 size_to_file_indices[file.size] = [ index ];
|
|
64 }
|
|
65 }
|
|
66
|
|
67 writefln("Number of files of duplicate size %s", duplicate_sizes.length);
|
|
68
|
|
69 foreach (size; duplicate_sizes.keys) {
|
|
70 uint[] indices = size_to_file_indices[size];
|
|
71 //writefln("For size %s there are %s files", size, indices.length);
|
|
72
|
|
73 uint[][ubyte[16]] digest_to_indices;
|
|
74
|
|
75 foreach (index; indices) {
|
|
76 FileInfo file_info = _file_array[index];
|
|
77
|
|
78 try {
|
|
79 ubyte[16] digest = compute_md5(file_info.name);
|
|
80
|
|
81 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
|
|
82 // A true duplicate
|
|
83 // index and index2 are the same
|
|
84
|
|
85 (*duplicate_indices) ~= index;
|
|
86 }
|
|
87 else {
|
|
88 digest_to_indices[digest] ~= index;
|
|
89 }
|
|
90 }
|
|
91 catch (ErrnoException ex) {
|
|
92 //writefln("Skipping: %s", file_info.name);
|
|
93 }
|
|
94
|
|
95 //writefln("\t%s", file_info.name);
|
|
96 }
|
|
97
|
|
98 foreach (indices2; digest_to_indices) {
|
|
99 if (indices2.length > 1) {
|
|
100 // List the duplicates
|
|
101 foreach (index; indices) {
|
|
102 FileInfo file_info = _file_array[index];
|
|
103 writefln("%s %s", file_info.size, file_info.name);
|
|
104 }
|
|
105 writefln("");
|
|
106 }
|
|
107 }
|
|
108 }
|
|
109
|
|
110 writefln("Done\n");
|
|
111 }
|
|
112
|
|
113 ubyte[16] compute_md5(in string name) {
|
|
114 ubyte[16] digest;
|
|
115
|
|
116 auto file = File(name, "r");
|
|
117 scope(exit) file.close;
|
|
118
|
|
119 MD5_CTX context;
|
|
120 context.start();
|
|
121 { // Block 1:
|
|
122 // Compute the actual digest
|
|
123 ulong amount = 0;
|
|
124 foreach (ubyte[] buffer; chunks(file, 1024)) {
|
|
125 context.update(buffer);
|
|
126 //bytes_chewed(buffer.length);
|
|
127 amount += buffer.length;
|
|
128 if (amount >= MD5_AMOUNT) {
|
|
129 break;
|
|
130 }
|
|
131 }
|
|
132 }
|
|
133 context.finish(digest);
|
|
134
|
|
135 return digest;
|
|
136 }
|
|
137
|
|
138 private {
|
|
139 immutable ulong KILO = 1 << 10;
|
|
140 immutable ulong MEGA = 1 << 20;
|
|
141
|
|
142 immutable ulong SIZE_THRESHOLD = 100 * KILO;
|
|
143 immutable ulong MD5_AMOUNT = 10 * KILO;
|
|
144
|
|
145 struct FileInfo {
|
|
146 this(in string name_, in ulong size_) {
|
|
147 name = name_;
|
|
148 size = size_;
|
|
149 }
|
|
150
|
|
151 string name;
|
|
152 ulong size;
|
|
153 };
|
|
154
|
|
155 FileInfo[] _file_array;
|
|
156 }
|
|
157 }
|
|
158
|
|
159 int main(string[] args) {
|
|
160 new DuplicateFinder(args[1..$]);
|
|
161
|
|
162 return 0;
|
|
163 }
|