117
|
1 import std.stdio;
|
|
2 import std.string;
|
|
3 import std.exception;
|
|
4 import std.file;
|
|
5 import std.md5;
|
|
6
|
|
7 class DuplicateFinder {
|
|
8 this(in string[] dirs) {
|
118
|
9 FileInfo[] _file_array;
|
|
10
|
117
|
11 writefln("Accumulating files");
|
|
12
|
|
13 string last_name;
|
|
14
|
|
15 foreach (string dir; dirs) {
|
|
16 try {
|
|
17 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
|
|
18 last_name = name;
|
|
19 try {
|
|
20 if (isFile(name)) {
|
|
21 ulong size = getSize(name);
|
|
22 if (size >= SIZE_THRESHOLD) {
|
|
23 _file_array ~= FileInfo(name, size);
|
|
24 }
|
|
25 }
|
|
26 }
|
|
27 catch (Exception ex) {
|
|
28 writefln("Skipping %s", name);
|
|
29 //writefln("Exception %s", ex);
|
|
30 // TODO accumulate errors and print after traversal
|
|
31 }
|
|
32 }
|
|
33 }
|
|
34 catch (FileException ex) {
|
|
35 // ignore
|
|
36 writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
|
|
37 }
|
|
38 }
|
|
39
|
|
40 writefln("Processing %s files", _file_array.length);
|
|
41
|
|
42 uint[][ulong] size_to_file_indices;
|
|
43 bool[ulong] duplicate_sizes;
|
|
44
|
|
45 foreach (index, file; _file_array) {
|
|
46 //writefln("%s %s %s", index, file.name, file.size);
|
|
47
|
|
48 if (uint[] * indices = (file.size in size_to_file_indices)) {
|
|
49 if (indices.length == 1) {
|
|
50 // Second time we've seen a file of this size,
|
|
51 // record it in the duplicate_sizes array
|
|
52 duplicate_sizes[file.size] = true;
|
|
53 }
|
|
54
|
|
55 (*indices) ~= index;
|
|
56 }
|
|
57 else {
|
|
58 size_to_file_indices[file.size] = [ index ];
|
|
59 }
|
|
60 }
|
|
61
|
|
62 writefln("Number of files of duplicate size %s", duplicate_sizes.length);
|
|
63
|
|
64 foreach (size; duplicate_sizes.keys) {
|
|
65 uint[] indices = size_to_file_indices[size];
|
|
66 //writefln("For size %s there are %s files", size, indices.length);
|
|
67
|
|
68 uint[][ubyte[16]] digest_to_indices;
|
|
69
|
|
70 foreach (index; indices) {
|
|
71 FileInfo file_info = _file_array[index];
|
|
72
|
|
73 try {
|
118
|
74 ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT);
|
117
|
75
|
|
76 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
|
|
77 // A true duplicate
|
|
78 // index and index2 are the same
|
|
79
|
|
80 (*duplicate_indices) ~= index;
|
|
81 }
|
|
82 else {
|
|
83 digest_to_indices[digest] ~= index;
|
|
84 }
|
|
85 }
|
|
86 catch (ErrnoException ex) {
|
|
87 //writefln("Skipping: %s", file_info.name);
|
|
88 }
|
|
89
|
|
90 //writefln("\t%s", file_info.name);
|
|
91 }
|
|
92
|
|
93 foreach (indices2; digest_to_indices) {
|
|
94 if (indices2.length > 1) {
|
|
95 // List the duplicates
|
|
96 foreach (index; indices) {
|
|
97 FileInfo file_info = _file_array[index];
|
|
98 writefln("%s %s", file_info.size, file_info.name);
|
|
99 }
|
|
100 writefln("");
|
|
101 }
|
|
102 }
|
|
103 }
|
|
104
|
|
105 writefln("Done\n");
|
|
106 }
|
|
107
|
118
|
108 struct FileInfo {
|
|
109 this(in string name_, in ulong size_) {
|
|
110 name = name_;
|
|
111 size = size_;
|
|
112 }
|
|
113
|
|
114 string name;
|
|
115 ulong size;
|
|
116 }
|
|
117
|
|
118 immutable ulong KILO = 1 << 10;
|
|
119 immutable ulong MEGA = 1 << 20;
|
|
120
|
|
121 immutable ulong SIZE_THRESHOLD = 100 * KILO;
|
|
122 immutable ulong MD5_AMOUNT = 10 * KILO;
|
|
123
|
|
124 static ubyte[16] compute_md5(in string name, in ulong max_bytes) {
|
117
|
125 ubyte[16] digest;
|
|
126
|
|
127 auto file = File(name, "r");
|
|
128 scope(exit) file.close;
|
|
129
|
|
130 MD5_CTX context;
|
|
131 context.start();
|
118
|
132 ulong byte_count = 0;
|
|
133 foreach (ubyte[] buffer; chunks(file, 1024)) {
|
|
134 context.update(buffer);
|
|
135 byte_count += buffer.length;
|
|
136 if (byte_count >= max_bytes) {
|
|
137 break;
|
117
|
138 }
|
|
139 }
|
118
|
140
|
117
|
141 context.finish(digest);
|
|
142
|
|
143 return digest;
|
|
144 }
|
|
145 }
|
|
146
|
|
147 int main(string[] args) {
|
|
148 new DuplicateFinder(args[1..$]);
|
|
149
|
|
150 return 0;
|
|
151 }
|