Mercurial > projects > doodle
comparison doodle/utils/prog/dupes.d @ 117:c566cdbccaeb
Added dupes, the rewrite.
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Tue, 19 Apr 2011 23:11:03 +0930 |
parents | |
children | 94233d54e16a |
comparison
equal
deleted
inserted
replaced
116:31c27f4f3bbc | 117:c566cdbccaeb |
---|---|
1 import std.stdio; | |
2 import std.string; | |
3 import std.exception; | |
4 import std.random; | |
5 import std.algorithm; | |
6 import std.file; | |
7 import std.c.stdio; | |
8 import std.c.string; | |
9 import std.cstream; | |
10 import core.sys.posix.dirent; | |
11 import std.md5; | |
12 | |
13 | |
14 class DuplicateFinder { | |
15 this(in string[] dirs) { | |
16 writefln("Accumulating files"); | |
17 | |
18 string last_name; | |
19 | |
20 foreach (string dir; dirs) { | |
21 try { | |
22 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | |
23 last_name = name; | |
24 try { | |
25 if (isFile(name)) { | |
26 ulong size = getSize(name); | |
27 if (size >= SIZE_THRESHOLD) { | |
28 _file_array ~= FileInfo(name, size); | |
29 } | |
30 } | |
31 } | |
32 catch (Exception ex) { | |
33 writefln("Skipping %s", name); | |
34 //writefln("Exception %s", ex); | |
35 // TODO accumulate errors and print after traversal | |
36 } | |
37 } | |
38 } | |
39 catch (FileException ex) { | |
40 // ignore | |
41 writefln("dirEntries bailed out (%s). Continuing anyway", last_name); | |
42 } | |
43 } | |
44 | |
45 writefln("Processing %s files", _file_array.length); | |
46 | |
47 uint[][ulong] size_to_file_indices; | |
48 bool[ulong] duplicate_sizes; | |
49 | |
50 foreach (index, file; _file_array) { | |
51 //writefln("%s %s %s", index, file.name, file.size); | |
52 | |
53 if (uint[] * indices = (file.size in size_to_file_indices)) { | |
54 if (indices.length == 1) { | |
55 // Second time we've seen a file of this size, | |
56 // record it in the duplicate_sizes array | |
57 duplicate_sizes[file.size] = true; | |
58 } | |
59 | |
60 (*indices) ~= index; | |
61 } | |
62 else { | |
63 size_to_file_indices[file.size] = [ index ]; | |
64 } | |
65 } | |
66 | |
67 writefln("Number of files of duplicate size %s", duplicate_sizes.length); | |
68 | |
69 foreach (size; duplicate_sizes.keys) { | |
70 uint[] indices = size_to_file_indices[size]; | |
71 //writefln("For size %s there are %s files", size, indices.length); | |
72 | |
73 uint[][ubyte[16]] digest_to_indices; | |
74 | |
75 foreach (index; indices) { | |
76 FileInfo file_info = _file_array[index]; | |
77 | |
78 try { | |
79 ubyte[16] digest = compute_md5(file_info.name); | |
80 | |
81 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { | |
82 // A true duplicate | |
83 // index and index2 are the same | |
84 | |
85 (*duplicate_indices) ~= index; | |
86 } | |
87 else { | |
88 digest_to_indices[digest] ~= index; | |
89 } | |
90 } | |
91 catch (ErrnoException ex) { | |
92 //writefln("Skipping: %s", file_info.name); | |
93 } | |
94 | |
95 //writefln("\t%s", file_info.name); | |
96 } | |
97 | |
98 foreach (indices2; digest_to_indices) { | |
99 if (indices2.length > 1) { | |
100 // List the duplicates | |
101 foreach (index; indices) { | |
102 FileInfo file_info = _file_array[index]; | |
103 writefln("%s %s", file_info.size, file_info.name); | |
104 } | |
105 writefln(""); | |
106 } | |
107 } | |
108 } | |
109 | |
110 writefln("Done\n"); | |
111 } | |
112 | |
113 ubyte[16] compute_md5(in string name) { | |
114 ubyte[16] digest; | |
115 | |
116 auto file = File(name, "r"); | |
117 scope(exit) file.close; | |
118 | |
119 MD5_CTX context; | |
120 context.start(); | |
121 { // Block 1: | |
122 // Compute the actual digest | |
123 ulong amount = 0; | |
124 foreach (ubyte[] buffer; chunks(file, 1024)) { | |
125 context.update(buffer); | |
126 //bytes_chewed(buffer.length); | |
127 amount += buffer.length; | |
128 if (amount >= MD5_AMOUNT) { | |
129 break; | |
130 } | |
131 } | |
132 } | |
133 context.finish(digest); | |
134 | |
135 return digest; | |
136 } | |
137 | |
138 private { | |
139 immutable ulong KILO = 1 << 10; | |
140 immutable ulong MEGA = 1 << 20; | |
141 | |
142 immutable ulong SIZE_THRESHOLD = 100 * KILO; | |
143 immutable ulong MD5_AMOUNT = 10 * KILO; | |
144 | |
145 struct FileInfo { | |
146 this(in string name_, in ulong size_) { | |
147 name = name_; | |
148 size = size_; | |
149 } | |
150 | |
151 string name; | |
152 ulong size; | |
153 }; | |
154 | |
155 FileInfo[] _file_array; | |
156 } | |
157 } | |
158 | |
159 int main(string[] args) { | |
160 new DuplicateFinder(args[1..$]); | |
161 | |
162 return 0; | |
163 } |