Mercurial > projects > doodle
annotate doodle/utils/prog/duplicates.d @ 114:b87e2e0a046a
Cleanup of duplicates.d
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Fri, 15 Apr 2011 11:07:47 +0930 |
parents | 9cc6c428fdbe |
children | d7330cc52622 |
rev | line source |
---|---|
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
1 import std.stdio; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
2 import std.string; |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
3 import std.exception; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
4 import std.algorithm; |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
5 import std.file; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
6 import std.c.stdio; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
7 import std.c.string; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
8 import std.cstream; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
9 import core.sys.posix.dirent; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
10 import std.md5; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
11 |
114 | 12 // This program recursively processes files in a list |
13 // of directories, computing an MD5 digest on each file | |
14 // and then informing the user of files with duplicate content. | |
15 // Only duplicate files over a certain size are reported. | |
16 | |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
17 class DuplicateFinder { |
114 | 18 this(in string[] dirs) { |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
19 // First pass to gather the number of files and bytes |
114 | 20 // so that we are able to convey progress to the user |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
21 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
22 writeln("Accumulating total bytes / files"); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
23 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
24 uint total_files = 0; |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
25 |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
26 try { |
114 | 27 foreach (string dir; dirs) { |
28 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | |
29 try { | |
30 if (isFile(name)) { | |
31 _total_bytes += getSize(name); | |
32 ++total_files; | |
33 } | |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
34 } |
114 | 35 catch (Exception ex) { |
36 writefln("Skipping %s", name); | |
37 //writefln("Exception %s", ex); | |
38 // TODO accumulate errors and print after traversal | |
39 } | |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
40 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
41 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
42 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
43 catch (FileException ex) { |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
44 // ignore |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
45 writefln("dirEntries bailed out. Continuing anyway"); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
46 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
47 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
48 writefln("Files %s, bytes %s", total_files, _total_bytes); |
114 | 49 |
50 // Go through the files again, but this time | |
51 // compute the MD5 digests and build our data structures | |
52 | |
53 writeln("Accumulating MD5 digests"); | |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
54 |
114 | 55 foreach (string dir; dirs) { |
56 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | |
57 if (isFile(name)) { | |
58 try { | |
59 //writefln("MD5'ing %s", name); | |
60 compute_md5(name); | |
61 } | |
62 catch (ErrnoException ex) { | |
63 //writefln("Skipping file: %s, %s", name, ex); | |
64 //writefln("(errno) Skipping file: %s", name); | |
65 // TODO accumulate errors and print after traversal is complete | |
66 } | |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
67 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
68 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
69 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
70 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
71 writefln(""); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
72 |
114 | 73 // Sort our duplicate digests by size so that we print |
74 // the biggest duplicate file offenders first | |
75 | |
76 writeln("Sorting duplicate digests by size"); | |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
77 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
78 ubyte[16][] keys = _duplicate_digests.keys; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
79 bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
80 sort!(compare_by_size)(keys); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
81 |
114 | 82 // Print the results out the user, in descending order |
83 // of file size | |
84 | |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
85 writeln("Printing results"); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
86 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
87 foreach (digest; keys) { |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
88 auto file_info = _file_info_map[digest]; |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
89 /* |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
90 writefln("Size %s, Count %s, Digest %s", |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
91 file_info.size, file_info.names.length, digestToString(digest)); |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
92 */ |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
93 writefln("Size %s, Count %s", file_info.size, file_info.names.length); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
94 foreach (name; file_info.names) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
95 writefln("\t%s", name); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
96 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
97 } |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
98 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
99 writeln("Done"); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
100 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
101 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
102 private { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
103 struct FileInfo { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
104 this(in ulong size_, string first_name) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
105 size = size_; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
106 names ~= first_name; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
107 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
108 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
109 ulong size; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
110 string[] names; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
111 }; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
112 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
113 //static const ulong SIZE_THRESHOLD = 1_000; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
114 static const ulong SIZE_THRESHOLD = 0; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
115 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
116 bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
117 FileInfo[ubyte[16]] _file_info_map; // map of digest to file info |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
118 |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
119 ulong _total_bytes; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
120 ulong _current_byte; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
121 double _last_progress = -1.0; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
122 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
123 void bytes_chewed(ulong bytes) { |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
124 _current_byte += bytes; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
125 double progress = cast(double)_current_byte / cast(double)_total_bytes; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
126 if (progress - _last_progress > 0.0005) { |
114 | 127 writef("\rProgress %.1f%%", 100.0 * progress); |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
128 std.stdio.stdout.flush(); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
129 _last_progress = progress; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
130 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
131 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
132 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
133 void compute_md5(in string filename) { |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
134 auto file = File(filename, "r"); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
135 scope(exit) file.close; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
136 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
137 ubyte[16] digest; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
138 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
139 MD5_CTX context; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
140 context.start(); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
141 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
142 bytes_chewed(buffer.length); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
143 context.update(buffer); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
144 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
145 context.finish(digest); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
146 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
147 if (FileInfo * file_info = (digest in _file_info_map)) { |
114 | 148 // This is a duplicate digest, append the subsequent name |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
149 file_info.names ~= filename; |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
150 |
114 | 151 // Record the duplicate as an offender if its size exceeds the threshold |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
152 if (file_info.size >= SIZE_THRESHOLD) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
153 _duplicate_digests[digest] = true; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
154 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
155 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
156 else { |
114 | 157 // We have not seen this digest before |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
158 _file_info_map[digest] = FileInfo(getSize(filename), filename); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
159 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
160 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
161 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
162 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
163 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
164 int main(string[] args) { |
114 | 165 new DuplicateFinder(args[1..$]); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
166 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
167 return 0; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
168 } |