Mercurial > projects > doodle
comparison doodle/utils/prog/duplicates.d @ 114:b87e2e0a046a
Cleanup of duplicates.d
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Fri, 15 Apr 2011 11:07:47 +0930 |
parents | 9cc6c428fdbe |
children | d7330cc52622 |
comparison
equal
deleted
inserted
replaced
113:9cc6c428fdbe | 114:b87e2e0a046a |
---|---|
7 import std.c.string; | 7 import std.c.string; |
8 import std.cstream; | 8 import std.cstream; |
9 import core.sys.posix.dirent; | 9 import core.sys.posix.dirent; |
10 import std.md5; | 10 import std.md5; |
11 | 11 |
12 // This program recursively processes files in a list | |
13 // of directories, computing an MD5 digest on each file | |
14 // and then informing the user of files with duplicate content. | |
15 // Only duplicate files over a certain size are reported. | |
16 | |
12 class DuplicateFinder { | 17 class DuplicateFinder { |
13 this(in string dir) { | 18 this(in string[] dirs) { |
14 // First pass to gather the number of files and bytes | 19 // First pass to gather the number of files and bytes |
20 // so that we are able to convey progress to the user | |
15 | 21 |
16 writeln("Accumulating total bytes / files"); | 22 writeln("Accumulating total bytes / files"); |
17 | 23 |
18 uint total_files = 0; | 24 uint total_files = 0; |
19 | 25 |
20 try { | 26 try { |
21 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | 27 foreach (string dir; dirs) { |
22 try { | 28 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { |
23 if (isFile(name)) { | 29 try { |
24 _total_bytes += getSize(name); | 30 if (isFile(name)) { |
25 ++total_files; | 31 _total_bytes += getSize(name); |
32 ++total_files; | |
33 } | |
26 } | 34 } |
27 } | 35 catch (Exception ex) { |
28 catch (Exception ex) { | 36 writefln("Skipping %s", name); |
29 writefln("Skipping %s", name); | 37 //writefln("Exception %s", ex); |
30 //writefln("Exception %s", ex); | 38 // TODO accumulate errors and print after traversal |
39 } | |
31 } | 40 } |
32 } | 41 } |
33 } | 42 } |
34 catch (FileException ex) { | 43 catch (FileException ex) { |
35 // ignore | 44 // ignore |
36 writefln("dirEntries bailed out. Continuing anyway"); | 45 writefln("dirEntries bailed out. Continuing anyway"); |
37 } | 46 } |
38 | 47 |
39 writefln("Files %s, bytes %s", total_files, _total_bytes); | 48 writefln("Files %s, bytes %s", total_files, _total_bytes); |
40 writeln("Accumulating MD5 sums"); | |
41 | 49 |
42 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | 50 // Go through the files again, but this time |
43 if (isFile(name)) { | 51 // compute the MD5 digests and build our data structures |
44 try { | 52 |
45 //writefln("MD5'ing %s", name); | 53 writeln("Accumulating MD5 digests"); |
46 compute_md5(name); | 54 |
47 } | 55 foreach (string dir; dirs) { |
48 catch (ErrnoException ex) { | 56 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { |
49 //writefln("Skipping file: %s, %s", name, ex); | 57 if (isFile(name)) { |
50 //writefln("(errno) Skipping file: %s", name); | 58 try { |
51 // TODO accumulate errors and print after traversal is complete | 59 //writefln("MD5'ing %s", name); |
60 compute_md5(name); | |
61 } | |
62 catch (ErrnoException ex) { | |
63 //writefln("Skipping file: %s, %s", name, ex); | |
64 //writefln("(errno) Skipping file: %s", name); | |
65 // TODO accumulate errors and print after traversal is complete | |
66 } | |
52 } | 67 } |
53 } | 68 } |
54 } | 69 } |
55 | 70 |
56 writefln(""); | 71 writefln(""); |
57 | 72 |
58 writeln("Sorting keys"); | 73 // Sort our duplicate digests by size so that we print |
74 // the biggest duplicate file offenders first | |
75 | |
76 writeln("Sorting duplicate digests by size"); | |
59 | 77 |
60 ubyte[16][] keys = _duplicate_digests.keys; | 78 ubyte[16][] keys = _duplicate_digests.keys; |
61 bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } | 79 bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } |
62 sort!(compare_by_size)(keys); | 80 sort!(compare_by_size)(keys); |
81 | |
82 // Print the results out the user, in descending order | |
83 // of file size | |
63 | 84 |
64 writeln("Printing results"); | 85 writeln("Printing results"); |
65 | 86 |
66 foreach (digest; keys) { | 87 foreach (digest; keys) { |
67 auto file_info = _file_info_map[digest]; | 88 auto file_info = _file_info_map[digest]; |
101 | 122 |
102 void bytes_chewed(ulong bytes) { | 123 void bytes_chewed(ulong bytes) { |
103 _current_byte += bytes; | 124 _current_byte += bytes; |
104 double progress = cast(double)_current_byte / cast(double)_total_bytes; | 125 double progress = cast(double)_current_byte / cast(double)_total_bytes; |
105 if (progress - _last_progress > 0.0005) { | 126 if (progress - _last_progress > 0.0005) { |
106 writef("\rProgress %3.1f%%", 100.0 * progress); | 127 writef("\rProgress %.1f%%", 100.0 * progress); |
107 std.stdio.stdout.flush(); | 128 std.stdio.stdout.flush(); |
108 _last_progress = progress; | 129 _last_progress = progress; |
109 } | 130 } |
110 | |
111 } | 131 } |
112 | 132 |
113 void compute_md5(in string filename) { | 133 void compute_md5(in string filename) { |
114 //writefln("%s", filename); | |
115 auto file = File(filename, "r"); | 134 auto file = File(filename, "r"); |
116 scope(exit) file.close; | 135 scope(exit) file.close; |
117 | 136 |
118 ubyte[16] digest; | 137 ubyte[16] digest; |
119 | 138 |
122 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { | 141 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { |
123 bytes_chewed(buffer.length); | 142 bytes_chewed(buffer.length); |
124 context.update(buffer); | 143 context.update(buffer); |
125 } | 144 } |
126 context.finish(digest); | 145 context.finish(digest); |
127 //writefln("%s: %s", digestToString(digest), filename); | |
128 | 146 |
129 if (FileInfo * file_info = (digest in _file_info_map)) { | 147 if (FileInfo * file_info = (digest in _file_info_map)) { |
130 // duplicate | 148 // This is a duplicate digest, append the subsequent name |
131 file_info.names ~= filename; | 149 file_info.names ~= filename; |
132 assert(file_info.names.length > 1); | |
133 | 150 |
151 // Record the duplicate as an offender if its size exceeds the threshold | |
134 if (file_info.size >= SIZE_THRESHOLD) { | 152 if (file_info.size >= SIZE_THRESHOLD) { |
135 _duplicate_digests[digest] = true; | 153 _duplicate_digests[digest] = true; |
136 } | 154 } |
137 } | 155 } |
138 else { | 156 else { |
139 // unseen | 157 // We have not seen this digest before |
140 _file_info_map[digest] = FileInfo(getSize(filename), filename); | 158 _file_info_map[digest] = FileInfo(getSize(filename), filename); |
141 //writefln("%s", _file_info_map.length); | |
142 } | 159 } |
143 } | 160 } |
144 } | 161 } |
145 } | 162 } |
146 | 163 |
147 int main(string[] args) { | 164 int main(string[] args) { |
148 foreach (string arg; args[1..$]) { | 165 new DuplicateFinder(args[1..$]); |
149 new DuplicateFinder(arg); | |
150 } | |
151 | 166 |
152 return 0; | 167 return 0; |
153 } | 168 } |