comparison doodle/utils/prog/duplicates.d @ 114:b87e2e0a046a

Cleanup of duplicates.d
author David Bryant <bagnose@gmail.com>
date Fri, 15 Apr 2011 11:07:47 +0930
parents 9cc6c428fdbe
children d7330cc52622
comparison
equal deleted inserted replaced
113:9cc6c428fdbe 114:b87e2e0a046a
7 import std.c.string; 7 import std.c.string;
8 import std.cstream; 8 import std.cstream;
9 import core.sys.posix.dirent; 9 import core.sys.posix.dirent;
10 import std.md5; 10 import std.md5;
11 11
12 // This program recursively processes files in a list
13 // of directories, computing an MD5 digest on each file
14 // and then informing the user of files with duplicate content.
15 // Only duplicate files over a certain size are reported.
16
12 class DuplicateFinder { 17 class DuplicateFinder {
13 this(in string dir) { 18 this(in string[] dirs) {
14 // First pass to gather the number of files and bytes 19 // First pass to gather the number of files and bytes
20 // so that we are able to convey progress to the user
15 21
16 writeln("Accumulating total bytes / files"); 22 writeln("Accumulating total bytes / files");
17 23
18 uint total_files = 0; 24 uint total_files = 0;
19 25
20 try { 26 try {
21 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { 27 foreach (string dir; dirs) {
22 try { 28 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
23 if (isFile(name)) { 29 try {
24 _total_bytes += getSize(name); 30 if (isFile(name)) {
25 ++total_files; 31 _total_bytes += getSize(name);
32 ++total_files;
33 }
26 } 34 }
27 } 35 catch (Exception ex) {
28 catch (Exception ex) { 36 writefln("Skipping %s", name);
29 writefln("Skipping %s", name); 37 //writefln("Exception %s", ex);
30 //writefln("Exception %s", ex); 38 // TODO accumulate errors and print after traversal
39 }
31 } 40 }
32 } 41 }
33 } 42 }
34 catch (FileException ex) { 43 catch (FileException ex) {
35 // ignore 44 // ignore
36 writefln("dirEntries bailed out. Continuing anyway"); 45 writefln("dirEntries bailed out. Continuing anyway");
37 } 46 }
38 47
39 writefln("Files %s, bytes %s", total_files, _total_bytes); 48 writefln("Files %s, bytes %s", total_files, _total_bytes);
40 writeln("Accumulating MD5 sums");
41 49
42 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { 50 // Go through the files again, but this time
43 if (isFile(name)) { 51 // compute the MD5 digests and build our data structures
44 try { 52
45 //writefln("MD5'ing %s", name); 53 writeln("Accumulating MD5 digests");
46 compute_md5(name); 54
47 } 55 foreach (string dir; dirs) {
48 catch (ErrnoException ex) { 56 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
49 //writefln("Skipping file: %s, %s", name, ex); 57 if (isFile(name)) {
50 //writefln("(errno) Skipping file: %s", name); 58 try {
51 // TODO accumulate errors and print after traversal is complete 59 //writefln("MD5'ing %s", name);
60 compute_md5(name);
61 }
62 catch (ErrnoException ex) {
63 //writefln("Skipping file: %s, %s", name, ex);
64 //writefln("(errno) Skipping file: %s", name);
65 // TODO accumulate errors and print after traversal is complete
66 }
52 } 67 }
53 } 68 }
54 } 69 }
55 70
56 writefln(""); 71 writefln("");
57 72
58 writeln("Sorting keys"); 73 // Sort our duplicate digests by size so that we print
74 // the biggest duplicate file offenders first
75
76 writeln("Sorting duplicate digests by size");
59 77
60 ubyte[16][] keys = _duplicate_digests.keys; 78 ubyte[16][] keys = _duplicate_digests.keys;
61 bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } 79 bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; }
62 sort!(compare_by_size)(keys); 80 sort!(compare_by_size)(keys);
81
82 // Print the results out the user, in descending order
83 // of file size
63 84
64 writeln("Printing results"); 85 writeln("Printing results");
65 86
66 foreach (digest; keys) { 87 foreach (digest; keys) {
67 auto file_info = _file_info_map[digest]; 88 auto file_info = _file_info_map[digest];
101 122
102 void bytes_chewed(ulong bytes) { 123 void bytes_chewed(ulong bytes) {
103 _current_byte += bytes; 124 _current_byte += bytes;
104 double progress = cast(double)_current_byte / cast(double)_total_bytes; 125 double progress = cast(double)_current_byte / cast(double)_total_bytes;
105 if (progress - _last_progress > 0.0005) { 126 if (progress - _last_progress > 0.0005) {
106 writef("\rProgress %3.1f%%", 100.0 * progress); 127 writef("\rProgress %.1f%%", 100.0 * progress);
107 std.stdio.stdout.flush(); 128 std.stdio.stdout.flush();
108 _last_progress = progress; 129 _last_progress = progress;
109 } 130 }
110
111 } 131 }
112 132
113 void compute_md5(in string filename) { 133 void compute_md5(in string filename) {
114 //writefln("%s", filename);
115 auto file = File(filename, "r"); 134 auto file = File(filename, "r");
116 scope(exit) file.close; 135 scope(exit) file.close;
117 136
118 ubyte[16] digest; 137 ubyte[16] digest;
119 138
122 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { 141 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
123 bytes_chewed(buffer.length); 142 bytes_chewed(buffer.length);
124 context.update(buffer); 143 context.update(buffer);
125 } 144 }
126 context.finish(digest); 145 context.finish(digest);
127 //writefln("%s: %s", digestToString(digest), filename);
128 146
129 if (FileInfo * file_info = (digest in _file_info_map)) { 147 if (FileInfo * file_info = (digest in _file_info_map)) {
130 // duplicate 148 // This is a duplicate digest, append the subsequent name
131 file_info.names ~= filename; 149 file_info.names ~= filename;
132 assert(file_info.names.length > 1);
133 150
151 // Record the duplicate as an offender if its size exceeds the threshold
134 if (file_info.size >= SIZE_THRESHOLD) { 152 if (file_info.size >= SIZE_THRESHOLD) {
135 _duplicate_digests[digest] = true; 153 _duplicate_digests[digest] = true;
136 } 154 }
137 } 155 }
138 else { 156 else {
139 // unseen 157 // We have not seen this digest before
140 _file_info_map[digest] = FileInfo(getSize(filename), filename); 158 _file_info_map[digest] = FileInfo(getSize(filename), filename);
141 //writefln("%s", _file_info_map.length);
142 } 159 }
143 } 160 }
144 } 161 }
145 } 162 }
146 163
147 int main(string[] args) { 164 int main(string[] args) {
148 foreach (string arg; args[1..$]) { 165 new DuplicateFinder(args[1..$]);
149 new DuplicateFinder(arg);
150 }
151 166
152 return 0; 167 return 0;
153 } 168 }