Mercurial > projects > doodle
comparison doodle/utils/prog/duplicates.d @ 115:d7330cc52622
Added instructions to duplicates.d on the smallest changes
required to trigger/untrigger the memory blowout.
Interestingly the blowout only occurs when compiled with -m32, not -m64.
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Sat, 16 Apr 2011 19:48:33 +0930 |
parents | b87e2e0a046a |
children | 31c27f4f3bbc |
comparison
equal
deleted
inserted
replaced
114:b87e2e0a046a | 115:d7330cc52622 |
---|---|
1 import std.stdio; | 1 import std.stdio; |
2 import std.string; | 2 import std.string; |
3 import std.exception; | 3 import std.exception; |
4 import std.random; | |
4 import std.algorithm; | 5 import std.algorithm; |
5 import std.file; | 6 import std.file; |
6 import std.c.stdio; | 7 import std.c.stdio; |
7 import std.c.string; | 8 import std.c.string; |
8 import std.cstream; | 9 import std.cstream; |
52 | 53 |
53 writeln("Accumulating MD5 digests"); | 54 writeln("Accumulating MD5 digests"); |
54 | 55 |
55 foreach (string dir; dirs) { | 56 foreach (string dir; dirs) { |
56 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | 57 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { |
57 if (isFile(name)) { | 58 try { |
58 try { | 59 if (isFile(name)) { |
59 //writefln("MD5'ing %s", name); | 60 //writefln("MD5'ing %s", name); |
60 compute_md5(name); | 61 compute_md5(name, getSize(name)); |
61 } | 62 } |
62 catch (ErrnoException ex) { | 63 } |
63 //writefln("Skipping file: %s, %s", name, ex); | 64 catch (FileException ex) { |
64 //writefln("(errno) Skipping file: %s", name); | 65 writefln("Skipping %s", name); |
65 // TODO accumulate errors and print after traversal is complete | 66 } |
66 } | 67 catch (ErrnoException ex) { |
68 //writefln("Skipping file: %s, %s", name, ex); | |
69 //writefln("(errno) Skipping file: %s", name); | |
70 // TODO accumulate errors and print after traversal is complete | |
67 } | 71 } |
68 } | 72 } |
69 } | 73 } |
70 | 74 |
71 writefln(""); | 75 writefln(""); |
81 | 85 |
82 // Print the results out the user, in descending order | 86 // Print the results out the user, in descending order |
83 // of file size | 87 // of file size |
84 | 88 |
85 writeln("Printing results"); | 89 writeln("Printing results"); |
90 | |
91 writefln("Number of duplicate files: %s", _duplicate_digests.length); | |
86 | 92 |
87 foreach (digest; keys) { | 93 foreach (digest; keys) { |
88 auto file_info = _file_info_map[digest]; | 94 auto file_info = _file_info_map[digest]; |
89 /* | 95 /* |
90 writefln("Size %s, Count %s, Digest %s", | 96 writefln("Size %s, Count %s, Digest %s", |
118 | 124 |
119 ulong _total_bytes; | 125 ulong _total_bytes; |
120 ulong _current_byte; | 126 ulong _current_byte; |
121 double _last_progress = -1.0; | 127 double _last_progress = -1.0; |
122 | 128 |
123 void bytes_chewed(ulong bytes) { | 129 void compute_md5(in string filename, in ulong size) { |
124 _current_byte += bytes; | 130 void bytes_chewed(ulong bytes) { |
125 double progress = cast(double)_current_byte / cast(double)_total_bytes; | 131 _current_byte += bytes; |
126 if (progress - _last_progress > 0.0005) { | 132 double progress = cast(double)_current_byte / cast(double)_total_bytes; |
127 writef("\rProgress %.1f%%", 100.0 * progress); | 133 if (progress - _last_progress > 0.0005) { |
128 std.stdio.stdout.flush(); | 134 writef("\rProgress %.1f%%", 100.0 * progress); |
129 _last_progress = progress; | 135 std.stdio.stdout.flush(); |
130 } | 136 _last_progress = progress; |
131 } | 137 } |
132 | 138 } |
133 void compute_md5(in string filename) { | |
134 auto file = File(filename, "r"); | |
135 scope(exit) file.close; | |
136 | 139 |
137 ubyte[16] digest; | 140 ubyte[16] digest; |
138 | 141 |
139 MD5_CTX context; | 142 // If Block 1 and Block 2 are both uncommented then there is a memory explosion. |
140 context.start(); | 143 // However, if either one is commented out there there isn't... |
141 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { | 144 |
142 bytes_chewed(buffer.length); | 145 { |
143 context.update(buffer); | 146 auto file = File(filename, "r"); |
144 } | 147 scope(exit) file.close; |
145 context.finish(digest); | 148 |
146 | 149 MD5_CTX context; |
147 if (FileInfo * file_info = (digest in _file_info_map)) { | 150 context.start(); |
148 // This is a duplicate digest, append the subsequent name | 151 { // Block 1: |
149 file_info.names ~= filename; | 152 // Compute the actual digest |
150 | 153 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { |
151 // Record the duplicate as an offender if its size exceeds the threshold | 154 context.update(buffer); |
152 if (file_info.size >= SIZE_THRESHOLD) { | 155 bytes_chewed(buffer.length); |
153 _duplicate_digests[digest] = true; | 156 } |
154 } | 157 } |
155 } | 158 context.finish(digest); |
156 else { | 159 |
157 // We have not seen this digest before | 160 /+ |
158 _file_info_map[digest] = FileInfo(getSize(filename), filename); | 161 { // Block 1 alternative: |
159 } | 162 // Create a random digest |
163 digest = make_random_digest; | |
164 bytes_chewed(size); | |
165 } | |
166 +/ | |
167 } | |
168 | |
169 { // Block 2: | |
170 // Update the data structures | |
171 if (FileInfo * file_info = (digest in _file_info_map)) { | |
172 // This is a duplicate digest, append the subsequent name | |
173 file_info.names ~= filename; | |
174 | |
175 // Record the duplicate as an offender if its size exceeds the threshold | |
176 if (file_info.size >= SIZE_THRESHOLD) { | |
177 _duplicate_digests[digest] = true; | |
178 } | |
179 } | |
180 else { | |
181 // We have not seen this digest before | |
182 _file_info_map[digest] = FileInfo(size, filename); | |
183 } | |
184 } | |
185 } | |
186 | |
187 ubyte[16] make_random_digest() { | |
188 ubyte[16] digest; | |
189 foreach (ref a; digest) { | |
190 a = cast(ubyte)uniform(0, 256); | |
191 } | |
192 return digest; | |
160 } | 193 } |
161 } | 194 } |
162 } | 195 } |
163 | 196 |
164 int main(string[] args) { | 197 int main(string[] args) { |