comparison doodle/utils/prog/duplicates.d @ 115:d7330cc52622

Added instructions to duplicates.d on the smallest changes required to trigger/untrigger the memory blowout. Interestingly the blowout only occurs when compiled with -m32, not -m64.
author David Bryant <bagnose@gmail.com>
date Sat, 16 Apr 2011 19:48:33 +0930
parents b87e2e0a046a
children 31c27f4f3bbc
comparison
equal deleted inserted replaced
114:b87e2e0a046a 115:d7330cc52622
1 import std.stdio; 1 import std.stdio;
2 import std.string; 2 import std.string;
3 import std.exception; 3 import std.exception;
4 import std.random;
4 import std.algorithm; 5 import std.algorithm;
5 import std.file; 6 import std.file;
6 import std.c.stdio; 7 import std.c.stdio;
7 import std.c.string; 8 import std.c.string;
8 import std.cstream; 9 import std.cstream;
52 53
53 writeln("Accumulating MD5 digests"); 54 writeln("Accumulating MD5 digests");
54 55
55 foreach (string dir; dirs) { 56 foreach (string dir; dirs) {
56 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { 57 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
57 if (isFile(name)) { 58 try {
58 try { 59 if (isFile(name)) {
59 //writefln("MD5'ing %s", name); 60 //writefln("MD5'ing %s", name);
60 compute_md5(name); 61 compute_md5(name, getSize(name));
61 } 62 }
62 catch (ErrnoException ex) { 63 }
63 //writefln("Skipping file: %s, %s", name, ex); 64 catch (FileException ex) {
64 //writefln("(errno) Skipping file: %s", name); 65 writefln("Skipping %s", name);
65 // TODO accumulate errors and print after traversal is complete 66 }
66 } 67 catch (ErrnoException ex) {
68 //writefln("Skipping file: %s, %s", name, ex);
69 //writefln("(errno) Skipping file: %s", name);
70 // TODO accumulate errors and print after traversal is complete
67 } 71 }
68 } 72 }
69 } 73 }
70 74
71 writefln(""); 75 writefln("");
81 85
82 // Print the results out the user, in descending order 86 // Print the results out the user, in descending order
83 // of file size 87 // of file size
84 88
85 writeln("Printing results"); 89 writeln("Printing results");
90
91 writefln("Number of duplicate files: %s", _duplicate_digests.length);
86 92
87 foreach (digest; keys) { 93 foreach (digest; keys) {
88 auto file_info = _file_info_map[digest]; 94 auto file_info = _file_info_map[digest];
89 /* 95 /*
90 writefln("Size %s, Count %s, Digest %s", 96 writefln("Size %s, Count %s, Digest %s",
118 124
119 ulong _total_bytes; 125 ulong _total_bytes;
120 ulong _current_byte; 126 ulong _current_byte;
121 double _last_progress = -1.0; 127 double _last_progress = -1.0;
122 128
123 void bytes_chewed(ulong bytes) { 129 void compute_md5(in string filename, in ulong size) {
124 _current_byte += bytes; 130 void bytes_chewed(ulong bytes) {
125 double progress = cast(double)_current_byte / cast(double)_total_bytes; 131 _current_byte += bytes;
126 if (progress - _last_progress > 0.0005) { 132 double progress = cast(double)_current_byte / cast(double)_total_bytes;
127 writef("\rProgress %.1f%%", 100.0 * progress); 133 if (progress - _last_progress > 0.0005) {
128 std.stdio.stdout.flush(); 134 writef("\rProgress %.1f%%", 100.0 * progress);
129 _last_progress = progress; 135 std.stdio.stdout.flush();
130 } 136 _last_progress = progress;
131 } 137 }
132 138 }
133 void compute_md5(in string filename) {
134 auto file = File(filename, "r");
135 scope(exit) file.close;
136 139
137 ubyte[16] digest; 140 ubyte[16] digest;
138 141
139 MD5_CTX context; 142 // If Block 1 and Block 2 are both uncommented then there is a memory explosion.
140 context.start(); 143 // However, if either one is commented out there there isn't...
141 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { 144
142 bytes_chewed(buffer.length); 145 {
143 context.update(buffer); 146 auto file = File(filename, "r");
144 } 147 scope(exit) file.close;
145 context.finish(digest); 148
146 149 MD5_CTX context;
147 if (FileInfo * file_info = (digest in _file_info_map)) { 150 context.start();
148 // This is a duplicate digest, append the subsequent name 151 { // Block 1:
149 file_info.names ~= filename; 152 // Compute the actual digest
150 153 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
151 // Record the duplicate as an offender if its size exceeds the threshold 154 context.update(buffer);
152 if (file_info.size >= SIZE_THRESHOLD) { 155 bytes_chewed(buffer.length);
153 _duplicate_digests[digest] = true; 156 }
154 } 157 }
155 } 158 context.finish(digest);
156 else { 159
157 // We have not seen this digest before 160 /+
158 _file_info_map[digest] = FileInfo(getSize(filename), filename); 161 { // Block 1 alternative:
159 } 162 // Create a random digest
163 digest = make_random_digest;
164 bytes_chewed(size);
165 }
166 +/
167 }
168
169 { // Block 2:
170 // Update the data structures
171 if (FileInfo * file_info = (digest in _file_info_map)) {
172 // This is a duplicate digest, append the subsequent name
173 file_info.names ~= filename;
174
175 // Record the duplicate as an offender if its size exceeds the threshold
176 if (file_info.size >= SIZE_THRESHOLD) {
177 _duplicate_digests[digest] = true;
178 }
179 }
180 else {
181 // We have not seen this digest before
182 _file_info_map[digest] = FileInfo(size, filename);
183 }
184 }
185 }
186
187 ubyte[16] make_random_digest() {
188 ubyte[16] digest;
189 foreach (ref a; digest) {
190 a = cast(ubyte)uniform(0, 256);
191 }
192 return digest;
160 } 193 }
161 } 194 }
162 } 195 }
163 196
164 int main(string[] args) { 197 int main(string[] args) {