Mercurial > projects > doodle
annotate doodle/utils/prog/duplicates.d @ 115:d7330cc52622
Added instructions to duplicates.d on the smallest changes
required to trigger/untrigger the memory blowout.
Interestingly the blowout only occurs when compiled with -m32, not -m64.
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Sat, 16 Apr 2011 19:48:33 +0930 |
parents | b87e2e0a046a |
children | 31c27f4f3bbc |
rev | line source |
---|---|
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
1 import std.stdio; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
2 import std.string; |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
3 import std.exception; |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
4 import std.random; |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
5 import std.algorithm; |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
6 import std.file; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
7 import std.c.stdio; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
8 import std.c.string; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
9 import std.cstream; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
10 import core.sys.posix.dirent; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
11 import std.md5; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
12 |
114 | 13 // This program recursively processes files in a list |
14 // of directories, computing an MD5 digest on each file | |
15 // and then informing the user of files with duplicate content. | |
16 // Only duplicate files over a certain size are reported. | |
17 | |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
18 class DuplicateFinder { |
114 | 19 this(in string[] dirs) { |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
20 // First pass to gather the number of files and bytes |
114 | 21 // so that we are able to convey progress to the user |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
22 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
23 writeln("Accumulating total bytes / files"); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
24 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
25 uint total_files = 0; |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
26 |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
27 try { |
114 | 28 foreach (string dir; dirs) { |
29 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | |
30 try { | |
31 if (isFile(name)) { | |
32 _total_bytes += getSize(name); | |
33 ++total_files; | |
34 } | |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
35 } |
114 | 36 catch (Exception ex) { |
37 writefln("Skipping %s", name); | |
38 //writefln("Exception %s", ex); | |
39 // TODO accumulate errors and print after traversal | |
40 } | |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
41 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
42 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
43 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
44 catch (FileException ex) { |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
45 // ignore |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
46 writefln("dirEntries bailed out. Continuing anyway"); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
47 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
48 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
49 writefln("Files %s, bytes %s", total_files, _total_bytes); |
114 | 50 |
51 // Go through the files again, but this time | |
52 // compute the MD5 digests and build our data structures | |
53 | |
54 writeln("Accumulating MD5 digests"); | |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
55 |
114 | 56 foreach (string dir; dirs) { |
57 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
58 try { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
59 if (isFile(name)) { |
114 | 60 //writefln("MD5'ing %s", name); |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
61 compute_md5(name, getSize(name)); |
114 | 62 } |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
63 } |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
64 catch (FileException ex) { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
65 writefln("Skipping %s", name); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
66 } |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
67 catch (ErrnoException ex) { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
68 //writefln("Skipping file: %s, %s", name, ex); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
69 //writefln("(errno) Skipping file: %s", name); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
70 // TODO accumulate errors and print after traversal is complete |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
71 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
72 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
73 } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
74 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
75 writefln(""); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
76 |
114 | 77 // Sort our duplicate digests by size so that we print |
78 // the biggest duplicate file offenders first | |
79 | |
80 writeln("Sorting duplicate digests by size"); | |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
81 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
82 ubyte[16][] keys = _duplicate_digests.keys; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
83 bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
84 sort!(compare_by_size)(keys); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
85 |
114 | 86 // Print the results out the user, in descending order |
87 // of file size | |
88 | |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
89 writeln("Printing results"); |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
90 |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
91 writefln("Number of duplicate files: %s", _duplicate_digests.length); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
92 |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
93 foreach (digest; keys) { |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
94 auto file_info = _file_info_map[digest]; |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
95 /* |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
96 writefln("Size %s, Count %s, Digest %s", |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
97 file_info.size, file_info.names.length, digestToString(digest)); |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
98 */ |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
99 writefln("Size %s, Count %s", file_info.size, file_info.names.length); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
100 foreach (name; file_info.names) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
101 writefln("\t%s", name); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
102 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
103 } |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
104 |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
105 writeln("Done"); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
106 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
107 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
108 private { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
109 struct FileInfo { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
110 this(in ulong size_, string first_name) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
111 size = size_; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
112 names ~= first_name; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
113 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
114 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
115 ulong size; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
116 string[] names; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
117 }; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
118 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
119 //static const ulong SIZE_THRESHOLD = 1_000; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
120 static const ulong SIZE_THRESHOLD = 0; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
121 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
122 bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
123 FileInfo[ubyte[16]] _file_info_map; // map of digest to file info |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
124 |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
125 ulong _total_bytes; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
126 ulong _current_byte; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
127 double _last_progress = -1.0; |
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
128 |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
129 void compute_md5(in string filename, in ulong size) { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
130 void bytes_chewed(ulong bytes) { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
131 _current_byte += bytes; |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
132 double progress = cast(double)_current_byte / cast(double)_total_bytes; |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
133 if (progress - _last_progress > 0.0005) { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
134 writef("\rProgress %.1f%%", 100.0 * progress); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
135 std.stdio.stdout.flush(); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
136 _last_progress = progress; |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
137 } |
113
9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents:
112
diff
changeset
|
138 } |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
139 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
140 ubyte[16] digest; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
141 |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
142 // If Block 1 and Block 2 are both uncommented then there is a memory explosion. |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
143 // However, if either one is commented out there there isn't... |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
144 |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
145 { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
146 auto file = File(filename, "r"); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
147 scope(exit) file.close; |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
148 |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
149 MD5_CTX context; |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
150 context.start(); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
151 { // Block 1: |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
152 // Compute the actual digest |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
153 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
154 context.update(buffer); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
155 bytes_chewed(buffer.length); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
156 } |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
157 } |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
158 context.finish(digest); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
159 |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
160 /+ |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
161 { // Block 1 alternative: |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
162 // Create a random digest |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
163 digest = make_random_digest; |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
164 bytes_chewed(size); |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
165 } |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
166 +/ |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
167 } |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
168 |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
169 { // Block 2: |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
170 // Update the data structures |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
171 if (FileInfo * file_info = (digest in _file_info_map)) { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
172 // This is a duplicate digest, append the subsequent name |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
173 file_info.names ~= filename; |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
174 |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
175 // Record the duplicate as an offender if its size exceeds the threshold |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
176 if (file_info.size >= SIZE_THRESHOLD) { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
177 _duplicate_digests[digest] = true; |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
178 } |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
179 } |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
180 else { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
181 // We have not seen this digest before |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
182 _file_info_map[digest] = FileInfo(size, filename); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
183 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
184 } |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
185 } |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
186 |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
187 ubyte[16] make_random_digest() { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
188 ubyte[16] digest; |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
189 foreach (ref a; digest) { |
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
190 a = cast(ubyte)uniform(0, 256); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
191 } |
115
d7330cc52622
Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents:
114
diff
changeset
|
192 return digest; |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
193 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
194 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
195 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
196 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
197 int main(string[] args) { |
114 | 198 new DuplicateFinder(args[1..$]); |
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
199 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
200 return 0; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
201 } |