annotate doodle/utils/prog/duplicates.d @ 115:d7330cc52622

Added instructions to duplicates.d on the smallest changes required to trigger/untrigger the memory blowout. Interestingly the blowout only occurs when compiled with -m32, not -m64.
author David Bryant <bagnose@gmail.com>
date Sat, 16 Apr 2011 19:48:33 +0930
parents b87e2e0a046a
children 31c27f4f3bbc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
1 import std.stdio;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
2 import std.string;
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
3 import std.exception;
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
4 import std.random;
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
5 import std.algorithm;
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
6 import std.file;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
7 import std.c.stdio;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
8 import std.c.string;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
9 import std.cstream;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
10 import core.sys.posix.dirent;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
11 import std.md5;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
12
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
13 // This program recursively processes files in a list
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
14 // of directories, computing an MD5 digest on each file
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
15 // and then informing the user of files with duplicate content.
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
16 // Only duplicate files over a certain size are reported.
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
17
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
18 class DuplicateFinder {
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
19 this(in string[] dirs) {
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
20 // First pass to gather the number of files and bytes
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
21 // so that we are able to convey progress to the user
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
22
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
23 writeln("Accumulating total bytes / files");
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
24
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
25 uint total_files = 0;
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
26
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
27 try {
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
28 foreach (string dir; dirs) {
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
29 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
30 try {
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
31 if (isFile(name)) {
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
32 _total_bytes += getSize(name);
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
33 ++total_files;
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
34 }
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
35 }
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
36 catch (Exception ex) {
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
37 writefln("Skipping %s", name);
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
38 //writefln("Exception %s", ex);
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
39 // TODO accumulate errors and print after traversal
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
40 }
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
41 }
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
42 }
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
43 }
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
44 catch (FileException ex) {
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
45 // ignore
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
46 writefln("dirEntries bailed out. Continuing anyway");
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
47 }
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
48
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
49 writefln("Files %s, bytes %s", total_files, _total_bytes);
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
50
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
51 // Go through the files again, but this time
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
52 // compute the MD5 digests and build our data structures
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
53
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
54 writeln("Accumulating MD5 digests");
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
55
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
56 foreach (string dir; dirs) {
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
57 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
58 try {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
59 if (isFile(name)) {
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
60 //writefln("MD5'ing %s", name);
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
61 compute_md5(name, getSize(name));
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
62 }
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
63 }
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
64 catch (FileException ex) {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
65 writefln("Skipping %s", name);
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
66 }
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
67 catch (ErrnoException ex) {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
68 //writefln("Skipping file: %s, %s", name, ex);
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
69 //writefln("(errno) Skipping file: %s", name);
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
70 // TODO accumulate errors and print after traversal is complete
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
71 }
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
72 }
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
73 }
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
74
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
75 writefln("");
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
76
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
77 // Sort our duplicate digests by size so that we print
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
78 // the biggest duplicate file offenders first
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
79
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
80 writeln("Sorting duplicate digests by size");
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
81
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
82 ubyte[16][] keys = _duplicate_digests.keys;
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
83 bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; }
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
84 sort!(compare_by_size)(keys);
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
85
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
86 // Print the results out the user, in descending order
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
87 // of file size
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
88
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
89 writeln("Printing results");
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
90
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
91 writefln("Number of duplicate files: %s", _duplicate_digests.length);
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
92
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
93 foreach (digest; keys) {
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
94 auto file_info = _file_info_map[digest];
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
95 /*
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
96 writefln("Size %s, Count %s, Digest %s",
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
97 file_info.size, file_info.names.length, digestToString(digest));
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
98 */
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
99 writefln("Size %s, Count %s", file_info.size, file_info.names.length);
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
100 foreach (name; file_info.names) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
101 writefln("\t%s", name);
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
102 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
103 }
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
104
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
105 writeln("Done");
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
106 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
107
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
108 private {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
109 struct FileInfo {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
110 this(in ulong size_, string first_name) {
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
111 size = size_;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
112 names ~= first_name;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
113 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
114
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
115 ulong size;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
116 string[] names;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
117 };
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
118
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
119 //static const ulong SIZE_THRESHOLD = 1_000;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
120 static const ulong SIZE_THRESHOLD = 0;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
121
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
122 bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
123 FileInfo[ubyte[16]] _file_info_map; // map of digest to file info
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
124
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
125 ulong _total_bytes;
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
126 ulong _current_byte;
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
127 double _last_progress = -1.0;
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
128
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
129 void compute_md5(in string filename, in ulong size) {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
130 void bytes_chewed(ulong bytes) {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
131 _current_byte += bytes;
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
132 double progress = cast(double)_current_byte / cast(double)_total_bytes;
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
133 if (progress - _last_progress > 0.0005) {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
134 writef("\rProgress %.1f%%", 100.0 * progress);
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
135 std.stdio.stdout.flush();
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
136 _last_progress = progress;
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
137 }
113
9cc6c428fdbe Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
David Bryant <bagnose@gmail.com>
parents: 112
diff changeset
138 }
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
139
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
140 ubyte[16] digest;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
141
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
142 // If Block 1 and Block 2 are both uncommented then there is a memory explosion.
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
143 // However, if either one is commented out there there isn't...
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
144
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
145 {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
146 auto file = File(filename, "r");
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
147 scope(exit) file.close;
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
148
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
149 MD5_CTX context;
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
150 context.start();
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
151 { // Block 1:
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
152 // Compute the actual digest
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
153 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
154 context.update(buffer);
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
155 bytes_chewed(buffer.length);
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
156 }
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
157 }
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
158 context.finish(digest);
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
159
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
160 /+
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
161 { // Block 1 alternative:
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
162 // Create a random digest
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
163 digest = make_random_digest;
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
164 bytes_chewed(size);
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
165 }
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
166 +/
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
167 }
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
168
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
169 { // Block 2:
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
170 // Update the data structures
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
171 if (FileInfo * file_info = (digest in _file_info_map)) {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
172 // This is a duplicate digest, append the subsequent name
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
173 file_info.names ~= filename;
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
174
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
175 // Record the duplicate as an offender if its size exceeds the threshold
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
176 if (file_info.size >= SIZE_THRESHOLD) {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
177 _duplicate_digests[digest] = true;
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
178 }
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
179 }
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
180 else {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
181 // We have not seen this digest before
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
182 _file_info_map[digest] = FileInfo(size, filename);
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
183 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
184 }
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
185 }
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
186
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
187 ubyte[16] make_random_digest() {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
188 ubyte[16] digest;
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
189 foreach (ref a; digest) {
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
190 a = cast(ubyte)uniform(0, 256);
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
191 }
115
d7330cc52622 Added instructions to duplicates.d on the smallest changes
David Bryant <bagnose@gmail.com>
parents: 114
diff changeset
192 return digest;
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
193 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
194 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
195 }
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
196
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
197 int main(string[] args) {
114
b87e2e0a046a Cleanup of duplicates.d
David Bryant <bagnose@gmail.com>
parents: 113
diff changeset
198 new DuplicateFinder(args[1..$]);
112
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
199
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
200 return 0;
b569d7d5064f Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff changeset
201 }