Mercurial > projects > doodle
comparison doodle/utils/prog/duplicates.d @ 113:9cc6c428fdbe
Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls
no longer needed. Still blows the hood on memory usage.
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Thu, 14 Apr 2011 19:10:46 +0930 |
parents | b569d7d5064f |
children | b87e2e0a046a |
comparison
equal
deleted
inserted
replaced
112:b569d7d5064f | 113:9cc6c428fdbe |
---|---|
1 import std.stdio; | 1 import std.stdio; |
2 import std.string; | 2 import std.string; |
3 import std.exception; | |
4 import std.algorithm; | |
3 import std.file; | 5 import std.file; |
4 import std.c.stdio; | 6 import std.c.stdio; |
5 import std.c.string; | 7 import std.c.string; |
6 import std.cstream; | 8 import std.cstream; |
7 import core.sys.posix.dirent; | 9 import core.sys.posix.dirent; |
8 import std.md5; | 10 import std.md5; |
9 | 11 |
10 class DuplicateFinder { | 12 class DuplicateFinder { |
11 this(in string dir) { | 13 this(in string dir) { |
12 recurse_directory(dir.dup); | 14 // First pass to gather the number of files and bytes |
13 | 15 |
14 writefln("\n"); | 16 writeln("Accumulating total bytes / files"); |
15 | 17 |
16 foreach (digest; _duplicate_digests.keys) { | 18 uint total_files = 0; |
17 writefln("%s", digestToString(digest)); | 19 |
20 try { | |
21 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | |
22 try { | |
23 if (isFile(name)) { | |
24 _total_bytes += getSize(name); | |
25 ++total_files; | |
26 } | |
27 } | |
28 catch (Exception ex) { | |
29 writefln("Skipping %s", name); | |
30 //writefln("Exception %s", ex); | |
31 } | |
32 } | |
33 } | |
34 catch (FileException ex) { | |
35 // ignore | |
36 writefln("dirEntries bailed out. Continuing anyway"); | |
37 } | |
38 | |
39 writefln("Files %s, bytes %s", total_files, _total_bytes); | |
40 writeln("Accumulating MD5 sums"); | |
41 | |
42 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | |
43 if (isFile(name)) { | |
44 try { | |
45 //writefln("MD5'ing %s", name); | |
46 compute_md5(name); | |
47 } | |
48 catch (ErrnoException ex) { | |
49 //writefln("Skipping file: %s, %s", name, ex); | |
50 //writefln("(errno) Skipping file: %s", name); | |
51 // TODO accumulate errors and print after traversal is complete | |
52 } | |
53 } | |
54 } | |
55 | |
56 writefln(""); | |
57 | |
58 writeln("Sorting keys"); | |
59 | |
60 ubyte[16][] keys = _duplicate_digests.keys; | |
61 bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; } | |
62 sort!(compare_by_size)(keys); | |
63 | |
64 writeln("Printing results"); | |
65 | |
66 foreach (digest; keys) { | |
18 auto file_info = _file_info_map[digest]; | 67 auto file_info = _file_info_map[digest]; |
68 /* | |
19 writefln("Size %s, Count %s, Digest %s", | 69 writefln("Size %s, Count %s, Digest %s", |
20 file_info.size, file_info.names.length, digestToString(digest)); | 70 file_info.size, file_info.names.length, digestToString(digest)); |
71 */ | |
72 writefln("Size %s, Count %s", file_info.size, file_info.names.length); | |
21 foreach (name; file_info.names) { | 73 foreach (name; file_info.names) { |
22 writefln("\t%s", name); | 74 writefln("\t%s", name); |
23 } | 75 } |
24 } | 76 } |
77 | |
78 writeln("Done"); | |
25 } | 79 } |
26 | 80 |
27 private { | 81 private { |
28 struct FileInfo { | 82 struct FileInfo { |
29 this(in ulong size_, string first_name) { | 83 this(in ulong size_, string first_name) { |
39 static const ulong SIZE_THRESHOLD = 0; | 93 static const ulong SIZE_THRESHOLD = 0; |
40 | 94 |
41 bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests | 95 bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests |
42 FileInfo[ubyte[16]] _file_info_map; // map of digest to file info | 96 FileInfo[ubyte[16]] _file_info_map; // map of digest to file info |
43 | 97 |
44 void compute_md5(in char[] filename, in ulong filesize) { | 98 ulong _total_bytes; |
99 ulong _current_byte; | |
100 double _last_progress = -1.0; | |
101 | |
102 void bytes_chewed(ulong bytes) { | |
103 _current_byte += bytes; | |
104 double progress = cast(double)_current_byte / cast(double)_total_bytes; | |
105 if (progress - _last_progress > 0.0005) { | |
106 writef("\rProgress %3.1f%%", 100.0 * progress); | |
107 std.stdio.stdout.flush(); | |
108 _last_progress = progress; | |
109 } | |
110 | |
111 } | |
112 | |
113 void compute_md5(in string filename) { | |
45 //writefln("%s", filename); | 114 //writefln("%s", filename); |
46 auto file = File(filename.idup, "r"); | 115 auto file = File(filename, "r"); |
47 scope(exit) file.close; | 116 scope(exit) file.close; |
48 | 117 |
49 ubyte[16] digest; | 118 ubyte[16] digest; |
50 | 119 |
51 MD5_CTX context; | 120 MD5_CTX context; |
52 context.start(); | 121 context.start(); |
53 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { | 122 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { |
123 bytes_chewed(buffer.length); | |
54 context.update(buffer); | 124 context.update(buffer); |
55 } | 125 } |
56 context.finish(digest); | 126 context.finish(digest); |
57 writefln("%s: %s", digestToString(digest), filename); | 127 //writefln("%s: %s", digestToString(digest), filename); |
58 | 128 |
59 if (FileInfo * file_info = (digest in _file_info_map)) { | 129 if (FileInfo * file_info = (digest in _file_info_map)) { |
60 // duplicate | 130 // duplicate |
61 file_info.names ~= filename.idup; | 131 file_info.names ~= filename; |
132 assert(file_info.names.length > 1); | |
62 | 133 |
63 if (file_info.size >= SIZE_THRESHOLD) { | 134 if (file_info.size >= SIZE_THRESHOLD) { |
64 _duplicate_digests[digest] = true; | 135 _duplicate_digests[digest] = true; |
65 } | 136 } |
66 } | 137 } |
67 else { | 138 else { |
68 // unseen | 139 // unseen |
69 _duplicate_digests[digest] = true; | 140 _file_info_map[digest] = FileInfo(getSize(filename), filename); |
70 _file_info_map[digest] = FileInfo(filesize, filename.idup); | |
71 //writefln("%s", _file_info_map.length); | 141 //writefln("%s", _file_info_map.length); |
72 } | |
73 } | |
74 | |
75 bool entry_callback(DirEntry * de) { | |
76 //writefln("File: %s", de.name); | |
77 | |
78 if (de.isdir) { | |
79 recurse_directory(de.name); | |
80 } | |
81 else if (de.isfile) { | |
82 compute_md5(de.name, de.size); | |
83 } | |
84 | |
85 return true; | |
86 } | |
87 | |
88 void recurse_directory(in char[] dirname) { | |
89 //writefln("Dir: %s", dirname); | |
90 | |
91 try { | |
92 listdir(dirname, &entry_callback); | |
93 } | |
94 catch (FileException ex) { | |
95 //writefln("Skipping: %s", dirname); | |
96 } | 142 } |
97 } | 143 } |
98 } | 144 } |
99 } | 145 } |
100 | 146 |