comparison doodle/utils/prog/duplicates.d @ 113:9cc6c428fdbe

Rewrote duplicates.d based on dirEntries. Removed all idup/dup calls no longer needed. Still blows the hood on memory usage.
author David Bryant <bagnose@gmail.com>
date Thu, 14 Apr 2011 19:10:46 +0930
parents b569d7d5064f
children b87e2e0a046a
comparison
equal deleted inserted replaced
112:b569d7d5064f 113:9cc6c428fdbe
1 import std.stdio; 1 import std.stdio;
2 import std.string; 2 import std.string;
3 import std.exception;
4 import std.algorithm;
3 import std.file; 5 import std.file;
4 import std.c.stdio; 6 import std.c.stdio;
5 import std.c.string; 7 import std.c.string;
6 import std.cstream; 8 import std.cstream;
7 import core.sys.posix.dirent; 9 import core.sys.posix.dirent;
8 import std.md5; 10 import std.md5;
9 11
10 class DuplicateFinder { 12 class DuplicateFinder {
11 this(in string dir) { 13 this(in string dir) {
12 recurse_directory(dir.dup); 14 // First pass to gather the number of files and bytes
13 15
14 writefln("\n"); 16 writeln("Accumulating total bytes / files");
15 17
16 foreach (digest; _duplicate_digests.keys) { 18 uint total_files = 0;
17 writefln("%s", digestToString(digest)); 19
20 try {
21 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
22 try {
23 if (isFile(name)) {
24 _total_bytes += getSize(name);
25 ++total_files;
26 }
27 }
28 catch (Exception ex) {
29 writefln("Skipping %s", name);
30 //writefln("Exception %s", ex);
31 }
32 }
33 }
34 catch (FileException ex) {
35 // ignore
36 writefln("dirEntries bailed out. Continuing anyway");
37 }
38
39 writefln("Files %s, bytes %s", total_files, _total_bytes);
40 writeln("Accumulating MD5 sums");
41
42 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
43 if (isFile(name)) {
44 try {
45 //writefln("MD5'ing %s", name);
46 compute_md5(name);
47 }
48 catch (ErrnoException ex) {
49 //writefln("Skipping file: %s, %s", name, ex);
50 //writefln("(errno) Skipping file: %s", name);
51 // TODO accumulate errors and print after traversal is complete
52 }
53 }
54 }
55
56 writefln("");
57
58 writeln("Sorting keys");
59
60 ubyte[16][] keys = _duplicate_digests.keys;
61 bool compare_by_size(const ref ubyte[16] a, const ref ubyte[16] b) { return _file_info_map[a].size > _file_info_map[b].size; }
62 sort!(compare_by_size)(keys);
63
64 writeln("Printing results");
65
66 foreach (digest; keys) {
18 auto file_info = _file_info_map[digest]; 67 auto file_info = _file_info_map[digest];
68 /*
19 writefln("Size %s, Count %s, Digest %s", 69 writefln("Size %s, Count %s, Digest %s",
20 file_info.size, file_info.names.length, digestToString(digest)); 70 file_info.size, file_info.names.length, digestToString(digest));
71 */
72 writefln("Size %s, Count %s", file_info.size, file_info.names.length);
21 foreach (name; file_info.names) { 73 foreach (name; file_info.names) {
22 writefln("\t%s", name); 74 writefln("\t%s", name);
23 } 75 }
24 } 76 }
77
78 writeln("Done");
25 } 79 }
26 80
27 private { 81 private {
28 struct FileInfo { 82 struct FileInfo {
29 this(in ulong size_, string first_name) { 83 this(in ulong size_, string first_name) {
39 static const ulong SIZE_THRESHOLD = 0; 93 static const ulong SIZE_THRESHOLD = 0;
40 94
41 bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests 95 bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests
42 FileInfo[ubyte[16]] _file_info_map; // map of digest to file info 96 FileInfo[ubyte[16]] _file_info_map; // map of digest to file info
43 97
44 void compute_md5(in char[] filename, in ulong filesize) { 98 ulong _total_bytes;
99 ulong _current_byte;
100 double _last_progress = -1.0;
101
102 void bytes_chewed(ulong bytes) {
103 _current_byte += bytes;
104 double progress = cast(double)_current_byte / cast(double)_total_bytes;
105 if (progress - _last_progress > 0.0005) {
106 writef("\rProgress %3.1f%%", 100.0 * progress);
107 std.stdio.stdout.flush();
108 _last_progress = progress;
109 }
110
111 }
112
113 void compute_md5(in string filename) {
45 //writefln("%s", filename); 114 //writefln("%s", filename);
46 auto file = File(filename.idup, "r"); 115 auto file = File(filename, "r");
47 scope(exit) file.close; 116 scope(exit) file.close;
48 117
49 ubyte[16] digest; 118 ubyte[16] digest;
50 119
51 MD5_CTX context; 120 MD5_CTX context;
52 context.start(); 121 context.start();
53 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { 122 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) {
123 bytes_chewed(buffer.length);
54 context.update(buffer); 124 context.update(buffer);
55 } 125 }
56 context.finish(digest); 126 context.finish(digest);
57 writefln("%s: %s", digestToString(digest), filename); 127 //writefln("%s: %s", digestToString(digest), filename);
58 128
59 if (FileInfo * file_info = (digest in _file_info_map)) { 129 if (FileInfo * file_info = (digest in _file_info_map)) {
60 // duplicate 130 // duplicate
61 file_info.names ~= filename.idup; 131 file_info.names ~= filename;
132 assert(file_info.names.length > 1);
62 133
63 if (file_info.size >= SIZE_THRESHOLD) { 134 if (file_info.size >= SIZE_THRESHOLD) {
64 _duplicate_digests[digest] = true; 135 _duplicate_digests[digest] = true;
65 } 136 }
66 } 137 }
67 else { 138 else {
68 // unseen 139 // unseen
69 _duplicate_digests[digest] = true; 140 _file_info_map[digest] = FileInfo(getSize(filename), filename);
70 _file_info_map[digest] = FileInfo(filesize, filename.idup);
71 //writefln("%s", _file_info_map.length); 141 //writefln("%s", _file_info_map.length);
72 }
73 }
74
75 bool entry_callback(DirEntry * de) {
76 //writefln("File: %s", de.name);
77
78 if (de.isdir) {
79 recurse_directory(de.name);
80 }
81 else if (de.isfile) {
82 compute_md5(de.name, de.size);
83 }
84
85 return true;
86 }
87
88 void recurse_directory(in char[] dirname) {
89 //writefln("Dir: %s", dirname);
90
91 try {
92 listdir(dirname, &entry_callback);
93 }
94 catch (FileException ex) {
95 //writefln("Skipping: %s", dirname);
96 } 142 }
97 } 143 }
98 } 144 }
99 } 145 }
100 146