Mercurial > projects > doodle
comparison doodle/utils/prog/dupes.d @ 119:8343c1dafac6
Make it compile under latest GtkD. Rewrote dupes.d as a single function
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Thu, 21 Apr 2011 18:12:13 +0930 |
parents | 94233d54e16a |
children | c275f26399c6 |
comparison
equal
deleted
inserted
replaced
118:94233d54e16a | 119:8343c1dafac6 |
---|---|
2 import std.string; | 2 import std.string; |
3 import std.exception; | 3 import std.exception; |
4 import std.file; | 4 import std.file; |
5 import std.md5; | 5 import std.md5; |
6 | 6 |
7 class DuplicateFinder { | 7 void find_duplicates(in string[] dirs) { |
8 this(in string[] dirs) { | |
9 FileInfo[] _file_array; | |
10 | |
11 writefln("Accumulating files"); | |
12 | |
13 string last_name; | |
14 | |
15 foreach (string dir; dirs) { | |
16 try { | |
17 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | |
18 last_name = name; | |
19 try { | |
20 if (isFile(name)) { | |
21 ulong size = getSize(name); | |
22 if (size >= SIZE_THRESHOLD) { | |
23 _file_array ~= FileInfo(name, size); | |
24 } | |
25 } | |
26 } | |
27 catch (Exception ex) { | |
28 writefln("Skipping %s", name); | |
29 //writefln("Exception %s", ex); | |
30 // TODO accumulate errors and print after traversal | |
31 } | |
32 } | |
33 } | |
34 catch (FileException ex) { | |
35 // ignore | |
36 writefln("dirEntries bailed out (%s). Continuing anyway", last_name); | |
37 } | |
38 } | |
39 | |
40 writefln("Processing %s files", _file_array.length); | |
41 | |
42 uint[][ulong] size_to_file_indices; | |
43 bool[ulong] duplicate_sizes; | |
44 | |
45 foreach (index, file; _file_array) { | |
46 //writefln("%s %s %s", index, file.name, file.size); | |
47 | |
48 if (uint[] * indices = (file.size in size_to_file_indices)) { | |
49 if (indices.length == 1) { | |
50 // Second time we've seen a file of this size, | |
51 // record it in the duplicate_sizes array | |
52 duplicate_sizes[file.size] = true; | |
53 } | |
54 | |
55 (*indices) ~= index; | |
56 } | |
57 else { | |
58 size_to_file_indices[file.size] = [ index ]; | |
59 } | |
60 } | |
61 | |
62 writefln("Number of files of duplicate size %s", duplicate_sizes.length); | |
63 | |
64 foreach (size; duplicate_sizes.keys) { | |
65 uint[] indices = size_to_file_indices[size]; | |
66 //writefln("For size %s there are %s files", size, indices.length); | |
67 | |
68 uint[][ubyte[16]] digest_to_indices; | |
69 | |
70 foreach (index; indices) { | |
71 FileInfo file_info = _file_array[index]; | |
72 | |
73 try { | |
74 ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT); | |
75 | |
76 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { | |
77 // A true duplicate | |
78 // index and index2 are the same | |
79 | |
80 (*duplicate_indices) ~= index; | |
81 } | |
82 else { | |
83 digest_to_indices[digest] ~= index; | |
84 } | |
85 } | |
86 catch (ErrnoException ex) { | |
87 //writefln("Skipping: %s", file_info.name); | |
88 } | |
89 | |
90 //writefln("\t%s", file_info.name); | |
91 } | |
92 | |
93 foreach (indices2; digest_to_indices) { | |
94 if (indices2.length > 1) { | |
95 // List the duplicates | |
96 foreach (index; indices) { | |
97 FileInfo file_info = _file_array[index]; | |
98 writefln("%s %s", file_info.size, file_info.name); | |
99 } | |
100 writefln(""); | |
101 } | |
102 } | |
103 } | |
104 | |
105 writefln("Done\n"); | |
106 } | |
107 | |
108 struct FileInfo { | |
109 this(in string name_, in ulong size_) { | |
110 name = name_; | |
111 size = size_; | |
112 } | |
113 | |
114 string name; | |
115 ulong size; | |
116 } | |
117 | |
118 immutable ulong KILO = 1 << 10; | 8 immutable ulong KILO = 1 << 10; |
119 immutable ulong MEGA = 1 << 20; | 9 immutable ulong MEGA = 1 << 20; |
120 | 10 |
121 immutable ulong SIZE_THRESHOLD = 100 * KILO; | 11 immutable ulong SIZE_THRESHOLD = 100 * KILO; |
122 immutable ulong MD5_AMOUNT = 10 * KILO; | 12 immutable ulong MD5_AMOUNT = 10 * KILO; |
140 | 30 |
141 context.finish(digest); | 31 context.finish(digest); |
142 | 32 |
143 return digest; | 33 return digest; |
144 } | 34 } |
35 | |
36 struct FileInfo { | |
37 string name; | |
38 ulong size; | |
39 } | |
40 | |
41 FileInfo[] file_array; | |
42 | |
43 writefln("Accumulating file list"); | |
44 | |
45 string last_name; | |
46 | |
47 foreach (string dir; dirs) { | |
48 try { | |
49 foreach (string name; dirEntries(dir, SpanMode.depth, false)) { | |
50 last_name = name; | |
51 try { | |
52 if (!isSymLink(name) && isFile(name)) { | |
53 ulong size = getSize(name); | |
54 if (size >= SIZE_THRESHOLD) { | |
55 file_array ~= FileInfo(name, size); | |
56 } | |
57 } | |
58 } | |
59 catch (Exception ex) { | |
60 writefln("Skipping %s", name); | |
61 //writefln("Exception %s", ex); | |
62 // TODO accumulate errors and print after traversal | |
63 } | |
64 } | |
65 } | |
66 catch (FileException ex) { | |
67 // ignore | |
68 writefln("dirEntries bailed out (%s). Continuing anyway", last_name); | |
69 } | |
70 } | |
71 | |
72 writefln("Processing %s files", file_array.length); | |
73 | |
74 uint[][ulong] size_to_file_indices; | |
75 bool[ulong] duplicate_sizes; | |
76 | |
77 foreach (index, file; file_array) { | |
78 //writefln("%s %s %s", index, file.name, file.size); | |
79 | |
80 if (uint[] * indices = (file.size in size_to_file_indices)) { | |
81 if (indices.length == 1) { | |
82 // Second time we've seen a file of this size, | |
83 // record it in the duplicate_sizes array | |
84 duplicate_sizes[file.size] = true; | |
85 } | |
86 | |
87 (*indices) ~= index; | |
88 } | |
89 else { | |
90 size_to_file_indices[file.size] = [ index ]; | |
91 } | |
92 } | |
93 | |
94 writefln("Number of files of duplicate size %s", duplicate_sizes.length); | |
95 | |
96 foreach (size; duplicate_sizes.keys) { | |
97 uint[] indices = size_to_file_indices[size]; | |
98 //writefln("For size %s there are %s files", size, indices.length); | |
99 | |
100 uint[][ubyte[16]] digest_to_indices; | |
101 | |
102 foreach (index; indices) { | |
103 const FileInfo file_info = file_array[index]; | |
104 | |
105 try { | |
106 ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT); | |
107 | |
108 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { | |
109 // A true duplicate | |
110 // index and index2 are the same | |
111 | |
112 (*duplicate_indices) ~= index; | |
113 } | |
114 else { | |
115 digest_to_indices[digest] ~= index; | |
116 } | |
117 } | |
118 catch (ErrnoException ex) { | |
119 //writefln("Skipping: %s", file_info.name); | |
120 } | |
121 | |
122 //writefln("\t%s", file_info.name); | |
123 } | |
124 | |
125 foreach (indices2; digest_to_indices) { | |
126 if (indices2.length > 1) { | |
127 // List the duplicates | |
128 foreach (index; indices) { | |
129 FileInfo file_info = file_array[index]; | |
130 writefln("%s %s", file_info.size, file_info.name); | |
131 } | |
132 writefln(""); | |
133 } | |
134 } | |
135 } | |
136 | |
137 writefln("Done"); | |
145 } | 138 } |
146 | 139 |
147 int main(string[] args) { | 140 int main(string[] args) { |
148 new DuplicateFinder(args[1..$]); | 141 find_duplicates(args[1..$]); |
149 | 142 |
150 return 0; | 143 return 0; |
151 } | 144 } |