comparison doodle/utils/prog/dupes.d @ 119:8343c1dafac6

Make it compile under latest GtkD. Rewrote dupes.d as a single function
author David Bryant <bagnose@gmail.com>
date Thu, 21 Apr 2011 18:12:13 +0930
parents 94233d54e16a
children c275f26399c6
comparison
equal deleted inserted replaced
118:94233d54e16a 119:8343c1dafac6
2 import std.string; 2 import std.string;
3 import std.exception; 3 import std.exception;
4 import std.file; 4 import std.file;
5 import std.md5; 5 import std.md5;
6 6
7 class DuplicateFinder { 7 void find_duplicates(in string[] dirs) {
8 this(in string[] dirs) {
9 FileInfo[] _file_array;
10
11 writefln("Accumulating files");
12
13 string last_name;
14
15 foreach (string dir; dirs) {
16 try {
17 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
18 last_name = name;
19 try {
20 if (isFile(name)) {
21 ulong size = getSize(name);
22 if (size >= SIZE_THRESHOLD) {
23 _file_array ~= FileInfo(name, size);
24 }
25 }
26 }
27 catch (Exception ex) {
28 writefln("Skipping %s", name);
29 //writefln("Exception %s", ex);
30 // TODO accumulate errors and print after traversal
31 }
32 }
33 }
34 catch (FileException ex) {
35 // ignore
36 writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
37 }
38 }
39
40 writefln("Processing %s files", _file_array.length);
41
42 uint[][ulong] size_to_file_indices;
43 bool[ulong] duplicate_sizes;
44
45 foreach (index, file; _file_array) {
46 //writefln("%s %s %s", index, file.name, file.size);
47
48 if (uint[] * indices = (file.size in size_to_file_indices)) {
49 if (indices.length == 1) {
50 // Second time we've seen a file of this size,
51 // record it in the duplicate_sizes array
52 duplicate_sizes[file.size] = true;
53 }
54
55 (*indices) ~= index;
56 }
57 else {
58 size_to_file_indices[file.size] = [ index ];
59 }
60 }
61
62 writefln("Number of files of duplicate size %s", duplicate_sizes.length);
63
64 foreach (size; duplicate_sizes.keys) {
65 uint[] indices = size_to_file_indices[size];
66 //writefln("For size %s there are %s files", size, indices.length);
67
68 uint[][ubyte[16]] digest_to_indices;
69
70 foreach (index; indices) {
71 FileInfo file_info = _file_array[index];
72
73 try {
74 ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT);
75
76 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
77 // A true duplicate
78 // index and index2 are the same
79
80 (*duplicate_indices) ~= index;
81 }
82 else {
83 digest_to_indices[digest] ~= index;
84 }
85 }
86 catch (ErrnoException ex) {
87 //writefln("Skipping: %s", file_info.name);
88 }
89
90 //writefln("\t%s", file_info.name);
91 }
92
93 foreach (indices2; digest_to_indices) {
94 if (indices2.length > 1) {
95 // List the duplicates
96 foreach (index; indices) {
97 FileInfo file_info = _file_array[index];
98 writefln("%s %s", file_info.size, file_info.name);
99 }
100 writefln("");
101 }
102 }
103 }
104
105 writefln("Done\n");
106 }
107
108 struct FileInfo {
109 this(in string name_, in ulong size_) {
110 name = name_;
111 size = size_;
112 }
113
114 string name;
115 ulong size;
116 }
117
118 immutable ulong KILO = 1 << 10; 8 immutable ulong KILO = 1 << 10;
119 immutable ulong MEGA = 1 << 20; 9 immutable ulong MEGA = 1 << 20;
120 10
121 immutable ulong SIZE_THRESHOLD = 100 * KILO; 11 immutable ulong SIZE_THRESHOLD = 100 * KILO;
122 immutable ulong MD5_AMOUNT = 10 * KILO; 12 immutable ulong MD5_AMOUNT = 10 * KILO;
140 30
141 context.finish(digest); 31 context.finish(digest);
142 32
143 return digest; 33 return digest;
144 } 34 }
35
36 struct FileInfo {
37 string name;
38 ulong size;
39 }
40
41 FileInfo[] file_array;
42
43 writefln("Accumulating file list");
44
45 string last_name;
46
47 foreach (string dir; dirs) {
48 try {
49 foreach (string name; dirEntries(dir, SpanMode.depth, false)) {
50 last_name = name;
51 try {
52 if (!isSymLink(name) && isFile(name)) {
53 ulong size = getSize(name);
54 if (size >= SIZE_THRESHOLD) {
55 file_array ~= FileInfo(name, size);
56 }
57 }
58 }
59 catch (Exception ex) {
60 writefln("Skipping %s", name);
61 //writefln("Exception %s", ex);
62 // TODO accumulate errors and print after traversal
63 }
64 }
65 }
66 catch (FileException ex) {
67 // ignore
68 writefln("dirEntries bailed out (%s). Continuing anyway", last_name);
69 }
70 }
71
72 writefln("Processing %s files", file_array.length);
73
74 uint[][ulong] size_to_file_indices;
75 bool[ulong] duplicate_sizes;
76
77 foreach (index, file; file_array) {
78 //writefln("%s %s %s", index, file.name, file.size);
79
80 if (uint[] * indices = (file.size in size_to_file_indices)) {
81 if (indices.length == 1) {
82 // Second time we've seen a file of this size,
83 // record it in the duplicate_sizes array
84 duplicate_sizes[file.size] = true;
85 }
86
87 (*indices) ~= index;
88 }
89 else {
90 size_to_file_indices[file.size] = [ index ];
91 }
92 }
93
94 writefln("Number of files of duplicate size %s", duplicate_sizes.length);
95
96 foreach (size; duplicate_sizes.keys) {
97 uint[] indices = size_to_file_indices[size];
98 //writefln("For size %s there are %s files", size, indices.length);
99
100 uint[][ubyte[16]] digest_to_indices;
101
102 foreach (index; indices) {
103 const FileInfo file_info = file_array[index];
104
105 try {
106 ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT);
107
108 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
109 // A true duplicate
110 // index and index2 are the same
111
112 (*duplicate_indices) ~= index;
113 }
114 else {
115 digest_to_indices[digest] ~= index;
116 }
117 }
118 catch (ErrnoException ex) {
119 //writefln("Skipping: %s", file_info.name);
120 }
121
122 //writefln("\t%s", file_info.name);
123 }
124
125 foreach (indices2; digest_to_indices) {
126 if (indices2.length > 1) {
127 // List the duplicates
128 foreach (index; indices) {
129 FileInfo file_info = file_array[index];
130 writefln("%s %s", file_info.size, file_info.name);
131 }
132 writefln("");
133 }
134 }
135 }
136
137 writefln("Done");
145 } 138 }
146 139
147 int main(string[] args) { 140 int main(string[] args) {
148 new DuplicateFinder(args[1..$]); 141 find_duplicates(args[1..$]);
149 142
150 return 0; 143 return 0;
151 } 144 }