Mercurial > projects > doodle
comparison doodle/utils/prog/dupes.d @ 118:94233d54e16a
Cleanup of dupes
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Wed, 20 Apr 2011 22:37:16 +0930 |
parents | c566cdbccaeb |
children | 8343c1dafac6 |
comparison
equal
deleted
inserted
replaced
117:c566cdbccaeb | 118:94233d54e16a |
---|---|
1 import std.stdio; | 1 import std.stdio; |
2 import std.string; | 2 import std.string; |
3 import std.exception; | 3 import std.exception; |
4 import std.random; | |
5 import std.algorithm; | |
6 import std.file; | 4 import std.file; |
7 import std.c.stdio; | |
8 import std.c.string; | |
9 import std.cstream; | |
10 import core.sys.posix.dirent; | |
11 import std.md5; | 5 import std.md5; |
12 | |
13 | 6 |
14 class DuplicateFinder { | 7 class DuplicateFinder { |
15 this(in string[] dirs) { | 8 this(in string[] dirs) { |
9 FileInfo[] _file_array; | |
10 | |
16 writefln("Accumulating files"); | 11 writefln("Accumulating files"); |
17 | 12 |
18 string last_name; | 13 string last_name; |
19 | 14 |
20 foreach (string dir; dirs) { | 15 foreach (string dir; dirs) { |
74 | 69 |
75 foreach (index; indices) { | 70 foreach (index; indices) { |
76 FileInfo file_info = _file_array[index]; | 71 FileInfo file_info = _file_array[index]; |
77 | 72 |
78 try { | 73 try { |
79 ubyte[16] digest = compute_md5(file_info.name); | 74 ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT); |
80 | 75 |
81 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { | 76 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { |
82 // A true duplicate | 77 // A true duplicate |
83 // index and index2 are the same | 78 // index and index2 are the same |
84 | 79 |
108 } | 103 } |
109 | 104 |
110 writefln("Done\n"); | 105 writefln("Done\n"); |
111 } | 106 } |
112 | 107 |
113 ubyte[16] compute_md5(in string name) { | 108 struct FileInfo { |
109 this(in string name_, in ulong size_) { | |
110 name = name_; | |
111 size = size_; | |
112 } | |
113 | |
114 string name; | |
115 ulong size; | |
116 } | |
117 | |
118 immutable ulong KILO = 1 << 10; | |
119 immutable ulong MEGA = 1 << 20; | |
120 | |
121 immutable ulong SIZE_THRESHOLD = 100 * KILO; | |
122 immutable ulong MD5_AMOUNT = 10 * KILO; | |
123 | |
124 static ubyte[16] compute_md5(in string name, in ulong max_bytes) { | |
114 ubyte[16] digest; | 125 ubyte[16] digest; |
115 | 126 |
116 auto file = File(name, "r"); | 127 auto file = File(name, "r"); |
117 scope(exit) file.close; | 128 scope(exit) file.close; |
118 | 129 |
119 MD5_CTX context; | 130 MD5_CTX context; |
120 context.start(); | 131 context.start(); |
121 { // Block 1: | 132 ulong byte_count = 0; |
122 // Compute the actual digest | 133 foreach (ubyte[] buffer; chunks(file, 1024)) { |
123 ulong amount = 0; | 134 context.update(buffer); |
124 foreach (ubyte[] buffer; chunks(file, 1024)) { | 135 byte_count += buffer.length; |
125 context.update(buffer); | 136 if (byte_count >= max_bytes) { |
126 //bytes_chewed(buffer.length); | 137 break; |
127 amount += buffer.length; | |
128 if (amount >= MD5_AMOUNT) { | |
129 break; | |
130 } | |
131 } | 138 } |
132 } | 139 } |
140 | |
133 context.finish(digest); | 141 context.finish(digest); |
134 | 142 |
135 return digest; | 143 return digest; |
136 } | |
137 | |
138 private { | |
139 immutable ulong KILO = 1 << 10; | |
140 immutable ulong MEGA = 1 << 20; | |
141 | |
142 immutable ulong SIZE_THRESHOLD = 100 * KILO; | |
143 immutable ulong MD5_AMOUNT = 10 * KILO; | |
144 | |
145 struct FileInfo { | |
146 this(in string name_, in ulong size_) { | |
147 name = name_; | |
148 size = size_; | |
149 } | |
150 | |
151 string name; | |
152 ulong size; | |
153 }; | |
154 | |
155 FileInfo[] _file_array; | |
156 } | 144 } |
157 } | 145 } |
158 | 146 |
159 int main(string[] args) { | 147 int main(string[] args) { |
160 new DuplicateFinder(args[1..$]); | 148 new DuplicateFinder(args[1..$]); |