comparison doodle/utils/prog/dupes.d @ 118:94233d54e16a

Cleanup of dupes
author David Bryant <bagnose@gmail.com>
date Wed, 20 Apr 2011 22:37:16 +0930
parents c566cdbccaeb
children 8343c1dafac6
comparison
equal deleted inserted replaced
117:c566cdbccaeb 118:94233d54e16a
1 import std.stdio; 1 import std.stdio;
2 import std.string; 2 import std.string;
3 import std.exception; 3 import std.exception;
4 import std.random;
5 import std.algorithm;
6 import std.file; 4 import std.file;
7 import std.c.stdio;
8 import std.c.string;
9 import std.cstream;
10 import core.sys.posix.dirent;
11 import std.md5; 5 import std.md5;
12
13 6
14 class DuplicateFinder { 7 class DuplicateFinder {
15 this(in string[] dirs) { 8 this(in string[] dirs) {
9 FileInfo[] _file_array;
10
16 writefln("Accumulating files"); 11 writefln("Accumulating files");
17 12
18 string last_name; 13 string last_name;
19 14
20 foreach (string dir; dirs) { 15 foreach (string dir; dirs) {
74 69
75 foreach (index; indices) { 70 foreach (index; indices) {
76 FileInfo file_info = _file_array[index]; 71 FileInfo file_info = _file_array[index];
77 72
78 try { 73 try {
79 ubyte[16] digest = compute_md5(file_info.name); 74 ubyte[16] digest = compute_md5(file_info.name, MD5_AMOUNT);
80 75
81 if (uint[] * duplicate_indices = (digest in digest_to_indices)) { 76 if (uint[] * duplicate_indices = (digest in digest_to_indices)) {
82 // A true duplicate 77 // A true duplicate
83 // index and index2 are the same 78 // index and index2 are the same
84 79
108 } 103 }
109 104
110 writefln("Done\n"); 105 writefln("Done\n");
111 } 106 }
112 107
113 ubyte[16] compute_md5(in string name) { 108 struct FileInfo {
109 this(in string name_, in ulong size_) {
110 name = name_;
111 size = size_;
112 }
113
114 string name;
115 ulong size;
116 }
117
118 immutable ulong KILO = 1 << 10;
119 immutable ulong MEGA = 1 << 20;
120
121 immutable ulong SIZE_THRESHOLD = 100 * KILO;
122 immutable ulong MD5_AMOUNT = 10 * KILO;
123
124 static ubyte[16] compute_md5(in string name, in ulong max_bytes) {
114 ubyte[16] digest; 125 ubyte[16] digest;
115 126
116 auto file = File(name, "r"); 127 auto file = File(name, "r");
117 scope(exit) file.close; 128 scope(exit) file.close;
118 129
119 MD5_CTX context; 130 MD5_CTX context;
120 context.start(); 131 context.start();
121 { // Block 1: 132 ulong byte_count = 0;
122 // Compute the actual digest 133 foreach (ubyte[] buffer; chunks(file, 1024)) {
123 ulong amount = 0; 134 context.update(buffer);
124 foreach (ubyte[] buffer; chunks(file, 1024)) { 135 byte_count += buffer.length;
125 context.update(buffer); 136 if (byte_count >= max_bytes) {
126 //bytes_chewed(buffer.length); 137 break;
127 amount += buffer.length;
128 if (amount >= MD5_AMOUNT) {
129 break;
130 }
131 } 138 }
132 } 139 }
140
133 context.finish(digest); 141 context.finish(digest);
134 142
135 return digest; 143 return digest;
136 }
137
138 private {
139 immutable ulong KILO = 1 << 10;
140 immutable ulong MEGA = 1 << 20;
141
142 immutable ulong SIZE_THRESHOLD = 100 * KILO;
143 immutable ulong MD5_AMOUNT = 10 * KILO;
144
145 struct FileInfo {
146 this(in string name_, in ulong size_) {
147 name = name_;
148 size = size_;
149 }
150
151 string name;
152 ulong size;
153 };
154
155 FileInfo[] _file_array;
156 } 144 }
157 } 145 }
158 146
159 int main(string[] args) { 147 int main(string[] args) {
160 new DuplicateFinder(args[1..$]); 148 new DuplicateFinder(args[1..$]);