Mercurial > projects > doodle
annotate doodle/utils/prog/duplicates.d @ 112:b569d7d5064f
Added some utilities that are a work in progress.
author | David Bryant <bagnose@gmail.com> |
---|---|
date | Thu, 14 Apr 2011 11:27:17 +0930 |
parents | |
children | 9cc6c428fdbe |
rev | line source |
---|---|
112
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
1 import std.stdio; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
2 import std.string; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
3 import std.file; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
4 import std.c.stdio; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
5 import std.c.string; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
6 import std.cstream; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
7 import core.sys.posix.dirent; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
8 import std.md5; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
9 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
10 class DuplicateFinder { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
11 this(in string dir) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
12 recurse_directory(dir.dup); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
13 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
14 writefln("\n"); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
15 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
16 foreach (digest; _duplicate_digests.keys) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
17 writefln("%s", digestToString(digest)); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
18 auto file_info = _file_info_map[digest]; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
19 writefln("Size %s, Count %s, Digest %s", |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
20 file_info.size, file_info.names.length, digestToString(digest)); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
21 foreach (name; file_info.names) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
22 writefln("\t%s", name); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
23 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
24 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
25 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
26 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
27 private { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
28 struct FileInfo { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
29 this(in ulong size_, string first_name) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
30 size = size_; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
31 names ~= first_name; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
32 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
33 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
34 ulong size; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
35 string[] names; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
36 }; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
37 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
38 //static const ulong SIZE_THRESHOLD = 1_000; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
39 static const ulong SIZE_THRESHOLD = 0; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
40 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
41 bool[ubyte[16]] _duplicate_digests; // set of all duplicate digests |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
42 FileInfo[ubyte[16]] _file_info_map; // map of digest to file info |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
43 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
44 void compute_md5(in char[] filename, in ulong filesize) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
45 //writefln("%s", filename); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
46 auto file = File(filename.idup, "r"); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
47 scope(exit) file.close; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
48 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
49 ubyte[16] digest; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
50 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
51 MD5_CTX context; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
52 context.start(); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
53 foreach (ubyte[] buffer; chunks(file, 4096 * 1024)) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
54 context.update(buffer); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
55 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
56 context.finish(digest); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
57 writefln("%s: %s", digestToString(digest), filename); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
58 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
59 if (FileInfo * file_info = (digest in _file_info_map)) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
60 // duplicate |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
61 file_info.names ~= filename.idup; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
62 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
63 if (file_info.size >= SIZE_THRESHOLD) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
64 _duplicate_digests[digest] = true; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
65 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
66 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
67 else { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
68 // unseen |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
69 _duplicate_digests[digest] = true; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
70 _file_info_map[digest] = FileInfo(filesize, filename.idup); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
71 //writefln("%s", _file_info_map.length); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
72 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
73 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
74 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
75 bool entry_callback(DirEntry * de) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
76 //writefln("File: %s", de.name); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
77 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
78 if (de.isdir) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
79 recurse_directory(de.name); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
80 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
81 else if (de.isfile) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
82 compute_md5(de.name, de.size); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
83 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
84 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
85 return true; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
86 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
87 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
88 void recurse_directory(in char[] dirname) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
89 //writefln("Dir: %s", dirname); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
90 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
91 try { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
92 listdir(dirname, &entry_callback); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
93 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
94 catch (FileException ex) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
95 //writefln("Skipping: %s", dirname); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
96 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
97 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
98 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
99 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
100 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
101 int main(string[] args) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
102 foreach (string arg; args[1..$]) { |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
103 new DuplicateFinder(arg); |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
104 } |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
105 |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
106 return 0; |
b569d7d5064f
Added some utilities that are a work in progress.
David Bryant <bagnose@gmail.com>
parents:
diff
changeset
|
107 } |