Commit | Line | Data |
---|---|---|
cce97c27 SK |
1 | open Printf |
2 | ||
948ee900 SK |
3 | module Array = ArrayLabels |
4 | module List = ListLabels | |
5c0100d2 | 5 | module StrSet = Set.Make(String) |
e09dff7f | 6 | module Unix = UnixLabels |
cce97c27 SK |
7 | |
8 | module Stream : sig | |
948ee900 | 9 | type 'a t |
e13e9ef5 SK |
10 | |
11 | val create : (unit -> 'a option) -> 'a t | |
12 | ||
948ee900 | 13 | val iter : 'a t -> f:('a -> unit) -> unit |
8673c3a5 | 14 | |
a9a56d74 SK |
15 | val map : 'a t -> f:('a -> 'b) -> 'b t |
16 | ||
17 | val filter : 'a t -> f:('a -> bool) -> 'a t | |
18 | ||
8673c3a5 | 19 | val concat : ('a t) list -> 'a t |
cce97c27 SK |
20 | end = struct |
21 | module S = Stream | |
22 | ||
948ee900 | 23 | type 'a t = |
a9a56d74 | 24 | {mutable streams : ('a S.t) list} |
948ee900 | 25 | |
e13e9ef5 | 26 | let create f = |
a9a56d74 SK |
27 | {streams = [S.from (fun _ -> f ())]} |
28 | ||
29 | let rec next t = | |
30 | match t.streams with | |
31 | | [] -> | |
32 | None | |
33 | | s :: streams -> | |
34 | (match S.next s with | |
35 | | exception Stream.Failure -> | |
36 | t.streams <- streams; | |
37 | next t | |
38 | | x -> | |
39 | Some x | |
40 | ) | |
41 | ||
42 | let map t ~f = | |
43 | create (fun () -> | |
44 | match next t with | |
45 | | None -> None | |
46 | | Some x -> Some (f x) | |
47 | ) | |
48 | ||
49 | let filter t ~f = | |
50 | let rec filter () = | |
51 | match next t with | |
52 | | None -> | |
53 | None | |
54 | | Some x when f x -> | |
55 | Some x | |
56 | | Some _ -> | |
57 | filter () | |
58 | in | |
59 | create filter | |
e13e9ef5 SK |
60 | |
61 | let iter t ~f = | |
a9a56d74 | 62 | List.iter t.streams ~f:(S.iter f) |
8673c3a5 SK |
63 | |
64 | let concat ts = | |
a9a56d74 | 65 | {streams = List.concat (List.map ts ~f:(fun {streams} -> streams))} |
e13e9ef5 SK |
66 | end |
67 | ||
68 | module In_channel : sig | |
69 | val lines : in_channel -> string Stream.t | |
70 | end = struct | |
71 | let lines ic = | |
72 | Stream.create (fun () -> | |
73 | match input_line ic with | |
74 | | exception End_of_file -> | |
75 | None | |
76 | | line -> | |
77 | Some line | |
78 | ) | |
79 | end | |
80 | ||
a9a56d74 SK |
81 | module File : sig |
82 | type t = | |
83 | { path : string | |
84 | ; size : int | |
85 | } | |
86 | ||
87 | val find : string -> t Stream.t | |
88 | (** Find all files in the directory tree, starting from the given root path *) | |
89 | ||
90 | val lookup : string Stream.t -> t Stream.t | |
91 | (** Lookup file info for given paths *) | |
5c0100d2 SK |
92 | |
93 | module Set : sig include Set.S with type elt := t end | |
e13e9ef5 | 94 | end = struct |
a9a56d74 SK |
95 | type t = |
96 | { path : string | |
97 | ; size : int | |
98 | } | |
99 | ||
5c0100d2 SK |
100 | let compare {path=p1; _} {path=p2; _} = |
101 | Stdlib.compare p1 p2 | |
102 | ||
a9a56d74 SK |
103 | let lookup paths = |
104 | Stream.map paths ~f:(fun path -> | |
105 | let {Unix.st_size = size; _} = Unix.lstat path in | |
106 | {path; size} | |
107 | ) | |
108 | ||
109 | let find root = | |
948ee900 SK |
110 | let dirs = Queue.create () in |
111 | let files = Queue.create () in | |
948ee900 SK |
112 | let explore parent = |
113 | Array.iter (Sys.readdir parent) ~f:(fun child -> | |
114 | let path = Filename.concat parent child in | |
a9a56d74 | 115 | let {Unix.st_kind = file_kind; st_size; _} = Unix.lstat path in |
948ee900 SK |
116 | match file_kind with |
117 | | Unix.S_REG -> | |
a9a56d74 SK |
118 | let file = {path; size = st_size} in |
119 | Queue.add file files | |
948ee900 SK |
120 | | Unix.S_DIR -> |
121 | Queue.add path dirs | |
122 | | Unix.S_CHR | |
123 | | Unix.S_BLK | |
124 | | Unix.S_LNK | |
125 | | Unix.S_FIFO | |
126 | | Unix.S_SOCK -> | |
127 | () | |
128 | ) | |
129 | in | |
1f130f74 | 130 | explore root; |
c66266c6 SK |
131 | let rec next () = |
132 | match Queue.is_empty files, Queue.is_empty dirs with | |
133 | | false, _ -> Some (Queue.take files) | |
134 | | true , true -> None | |
135 | | true , false -> | |
136 | explore (Queue.take dirs); | |
137 | next () | |
138 | in | |
139 | Stream.create next | |
5c0100d2 SK |
140 | |
141 | module Set = Set.Make(struct | |
142 | type elt = t | |
143 | type t = elt | |
144 | let compare = compare | |
145 | end) | |
cce97c27 SK |
146 | end |
147 | ||
948ee900 | 148 | type input = |
cfcdf90a SK |
149 | | Stdin |
150 | | Directories of string list | |
948ee900 | 151 | |
e09dff7f SK |
152 | type output = |
153 | | Stdout | |
154 | | Directory of string | |
155 | ||
1253df34 SK |
156 | type opt = |
157 | { input : input | |
158 | ; output : output | |
159 | ; ignore : Str.regexp option | |
7a0486be | 160 | ; sample : int |
1253df34 SK |
161 | } |
162 | ||
a9a56d74 SK |
163 | type count = |
164 | { considered : int ref | |
165 | ; empty : int ref | |
166 | ; ignored : int ref | |
167 | ; unique_size : int ref | |
7a0486be | 168 | ; unique_sample : int ref |
a9a56d74 SK |
169 | ; hashed : int ref |
170 | } | |
171 | ||
172 | let make_input_stream input ignore count = | |
173 | let input = | |
174 | match input with | |
175 | | Stdin -> | |
176 | File.lookup (In_channel.lines stdin) | |
177 | | Directories paths -> | |
178 | let paths = StrSet.elements (StrSet.of_list paths) in | |
179 | Stream.concat (List.map paths ~f:File.find) | |
180 | in | |
181 | Stream.filter input ~f:(fun {File.path; size} -> | |
182 | incr count.considered; | |
183 | let empty = size = 0 in | |
184 | let ignored = | |
185 | match ignore with | |
186 | | Some regexp when (Str.string_match regexp path 0) -> | |
187 | true | |
188 | | Some _ | None -> | |
189 | false | |
190 | in | |
191 | if empty then incr count.empty; | |
192 | if ignored then incr count.ignored; | |
193 | (not empty) && (not ignored) | |
194 | ) | |
e09dff7f SK |
195 | |
196 | let make_output_fun = function | |
197 | | Stdout -> | |
5c0100d2 SK |
198 | fun digest n_files files -> |
199 | printf "%s %d\n%!" (Digest.to_hex digest) n_files; | |
200 | List.iter (File.Set.elements files) ~f:(fun {File.path; _} -> | |
201 | printf " %S\n%!" path | |
202 | ) | |
e09dff7f | 203 | | Directory dir -> |
5c0100d2 | 204 | fun digest _ files -> |
e09dff7f SK |
205 | let digest = Digest.to_hex digest in |
206 | let dir = Filename.concat dir (String.sub digest 0 2) in | |
207 | Unix.mkdir dir ~perm:0o700; | |
208 | let oc = open_out (Filename.concat dir digest) in | |
5c0100d2 | 209 | List.iter (File.Set.elements files) ~f:(fun {File.path; _} -> |
e09dff7f SK |
210 | output_string oc (sprintf "%S\n%!" path) |
211 | ); | |
212 | close_out oc | |
213 | ||
7a0486be SK |
214 | let sample path ~len = |
215 | let buf = Bytes.make len ' ' in | |
216 | let ic = open_in_bin path in | |
217 | let rec read pos len = | |
218 | assert (len >= 0); | |
219 | if len = 0 then | |
220 | () | |
221 | else begin | |
222 | let chunk_size = input ic buf pos len in | |
223 | if chunk_size = 0 then (* EOF *) | |
224 | () | |
225 | else | |
226 | read (pos + chunk_size) (len - chunk_size) | |
227 | end | |
228 | in | |
229 | read 0 len; | |
230 | close_in ic; | |
231 | Bytes.to_string buf | |
232 | ||
233 | let main {input; output; ignore; sample = sample_len} = | |
a9a56d74 SK |
234 | let t0 = Sys.time () in |
235 | let count = | |
236 | { considered = ref 0 | |
237 | ; empty = ref 0 | |
238 | ; ignored = ref 0 | |
239 | ; unique_size = ref 0 | |
240 | ; hashed = ref 0 | |
7a0486be | 241 | ; unique_sample = ref 0 |
a9a56d74 SK |
242 | } |
243 | in | |
e09dff7f | 244 | let output = make_output_fun output in |
a9a56d74 | 245 | let input = make_input_stream input ignore count in |
5c0100d2 SK |
246 | let files_by_size = Hashtbl.create 1_000_000 in |
247 | let files_by_sample = Hashtbl.create 1_000_000 in | |
248 | let files_by_digest = Hashtbl.create 1_000_000 in | |
249 | let process tbl ~group ~file = | |
250 | let count, files = | |
251 | match Hashtbl.find_opt tbl group with | |
a9a56d74 | 252 | | None -> |
5c0100d2 SK |
253 | (0, File.Set.empty) |
254 | | Some (n, files) -> | |
255 | (n, files) | |
a9a56d74 | 256 | in |
5c0100d2 | 257 | Hashtbl.replace tbl group (count + 1, File.Set.add file files) |
34107832 | 258 | in |
8c54ccb8 SK |
259 | (* TODO: Make a nice(r) abstraction to re-assemble pieces in the pipeline: |
260 | * | |
261 | * from input to files_by_size | |
262 | * from files_by_size to files_by_sample | |
263 | * from files_by_sample to files_by_digest | |
264 | * from files_by_digest to output | |
265 | * | |
266 | * input |> files_by_size |> files_by_sample |> files_by_digest |> output | |
267 | *) | |
5c0100d2 SK |
268 | Stream.iter input ~f:(fun ({File.size; _} as file) -> |
269 | process files_by_size ~group:size ~file | |
cce97c27 | 270 | ); |
a9a56d74 | 271 | Hashtbl.iter |
5c0100d2 | 272 | (fun _ (n, files) -> |
a9a56d74 SK |
273 | (* Skip files with unique sizes *) |
274 | if n > 1 then | |
5c0100d2 SK |
275 | File.Set.iter |
276 | (fun ({File.path; _} as file) -> | |
277 | process | |
278 | files_by_sample | |
279 | ~group:(sample path ~len:sample_len) | |
280 | ~file | |
a9a56d74 | 281 | ) |
5c0100d2 | 282 | files |
a9a56d74 SK |
283 | else |
284 | incr count.unique_size; | |
285 | ) | |
5c0100d2 | 286 | files_by_size; |
7a0486be | 287 | Hashtbl.iter |
5c0100d2 | 288 | (fun _ (n, files) -> |
7a0486be SK |
289 | (* Skip files with unique samples *) |
290 | if n > 1 then | |
5c0100d2 SK |
291 | File.Set.iter |
292 | (fun ({File.path; _} as file) -> | |
7a0486be | 293 | incr count.hashed; |
5c0100d2 | 294 | process files_by_digest ~group:(Digest.file path) ~file |
7a0486be | 295 | ) |
5c0100d2 | 296 | files |
7a0486be SK |
297 | else |
298 | incr count.unique_sample; | |
299 | ) | |
5c0100d2 SK |
300 | files_by_sample; |
301 | Hashtbl.iter | |
302 | (fun d (n, files) -> | |
303 | if n > 1 then | |
304 | output d n files | |
305 | ) | |
306 | files_by_digest; | |
f16e6ff1 | 307 | let t1 = Sys.time () in |
a9a56d74 SK |
308 | eprintf "Time : %f seconds\n%!" (t1 -. t0); |
309 | eprintf "Considered : %d\n%!" !(count.considered); | |
310 | eprintf "Hashed : %d\n%!" !(count.hashed); | |
311 | eprintf "Skipped due to 0 size : %d\n%!" !(count.empty); | |
312 | eprintf "Skipped due to unique size : %d\n%!" !(count.unique_size); | |
7a0486be | 313 | eprintf "Skipped due to unique sample : %d\n%!" !(count.unique_sample); |
a9a56d74 | 314 | eprintf "Ignored due to regex match : %d\n%!" !(count.ignored) |
cce97c27 | 315 | |
1253df34 SK |
316 | let get_opt () : opt = |
317 | let assert_ test x msg = | |
318 | if not (test x) then begin | |
319 | eprintf "%s\n%!" msg; | |
e09dff7f SK |
320 | exit 1 |
321 | end | |
322 | in | |
1253df34 SK |
323 | let assert_file_exists path = |
324 | assert_ Sys.file_exists path (sprintf "File does not exist: %S" path) | |
325 | in | |
e09dff7f | 326 | let assert_file_is_dir path = |
1253df34 | 327 | assert_ Sys.is_directory path (sprintf "File is not a directory: %S" path) |
e09dff7f | 328 | in |
1253df34 SK |
329 | let input = ref Stdin in |
330 | let output = ref Stdout in | |
331 | let ignore = ref None in | |
7a0486be | 332 | let sample = ref 256 in |
e09dff7f SK |
333 | let spec = |
334 | [ ( "-out" | |
335 | , Arg.String (fun path -> | |
336 | assert_file_exists path; | |
337 | assert_file_is_dir path; | |
338 | output := Directory path | |
8673c3a5 | 339 | ) |
e09dff7f SK |
340 | , " Output to this directory instead of stdout." |
341 | ) | |
34107832 SK |
342 | ; ( "-ignore" |
343 | , Arg.String (fun regexp -> ignore := Some (Str.regexp regexp)) | |
344 | , " Ignore file paths which match this regexp pattern (see Str module)." | |
345 | ) | |
7a0486be SK |
346 | ; ( "-sample" |
347 | , Arg.Set_int sample | |
348 | , (sprintf " Byte size of file samples to use. Default: %d" !sample) | |
349 | ) | |
e09dff7f SK |
350 | ] |
351 | in | |
352 | Arg.parse | |
353 | (Arg.align spec) | |
354 | (fun path -> | |
355 | assert_file_exists path; | |
356 | assert_file_is_dir path; | |
357 | match !input with | |
cfcdf90a SK |
358 | | Stdin -> |
359 | input := Directories [path] | |
360 | | Directories paths -> | |
361 | input := Directories (path :: paths) | |
61a05dbb SK |
362 | ) |
363 | ""; | |
7a0486be SK |
364 | assert_ |
365 | (fun x -> x > 0) | |
366 | !sample | |
367 | (sprintf "Sample size cannot be negative: %d" !sample); | |
1253df34 SK |
368 | { input = !input |
369 | ; output = !output | |
370 | ; ignore = !ignore | |
7a0486be | 371 | ; sample = !sample |
1253df34 SK |
372 | } |
373 | ||
374 | let () = | |
375 | main (get_opt ()) |