Commit | Line | Data |
---|---|---|
cce97c27 SK |
1 | open Printf |
2 | ||
948ee900 SK |
3 | module Array = ArrayLabels |
4 | module List = ListLabels | |
5c0100d2 | 5 | module StrSet = Set.Make(String) |
e09dff7f | 6 | module Unix = UnixLabels |
cce97c27 | 7 | |
1013fbcd SK |
8 | module Metrics : sig |
9 | type t | |
10 | ||
11 | val init | |
12 | : unit -> t | |
13 | val report | |
14 | : t | |
15 | -> time_all:float | |
16 | -> time_group_by_size:float | |
17 | -> time_group_by_head:float | |
18 | -> time_group_by_digest:float | |
19 | -> unit | |
20 | ||
21 | val file_considered | |
22 | : t -> size:int -> unit | |
23 | val file_ignored | |
24 | : t -> size:int -> unit | |
25 | val file_empty | |
26 | : t -> unit | |
27 | val file_sampled | |
28 | : t -> unit | |
29 | val chunk_read | |
30 | : t -> size:int -> unit | |
31 | val file_unique_size | |
32 | : t -> size:int -> unit | |
33 | val file_unique_sample | |
34 | : t -> size:int -> unit | |
35 | val file_hashed | |
36 | : t -> size:int -> unit | |
37 | val digest | |
38 | : t -> unit | |
389dccaf SK |
39 | val redundant_data |
40 | : t -> size:int -> unit | |
1013fbcd SK |
41 | end = struct |
42 | type t = | |
43 | { considered_files : int ref | |
44 | ; considered_bytes : int ref | |
45 | ; empty : int ref | |
46 | ; ignored_files : int ref | |
47 | ; ignored_bytes : int ref | |
48 | ; unique_size_files : int ref | |
49 | ; unique_size_bytes : int ref | |
50 | ; unique_sample_files : int ref | |
51 | ; unique_sample_bytes : int ref | |
52 | ; sampled_files : int ref | |
53 | ; sampled_bytes : int ref | |
54 | ; hashed_files : int ref | |
55 | ; hashed_bytes : int ref | |
56 | ; digests : int ref | |
389dccaf | 57 | ; redundant_data : int ref |
1013fbcd SK |
58 | } |
59 | ||
60 | let init () = | |
61 | { considered_files = ref 0 | |
62 | ; considered_bytes = ref 0 | |
63 | ; empty = ref 0 | |
64 | ; ignored_files = ref 0 | |
65 | ; ignored_bytes = ref 0 | |
66 | ; unique_size_files = ref 0 | |
67 | ; unique_size_bytes = ref 0 | |
68 | ; sampled_files = ref 0 | |
69 | ; sampled_bytes = ref 0 | |
70 | ; hashed_files = ref 0 | |
71 | ; hashed_bytes = ref 0 | |
72 | ; unique_sample_files = ref 0 | |
73 | ; unique_sample_bytes = ref 0 | |
74 | ; digests = ref 0 | |
389dccaf | 75 | ; redundant_data = ref 0 |
1013fbcd SK |
76 | } |
77 | ||
78 | let add sum addend = | |
79 | sum := !sum + addend | |
80 | ||
81 | let file_considered t ~size = | |
82 | incr t.considered_files; | |
83 | add t.considered_bytes size | |
84 | ||
85 | let file_ignored {ignored_files; ignored_bytes; _} ~size = | |
86 | incr ignored_files; | |
87 | add ignored_bytes size | |
88 | ||
89 | let file_empty t = | |
90 | incr t.empty | |
91 | ||
92 | let chunk_read t ~size = | |
93 | add t.sampled_bytes size | |
94 | ||
95 | let file_sampled t = | |
96 | incr t.sampled_files | |
97 | ||
98 | let file_unique_size t ~size = | |
99 | incr t.unique_size_files; | |
100 | add t.unique_size_bytes size | |
101 | ||
102 | let file_unique_sample t ~size = | |
103 | incr t.unique_sample_files; | |
104 | add t.unique_sample_bytes size | |
105 | ||
106 | let file_hashed t ~size = | |
107 | incr t.hashed_files; | |
108 | add t.hashed_bytes size | |
109 | ||
110 | let digest t = | |
111 | incr t.digests | |
112 | ||
389dccaf SK |
113 | let redundant_data t ~size = |
114 | add t.redundant_data size | |
115 | ||
1013fbcd SK |
116 | let report |
117 | t | |
118 | ~time_all | |
119 | ~time_group_by_size | |
120 | ~time_group_by_head | |
121 | ~time_group_by_digest | |
122 | = | |
123 | let b_to_mb b = (float_of_int b) /. 1024. /. 1024. in | |
124 | let b_to_gb b = (b_to_mb b) /. 1024. in | |
125 | eprintf "Time : %8.2f seconds\n%!" | |
126 | time_all; | |
127 | eprintf "Considered : %8d files %6.2f Gb\n%!" | |
128 | !(t.considered_files) | |
129 | (b_to_gb !(t.considered_bytes)); | |
130 | eprintf "Sampled : %8d files %6.2f Gb\n%!" | |
131 | !(t.sampled_files) | |
132 | (b_to_gb !(t.sampled_bytes)); | |
133 | eprintf "Hashed : %8d files %6.2f Gb %6.2f seconds\n%!" | |
134 | !(t.hashed_files) | |
135 | (b_to_gb !(t.hashed_bytes)) | |
136 | time_group_by_digest; | |
137 | eprintf "Digests : %8d\n%!" | |
138 | !(t.digests); | |
389dccaf SK |
139 | eprintf "Duplicates (Hashed - Digests): %8d files %6.2f Gb\n%!" |
140 | (!(t.hashed_files) - !(t.digests)) | |
141 | (b_to_gb !(t.redundant_data)); | |
1013fbcd SK |
142 | eprintf "Skipped due to 0 size : %8d files\n%!" !(t.empty); |
143 | eprintf "Skipped due to unique size : %8d files %6.2f Gb %6.2f seconds\n%!" | |
144 | !(t.unique_size_files) | |
145 | (b_to_gb !(t.unique_size_bytes)) | |
146 | time_group_by_size; | |
147 | eprintf "Skipped due to unique sample : %8d files %6.2f Gb %6.2f seconds\n%!" | |
148 | !(t.unique_sample_files) | |
149 | (b_to_gb !(t.unique_sample_bytes)) | |
150 | time_group_by_head; | |
151 | eprintf "Ignored due to regex match : %8d files %6.2f Gb\n%!" | |
152 | !(t.ignored_files) | |
153 | (b_to_gb !(t.ignored_bytes)) | |
154 | end | |
155 | ||
156 | module M = Metrics | |
157 | ||
cce97c27 | 158 | module Stream : sig |
948ee900 | 159 | type 'a t |
e13e9ef5 SK |
160 | |
161 | val create : (unit -> 'a option) -> 'a t | |
162 | ||
1013fbcd SK |
163 | val of_queue : 'a Queue.t -> 'a t |
164 | ||
948ee900 | 165 | val iter : 'a t -> f:('a -> unit) -> unit |
8673c3a5 | 166 | |
7c17443d SK |
167 | val bag_map : 'a t -> njobs:int -> f:('a -> 'b) -> ('a * 'b) t |
168 | (** Parallel map with arbitrarily-reordered elements. *) | |
169 | ||
a9a56d74 SK |
170 | val map : 'a t -> f:('a -> 'b) -> 'b t |
171 | ||
172 | val filter : 'a t -> f:('a -> bool) -> 'a t | |
173 | ||
8673c3a5 | 174 | val concat : ('a t) list -> 'a t |
1013fbcd SK |
175 | |
176 | val group_by : 'a t -> f:('a -> 'b) -> ('b * int * 'a list) t | |
cce97c27 SK |
177 | end = struct |
178 | module S = Stream | |
179 | ||
948ee900 | 180 | type 'a t = |
a9a56d74 | 181 | {mutable streams : ('a S.t) list} |
948ee900 | 182 | |
7c17443d SK |
183 | type ('input, 'output) msg_from_vassal = |
184 | | Ready of int | |
185 | | Result of (int * ('input * 'output)) | |
186 | | Exiting of int | |
187 | ||
188 | type 'input msg_from_lord = | |
189 | | Job of 'input option | |
190 | ||
e13e9ef5 | 191 | let create f = |
a9a56d74 SK |
192 | {streams = [S.from (fun _ -> f ())]} |
193 | ||
1013fbcd SK |
194 | let of_queue q = |
195 | create (fun () -> | |
196 | match Queue.take q with | |
197 | | exception Queue.Empty -> | |
198 | None | |
199 | | x -> | |
200 | Some x | |
201 | ) | |
202 | ||
a9a56d74 SK |
203 | let rec next t = |
204 | match t.streams with | |
205 | | [] -> | |
206 | None | |
207 | | s :: streams -> | |
208 | (match S.next s with | |
209 | | exception Stream.Failure -> | |
210 | t.streams <- streams; | |
211 | next t | |
212 | | x -> | |
213 | Some x | |
214 | ) | |
215 | ||
216 | let map t ~f = | |
217 | create (fun () -> | |
218 | match next t with | |
219 | | None -> None | |
220 | | Some x -> Some (f x) | |
221 | ) | |
222 | ||
223 | let filter t ~f = | |
224 | let rec filter () = | |
225 | match next t with | |
226 | | None -> | |
227 | None | |
228 | | Some x when f x -> | |
229 | Some x | |
230 | | Some _ -> | |
231 | filter () | |
232 | in | |
233 | create filter | |
e13e9ef5 SK |
234 | |
235 | let iter t ~f = | |
a9a56d74 | 236 | List.iter t.streams ~f:(S.iter f) |
8673c3a5 SK |
237 | |
238 | let concat ts = | |
a9a56d74 | 239 | {streams = List.concat (List.map ts ~f:(fun {streams} -> streams))} |
1013fbcd SK |
240 | |
241 | let group_by t ~f = | |
242 | let groups_tbl = Hashtbl.create 1_000_000 in | |
243 | let group_update x = | |
244 | let group = f x in | |
245 | let members = | |
246 | match Hashtbl.find_opt groups_tbl group with | |
247 | | None -> | |
248 | (1, [x]) | |
249 | | Some (n, xs) -> | |
250 | (succ n, x :: xs) | |
251 | in | |
252 | Hashtbl.replace groups_tbl group members | |
253 | in | |
254 | iter t ~f:group_update; | |
255 | let groups = Queue.create () in | |
256 | Hashtbl.iter | |
257 | (fun name (length, members) -> Queue.add (name, length, members) groups) | |
258 | groups_tbl; | |
259 | of_queue groups | |
7c17443d SK |
260 | |
261 | module Ipc : sig | |
262 | val send : out_channel -> 'a -> unit | |
263 | val recv : in_channel -> 'a | |
264 | end = struct | |
265 | let send oc msg = | |
266 | Marshal.to_channel oc msg []; | |
267 | flush oc | |
268 | ||
269 | let recv ic = | |
270 | Marshal.from_channel ic | |
271 | end | |
272 | ||
273 | let lord t ~njobs ~vassals ~ic ~ocs = | |
274 | eprintf "[debug] [lord] started\n%!"; | |
275 | let active_vassals = ref njobs in | |
276 | let results = Queue.create () in | |
277 | let rec dispatch () = | |
278 | match Ipc.recv ic with | |
279 | | ((Exiting i) : ('input, 'output) msg_from_vassal) -> | |
280 | close_out ocs.(i); | |
281 | decr active_vassals; | |
282 | if !active_vassals = 0 then | |
283 | () | |
284 | else | |
285 | dispatch () | |
286 | | ((Ready i) : ('input, 'output) msg_from_vassal) -> | |
287 | Ipc.send ocs.(i) (Job (next t)); | |
288 | dispatch () | |
289 | | ((Result (i, result)) : ('input, 'output) msg_from_vassal) -> | |
290 | Queue.add result results; | |
291 | Ipc.send ocs.(i) (Job (next t)); | |
292 | dispatch () | |
293 | in | |
294 | let rec wait = function | |
295 | | [] -> () | |
296 | | vassals -> | |
297 | let pid, _process_status = Unix.wait () in | |
298 | (* TODO: handle process_status *) | |
299 | wait (List.filter vassals ~f:(fun p -> p <> pid)) | |
300 | in | |
301 | dispatch (); | |
302 | close_in ic; | |
303 | wait vassals; | |
304 | of_queue results | |
305 | ||
306 | let vassal i ~f ~vassal_pipe_r ~lord_pipe_w = | |
307 | eprintf "[debug] [vassal %d] started\n%!" i; | |
308 | let ic = Unix.in_channel_of_descr vassal_pipe_r in | |
309 | let oc = Unix.out_channel_of_descr lord_pipe_w in | |
310 | let rec work msg = | |
311 | Ipc.send oc msg; | |
312 | match Ipc.recv ic with | |
313 | | (Job (Some x) : 'input msg_from_lord) -> | |
314 | work (Result (i, (x, f x))) | |
315 | | (Job None : 'input msg_from_lord) -> | |
316 | Ipc.send oc (Exiting i) | |
317 | in | |
318 | work (Ready i); | |
319 | close_in ic; | |
320 | close_out oc; | |
321 | exit 0 | |
322 | ||
323 | let bag_map t ~njobs ~f = | |
324 | let lord_pipe_r, lord_pipe_w = Unix.pipe () in | |
325 | let vassal_pipes = Array.init njobs ~f:(fun _ -> Unix.pipe ()) in | |
326 | let vassal_pipes_r = Array.map vassal_pipes ~f:(fun (r, _) -> r) in | |
327 | let vassal_pipes_w = Array.map vassal_pipes ~f:(fun (_, w) -> w) in | |
328 | let vassals = ref [] in | |
329 | for i=0 to (njobs - 1) do | |
330 | begin match Unix.fork () with | |
331 | | 0 -> | |
332 | Unix.close lord_pipe_r; | |
333 | vassal i ~f ~lord_pipe_w ~vassal_pipe_r:vassal_pipes_r.(i) | |
334 | | pid -> | |
335 | vassals := pid :: !vassals | |
336 | end | |
337 | done; | |
338 | Unix.close lord_pipe_w; | |
339 | lord | |
340 | t | |
341 | ~njobs | |
342 | ~vassals:!vassals | |
343 | ~ic:(Unix.in_channel_of_descr lord_pipe_r) | |
344 | ~ocs:(Array.map vassal_pipes_w ~f:Unix.out_channel_of_descr) | |
e13e9ef5 SK |
345 | end |
346 | ||
347 | module In_channel : sig | |
348 | val lines : in_channel -> string Stream.t | |
349 | end = struct | |
350 | let lines ic = | |
351 | Stream.create (fun () -> | |
352 | match input_line ic with | |
353 | | exception End_of_file -> | |
354 | None | |
355 | | line -> | |
356 | Some line | |
357 | ) | |
358 | end | |
359 | ||
a9a56d74 SK |
360 | module File : sig |
361 | type t = | |
362 | { path : string | |
363 | ; size : int | |
364 | } | |
365 | ||
366 | val find : string -> t Stream.t | |
367 | (** Find all files in the directory tree, starting from the given root path *) | |
368 | ||
369 | val lookup : string Stream.t -> t Stream.t | |
370 | (** Lookup file info for given paths *) | |
5c0100d2 | 371 | |
1013fbcd SK |
372 | val filter_out_unique_sizes : t Stream.t -> metrics:M.t -> t Stream.t |
373 | val filter_out_unique_heads : t Stream.t -> len:int -> metrics:M.t -> t Stream.t | |
e13e9ef5 | 374 | end = struct |
a9a56d74 SK |
375 | type t = |
376 | { path : string | |
377 | ; size : int | |
378 | } | |
379 | ||
380 | let lookup paths = | |
381 | Stream.map paths ~f:(fun path -> | |
382 | let {Unix.st_size = size; _} = Unix.lstat path in | |
383 | {path; size} | |
384 | ) | |
385 | ||
386 | let find root = | |
948ee900 SK |
387 | let dirs = Queue.create () in |
388 | let files = Queue.create () in | |
948ee900 SK |
389 | let explore parent = |
390 | Array.iter (Sys.readdir parent) ~f:(fun child -> | |
391 | let path = Filename.concat parent child in | |
a9a56d74 | 392 | let {Unix.st_kind = file_kind; st_size; _} = Unix.lstat path in |
948ee900 SK |
393 | match file_kind with |
394 | | Unix.S_REG -> | |
a9a56d74 SK |
395 | let file = {path; size = st_size} in |
396 | Queue.add file files | |
948ee900 SK |
397 | | Unix.S_DIR -> |
398 | Queue.add path dirs | |
399 | | Unix.S_CHR | |
400 | | Unix.S_BLK | |
401 | | Unix.S_LNK | |
402 | | Unix.S_FIFO | |
403 | | Unix.S_SOCK -> | |
404 | () | |
405 | ) | |
406 | in | |
1f130f74 | 407 | explore root; |
c66266c6 SK |
408 | let rec next () = |
409 | match Queue.is_empty files, Queue.is_empty dirs with | |
410 | | false, _ -> Some (Queue.take files) | |
411 | | true , true -> None | |
412 | | true , false -> | |
413 | explore (Queue.take dirs); | |
414 | next () | |
415 | in | |
416 | Stream.create next | |
5c0100d2 | 417 | |
1013fbcd SK |
418 | let filter_out_singletons files ~group ~handle_singleton = |
419 | let q = Queue.create () in | |
420 | Stream.iter (Stream.group_by files ~f:group) ~f:(fun group -> | |
421 | let (_, n, members) = group in | |
422 | if n > 1 then | |
423 | List.iter members ~f:(fun m -> Queue.add m q) | |
424 | else | |
425 | handle_singleton group | |
426 | ); | |
427 | Stream.of_queue q | |
428 | ||
429 | let filter_out_unique_sizes files ~metrics = | |
430 | filter_out_singletons | |
431 | files | |
432 | ~group:(fun {size; _} -> size) | |
433 | ~handle_singleton:(fun (size, _, _) -> M.file_unique_size metrics ~size) | |
434 | ||
435 | let head path ~len ~metrics = | |
436 | let buf = Bytes.make len ' ' in | |
437 | let ic = open_in_bin path in | |
438 | let rec read pos len = | |
439 | assert (len >= 0); | |
440 | if len = 0 then | |
441 | () | |
442 | else begin | |
443 | let chunk_size = input ic buf pos len in | |
444 | M.chunk_read metrics ~size:chunk_size; | |
445 | if chunk_size = 0 then (* EOF *) | |
446 | () | |
447 | else | |
448 | read (pos + chunk_size) (len - chunk_size) | |
449 | end | |
450 | in | |
451 | read 0 len; | |
452 | close_in ic; | |
453 | Bytes.to_string buf | |
454 | ||
455 | let filter_out_unique_heads files ~len ~metrics = | |
456 | filter_out_singletons | |
457 | files | |
458 | ~group:(fun {path; _} -> | |
459 | M.file_sampled metrics; | |
460 | head path ~len ~metrics | |
461 | ) | |
462 | ~handle_singleton:(fun (_, _, files) -> | |
463 | let {size; _} = List.hd files in (* Guaranteed non-empty *) | |
464 | M.file_unique_sample metrics ~size | |
465 | ) | |
cce97c27 SK |
466 | end |
467 | ||
948ee900 | 468 | type input = |
cfcdf90a SK |
469 | | Stdin |
470 | | Directories of string list | |
948ee900 | 471 | |
e09dff7f SK |
472 | type output = |
473 | | Stdout | |
474 | | Directory of string | |
475 | ||
1253df34 SK |
476 | type opt = |
477 | { input : input | |
478 | ; output : output | |
9d01fa28 | 479 | ; ignore : string -> bool |
7a0486be | 480 | ; sample : int |
7c17443d | 481 | ; njobs : int |
1253df34 SK |
482 | } |
483 | ||
1013fbcd | 484 | let make_input_stream input ignore ~metrics = |
a9a56d74 SK |
485 | let input = |
486 | match input with | |
487 | | Stdin -> | |
488 | File.lookup (In_channel.lines stdin) | |
489 | | Directories paths -> | |
490 | let paths = StrSet.elements (StrSet.of_list paths) in | |
491 | Stream.concat (List.map paths ~f:File.find) | |
492 | in | |
493 | Stream.filter input ~f:(fun {File.path; size} -> | |
1013fbcd | 494 | M.file_considered metrics ~size; |
a9a56d74 | 495 | let empty = size = 0 in |
9d01fa28 | 496 | let ignored = ignore path in |
1013fbcd | 497 | if empty then M.file_empty metrics; |
9d01fa28 | 498 | if ignored then M.file_ignored metrics ~size; |
a9a56d74 SK |
499 | (not empty) && (not ignored) |
500 | ) | |
e09dff7f SK |
501 | |
502 | let make_output_fun = function | |
503 | | Stdout -> | |
5c0100d2 SK |
504 | fun digest n_files files -> |
505 | printf "%s %d\n%!" (Digest.to_hex digest) n_files; | |
1013fbcd | 506 | List.iter files ~f:(fun {File.path; _} -> |
5c0100d2 SK |
507 | printf " %S\n%!" path |
508 | ) | |
e09dff7f | 509 | | Directory dir -> |
5c0100d2 | 510 | fun digest _ files -> |
e09dff7f SK |
511 | let digest = Digest.to_hex digest in |
512 | let dir = Filename.concat dir (String.sub digest 0 2) in | |
513 | Unix.mkdir dir ~perm:0o700; | |
514 | let oc = open_out (Filename.concat dir digest) in | |
1013fbcd | 515 | List.iter files ~f:(fun {File.path; _} -> |
e09dff7f SK |
516 | output_string oc (sprintf "%S\n%!" path) |
517 | ); | |
518 | close_out oc | |
519 | ||
7c17443d SK |
520 | let time () = |
521 | Unix.gettimeofday () | |
522 | ||
523 | let main {input; output; ignore; sample = sample_len; njobs} = | |
524 | let t0_all = time () in | |
1013fbcd | 525 | let metrics = M.init () in |
e09dff7f | 526 | let output = make_output_fun output in |
1013fbcd | 527 | let input = make_input_stream input ignore ~metrics in |
8c54ccb8 SK |
528 | (* TODO: Make a nice(r) abstraction to re-assemble pieces in the pipeline: |
529 | * | |
530 | * from input to files_by_size | |
531 | * from files_by_size to files_by_sample | |
532 | * from files_by_sample to files_by_digest | |
533 | * from files_by_digest to output | |
534 | * | |
535 | * input |> files_by_size |> files_by_sample |> files_by_digest |> output | |
536 | *) | |
1013fbcd SK |
537 | |
538 | let files = input in | |
539 | ||
7c17443d SK |
540 | let t0_group_by_size = time () in |
541 | eprintf "[debug] filtering-out files with unique size\n%!"; | |
1013fbcd | 542 | let files = File.filter_out_unique_sizes files ~metrics in |
7c17443d | 543 | let t1_group_by_size = time () in |
1013fbcd SK |
544 | |
545 | let t0_group_by_sample = t1_group_by_size in | |
7c17443d | 546 | eprintf "[debug] filtering-out files with unique heads\n%!"; |
1013fbcd | 547 | let files = File.filter_out_unique_heads files ~len:sample_len ~metrics in |
7c17443d | 548 | let t1_group_by_sample = time () in |
1013fbcd SK |
549 | |
550 | let t0_group_by_digest = t1_group_by_sample in | |
7c17443d | 551 | eprintf "[debug] hashing\n%!"; |
1013fbcd | 552 | let groups = |
7c17443d SK |
553 | if njobs > 1 then |
554 | let digests = | |
555 | Stream.bag_map files ~njobs ~f:(fun {File.path; _} -> Digest.file path) | |
556 | in | |
557 | Stream.map (Stream.group_by digests ~f:(fun (_, d) -> d)) ~f:( | |
558 | fun (digest, n, file_digest_pairs) -> | |
559 | let files = | |
560 | List.map file_digest_pairs ~f:(fun (file, _) -> | |
561 | M.file_hashed metrics ~size:file.File.size; | |
562 | file | |
563 | ) | |
564 | in | |
565 | (digest, n, files) | |
566 | ) | |
567 | else | |
568 | Stream.group_by files ~f:(fun {File.path; size} -> | |
569 | M.file_hashed metrics ~size; | |
570 | Digest.file path | |
571 | ) | |
1013fbcd | 572 | in |
7c17443d | 573 | let t1_group_by_digest = time () in |
1013fbcd | 574 | |
7c17443d | 575 | eprintf "[debug] reporting\n%!"; |
1013fbcd SK |
576 | Stream.iter groups ~f:(fun (d, n, files) -> |
577 | M.digest metrics; | |
389dccaf SK |
578 | if n > 1 then |
579 | M.redundant_data metrics ~size:(n * (List.hd files).File.size); | |
580 | output d n files | |
1013fbcd SK |
581 | ); |
582 | ||
7c17443d | 583 | let t1_all = time () in |
1013fbcd SK |
584 | |
585 | M.report metrics | |
586 | ~time_all: (t1_all -. t0_all) | |
587 | ~time_group_by_size: (t1_group_by_size -. t0_group_by_size) | |
588 | ~time_group_by_head: (t1_group_by_sample -. t0_group_by_sample) | |
589 | ~time_group_by_digest:(t1_group_by_digest -. t0_group_by_digest) | |
cce97c27 | 590 | |
1253df34 SK |
591 | let get_opt () : opt = |
592 | let assert_ test x msg = | |
593 | if not (test x) then begin | |
594 | eprintf "%s\n%!" msg; | |
e09dff7f SK |
595 | exit 1 |
596 | end | |
597 | in | |
1253df34 SK |
598 | let assert_file_exists path = |
599 | assert_ Sys.file_exists path (sprintf "File does not exist: %S" path) | |
600 | in | |
e09dff7f | 601 | let assert_file_is_dir path = |
1253df34 | 602 | assert_ Sys.is_directory path (sprintf "File is not a directory: %S" path) |
e09dff7f | 603 | in |
1253df34 SK |
604 | let input = ref Stdin in |
605 | let output = ref Stdout in | |
9d01fa28 | 606 | let ignore = ref (fun _ -> false) in |
7c17443d SK |
607 | let sample = ref 512 in |
608 | let njobs = ref 8 in | |
e09dff7f SK |
609 | let spec = |
610 | [ ( "-out" | |
611 | , Arg.String (fun path -> | |
612 | assert_file_exists path; | |
613 | assert_file_is_dir path; | |
614 | output := Directory path | |
8673c3a5 | 615 | ) |
e09dff7f SK |
616 | , " Output to this directory instead of stdout." |
617 | ) | |
34107832 | 618 | ; ( "-ignore" |
9d01fa28 SK |
619 | , Arg.String (fun regexp -> |
620 | let regexp = Str.regexp regexp in | |
621 | ignore := fun string -> Str.string_match regexp string 0) | |
34107832 SK |
622 | , " Ignore file paths which match this regexp pattern (see Str module)." |
623 | ) | |
7a0486be SK |
624 | ; ( "-sample" |
625 | , Arg.Set_int sample | |
626 | , (sprintf " Byte size of file samples to use. Default: %d" !sample) | |
627 | ) | |
7c17443d SK |
628 | ; ( "-j" |
629 | , Arg.Set_int njobs | |
630 | , (sprintf " Number of parallel jobs. Default: %d" !njobs) | |
631 | ) | |
e09dff7f SK |
632 | ] |
633 | in | |
634 | Arg.parse | |
635 | (Arg.align spec) | |
636 | (fun path -> | |
637 | assert_file_exists path; | |
638 | assert_file_is_dir path; | |
639 | match !input with | |
cfcdf90a SK |
640 | | Stdin -> |
641 | input := Directories [path] | |
642 | | Directories paths -> | |
643 | input := Directories (path :: paths) | |
61a05dbb SK |
644 | ) |
645 | ""; | |
7a0486be SK |
646 | assert_ |
647 | (fun x -> x > 0) | |
648 | !sample | |
649 | (sprintf "Sample size cannot be negative: %d" !sample); | |
1253df34 SK |
650 | { input = !input |
651 | ; output = !output | |
652 | ; ignore = !ignore | |
7a0486be | 653 | ; sample = !sample |
7c17443d | 654 | ; njobs = !njobs |
1253df34 SK |
655 | } |
656 | ||
657 | let () = | |
658 | main (get_opt ()) |