+ Stream.filter input ~f:(fun {File.path; size} ->
+ M.file_considered metrics ~size;
+ let empty = size = 0 in
+ let ignored = ignore path in
+ if empty then M.file_empty metrics;
+ if ignored then M.file_ignored metrics ~size;
+ (not empty) && (not ignored)
+ )
+
+let make_output_fun = function
+ | Stdout ->
+ fun digest n_files files ->
+ printf "%s %d\n%!" (Digest.to_hex digest) n_files;
+ List.iter files ~f:(fun {File.path; _} ->
+ printf " %S\n%!" path
+ )
+ | Directory dir ->
+ fun digest _ files ->
+ let digest = Digest.to_hex digest in
+ let dir = Filename.concat dir (String.sub digest 0 2) in
+ Unix.mkdir dir ~perm:0o700;
+ let oc = open_out (Filename.concat dir digest) in
+ List.iter files ~f:(fun {File.path; _} ->
+ output_string oc (sprintf "%S\n%!" path)
+ );
+ close_out oc
+
+let time_wall () =
+ Unix.gettimeofday ()
+
+let time_proc () =
+ Sys.time ()
+
+let main {input; output; ignore; sample = sample_len; njobs; delim_null} =
+ let wt0_all = time_wall () in
+ let pt0_all = time_proc () in
+ let metrics = M.init () in
+ let output = make_output_fun output in
+ let input = make_input_stream input ignore ~metrics ~delim_null in
+ (* TODO: Make a nice(r) abstraction to re-assemble pieces in the pipeline:
+ *
+ * from input to files_by_size
+ * from files_by_size to files_by_sample
+ * from files_by_sample to files_by_digest
+ * from files_by_digest to output
+ *
+ * input |> files_by_size |> files_by_sample |> files_by_digest |> output
+ *)
+
+ let files = input in
+
+ let wt0_group_by_size = time_wall () in
+ let pt0_group_by_size = time_proc () in
+ eprintf "[debug] filtering-out files with unique size\n%!";
+ let files = File.filter_out_unique_sizes files ~metrics in
+ let pt1_group_by_size = time_proc () in
+ let wt1_group_by_size = time_wall () in
+
+ let wt0_group_by_sample = wt1_group_by_size in
+ let pt0_group_by_sample = pt1_group_by_size in
+ eprintf "[debug] filtering-out files with unique heads\n%!";
+ let files =
+ if njobs > 1 then begin
+ let q = Queue.create () in
+ files
+ |> Stream.bag_map ~njobs ~f:(File.head ~len:sample_len ~metrics)
+ |> Stream.group_by ~f:snd
+ |> Stream.map ~f:(fun (d, n, pairs) -> (d, n, List.map pairs ~f:fst))
+ |> Stream.filter ~f:(fun (_, n, _) -> n > 1)
+ |> Stream.iter ~f:(fun (_, _, fs) -> List.iter fs ~f:(fun f -> Queue.add f q))
+ ;
+ Stream.of_queue q
+ end else
+ File.filter_out_unique_heads files ~len:sample_len ~metrics
+ in
+ let pt1_group_by_sample = time_proc () in
+ let wt1_group_by_sample = time_wall () in
+
+ let wt0_group_by_digest = wt1_group_by_sample in
+ let pt0_group_by_digest = pt1_group_by_sample in
+ eprintf "[debug] hashing\n%!";
+ let groups =
+ if njobs > 1 then
+ let with_digests =
+ Stream.bag_map files ~njobs ~f:(fun {File.path; _} -> Digest.file path)