- Stream.iter paths ~f:(fun path ->
- incr path_count;
- try
- let digest = Digest.file path in
- let paths =
- match Hashtbl.find_opt paths_by_digest digest with
- | None ->
- StrSet.empty
- | Some paths ->
- paths
- in
- Hashtbl.replace paths_by_digest digest (StrSet.add path paths)
- with Sys_error e ->
- eprintf "WARNING: Failed to process %S: %S\n%!" path e
+ let count =
+ { considered_files = ref 0
+ ; considered_bytes = ref 0
+ ; empty = ref 0
+ ; ignored_files = ref 0
+ ; ignored_bytes = ref 0
+ ; unique_size_files = ref 0
+ ; unique_size_bytes = ref 0
+ ; sampled_files = ref 0
+ ; sampled_bytes = ref 0
+ ; hashed_files = ref 0
+ ; hashed_bytes = ref 0
+ ; unique_sample_files = ref 0
+ ; unique_sample_bytes = ref 0
+ ; digests = ref 0
+ }
+ in
+ let output = make_output_fun output in
+ let input = make_input_stream input ignore count in
+ let files_by_size = Hashtbl.create 1_000_000 in
+ let files_by_sample = Hashtbl.create 1_000_000 in
+ let files_by_digest = Hashtbl.create 1_000_000 in
+ let process tbl ~group ~file =
+ let count, files =
+ match Hashtbl.find_opt tbl group with
+ | None ->
+ (0, File.Set.empty)
+ | Some (n, files) ->
+ (n, files)
+ in
+ Hashtbl.replace tbl group (count + 1, File.Set.add file files)
+ in
+ (* TODO: Make a nice(r) abstraction to re-assemble pieces in the pipeline:
+ *
+ * from input to files_by_size
+ * from files_by_size to files_by_sample
+ * from files_by_sample to files_by_digest
+ * from files_by_digest to output
+ *
+ * input |> files_by_size |> files_by_sample |> files_by_digest |> output
+ *)
+ let t0_group_by_size = Sys.time () in
+ Stream.iter input ~f:(fun ({File.size; _} as file) ->
+ process files_by_size ~group:size ~file