--- /dev/null
+module Array = ArrayLabels
+module List = ListLabels
+module Unix = UnixLabels
+
+module M = Metrics
+
+type t =
+ { path : string
+ ; size : int
+ }
+
+let lookup paths =
+ Stream.map paths ~f:(fun path ->
+ let {Unix.st_size = size; _} = Unix.lstat path in
+ {path; size}
+ )
+
+let find root =
+ let dirs = Queue.create () in
+ let files = Queue.create () in
+ let explore parent =
+ Array.iter (Sys.readdir parent) ~f:(fun child ->
+ let path = Filename.concat parent child in
+ let {Unix.st_kind = file_kind; st_size; _} = Unix.lstat path in
+ match file_kind with
+ | Unix.S_REG ->
+ let file = {path; size = st_size} in
+ Queue.add file files
+ | Unix.S_DIR ->
+ Queue.add path dirs
+ | Unix.S_CHR
+ | Unix.S_BLK
+ | Unix.S_LNK
+ | Unix.S_FIFO
+ | Unix.S_SOCK ->
+ ()
+ )
+ in
+ explore root;
+ let rec next () =
+ match Queue.is_empty files, Queue.is_empty dirs with
+ | false, _ -> Some (Queue.take files)
+ | true , true -> None
+ | true , false ->
+ explore (Queue.take dirs);
+ next ()
+ in
+ Stream.create next
+
+let filter_out_singletons files ~group ~handle_singleton =
+ let q = Queue.create () in
+ Stream.iter (Stream.group_by files ~f:group) ~f:(fun group ->
+ let (_, n, members) = group in
+ if n > 1 then
+ List.iter members ~f:(fun m -> Queue.add m q)
+ else
+ handle_singleton group
+ );
+ Stream.of_queue q
+
+let filter_out_unique_sizes files ~metrics =
+ filter_out_singletons
+ files
+ ~group:(fun {size; _} -> size)
+ ~handle_singleton:(fun (size, _, _) -> M.file_unique_size metrics ~size)
+
+let head {path; _} ~len ~metrics =
+ M.file_sampled metrics;
+ let buf = Bytes.make len ' ' in
+ let ic = open_in_bin path in
+ let rec read pos len =
+ assert (len >= 0);
+ if len = 0 then
+ ()
+ else begin
+ let chunk_size = input ic buf pos len in
+ M.chunk_read metrics ~size:chunk_size;
+ if chunk_size = 0 then (* EOF *)
+ ()
+ else
+ read (pos + chunk_size) (len - chunk_size)
+ end
+ in
+ read 0 len;
+ close_in ic;
+ Bytes.to_string buf
+
+let filter_out_unique_heads files ~len ~metrics =
+ filter_out_singletons
+ files
+ ~group:(head ~len ~metrics)
+ ~handle_singleton:(fun (_, _, files) ->
+ let {size; _} = List.hd files in (* Guaranteed non-empty *)
+ M.file_unique_sample metrics ~size
+ )