module Array = ArrayLabels module List = ListLabels module Unix = UnixLabels module M = Metrics type t = { path : string ; size : int } let lookup paths = Stream.map paths ~f:(fun path -> let {Unix.st_size = size; _} = Unix.lstat path in {path; size} ) let find root = let dirs = Queue.create () in let files = Queue.create () in let explore parent = Array.iter (Sys.readdir parent) ~f:(fun child -> let path = Filename.concat parent child in let {Unix.st_kind = file_kind; st_size; _} = Unix.lstat path in match file_kind with | Unix.S_REG -> let file = {path; size = st_size} in Queue.add file files | Unix.S_DIR -> Queue.add path dirs | Unix.S_CHR | Unix.S_BLK | Unix.S_LNK | Unix.S_FIFO | Unix.S_SOCK -> () ) in explore root; let rec next () = match Queue.is_empty files, Queue.is_empty dirs with | false, _ -> Some (Queue.take files) | true , true -> None | true , false -> explore (Queue.take dirs); next () in Stream.create next let filter_out_singletons files ~group ~handle_singleton = let q = Queue.create () in Stream.iter (Stream.group_by files ~f:group) ~f:(fun group -> let (_, n, members) = group in if n > 1 then List.iter members ~f:(fun m -> Queue.add m q) else handle_singleton group ); Stream.of_queue q let filter_out_unique_sizes files ~metrics = filter_out_singletons files ~group:(fun {size; _} -> size) ~handle_singleton:(fun (size, _, _) -> M.file_unique_size metrics ~size) let head {path; _} ~len ~metrics = M.file_sampled metrics; let buf = Bytes.make len ' ' in let ic = open_in_bin path in let rec read pos len = assert (len >= 0); if len = 0 then () else begin let chunk_size = input ic buf pos len in M.chunk_read metrics ~size:chunk_size; if chunk_size = 0 then (* EOF *) () else read (pos + chunk_size) (len - chunk_size) end in read 0 len; close_in ic; Bytes.to_string buf let filter_out_unique_heads files ~len ~metrics = filter_out_singletons files ~group:(head ~len ~metrics) ~handle_singleton:(fun (_, _, files) -> let {size; _} = List.hd files in (* Guaranteed non-empty *) M.file_unique_sample metrics ~size )