X-Git-Url: https://git.xandkar.net/?p=dups.git;a=blobdiff_plain;f=lib%2Ffile.ml;fp=lib%2Ffile.ml;h=df7e60868d420009d32f5747f64ff24339eada1c;hp=0000000000000000000000000000000000000000;hb=ddcbda0046a598d55746850e15d4fa99b3998ce0;hpb=21e1d14c1e23d2c586ebe1480add8e9d87e7ad7a diff --git a/lib/file.ml b/lib/file.ml new file mode 100644 index 0000000..df7e608 --- /dev/null +++ b/lib/file.ml @@ -0,0 +1,95 @@ +module Array = ArrayLabels +module List = ListLabels +module Unix = UnixLabels + +module M = Metrics + +type t = + { path : string + ; size : int + } + +let lookup paths = + Stream.map paths ~f:(fun path -> + let {Unix.st_size = size; _} = Unix.lstat path in + {path; size} + ) + +let find root = + let dirs = Queue.create () in + let files = Queue.create () in + let explore parent = + Array.iter (Sys.readdir parent) ~f:(fun child -> + let path = Filename.concat parent child in + let {Unix.st_kind = file_kind; st_size; _} = Unix.lstat path in + match file_kind with + | Unix.S_REG -> + let file = {path; size = st_size} in + Queue.add file files + | Unix.S_DIR -> + Queue.add path dirs + | Unix.S_CHR + | Unix.S_BLK + | Unix.S_LNK + | Unix.S_FIFO + | Unix.S_SOCK -> + () + ) + in + explore root; + let rec next () = + match Queue.is_empty files, Queue.is_empty dirs with + | false, _ -> Some (Queue.take files) + | true , true -> None + | true , false -> + explore (Queue.take dirs); + next () + in + Stream.create next + +let filter_out_singletons files ~group ~handle_singleton = + let q = Queue.create () in + Stream.iter (Stream.group_by files ~f:group) ~f:(fun group -> + let (_, n, members) = group in + if n > 1 then + List.iter members ~f:(fun m -> Queue.add m q) + else + handle_singleton group + ); + Stream.of_queue q + +let filter_out_unique_sizes files ~metrics = + filter_out_singletons + files + ~group:(fun {size; _} -> size) + ~handle_singleton:(fun (size, _, _) -> M.file_unique_size metrics ~size) + +let head {path; _} ~len ~metrics = + M.file_sampled metrics; + let buf = Bytes.make len ' ' in + let ic = open_in_bin path in + let rec read pos len = + assert (len >= 0); + if len = 0 then + () + else begin + let chunk_size = input ic buf pos len in + M.chunk_read metrics ~size:chunk_size; + if chunk_size = 0 then (* EOF *) + () + else + read (pos + chunk_size) (len - chunk_size) + end + in + read 0 len; + close_in ic; + Bytes.to_string buf + +let filter_out_unique_heads files ~len ~metrics = + filter_out_singletons + files + ~group:(head ~len ~metrics) + ~handle_singleton:(fun (_, _, files) -> + let {size; _} = List.hd files in (* Guaranteed non-empty *) + M.file_unique_sample metrics ~size + )