| 1 | module Array = ArrayLabels |
| 2 | module List = ListLabels |
| 3 | module Unix = UnixLabels |
| 4 | |
| 5 | module M = Metrics |
| 6 | |
| 7 | type t = |
| 8 | { path : string |
| 9 | ; size : int |
| 10 | } |
| 11 | |
| 12 | let lookup paths = |
| 13 | Stream.map paths ~f:(fun path -> |
| 14 | let {Unix.st_size = size; _} = Unix.lstat path in |
| 15 | {path; size} |
| 16 | ) |
| 17 | |
| 18 | let find root = |
| 19 | let dirs = Queue.create () in |
| 20 | let files = Queue.create () in |
| 21 | let explore parent = |
| 22 | Array.iter (Sys.readdir parent) ~f:(fun child -> |
| 23 | let path = Filename.concat parent child in |
| 24 | let {Unix.st_kind = file_kind; st_size; _} = Unix.lstat path in |
| 25 | match file_kind with |
| 26 | | Unix.S_REG -> |
| 27 | let file = {path; size = st_size} in |
| 28 | Queue.add file files |
| 29 | | Unix.S_DIR -> |
| 30 | Queue.add path dirs |
| 31 | | Unix.S_CHR |
| 32 | | Unix.S_BLK |
| 33 | | Unix.S_LNK |
| 34 | | Unix.S_FIFO |
| 35 | | Unix.S_SOCK -> |
| 36 | () |
| 37 | ) |
| 38 | in |
| 39 | explore root; |
| 40 | let rec next () = |
| 41 | match Queue.is_empty files, Queue.is_empty dirs with |
| 42 | | false, _ -> Some (Queue.take files) |
| 43 | | true , true -> None |
| 44 | | true , false -> |
| 45 | explore (Queue.take dirs); |
| 46 | next () |
| 47 | in |
| 48 | Stream.create next |
| 49 | |
| 50 | let filter_out_singletons files ~group ~handle_singleton = |
| 51 | let q = Queue.create () in |
| 52 | Stream.iter (Stream.group_by files ~f:group) ~f:(fun group -> |
| 53 | let (_, n, members) = group in |
| 54 | if n > 1 then |
| 55 | List.iter members ~f:(fun m -> Queue.add m q) |
| 56 | else |
| 57 | handle_singleton group |
| 58 | ); |
| 59 | Stream.of_queue q |
| 60 | |
| 61 | let filter_out_unique_sizes files ~metrics = |
| 62 | filter_out_singletons |
| 63 | files |
| 64 | ~group:(fun {size; _} -> size) |
| 65 | ~handle_singleton:(fun (size, _, _) -> M.file_unique_size metrics ~size) |
| 66 | |
| 67 | let head {path; _} ~len ~metrics = |
| 68 | M.file_sampled metrics; |
| 69 | let buf = Bytes.make len ' ' in |
| 70 | let ic = open_in_bin path in |
| 71 | let rec read pos len = |
| 72 | assert (len >= 0); |
| 73 | if len = 0 then |
| 74 | () |
| 75 | else begin |
| 76 | let chunk_size = input ic buf pos len in |
| 77 | M.chunk_read metrics ~size:chunk_size; |
| 78 | if chunk_size = 0 then (* EOF *) |
| 79 | () |
| 80 | else |
| 81 | read (pos + chunk_size) (len - chunk_size) |
| 82 | end |
| 83 | in |
| 84 | read 0 len; |
| 85 | close_in ic; |
| 86 | Bytes.to_string buf |
| 87 | |
| 88 | let filter_out_unique_heads files ~len ~metrics = |
| 89 | filter_out_singletons |
| 90 | files |
| 91 | ~group:(head ~len ~metrics) |
| 92 | ~handle_singleton:(fun (_, _, files) -> |
| 93 | let {size; _} = List.hd files in (* Guaranteed non-empty *) |
| 94 | M.file_unique_sample metrics ~size |
| 95 | ) |