3 module Array = ArrayLabels
4 module List = ListLabels
5 module StrSet= Set.Make(String)
6 module Unix = UnixLabels
11 val create : (unit -> 'a option) -> 'a t
13 val iter : 'a t -> f:('a -> unit) -> unit
15 val map : 'a t -> f:('a -> 'b) -> 'b t
17 val filter : 'a t -> f:('a -> bool) -> 'a t
19 val concat : ('a t) list -> 'a t
24 {mutable streams : ('a S.t) list}
27 {streams = [S.from (fun _ -> f ())]}
35 | exception Stream.Failure ->
46 | Some x -> Some (f x)
62 List.iter t.streams ~f:(S.iter f)
65 {streams = List.concat (List.map ts ~f:(fun {streams} -> streams))}
68 module In_channel : sig
69 val lines : in_channel -> string Stream.t
72 Stream.create (fun () ->
73 match input_line ic with
74 | exception End_of_file ->
87 val find : string -> t Stream.t
88 (** Find all files in the directory tree, starting from the given root path *)
90 val lookup : string Stream.t -> t Stream.t
91 (** Lookup file info for given paths *)
99 Stream.map paths ~f:(fun path ->
100 let {Unix.st_size = size; _} = Unix.lstat path in
105 let dirs = Queue.create () in
106 let files = Queue.create () in
108 Array.iter (Sys.readdir parent) ~f:(fun child ->
109 let path = Filename.concat parent child in
110 let {Unix.st_kind = file_kind; st_size; _} = Unix.lstat path in
113 let file = {path; size = st_size} in
127 match Queue.is_empty files, Queue.is_empty dirs with
128 | false, _ -> Some (Queue.take files)
129 | true , true -> None
131 explore (Queue.take dirs);
139 | Directories of string list
143 | Directory of string
146 { considered : int ref
149 ; unique_size : int ref
153 let make_input_stream input ignore count =
157 File.lookup (In_channel.lines stdin)
158 | Directories paths ->
159 let paths = StrSet.elements (StrSet.of_list paths) in
160 Stream.concat (List.map paths ~f:File.find)
162 Stream.filter input ~f:(fun {File.path; size} ->
163 incr count.considered;
164 let empty = size = 0 in
167 | Some regexp when (Str.string_match regexp path 0) ->
172 if empty then incr count.empty;
173 if ignored then incr count.ignored;
174 (not empty) && (not ignored)
177 let make_output_fun = function
179 fun digest n_paths paths ->
180 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
181 List.iter (StrSet.elements paths) ~f:(printf " %S\n%!")
183 fun digest _ paths ->
184 let digest = Digest.to_hex digest in
185 let dir = Filename.concat dir (String.sub digest 0 2) in
186 Unix.mkdir dir ~perm:0o700;
187 let oc = open_out (Filename.concat dir digest) in
188 List.iter (StrSet.elements paths) ~f:(fun path ->
189 output_string oc (sprintf "%S\n%!" path)
193 let main input output ignore =
194 let t0 = Sys.time () in
199 ; unique_size = ref 0
203 let output = make_output_fun output in
204 let input = make_input_stream input ignore count in
205 let paths_by_size = Hashtbl.create 1_000_000 in
206 let paths_by_digest = Hashtbl.create 1_000_000 in
207 let process tbl path ~f =
210 match Hashtbl.find_opt tbl key with
216 Hashtbl.replace tbl key (count + 1, StrSet.add path paths)
218 Stream.iter input ~f:(fun {File.path; size} ->
219 process paths_by_size path ~f:(fun _ -> size)
223 (* Skip files with unique sizes *)
228 process paths_by_digest path ~f:Digest.file
232 incr count.unique_size;
235 Hashtbl.iter (fun d (n, ps) -> if n > 1 then output d n ps) paths_by_digest;
236 let t1 = Sys.time () in
237 eprintf "Time : %f seconds\n%!" (t1 -. t0);
238 eprintf "Considered : %d\n%!" !(count.considered);
239 eprintf "Hashed : %d\n%!" !(count.hashed);
240 eprintf "Skipped due to 0 size : %d\n%!" !(count.empty);
241 eprintf "Skipped due to unique size : %d\n%!" !(count.unique_size);
242 eprintf "Ignored due to regex match : %d\n%!" !(count.ignored)
245 let input = ref Stdin in
246 let output = ref Stdout in
247 let ignore = ref None in
248 let assert_file_exists path =
249 if Sys.file_exists path then
252 eprintf "File does not exist: %S\n%!" path;
256 let assert_file_is_dir path =
257 if Sys.is_directory path then
260 eprintf "File is not a directory: %S\n%!" path;
266 , Arg.String (fun path ->
267 assert_file_exists path;
268 assert_file_is_dir path;
269 output := Directory path
271 , " Output to this directory instead of stdout."
274 , Arg.String (fun regexp -> ignore := Some (Str.regexp regexp))
275 , " Ignore file paths which match this regexp pattern (see Str module)."
282 assert_file_exists path;
283 assert_file_is_dir path;
286 input := Directories [path]
287 | Directories paths ->
288 input := Directories (path :: paths)
291 main !input !output !ignore