3 module Array = ArrayLabels
4 module List = ListLabels
5 module StrSet= Set.Make(String)
6 module Unix = UnixLabels
11 val create : (unit -> 'a option) -> 'a t
13 val iter : 'a t -> f:('a -> unit) -> unit
15 val concat : ('a t) list -> 'a t
23 [S.from (fun _ -> f ())]
26 List.iter t ~f:(S.iter f)
32 module In_channel : sig
33 val lines : in_channel -> string Stream.t
36 Stream.create (fun () ->
37 match input_line ic with
38 | exception End_of_file ->
45 module Directory_tree : sig
46 val find_files : string -> string Stream.t
49 let dirs = Queue.create () in
50 let files = Queue.create () in
52 Array.iter (Sys.readdir parent) ~f:(fun child ->
53 let path = Filename.concat parent child in
54 let {Unix.st_kind = file_kind; _} = Unix.lstat path in
70 match Queue.is_empty files, Queue.is_empty dirs with
71 | false, _ -> Some (Queue.take files)
74 explore (Queue.take dirs);
82 | Directories of string list
88 let make_input_stream = function
90 In_channel.lines stdin
91 | Directories paths ->
92 let paths = StrSet.elements (StrSet.of_list paths) in
93 Stream.concat (List.map paths ~f:Directory_tree.find_files)
95 let make_output_fun = function
97 fun digest n_paths paths ->
98 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
99 List.iter (StrSet.elements paths) ~f:(printf " %S\n%!")
101 fun digest _ paths ->
102 let digest = Digest.to_hex digest in
103 let dir = Filename.concat dir (String.sub digest 0 2) in
104 Unix.mkdir dir ~perm:0o700;
105 let oc = open_out (Filename.concat dir digest) in
106 List.iter (StrSet.elements paths) ~f:(fun path ->
107 output_string oc (sprintf "%S\n%!" path)
111 let main input output ignore =
112 let output = make_output_fun output in
113 let input = make_input_stream input in
114 let paths_by_digest = Hashtbl.create 1_000_000 in
115 let path_count = ref 0 in
116 let t0 = Sys.time () in
119 let digest = Digest.file path in
121 match Hashtbl.find_opt paths_by_digest digest with
127 Hashtbl.replace paths_by_digest digest (count + 1, StrSet.add path paths)
129 eprintf "WARNING: Failed to process %S: %S\n%!" path e
131 Stream.iter input ~f:(fun path ->
134 | Some regexp when (Str.string_match regexp path 0) ->
139 Hashtbl.iter (fun d (n, ps) -> if n > 1 then output d n ps) paths_by_digest;
140 let t1 = Sys.time () in
141 eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0)
144 let input = ref Stdin in
145 let output = ref Stdout in
146 let ignore = ref None in
147 let assert_file_exists path =
148 if Sys.file_exists path then
151 eprintf "File does not exist: %S\n%!" path;
155 let assert_file_is_dir path =
156 if Sys.is_directory path then
159 eprintf "File is not a directory: %S\n%!" path;
165 , Arg.String (fun path ->
166 assert_file_exists path;
167 assert_file_is_dir path;
168 output := Directory path
170 , " Output to this directory instead of stdout."
173 , Arg.String (fun regexp -> ignore := Some (Str.regexp regexp))
174 , " Ignore file paths which match this regexp pattern (see Str module)."
181 assert_file_exists path;
182 assert_file_is_dir path;
185 input := Directories [path]
186 | Directories paths ->
187 input := Directories (path :: paths)
190 main !input !output !ignore