Refactor CLI options gathering
[dups.git] / dups.ml
CommitLineData
cce97c27
SK
1open Printf
2
948ee900
SK
3module Array = ArrayLabels
4module List = ListLabels
8673c3a5 5module StrSet= Set.Make(String)
e09dff7f 6module Unix = UnixLabels
cce97c27
SK
7
8module Stream : sig
948ee900 9 type 'a t
e13e9ef5
SK
10
11 val create : (unit -> 'a option) -> 'a t
12
948ee900 13 val iter : 'a t -> f:('a -> unit) -> unit
8673c3a5 14
a9a56d74
SK
15 val map : 'a t -> f:('a -> 'b) -> 'b t
16
17 val filter : 'a t -> f:('a -> bool) -> 'a t
18
8673c3a5 19 val concat : ('a t) list -> 'a t
cce97c27
SK
20end = struct
21 module S = Stream
22
948ee900 23 type 'a t =
a9a56d74 24 {mutable streams : ('a S.t) list}
948ee900 25
e13e9ef5 26 let create f =
a9a56d74
SK
27 {streams = [S.from (fun _ -> f ())]}
28
29 let rec next t =
30 match t.streams with
31 | [] ->
32 None
33 | s :: streams ->
34 (match S.next s with
35 | exception Stream.Failure ->
36 t.streams <- streams;
37 next t
38 | x ->
39 Some x
40 )
41
42 let map t ~f =
43 create (fun () ->
44 match next t with
45 | None -> None
46 | Some x -> Some (f x)
47 )
48
49 let filter t ~f =
50 let rec filter () =
51 match next t with
52 | None ->
53 None
54 | Some x when f x ->
55 Some x
56 | Some _ ->
57 filter ()
58 in
59 create filter
e13e9ef5
SK
60
61 let iter t ~f =
a9a56d74 62 List.iter t.streams ~f:(S.iter f)
8673c3a5
SK
63
64 let concat ts =
a9a56d74 65 {streams = List.concat (List.map ts ~f:(fun {streams} -> streams))}
e13e9ef5
SK
66end
67
68module In_channel : sig
69 val lines : in_channel -> string Stream.t
70end = struct
71 let lines ic =
72 Stream.create (fun () ->
73 match input_line ic with
74 | exception End_of_file ->
75 None
76 | line ->
77 Some line
78 )
79end
80
a9a56d74
SK
81module File : sig
82 type t =
83 { path : string
84 ; size : int
85 }
86
87 val find : string -> t Stream.t
88 (** Find all files in the directory tree, starting from the given root path *)
89
90 val lookup : string Stream.t -> t Stream.t
91 (** Lookup file info for given paths *)
e13e9ef5 92end = struct
a9a56d74
SK
93 type t =
94 { path : string
95 ; size : int
96 }
97
98 let lookup paths =
99 Stream.map paths ~f:(fun path ->
100 let {Unix.st_size = size; _} = Unix.lstat path in
101 {path; size}
102 )
103
104 let find root =
948ee900
SK
105 let dirs = Queue.create () in
106 let files = Queue.create () in
948ee900
SK
107 let explore parent =
108 Array.iter (Sys.readdir parent) ~f:(fun child ->
109 let path = Filename.concat parent child in
a9a56d74 110 let {Unix.st_kind = file_kind; st_size; _} = Unix.lstat path in
948ee900
SK
111 match file_kind with
112 | Unix.S_REG ->
a9a56d74
SK
113 let file = {path; size = st_size} in
114 Queue.add file files
948ee900
SK
115 | Unix.S_DIR ->
116 Queue.add path dirs
117 | Unix.S_CHR
118 | Unix.S_BLK
119 | Unix.S_LNK
120 | Unix.S_FIFO
121 | Unix.S_SOCK ->
122 ()
123 )
124 in
1f130f74 125 explore root;
c66266c6
SK
126 let rec next () =
127 match Queue.is_empty files, Queue.is_empty dirs with
128 | false, _ -> Some (Queue.take files)
129 | true , true -> None
130 | true , false ->
131 explore (Queue.take dirs);
132 next ()
133 in
134 Stream.create next
cce97c27
SK
135end
136
948ee900 137type input =
cfcdf90a
SK
138 | Stdin
139 | Directories of string list
948ee900 140
e09dff7f
SK
141type output =
142 | Stdout
143 | Directory of string
144
1253df34
SK
145type opt =
146 { input : input
147 ; output : output
148 ; ignore : Str.regexp option
149 }
150
a9a56d74
SK
151type count =
152 { considered : int ref
153 ; empty : int ref
154 ; ignored : int ref
155 ; unique_size : int ref
156 ; hashed : int ref
157 }
158
159let make_input_stream input ignore count =
160 let input =
161 match input with
162 | Stdin ->
163 File.lookup (In_channel.lines stdin)
164 | Directories paths ->
165 let paths = StrSet.elements (StrSet.of_list paths) in
166 Stream.concat (List.map paths ~f:File.find)
167 in
168 Stream.filter input ~f:(fun {File.path; size} ->
169 incr count.considered;
170 let empty = size = 0 in
171 let ignored =
172 match ignore with
173 | Some regexp when (Str.string_match regexp path 0) ->
174 true
175 | Some _ | None ->
176 false
177 in
178 if empty then incr count.empty;
179 if ignored then incr count.ignored;
180 (not empty) && (not ignored)
181 )
e09dff7f
SK
182
183let make_output_fun = function
184 | Stdout ->
185 fun digest n_paths paths ->
186 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
187 List.iter (StrSet.elements paths) ~f:(printf " %S\n%!")
188 | Directory dir ->
189 fun digest _ paths ->
190 let digest = Digest.to_hex digest in
191 let dir = Filename.concat dir (String.sub digest 0 2) in
192 Unix.mkdir dir ~perm:0o700;
193 let oc = open_out (Filename.concat dir digest) in
194 List.iter (StrSet.elements paths) ~f:(fun path ->
195 output_string oc (sprintf "%S\n%!" path)
196 );
197 close_out oc
198
1253df34 199let main {input; output; ignore} =
a9a56d74
SK
200 let t0 = Sys.time () in
201 let count =
202 { considered = ref 0
203 ; empty = ref 0
204 ; ignored = ref 0
205 ; unique_size = ref 0
206 ; hashed = ref 0
207 }
208 in
e09dff7f 209 let output = make_output_fun output in
a9a56d74
SK
210 let input = make_input_stream input ignore count in
211 let paths_by_size = Hashtbl.create 1_000_000 in
cce97c27 212 let paths_by_digest = Hashtbl.create 1_000_000 in
a9a56d74
SK
213 let process tbl path ~f =
214 let key = f path in
215 let count, paths =
216 match Hashtbl.find_opt tbl key with
217 | None ->
218 (0, StrSet.empty)
219 | Some (n, paths) ->
220 (n, paths)
221 in
222 Hashtbl.replace tbl key (count + 1, StrSet.add path paths)
34107832 223 in
a9a56d74
SK
224 Stream.iter input ~f:(fun {File.path; size} ->
225 process paths_by_size path ~f:(fun _ -> size)
cce97c27 226 );
a9a56d74
SK
227 Hashtbl.iter
228 (fun _ (n, paths) ->
229 (* Skip files with unique sizes *)
230 if n > 1 then
231 StrSet.iter
232 (fun path ->
233 incr count.hashed;
234 process paths_by_digest path ~f:Digest.file
235 )
236 paths
237 else
238 incr count.unique_size;
239 )
240 paths_by_size;
e09dff7f 241 Hashtbl.iter (fun d (n, ps) -> if n > 1 then output d n ps) paths_by_digest;
f16e6ff1 242 let t1 = Sys.time () in
a9a56d74
SK
243 eprintf "Time : %f seconds\n%!" (t1 -. t0);
244 eprintf "Considered : %d\n%!" !(count.considered);
245 eprintf "Hashed : %d\n%!" !(count.hashed);
246 eprintf "Skipped due to 0 size : %d\n%!" !(count.empty);
247 eprintf "Skipped due to unique size : %d\n%!" !(count.unique_size);
248 eprintf "Ignored due to regex match : %d\n%!" !(count.ignored)
cce97c27 249
1253df34
SK
250let get_opt () : opt =
251 let assert_ test x msg =
252 if not (test x) then begin
253 eprintf "%s\n%!" msg;
e09dff7f
SK
254 exit 1
255 end
256 in
1253df34
SK
257 let assert_file_exists path =
258 assert_ Sys.file_exists path (sprintf "File does not exist: %S" path)
259 in
e09dff7f 260 let assert_file_is_dir path =
1253df34 261 assert_ Sys.is_directory path (sprintf "File is not a directory: %S" path)
e09dff7f 262 in
1253df34
SK
263 let input = ref Stdin in
264 let output = ref Stdout in
265 let ignore = ref None in
e09dff7f
SK
266 let spec =
267 [ ( "-out"
268 , Arg.String (fun path ->
269 assert_file_exists path;
270 assert_file_is_dir path;
271 output := Directory path
8673c3a5 272 )
e09dff7f
SK
273 , " Output to this directory instead of stdout."
274 )
34107832
SK
275 ; ( "-ignore"
276 , Arg.String (fun regexp -> ignore := Some (Str.regexp regexp))
277 , " Ignore file paths which match this regexp pattern (see Str module)."
278 )
e09dff7f
SK
279 ]
280 in
281 Arg.parse
282 (Arg.align spec)
283 (fun path ->
284 assert_file_exists path;
285 assert_file_is_dir path;
286 match !input with
cfcdf90a
SK
287 | Stdin ->
288 input := Directories [path]
289 | Directories paths ->
290 input := Directories (path :: paths)
61a05dbb
SK
291 )
292 "";
1253df34
SK
293 { input = !input
294 ; output = !output
295 ; ignore = !ignore
296 }
297
298let () =
299 main (get_opt ())
This page took 0.038504 seconds and 4 git commands to generate.