Sample file heads and skip unique ones
[dups.git] / dups.ml
1 open Printf
2
3 module Array = ArrayLabels
4 module List = ListLabels
5 module StrSet= Set.Make(String)
6 module Unix = UnixLabels
7
8 module Stream : sig
9 type 'a t
10
11 val create : (unit -> 'a option) -> 'a t
12
13 val iter : 'a t -> f:('a -> unit) -> unit
14
15 val map : 'a t -> f:('a -> 'b) -> 'b t
16
17 val filter : 'a t -> f:('a -> bool) -> 'a t
18
19 val concat : ('a t) list -> 'a t
20 end = struct
21 module S = Stream
22
23 type 'a t =
24 {mutable streams : ('a S.t) list}
25
26 let create f =
27 {streams = [S.from (fun _ -> f ())]}
28
29 let rec next t =
30 match t.streams with
31 | [] ->
32 None
33 | s :: streams ->
34 (match S.next s with
35 | exception Stream.Failure ->
36 t.streams <- streams;
37 next t
38 | x ->
39 Some x
40 )
41
42 let map t ~f =
43 create (fun () ->
44 match next t with
45 | None -> None
46 | Some x -> Some (f x)
47 )
48
49 let filter t ~f =
50 let rec filter () =
51 match next t with
52 | None ->
53 None
54 | Some x when f x ->
55 Some x
56 | Some _ ->
57 filter ()
58 in
59 create filter
60
61 let iter t ~f =
62 List.iter t.streams ~f:(S.iter f)
63
64 let concat ts =
65 {streams = List.concat (List.map ts ~f:(fun {streams} -> streams))}
66 end
67
68 module In_channel : sig
69 val lines : in_channel -> string Stream.t
70 end = struct
71 let lines ic =
72 Stream.create (fun () ->
73 match input_line ic with
74 | exception End_of_file ->
75 None
76 | line ->
77 Some line
78 )
79 end
80
81 module File : sig
82 type t =
83 { path : string
84 ; size : int
85 }
86
87 val find : string -> t Stream.t
88 (** Find all files in the directory tree, starting from the given root path *)
89
90 val lookup : string Stream.t -> t Stream.t
91 (** Lookup file info for given paths *)
92 end = struct
93 type t =
94 { path : string
95 ; size : int
96 }
97
98 let lookup paths =
99 Stream.map paths ~f:(fun path ->
100 let {Unix.st_size = size; _} = Unix.lstat path in
101 {path; size}
102 )
103
104 let find root =
105 let dirs = Queue.create () in
106 let files = Queue.create () in
107 let explore parent =
108 Array.iter (Sys.readdir parent) ~f:(fun child ->
109 let path = Filename.concat parent child in
110 let {Unix.st_kind = file_kind; st_size; _} = Unix.lstat path in
111 match file_kind with
112 | Unix.S_REG ->
113 let file = {path; size = st_size} in
114 Queue.add file files
115 | Unix.S_DIR ->
116 Queue.add path dirs
117 | Unix.S_CHR
118 | Unix.S_BLK
119 | Unix.S_LNK
120 | Unix.S_FIFO
121 | Unix.S_SOCK ->
122 ()
123 )
124 in
125 explore root;
126 let rec next () =
127 match Queue.is_empty files, Queue.is_empty dirs with
128 | false, _ -> Some (Queue.take files)
129 | true , true -> None
130 | true , false ->
131 explore (Queue.take dirs);
132 next ()
133 in
134 Stream.create next
135 end
136
137 type input =
138 | Stdin
139 | Directories of string list
140
141 type output =
142 | Stdout
143 | Directory of string
144
145 type opt =
146 { input : input
147 ; output : output
148 ; ignore : Str.regexp option
149 ; sample : int
150 }
151
152 type count =
153 { considered : int ref
154 ; empty : int ref
155 ; ignored : int ref
156 ; unique_size : int ref
157 ; unique_sample : int ref
158 ; hashed : int ref
159 }
160
161 let make_input_stream input ignore count =
162 let input =
163 match input with
164 | Stdin ->
165 File.lookup (In_channel.lines stdin)
166 | Directories paths ->
167 let paths = StrSet.elements (StrSet.of_list paths) in
168 Stream.concat (List.map paths ~f:File.find)
169 in
170 Stream.filter input ~f:(fun {File.path; size} ->
171 incr count.considered;
172 let empty = size = 0 in
173 let ignored =
174 match ignore with
175 | Some regexp when (Str.string_match regexp path 0) ->
176 true
177 | Some _ | None ->
178 false
179 in
180 if empty then incr count.empty;
181 if ignored then incr count.ignored;
182 (not empty) && (not ignored)
183 )
184
185 let make_output_fun = function
186 | Stdout ->
187 fun digest n_paths paths ->
188 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
189 List.iter (StrSet.elements paths) ~f:(printf " %S\n%!")
190 | Directory dir ->
191 fun digest _ paths ->
192 let digest = Digest.to_hex digest in
193 let dir = Filename.concat dir (String.sub digest 0 2) in
194 Unix.mkdir dir ~perm:0o700;
195 let oc = open_out (Filename.concat dir digest) in
196 List.iter (StrSet.elements paths) ~f:(fun path ->
197 output_string oc (sprintf "%S\n%!" path)
198 );
199 close_out oc
200
201 let sample path ~len =
202 let buf = Bytes.make len ' ' in
203 let ic = open_in_bin path in
204 let rec read pos len =
205 assert (len >= 0);
206 if len = 0 then
207 ()
208 else begin
209 let chunk_size = input ic buf pos len in
210 if chunk_size = 0 then (* EOF *)
211 ()
212 else
213 read (pos + chunk_size) (len - chunk_size)
214 end
215 in
216 read 0 len;
217 close_in ic;
218 Bytes.to_string buf
219
220 let main {input; output; ignore; sample = sample_len} =
221 let t0 = Sys.time () in
222 let count =
223 { considered = ref 0
224 ; empty = ref 0
225 ; ignored = ref 0
226 ; unique_size = ref 0
227 ; hashed = ref 0
228 ; unique_sample = ref 0
229 }
230 in
231 let output = make_output_fun output in
232 let input = make_input_stream input ignore count in
233 let paths_by_size = Hashtbl.create 1_000_000 in
234 let paths_by_sample = Hashtbl.create 1_000_000 in
235 let paths_by_digest = Hashtbl.create 1_000_000 in
236 let process tbl path ~f =
237 let key = f path in
238 let count, paths =
239 match Hashtbl.find_opt tbl key with
240 | None ->
241 (0, StrSet.empty)
242 | Some (n, paths) ->
243 (n, paths)
244 in
245 Hashtbl.replace tbl key (count + 1, StrSet.add path paths)
246 in
247 Stream.iter input ~f:(fun {File.path; size} ->
248 process paths_by_size path ~f:(fun _ -> size)
249 );
250 Hashtbl.iter
251 (fun _ (n, paths) ->
252 (* Skip files with unique sizes *)
253 if n > 1 then
254 StrSet.iter
255 (fun path ->
256 process paths_by_sample path ~f:(sample ~len:sample_len)
257 )
258 paths
259 else
260 incr count.unique_size;
261 )
262 paths_by_size;
263 Hashtbl.iter
264 (fun _ (n, paths) ->
265 (* Skip files with unique samples *)
266 if n > 1 then
267 StrSet.iter
268 (fun path ->
269 incr count.hashed;
270 process paths_by_digest path ~f:Digest.file
271 )
272 paths
273 else
274 incr count.unique_sample;
275 )
276 paths_by_sample;
277 Hashtbl.iter (fun d (n, ps) -> if n > 1 then output d n ps) paths_by_digest;
278 let t1 = Sys.time () in
279 eprintf "Time : %f seconds\n%!" (t1 -. t0);
280 eprintf "Considered : %d\n%!" !(count.considered);
281 eprintf "Hashed : %d\n%!" !(count.hashed);
282 eprintf "Skipped due to 0 size : %d\n%!" !(count.empty);
283 eprintf "Skipped due to unique size : %d\n%!" !(count.unique_size);
284 eprintf "Skipped due to unique sample : %d\n%!" !(count.unique_sample);
285 eprintf "Ignored due to regex match : %d\n%!" !(count.ignored)
286
287 let get_opt () : opt =
288 let assert_ test x msg =
289 if not (test x) then begin
290 eprintf "%s\n%!" msg;
291 exit 1
292 end
293 in
294 let assert_file_exists path =
295 assert_ Sys.file_exists path (sprintf "File does not exist: %S" path)
296 in
297 let assert_file_is_dir path =
298 assert_ Sys.is_directory path (sprintf "File is not a directory: %S" path)
299 in
300 let input = ref Stdin in
301 let output = ref Stdout in
302 let ignore = ref None in
303 let sample = ref 256 in
304 let spec =
305 [ ( "-out"
306 , Arg.String (fun path ->
307 assert_file_exists path;
308 assert_file_is_dir path;
309 output := Directory path
310 )
311 , " Output to this directory instead of stdout."
312 )
313 ; ( "-ignore"
314 , Arg.String (fun regexp -> ignore := Some (Str.regexp regexp))
315 , " Ignore file paths which match this regexp pattern (see Str module)."
316 )
317 ; ( "-sample"
318 , Arg.Set_int sample
319 , (sprintf " Byte size of file samples to use. Default: %d" !sample)
320 )
321 ]
322 in
323 Arg.parse
324 (Arg.align spec)
325 (fun path ->
326 assert_file_exists path;
327 assert_file_is_dir path;
328 match !input with
329 | Stdin ->
330 input := Directories [path]
331 | Directories paths ->
332 input := Directories (path :: paths)
333 )
334 "";
335 assert_
336 (fun x -> x > 0)
337 !sample
338 (sprintf "Sample size cannot be negative: %d" !sample);
339 { input = !input
340 ; output = !output
341 ; ignore = !ignore
342 ; sample = !sample
343 }
344
345 let () =
346 main (get_opt ())
This page took 0.061823 seconds and 5 git commands to generate.