0bcc83a8a312f52ff3dbd91dab32493e67538dd8
[dups.git] / dups.ml
1 open Printf
2
3 module Array = ArrayLabels
4 module List = ListLabels
5
6 module Stream : sig
7 type 'a t
8
9 val create : (unit -> 'a option) -> 'a t
10
11 val iter : 'a t -> f:('a -> unit) -> unit
12 end = struct
13 module S = Stream
14
15 type 'a t =
16 'a S.t
17
18 let create f =
19 S.from (fun _ -> f ())
20
21 let iter t ~f =
22 S.iter f t
23 end
24
25 module In_channel : sig
26 val lines : in_channel -> string Stream.t
27 end = struct
28 let lines ic =
29 Stream.create (fun () ->
30 match input_line ic with
31 | exception End_of_file ->
32 None
33 | line ->
34 Some line
35 )
36 end
37
38 module Directory : sig
39 val find_files : string -> string Stream.t
40 end = struct
41 let find_files root =
42 let dirs = Queue.create () in
43 let files = Queue.create () in
44 Queue.add root dirs;
45 let explore parent =
46 Array.iter (Sys.readdir parent) ~f:(fun child ->
47 let path = Filename.concat parent child in
48 let {Unix.st_kind = file_kind; _} = Unix.lstat path in
49 match file_kind with
50 | Unix.S_REG ->
51 Queue.add path files
52 | Unix.S_DIR ->
53 Queue.add path dirs
54 | Unix.S_CHR
55 | Unix.S_BLK
56 | Unix.S_LNK
57 | Unix.S_FIFO
58 | Unix.S_SOCK ->
59 ()
60 )
61 in
62 let next_dir () =
63 match Queue.take dirs with
64 | exception Queue.Empty ->
65 ()
66 | dir ->
67 explore dir
68 in
69 let next_file () =
70 match Queue.take files with
71 | exception Queue.Empty ->
72 None
73 | file_path ->
74 Some file_path
75 in
76 Stream.create (fun () ->
77 next_dir ();
78 next_file ()
79 )
80 end
81
82 type input =
83 | Root_path of string
84 | Paths_on_stdin
85
86 let main input =
87 let paths =
88 match input with
89 | Paths_on_stdin -> In_channel.lines stdin
90 | Root_path root -> Directory.find_files root
91 in
92 let paths_by_digest = Hashtbl.create 1_000_000 in
93 let path_count = ref 0 in
94 let t0 = Sys.time () in
95 Stream.iter paths ~f:(fun path ->
96 incr path_count;
97 try
98 let digest = Digest.file path in
99 let paths =
100 match Hashtbl.find_opt paths_by_digest digest with
101 | None ->
102 []
103 | Some paths ->
104 paths
105 in
106 Hashtbl.replace paths_by_digest digest (path :: paths)
107 with Sys_error e ->
108 eprintf "WARNING: Failed to process %S: %S\n%!" path e
109 );
110 Hashtbl.iter
111 (fun digest paths ->
112 let n_paths = List.length paths in
113 if n_paths > 1 then begin
114 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
115 List.iter paths ~f:(fun path -> printf " %s\n%!" path)
116 end
117 )
118 paths_by_digest;
119 let t1 = Sys.time () in
120 eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0)
121
122 let () =
123 let input = ref Paths_on_stdin in
124 Arg.parse [] (fun path -> input := Root_path path) "";
125 main !input
This page took 0.056398 seconds and 4 git commands to generate.