941d8fa34d1e43a0e0f8b0c656faccc4a6607194
[dups.git] / dups.ml
1 open Printf
2
3 module Array = ArrayLabels
4 module List = ListLabels
5 module StrSet= Set.Make(String)
6
7 module Stream : sig
8 type 'a t
9
10 val create : (unit -> 'a option) -> 'a t
11
12 val iter : 'a t -> f:('a -> unit) -> unit
13
14 val concat : ('a t) list -> 'a t
15 end = struct
16 module S = Stream
17
18 type 'a t =
19 ('a S.t) list
20
21 let create f =
22 [S.from (fun _ -> f ())]
23
24 let iter t ~f =
25 List.iter t ~f:(S.iter f)
26
27 let concat ts =
28 List.concat ts
29 end
30
31 module In_channel : sig
32 val lines : in_channel -> string Stream.t
33 end = struct
34 let lines ic =
35 Stream.create (fun () ->
36 match input_line ic with
37 | exception End_of_file ->
38 None
39 | line ->
40 Some line
41 )
42 end
43
44 module Directory_tree : sig
45 val find_files : string -> string Stream.t
46 end = struct
47 let find_files root =
48 let dirs = Queue.create () in
49 let files = Queue.create () in
50 let explore parent =
51 Array.iter (Sys.readdir parent) ~f:(fun child ->
52 let path = Filename.concat parent child in
53 let {Unix.st_kind = file_kind; _} = Unix.lstat path in
54 match file_kind with
55 | Unix.S_REG ->
56 Queue.add path files
57 | Unix.S_DIR ->
58 Queue.add path dirs
59 | Unix.S_CHR
60 | Unix.S_BLK
61 | Unix.S_LNK
62 | Unix.S_FIFO
63 | Unix.S_SOCK ->
64 ()
65 )
66 in
67 explore root;
68 let rec next () =
69 match Queue.is_empty files, Queue.is_empty dirs with
70 | false, _ -> Some (Queue.take files)
71 | true , true -> None
72 | true , false ->
73 explore (Queue.take dirs);
74 next ()
75 in
76 Stream.create next
77 end
78
79 type input =
80 | Root_paths of string list
81 | Paths_on_stdin
82
83 let main input =
84 let paths =
85 match input with
86 | Paths_on_stdin ->
87 In_channel.lines stdin
88 | Root_paths paths ->
89 let paths = StrSet.elements (StrSet.of_list paths) in
90 Stream.concat (List.map paths ~f:Directory_tree.find_files)
91 in
92 let paths_by_digest = Hashtbl.create 1_000_000 in
93 let path_count = ref 0 in
94 let t0 = Sys.time () in
95 Stream.iter paths ~f:(fun path ->
96 incr path_count;
97 try
98 let digest = Digest.file path in
99 let paths =
100 match Hashtbl.find_opt paths_by_digest digest with
101 | None ->
102 StrSet.empty
103 | Some paths ->
104 paths
105 in
106 Hashtbl.replace paths_by_digest digest (StrSet.add path paths)
107 with Sys_error e ->
108 eprintf "WARNING: Failed to process %S: %S\n%!" path e
109 );
110 Hashtbl.iter
111 (fun digest paths ->
112 let n_paths = StrSet.cardinal paths in
113 if n_paths > 1 then begin
114 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
115 List.iter (StrSet.elements paths) ~f:(printf " %s\n%!")
116 end
117 )
118 paths_by_digest;
119 let t1 = Sys.time () in
120 eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0)
121
122 let () =
123 let input = ref Paths_on_stdin in
124 Arg.parse
125 []
126 (function
127 | path when Sys.file_exists path ->
128 (match !input with
129 | Paths_on_stdin ->
130 input := Root_paths [path]
131 | Root_paths paths ->
132 input := Root_paths (path :: paths)
133 )
134 | path ->
135 eprintf "File does not exist: %S\n%!" path;
136 exit 1
137 )
138 "";
139 main !input
This page took 0.051181 seconds and 4 git commands to generate.