Quote outputted paths
[dups.git] / dups.ml
CommitLineData
cce97c27
SK
1open Printf
2
948ee900
SK
3module Array = ArrayLabels
4module List = ListLabels
8673c3a5 5module StrSet= Set.Make(String)
cce97c27
SK
6
7module Stream : sig
948ee900 8 type 'a t
e13e9ef5
SK
9
10 val create : (unit -> 'a option) -> 'a t
11
948ee900 12 val iter : 'a t -> f:('a -> unit) -> unit
8673c3a5
SK
13
14 val concat : ('a t) list -> 'a t
cce97c27
SK
15end = struct
16 module S = Stream
17
948ee900 18 type 'a t =
8673c3a5 19 ('a S.t) list
948ee900 20
e13e9ef5 21 let create f =
8673c3a5 22 [S.from (fun _ -> f ())]
e13e9ef5
SK
23
24 let iter t ~f =
8673c3a5
SK
25 List.iter t ~f:(S.iter f)
26
27 let concat ts =
28 List.concat ts
e13e9ef5
SK
29end
30
31module In_channel : sig
32 val lines : in_channel -> string Stream.t
33end = struct
34 let lines ic =
35 Stream.create (fun () ->
36 match input_line ic with
37 | exception End_of_file ->
38 None
39 | line ->
40 Some line
41 )
42end
43
7b7a6b7f 44module Directory_tree : sig
e13e9ef5
SK
45 val find_files : string -> string Stream.t
46end = struct
47 let find_files root =
948ee900
SK
48 let dirs = Queue.create () in
49 let files = Queue.create () in
948ee900
SK
50 let explore parent =
51 Array.iter (Sys.readdir parent) ~f:(fun child ->
52 let path = Filename.concat parent child in
53 let {Unix.st_kind = file_kind; _} = Unix.lstat path in
54 match file_kind with
55 | Unix.S_REG ->
56 Queue.add path files
57 | Unix.S_DIR ->
58 Queue.add path dirs
59 | Unix.S_CHR
60 | Unix.S_BLK
61 | Unix.S_LNK
62 | Unix.S_FIFO
63 | Unix.S_SOCK ->
64 ()
65 )
66 in
1f130f74 67 explore root;
c66266c6
SK
68 let rec next () =
69 match Queue.is_empty files, Queue.is_empty dirs with
70 | false, _ -> Some (Queue.take files)
71 | true , true -> None
72 | true , false ->
73 explore (Queue.take dirs);
74 next ()
75 in
76 Stream.create next
cce97c27
SK
77end
78
948ee900 79type input =
8673c3a5 80 | Root_paths of string list
948ee900
SK
81 | Paths_on_stdin
82
83let main input =
84 let paths =
85 match input with
8673c3a5
SK
86 | Paths_on_stdin ->
87 In_channel.lines stdin
88 | Root_paths paths ->
89 let paths = StrSet.elements (StrSet.of_list paths) in
90 Stream.concat (List.map paths ~f:Directory_tree.find_files)
948ee900 91 in
cce97c27 92 let paths_by_digest = Hashtbl.create 1_000_000 in
f16e6ff1
SK
93 let path_count = ref 0 in
94 let t0 = Sys.time () in
948ee900 95 Stream.iter paths ~f:(fun path ->
f16e6ff1 96 incr path_count;
cce97c27
SK
97 try
98 let digest = Digest.file path in
99 let paths =
100 match Hashtbl.find_opt paths_by_digest digest with
101 | None ->
8673c3a5 102 StrSet.empty
cce97c27
SK
103 | Some paths ->
104 paths
105 in
8673c3a5 106 Hashtbl.replace paths_by_digest digest (StrSet.add path paths)
cce97c27
SK
107 with Sys_error e ->
108 eprintf "WARNING: Failed to process %S: %S\n%!" path e
109 );
110 Hashtbl.iter
111 (fun digest paths ->
8673c3a5 112 let n_paths = StrSet.cardinal paths in
cce97c27
SK
113 if n_paths > 1 then begin
114 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
e4a6d3c9 115 List.iter (StrSet.elements paths) ~f:(printf " %S\n%!")
cce97c27
SK
116 end
117 )
f16e6ff1
SK
118 paths_by_digest;
119 let t1 = Sys.time () in
120 eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0)
cce97c27
SK
121
122let () =
948ee900 123 let input = ref Paths_on_stdin in
61a05dbb
SK
124 Arg.parse
125 []
126 (function
127 | path when Sys.file_exists path ->
8673c3a5
SK
128 (match !input with
129 | Paths_on_stdin ->
130 input := Root_paths [path]
131 | Root_paths paths ->
132 input := Root_paths (path :: paths)
133 )
61a05dbb
SK
134 | path ->
135 eprintf "File does not exist: %S\n%!" path;
136 exit 1
137 )
138 "";
948ee900 139 main !input
This page took 0.030781 seconds and 4 git commands to generate.