Add option to ignore filepaths matching a pattern
[dups.git] / dups.ml
CommitLineData
cce97c27
SK
1open Printf
2
948ee900
SK
3module Array = ArrayLabels
4module List = ListLabels
8673c3a5 5module StrSet= Set.Make(String)
e09dff7f 6module Unix = UnixLabels
cce97c27
SK
7
8module Stream : sig
948ee900 9 type 'a t
e13e9ef5
SK
10
11 val create : (unit -> 'a option) -> 'a t
12
948ee900 13 val iter : 'a t -> f:('a -> unit) -> unit
8673c3a5
SK
14
15 val concat : ('a t) list -> 'a t
cce97c27
SK
16end = struct
17 module S = Stream
18
948ee900 19 type 'a t =
8673c3a5 20 ('a S.t) list
948ee900 21
e13e9ef5 22 let create f =
8673c3a5 23 [S.from (fun _ -> f ())]
e13e9ef5
SK
24
25 let iter t ~f =
8673c3a5
SK
26 List.iter t ~f:(S.iter f)
27
28 let concat ts =
29 List.concat ts
e13e9ef5
SK
30end
31
32module In_channel : sig
33 val lines : in_channel -> string Stream.t
34end = struct
35 let lines ic =
36 Stream.create (fun () ->
37 match input_line ic with
38 | exception End_of_file ->
39 None
40 | line ->
41 Some line
42 )
43end
44
7b7a6b7f 45module Directory_tree : sig
e13e9ef5
SK
46 val find_files : string -> string Stream.t
47end = struct
48 let find_files root =
948ee900
SK
49 let dirs = Queue.create () in
50 let files = Queue.create () in
948ee900
SK
51 let explore parent =
52 Array.iter (Sys.readdir parent) ~f:(fun child ->
53 let path = Filename.concat parent child in
54 let {Unix.st_kind = file_kind; _} = Unix.lstat path in
55 match file_kind with
56 | Unix.S_REG ->
57 Queue.add path files
58 | Unix.S_DIR ->
59 Queue.add path dirs
60 | Unix.S_CHR
61 | Unix.S_BLK
62 | Unix.S_LNK
63 | Unix.S_FIFO
64 | Unix.S_SOCK ->
65 ()
66 )
67 in
1f130f74 68 explore root;
c66266c6
SK
69 let rec next () =
70 match Queue.is_empty files, Queue.is_empty dirs with
71 | false, _ -> Some (Queue.take files)
72 | true , true -> None
73 | true , false ->
74 explore (Queue.take dirs);
75 next ()
76 in
77 Stream.create next
cce97c27
SK
78end
79
948ee900 80type input =
cfcdf90a
SK
81 | Stdin
82 | Directories of string list
948ee900 83
e09dff7f
SK
84type output =
85 | Stdout
86 | Directory of string
87
88let make_input_stream = function
cfcdf90a 89 | Stdin ->
e09dff7f 90 In_channel.lines stdin
cfcdf90a 91 | Directories paths ->
e09dff7f
SK
92 let paths = StrSet.elements (StrSet.of_list paths) in
93 Stream.concat (List.map paths ~f:Directory_tree.find_files)
94
95let make_output_fun = function
96 | Stdout ->
97 fun digest n_paths paths ->
98 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
99 List.iter (StrSet.elements paths) ~f:(printf " %S\n%!")
100 | Directory dir ->
101 fun digest _ paths ->
102 let digest = Digest.to_hex digest in
103 let dir = Filename.concat dir (String.sub digest 0 2) in
104 Unix.mkdir dir ~perm:0o700;
105 let oc = open_out (Filename.concat dir digest) in
106 List.iter (StrSet.elements paths) ~f:(fun path ->
107 output_string oc (sprintf "%S\n%!" path)
108 );
109 close_out oc
110
34107832 111let main input output ignore =
e09dff7f
SK
112 let output = make_output_fun output in
113 let input = make_input_stream input in
cce97c27 114 let paths_by_digest = Hashtbl.create 1_000_000 in
f16e6ff1
SK
115 let path_count = ref 0 in
116 let t0 = Sys.time () in
34107832 117 let process path =
cce97c27
SK
118 try
119 let digest = Digest.file path in
e09dff7f 120 let count, paths =
cce97c27
SK
121 match Hashtbl.find_opt paths_by_digest digest with
122 | None ->
e09dff7f
SK
123 (0, StrSet.empty)
124 | Some (n, paths) ->
125 (n, paths)
cce97c27 126 in
e09dff7f 127 Hashtbl.replace paths_by_digest digest (count + 1, StrSet.add path paths)
cce97c27
SK
128 with Sys_error e ->
129 eprintf "WARNING: Failed to process %S: %S\n%!" path e
34107832
SK
130 in
131 Stream.iter input ~f:(fun path ->
132 incr path_count;
133 match ignore with
134 | Some regexp when (Str.string_match regexp path 0) ->
135 ()
136 | Some _ | None ->
137 process path
cce97c27 138 );
e09dff7f 139 Hashtbl.iter (fun d (n, ps) -> if n > 1 then output d n ps) paths_by_digest;
f16e6ff1
SK
140 let t1 = Sys.time () in
141 eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0)
cce97c27
SK
142
143let () =
cfcdf90a 144 let input = ref Stdin in
e09dff7f 145 let output = ref Stdout in
34107832 146 let ignore = ref None in
e09dff7f
SK
147 let assert_file_exists path =
148 if Sys.file_exists path then
149 ()
150 else begin
151 eprintf "File does not exist: %S\n%!" path;
152 exit 1
153 end
154 in
155 let assert_file_is_dir path =
156 if Sys.is_directory path then
157 ()
158 else begin
159 eprintf "File is not a directory: %S\n%!" path;
160 exit 1
161 end
162 in
163 let spec =
164 [ ( "-out"
165 , Arg.String (fun path ->
166 assert_file_exists path;
167 assert_file_is_dir path;
168 output := Directory path
8673c3a5 169 )
e09dff7f
SK
170 , " Output to this directory instead of stdout."
171 )
34107832
SK
172 ; ( "-ignore"
173 , Arg.String (fun regexp -> ignore := Some (Str.regexp regexp))
174 , " Ignore file paths which match this regexp pattern (see Str module)."
175 )
e09dff7f
SK
176 ]
177 in
178 Arg.parse
179 (Arg.align spec)
180 (fun path ->
181 assert_file_exists path;
182 assert_file_is_dir path;
183 match !input with
cfcdf90a
SK
184 | Stdin ->
185 input := Directories [path]
186 | Directories paths ->
187 input := Directories (path :: paths)
61a05dbb
SK
188 )
189 "";
34107832 190 main !input !output !ignore
This page took 0.035871 seconds and 4 git commands to generate.