84c070722b4226a11403da85d665b2159041f684
[dups.git] / dups.ml
1 open Printf
2
3 module Array = ArrayLabels
4 module List = ListLabels
5
6 module Stream : sig
7 type 'a t
8
9 val create : (unit -> 'a option) -> 'a t
10
11 val iter : 'a t -> f:('a -> unit) -> unit
12 end = struct
13 module S = Stream
14
15 type 'a t =
16 'a S.t
17
18 let create f =
19 S.from (fun _ -> f ())
20
21 let iter t ~f =
22 S.iter f t
23 end
24
25 module In_channel : sig
26 val lines : in_channel -> string Stream.t
27 end = struct
28 let lines ic =
29 Stream.create (fun () ->
30 match input_line ic with
31 | exception End_of_file ->
32 None
33 | line ->
34 Some line
35 )
36 end
37
38 module Directory_tree : sig
39 val find_files : string -> string Stream.t
40 end = struct
41 let find_files root =
42 let dirs = Queue.create () in
43 let files = Queue.create () in
44 let explore parent =
45 Array.iter (Sys.readdir parent) ~f:(fun child ->
46 let path = Filename.concat parent child in
47 let {Unix.st_kind = file_kind; _} = Unix.lstat path in
48 match file_kind with
49 | Unix.S_REG ->
50 Queue.add path files
51 | Unix.S_DIR ->
52 Queue.add path dirs
53 | Unix.S_CHR
54 | Unix.S_BLK
55 | Unix.S_LNK
56 | Unix.S_FIFO
57 | Unix.S_SOCK ->
58 ()
59 )
60 in
61 explore root;
62 let rec next () =
63 match Queue.is_empty files, Queue.is_empty dirs with
64 | false, _ -> Some (Queue.take files)
65 | true , true -> None
66 | true , false ->
67 explore (Queue.take dirs);
68 next ()
69 in
70 Stream.create next
71 end
72
73 type input =
74 | Root_path of string
75 | Paths_on_stdin
76
77 let main input =
78 let paths =
79 match input with
80 | Paths_on_stdin -> In_channel.lines stdin
81 | Root_path root -> Directory_tree.find_files root
82 in
83 let paths_by_digest = Hashtbl.create 1_000_000 in
84 let path_count = ref 0 in
85 let t0 = Sys.time () in
86 Stream.iter paths ~f:(fun path ->
87 incr path_count;
88 try
89 let digest = Digest.file path in
90 let paths =
91 match Hashtbl.find_opt paths_by_digest digest with
92 | None ->
93 []
94 | Some paths ->
95 paths
96 in
97 Hashtbl.replace paths_by_digest digest (path :: paths)
98 with Sys_error e ->
99 eprintf "WARNING: Failed to process %S: %S\n%!" path e
100 );
101 Hashtbl.iter
102 (fun digest paths ->
103 let n_paths = List.length paths in
104 if n_paths > 1 then begin
105 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
106 List.iter paths ~f:(fun path -> printf " %s\n%!" path)
107 end
108 )
109 paths_by_digest;
110 let t1 = Sys.time () in
111 eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0)
112
113 let () =
114 let input = ref Paths_on_stdin in
115 Arg.parse
116 []
117 (function
118 | path when Sys.file_exists path ->
119 input := Root_path path
120 | path ->
121 eprintf "File does not exist: %S\n%!" path;
122 exit 1
123 )
124 "";
125 main !input
This page took 0.044333 seconds and 3 git commands to generate.