Rename dupfiles to dups
[dups.git] / dups.ml
CommitLineData
cce97c27
SK
1open Printf
2
948ee900
SK
3module Array = ArrayLabels
4module List = ListLabels
cce97c27
SK
5
6module Stream : sig
948ee900
SK
7 type 'a t
8 val lines : in_channel -> string t
9 val rec_file_paths : root:string -> string t
10 val iter : 'a t -> f:('a -> unit) -> unit
cce97c27
SK
11end = struct
12 module S = Stream
13
948ee900
SK
14 type 'a t =
15 'a S.t
16
17 let rec_file_paths ~root =
18 let dirs = Queue.create () in
19 let files = Queue.create () in
20 Queue.add root dirs;
21 let explore parent =
22 Array.iter (Sys.readdir parent) ~f:(fun child ->
23 let path = Filename.concat parent child in
24 let {Unix.st_kind = file_kind; _} = Unix.lstat path in
25 match file_kind with
26 | Unix.S_REG ->
27 Queue.add path files
28 | Unix.S_DIR ->
29 Queue.add path dirs
30 | Unix.S_CHR
31 | Unix.S_BLK
32 | Unix.S_LNK
33 | Unix.S_FIFO
34 | Unix.S_SOCK ->
35 ()
36 )
37 in
38 let next_dir () =
39 match Queue.take dirs with
40 | exception Queue.Empty ->
41 ()
42 | dir ->
43 explore dir
44 in
45 let next_file () =
46 match Queue.take files with
47 | exception Queue.Empty ->
48 None
49 | file_path ->
50 Some file_path
51 in
52 S.from (fun _ ->
53 next_dir ();
54 next_file ()
55 )
56
57 let lines ic =
cce97c27
SK
58 S.from (fun _ ->
59 match input_line ic with
60 | exception End_of_file ->
61 None
62 | line ->
63 Some line
64 )
65
66 let iter t ~f =
67 S.iter f t
cce97c27
SK
68end
69
948ee900
SK
70type input =
71 | Root_path of string
72 | Paths_on_stdin
73
74let main input =
75 let paths =
76 match input with
77 | Paths_on_stdin -> Stream.lines stdin
78 | Root_path root -> Stream.rec_file_paths ~root
79 in
cce97c27 80 let paths_by_digest = Hashtbl.create 1_000_000 in
f16e6ff1
SK
81 let path_count = ref 0 in
82 let t0 = Sys.time () in
948ee900 83 Stream.iter paths ~f:(fun path ->
f16e6ff1 84 incr path_count;
cce97c27
SK
85 try
86 let digest = Digest.file path in
87 let paths =
88 match Hashtbl.find_opt paths_by_digest digest with
89 | None ->
90 []
91 | Some paths ->
92 paths
93 in
94 Hashtbl.replace paths_by_digest digest (path :: paths)
95 with Sys_error e ->
96 eprintf "WARNING: Failed to process %S: %S\n%!" path e
97 );
98 Hashtbl.iter
99 (fun digest paths ->
100 let n_paths = List.length paths in
101 if n_paths > 1 then begin
102 printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
103 List.iter paths ~f:(fun path -> printf " %s\n%!" path)
104 end
105 )
f16e6ff1
SK
106 paths_by_digest;
107 let t1 = Sys.time () in
108 eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0)
cce97c27
SK
109
110let () =
948ee900
SK
111 let input = ref Paths_on_stdin in
112 Arg.parse [] (fun path -> input := Root_path path) "";
113 main !input
This page took 0.024132 seconds and 4 git commands to generate.