Commit | Line | Data |
---|---|---|
cce97c27 SK |
1 | open Printf |
2 | ||
948ee900 SK |
3 | module Array = ArrayLabels |
4 | module List = ListLabels | |
cce97c27 SK |
5 | |
6 | module Stream : sig | |
948ee900 SK |
7 | type 'a t |
8 | val lines : in_channel -> string t | |
9 | val rec_file_paths : root:string -> string t | |
10 | val iter : 'a t -> f:('a -> unit) -> unit | |
cce97c27 SK |
11 | end = struct |
12 | module S = Stream | |
13 | ||
948ee900 SK |
14 | type 'a t = |
15 | 'a S.t | |
16 | ||
17 | let rec_file_paths ~root = | |
18 | let dirs = Queue.create () in | |
19 | let files = Queue.create () in | |
20 | Queue.add root dirs; | |
21 | let explore parent = | |
22 | Array.iter (Sys.readdir parent) ~f:(fun child -> | |
23 | let path = Filename.concat parent child in | |
24 | let {Unix.st_kind = file_kind; _} = Unix.lstat path in | |
25 | match file_kind with | |
26 | | Unix.S_REG -> | |
27 | Queue.add path files | |
28 | | Unix.S_DIR -> | |
29 | Queue.add path dirs | |
30 | | Unix.S_CHR | |
31 | | Unix.S_BLK | |
32 | | Unix.S_LNK | |
33 | | Unix.S_FIFO | |
34 | | Unix.S_SOCK -> | |
35 | () | |
36 | ) | |
37 | in | |
38 | let next_dir () = | |
39 | match Queue.take dirs with | |
40 | | exception Queue.Empty -> | |
41 | () | |
42 | | dir -> | |
43 | explore dir | |
44 | in | |
45 | let next_file () = | |
46 | match Queue.take files with | |
47 | | exception Queue.Empty -> | |
48 | None | |
49 | | file_path -> | |
50 | Some file_path | |
51 | in | |
52 | S.from (fun _ -> | |
53 | next_dir (); | |
54 | next_file () | |
55 | ) | |
56 | ||
57 | let lines ic = | |
cce97c27 SK |
58 | S.from (fun _ -> |
59 | match input_line ic with | |
60 | | exception End_of_file -> | |
61 | None | |
62 | | line -> | |
63 | Some line | |
64 | ) | |
65 | ||
66 | let iter t ~f = | |
67 | S.iter f t | |
cce97c27 SK |
68 | end |
69 | ||
948ee900 SK |
70 | type input = |
71 | | Root_path of string | |
72 | | Paths_on_stdin | |
73 | ||
74 | let main input = | |
75 | let paths = | |
76 | match input with | |
77 | | Paths_on_stdin -> Stream.lines stdin | |
78 | | Root_path root -> Stream.rec_file_paths ~root | |
79 | in | |
cce97c27 | 80 | let paths_by_digest = Hashtbl.create 1_000_000 in |
f16e6ff1 SK |
81 | let path_count = ref 0 in |
82 | let t0 = Sys.time () in | |
948ee900 | 83 | Stream.iter paths ~f:(fun path -> |
f16e6ff1 | 84 | incr path_count; |
cce97c27 SK |
85 | try |
86 | let digest = Digest.file path in | |
87 | let paths = | |
88 | match Hashtbl.find_opt paths_by_digest digest with | |
89 | | None -> | |
90 | [] | |
91 | | Some paths -> | |
92 | paths | |
93 | in | |
94 | Hashtbl.replace paths_by_digest digest (path :: paths) | |
95 | with Sys_error e -> | |
96 | eprintf "WARNING: Failed to process %S: %S\n%!" path e | |
97 | ); | |
98 | Hashtbl.iter | |
99 | (fun digest paths -> | |
100 | let n_paths = List.length paths in | |
101 | if n_paths > 1 then begin | |
102 | printf "%s %d\n%!" (Digest.to_hex digest) n_paths; | |
103 | List.iter paths ~f:(fun path -> printf " %s\n%!" path) | |
104 | end | |
105 | ) | |
f16e6ff1 SK |
106 | paths_by_digest; |
107 | let t1 = Sys.time () in | |
108 | eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0) | |
cce97c27 SK |
109 | |
110 | let () = | |
948ee900 SK |
111 | let input = ref Paths_on_stdin in |
112 | Arg.parse [] (fun path -> input := Root_path path) ""; | |
113 | main !input |