| 1 | open Printf |
| 2 | |
| 3 | module Array = ArrayLabels |
| 4 | module List = ListLabels |
| 5 | |
| 6 | module Stream : sig |
| 7 | type 'a t |
| 8 | val lines : in_channel -> string t |
| 9 | val rec_file_paths : root:string -> string t |
| 10 | val iter : 'a t -> f:('a -> unit) -> unit |
| 11 | end = struct |
| 12 | module S = Stream |
| 13 | |
| 14 | type 'a t = |
| 15 | 'a S.t |
| 16 | |
| 17 | let rec_file_paths ~root = |
| 18 | let dirs = Queue.create () in |
| 19 | let files = Queue.create () in |
| 20 | Queue.add root dirs; |
| 21 | let explore parent = |
| 22 | Array.iter (Sys.readdir parent) ~f:(fun child -> |
| 23 | let path = Filename.concat parent child in |
| 24 | let {Unix.st_kind = file_kind; _} = Unix.lstat path in |
| 25 | match file_kind with |
| 26 | | Unix.S_REG -> |
| 27 | Queue.add path files |
| 28 | | Unix.S_DIR -> |
| 29 | Queue.add path dirs |
| 30 | | Unix.S_CHR |
| 31 | | Unix.S_BLK |
| 32 | | Unix.S_LNK |
| 33 | | Unix.S_FIFO |
| 34 | | Unix.S_SOCK -> |
| 35 | () |
| 36 | ) |
| 37 | in |
| 38 | let next_dir () = |
| 39 | match Queue.take dirs with |
| 40 | | exception Queue.Empty -> |
| 41 | () |
| 42 | | dir -> |
| 43 | explore dir |
| 44 | in |
| 45 | let next_file () = |
| 46 | match Queue.take files with |
| 47 | | exception Queue.Empty -> |
| 48 | None |
| 49 | | file_path -> |
| 50 | Some file_path |
| 51 | in |
| 52 | S.from (fun _ -> |
| 53 | next_dir (); |
| 54 | next_file () |
| 55 | ) |
| 56 | |
| 57 | let lines ic = |
| 58 | S.from (fun _ -> |
| 59 | match input_line ic with |
| 60 | | exception End_of_file -> |
| 61 | None |
| 62 | | line -> |
| 63 | Some line |
| 64 | ) |
| 65 | |
| 66 | let iter t ~f = |
| 67 | S.iter f t |
| 68 | end |
| 69 | |
| 70 | type input = |
| 71 | | Root_path of string |
| 72 | | Paths_on_stdin |
| 73 | |
| 74 | let main input = |
| 75 | let paths = |
| 76 | match input with |
| 77 | | Paths_on_stdin -> Stream.lines stdin |
| 78 | | Root_path root -> Stream.rec_file_paths ~root |
| 79 | in |
| 80 | let paths_by_digest = Hashtbl.create 1_000_000 in |
| 81 | let path_count = ref 0 in |
| 82 | let t0 = Sys.time () in |
| 83 | Stream.iter paths ~f:(fun path -> |
| 84 | incr path_count; |
| 85 | try |
| 86 | let digest = Digest.file path in |
| 87 | let paths = |
| 88 | match Hashtbl.find_opt paths_by_digest digest with |
| 89 | | None -> |
| 90 | [] |
| 91 | | Some paths -> |
| 92 | paths |
| 93 | in |
| 94 | Hashtbl.replace paths_by_digest digest (path :: paths) |
| 95 | with Sys_error e -> |
| 96 | eprintf "WARNING: Failed to process %S: %S\n%!" path e |
| 97 | ); |
| 98 | Hashtbl.iter |
| 99 | (fun digest paths -> |
| 100 | let n_paths = List.length paths in |
| 101 | if n_paths > 1 then begin |
| 102 | printf "%s %d\n%!" (Digest.to_hex digest) n_paths; |
| 103 | List.iter paths ~f:(fun path -> printf " %s\n%!" path) |
| 104 | end |
| 105 | ) |
| 106 | paths_by_digest; |
| 107 | let t1 = Sys.time () in |
| 108 | eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0) |
| 109 | |
| 110 | let () = |
| 111 | let input = ref Paths_on_stdin in |
| 112 | Arg.parse [] (fun path -> input := Root_path path) ""; |
| 113 | main !input |