| 1 | open Printf |
| 2 | |
| 3 | module Array = ArrayLabels |
| 4 | module List = ListLabels |
| 5 | module StrSet= Set.Make(String) |
| 6 | |
| 7 | module Stream : sig |
| 8 | type 'a t |
| 9 | |
| 10 | val create : (unit -> 'a option) -> 'a t |
| 11 | |
| 12 | val iter : 'a t -> f:('a -> unit) -> unit |
| 13 | |
| 14 | val concat : ('a t) list -> 'a t |
| 15 | end = struct |
| 16 | module S = Stream |
| 17 | |
| 18 | type 'a t = |
| 19 | ('a S.t) list |
| 20 | |
| 21 | let create f = |
| 22 | [S.from (fun _ -> f ())] |
| 23 | |
| 24 | let iter t ~f = |
| 25 | List.iter t ~f:(S.iter f) |
| 26 | |
| 27 | let concat ts = |
| 28 | List.concat ts |
| 29 | end |
| 30 | |
| 31 | module In_channel : sig |
| 32 | val lines : in_channel -> string Stream.t |
| 33 | end = struct |
| 34 | let lines ic = |
| 35 | Stream.create (fun () -> |
| 36 | match input_line ic with |
| 37 | | exception End_of_file -> |
| 38 | None |
| 39 | | line -> |
| 40 | Some line |
| 41 | ) |
| 42 | end |
| 43 | |
| 44 | module Directory_tree : sig |
| 45 | val find_files : string -> string Stream.t |
| 46 | end = struct |
| 47 | let find_files root = |
| 48 | let dirs = Queue.create () in |
| 49 | let files = Queue.create () in |
| 50 | let explore parent = |
| 51 | Array.iter (Sys.readdir parent) ~f:(fun child -> |
| 52 | let path = Filename.concat parent child in |
| 53 | let {Unix.st_kind = file_kind; _} = Unix.lstat path in |
| 54 | match file_kind with |
| 55 | | Unix.S_REG -> |
| 56 | Queue.add path files |
| 57 | | Unix.S_DIR -> |
| 58 | Queue.add path dirs |
| 59 | | Unix.S_CHR |
| 60 | | Unix.S_BLK |
| 61 | | Unix.S_LNK |
| 62 | | Unix.S_FIFO |
| 63 | | Unix.S_SOCK -> |
| 64 | () |
| 65 | ) |
| 66 | in |
| 67 | explore root; |
| 68 | let rec next () = |
| 69 | match Queue.is_empty files, Queue.is_empty dirs with |
| 70 | | false, _ -> Some (Queue.take files) |
| 71 | | true , true -> None |
| 72 | | true , false -> |
| 73 | explore (Queue.take dirs); |
| 74 | next () |
| 75 | in |
| 76 | Stream.create next |
| 77 | end |
| 78 | |
| 79 | type input = |
| 80 | | Root_paths of string list |
| 81 | | Paths_on_stdin |
| 82 | |
| 83 | let main input = |
| 84 | let paths = |
| 85 | match input with |
| 86 | | Paths_on_stdin -> |
| 87 | In_channel.lines stdin |
| 88 | | Root_paths paths -> |
| 89 | let paths = StrSet.elements (StrSet.of_list paths) in |
| 90 | Stream.concat (List.map paths ~f:Directory_tree.find_files) |
| 91 | in |
| 92 | let paths_by_digest = Hashtbl.create 1_000_000 in |
| 93 | let path_count = ref 0 in |
| 94 | let t0 = Sys.time () in |
| 95 | Stream.iter paths ~f:(fun path -> |
| 96 | incr path_count; |
| 97 | try |
| 98 | let digest = Digest.file path in |
| 99 | let paths = |
| 100 | match Hashtbl.find_opt paths_by_digest digest with |
| 101 | | None -> |
| 102 | StrSet.empty |
| 103 | | Some paths -> |
| 104 | paths |
| 105 | in |
| 106 | Hashtbl.replace paths_by_digest digest (StrSet.add path paths) |
| 107 | with Sys_error e -> |
| 108 | eprintf "WARNING: Failed to process %S: %S\n%!" path e |
| 109 | ); |
| 110 | Hashtbl.iter |
| 111 | (fun digest paths -> |
| 112 | let n_paths = StrSet.cardinal paths in |
| 113 | if n_paths > 1 then begin |
| 114 | printf "%s %d\n%!" (Digest.to_hex digest) n_paths; |
| 115 | List.iter (StrSet.elements paths) ~f:(printf " %s\n%!") |
| 116 | end |
| 117 | ) |
| 118 | paths_by_digest; |
| 119 | let t1 = Sys.time () in |
| 120 | eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0) |
| 121 | |
| 122 | let () = |
| 123 | let input = ref Paths_on_stdin in |
| 124 | Arg.parse |
| 125 | [] |
| 126 | (function |
| 127 | | path when Sys.file_exists path -> |
| 128 | (match !input with |
| 129 | | Paths_on_stdin -> |
| 130 | input := Root_paths [path] |
| 131 | | Root_paths paths -> |
| 132 | input := Root_paths (path :: paths) |
| 133 | ) |
| 134 | | path -> |
| 135 | eprintf "File does not exist: %S\n%!" path; |
| 136 | exit 1 |
| 137 | ) |
| 138 | ""; |
| 139 | main !input |