| 1 | open Printf |
| 2 | |
| 3 | module Array = ArrayLabels |
| 4 | module List = ListLabels |
| 5 | |
| 6 | module Stream : sig |
| 7 | type 'a t |
| 8 | |
| 9 | val create : (unit -> 'a option) -> 'a t |
| 10 | |
| 11 | val iter : 'a t -> f:('a -> unit) -> unit |
| 12 | end = struct |
| 13 | module S = Stream |
| 14 | |
| 15 | type 'a t = |
| 16 | 'a S.t |
| 17 | |
| 18 | let create f = |
| 19 | S.from (fun _ -> f ()) |
| 20 | |
| 21 | let iter t ~f = |
| 22 | S.iter f t |
| 23 | end |
| 24 | |
| 25 | module In_channel : sig |
| 26 | val lines : in_channel -> string Stream.t |
| 27 | end = struct |
| 28 | let lines ic = |
| 29 | Stream.create (fun () -> |
| 30 | match input_line ic with |
| 31 | | exception End_of_file -> |
| 32 | None |
| 33 | | line -> |
| 34 | Some line |
| 35 | ) |
| 36 | end |
| 37 | |
| 38 | module Directory : sig |
| 39 | val find_files : string -> string Stream.t |
| 40 | end = struct |
| 41 | let find_files root = |
| 42 | let dirs = Queue.create () in |
| 43 | let files = Queue.create () in |
| 44 | let explore parent = |
| 45 | Array.iter (Sys.readdir parent) ~f:(fun child -> |
| 46 | let path = Filename.concat parent child in |
| 47 | let {Unix.st_kind = file_kind; _} = Unix.lstat path in |
| 48 | match file_kind with |
| 49 | | Unix.S_REG -> |
| 50 | Queue.add path files |
| 51 | | Unix.S_DIR -> |
| 52 | Queue.add path dirs |
| 53 | | Unix.S_CHR |
| 54 | | Unix.S_BLK |
| 55 | | Unix.S_LNK |
| 56 | | Unix.S_FIFO |
| 57 | | Unix.S_SOCK -> |
| 58 | () |
| 59 | ) |
| 60 | in |
| 61 | explore root; |
| 62 | let rec next () = |
| 63 | match Queue.is_empty files, Queue.is_empty dirs with |
| 64 | | false, _ -> Some (Queue.take files) |
| 65 | | true , true -> None |
| 66 | | true , false -> |
| 67 | explore (Queue.take dirs); |
| 68 | next () |
| 69 | in |
| 70 | Stream.create next |
| 71 | end |
| 72 | |
| 73 | type input = |
| 74 | | Root_path of string |
| 75 | | Paths_on_stdin |
| 76 | |
| 77 | let main input = |
| 78 | let paths = |
| 79 | match input with |
| 80 | | Paths_on_stdin -> In_channel.lines stdin |
| 81 | | Root_path root -> Directory.find_files root |
| 82 | in |
| 83 | let paths_by_digest = Hashtbl.create 1_000_000 in |
| 84 | let path_count = ref 0 in |
| 85 | let t0 = Sys.time () in |
| 86 | Stream.iter paths ~f:(fun path -> |
| 87 | incr path_count; |
| 88 | try |
| 89 | let digest = Digest.file path in |
| 90 | let paths = |
| 91 | match Hashtbl.find_opt paths_by_digest digest with |
| 92 | | None -> |
| 93 | [] |
| 94 | | Some paths -> |
| 95 | paths |
| 96 | in |
| 97 | Hashtbl.replace paths_by_digest digest (path :: paths) |
| 98 | with Sys_error e -> |
| 99 | eprintf "WARNING: Failed to process %S: %S\n%!" path e |
| 100 | ); |
| 101 | Hashtbl.iter |
| 102 | (fun digest paths -> |
| 103 | let n_paths = List.length paths in |
| 104 | if n_paths > 1 then begin |
| 105 | printf "%s %d\n%!" (Digest.to_hex digest) n_paths; |
| 106 | List.iter paths ~f:(fun path -> printf " %s\n%!" path) |
| 107 | end |
| 108 | ) |
| 109 | paths_by_digest; |
| 110 | let t1 = Sys.time () in |
| 111 | eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0) |
| 112 | |
| 113 | let () = |
| 114 | let input = ref Paths_on_stdin in |
| 115 | Arg.parse [] (fun path -> |
| 116 | if Sys.file_exists path then |
| 117 | input := Root_path path |
| 118 | else begin |
| 119 | eprintf "File does not exist: %S\n%!" path; |
| 120 | exit 1 |
| 121 | end |
| 122 | ) ""; |
| 123 | main !input |