X-Git-Url: https://git.xandkar.net/?p=dups.git;a=blobdiff_plain;f=dupfiles.ml;fp=dupfiles.ml;h=42647b7c6b429253d58447bed578221e709ffa88;hp=0000000000000000000000000000000000000000;hb=cce97c27face42237f2b3757c91ad6e29685d54a;hpb=00913a1ac144d831e3f0323355da151b115304a3 diff --git a/dupfiles.ml b/dupfiles.ml new file mode 100644 index 0000000..42647b7 --- /dev/null +++ b/dupfiles.ml @@ -0,0 +1,56 @@ +open Printf + +module List = ListLabels + +module Stream : sig + val lines : in_channel -> f:(string -> unit) -> unit +end = struct + module S = Stream + + let lines_of_channel ic = + S.from (fun _ -> + match input_line ic with + | exception End_of_file -> + None + | line -> + Some line + ) + + let iter t ~f = + S.iter f t + + let lines ic ~f = + iter (lines_of_channel ic) ~f +end + +let main ic = + let paths_by_digest = Hashtbl.create 1_000_000 in + Stream.lines ic ~f:(fun path -> + try + let digest = Digest.file path in + let paths = + match Hashtbl.find_opt paths_by_digest digest with + | None -> + [] + | Some paths -> + paths + in + Hashtbl.replace paths_by_digest digest (path :: paths) + with Sys_error e -> + eprintf "WARNING: Failed to process %S: %S\n%!" path e + ); + Hashtbl.iter + (fun digest paths -> + let n_paths = List.length paths in + if n_paths > 1 then begin + printf "%s %d\n%!" (Digest.to_hex digest) n_paths; + List.iter paths ~f:(fun path -> printf " %s\n%!" path) + end + ) + paths_by_digest + +let () = + let ic = ref stdin in + Arg.parse [] (fun filename -> ic := open_in filename) ""; + main !ic; + close_in !ic