From cce97c27face42237f2b3757c91ad6e29685d54a Mon Sep 17 00:00:00 2001 From: Siraaj Khandkar Date: Wed, 14 Nov 2018 09:08:34 -0500 Subject: [PATCH] Initial prototype works mostly smoothly, except one problem: because we treat each input line as a filename - the filenames which contain newline characters are seen as multiple filenames which do not exist. --- .gitignore | 4 ++++ Makefile | 16 +++++++++++++++ dupfiles.ml | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ dupfiles.mli | 0 4 files changed, 76 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 dupfiles.ml create mode 100644 dupfiles.mli diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..47ad301 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +_build/ +*.native +*.native +*.swp diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d79ad63 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +TARGET := dupfiles.native + +.PHONY: all build clean run + +all: + @$(MAKE) -s clean + @$(MAKE) -s build + +build: + @ocamlbuild -cflags '-w A' $(TARGET) + +clean: + @ocamlbuild -clean + +run: + @find ~ -type f | egrep -v '/.git/' | ./$(TARGET) diff --git a/dupfiles.ml b/dupfiles.ml new file mode 100644 index 0000000..42647b7 --- /dev/null +++ b/dupfiles.ml @@ -0,0 +1,56 @@ +open Printf + +module List = ListLabels + +module Stream : sig + val lines : in_channel -> f:(string -> unit) -> unit +end = struct + module S = Stream + + let lines_of_channel ic = + S.from (fun _ -> + match input_line ic with + | exception End_of_file -> + None + | line -> + Some line + ) + + let iter t ~f = + S.iter f t + + let lines ic ~f = + iter (lines_of_channel ic) ~f +end + +let main ic = + let paths_by_digest = Hashtbl.create 1_000_000 in + Stream.lines ic ~f:(fun path -> + try + let digest = Digest.file path in + let paths = + match Hashtbl.find_opt paths_by_digest digest with + | None -> + [] + | Some paths -> + paths + in + Hashtbl.replace paths_by_digest digest (path :: paths) + with Sys_error e -> + eprintf "WARNING: Failed to process %S: %S\n%!" path e + ); + Hashtbl.iter + (fun digest paths -> + let n_paths = List.length paths in + if n_paths > 1 then begin + printf "%s %d\n%!" (Digest.to_hex digest) n_paths; + List.iter paths ~f:(fun path -> printf " %s\n%!" path) + end + ) + paths_by_digest + +let () = + let ic = ref stdin in + Arg.parse [] (fun filename -> ic := open_in filename) ""; + main !ic; + close_in !ic diff --git a/dupfiles.mli b/dupfiles.mli new file mode 100644 index 0000000..e69de29 -- 2.20.1