Initial prototype
authorSiraaj Khandkar <siraaj@khandkar.net>
Wed, 14 Nov 2018 14:08:34 +0000 (09:08 -0500)
committerSiraaj Khandkar <siraaj@khandkar.net>
Wed, 14 Nov 2018 14:08:34 +0000 (09:08 -0500)
works mostly smoothly, except one problem: because we treat each input
line as a filename - the filenames which contain newline characters are
seen as multiple filenames which do not exist.

.gitignore [new file with mode: 0644]
Makefile [new file with mode: 0644]
dupfiles.ml [new file with mode: 0644]
dupfiles.mli [new file with mode: 0644]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..47ad301
--- /dev/null
@@ -0,0 +1,4 @@
+_build/
+*.native
+*.native
+*.swp
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..d79ad63
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,16 @@
+TARGET := dupfiles.native
+
+.PHONY: all build clean run
+
+all:
+       @$(MAKE) -s clean
+       @$(MAKE) -s build
+
+build:
+       @ocamlbuild -cflags '-w A' $(TARGET)
+
+clean:
+       @ocamlbuild -clean
+
+run:
+       @find ~ -type f | egrep -v '/.git/' | ./$(TARGET)
diff --git a/dupfiles.ml b/dupfiles.ml
new file mode 100644 (file)
index 0000000..42647b7
--- /dev/null
@@ -0,0 +1,56 @@
+open Printf
+
+module List = ListLabels
+
+module Stream : sig
+  val lines : in_channel -> f:(string -> unit) -> unit
+end = struct
+  module S = Stream
+
+  let lines_of_channel ic =
+    S.from (fun _ ->
+      match input_line ic with
+      | exception End_of_file ->
+          None
+      | line ->
+          Some line
+    )
+
+  let iter t ~f =
+    S.iter f t
+
+  let lines ic ~f =
+    iter (lines_of_channel ic) ~f
+end
+
+let main ic =
+  let paths_by_digest = Hashtbl.create 1_000_000 in
+  Stream.lines ic ~f:(fun path ->
+    try
+      let digest = Digest.file path in
+      let paths =
+        match Hashtbl.find_opt paths_by_digest digest with
+        | None ->
+            []
+        | Some paths ->
+            paths
+      in
+      Hashtbl.replace paths_by_digest digest (path :: paths)
+    with Sys_error e ->
+      eprintf "WARNING: Failed to process %S: %S\n%!" path e
+  );
+  Hashtbl.iter
+    (fun digest paths ->
+      let n_paths = List.length paths in
+      if n_paths > 1 then begin
+        printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
+        List.iter paths ~f:(fun path -> printf "    %s\n%!" path)
+      end
+    )
+    paths_by_digest
+
+let () =
+  let ic = ref stdin in
+  Arg.parse [] (fun filename -> ic := open_in filename) "";
+  main !ic;
+  close_in !ic
diff --git a/dupfiles.mli b/dupfiles.mli
new file mode 100644 (file)
index 0000000..e69de29
This page took 0.030695 seconds and 4 git commands to generate.