X-Git-Url: https://git.xandkar.net/?p=dups.git;a=blobdiff_plain;f=dupfiles.ml;fp=dupfiles.ml;h=0000000000000000000000000000000000000000;hp=8a555e8246b908c9a90ca1b1de43518633fb7933;hb=03db9aee3a879fdb6aef6b40d86b9aca3898a376;hpb=0a1c6f40315940659978d3d576cc7c6415c27d81 diff --git a/dupfiles.ml b/dupfiles.ml deleted file mode 100644 index 8a555e8..0000000 --- a/dupfiles.ml +++ /dev/null @@ -1,113 +0,0 @@ -open Printf - -module Array = ArrayLabels -module List = ListLabels - -module Stream : sig - type 'a t - val lines : in_channel -> string t - val rec_file_paths : root:string -> string t - val iter : 'a t -> f:('a -> unit) -> unit -end = struct - module S = Stream - - type 'a t = - 'a S.t - - let rec_file_paths ~root = - let dirs = Queue.create () in - let files = Queue.create () in - Queue.add root dirs; - let explore parent = - Array.iter (Sys.readdir parent) ~f:(fun child -> - let path = Filename.concat parent child in - let {Unix.st_kind = file_kind; _} = Unix.lstat path in - match file_kind with - | Unix.S_REG -> - Queue.add path files - | Unix.S_DIR -> - Queue.add path dirs - | Unix.S_CHR - | Unix.S_BLK - | Unix.S_LNK - | Unix.S_FIFO - | Unix.S_SOCK -> - () - ) - in - let next_dir () = - match Queue.take dirs with - | exception Queue.Empty -> - () - | dir -> - explore dir - in - let next_file () = - match Queue.take files with - | exception Queue.Empty -> - None - | file_path -> - Some file_path - in - S.from (fun _ -> - next_dir (); - next_file () - ) - - let lines ic = - S.from (fun _ -> - match input_line ic with - | exception End_of_file -> - None - | line -> - Some line - ) - - let iter t ~f = - S.iter f t -end - -type input = - | Root_path of string - | Paths_on_stdin - -let main input = - let paths = - match input with - | Paths_on_stdin -> Stream.lines stdin - | Root_path root -> Stream.rec_file_paths ~root - in - let paths_by_digest = Hashtbl.create 1_000_000 in - let path_count = ref 0 in - let t0 = Sys.time () in - Stream.iter paths ~f:(fun path -> - incr path_count; - try - let digest = Digest.file path in - let paths = - match Hashtbl.find_opt paths_by_digest digest with - | None -> - [] - | Some paths -> - paths - in - Hashtbl.replace paths_by_digest digest (path :: paths) - with Sys_error e -> - eprintf "WARNING: Failed to process %S: %S\n%!" path e - ); - Hashtbl.iter - (fun digest paths -> - let n_paths = List.length paths in - if n_paths > 1 then begin - printf "%s %d\n%!" (Digest.to_hex digest) n_paths; - List.iter paths ~f:(fun path -> printf " %s\n%!" path) - end - ) - paths_by_digest; - let t1 = Sys.time () in - eprintf "Processed %d files in %f seconds.\n%!" !path_count (t1 -. t0) - -let () = - let input = ref Paths_on_stdin in - Arg.parse [] (fun path -> input := Root_path path) ""; - main !input