module Array = ArrayLabels
module List = ListLabels
-module StrSet= Set.Make(String)
+module StrSet = Set.Make(String)
module Unix = UnixLabels
module Stream : sig
val lookup : string Stream.t -> t Stream.t
(** Lookup file info for given paths *)
+
+ module Set : sig include Set.S with type elt := t end
end = struct
type t =
{ path : string
; size : int
}
+ let compare {path=p1; _} {path=p2; _} =
+ Stdlib.compare p1 p2
+
let lookup paths =
Stream.map paths ~f:(fun path ->
let {Unix.st_size = size; _} = Unix.lstat path in
next ()
in
Stream.create next
+
+ module Set = Set.Make(struct
+ type elt = t
+ type t = elt
+ let compare = compare
+ end)
end
type input =
| Stdout
| Directory of string
+type opt =
+ { input : input
+ ; output : output
+ ; ignore : Str.regexp option
+ ; sample : int
+ }
+
type count =
- { considered : int ref
- ; empty : int ref
- ; ignored : int ref
- ; unique_size : int ref
- ; hashed : int ref
+ { considered_files : int ref
+ ; considered_bytes : int ref
+ ; empty : int ref
+ ; ignored_files : int ref
+ ; ignored_bytes : int ref
+ ; unique_size_files : int ref
+ ; unique_size_bytes : int ref
+ ; unique_sample_files : int ref
+ ; unique_sample_bytes : int ref
+ ; sampled_files : int ref
+ ; sampled_bytes : int ref
+ ; hashed_files : int ref
+ ; hashed_bytes : int ref
+ ; digests : int ref
}
+let add sum addend =
+ sum := !sum + addend
+
let make_input_stream input ignore count =
let input =
match input with
Stream.concat (List.map paths ~f:File.find)
in
Stream.filter input ~f:(fun {File.path; size} ->
- incr count.considered;
+ incr count.considered_files;
+ add count.considered_bytes size;
let empty = size = 0 in
let ignored =
match ignore with
| Some regexp when (Str.string_match regexp path 0) ->
+ incr count.ignored_files;
+ add count.ignored_bytes size;
true
| Some _ | None ->
false
in
if empty then incr count.empty;
- if ignored then incr count.ignored;
(not empty) && (not ignored)
)
let make_output_fun = function
| Stdout ->
- fun digest n_paths paths ->
- printf "%s %d\n%!" (Digest.to_hex digest) n_paths;
- List.iter (StrSet.elements paths) ~f:(printf " %S\n%!")
+ fun digest n_files files ->
+ printf "%s %d\n%!" (Digest.to_hex digest) n_files;
+ List.iter (File.Set.elements files) ~f:(fun {File.path; _} ->
+ printf " %S\n%!" path
+ )
| Directory dir ->
- fun digest _ paths ->
+ fun digest _ files ->
let digest = Digest.to_hex digest in
let dir = Filename.concat dir (String.sub digest 0 2) in
Unix.mkdir dir ~perm:0o700;
let oc = open_out (Filename.concat dir digest) in
- List.iter (StrSet.elements paths) ~f:(fun path ->
+ List.iter (File.Set.elements files) ~f:(fun {File.path; _} ->
output_string oc (sprintf "%S\n%!" path)
);
close_out oc
-let main input output ignore =
+let sample path ~len ~count =
+ let buf = Bytes.make len ' ' in
+ let ic = open_in_bin path in
+ let rec read pos len =
+ assert (len >= 0);
+ if len = 0 then
+ ()
+ else begin
+ let chunk_size = input ic buf pos len in
+ add count.sampled_bytes chunk_size;
+ if chunk_size = 0 then (* EOF *)
+ ()
+ else
+ read (pos + chunk_size) (len - chunk_size)
+ end
+ in
+ read 0 len;
+ close_in ic;
+ Bytes.to_string buf
+
+let main {input; output; ignore; sample = sample_len} =
let t0 = Sys.time () in
let count =
- { considered = ref 0
- ; empty = ref 0
- ; ignored = ref 0
- ; unique_size = ref 0
- ; hashed = ref 0
+ { considered_files = ref 0
+ ; considered_bytes = ref 0
+ ; empty = ref 0
+ ; ignored_files = ref 0
+ ; ignored_bytes = ref 0
+ ; unique_size_files = ref 0
+ ; unique_size_bytes = ref 0
+ ; sampled_files = ref 0
+ ; sampled_bytes = ref 0
+ ; hashed_files = ref 0
+ ; hashed_bytes = ref 0
+ ; unique_sample_files = ref 0
+ ; unique_sample_bytes = ref 0
+ ; digests = ref 0
}
in
let output = make_output_fun output in
let input = make_input_stream input ignore count in
- let paths_by_size = Hashtbl.create 1_000_000 in
- let paths_by_digest = Hashtbl.create 1_000_000 in
- let process tbl path ~f =
- let key = f path in
- let count, paths =
- match Hashtbl.find_opt tbl key with
+ let files_by_size = Hashtbl.create 1_000_000 in
+ let files_by_sample = Hashtbl.create 1_000_000 in
+ let files_by_digest = Hashtbl.create 1_000_000 in
+ let process tbl ~group ~file =
+ let count, files =
+ match Hashtbl.find_opt tbl group with
| None ->
- (0, StrSet.empty)
- | Some (n, paths) ->
- (n, paths)
+ (0, File.Set.empty)
+ | Some (n, files) ->
+ (n, files)
in
- Hashtbl.replace tbl key (count + 1, StrSet.add path paths)
+ Hashtbl.replace tbl group (count + 1, File.Set.add file files)
in
- Stream.iter input ~f:(fun {File.path; size} ->
- process paths_by_size path ~f:(fun _ -> size)
+ (* TODO: Make a nice(r) abstraction to re-assemble pieces in the pipeline:
+ *
+ * from input to files_by_size
+ * from files_by_size to files_by_sample
+ * from files_by_sample to files_by_digest
+ * from files_by_digest to output
+ *
+ * input |> files_by_size |> files_by_sample |> files_by_digest |> output
+ *)
+ let t0_group_by_size = Sys.time () in
+ Stream.iter input ~f:(fun ({File.size; _} as file) ->
+ process files_by_size ~group:size ~file
);
+ let t1_group_by_size = Sys.time () in
+ let t0_group_by_sample = Sys.time () in
Hashtbl.iter
- (fun _ (n, paths) ->
+ (fun _ (n, files) ->
(* Skip files with unique sizes *)
if n > 1 then
- StrSet.iter
- (fun path ->
- incr count.hashed;
- process paths_by_digest path ~f:Digest.file
+ File.Set.iter
+ (fun ({File.path; _} as file) ->
+ incr count.sampled_files;
+ process
+ files_by_sample
+ ~group:(sample path ~len:sample_len ~count)
+ ~file
)
- paths
+ files
else
- incr count.unique_size;
+ File.Set.iter
+ (fun {File.size; _} ->
+ incr count.unique_size_files;
+ add count.unique_size_bytes size
+ )
+ files
)
- paths_by_size;
- Hashtbl.iter (fun d (n, ps) -> if n > 1 then output d n ps) paths_by_digest;
+ files_by_size;
+ let t1_group_by_sample = Sys.time () in
+ let t0_group_by_digest = Sys.time () in
+ Hashtbl.iter
+ (fun _ (n, files) ->
+ (* Skip files with unique samples *)
+ if n > 1 then
+ File.Set.iter
+ (fun ({File.path; size} as file) ->
+ incr count.hashed_files;
+ add count.hashed_bytes size;
+ process files_by_digest ~group:(Digest.file path) ~file
+ )
+ files
+ else
+ File.Set.iter
+ (fun {File.size; _} ->
+ incr count.unique_sample_files;
+ add count.unique_sample_bytes size;
+ )
+ files
+ )
+ files_by_sample;
+ let t1_group_by_digest = Sys.time () in
+ Hashtbl.iter
+ (fun d (n, files) ->
+ incr count.digests;
+ if n > 1 then
+ output d n files
+ )
+ files_by_digest;
let t1 = Sys.time () in
- eprintf "Time : %f seconds\n%!" (t1 -. t0);
- eprintf "Considered : %d\n%!" !(count.considered);
- eprintf "Hashed : %d\n%!" !(count.hashed);
- eprintf "Skipped due to 0 size : %d\n%!" !(count.empty);
- eprintf "Skipped due to unique size : %d\n%!" !(count.unique_size);
- eprintf "Ignored due to regex match : %d\n%!" !(count.ignored)
+ let b_to_mb b = (float_of_int b) /. 1024. /. 1024. in
+ let b_to_gb b = (b_to_mb b) /. 1024. in
+ eprintf "Time : %8.2f seconds\n%!" (t1 -. t0);
+ eprintf "Considered : %8d files %6.2f Gb\n%!"
+ !(count.considered_files)
+ (b_to_gb !(count.considered_bytes));
+ eprintf "Sampled : %8d files %6.2f Gb\n%!"
+ !(count.sampled_files)
+ (b_to_gb !(count.sampled_bytes));
+ eprintf "Hashed : %8d files %6.2f Gb %6.2f seconds\n%!"
+ !(count.hashed_files)
+ (b_to_gb !(count.hashed_bytes))
+ (t1_group_by_digest -. t0_group_by_digest);
+ eprintf "Digests : %8d\n%!"
+ !(count.digests);
+ eprintf "Duplicates (Hashed - Digests): %8d\n%!"
+ (!(count.hashed_files) - !(count.digests));
+ eprintf "Skipped due to 0 size : %8d files\n%!" !(count.empty);
+ eprintf "Skipped due to unique size : %8d files %6.2f Gb %6.2f seconds\n%!"
+ !(count.unique_size_files)
+ (b_to_gb !(count.unique_size_bytes))
+ (t1_group_by_size -. t0_group_by_size);
+ eprintf "Skipped due to unique sample : %8d files %6.2f Gb %6.2f seconds\n%!"
+ !(count.unique_sample_files)
+ (b_to_gb !(count.unique_sample_bytes))
+ (t1_group_by_sample -. t0_group_by_sample);
+ eprintf "Ignored due to regex match : %8d files %6.2f Gb\n%!"
+ !(count.ignored_files)
+ (b_to_gb !(count.ignored_bytes))
-let () =
- let input = ref Stdin in
- let output = ref Stdout in
- let ignore = ref None in
- let assert_file_exists path =
- if Sys.file_exists path then
- ()
- else begin
- eprintf "File does not exist: %S\n%!" path;
+let get_opt () : opt =
+ let assert_ test x msg =
+ if not (test x) then begin
+ eprintf "%s\n%!" msg;
exit 1
end
in
+ let assert_file_exists path =
+ assert_ Sys.file_exists path (sprintf "File does not exist: %S" path)
+ in
let assert_file_is_dir path =
- if Sys.is_directory path then
- ()
- else begin
- eprintf "File is not a directory: %S\n%!" path;
- exit 1
- end
+ assert_ Sys.is_directory path (sprintf "File is not a directory: %S" path)
in
+ let input = ref Stdin in
+ let output = ref Stdout in
+ let ignore = ref None in
+ let sample = ref 256 in
let spec =
[ ( "-out"
, Arg.String (fun path ->
, Arg.String (fun regexp -> ignore := Some (Str.regexp regexp))
, " Ignore file paths which match this regexp pattern (see Str module)."
)
+ ; ( "-sample"
+ , Arg.Set_int sample
+ , (sprintf " Byte size of file samples to use. Default: %d" !sample)
+ )
]
in
Arg.parse
input := Directories (path :: paths)
)
"";
- main !input !output !ignore
+ assert_
+ (fun x -> x > 0)
+ !sample
+ (sprintf "Sample size cannot be negative: %d" !sample);
+ { input = !input
+ ; output = !output
+ ; ignore = !ignore
+ ; sample = !sample
+ }
+
+let () =
+ main (get_opt ())