: t -> size:int -> unit
val digest
: t -> unit
+ val redundant_data
+ : t -> size:int -> unit
end = struct
type t =
{ considered_files : int ref
; hashed_files : int ref
; hashed_bytes : int ref
; digests : int ref
+ ; redundant_data : int ref
}
let init () =
; unique_sample_files = ref 0
; unique_sample_bytes = ref 0
; digests = ref 0
+ ; redundant_data = ref 0
}
let add sum addend =
let digest t =
incr t.digests
+ let redundant_data t ~size =
+ add t.redundant_data size
+
let report
t
~time_all
time_group_by_digest;
eprintf "Digests : %8d\n%!"
!(t.digests);
- eprintf "Duplicates (Hashed - Digests): %8d\n%!"
- (!(t.hashed_files) - !(t.digests));
+ eprintf "Duplicates (Hashed - Digests): %8d files %6.2f Gb\n%!"
+ (!(t.hashed_files) - !(t.digests))
+ (b_to_gb !(t.redundant_data));
eprintf "Skipped due to 0 size : %8d files\n%!" !(t.empty);
eprintf "Skipped due to unique size : %8d files %6.2f Gb %6.2f seconds\n%!"
!(t.unique_size_files)
type opt =
{ input : input
; output : output
- ; ignore : Str.regexp option
+ ; ignore : string -> bool
; sample : int
}
Stream.filter input ~f:(fun {File.path; size} ->
M.file_considered metrics ~size;
let empty = size = 0 in
+ let ignored = ignore path in
if empty then M.file_empty metrics;
- let ignored =
- match ignore with
- | Some regexp when (Str.string_match regexp path 0) ->
- M.file_ignored metrics ~size;
- true
- | Some _ | None ->
- false
- in
+ if ignored then M.file_ignored metrics ~size;
(not empty) && (not ignored)
)
Stream.iter groups ~f:(fun (d, n, files) ->
M.digest metrics;
- if n > 1 then output d n files
+ if n > 1 then
+ M.redundant_data metrics ~size:(n * (List.hd files).File.size);
+ output d n files
);
let t1_all = Sys.time () in
in
let input = ref Stdin in
let output = ref Stdout in
- let ignore = ref None in
+ let ignore = ref (fun _ -> false) in
let sample = ref 256 in
let spec =
[ ( "-out"
, " Output to this directory instead of stdout."
)
; ( "-ignore"
- , Arg.String (fun regexp -> ignore := Some (Str.regexp regexp))
+ , Arg.String (fun regexp ->
+ let regexp = Str.regexp regexp in
+ ignore := fun string -> Str.string_match regexp string 0)
, " Ignore file paths which match this regexp pattern (see Str module)."
)
; ( "-sample"