X-Git-Url: https://git.xandkar.net/?p=dups.git;a=blobdiff_plain;f=dups.ml;h=a2e0181b5ae84cad13245337df20cb989395fd25;hp=14132a6d72bc23b43425f01f4d95af250c92faac;hb=389dccafe4b8d3353db36f0cab3de5b9ea8de9ce;hpb=1013fbcdb1a09c416c45ad419bc1316de82d89d6 diff --git a/dups.ml b/dups.ml index 14132a6..a2e0181 100644 --- a/dups.ml +++ b/dups.ml @@ -36,6 +36,8 @@ module Metrics : sig : t -> size:int -> unit val digest : t -> unit + val redundant_data + : t -> size:int -> unit end = struct type t = { considered_files : int ref @@ -52,6 +54,7 @@ end = struct ; hashed_files : int ref ; hashed_bytes : int ref ; digests : int ref + ; redundant_data : int ref } let init () = @@ -69,6 +72,7 @@ end = struct ; unique_sample_files = ref 0 ; unique_sample_bytes = ref 0 ; digests = ref 0 + ; redundant_data = ref 0 } let add sum addend = @@ -106,6 +110,9 @@ end = struct let digest t = incr t.digests + let redundant_data t ~size = + add t.redundant_data size + let report t ~time_all @@ -129,8 +136,9 @@ end = struct time_group_by_digest; eprintf "Digests : %8d\n%!" !(t.digests); - eprintf "Duplicates (Hashed - Digests): %8d\n%!" - (!(t.hashed_files) - !(t.digests)); + eprintf "Duplicates (Hashed - Digests): %8d files %6.2f Gb\n%!" + (!(t.hashed_files) - !(t.digests)) + (b_to_gb !(t.redundant_data)); eprintf "Skipped due to 0 size : %8d files\n%!" !(t.empty); eprintf "Skipped due to unique size : %8d files %6.2f Gb %6.2f seconds\n%!" !(t.unique_size_files) @@ -454,7 +462,9 @@ let main {input; output; ignore; sample = sample_len} = Stream.iter groups ~f:(fun (d, n, files) -> M.digest metrics; - if n > 1 then output d n files + if n > 1 then + M.redundant_data metrics ~size:(n * (List.hd files).File.size); + output d n files ); let t1_all = Sys.time () in