From 389dccafe4b8d3353db36f0cab3de5b9ea8de9ce Mon Sep 17 00:00:00 2001 From: Siraaj Khandkar Date: Sun, 25 Nov 2018 23:55:09 -0500 Subject: [PATCH 1/1] Count redundant data size --- dups.ml | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/dups.ml b/dups.ml index 14132a6..a2e0181 100644 --- a/dups.ml +++ b/dups.ml @@ -36,6 +36,8 @@ module Metrics : sig : t -> size:int -> unit val digest : t -> unit + val redundant_data + : t -> size:int -> unit end = struct type t = { considered_files : int ref @@ -52,6 +54,7 @@ end = struct ; hashed_files : int ref ; hashed_bytes : int ref ; digests : int ref + ; redundant_data : int ref } let init () = @@ -69,6 +72,7 @@ end = struct ; unique_sample_files = ref 0 ; unique_sample_bytes = ref 0 ; digests = ref 0 + ; redundant_data = ref 0 } let add sum addend = @@ -106,6 +110,9 @@ end = struct let digest t = incr t.digests + let redundant_data t ~size = + add t.redundant_data size + let report t ~time_all @@ -129,8 +136,9 @@ end = struct time_group_by_digest; eprintf "Digests : %8d\n%!" !(t.digests); - eprintf "Duplicates (Hashed - Digests): %8d\n%!" - (!(t.hashed_files) - !(t.digests)); + eprintf "Duplicates (Hashed - Digests): %8d files %6.2f Gb\n%!" + (!(t.hashed_files) - !(t.digests)) + (b_to_gb !(t.redundant_data)); eprintf "Skipped due to 0 size : %8d files\n%!" !(t.empty); eprintf "Skipped due to unique size : %8d files %6.2f Gb %6.2f seconds\n%!" !(t.unique_size_files) @@ -454,7 +462,9 @@ let main {input; output; ignore; sample = sample_len} = Stream.iter groups ~f:(fun (d, n, files) -> M.digest metrics; - if n > 1 then output d n files + if n > 1 then + M.redundant_data metrics ~size:(n * (List.hd files).File.size); + output d n files ); let t1_all = Sys.time () in -- 2.20.1