X-Git-Url: https://git.xandkar.net/?p=dups.git;a=blobdiff_plain;f=dups.ml;h=7c03773b48030904b1c5d100e5c840f014ae4340;hp=14132a6d72bc23b43425f01f4d95af250c92faac;hb=9d01fa28f425150b98f7759d6ff6dd646a41e41a;hpb=1013fbcdb1a09c416c45ad419bc1316de82d89d6 diff --git a/dups.ml b/dups.ml index 14132a6..7c03773 100644 --- a/dups.ml +++ b/dups.ml @@ -36,6 +36,8 @@ module Metrics : sig : t -> size:int -> unit val digest : t -> unit + val redundant_data + : t -> size:int -> unit end = struct type t = { considered_files : int ref @@ -52,6 +54,7 @@ end = struct ; hashed_files : int ref ; hashed_bytes : int ref ; digests : int ref + ; redundant_data : int ref } let init () = @@ -69,6 +72,7 @@ end = struct ; unique_sample_files = ref 0 ; unique_sample_bytes = ref 0 ; digests = ref 0 + ; redundant_data = ref 0 } let add sum addend = @@ -106,6 +110,9 @@ end = struct let digest t = incr t.digests + let redundant_data t ~size = + add t.redundant_data size + let report t ~time_all @@ -129,8 +136,9 @@ end = struct time_group_by_digest; eprintf "Digests : %8d\n%!" !(t.digests); - eprintf "Duplicates (Hashed - Digests): %8d\n%!" - (!(t.hashed_files) - !(t.digests)); + eprintf "Duplicates (Hashed - Digests): %8d files %6.2f Gb\n%!" + (!(t.hashed_files) - !(t.digests)) + (b_to_gb !(t.redundant_data)); eprintf "Skipped due to 0 size : %8d files\n%!" !(t.empty); eprintf "Skipped due to unique size : %8d files %6.2f Gb %6.2f seconds\n%!" !(t.unique_size_files) @@ -372,7 +380,7 @@ type output = type opt = { input : input ; output : output - ; ignore : Str.regexp option + ; ignore : string -> bool ; sample : int } @@ -388,15 +396,9 @@ let make_input_stream input ignore ~metrics = Stream.filter input ~f:(fun {File.path; size} -> M.file_considered metrics ~size; let empty = size = 0 in + let ignored = ignore path in if empty then M.file_empty metrics; - let ignored = - match ignore with - | Some regexp when (Str.string_match regexp path 0) -> - M.file_ignored metrics ~size; - true - | Some _ | None -> - false - in + if ignored then M.file_ignored metrics ~size; (not empty) && (not ignored) ) @@ -454,7 +456,9 @@ let main {input; output; ignore; sample = sample_len} = Stream.iter groups ~f:(fun (d, n, files) -> M.digest metrics; - if n > 1 then output d n files + if n > 1 then + M.redundant_data metrics ~size:(n * (List.hd files).File.size); + output d n files ); let t1_all = Sys.time () in @@ -480,7 +484,7 @@ let get_opt () : opt = in let input = ref Stdin in let output = ref Stdout in - let ignore = ref None in + let ignore = ref (fun _ -> false) in let sample = ref 256 in let spec = [ ( "-out" @@ -492,7 +496,9 @@ let get_opt () : opt = , " Output to this directory instead of stdout." ) ; ( "-ignore" - , Arg.String (fun regexp -> ignore := Some (Str.regexp regexp)) + , Arg.String (fun regexp -> + let regexp = Str.regexp regexp in + ignore := fun string -> Str.string_match regexp string 0) , " Ignore file paths which match this regexp pattern (see Str module)." ) ; ( "-sample"