Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions deduplication/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@
if args.mode == "bloom":
if args.single:
assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list"
dedup_single_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing)
dedup_single_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, args.skip_minhashing, clear=args.clear)
elif args.multi:
dedup_multi_bloom(args.input, args.minhash_dir, args.num, args.fp, args.output_file, args.name, args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing)
dedup_multi_bloom(args.input, args.minhash_dir, args.num, args.fp, args.output_file, args.name, args.sim_threshold, args.num_perm, args.save_dir, args.skip_minhashing, clear=args.clear)
else:
assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list"
dedup_single_file_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing)
dedup_single_file_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, args.skip_minhashing, clear=args.clear)
else:
if args.single:
assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list"
dedup_single_lsh(args.input[0], args.minhash_dir[0], args.output_file, args.name[0], args.sim_threshold, args.num_perm, redis_port=args.redis_port, compute_minhashes=not args.skip_minhashing)
dedup_single_lsh(args.input[0], args.minhash_dir[0], args.output_file, args.name[0], args.sim_threshold, args.num_perm, redis_port=args.redis_port, skip_minhashing=args.skip_minhashing)
elif args.multi:
dedup_multi_lsh(args.input, args.minhash_dir, args.output_file, args.name, args.sim_threshold, args.num_perm, redis_port=args.redis_port, compute_minhashes=not args.skip_minhashing)
dedup_multi_lsh(args.input, args.minhash_dir, args.output_file, args.name, args.sim_threshold, args.num_perm, redis_port=args.redis_port, skip_minhashing=args.skip_minhashing)
else:
assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list"
dedup_single_file_lsh(args.input[0], args.minhash_dir[0], args.output_file, args.name[0], args.sim_threshold, args.num_perm, redis_port=args.redis_port, compute_minhashes=not args.skip_minhashing)
dedup_single_file_lsh(args.input[0], args.minhash_dir[0], args.output_file, args.name[0], args.sim_threshold, args.num_perm, redis_port=args.redis_port, skip_minhashing=args.skip_minhashing)


2 changes: 2 additions & 0 deletions deduplication/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,13 @@ def parse_args():
"--sim-threshold",
help="Jaccard Similarity threshold for deduplication, should be in [0, 1]. Default is 0.8",
default=0.8,
type=float,
)
parser.add_argument(
"--num-perm",
help="Number of hash functions for MinHashing. Default is 128",
default=128,
type=int,
)
parser.add_argument(
"--mode",
Expand Down
25 changes: 13 additions & 12 deletions deduplication/workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def dedup_single_lsh(
n_hash_funcs: int = 128,
redis_name: str = b"tpc",
redis_port: int = 6379,
compute_minhashes: bool = True,
skip_minhashing: bool = False,
):
lsh_params = {
"threshold": sim_threshold,
Expand All @@ -29,7 +29,7 @@ def dedup_single_lsh(
},
}

if compute_minhashes:
if not skip_minhashing:
m = MinHasher(input_dir, minhash_dir, n_hash_funcs)
m.process()

Expand All @@ -48,7 +48,7 @@ def dedup_multi_lsh(
n_hash_funcs: int = 128,
redis_name: str = b"tpc",
redis_port: int = 6379,
compute_minhashes: bool = True,
skip_minhashing: bool = False,
):
assert len(input_dirs) == len(minhash_dirs) == len(corpus_names), \
f"Expected len(input_dirs) == len(minhash_dirs) == len(corpus_names), got {len(input_dirs)}, {len(minhash_dirs)}, {len(corpus_names)}"
Expand All @@ -63,7 +63,7 @@ def dedup_multi_lsh(
n_hash_funcs,
redis_name,
redis_port,
compute_minhashes,
skip_minhashing,
)


Expand All @@ -76,7 +76,7 @@ def dedup_single_file_lsh(
n_hash_funcs: int = 128,
redis_name: str = b"tpc",
redis_port: int = 6379,
compute_minhashes: bool = True,
skip_minhashing: bool = False,
):
lsh_params = {
"threshold": sim_threshold,
Expand All @@ -88,7 +88,7 @@ def dedup_single_file_lsh(
},
}

if compute_minhashes:
if not skip_minhashing:
m = MinHasher(None, minhash_dir, n_hash_funcs)
m.compute_minhash_for_file(input_file)

Expand All @@ -105,6 +105,7 @@ def clear_dir(save_dir):
if os.path.exists(save_dir):
rm_files = [os.path.join(save_dir, f) for f in os.listdir(save_dir) if ".bf" in f or '.csv' in f]
for f in rm_files:
print(f"Clearing {f}...")
os.remove(f)


Expand All @@ -119,7 +120,7 @@ def dedup_single_bloom(
sim_threshold: float = 0.8,
n_hash_funcs: int = 128,
save_dir: str = "./",
compute_minhashes: bool = True,
skip_minhashing: bool = False,
clear: bool = False,
):
if clear:
Expand All @@ -133,7 +134,7 @@ def dedup_single_bloom(
"save_dir": save_dir
}

if compute_minhashes:
if not skip_minhashing:
m = MinHasher(input_dir, minhash_dir, n_hash_funcs)
m.process()

Expand All @@ -153,7 +154,7 @@ def dedup_multi_bloom(
sim_threshold: float = 0.8,
n_hash_funcs: int = 128,
save_dir: str = "./",
compute_minhashes: bool = True,
skip_minhashing: bool = False,
clear: bool = False,
):
assert len(input_dirs) == len(minhash_dirs) == len(corpus_names), \
Expand All @@ -173,7 +174,7 @@ def dedup_multi_bloom(
sim_threshold,
n_hash_funcs,
save_dir,
compute_minhashes,
skip_minhashing,
clear=False
)

Expand All @@ -187,7 +188,7 @@ def dedup_single_file_bloom(
sim_threshold: float = 0.8,
n_hash_funcs: int = 128,
save_dir: str = "./",
compute_minhashes: bool = True,
skip_minhashing: bool = False,
clear: bool = False,
):
if clear:
Expand All @@ -201,7 +202,7 @@ def dedup_single_file_bloom(
"save_dir": save_dir
}

if compute_minhashes:
if not skip_minhashing:
m = MinHasher(None, minhash_dir, n_hash_funcs)
m.compute_minhash_for_file(input_file)

Expand Down