Practical-Formal-Methods · jiradeto · Apr 6, 2022 · Apr 11, 2022 · Apr 11, 2022 · Apr 11, 2022
diff --git a/common/experiment_utils.py b/common/experiment_utils.py
@@ -72,6 +72,12 @@ def get_oss_fuzz_corpora_filestore_path():
     return posixpath.join(get_experiment_filestore_path(), 'oss_fuzz_corpora')
 
 
+def get_random_seed_corpora_filestore_path():
+    """Returns path containing the user-provided seed corpora."""
+    return posixpath.join(get_experiment_filestore_path(),
+                          'random_seed_corpora')
+
+
 def get_dispatcher_instance_name(experiment: str) -> str:
     """Returns a dispatcher instance name for an experiment."""
     return 'd-%s' % experiment

diff --git a/experiment/resources/runner-startup-script-template.sh b/experiment/resources/runner-startup-script-template.sh
@@ -46,6 +46,7 @@ docker run \
 -e NO_SEEDS={{no_seeds}} \
 -e NO_DICTIONARIES={{no_dictionaries}} \
 -e OSS_FUZZ_CORPUS={{oss_fuzz_corpus}} \
+-e RANDOM_SEED_CORPUS_DIR={{random_seed_corpus_dir}} \
 -e DOCKER_REGISTRY={{docker_registry}} {% if not local_experiment %}-e CLOUD_PROJECT={{cloud_project}} -e CLOUD_COMPUTE_ZONE={{cloud_compute_zone}} {% endif %}\
 -e EXPERIMENT_FILESTORE={{experiment_filestore}} {% if local_experiment %}-v {{experiment_filestore}}:{{experiment_filestore}} {% endif %}\
 -e REPORT_FILESTORE={{report_filestore}} {% if local_experiment %}-v {{report_filestore}}:{{report_filestore}} {% endif %}\

diff --git a/experiment/run_experiment.py b/experiment/run_experiment.py
@@ -22,6 +22,7 @@
 import sys
 import tarfile
 import tempfile
+import zipfile
 from typing import Dict, List
 
 import jinja2
@@ -63,6 +64,9 @@
     'gs://{project}-backup.clusterfuzz-external.appspot.com/corpus/'
     'libFuzzer/{fuzz_target}/public.zip')
 
+# max size allowed per seed corpus for AFL
+CORPUS_ELEMENT_BYTES_LIMIT = 1 * 1024 * 1024
+
 
 def read_and_validate_experiment_config(config_filename: str) -> Dict:
     """Reads |config_filename|, validates it, finds as many errors as possible,
@@ -148,6 +152,48 @@ def get_directories(parent_dir):
     ]
 
 
+# pylint: disable=too-many-locals
+def validate_and_pack_random_seed_corpus(random_seed_corpus_dir, benchmarks):
+    """Validate and archive seed corpus provided by user and."""
+    if not os.path.isdir(random_seed_corpus_dir):
+        raise ValidationError('Corpus location "%s" is invalid.' %
+                              random_seed_corpus_dir)
+
+    for benchmark in benchmarks:
+        benchmark_corpus_dir = os.path.join(random_seed_corpus_dir, benchmark)
+        if not os.path.exists(benchmark_corpus_dir):
+            raise ValidationError('Random seed corpus directory for '
+                                  'benchmark "%s" does not exist.' % benchmark)
+        if not os.path.isdir(benchmark_corpus_dir):
+            raise ValidationError('Seed corpus of benchmark "%s" must be '
+                                  'a directory.' % benchmark)
+        if not os.listdir(benchmark_corpus_dir):
+            raise ValidationError('Seed corpus of benchmark "%s" is empty.' %
+                                  benchmark)
+
+        valid_corpus_files = set()
+        for root, _, files in os.walk(benchmark_corpus_dir):
+            for filename in files:
+                file_path = os.path.join(root, filename)
+                file_size = os.path.getsize(file_path)
+
+                if file_size == 0 or file_size > CORPUS_ELEMENT_BYTES_LIMIT:
+                    continue
+                valid_corpus_files.add(file_path)
+
+        if not valid_corpus_files:
+            raise ValidationError('No valid corpus files for "%s"' % benchmark)
+
+        benchmark_corpus_archive_path = os.path.join(random_seed_corpus_dir,
+                                                     f'{benchmark}.zip')
+        with zipfile.ZipFile(benchmark_corpus_archive_path, 'w') as archive:
+            for filename in valid_corpus_files:
+                dir_name = os.path.dirname(filename)
+                archive.write(
+                    filename,
+                    os.path.relpath(filename, os.path.join(dir_name, '..')))
+
+
 def validate_benchmarks(benchmarks: List[str]):
     """Parses and validates list of benchmarks."""
     benchmark_types = set()
@@ -220,7 +266,8 @@ def start_experiment(  # pylint: disable=too-many-arguments
         concurrent_builds=None,
         measurers_cpus=None,
         runners_cpus=None,
-        use_branch_coverage=False):
+        use_branch_coverage=False,
+        random_seed_corpus_dir=None):
     """Start a fuzzer benchmarking experiment."""
     if not allow_uncommitted_changes:
         check_no_uncommitted_changes()
@@ -250,6 +297,12 @@ def start_experiment(  # pylint: disable=too-many-arguments
     # 12GB is just the amount that KLEE needs, use this default to make KLEE
     # experiments easier to run.
     config['runner_memory'] = config.get('runner_memory', '12GB')
+
+    config['random_seed_corpus_dir'] = random_seed_corpus_dir
+    if config['random_seed_corpus_dir']:
+        validate_and_pack_random_seed_corpus(config['random_seed_corpus_dir'],
+                                             benchmarks)
+
     return start_experiment_from_full_config(config)
 
 
@@ -332,6 +385,16 @@ def filter_file(tar_info):
         for benchmark in config['benchmarks']:
             add_oss_fuzz_corpus(benchmark, oss_fuzz_corpora_dir)
 
+    if config['random_seed_corpus_dir']:
+        for benchmark in config['benchmarks']:
+            benchmark_corpus_archive_path = os.path.join(
+                config['random_seed_corpus_dir'], f'{benchmark}.zip')
+            filestore_utils.cp(
+                benchmark_corpus_archive_path,
+                experiment_utils.get_random_seed_corpora_filestore_path() + '/',
+                recursive=True,
+                parallel=True)
+
 
 class BaseDispatcher:
     """Class representing the dispatcher."""
@@ -524,6 +587,10 @@ def main():
                         '--runners-cpus',
                         help='Cpus available to the runners.',
                         required=False)
+    parser.add_argument('-rs',
+                        '--random-seed-corpus-dir',
+                        help='Path to the random seed corpus',
+                        required=False)
 
     all_fuzzers = fuzzer_utils.get_fuzzer_names()
     parser.add_argument('-f',
@@ -593,6 +660,14 @@ def main():
         parser.error('The sum of runners and measurers cpus is greater than the'
                      ' available cpu cores (%d)' % os.cpu_count())
 
+    if args.random_seed_corpus_dir:
+        if args.no_seeds:
+            parser.error('Cannot enable options "random_seed_corpus_dir" and '
+                         '"no_seeds" at the same time')
+        if args.oss_fuzz_corpus:
+            parser.error('Cannot enable options "random_seed_corpus_dir" and '
+                         '"oss_fuzz_corpus" at the same time')
+
     start_experiment(args.experiment_name,
                      args.experiment_config,
                      args.benchmarks,
@@ -605,7 +680,8 @@ def main():
                      concurrent_builds=concurrent_builds,
                      measurers_cpus=measurers_cpus,
                      runners_cpus=runners_cpus,
-                     use_branch_coverage=args.use_branch_coverage)
+                     use_branch_coverage=args.use_branch_coverage,
+                     random_seed_corpus_dir=args.random_seed_corpus_dir)
     return 0
 
 

diff --git a/experiment/runner.py b/experiment/runner.py
@@ -27,6 +27,7 @@
 import threading
 import time
 import zipfile
+import random
 
 from common import benchmark_config
 from common import environment
@@ -115,6 +116,20 @@ def get_clusterfuzz_seed_corpus_path(fuzz_target_path):
     return seed_corpus_path if os.path.exists(seed_corpus_path) else None
 
 
+def _unpack_random_seed_corpus(corpus_directory):
+    "Unpack and randomply pick one input from the seed corpus provided by user"
+    # remove initial seed corpus
+    shutil.rmtree(corpus_directory)
+    os.mkdir(corpus_directory)
+    benchmark = environment.get('BENCHMARK')
+    corpus_archive_filename = posixpath.join(
+        experiment_utils.get_random_seed_corpora_filestore_path(),
+        f'{benchmark}.zip')
+    with zipfile.ZipFile(corpus_archive_filename) as zip_file:
+        selected_file = random.choice(zip_file.infolist())
+        zip_file.extract(selected_file, corpus_directory)
+
+
 def _unpack_clusterfuzz_seed_corpus(fuzz_target_path, corpus_directory):
     """If a clusterfuzz seed corpus archive is available, unpack it into the
     corpus directory if it exists. Copied from unpack_seed_corpus in
@@ -172,7 +187,10 @@ def run_fuzzer(max_total_time, log_filename):
         logs.error('Fuzz target binary not found.')
         return
 
-    _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus)
+    if environment.get('RANDOM_SEED_CORPUS_DIR'):
+        _unpack_random_seed_corpus(input_corpus)
+    else:
+        _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus)
     _clean_seed_corpus(input_corpus)
 
     if max_total_time is None:

diff --git a/experiment/scheduler.py b/experiment/scheduler.py
@@ -717,6 +717,7 @@ def render_startup_script_template(instance_name: str, fuzzer: str,
         'oss_fuzz_corpus': experiment_config['oss_fuzz_corpus'],
         'num_cpu_cores': experiment_config['runner_num_cpu_cores'],
         'cpuset': CPUSET,
+        'random_seed_corpus_dir': experiment_config['random_seed_corpus_dir'],
     }
 
     if not local_experiment:

diff --git a/experiment/test_data/experiment-config.yaml b/experiment/test_data/experiment-config.yaml
@@ -31,6 +31,7 @@ git_hash: "git-hash"
 no_seeds: false
 no_dictionaries: false
 oss_fuzz_corpus: false
+random_seed_corpus_dir: null
 description: "Test experiment"
 concurrent_builds: null
 runners_cpus: null

diff --git a/experiment/test_run_experiment.py b/experiment/test_run_experiment.py
@@ -202,6 +202,7 @@ def test_copy_resources_to_bucket(tmp_path):
         'experiment': 'experiment',
         'benchmarks': ['libxslt_xpath'],
         'oss_fuzz_corpus': True,
+        'random_seed_corpus_dir': None,
     }
     try:
         with mock.patch('common.filestore_utils.cp') as mocked_filestore_cp:

diff --git a/experiment/test_scheduler.py b/experiment/test_scheduler.py
@@ -118,6 +118,7 @@ def test_create_trial_instance(benchmark, expected_image, expected_target,
 -e NO_SEEDS=False \\
 -e NO_DICTIONARIES=False \\
 -e OSS_FUZZ_CORPUS=False \\
+-e RANDOM_SEED_CORPUS_DIR=None \\
 -e DOCKER_REGISTRY=gcr.io/fuzzbench -e CLOUD_PROJECT=fuzzbench -e CLOUD_COMPUTE_ZONE=us-central1-a \\
 -e EXPERIMENT_FILESTORE=gs://experiment-data \\
 -e REPORT_FILESTORE=gs://web-reports \\