FNALLPC · kpedro88 · Aug 15, 2025 · Jul 31, 2025 · Aug 15, 2025
diff --git a/.pylintrc b/.pylintrc
@@ -123,7 +123,7 @@ no-docstring-rgx=__.*__
 [FORMAT]
 
 # Maximum number of characters on a single line.
-max-line-length=130
+max-line-length=150
 
 # Maximum number of lines in a module
 max-module-lines=1000

diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ Table of Contents
 * [bind_condor.sh](#bind_condorsh)
    * [Usage](#usage-1)
    * [Setting up bindings](#setting-up-bindings)
+* [get_files_on_disk.py](#get_files_on_diskpy)
 * [tunn](#tunn)
    * [Detailed usage](#detailed-usage)
    * [Web browser usage](#web-browser-usage)
@@ -214,6 +215,45 @@ In this particular case, it is necessary to upgrade `pip` because the Python ver
 **NOTE**: These recipes only install the bindings for Python3. (Python2 was still the default in `CMSSW_10_6_X`.)
 You will need to make sure any scripts using the bindings are compatible with Python3.
 
+## `get_files_on_disk.py`
+
+This script automates the process of querying Rucio to find only the files in a CMS data or MC sample that are currently hosted on disk.
+(The most general form of this functionality is not currently available from other CMS database tools such as `dasgoclient`.)
+
+There are two major use cases for this tool:
+1. Finding AOD (or earlier formats such as RECO or RAW) files for testing or development. (AOD samples are not hosted on disk by default, so typically only small subsets of a sample will be transferred to disk for temporary usage.)
+2. Obtaining file lists for premixed pileup samples for private MC production. (Premixed pileup input samples are no longer fully hosted on disk because of resource limitations.)
+
+A fraction of each premixed pileup sample is subscribed to disk by the central production team, and the corresponding list of files is synced to cvmfs.
+By default, this script will just copy this cached information.
+This is the most stable and preferred approach, so only deviate from it if absolutely necessary.
+
+This script should *not* be run in batch jobs, as that can lead to an inadvertent distributed denial of service disruption of the CMS data management system.
+The script will actively try to prevent you from running it in batch jobs.
+Please run the script locally, before submitting your jobs, and send the resulting information as part of the job input files.
+
+The available options for this script are:
+```
+usage: get_files_on_disk.py [-h] [-a [ALLOW ...] | -b [BLOCK ...]] [-o OUTFILE] [-u USER] [-v] [--no-cache] dataset
+
+Find all available files (those hosted on disk) for a given dataset
+
+positional arguments:
+  dataset               dataset to query
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -a [ALLOW ...], --allow [ALLOW ...]
+                        allow only these sites (default: None)
+  -b [BLOCK ...], --block [BLOCK ...]
+                        block these sites (default: None)
+  -o OUTFILE, --outfile OUTFILE
+                        write to this file instead of stdout (default: None)
+  -u USER, --user USER  username for rucio (default: [user])
+  -v, --verbose         print extra information (site list) (default: False)
+  --no-cache            do not use cached file lists from cvmfs (default: False)
+```
+
 ## `tunn`
 
 A simple utility to create and manage SSH tunnels.

diff --git a/get_files_on_disk.py b/get_files_on_disk.py
@@ -62,11 +62,48 @@ def sitecond(site):
     sys.path.pop(0)
     return filelist, sitelist
 
-def main(dataset, user, outfile=None, verbose=False, allow=None, block=None):
+def getCache(dataset, verbose=False):
+    """Gets cached file lists from cvmfs for pileup samples"""
+    filelist = None
+    cache_dir = "/cvmfs/cms.cern.ch/offcomp-prod/premixPUlist/"
+    cache_map_file = "pileup_mapping.txt"
+    cache_map_path = os.path.join(cache_dir, cache_map_file)
+    if os.path.isfile(cache_map_path):
+        cache_map = {}
+        with open(cache_map_path, 'r') as mapfile: # pylint: disable=unspecified-encoding
+            for line in mapfile:
+                line = line.rstrip()
+                linesplit = line.split()
+                if len(linesplit)==2:
+                    cache_map[linesplit[0]] = linesplit[1]
+
+        if dataset in cache_map:
+            cache_file = cache_map[dataset]
+            cache_file_path = os.path.join(cache_dir, cache_file)
+            if verbose:
+                print(f"Loading from cache: {cache_file_path}")
+            with open(cache_file_path, 'r') as cfile: # pylint: disable=unspecified-encoding
+                filelist = [line.rstrip() for line in cfile]
+
+    return filelist
+
+def main(dataset, user, outfile=None, verbose=False, allow=None, block=None, cache=True):
     """Prints file list and site list"""
-    filelist, sitelist = getHosted(dataset, user, allow=allow, block=block)
+    filelist = None
+    sitelist = None
 
-    if verbose:
+    if cache:
+        if not allow and not block:
+            filelist = getCache(dataset, verbose)
+        # cache does not consider allow or block lists, so disable if they are requested
+        else:
+            if verbose:
+                print("Disabling cache because allow and/or block lists are specified")
+
+    if not filelist:
+        filelist, sitelist = getHosted(dataset, user, allow=allow, block=block)
+
+    if verbose and sitelist:
         print("Site list:")
         print("\n".join(f'{k}: {v}' for k,v in sitelist.items()))
 
@@ -86,7 +123,8 @@ def main(dataset, user, outfile=None, verbose=False, allow=None, block=None):
     parser.add_argument("-o","--outfile",type=str,default=None,help="write to this file instead of stdout")
     parser.add_argument("-u","--user",type=str,default=default_user,help="username for rucio")
     parser.add_argument("-v","--verbose",default=False,action="store_true",help="print extra information (site list)")
+    parser.add_argument("--no-cache",default=False,action="store_true",help="do not use cached file lists from cvmfs")
     parser.add_argument("dataset",type=str,help="dataset to query")
     args = parser.parse_args()
 
-    main(args.dataset, args.user, outfile=args.outfile, verbose=args.verbose, allow=args.allow, block=args.block)
+    main(args.dataset, args.user, outfile=args.outfile, verbose=args.verbose, allow=args.allow, block=args.block, cache=not args.no_cache)