defaults.yamlΒΆ
set in config
# Default configuration settings for merging
# Please create your own config files instead of modifying this one directly!
# Angle brackets indicate the expected type of each setting, and will be ignored in user configs
# See https://dune.github.io/merge-utils/configuration.html for details on the type checking
version: ~ # Specify the merge-utils package version
input:
mode: <opt(dids, files, dataset, query)>
inputs: [] # List of inputs
search_dirs: <set> # List of directories to search for metadata files
namespace: <str> # Namespace override for local input files without metadata
skip: <int> # Skip a number of input files, overridden by '--skip #'
limit: <int> # Limit the number of input files, overridden by '--limit #'
tag: <str> # Specify a tag to identify outputs, overridden by '--tag TAG'
comment: <str> # Add a comment to output metadata, overridden by '--comment COMMENT'
streaming: True # Stream files from remote sites instead of making a local copy
output:
mode: <opt(merge, validate, metadata, dids, replicas, pfns, rses)> # Whether to run merging or just validate/list metadata, DIDs, replicas, PFNs, or RSEs
local: False # Run merging locally instead of submitting to JustIN
name: "{core.run_type}_{dune.campaign}_{dune.config_file}_{core.application.name}"
namespace: <str> # Optionally specify a namespace different from the parents
grandparents: False # List the parents of the input files as the parents of the merged file
tmp_dir: "{PKG}/tmp" # Directory for generated job scripts
out_dir: "/pnfs/dune/scratch/users/{$USER}/merge_test" # Local output directory
batch: # Settings for batch merging
lifetime: 1000 # Lifetime of output files (in days)
rse: <str> # Save merged files to a specific RSE
scratch: # Settings for temporary files from 2-stage merging
namespace: <str> # Optionally specify a different namespace for temporary files
lifetime: 30 # Lifetime of temporary files (in days)
grouping: # Settings for how input files are grouped into the final merged files
mode: <opt(size, count)>
target: 10.0 # Target size (in GB) or number of files"
equalize: True # Try to equalize the size of the merged files
#metadata:
# optional: # These metadata keys are optional (overrides required and conditional keys)
# - "dune_mc.geometry_version"
# - "dune_mc.gen_fcl_filename"
validation:
batch_size: 100 # Number of files to query metacat about at once
concurrency: 10 # Number of threads to use for checking replicas
fast_fail: True # Stop processing files as soon as one batch fails validation
check_fids: True # Make sure parent FIDs exist in MetaCat (DIDs are always checked)
handling: # How to handle files with errors
default: <opt(quit,skip,gap)> # Default handling mode
duplicate: <opt(default,quit,skip,gap)> # Duplicated files
no_metadata: <opt(default,quit,skip,gap)> # Files without metadata
undeclared: <opt(default,quit,skip,gap)> # Files without a MetaCat record
retired: <opt(default,quit,skip,gap)> # Retired files
invalid: <opt(default,quit,skip,gap)> # Metadata failed validation
no_replicas: <opt(default,quit,skip,gap)> # Files without any replicas
unreachable: <opt(default,quit,skip,gap)> # Replicas exist but exceed max distance
inconsistent: <opt(default,quit,skip,gap)> # Metadata inconsistent with other files
already_done: <opt(include,quit,skip,gap)> # Files that have been merged in a previous job
checksums:
- "adler32" # Adler32 should be the default checksum
sites:
justin_url: "https://justin-ui-fnal.dune.hep.ac.uk"
default: "US_FNAL-FermiGrid" # Default site (eg for stage 2 jobs)
max_distance: 1000.0 # Distances range from 0 to 101
site_distances: # Distance offsets for merging sites
default: .inf # Do not allow merging except at specified sites
"US_FNAL-FermiGrid": -5.0 # Increase priority
"CERN": 0.0
rse_distances: # Distances offsets for specific RSEs
disk: 0.0 # Disk RSEs are preferred
tape: 100.0 # Tape RSEs are penalized
"DUNE_US_FNAL_DISK_STAGE": -5.0 # Increase priority
dcache: # DCache RSEs have a staging penalty for nearline files
"FNAL_DCACHE":
url: "root://fndcadoor.fnal.gov:1094/pnfs/fnal.gov/usr/dune/tape_backed/dunepro"
staging: 10.0
local:
site: <str>
hosts:
"*.fnal.gov": "US_FNAL-FermiGrid"
xrootd:
"US_FNAL-FermiGrid":
"root://fndcadoor.fnal.gov:1094/pnfs/fnal.gov/usr/": "/pnfs/"
"root://fndca1.fnal.gov:1094/pnfs/fnal.gov/usr/": "/pnfs/"
method:
method_name: auto # Can be auto, a specific default method, or the path to a custom script
cmd: <str> # Optionally specify merging command, eg. '{script} {cfg} {output} {inputs}'
cfg: <str> # Optionally specify a config file for merging
script: <str> # Optionally specify a custom merging script
transform: <str> # For transform jobs, specify "app_familly.app_name"
outputs: <list(output_file)> # List of output files produced by the merging method
dependencies: <set> # Optionally specify additional files required for merging
environment:
dunesw_version: <str> # DUNE software version, defaults to DUNESW_VERSION env var
dunesw_qualifier: <str> # DUNE software qualifier, defaults to DUNE_QUALIFIER env var
vars: <map> # Additional environment variables to set
image: <str> # Apptainer image to use for batch jobs
products: <set> # Extra products to load in the environment
chunks: # Limits on merging individual chunks
max_count: 100 # Maximum number of files to merge at once
min_count: 2 # Minimum number of files to merge at once
max_size: 20.0 # Maximum space available for merging (in GB)
standard_methods: # Built-in merging methods, matched using 'cond' in reverse order
- method_name: "tar"
cond: "True" # Always matches if no other method matches first
script: "merge_tar.py"
outputs:
- name: "{NAME}_merged_{UUID}.tar"
metadata:
core.file_format: "tar"
- method_name: "hadd"
cond: "'{core.file_format}' in ['root', 'rootntuple', 'tfile']"
cmd: "hadd -f {output} {inputs}"
outputs:
- name: "{NAME}_merged_{UUID}.root"
- method_name: "lar"
cond: "'{core.file_format}' in ['artroot']"
cmd: "lar -c {cfg} -n -1 -o {output} {inputs}"
cfg: "artcat.fcl"
dependencies:
- "messageService.fcl"
- "minimalMessageService.fcl"
- "standardMessageDestinations.fcl"
outputs:
- name: "{NAME}_merged_{UUID}.root"
- method_name: "hdf5"
cond: "'{core.file_format}' in ['hdf5']"
script: "merge_hdf5.py"
cfg: "hdf5.yaml"
outputs:
- name: "{NAME}_merged_{UUID}.hdf5"
job: # Populated at runtime
timestamp: <str>
dir: <str>
config_files: <list>
schema: # Used to check validity of user config files
type_defs:
output_file:
name: <str> # Output file name, shoud contain '{NAME}' and '{UUID}'
metadata: <map> # Per-file metadata overrides
tmp_metadata: <map> # metadata for temporary files in multi-stage merges
rename: <str> # File name produced by merging method, if different from spec
pass2: <str> # Basic merging method for pass 2+ of a transform job
size: <size_spec> sum # File size estimate. Can be sum, avg, or an explicit value
size_min: <size_spec> 0 # Minimum size for output file (for error checking)
checklist: <path> # Optional checklist file to verify the output file contents
merging_method:
method_name: <str> # Name of method, used for selection and in output metadata
cond: <cond> # Condition to auto-select this method based on input metadata
cmd: <str> # Bash command to run the merging method
cfg: <str> # Optional config file to use for the merging method
script: <str> # Optional script to run the merging method
transform: <str> # Optional application name for transform jobs
dependencies: <set> # Optionally specify additional files required for merging
metadata: <map> # Per-method metadata overrides
outputs: <list(output_file)>
dcache_site:
url: <str> # Base URL for accessing files on this DCache site
staging: <float> # Distance penalty for staging files from this site
key_defs:
output.tmp_dir: <path>
output.local.out_dir: <path>
local.hosts: <map>
local.xrootd: <map(map)>
standard_methods: <list(merging_method)>
sites.site_distances: <map(float)>
sites.site_distances[default]: <float>
sites.rse_distances: <map(float)>
sites.rse_distances[disk]: <float>
sites.rse_distances[tape]: <float>
sites.dcache: <map(dcache_site)>