defaults.yamlΒΆ

set in config

# Default configuration settings for merging
# Please create your own config files instead of modifying this one directly!
# Angle brackets indicate the expected type of each setting, and will be ignored in user configs
# See https://dune.github.io/merge-utils/configuration.html for details on the type checking

version: ~  # Specify the merge-utils package version

input:
    mode: <opt(dids, files, dataset, query)>
    inputs: []              # List of inputs
    search_dirs: <set>      # List of directories to search for metadata files
    namespace: <str>        # Namespace override for local input files without metadata
    skip: <int>             # Skip a number of input files, overridden by '--skip #'
    limit: <int>            # Limit the number of input files, overridden by '--limit #'
    tag: <str>              # Specify a tag to identify outputs, overridden by '--tag TAG'
    comment: <str>          # Add a comment to output metadata, overridden by '--comment COMMENT'
    streaming: True         # Stream files from remote sites instead of making a local copy

output:
    mode: <opt(merge, validate, metadata, dids, replicas, pfns, rses)>  # Whether to run merging or just validate/list metadata, DIDs, replicas, PFNs, or RSEs
    local: False          # Run merging locally instead of submitting to JustIN
    name: "{core.run_type}_{dune.campaign}_{dune.config_file}_{core.application.name}"
    namespace: <str>      # Optionally specify a namespace different from the parents
    grandparents: False   # List the parents of the input files as the parents of the merged file
    tmp_dir: "{PKG}/tmp"  # Directory for generated job scripts
    out_dir: "/pnfs/dune/scratch/users/{$USER}/merge_test" # Local output directory
    batch:                # Settings for batch merging
        lifetime: 1000    # Lifetime of output files (in days)
        rse: <str>        # Save merged files to a specific RSE
    scratch:              # Settings for temporary files from 2-stage merging
        namespace: <str>  # Optionally specify a different namespace for temporary files
        lifetime: 30      # Lifetime of temporary files (in days)
    grouping:             # Settings for how input files are grouped into the final merged files
        mode: <opt(size, count)>
        target: 10.0      # Target size (in GB) or number of files"
        equalize: True    # Try to equalize the size of the merged files

#metadata:
#    optional:         # These metadata keys are optional (overrides required and conditional keys)
#      - "dune_mc.geometry_version"
#      - "dune_mc.gen_fcl_filename"

validation:
    batch_size: 100   # Number of files to query metacat about at once
    concurrency: 10   # Number of threads to use for checking replicas
    fast_fail: True   # Stop processing files as soon as one batch fails validation
    check_fids: True  # Make sure parent FIDs exist in MetaCat (DIDs are always checked)
    handling:         # How to handle files with errors
        default:      <opt(quit,skip,gap)>         # Default handling mode
        duplicate:    <opt(default,quit,skip,gap)> # Duplicated files
        no_metadata:  <opt(default,quit,skip,gap)> # Files without metadata
        undeclared:   <opt(default,quit,skip,gap)> # Files without a MetaCat record
        retired:      <opt(default,quit,skip,gap)> # Retired files
        invalid:      <opt(default,quit,skip,gap)> # Metadata failed validation
        no_replicas:  <opt(default,quit,skip,gap)> # Files without any replicas
        unreachable:  <opt(default,quit,skip,gap)> # Replicas exist but exceed max distance
        inconsistent: <opt(default,quit,skip,gap)> # Metadata inconsistent with other files
        already_done: <opt(include,quit,skip,gap)> # Files that have been merged in a previous job
    checksums:
      - "adler32"     # Adler32 should be the default checksum

sites:
    justin_url: "https://justin-ui-fnal.dune.hep.ac.uk"
    default: "US_FNAL-FermiGrid"          # Default site (eg for stage 2 jobs)
    max_distance: 1000.0                  # Distances range from 0 to 101
    site_distances:                       # Distance offsets for merging sites
        default: .inf                     # Do not allow merging except at specified sites
        "US_FNAL-FermiGrid": -5.0         # Increase priority
        "CERN": 0.0
    rse_distances:                        # Distances offsets for specific RSEs
        disk: 0.0                         # Disk RSEs are preferred
        tape: 100.0                       # Tape RSEs are penalized
        "DUNE_US_FNAL_DISK_STAGE": -5.0   # Increase priority 
    dcache:                               # DCache RSEs have a staging penalty for nearline files
        "FNAL_DCACHE":
            url: "root://fndcadoor.fnal.gov:1094/pnfs/fnal.gov/usr/dune/tape_backed/dunepro"
            staging: 10.0

local:
    site: <str>
    hosts:
        "*.fnal.gov": "US_FNAL-FermiGrid"
    xrootd:
        "US_FNAL-FermiGrid":
            "root://fndcadoor.fnal.gov:1094/pnfs/fnal.gov/usr/": "/pnfs/"
            "root://fndca1.fnal.gov:1094/pnfs/fnal.gov/usr/": "/pnfs/"

method:
    method_name: auto             # Can be auto, a specific default method, or the path to a custom script
    cmd: <str>                    # Optionally specify merging command, eg. '{script} {cfg} {output} {inputs}'
    cfg: <str>                    # Optionally specify a config file for merging
    script: <str>                 # Optionally specify a custom merging script
    transform: <str>              # For transform jobs, specify "app_familly.app_name"
    outputs: <list(output_file)>  # List of output files produced by the merging method      
    dependencies: <set>           # Optionally specify additional files required for merging
    environment:
        dunesw_version: <str>     # DUNE software version, defaults to DUNESW_VERSION env var
        dunesw_qualifier: <str>   # DUNE software qualifier, defaults to DUNE_QUALIFIER env var
        vars: <map>               # Additional environment variables to set
        image: <str>              # Apptainer image to use for batch jobs
        products: <set>           # Extra products to load in the environment
    chunks:                       # Limits on merging individual chunks
        max_count: 100            # Maximum number of files to merge at once
        min_count: 2              # Minimum number of files to merge at once
        max_size: 20.0            # Maximum space available for merging (in GB)

standard_methods: # Built-in merging methods, matched using 'cond' in reverse order
  - method_name: "tar"
    cond: "True"  # Always matches if no other method matches first
    script: "merge_tar.py"
    outputs:
      - name: "{NAME}_merged_{UUID}.tar"
        metadata:
            core.file_format: "tar"
  - method_name: "hadd"
    cond: "'{core.file_format}' in ['root', 'rootntuple', 'tfile']"
    cmd: "hadd -f {output} {inputs}"
    outputs:
      - name: "{NAME}_merged_{UUID}.root"
  - method_name: "lar"
    cond: "'{core.file_format}' in ['artroot']"
    cmd: "lar -c {cfg} -n -1 -o {output} {inputs}"
    cfg: "artcat.fcl"
    dependencies:
      - "messageService.fcl"
      - "minimalMessageService.fcl"
      - "standardMessageDestinations.fcl"
    outputs:
      - name: "{NAME}_merged_{UUID}.root"
  - method_name: "hdf5"
    cond: "'{core.file_format}' in ['hdf5']"
    script: "merge_hdf5.py"
    cfg: "hdf5.yaml"
    outputs:
      - name: "{NAME}_merged_{UUID}.hdf5"
      
job:        # Populated at runtime
    timestamp: <str>
    dir: <str>
    config_files: <list>

schema:     # Used to check validity of user config files
    type_defs:
        output_file:
            name: <str>             # Output file name, shoud contain '{NAME}' and '{UUID}'
            metadata: <map>         # Per-file metadata overrides
            tmp_metadata: <map>     # metadata for temporary files in multi-stage merges
            rename: <str>           # File name produced by merging method, if different from spec
            pass2: <str>            # Basic merging method for pass 2+ of a transform job
            size: <size_spec> sum   # File size estimate. Can be sum, avg, or an explicit value
            size_min: <size_spec> 0 # Minimum size for output file (for error checking)
            checklist: <path>       # Optional checklist file to verify the output file contents
        merging_method:
            method_name: <str>      # Name of method, used for selection and in output metadata
            cond: <cond>            # Condition to auto-select this method based on input metadata
            cmd: <str>              # Bash command to run the merging method
            cfg: <str>              # Optional config file to use for the merging method
            script: <str>           # Optional script to run the merging method
            transform: <str>        # Optional application name for transform jobs
            dependencies: <set>     # Optionally specify additional files required for merging
            metadata: <map>         # Per-method metadata overrides
            outputs: <list(output_file)>
        dcache_site:
            url: <str>              # Base URL for accessing files on this DCache site
            staging: <float>        # Distance penalty for staging files from this site
    key_defs:
        output.tmp_dir: <path>
        output.local.out_dir: <path>
        local.hosts: <map>
        local.xrootd: <map(map)>
        standard_methods: <list(merging_method)>
        sites.site_distances: <map(float)>
        sites.site_distances[default]: <float>
        sites.rse_distances: <map(float)>
        sites.rse_distances[disk]: <float>
        sites.rse_distances[tape]: <float>
        sites.dcache: <map(dcache_site)>