merge_set

Container for a set of files to be merged

class merge_utils.merge_set.MergeChunk(skip: int | None = None, limit: int | None = None, files: list | None = None)[source]

Class to keep track of a chunk of files for merging

property chunk_id: list[int]

Get the chunk indices for the chunk

inputs(output_id=None) list[str][source]

Get the list of input files

Parameters:

output_id – individual output stream for pass 2+

Returns:

list of input file paths or DIDs

make_child(files: list) MergeChunk[source]

Make a child chunk with the given files

make_name(name: str, chunk: list[int]) str[source]

Get the name for a chunk output

property metadata: dict

Get the metadata for the chunk

property namespace: str

Get the namespace for the chunk

outputs(output_id=None) list[dict][source]

Get the list of output file specifications for the chunk

Parameters:

output_id – individual output stream for pass 2+

Returns:

list of output specifications

property parents: list[str]

Get the list of parent dids

settings(output_id=None) dict[source]

Get the merging settings for the chunk

Parameters:

output_id – individual output stream for pass 2+

Returns:

settings dictionary

spec(output_id=None) dict[source]

Get the merging specification dictionary for a given output stream

Parameters:

output_id – individual output stream for pass 2+

Returns:

merging specification dictionary

property specs: list[dict]

Get the list of merging specification dictionaries for all output streams

Returns:

list of merging specification dictionaries

property tier: int

Get the tier for the chunk

class merge_utils.merge_set.MergeFile(data: dict)[source]

A generic data file with metadata

Initialize the MergeFile with a metadata dictionary

property did: str

name)

Type:

The file DID (namespace

property file_format

The file format (core.file_format)

get_fields(fields: list) tuple[source]

Get the namespace and specified metadata values from the file

Parameters:

fields – list of metadata fields to extract

Returns:

tuple of values for each field

property good: bool

Check if the file has no errors

property name: str

The file name

property namespace: str

The file namespace

set_parents(parents: Iterable) None[source]

Set the parent FIDs for the file, checking for any missing FIDs

validate() None[source]

Check for errors or invalid metadata

class merge_utils.merge_set.MergeFileError(value)[source]

Enumeration of possible file error flags

classmethod critical() MergeFileError[source]

Get the set of errors that are considered critical

property first: MergeFileError

Get the first error in the enumeration

property group: bool

Check if the file should count towards grouping

property handling: str

Get the error handling method from the configuration

class merge_utils.merge_set.MergeSet[source]

Class to keep track of a set of files for merging

add(skip: int, files: Iterable) list[source]

Add a batch of files to the set.

Parameters:

files – collection of dictionaries with file metadata

Returns:

list of good MergeFile objects that were added

property all_files: list[MergeFile]

List of all MergeFile objects in the set, including bad files

at(idx: int) MergeFile[source]

Get a file by its index in the set, raising an error if not found.

Parameters:

idx – index of the file

Returns:

MergeFile object

check_consistency() list[str][source]

Pick out the largest consistent subset of files and log any inconsistencies.

Returns:

list of log messages about inconsistent files

check_errors(final: bool = False) None[source]

Check and log errors in the set.

Parameters:

final – print final summary of errors even if bad files are allowed

property end_idx: int

Get the index of the end of the set (one past the last file)

property enum: Generator[tuple[int, MergeFile], None, None]

Generator of (index, MergeFile) for all files in the set

property enum_good: Generator[tuple[int, MergeFile], None, None]

Generator of (index, MergeFile) for good files in the set

get_by_did(did: str) MergeFile[source]

Get a file by its DID.

Parameters:

did – DID of the file

Returns:

MergeFile object or None if not found

get_by_idx(idx: int) MergeFile | None[source]

Get a file by its index in the set.

Parameters:

idx – index of the file

Returns:

MergeFile object or None if not found

get_slice(start: int = 0, end: int = 0, step: int = 1) list[MergeFile][source]

Get a slice of files by their indices.

Parameters:
  • start – starting index of the slice

  • end – ending index of the slice (exclusive)

  • step – step size for the slice

Returns:

list of MergeFile objects

property good_files: list[MergeFile]

List of good MergeFile objects in the set

group_by_count(count: int) list[int][source]

Group input files by count

Parameters:

count – Number of files to group

Returns:

List of group divisions

group_by_size(indices: list[int]) list[int][source]

Group input files by size

Parameters:

indices – Indices of files to group

Returns:

List of group divisions

groups() Generator[MergeChunk, None, None][source]

Split the files into groups for merging

insert(idx: int, file: MergeFile) None[source]

Insert a file at the specified index.

Parameters:
  • idx – index of the file

  • file – MergeFile object to set at the index

set_error(dids: Iterable[str], error: MergeFileError) None[source]

Mark files as having a specific error.

Parameters:
  • dids – list of file DIDs to mark

  • error – MergeFileError to set