Skip to content

Implementation Details

Technical Overview

This document provides technical details about the nf-proteindesign pipeline implementation, including design decisions, container specifications, and development guidelines.

Container Strategy

Base Images

The pipeline uses specialized containers for each component:

Containers:
  boltzgen: "ghcr.io/flouwuenne/boltzgen:latest"
  proteinmpnn: "ghcr.io/flouwuenne/proteinmpnn:latest"
  ipsae: "ghcr.io/flouwuenne/ipsae:latest"
  prodigy: "ghcr.io/flouwuenne/prodigy:latest"  

GPU Support

CUDA 11.8+ required for Boltzgen:

FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
RUN pip install torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118

Code Organization

Directory Structure

nf-proteindesign/
├── main.nf                              # Main entry point with mode detection
├── nextflow.config                      # Pipeline configuration
├── conf/
│   ├── base.config                     # Base resource settings
│   ├── modules.config                  # Module-specific configuration
│   ├── test.config                     # Test profile configuration
│   └── test_full.config                # Full test profile
├── workflows/
│   └── protein_design.nf               # Unified workflow handling all modes
├── modules/local/
│   ├── boltzgen_run.nf
│   ├── convert_cif_to_pdb.nf
│   ├── collect_design_files.nf
│   ├── proteinmpnn_optimize.nf
│   ├── ipsae_calculate.nf
│   ├── prodigy_predict.nf
│   └── consolidate_metrics.nf
├── bin/
│   ├── convert_cif_to_pdb.py          # CIF to PDB conversion
│   ├── collect_boltzgen_outputs.py    # Collect Boltzgen results
│   ├── consolidate_metrics.py         # Generate unified metrics report
│   └── create_design_yaml.py          # Generate design YAML files
└── assets/
    ├── schema_input_design.json        # Design mode samplesheet schema
    └── test_data/                       # Test datasets
        ├── egfr_*_design.yaml          # Pre-made design YAMLs
        ├── 2VSM.cif                     # Test structure
        └── samplesheet_design_*.csv     # Test samplesheets

Helper Scripts

Samplesheet Validation

#!/usr/bin/env python3
"""
Validates samplesheet format and content.
"""

import sys
import csv
from pathlib import Path

def validate_samplesheet(file_path):
    """Validate samplesheet CSV format."""

    required_columns = ['sample_id', 'design_yaml']

    with open(file_path) as f:
        reader = csv.DictReader(f)
        headers = reader.fieldnames

        # Check required columns
        for col in required_columns:
            if col not in headers:
                sys.exit(f"Missing required column: {col}")

        print(f"Valid design mode samplesheet")

        return True

if __name__ == '__main__':
    validate_samplesheet(sys.argv[1])

Testing

Test Configuration

// conf/test.config
params {
    input = 'test_data/samplesheet_test.csv'
    outdir = 'test_results'
    n_samples = 5
    max_cpus = 4
    max_memory = 16.GB
}

Running Tests

# Quick test
nextflow run main.nf -profile test,docker

# Full test suite
nextflow run tests/ -profile test,docker

Best Practices

Process Definition

process EXAMPLE_PROCESS {
    tag "$sample"           // Show sample name in logs
    label 'gpu'            // Apply resource label
    publishDir "${params.outdir}/${sample}", 
        mode: 'copy'       // Copy instead of symlink

    input:
    tuple val(sample), path(input_file)

    output:
    tuple val(sample), path("output/*"), emit: results
    path "*.log", emit: logs

    script:
    """
    tool --input ${input_file} \
         --output output/ \
         --threads ${task.cpus} \
         2>&1 | tee process.log
    """
}

Error Handling

workflow {
    main:
        PROCESS(input_ch)
            .map { sample, files ->
                if (files.isEmpty()) {
                    log.warn "No output for sample: ${sample}"
                    return null
                }
                return [sample, files]
            }
            .filter { it != null }
}

Channel Management

Creating Channels

// From samplesheet
Channel
    .fromPath(params.input)
    .splitCsv(header: true)
    .map { row ->
        [row.sample, file(row.design_yaml)]
    }
    .set { design_ch }

// From file patterns
Channel
    .fromPath("${params.outdir}/*/final_ranked_designs/*.cif")
    .map { file ->
        def sample = file.parent.parent.parent.name
        [sample, file]
    }
    .set { results_ch }

Combining Channels

// Join by sample ID
design_ch
    .join(metadata_ch, by: 0)
    .set { combined_ch }

// Combine all
Channel
    .of(design_ch, metadata_ch)
    .flatten()
    .collect()
    .set { all_inputs }

Configuration Management

Parameter Validation

// nextflow.config
params {
    // Validate parameters
    validate_params = true
}

def validateParameters() {
    if (params.num_designs < 1) {
        error "num_designs must be >= 1"
    }
    if (params.budget < 1) {
        error "budget must be >= 1"
    }
}

if (params.validate_params) {
    validateParameters()
}

Profile Inheritance

profiles {
    base {
        process.container = 'ubuntu:22.04'
    }

    docker {
        includeConfig 'conf/base.config'
        docker.enabled = true
        docker.runOptions = '--gpus all'
    }
}

Performance Optimization

Resource Allocation

process {
    // Dynamic resource allocation
    withLabel: gpu {
        cpus = { 8 * task.attempt }
        memory = { 32.GB * task.attempt }
        time = { 24.h * task.attempt }
        errorStrategy = 'retry'
        maxRetries = 2
    }
}

Caching Strategy

# Enable Nextflow caching
nextflow run main.nf -resume

# Clear cache if needed
nextflow clean -f

Debugging

Enable Debug Mode

# Verbose logging
nextflow run main.nf -with-trace -with-timeline -with-report

# Debug specific processes
nextflow run main.nf -process.debug true

Inspect Work Directory

# Find failed process
grep 'FAILED' .nextflow.log

# Check work directory
cd work/ab/cd1234...
cat .command.log
cat .command.err

Documentation

Module Documentation

/**
 * BOLTZGEN_RUN: Execute Boltzgen protein design
 *
 * @input tuple(sample_id, design_yaml)
 * @output tuple(sample_id, designs_dir)
 * @param params.n_samples Number of designs to generate
 * @param params.timesteps Diffusion timesteps
 */
process BOLTZGEN_RUN {
    // Process implementation
}

Version Control

Release Process

  1. Update version in nextflow.config
  2. Update CHANGELOG.md
  3. Create git tag
  4. Push containers to registry
  5. Create GitHub release
# Tag release
git tag -a v1.0.0 -m "Release version 1.0.0"
git push origin v1.0.0

Further Reading


Contributing

See the GitHub repository for contribution guidelines.