diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 199344f5..372071ee 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,5 @@ - ### Checklist -- [ ] Pull request details were added to CHANGELOG.md -- [ ] `parameter_meta` for each task is up to date. +- [ ] Pull request details were added to CHANGELOG.md. +- [ ] Documentation was updated (if required). +- [ ] `parameter_meta` was added/updated (if required). +- [ ] Submodule branches are on develop or a tagged commit. diff --git a/.github/lint-environment.yml b/.github/lint-environment.yml new file mode 100644 index 00000000..63b538fc --- /dev/null +++ b/.github/lint-environment.yml @@ -0,0 +1,9 @@ +name: biowdl-lint +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - cromwell + - wdl-aid + - miniwdl diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..7ef19e58 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,93 @@ +name: Linting + +on: + pull_request: + paths_ignore: + - "docs/**" + +defaults: + run: + # This is needed for miniconda, see: + # https://github.com/marketplace/actions/setup-miniconda#important + shell: bash -l {0} + +jobs: + lint: + runs-on: ubuntu-latest + name: Linting checks + steps: + - uses: actions/checkout@v2.3.4 + with: + submodules: recursive + + - name: Set cache date + run: echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV + + - name: Cache conda environment + # Use an always upload cache to prevent solving conda environment again and again on failing linting. + uses: pat-s/always-upload-cache@v2.1.5 + env: + # Increase this value to manually invalidate the cache + CACHE_NUMBER: 0 + with: + path: /usr/share/miniconda/envs/biowdl-lint + key: + ${{runner.os}}-biowdl-lint-${{ env.CACHE_NUMBER }}-${{env.DATE}}-${{ hashFiles('.github/lint-environment.yml') }} + id: env_cache + + # Use the builtin conda. This is the fastest installation. It may not be + # the fastest for resolving, but the package cache mitigates that problem. + # Since this installs fastest, it is fastest for all runs where a cache + # hit occurs. + - name: install miniconda + uses: conda-incubator/setup-miniconda@v2.1.1 + with: + channels: conda-forge,bioconda,defaults + channel-priority: strict + auto-activate-base: false + use-only-tar-bz2: true # Needed for proper caching according to the documentation. + # activate-environment is broken! This always seems to create a new environment. + # Activation is therefore done separately. + + - name: Create test environment if no cache is present + run: conda env create -n biowdl-lint -f .github/lint-environment.yml + if: steps.env_cache.outputs.cache-hit != 'true' + + - name: Activate test environment + # The new PATH should be passed to the environment, otherwise it won't register. + run: | + conda activate biowdl-lint + echo "PATH=$PATH" >> $GITHUB_ENV + + - name: Fetch develop branch for comparisons + run: git fetch --depth=1 origin develop + + - name: run womtool validate + # Only check files that have changed from the base reference. + # Womtool validate checks very slowly, so this saves a lot of time. + run: | + set -x + for WDL_FILE in $(git diff --name-only origin/${{github.base_ref}} | grep -E '*.wdl$'); do + womtool validate $WDL_FILE + done + - name: run miniwdl check + run: | + set -x + bash -c 'miniwdl check $(git ls-files *.wdl)' + + - name: Check copyright headers + run: | + set -x + for WDL_FILE in $(git diff --name-only origin/${{github.base_ref}} | grep -E '*.wdl$'); do + grep Copyright $WDL_FILE || bash -c "echo No copyright header in $WDL_FILE && exit 1" + done + - name: Check parameter_meta for inputs + run: | + set -x + for WDL_FILE in $(git diff --name-only origin/${{github.base_ref}} | grep -E '*.wdl$'); do + wdl-aid --strict $WDL_FILE > /dev/null 2> wdl-aid_stderr || + if grep -z 'ValueError: Missing parameter_meta for inputs:' wdl-aid_stderr + then + exit 1 + fi + done diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 396b998f..00000000 --- a/.travis.yml +++ /dev/null @@ -1,22 +0,0 @@ -# We use conda to install cromwell. - -language: python - -python: - - 3.6 - -before_install: - # Install conda - - export MINICONDA=${HOME}/miniconda - - export PATH=${MINICONDA}/bin:${PATH} - - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - - bash miniconda.sh -b -f -p ${MINICONDA} - - conda config --set always_yes yes - - conda config --add channels defaults - - conda config --add channels bioconda - - conda config --add channels conda-forge - -install: - - conda install --file requirements-test.txt - -script: bash scripts/biowdl_lint.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c3a3744..1c5c35aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,12 +2,384 @@ Changelog ========== + +version 6.0.0-dev +--------------------------- ++ bedtools.Sort: bumped container version to permit use of `faidx`. ++ Add a task for bcftools norm. ++ Add support for outputting compressed files to snpeff and snpsift. ++ Fixed an issue with the parameter_meta section of bcftools annotate + which caused wdlTools to error on parsing the file. ++ Updated the bcftools view task with an input for an index file. ++ Updated the bcftools view task to allow specifying a region. ++ Added a task for SnpSift filter. ++ Updated the snpEff task to allow setting the `-no-upstream` flag. ++ Update vt task to allow a filter expression and compress and index the output. ++ MultiQC image updated to version 1.28 ++ Samtools merge now has options added for merging RG and PG headers. ++ Samtools merge default thread count increased based on the number of files. ++ Update docker images in samtools.wdl ++ Add threads and compression levels to applicable tasks in samtools. Default to + compression level 1. ++ samtools BgzipAndIndex and Tabix "type" parameter changed to "preset" as is + the name of the flag. ++ Unused javaXmx parameter removed from samtools DictAndFaidx ++ Update Picard images ++ Add Mosdepth task. ++ pbmm2 loses the sort parameter. Output is now always sorted. ++ pbmm2 gets an unmapped parameter. ++ Allow pbmm2 to work with a set output prefix for the BAM file. ++ Update pbmm2 docker container to version 1.17 ++ Add VEP task. ++ Add Sequali task. ++ Add Clair3 task. ++ Add Modkit task. ++ Modify minimap2 task to accept ubam input, including transfer of methylation + tags. Also sort the BAM output file by coordinate. ++ Update DeepVariant container and update resource requirements. ++ rtg Format and VcfEval tasks now handle reference as an array of files to enable caching. ++ Added --select-genotype and --exclude-filtered flags to GATK SelectVariants ++ Use softlinks to localise the database for centrifuge. ++ Added the FastqFilter task. ++ Added a new input `revcomp` to cutadapt to set the `--revcomp` flag, defaults to `false`. ++ Added `samtools.Quickcheck` to allow failing on truncated files early. ++ Fixed bug whereby `samtools.Fastq` could produce out of sync R1/R2 when used with an unsorted bam input. `samtools collate` is now used by default to group reads by readname in order to avoid this issue. ++ New samtools task: split. ++ Update `bedtools.Intersect` to support `-wa`, `-wb`, and `-s` flags. ++ Add `biopet.ValidateFastq` to check your fastq files for pairing and other correctness. ++ **Breaking**: `samtools.Fastq` now requires defining your singleton read location. This only affects you if you were previously using this task with only a single output read file. ++ Deprecate `modkit.Pileup`'s bedGraph option, it is now output by default. ++ Add support for filterThreshold/filterPercent for `modkit.Pileup`. ++ Add `modkit.Summary` task. ++ Disable the one-click GDPR dataleak button in MultiQC `--no-ai` by default. ++ Support providing additional reports to MultiQC in workflow configuration. ++ Update clair3 version from 1.0.11 to 1.1.0 ++ Improve whatshap runtime/memory usage for our cluster. ++ Add `Modkit.SampleProbs` + +version 5.2.0 +--------------------------- ++ Update cutadapt version to 4.4 ++ Update FastQC version to 0.12.1 + +version 5.1.0 +--------------------------- ++ Add the `highlyDiploidPercentage` and `somaticMinPuritySpread` inputs to the + hmtools PURPLE task. ++ Add a task for fastp. ++ Add a task for picard CollectInsertSizeMetrics. ++ Increased the timeMinutes runtime attribute for manta (somatic and germline) to `2880`. ++ Add a task for GRIDSS somatic filtering. ++ Add a task to generate a panel of normals BED and BEDPE file for GRIDSS. ++ Add a task to filter a GRIDSS PON. ++ Add a task for delly somatic filtering. ++ Delly CallSV's `bamFile` and `bamIndex` inputs are not arrays of files, allowing + for multiple samples to be included. ++ Add `samples` input to bcftools view to select samples included in the output vcf. ++ Add a separatorChar input to the tagUmi task. ++ Bug fix: Add space between flag and the value provided for macs2 ++ Add optional inputs to macs2, aiming to allow adhering to Encode ATACs-seq. Inputs added: + + nomodel + + gensz + + extsize + + shiftsize + + pval_thres + + bdg + + keepdup + + callsummits ++ Update samtools image to version 1.16. ++ Add targetsFile input for samtools View. ++ Mateclever's runtime attribute defaults were changed to: + + memory: `"250GiB"` + + timeMinutes: `2880` ++ Clever's Prediction task's runtime attribute defaults were changed to: + + memory: `"80GiB"` + + timeMinutes: `2200` ++ The GRIDSS AnnotateSvTypes task now also removes the second breakend of + the breakpoints and single breakends. This will prepare the output better + to be passed into survivor. ++ Updated SURVIVOR version to 1.0.7 ++ Add a combined samtools dict and samtools faidx task. ++ Add a BWA index task. ++ Move all memory notation to `KiB`, `MiB` and `GiB` from `K`, `M` and `G` + previously. The WDL spec clearly distuingishes between SI and binary + notations. Since Java always takes `K`, `M` and `G` to mean `KiB`, `MiB` and + `GiB` this means java tasks such as GATK, FastQC and Picard will always + receive enough memory now. ++ Purple's `somaticRainfallPlot` output is now optional and included in + the `plots` output as well. ++ Bedtools coverage's timeMinutes now defaults to `320`. ++ Gridss' runtime attribute defaults were changed to: + + jvmHeapSizeGb: `64` + + nonJvmMemoryGb: `10` + + threads: `12` ++ Virusbreakend's runtime attribute defaults were changed to: + + threads: `12` + + timeMinutes: `320` ++ Cobalt's timeMinutes now defaults to `480`. ++ Orange's timeMinutes now defaults to 10. ++ Sage's runtime attributes were changed to: + + threads: `32` + + javaXmx: `"16G"` + + memory: `"20G"` + + timeMinutes: `720` ++ Sambamba's runtimeMinutes nor defaults to `320`. ++ Added a task for CupGenerateReport. ++ Updated Cuppa to version 1.6. ++ Added a task for Gripss. ++ Fixed the HealthChecker task's determination of the `succeeded` output + value. ++ Updated Linx to version 1.18. ++ Added a task for LinxVisualization. ++ Added a task for HMFtools Orange. ++ Added a task for HMFtools Pave. ++ Updated Purple to version 3.2. ++ Added plot and table outputs of Sage to task outputs. ++ Updated virus-interpreter to version 1.2. ++ Updated Peach to version 1.5. ++ Added a task to add SVTYPE annotations to GRIDSS results + (`AnnotateSvTypes`). ++ The GRIDSS task will now run tabix separately if GRIDSS doesn't + produce a vcf index. ++ Add a script to subtract UMI's from the read name and add them as + a BAM tag for each BAM record. The script is in umi.BamReadNameToUmiTag. ++ Add fgbio.AnnotateBamWithUmis. ++ Add picard.UmiAwareMarkDuplicatesWithMateCigar. ++ Added a task for SnpEff. ++ Adjusted runtime settings for sambamba Markdup. ++ Added a task for sambamba Flagstat. ++ Added a task for Picard CollectWgsMetrics. ++ Added a task for Peach. ++ Added tasks for HMFtools: + + Amber + + Cobalt + + Cuppa + + CuppaChart + + GripssApplicationKt + + GripssHardFilterApplicationKt + + HealthChecker + + Linx + + Protect + + Purple + + Sage + + VirusInterpreter ++ Added a task for VirusBreakend. ++ Added a task for GridssAnnotateVcfRepeatmasker. ++ Bumped GRIDSS version to 2.12.2. ++ Adjusted GRIDSS runtime settings. ++ Added optional inputs to GRIDSS: + + blacklistBed + + gridssProperties ++ Added a task for GRIDSS AnnotateInsertedSequence. ++ Added a task for ExtractSigPredictHRD. ++ Added a task for DeconstructSigs. ++ Added option useSoftclippingForSupplementary (default false) to + BWA mem. ++ Adjusted BWA mem runtime settings. ++ Added a task for bedtools coverage. ++ Added a task for bcftools filter. ++ Adjusted runtime settings for bcftools annotate. ++ Added optional inputs to bcftools annotate: + + inputFileIndex + + annsFileIndex ++ Update parameter_meta for macs2 ++ Add sample position in array task. + +version 5.0.2 +--------------------------- ++ bumped ScatterRegions container to 1.0.0 + +version 5.0.1 +--------------------------- ++ Smoove: enable genotyping ++ add runtime memory to number of tasks. + +version 5.0.0 +--------------------------- ++ Update CPAT to version 3.0.4. + + Changed the `outFilePath` input to `outputPrefix`. ++ Survivor: Change integer to string literal in boolean parameters. ++ Samtools: Add mkdir line to `Fastq` task. ++ Add new parameters from CCS version 6.0.0 and add two new outputs: + `ccs_report.txt` & `zmw_metrics.json.gz`. ++ Change CutAdapt memory to `5G`. ++ Increase multiqc base time from 5 to 10. ++ Update biowdl-input-converter to version 0.3. ++ Update minimap2 to version 2.20. ++ Update lima to version 2.2.0. ++ Update ccs to version 6.0.0. ++ Update bam2fastx to version 1.3.1. ++ Add memory values to GffCompare, GffRead and CPAT. ++ GffCompare: Make the `referenceAnnotation` input optional. ++ Stringtie: Add the `minimumCoverage` input. ++ UMI-tools: Update default dockerImage to use umitools v1.1.1 with correct + samtools version (1.10). ++ UMI-tools: Re-introduce samtools indexing. ++ UMI-tools: Update default dockerImage to use umitools v1.1.1. ++ UMI-tools dedup: Add tempdir. ++ Bcftools view: Add options for filtering (include, exclude, excludeUncalled). ++ Duphold: Add `duphold.wdl`. ++ Add new wdl file prepareShiny.wdl for creating input files for shiny app. ++ mergePacBio: Rename `mergedReport` to `outputPathMergedReport`. ++ Lima: Fix copy commands. ++ Fixed the `size` call in the default for gffread's timeMinutes, to retrieve + GBs instead of bytes. ++ Update stringtie to version 1.3.6. ++ Update Lima to version 2.0.0. ++ Update IsoSeq3 to version 3.4.0. ++ Update samtools to version 1.11. ++ Update Picard to version 2.23.8. ++ Update NanoPlot to version 1.32.1. ++ Update MultiQC to version 1.9. ++ ~Update StringTie to version 2.1.4.~ ++ Complete `parameter_meta` for tasks missing the outputs. ++ DeepVariant: Add an optional input for the gvcf index. ++ Samtools: `Sort` task now has `threads` in runtime instead of `1`. ++ Picard: Add parameter_meta to `SortSam`. ++ pbmm2: Add parameter_meta for `sample`. ++ Centrifuge: Rename output in task `KReport` to `KrakenReport` to resolve + name collision with task name. ++ Bwa & bwa-mem2: Add parameter_meta for `outputHla`. ++ Multiqc: Removed WDL_AID excludes of "finished" & "dependencies" inputs. ++ Bam2fastx: Add localisation of input files to Bam2Fasta task. ++ Lima: `cores` input has been renamed to `threads` to match tool naming. ++ isoseq3: `cores` input has been renamed to `threads` to match tool naming. ++ CCS: `cores` input has been renamed to `threads` to match tool naming. ++ Add PacBio preprocessing specific tasks `mergePacBio` & `ccsChunks`. ++ CCS: Update CCS to version 5. ++ deepvariant: Add task for DeepVariant. ++ gatk: Make intervals optional for GenotypeGVCFs. ++ isoseq3: Add required bam index input to isoseq3. ++ pbbam: Add task for indexing PacBio bam files. ++ picard: Add CollectHsMetrics and CollectVariantCallingMetrics. ++ Samtools: Add `threads` to parameter meta for Merge task. ++ bcftools: add tmpDir input to specify temporary directory when sorting. ++ bcftools: remove outputType and implement indexing based on output + file extension. ++ NanoPack: Add parameter_meta to NanoPlot task. ++ Centrifuge: Remove metrics file from classification (which causes the + summary report to be empty). + https://github.com/DaehwanKimLab/centrifuge/issues/83 ++ Add NanoPlot and NanoQC tasks. ++ Centrifuge: Add `timeMinutes` to `Classify` task and remove unnecessary + downloading tasks (alternative is refseqtools). ++ collect-columns: updated docker image to version 1.0.0 and added the + `sumOnDuplicateId` input (defaults to false). ++ survivor: replace integer boolean type to logical true or false value. ++ vt: Add option to ignore masked reference. ++ bcftools: add sorting and annotation. ++ Bam2fastx: Input bam and index are now arrays. ++ Lima: Remove globs from outputs. ++ Updated task gridss.wdl: add --jvmheap parameter. ++ A bwa-mem2 task was created with the same interface (including usePostalt) + as the bwa mem task. ++ bwa mem and bwa kit are now one task. The usePostalt boolean can be used to + switch the postalt script on and off. ++ Added a task for GRIDSS. ++ Add wdl file for pacbio's bam2fastx tool. + +version 4.0.0 +--------------------------- ++ Picard MergeVcf now uses compression level 1 by default. ++ bwa mem, bwa mem+kit and hisat2 have their samtools sort threads tweaked. The + number of threads is now related to the number of threads on the aligner. + Using more threads reduces the chance of the samtools sort pipe getting + blocked if it's full. ++ Renamed a few inputs in centrifuge.wdl, isoseq3.wdl, talon.wdl, + transcriptclean.wdl to be more descriptive. ++ Renamed outputs of tasks used in the TALON-WDL, PacBio-subreads-processing & + sequence-classification pipelines. ++ Reworked bcf2vcf task into bcftools view task. ++ Removed the redundant format flag from the htseq interface. This is + autodetected in newer versions of htseq. ++ Update docker images for samtools, bcftools, picard, GATK, cutadapt, htseq + and chunked-scatter. ++ Default docker images for bwa, bwakit and hisat2 updated to include samtools + 1.10. ++ Alignment tasks (STAR, Hisat2, BWA) now produce BAM files at level 1 + compression. ++ Hisat2 task has added controls for samtools. ++ Alignment tasks no longer produce BAM indexes as these are not needed + by the markduplicates step. ++ Picard Markduplicates now uses 7G of RAM just like in GATK's best practice + example pipeline. ++ Picard SortSam added as a task. ++ Md5 files are no longer created by default on Picard tasks that generate + BAM files. ++ Changed PicardMarkduplicates to use COMPRESSION_LEVEL=1 by default with + the htsjdk deflater. + This makes the task finish in 32% less time at the cost of a 8% larger BAM + file. ++ Added sambamba markdup and sambamba sort. NOTE: samtools sort is more + efficient and is recommended. ++ Correctly represent samtools inconsistent use of the threads flag. + Sometimes it means 'threads' sometimes it means 'additional threads'. + BioWDL tasks now use only threads. The `threads - 1` conversion is + applied where necessary for samtools tools that use additional threads. ++ Updated BWA MEM and BWA KIT tasks to use samtools sort version 1.10 for + sorting the BAM file. ++ Updated memory requirements on bcftools Stats, bwa mem, bwakit, GATK + ApplyBQSR, GATK BaseRecalibrator, GATK GatherBqsrReports, Gatk + HaplotypeCaller, Picard CollectMultipleMetrics, Picard GatherBamFiles, + samtools Flagstat, samtools sort and bcftools stats. ++ TALON: Update `FilterTalonTranscripts` to new version, which removes the + pairingsFile and replaces this with datasetsFile. ++ TALON: Add `GetSpliceJunctions` & `LabelReads` tasks. ++ TALON: Update to version 5.0. ++ Add tasks for pbmm2, the PacBio wrapper for minimap2. ++ Update the image for chunked-scatter and make use of new features from 0.2.0. ++ Tuned resource requirements for GATK VariantEval, MultiQC, Picard metrics and + STAR. ++ Added a new task for [scatter-regions](https://github.com/biowdl/chunked-scatter) + that replaces biopet-scatterregions. ++ The FastQC task now talks to the Java directly instead of using the included + Perl wrapper for FastQC. This has the advantage that memory and threads can + be set independently. A rather high maximum heap size of 1750MB (Xmx1750M) + was set, as OOM errors occurred frequently on some fastqs. ++ STAR: Add options regarding alignment score (regarding read length as well) + for tweaking when processing rRNA depleted samples. ++ TALON: Update `minimumIdentity` to correct type (float, was integer) + & set new default according to developers (0.8, was 0). ++ Added GATK VariantEval task. ++ Added a log output for STAR. ++ Added report output to Hisat2. ++ Added output with all reports to gffcompare. ++ Change MultiQC inputs. It now accepts an array of reports files. It does not + need access to a folder with the reports anymore. MultiQC can now be used + as a normal WDL task without hacks. ++ Picard: Make all outputs in `CollectMultipleMetrics` optional. This will + make sure the task will not fail if one of the metrics is set to false. ++ The struct `BowtieIndex` was removed, as it has become obsolete. ++ The task `ReorderGlobbedScatters` was removed, as it has become obsolete. ++ Adjusted the memory settings of many tools, especially java tools. + They should now more accurately represent actual memory usage (as + opposed to virtual memory). ++ Added `-XX:ParallelGCThreads=1` to the java options of java tasks. ++ Added `timeMinutes` input to many tasks, this indicates a maximum + number of minutes that the job will run. The associated runtime + attribute is `time_minutes` which can be used to inform + a scheduler (eg. slurm) of the run time of the job. ++ Added STAR GenomeGenerate task. ++ GATK.HaplotypeCaller: Add `--dont-use-soft-clipped-bases` and + `--standard-min-confidence-threshold-for-calling` options. These are + required for RNA seq variant calling according to GATK best practices. ++ Samtools: Fix quotations in sort command. ++ Samtools SortByName is now called Sort. ++ Generalize sort task to now also sort by position, instead of just read name. ++ Add CreateSequenceDictionary task to picard. ++ Add faidx task to samtools. ++ Isoseq3: Remove dirname command from output folder creation step. ++ Isoseq3: Requires more memory by default, is now 2G. ++ Isoseq3: Remove cp commands and other bash magic, file naming is now + solved by pipeline. ++ Lima: Replace mv command with cp. ++ Add WDL task for smoove (lumpy) sv-caller. + version 3.1.0 --------------------------- + Default threads for BWA in bwa.Kit task: 4. Samtools sort in the @@ -15,17 +387,17 @@ version 3.1.0 + Lima: Add missing output to parameter_meta. + Lima: Remove outputPrefix variable from output section. + Isoseq3: Make sure stderr log file from Refine is unique and not overwritten. -+ Isoseq3: Add workaround in Refine for glob command not locating files in output directory. ++ Isoseq3: Add workaround in Refine for glob command not locating files + in output directory. + Isoseq3: Fix --min-polya-length argument syntax. + Lima: Add workaround for glob command not locating files in output directory. + CCS: Add missing backslash. + Cutadapt now explicitly calls the `--compression-level` flag with compression - level 1 to prevent cutadapt from using very high gzip compression level 6 + level 1 to prevent cutadapt from using very high gzip compression level 6 that uses 400% more cpu time. + Update default docker image for cutadapt and fastqc. + Default number of cores for cutadapt and bwamem to 4 cores. - version 3.0.0 --------------------------- + Add optional input umiSeparator in umi-tools dedup task. @@ -36,7 +408,7 @@ version 3.0.0 + Allow setting the `--emit-ref-confidence` flag for HaplotypeCaller. + Add `--output-mode` flag to HaplotypeCaller. + Added rtg.Format and rtg.VcfEval tasks. -+ Added gatk.SelectVariants and gatk.VariantFiltration tasks. ++ Added gatk.SelectVariants and gatk.VariantFiltration tasks. + Fixed a bug where the output directory was not created for bwa.Kit. + Add vt task for variants normalization and decomposition. + Update WDL task Picard (Add task RenameSample). @@ -53,17 +425,20 @@ version 3.0.0 biopet.ScatterRegions now always returns correctly ordered scatters. + Add tasks for umi-tools dedup and extract. + Add `GenomicsDBImport` task for GATK. -+ Add `annotationGroups` input to `GenotypeGVCFs` to allow setting multiple ++ Add `annotationGroups` input to `GenotypeGVCFs` to allow setting multiple annotation groups. The `StandardAnnotation` group is still used as default. + GenotypeGVCFs, only allow one input GVCF file, as the tool also only allows - one input file. -+ Rename HaplotypeCallerGVCF to HaplotypeCaller. Add `gvcf` option to set + one input file. ++ Rename HaplotypeCallerGVCF to HaplotypeCaller. Add `gvcf` option to set whether output should be a GVCF. + Centrifuge: Add Krona task specific to Centrifuge. -+ Centrifuge: Fix Centrifuge tests, where sometimes the index files could still not be located. ++ Centrifuge: Fix Centrifuge tests, where sometimes the index files could + still not be located. + Update parameter_meta for TALON, Centrifuge and Minimap2. -+ Centrifuge: Fix issue where Centrifuge Inspect did not get the correct index files location. -+ Add `minimumContigLength` input to PlotDenoisedCopyRatios and PlotModeledSegments. ++ Centrifuge: Fix issue where Centrifuge Inspect did not get the correct + index files location. ++ Add `minimumContigLength` input to PlotDenoisedCopyRatios + and PlotModeledSegments. + Add `commonVariantSitesIndex` input to CollectAllelicCounts. + Centrifuge: Fix issue where Centrifuge could not locate index files. + Increase default memory of BWA mem to 32G (was 16G). @@ -82,11 +457,11 @@ version 3.0.0 + PreprocessIntervals + Add common.TextToFile task. + Add bedtools.Intersect. -+ Add `-o pipefail` to bedtools.MergeBedFiles to prevent errors in BED files ++ Add `-o pipefail` to bedtools.MergeBedFiles to prevent errors in BED files from going unnoticed. + Centrifuge: Fix -1/-U options for single end data. + Add bedtools.Complement, bedtools.Merge, and add a task to combine multiple - bed files called bedtools.MergeBedFiles. This task combines bedtools merge + bed files called bedtools.MergeBedFiles. This task combines bedtools merge and sort. + Change `g` parameter on bedtools.Sort to `genome`. + Add `ploidity` and `excludeIntervalList` to gatk.HaplotypeCallerGvcf. @@ -99,11 +474,13 @@ version 3.0.0 + Removed the "extraArgs" input from FilterMutectCalls. + Removed unused "verbose" and "quiet" inputs from multiqc. + Added parameter_meta sections to a variety of tasks. -+ Picard's BedToIntervalList outputPath input is now optional (with a default of "regions.interval_list"). ++ Picard's BedToIntervalList outputPath input is now + optional (with a default of "regions.interval_list"). + TALON: Fix SQLite error concerning database/disk space being full. + Update htseq to default image version 0.11.2. + Update biowdl-input-converter in common.wdl to version 0.2.1. -+ Update TALON section to now include the new annotation file output, and add config file creation to the TALON task. ++ Update TALON section to now include the new annotation file output, and + add config file creation to the TALON task. + Removed unused inputs (trimPrimer and format) for cutadapt. + Various minor command tweaks to increase stability. + Fixed unused inputs in bedtools sort (inputs are now used). @@ -116,7 +493,8 @@ version 2.1.0 + Updated biowdl-input-converter version. + GATK CombineGVCFs memory was tripled to prevent it from using a lot of CPU in Garbage Collection mode. -+ Updated parameter_meta sections for Minimap2 and TranscriptClean to wdl-aid format. ++ Updated parameter_meta sections for Minimap2 and TranscriptClean to + wdl-aid format. + Updated cores variable for TALON, the default is now 4. + Updated TALON to version 4.4. + Added parameter_meta sections to the following tools: @@ -133,11 +511,15 @@ version 2.1.0 version 2.0.0 --------------------------- + TranscriptClean: Update TranscriptClean to version 2.0.2. -+ Memory runtime attributes are now Strings indicating total memory, as opposed to Ints indicating memory per core. -+ Memory inputs for most tasks are now Strings, remaining Int memory inputs are renamed to "memoryGb". -+ Use the biowdl-input-converter container for JsonToYaml, to reduce the amount of containers needed. -+ Add biowdl-input-converter and remove SampleConfigToSampleReadgroupLists which it replaces. -+ GATK.GenotypeGVCFs: Increased memoryMultiplier from 2.0 to 3.0 . ++ Memory runtime attributes are now Strings indicating total memory, as + opposed to Ints indicating memory per core. ++ Memory inputs for most tasks are now Strings, remaining Int memory inputs + are renamed to "memoryGb". ++ Use the biowdl-input-converter container for JsonToYaml, to reduce the + amount of containers needed. ++ Add biowdl-input-converter and remove SampleConfigToSampleReadgroupLists + which it replaces. ++ GATK.GenotypeGVCFs: Increased memoryMultiplier from 2.0 to 3.0. + Minimap2: Add -k option to minimap2 mapping. + Added bwakit task. + Minimap2: Add the option for --MD tag. @@ -147,10 +529,10 @@ version 1.0.0 --------------------------- + Common: Add "SampleConfigToSampleReadgroupLists" task. + MultiQC: the "interactive" input is now set to true by default. -+ Removed deprecated tasks: - + bioconda.installPrefix - + mergecounts.MergeCounts -+ GATK.BaseRecalibrator: "knownIndelsSitesVCFs" and "knownIndelsSitesVCFIndexes" are no longer optional, but now have a default of "[]". ++ Removed deprecated tasks: bioconda.installPrefix, mergecounts.MergeCounts ++ GATK.BaseRecalibrator: "knownIndelsSitesVCFs" + and "knownIndelsSitesVCFIndexes" are no longer optional, but + now have a default of "[]". + Removed BWA index task. + Removed unused "picardJar" input from bwa.wdl. + All inputs to bedtools Sort are now reflected in the generated command. @@ -166,17 +548,25 @@ version 1.0.0 + Fastqsplitter: use version 1.1. + Picard: Use version 2.20.5 of the biocontainer as this includes the R dependency. + Common: Update dockerTag to dockerImage. -+ GATK: Add CombineVariants task that allows, e.g., to merge VCFs from different callers. -+ Mutect2: Add GATK tasks related to variant filtering (LearnReadOrientationModel, MergeStats, GetPileupSummaries, CalculateContamination and FilterMutectCalls). -+ Mutect2: Add "--germline-resource" and "--f1r2-tar-gz" inputs, requiring an update to GATK 4.1.2.0. ++ GATK: Add CombineVariants task that allows, e.g., to merge VCFs + from different callers. ++ Mutect2: Add GATK tasks related to variant + filtering (LearnReadOrientationModel, MergeStats, GetPileupSummaries, + CalculateContamination and FilterMutectCalls). ++ Mutect2: Add "--germline-resource" and "--f1r2-tar-gz" inputs, requiring + an update to GATK 4.1.2.0. + Mutect2: Add necessary missing index attribute for panel of normals. + MultiQC: Add memory variable to multiqc task. -+ GATK: SplitNCigarReads, BaseRecalibration and ApplyBQSR do no longer need regions files as required inputs. -+ VarDict: Add user definable flags (-M, -A, -Q, -d, -v, -f) to the paired VCF filtering script. -+ Cutadapt: If the output is a gzipped file, compress with level 1 (instead of default 6). ++ GATK: SplitNCigarReads, BaseRecalibration and ApplyBQSR do no longer need + regions files as required inputs. ++ VarDict: Add user definable flags (-M, -A, -Q, -d, -v, -f) to the paired + VCF filtering script. ++ Cutadapt: If the output is a gzipped file, compress with + level 1 (instead of default 6). + Cutadapt: Fix issues with read2output when using single-end reads. + Add feature type, idattr and additional attributes to htseq-count. + Added allow-contain option to bowtie. + Added a changelog to keep track of changes. -+ Added sortByName task in samtools to support more memory efficient execution of HTSeqCount. ++ Added sortByName task in samtools to support more memory efficient + execution of HTSeqCount. + Removed the bam index from HTSeqCount's inputs. diff --git a/CPAT.wdl b/CPAT.wdl index 098d9ca6..b96ea0d7 100644 --- a/CPAT.wdl +++ b/CPAT.wdl @@ -23,25 +23,32 @@ version 1.0 task CPAT { input { File gene - String outFilePath + String outputPrefix File hex File logitModel + File? referenceGenome - File? referenceGenomeIndex # Should be added as input if - # CPAT should not index the reference genome. + # Should be added as input if CPAT should not index the + # reference genome. + File? referenceGenomeIndex Array[String]? startCodons Array[String]? stopCodons - String dockerImage = "biocontainers/cpat:v1.2.4_cv1" + + String memory = "4GiB" + Int timeMinutes = 10 + ceil(size(gene, "GiB") * 30) + String dockerImage = "quay.io/biocontainers/cpat:3.0.4--py39hcbe4a3b_0" } - # Some WDL magic in the command section to properly output the start and stopcodons to the command. - # select_first is needed in order to convert the optional arrays to non-optionals. + # Some WDL magic in the command section to properly output the start and + # stopcodons to the command. + # select_first is needed in order to convert the optional arrays + # to non-optionals. command { set -e - mkdir -p "$(dirname ~{outFilePath})" + mkdir -p "$(dirname ~{outputPrefix})" cpat.py \ --gene ~{gene} \ - --outfile ~{outFilePath} \ + --outfile ~{outputPrefix} \ --hex ~{hex} \ --logitModel ~{logitModel} \ ~{"--ref " + referenceGenome} \ @@ -50,25 +57,32 @@ task CPAT { } output { - File outFile = outFilePath + File orfSeqs = "~{outputPrefix}.ORF_seqs.fa" + File orfProb = "~{outputPrefix}.ORF_prob.tsv" + File orfProbBest = "~{outputPrefix}.ORF_prob.best.tsv" + File noOrf = "~{outputPrefix}.no_ORF.txt" + File rScript = "~{outputPrefix}.r" } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs gene: {description: "Equivalent to CPAT's `--gene` option.", category: "required"} - outFilePath: {description: "Equivalent to CPAT's `--outfile` option.", category: "required"} + outputPrefix: {description: "Equivalent to CPAT's `--outfile` option.", category: "required"} hex: {description: "Equivalent to CPAT's `--hex` option.", category: "required"} logitModel: {description: "Equivalent to CPAT's `--logitModel` option.", category: "required"} referenceGenome: {description: "Equivalent to CPAT's `--ref` option.", category: "advanced"} - referenceGenomeIndex: {description: "The index of the reference. Should be added as input if CPAT should not index the reference genome.", - category: "advanced"} + referenceGenomeIndex: {description: "The index of the reference. Should be added as input if CPAT should not index the reference genome.", category: "advanced"} startCodons: {description: "Equivalent to CPAT's `--start` option.", category: "advanced"} stopCodons: {description: "Equivalent to CPAT's `--stop` option.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } diff --git a/LICENSE b/LICENSE index 37eeade5..b1f2b679 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,3 @@ -MIT License - Copyright (c) 2017 Leiden University Medical Center Permission is hereby granted, free of charge, to any person obtaining a copy @@ -9,8 +7,8 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, diff --git a/README.md b/README.md index 246e3814..2c80e317 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,21 @@ # Tasks - -This repository contains the WDL task definitions used in the various +This repository contains the WDL task definitions used in the various [Biowdl](https://github.com/biowdl) workflows and pipelines. - ## Documentation - -Documentation for this workflow can be found -[here](https://biowdl.github.io/tasks/). +Documentation for this repository can be +found [here](https://biowdl.github.io/tasks/). ## About -These tasks are part of [Biowdl](https://github.com/biowdl) -developed by [the SASC team](http://sasc.lumc.nl/). +These tasks are part of [Biowdl](https://github.com/biowdl) developed by the +SASC team at [Leiden University Medical Center](https://www.lumc.nl/). ## Contact -

-For any question related to these tasks, please use the -github issue tracker -or contact - the SASC team directly at: +For any question related to Tasks, please use the +github issue tracker +or contact the SASC team directly at: + sasc@lumc.nl.

diff --git a/VERSION b/VERSION index fd2a0186..03f488b0 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.1.0 +5.3.0 diff --git a/bam2fastx.wdl b/bam2fastx.wdl new file mode 100644 index 00000000..62827fd9 --- /dev/null +++ b/bam2fastx.wdl @@ -0,0 +1,157 @@ +version 1.0 + +# Copyright (c) 2020 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Bam2Fasta { + input { + Array[File]+ bam + Array[File]+ bamIndex + String outputPrefix + Int compressionLevel = 1 + Boolean splitByBarcode = false + + String? seqIdPrefix + + String memory = "2GiB" + Int timeMinutes = 15 + String dockerImage = "quay.io/biocontainers/bam2fastx:1.3.1--hf05d43a_1" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + + # Localise the bam and pbi files so they are next to each other in the + # current folder. + bamFiles="" + for bamFile in ~{sep=" " bam} + do + ln $bamFile . + bamFiles=$bamFiles" $(basename $bamFile)" + done + + for index in ~{sep=" " bamIndex} + do + ln $index . + done + + bam2fasta \ + --output ~{outputPrefix} \ + -c ~{compressionLevel} \ + ~{true="--split-barcodes" false="" splitByBarcode} \ + ~{"--seqid-prefix " + seqIdPrefix} \ + $bamFiles + } + + output { + File fastaFile = outputPrefix + ".fasta.gz" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + bam: {description: "The input pacbio bam file(s).", category: "required"} + bamIndex: {description: "The .pbi index for the input file(s).", category: "required"} + outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + compressionLevel: {description: "Gzip compression level [1-9]", category: "advanced"} + splitByBarcode: {description: "Split output into multiple fasta files, by barcode pairs.", category: "advanced"} + seqIdPrefix: {description: "Prefix for sequence IDs in headers.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + fastaFile: {description: "The fasta output file."} + } +} + +task Bam2Fastq { + input { + Array[File]+ bam + Array[File]+ bamIndex + String outputPrefix + Int compressionLevel = 1 + Boolean splitByBarcode = false + + String? seqIdPrefix + + String memory = "2GiB" + Int timeMinutes = 15 + String dockerImage = "quay.io/biocontainers/bam2fastx:1.3.1--hf05d43a_1" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + + # Localise the bam and pbi files so they are next to each other in the + # current folder. + bamFiles="" + for bamFile in ~{sep=" " bam} + do + ln $bamFile . + bamFiles=$bamFiles" $(basename $bamFile)" + done + + for index in ~{sep=" " bamIndex} + do + ln $index . + done + + bam2fastq \ + --output ~{outputPrefix} \ + -c ~{compressionLevel} \ + ~{true="--split-barcodes" false="" splitByBarcode} \ + ~{"--seqid-prefix " + seqIdPrefix} \ + $bamFiles + } + + output { + File fastqFile = outputPrefix + ".fastq.gz" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + bam: {description: "The input pacbio bam file(s).", category: "required"} + bamIndex: {description: "The .pbi index for the input file(s).", category: "required"} + outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + compressionLevel: {description: "Gzip compression level [1-9]", category: "advanced"} + splitByBarcode: {description: "Split output into multiple fastq files, by barcode pairs.", category: "advanced"} + seqIdPrefix: {description: "Prefix for sequence IDs in headers.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + fastqFile: {description: "The fastq output file."} + } +} diff --git a/bcftools.wdl b/bcftools.wdl index 122fcdd1..31c7db13 100644 --- a/bcftools.wdl +++ b/bcftools.wdl @@ -1,7 +1,5 @@ version 1.0 -# MIT License -# # Copyright (c) 2018 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -22,30 +20,456 @@ version 1.0 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -task Bcf2Vcf { +task Annotate { + input { + Array[String] columns = [] + Boolean force = false + Boolean keepSites = false + Boolean noVersion = false + Array[String] samples = [] + Boolean singleOverlaps = false + Array[String] removeAnns = [] + File inputFile + File? inputFileIndex + String outputPath = "output.vcf.gz" + + File? annsFile + File? annsFileIndex + String? collapse + String? exclude + File? headerLines + String? newId + String? include + String? markSites + String? regions + File? regionsFile + File? renameChrs + File? samplesFile + + Int threads = 0 + String memory = "4GiB" + Int timeMinutes = 60 + ceil(size(inputFile, "G")) + String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" + } + + Boolean compressed = basename(outputPath) != basename(outputPath, ".gz") + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + bcftools annotate \ + -o ~{outputPath} \ + -O ~{true="z" false="v" compressed} \ + ~{"--annotations " + annsFile} \ + ~{"--collapse " + collapse} \ + ~{true="--columns" false="" length(columns) > 0} ~{sep="," columns} \ + ~{"--exclude " + exclude} \ + ~{true="--force" false="" force} \ + ~{"--header-lines " + headerLines} \ + ~{"--set-id " + newId} \ + ~{"--include " + include} \ + ~{true="--keep-sites" false="" keepSites} \ + ~{"--mark-sites " + markSites} \ + ~{true="--no-version" false="" noVersion} \ + ~{"--regions " + regions} \ + ~{"--regions-file " + regionsFile} \ + ~{"--rename-chrs " + renameChrs} \ + ~{true="--samples" false="" length(samples) > 0} ~{sep="," samples} \ + ~{"--samples-file " + samplesFile} \ + ~{true="--single-overlaps" false="" singleOverlaps} \ + ~{true="--remove" false="" length(removeAnns) > 0} ~{sep="," removeAnns} \ + ~{inputFile} + + ~{if compressed then 'bcftools index --tbi ~{outputPath}' else ''} + } + + output { + File outputVcf = outputPath + File? outputVcfIndex = outputPath + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + columns: {description: "Comma-separated list of columns or tags to carry over from the annotation file (see man page for details).", category: "advanced"} + force: {description: "Continue even when parsing errors, such as undefined tags, are encountered.", category: "advanced"} + keepSites: {description: "Keep sites which do not pass -i and -e expressions instead of discarding them.", category: "advanced"} + noVersion: {description: "Do not append version and command line information to the output VCF header.", category: "advanced"} + samples: {description: "List of samples for sample stats, \"-\" to include all samples.", category: "advanced"} + singleOverlaps: {description: "keep memory requirements low with very large annotation files.", category: "advanced"} + removeAnns: {description: "List of annotations to remove (see man page for details).", category: "advanced"} + inputFile: {description: "A vcf or bcf file.", category: "required"} + inputFileIndex: {description: "The index for the input vcf or bcf.", category: "common"} + outputPath: {description: "The location the output VCF file should be written.", category: "common"} + annsFile: {description: "Bgzip-compressed and tabix-indexed file with annotations (see man page for details).", category: "common"} + annsFileIndex: {description: "The index for annsFile.", category: "common"} + collapse: {description: "Treat as identical records with , see man page for details.", category: "advanced"} + exclude: {description: "Exclude sites for which the expression is true (see man page for details).", category: "advanced"} + headerLines: {description: "Lines to append to the VCF header (see man page for details).", category: "advanced"} + newId: {description: "Assign ID on the fly (e.g. --set-id +'%CHROM\\_%POS').", category: "advanced"} + include: {description: "Select sites for which the expression is true (see man page for details).", category: "advanced"} + markSites: {description: "Annotate sites which are present ('+') or absent ('-') in the -a file with a new INFO/TAG flag.", category: "advanced"} + regions: {description: "Restrict to comma-separated list of regions.", category: "advanced"} + regionsFile: {description: "Restrict to regions listed in a file.", category: "advanced"} + renameChrs: {description: "rename chromosomes according to the map in file (see man page for details).", category: "advanced"} + samplesFile: {description: "File of samples to include.", category: "advanced"} + threads: {description: "Number of extra decompression threads [0].", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "Annotated VCF file."} + outputVcfIndex: {description: "Index of the annotated VCF file."} + } +} + +task Filter { + input { + File vcf + File vcfIndex + String? include + String? exclude + String? softFilter + String outputPath = "./filtered.vcf.gz" + + String memory = "256MiB" + Int timeMinutes = 1 + ceil(size(vcf, "G")) + String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + bcftools \ + filter \ + ~{"-i " + include} \ + ~{"-e " + exclude} \ + ~{"-s " + softFilter} \ + ~{vcf} \ + -O z \ + -o ~{outputPath} + bcftools index --tbi ~{outputPath} + } + + output { + File outputVcf = outputPath + File outputVcfIndex = outputPath + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + vcf: {description: "The VCF file to operate on.", category: "required"} + vcfIndex: {description: "The index for the VCF file.", category: "required"} + include: {description: "Equivalent to the `-i` option.", category: "common"} + outputPath: {description: "The location the output VCF file should be written.", category: "common"} + + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + } +} + +task Norm { + input { + File inputFile + File? inputFileIndex + String outputPath = "output.vcf.gz" + + File? fasta + String? regions + Boolean splitMultiallelicSites = false + + String memory = "4GiB" + Int timeMinutes = 1 + ceil(size(inputFile, "G")) + Int diskGb = ceil(2.1 * size(inputFile, "G") + size(fasta, "G")) + String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" + } + + Boolean compressed = basename(outputPath) != basename(outputPath, ".gz") + + command { + set -e + ls ~{inputFile} ~{inputFileIndex} # dxCompiler localization workaroud + + mkdir -p "$(dirname ~{outputPath})" + bcftools norm \ + -o ~{outputPath} \ + -O ~{true="z" false="v" compressed} \ + ~{"--regions " + regions} \ + ~{"--fasta " + fasta} \ + ~{if splitMultiallelicSites then "--multiallelics -both" else ""} \ + ~{inputFile} + + ~{if compressed then "bcftools index --tbi ~{outputPath}" else ""} + } + + output { + File outputVcf = outputPath + File? outputVcfIndex = outputPath + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + disks: "local-disk ~{diskGb} SSD" # Based on an example in dxCompiler docs + } + + parameter_meta { + # inputs + inputFile: {description: "A vcf or bcf file.", category: "required"} + outputPath: {description: "The location the output VCF file should be written.", category: "common"} + fasta: {description: "Equivalent to bcftools norm's `--fasta` option.", category: "advanced"} + regions: {description: "Equivalent to bcftools norm's `--regions` option.", category: "advanced"} + splitMultiallelicSites: {description: "Whether multiallelic lines should be split up.", category: "advanced"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + diskGb: {description: "The amount of disk space needed for this job in GiB.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "Normalized VCF file."} + outputVcfIndex: {description: "Index of Normalized VCF file."} + } +} + +task Sort { + input { + File inputFile + String outputPath = "output.vcf.gz" + String tmpDir = "./sorting-tmp" + + String memory = "5GiB" + Int timeMinutes = 1 + ceil(size(inputFile, "G")) * 5 + String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" + } + + Boolean compressed = basename(outputPath) != basename(outputPath, ".gz") + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" ~{tmpDir} + bcftools sort \ + -o ~{outputPath} \ + -O ~{true="z" false="v" compressed} \ + -T ~{tmpDir} \ + ~{inputFile} + + ~{if compressed then 'bcftools index --tbi ~{outputPath}' else ''} + } + + output { + File outputVcf = outputPath + File? outputVcfIndex = outputPath + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputFile: {description: "A vcf or bcf file.", category: "required"} + outputPath: {description: "The location the output VCF file should be written.", category: "common"} + tmpDir: {description: "The location of the temporary files during the bcftools sorting.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "Sorted VCF file."} + outputVcfIndex: {description: "Index of sorted VCF file."} + } +} + +task Stats { input { - File bcf - String outputPath = "./bcftools/SV.vcf" - String dockerImage = "quay.io/biocontainers/bcftools:1.9--ha228f0b_3" + File inputVcf + File inputVcfIndex + String outputPath = basename(inputVcf) + ".stats" + Boolean firstAlleleOnly = false + Boolean splitByID = false + Array[String] samples = [] + Boolean verbose = false + + File? compareVcf + File? compareVcfIndex + String? afBins + String? afTag + String? collapse + String? depth + String? exclude + File? exons + String? applyFilters + File? fastaRef + File? fastaRefIndex + String? include + String? regions + File? regionsFile + File? samplesFile + String? targets + File? targetsFile + String? userTsTv + + Int threads = 0 + String memory = "256MiB" + Int timeMinutes = 1 + 2* ceil(size(select_all([inputVcf, compareVcf]), "G")) # TODO: Estimate, 2 minutes per GB, refine later. + String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" } command { set -e + mkdir -p $(dirname ~{outputPath}) + bcftools stats \ + ~{"--af-bins " + afBins} \ + ~{"--af-tag " + afTag} \ + ~{true="--1st-allele-only" false="" firstAlleleOnly} \ + ~{"--collapse " + collapse} \ + ~{"--depth " + depth} \ + ~{"--exclude " + exclude} \ + ~{"--exons " + exons} \ + ~{"--apply-filters " + applyFilters} \ + ~{"--fasta-ref " + fastaRef} \ + ~{"--include " + include} \ + ~{true="--split-by-ID" false="" splitByID} \ + ~{"--regions " + regions} \ + ~{"--regions-file " + regionsFile} \ + ~{true="--samples" false="" length(samples) > 0} ~{sep="," samples} \ + ~{"--samples-file " + samplesFile} \ + ~{"--targets " + targets} \ + ~{"--targets-file " + targetsFile} \ + ~{"--user-tstv " + userTsTv} \ + --threads ~{threads} \ + ~{true="--verbose" false="" verbose} \ + ~{inputVcf} ~{compareVcf} > ~{outputPath} + } + + output { + File stats = outputPath + } + + runtime { + cpu: threads + 1 + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputVcf: {description: "The VCF to be analysed.", category: "required"} + inputVcfIndex: {description: "The index for the input VCF.", category: "required"} + outputPath: {description: "The location the output VCF file should be written.", category: "common"} + firstAlleleOnly: {description: "Include only 1st allele at multiallelic sites.", category: "advanced"} + splitByID: {description: "Collect stats for sites with ID separately (known vs novel).", category: "advanced"} + samples: {description: "List of samples for sample stats, \"-\" to include all samples.", category: "advanced"} + verbose: {description: "Produce verbose per-site and per-sample output.", category: "advanced"} + compareVcf: {description: "When inputVcf and compareVCF are given, the program generates separate stats for intersection and the complements. By default only sites are compared, samples must be given to include also sample columns.", category: "common"} + compareVcfIndex: {description: "Index for the compareVcf.", category: "common"} + afBins: {description: "Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\n0.5\n1).", category: "advanced"} + afTag: {description: "Allele frequency tag to use, by default estimated from AN,AC or GT.", category: "advanded"} + collapse: {description: "Treat as identical records with , see man page for details.", category: "advanced"} + depth: {description: "Depth distribution: min,max,bin size [0,500,1].", category: "advanced"} + exclude: {description: "Exclude sites for which the expression is true (see man page for details).", category: "advanced"} + exons: {description: "Tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed).", category: "advanced"} + applyFilters: {description: "Require at least one of the listed FILTER strings (e.g. \"PASS,.\").", category: "advanced"} + fastaRef: {description: "Faidx indexed reference sequence file to determine INDEL context.", category: "advanced"} + fastaRefIndex: {description: "Index file (.fai) for fastaRef. Must be supplied if fastaRef is supplied.", category: "advanced"} + include: {description: "Select sites for which the expression is true (see man page for details).", category: "advanced"} + regions: {description: "Restrict to comma-separated list of regions.", category: "advanced"} + regionsFile: {description: "Restrict to regions listed in a file.", category: "advanced"} + samplesFile: {description: "File of samples to include.", category: "advanced"} + targets: {description: "Similar to regions but streams rather than index-jumps.", category: "advanced"} + targetsFile: {description: "Similar to regionsFile but streams rather than index-jumps.", category: "advanced"} + userTsTv: {description: ". Collect Ts/Tv stats for any tag using the given binning [0:1:100].", category: "advanced"} + threads: {description: "Number of extra decompression threads [0].", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + stats: {description: "Text file stats which is suitable for machine processing and can be plotted using plot-vcfstats."} + } +} + +task View { + input { + File inputFile + File? inputFileIndex + String outputPath = "output.vcf" + Boolean excludeUncalled = false + + String? exclude + String? include + String? region + Array[String] samples = [] + + String memory = "256MiB" + Int timeMinutes = 1 + ceil(size(inputFile, "G")) + String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" + } + + Boolean compressed = basename(outputPath) != basename(outputPath, ".gz") + + command { + set -e + ls ~{inputFile} ~{inputFileIndex} # dxCompiler localization workaroud + mkdir -p "$(dirname ~{outputPath})" - bcftools view ~{bcf} -O v -o ~{outputPath} + bcftools view \ + ~{"--exclude " + exclude} \ + ~{"--include " + include} \ + ~{true="--exclude-uncalled" false="" excludeUncalled} \ + ~{if length(samples) > 0 then "-s" else ""} ~{sep="," samples} \ + -o ~{outputPath} \ + -O ~{true="z" false="v" compressed} \ + ~{inputFile} \ + ~{region} + + ~{if compressed then 'bcftools index --tbi ~{outputPath}' else ''} } output { File outputVcf = outputPath + File? outputVcfIndex = outputPath + ".tbi" } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - bcf: {description: "The generated BCF from an SV caller", category: "required"} + # inputs + inputFile: {description: "A vcf or bcf file.", category: "required"} + inputFileIndex: {description: "the index for the input file.", category: "common"} outputPath: {description: "The location the output VCF file should be written.", category: "common"} + include: {description: "Select sites for which the expression is true (see man page for details).", category: "advanced"} + exclude: {description: "Exclude sites for which the expression is true (see man page for details).", category: "advanced"} + region: {description: "The region to retrieve from the VCF file.", category: "common"} + excludeUncalled: {description: "Exclude sites without a called genotype (see man page for details).", category: "advanced"} + samples: {description: "A list of sample names to include.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "VCF file."} + outputVcfIndex: {description: "Index of VCF file."} } } diff --git a/bedtools.wdl b/bedtools.wdl index 4f39e2a8..64fccc7b 100644 --- a/bedtools.wdl +++ b/bedtools.wdl @@ -24,8 +24,11 @@ task Complement { input { File faidx File inputBed - String dockerImage = "quay.io/biocontainers/bedtools:2.23.0--hdbcaa40_3" String outputBed = basename(inputBed, "\.bed") + ".complement.bed" + + String memory = "~{512 + ceil(size([inputBed, faidx], "MiB"))}MiB" + Int timeMinutes = 1 + ceil(size([inputBed, faidx], "G")) + String dockerImage = "quay.io/biocontainers/bedtools:2.23.0--hdbcaa40_3" } # Use a fasta index file to get the genome sizes. And convert that to the @@ -44,20 +47,71 @@ task Complement { } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - faidx: {description: "The fasta index (.fai) file from which to extract the genome sizes", - category: "required"} - inputBed: {description: "The inputBed to complement", - category: "required"} - outputBed: {description: "The path to write the output to", - category: "advanced"} - dockerImage: { - description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced" - } + # inputs + faidx: {description: "The fasta index (.fai) file from which to extract the genome sizes.", category: "required"} + inputBed: {description: "The inputBed to complement.", category: "required"} + outputBed: {description: "The path to write the output to.", category: "advanced"} + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + complementBed: {description: "All intervals in a genome that are not covered by at least one interval in the input file."} + } +} + +task Coverage { + input { + File genomeFile + File a + File? aIndex + File b + File? bIndex + String outputPath = "./coverage.tsv" + + String memory = "8GiB" + Int timeMinutes = 320 + String dockerImage = "quay.io/biocontainers/bedtools:2.30.0--h7d7f7ad_2" + } + + command { + bedtools coverage \ + -sorted \ + -g ~{genomeFile} \ + -a ~{a} \ + -b ~{b} \ + -d \ + > ~{outputPath} + } + + output { + File coverageTsv = outputPath + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + genomeFile: {description: "A file listing the chromosomes and their lengths.", category: "required"} + a: {description: "The file containing the regions for which the coverage will be counted.", category: "required"} + aIndex: {description: "An index for the file given as `a`.", category: "common"} + b: {description: "The file in which the coverage will be counted. Likely a BAM file.", category: "required"} + bIndex: {description: "An index for the file given as `b`.", category: "common"} + outputPath: {description: "The path the ouptu will be written to.", category: "common"} + + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } } @@ -65,10 +119,14 @@ task Merge { input { File inputBed String outputBed = "merged.bed" + + String memory = "~{512 + ceil(size(inputBed, "MiB"))}MiB" + Int timeMinutes = 1 + ceil(size(inputBed, "G")) String dockerImage = "quay.io/biocontainers/bedtools:2.23.0--hdbcaa40_3" } command { + set -e bedtools merge -i ~{inputBed} > ~{outputBed} } @@ -77,18 +135,21 @@ task Merge { } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - inputBed: {description: "The bed to merge", - category: "required"} - outputBed: {description: "The path to write the output to", - category: "advanced"} - dockerImage: { - description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced" - } + # inputs + inputBed: {description: "The bed to merge.", category: "required"} + outputBed: {description: "The path to write the output to.", category: "advanced"} + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + mergedBed: {description: "Merged bed file."} } } @@ -97,6 +158,9 @@ task MergeBedFiles { input { Array[File]+ bedFiles String outputBed = "merged.bed" + + String memory = "~{512 + ceil(size(bedFiles, "MiB"))}MiB" + Int timeMinutes = 1 + ceil(size(bedFiles, "G")) String dockerImage = "quay.io/biocontainers/bedtools:2.23.0--hdbcaa40_3" } @@ -111,17 +175,21 @@ task MergeBedFiles { } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } + parameter_meta { - bedFiles: {description: "The bed files to merge", - category: "required"} - outputBed: {description: "The path to write the output to", - category: "advanced"} - dockerImage: { - description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced" - } + # inputs + bedFiles: {description: "The bed files to merge.", category: "required"} + outputBed: {description: "The path to write the output to.", category: "advanced"} + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + mergedBed: {description: "Merged bed file."} } } @@ -134,10 +202,14 @@ task Sort { Boolean chrThenSizeD = false Boolean chrThenScoreA = false Boolean chrThenScoreD = false + String outputBed = "output.sorted.bed" + File? genome File? faidx - String outputBed = "output.sorted.bed" - String dockerImage = "quay.io/biocontainers/bedtools:2.23.0--hdbcaa40_3" + + String memory = "~{512 + ceil(size(inputBed, "MiB"))}MiB" + Int timeMinutes = 1 + ceil(size(inputBed, "GiB")) + String dockerImage = "quay.io/biocontainers/bedtools:2.31.1--hf5e1c6e_2" } command { @@ -152,7 +224,7 @@ task Sort { ~{true="-chrThenScoreA" false="" chrThenScoreA} \ ~{true="-chrThenScoreD" false="" chrThenScoreD} \ ~{"-g " + genome} \ - ~{"-faidx" + faidx} \ + ~{"-faidx " + faidx} \ > ~{outputBed} } @@ -161,28 +233,64 @@ task Sort { } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } + + parameter_meta { + # inputs + inputBed: {description: "The bed to sort.", category: "required"} + sizeA: {description: "Sort by feature size in ascending order.", category: "common"} + sizeD: {description: "Sort by feature size in descending order.", category: "common"} + chrThenSizeA: {description: "Sort by chromosome (asc), then by feature size (asc).", category: "common"} + chrThenSizeD: {description: "Sort by chromosome (asc), then by feature size (desc).", category: "common"} + chrThenScoreA: {description: "Sort by chromosome (asc), then by score (asc).", category: "common"} + chrThenScoreD: {description: "Sort by chromosome (asc), then by score (desc).", category: "common"} + outputBed: {description: "The path to write the output to.", category: "advanced"} + genome: {description: "Define sort order by order of tab-delimited file with chromosome names in the first column.", category: "advanced"} + faidx: {description: "Define sort order by order of tab-delimited file with chromosome names in the first column. Sort by specified chromosome order.", category: "advanced"} + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + sortedBed: {description: "The sorted bed file."} + } } task Intersect { input { File regionsA File regionsB - # Giving a faidx file will set the sorted option. - File? faidx String outputBed = "intersect.bed" - String dockerImage = "quay.io/biocontainers/bedtools:2.23.0--hdbcaa40_3" + + File? faidx # Giving a faidx file will set the sorted option. + + Boolean writeA = false + Boolean writeB = false + Boolean stranded = false + Boolean nonamecheck = false + + String memory = "~{512 + ceil(size([regionsA, regionsB], "MiB"))}MiB" + Int timeMinutes = 1 + ceil(size([regionsA, regionsB], "GiB")) + String dockerImage = "quay.io/biocontainers/bedtools:2.31.1--hf5e1c6e_2" } + Boolean sorted = defined(faidx) command { set -e + mkdir -p "$(dirname ~{outputBed})" ~{"cut -f1,2 " + faidx} ~{true="> sorted.genome" false ="" sorted} bedtools intersect \ -a ~{regionsA} \ -b ~{regionsB} \ + ~{true="-wa" false="" writeA} \ + ~{true="-wb" false="" writeB} \ + ~{true="-s" false="" stranded} \ ~{true="-sorted" false="" sorted} \ + ~{true="-nonamecheck" false="" nonamecheck} \ ~{true="-g sorted.genome" false="" sorted} \ > ~{outputBed} } @@ -192,21 +300,28 @@ task Intersect { } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - faidx: {description: "The fasta index (.fai) file that is used to create the genome file required for sorted output. Implies sorted option.", - category: "common"} - regionsA: {description: "Region file a to intersect", - category: "required"} - regionsB: {description: "Region file b to intersect", - category: "required"} - outputBed: {description: "The path to write the output to", - category: "advanced"} - dockerImage: { - description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced" - } + # inputs + regionsA: {description: "Region file a to intersect.", category: "required"} + regionsB: {description: "Region file b to intersect.", category: "required"} + outputBed: {description: "The path to write the output to.", category: "advanced"} + faidx: {description: "The fasta index (.fai) file that is used to create the genome file required for sorted output. Implies sorted option.", category: "common"} + + writeA: {description: "Write the original entry in A for each overlap.", category: "advanced"} + writeB: {description: "Write the original entry in B for each overlap. Useful for knowing what A overlaps.", category: "advanced"} + stranded: {description: "Force “strandedness”. That is, only report hits in B that overlap A on the same strand. By default, overlaps are reported without respect to strand.", category: "advanced"} + nonamecheck: {description: "Disable the bedtools intersect name check. This is used to catch chr1 vs chr01 or chr1 vs 1 naming inconsistencies. However, it throws an error for GIAB hg38 which has capital letters. https://github.com/arq5x/bedtools2/issues/648", category: "advanced"} + + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + intersectedBed: {description: "The intersected bed file."} } } diff --git a/biopet.wdl b/biopet.wdl new file mode 100644 index 00000000..ea8a36c8 --- /dev/null +++ b/biopet.wdl @@ -0,0 +1,60 @@ +version 1.0 + +# Copyright (c) 2025 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +task ValidateFastq { + input { + File inputRead1 + File? inputRead2 + + String memory = "1GiB" + Int timeMinutes = 5 + ceil(size(inputRead1, "GiB")) + String dockerImage = "quay.io/biocontainers/biopet-validatefastq:0.1.1--hdfd78af_3" + } + + command { + set -e + java -jar /usr/local/share/biopet-validatefastq-0.1.1-3/validatefastq-assembly-0.1.1.jar \ + --fastq1 ~{inputRead1} \ + ~{"--fastq2 " + inputRead2} + } + + output { + } + + runtime { + cpu: 1 + memory: memory + docker: dockerImage + time_minutes: timeMinutes + } + + parameter_meta { + # inputs + inputRead1: {description: "The location of the first FASTQ file (first reads for pairs, in case of paired-end sequencing).", category: "required"} + inputRead2: {description: "The location of the paired end reads.", category: "common"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } +} diff --git a/biopet/bamstats.wdl b/biopet/bamstats.wdl deleted file mode 100644 index 7def9aec..00000000 --- a/biopet/bamstats.wdl +++ /dev/null @@ -1,69 +0,0 @@ -version 1.0 - -# Copyright (c) 2017 Leiden University Medical Center -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -import "../common.wdl" as common - -task Generate { - input { - String? preCommand - File? toolJar - IndexedBamFile bam - File? bedFile - Boolean scatterMode = false - Boolean onlyUnmapped = false - Boolean tsvOutputs = false - String outputDir - Reference? reference - - String memory = "16G" - String javaXmx = "8G" - } - - File referenceFasta = if defined(reference) then select_first([reference]).fasta else "" - - String toolCommand = if defined(toolJar) - then "java -Xmx~{javaXmx} -jar " + toolJar - else "biopet-bamstats -Xmx~{javaXmx}" - - command { - set -e -o pipefail - ~{preCommand} - mkdir -p ~{outputDir} - ~{toolCommand} Generate \ - --bam ~{bam.file} \ - ~{"--bedFile " + bedFile} \ - ~{true="--reference" false="" defined(reference)} ~{referenceFasta} \ - ~{true="--onlyUnmapped" false="" onlyUnmapped} \ - ~{true="--scatterMode" false="" scatterMode} \ - ~{true="--tsvOutputs" false="" tsvOutputs} \ - --outputDir ~{outputDir} - } - - output { - File json = outputDir + "/bamstats.json" - File summaryJson = outputDir + "/bamstats.summary.json" - } - - runtime { - memory: memory - } -} \ No newline at end of file diff --git a/biopet/biopet.wdl b/biopet/biopet.wdl deleted file mode 100644 index ec64fb4b..00000000 --- a/biopet/biopet.wdl +++ /dev/null @@ -1,560 +0,0 @@ -version 1.0 - -# Copyright (c) 2017 Leiden University Medical Center -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -import "../common.wdl" - -task BaseCounter { - input { - String? preCommand - File? toolJar - IndexedBamFile bam - File refFlat - String outputDir - String prefix - - String memory = "14G" - String javaXmx = "4G" - } - - String toolCommand = if defined(toolJar) - then "java -Xmx~{javaXmx} -jar " + toolJar - else "biopet-basecounter -Xmx~{javaXmx}" - - command { - set -e -o pipefail - mkdir -p ~{outputDir} - ~{preCommand} - ~{toolCommand} \ - -b ~{bam.file} \ - -r ~{refFlat} \ - -o ~{outputDir} \ - -p ~{prefix} - } - - output { - File exonAntisense = outputDir + "/" + prefix + ".base.exon.antisense.counts" - File exon = outputDir + "/" + prefix + ".base.exon.counts" - File exonMergeAntisense = outputDir + "/" + prefix + ".base.exon.merge.antisense.counts" - File exonMerge = outputDir + "/" + prefix + ".base.exon.merge.counts" - File exonMergeSense = outputDir + "/" + prefix + ".base.exon.merge.sense.counts" - File exonSense = outputDir + "/" + prefix + ".base.exon.sense.counts" - File geneAntisense = outputDir + "/" + prefix + ".base.gene.antisense.counts" - File gene = outputDir + "/" + prefix + ".base.gene.counts" - File geneExonicAntisense = outputDir + "/" + prefix + ".base.gene.exonic.antisense.counts" - File geneExonic = outputDir + "/" + prefix + ".base.gene.exonic.counts" - File geneExonicSense = outputDir + "/" + prefix + ".base.gene.exonic.sense.counts" - File geneIntronicAntisense = outputDir + "/" + prefix + ".base.gene.intronic.antisense.counts" - File geneIntronic = outputDir + "/" + prefix + ".base.gene.intronic.counts" - File geneIntronicSense = outputDir + "/" + prefix + ".base.gene.intronic.sense.counts" - File geneSense = outputDir + "/" + prefix + ".base.gene.sense.counts" - File intronAntisense = outputDir + "/" + prefix + ".base.intron.antisense.counts" - File intron = outputDir + "/" + prefix + ".base.intron.counts" - File intronMergeAntisense = outputDir + "/" + prefix + ".base.intron.merge.antisense.counts" - File intronMerge = outputDir + "/" + prefix + ".base.intron.merge.counts" - File intronMergeSense = outputDir + "/" + prefix + ".base.intron.merge.sense.counts" - File intronSense = outputDir + "/" + prefix + ".base.intron.sense.counts" - File metaExonsNonStranded = outputDir + "/" + prefix + ".base.metaexons.non_stranded.counts" - File metaExonsStrandedAntisense = outputDir + "/" + prefix + ".base.metaexons.stranded.antisense.counts" - File metaExonsStranded = outputDir + "/" + prefix + ".base.metaexons.stranded.counts" - File metaExonsStrandedSense = outputDir + "/" + prefix + ".base.metaexons.stranded.sense.counts" - File transcriptAntisense = outputDir + "/" + prefix + ".base.transcript.antisense.counts" - File transcript = outputDir + "/" + prefix + ".base.transcript.counts" - File transcriptExonicAntisense = outputDir + "/" + prefix + ".base.transcript.exonic.antisense.counts" - File transcriptExonic = outputDir + "/" + prefix + ".base.transcript.exonic.counts" - File transcriptExonicSense = outputDir + "/" + prefix + ".base.transcript.exonic.sense.counts" - File transcriptIntronicAntisense = outputDir + "/" + prefix + ".base.transcript.intronic.antisense.counts" - File transcriptIntronic = outputDir + "/" + prefix + ".base.transcript.intronic.counts" - File transcriptIntronicSense = outputDir + "/" + prefix + ".base.transcript.intronic.sense.counts" - File transcriptSense = outputDir + "/" + prefix + ".base.transcript.sense.counts" - } - - runtime { - memory: memory - } -} - -task ExtractAdaptersFastqc { - input { - File inputFile - String outputDir - String adapterOutputFilePath = outputDir + "/adapter.list" - String contamsOutputFilePath = outputDir + "/contaminations.list" - Boolean? skipContams - File? knownContamFile - File? knownAdapterFile - Float? adapterCutoff - Boolean? outputAsFasta - - String memory = "40G" # This is ridiculous, but needed due to vmem monitoring on SGE. - String javaXmx = "8G" - String dockerImage = "quay.io/biocontainers/biopet-extractadaptersfastqc:0.2--1" - } - - command { - set -e - mkdir -p ~{outputDir} - biopet-extractadaptersfastqc -Xmx~{javaXmx} \ - --inputFile ~{inputFile} \ - ~{"--adapterOutputFile " + adapterOutputFilePath } \ - ~{"--contamsOutputFile " + contamsOutputFilePath } \ - ~{"--knownContamFile " + knownContamFile} \ - ~{"--knownAdapterFile " + knownAdapterFile} \ - ~{"--adapterCutoff " + adapterCutoff} \ - ~{true="--skipContams" false="" skipContams} \ - ~{true="--outputAsFasta" false="" outputAsFasta} - } - - output { - File adapterOutputFile = adapterOutputFilePath - File contamsOutputFile = contamsOutputFilePath - Array[String] adapterList = read_lines(adapterOutputFile) - Array[String] contamsList = read_lines(contamsOutputFile) - } - - runtime { - memory: memory - docker: dockerImage - } -} - -task FastqSplitter { - input { - String? preCommand - File inputFastq - Array[String]+ outputPaths - File? toolJar - - String memory = "12G" - String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/biopet-fastqsplitter:0.1--2" - } - - command { - set -e - mkdir -p $(dirname ~{sep=') $(dirname ' outputPaths}) - biopet-fastqsplitter -Xmx~{javaXmx} \ - -I ~{inputFastq} \ - -o ~{sep=' -o ' outputPaths} - } - - output { - Array[File] chunks = outputPaths - } - - runtime { - memory: memory - docker: dockerImage - } -} - -task FastqSync { - input { - String? preCommand - FastqPair refFastq - FastqPair inputFastq - String out1path - String out2path - File? toolJar - - String memory = "10G" - String javaXmx = "4G" - } - - String toolCommand = if defined(toolJar) - then "java -Xmx~{javaXmx} -jar " + toolJar - else "biopet-fastqsync -Xmx~{javaXmx}" - - command { - set -e -o pipefail - ~{preCommand} - mkdir -p $(dirname ~{out1path}) $(dirname ~{out2path}) - ~{toolCommand} \ - --in1 ~{inputFastq.R1} \ - --in2 ~{inputFastq.R2} \ - --ref1 ~{refFastq.R1} \ - --ref2 ~{refFastq.R2} \ - --out1 ~{out1path} \ - --out2 ~{out2path} - } - - output { - FastqPair out1 = object { - R1: out1path, - R2: out2path - } - } - - runtime { - memory: memory - } -} - -task ReorderGlobbedScatters { - input { - Array[File]+ scatters - - # Should not be changed from the main pipeline. As it should not influence results. - # The 3.7-slim container is 143 mb on the filesystem. 3.7 is 927 mb. - # The slim container is sufficient for this small task. - String dockerImage = "python:3.7-slim" - } - - command <<< - set -e - # Copy all the scatter files to the CWD so the output matches paths in - # the cwd. - for file in ~{sep=" " scatters} - do cp $file . - done - python << CODE - from os.path import basename - scatters = ['~{sep="','" scatters}'] - splitext = [basename(x).split(".") for x in scatters] - splitnum = [x.split("-") + [y] for x,y in splitext] - ordered = sorted(splitnum, key=lambda x: int(x[1])) - merged = ["{}-{}.{}".format(x[0],x[1],x[2]) for x in ordered] - for x in merged: - print(x) - CODE - >>> - - output { - Array[File] reorderedScatters = read_lines(stdout()) - } - - runtime { - docker: dockerImage - # 4 gigs of memory to be able to build the docker image in singularity - memory: "4G" - } - - parameter_meta { - scatters: {description: "The files which should be ordered.", category: "required"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} - } -} - -task ScatterRegions { - input { - File referenceFasta - File referenceFastaDict - Int? scatterSize - File? regions - Boolean notSplitContigs = false - File? bamFile - File? bamIndex - - String memory = "24G" - String javaXmx = "8G" - String dockerImage = "quay.io/biocontainers/biopet-scatterregions:0.2--0" - } - - # OutDirPath must be defined here because the glob process relies on - # linking. This path must be in the containers filesystem, otherwise the - # linking does not work. - String outputDirPath = "scatters" - - command <<< - set -e -o pipefail - mkdir -p ~{outputDirPath} - biopet-scatterregions -Xmx~{javaXmx} \ - -R ~{referenceFasta} \ - -o ~{outputDirPath} \ - ~{"-s " + scatterSize} \ - ~{"-L " + regions} \ - ~{"--bamFile " + bamFile} \ - ~{true="--notSplitContigs" false="" notSplitContigs} - - # Glob messes with order of scatters (10 comes before 1), which causes - # problems at gatherGvcfs - # Therefore we reorder the scatters with python. - python << CODE - import os - scatters = os.listdir("~{outputDirPath}") - splitext = [ x.split(".") for x in scatters] - splitnum = [x.split("-") + [y] for x,y in splitext] - ordered = sorted(splitnum, key=lambda x: int(x[1])) - merged = ["~{outputDirPath}/{}-{}.{}".format(x[0],x[1],x[2]) for x in ordered] - for x in merged: - print(x) - CODE - >>> - - output { - Array[File] scatters = read_lines(stdout()) - } - - runtime { - docker: dockerImage - memory: memory - } - - parameter_meta { - referenceFasta: {description: "The reference fasta file.", category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} - scatterSize: {description: "Equivalent to biopet scatterregions' `-s` option.", category: "common"} - regions: {description: "The regions to be scattered.", category: "advanced"} - notSplitContigs: {description: "Equivalent to biopet scatterregions' `--notSplitContigs` flag.", - category: "advanced"} - bamFile: {description: "Equivalent to biopet scatterregions' `--bamfile` option.", - category: "advanced"} - bamIndex: {description: "The index for the bamfile given through bamFile.", category: "advanced"} - - memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} - } -} - -task ValidateAnnotation { - input { - File? refRefflat - File? gtfFile - Reference reference - - String memory = "9G" - String javaXmx = "3G" - String dockerImage = "quay.io/biocontainers/biopet-validateannotation:0.1--0" - } - - command { - biopet-validateannotation -Xmx~{javaXmx} \ - ~{"-r " + refRefflat} \ - ~{"-g " + gtfFile} \ - -R ~{reference.fasta} - } - - output { - File stderr = stderr() - } - - runtime { - memory: memory - docker: dockerImage - } -} - -task ValidateFastq { - input { - File read1 - File? read2 - String memory = "9G" - String javaXmx = "3G" - String dockerImage = "quay.io/biocontainers/biopet-validatefastq:0.1.1--1" - } - - command { - biopet-validatefastq -Xmx~{javaXmx} \ - --fastq1 ~{read1} \ - ~{"--fastq2 " + read2} - } - - output { - File stderr = stderr() - } - - runtime { - memory: memory - docker: dockerImage - } -} - -task ValidateVcf { - input { - IndexedVcfFile vcf - Reference reference - String memory = "9G" - String javaXmx = "3G" - String dockerImage = "quay.io/biocontainers/biopet-validatevcf:0.1--0" - } - - command { - biopet-validatevcf -Xmx~{javaXmx} \ - -i ~{vcf.file} \ - -R ~{reference.fasta} - } - - output { - File stderr = stderr() - } - - runtime { - memory: memory - docker: dockerImage - } -} - -task VcfStats { - input { - IndexedVcfFile vcf - Reference reference - String outputDir - File? intervals - Array[String]+? infoTags - Array[String]+? genotypeTags - Int? sampleToSampleMinDepth - Int? binSize - Int? maxContigsInSingleJob - Boolean writeBinStats = false - Int localThreads = 1 - Boolean notWriteContigStats = false - Boolean skipGeneral = false - Boolean skipGenotype = false - Boolean skipSampleDistributions = false - Boolean skipSampleCompare = false - String? sparkMaster - Int? sparkExecutorMemory - Array[String]+? sparkConfigValues - - String dockerImage = "quay.io/biocontainers/biopet-vcfstats:1.2--0" - String memory = "12G" - String javaXmx = "4G" - } - - command { - set -e - mkdir -p ~{outputDir} - biopet-vcfstats -Xmx~{javaXmx} \ - -I ~{vcf.file} \ - -R ~{reference.fasta} \ - -o ~{outputDir} \ - -t ~{localThreads} \ - ~{"--intervals " + intervals} \ - ~{true="--infoTag" false="" defined(infoTags)} ~{sep=" --infoTag " infoTags} \ - ~{true="--genotypeTag" false="" defined(genotypeTags)} ~{sep=" --genotypeTag " - genotypeTags} \ - ~{"--sampleToSampleMinDepth " + sampleToSampleMinDepth} \ - ~{"--binSize " + binSize} \ - ~{"--maxContigsInSingleJob " + maxContigsInSingleJob} \ - ~{true="--writeBinStats" false="" writeBinStats} \ - ~{true="--notWriteContigStats" false="" notWriteContigStats} \ - ~{true="--skipGeneral" false="" skipGeneral} \ - ~{true="--skipGenotype" false="" skipGenotype} \ - ~{true="--skipSampleDistributions" false="" skipSampleDistributions} \ - ~{true="--skipSampleCompare" false="" skipSampleCompare} \ - ~{"--sparkMaster " + sparkMaster} \ - ~{"--sparkExecutorMemory " + sparkExecutorMemory} \ - ~{true="--sparkConfigValue" false="" defined(sparkConfigValues)} ~{ - sep=" --sparkConfigValue" sparkConfigValues} - } - - output { - File? general = outputDir + "/general.tsv" - File? genotype = outputDir + "/genotype.tsv" - File? sampleDistributionAvailableAggregate = outputDir + - "/sample_distributions/Available.aggregate.tsv" - File? sampleDistributionAvailable = outputDir + "/sample_distributions/Available.tsv" - File? sampleDistributionCalledAggregate = outputDir + - "/sample_distributions/Called.aggregate.tsv" - File? sampleDistributionCalled = outputDir + "/sample_distributions/Called.tsv" - File? sampleDistributionFilteredAggregate = outputDir + - "/sample_distributions/Filtered.aggregate.tsv" - File? sampleDistributionFiltered = outputDir + "/sample_distributions/Filtered.tsv" - File? sampleDistributionHetAggregate = outputDir + "/sample_distributions/Het.aggregate.tsv" - File? sampleDistributionHetNoNRefAggregate = outputDir + - "/sample_distributions/HetNonRef.aggregate.tsv" - File? sampleDistributionHetNonRef = outputDir + "/sample_distributions/HetNonRef.tsv" - File? sampleDistributionHet = outputDir + "/sample_distributions/Het.tsv" - File? sampleDistributionHomAggregate = outputDir + "/sample_distributions/Hom.aggregate.tsv" - File? sampleDistributionHomRefAggregate = outputDir + - "/sample_distributions/HomRef.aggregate.tsv" - File? sampleDistributionHomRef = outputDir + "/sample_distributions/HomRef.tsv" - File? sampleDistributionHom = outputDir + "/sample_distributions/Hom.tsv" - File? sampleDistributionHomVarAggregate = outputDir + - "/sample_distributions/HomVar.aggregate.tsv" - File? sampleDistributionHomVar = outputDir + "/sample_distributions/HomVar.tsv" - File? sampleDistributionMixedAggregate = outputDir + - "/sample_distributions/Mixed.aggregate.tsv" - File? sampleDistributionMixed = outputDir + "/sample_distributions/Mixed.tsv" - File? sampleDistributionNoCallAggregate = outputDir + - "/sample_distributions/NoCall.aggregate.tsv" - File? sampleDistributionNoCall = outputDir + "/sample_distributions/NoCall.tsv" - File? sampleDistributionNonInformativeAggregate = outputDir + - "/sample_distributions/NonInformative.aggregate.tsv" - File? sampleDistributionNonInformative = outputDir + - "/sample_distributions/NonInformative.tsv" - File? sampleDistributionToalAggregate = outputDir + - "/sample_distributions/Total.aggregate.tsv" - File? sampleDistributionTotal = outputDir + "/sample_distributions/Total.tsv" - File? sampleDistributionVariantAggregate = outputDir + - "/sample_distributions/Variant.aggregate.tsv" - File? sampleDistributionVariant = outputDir + "/sample_distributions/Variant.tsv" - File? sampleCompareAlleleAbs = outputDir + "/sample_compare/allele.abs.tsv" - File? sampleCompareAlleleNonRefAbs = outputDir + "/sample_compare/allele.non_ref.abs.tsv" - File? sampleCompareAlleleRefAbs = outputDir + "/sample_compare/allele.ref.abs.tsv" - File? sampleCompareAlleleRel = outputDir + "/sample_compare/allele.rel.tsv" - File? sampleCompareGenotypeAbs = outputDir + "/sample_compare/genotype.abs.tsv" - File? sampleCompareGenotypeNonRefAbs = outputDir + - "/sample_compare/genotype.non_ref.abs.tsv" - File? sampleCompareGenotypeRefAbs = outputDir + "/sample_compare/genotype.ref.abs.tsv" - File? sampleCompareGenotypeRel = outputDir + "/sample_compare/genotype.rel.tsv" - # A glob is easier, but duplicates all the outputs - Array[File] allStats = select_all([ - general, - genotype, - sampleDistributionAvailableAggregate, - sampleDistributionAvailable, - sampleDistributionCalledAggregate, - sampleDistributionCalled, - sampleDistributionFilteredAggregate, - sampleDistributionFiltered, - sampleDistributionHetAggregate, - sampleDistributionHetNoNRefAggregate, - sampleDistributionHetNonRef, - sampleDistributionHet, - sampleDistributionHomAggregate, - sampleDistributionHomRefAggregate, - sampleDistributionHomRef, - sampleDistributionHom, - sampleDistributionHomVarAggregate, - sampleDistributionHomVar, - sampleDistributionMixedAggregate, - sampleDistributionMixed, - sampleDistributionNoCallAggregate, - sampleDistributionNoCall, - sampleDistributionNonInformativeAggregate, - sampleDistributionNonInformative, - sampleDistributionToalAggregate, - sampleDistributionTotal, - sampleDistributionVariantAggregate, - sampleDistributionVariant, - sampleCompareAlleleAbs, - sampleCompareAlleleNonRefAbs, - sampleCompareAlleleRefAbs, - sampleCompareAlleleRel, - sampleCompareGenotypeAbs, - sampleCompareGenotypeNonRefAbs, - sampleCompareGenotypeRefAbs, - sampleCompareGenotypeRel - ]) - } - - runtime { - cpu: localThreads - memory: memory - docker: dockerImage - } -} diff --git a/biopet/sampleconfig.wdl b/biopet/sampleconfig.wdl deleted file mode 100644 index 0fbd466a..00000000 --- a/biopet/sampleconfig.wdl +++ /dev/null @@ -1,140 +0,0 @@ -version 1.0 - -# Copyright (c) 2017 Leiden University Medical Center -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -import "../common.wdl" as common - -task SampleConfig { - input { - File? toolJar - String? preCommand - Array[File]+ inputFiles - String keyFilePath - String? sample - String? library - String? readgroup - String? jsonOutputPath - String? tsvOutputPath - - String memory = "8G" - String javaXmx = "16G" - } - - String toolCommand = if defined(toolJar) - then "java -Xmx~{javaXmx} -jar " + toolJar - else "biopet-sampleconfig -Xmx~{javaXmx}" - - command { - set -e -o pipefail - ~{preCommand} - mkdir -p . ~{"$(dirname " + jsonOutputPath + ")"} ~{"$(dirname " + tsvOutputPath + ")"} - ~{toolCommand} \ - -i ~{sep="-i " inputFiles} \ - ~{"--sample " + sample} \ - ~{"--library " + library} \ - ~{"--readgroup " + readgroup} \ - ~{"--jsonOutput " + jsonOutputPath} \ - ~{"--tsvOutput " + tsvOutputPath} \ - > ~{keyFilePath} - } - - output { - File keysFile = keyFilePath - File? jsonOutput = jsonOutputPath - File? tsvOutput = tsvOutputPath - } - - runtime { - memory: memory - } -} - -task SampleConfigCromwellArrays { - input { - File? toolJar - String? preCommand - Array[File]+ inputFiles - String outputPath - - String memory = "8G" - String javaXmx = "4G" - } - - String toolCommand = if defined(toolJar) - then "java -Xmx~{javaXmx} -jar " + toolJar - else "biopet-sampleconfig -Xmx~{javaXmx}" - - command { - set -e -o pipefail - ~{preCommand} - mkdir -p $(dirname ~{outputPath}) - ~{toolCommand} CromwellArrays \ - -i ~{sep="-i " inputFiles} \ - ~{"-o " + outputPath} - } - - output { - File outputFile = outputPath - } - - runtime { - memory: memory - } -} - -task CaseControl { - input { - File? toolJar - String? preCommand - Array[File]+ inputFiles - Array[File]+ inputIndexFiles - Array[File]+ sampleConfigs - String outputPath - String controlTag = "control" - - String memory = "8G" - String javaXmx = "4G" - } - - String toolCommand = if defined(toolJar) - then "java -Xmx~{javaXmx} -jar " + toolJar - else "biopet-sampleconfig -Xmx~{javaXmx}" - - command { - set -e -o pipefail - ~{preCommand} - mkdir -p $(dirname ~{outputPath}) - ~{toolCommand} CaseControl \ - -i ~{sep=" -i " inputFiles} \ - -s ~{sep=" -s " sampleConfigs} \ - ~{"-o " + outputPath} \ - ~{"--controlTag " + controlTag} - } - - output { - File outputFile = outputPath - CaseControls caseControls = read_json(outputFile) - } - - runtime { - memory: memory - } -} diff --git a/biopet/seqstat.wdl b/biopet/seqstat.wdl deleted file mode 100644 index 6694a759..00000000 --- a/biopet/seqstat.wdl +++ /dev/null @@ -1,63 +0,0 @@ -version 1.0 - -# Copyright (c) 2017 Leiden University Medical Center -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -import "../common.wdl" as common - -task Generate { - input { - String? preCommand - File? toolJar - FastqPair fastq - String outputFile - String sample - String library - String readgroup - - String memory = "10G" - String javaXmx = "4G" - } - - String toolCommand = if defined(toolJar) - then "java -Xmx~{javaXmx} -jar " + toolJar - else "biopet-seqstat -Xmx~{javaXmx}" - - command { - set -e -o pipefail - ~{preCommand} - mkdir -p $(dirname ~{outputFile}) - ~{toolCommand} Generate \ - --fastqR1 ~{fastq.R1} \ - ~{"--fastqR2 " + fastq.R2} \ - --output ~{outputFile} \ - ~{"--sample " + sample} \ - ~{"--library " + library } \ - ~{"--readgroup " + readgroup } - } - - output { - File json = outputFile - } - - runtime { - memory: memory - } -} \ No newline at end of file diff --git a/biowdl.wdl b/biowdl.wdl index 7aa68b27..f891618e 100644 --- a/biowdl.wdl +++ b/biowdl.wdl @@ -31,7 +31,10 @@ task InputConverter { Boolean skipFileCheck=true Boolean checkFileMd5sums=false Boolean old=false - String dockerImage = "quay.io/biocontainers/biowdl-input-converter:0.2.1--py_0" + + String memory = "128MiB" + Int timeMinutes = 1 + String dockerImage = "quay.io/biocontainers/biowdl-input-converter:0.3.0--pyhdfd78af_0" } command <<< @@ -50,19 +53,23 @@ task InputConverter { } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs samplesheet: {description: "The samplesheet to be processed.", category: "required"} - outputFile: {description: "The location the JSON representation of the samplesheet should be written to.", - category: "advanced"} - skipFileCheck: {description: "Whether or not the existance of the files mentioned in the samplesheet should be checked.", - category: "advanced"} - checkFileMd5sums: {description: "Whether or not the MD5 sums of the files mentioned in the samplesheet should be checked.", - category: "advanced"} + outputFile: {description: "The location the JSON representation of the samplesheet should be written to.", category: "advanced"} + skipFileCheck: {description: "Whether or not the existance of the files mentioned in the samplesheet should be checked.", category: "advanced"} + checkFileMd5sums: {description: "Whether or not the MD5 sums of the files mentioned in the samplesheet should be checked.", category: "advanced"} old: {description: "Whether or not the old samplesheet format should be used.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + json: {description: "JSON file version of the input sample sheet."} } } diff --git a/bowtie.wdl b/bowtie.wdl index 18fd6146..7e817594 100644 --- a/bowtie.wdl +++ b/bowtie.wdl @@ -1,7 +1,5 @@ version 1.0 -# MIT License -# # Copyright (c) 2018 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -28,29 +26,31 @@ task Bowtie { Array[File] readsDownstream = [] String outputPath = "mapped.bam" Array[File]+ indexFiles - Int? seedmms - Int? seedlen - Int? k Boolean best = false Boolean strata = false Boolean allowContain = false + + Int? seedmms + Int? seedlen + Int? k String? samRG - Int threads = 1 - String memory = "16G" String picardXmx = "4G" + Int threads = 1 + String memory = "~{5 + ceil(size(indexFiles, "GiB"))}GiB" + Int timeMinutes = 1 + ceil(size(flatten([readsUpstream, readsDownstream]), "G") * 300 / threads) # Image contains bowtie=1.2.2 and picard=2.9.2 String dockerImage = "quay.io/biocontainers/mulled-v2-bfe71839265127576d3cd749c056e7b168308d56:1d8bec77b352cdcf3e9ff3d20af238b33ed96eae-0" } # Assume fastq input with -q flag. - # The output always needs to be SAM as it is piped into Picard SortSam + # The output always needs to be SAM as it is piped into Picard SortSam. # Hence, the --sam flag is used. - command { set -e -o pipefail mkdir -p "$(dirname ~{outputPath})" - bowtie -q \ + bowtie \ + -q \ --sam \ ~{"--seedmms " + seedmms} \ ~{"--seedlen " + seedlen} \ @@ -78,32 +78,31 @@ task Bowtie { runtime { cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs readsUpstream: {description: "The first-/single-end fastq files.", category: "required"} readsDownstream: {description: "The second-end fastq files.", category: "common"} outputPath: {description: "The location the output BAM file should be written to.", category: "common"} indexFiles: {description: "The index files for bowtie.", category: "required"} - seedmms: {description: "Equivalent to bowtie's `--seedmms` option.", category: "advanced"} - seedlen: {description: "Equivalent to bowtie's `--seedlen` option.", category: "advanced"} - k: {description: "Equivalent to bowtie's `-k` option.", category: "advanced"} best: {description: "Equivalent to bowtie's `--best` flag.", category: "advanced"} strata: {description: "Equivalent to bowtie's `--strata` flag.", category: "advanced"} allowContain: {description: "Equivalent to bowtie's `--allow-contain` flag.", category: "advanced"} + seedmms: {description: "Equivalent to bowtie's `--seedmms` option.", category: "advanced"} + seedlen: {description: "Equivalent to bowtie's `--seedlen` option.", category: "advanced"} + k: {description: "Equivalent to bowtie's `-k` option.", category: "advanced"} samRG: {description: "Equivalent to bowtie's `--sam-RG` option.", category: "advanced"} - - picardXmx: {description: "The maximum memory available to the picard (used for sorting the output). Should be lower than `memory` to accommodate JVM overhead and bowtie's memory usage.", - category: "advanced"} + picardXmx: {description: "The maximum memory available to the picard (used for sorting the output). Should be lower than `memory` to accommodate JVM overhead and bowtie's memory usage.", category: "advanced"} threads: {description: "The number of threads to use.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "Output alignment file."} + outputBamIndex: {description: "Index of output alignment file."} } } - -struct BowtieIndex { - File fasta - Array[File] indexFiles -} \ No newline at end of file diff --git a/bwa-mem2.wdl b/bwa-mem2.wdl new file mode 100644 index 00000000..b3db0ad1 --- /dev/null +++ b/bwa-mem2.wdl @@ -0,0 +1,118 @@ +version 1.0 + +# Copyright (c) 2017 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Mem { + input { + File read1 + File? read2 + BwaIndex bwaIndex + String outputPrefix + Boolean sixtyFour = false + Boolean usePostalt = false + Int sortMemoryPerThreadGb = 2 + Int compressionLevel = 1 + + String? readgroup + Int? sortThreads + + Int threads = 4 + Int? memoryGb + Int timeMinutes = 1 + ceil(size([read1, read2], "GiB") * 220 / threads) + # Contains bwa-mem2 2.0 bwakit 0.7.17.dev1 and samtools 1.10. + String dockerImage = "quay.io/biocontainers/mulled-v2-6a15c99309c82b345497d24489bee67bbb76c2f6:1c9c3227b9bf825a8dc9726a25701aa23c0b1f12-0" + } + + # Samtools sort may block the pipe while it is writing data to disk. + # This can lead to cpu underutilization. + # 1 thread if threads is 1. For 2-4 threads 2 sort threads. 3 sort threads for 5-8 threads. + Int estimatedSortThreads = if threads == 1 then 1 else 1 + ceil(threads / 4.0) + Int totalSortThreads = select_first([sortThreads, estimatedSortThreads]) + # BWA-mem2's index files contain 2 BWT indexes of which only one is used. .2bit64 is used by default and + # .8bit32 is used for avx2. + # The larger one of these is the 8bit32 index. Since we do not know beforehand which one is used we need to accomodate for that. + # Using only the 8bit32 index uses 57,5% of the index files. Since bwa-mem2 uses slightly more memory than the index. + # We put it at 62% as a safety factor. That means the memory usage for bwa-mem will be 53G for a human genome. Resulting in 60G total + # on 8 cores with samtools with 3 sort threads. + Int estimatedMemoryGb = 1 + ceil(size(bwaIndex.indexFiles, "G") * 0.62) + sortMemoryPerThreadGb * totalSortThreads + + # The bwa postalt script is out commented as soon as usePostalt = false. + # This hack was tested with bash, dash and ash. It seems that comments in between pipes work for all of them. + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + bwa-mem2 mem \ + -t ~{threads} \ + ~{"-R '" + readgroup}~{true="'" false="" defined(readgroup)} \ + ~{bwaIndex.fastaFile} \ + ~{read1} \ + ~{read2} \ + 2> ~{outputPrefix}.log.bwamem | \ + ~{true="" false="#" usePostalt} bwa-postalt.js -p ~{outputPrefix}.hla ~{bwaIndex.fastaFile}~{true=".64.alt" false=".alt" sixtyFour} | \ + samtools sort \ + ~{"-@ " + totalSortThreads} \ + -m ~{sortMemoryPerThreadGb}G \ + -l ~{compressionLevel} \ + - \ + -o ~{outputPrefix}.aln.bam + } + + output { + File outputBam = outputPrefix + ".aln.bam" + File? outputHla = outputPrefix + ".hla" + } + + runtime { + # One extra thread for bwa-postalt + samtools is not needed. + # These only use 5-10% of compute power and not always simultaneously. + cpu: threads + memory: "~{select_first([memoryGb, estimatedMemoryGb])}GiB" + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + read1: {description: "The first-end fastq file.", category: "required"} + read2: {description: "The second-end fastq file.", category: "common"} + bwaIndex: {description: "The BWA index, including (optionally) a .alt file.", category: "required"} + outputPrefix: {description: "The prefix of the output files, including any parent directories.", category: "required"} + sixtyFour: {description: "Whether or not the index uses the '.64' suffixes.", category: "common"} + usePostalt: {description: "Whether to use the postalt script from bwa kit."} + sortMemoryPerThreadGb: {description: "The amount of memory for each sorting thread in gigabytes.", category: "advanced"} + compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"} + readgroup: {description: "A readgroup identifier.", category: "common"} + sortThreads: {description: "The number of threads to use for sorting.", category: "advanced"} + threads: {description: "The number of threads to use for alignment.", category: "advanced"} + memoryGb: {description: "The amount of memory this job will use in gigabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "The produced BAM file."} + outputHla: {description: "The produced HLA file."} + } +} + +struct BwaIndex { + File fastaFile + Array[File] indexFiles +} diff --git a/bwa.wdl b/bwa.wdl index fec2b09f..66b8e8cc 100644 --- a/bwa.wdl +++ b/bwa.wdl @@ -21,115 +21,68 @@ version 1.0 # SOFTWARE. task Mem { - input { - File read1 - File? read2 - BwaIndex bwaIndex - String outputPath - String? readgroup - - Int threads = 4 - String memory = "32G" - String picardXmx = "4G" - # A mulled container is needed to have both picard and bwa in one container. - # This container contains: picard (2.18.7), bwa (0.7.17-r1188) - String dockerImage = "quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0" - } - - command { - set -e -o pipefail - mkdir -p "$(dirname ~{outputPath})" - bwa mem \ - ~{"-t " + threads} \ - ~{"-R '" + readgroup}~{true="'" false="" defined(readgroup)} \ - ~{bwaIndex.fastaFile} \ - ~{read1} \ - ~{read2} \ - | picard -Xmx~{picardXmx} SortSam \ - INPUT=/dev/stdin \ - OUTPUT=~{outputPath} \ - SORT_ORDER=coordinate \ - CREATE_INDEX=true - } - - output { - File outputBam = outputPath - File outputBamIndex = sub(outputPath, "\.bam$", ".bai") - } - - runtime { - cpu: threads - memory: memory - docker: dockerImage - } - - parameter_meta { - read1: {description: "The first or single end fastq file.", category: "required"} - read2: {description: "The second end fastq file.", category: "common"} - bwaIndex: {description: "The BWA index files.", category: "required"} - outputPath: {description: "The location the output BAM file should be written to.", category: "required"} - readgroup: {description: "The readgroup to be assigned to the reads. See BWA mem's `-R` option.", category: "common"} - - threads: {description: "The number of threads to use.", category: "advanced"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - picardXmx: {description: "The maximum memory available to picard SortSam. Should be lower than `memory` to accommodate JVM overhead and BWA mem's memory usage.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} - } -} - -task Kit { input { File read1 File? read2 BwaIndex bwaIndex String outputPrefix - String? readgroup Boolean sixtyFour = false + Boolean usePostalt = false + Boolean useSoftclippingForSupplementary = false + Int sortMemoryPerThreadGb = 2 + Int compressionLevel = 1 - Int threads = 4 - # Samtools uses *additional* threads. So by default this option should - # not be used. + String? readgroup Int? sortThreads - # Compression uses zlib. Higher than level 2 causes enormous slowdowns. - # GATK/Picard default is level 2. - String sortMemoryPerThread = "4G" - Int compressionLevel = 1 - String memory = "32G" - String dockerImage = "biocontainers/bwakit:v0.7.15_cv1" + + Int threads = 4 + Int? memoryGb + Int timeMinutes = 10 + ceil(size([read1, read2], "GiB") * 300 / threads) + # Contains bwa 0.7.17 bwakit 0.7.17.dev1 and samtools 1.10. + String dockerImage = "quay.io/biocontainers/mulled-v2-ad317f19f5881324e963f6a6d464d696a2825ab6:c59b7a73c87a9fe81737d5d628e10a3b5807f453-0" } + # Samtools sort may block the pipe while it is writing data to disk. + # This can lead to cpu underutilization. + # 1 thread if threads is 1. For 2-4 threads 2 sort threads. 3 sort threads for 5-8 threads. + Int estimatedSortThreads = if threads == 1 then 1 else 1 + ceil(threads / 4.0) + Int totalSortThreads = select_first([sortThreads, estimatedSortThreads]) + # BWA needs slightly more memory than the size of the index files (~10%). Add a margin for safety here. + Int estimatedMemoryGb = 10 + ceil(size(bwaIndex.indexFiles, "GiB") * 2) + sortMemoryPerThreadGb * totalSortThreads + + # The bwa postalt script is out commented as soon as usePostalt = false. + # This hack was tested with bash, dash and ash. It seems that comments in between pipes work for all of them. command { set -e mkdir -p "$(dirname ~{outputPrefix})" bwa mem \ -t ~{threads} \ + ~{if useSoftclippingForSupplementary then "-Y" else ""} \ ~{"-R '" + readgroup}~{true="'" false="" defined(readgroup)} \ ~{bwaIndex.fastaFile} \ ~{read1} \ ~{read2} \ 2> ~{outputPrefix}.log.bwamem | \ - k8 /opt/conda/bin/bwa-postalt.js \ - -p ~{outputPrefix}.hla \ - ~{bwaIndex.fastaFile}~{true=".64.alt" false=".alt" sixtyFour} | \ - samtools sort \ - ~{"-@ " + sortThreads} \ - -m ~{sortMemoryPerThread} \ + ~{true="" false="#" usePostalt} bwa-postalt.js -p ~{outputPrefix}.hla ~{bwaIndex.fastaFile}~{true=".64.alt" false=".alt" sixtyFour} | \ + samtools sort \ + ~{"-@ " + totalSortThreads} \ + -m ~{sortMemoryPerThreadGb}G \ -l ~{compressionLevel} \ - \ -o ~{outputPrefix}.aln.bam - samtools index ~{outputPrefix}.aln.bam ~{outputPrefix}.aln.bai } output { File outputBam = outputPrefix + ".aln.bam" - File outputBamIndex = outputPrefix + ".aln.bai" + File? outputHla = outputPrefix + ".hla" } runtime { - cpu: threads + 1 # One thread for bwa-postalt + samtools. - memory: memory + # One extra thread for bwa-postalt + samtools is not needed. + # These only use 5-10% of compute power and not always simultaneously. + cpu: threads + memory: "~{select_first([memoryGb, estimatedMemoryGb])}GiB" + time_minutes: timeMinutes docker: dockerImage } @@ -137,21 +90,23 @@ task Kit { # inputs read1: {description: "The first-end fastq file.", category: "required"} read2: {description: "The second-end fastq file.", category: "common"} - bwaIndex: {description: "The BWA index, including a .alt file.", category: "required"} + bwaIndex: {description: "The BWA index, including (optionally) a .alt file.", category: "required"} outputPrefix: {description: "The prefix of the output files, including any parent directories.", category: "required"} - readgroup: {description: "A readgroup identifier.", category: "common"} sixtyFour: {description: "Whether or not the index uses the '.64' suffixes.", category: "common"} - threads: {description: "The number of threads to use for alignment.", category: "advanced"} - sortThreads: {description: "The number of additional threads to use for sorting.", category: "advanced"} - sortMemoryPerThread: {description: "The amount of memory for each sorting thread.", category: "advanced"} + usePostalt: {description: "Whether to use the postalt script from bwa kit."} + useSoftclippingForSupplementary: {description: "Use soft-clipping for supplementary alignments instead of hard-clipping", category: "common"} + sortMemoryPerThreadGb: {description: "The amount of memory for each sorting thread in gigabytes.", category: "advanced"} compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + readgroup: {description: "A readgroup identifier.", category: "common"} + sortThreads: {description: "The number of threads to use for sorting.", category: "advanced"} + threads: {description: "The number of threads to use for alignment.", category: "advanced"} + memoryGb: {description: "The amount of memory this job will use in gigabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputBam: "The produced BAM file." - outputBamIndex: "The index of the produced BAM file." + outputBam: {description: "The produced BAM file."} + outputHla: {description: "The produced HLA file."} } } @@ -159,3 +114,48 @@ struct BwaIndex { File fastaFile Array[File] indexFiles } + +task Index { + input { + File fasta + String dockerImage = "quay.io/biocontainers/bwa:0.7.17--hed695b0_7" + Int? timeMinutes = 5 + ceil(size(fasta, "G") * 5) + } + String indexedFile = basename(fasta) + + command { + set -e + cp ~{fasta} ~{indexedFile} + bwa index ~{indexedFile} + } + + output { + BwaIndex index = object { + fastaFile: indexedFile, + indexFiles: [ + indexedFile + ".amb", + indexedFile + ".ann", + indexedFile + ".bwt", + indexedFile + ".pac", + indexedFile + ".sa" + ] + } + } + + runtime { + docker: dockerImage + cpu: 1 + memory: "~{size(fasta, 'G') + 1}GiB" + time_minutes: timeMinutes + } + + parameter_meta { + # inputs + fasta: {description: "Reference fasta file.", category: "required"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + index: {description: "The produced BWA index."} + } +} diff --git a/ccs.wdl b/ccs.wdl index 39bb0a19..27db15ab 100644 --- a/ccs.wdl +++ b/ccs.wdl @@ -1,6 +1,6 @@ version 1.0 -# Copyright (c) 2020 Sequencing Analysis Support Core - Leiden University Medical Center +# Copyright (c) 2020 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -8,10 +8,10 @@ version 1.0 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -22,18 +22,29 @@ version 1.0 task CCS { input { + File subreadsFile + String outputPrefix + String logLevel = "WARN" Int minPasses = 3 + Int topPasses = 60 Int minLength = 10 Int maxLength = 50000 Boolean byStrand = false + Boolean skipPolish = false + Boolean all = false + Boolean subreadFallback = false + Boolean allKinetics = false + Boolean hifiKinetics = false + Float minSnr = 2.5 Float minReadQuality = 0.99 - String logLevel = "WARN" - File subreadsFile - String outputPrefix - - Int cores = 4 - String memory = "10G" - String dockerImage = "quay.io/biocontainers/pbccs:4.2.0--0" + + File? subreadsIndexFile + String? chunkString + + Int threads = 2 + String memory = "4GiB" + Int timeMinutes = 1440 + String dockerImage = "quay.io/biocontainers/pbccs:6.0.0--h9ee0642_2" } command { @@ -41,49 +52,74 @@ task CCS { mkdir -p "$(dirname ~{outputPrefix})" ccs \ --min-passes ~{minPasses} \ + --min-snr ~{minSnr} \ + --top-passes ~{topPasses} \ --min-length ~{minLength} \ --max-length ~{maxLength} \ ~{true="--by-strand" false="" byStrand} \ + ~{true="--skip-polish" false="" skipPolish} \ + ~{true="--all" false="" all} \ + ~{true="--subread-fallback" false="" subreadFallback} \ + ~{true="--all-kinetics" false="" allKinetics} \ + ~{true="--hifi-kinetics" false="" hifiKinetics} \ --min-rq ~{minReadQuality} \ --log-level ~{logLevel} \ - --num-threads ~{cores} \ - ~{"--report-file " + outputPrefix + ".ccs.report.txt"} \ + --num-threads ~{threads} \ + ~{"--chunk " + chunkString} \ + ~{"--report-file " + outputPrefix + ".ccs_report.txt"} \ + ~{"--report-json " + outputPrefix + ".ccs.report.json"} \ ~{"--log-file " + outputPrefix + ".ccs.stderr.log"} \ + ~{"--metrics-json " + outputPrefix + ".zmw_metrics.json.gz"} \ ~{subreadsFile} \ ~{outputPrefix + ".ccs.bam"} } output { - File outputCCSfile = outputPrefix + ".ccs.bam" - File outputCCSindexFile = outputPrefix + ".ccs.bam.pbi" - File outputReportFile = outputPrefix + ".ccs.report.txt" - File outputSTDERRfile = outputPrefix + ".ccs.stderr.log" + File ccsBam = outputPrefix + ".ccs.bam" + File ccsBamIndex = outputPrefix + ".ccs.bam.pbi" + File ccsReport = outputPrefix + ".ccs_report.txt" + File ccsJsonReport = outputPrefix + ".ccs.report.json" + File ccsStderr = outputPrefix + ".ccs.stderr.log" + File zmwMetrics = outputPrefix + ".zmw_metrics.json.gz" } runtime { - cpu: cores + cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - minPasses: {description: "Minimum number of full-length subreads required to generate CCS for a ZMW.", category: "advanced"} + subreadsFile: {description: "Subreads input file.", category: "required"} + outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + logLevel: {description: "Set log level. Valid choices: (TRACE, DEBUG, INFO, WARN, FATAL).", category: "advanced"} + minPasses: {description: "Minimum number of full-length subreads required to generate ccs for a ZMW.", category: "advanced"} + topPasses: {description: "Pick at maximum the top N passes for each ZMW.", category: "advanced"} minLength: {description: "Minimum draft length before polishing.", category: "advanced"} maxLength: {description: "Maximum draft length before polishing.", category: "advanced"} byStrand: {description: "Generate a consensus for each strand.", category: "advanced"} + skipPolish: {description: "Only output the initial draft template (faster, less accurate).", category: "advanced"} + all: {description: "Emit all ZMWs.", category: "advanced"} + subreadFallback: {description: "Emit a representative subread, instead of the draft consensus, if polishing failed.", category: "advanced"} + allKinetics: {description: "Calculate mean pulse widths (PW) and interpulse durations (IPD) for every ZMW.", category: "advanced"} + hifiKinetics: {description: "Calculate mean pulse widths (PW) and interpulse durations (IPD) for every HiFi read.", category: "advanced"} + minSnr: {description: "Minimum SNR of subreads to use for generating CCS.", category: "advanced"} minReadQuality: {description: "Minimum predicted accuracy in [0, 1].", category: "common"} - logLevel: {description: "Set log level. Valid choices: (TRACE, DEBUG, INFO, WARN, FATAL).", category: "advanced"} - subreadsFile: {description: "Subreads input file.", category: "required"} - outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - cores: {description: "The number of cores to be used.", category: "advanced"} + subreadsIndexFile: {description: "Index for the subreads input file, required when using chunkString.", category: "advanced"} + chunkString: {descpription: "Chunk string (e.g. 1/4, 5/5) for CCS.", category: "advanced"} + threads: {description: "The number of threads to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputCCSfile: {description: "Consensus reads output file."} - outputCCSindexFile: {description: "Index of consensus reads output file."} - outputReportFile: {description: "CCS results report file."} - outputSTDERRfile: {description: "CCS STDERR log file."} + ccsBam: {description: "Consensus reads output file."} + ccsBamIndex: {description: "Index of consensus reads output file."} + ccsReport: {description: "Ccs report file."} + ccsJsonReport: {description: "Ccs results json report file."} + ccsStderr: {description: "Ccs STDERR log file."} + zmwMetrics: {description: "ZMW metrics json file."} } } diff --git a/centrifuge.wdl b/centrifuge.wdl index 1fbc7be1..41a907ae 100644 --- a/centrifuge.wdl +++ b/centrifuge.wdl @@ -36,7 +36,8 @@ task Build { File? sizeTable Int threads = 5 - String memory = "20G" + String memory = "20GiB" + Int timeMinutes = 2880 String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5" } @@ -58,12 +59,13 @@ task Build { } output { - Array[File] outputIndex = glob(outputPrefix + "/" + indexBasename + "*.cf") + Array[File] index = glob(outputPrefix + "/" + indexBasename + "*.cf") } runtime { cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } @@ -73,7 +75,7 @@ task Build { conversionTable: {description: "List of UIDs (unique ID) and corresponding taxonomic IDs.", category: "required"} taxonomyTree: {description: "Taxonomic tree (e.g. nodes.dmp).", category: "required"} nameTable: {description: "Name table (e.g. names.dmp).", category: "required"} - referenceFile: {description: "A comma-separated list of FASTA files containing the reference sequences to be aligned to.", category: "required"} + referenceFile: {description: "A comma-separated list of fasta files containing the reference sequences to be aligned to.", category: "required"} indexBasename: {description: "The basename of the index files to write.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} offrate: {description: "The number of rows marked by the indexer.", category: "common"} @@ -82,22 +84,23 @@ task Build { sizeTable: {description: "List of taxonomic IDs and lengths of the sequences belonging to the same taxonomic IDs.", category: "common"} threads: {description: "The number of threads to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputIndex: {description: "Generated Centrifuge index."} + index: {description: "Generated centrifuge index."} } } task Classify { input { + Array[File]+ read1 + Array[File] read2 = [] String inputFormat = "fastq" Boolean phred64 = false Int minHitLength = 22 Array[File]+ indexFiles - Array[File]+ read1 String outputPrefix - Array[File] read2 = [] Int? trim5 Int? trim3 @@ -106,7 +109,8 @@ task Classify { String? excludeTaxIDs Int threads = 4 - String memory = "16G" + String memory = "16GiB" + Int timeMinutes = 2880 String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5" } @@ -118,13 +122,12 @@ task Classify { indexBasename="$(basename ~{sub(indexFiles[0], "\.[0-9]\.cf", "")})" for file in ~{sep=" " indexFiles} do - ln ${file} $PWD/"$(basename ${file})" + ln -s ${file} $PWD/"$(basename ${file})" done centrifuge \ ~{inputFormatOptions[inputFormat]} \ ~{true="--phred64" false="--phred33" phred64} \ --min-hitlen ~{minHitLength} \ - ~{"--met-file " + outputPrefix + "_alignment_metrics.tsv"} \ --threads ~{threads} \ ~{"--trim5 " + trim5} \ ~{"--trim3 " + trim3} \ @@ -139,26 +142,26 @@ task Classify { >>> output { - File outputMetrics = outputPrefix + "_alignment_metrics.tsv" - File outputClassification = outputPrefix + "_classification.tsv" - File outputReport = outputPrefix + "_output_report.tsv" + File classification = outputPrefix + "_classification.tsv" + File report = outputPrefix + "_output_report.tsv" } runtime { cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs + read1: {description: "List of files containing mate 1s, or unpaired reads.", category: "required"} + read2: {description: "List of files containing mate 2s.", category: "common"} inputFormat: {description: "The format of the read file(s).", category: "required"} - phred64: {description: "If set to true, Phred+64 encoding is used.", category: "required"} + phred64: {description: "If set to true, phred+64 encoding is used.", category: "required"} minHitLength: {description: "Minimum length of partial hits.", category: "required"} indexFiles: {description: "The files of the index for the reference genomes.", category: "required"} - read1: {description: "List of files containing mate 1s, or unpaired reads.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - read2: {description: "List of files containing mate 2s.", category: "common"} trim5: {description: "Trim bases from 5' (left) end of each read before alignment.", category: "common"} trim3: {description: "Trim bases from 3' (right) end of each read before alignment.", category: "common"} reportMaxDistinct: {description: "It searches for at most distinct, primary assignments for each read or pair.", category: "common"} @@ -166,12 +169,12 @@ task Classify { excludeTaxIDs: {description: "A comma-separated list of taxonomic IDs that will be excluded in classification procedure.", category: "common"} threads: {description: "The number of threads to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputMetrics: {description: "File with Centrifuge metrics."} - outputClassification: {description: "File with the classification results."} - outputReport: {description: "File with a classification summary."} + classification: {description: "File with the classification results."} + report: {description: "File with a classification summary."} } } @@ -183,7 +186,8 @@ task Inspect { Int? across - String memory = "4G" + String memory = "4GiB" + Int timeMinutes = 1 String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5" } @@ -195,7 +199,7 @@ task Inspect { indexBasename="$(basename ~{sub(indexFiles[0], "\.[0-9]\.cf", "")})" for file in ~{sep=" " indexFiles} do - ln ${file} $PWD/"$(basename ${file})" + ln -s ${file} $PWD/"$(basename ${file})" done centrifuge-inspect \ ~{outputOptions[printOption]} \ @@ -205,11 +209,12 @@ task Inspect { >>> output { - File outputInspect = outputPrefix + "/" + printOption + File inspectResult = outputPrefix + "/" + printOption } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } @@ -218,85 +223,19 @@ task Inspect { printOption: {description: "The output option for inspect (fasta, summary, conversionTable, taxonomyTree, nameTable, sizeTable)", category: "required"} indexFiles: {description: "The files of the index for the reference genomes.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - across: {description: "When printing FASTA output, output a newline character every bases.", category: "common"} + across: {description: "When printing fasta output, output a newline character every bases.", category: "common"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputInspect: {description: "Output file according to output option."} + inspectResult: {description: "Output file according to output option."} } } -task Download { - input { - String libraryPath - Array[String]? domain - String executable = "centrifuge-download" - String? preCommand - String? seqTaxMapPath - String database = "refseq" - String? assemblyLevel - String? refseqCategory - Array[String]? taxIds - Boolean filterUnplaced = false - Boolean maskLowComplexRegions = false - Boolean downloadRnaSeqs = false - Boolean modifyHeader = false - Boolean downloadGiMap = false - } - - # This will use centrifuge-download to download. - # The bash statement at the beginning is to make sure - # the directory for the SeqTaxMapPath exists. - command { - set -e -o pipefail - ~{preCommand} - ~{"mkdir -p $(dirname " + seqTaxMapPath + ")"} - ~{executable} \ - -o ~{libraryPath} \ - ~{true='-d ' false='' defined(domain)}~{sep=',' domain} \ - ~{'-a "' + assemblyLevel + '"'} \ - ~{"-c " + refseqCategory} \ - ~{true='-t' false='' defined(taxIds)} '~{sep=',' taxIds}' \ - ~{true='-r' false='' downloadRnaSeqs} \ - ~{true='-u' false='' filterUnplaced} \ - ~{true='-m' false='' maskLowComplexRegions} \ - ~{true='-l' false='' modifyHeader} \ - ~{true='-g' false='' downloadGiMap} \ - ~{database} ~{">> " + seqTaxMapPath} - } - - output { - File seqTaxMap = "~{seqTaxMapPath}" - File library = libraryPath - Array[File] fastaFiles = glob(libraryPath + "/*/*.fna") - } - } - -task DownloadTaxonomy { - input { - String centrifugeTaxonomyDir - String executable = "centrifuge-download" - String? preCommand - } - - command { - set -e -o pipefail - ~{preCommand} - ~{executable} \ - -o ~{centrifugeTaxonomyDir} \ - taxonomy - } - - output { - File taxonomyTree = centrifugeTaxonomyDir + "/nodes.dmp" - File nameTable = centrifugeTaxonomyDir + "/names.dmp" - } - } - -task Kreport { +task KReport { input { - File centrifugeClassification + File classification String outputPrefix Array[File]+ indexFiles Boolean noLCA = false @@ -306,7 +245,8 @@ task Kreport { Int? minimumScore Int? minimumLength - String memory = "4G" + String memory = "4GiB" + Int timeMinutes = 10 String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5" } @@ -316,7 +256,7 @@ task Kreport { indexBasename="$(basename ~{sub(indexFiles[0], "\.[0-9]\.cf", "")})" for file in ~{sep=" " indexFiles} do - ln ${file} $PWD/"$(basename ${file})" + ln -s ${file} $PWD/"$(basename ${file})" done centrifuge-kreport \ -x $PWD/${indexBasename} \ @@ -325,34 +265,36 @@ task Kreport { ~{true="--is-count-table" false="" isCountTable} \ ~{"--min-score " + minimumScore} \ ~{"--min-length " + minimumLength} \ - ~{centrifugeClassification} \ + ~{classification} \ > ~{outputPrefix + "_kreport.tsv"} >>> output { - File outputKreport = outputPrefix + "_kreport.tsv" + File KrakenReport = outputPrefix + "_kreport.tsv" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - centrifugeClassification: {description: "File with Centrifuge classification results.", category: "required"} + classification: {description: "File with centrifuge classification results.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} indexFiles: {description: "The files of the index for the reference genomes.", category: "required"} - noLCA: {description: "Do not report the LCA of multiple assignments, but report count fractions at the taxa.", category: "advanced"} + noLCA: {description: "Do not report the lca of multiple assignments, but report count fractions at the taxa.", category: "advanced"} showZeros: {description: "Show clades that have zero reads.", category: "advanced"} isCountTable: {description: "The format of the file is taxIDCOUNT.", category: "advanced"} minimumScore: {description: "Require a minimum score for reads to be counted.", category: "advanced"} minimumLength: {description: "Require a minimum alignment length to the read.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputKreport: {description: "File with kraken style report."} + KrakenReport: {description: "File with kraken style report."} } } @@ -361,7 +303,8 @@ task KTimportTaxonomy { File inputFile String outputPrefix - String memory = "4G" + String memory = "4GiB" + Int timeMinutes = 1 String dockerImage = "biocontainers/krona:v2.7.1_cv1" } @@ -374,22 +317,24 @@ task KTimportTaxonomy { } output { - File outputKronaPlot = outputPrefix + "_krona.html" + File kronaPlot = outputPrefix + "_krona.html" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - inputFile: {description: "File with Centrifuge classification results.", category: "required"} + inputFile: {description: "File with centrifuge classification results.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputKronaPlot: {description: "Krona taxonomy plot html file."} + kronaPlot: {description: "Krona taxonomy plot html file."} } } diff --git a/chunked-scatter.wdl b/chunked-scatter.wdl index 619292d9..af24b139 100644 --- a/chunked-scatter.wdl +++ b/chunked-scatter.wdl @@ -24,40 +24,105 @@ task ChunkedScatter { input { File inputFile String prefix = "./scatter" + Boolean splitContigs = false + Int? chunkSize Int? overlap Int? minimumBasesPerFile - String dockerImage = "quay.io/biocontainers/chunked-scatter:0.1.0--py_0" + String memory = "256MiB" + Int timeMinutes = 2 + String dockerImage = "quay.io/biocontainers/chunked-scatter:1.0.0--py_0" } command { - set -e - mkdir -p ~{prefix} chunked-scatter \ + --print-paths \ -p ~{prefix} \ - -i ~{inputFile} \ ~{"-c " + chunkSize} \ ~{"-o " + overlap} \ - ~{"-m " + minimumBasesPerFile} + ~{"-m " + minimumBasesPerFile} \ + ~{true="--split-contigs " false="" splitContigs} \ + ~{inputFile} } output { - Array[File] scatters = glob(prefix + "*.bed") + Array[File] scatters = read_lines(stdout()) } runtime { - memory: "4G" + cpu: 1 + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs inputFile: {description: "Either a bed file describing regiosn of intrest or a sequence dictionary.", category: "required"} prefix: {description: "The prefix for the output files.", category: "advanced"} + splitContigs: {description: "If set, contigs are allowed to be split up over multiple files.", category: "advanced"} chunkSize: {description: "Equivalent to chunked-scatter's `-c` option.", category: "advanced"} overlap: {description: "Equivalent to chunked-scatter's `-o` option.", category: "advanced"} minimumBasesPerFile: {description: "Equivalent to chunked-scatter's `-m` option.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + scatters: {description: "Overlapping chunks of a given size in new bed files."} + } +} + + +task ScatterRegions { + input { + File inputFile + String prefix = "scatters/scatter-" + Boolean splitContigs = false + Int scatterSizeMillions = 1000 + + Int? scatterSize + + String memory = "256MiB" + Int timeMinutes = 2 + String dockerImage = "quay.io/biocontainers/chunked-scatter:1.0.0--py_0" + } + + String finalSize = if defined(scatterSize) then "~{scatterSize}" else "~{scatterSizeMillions}000000" + + command { + scatter-regions \ + --print-paths \ + --scatter-size ~{finalSize} \ + ~{true="--split-contigs" false="" splitContigs} \ + ~{"--prefix " + prefix} \ + ~{inputFile} + } + + output { + Array[File] scatters = read_lines(stdout()) + } + + runtime { + cpu: 1 + memory: memory + docker: dockerImage + time_minutes: timeMinutes + } + + parameter_meta { + # inputs + inputFile: {description: "The input file, either a bed file or a sequence dict. Which format is used is detected by the extension: '.bed', '.fai' or '.dict'.", category: "required"} + prefix: {description: "The prefix of the ouput files. Output will be named like: .bed, in which N is an incrementing number. Default 'scatter-'.", category: "advanced"} + splitContigs: {description: "If set, contigs are allowed to be split up over multiple files.", category: "advanced"} + scatterSizeMillions: {description: "Over how many million base pairs should be scattered.", category: "common"} + scatterSize: {description: "Overrides scatterSizeMillions with a smaller value if set.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + scatters: {description: "Bed file where the contigs add up approximately to the given scatter size."} } -} \ No newline at end of file +} diff --git a/clair3.wdl b/clair3.wdl new file mode 100644 index 00000000..ae54ef40 --- /dev/null +++ b/clair3.wdl @@ -0,0 +1,94 @@ +version 1.0 + +# Copyright (c) 2024 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Clair3 { + input { + File bam + File bamIndex + File referenceFasta + File referenceFastaFai + String outputPrefix + String? sampleName + File? modelTar + String? builtinModel + String platform + Int threads = 8 + Boolean includeAllCtgs = false + String memory = "~{threads + 16}GiB" + Int timeMinutes = 10 + ceil(size(bam, "G") * 200 / threads) + String dockerImage = "quay.io/biocontainers/clair3:1.1.0--py39hd649744_0" + } + + String modelArg = "~{if defined(modelTar) then basename(select_first([modelTar]), '.tar.gz') else builtinModel}" + + command <<< + set -e + ~{if defined(modelTar) then "tar -xvf " + modelTar else "" } + mkdir -p $(dirname ~{outputPrefix}) + run_clair3.sh \ + --model=~{modelArg} \ + --ref_fn=~{referenceFasta} \ + --bam_fn=~{bam} \ + --output=out \ + --threads=~{threads} \ + --platform=~{platform} \ + ~{"--sample_name=" + sampleName} \ + ~{true="--include_all_ctgs" false ="" includeAllCtgs} + mv out/merge_output.vcf.gz ~{outputPrefix}.vcf.gz + mv out/merge_output.vcf.gz.tbi ~{outputPrefix}.vcf.gz.tbi + >>> + + output { + File vcf = "~{outputPrefix}.vcf.gz" + File vcfIndex = "~{outputPrefix}.vcf.gz.tbi" + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # input + bam: {description: "The input alignment file", category: "required"} + bamIndex: {description: "The index for the input alignment file", category: "required"} + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + outputPrefix: {description: "The output prefix where the data should be placed.", category: "common"} + modelTar: {description: "The TAR file with the model", category: "common"} + builtinModel: {description: "The builtin model name (in case a tar file is not used)", category: "common"} + sampleName: {description: "The name of the sample in the VCF", category: "common"} + platform: {description: "platform setting for clair3.", category: "required"} + includeAllCtgs: {description: "whether or not to call all contigs in the reference", category: "advanced"} + threads: {description: "The number of threads to use for variant calling.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # output + vcf: {description: "Output VCF file."} + vcfIndex: {description: "Output VCF index."} + + } +} diff --git a/clever.wdl b/clever.wdl index e1dcf5a6..3b819ed2 100644 --- a/clever.wdl +++ b/clever.wdl @@ -1,7 +1,5 @@ version 1.0 -# MIT License -# # Copyright (c) 2018 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -36,7 +34,8 @@ task Mateclever { Int maxOffset = 150 Int threads = 10 - String memory = "15G" + String memory = "250GiB" + Int timeMinutes = 2880 String dockerImage = "quay.io/biocontainers/clever-toolkit:2.4--py36hcfe0e84_6" } @@ -63,6 +62,7 @@ task Mateclever { runtime { cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } @@ -72,13 +72,17 @@ task Mateclever { indexedFiteredBam: {description: "The index of the filtered bam file.", category: "required"} bwaIndex: {description: "The BWA index files.", category: "required"} predictions: {description: "The predicted deletions (VCF) from clever.", category: "required"} - maxOffset: {description: "The maximum center distance between split-read and read-pair deletion to be considered identical.", category: "advanced"} - maxLengthDiff: {description: "The maximum length difference between split-read and read-pair deletion to be considered identical.", category: "advanced"} - cleverMaxDelLength: {description: "The maximum deletion length to look for in Clever predictions.", category: "advanced"} outputPath: {description: "The location the output VCF file should be written.", category: "common"} - threads: {description: "The the number of threads required to run a program", category: "advanced"} - memory: {description: "The memory required to run the programs", category: "advanced"} + cleverMaxDelLength: {description: "The maximum deletion length to look for in Clever predictions.", category: "advanced"} + maxLengthDiff: {description: "The maximum length difference between split-read and read-pair deletion to be considered identical.", category: "advanced"} + maxOffset: {description: "The maximum center distance between split-read and read-pair deletion to be considered identical.", category: "advanced"} + threads: {description: "The the number of threads required to run a program.", category: "advanced"} + memory: {description: "The memory required to run the programs.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + matecleverVcf: {description: "VCF with additional mateclever results."} } } @@ -90,7 +94,8 @@ task Prediction { String outputPath = "./clever" Int threads = 10 - String memory = "15G" + String memory = "80GiB" + Int timeMinutes = 2200 String dockerImage = "quay.io/biocontainers/clever-toolkit:2.4--py36hcfe0e84_6" } @@ -114,6 +119,7 @@ task Prediction { runtime { cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } @@ -123,11 +129,12 @@ task Prediction { bamIndex: {description: "The index bam file.", category: "required"} bwaIndex: {description: "The BWA index files.", category: "required"} outputPath: {description: "The location the output VCF file should be written.", category: "common"} - threads: {description: "The the number of threads required to run a program", category: "advanced"} - memory: {description: "The memory required to run the programs", category: "advanced"} + threads: {description: "The the number of threads required to run a program.", category: "advanced"} + memory: {description: "The memory required to run the programs.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - predictions: {description: "The predicted deletions (VCF) from clever.", category: "advanced"} + predictions: {description: "The predicted deletions (VCF) from clever."} } } diff --git a/collect-columns.wdl b/collect-columns.wdl index 8b1fa387..03ccb6f7 100644 --- a/collect-columns.wdl +++ b/collect-columns.wdl @@ -24,16 +24,20 @@ task CollectColumns { input { Array[File]+ inputTables String outputPath + Boolean header = false + Boolean sumOnDuplicateId = false + Int? featureColumn Int? valueColumn Int? separator Array[String]? sampleNames - Boolean header = false Array[String]? additionalAttributes File? referenceGtf String? featureAttribute - String dockerImage = "quay.io/biocontainers/collect-columns:0.2.0--py_1" + Int memoryGb = 4 + ceil(0.5 * length(inputTables)) + Int timeMinutes = 10 + String dockerImage = "quay.io/biocontainers/collect-columns:1.0.0--py_0" } command { @@ -47,6 +51,7 @@ task CollectColumns { ~{"-s " + separator} \ ~{true="-n" false="" defined(sampleNames)} ~{sep=" " sampleNames} \ ~{true="-H" false="" header} \ + ~{true="-S" false="" sumOnDuplicateId} \ ~{true="-a" false="" defined(additionalAttributes)} ~{sep=" " additionalAttributes} \ ~{"-g " + referenceGtf} \ ~{"-F " + featureAttribute} @@ -56,35 +61,30 @@ task CollectColumns { File outputTable = outputPath } - Int memoryGb = 4 + ceil(0.5 * length(inputTables)) - runtime { - memory: "~{memoryGb}G" + memory: "~{memoryGb}GiB" + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - inputTables: {description: "The tables from which columns should be taken.", - category: "required"} - outputPath: {description: "The path to which the output should be written.", - category: "required"} - featureColumn: {description: "Equivalent to the -f option of collect-columns.", - category: "advanced"} - valueColumn: {description: "Equivalent to the -c option of collect-columns.", - category: "advanced"} - separator: {description: "Equivalent to the -s option of collect-columns.", - category: "advanced"} - sampleNames: {description: "Equivalent to the -n option of collect-columns.", - category: "advanced"} - header: {description: "Equivalent to the -H flag of collect-columns.", - category: "advanced"} - additionalAttributes: {description: "Equivalent to the -a option of collect-columns.", - category: "advanced"} - referenceGtf: {description: "Equivalent to the -g option of collect-columns.", - category: "advanced"} - featureAttribute: {description: "Equivalent to the -F option of collect-columns.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + # inputs + inputTables: {description: "The tables from which columns should be taken.", category: "required"} + outputPath: {description: "The path to which the output should be written.", category: "required"} + header: {description: "Equivalent to the -H flag of collect-columns.", category: "advanced"} + sumOnDuplicateId: {description: "Equivalent to the -S flag of collect-columns.", category: "advanced"} + featureColumn: {description: "Equivalent to the -f option of collect-columns.", category: "advanced"} + valueColumn: {description: "Equivalent to the -c option of collect-columns.", category: "advanced"} + separator: {description: "Equivalent to the -s option of collect-columns.", category: "advanced"} + sampleNames: {description: "Equivalent to the -n option of collect-columns.", category: "advanced"} + additionalAttributes: {description: "Equivalent to the -a option of collect-columns.", category: "advanced"} + referenceGtf: {description: "Equivalent to the -g option of collect-columns.", category: "advanced"} + featureAttribute: {description: "Equivalent to the -F option of collect-columns.", category: "advanced"} + memoryGb: {description: "The maximum amount of memory the job will need in GB.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputTable: {description: "All input columns combined into one table."} } -} \ No newline at end of file +} diff --git a/common.wdl b/common.wdl index ef86abcc..1ce2895f 100644 --- a/common.wdl +++ b/common.wdl @@ -24,6 +24,8 @@ task AppendToStringArray { input { Array[String] array String string + + String memory = "1GiB" } command { @@ -36,7 +38,7 @@ task AppendToStringArray { } runtime { - memory: "1G" + memory: memory } } @@ -45,9 +47,11 @@ task CheckFileMD5 { input { File file String md5 - # By default cromwell expects /bin/bash to be present in the container + + # By default cromwell expects /bin/bash to be present in the container. # The 'bash' container does not fill this requirement. (It is in /usr/local/bin/bash) # Use a stable version of debian:stretch-slim for this. (Smaller than ubuntu) + String memory = "1GiB" String dockerImage = "debian@sha256:f05c05a218b7a4a5fe979045b1c8e2a9ec3524e5611ebfdd0ef5b8040f9008fa" } @@ -60,6 +64,7 @@ task CheckFileMD5 { runtime { docker: dockerImage + memory: memory } } @@ -69,9 +74,11 @@ task ConcatenateTextFiles { String combinedFilePath Boolean unzip = false Boolean zip = false + + String memory = "1GiB" } - # When input and output is both compressed decompression is not needed + # When input and output is both compressed decompression is not needed. String cmdPrefix = if (unzip && !zip) then "zcat " else "cat " String cmdSuffix = if (!unzip && zip) then " | gzip -c " else "" @@ -86,7 +93,7 @@ task ConcatenateTextFiles { } runtime { - memory: "1G" + memory: memory } } @@ -97,6 +104,7 @@ task Copy { Boolean recursive = false # Version not that important as long as it is stable. + String memory = "1GiB" String dockerImage = "debian@sha256:f05c05a218b7a4a5fe979045b1c8e2a9ec3524e5611ebfdd0ef5b8040f9008fa" } @@ -112,16 +120,19 @@ task Copy { runtime { docker: dockerImage + memory: memory } } task CreateLink { - # Making this of type File will create a link to the copy of the file in the execution - # folder, instead of the actual file. + # Making this of type File will create a link to the copy of the file in + # the execution folder, instead of the actual file. # This cannot be propperly call-cached or used within a container. input { String inputFile String outputPath + + String memory = "1GiB" } command { @@ -131,12 +142,55 @@ task CreateLink { output { File link = outputPath } + + runtime { + memory: memory + } +} + +task GetSamplePositionInArray { + input { + Array[String] sampleIds + String sample + + # python:3.7-slim's sha256 digest. This image is based on debian buster. + String dockerImage = "python@sha256:e0f6a4df17d5707637fa3557ab266f44dddc46ebfc82b0f1dbe725103961da4e" + } + + command <<< + python <>> + + output { + Int position = read_int(stdout()) + } + + runtime { + # 4 gigs of memory to be able to build the docker image in singularity. + memory: "4GiB" + docker: dockerImage + timeMinutes: 5 + } + + parameter_meta { + # inputs + sampleIds: {description: "A list of sample ids.", category: "required"} + sample: {description: "The sample for which the position is wanted.", category: "required"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + position: {description: ""} + } } task MapMd5 { input { Map[String,String] map + String memory = "1GiB" String dockerImage = "debian@sha256:f05c05a218b7a4a5fe979045b1c8e2a9ec3524e5611ebfdd0ef5b8040f9008fa" } @@ -150,7 +204,7 @@ task MapMd5 { } runtime { - memory: "1G" + memory: memory docker: dockerImage } } @@ -160,6 +214,7 @@ task StringArrayMd5 { input { Array[String] stringArray + String memory = "1GiB" String dockerImage = "debian@sha256:f05c05a218b7a4a5fe979045b1c8e2a9ec3524e5611ebfdd0ef5b8040f9008fa" } @@ -173,16 +228,18 @@ task StringArrayMd5 { } runtime { - memory: "1G" + memory: memory docker: dockerImage } } task TextToFile { - input { String text String outputFile = "out.txt" + + String memory = "1GiB" + Int timeMinutes = 1 String dockerImage = "debian@sha256:f05c05a218b7a4a5fe979045b1c8e2a9ec3524e5611ebfdd0ef5b8040f9008fa" } @@ -194,25 +251,35 @@ task TextToFile { File out = outputFile } - parameter_meta { - text: {description: "The text to print", category: "required"} - outputFile: {description: "The name of the output file", category: "common"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} - } runtime { - memory: "1G" + memory: memory + time_minutes: timeMinutes docker: dockerImage } + + parameter_meta { + # inputs + text: {description: "The text to print.", category: "required"} + outputFile: {description: "The name of the output file.", category: "common"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + out: {description: "File containing input text."} + } } task YamlToJson { input { File yaml String outputJson = basename(yaml, "\.ya?ml$") + ".json" + + String memory = "128MiB" + Int timeMinutes = 1 # biowdl-input-converter has python and pyyaml. - String dockerImage = "quay.io/biocontainers/biowdl-input-converter:0.2.1--py_0" + String dockerImage = "quay.io/biocontainers/biowdl-input-converter:0.3.0--pyhdfd78af_0" } + command { set -e mkdir -p "$(dirname ~{outputJson})" @@ -225,19 +292,27 @@ task YamlToJson { json.dump(content, output_json) CODE } + output { File json = outputJson } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs yaml: {description: "The YAML file to convert.", category: "required"} outputJson: {description: "The location the output JSON file should be written to.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The maximum amount of memory the job will need.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + json: {description: "JSON file version of input YAML."} } } diff --git a/cutadapt.wdl b/cutadapt.wdl index d04865b6..c695c08e 100644 --- a/cutadapt.wdl +++ b/cutadapt.wdl @@ -32,6 +32,14 @@ task Cutadapt { Array[String] adapterRead2 = [] Array[String] frontRead2 = [] Array[String] anywhereRead2 = [] + String reportPath = "cutadapt_report.txt" + # Cutadapt compresses the zipped output files with a ridiculously + # high compression level (5 or 6). + # This is not the fast compression preset. It takes up to 400% more + # CPU time for a 20% reduction in file size. + # Hence we use compression level 1 here. + Int compressionLevel = 1 # This only affects outputs with the .gz suffix. + Boolean? interleaved String? pairFilter Float? errorRate @@ -52,7 +60,7 @@ task Cutadapt { String? stripSuffix String? prefix String? suffix - Int? minimumLength = 2 # Necessary to prevent creation of empty reads or 1 base reads. + Int? minimumLength = 2 # Necessary to prevent creation of empty reads or 1 base reads. Int? maximumLength Int? maxN Boolean? discardTrimmed @@ -73,14 +81,12 @@ task Cutadapt { Boolean? bwa Boolean? zeroCap Boolean? noZeroCap - String reportPath = "cutadapt_report.txt" - # Cutadapt compresses the zipped output files with a ridiculously high compression level (5 or 6). - # This is not the fast compression preset. It takes up to 400% more CPU time for a 20% reduction in file size. - # Hence we use compression level 1 here. - Int compressionLevel = 1 # This only affects outputs with the .gz suffix. + Boolean revcomp = false + Int cores = 4 - String memory = "4G" - String dockerImage = "quay.io/biocontainers/cutadapt:2.8--py37h516909a_0" + String memory = "5GiB" + Int timeMinutes = 1 + ceil(size([read1, read2], "G") * 12.0 / cores) + String dockerImage = "quay.io/biocontainers/cutadapt:4.4--py310h1425a21_0" } String realRead2output = select_first([read2output, "cut_r2.fq.gz"]) @@ -144,6 +150,7 @@ task Cutadapt { ~{true="--bwa" false="" bwa} \ ~{true="--zero-cap" false="" zeroCap} \ ~{true="--no-zero-cap" false="" noZeroCap} \ + ~{if revcomp then "--revcomp" else ""} \ ~{read1} \ ~{read2} \ ~{"> " + reportPath} @@ -151,8 +158,8 @@ task Cutadapt { output{ File cutRead1 = read1output - File? cutRead2 = read2output File report = reportPath + File? cutRead2 = read2output File? tooLongOutput=tooLongOutputPath File? tooShortOutput=tooShortOutputPath File? untrimmedOutput=untrimmedOutputPath @@ -167,231 +174,83 @@ task Cutadapt { runtime { cpu: cores memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - read1: { - description: "The first or single end fastq file to be run through cutadapt.", - category: "required" - } - read2: { - description: "An optional second end fastq file to be run through cutadapt.", - category: "common" - } - read1output: { - description: "The name of the resulting first or single end fastq file.", - category: "common" - } - read2output: { - description: "The name of the resulting second end fastq file.", - category: "common" - } - adapter: { - description: "A list of 3' ligated adapter sequences to be cut from the given first or single end fastq file.", - category: "common" - } - front: { - description: "A list of 5' ligated adapter sequences to be cut from the given first or single end fastq file.", - category: "advanced" - } - anywhere: { - description: "A list of 3' or 5' ligated adapter sequences to be cut from the given first or single end fastq file.", - category: "advanced" - } - adapterRead2: { - description: "A list of 3' ligated adapter sequences to be cut from the given second end fastq file.", - category: "common" - } - frontRead2: { - description: "A list of 5' ligated adapter sequences to be cut from the given second end fastq file.", - category: "advanced" - } - anywhereRead2: { - description: "A list of 3' or 5' ligated adapter sequences to be cut from the given second end fastq file.", - category: "advanced" - } - interleaved: { - description: "Equivalent to cutadapt's --interleaved flag.", - category: "advanced" - } - pairFilter: { - description: "Equivalent to cutadapt's --pair-filter option.", - category: "advanced" - } - errorRate: { - description: "Equivalent to cutadapt's --error-rate option.", - category: "advanced" - } - noIndels: { - description: "Equivalent to cutadapt's --no-indels flag.", - category: "advanced" - } - times: { - description: "Equivalent to cutadapt's --times option.", - category: "advanced" - } - overlap: { - description: "Equivalent to cutadapt's --overlap option.", - category: "advanced" - } - matchReadWildcards: { - description: "Equivalent to cutadapt's --match-read-wildcards flag.", - category: "advanced" - } - noMatchAdapterWildcards: { - description: "Equivalent to cutadapt's --no-match-adapter-wildcards flag.", - category: "advanced" - } - noTrim: { - description: "Equivalent to cutadapt's --no-trim flag.", - category: "advanced" - } - maskAdapter: { - description: "Equivalent to cutadapt's --mask-adapter flag.", - category: "advanced" - } - cut: { - description: "Equivalent to cutadapt's --cut option.", - category: "advanced" - } - nextseqTrim: { - description: "Equivalent to cutadapt's --nextseq-trim option.", - category: "advanced" - } - qualityCutoff: { - description: "Equivalent to cutadapt's --quality-cutoff option.", - category: "advanced" - } - qualityBase: { - description: "Equivalent to cutadapt's --quality-base option.", - category: "advanced" - } - length: { - description: "Equivalent to cutadapt's --length option.", - category: "advanced" - } - trimN: { - description: "Equivalent to cutadapt's --trim-n flag.", - category: "advanced" - } - lengthTag: { - description: "Equivalent to cutadapt's --length-tag option.", - category: "advanced" - } - stripSuffix: { - description: "Equivalent to cutadapt's --strip-suffix option.", - category: "advanced" - } - prefix: { - description: "Equivalent to cutadapt's --prefix option.", - category: "advanced" - } - suffix: { - description: "Equivalent to cutadapt's --suffix option.", - category: "advanced" - } - minimumLength: { - description: "Equivalent to cutadapt's --minimum-length option.", - category: "advanced" - } - maximumLength: { - description: "Equivalent to cutadapt's --maximum-length option.", - category: "advanced" - } - maxN: { - description: "Equivalent to cutadapt's --max-n option.", - category: "advanced" - } - discardTrimmed: { - description: "Equivalent to cutadapt's --quality-cutoff option.", - category: "advanced" - } - discardUntrimmed: { - description: "Equivalent to cutadapt's --discard-untrimmed option.", - category: "advanced" - } - infoFilePath: { - description: "Equivalent to cutadapt's --info-file option.", - category: "advanced" - } - restFilePath: { - description: "Equivalent to cutadapt's --rest-file option.", - category: "advanced" - } - wildcardFilePath: { - description: "Equivalent to cutadapt's --wildcard-file option.", - category: "advanced" - } - tooShortOutputPath: { - description: "Equivalent to cutadapt's --too-short-output option.", - category: "advanced" - } - tooLongOutputPath: { - description: "Equivalent to cutadapt's --too-long-output option.", - category: "advanced" - } - untrimmedOutputPath: { - description: "Equivalent to cutadapt's --untrimmed-output option.", - category: "advanced" - } - tooShortPairedOutputPath: { - description: "Equivalent to cutadapt's --too-short-paired-output option.", - category: "advanced" - } - tooLongPairedOutputPath: { - description: "Equivalent to cutadapt's --too-long-paired-output option.", - category: "advanced" - } - untrimmedPairedOutputPath: { - description: "Equivalent to cutadapt's --untrimmed-paired-output option.", - category: "advanced" - } - colorspace: { - description: "Equivalent to cutadapt's --colorspace flag.", - category: "advanced" - } - doubleEncode: { - description: "Equivalent to cutadapt's --double-encode flag.", - category: "advanced" - } - stripF3: { - description: "Equivalent to cutadapt's --strip-f3 flag.", - category: "advanced" - } - maq: { - description: "Equivalent to cutadapt's --maq flag.", - category: "advanced" - } - bwa: { - description: "Equivalent to cutadapt's --bwa flag.", - category: "advanced" - } - zeroCap: { - description: "Equivalent to cutadapt's --zero-cap flag.", - category: "advanced" - } - noZeroCap: { - description: "Equivalent to cutadapt's --no-zero-cap flag.", - category: "advanced" - } - reportPath: { - description: "The name of the file to write cutadapts's stdout to, this contains some metrics.", - category: "common" - } - compressionLevel: {description: "The compression level if gzipped output is used.", - category: "advanced"} - cores: { - description: "The number of cores to use.", - category: "advanced" - } - memory: { - description: "The amount of memory this job will use.", - category: "advanced" - } - dockerImage: { - description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced" - } + # inputs + read1: {description: "The first or single end fastq file to be run through cutadapt.", category: "required"} + read2: {description: "An optional second end fastq file to be run through cutadapt.", category: "common"} + read1output: {description: "The name of the resulting first or single end fastq file.", category: "common"} + read2output: {description: "The name of the resulting second end fastq file.", category: "common"} + adapter: {description: "A list of 3' ligated adapter sequences to be cut from the given first or single end fastq file.", category: "common"} + front: {description: "A list of 5' ligated adapter sequences to be cut from the given first or single end fastq file.", category: "advanced"} + anywhere: {description: "A list of 3' or 5' ligated adapter sequences to be cut from the given first or single end fastq file.", category: "advanced"} + adapterRead2: {description: "A list of 3' ligated adapter sequences to be cut from the given second end fastq file.", category: "common"} + frontRead2: {description: "A list of 5' ligated adapter sequences to be cut from the given second end fastq file.", category: "advanced"} + anywhereRead2: {description: "A list of 3' or 5' ligated adapter sequences to be cut from the given second end fastq file.", category: "advanced"} + reportPath: {description: "The name of the file to write cutadapts's stdout to, this contains some metrics.", category: "common"} + compressionLevel: {description: "The compression level if gzipped output is used.", category: "advanced"} + interleaved: {description: "Equivalent to cutadapt's --interleaved flag.", category: "advanced"} + pairFilter: {description: "Equivalent to cutadapt's --pair-filter option.", category: "advanced"} + errorRate: {description: "Equivalent to cutadapt's --error-rate option.", category: "advanced"} + noIndels: {description: "Equivalent to cutadapt's --no-indels flag.", category: "advanced"} + times: {description: "Equivalent to cutadapt's --times option.", category: "advanced"} + overlap: {description: "Equivalent to cutadapt's --overlap option.", category: "advanced"} + matchReadWildcards: {description: "Equivalent to cutadapt's --match-read-wildcards flag.", category: "advanced"} + noMatchAdapterWildcards: {description: "Equivalent to cutadapt's --no-match-adapter-wildcards flag.", category: "advanced"} + noTrim: {description: "Equivalent to cutadapt's --no-trim flag.", category: "advanced"} + maskAdapter: {description: "Equivalent to cutadapt's --mask-adapter flag.", category: "advanced"} + cut: {description: "Equivalent to cutadapt's --cut option.", category: "advanced"} + nextseqTrim: {description: "Equivalent to cutadapt's --nextseq-trim option.", category: "advanced"} + qualityCutoff: {description: "Equivalent to cutadapt's --quality-cutoff option.", category: "advanced"} + qualityBase: {description: "Equivalent to cutadapt's --quality-base option.", category: "advanced"} + length: {description: "Equivalent to cutadapt's --length option.", category: "advanced"} + trimN: {description: "Equivalent to cutadapt's --trim-n flag.", category: "advanced"} + lengthTag: {description: "Equivalent to cutadapt's --length-tag option.", category: "advanced"} + stripSuffix: {description: "Equivalent to cutadapt's --strip-suffix option.", category: "advanced"} + prefix: {description: "Equivalent to cutadapt's --prefix option.", category: "advanced"} + suffix: {description: "Equivalent to cutadapt's --suffix option.", category: "advanced"} + minimumLength: {description: "Equivalent to cutadapt's --minimum-length option.", category: "advanced"} + maximumLength: {description: "Equivalent to cutadapt's --maximum-length option.", category: "advanced"} + maxN: {description: "Equivalent to cutadapt's --max-n option.", category: "advanced"} + discardTrimmed: {description: "Equivalent to cutadapt's --quality-cutoff option.", category: "advanced"} + discardUntrimmed: {description: "Equivalent to cutadapt's --discard-untrimmed option.", category: "advanced"} + infoFilePath: {description: "Equivalent to cutadapt's --info-file option.", category: "advanced"} + restFilePath: {description: "Equivalent to cutadapt's --rest-file option.", category: "advanced"} + wildcardFilePath: {description: "Equivalent to cutadapt's --wildcard-file option.", category: "advanced"} + tooShortOutputPath: {description: "Equivalent to cutadapt's --too-short-output option.", category: "advanced"} + tooLongOutputPath: {description: "Equivalent to cutadapt's --too-long-output option.", category: "advanced"} + untrimmedOutputPath: {description: "Equivalent to cutadapt's --untrimmed-output option.", category: "advanced"} + tooShortPairedOutputPath: {description: "Equivalent to cutadapt's --too-short-paired-output option.", category: "advanced"} + tooLongPairedOutputPath: {description: "Equivalent to cutadapt's --too-long-paired-output option.", category: "advanced"} + untrimmedPairedOutputPath: {description: "Equivalent to cutadapt's --untrimmed-paired-output option.", category: "advanced"} + colorspace: {description: "Equivalent to cutadapt's --colorspace flag.", category: "advanced"} + doubleEncode: {description: "Equivalent to cutadapt's --double-encode flag.", category: "advanced"} + stripF3: {description: "Equivalent to cutadapt's --strip-f3 flag.", category: "advanced"} + maq: {description: "Equivalent to cutadapt's --maq flag.", category: "advanced"} + bwa: {description: "Equivalent to cutadapt's --bwa flag.", category: "advanced"} + zeroCap: {description: "Equivalent to cutadapt's --zero-cap flag.", category: "advanced"} + noZeroCap: {description: "Equivalent to cutadapt's --no-zero-cap flag.", category: "advanced"} + revcomp: {description: "Equivalent to cutadapt's --revcomp flag.", category: "advanced"} + cores: {description: "The number of cores to use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + cutRead1: {description: "Trimmed read one."} + report: {description: "Per-adapter statistics file."} + cutRead2: {description: "Trimmed read two in pair."} + tooLongOutput: {description: "Reads that are too long according to -M."} + tooShortOutput: {description: "Reads that are too short according to -m."} + untrimmedOutput: {description: "All reads without adapters (instead of the regular output file)."} + tooLongPairedOutput: {description: "Second reads in a pair."} + tooShortPairedOutput: {description: "Second reads in a pair."} + untrimmedPairedOutput: {description: "The second reads in a pair that were not trimmed."} + infoFile: {description: "Detailed information about where adapters were found in each read."} + restFile: {description: "The rest file."} + wildcardFile: {description: "The wildcard file."} } } diff --git a/deconstructsigs.wdl b/deconstructsigs.wdl new file mode 100644 index 00000000..c44bf9c0 --- /dev/null +++ b/deconstructsigs.wdl @@ -0,0 +1,66 @@ +# Copyright (c) 2021 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +version 1.0 + +task DeconstructSigs { + input { + File signaturesMatrix + File signaturesReference + String outputPath = "./signatures.rds" + + Int timeMinutes = 15 + String memory = "4GiB" + String dockerImage = "quay.io/biocontainers/r-deconstructsigs:1.9.0--r41hdfd78af_1" + } + + command { + R --no-echo << EOF + library(deconstructSigs) + tumor <- read.table("~{signaturesMatrix}", check.names=F) + ref <- data.frame(t(read.table("~{signaturesReference}", check.names=F, header=T, row.names="Type")), check.names=F) + tumor <- tumor[,colnames(ref)] + + sigs <- whichSignatures(tumor.ref=tumor, row.names(tumor), signatures.ref=ref, contexts.needed=T) + saveRDS(sigs, "~{outputPath}") + EOF + } + + output { + File signatureRDS = outputPath + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + signaturesMatrix: {description: "A table containing columns represtenting mutation types (matching the types in the signatures reference) and one row with the counts for each of these types for the sample of intrest.", + category: "required"} + signaturesReference: {description: "A table describing the mutational signatures, formatted like those provided by COSMIC.", + category: "required"} + outputPath: {description: "The location the output will be written to.", category: "common"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } +} \ No newline at end of file diff --git a/deepvariant.wdl b/deepvariant.wdl new file mode 100644 index 00000000..b0ed2a19 --- /dev/null +++ b/deepvariant.wdl @@ -0,0 +1,109 @@ +version 1.0 + +# Copyright (c) 2018 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task RunDeepVariant { + input { + File referenceFasta + File referenceFastaIndex + File inputBam + File inputBamIndex + String modelType + String outputVcf = "sample.vcf.gz" + String? postprocessVariantsExtraArgs + File? customizedModel + Int numShards = 8 + String? outputGVcf + String? outputGVcfIndex + File? regions + String? sampleName + Boolean VCFStatsReport = true + + # Most of the memory used is at the end, in the step where the variants + # are merged. This is a single-threaded high memory step. The number + # of shards does not influence the memory so much. + # The provided memory here is enough for merge human chromosome 1. + String memory = "48GiB" + Int timeMinutes = 5000 + # Version 1.8.0 has a bug. + # https://github.com/google/deepvariant/issues/912 + String dockerImage = "google/deepvariant:1.6.1" + } + + command { + set -e + /opt/deepvariant/bin/run_deepvariant \ + --ref ~{referenceFasta} \ + --reads ~{inputBam} \ + --model_type ~{modelType} \ + --output_vcf ~{outputVcf} \ + ~{"--output_gvcf " + outputGVcf} \ + ~{"--customized_model " + customizedModel} \ + ~{"--num_shards " + numShards} \ + ~{"--regions " + regions} \ + ~{"--sample_name " + sampleName} \ + ~{"--postprocess_variants_extra_args " + postprocessVariantsExtraArgs} \ + ~{true="--vcf_stats_report" false="--novcf_stats_report" VCFStatsReport} + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + cpu: numShards + } + + output { + File outputVCF = outputVcf + File outputVCFIndex = outputVcf + ".tbi" + Array[File] outputVCFStatsReport = glob("*.visual_report.html") + File? outputGVCF = outputGVcf + File? outputGVCFIndex = outputGVcfIndex + } + + parameter_meta { + # inputs + referenceFasta: {description: "Genome reference to use.", category: "required"} + referenceFastaIndex: {description: "Index for the genome reference file.", category: "required"} + inputBam: {description: "Aligned, sorted, indexed BAM file containing the reads we want to call.", category: "required"} + inputBamIndex: {description: "Index for the input bam file.", category: "required"} + modelType: {description: ". Type of model to use for variant calling. Each model_type has an associated default model, which can be overridden by the --customized_model flag.", category: "required"} + outputVcf: {description: "Path where we should write VCF file.", category: "required"} + postprocessVariantsExtraArgs: {description: "A comma-separated list of flag_name=flag_value. 'flag_name' has to be valid flags for calpostprocess_variants.py.", category: "advanced"} + customizedModel: {description: "A path to a model checkpoint to load for the `call_variants` step. If not set, the default for each --model_type will be used.", category: "advanced"} + numShards: {description: "Number of shards for make_examples step.", category: "common"} + outputGVcf: {description: "Path where we should write gVCF file.", category: "common"} + outputGVcfIndex: {description: "Path to where the gVCF index file will be written. This is needed as a workaround, set it to `outputGVcf + '.tbi.'`", category: "common"} + regions: {description: "List of regions we want to process, in BED/BEDPE format.", category: "advanced"} + sampleName: {description: "Sample name to use instead of the sample name from the input reads BAM (SM tag in the header).", category: "common"} + VCFStatsReport: {description: "Output a visual report (HTML) of statistics about the output VCF.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVCF: {description: "Output VCF file."} + outputVCFIndex: {description: "Index of output VCF file."} + outputVCFStatsReport: {description: "Statistics file."} + outputGVCF: {description: "GVCF version of VCF file(s)."} + outputGVCFIndex: {description: "Index of GVCF file(s)."} + } +} diff --git a/delly.wdl b/delly.wdl index ad8f18d9..b952da7e 100644 --- a/delly.wdl +++ b/delly.wdl @@ -1,7 +1,5 @@ version 1.0 -# MIT License -# # Copyright (c) 2018 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -24,14 +22,18 @@ version 1.0 task CallSV { input { - File bamFile - File bamIndex + Array[File]+ bamFile + Array[File]+ bamIndex File referenceFasta File referenceFastaFai - String outputPath = "./delly/delly.vcf" + String outputPath = "./delly/delly.bcf" + + File? genotypeBcf + File? genotypeBcfIndex - String memory = "15G" - String dockerImage = "quay.io/biocontainers/delly:0.8.1--h4037b6b_1" + String memory = "15GiB" + Int timeMinutes = 600 + String dockerImage = "quay.io/biocontainers/delly:1.1.6--ha41ced6_0" } command { @@ -40,26 +42,85 @@ task CallSV { delly call \ -o ~{outputPath} \ -g ~{referenceFasta} \ - ~{bamFile} + ~{"-v " + genotypeBcf} \ + ~{sep=" " bamFile} } output { File dellyBcf = outputPath + File dellyBcfIndex = outputPath + ".csi" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - bamFile: {description: "The bam file to process.", category: "required"} - bamIndex: {description: "The index bam file.", category: "required"} + bamFile: {description: "The bam files to process.", category: "required"} + bamIndex: {description: "The indexes for the bam files.", category: "required"} referenceFasta: {description: "The reference fasta file also used for mapping.", category: "required"} - referenceFastaFai: {description: "Fasta index (.fai) file of the reference", category: "required" } - outputPath: {description: "The location the output VCF file should be written.", category: "common"} - memory: {description: "The memory required to run the programs", category: "advanced"} + referenceFastaFai: {description: "Fasta index (.fai) file of the reference.", category: "required" } + outputPath: {description: "The location the output BCF file should be written.", category: "common"} + genotypeBcf: {description: "A BCF with SVs to get genotyped in the samples.", category: "advanced"} + genotypeBcfIndex: {description: "The index for the genotype BCF file.", category: "advanced"} + memory: {description: "The memory required to run the programs.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + dellyBcf: {description: "File containing structural variants."} } } + + +task SomaticFilter { + input { + File dellyBcf + File dellyBcfIndex + Array[String]+ normalSamples + Array[String]+ tumorSamples + String outputPath = "./delly/delly_filter.bcf" + + String memory = "15GiB" + Int timeMinutes = 300 + String dockerImage = "quay.io/biocontainers/delly:1.1.6--ha41ced6_0" + } + + command <<< + set -e + mkdir -p "$(dirname ~{outputPath})" + for SAMPLE in ~{sep=" " normalSamples}; do echo -e "${SAMPLE}\tcontrol" >> samples.tsv; done + for SAMPLE in ~{sep=" " tumorSamples}; do echo -e "${SAMPLE}\ttumor" >> samples.tsv; done + + delly filter \ + -f somatic \ + -o ~{outputPath} \ + -s samples.tsv \ + ~{dellyBcf} + >>> + + output { + File filterBcf = outputPath + File filterBcfIndex = outputPath + ".csi" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + dellyBcf: {description: "The BCF file produced by delly.", category: "required"} + dellyBcfIndex: {description: "The index for the delly BCF file.", category: "required"} + normalSamples: {description: "The names for the normal samples as used in the delly BCF file.", category: "required"} + tumorSamples: {description: "The names for the tumor samples as used in the delly BCF file.", category: "required"} + outputPath: {description: "The location the output BCF file should be written.", category: "common"} + memory: {description: "The memory required to run the programs.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } +} \ No newline at end of file diff --git a/duphold.wdl b/duphold.wdl new file mode 100644 index 00000000..0426da56 --- /dev/null +++ b/duphold.wdl @@ -0,0 +1,75 @@ +version 1.0 + +# Copyright (c) 2020 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Duphold { + input { + File inputVcf + File bamFile + File bamIndex + File referenceFasta + File referenceFastaFai + String sample + String outputPath = "./duphold.vcf" + + String memory = "15GiB" + Int timeMinutes = 1440 + String dockerImage = "quay.io/biocontainers/duphold:0.2.1--h516909a_1" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + export DUPHOLD_SAMPLE_NAME=~{sample} + duphold \ + -v ~{inputVcf} \ + -b ~{bamFile} \ + -f ~{referenceFasta} \ + -o ~{outputPath} + } + + output { + File outputVcf = outputPath + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputVcf: {description: "The VCF file to process.", category: "required"} + bamFile: {description: "The bam file to process.", category: "required"} + bamIndex: {description: "The index of the bam file.", category: "required"} + referenceFasta: {description: "The reference fasta file also used for mapping.", category: "required"} + referenceFastaFai: {description: "Fasta index (.fai) file of the reference.", category: "required" } + sample: {description: "The name of the sample.", category: "required"} + outputPath: {description: "The location the output VCF file should be written.", category: "common"} + memory: {description: "The memory required to run the programs.", category: "advanced"} + timeMinutes: {description: "The maximum duration (in minutes) the tool is allowed to run.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "Duphold annotated VCF file."} + } +} diff --git a/extractSigPredictHRD.wdl b/extractSigPredictHRD.wdl new file mode 100644 index 00000000..1520b608 --- /dev/null +++ b/extractSigPredictHRD.wdl @@ -0,0 +1,71 @@ +version 1.0 + +# Copyright (c) 2021 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task ExtractSigPredictHRD { + input { + String outputDir = "." + String sampleName + File snvIndelVcf + File snvIndelVcfIndex + File svVcf + File svVcfIndex + Boolean hg38 = false + + String memory = "3GiB" + Int timeMinutes = 10 + String dockerImage = "quay.io/biowdl/chord-mutsigextractor:2.00_1.14" + } + + command { + extractSigPredictHRD.R \ + ~{outputDir} \ + ~{sampleName} \ + ~{snvIndelVcf} \ + ~{svVcf} \ + ~{if hg38 then "RG_38" else "RG_37"} + } + + output { + File chordPrediction = "~{outputDir}/~{sampleName}_chord_prediction.txt" + File chordSignatures = "~{outputDir}/~{sampleName}_chord_signatures.txt" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + outputDir: {description: "The directory the outout will be written to.", category: "required"} + sampleName: {description: "The name of the sample.", category: "required"} + snvIndelVcf: {description: "A VCF file with SNVs and indels.", category: "required"} + snvIndelVcfIndex: {description: "The index for the SNV/indel VCF file.", category: "required"} + svVcf: {description: "A VCF file with SVs.", category: "required"} + svVcfIndex: {description: "The index for the SV VCF file.", category: "required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} \ No newline at end of file diff --git a/fastp.wdl b/fastp.wdl new file mode 100644 index 00000000..9849738b --- /dev/null +++ b/fastp.wdl @@ -0,0 +1,124 @@ +version 1.0 + +# MIT License +# +# Copyright (c) 2022 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Fastp { + input { + File read1 + File read2 + String outputPathR1 + String outputPathR2 + String htmlPath + String jsonPath + + Int compressionLevel = 1 + Boolean correction = false + Int lengthRequired = 15 + Int? split + Boolean performAdapterTrimming = true + Boolean performQualityFiltering = true + Boolean performLengthFiltering = true + Boolean? performPolyGTrimming + + Int threads = 4 + String memory = "50GiB" + Int timeMinutes = 1 + ceil(size([read1, read2], "G") * 6.0 / threads) + String dockerImage = "quay.io/biocontainers/fastp:0.23.2--h5f740d0_3" + + Int? noneInt + } + + String outputDirR1 = sub(outputPathR1, basename(outputPathR1), "") + String outputDirR2 = sub(outputPathR2, basename(outputPathR2), "") + + String polyGTrimmingFlag = if defined(performPolyGTrimming) + then + if select_first([performPolyGTrimming]) then "--trim_poly_g" else "--disable_trim_poly_g" + else "" + + Int? effectiveSplit = if select_first([split, 1]) > 1 then split else noneInt + + command <<< + set -e + mkdir -p $(dirname ~{outputPathR1}) + mkdir -p $(dirname ~{outputPathR2}) + mkdir -p $(dirname ~{htmlPath}) + mkdir -p $(dirname ~{jsonPath}) + + # predict output paths + seq 1 ~{if defined(effectiveSplit) then effectiveSplit else "2"} | awk '{print "~{outputDirR1}/"$0".~{basename(outputPathR1)}"}' > r1_paths + seq 1 ~{if defined(effectiveSplit) then effectiveSplit else "2"} | awk '{print "~{outputDirR2}/"$0".~{basename(outputPathR2)}"}' > r2_paths + fastp \ + -i ~{read1} \ + ~{"-I " + read2} \ + -o ~{outputPathR1} \ + ~{"-O " + outputPathR2} \ + -h ~{htmlPath} \ + -j ~{jsonPath} \ + -z ~{compressionLevel} \ + ~{if correction then "--correction" else ""} \ + --length_required ~{lengthRequired} \ + --thread ~{select_first([effectiveSplit, threads])} \ + ~{"--split " + effectiveSplit} \ + ~{if defined(effectiveSplit) then "-d 0" else ""} \ + ~{if performAdapterTrimming then "" else "--disable_adapter_trimming"} \ + ~{if performQualityFiltering then "" else "--disable_quality_filtering"} \ + ~{if performLengthFiltering then "" else "--disable_length_filtering"} \ + ~{polyGTrimmingFlag} + >>> + + output { + File htmlReport = htmlPath + File jsonReport = jsonPath + Array[File] clippedR1 = if defined(effectiveSplit) then read_lines("r1_paths") else [outputPathR1] + Array[File] clippedR2 = if defined(effectiveSplit) then read_lines("r2_paths") else [outputPathR2] + } + + runtime { + cpu: select_first([effectiveSplit, threads]) + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + read1: {description: "The R1 fastq file.", category: "required"} + read2: {description: "The R2 fastq file.", category: "required"} + outputPathR1: {description: "The output path for the R1 file.", category: "required"} + outputPathR2: {description: "The output path for the R2 file.", category: "required"} + htmlPath: {description: "The path to write the html report to.", category: "required"} + jsonPath: {description: "The path to write the json report to.", category: "required"} + compressionLevel: {description: "The compression level to use for the output.", category: "advanced"} + correction: {description: "Whether or not to apply overlap based correction.", category: "advanced"} + lengthRequired: {description: "The minimum read length.", category: "advanced"} + split: {description: "The number of chunks to split the files into. Number of threads will be set equal to the amount of splits.", category: "common"} + performAdapterTrimming: {description: "Whether adapter trimming should be performed or not.", category: "advanced"} + performQualityFiltering: {description: "Whether reads should be filtered based on quality scores.", category: "advanced"} + performLengthFiltering: {description: "Whether reads shoulde be filtered based on lengths.", catgegory: "advanced"} + performPolyGTrimming: {description: "Whether or not poly-G-tail trimming should be performed. If undefined fastp's default behaviour will be used, ie. enabled for NextSeq/NovaSeq data as detected from read headers.", category: "advanced"} + threads: {description: "The number of threads to use. Only used if the split input is not set.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } +} \ No newline at end of file diff --git a/fastqFilter.wdl b/fastqFilter.wdl new file mode 100644 index 00000000..3701b8aa --- /dev/null +++ b/fastqFilter.wdl @@ -0,0 +1,66 @@ +version 1.0 + +# MIT License +# +# Copyright (c) 2023 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task FastqFilter { + input { + Array[File]+ fastq + Array[String]+ outputPaths + Int? minLength + Int? maxLength + + String memory = "1GiB" + Int timeMinutes = 1 + ceil(size(fastq, "G")) + String dockerImage = "quay.io/biocontainers/fastq-filter:0.3.0--py39hf95cd2a_1" + } + + command { + set -e + mkdir -p $(dirname ~{sep=" " outputPaths}) + fastq-filter \ + -o ~{sep=" -o " outputPaths} \ + ~{"-l " + minLength} \ + ~{"-L " + maxLength} \ + ~{sep=" " fastq} + } + + output { + Array[File] filtered = outputPaths + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + fastq: {description: "A list of fastq files to filter.", category: "required"} + outputPaths: {description: "A list containing the output paths for each input fastq file.", category: "required"} + minLength: {description: "Equivalent to fastq-filter's `--min-length` option.", category: "common"} + maxLength: {description: "Equivalent to fastq-filter's `--max-length` option.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } +} \ No newline at end of file diff --git a/fastqc.wdl b/fastqc.wdl index 4d10147c..da31882c 100644 --- a/fastqc.wdl +++ b/fastqc.wdl @@ -29,6 +29,7 @@ task Fastqc { Boolean noFilter = false Boolean extract = false Boolean nogroup = false + Int? minLength String? format File? contaminants @@ -37,58 +38,76 @@ task Fastqc { Int? kmers String? dir + # Set javaXmx a little high. Equal to fastqc default with 7 threads. + # This is because some fastq files need more memory. 2G per core + # is a nice cluster default, so we use all the rest of the memory for + # fastqc so we should have as little OOM crashes as possible even with + # weird edge case fastq's. + String javaXmx="1750M" Int threads = 1 - String memory = "4G" - String dockerImage = "quay.io/biocontainers/fastqc:0.11.9--0" - Array[File]? NoneArray - File? NoneFile + String memory = "2GiB" + Int timeMinutes = 1 + ceil(size(seqFile, "G")) * 4 + String dockerImage = "quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0" + + Array[File]? noneArray + File? noneFile } # Chops of the .gz extension if present. - # The Basename needs to be taken here. Otherwise paths might differ between similar jobs. + # The Basename needs to be taken here. Otherwise paths might differ + # between similar jobs. String name = basename(sub(seqFile, "\.gz$","")) - # This regex chops of the extension and replaces it with _fastqc for the reportdir. + # This regex chops of the extension and replaces it with _fastqc for + # the reportdir. # Just as fastqc does it. String reportDir = outdirPath + "/" + sub(name, "\.[^\.]*$", "_fastqc") - command { + # We reimplement the perl wrapper here. This has the advantage that it + # gives us more control over the amount of memory used. + command <<< set -e - mkdir -p ~{outdirPath} - fastqc \ - ~{"--outdir " + outdirPath} \ - ~{true="--casava" false="" casava} \ - ~{true="--nano" false="" nano} \ - ~{true="--nofilter" false="" noFilter} \ - ~{true="--extract" false="" extract} \ - ~{true="--nogroup" false="" nogroup} \ - ~{"--min_length " + minLength } \ - ~{"--format " + format} \ - ~{"--threads " + threads} \ - ~{"--contaminants " + contaminants} \ - ~{"--adapters " + adapters} \ - ~{"--limits " + limits} \ - ~{"--kmers " + kmers} \ - ~{"--dir " + dir} \ + mkdir -p "~{outdirPath}" + FASTQC_DIR="/usr/local/opt/fastqc-0.12.1" + export CLASSPATH="$FASTQC_DIR:$FASTQC_DIR/sam-1.103.jar:$FASTQC_DIR/jbzip2-0.9.jar:$FASTQC_DIR/cisd-jhdf5.jar" + java -Djava.awt.headless=true -XX:ParallelGCThreads=1 \ + -Xms200M -Xmx~{javaXmx} \ + ~{"-Dfastqc.output_dir=" + outdirPath} \ + ~{true="-Dfastqc.casava=true" false="" casava} \ + ~{true="-Dfastqc.nano=true" false="" nano} \ + ~{true="-Dfastqc.nofilter=true" false="" noFilter} \ + ~{true="-Dfastqc.unzip=true" false="" extract} \ + ~{true="-Dfastqc.nogroup=true" false="" nogroup} \ + ~{"-Dfastqc.min_length=" + minLength} \ + ~{"-Dfastqc.sequence_format=" + format} \ + ~{"-Dfastqc.threads=" + threads} \ + ~{"-Dfastqc.contaminant_file=" + contaminants} \ + ~{"-Dfastqc.adapter_file=" + adapters} \ + ~{"-Dfastqc.limits_file=" + limits} \ + ~{"-Dfastqc.kmer_size=" + kmers} \ + ~{"-Djava.io.tmpdir=" + dir} \ + uk.ac.babraham.FastQC.FastQCApplication \ ~{seqFile} - } + >>> output { - File? rawReport = if extract then reportDir + "/fastqc_data.txt" else NoneFile File htmlReport = reportDir + ".html" File reportZip = reportDir + ".zip" - File? summary = if extract then reportDir + "/summary.txt" else NoneFile - Array[File]? images = if extract then glob(reportDir + "/Images/*.png") else NoneArray + File? summary = if extract then reportDir + "/summary.txt" else noneFile + File? rawReport = if extract then reportDir + "/fastqc_data.txt" else noneFile + Array[File]? images = if extract then glob(reportDir + "/Images/*.png") else noneArray } runtime { cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs seqFile: {description: "A fastq file.", category: "required"} - outdirPath: {description: "The path to write the output to", catgory: "required"} + outdirPath: {description: "The path to write the output to.", catgory: "required"} casava: {description: "Equivalent to fastqc's --casava flag.", category: "advanced"} nano: {description: "Equivalent to fastqc's --nano flag.", category: "advanced"} noFilter: {description: "Equivalent to fastqc's --nofilter flag.", category: "advanced"} @@ -101,21 +120,31 @@ task Fastqc { limits: {description: "Equivalent to fastqc's --limits option.", category: "advanced"} kmers: {description: "Equivalent to fastqc's --kmers option.", category: "advanced"} dir: {description: "Equivalent to fastqc's --dir option.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} threads: {description: "The number of cores to use.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + htmlReport: {description: "HTML report file."} + reportZip: {description: "Source data file."} + summary: {description: "Summary file."} + rawReport: {description: "Raw report file."} + images: {description: "Images in report file."} } meta { WDL_AID: { - exclude: ["NoneFile", "NoneArray"] + exclude: ["noneFile", "noneArray"] } } } task GetConfiguration { input { + String memory = "2G" # Needs more than 1 to pull the docker image. + Int timeMinutes = 1 String dockerImage = "quay.io/biocontainers/fastqc:0.11.7--4" } @@ -137,14 +166,20 @@ task GetConfiguration { } runtime { - memory: "2G" # Needs more than 1 to pull the docker image + memory: memory + time_minute: timeMinutes docker: dockerImage } parameter_meta { - dockerImage: { - description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced" - } + # inputs + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + adapterList: {description: "List of adapters found."} + contaminantList: {description: "List of contaminants found."} + limits: {description: "Limits file."} } } diff --git a/fastqsplitter.wdl b/fastqsplitter.wdl index c523cf8a..4a02697c 100644 --- a/fastqsplitter.wdl +++ b/fastqsplitter.wdl @@ -1,7 +1,5 @@ version 1.0 -# MIT License -# # Copyright (c) 2019 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -26,19 +24,24 @@ task Fastqsplitter { input { File inputFastq Array[String]+ outputPaths - String dockerImage = "quay.io/biocontainers/fastqsplitter:1.1.0--py37h516909a_1" + Int? compressionLevel Int? threadsPerFile - # fastqplitter utilizes one thread per input file and one or more threads per output file + one thread for the application. - # Since a compression level of 1 is used, each output file uses approx 0.5 cores. + + # fastqplitter utilizes one thread per input file and one or + # more threads per output file + one thread for the application. + # Since a compression level of 1 is used, each output file + # uses approx 0.5 cores. Int cores = 1 + ceil(0.5 * length(outputPaths)) + String dockerImage = "quay.io/biocontainers/fastqsplitter:1.1.0--py37h516909a_1" } # Busybox mkdir does not accept multiple paths. command <<< set -e for FILE in ~{sep=' ' outputPaths} - do mkdir -p "$(dirname $FILE)" + do + mkdir -p "$(dirname ${FILE})" done fastqsplitter \ ~{"-c " + compressionLevel} \ @@ -51,15 +54,16 @@ task Fastqsplitter { Array[File] chunks = outputPaths } - # Using very safe margins here. 10MB/300MB per outputfile is used for single-threaded/multi-threaded compression. + # Using very safe margins here. 10MB/300MB per outputfile is used for + # single-threaded/multi-threaded compression. Float memoryPerFile = if select_first([threadsPerFile, 1]) > 1 then 0.40 else 0.02 Int fastqsplitterMemory = ceil(0.100 + memoryPerFile * length(outputPaths)) - # Make sure a minimum of 2 GB is present to pull the singularity image + # Make sure a minimum of 2 GB is present to pull the singularity image. Int memory = if fastqsplitterMemory <= 2 then 2 else fastqsplitterMemory runtime { - memory: "~{memory}G" - docker: dockerImage cpu: cores + memory: "~{memory}GiB" + docker: dockerImage } } diff --git a/fgbio.wdl b/fgbio.wdl new file mode 100644 index 00000000..15fb0ea4 --- /dev/null +++ b/fgbio.wdl @@ -0,0 +1,68 @@ +version 1.0 + +# Copyright (c) 2017 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task AnnotateBamWithUmis { + input { + File inputBam + File inputUmi + String outputPath + + String memory = "120GiB" + Int timeMinutes = 360 + String javaXmx="100G" + String dockerImage = "quay.io/biocontainers/fgbio:1.4.0--hdfd78af_0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + fgbio -Xmx~{javaXmx} \ + AnnotateBamWithUmis \ + -i ~{inputBam} \ + -f ~{inputUmi} \ + -o ~{outputPath} + } + + output { + File outputBam = outputPath + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBam: {description: "The input BAM file.", category: "required"} + inputUmi: {description: "The input fastq file with UMIs.", category: "required"} + outputPath: {description: "Output directory path + output file.", category: "required"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "UMI-annotated output BAM file."} + } +} diff --git a/flash.wdl b/flash.wdl index 6e704921..7b50e0d7 100644 --- a/flash.wdl +++ b/flash.wdl @@ -24,16 +24,17 @@ import "common.wdl" as common task Flash { input { - String? preCommand FastqPair inputFastq String outdirPath String outPrefix = "flash" + Boolean compress = true + + String? preCommand Int? minOverlap Int? maxOverlap - Boolean compress = true Int threads = 2 - String memory = "2G" + String memory = "2GiB" } command { @@ -55,8 +56,8 @@ task Flash { File notCombined1 = outdirPath + "/" + outPrefix + ".notCombined_1.fastq.gz" File notCombined2 = outdirPath + "/" + outPrefix + ".notCombined_2.fastq.gz" FastqPair notCombined = object { - R1: notCombined1, - R2: notCombined2 + R1: notCombined1, + R2: notCombined2 } File hist = outdirPath + "/" + outPrefix + ".hist" File histogram = outdirPath + "/" + outPrefix + ".histogram" @@ -66,5 +67,4 @@ task Flash { cpu: threads memory: memory } - -} \ No newline at end of file +} diff --git a/gatk.wdl b/gatk.wdl index b730cbee..655a0b66 100644 --- a/gatk.wdl +++ b/gatk.wdl @@ -28,19 +28,21 @@ task AnnotateIntervals { String annotatedIntervalsPath = "intervals.annotated.tsv" File intervals String intervalMergingRule = "OVERLAPPING_ONLY" + Int featureQueryLookahead = 1000000 + File? mappabilityTrack File? segmentalDuplicationTrack - Int featureQueryLookahead = 1000000 - String memory = "10G" String javaXmx = "2G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "3GiB" + Int timeMinutes = 5 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{annotatedIntervalsPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ AnnotateIntervals \ -R ~{referenceFasta} \ -L ~{intervals} \ @@ -56,11 +58,13 @@ task AnnotateIntervals { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs referenceFasta: {description: "The reference fasta file.", category: "required"} referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} @@ -69,16 +73,18 @@ task AnnotateIntervals { intervalMergingRule: {description: "Equivalent to gatk AnnotateIntervals' `--interval-merging-rule` option.", category: "advanced"} mappabilityTrack: {description: "Equivalent to gatk AnnotateIntervals' `--mappability-track` option.", category: "common"} segmentalDuplicationTrack: {description: "Equivalent to gatk AnnotateIntervals' `--segmenta-duplicarion-track` option.", category: "common"} - featureQueryLookahead: {description: "Equivalent to gatk AnnotateIntervals' `--feature-query-lookahead` option", category: "advanced"} + featureQueryLookahead: {description: "Equivalent to gatk AnnotateIntervals' `--feature-query-lookahead` option.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + annotatedIntervals: {description: "This is a tab-separated values (TSV) file with a SAM-style header containing a sequence dictionary, a row specifying the column headers for the contained annotations, and the corresponding entry rows."} } } -# Apply Base Quality Score Recalibration (BQSR) model +# Apply Base Quality Score Recalibration (BQSR) model. task ApplyBQSR { input { File inputBam @@ -90,15 +96,18 @@ task ApplyBQSR { File referenceFastaDict File referenceFastaFai - String memory = "12G" - String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + Int javaXmxMb = 2048 + Int memoryMb = javaXmxMb + 512 + # This will likely be used with intervals, as such size based + # estimation can't be used. + Int timeMinutes = 120 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputBamPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1' \ ApplyBQSR \ --create-output-bam-md5 \ --add-output-sam-program-record \ @@ -120,31 +129,34 @@ task ApplyBQSR { } runtime { + memory: "~{memoryMb}MiB" + time_minutes: timeMinutes docker: dockerImage - memory: memory } parameter_meta { + # inputs inputBam: {description: "The BAM file which should be recalibrated.", category: "required"} inputBamIndex: {description: "The input BAM file's index.", category: "required"} outputBamPath: {description: "The location the resulting BAM file should be written.", category: "required"} recalibrationReport: {description: "The BQSR report the be used for recalibration.", category: "required"} sequenceGroupInterval: {description: "Bed files describing the regions to operate on.", category: "advanced"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} + memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + # outputs + recalibratedBam: {description: "A BAM file containing the recalibrated read data."} + recalibratedBamIndex: {description: "Index of recalibrated BAM file."} + recalibratedBamMd5: {description: "MD5 of recalibrated BAM file."} } } -# Generate Base Quality Score Recalibration (BQSR) model +# Generate Base Quality Score Recalibration (BQSR) model. task BaseRecalibrator { input { File inputBam @@ -153,21 +165,23 @@ task BaseRecalibrator { Array[File] sequenceGroupInterval = [] Array[File] knownIndelsSitesVCFs = [] Array[File] knownIndelsSitesVCFIndexes = [] - File? dbsnpVCF - File? dbsnpVCFIndex File referenceFasta File referenceFastaDict File referenceFastaFai - String memory = "12G" - String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + File? dbsnpVCF + File? dbsnpVCFIndex + + Int javaXmxMb = 1024 + Int memoryMb = javaXmxMb + 512 + Int timeMinutes = 120 # This will likely be used with intervals, as such size based estimation can't be used. + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{recalibrationReportPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1' \ BaseRecalibrator \ -R ~{referenceFasta} \ -I ~{inputBam} \ @@ -183,46 +197,49 @@ task BaseRecalibrator { } runtime { + memory: "~{memoryMb}MiB" + time_minutes: timeMinutes docker: dockerImage - memory: memory } parameter_meta { + # inputs inputBam: {description: "The BAM file to generate a BQSR report for.", category: "required"} inputBamIndex: {description: "The index of the input BAM file.", category: "required"} recalibrationReportPath: {description: "The location to write the BQSR report to.", category: "required"} sequenceGroupInterval: {description: "Bed files describing the regions to operate on.", category: "advanced"} knownIndelsSitesVCFs: {description: "VCF files with known indels.", category: "advanced"} knownIndelsSitesVCFIndexes: {description: "The indexed for the known variant VCFs.", category: "advanced"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} dbsnpVCF: {description: "A dbSNP VCF.", category: "common"} dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} - referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} + memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + # outputs + recalibrationReport: {description: "A GATK Report file with many tables."} } } task CalculateContamination { input { File tumorPileups + File? normalPileups - String memory = "24G" String javaXmx = "12G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" + String memory = "13GiB" + Int timeMinutes = 180 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ CalculateContamination \ -I ~{tumorPileups} \ ~{"-matched " + normalPileups} \ @@ -236,18 +253,23 @@ task CalculateContamination { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs tumorPileups: {description: "The pileup summary of a tumor/case sample.", category: "required"} normalPileups: {description: "The pileup summary of the normal/control sample.", category: "common"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + contaminationTable: {description: "Table with fractions of reads from cross-sample contamination."} + mafTumorSegments: {description: "Tumor segments table."} } } @@ -256,15 +278,16 @@ task CallCopyRatioSegments { String outputPrefix File copyRatioSegments - String memory = "21G" - String javaXmx = "6G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String javaXmx = "2G" + String memory = "3GiB" + Int timeMinutes = 2 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputPrefix})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ CallCopyRatioSegments \ -I ~{copyRatioSegments} \ -O ~{outputPrefix}.called.seg @@ -276,18 +299,23 @@ task CallCopyRatioSegments { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs outputPrefix: {description: "The prefix for the output files.", category: "required"} copyRatioSegments: {description: "The copy ratios file generated by gatk ModelSegments.", category: "required"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + calledSegments: {description: "This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in CalledCopyRatioSegmentCollection.CalledCopyRatioSegmentTableColumn, and the corresponding entry rows."} + calledSegmentsIgv: {description: "This is a tab-separated values (TSV) file with CBS-format column headers and the corresponding entry rows that can be plotted using IGV."} } } @@ -295,21 +323,24 @@ task CollectAllelicCounts { input { String allelicCountsPath = "allelic_counts.tsv" File commonVariantSites - File? commonVariantSitesIndex File inputBam File inputBamIndex File referenceFasta File referenceFastaDict File referenceFastaFai - String memory = "90G" - String javaXmx = "30G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + + File? commonVariantSitesIndex + + String javaXmx = "10G" + String memory = "11GiB" + Int timeMinutes = 120 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{allelicCountsPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ CollectAllelicCounts \ -L ~{commonVariantSites} \ -I ~{inputBam} \ @@ -322,24 +353,28 @@ task CollectAllelicCounts { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs allelicCountsPath: {description: "The path the output should be written to.", category: "advanced"} commonVariantSites: {description: "Interval list or vcf of common variant sites (to retrieve the allelic counts for).", category: "required"} - commonVariantSitesIndex: {description: "The index for commonVariantSites.", category: "common"} inputBam: {description: "The BAM file to generate counts for.", category: "required"} inputBamIndex: {description: "The index of the input BAM file.", category: "required"} referenceFasta: {description: "The reference fasta file.", category: "required"} referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + commonVariantSitesIndex: {description: "The index for commonVariantSites.", category: "common"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + allelicCounts: {description: "This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in AllelicCountCollection.AllelicCountTableColumn, and the corresponding entry rows."} } } @@ -354,15 +389,16 @@ task CollectReadCounts { File referenceFastaFai String intervalMergingRule = "OVERLAPPING_ONLY" - String memory = "35G" String javaXmx = "7G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "8GiB" + Int timeMinutes = 1 + ceil(size(inputBam, "G") * 5) + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{countsPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ CollectReadCounts \ -L ~{intervals} \ -I ~{inputBam} \ @@ -377,11 +413,13 @@ task CollectReadCounts { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs countsPath: {description: "The location the output should be written to.", category: "advanced"} intervals: {description: "The intervals to collect counts for.", category: "required"} inputBam: {description: "The BAM file to determine the coverage for.", category: "required"} @@ -390,11 +428,13 @@ task CollectReadCounts { referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} intervalMergingRule: {description: "Equivalent to gatk CollectReadCounts' `--interval-merging-rule` option.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + counts: {description: "Read counts at specified intervals."} } } @@ -408,15 +448,16 @@ task CombineGVCFs { File referenceFastaDict File referenceFastaFai - String memory = "24G" - String javaXmx = "12G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String javaXmx = "4G" + String memory = "5GiB" + Int timeMinutes = 1 + ceil(size(gvcfFiles, "G") * 8) + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ CombineGVCFs \ -R ~{referenceFasta} \ -O ~{outputPath} \ @@ -430,26 +471,28 @@ task CombineGVCFs { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs gvcfFiles: {description: "The GVCF files to be combined.", category: "required"} gvcfFilesIndex: {description: "The indexes for the GVCF files.", caregory: "required"} intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "advanced"} outputPath: {description: "The location the combined GVCF should be written to.", category: "required"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "A combined multi-sample gVCF."} + outputVcfIndex: {description: "Index of the output file."} } } @@ -461,31 +504,32 @@ task CombineVariants { String genotypeMergeOption = "UNIQUIFY" String filteredRecordsMergeType = "KEEP_IF_ANY_UNFILTERED" Array[String]+ identifiers - Array[File]+ variantVcfs # follow "identifiers" array order + Array[File]+ variantVcfs # Follow "identifiers" array order. Array[File]+ variantIndexes String outputPath - String memory = "24G" String javaXmx = "12G" + String memory = "13GiB" + Int timeMinutes = 180 String dockerImage = "broadinstitute/gatk3:3.8-1" } command <<< set -e mkdir -p "$(dirname ~{outputPath})" - - # build "-V: " arguments according to IDs and VCFs to merge - # Make sure commands are run in bash + # Build "-V: " arguments according to IDs + # and VCFs to merge. + # Make sure commands are run in bash. V_args=$(bash -c ' set -eu ids=(~{sep=" " identifiers}) vars=(~{sep=" " variantVcfs}) for (( i = 0; i < ${#ids[@]}; ++i )) - do + do printf -- "-V:%s %s " "${ids[i]}" "${vars[i]}" - done + done ') - java -Xmx~{javaXmx} -jar /usr/GenomeAnalysisTK.jar \ + java -Xmx~{javaXmx} -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar \ -T CombineVariants \ -R ~{referenceFasta} \ --genotypemergeoption ~{genotypeMergeOption} \ @@ -500,11 +544,13 @@ task CombineVariants { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} @@ -513,13 +559,15 @@ task CombineVariants { identifiers: {description: "The sample identifiers in the same order as variantVcfs.", category: "required"} variantVcfs: {description: "The input VCF files in the same order as identifiers.", category: "required"} variantIndexes: {description: "The indexes of the input VCF files.", category: "required"} - outputPath: {description: "The location the output should be written to", category: "required"} - + outputPath: {description: "The location the output should be written to.", category: "required"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + combinedVcf: {description: "Combined VCF file."} + combinedVcfIndex: {description: "Index of combined VCF file."} } } @@ -527,17 +575,20 @@ task CreateReadCountPanelOfNormals { input { String PONpath = "PON.hdf5" Array[File]+ readCountsFiles + File? annotatedIntervals - String memory = "21G" String javaXmx = "7G" - String dockerImage = "broadinstitute/gatk:4.1.4.0" # The biocontainer causes a spark related error for some reason... + String memory = "8GiB" + Int timeMinutes = 5 + # The biocontainer causes a spark related error for some reason. + String dockerImage = "broadinstitute/gatk:4.1.8.0" } command { set -e mkdir -p "$(dirname ~{PONpath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ CreateReadCountPanelOfNormals \ -I ~{sep=" -I " readCountsFiles} \ ~{"--annotated-intervals " + annotatedIntervals} \ @@ -549,39 +600,44 @@ task CreateReadCountPanelOfNormals { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs PONpath: {description: "The location the PON should be written to.", category: "common"} readCountsFiles: {description: "The read counts files as generated by CollectReadCounts.", category: "required"} - annotatedIntervals: {description: "An annotation set of intervals as generated by AnnotateIntervals. If provided, explicit GC correction will be performed.", - category: "advanced"} + annotatedIntervals: {description: "An annotation set of intervals as generated by AnnotateIntervals. If provided, explicit GC correction will be performed.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + PON: {description: "Panel-of-normals file."} } } task DenoiseReadCounts { input { - File? PON - File? annotatedIntervals File readCounts String outputPrefix - String memory = "39G" - String javaXmx = "13G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + File? PON + File? annotatedIntervals + + String javaXmx = "4G" + String memory = "5GiB" + Int timeMinutes = 5 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputPrefix})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ DenoiseReadCounts \ -I ~{readCounts} \ ~{"--count-panel-of-normals " + PON} \ @@ -596,21 +652,25 @@ task DenoiseReadCounts { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { - PON: {description: "A panel of normals as generated by CreateReadCountPanelOfNormals.", category: "advanced"} - annotatedIntervals: {description: "An annotated set of intervals as generated by AnnotateIntervals. Will be ignored if PON is provided.", - category: "advanced"} + # inputs readCounts: {description: "The read counts file as generated by CollectReadCounts.", category: "required"} outputPrefix: {description: "The prefix for the output files.", category: "required"} + PON: {description: "A panel of normals as generated by CreateReadCountPanelOfNormals.", category: "advanced"} + annotatedIntervals: {description: "An annotated set of intervals as generated by AnnotateIntervals. Will be ignored if PON is provided.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + standardizedCopyRatios: {description: "This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in CopyRatioCollection.CopyRatioTableColumn, and the corresponding entry rows."} + denoisedCopyRatios: {description: "This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in CopyRatioCollection.CopyRatioTableColumn, and the corresponding entry rows."} } } @@ -622,21 +682,23 @@ task FilterMutectCalls { File unfilteredVcf File unfilteredVcfIndex String outputVcf + Int uniqueAltReadCount = 4 + File mutect2Stats + File? contaminationTable File? mafTumorSegments File? artifactPriors - Int uniqueAltReadCount = 4 - File mutect2Stats - String memory = "24G" String javaXmx = "12G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" + String memory = "13GiB" + Int timeMinutes = 60 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputVcf})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ FilterMutectCalls \ -R ~{referenceFasta} \ -V ~{unfilteredVcf} \ @@ -657,46 +719,52 @@ task FilterMutectCalls { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} unfilteredVcf: {description: "An unfiltered VCF file as produced by Mutect2.", category: "required"} unfilteredVcfIndex: {description: "The index of the unfiltered VCF file.", category: "required"} outputVcf: {description: "The location the filtered VCF file should be written.", category: "required"} + uniqueAltReadCount: {description: "Equivalent to FilterMutectCalls' `--unique-alt-read-count` option.", category: "advanced"} + mutect2Stats: {description: "Equivalent to FilterMutectCalls' `-stats` option.", category: "advanced"} contaminationTable: {description: "Equivalent to FilterMutectCalls' `--contamination-table` option.", category: "advanced"} mafTumorSegments: {description: "Equivalent to FilterMutectCalls' `--tumor-segmentation` option.", category: "advanced"} artifactPriors: {description: "Equivalent to FilterMutectCalls' `--ob-priors` option.", category: "advanced"} - uniqueAltReadCount: {description: "Equivalent to FilterMutectCalls' `--unique-alt-read-count` option.", category: "advanced"} - mutect2Stats: {description: "Equivalent to FilterMutectCalls' `-stats` option.", category: "advanced"} - + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + filteredVcf: {description: "VCF file with filtered variants from a Mutect2 VCF callset."} + filteredVcfIndex: {description: "Index of output VCF file."} + filteringStats: {description: "The output filtering stats file."} } } -# Combine multiple recalibration tables from scattered BaseRecalibrator runs +# Combine multiple recalibration tables from scattered BaseRecalibrator runs. task GatherBqsrReports { input { Array[File] inputBQSRreports String outputReportPath - String memory = "12G" - String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + Int javaXmxMb = 256 + Int memoryMb = 256 + javaXmxMb + Int timeMinutes = 1 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputReportPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1' \ GatherBQSRReports \ -I ~{sep=' -I ' inputBQSRreports} \ -O ~{outputReportPath} @@ -707,19 +775,22 @@ task GatherBqsrReports { } runtime { + memory: "~{memoryMb}MiB" + time_minutes: timeMinutes docker: dockerImage - memory: memory } parameter_meta { + # inputs inputBQSRreports: {description: "The BQSR reports to be merged.", category: "required"} outputReportPath: {description: "The location of the combined BQSR report.", category: "required"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + # outputs + outputBQSRreport: {description: "Single file with scattered BQSR recalibration reports gathered into one."} } } @@ -730,16 +801,19 @@ task GenomicsDBImport { Array[File]+ intervals String genomicsDBWorkspacePath = "genomics_db" String genomicsDBTarFile = "genomics_db.tar.gz" + String? tmpDir - String memory = "12G" + String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "5GiB" + Int timeMinutes = 180 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{genomicsDBWorkspacePath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ GenomicsDBImport \ -V ~{sep=" -V " gvcfFiles} \ --genomicsdb-workspace-path ~{genomicsDBWorkspacePath} \ @@ -753,23 +827,26 @@ task GenomicsDBImport { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs gvcfFiles: {description: "The gvcfFiles to be merged.", category: "required"} gvcfFilesIndex: {description: "Indexes for the gvcfFiles.", category: "required"} intervals: {description: "intervals over which to operate.", category: "required"} - genomicsDBWorkspacePath: {description: "Where the genomicsDB files should be stored", category: "advanced"} - genomicsDBTarFile: {description: "Where the .tar file containing the genomicsDB should be stored", category: "advanced"} - tmpDir: {description: "Alternate temporary directory in case there is not enough space. Must be mounted when using containers", - category: "advanced"} + genomicsDBWorkspacePath: {description: "Where the genomicsDB files should be stored.", category: "advanced"} + genomicsDBTarFile: {description: "Where the .tar file containing the genomicsDB should be stored.", category: "advanced"} + tmpDir: {description: "Alternate temporary directory in case there is not enough space. Must be mounted when using containers.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + genomicsDbTarArchive: {description: "Imported VCFs to GenomicsDB file."} } } @@ -777,66 +854,70 @@ task GenotypeGVCFs { input { File gvcfFile File gvcfFileIndex - Array[File]+ intervals String outputPath File referenceFasta File referenceFastaDict File referenceFastaFai Array[String] annotationGroups = ["StandardAnnotation"] + + Array[File]? intervals File? dbsnpVCF File? dbsnpVCFIndex File? pedigree - String memory = "18G" String javaXmx = "6G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "7GiB" + Int timeMinutes = 120 # This will likely be used with intervals, as such size based estimation can't be used. + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ GenotypeGVCFs \ -R ~{referenceFasta} \ -O ~{outputPath} \ ~{"-D " + dbsnpVCF} \ ~{"--pedigree " + pedigree} \ ~{true="-G" false="" length(annotationGroups) > 0} ~{sep=" -G " annotationGroups} \ - --only-output-calls-starting-in-intervals \ -V ~{gvcfFile} \ - -L ~{sep=' -L ' intervals} + ~{true="--only-output-calls-starting-in-intervals" false="" defined(intervals)} \ + ~{true="-L" false="" defined(intervals)} ~{sep=' -L ' intervals} } output { File outputVCF = outputPath File outputVCFIndex = outputPath + ".tbi" - } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs gvcfFile: {description: "The GVCF file to be genotyped.", category: "required"} gvcfFileIndex: {description: "The index of the input GVCF file.", category: "required"} - intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "required"} outputPath: {description: "The location to write the output VCF file to.", category: "required"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - annotationGroups: {description: "Which annotation groups will be used for the annotation", category: "advanced"} + annotationGroups: {description: "Which annotation groups will be used for the annotation.", category: "advanced"} + intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "optional"} dbsnpVCF: {description: "A dbSNP VCF.", category: "common"} dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"} - pedigree: {description: "Pedigree file for determining the population \"founders\"", category: "common"} + pedigree: {description: "Pedigree file for determining the population \"founders\".", category: "common"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVCF: {description: "A final VCF in which all samples have been jointly genotyped. "} + outputVCFIndex: {description: "Index of final VCF file."} } } @@ -850,14 +931,15 @@ task GetPileupSummaries { File sitesForContaminationIndex String outputPrefix - String memory = "24G" String javaXmx = "12G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" + String memory = "13GiB" + Int timeMinutes = 120 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ GetPileupSummaries \ -I ~{sampleBam} \ -V ~{variantsForContamination} \ @@ -870,11 +952,13 @@ task GetPileupSummaries { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs sampleBam: {description: "A BAM file for which a pileup should be created.", category: "required"} sampleBamIndex: {description: "The index of the input BAM file.", category: "required"} variantsForContamination: {description: "A VCF file with common variants.", category: "required"} @@ -882,44 +966,50 @@ task GetPileupSummaries { sitesForContamination: {description: "A bed file describing regions to operate on.", category: "required"} sitesForContaminationIndex: {description: "The index for the bed file.", category: "required"} outputPrefix: {description: "The prefix for the ouput.", category: "required"} - + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + pileups: {description: "Pileup metrics for inferring contamination."} } } -# Call variants on a single sample with HaplotypeCaller to produce a GVCF + task HaplotypeCaller { input { Array[File]+ inputBams Array[File]+ inputBamsIndex - Array[File]+? intervalList - Array[File]+? excludeIntervalList String outputPath File referenceFasta File referenceFastaIndex File referenceFastaDict + Boolean gvcf = false + String emitRefConfidence = if gvcf then "GVCF" else "NONE" + Boolean dontUseSoftClippedBases = false + + Array[File]+? intervalList + Array[File]+? excludeIntervalList Float? contamination File? dbsnpVCF File? dbsnpVCFIndex File? pedigree Int? ploidy String? outputMode - Boolean gvcf = false - String emitRefConfidence = if gvcf then "GVCF" else "NONE" + Float? standardMinConfidenceThresholdForCalling - String memory = "12G" - String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + Int javaXmxMb = 4096 + # Memory increases with time used. 4G should cover most use cases. + Int memoryMb = javaXmxMb + 512 + Int timeMinutes = 400 # This will likely be used with intervals, as such size based estimation can't be used. + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1' \ HaplotypeCaller \ -R ~{referenceFasta} \ -O ~{outputPath} \ @@ -931,7 +1021,9 @@ task HaplotypeCaller { ~{"--pedigree " + pedigree} \ ~{"--contamination-fraction-per-sample-file " + contamination} \ ~{"--output-mode " + outputMode} \ - --emit-ref-confidence ~{emitRefConfidence} + --emit-ref-confidence ~{emitRefConfidence} \ + ~{true="--dont-use-soft-clipped-bases" false="" dontUseSoftClippedBases} \ + ~{"--standard-min-confidence-threshold-for-calling " + standardMinConfidenceThresholdForCalling} } output { @@ -940,52 +1032,55 @@ task HaplotypeCaller { } runtime { + memory: "~{memoryMb}MiB" + time_minutes: timeMinutes docker: dockerImage - memory: memory } parameter_meta { + # inputs inputBams: {description: "The BAM files on which to perform variant calling.", category: "required"} inputBamsIndex: {description: "The indexes for the input BAM files.", category: "required"} - intervalList: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"} - excludeIntervalList: {description: "Bed files or interval lists describing the regions to NOT operate on.", category: "common"} outputPath: {description: "The location to write the output to.", category: "required"} - ploidy: {description: "The ploidy with which the variants should be called.", category: "common"} - gvcf: {description: "Whether the output should be a gvcf", category: "common"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaIndex: {description: "The index for the reference fasta file.", category: "required"} + gvcf: {description: "Whether the output should be a gvcf.", category: "common"} + emitRefConfidence: {description: "Whether to include reference calls. Three modes: 'NONE', 'BP_RESOLUTION' and 'GVCF'.", category: "advanced"} + dontUseSoftClippedBases: {description: "Do not use soft-clipped bases. Should be 'true' for RNA variant calling.", category: "common"} + intervalList: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"} + excludeIntervalList: {description: "Bed files or interval lists describing the regions to NOT operate on.", category: "common"} contamination: {description: "Equivalent to HaplotypeCaller's `-contamination` option.", category: "advanced"} - outputMode: {description: "Specifies which type of calls we should output. Same as HaplotypeCaller's `--output-mode` option.", - category: "advanced"} - emitRefConfidence: {description: "Whether to include reference calls. Three modes: 'NONE', 'BP_RESOLUTION' and 'GVCF'", - category: "advanced"} dbsnpVCF: {description: "A dbSNP VCF.", category: "common"} dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"} - pedigree: {description: "Pedigree file for determining the population \"founders\"", category: "common"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + pedigree: {description: "Pedigree file for determining the population \"founders\".", category: "common"} + ploidy: {description: "The ploidy with which the variants should be called.", category: "common"} + outputMode: {description: "Specifies which type of calls we should output. Same as HaplotypeCaller's `--output-mode` option.", category: "advanced"} + standardMinConfidenceThresholdForCalling: {description: "Confidence threshold used for calling variants.", category: "advanced"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} + memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVCF: {description: "Raw, unfiltered SNP and indel calls."} + outputVCFIndex: {description: "Index of output VCF."} } } - task LearnReadOrientationModel { input { Array[File]+ f1r2TarGz - String memory = "24G" String javaXmx = "12G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" + String memory = "13GiB" + Int timeMinutes = 120 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ LearnReadOrientationModel \ -I ~{sep=" -I " f1r2TarGz} \ -O "artifact-priors.tar.gz" @@ -996,17 +1091,21 @@ task LearnReadOrientationModel { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs f1r2TarGz: {description: "A f1r2TarGz file outputed by mutect2.", category: "required"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + artifactPriorsTable: {description: "Maximum likelihood estimates of artifact prior probabilities in the orientation bias mixture model filter."} } } @@ -1014,14 +1113,15 @@ task MergeStats { input { Array[File]+ stats - String memory = "28G" String javaXmx = "14G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "15GiB" + Int timeMinutes = 30 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ MergeMutectStats \ -stats ~{sep=" -stats " stats} \ -O "merged.stats" @@ -1032,17 +1132,21 @@ task MergeStats { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs stats: {description: "Statistics files to be merged.", category: "required"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + mergedStats: {description: "Merged stats from scattered Mutect2 runs."} } } @@ -1052,21 +1156,21 @@ task ModelSegments { String outputPrefix File denoisedCopyRatios File allelicCounts - File? normalAllelicCounts - Int minimumTotalAlleleCountCase = if defined(normalAllelicCounts) - then 0 - else 30 + Int minimumTotalAlleleCountCase = if defined(normalAllelicCounts) then 0 else 30 Int maximumNumberOfSmoothingIterations = 10 - String memory = "64G" + File? normalAllelicCounts + String javaXmx = "10G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "11GiB" + Int timeMinutes = 60 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p ~{outputDir} - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ ModelSegments \ --denoised-copy-ratios ~{denoisedCopyRatios} \ --allelic-counts ~{allelicCounts} \ @@ -1079,7 +1183,6 @@ task ModelSegments { output { File hetrozygousAllelicCounts = outputDir + "/" + outputPrefix + ".hets.tsv" - File? normalHetrozygousAllelicCounts = outputDir + "/" + outputPrefix + ".hets.normal.tsv" File copyRatioSegments = outputDir + "/" + outputPrefix + ".cr.seg" File copyRatioCBS = outputDir + "/" + outputPrefix + ".cr.igv.seg" File alleleFractionCBS = outputDir + "/" + outputPrefix + ".af.igv.seg" @@ -1089,27 +1192,41 @@ task ModelSegments { File modeledSegments = outputDir + "/" + outputPrefix + ".modelFinal.seg" File copyRatioParameters = outputDir + "/" + outputPrefix + ".modelFinal.cr.param" File alleleFractionParameters = outputDir + "/" + outputPrefix + ".modelFinal.af.param" + File? normalHetrozygousAllelicCounts = outputDir + "/" + outputPrefix + ".hets.normal.tsv" } runtime { - docker: dockerImage memory: memory + time_minute: timeMinutes + docker: dockerImage } parameter_meta { + # inputs outputDir: {description: "The directory to write the ouput to.", category: "common"} outputPrefix: {description: "The prefix of the output files. Should not include directories.", category: "required"} denoisedCopyRatios: {description: "The denoised copy ratios as generated by DenoiseReadCounts.", category: "required"} allelicCounts: {description: "The allelicCounts as generate by CollectAllelicCounts.", category: "required" } - normalAllelicCounts: {description: "The allelicCounts as generate by CollectAllelicCounts for a matched normal.", category: "common"} minimumTotalAlleleCountCase: {description: "Equivalent to gatk ModelSeqments' `--minimum-total-allele-count-case` option.", category: "advanced"} maximumNumberOfSmoothingIterations: {description: "Equivalent to gatk ModelSeqments' `--maximum-number-of-smoothing-iterations` option.", category: "advanced"} - + normalAllelicCounts: {description: "The allelicCounts as generate by CollectAllelicCounts for a matched normal.", category: "common"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + hetrozygousAllelicCounts: {description: "Allelic-counts file containing the counts at sites genotyped as heterozygous in the case sample."} + copyRatioSegments: {description: "It contains the segments from the .modelFinal.seg file converted to a format suitable for input to CallCopyRatioSegments."} + copyRatioCBS: {description: "The posterior medians of the log2 copy ratio."} + alleleFractionCBS: {description: "Minor-allele fraction."} + unsmoothedModeledSegments: {description: "The initial modeled-segments result before segmentation smoothing."} + unsmoothedCopyRatioParameters: {description: "The initial copy-ratio-model global-parameter result before segmentation smoothing."} + unsmoothedAlleleFractionParameters: {description: "The initial allele-fraction-model global-parameter result before segmentation smoothing."} + modeledSegments: {description: "The final modeled-segments result after segmentation smoothing."} + copyRatioParameters: {description: "The final copy-ratio-model global-parameter result after segmentation smoothing."} + alleleFractionParameters: {description: "The final allele-fraction-model global-parameter result after segmentation smoothing."} + normalHetrozygousAllelicCounts: {description: "Allelic-counts file containing the counts at sites genotyped as heterozygous in the matched-normal sample."} } } @@ -1122,24 +1239,26 @@ task MuTect2 { File referenceFastaFai String outputVcf String tumorSample + String f1r2TarGz = "f1r2.tar.gz" + Array[File]+ intervals + String outputStats = outputVcf + ".stats" + String? normalSample File? germlineResource File? germlineResourceIndex File? panelOfNormals File? panelOfNormalsIndex - String f1r2TarGz = "f1r2.tar.gz" - Array[File]+ intervals - String outputStats = outputVcf + ".stats" - String memory = "16G" String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "5GiB" + Int timeMinutes = 240 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputVcf})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ Mutect2 \ -R ~{referenceFasta} \ -I ~{sep=" -I " inputBams} \ @@ -1160,11 +1279,13 @@ task MuTect2 { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs inputBams: {description: "The BAM files on which to perform variant calling.", category: "required"} inputBamsIndex: {description: "The indexes for the input BAM files.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} @@ -1172,19 +1293,24 @@ task MuTect2 { referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} outputVcf: {description: "The location to write the output VCF file to.", category: "required"} tumorSample: {description: "The name of the tumor/case sample.", category: "required"} + f1r2TarGz: {description: "Equivalent to Mutect2's `--f1r2-tar-gz` option.", category: "advanced"} + intervals: {description: "Bed files describing the regiosn to operate on.", category: "required"} + outputStats: {description: "The location the output statistics should be written to.", category: "advanced"} normalSample: {description: "The name of the normal/control sample.", category: "common"} germlineResource: {description: "Equivalent to Mutect2's `--germline-resource` option.", category: "advanced"} germlineResourceIndex: {description: "The index for the germline resource.", category: "advanced"} panelOfNormals: {description: "Equivalent to Mutect2's `--panel-of-normals` option.", category: "advanced"} panelOfNormalsIndex: {description: "The index for the panel of normals.", category: "advanced"} - f1r2TarGz: {description: "Equivalent to Mutect2's `--f1r2-tar-gz` option.", category: "advanced"} - intervals: {description: "Bed files describing the regiosn to operate on.", category: "required"} - outputStats: {description: "The location the output statistics should be written to.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + vcfFile: {description: "Somatic SNVs and indels called via local assembly of haplotypes."} + vcfFileIndex: {description: "Index for Mutect2 VCF."} + f1r2File: {description: "Contains information that can then be passed to LearnReadOrientationModel, which generate an artifact prior table for each tumor sample for FilterMutectCalls to use."} + stats: {description: "Stats file."} } } @@ -1195,17 +1321,19 @@ task PlotDenoisedCopyRatios { String outputPrefix File standardizedCopyRatios File denoisedCopyRatios + Int? minimumContigLength - String memory = "32G" - String javaXmx = "7G" - String dockerImage = "broadinstitute/gatk:4.1.4.0" # The biocontainer doesn't seem to contain R. + String javaXmx = "3G" + String memory = "4GiB" + Int timeMinutes = 2 + String dockerImage = "broadinstitute/gatk:4.1.8.0" } command { set -e mkdir -p ~{outputDir} - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ PlotDenoisedCopyRatios \ --standardized-copy-ratios ~{standardizedCopyRatios} \ --denoised-copy-ratios ~{denoisedCopyRatios} \ @@ -1217,30 +1345,39 @@ task PlotDenoisedCopyRatios { output { File denoisedCopyRatiosPlot = outputDir + "/" + outputPrefix + ".denoised.png" - File denoisedCopyRatiosLimitedPlot = outputDir + "/" + outputPrefix + ".denoisedLimit4.png" File standardizedMedianAbsoluteDeviation = outputDir + "/" + outputPrefix + ".standardizedMAD.txt" File denoisedMedianAbsoluteDeviation = outputDir + "/" + outputPrefix + ".denoisedMAD.txt" File deltaMedianAbsoluteDeviation = outputDir + "/" + outputPrefix + ".deltaMAD.txt" File deltaScaledMedianAbsoluteDeviation = outputDir + "/" + outputPrefix + ".scaledDeltaMAD.txt" + File? denoisedCopyRatiosLimitedPlot = outputDir + "/" + outputPrefix + ".denoisedLimit4.png" } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file used for the analyses.", category: "required"} outputDir: {description: "The directory to write the ouput to.", category: "common"} outputPrefix: {description: "The prefix of the output files. Should not include directories.", category: "required"} - denoisedCopyRatios: {description: "The denoised copy ratios as generated by DenoiseReadCounts.", category: "required"} standardizedCopyRatios: {description: "The standardized copy ratios as generated by DenoiseReadCounts.", category: "required"} + denoisedCopyRatios: {description: "The denoised copy ratios as generated by DenoiseReadCounts.", category: "required"} minimumContigLength: {description: "The minimum length for a contig to be included in the plots.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + denoisedCopyRatiosPlot: {description: "Plot showing the entire range of standardized and denoised copy ratios."} + standardizedMedianAbsoluteDeviation: {description: "Standardized median absolute deviation copy ratios."} + denoisedMedianAbsoluteDeviation: {description: "Denoised median absolute deviation copy ratios."} + deltaMedianAbsoluteDeviation: {description: "The change between `standardizedMedianAbsoluteDeviation` & `denoisedMedianAbsoluteDeviation`."} + deltaScaledMedianAbsoluteDeviation: {description: "The change between `standardizedMedianAbsoluteDeviation` & `denoisedMedianAbsoluteDeviation` scaled by standardized MAD."} + denoisedCopyRatiosLimitedPlot: {description: "Plot showing the standardized and denoised copy ratios limited to ratios within [0, 4]."} } } @@ -1252,17 +1389,19 @@ task PlotModeledSegments { File denoisedCopyRatios File segments File allelicCounts + Int? minimumContigLength - String memory = "21G" - String javaXmx = "7G" - String dockerImage = "broadinstitute/gatk:4.1.4.0" # The biocontainer doesn't seem to contain R. + String javaXmx = "3G" + String memory = "4GiB" + Int timeMinutes = 2 + String dockerImage = "broadinstitute/gatk:4.1.8.0" } command { set -e mkdir -p ~{outputDir} - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ PlotModeledSegments \ --denoised-copy-ratios ~{denoisedCopyRatios} \ --allelic-counts ~{allelicCounts} \ @@ -1278,11 +1417,13 @@ task PlotModeledSegments { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file used for the analyses.", category: "required"} outputDir: {description: "The directory to write the ouput to.", category: "common"} outputPrefix: {description: "The prefix of the output files. Should not include directories.", category: "required"} @@ -1290,11 +1431,13 @@ task PlotModeledSegments { segments: {description: "The modeled segments as generated by ModelSegments.", category: "required"} allelicCounts: {description: "The hetrozygous allelic counts as generated by ModelSegments.", category: "required"} minimumContigLength: {description: "The minimum length for a contig to be included in the plots.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + modeledSegmentsPlot: {description: "This plot shows the input denoised copy ratios and/or alternate-allele fractions as points, as well as box plots for the available posteriors in each segment."} } } @@ -1303,21 +1446,23 @@ task PreprocessIntervals { File referenceFasta File referenceFastaDict File referenceFastaFai - File? intervals String outputIntervalList = "bins.interval_list" Int binLength = if defined(intervals) then 0 else 1000 Int padding = if defined(intervals) then 250 else 0 String intervalMergingRule = "OVERLAPPING_ONLY" - String memory = "10G" - String javaXmx = "2G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + File? intervals + + String javaXmx = "3G" + String memory = "4GiB" + Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 6) + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputIntervalList})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ PreprocessIntervals \ -R ~{referenceFasta} \ --sequence-dictionary ~{referenceFastaDict} \ @@ -1333,50 +1478,61 @@ task PreprocessIntervals { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { - referenceFasta: {description: "The reference fasta file..", category: "required"} + # inputs + referenceFasta: {description: "The reference fasta file.", category: "required"} referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - intervals: {description: "Bed files describing the regiosn to operate on.", category: "common"} outputIntervalList: {description: "The location the output should be written to.", category: "advanced"} binLength: {description: "The size of the bins to be created. Should be 0 for targeted/exome sequencing.", category: "advanced"} padding: {description: "The padding to be added to the bins. Should be 0 if contiguos binning is used, eg with WGS.", category: "advanced"} intervalMergingRule: {description: "Equivalent to gatk PreprocessIntervals' `--interval-merging-rule` option.", category: "advanced"} + intervals: {description: "Bed files describing the regiosn to operate on.", category: "common"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + intervalList: {description: "Preprocessed Picard interval-list file."} } } task SelectVariants { input { + File inputVcf + File inputVcfIndex File referenceFasta File referenceFastaDict File referenceFastaFai - File inputVcf - File inputVcfIndex String outputPath = "output.vcf.gz" - String? selectTypeToInclude Array[File] intervals = [] - String memory = "16G" + + Boolean excludeFiltered = false + String? selectTypeToInclude + String? selectGenotype + String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "5GiB" + Int timeMinutes = 60 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ SelectVariants \ -R ~{referenceFasta} \ -V ~{inputVcf} \ ~{"--select-type-to-include " + selectTypeToInclude} \ + ~{"-select-genotype \"" + selectGenotype}~{true="\"" false="" defined(selectGenotype)} \ + ~{true="--exclude-filtered" false="" excludeFiltered} \ ~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals} \ -O ~{outputPath} } @@ -1387,27 +1543,31 @@ task SelectVariants { } runtime { - docker: dockerImage memory: memory + time_minute: timeMinutes + docker: dockerImage } parameter_meta { + # inputs inputVcf: {description: "The VCF input file.", category: "required"} inputVcfIndex: {description: "The input VCF file's index.", category: "required"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - selectTypeToInclude: {description: "Select only a certain type of variants from the input file", category: "common"} outputPath: {description: "The location the output VCF file should be written.", category: "advanced"} intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"} - + selectTypeToInclude: {description: "Select only a certain type of variants from the input file.", category: "common"} + excludeFiltered: {description: "Remove all variants that do not have a PASS filter", category: "advanced"} + selectGenotype: {description: "The genotype to be selected", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "A new VCF file containing the selected subset of variants."} + outputVcfIndex: {description: "Index of the new output VCF file."} } } @@ -1421,15 +1581,16 @@ task SplitNCigarReads { String outputBam Array[File] intervals = [] - String memory = "16G" String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "5GiB" + Int timeMinutes = 120 # This will likely be used with intervals, as such size based estimation can't be used. + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputBam})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ SplitNCigarReads \ -I ~{inputBam} \ -R ~{referenceFasta} \ @@ -1443,26 +1604,115 @@ task SplitNCigarReads { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs inputBam: {description: "The BAM file for which spliced reads should be split.", category: "required"} inputBamIndex: {description: "The input BAM file's index.", category: "required"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} outputBam: {description: "The location the output BAM file should be written.", category: "required"} intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + # outputs + bam: {description: "BAM file with reads split at N CIGAR elements and CIGAR strings updated."} + bamIndex: {description: "Index of output BAM file."} + } +} + +task VariantEval { + input { + Array[File] evalVcfs + Array[File] evalVcfsIndex + Array[File] comparisonVcfs = [] + Array[File] comparisonVcfsIndex = [] + Array[File] intervals = [] + String outputPath = "eval.table" + Boolean doNotUseAllStandardModules = false + Boolean doNotUseAllStandardStratifications = false + Array[String] evalModules = [] + Array[String] stratificationModules = [] + Array[String] samples = [] + Boolean mergeEvals = false + + File? referenceFasta + File? referenceFastaDict + File? referenceFastaFai + File? dbsnpVCF + File? dbsnpVCFIndex + + String javaXmx = "4G" + String memory = "5GiB" + # TODO: Refine estimate. For now 4 minutes per GB of input. + Int timeMinutes = ceil(size(flatten([evalVcfs, comparisonVcfs, select_all([referenceFasta, dbsnpVCF])]), "G") * 20) + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ + VariantEval \ + --output ~{outputPath} \ + ~{true="--eval" false="" length(evalVcfs) > 0} ~{sep=" --eval " evalVcfs} \ + ~{true="--comparison" false="" length(comparisonVcfs) > 0} ~{sep=" --comparison " comparisonVcfs} \ + ~{"-R " + referenceFasta} \ + ~{"--dbsnp " + dbsnpVCF } \ + ~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals} \ + ~{true="--sample" false="" length(samples) > 0} ~{sep=' --sample ' samples} \ + ~{true="--do-not-use-all-standard-modules" false="" doNotUseAllStandardModules} \ + ~{true="--do-not-use-all-standard-stratifications" false="" doNotUseAllStandardStratifications} \ + ~{true="-EV" false="" length(evalModules) > 0} ~{sep=" -EV " evalModules} \ + ~{true="-ST" false="" length(stratificationModules) > 0} ~{sep=" -ST " stratificationModules} \ + ~{true="--merge-evals" false="" mergeEvals} + } + + output { + File table = outputPath + } + + runtime { + cpu: 1 + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + evalVcfs: {description: "Variant sets to evaluate.", category: "required"} + evalVcfsIndex: {description: "Indexes for the variant sets.", category: "required"} + comparisonVcfs: {description: "Compare set vcfs.", category: "advanced"} + comparisonVcfsIndex: {description: "Indexes for the compare sets.", category: "advanced"} + intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "advanced"} + outputPath: {description: "The location the output table should be written.", category: "advanced"} + doNotUseAllStandardModules: {description: "Do not use the standard modules by default (instead, only those that are specified with the evalModules option).", category: "common"} + doNotUseAllStandardStratifications: {description: "Do not use the standard stratification modules by default (instead, only those that are specified with the stratificationModules option).", category: "common"} + evalModules: {description: "One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless doNotUseAllStandardModules=true).", category: "common"} + stratificationModules: {description: "One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless doNotUseAllStandardStratifications=true).", category: "common"} + samples: {description: "Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context." , category: "advanced"} + mergeEvals: {description: "If provided, all evalVcf tracks will be merged into a single eval track.", category: "common"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "common"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "common"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "common"} + dbsnpVCF: {description: "A dbSNP VCF.", category: "common"} + dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + table: {description: "Evaluation tables detailing the results of the eval modules which were applied."} } } @@ -1477,15 +1727,16 @@ task VariantFiltration { Array[String]+ filterArguments Array[File] intervals = [] - String memory = "16G" String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String memory = "5GiB" + Int timeMinutes = 120 + String dockerImage = "quay.io/biocontainers/gatk4:4.1.8.0--py38h37ae868_0" } command { set -e mkdir -p "$(dirname ~{outputPath})" - gatk --java-options -Xmx~{javaXmx} \ + gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \ VariantFiltration \ -I ~{inputVcf} \ -R ~{referenceFasta} \ @@ -1500,27 +1751,28 @@ task VariantFiltration { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs inputVcf: {description: "The VCF to be filtered.", category: "required"} inputVcfIndex: {description: "The input VCF file's index.", category: "required"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} outputPath: {description: "The location the output VCF file should be written.", category: "common"} + filterArguments: {description: "Arguments that should be used for the filter. For example: ['--filter-name', 'my_filter', '--filter-expression', 'AB<0.2'].", category: "required"} intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "advanced"} - filterArguments: {description: "Arguments that should be used for the filter. For example: ['--filter-name', 'my_filter', '--filter-expression', 'AB<0.2']", - category: "required"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + filteredVcf: {description: "A filtered VCF in which passing variants are annotated as PASS and failing variants are annotated with the name(s) of the filter(s) they failed."} + filteredVcfIndex: {description: "Index of filtered VCF."} } } - diff --git a/gffcompare.wdl b/gffcompare.wdl index ca2b1669..fe1db0a8 100644 --- a/gffcompare.wdl +++ b/gffcompare.wdl @@ -22,16 +22,10 @@ version 1.0 task GffCompare { input { - File? inputGtfList Array[File] inputGtfFiles - File referenceAnnotation - String? outputDir - String outPrefix = "gffcmp" # gffcmp is the default used by the program as well. This - # needs to be defined in order for the output values to be consistent and correct. - File? genomeSequences - Int? maxDistanceFreeEndsTerminalExons - Int? maxDistanceGroupingTranscriptStartSites - String? namePrefix + # gffcmp is the default used by the program as well. This needs to be + # defined in order for the output values to be consistent and correct. + String outPrefix = "gffcmp" Boolean C = false Boolean A = false Boolean X = false @@ -44,14 +38,24 @@ task GffCompare { Boolean verbose = false Boolean debugMode = false + File? inputGtfList + File? referenceAnnotation + String? outputDir + File? genomeSequences + Int? maxDistanceFreeEndsTerminalExons + Int? maxDistanceGroupingTranscriptStartSites + String? namePrefix + + String memory = "4GiB" + Int timeMinutes = 1 + ceil(size(inputGtfFiles, "GiB") * 30) String dockerImage = "quay.io/biocontainers/gffcompare:0.10.6--h2d50403_0" # This workaround only works in the input section. - # Issue addressed at https://github.com/openwdl/wdl/pull/263 + # Issue addressed at https://github.com/openwdl/wdl/pull/263. File? noneFile # This is a wdl workaround. Please do not assign! } - # This allows for the creation of output directories + # This allows for the creation of output directories. String dirPrefix = if defined(outputDir) then select_first([outputDir]) + "/" else "" @@ -61,7 +65,7 @@ task GffCompare { set -e ~{"mkdir -p " + outputDir} gffcompare \ - -r ~{referenceAnnotation} \ + ~{"-r " + referenceAnnotation} \ ~{"-o '" + totalPrefix + "'"} \ ~{"-s " + genomeSequences} \ ~{"-e " + maxDistanceFreeEndsTerminalExons} \ @@ -88,19 +92,20 @@ task GffCompare { else 0 Int noInputFiles = length(inputGtfFiles) Boolean oneFile = (noFilesGtfList + noInputFiles) == 1 - String annotatedName = if oneFile + String annotatedName = if oneFile && defined(referenceAnnotation) then "annotated" else "combined" - # Check if a redundant .gtf will be created + # Check if a redundant .gtf will be created. Boolean createRedundant = C || A || X output { + # noneFile is not stable. Please replace this as soon as wdl spec allows. File annotated = totalPrefix + "." + annotatedName + ".gtf" File loci = totalPrefix + ".loci" File stats = totalPrefix + ".stats" File tracking = totalPrefix + ".tracking" - # noneFile is not stable. Please replace this as soon as wdl spec allows + Array[File] allFiles = select_all([annotated, loci, stats, tracking, redundant, missedIntrons]) File? redundant = if createRedundant then totalPrefix + ".redundant.gtf" else noneFile @@ -110,19 +115,16 @@ task GffCompare { } runtime { - docker: dockerImage + memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { - inputGtfList: {description: "Equivalent to gffcompare's `-i` option.", category: "advanced"} + # inputs inputGtfFiles: {description: "The input GTF files.", category: "required"} referenceAnnotation: {description: "The GTF file to compare with.", category: "required"} - outputDir: {description: "The location the output should be written.", category: "common"} outPrefix: {description: "The prefix for the output.", category: "advanced"} - genomeSequences: {description: "Equivalent to gffcompare's `-s` option.", category: "advanced"} - maxDistanceFreeEndsTerminalExons: {description: "Equivalent to gffcompare's `-e` option.", category: "advanced"} - maxDistanceGroupingTranscriptStartSites: {description: "Equivalent to gffcompare's `-d` option.", category: "advanced"} - namePrefix: {description: "Equivalent to gffcompare's `-p` option.", category: "advanced"} C: {description: "Equivalent to gffcompare's `-C` flag.", category: "advanced"} A: {description: "Equivalent to gffcompare's `-A` flag.", category: "advanced"} X: {description: "Equivalent to gffcompare's `-X` flag.", category: "advanced"} @@ -134,8 +136,24 @@ task GffCompare { noTmap: {description: "Equivalent to gffcompare's `-T` flag.", category: "advanced"} verbose: {description: "Equivalent to gffcompare's `-V` flag.", category: "advanced"} debugMode: {description: "Equivalent to gffcompare's `-D` flag.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + inputGtfList: {description: "Equivalent to gffcompare's `-i` option.", category: "advanced"} + outputDir: {description: "The location the output should be written.", category: "common"} + genomeSequences: {description: "Equivalent to gffcompare's `-s` option.", category: "advanced"} + maxDistanceFreeEndsTerminalExons: {description: "Equivalent to gffcompare's `-e` option.", category: "advanced"} + maxDistanceGroupingTranscriptStartSites: {description: "Equivalent to gffcompare's `-d` option.", category: "advanced"} + namePrefix: {description: "Equivalent to gffcompare's `-p` option.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + annotated: {description: "Annotated GTF file."} + loci: {description: "File describing the processed loci."} + stats: {description: "Various statistics related to the “accuracy” (or a measure of agreement) of the input transcripts when compared to reference annotation data."} + tracking: {description: "File matching up transcripts between samples."} + allFiles: {description: "A collection of all output files."} + redundant: {description: "File containing duplicate/redundant transcripts."} + missedIntrons: {description: "File denoting missed introns."} } meta { @@ -143,4 +161,4 @@ task GffCompare { exclude: ["noneFile"] } } -} \ No newline at end of file +} diff --git a/gffread.wdl b/gffread.wdl index 6b23785c..26a2773c 100644 --- a/gffread.wdl +++ b/gffread.wdl @@ -24,18 +24,22 @@ task GffRead { input { File inputGff File genomicSequence + Boolean outputGtfFormat = false + File? genomicIndex # Optional. GFFRead can create this by itself. String? exonsFastaPath String? CDSFastaPath String? proteinFastaPath String? filteredGffPath - Boolean outputGtfFormat = false + + String memory = "4GiB" + Int timeMinutes = 1 + ceil(size(inputGff, "GiB") * 10) String dockerImage = "quay.io/biocontainers/gffread:0.9.12--0" } # The mkdirs below are hackish. It should be - # ~{"mkir -p $(dirname " + somePath + ")"} - # but this goes wrong. Cromwell will always use ')' even if somepath is not defined. + # ~{"mkir -p $(dirname " + somePath + ")"} but this goes wrong. + # Cromwell will always use ')' even if somepath is not defined. # Which leads to crashing. command { set -e @@ -61,19 +65,29 @@ task GffRead { } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs inputGff: {description: "The input GFF file.", category: "required"} genomicSequence: {description: "The genome.", category: "required"} + outputGtfFormat: {description: "Equivalent to gffread's `-T` flag.", category: "advanced"} genomicIndex: {description: "The genome's index.", category: "advanced"} exonsFastaPath: {description: "The location the exons fasta should be written to.", category: "advanced"} CDSFastaPath: {description: "The location the CDS fasta should be written to.", category: "advanced"} proteinFastaPath: {description: "The location the protein fasta should be written to.", category: "advanced"} filteredGffPath: {description: "The location the filtered GFF should be written to.", category: "advanced"} - outputGtfFormat: {description: "Equivalent to gffread's `-T` flag.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + exonsFasta: {description: "Fasta file containing exons."} + CDSFasta: {description: "Fasta file containing CDS's."} + proteinFasta: {description: "Fasta file containing proteins."} + filteredGff: {description: "Filtered GFF file."} } -} \ No newline at end of file +} diff --git a/gridss.wdl b/gridss.wdl new file mode 100644 index 00000000..5aca3825 --- /dev/null +++ b/gridss.wdl @@ -0,0 +1,496 @@ +version 1.0 + +# Copyright (c) 2020 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import "bwa.wdl" as bwa + +task AnnotateInsertedSequence { + input { + File inputVcf + String outputPath = "gridss.annotated.vcf.gz" + File viralReference + File viralReferenceFai + File viralReferenceDict + File viralReferenceImg + + Int threads = 8 + String javaXmx = "8G" + String memory = "9GiB" + String dockerImage = "quay.io/biowdl/gridss:2.12.2" + Int timeMinutes = 120 + } + + command { + set -e + _JAVA_OPTIONS="$_JAVA_OPTIONS -Xmx~{javaXmx}" + AnnotateInsertedSequence \ + REFERENCE_SEQUENCE=~{viralReference} \ + INPUT=~{inputVcf} \ + OUTPUT=~{outputPath} \ + ALIGNMENT=APPEND \ + WORKING_DIR='.' \ + WORKER_THREADS=~{threads} + } + + output { + File outputVcf = outputPath + File outputVcfIndex = outputPath + ".tbi" + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + inputVcf: {description: "The input VCF file.", category: "required"} + outputPath: {description: "The path the output will be written to.", category: "common"} + viralReference: {description: "A fasta file with viral sequences.", category: "required"} + viralReferenceFai: {description: "The index for the viral reference fasta.", category: "required"} + viralReferenceDict: {description: "The dict file for the viral reference.", category: "required"} + viralReferenceImg: {description: "The BWA index image (generated with GATK BwaMemIndexImageCreator) of the viral reference.", category: "required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task AnnotateSvTypes { + input { + File gridssVcf + File gridssVcfIndex + String outputPath = "./gridss.svtyped.vcf.bgz" + + String memory = "32GiB" + String dockerImage = "quay.io/biocontainers/bioconductor-structuralvariantannotation:1.10.0--r41hdfd78af_0" + Int timeMinutes = 240 + } + + String effectiveOutputPath = sub(outputPath, "\\.bgz", "") + String index = if effectiveOutputPath != outputPath then "T" else "F" + + + # Based on https://github.com/PapenfussLab/gridss/issues/74 + command <<< + set -e + mkdir -p "$(dirname ~{outputPath})" + R --vanilla << "EOF" + library(VariantAnnotation) + library(StructuralVariantAnnotation) + + vcf_path <- "~{gridssVcf}" + out_path <- "~{effectiveOutputPath}" + + # Simple SV type classifier + simpleEventType <- function(gr) { + return(ifelse(seqnames(gr) != seqnames(partner(gr)), "BND", # inter-chromosomosal + ifelse(gr$insLen >= abs(gr$svLen) * 0.7, "INS", + ifelse(strand(gr) == strand(partner(gr)), "INV", + ifelse(xor(start(gr) < start(partner(gr)), strand(gr) == "-"), "DEL", + "DUP"))))) + } + + header <- scanVcfHeader(vcf_path) + vcf <- readVcf(vcf_path, seqinfo(header)) + gr <- breakpointRanges(vcf) + svtype <- simpleEventType(gr) + info(vcf[gr$sourceId])$SVTYPE <- svtype + # GRIDSS doesn't supply a GT, simply set it to 0/1 + geno(vcf)$GT <- as.matrix(sapply(row.names(vcf), function(x) {"0/1"})) + # Select only one breakend per event (also removes single breakends): + # sourceId ends with o or h for paired breakends, the first in the pair + # end with o the second with h. Single breakend end with b, these will + # also be removed since we can't determine the SVTYPE. + gr2 <- gr[grepl(".*o$", gr$sourceId)] + writeVcf(vcf[gr2$sourceId], out_path, index=~{index}) + EOF + >>> + + output { + File vcf = outputPath + File? vcfIndex = outputPath + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + gridssVcf: {description: "The VCF produced by GRIDSS.", category: "required"} + gridssVcfIndex: {description: "The index for the VCF produced by GRIDSS.", category: "required"} + outputPath: {description: "The path the output should be written to.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task FilterPon { + input { + File ponBed + File ponBedpe + Int minimumScore = 3 + String outputDir = "." + + String memory = "1GiB" + String dockerImage = "quay.io/biowdl/gridss:2.12.2" + Int timeMinutes = 20 + } + + command <<< + set -e + mkdir -p ~{outputDir} + + cat ~{ponBed} | awk '{if ($5 >= ~{minimumScore}) print $0}' > ~{outputDir}/gridss_pon_single_breakend.bed + cat ~{ponBedpe} | awk '{if ($8 >= ~{minimumScore}) print $0}' > ~{outputDir}/gridss_pon_breakpoint.bedpe + >>> + + output { + File bedpe = "~{outputDir}/gridss_pon_breakpoint.bedpe" + File bed = "~{outputDir}/gridss_pon_single_breakend.bed" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + ponBed: {description: "The PON BED file.", category: "required"} + ponBedpe: {description: "The PON BEDPE file.", category: "required"} + minimumScore: {description: "The minimum number normal samples an SV must have been found in to be kept.", category: "advanced"} + outputDir: {description: "The directory the output will be written to.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task GeneratePonBedpe { + input { + Array[File]+ vcfFiles + Array[File]+ vcfIndexes + File referenceFasta + File referenceFastaFai + String outputDir = "." + + Int threads = 8 + String javaXmx = "8G" + String memory = "9GiB" + String dockerImage = "quay.io/biowdl/gridss:2.12.2" + Int timeMinutes = 120 + } + + command { + set -e + mkdir -p ~{outputDir} + java -Xmx~{javaXmx} \ + -cp /usr/local/share/gridss-2.12.2-0/gridss.jar \ + gridss.GeneratePonBedpe \ + INPUT=~{sep=" INPUT=" vcfFiles} \ + NO=0 \ + O=~{outputDir}/gridss_pon_breakpoint.bedpe \ + SBO=~{outputDir}/gridss_pon_single_breakend.bed \ + REFERENCE_SEQUENCE=~{referenceFasta} \ + THREADS=~{threads} + } + + output { + File bedpe = "~{outputDir}/gridss_pon_breakpoint.bedpe" + File bed = "~{outputDir}/gridss_pon_single_breakend.bed" + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + vcfFiles: {description: "The vcf files with the normals as the first sample.", category: "required"} + referenceFasta: {description: "The fasta of the reference genome.", category: "required"} + referenceFastaFai: {description: "The index for the reference genome fasta.", category: "required"} + outputDir: {description: "The directory the output will be written to.", category: "common"} + threads: {description: "The number of the threads to use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task GRIDSS { + input { + Array[File]+ tumorBam + Array[File]+ tumorBai + Array[String]+ tumorLabel + BwaIndex reference + String outputPrefix = "gridss" + + File? normalBam + File? normalBai + String? normalLabel + File? blacklistBed + File? gridssProperties + + Int jvmHeapSizeGb = 64 + Int nonJvmMemoryGb = 10 + Int threads = 12 + Int timeMinutes = ceil(7200 / threads) + 1800 + String dockerImage = "quay.io/biowdl/gridss:2.12.2" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + gridss \ + -w . \ + --reference ~{reference.fastaFile} \ + --output ~{outputPrefix}.vcf.gz \ + --assembly ~{outputPrefix}_assembly.bam \ + ~{"-c " + gridssProperties} \ + ~{"-t " + threads} \ + ~{"--jvmheap " + jvmHeapSizeGb + "G"} \ + --labels ~{normalLabel}~{true="," false="" defined(normalLabel)}~{sep="," tumorLabel} \ + ~{"--blacklist " + blacklistBed} \ + ~{normalBam} \ + ~{sep=" " tumorBam} + samtools index ~{outputPrefix}_assembly.bam ~{outputPrefix}_assembly.bai + + # For some reason the VCF index is sometimes missing + if [ ! -e ~{outputPrefix}.vcf.gz.tbi ] + then + tabix ~{outputPrefix}.vcf.gz + fi + } + + output { + File vcf = outputPrefix + ".vcf.gz" + File vcfIndex = outputPrefix + ".vcf.gz.tbi" + File assembly = outputPrefix + "_assembly.bam" + File assemblyIndex = outputPrefix + "_assembly.bai" + } + + runtime { + cpu: threads + memory: "~{jvmHeapSizeGb + nonJvmMemoryGb}GiB" + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + # inputs + tumorBam: {description: "The input BAM file. This should be the tumor/case sample in case of a paired analysis.", category: "required"} + tumorBai: {description: "The index for tumorBam.", category: "required"} + tumorLabel: {description: "The name of the (tumor) sample.", category: "required"} + reference: {description: "A BWA index, this should also include the fasta index file (.fai).", category: "required"} + outputPrefix: {description: "The prefix for the output files. This may include parent directories.", category: "common"} + normalBam: {description: "The BAM file for the normal/control sample.", category: "advanced"} + normalBai: {description: "The index for normalBam.", category: "advanced"} + normalLabel: {description: "The name of the normal sample.", category: "advanced"} + blacklistBed: {description: "A bed file with blaclisted regins.", category: "advanced"} + gridssProperties: {description: "A properties file for gridss.", category: "advanced"} + + threads: {description: "The number of the threads to use.", category: "advanced"} + jvmHeapSizeGb: {description: "The size of JVM heap for assembly and variant calling", category: "advanced"} + nonJvmMemoryGb: {description: "The amount of memory in Gb to be requested besides JVM memory.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + vcf: {description: "VCF file including variant allele fractions."} + vcfIndex: {description: "Index of output VCF."} + assembly: {description: "The GRIDSS assembly BAM."} + assemblyIndex: {description: "Index of output BAM file."} + } +} + +task GridssAnnotateVcfRepeatmasker { + input { + File gridssVcf + File gridssVcfIndex + String outputPath = "./gridss.repeatmasker_annotated.vcf.gz" + + String memory = "25GiB" + Int threads = 8 + String dockerImage = "quay.io/biowdl/gridss:2.12.2" + Int timeMinutes = 1440 + } + + command { + gridss_annotate_vcf_repeatmasker \ + --output ~{outputPath} \ + --jar /usr/local/share/gridss-2.12.2-0/gridss.jar \ + -w . \ + -t ~{threads} \ + ~{gridssVcf} + } + + output { + File annotatedVcf = outputPath + File annotatedVcfIndex = "~{outputPath}.tbi" + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + gridssVcf: {description: "The GRIDSS output.", category: "required"} + gridssVcfIndex: {description: "The index for the GRIDSS output.", category: "required"} + outputPath: {description: "The path the output should be written to.", category: "common"} + threads: {description: "The number of the threads to use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task SomaticFilter { + input { + File vcfFile + File vcfIndex + File ponBed + File ponBedpe + String outputPath = "./high_confidence_somatic.vcf" + String fullOutputPath = "./high_and_low_confidence_somatic.vcf" + + String memory = "16GiB" + String dockerImage = "quay.io/biowdl/gridss:2.12.2" + Int timeMinutes = 60 + } + + String ponDir = sub(ponBed, basename(ponBed), "") + + command { + set -e + mkdir -p $(dirname ~{outputPath}) + mkdir -p $(dirname ~{fullOutputPath}) + + gridss_somatic_filter \ + --pondir ~{ponDir} \ + --input ~{vcfFile} \ + --output ~{outputPath} \ + --fulloutput ~{fullOutputPath} + } + + output { + File fullVcf = "~{fullOutputPath}.bgz" + File fullVcfIndex = "~{fullOutputPath}.bgz.tbi" + File highConfidenceVcf = "~{outputPath}.bgz" + File highConfidenceVcfIndex = "~{outputPath}.bgz.tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + vcfFile: {description: "The GRIDSS VCF file.", category: "required"} + vcfIndex: {description: "The index for the GRIDSS VCF file.", category: "required"} + ponBed: {description: "The PON BED file.", category: "required"} + ponBedpe: {description: "The PON BEDPE file.", category: "required"} + outputPath: {description: "The path the high confidence output should be written to.", category: "common"} + fullOutputPath: {description: "The path the full output should be written to.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Virusbreakend { + input { + File bam + File bamIndex + File referenceFasta + File referenceFastaFai + File referenceFastaDict + File referenceImg + File virusbreakendDB + String outputPath = "./virusbreakend.vcf" + + String memory = "75GiB" + Int threads = 12 + String dockerImage = "quay.io/biowdl/gridss:2.12.2" + Int timeMinutes = 320 + } + + command { + set -e + mkdir virusbreakenddb + tar -xzvf ~{virusbreakendDB} -C virusbreakenddb --strip-components 1 + virusbreakend \ + --output ~{outputPath} \ + --workingdir . \ + --reference ~{referenceFasta} \ + --db virusbreakenddb \ + --jar /usr/local/share/gridss-2.12.2-0/gridss.jar \ + -t ~{threads} \ + ~{bam} + } + + output { + File vcf = outputPath + File summary = "~{outputPath}.summary.tsv" + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + bam: {description: "A BAM file.", category: "required"} + bamIndex: {description: "The index for the BAM file.", category: "required"} + referenceFasta: {description: "The fasta of the reference genome.", category: "required"} + referenceImg: {description: "The BWA index image (generated with GATK BwaMemIndexImageCreator) of the reference.", category: "required"} + virusbreakendDB: {description: "A .tar.gz containing the virusbreakend database.", category: "required"} + outputPath: {description: "The path the output should be written to.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + threads: {description: "The number of the threads to use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} diff --git a/hisat2.wdl b/hisat2.wdl index bc6be2e8..50fabc9d 100644 --- a/hisat2.wdl +++ b/hisat2.wdl @@ -22,25 +22,35 @@ version 1.0 task Hisat2 { input { - Array[File]+ indexFiles File inputR1 File? inputR2 + Array[File]+ indexFiles String outputBam String sample String library String readgroup String platform = "illumina" Boolean downstreamTranscriptomeAssembly = true + String summaryFilePath = basename(outputBam, ".bam") + ".summary.txt" + Int sortMemoryPerThreadGb = 2 + Int compressionLevel = 1 + + Int? sortThreads - Int threads = 1 - String memory = "48G" + Int threads = 4 + Int? memoryGb + Int timeMinutes = 1 + ceil(size([inputR1, inputR2], "G") * 180 / threads) # quay.io/biocontainers/mulled-v2-a97e90b3b802d1da3d6958e0867610c718cb5eb1 - # is a combination of hisat2 and samtools - # hisat2=2.1.0, samtools=1.8 - String dockerImage = "quay.io/biocontainers/mulled-v2-a97e90b3b802d1da3d6958e0867610c718cb5eb1:2388ff67fc407dad75774291ca5038f40cac4be0-0" + # is a combination of hisat2 and samtools hisat2=2.2.0 & samtools=1.10. + String dockerImage = "quay.io/biocontainers/mulled-v2-a97e90b3b802d1da3d6958e0867610c718cb5eb1:2880dd9d8ad0a7b221d4eacda9a818e92983128d-0" } - String bamIndexPath = sub(outputBam, "\.bam$", ".bai") + # Samtools sort may block the pipe while it is writing data to disk. + # This can lead to cpu underutilization. + # 1 thread if threads is 1. For 2-4 threads 2 sort threads. 3 sort threads for 5-8 threads. + Int estimatedSortThreads = if threads == 1 then 1 else 1 + ceil(threads / 4.0) + Int totalSortThreads = select_first([sortThreads, estimatedSortThreads]) + Int estimatedMemoryGb = 1 + ceil(size(indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * totalSortThreads command { set -e -o pipefail @@ -55,34 +65,50 @@ task Hisat2 { --rg 'LB:~{library}' \ --rg 'PL:~{platform}' \ ~{true="--dta" false="" downstreamTranscriptomeAssembly} \ - | samtools sort > ~{outputBam} - samtools index ~{outputBam} ~{bamIndexPath} + --new-summary \ + --summary-file ~{summaryFilePath} \ + | samtools sort \ + ~{"-@ " + totalSortThreads} \ + -m ~{sortMemoryPerThreadGb}G \ + -l ~{compressionLevel} \ + - \ + -o ~{outputBam} } output { File bamFile = outputBam - File bamIndex = bamIndexPath + File summaryFile = summaryFilePath } runtime { - memory: memory - cpu: threads + 1 + cpu: threads + memory: "~{select_first([memoryGb, estimatedMemoryGb])}GiB" + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - indexFiles: {description: "The hisat2 index files.", category: "required"} + # inputs inputR1: {description: "The first-/single-end FastQ file.", category: "required"} inputR2: {description: "The second-end FastQ file.", category: "common"} + indexFiles: {description: "The hisat2 index files.", category: "required"} outputBam: {description: "The location the output BAM file should be written to.", category: "required"} sample: {description: "The sample id.", category: "required"} library: {description: "The library id.", category: "required"} readgroup: {description: "The readgroup id.", category: "required"} platform: {description: "The platform used for sequencing.", category: "advanced"} downstreamTranscriptomeAssembly: {description: "Equivalent to hisat2's `--dta` flag.", category: "advanced"} + summaryFilePath: {description: "Where the summary file should be written.", category: "advanced"} + sortMemoryPerThreadGb: {description: "The amount of memory for each sorting thread in gigabytes.", category: "advanced"} + compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"} + sortThreads: {description: "The number of threads to use for sorting.", category: "advanced"} threads: {description: "The number of threads to use.", category: "advanced"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memoryGb: {description: "The amount of memory this job will use in gigabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + bamFile: {description: "Output BAM file."} + summaryFile: {description: "Alignment summary file."} } -} \ No newline at end of file +} diff --git a/hmftools.wdl b/hmftools.wdl new file mode 100644 index 00000000..c27630a1 --- /dev/null +++ b/hmftools.wdl @@ -0,0 +1,1368 @@ +version 1.0 + +# Copyright (c) 2020 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Amber { + input { + String referenceName + File referenceBam + File referenceBamIndex + String tumorName + File tumorBam + File tumorBamIndex + String outputDir = "./amber" + File loci + File referenceFasta + File referenceFastaFai + File referenceFastaDict + + Int threads = 2 + String memory = "85GiB" + String javaXmx = "80G" + Int timeMinutes = 480 + String dockerImage = "quay.io/biocontainers/hmftools-amber:3.5--0" + } + + command { + AMBER -Xmx~{javaXmx} \ + -reference ~{referenceName} \ + -reference_bam ~{referenceBam} \ + -tumor ~{tumorName} \ + -tumor_bam ~{tumorBam} \ + -output_dir ~{outputDir} \ + -threads ~{threads} \ + -ref_genome ~{referenceFasta} \ + -loci ~{loci} + } + + output { + File version = "~{outputDir}/amber.version" + File tumorBafPcf = "~{outputDir}/~{tumorName}.amber.baf.pcf" + File tumorBafTsv = "~{outputDir}/~{tumorName}.amber.baf.tsv" + File tumorBafVcf = "~{outputDir}/~{tumorName}.amber.baf.vcf.gz" + File tumorBafVcfIndex = "~{outputDir}/~{tumorName}.amber.baf.vcf.gz.tbi" + File tumorContaminationVcf = "~{outputDir}/~{tumorName}.amber.contamination.vcf.gz" + File tumorContaminationVcfIndex = "~{outputDir}/~{tumorName}.amber.contamination.vcf.gz.tbi" + File tumorContaminationTsv = "~{outputDir}/~{tumorName}.amber.contamination.tsv" + File tumorQc = "~{outputDir}/~{tumorName}.amber.qc" + File normalSnpVcf = "~{outputDir}/~{referenceName}.amber.snp.vcf.gz" + File normalSnpVcfIndex = "~{outputDir}/~{referenceName}.amber.snp.vcf.gz.tbi" + Array[File] outputs = [version, tumorBafPcf, tumorBafTsv, tumorBafVcf, tumorBafVcfIndex, + tumorContaminationVcf, tumorContaminationVcfIndex, tumorContaminationTsv, tumorQc, + normalSnpVcf, normalSnpVcfIndex] + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + cpu: threads + } + + parameter_meta { + referenceName: {description: "the name of the normal sample.", category: "required"} + referenceBam: {description: "The normal BAM file.", category: "required"} + referenceBamIndex: {description: "The index for the normal BAM file.", category: "required"} + tumorName: {description: "The name of the tumor sample.", category: "required"} + tumorBam: {description: "The tumor BAM file.", category: "required"} + tumorBamIndex: {description: "The index for the tumor BAM file.", category: "required"} + outputDir: {description: "The path to the output directory.", category: "common"} + loci: {description: "A VCF file containing likely heterozygous sites.", category: "required"} + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + threads: {description: "The number of threads the program will use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Cobalt { + input { + String referenceName + File referenceBam + File referenceBamIndex + String tumorName + File tumorBam + File tumorBamIndex + String outputDir = "./cobalt" + File gcProfile + + Int threads = 1 + String memory = "5GiB" + String javaXmx = "4G" + Int timeMinutes = 480 + String dockerImage = "quay.io/biocontainers/hmftools-cobalt:1.11--0" + } + + command { + COBALT -Xmx~{javaXmx} \ + -reference ~{referenceName} \ + -reference_bam ~{referenceBam} \ + -tumor ~{tumorName} \ + -tumor_bam ~{tumorBam} \ + -output_dir ~{outputDir} \ + -threads ~{threads} \ + -gc_profile ~{gcProfile} + } + + output { + File version = "~{outputDir}/cobalt.version" + File normalGcMedianTsv = "~{outputDir}/~{referenceName}.cobalt.gc.median.tsv" + File normalRationMedianTsv = "~{outputDir}/~{referenceName}.cobalt.ratio.median.tsv" + File normalRationPcf = "~{outputDir}/~{referenceName}.cobalt.ratio.pcf" + File tumorGcMedianTsv = "~{outputDir}/~{tumorName}.cobalt.gc.median.tsv" + File tumorRatioPcf = "~{outputDir}/~{tumorName}.cobalt.ratio.pcf" + File tumorRatioTsv = "~{outputDir}/~{tumorName}.cobalt.ratio.tsv" + File tumorChrLen = "~{outputDir}/~{tumorName}.chr.len" + Array[File] outputs = [version, normalGcMedianTsv, normalRationMedianTsv, + normalRationPcf, tumorGcMedianTsv, tumorRatioPcf, tumorRatioTsv, tumorChrLen] + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + cpu: threads + } + + parameter_meta { + referenceName: {description: "the name of the normal sample.", category: "required"} + referenceBam: {description: "The normal BAM file.", category: "required"} + referenceBamIndex: {description: "The index for the normal BAM file.", category: "required"} + tumorName: {description: "The name of the tumor sample.", category: "required"} + tumorBam: {description: "The tumor BAM file.", category: "required"} + tumorBamIndex: {description: "The index for the tumor BAM file.", category: "required"} + outputDir: {description: "The path to the output directory.", category: "common"} + gcProfile: {description: "A file describing the GC profile of the reference genome.", category: "required"} + threads: {description: "The number of threads the program will use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task CupGenerateReport { + input { + String sampleName + File cupData + String outputDir = "./cuppa" + + String memory = "5GiB" + Int timeMinutes = 10 + String dockerImage = "quay.io/biowdl/cuppa:1.6" + } + + # This script writes to the directory that the input is located in. + # Giving the input directly will cause the script to write in the + # locallized input dir, which may cause issues with write permissions + # in certain execution engines or backends. We, therefore, make links + # to a working directory, and give that directory as input instead. + # We can't just use the outputDir directly. This could be an + # absolute path in which case the linking might fail due to name + # collisions. Outputs are copied to the given output dir afterwards. + command { + set -e + mkdir -p ./workdir ~{outputDir} + ln -s -t workdir ~{cupData} + CupGenerateReport \ + ~{sampleName} \ + workdir/ + mv -t ~{outputDir} \ + ./workdir/~{sampleName}.cup.report.summary.png \ + ./workdir/~{sampleName}_cup_report.pdf + if [ -f ./workdir/~{sampleName}.cup.report.features.png ] + then + mv -t ~{outputDir} \ + ./workdir/~{sampleName}.cup.report.features.png + fi + } + + output { + File summaryPng = "~{outputDir}/~{sampleName}.cup.report.summary.png" + File? featuresPng = "~{outputDir}/~{sampleName}.cup.report.features.png" + File reportPdf = "~{outputDir}/~{sampleName}_cup_report.pdf" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + sampleName: {description: "The sample id.", category: "required"} + cupData: {description: "The output produced by cuppa.", category: "required"} + outputDir: {description: "The directory the ouput will be placed in.", category: "common"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Cuppa { + input { + Array[File]+ linxOutput + Array[File]+ purpleOutput + String sampleName + Array[String]+ categories = ["DNA"] + Array[File]+ referenceData + File purpleSvVcf + File purpleSvVcfIndex + File purpleSomaticVcf + File purpleSomaticVcfIndex + String outputDir = "./cuppa" + + String javaXmx = "4G" + String memory = "5GiB" + Int timeMinutes = 10 + String dockerImage = "quay.io/biowdl/cuppa:1.6" + } + + command { + set -e + mkdir -p sampleData ~{outputDir} + ln -s -t sampleData ~{sep=" " linxOutput} ~{sep=" " purpleOutput} + cuppa -Xmx~{javaXmx} \ + -output_dir ~{outputDir} \ + -output_id ~{sampleName} \ + -categories '~{sep="," categories}' \ + -ref_data_dir ~{sub(referenceData[0], basename(referenceData[0]), "")} \ + -sample_data_dir sampleData \ + -sample_data ~{sampleName} \ + -sample_sv_file ~{purpleSvVcf} \ + -sample_somatic_vcf ~{purpleSomaticVcf} + } + + output { + File cupData = "~{outputDir}/~{sampleName}.cup.data.csv" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + linxOutput: {description: "The files produced by linx.", category: "required"} + purpleOutput: {description: "The files produced by purple.", category: "required"} + sampleName: {description: "The name of the sample.", category: "required"} + categories: {description: "The classifiers to use.", category: "advanced"} + referenceData : {description: "The reference data.", category: "required"} + purpleSvVcf: {description: "The VCF file produced by purple which contains structural variants.", category: "required"} + purpleSvVcfIndex: {description: "The index of the structural variants VCF file produced by purple.", category: "required"} + purpleSomaticVcf: {description: "The VCF file produced by purple which contains somatic variants.", category: "required"} + purpleSomaticVcfIndex: {description: "The index of the somatic VCF file produced by purple.", category: "required"} + outputDir: {description: "The directory the ouput will be placed in.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task CuppaChart { + input { + String sampleName + File cupData + String outputDir = "./cuppa" + + String memory = "4GiB" + Int timeMinutes = 5 + String dockerImage = "quay.io/biowdl/cuppa:1.6" + } + + command { + set -e + mkdir -p ~{outputDir} + cuppa-chart \ + -sample ~{sampleName} \ + -sample_data ~{cupData} \ + -output_dir ~{outputDir} + } + + output { + File cuppaChart = "~{outputDir}/~{sampleName}.cuppa.chart.png" + File cuppaConclusion = "~{outputDir}/~{sampleName}.cuppa.conclusion.txt" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + sampleName: {description: "The name of the sample.", category:"common"} + cupData: {description: "The cuppa output.", category: "required"} + outputDir: {description: "The directory the output will be written to.", category:"common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Gripss { + input { + File referenceFasta + File referenceFastaFai + File referenceFastaDict + File knownFusionPairBedpe + File breakendPon + File breakpointPon + String referenceName + String tumorName + File vcf + File vcfIndex + String outputDir = "./" + + String memory = "17GiB" + String javaXmx = "16G" + Int timeMinutes = 50 + String dockerImage = "quay.io/biocontainers/hmftools-gripss:2.0--hdfd78af_0" + } + + command { + set -e + mkdir -p ~{outputDir} + gripss -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -ref_genome ~{referenceFasta} \ + -known_hotspot_file ~{knownFusionPairBedpe} \ + -pon_sgl_file ~{breakendPon} \ + -pon_sv_file ~{breakpointPon} \ + -reference ~{referenceName} \ + -sample ~{tumorName} \ + -vcf ~{vcf} \ + -output_dir ~{outputDir} \ + -output_id somatic + } + + output { + File fullVcf = "~{outputDir}/~{tumorName}.gripss.somatic.vcf.gz" + File fullVcfIndex = "~{outputDir}/~{tumorName}.gripss.somatic.vcf.gz.tbi" + File filteredVcf = "~{outputDir}/~{tumorName}.gripss.filtered.somatic.vcf.gz" + File filteredVcfIndex = "~{outputDir}/~{tumorName}.gripss.filtered.somatic.vcf.gz.tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + knownFusionPairBedpe: {description: "Equivalent to the `-known_hotspot_file` option.", category: "required"} + breakendPon: {description: "Equivalent to the `-pon_sgl_file` option.", category: "required"} + breakpointPon: {description: "Equivalent to the `-pon_sv_file` option.", category: "required"} + tumorName: {description: "The name of the tumor sample.", category: "required"} + referenceName: {description: "The name of the normal sample.", category: "required"} + vcf: {description: "The input VCF.", category: "required"} + vcfIndex: {description: "The index for the input VCF.", category: "required"} + outputDir: {description: "The path the output will be written to.", category:"required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task GripssApplicationKt { + # Obsolete + input { + File inputVcf + String outputPath = "gripss.vcf.gz" + String tumorName + String referenceName + File referenceFasta + File referenceFastaFai + File referenceFastaDict + File breakpointHotspot + File breakendPon + File breakpointPon + + String memory = "32GiB" + String javaXmx = "31G" + Int timeMinutes = 45 + String dockerImage = "quay.io/biocontainers/hmftools-gripss:1.11--hdfd78af_0" + } + + command { + java -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -cp /usr/local/share/hmftools-gripss-1.11-0/gripss.jar \ + com.hartwig.hmftools.gripss.GripssApplicationKt \ + -tumor ~{tumorName} \ + -reference ~{referenceName} \ + -ref_genome ~{referenceFasta} \ + -breakpoint_hotspot ~{breakpointHotspot} \ + -breakend_pon ~{breakendPon} \ + -breakpoint_pon ~{breakpointPon} \ + -input_vcf ~{inputVcf} \ + -output_vcf ~{outputPath} \ + -paired_normal_tumor_ordinals + } + + output { + File outputVcf = outputPath + File outputVcfIndex = outputPath + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + inputVcf: {description: "The input VCF.", category: "required"} + outputPath: {description: "The path where th eoutput VCF will be written.", category: "common"} + referenceName: {description: "The name of the normal sample.", category: "required"} + tumorName: {description: "The name of the tumor sample.", category: "required"} + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + breakpointHotspot: {description: "Equivalent to the `-breakpoint_hotspot` option.", category: "required"} + breakendPon: {description: "Equivalent to the `-breakend_pon` option.", category: "required"} + breakpointPon: {description: "Equivalent to the `-breakpoint_pon` option.", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task GripssHardFilterApplicationKt { + # Obsolete + input { + File inputVcf + String outputPath = "gripss_hard_filter.vcf.gz" + + String memory = "3GiB" + String javaXmx = "2G" + Int timeMinutes = 15 + String dockerImage = "quay.io/biocontainers/hmftools-gripss:1.11--hdfd78af_0" + } + + command { + java -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -cp /usr/local/share/hmftools-gripss-1.11-0/gripss.jar \ + com.hartwig.hmftools.gripss.GripssHardFilterApplicationKt \ + -input_vcf ~{inputVcf} \ + -output_vcf ~{outputPath} + } + + output { + File outputVcf = outputPath + File outputVcfIndex = outputPath + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + inputVcf: {description: "The input VCF.", category: "required"} + outputPath: {description: "The path where th eoutput VCF will be written.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task HealthChecker { + input { + String outputDir = "." + String referenceName + File referenceFlagstats + File referenceMetrics + String tumorName + File tumorFlagstats + File tumorMetrics + Array[File]+ purpleOutput + + String javaXmx = "2G" + String memory = "3GiB" + Int timeMinutes = 1 + String dockerImage = "quay.io/biowdl/health-checker:3.2" + } + + command { + set -e + mkdir -p ~{outputDir} + health-checker -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -reference ~{referenceName} \ + -ref_flagstat_file ~{referenceFlagstats} \ + -ref_wgs_metrics_file ~{referenceMetrics} \ + -tumor ~{tumorName} \ + -tum_flagstat_file ~{tumorFlagstats} \ + -tum_wgs_metrics_file ~{tumorMetrics} \ + -purple_dir ~{sub(purpleOutput[0], basename(purpleOutput[0]), "")} \ + -output_dir ~{outputDir} + if [ -e '~{outputDir}/~{tumorName}.HealthCheckSucceeded' ] + then + echo 'true' > '~{outputDir}/succeeded' + fi + if [ -e '~{outputDir}/~{tumorName}.HealthCheckFailed' ] + then + echo 'false' > '~{outputDir}/succeeded' + fi + } + + output { + Boolean succeeded = read_boolean("succeeded") + File outputFile = if succeeded + then "~{outputDir}/~{tumorName}.HealthCheckSucceeded" + else "~{outputDir}/~{tumorName}.HealthCheckFailed" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + outputDir: {description: "The path the output will be written to.", category:"required"} + referenceName: {description: "The name of the normal sample.", category: "required"} + referenceFlagstats: {description: "The flagstats for the normal sample.", category: "required"} + referenceMetrics: {description: "The picard WGS metrics for the normal sample.", category: "required"} + tumorName: {description: "The name of the tumor sample.", category: "required"} + tumorFlagstats: {description: "The flagstats for the tumor sample.", category: "required"} + tumorMetrics: {description: "The picard WGS metrics for the tumor sample.", category: "required"} + purpleOutput: {description: "The files from purple's output directory.", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Linx { + input { + String sampleName + File svVcf + File svVcfIndex + Array[File]+ purpleOutput + String refGenomeVersion + String outputDir = "./linx" + File fragileSiteCsv + File lineElementCsv + File knownFusionCsv + File driverGenePanel + Boolean writeAllVisFusions = false + #The following should be in the same directory. + File geneDataCsv + File proteinFeaturesCsv + File transExonDataCsv + File transSpliceDataCsv + + String memory = "9GiB" + String javaXmx = "8G" + Int timeMinutes = 10 + String dockerImage = "quay.io/biocontainers/hmftools-linx:1.18--hdfd78af_0" + } + + command { + linx -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -sample ~{sampleName} \ + -sv_vcf ~{svVcf} \ + -purple_dir ~{sub(purpleOutput[0], basename(purpleOutput[0]), "")} \ + -ref_genome_version ~{refGenomeVersion} \ + -output_dir ~{outputDir} \ + -fragile_site_file ~{fragileSiteCsv} \ + -line_element_file ~{lineElementCsv} \ + -ensembl_data_dir ~{sub(geneDataCsv, basename(geneDataCsv), "")} \ + -check_fusions \ + -known_fusion_file ~{knownFusionCsv} \ + -check_drivers \ + -driver_gene_panel ~{driverGenePanel} \ + -chaining_sv_limit 0 \ + -write_vis_data \ + ~{if writeAllVisFusions then "-write_all_vis_fusions" else ""} + } + + output { + File driverCatalog = "~{outputDir}/~{sampleName}.linx.driver.catalog.tsv" + File linxBreakend = "~{outputDir}/~{sampleName}.linx.breakend.tsv" + File linxClusters = "~{outputDir}/~{sampleName}.linx.clusters.tsv" + File linxDrivers = "~{outputDir}/~{sampleName}.linx.drivers.tsv" + File linxFusion = "~{outputDir}/~{sampleName}.linx.fusion.tsv" + File linxLinks = "~{outputDir}/~{sampleName}.linx.links.tsv" + File linxSvs = "~{outputDir}/~{sampleName}.linx.svs.tsv" + File linxVisCopyNumber = "~{outputDir}/~{sampleName}.linx.vis_copy_number.tsv" + File linxVisFusion = "~{outputDir}/~{sampleName}.linx.vis_fusion.tsv" + File linxVisGeneExon = "~{outputDir}/~{sampleName}.linx.vis_gene_exon.tsv" + File linxVisProteinDomain = "~{outputDir}/~{sampleName}.linx.vis_protein_domain.tsv" + File linxVisSegments = "~{outputDir}/~{sampleName}.linx.vis_segments.tsv" + File linxVisSvData = "~{outputDir}/~{sampleName}.linx.vis_sv_data.tsv" + File linxVersion = "~{outputDir}/linx.version" + Array[File] outputs = [driverCatalog, linxBreakend, linxClusters, linxDrivers, linxFusion, + linxLinks, linxSvs, linxVisCopyNumber, linxVisFusion, + linxVisGeneExon, linxVisProteinDomain, linxVisSegments, linxVisSvData, + linxVersion] + } + + runtime { + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + memory: memory + } + + parameter_meta { + sampleName: {description: "The name of the sample.", category: "required"} + svVcf: {description: "A VCF file containing structural variants, produced using GRIDSS, annotated for viral insertions and postprocessed with GRIPSS.", category: "required"} + svVcfIndex: {description: "Index for the structural variants VCf file.", category: "required"} + purpleOutput: {description: "The files produced by PURPLE.", category: "required"} + refGenomeVersion: {description: "The version of the genome assembly used for alignment. Either \"37\" or \"38\".", category: "required"} + outputDir: {description: "The directory the outputs will be written to.", category: "required"} + fragileSiteCsv: {description: "A list of known fragile sites.", category: "required"} + lineElementCsv: {description: "A list of known LINE source regions.", category: "required"} + knownFusionCsv: {description: "A CSV file describing known fusions.", category: "required"} + driverGenePanel: {description: "A TSV file describing the driver gene panel.", category: "required"} + writeAllVisFusions: {description: "Equivalent to the -write_all_vis_fusions flag.", category: "advanced"} + geneDataCsv: {description: "A CSV file containing gene information, must be in the same directory as `proteinFeaturesCsv`, `transExonDataCsv` and `transSpliceDataCsv`.", category: "required"} + proteinFeaturesCsv: {description: "A CSV file containing protein feature information, must be in the same directory as `geneDataCsv`, `transExonDataCsv` and `transSpliceDataCsv`.", category: "required"} + transExonDataCsv: {description: "A CSV file containing transcript exon information, must be in the same directory as `geneDataCsv`, `proteinFeaturesCsv` and `transSpliceDataCsv`.", category: "required"} + transSpliceDataCsv: {description: "A CSV file containing transcript splicing information, must be in the same directory as `geneDataCsv`, `proteinFeaturesCsv` and `transExonDataCsv`.", category: "required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task LinxVisualisations { + input { + String outputDir = "./linx_visualisation" + String sample + String refGenomeVersion + Array[File]+ linxOutput + Boolean plotReportable = true + + String memory = "9GiB" + String javaXmx = "8G" + Int timeMinutes = 1440 + String dockerImage = "quay.io/biocontainers/hmftools-linx:1.18--hdfd78af_0" + } + + command { + set -e + mkdir -p ~{outputDir} + java -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -cp /usr/local/share/hmftools-linx-1.18-0/sv-linx.jar \ + com.hartwig.hmftools.linx.visualiser.SvVisualiser \ + -sample ~{sample} \ + -ref_genome_version ~{refGenomeVersion} \ + -circos /usr/local/bin/circos \ + -vis_file_dir ~{sub(linxOutput[0], basename(linxOutput[0]), "")} \ + -data_out ~{outputDir}/circos \ + -plot_out ~{outputDir}/plots \ + ~{if plotReportable then "-plot_reportable" else ""} + } + + output { + Array[File] circos = glob("~{outputDir}/circos/*") + Array[File] plots = glob("~{outputDir}/plots/*") + } + + runtime { + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + memory: memory + } + + parameter_meta { + outputDir: {description: "The directory the outputs will be written to.", category: "required"} + sample: {description: "The sample's name.", category: "required"} + refGenomeVersion: {description: "The version of the genome assembly used for alignment. Either \"37\" or \"38\".", category: "required"} + linxOutput: {description: "The directory containing the linx output.", category: "required"} + plotReportable: {description: "Equivalent to the -plot_reportable flag.", category: "advanced"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Orange { + input { + String outputDir = "./orange" + File doidJson + Array[String] sampleDoids + String tumorName + String referenceName + File referenceMetrics + File tumorMetrics + File referenceFlagstats + File tumorFlagstats + File sageGermlineGeneCoverageTsv + File sageSomaticRefSampleBqrPlot + File sageSomaticTumorSampleBqrPlot + File purpleGeneCopyNumberTsv + File purpleGermlineDriverCatalogTsv + File purpleGermlineVariantVcf + File purpleGermlineVariantVcfIndex + Array[File]+ purplePlots + File purplePurityTsv + File purpleQcFile + File purpleSomaticDriverCatalogTsv + File purpleSomaticVariantVcf + File purpleSomaticVariantVcfIndex + File linxFusionTsv + File linxBreakendTsv + File linxDriverCatalogTsv + File linxDriverTsv + Array[File]+ linxPlots + File cuppaResultCsv + File cuppaSummaryPlot + File? cuppaFeaturePlot + File chordPredictionTxt + File peachGenotypeTsv + File protectEvidenceTsv + File annotatedVirusTsv + #File pipelineVersionFile + File cohortMappingTsv + File cohortPercentilesTsv + + String memory = "17GiB" + String javaXmx = "16G" + Int timeMinutes = 10 + String dockerImage = "quay.io/biowdl/orange:v1.6" + } + + command { + set -e + mkdir -p ~{outputDir} + orange -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -output_dir ~{outputDir} \ + -doid_json ~{doidJson} \ + -primary_tumor_doids '~{sep=";" sampleDoids}' \ + -max_evidence_level C \ + -tumor_sample_id ~{tumorName} \ + -reference_sample_id ~{referenceName} \ + -ref_sample_wgs_metrics_file ~{referenceMetrics} \ + -tumor_sample_wgs_metrics_file ~{tumorMetrics} \ + -ref_sample_flagstat_file ~{referenceFlagstats} \ + -tumor_sample_flagstat_file ~{tumorFlagstats} \ + -sage_germline_gene_coverage_tsv ~{sageGermlineGeneCoverageTsv} \ + -sage_somatic_ref_sample_bqr_plot ~{sageSomaticRefSampleBqrPlot} \ + -sage_somatic_tumor_sample_bqr_plot ~{sageSomaticTumorSampleBqrPlot} \ + -purple_gene_copy_number_tsv ~{purpleGeneCopyNumberTsv} \ + -purple_germline_driver_catalog_tsv ~{purpleGermlineDriverCatalogTsv} \ + -purple_germline_variant_vcf ~{purpleGermlineVariantVcf} \ + -purple_plot_directory ~{sub(purplePlots[0], basename(purplePlots[0]), "")} \ + -purple_purity_tsv ~{purplePurityTsv} \ + -purple_qc_file ~{purpleQcFile} \ + -purple_somatic_driver_catalog_tsv ~{purpleSomaticDriverCatalogTsv} \ + -purple_somatic_variant_vcf ~{purpleSomaticVariantVcf} \ + -linx_fusion_tsv ~{linxFusionTsv} \ + -linx_breakend_tsv ~{linxBreakendTsv} \ + -linx_driver_catalog_tsv ~{linxDriverCatalogTsv} \ + -linx_driver_tsv ~{linxDriverTsv} \ + -linx_plot_directory ~{sub(linxPlots[0], basename(linxPlots[0]), "")} \ + -cuppa_result_csv ~{cuppaResultCsv} \ + -cuppa_summary_plot ~{cuppaSummaryPlot} \ + ~{"-cuppa_feature_plot " + cuppaFeaturePlot} \ + -chord_prediction_txt ~{chordPredictionTxt} \ + -peach_genotype_tsv ~{peachGenotypeTsv} \ + -protect_evidence_tsv ~{protectEvidenceTsv} \ + -annotated_virus_tsv ~{annotatedVirusTsv} \ + -cohort_mapping_tsv ~{cohortMappingTsv} \ + -cohort_percentiles_tsv ~{cohortPercentilesTsv} + } + #TODO may need to be added: -pipeline_version_file ~{pipelineVersionFile} + + output { + File orangeJson = "~{outputDir}/~{tumorName}.orange.json" + File orangePdf = "~{outputDir}/~{tumorName}.orange.pdf" + } + + runtime { + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + memory: memory + } + + parameter_meta { + outputDir: {description: "The directory the outputs will be written to.", category: "common"} + doidJson: {description: "A json with the DOID (Human Disease Ontology) tree.", category: "required"} + sampleDoids: {description: "The DOIDs (Human Disease Ontology) for the primary tumor.", category: "required"} + tumorName: {description: "The name of the tumor sample.", category: "required"} + referenceName: {description: "The name of the normal sample.", category: "required"} + referenceMetrics: {description: "The picard WGS metrics for the normal sample.", category: "required"} + tumorMetrics: {description: "The picard WGS metrics for the tumor sample.", category: "required"} + referenceFlagstats: {description: "The flagstats for the normal sample.", category: "required"} + tumorFlagstats: {description: "The flagstats for the tumor sample.", category: "required"} + sageGermlineGeneCoverageTsv: {description: "Gene coverage file produced by the germline sage run.", category: "required"} + sageSomaticRefSampleBqrPlot: {description: "The reference bqr plot produced by the somatic sage run.", category: "required"} + sageSomaticTumorSampleBqrPlot: {description: "The reference bqr plot produced by the somatic sage run.", category: "required"} + purpleGeneCopyNumberTsv: {description: "Copy number tsv produced by purple.", category: "required"} + purpleGermlineDriverCatalogTsv: {description: "Germline driver catalog produced by purple.", category: "required"} + purpleGermlineVariantVcf: {description: "Germline variant vcf produced by purple.", category: "required"} + purplePlots: {description: "The plots generated by purple.", category: "required"} + purplePurityTsv: {description: "The purity file produced by purple.", category: "required"} + purpleQcFile: {description: "The qc file produced by purple.", category: "required"} + purpleSomaticDriverCatalogTsv: {description: "Somatic driver catalog produced by purple.", category: "required"} + purpleSomaticVariantVcf: {description: "Somatic variant vcf produced by purple.", category: "required"} + linxFusionTsv: {description: "The fusions tsv produced by linx.", category: "required"} + linxBreakendTsv: {description: "The breakend tsv produced by linx.", category: "required"} + linxDriverCatalogTsv: {description: "The driver catalog produced by linx.", category: "required"} + linxDriverTsv: {description: "The driver tsv produced by linx.", category: "required"} + linxPlots: {description: "The plots generated by linx.", category: "required"} + cuppaResultCsv: {description: "The cuppa results csv.", category: "required"} + cuppaSummaryPlot: {description: "The cuppa summary plot.", category: "required"} + cuppaFeaturePlot: {description: "The cuppa feature plot.", category: "common"} + chordPredictionTxt: {description: "Chord prediction results.", category: "required"} + peachGenotypeTsv: {description: "Genotype tsv produced by peach.", category: "required"} + protectEvidenceTsv: {description: "Evidence tsv produced by protect.", category: "required"} + annotatedVirusTsv: {description: "Annotated virus tsv produced by virus-interpreter.", category: "required"} + #pipelineVersionFile: {description: "", category: "required"} + cohortMappingTsv: {description: "Cohort mapping file from the HMFTools resources.", category: "required"} + cohortPercentilesTsv: {description: "Cohort percentile file from the HMFTools resources.", category: "required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Pave { + input { + String outputDir = "./" + String sampleName + File vcfFile + File vcfFileIndex + File referenceFasta + File referenceFastaFai + File referenceFastaDict + String refGenomeVersion + File driverGenePanel + #The following should be in the same directory. + File geneDataCsv + File proteinFeaturesCsv + File transExonDataCsv + File transSpliceDataCsv + + Int timeMinutes = 50 + String javaXmx = "8G" + String memory = "9GiB" + String dockerImage = "quay.io/biowdl/pave:v1.0" + } + + command { + set -e + mkdir -p ~{outputDir} + pave -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -sample ~{sampleName} \ + -vcf_file ~{vcfFile} \ + -output_dir ~{outputDir} \ + -ensembl_data_dir ~{sub(geneDataCsv, basename(geneDataCsv), "")} \ + -ref_genome ~{referenceFasta} \ + -ref_genome_version ~{refGenomeVersion} \ + -driver_gene_panel ~{driverGenePanel} + } + + output { + File outputVcf = "~{outputDir}/~{sub(basename(vcfFile), 'vcf.gz$', 'pave.vcf.gz')}" + File outputVcfIndex = "~{outputDir}/~{sub(basename(vcfFile), 'vcf.gz$', 'pave.vcf.gz.tbi')}" + } + + runtime { + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + memory: memory + } + + parameter_meta { + outputDir: {description: "The directory the outputs will be written to.", category: "required"} + sampleName: {description: "The name of the sample.", category: "required"} + vcfFile: {description: "The input VCF file.", category: "required"} + vcfFileIndex: {description: "The index for the input vcf file.", category: "required"} + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + refGenomeVersion: {description: "The version of the genome assembly used for alignment. Either \"HG19\" or \"HG38\".", category: "required"} + driverGenePanel: {description: "A TSV file describing the driver gene panel.", category: "required"} + geneDataCsv: {description: "A CSV file containing gene information, must be in the same directory as `proteinFeaturesCsv`, `transExonDataCsv` and `transSpliceDataCsv`.", category: "required"} + proteinFeaturesCsv: {description: "A CSV file containing protein feature information, must be in the same directory as `geneDataCsv`, `transExonDataCsv` and `transSpliceDataCsv`.", category: "required"} + transExonDataCsv: {description: "A CSV file containing transcript exon information, must be in the same directory as `geneDataCsv`, `proteinFeaturesCsv` and `transSpliceDataCsv`.", category: "required"} + transSpliceDataCsv: {description: "A CSV file containing transcript splicing information, must be in the same directory as `geneDataCsv`, `proteinFeaturesCsv` and `transExonDataCsv`.", category: "required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Protect { + input { + String refGenomeVersion + String tumorName + String referenceName + Array[String]+ sampleDoids + String outputDir = "." + Array[File]+ serveActionability + File doidJson + File purplePurity + File purpleQc + File purpleDriverCatalogSomatic + File purpleDriverCatalogGermline + File purpleSomaticVariants + File purpleSomaticVariantsIndex + File purpleGermlineVariants + File purpleGermlineVariantsIndex + File purpleGeneCopyNumber + File linxFusion + File linxBreakend + File linxDriversCatalog + File chordPrediction + File annotatedVirus + + String memory = "9GiB" + String javaXmx = "8G" + Int timeMinutes = 60 + String dockerImage = "quay.io/biowdl/protect:v2.0" + } + + command { + protect -Xmx~{javaXmx} \ + -ref_genome_version ~{refGenomeVersion} \ + -tumor_sample_id ~{tumorName} \ + -reference_sample_id ~{referenceName} \ + -primary_tumor_doids '~{sep=";" sampleDoids}' \ + -output_dir ~{outputDir} \ + -serve_actionability_dir ~{sub(serveActionability[0], basename(serveActionability[0]), "")} \ + -doid_json ~{doidJson} \ + -purple_purity_tsv ~{purplePurity} \ + -purple_qc_file ~{purpleQc} \ + -purple_somatic_driver_catalog_tsv ~{purpleDriverCatalogSomatic} \ + -purple_germline_driver_catalog_tsv ~{purpleDriverCatalogGermline} \ + -purple_somatic_variant_vcf ~{purpleSomaticVariants} \ + -purple_germline_variant_vcf ~{purpleGermlineVariants} \ + -purple_gene_copy_number_tsv ~{purpleGeneCopyNumber} \ + -linx_fusion_tsv ~{linxFusion} \ + -linx_breakend_tsv ~{linxBreakend} \ + -linx_driver_catalog_tsv ~{linxDriversCatalog} \ + -chord_prediction_txt ~{chordPrediction} \ + -annotated_virus_tsv ~{annotatedVirus} + } + + output { + File protectTsv = "~{outputDir}/~{tumorName}.protect.tsv" + } + + runtime { + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + memory: memory + } + + parameter_meta { + refGenomeVersion: {description: "The version of the genome assembly used for alignment. Either \"37\" or \"38\".", category: "required"} + tumorName: {description: "The name of the tumor sample.", category: "required"} + referenceName: {description: "The name of the normal sample.", category: "required"} + sampleDoids: {description: "The DOIDs (Human Disease Ontology) for the primary tumor.", category: "required"} + outputDir: {description: "The directory the outputs will be written to.", category: "required"} + serveActionability: {description: "The actionability files generated by hmftools' serve.", category: "required"} + doidJson: {description: "A json with the DOID (Human Disease Ontology) tree.", category: "required"} + purplePurity: {description: "The purity file generated by purple.", category: "required"} + purpleQc: {description: "The QC file generated by purple.", category: "required"} + purpleDriverCatalogSomatic: {description: "The somatic driver catalog generated by purple.", category: "required"} + purpleDriverCatalogGermline: {description: "The germline driver catalog generated by purple.", category: "required"} + purpleSomaticVariants: {description: "The somatic VCF generated by purple.", category: "required"} + purpleSomaticVariantsIndex: {description: "The index for the somatic VCF generated by purple.", category: "required"} + purpleGermlineVariants: {description: "The germline VCF generated by purple.", category: "required"} + purpleGermlineVariantsIndex: {description: "The index of the germline VCF generated by purple.", category: "required"} + purpleGeneCopyNumber: {description: "The gene copy number file generated by purple.", category: "required"} + linxFusion: {description: "The fusion file generated by linx.", category: "required"} + linxBreakend: {description: "The breakend file generated by linx.", category: "required"} + linxDriversCatalog: {description: "The driver catalog generated generated by linx.", category: "required"} + chordPrediction: {description: "The chord prediction file.", category: "required"} + annotatedVirus: {description: "The virus-interpreter output.", category: "required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Purple { + input { + String referenceName + String tumorName + String outputDir = "./purple" + Array[File]+ amberOutput + Array[File]+ cobaltOutput + File gcProfile + File somaticVcf + File germlineVcf + File filteredSvVcf + File filteredSvVcfIndex + File fullSvVcf + File fullSvVcfIndex + File referenceFasta + File referenceFastaFai + File referenceFastaDict + File driverGenePanel + File somaticHotspots + File germlineHotspots + Float? highlyDiploidPercentage + Float? somaticMinPuritySpread + #The following should be in the same directory. + File geneDataCsv + File proteinFeaturesCsv + File transExonDataCsv + File transSpliceDataCsv + + Int threads = 1 + Int timeMinutes = 30 + String memory = "9GiB" + String javaXmx = "8G" + # clone of quay.io/biocontainers/hmftools-purple:3.2--hdfd78af_0 with 'ln -s /usr/local/lib/libwebp.so.7 /usr/local/lib/libwebp.so.6' + String dockerImage = "quay.io/biowdl/hmftools-purple:3.2" + } + + command { + PURPLE -Xmx~{javaXmx} \ + -reference ~{referenceName} \ + -germline_vcf ~{germlineVcf} \ + -germline_hotspots ~{germlineHotspots} \ + -tumor ~{tumorName} \ + -output_dir ~{outputDir} \ + -amber ~{sub(amberOutput[0], basename(amberOutput[0]), "")} \ + -cobalt ~{sub(cobaltOutput[0], basename(cobaltOutput[0]), "")} \ + -gc_profile ~{gcProfile} \ + -somatic_vcf ~{somaticVcf} \ + -structural_vcf ~{filteredSvVcf} \ + -sv_recovery_vcf ~{fullSvVcf} \ + -circos /usr/local/bin/circos \ + -ref_genome ~{referenceFasta} \ + -ensembl_data_dir ~{sub(geneDataCsv, basename(geneDataCsv), "")} \ + -run_drivers \ + -somatic_hotspots ~{somaticHotspots} \ + -driver_gene_panel ~{driverGenePanel} \ + ~{"-highly_diploid_percentage " + highlyDiploidPercentage} \ + ~{"-somatic_min_purity_spread " + somaticMinPuritySpread} \ + -threads ~{threads} + } + + output { + File driverCatalogGermlineTsv = "~{outputDir}/~{tumorName}.driver.catalog.germline.tsv" + File driverCatalogSomaticTsv = "~{outputDir}/~{tumorName}.driver.catalog.somatic.tsv" + File purpleCnvGeneTsv = "~{outputDir}/~{tumorName}.purple.cnv.gene.tsv" + File purpleCnvSomaticTsv = "~{outputDir}/~{tumorName}.purple.cnv.somatic.tsv" + File purpleGermlineDeletionTsv = "~{outputDir}/~{tumorName}.purple.germline.deletion.tsv" + File purpleGermlineVcf = "~{outputDir}/~{tumorName}.purple.germline.vcf.gz" + File purpleGermlineVcfIndex = "~{outputDir}/~{tumorName}.purple.germline.vcf.gz.tbi" + File purplePurityRangeTsv = "~{outputDir}/~{tumorName}.purple.purity.range.tsv" + File purplePurityTsv = "~{outputDir}/~{tumorName}.purple.purity.tsv" + File purpleQc = "~{outputDir}/~{tumorName}.purple.qc" + File purpleSegmentTsv = "~{outputDir}/~{tumorName}.purple.segment.tsv" + File purpleSomaticClonalityTsv = "~{outputDir}/~{tumorName}.purple.somatic.clonality.tsv" + File purpleSomaticHistTsv = "~{outputDir}/~{tumorName}.purple.somatic.hist.tsv" + File purpleSomaticVcf = "~{outputDir}/~{tumorName}.purple.somatic.vcf.gz" + File purpleSomaticVcfIndex = "~{outputDir}/~{tumorName}.purple.somatic.vcf.gz.tbi" + File purpleSvVcf = "~{outputDir}/~{tumorName}.purple.sv.vcf.gz" + File purpleSvVcfIndex = "~{outputDir}/~{tumorName}.purple.sv.vcf.gz.tbi" + File purpleVersion = "~{outputDir}/purple.version" + File circosPlot = "~{outputDir}/plot/~{tumorName}.circos.png" + File copynumberPlot = "~{outputDir}/plot/~{tumorName}.copynumber.png" + File inputPlot = "~{outputDir}/plot/~{tumorName}.input.png" + File mapPlot = "~{outputDir}/plot/~{tumorName}.map.png" + File purityRangePlot = "~{outputDir}/plot/~{tumorName}.purity.range.png" + File segmentPlot = "~{outputDir}/plot/~{tumorName}.segment.png" + File somaticClonalityPlot = "~{outputDir}/plot/~{tumorName}.somatic.clonality.png" + File somaticPlot = "~{outputDir}/plot/~{tumorName}.somatic.png" + File? somaticRainfallPlot = "~{outputDir}/plot/~{tumorName}.somatic.rainfall.png" + File circosNormalRatio = "~{outputDir}/circos/~{referenceName}.ratio.circos" + File circosBaf = "~{outputDir}/circos/~{tumorName}.baf.circos" + File circosConf = "~{outputDir}/circos/~{tumorName}.circos.conf" + File circosCnv = "~{outputDir}/circos/~{tumorName}.cnv.circos" + File circosIndel = "~{outputDir}/circos/~{tumorName}.indel.circos" + File circosInputConf = "~{outputDir}/circos/~{tumorName}.input.conf" + File circosLink = "~{outputDir}/circos/~{tumorName}.link.circos" + File circosMap = "~{outputDir}/circos/~{tumorName}.map.circos" + File circosTumorRatio = "~{outputDir}/circos/~{tumorName}.ratio.circos" + File circosSnp = "~{outputDir}/circos/~{tumorName}.snp.circos" + File circosGaps = "~{outputDir}/circos/gaps.txt" + Array[File] outputs = [driverCatalogSomaticTsv, purpleCnvGeneTsv, + purpleCnvSomaticTsv, purplePurityRangeTsv, purplePurityTsv, purpleQc, + purpleSegmentTsv, purpleSomaticClonalityTsv, purpleSomaticHistTsv, + purpleSomaticVcf, purpleSomaticVcfIndex, purpleSvVcf, purpleSvVcfIndex, + purpleVersion, purpleGermlineVcf, purpleGermlineVcfIndex, driverCatalogGermlineTsv] + Array[File] plots = select_all([circosPlot, copynumberPlot, inputPlot, mapPlot, purityRangePlot, + segmentPlot, somaticClonalityPlot, somaticPlot, somaticRainfallPlot]) + Array[File] circos = [circosNormalRatio, circosConf, circosIndel, circosLink, + circosTumorRatio, circosGaps, circosBaf, circosCnv, circosInputConf, circosMap, + circosSnp] + } + + runtime { + time_minutes: timeMinutes # !UnknownRuntimeKey + cpu: threads + docker: dockerImage + memory: memory + } + + parameter_meta { + referenceName: {description: "the name of the normal sample.", category: "required"} + tumorName: {description: "The name of the tumor sample.", category: "required"} + outputDir: {description: "The path to the output directory.", category: "common"} + amberOutput: {description: "The output files of hmftools amber.", category: "required"} + cobaltOutput: {description: "The output files of hmftools cobalt", category: "required"} + gcProfile: {description: "A file describing the GC profile of the reference genome.", category: "required"} + somaticVcf: {description: "The somatic variant calling results.", category: "required"} + germlineVcf: {description: "The germline variant calling results.", category: "required"} + filteredSvVcf: {description: "The filtered structural variant calling results.", category: "required"} + fullSvVcf: {description: "The unfiltered structural variant calling results.", category: "required"} + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + driverGenePanel: {description: "A TSV file describing the driver gene panel.", category: "required"} + somaticHotspots: {description: "A vcf file with hotspot somatic variant sites.", category: "required"} + germlineHotspots: {description: "A vcf file with hotspot germline variant sites.", category: "required"} + highlyDiploidPercentage: {description: "Equivalent to PURPLE's `-highly_diploid_percentage` option.", category: "advanced"} + somaticMinPuritySpread: {description: "Equivalent to PURPLE's `-somatic_min_purity_spread` option.", category: "advanced"} + geneDataCsv: {description: "A CSV file containing gene information, must be in the same directory as `proteinFeaturesCsv`, `transExonDataCsv` and `transSpliceDataCsv`.", category: "required"} + proteinFeaturesCsv: {description: "A CSV file containing protein feature information, must be in the same directory as `geneDataCsv`, `transExonDataCsv` and `transSpliceDataCsv`.", category: "required"} + transExonDataCsv: {description: "A CSV file containing transcript exon information, must be in the same directory as `geneDataCsv`, `proteinFeaturesCsv` and `transSpliceDataCsv`.", category: "required"} + transSpliceDataCsv: {description: "A CSV file containing transcript splicing information, must be in the same directory as `geneDataCsv`, `proteinFeaturesCsv` and `transExonDataCsv`.", category: "required"} + + + threads: {description: "The number of threads the program will use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task Sage { + input { + String tumorName + File tumorBam + File tumorBamIndex + File referenceFasta + File referenceFastaDict + File referenceFastaFai + File hotspots + File panelBed + File highConfidenceBed + Boolean hg38 = false + Boolean panelOnly = false + String outputPath = "./sage.vcf.gz" + + String? referenceName + File? referenceBam + File? referenceBamIndex + Int? hotspotMinTumorQual + Int? panelMinTumorQual + Int? hotspotMaxGermlineVaf + Int? hotspotMaxGermlineRelRawBaseQual + Int? panelMaxGermlineVaf + Int? panelMaxGermlineRelRawBaseQual + String? mnvFilterEnabled + File? coverageBed + + Int threads = 32 + String javaXmx = "16G" + String memory = "20GiB" + Int timeMinutes = 720 + String dockerImage = "quay.io/biocontainers/hmftools-sage:2.8--hdfd78af_1" + } + + command { + SAGE -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -tumor ~{tumorName} \ + -tumor_bam ~{tumorBam} \ + ~{"-reference " + referenceName} \ + ~{"-reference_bam " + referenceBam} \ + -ref_genome ~{referenceFasta} \ + -hotspots ~{hotspots} \ + -panel_bed ~{panelBed} \ + -high_confidence_bed ~{highConfidenceBed} \ + -assembly ~{true="hg38" false="hg19" hg38} \ + ~{"-hotspot_min_tumor_qual " + hotspotMinTumorQual} \ + ~{"-panel_min_tumor_qual " + panelMinTumorQual} \ + ~{"-hotspot_max_germline_vaf " + hotspotMaxGermlineVaf} \ + ~{"-hotspot_max_germline_rel_raw_base_qual " + hotspotMaxGermlineRelRawBaseQual} \ + ~{"-panel_max_germline_vaf " + panelMaxGermlineVaf} \ + ~{"-panel_max_germline_rel_raw_base_qual " + panelMaxGermlineRelRawBaseQual} \ + ~{"-mnv_filter_enabled " + mnvFilterEnabled} \ + ~{"-coverage_bed " + coverageBed} \ + ~{true="-panel_only" false="" panelOnly} \ + -threads ~{threads} \ + -out ~{outputPath} + } + + output { + File outputVcf = outputPath + File outputVcfIndex = outputPath + ".tbi" + File? referenceSageBqrPng = "~{referenceName}.sage.bqr.png" + File? referenceSageBqrTsv = "~{referenceName}.sage.bqr.tsv" + File tumorSageBqrPng = "~{tumorName}.sage.bqr.png" + File tumorSageBqrTsv = "~{tumorName}.sage.bqr.tsv" + File sageGeneCoverageTsv = "~{tumorName}.sage.gene.coverage.tsv" + } + + runtime { + time_minutes: timeMinutes # !UnknownRuntimeKey + cpu: threads + docker: dockerImage + memory: memory + } + + parameter_meta { + tumorName: {description: "The name of the tumor sample.", category: "required"} + tumorBam: {description: "The BAM file for the tumor sample.", category: "required"} + tumorBamIndex: {description: "The index of the BAM file for the tumor sample.", category: "required"} + referenceName: {description: "The name of the normal/reference sample.", category: "common"} + referenceBam: {description: "The BAM file for the normal sample.", category: "common"} + referenceBamIndex: {description: "The index of the BAM file for the normal sample.", category: "common"} + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + hotspots: {description: "A vcf file with hotspot variant sites.", category: "required"} + panelBed: {description: "A bed file describing coding regions to search for in frame indels.", category: "required"} + highConfidenceBed: {description: "A bed files describing high confidence mapping regions.", category: "required"} + hotspotMinTumorQual: {description: "Equivalent to sage's `hotspot_min_tumor_qual` option.", category: "advanced"} + panelMinTumorQual: {description: "Equivalent to sage's `panel_min_tumor_qual` option.", category: "advanced"} + hotspotMaxGermlineVaf: {description: "Equivalent to sage's `hotspot_max_germline_vaf` option.", category: "advanced"} + hotspotMaxGermlineRelRawBaseQual: {description: "Equivalent to sage's `hotspot_max_germline_rel_raw_base_qual` option.", category: "advanced"} + panelMaxGermlineVaf: {description: "Equivalent to sage's `panel_max_germline_vaf` option.", category: "advanced"} + panelMaxGermlineRelRawBaseQual: {description: "Equivalent to sage's `panel_max_germline_vaf` option.", category: "advanced"} + mnvFilterEnabled: {description: "Equivalent to sage's `mnv_filter_enabled` option.", category: "advanced"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task VirusInterpreter { + input { + String sampleId + File purplePurityTsv + File prupleQcFile + File tumorSampleWgsMetricsFile + File virusBreakendTsv + File taxonomyDbTsv + File virusReportingDbTsv + String outputDir = "." + + String memory = "3GiB" + String javaXmx = "2G" + Int timeMinutes = 15 + String dockerImage = "quay.io/biowdl/virus-interpreter:1.2" + } + + command { + virus-interpreter -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -sample_id ~{sampleId} \ + -purple_purity_tsv ~{purplePurityTsv} \ + -purple_qc_file ~{prupleQcFile} \ + -tumor_sample_wgs_metrics_file ~{tumorSampleWgsMetricsFile} \ + -virus_breakend_tsv ~{virusBreakendTsv} \ + -taxonomy_db_tsv ~{taxonomyDbTsv} \ + -virus_reporting_db_tsv ~{virusReportingDbTsv} \ + -output_dir ~{outputDir} + } + + output { + File virusAnnotatedTsv = "~{outputDir}/~{sampleId}.virus.annotated.tsv" + } + + runtime { + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + memory: memory + } + + parameter_meta { + sampleId: {description: "The name of the sample.", category: "required"} + purplePurityTsv: {description: "The purity file produced by purple.", category: "required"} + prupleQcFile: {description: "The QC file produced by purple.", category: "required"} + tumorSampleWgsMetricsFile: {description: "The picard WGS metrics file for this sample.", category: "required"} + virusBreakendTsv: {description: "The TSV output from virusbreakend.", category: "required"} + taxonomyDbTsv: {description: "A taxonomy database tsv.", category: "required"} + virusReportingDbTsv: {description: "A virus reporting tsv.", category: "required"} + outputDir: {description: "The directory the output will be written to.", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} diff --git a/htseq.wdl b/htseq.wdl index 900a88a7..92bc4423 100644 --- a/htseq.wdl +++ b/htseq.wdl @@ -25,22 +25,24 @@ task HTSeqCount { Array[File]+ inputBams File gtfFile String outputTable = "output.tsv" - String format = "bam" String order = "pos" String stranded = "no" + Array[String] additionalAttributes = [] + String? featureType String? idattr - Array[String] additionalAttributes = [] - String memory = "40G" - String dockerImage = "quay.io/biocontainers/htseq:0.11.2--py37h637b7d7_1" + Int nprocesses = 1 + String memory = "8GiB" + Int timeMinutes = 1440 #10 + ceil(size(inputBams, "GiB") * 60) FIXME + String dockerImage = "quay.io/biocontainers/htseq:0.12.4--py37hb3f55d8_0" } command { set -e mkdir -p "$(dirname ~{outputTable})" htseq-count \ - -f ~{format} \ + --nprocesses ~{nprocesses} \ -r ~{order} \ -s ~{stranded} \ ~{"--type " + featureType} \ @@ -48,7 +50,7 @@ task HTSeqCount { ~{true="--additional-attr " false="" length(additionalAttributes) > 0 }~{sep=" --additional-attr " additionalAttributes} \ ~{sep=" " inputBams} \ ~{gtfFile} \ - > ~{outputTable} + -c ~{outputTable} } output { @@ -56,54 +58,28 @@ task HTSeqCount { } runtime { + cpu: nprocesses memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - inputBams: { - description: "The input BAM files.", - category: "required" - } - gtfFile: { - description: "A GTF/GFF file containing the features of interest.", - category: "required" - } - outputTable: { - description: "The path to which the output table should be written.", - category: "common" - } - format: { - description: "Equivalent to the -f option of htseq-count.", - category: "advanced" - } - order: { - description: "Equivalent to the -r option of htseq-count.", - category: "advanced" - } - stranded: { - description: "Equivalent to the -s option of htseq-count.", - category: "common" - } - featureType: { - description: "Equivalent to the --type option of htseq-count.", - category: "advanced" - } - idattr: { - description: "Equivalent to the --idattr option of htseq-count.", - category: "advanced" - } - additionalAttributes: { - description: "Equivalent to the --additional-attr option of htseq-count.", - category: "advanced" - } - memory: { - description: "The amount of memory the job requires in GB.", - category: "advanced" - } - dockerImage: { - description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced" - } + # inputs + inputBams: {description: "The input BAM files.", category: "required"} + gtfFile: {description: "A GTF/GFF file containing the features of interest.", category: "required"} + outputTable: {description: "The path to which the output table should be written.", category: "common"} + order: {description: "Equivalent to the -r option of htseq-count.", category: "advanced"} + stranded: {description: "Equivalent to the -s option of htseq-count.", category: "common"} + additionalAttributes: {description: "Equivalent to the --additional-attr option of htseq-count.", category: "advanced"} + featureType: {description: "Equivalent to the --type option of htseq-count.", category: "advanced"} + idattr: {description: "Equivalent to the --idattr option of htseq-count.", category: "advanced"} + nprocesses: {description: "Number of processes to run htseq with.", category: "advanced"} + memory: {description: "The amount of memory the job requires in GB.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + counts: {description: "Count table based on input BAM file."} } } diff --git a/isoseq3.wdl b/isoseq3.wdl index 8cc0db8f..77f19f80 100644 --- a/isoseq3.wdl +++ b/isoseq3.wdl @@ -1,6 +1,6 @@ version 1.0 -# Copyright (c) 2020 Sequencing Analysis Support Core - Leiden University Medical Center +# Copyright (c) 2020 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -8,10 +8,10 @@ version 1.0 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -22,81 +22,72 @@ version 1.0 task Refine { input { - Int minPolyAlength = 20 + Int minPolyALength = 20 Boolean requirePolyA = false String logLevel = "WARN" File inputBamFile + File inputBamIndex File primerFile - String outputPrefix + String outputDir + String outputNamePrefix - Int cores = 4 - String memory = "10G" - String dockerImage = "quay.io/biocontainers/isoseq3:3.3.0--0" + Int threads = 2 + String memory = "2GiB" + Int timeMinutes = 30 + String dockerImage = "quay.io/biocontainers/isoseq3:3.4.0--0" } - command <<< + command { set -e - mkdir -p "$(dirname ~{outputPrefix})" - - # Create a unique output name base on the input bam file. - bamBasename="$(basename ~{inputBamFile})" - bamNewName="${bamBasename/fl/flnc}" - folderDirname="$(dirname ~{outputPrefix})" - combinedOutput="${folderDirname}/${bamNewName}" - + mkdir -p "~{outputDir}" isoseq3 refine \ - --min-polya-length ~{minPolyAlength} \ + --min-polya-length ~{minPolyALength} \ ~{true="--require-polya" false="" requirePolyA} \ --log-level ~{logLevel} \ - --num-threads ~{cores} \ - --log-file "${bamNewName}.stderr.log" \ + --num-threads ~{threads} \ + --log-file "~{outputDir}/~{outputNamePrefix}.stderr.log" \ ~{inputBamFile} \ ~{primerFile} \ - ${bamNewName} - - # Copy commands below are needed because naming schema for Refine output - # can not be correctly handled in the WDL output section. - cp "${bamNewName}" "${combinedOutput}" - cp "${bamNewName}.pbi" "${combinedOutput}.pbi" - cp "${bamNewName/bam/consensusreadset}.xml" "${combinedOutput/bam/consensusreadset}.xml" - cp "${bamNewName/bam/filter_summary}.json" "${combinedOutput/bam/filter_summary}.json" - cp "${bamNewName/bam/report}.csv" "${combinedOutput/bam/report}.csv" - cp "${bamNewName}.stderr.log" "${combinedOutput}.stderr.log" - >>> + "~{outputDir}/~{outputNamePrefix}.bam" + } output { - Array[File] outputFLNCfile = glob("*.bam") - Array[File] outputFLNCindexFile = glob("*.bam.pbi") - Array[File] outputConsensusReadsetFile = glob("*.consensusreadset.xml") - Array[File] outputFilterSummaryFile = glob("*.filter_summary.json") - Array[File] outputReportFile = glob("*.report.csv") - Array[File] outputSTDERRfile = glob("*.stderr.log") + File refineBam = outputDir + "/" + outputNamePrefix + ".bam" + File refineBamIndex = outputDir + "/" + outputNamePrefix + ".bam.pbi" + File refineConsensusReadset = outputDir + "/" + outputNamePrefix + ".consensusreadset.xml" + File refineFilterSummary = outputDir + "/" + outputNamePrefix + ".filter_summary.json" + File refineReport = outputDir + "/" + outputNamePrefix + ".report.csv" + File refineStderr = outputDir + "/" + outputNamePrefix + ".stderr.log" } runtime { - cpu: cores + cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - minPolyAlength: {description: "Minimum poly(A) tail length.", category: "advanced"} - requirePolyA: {description: "Require FL reads to have a poly(A) tail and remove it.", category: "common"} + minPolyALength: {description: "Minimum poly(A) tail length.", category: "advanced"} + requirePolyA: {description: "Require fl reads to have a poly(A) tail and remove it.", category: "common"} logLevel: {description: "Set log level. Valid choices: (TRACE, DEBUG, INFO, WARN, FATAL).", category: "advanced"} - inputBamFile: {description: "BAM input file.", category: "required"} + inputBamFile: {description: "Bam input file.", category: "required"} + inputBamIndex: {description: "Index for the Bam input file.", category: "required"} primerFile: {description: "Barcode/primer fasta file.", category: "required"} - outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - cores: {description: "The number of cores to be used.", category: "advanced"} + outputDir: {description: "Output directory path.", category: "required"} + outputNamePrefix: {description: "Basename of the output files.", category: "required"} + threads: {description: "The number of threads to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputFLNCfile: {description: "Filtered reads output file."} - outputFLNCindexFile: {description: "Index of filtered reads output file."} - outputSTDERRfile: {description: "Refine STDERR log file."} - outputConsensusReadsetFile: {description: "Refine consensus readset XML file."} - outputFilterSummaryFile: {description: "Refine summary file."} - outputReportFile: {description: "Refine report file."} + refineBam: {description: "Filtered reads output file."} + refineBamIndex: {description: "Index of filtered reads output file."} + refineConsensusReadset: {description: "Refine consensus readset xml file."} + refineFilterSummary: {description: "Refine summary file."} + refineReport: {description: "Refine report file."} + refineStderr: {description: "Refine stderr log file."} } } diff --git a/lima.wdl b/lima.wdl index 747959a1..eece2b3f 100644 --- a/lima.wdl +++ b/lima.wdl @@ -1,6 +1,6 @@ version 1.0 -# Copyright (c) 2020 Sequencing Analysis Support Core - Leiden University Medical Center +# Copyright (c) 2020 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -8,10 +8,10 @@ version 1.0 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -48,14 +48,15 @@ task Lima { File barcodeFile String outputPrefix - Int cores = 4 - String memory = "10G" - String dockerImage = "quay.io/biocontainers/lima:1.11.0--0" + Int threads = 2 + String memory = "2GiB" + Int timeMinutes = 30 + String dockerImage = "quay.io/biocontainers/lima:2.2.0--h9ee0642_0" } Map[String, String] libraryDesignOptions = {"same": "--same", "different": "--different", "neighbors": "--neighbors"} - command { + command <<< set -e mkdir -p "$(dirname ~{outputPrefix})" lima \ @@ -81,35 +82,33 @@ task Lima { --guess-min-count ~{guessMinCount} \ ~{true="--peek-guess" false="" peekGuess} \ --log-level ~{logLevel} \ - --num-threads ~{cores} \ - ~{"--log-file " + outputPrefix + ".fl.stderr.log"} \ + --num-threads ~{threads} \ + ~{"--log-file " + outputPrefix + ".lima.stderr.log"} \ ~{inputBamFile} \ ~{barcodeFile} \ - ~{basename(outputPrefix) + ".fl.bam"} + ~{outputPrefix + ".bam"} - # Move commands below are needed because glob command does not find - # multiple bam/bam.pbi/subreadset.xml files when not located in working - # directory. - mv "~{basename(outputPrefix)}.fl.json" "~{outputPrefix}.fl.json" - mv "~{basename(outputPrefix)}.fl.lima.counts" "~{outputPrefix}.fl.lima.counts" - mv "~{basename(outputPrefix)}.fl.lima.report" "~{outputPrefix}.fl.lima.report" - mv "~{basename(outputPrefix)}.fl.lima.summary" "~{outputPrefix}.fl.lima.summary" - } + dirName="$(dirname ~{outputPrefix})" + find "$(cd ${dirName}; pwd)" -name "*.bam" > bamFiles.txt + find "$(cd ${dirName}; pwd)" -name "*.bam.pbi" > bamIndexes.txt + find "$(cd ${dirName}; pwd)" -name "*.consensusreadset.xml" > consensusreadset.txt + >>> output { - Array[File] outputFLfile = glob("*.bam") - Array[File] outputFLindexFile = glob("*.bam.pbi") - Array[File] outputFLxmlFile = glob("*.subreadset.xml") - File outputSTDERRfile = outputPrefix + ".fl.stderr.log" - File outputJSONfile = outputPrefix + ".fl.json" - File outputCountsFile = outputPrefix + ".fl.lima.counts" - File outputReportFile = outputPrefix + ".fl.lima.report" - File outputSummaryFile = outputPrefix + ".fl.lima.summary" + Array[File] limaBam = read_lines("bamFiles.txt") + Array[File] limaBamIndex = read_lines("bamIndexes.txt") + Array[File] limaXml = read_lines("consensusreadset.txt") + File limaStderr = outputPrefix + ".lima.stderr.log" + File limaJson = outputPrefix + ".json" + File limaCounts = outputPrefix + ".lima.counts" + File limaReport = outputPrefix + ".lima.report" + File limaSummary = outputPrefix + ".lima.summary" } runtime { - cpu: cores + cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } @@ -129,29 +128,30 @@ task Lima { minEndScore: {description: "Minimum end barcode score threshold is applied to the individual leading and trailing ends.", category: "advanced"} minSignalIncrease: {description: "The minimal score difference, between first and combined, required to call a barcode pair different.", category: "advanced"} minScoreLead: {description: "The minimal score lead required to call a barcode pair significant.", category: "common"} - ccsMode: {description: "CCS mode, use optimal alignment options.", category: "common"} - splitBamNamed: {description: "Split BAM output by resolved barcode pair name.", category: "common"} + ccsMode: {description: "Ccs mode, use optimal alignment options.", category: "common"} + splitBamNamed: {description: "Split bam output by resolved barcode pair name.", category: "common"} scoredAdapterRatio: {description: "Minimum ratio of scored vs sequenced adapters.", category: "advanced"} peek: {description: "Demux the first N ZMWs and return the mean score, 0 means peeking deactivated.", category: "advanced"} guess: {description: "Try to guess the used barcodes, using the provided mean score threshold, 0 means guessing deactivated.", category: "advanced"} guessMinCount: {description: "Minimum number of ZMWs observed to whitelist barcodes.", category: "advanced"} peekGuess: {description: "Try to infer the used barcodes subset, by peeking at the first 50,000 ZMWs.", category: "advanced"} logLevel: {description: "Set log level. Valid choices: (TRACE, DEBUG, INFO, WARN, FATAL).", category: "advanced"} - inputBamFile: {description: "BAM input file.", category: "required"} + inputBamFile: {description: "Bam input file.", category: "required"} barcodeFile: {description: "Barcode/primer fasta file.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - cores: {description: "The number of cores to be used.", category: "advanced"} + threads: {description: "The number of threads to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputFLfile: {description: "Demultiplexed reads output file(s)."} - outputFLindexFile: {description: "Index of demultiplexed reads output file(s)."} - outputFLxmlFile: {description: "XML file of the subreadset(s)."} - outputSTDERRfile: {description: "Lima STDERR log file."} - outputJSONfile: {description: "Lima JSON file."} - outputCountsFile: {description: "Lima counts file."} - outputReportFile: {description: "Lima report file."} - outputSummaryFile: {description: "Lima summary file."} + limaBam: {description: "Demultiplexed reads output file(s)."} + limaBamIndex: {description: "Index of demultiplexed reads output file(s)."} + limaXml: {description: "Xml file of the subreadset(s)."} + limaStderr: {description: "Lima stderr log file."} + limaJson: {description: "Lima json file."} + limaCounts: {description: "Lima counts file."} + limaReport: {description: "Lima report file."} + limaSummary: {description: "Lima summary file."} } } diff --git a/macs2.wdl b/macs2.wdl index fad3cb00..5ccc5a5f 100644 --- a/macs2.wdl +++ b/macs2.wdl @@ -24,14 +24,21 @@ task PeakCalling { input { Array[File]+ inputBams Array[File]+ inputBamsIndex - Array[File]+? controlBams - Array[File]+? controlBamsIndex - String outDir + Array[File] controlBams + Array[File] controlBamsIndex + String outDir = "macs2" String sampleName + String format = "AUTO" Boolean nomodel = false - - Int threads = 1 - String memory = "8G" + String? gensz + Int? extsize + Int? shiftsize + Float? pval_thres + Boolean bdg = false + String? keepdup + Boolean callsummits = false + Int timeMinutes = 600 # Default to 10 hours + String memory = "8GiB" String dockerImage = "quay.io/biocontainers/macs2:2.1.2--py27r351_0" } @@ -39,10 +46,18 @@ task PeakCalling { set -e macs2 callpeak \ --treatment ~{sep = ' ' inputBams} \ - ~{true="--control" false="" defined(controlBams)} ~{sep = ' ' controlBams} \ + ~{true="--control" false="" length(controlBams) > 0} ~{sep = ' ' controlBams} \ --outdir ~{outDir} \ --name ~{sampleName} \ - ~{true='--nomodel' false='' nomodel} + ~{"-f " + format} \ + ~{"-g " + gensz} \ + ~{"-p " + pval_thres} \ + ~{"--shift " + shiftsize} \ + ~{"--extsize " + extsize} \ + ~{true='--nomodel' false='' nomodel} \ + ~{true='-B' false='' bdg} \ + ~{"--keep-dup " + keepdup} \ + ~{true='--call-summits' false='' callsummits} } output { @@ -50,8 +65,29 @@ task PeakCalling { } runtime { - cpu: threads + cpu: 1 memory: memory docker: dockerImage + time_minutes: timeMinutes + } + parameter_meta { + inputBams: {description: "The BAM files on which to perform peak calling.", category: "required"} + inputBamsIndex: {description: "The indexes for the input BAM files.", category: "required"} + controlBams: {description: "Control BAM files for the input bam files.", category: "common"} + controlBamsIndex: {description: "The indexes for the control BAM files.", category: "common"} + sampleName: {description: "Name of the sample to be analysed", category: "required"} + outDir: {description: "All output files will be written in this directory.", category: "advanced"} + nomodel: {description: "Whether or not to build the shifting model.", category: "advanced"} + gensz: {description: "macs2 argument for setting the mappable genome size or effective genome size which is defined as the genome size which can be sequenced.", category: "advanced"} + pval_thres: {description: "macs2 argument for setting the p-value cutoff. If -p is specified, MACS2 will use p-value instead of q-value.", category: "advanced"} + shiftsize: {description: "macs2 argument to set an arbitrary shift in bp. Can be negative to indicate direction.", category: "advanced"} + extsize: {description: "macs2 argument to extend reads in 5'->3' direction to fix-sized fragments.", category: "advanced"} + bdg: {description: "macs2 argument that enables the storage of the fragment pileup, control lambda in bedGraph files.", category: "advanced"} + keepdup: {description: "macs2 argument that controls the behavior towards duplicate tags at the exact same location.", category: "advanced"} + callsummits: {description: "macs2 argument to reanalyze the shape of signal profile to deconvolve subpeaks within each peak called from the general procedure.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + format: {description: "Which format to use. Use BAMPE for paired-end reads.", category: "common"} } -} \ No newline at end of file +} diff --git a/manta.wdl b/manta.wdl index 5006a01e..fde8c208 100644 --- a/manta.wdl +++ b/manta.wdl @@ -27,12 +27,14 @@ task Germline { File referenceFasta File referenceFastaFai String runDir = "./manta_run" + Boolean exome = false + File? callRegions File? callRegionsIndex - Boolean exome = false Int cores = 1 Int memoryGb = 4 + Int timeMinutes = 2880 String dockerImage = "quay.io/biocontainers/manta:1.4.0--py27_1" } @@ -58,8 +60,9 @@ task Germline { runtime { cpu: cores - memory: "~{memoryGb}G" + memory: "~{memoryGb}GiB" docker: dockerImage + time_minutes: timeMinutes } parameter_meta { @@ -69,12 +72,17 @@ task Germline { referenceFasta: {description: "The reference fasta file also used for mapping.", category: "required"} referenceFastaFai: {description: "Fasta index (.fai) file of the reference", category: "required" } runDir: {description: "The directory to use as run/output directory.", category: "common"} + exome: {description: "Whether or not the data is from exome sequencing.", category: "common"} callRegions: {description: "The bed file which indicates the regions to operate on.", category: "common"} callRegionsIndex: {description: "The index of the bed file which indicates the regions to operate on.", category: "common"} - exome: {description: "Whether or not the data is from exome sequencing.", category: "common"} cores: {description: "The the number of cores required to run a program", category: "required"} memoryGb: {description: "The memory required to run the manta", category: "required"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + mantaVCF: {description: "SVs and indels scored and genotyped under a diploid model for the set of samples in a joint diploid sample analysis or for the normal sample in a tumor/normal subtraction analysis."} + mantaVCFindex: {description: "Index of output mantaVCF."} } } @@ -82,17 +90,19 @@ task Somatic { input { File tumorBam File tumorBamIndex - File? normalBam - File? normalBamIndex File referenceFasta File referenceFastaFai String runDir = "./manta_run" + Boolean exome = false + + File? normalBam + File? normalBamIndex File? callRegions File? callRegionsIndex - Boolean exome = false Int cores = 1 Int memoryGb = 4 + Int timeMinutes = 2880 String dockerImage = "quay.io/biocontainers/manta:1.4.0--py27_1" } @@ -128,23 +138,36 @@ task Somatic { runtime { cpu: cores - memory: "~{memoryGb}G" + memory: "~{memoryGb}GiB" docker: dockerImage + time_minutes: timeMinutes } parameter_meta { + # inputs tumorBam: {description: "The tumor/case sample's BAM file.", category: "required"} tumorBamIndex: {description: "The index for the tumor/case sample's BAM file.", category: "required"} - normalBam: {description: "The normal/control sample's BAM file.", category: "common"} - normalBamIndex: {description: "The index for the normal/control sample's BAM file.", category: "common"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} runDir: {description: "The directory to use as run/output directory.", category: "common"} + exome: {description: "Whether or not the data is from exome sequencing.", category: "common"} + normalBam: {description: "The normal/control sample's BAM file.", category: "common"} + normalBamIndex: {description: "The index for the normal/control sample's BAM file.", category: "common"} callRegions: {description: "The bed file which indicates the regions to operate on.", category: "common"} callRegionsIndex: {description: "The index of the bed file which indicates the regions to operate on.", category: "common"} - exome: {description: "Whether or not the data is from exome sequencing.", category: "common"} cores: {description: "The number of cores to use.", category: "advanced"} memoryGb: {description: "The amount of memory this job will use in Gigabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + candidateSmallIndelsVcf: {description: "Subset of the candidateSV.vcf.gz file containing only simple insertion and deletion variants less than the minimum scored variant size."} + candidateSmallIndelsVcfIndex: {description: "Index of output VCF file candidateSmallIndelsVcf."} + candidateSVVcf: {description: "Unscored SV and indel candidates."} + candidatSVVcfIndex: {description: "Index of output VCF file candidateSVVcf."} + tumorSVVcf: {description: "Subset of the candidateSV.vcf.gz file after removing redundant candidates and small indels less than the minimum scored variant size."} + tumorSVVcfIndex: {description: "Index of output VCF file tumorSVVcf."} + diploidSV: {description: "SVs and indels scored and genotyped under a diploid model for the set of samples in a joint diploid sample analysis or for the normal sample in a tumor/normal subtraction analysis."} + diploidSVindex: {description: "Index of output VCF file diploidSV."} } } diff --git a/minimap2.wdl b/minimap2.wdl index fd28d4a9..a7584beb 100644 --- a/minimap2.wdl +++ b/minimap2.wdl @@ -1,6 +1,6 @@ version 1.0 -# Copyright (c) 2019 Sequencing Analysis Support Core - Leiden University Medical Center +# Copyright (c) 2019 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -8,10 +8,10 @@ version 1.0 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -31,8 +31,9 @@ task Indexing { Int? splitIndex Int cores = 1 - String memory = "4G" - String dockerImage = "quay.io/biocontainers/minimap2:2.17--h84994c4_0" + String memory = "4GiB" + Int timeMinutes = 10 + String dockerImage = "quay.io/biocontainers/minimap2:2.20--h5bf99c6_0" } command { @@ -49,18 +50,19 @@ task Indexing { } output { - File outputIndexFile = outputPrefix + ".mmi" + File indexFile = outputPrefix + ".mmi" } runtime { cpu: cores memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - # input - useHomopolymerCompressedKmer: {description: "Use homopolymer-compressed k-mer (preferrable for PacBio).", category: "advanced"} + # inputs + useHomopolymerCompressedKmer: {description: "Use homopolymer-compressed k-mer (preferrable for pacbio).", category: "advanced"} kmerSize: {description: "K-mer size (no larger than 28).", category: "advanced"} minimizerWindowSize: {description: "Minimizer window size.", category: "advanced"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} @@ -68,90 +70,131 @@ task Indexing { splitIndex: {description: "Split index for every ~NUM input bases.", category: "advanced"} cores: {description: "The number of cores to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} - # output - outputIndexFile: {description: "Indexed reference file."} + # outputs + indexFile: {description: "Indexed reference file."} } } task Mapping { input { String presetOption - Int kmerSize = 15 - Boolean skipSelfAndDualMappings = false - Boolean outputSAM = false String outputPrefix - Boolean addMDtagToSAM = false - Boolean secondaryAlignment = false File referenceFile File queryFile + + Int compressionLevel = 1 + Boolean nameSorted = false + # MM, ML, MN -> Methylation flags + # Also keep the following flags for Sequali to be able to run on the mapped bam file and get ONT information. + # ch -> channel + # st -> start time + # du -> duration + # dx -> Whether read was duplex + # pi -> Parent ID for split read + + String tagsToKeep = "MM,ML,MN,ch,st,du,dx,pi" + Boolean skipSelfAndDualMappings = false + Boolean addMDTagToSam = false + Boolean secondaryAlignment = true + + Int? kmerSize Int? maxIntronLength Int? maxFragmentLength Int? retainMaxSecondaryAlignments Int? matchingScore Int? mismatchPenalty String? howToFindGTAG - - Int cores = 4 - String memory = "30G" - String dockerImage = "quay.io/biocontainers/minimap2:2.17--h84994c4_0" + String? readgroup + + Int sortThreads = 2 + Int sortMemoryGb = 1 + Int cores = 8 + String memory = "24GiB" + Int timeMinutes = 1 + ceil(size(queryFile, "G") * 200 / cores) + # Minimap 2.28 samtools 1.20 + String dockerImage = "quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:3161f532a5ea6f1dec9be5667c9efc2afdac6104-0" } - command { - set -e + # Always run data through samtools fastq. This supports both FASTQ and uBAM + # files. It does remove any existing FASTQ comments, but this should not be + # problematic for most files. + + command <<< + set -e -o pipefail mkdir -p "$(dirname ~{outputPrefix})" + samtools fastq -T "~{tagsToKeep}" ~{queryFile} | \ minimap2 \ + -a \ -x ~{presetOption} \ - -k ~{kmerSize} \ ~{true="-X" false="" skipSelfAndDualMappings} \ - ~{true="-a" false="" outputSAM} \ - -o ~{outputPrefix} \ - ~{true="--MD" false="" addMDtagToSAM} \ + ~{true="--MD" false="" addMDTagToSam} \ --secondary=~{true="yes" false="no" secondaryAlignment} \ + -y \ -t ~{cores} \ + ~{"-k " + kmerSize} \ ~{"-G " + maxIntronLength} \ ~{"-F " + maxFragmentLength} \ ~{"-N " + retainMaxSecondaryAlignments} \ ~{"-A " + matchingScore} \ ~{"-B " + mismatchPenalty} \ ~{"-u " + howToFindGTAG} \ + ~{"-R '" + readgroup}~{false="" true="'" defined(readgroup)} \ ~{referenceFile} \ - ~{queryFile} - } + - \ + | samtools sort \ + ~{true="-N" false="" nameSorted} \ + --threads ~{sortThreads - 1} \ + -l ~{compressionLevel} \ + -m ~{sortMemoryGb}G \ + -o ~{outputPrefix}.bam + samtools index ~{outputPrefix}.bam + >>> output { - File outputAlignmentFile = outputPrefix + File bam = "~{outputPrefix}.bam" + File bamIndex = "~{outputPrefix}.bam.bai" } runtime { cpu: cores memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs presetOption: {description: "This option applies multiple options at the same time.", category: "common"} kmerSize: {description: "K-mer size (no larger than 28).", category: "advanced"} - outputSAM: {description: "Output in the SAM format.", category: "common"} + skipSelfAndDualMappings: {description: "Skip self and dual mappings (for the all-vs-all mode).", category: "advanced"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + addMDTagToSam: {description: "Adds a MD tag to the sam output file.", category: "common"} + secondaryAlignment: {description: "Whether to output secondary alignments.", category: "advanced"} + referenceFile: {description: "Reference fasta file.", category: "required"} + queryFile: {description: "Input fasta file.", category: "required"} maxIntronLength: {description: "Max intron length (effective with -xsplice; changing -r).", category: "advanced"} maxFragmentLength: {description: "Max fragment length (effective with -xsr or in the fragment mode).", category: "advanced"} - skipSelfAndDualMappings: {description: "Skip self and dual mappings (for the all-vs-all mode).", category: "advanced"} - retainMaxSecondaryAlignments: {description: "Retain at most INT secondary alignments.", category: "advanced"} + retainMaxSecondaryAlignments: {description: "Retain at most N secondary alignments.", category: "advanced"} matchingScore: {description: "Matching score.", category: "advanced"} mismatchPenalty: {description: "Mismatch penalty.", category: "advanced"} + tagsToKeep: {description: "Tags to keep from the input unaligned BAM file.", category: "Advanced"} howToFindGTAG: {description: "How to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG.", category: "common"} - addMDtagToSAM: {description: "Adds a MD tag to the SAM output file.", category: "common"} - secondaryAlignment: {description: "Whether to output secondary alignments.", category: "advanced"} - referenceFile: {description: "Reference fasta file.", category: "required"} - queryFile: {description: "Input fasta file.", category: "required"} + compressionLevel: {description: "compressionLevel for the output file", category: "advanced"} + sortThreads: {description: "Extra sorting threads used for samtools sort", category: "advanced"} + sortMemoryGb: {description: "Amount of memory set for sorting", category: "advanced"} + nameSorted: {description: "Output a name sorted file instead", category: "common"} + cores: {description: "The number of cores to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} - # output - outputAlignmentFile: {description: "Mapping and alignment between collections of DNA sequences file."} + # outputs + bam: {description: "Mapping and alignment between collections of dna sequences file in BAM format."} + bamIndex: {description: "Accompanying index file for the BAM file."} } } diff --git a/modkit.wdl b/modkit.wdl new file mode 100644 index 00000000..ddf4dbf7 --- /dev/null +++ b/modkit.wdl @@ -0,0 +1,250 @@ +version 1.0 + +# Copyright (c) 2025 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Pileup { + input { + File bam + File bamIndex + String outputBed = "output.bedMethyl" + String outputBedGraph = "combined.bedgraph" + File referenceFasta + File referenceFastaFai + + Int? intervalSize + File? includeBed + String? filterThreshold + String? filterPercentile + + Boolean cpg = false + Boolean combineMods = false + Boolean combineStrands = false + String? ignore + String logFilePath = "modkit.log" + + Int threads = 8 + String memory = "4GiB" + Int timeMinutes = 2880 / threads # 2 Days / threads + String dockerImage = "quay.io/biocontainers/ont-modkit:0.4.3--hcdda2d0_0" + } + + command <<< + set -e + mkdir -p $(dirname ~{outputBed}) + mkdir -p $(dirname ~{logFilePath}) + modkit pileup \ + --threads ~{threads} \ + ~{"--interval-size " + intervalSize} \ + ~{"--include-bed " + includeBed} \ + ~{"--ignore " + ignore} \ + --ref ~{referenceFasta} \ + ~{true="--cpg" false="" cpg} \ + ~{true="--combine-mods" false="" combineMods} \ + ~{true="--combine-strands" false="" combineStrands} \ + ~{"--filter-percentile " + filterPercentile} \ + ~{"--filter-threshold " + filterThreshold} \ + --log-filepath ~{logFilePath} \ + ~{bam} \ + - | tee ~{outputBed} | awk -v OFS="\t" '{print $1, $2, $3, $11, $10 >> "~{outputBedGraph}_"$4"_"$6".bedGraph"}' + # Separately generate the combined file as well, so users can have a choice. + cat ~{outputBed} | awk -v OFS="\t" '{print $1, $2, $3, $11, $10}' > ~{outputBedGraph} + >>> + + # You can use modkit pileup ${bam_path} - | tee out.bedmethyl | awk -v OFS="\t" '{print $1, $2, $3, $11, $10}' > out.bg to get both outputs at once without running anything twice. + # https://github.com/nanoporetech/modkit/issues/210#issuecomment-2181706374 + + output { + File out = outputBed # Normal mode + File outGraph = outputBedGraph # Normal mode + Array[File] outFiles = glob(outputBedGraph + "*.bedGraph") # Bedgraph mode + File logFile = logFilePath + } + + runtime { + docker: dockerImage + cpu: threads + memory: memory + time_minutes: timeMinutes + } + + parameter_meta { + # input + bam: {description: "The input alignment file", category: "required"} + bamIndex: {description: "The index for the input alignment file", category: "required"} + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + outputBed: {description: "The output name where the bedMethyl file should be placed.", category: "common"} + outputBedGraph: {description: "The output name where the bedgraph file should be placed", category: "common"} + + intervalSize: {description: "Sets the interval size", category: "advanced"} + includeBed: {description: "Bed file with regions to include", category: "advanced"} + cpg: {description: "Whether to call only at cpg sites", category: "advanced"} + combineMods: {description: "Whether to combine modifications in the output", category: "advanced"} + combineStrands: {description: "Whether to combine strands in the output", category: "advanced"} + ignore: {description: "Modification type to ignore. For example 'h'.", category: "advanced"} + logFilePath: {description: "Path where the log file should be written.", category: "advanced"} + filterThreshold: {description: "Global filter threshold can be specified with by a decimal number (e.g. 0.75). Otherwise the automatic filter percentile will be used.", category: "advanced"} + filterPercentile: {description: "This defaults to 0.1, to remove the lowest 10% confidence modification calls, but can be manually adjusted", category: "advanced"} + + threads: {description: "The number of threads to use for variant calling.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # output + out: {description: "The output bed files. Not available when bedgraph = true."} + outFiles: {description: "Output files when bedgraph = true."} + logFile: {description: "The generated log file."} + } +} + +task Summary { + input { + File bam + File bamIndex + + String summary = "modkit.summary.txt" + + Boolean sample = true + Int? numReads # = 10042 + Float? samplingFrac # = 0.1 + Int? seed + + Int threads = 4 + String memory = ceil(size(bam, "GiB") * 0.1) + 5 # Based on a linear model with some fudge (memory = 0.07540 * file_size - 0.6). + Int timeMinutes = 60 # originally this was set at "2 Days / threads" but with 4 threads and that much ram, it's pretty fast. + String dockerImage = "quay.io/biocontainers/ont-modkit:0.4.3--hcdda2d0_0" + } + + command <<< + set -e + mkdir -p $(dirname ~{summary}) + + modkit summary \ + --threads ~{threads} \ + ~{true="" false="--no-sampling" sample} \ + ~{"--num-reads " + numReads} \ + ~{"--sampling-frac " + samplingFrac} \ + ~{"--seed " + seed} \ + ~{bam} > ~{summary} + >>> + + output { + File summaryReport = summary # Normal mode + } + + runtime { + docker: dockerImage + cpu: threads + memory: memory + time_minutes: timeMinutes + } + + parameter_meta { + # input + bam: {description: "The input alignment file", category: "required"} + bamIndex: {description: "The index for the input alignment file", category: "required"} + + sample: {description: "Allows you to disable sampling and report stats for the whole file.", category: "advanced"} + numReads: {description: "By default a fixed amount of reads are read, you can set this to change the number of reads to sample.", category: "advanced"} + samplingFrac: {description: "Use a fixed percentage of reads, rather than a fixed number of reads, for sampling.", category: "advanced"} + seed: {description: "A seed can be provided for reproducibility in the sampling fraction case.", category: "advanced"} + + threads: {description: "The number of threads to use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # output + summaryReport: {description: "The output modkit summary."} + } +} + +task SampleProbs { + input { + File bam + File bamIndex + + String summary = "modkit-sample-probs" + + Boolean sample = true + Int? numReads # = 10042 + Float? samplingFrac # = 0.1 + Int? seed + + Int threads = 4 + String memory = "32G" + Int timeMinutes = 60 + String dockerImage = "quay.io/biocontainers/ont-modkit:0.4.3--hcdda2d0_0" + } + + command <<< + set -e + mkdir -p ~{summary} + + modkit sample-probs \ + --threads ~{threads} \ + --out-dir ~{summary} \ + ~{true="" false="--no-sampling" sample} \ + ~{"--num-reads " + numReads} \ + ~{"--sampling-frac " + samplingFrac} \ + ~{"--seed " + seed} \ + --hist \ + ~{bam} + >>> + + output { + File reportCounts = "~{summary}/counts.html" + File reportProportion = "~{summary}/proportion.html" + File reportProbabilitiesTsv = "~{summary}/probabilities.tsv" + File reportThresholdsTsv = "~{summary}/thresholds.tsv" + } + + runtime { + docker: dockerImage + cpu: threads + memory: memory + time_minutes: timeMinutes + } + + parameter_meta { + # input + bam: {description: "The input alignment file", category: "required"} + bamIndex: {description: "The index for the input alignment file", category: "required"} + summary: {description: "A folder for the outputs", category: "required"} + + sample: {description: "Allows you to disable sampling and report stats for the whole file.", category: "advanced"} + numReads: {description: "By default a fixed amount of reads are read, you can set this to change the number of reads to sample.", category: "advanced"} + samplingFrac: {description: "Use a fixed percentage of reads, rather than a fixed number of reads, for sampling.", category: "advanced"} + seed: {description: "A seed can be provided for reproducibility in the sampling fraction case.", category: "advanced"} + + threads: {description: "The number of threads to use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # output + reportCounts: {description: "The output html report of counts"} + reportProportion: {description: "The output html report of proportions"} + reportProbabilitiesTsv: {description: "The output TSV of Probabilities"} + reportThresholdsTsv: {description: "The output TSV of thresholds"} + } +} diff --git a/mosdepth.wdl b/mosdepth.wdl new file mode 100644 index 00000000..43e95614 --- /dev/null +++ b/mosdepth.wdl @@ -0,0 +1,106 @@ +version 1.0 + +# Copyright (c) 2025 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Mosdepth { + input { + File bam + File bamIndex + String prefix = "./out" + + String? chrom + # --by flag takes a BED file or an integer. So there need to be two inputs in WDL's typed system. + File? byBed + Int? byWindow + File? fasta + Int? flag + Int? includeFlag + + Boolean noPerBase = false + Boolean d4 = false + Boolean fastMode = false + + Int threads = 1 + String memory = "4GiB" + Int timeMinutes = 10 + ceil(size(bam, "G")) * 4 + String dockerImage = "quay.io/biocontainers/mosdepth:0.3.10--h4e814b3_1" + } + + command <<< + set -e + mkdir -p $(dirname ~{prefix}) + mosdepth \ + --threads ~{threads} \ + ~{"--chrom " + chrom} \ + ~{"--by " + byBed} \ + ~{"--by " + byWindow} \ + ~{"--fasta " + fasta} \ + ~{true="--no-per-base" false="" noPerBase} \ + ~{true="--d4" false="" d4} \ + ~{"--flag " + flag} \ + ~{"--include-flag " + includeFlag} \ + ~{true="--fast-mode" false="" fastMode} \ + ~{prefix} ~{bam} + >>> + + output { + File globalDist = "~{prefix}.mosdepth.global.dist.txt" + File summary = "~{prefix}.mosdepth.summary.txt" + File? perBaseBed = "~{prefix}.per-base.bed.gz" + File? regionsBed = "~{prefix}.regions.bed.gz" + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + bam: {description: "Input BAM or CRAM file.", category: "required"} + bamIndex: {description: "Index for the input BAM or CRAM file.", category: "required"} + prefix: {description: "Output prefix.", category: "common"} + + chrom: {description: "Chromosome to restrict depth calculation.", category: "advanced"} + byBed: {description: "Bed file with windows to include for the --by flag. Should not be used together with byWindow.", category: "common"} + byWindow: {description: "Integer window size for the --by flag. Should not be used together with byBed.", category: "advanced"} + fasta: {description: "FASTA file, only necessary when CRAM input is used.", category: "advanced"} + flag: {description: "Exclude reads with any of the bits in FLAG set.", category: "advanced"} + includeFlag: {description: "Only include reads with any of the bits in FLAG set.", category: "advanced"} + + noPerBase: {description: "Don't output per-base depth. Skipping this output will speed execution.", category: "common"} + d4: {description: "output per-base depth in d4 format.", category: "advanced"} + fastMode: {description: "Don't look at internal cigar operations or correct mate overlaps (recommended for most use-cases).", category: "common"} + + threads: {description: "How many threads to use.", category: "common"} + memory: {description: "How much memory to allocate.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + globalDist: {description: "Global distribution table file."} + summary: {description: "Summary table file."} + perBaseBed: {description: "Per base coverage BED file."} + regionsBed: {description: "Per region BED file, if byBed or byWindow is used."} + } +} \ No newline at end of file diff --git a/multiqc.wdl b/multiqc.wdl index db1dd21e..db47ac87 100644 --- a/multiqc.wdl +++ b/multiqc.wdl @@ -22,46 +22,84 @@ version 1.0 task MultiQC { input { - # Use a string here so cromwell does not relocate an entire analysis directory - String analysisDirectory - Array[File] dependencies = [] # This must be used in order to run multiqc after these tasks. + # Use a string here so cromwell does not relocate an entire + # analysis directory. + Array[File] reports Boolean force = false Boolean dirs = false - Int? dirsDepth Boolean fullNames = false + String outDir = "." + Boolean dataDir = false + Boolean zipDataDir = true + Boolean export = false + Boolean flat = false + Boolean interactive = true + Boolean lint = false + Boolean pdf = false + # This must be actively enabled in my opinion. + # The tools default is to upload. + Boolean megaQCUpload = false + Boolean enableAi = false + + Int? dirsDepth String? title String? comment String? fileName - String outDir = "." String? template String? tag String? ignore String? ignoreSamples - Boolean ignoreSymlinks = false File? sampleNames File? fileList Array[String]+? exclude Array[String]+? module - Boolean dataDir = false - Boolean noDataDir = false + Array[File]+? additionalReports String? dataFormat - Boolean zipDataDir = false - Boolean export = false - Boolean flat = false - Boolean interactive = true - Boolean lint = false - Boolean pdf = false - Boolean megaQCUpload = false # This must be actively enabled in my opinion. The tools default is to upload. File? config # A directory String? clConfig - Array[Boolean] finished = [] # An array of booleans that can be used to let multiqc wait on stuff. - String memory = "4G" - - String dockerImage = "quay.io/biocontainers/multiqc:1.7--py_1" + String? memory + Int timeMinutes = 10 + ceil(size(reports, "GiB") * 8) + String dockerImage = "quay.io/biocontainers/multiqc:1.28--pyhdfd78af_0" } + Int memoryGb = 2 + ceil(size(reports, "GiB")) + + # This is where the reports end up. It does not need to be changed by the + # user. It is full of symbolic links, so it is not of any use to the user + # anyway. + String reportDir = "reports" + + # Below code requires python 3.6 or higher. + # This makes sure all report files are in a report directory that + # MultiQC can investigate. + # This creates files in report_dir / hashed_parent / file basename. + # By hashing the parent path we make sure there are no file colissions as + # files from the same directory end up in the same directory, while files + # from other directories get their own directory. Cromwell also uses this + # strategy. Using python's builtin hash is unique enough + # for these purposes. + + Array[File] allReports = flatten([reports, flatten(select_all([additionalReports]))]) + command { + python3 < ~{ntFilePath} @@ -132,15 +132,16 @@ task DownloadAccessionToTaxId { command { set -e -o pipefail mkdir -p ~{downloadDir} - rsync -av \ - --partial \ - rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_*.accession2taxid.gz* \ - ~{downloadDir} + rsync \ + -av \ + --partial \ + rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_*.accession2taxid.gz* \ + ~{downloadDir} (cd ~{downloadDir} && md5sum -c *.md5) for file in ~{downloadDir}/nucl_*.accession2taxid.gz do zcat $file | tail -n +2 | cut -f 2,3 ~{true="| gzip" false='' gzip} > \ - $file.seqtaxmap~{true='.gz' false='' gzip} + $file.seqtaxmap~{true='.gz' false='' gzip} done } diff --git a/pacbio.wdl b/pacbio.wdl new file mode 100644 index 00000000..dcf0f69e --- /dev/null +++ b/pacbio.wdl @@ -0,0 +1,95 @@ +version 1.0 + +# Copyright (c) 2020 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task mergePacBio { + input { + Array[File]+ reports + String outputPathMergedReport + + String memory = "4GiB" + String dockerImage = "quay.io/redmar_van_den_berg/pacbio-merge:0.2" + } + + command { + set -e + mkdir -p $(dirname ~{outputPathMergedReport}) + pacbio_merge \ + --reports ~{sep=" " reports} \ + --json-output ~{outputPathMergedReport} + } + + runtime { + memory: memory + docker: dockerImage + } + + output { + File outputMergedReport = outputPathMergedReport + } + + parameter_meta { + # inputs + reports: {description: "The PacBio report files to merge.", category: "required"} + outputPathMergedReport: {description: "The location the merged PacBio report file should be written to.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputMergedReport: {description: "The PacBio reports merged into one."} + } +} + +task ccsChunks { + input { + Int chunkCount + + String memory = "4GiB" + String dockerImage = "python:3.7-slim" + } + + command { + set -e + python <7 days and failing + # Sometimes we wish to supply "null" in order to turn off optical duplicate detection. + # This can be desirable if you don't mind the estimated library size + # being wrong and optical duplicate detection is taking >7 days and failing. String? read_name_regex + + # In GATK Best practices pipeline MarkDuplicates is given a 7G VM. + # https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/d2934ed656ade44801f9cfe1c0e78d4f80684b7b/PairedEndSingleSampleWf-fc-hg38.wdl#L1040 + Int javaXmxMb = 6656 # 6.5G + String memoryMb = javaXmxMb + 512 + + Int timeMinutes = 1 + ceil(size(inputBams, "GiB") * 8) + String dockerImage = "quay.io/biocontainers/picard:3.3.0--hdfd78af_0" } # Task is assuming query-sorted input so that the Secondary and Supplementary reads get # marked correctly. This works because the output of BWA is query-grouped and therefore, # so is the output of MergeBamAlignment. While query-grouped isn't actually query-sorted, - # it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" - + # it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname". command { set -e mkdir -p "$(dirname ~{outputBamPath})" - picard -Xmx~{javaXmx} \ + picard -Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1 \ MarkDuplicates \ INPUT=~{sep=' INPUT=' inputBams} \ OUTPUT=~{outputBamPath} \ METRICS_FILE=~{metricsPath} \ + COMPRESSION_LEVEL=~{compressionLevel} \ + USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \ VALIDATION_STRINGENCY=SILENT \ ~{"READ_NAME_REGEX=" + read_name_regex} \ OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ CLEAR_DT="false" \ CREATE_INDEX=true \ ADD_PG_TAG_TO_READS=false \ - CREATE_MD5_FILE=true + CREATE_MD5_FILE=~{true="true" false="false" createMd5File} \ } output { File outputBam = outputBamPath File outputBamIndex = sub(outputBamPath, "\.bam$", ".bai") - File outputBamMd5 = outputBamPath + ".md5" + File? outputBamMd5 = outputBamPath + ".md5" File metricsFile = metricsPath } runtime { + memory: "~{memoryMb}MiB" + time_minutes: timeMinutes docker: dockerImage - memory: memory } parameter_meta { # inputs inputBams: {description: "The BAM files for which the duplicate reads should be marked.", category: "required"} - inputBamIndexes: {description: "Th eindexes for the input BAM files.", category: "required"} outputBamPath: {description: "The location where the ouptut BAM file should be written.", category: "required"} metricsPath: {description: "The location where the output metrics file should be written.", category: "required"} + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} + useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} + useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} + createMd5File: {description: "Whether to create a md5 file for the created BAM file.", category: "advanced"} read_name_regex: {description: "Equivalent to the `READ_NAME_REGEX` option of MarkDuplicates.", category: "advanced"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} + memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + # outputs + outputBam: {description: ""} + outputBamIndex: {description: ""} + outputBamMd5: {description: ""} + metricsFile: {description: ""} } } -# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs +# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs. task MergeVCFs { input { Array[File]+ inputVCFs Array[File]+ inputVCFsIndexes String outputVcfPath + Int compressionLevel = 1 + Boolean useJdkInflater = false + # Better results for compression level 1 (much smaller). + # Higher compression levels similar to intel deflater. + # NOTE: this might change in the future when the intel deflater is updated! + # Second NOTE: No it did not change. Only the fastest algorithm with + # worse compression is wrapped in the intel GKL. Instead of using + # one of the slightly slower but better compressing alternatives from ISA-L. + # (Which are also faster than zlib.) + Boolean useJdkDeflater = true # Achieves much better compression rates than the intel deflater - String memory = "24G" - String javaXmx = "8G" - String dockerImage = "quay.io/biocontainers/picard:2.20.5--0" + String javaXmx = "4G" + String memory = "5GiB" + Int timeMinutes = 1 + ceil(size(inputVCFs, "GiB")) * 2 + String dockerImage = "quay.io/biocontainers/picard:3.3.0--hdfd78af_0" } - # Using MergeVcfs instead of GatherVcfs so we can create indices - # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket - + # Using MergeVcfs instead of GatherVcfs so we can create indices. + # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket. command { set -e mkdir -p "$(dirname ~{outputVcfPath})" - picard -Xmx~{javaXmx} \ + picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ MergeVcfs \ INPUT=~{sep=' INPUT=' inputVCFs} \ - OUTPUT=~{outputVcfPath} + OUTPUT=~{outputVcfPath} \ + COMPRESSION_LEVEL=~{compressionLevel} \ + USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} } output { @@ -503,8 +859,9 @@ task MergeVCFs { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -512,12 +869,17 @@ task MergeVCFs { inputVCFs: {description: "The VCF files to be merged.", category: "required"} inputVCFsIndexes: {description: "The indexes of the VCF files.", category: "required"} outputVcfPath: {description: "The location the output VCF file should be written to.", category: "required"} - + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} + useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} + useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "Multiple variant files combined into a single variant file."} + outputVcfIndex: {description: "Index of `outputVcf`."} } } @@ -527,10 +889,12 @@ task SamToFastq { File inputBamIndex Boolean paired = true - String memory = "48G" String javaXmx = "16G" # High memory default to avoid crashes. - String dockerImage = "quay.io/biocontainers/picard:2.20.5--0" - File? NONE + String memory = "17GiB" + Int timeMinutes = 30 + String dockerImage = "quay.io/biocontainers/picard:3.3.0--hdfd78af_0" + + File? noneFile } String outputRead1 = basename(inputBam, "\.[bs]am") + "_R1.fastq.gz" @@ -539,7 +903,7 @@ task SamToFastq { command { set -e - picard -Xmx~{javaXmx} \ + picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ SamToFastq \ I=~{inputBam} \ ~{"FASTQ=" + outputRead1} \ @@ -549,13 +913,36 @@ task SamToFastq { output { File read1 = outputRead1 - File? read2 = if paired then outputRead2 else NONE - File? unpairedRead = if paired then outputUnpaired else NONE + File? read2 = if paired then outputRead2 else noneFile + File? unpairedRead = if paired then outputUnpaired else noneFile } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBam: {description: "Input BAM file to extract reads from.", category: "required"} + inputBamIndex: {description: "Input BAM index file.", category: "required"} + paired: {description: "Set to false when input data is single-end.", category: "common"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + read1: {description: "Fastq file containing reads from the first pair."} + read2: {description: "Fastq file containing reads from the second pair."} + unpairedRead: {description: "Fastq file containing unpaired reads."} + } + + meta { + WDL_AID: { + exclude: ["noneFile"] + } } } @@ -564,15 +951,15 @@ task ScatterIntervalList { File interval_list Int scatter_count - String memory = "12G" - String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/picard:2.20.5--0" + String javaXmx = "3G" + String memory = "4GiB" + String dockerImage = "quay.io/biocontainers/picard:3.3.0--hdfd78af_0" } command { set -e mkdir scatter_list - picard -Xmx~{javaXmx} \ + picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ IntervalListTools \ SCATTER_COUNT=~{scatter_count} \ SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ @@ -588,8 +975,76 @@ task ScatterIntervalList { } runtime { - docker: dockerImage memory: memory + docker: dockerImage + } +} + +task SortSam { + input { + File inputBam + String outputPath + Boolean sortByName = false + Boolean createMd5File = false + Int maxRecordsInRam = 500000 + Int compressionLevel = 1 + Boolean useJdkInflater = false + Boolean useJdkDeflater = true # Achieves much better compression rates than the intel deflater + + # Default ram of 4 GB. Using 125001.0 to prevent an answer of + # 4.000000001 which gets rounded to 5. + # GATK Best practices uses 75000 here: https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/d2934ed656ade44801f9cfe1c0e78d4f80684b7b/PairedEndSingleSampleWf-fc-hg38.wdl#L778 + Int XmxGb = ceil(maxRecordsInRam / 125001.0) + Int timeMinutes = 1 + ceil(size(inputBam, "GiB") * 3) + String dockerImage = "quay.io/biocontainers/picard:3.3.0--hdfd78af_0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + picard -Xmx~{XmxGb}G -XX:ParallelGCThreads=1 SortSam \ + INPUT=~{inputBam} \ + OUTPUT=~{outputPath} \ + MAX_RECORDS_IN_RAM=~{maxRecordsInRam} \ + SORT_ORDER=~{true="queryname" false="coordinate" sortByName} \ + CREATE_INDEX=true \ + COMPRESSION_LEVEL=~{compressionLevel} \ + USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \ + VALIDATION_STRINGENCY=SILENT \ + CREATE_MD5_FILE=~{true="true" false="false" createMd5File} + + } + + output { + File outputBam = outputPath + File outputBamIndex = sub(outputPath, "\.bam$", ".bai") + } + + runtime { + cpu: 1 + memory: "~{1 + XmxGb}GiB" + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBam: {description: "The unsorted input BAM file.", category: "required"} + outputPath: {description: "The location the output BAM file should be written to.", category: "required"} + sortByName: {description: "Sort the output file by name, default is position.", category: "advanced"} + createMd5File: {description: "Whether to create an MD5 digest for any BAM or FASTQ files created.", category: "advanced"} + maxRecordsInRam: {description: "This will specify the number of records stored in RAM before spilling to disk.", category: "advanced"} + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} + useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} + useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} + XmxGb: {description: "The maximum memory available to picard SortSam. Should be lower than `memory` to accommodate JVM overhead and BWA mem's memory usage.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "Sorted BAM file."} + outputBamIndex: {description: "Index of sorted BAM file."} } } @@ -597,18 +1052,20 @@ task SortVcf { input { Array[File]+ vcfFiles String outputVcfPath + File? dict - String memory = "24G" String javaXmx = "8G" - String dockerImage = "quay.io/biocontainers/picard:2.20.5--0" + String memory = "9GiB" + Int timeMinutes = 1 + ceil(size(vcfFiles, "GiB") * 5) + String dockerImage = "quay.io/biocontainers/picard:3.3.0--hdfd78af_0" } command { set -e mkdir -p "$(dirname ~{outputVcfPath})" - picard -Xmx~{javaXmx} \ + picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ SortVcf \ I=~{sep=" I=" vcfFiles} \ ~{"SEQUENCE_DICTIONARY=" + dict} \ @@ -621,8 +1078,9 @@ task SortVcf { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -630,12 +1088,14 @@ task SortVcf { vcfFiles: {description: "The VCF files to merge and sort.", category: "required"} outputVcfPath: {description: "The location the sorted VCF files should be written to.", category: "required"} dict: {description: "A sequence dictionary matching the VCF files.", category: "advanced"} - + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "Sorted VCF file(s)."} + outputVcfIndex: {description: "Index(es) of sort(ed) VCF file(s)."} } } @@ -644,15 +1104,17 @@ task RenameSample { File inputVcf String outputPath = "./picard/renamed.vcf" String newSampleName - String memory = "24G" + String javaXmx = "8G" - String dockerImage = "quay.io/biocontainers/picard:2.19.0--0" + String memory = "9GiB" + Int timeMinutes = 1 + ceil(size(inputVcf, "GiB") * 2) + String dockerImage = "quay.io/biocontainers/picard:3.3.0--hdfd78af_0" } command { set -e mkdir -p "$(dirname ~{outputPath})" - picard -Xmx~{javaXmx} \ + picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ RenameSampleInVcf \ I=~{inputVcf} \ O=~{outputPath} \ @@ -664,8 +1126,9 @@ task RenameSample { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -673,8 +1136,87 @@ task RenameSample { inputVcf: {description: "The VCF file to process.", category: "required"} outputPath: {description: "The location the output VCF file should be written.", category: "common"} newSampleName: {description: "A string to replace the old sample name.", category: "required"} - memory: {description: "The memory required to run the programs", category: "advanced"} - javaXmx: {description: "The max. memory allocated for JAVA", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The memory required to run the programs.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + renamedVcf: {description: "New VCF with renamed sample."} } } + +task UmiAwareMarkDuplicatesWithMateCigar { + input { + Array[File] inputBams + String outputPath + String outputPathMetrics = outputPath + ".metrics" + String outputPathUmiMetrics = outputPath + ".umi-metrics" + Int maxRecordsInRam = 1500000 # Default is 500_000 but that will lead to very small files on disk. + String? assumeSortOrder + String tempdir = "temp" + Boolean removeDuplicates = true + String umiTagName = "RX" + Int compressionLevel = 1 + Boolean useJdkInflater = false + Boolean useJdkDeflater = true # Achieves much better compression rates than the intel deflater + String javaXmx = "8G" + String memory = "9GiB" + Int timeMinutes = 360 + String dockerImage = "quay.io/biocontainers/picard:3.3.0--hdfd78af_0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" ~{tempdir} + picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + UmiAwareMarkDuplicatesWithMateCigar \ + INPUT=~{sep=' INPUT=' inputBams} \ + O=~{outputPath} \ + M=~{outputPathMetrics} \ + UMI_TAG_NAME=~{umiTagName} \ + UMI_METRICS_FILE=~{outputPathUmiMetrics} \ + TMP_DIR=~{tempdir} \ + REMOVE_DUPLICATES=~{removeDuplicates} \ + MAX_RECORDS_IN_RAM=~{maxRecordsInRam} \ + CREATE_INDEX=true \ + COMPRESSION_LEVEL=~{compressionLevel} \ + USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \ + ~{"ASSUME_SORT_ORDER=" + assumeSortOrder} + } + + output { + File outputBam = outputPath + File outputBamIndex = sub(outputPath, "\.bam$", ".bai") + File outputMetrics = outputPathMetrics + File outputUmiMetrics = outputPathUmiMetrics + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBams: {description: "The BAM files for which the duplicate reads should be marked.", category: "required"} + outputPath: {description: "The location the output BAM file should be written to.", category: "required"} + outputPathMetrics: {description: "The location the output metrics file should be written to.", category: "required"} + outputPathUmiMetrics: {description: "The location the output UMI metrics file should be written to.", category: "required"} + removeDuplicates: {description: "Whether the duplicate reads should be removed instead of marked.", category: "common"} + umiTagName: {description: "Which tag in the BAM file holds the UMI.", category: "common"} + assumeSortOrder: {description: "Assume a certain sort order even though the header might say otherwise.", category: "common"} + tempdir: {description: "Temporary directory.", category: "advanced"} + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} + maxRecordsInRam: {description: "This will specify the number of records stored in RAM before spilling to disk.", category: "advanced"} + useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} + useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + } +} \ No newline at end of file diff --git a/prepareShiny.wdl b/prepareShiny.wdl new file mode 100644 index 00000000..28910743 --- /dev/null +++ b/prepareShiny.wdl @@ -0,0 +1,106 @@ +version 1.0 + +# Copyright (c) 2017 Sequencing Analysis Support Core - Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task CreateDesignMatrix { + input { + File countTable + String shinyDir = "." + + String memory = "5GiB" + Int timeMinutes = 30 + String dockerImage = "quay.io/biocontainers/predex:0.9.2--pyh3252c3a_0" + } + + command { + set -e + mkdir -p ~{shinyDir} + predex design \ + --input ~{countTable} \ + --output ~{shinyDir} + } + + output { + File dgeDesign = shinyDir + "/design_matrix.tsv" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + countTable: {description: "The created count table from HTseq.", category: "required"} + shinyDir: {description: "The directory to write the output to.", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + dgeDesign: {description: "Design matrix template to add sample information for DGE analysis."} + } +} + +task CreateAnnotation { + input { + File referenceFasta + File referenceGtfFile + String shinyDir = "." + + String memory = "5GiB" + Int timeMinutes = 30 + String dockerImage = "quay.io/biocontainers/predex:0.9.2--pyh3252c3a_0" + } + + command { + set -e + mkdir -p ~{shinyDir} + predex annotation \ + --fasta ~{referenceFasta} \ + --gtf ~{referenceGtfFile} \ + --output ~{shinyDir} + } + + output { + File dgeAnnotation = shinyDir + "/annotation.tsv" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + referenceFasta: {description: "The reference Fasta file.", category: "required"} + referenceGtfFile: {description: "The reference GTF file.", category: "required"} + shinyDir: {description: "The directory to write the output to.", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + dgeAnnotation: {description: "Annotation file for DGE analysis."} + } +} diff --git a/requirements-test.txt b/requirements-test.txt deleted file mode 100644 index f074413b..00000000 --- a/requirements-test.txt +++ /dev/null @@ -1,2 +0,0 @@ -cromwell -miniwdl \ No newline at end of file diff --git a/rtg.wdl b/rtg.wdl index 8fd53ca4..62e1e77f 100644 --- a/rtg.wdl +++ b/rtg.wdl @@ -22,12 +22,13 @@ version 1.0 task Format { input { - String format = "fasta" - String outputPath = "seq_data.sdf" Array[File]+ inputFiles - String dockerImage = "quay.io/biocontainers/rtg-tools:3.10.1--0" + String format = "fasta" + String outputPath = "reference_data" String rtgMem = "8G" - String memory = "16G" + String memory = "9GiB" + Int timeMinutes = 1 + ceil(size(inputFiles, "GiB") * 2) + String dockerImage = "quay.io/biocontainers/rtg-tools:3.10.1--0" } command { @@ -39,23 +40,27 @@ task Format { } output { - File sdf = outputPath + Array[File] referenceFiles = glob("~{outputPath}/*") } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { - format: {description: "Format of input. Allowed values are [fasta, fastq, fastq-interleaved, sam-se, sam-pe] (Default is fasta)", - category: "advanced"} + # inputs + inputFiles: {description: "Input sequence files. May be specified 1 or more times.", category: "required"} + format: {description: "Format of input. Allowed values are [fasta, fastq, fastq-interleaved, sam-se, sam-pe].", category: "advanced"} outputPath: {description: "Where the output should be placed.", category: "advanced"} - inputFiles: {description: "input sequence files. May be specified 1 or more times.", category: "required"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + rtgMem: {description: "The amount of memory rtg will allocate to the JVM.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - rtgMem: {description: "The amount of memory rtg will allocate to the JVM", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + referenceFiles: {description: "An array with all the generated reference files"} } } @@ -65,19 +70,22 @@ task VcfEval { File baselineIndex File calls File callsIndex - File? evaluationRegions - File? bedRegions + Boolean squashPloidy = false + String outputMode = "split" String outputDir = "output/" - File template + Array[File] referenceFiles Boolean allRecords = false Boolean decompose = false Boolean refOverlap = false + + File? evaluationRegions + File? bedRegions String? sample - Boolean squashPloidy = false - String outputMode = "split" - Int threads = 1 # tool default is number of cores in the system 😱 + String rtgMem = "8G" - String memory = "16G" + Int threads = 1 # Tool default is number of cores in the system 😱. + String memory = "9GiB" + Int timeMinutes = 1 + ceil(size([baseline, calls], "GiB") * 5) String dockerImage = "quay.io/biocontainers/rtg-tools:3.10.1--0" } @@ -90,7 +98,7 @@ task VcfEval { ~{"--evaluation-regions " + evaluationRegions} \ ~{"--bed-regions " + bedRegions} \ --output ~{outputDir} \ - --template ~{template} \ + --template $(dirname ~{referenceFiles[0]}) \ ~{true="--all-records" false="" allRecords} \ ~{true="--decompose" false="" decompose} \ ~{true="--ref-overlap" false="" refOverlap} \ @@ -128,37 +136,47 @@ task VcfEval { } runtime { - docker: dockerImage cpu: threads memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { - baseline: {description: "VCF file containing baseline variants", category: "required"} - baselineIndex: {description: "The baseline's VCF index", category: "required"} - calls: {description: "VCF file containing called variants", category: "required"} - callsIndex: {description: "The call's VCF index", category: "required"} - outputDir: {description: "Directory for output", category: "advanced"} - bedRegions: {description: "if set, only read VCF records that overlap the ranges contained in the specified BED file", category: "advanced"} - evaluationRegions: {description: "if set, evaluate within regions contained in the supplied BED file, allowing transborder matches. To be used for truth-set high-confidence regions or other regions of interest where region boundary effects should be minimized", - category: "advanced"} - template: {description: "SDF of the reference genome the variants are called against", category: "required"} - allRecords: {description: "use all records regardless of FILTER status (Default is to only process records where FILTER is \".\" or \"PASS\")", - category: "common"} - decompose: {description: "decompose complex variants into smaller constituents to allow partial credit", category: "common"} - refOverlap: {description: "allow alleles to overlap where bases of either allele are same-as-ref (Default is to only allow VCF anchor base overlap)", - category: "common"} - sample: {description: "the name of the sample to select. Use , to select different sample names for baseline and calls. (Required when using multi-sample VCF files)", - category: "common"} - squashPloidy: {description: "treat heterozygous genotypes as homozygous ALT in both baseline and calls, to allow matches that ignore zygosity differences", - category: "common"} - outputMode: {description: "output reporting mode. Allowed values are [split, annotate, combine, ga4gh, roc-only] (Default is split)", - category: "advanced"} - threads: {description: "Number of threads. Default is 1", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} - rtgMem: {description: "The amount of memory rtg will allocate to the JVM", category: "advanced"} + # inputs + baseline: {description: "VCF file containing baseline variants.", category: "required"} + baselineIndex: {description: "The baseline's VCF index.", category: "required"} + calls: {description: "VCF file containing called variants.", category: "required"} + callsIndex: {description: "The call's VCF index.", category: "required"} + squashPloidy: {description: "treat heterozygous genotypes as homozygous ALT in both baseline and calls, to allow matches that ignore zygosity differences.", category: "common"} + outputMode: {description: "output reporting mode. Allowed values are [split, annotate, combine, ga4gh, roc-only] (Default is split).", category: "advanced"} + outputDir: {description: "Directory for output.", category: "advanced"} + referenceFiles: {description: "An array of reference Files generated by the Format task.", category: "required"} + allRecords: {description: "use all records regardless of FILTER status (Default is to only process records where FILTER is \".\" or \"PASS\").", category: "common"} + decompose: {description: "decompose complex variants into smaller constituents to allow partial credit.", category: "common"} + refOverlap: {description: "allow alleles to overlap where bases of either allele are same-as-ref (Default is to only allow VCF anchor base overlap).", category: "common"} + sample: {description: "the name of the sample to select. Use , to select different sample names for baseline and calls. (Required when using multi-sample VCF files).", category: "common"} + bedRegions: {description: "if set, only read VCF records that overlap the ranges contained in the specified BED file.", category: "advanced"} + evaluationRegions: {description: "if set, evaluate within regions contained in the supplied BED file, allowing transborder matches. To be used for truth-set high-confidence regions or other regions of interest where region boundary effects should be minimized.", category: "advanced"} + rtgMem: {description: "The amount of memory rtg will allocate to the JVM.", category: "advanced"} + threads: {description: "Number of threads. Default is 1.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + falseNegativesVcf: {description: "Variants from thebaselineVCF which were not correctly called."} + falseNegativesVcfIndex: {description: "Index of the output VCF file `falseNegativesVcf`."} + falsePositivesVcf: {description: "Variants from thecallsVCF which do not agree with baseline variants."} + falsePositivesVcfIndex: {description: "Index of the output VCF file `falsePositivesVcf`."} + summary: {description: "Summary statistic file."} + truePositivesBaselineVcf: {description: "Variants from thebaselineVCF which agree with variants in thecalls VCF."} + truePositivesBaselineVcfIndex: {description: "Index of the output VCF file `truePositivesBaselineVcf`."} + truePositivesVcf: {description: "Variants from thecallsVCF which agree with variants in the baseline VCF."} + truePositivesVcfIndex: {description: "Index of the output VCF file `truePositivesVcf`."} + nonSnpRoc: {description: "ROC data derived from those variants which were not represented asSNPs."} + phasing: {description: "Phasing file."} + weightedRoc: {description: "ROC data derived from all analyzed call variants, regardless of their representation."} + allStats: {description: "All output files combined in a array."} } } - diff --git a/sambamba.wdl b/sambamba.wdl new file mode 100644 index 00000000..be347f94 --- /dev/null +++ b/sambamba.wdl @@ -0,0 +1,201 @@ +version 1.0 + +# Copyright (c) 2017 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Flagstat { + input { + File inputBam + File inputBamIndex + String outputPath = "./flagstat.txt" + + Int threads = 2 + String memory = "8GiB" + Int timeMinutes = 320 + String dockerImage = "quay.io/biocontainers/sambamba:0.7.1--h148d290_2" + } + + command { + sambamba flagstat \ + -t ~{threads} \ + ~{inputBam} \ + > ~{outputPath} + } + + output { + File stats = outputPath + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + inputBam: {description: "The input BAM file.", category: "required"} + inputBamIndex: {description: "The index for the BAM file.", category: "required"} + outputPath: {description: "The path to write the ouput to.", category: "required"} + + threads: {description: "The number of threads that will be used for this task.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } +} + + +task Markdup { + input { + Array[File] inputBams + String outputPath + Int compressionLevel = 1 + # sortBufferSize and ioBufferSize taken from markdup defaults as of sambamba 0.7.1. + Int sortBufferSize = 4096 + Int ioBufferSize = 128 + Boolean removeDuplicates = false + + Int? hashTableSize + Int? overFlowListSize + + # Sambamba scales like this: 1 thread is fully utilized (1). + # 2 threads 1.8 utilized. 3 -> 2.4, 4-> 2.7. + # 2 threads reduces wall clock time by more than 40%. + Int threads = 2 + # According to the manual sambamba markdup uses the sortbufferSize + 2 times the ioBuffer size. + # Added 8192 mb as a margin of safety. Real life use with this setting uses 2.7 GiB. + Int memoryMb = 8192 + sortBufferSize + 2 * ioBufferSize + # Time minute calculation does not work well for higher number of threads. + Int timeMinutes = 1 + ceil(size(inputBams, "GiB") * 25) / threads + String dockerImage = "quay.io/biocontainers/sambamba:0.7.1--h148d290_2" + } + + String bamIndexPath = sub(outputPath, "\.bam$", ".bai") + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + sambamba markdup \ + --nthreads ~{threads} \ + -l ~{compressionLevel} \ + ~{true="-r" false="" removeDuplicates} \ + ~{"--hash-table-size " + hashTableSize} \ + ~{"--overflow-list-size " + overFlowListSize} \ + ~{"--sort-buffer-size " + sortBufferSize} \ + ~{"--io-buffer-size " + ioBufferSize} \ + ~{sep=' ' inputBams} ~{outputPath} + # sambamba creates an index for us. + mv ~{outputPath}.bai ~{bamIndexPath} + } + + output { + File outputBam = outputPath + File outputBamIndex = bamIndexPath + } + + runtime { + cpu: threads + memory: "~{memoryMb}MiB" + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBams: {description: "The input BAM files.", category: "required"} + outputPath: {description: "Output directory path + output file.", category: "required"} + compressionLevel: {description: "Compression level from 0 (uncompressed) to 9 (best).", category: "advanced"} + sortBufferSize: {description: "The amount of mb allocated to the sort buffer.", category: "advanced"} + ioBufferSize: {description: "The amount of mb allocated to each IO buffer. Sambamba uses two IO buffers.", category: "advanced"} + removeDuplicates: {description: "Whether to remove the duplicates (instead of only marking them).", category: "advanced"} + hashTableSize: {description: "Sets sambamba's hash table size.", category: "advanced"} + overFlowListSize: {description: "Sets sambamba's overflow list size.", category: "advanced"} + threads: {description: "The number of threads that will be used for this task.", category: "advanced"} + memoryMb: {description: "The amount of memory available to the job in megabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "Sorted BAM file."} + outputBamIndex: {description: "Sorted BAM file index."} + } +} + +task Sort { + input { + File inputBam + String outputPath = basename(inputBam, "\.bam") + ".sorted.bam" + Boolean sortByName = false + Int compressionLevel = 1 + + Int memoryPerThreadGb = 4 + Int threads = 1 + Int memoryGb = 1 + threads * memoryPerThreadGb + Int timeMinutes = 1 + ceil(size(inputBam, "GiB") * 3) + String dockerImage = "quay.io/biocontainers/sambamba:0.7.1--h148d290_2" + } + + # Select first needed as outputPath is optional input (bug in cromwell). + String bamIndexPath = sub(select_first([outputPath]), "\.bam$", ".bai") + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + sambamba sort \ + -l ~{compressionLevel} \ + ~{true="-n" false="" sortByName} \ + ~{"--nthreads " + threads} \ + -m ~{memoryPerThreadGb}G \ + -o ~{outputPath} \ + ~{inputBam} + # sambamba creates an index for us. + mv ~{outputPath}.bai ~{bamIndexPath} + } + + output { + File outputBam = outputPath + File outputBamIndex = bamIndexPath + } + + runtime { + cpu: threads + memory: "~{memoryGb}GiB" + docker: dockerImage + time_minutes: timeMinutes + } + + parameter_meta { + # inputs + inputBam: {description: "The input SAM file.", category: "required"} + outputPath: {description: "Output directory path + output file.", category: "required"} + sortByName: {description: "Sort the inputBam by read name instead of position.", category: "advanced"} + compressionLevel: {description: "Compression level from 0 (uncompressed) to 9 (best).", category: "advanced"} + memoryPerThreadGb: {description: "The amount of memory used per sort thread in gigabytes.", category: "advanced"} + threads: {description: "The number of threads that will be used for this task.", category: "advanced"} + memoryGb: {description: "The amount of memory available to the job in gigabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "Sorted BAM file."} + outputBamIndex: {description: "Sorted BAM file index."} + } +} diff --git a/samtools.wdl b/samtools.wdl index a4a893a1..711cb906 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -24,9 +24,13 @@ task BgzipAndIndex { input { File inputFile String outputDir - String type = "vcf" + String preset = "vcf" - String dockerImage = "quay.io/biocontainers/tabix:0.2.6--ha92aebf_0" + Int compressLevel = 1 + Int threads = 1 + String memory = "2GiB" + Int timeMinutes = 1 + ceil(size(inputFile, "GiB")) + String dockerImage = "quay.io/biocontainers/htslib:1.21--h566b1c6_1" } String outputGz = outputDir + "/" + basename(inputFile) + ".gz" @@ -34,8 +38,15 @@ task BgzipAndIndex { command { set -e mkdir -p "$(dirname ~{outputGz})" - bgzip -c ~{inputFile} > ~{outputGz} - tabix ~{outputGz} -p ~{type} + bgzip \ + --threads ~{threads} \ + --compress-level ~{compressLevel} \ + -c ~{inputFile} > ~{outputGz} + + tabix \ + --preset ~{preset} \ + --threads ~{threads - 1} \ + ~{outputGz} } output { @@ -44,24 +55,293 @@ task BgzipAndIndex { } runtime { - docker: dockerImage + cpu: threads + memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { # inputs inputFile: {description: "The file to be compressed and indexed.", category: "required"} outputDir: {description: "The directory in which the output will be placed.", category: "required"} - type: {description: "The type of file (eg. vcf or bed) to be compressed and indexed.", category: "common"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + preset: {description: "The preset for the file (eg. vcf or bed) to be compressed and indexed.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + compressLevel: {description: "Set compression level.", category: "advanced"} + threads: {description: "The number of threads to use.", category: "advanced"} + + # outputs + compressed: {description: "Compressed input file."} + index: {description: "Index of the compressed input file."} + } +} + +task DictAndFaidx { + input { + File inputFile + String memory = "3GiB" + Int timeMinutes = 5 + ceil(size(inputFile, "GiB") * 5) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" + } + + String outputFile = basename(inputFile) + # Capture .fa¸ .fna and .fasta + String outputDict = sub(outputFile, "\.fn?as?t?a?$", "") + ".dict" + # This executes both dict and faidx, so indexes are co-located in the same folder. + command <<< + set -e + cp ~{inputFile} ~{outputFile} + samtools dict -o ~{outputDict} ~{outputFile} + samtools faidx ~{outputFile} --fai-idx ~{outputFile}.fai + >>> + + output { + File outputFasta = outputFile + File outputFastaDict = outputDict + File outputFastaFai = outputFile + ".fai" + } + + runtime { + memory: memory + docker: dockerImage + time_minutes: timeMinutes + cpu: 1 + } + + parameter_meta { + # inputs + inputFile: {description: "The input fasta file.", category: "required"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + # outputs + outputFasta: {description: "Fasta file that is co-located with the indexes"} + outputFastaFai: {description: "Fasta index file for the outputFasta file."} + outputFastaDict: {description: "Sequence dictionary for the outputFasta file."} + } +} + +task Faidx { + input { + File inputFile + String outputDir + + String memory = "2GiB" + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" + } + + command { + set -e + mkdir -p "~{outputDir}" + ln -s ~{inputFile} "~{outputDir}/$(basename ~{inputFile})" + samtools faidx \ + "~{outputDir}/$(basename ~{inputFile})" + } + + output { + File outputIndex = outputDir + "/" + basename(inputFile) + ".fai" + } + + runtime { + memory: memory + docker: dockerImage + } + + parameter_meta { + # inputs + inputFile: {description: "The input fasta file.", category: "required"} + outputDir: {description: "Output directory path.", category: "required"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputIndex: {description: "Index of the input fasta file."} + } +} + +task Fastq { + input { + File inputBam + String outputRead1 + String? outputRead2 + String? outputRead0 + String? outputReadS + Boolean appendReadNumber = false + Boolean outputQuality = false + + Int? includeFilter + Int? excludeFilter + Int? excludeSpecificFilter + Int compressionLevel = 1 + + Int threads = 1 + String memory = "1GiB" + Int timeMinutes = 1 + ceil(size(inputBam, "GiB") * 2) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" + } + + command { + set -e + mkdir -p "$(dirname ~{outputRead1})" + samtools collate -u -O ~{inputBam} | \ + samtools fastq \ + ~{"-1 " + outputRead1} \ + ~{"-2 " + outputRead2} \ + ~{"-0 " + outputRead0} \ + ~{"-s " + outputReadS} \ + ~{"-f " + includeFilter} \ + ~{"-F " + excludeFilter} \ + ~{"-G " + excludeSpecificFilter} \ + ~{true="-N" false="-n" appendReadNumber} \ + ~{true="-O" false="" outputQuality} \ + -c ~{compressionLevel} \ + "--threads " ~{threads - 1} + } + + output { + File read1 = outputRead1 + File? read2 = outputRead2 + File? read0 = outputRead0 + File? readS = outputReadS + } + + runtime { + cpu: threads + memory: memory + docker: dockerImage + time_minutes: timeMinutes + } + + parameter_meta { + # inputs + inputBam: {description: "The bam file to process.", category: "required"} + outputRead1: {description: "The location the reads (first reads for pairs, in case of paired-end sequencing) should be written to.", category: "required"} + outputRead2: {description: "The location the second reads from pairs should be written to.", category: "common"} + outputRead0: {description: "The location the unpaired reads should be written to (in case of paired-end sequenicng).", category: "advanced"} + outputReadS: {description: "The location singleton reads should be written to.", category: "advanced"} + appendReadNumber: {description: "Append /1 and /2 to the read name, or don't. Corresponds to `-n/N`.", category: "advanced"} + outputQuality: {description: "Equivalent to samtools fastq's `-O` flag.", category: "advanced"} + includeFilter: {description: "Include reads with ALL of these flags. Corresponds to `-f`.", category: "advanced"} + excludeFilter: {description: "Exclude reads with ONE OR MORE of these flags. Corresponds to `-F`.", category: "advanced"} + excludeSpecificFilter: {description: "Exclude reads with ALL of these flags. Corresponds to `-G`.", category: "advanced"} + compressionLevel: {description: "Set compression level when writing gz or bgzf fastq files.", category: "advanced"} + threads: {description: "The number of threads to use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + read1: {description: "Reads with the READ1 FLAG set."} + read2: {description: "Reads with the READ2 FLAG set."} + read0: {description: "Reads with either READ1 FLAG or READ2 flag set."} + } +} + +task FilterShortReadsBam { + input { + File bamFile + String outputPathBam + + String memory = "1GiB" + Int timeMinutes = 1 + ceil(size(bamFile, "GiB") * 8) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" + } + + String outputPathBamIndex = sub(outputPathBam, "\.bam$", ".bai") + + command { + set -e + mkdir -p "$(dirname ~{outputPathBam})" + samtools view -h ~{bamFile} | \ + awk 'length($10) > 30 || $1 ~/^@/' | \ + samtools view -bS -> ~{outputPathBam} + samtools index ~{outputPathBam} ~{outputPathBamIndex} + } + + output { + File filteredBam = outputPathBam + File filteredBamIndex = outputPathBamIndex + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + bamFile: {description: "The bam file to process.", category: "required"} + outputPathBam: {description: "The filtered bam file.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + filteredBam: {description: "BAM file filtered for short reads."} + filteredBamIndex: {description: "Index of filtered BAM file."} + } +} + +task Flagstat { + input { + File inputBam + String outputPath + + Int threads = 1 + + String memory = "256MiB" # Only 40.5 MiB used for 150G bam file. + Int timeMinutes = 1 + ceil(size(inputBam, "G")) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + + samtools flagstat \ + --threads ~{threads - 1} \ + ~{inputBam} > ~{outputPath} + } + + output { + File flagstat = outputPath + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBam: {description: "The BAM file for which statistics should be retrieved.", category: "required"} + outputPath: {description: "The location the ouput should be written to.", category: "required"} + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + threads: {description: "The number of threads to use.", category: "advanced"} + + # outputs + flagstat: {description: "The number of alignments for each FLAG type."} } } task Index { input { File bamFile + String? outputBamPath - String dockerImage = "quay.io/biocontainers/samtools:1.8--h46bd0b3_5" + + Int threads = 1 + + String memory = "2GiB" + Int timeMinutes = 1 + ceil(size(bamFile, "GiB") * 4) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" } # Select_first is needed, otherwise womtool validate fails. @@ -75,9 +355,11 @@ task Index { if [ ! -f ~{outputPath} ] then mkdir -p "$(dirname ~{outputPath})" - ln ~{bamFile} ~{outputPath} + ln ~{bamFile} ~{outputPath} || cp ~{bamFile} ~{outputPath} fi - samtools index ~{outputPath} ~{bamIndexPath} + samtools index \ + --threads ~{threads -1} \ + ~{outputPath} ~{bamIndexPath} ' } @@ -87,227 +369,310 @@ task Index { } runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs bamFile: {description: "The BAM file for which an index should be made.", category: "required"} - outputBamPath: {description: "The location where the BAM file should be written to. The index will appear alongside this link to the BAM file.", - category: "common"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + outputBamPath: {description: "The location where the BAM file should be written to. The index will appear alongside this link to the BAM file.", category: "common"} + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + threads: {description: "The number of threads to use.", category: "advanced"} + + # outputs + indexedBam: {description: "BAM file that was indexed."} + index: {description: "Index of the input BAM file."} } } -task Merge { +task Markdup { input { - Array[File]+ bamFiles - String outputBamPath = "merged.bam" - Boolean force = true + File inputBam + String outputBamPath + Int threads = 1 - String dockerImage = "quay.io/biocontainers/samtools:1.8--h46bd0b3_5" + Int timeMinutes = 1 + ceil(size(inputBam, "GiB") * 2) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" } - String indexPath = sub(outputBamPath, "\.bam$",".bai") command { set -e mkdir -p "$(dirname ~{outputBamPath})" - samtools merge ~{true="-f" false="" force} ~{outputBamPath} ~{sep=' ' bamFiles} - samtools index ~{outputBamPath} ~{indexPath} + samtools markdup \ + --threads ~{threads - 1} \ + ~{inputBam} ~{outputBamPath} } output { File outputBam = outputBamPath - File outputBamIndex = indexPath } runtime { + cpu: threads docker: dockerImage + time_minutes: timeMinutes } parameter_meta { # inputs - bamFiles: {description: "The BAM files to merge.", category: "required"} - outputBamPath: {description: "The location the merged BAM file should be written to.", category: "common"} - force: {description: "Equivalent to samtools merge's `-f` flag.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + inputBam: {description: "The BAM file to be processed.", category: "required"} + outputBamPath: {description: "The location of the output BAM file.", category: "required"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + threads: {description: "The number of threads to use.", category: "advanced"} + + # outputs + outputBam: {description: "BAM file with duplicate alignments marked."} } } -task SortByName { +task Merge { input { - File bamFile - String outputBamPath = "namesorted.bam" + Array[File]+ bamFiles + String outputBamPath = "merged.bam" + Boolean force = true + + Boolean combineRGHeaders = false + Boolean combinePGHeaders = false - String dockerImage = "quay.io/biocontainers/samtools:1.8--h46bd0b3_5" + Int compressionLevel = 1 + # Use one thread per input + one for the output + one for merging + Int threads = length(bamFiles) + 2 + String memory = "4GiB" + Int timeMinutes = 1 + ceil(size(bamFiles, "GiB") * 4) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" } + String indexPath = sub(outputBamPath, "\.bam$",".bai") + + # Samtools uses additional threads for merge. command { set -e mkdir -p "$(dirname ~{outputBamPath})" - samtools sort -n ~{bamFile} -o ~{outputBamPath} + samtools merge \ + --threads ~{threads - 1} \ + ~{true="-f" false="" force} \ + -l ~{compressionLevel} \ + ~{true="-c" false="" combineRGHeaders} \ + ~{true="-p" false="" combinePGHeaders} \ + ~{outputBamPath} ~{sep=' ' bamFiles} + samtools index -@ ~{threads - 1} ~{outputBamPath} ~{indexPath} } output { File outputBam = outputBamPath + File outputBamIndex = indexPath } runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - bamFile: {description: "The BAM file to get sorted.", category: "required"} - outputBamPath: {description: "The location the sorted BAM file should be written to.", category: "common"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + bamFiles: {description: "The BAM files to merge.", category: "required"} + outputBamPath: {description: "The location the merged BAM file should be written to.", category: "common"} + force: {description: "Equivalent to samtools merge's `-f` flag.", category: "advanced"} + + combineRGHeaders: {description: "Combine @RG headers with colliding IDs", category: "advanced"} + combinePGHeaders: {description: "Combine @PG headers with colliding IDs", category: "advanced"} + + compressionLevel: {description: "Compression level from 0 (uncompressed) to 9 (best).", category: "advanced"} + + threads: {description: "Number of threads to use.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "Multiple BAM files merged into one."} + outputBamIndex: {description: "Index of the merged BAM file."} } } -task Markdup { +task Quickcheck { input { File inputBam - String outputBamPath - String dockerImage = "quay.io/biocontainers/samtools:1.8--h46bd0b3_5" + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" } command { set -e - mkdir -p "$(dirname ~{outputBamPath})" - samtools markdup ~{inputBam} ~{outputBamPath} + samtools quickcheck ~{inputBam} } output { - File outputBam = outputBamPath + File outputBam = inputBam } runtime { + cpu: 1 + time_minutes: 5 + memory: "1GiB" docker: dockerImage } parameter_meta { # inputs - inputBam: {description: "The BAM file to be processed.", category: "required"} - outputBamPath: {description: "The location of the output BAM file.", category: "required"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + inputBam: {description: "The input BAM/SAM/CRAM file.", category: "required"} + + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "The exact same input file, but use this so it is recognised as a dependent task."} } } -task Flagstat { +task Sort { input { File inputBam - String outputPath + String outputPath = basename(inputBam, "\.bam") + ".sorted.bam" + Boolean sortByName = false + Int compressionLevel = 1 - String dockerImage = "quay.io/biocontainers/samtools:1.8--h46bd0b3_5" + Int memoryPerThreadGb = 4 + Int threads = 1 + Int memoryGb = 1 + threads * memoryPerThreadGb + Int timeMinutes = 1 + ceil(size(inputBam, "GiB") * 3) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" } + # Select first needed as outputPath is optional input (bug in cromwell). + String bamIndexPath = sub(select_first([outputPath]), "\.bam$", ".bai") + command { set -e mkdir -p "$(dirname ~{outputPath})" - samtools flagstat ~{inputBam} > ~{outputPath} + samtools sort \ + -l ~{compressionLevel} \ + ~{true="-n" false="" sortByName} \ + ~{"--threads " + threads} \ + -m ~{memoryPerThreadGb}G \ + -o ~{outputPath} \ + ~{inputBam} + samtools index \ + --threads ~{threads - 1} \ + ~{outputPath} ~{bamIndexPath} } output { - File flagstat = outputPath + File outputBam = outputPath + File outputBamIndex = bamIndexPath } runtime { + cpu: threads + memory: "~{memoryGb}GiB" + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - inputBam: {description: "The BAM file for which statistics should be retrieved.", category: "required"} - outputPath: {description: "The location the ouput should be written to.", category: "required"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + inputBam: {description: "The input SAM file.", category: "required"} + outputPath: {description: "Output directory path + output file.", category: "required"} + sortByName: {description: "Sort the inputBam by read name instead of position.", category: "advanced"} + compressionLevel: {description: "Compression level from 0 (uncompressed) to 9 (best).", category: "advanced"} + memoryPerThreadGb: {description: "The amount of memory used per sort thread in gigabytes.", category: "advanced"} + threads: {description: "The number of threads that will be used for this task.", category: "advanced"} + memoryGb: {description: "The amount of memory available to the job in gigabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "Sorted BAM file."} + outputBamIndex: {description: "Sorted BAM file index."} } } -task Fastq { +task Split { input { File inputBam - String outputRead1 - String? outputRead2 - String? outputRead0 - Int? includeFilter - Int? excludeFilter - Int? excludeSpecificFilter - Boolean appendReadNumber = false - Boolean outputQuality = false - Int? compressionLevel + String outputPath + String? unaccountedPath + String filenameFormat = "%!.%." + + Int compressionLevel = 1 Int threads = 1 - String memory = "1G" - String dockerImage = "quay.io/biocontainers/samtools:1.8--h46bd0b3_5" + String memory = "1GiB" + Int timeMinutes = 1 + ceil(size(inputBam, "GiB") * 2) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" } command { - samtools fastq \ - ~{true="-1" false="-s" defined(outputRead2)} ~{outputRead1} \ - ~{"-2 " + outputRead2} \ - ~{"-0 " + outputRead0} \ - ~{"-f " + includeFilter} \ - ~{"-F " + excludeFilter} \ - ~{"-G " + excludeSpecificFilter} \ - ~{true="-N" false="-n" appendReadNumber} \ - ~{true="-O" false="" outputQuality} \ - ~{"-c " + compressionLevel} \ - ~{"--threads " + threads} \ - ~{inputBam} + set -e + mkdir -p "~{outputPath}/rg/" + + samtools split \ + --output-fmt bam \ + --output-fmt-option level=~{compressionLevel} \ + -f "~{outputPath}/rg/~{filenameFormat}" \ + ~{"-u " + unaccountedPath} \ + --threads ~{threads - 1} \ + --write-index \ + ~{inputBam} } output { - File read1 = outputRead1 - File? read2 = outputRead2 - File? read0 = outputRead0 + Array[File] splitBam = glob(outputPath + "/rg/*.bam") + Array[File] splitBamIndex = glob(outputPath + "/rg/*.bam.csi") + File? unaccounted = unaccountedPath } runtime { cpu: threads memory: memory docker: dockerImage + time_minutes: timeMinutes } parameter_meta { # inputs - inputBam: {description: "The bam file to process.", category: "required"} - outputRead1: {description: "The location the reads (first reads for pairs, in case of paired-end sequencing) should be written to.", category: "required"} - outputRead2: {description: "The location the second reads from pairs should be written to.", category: "common"} - outputRead0: {description: "The location the unpaired reads should be written to (in case of paired-end sequenicng).", category: "advanced"} - includeFilter: {description: "Include reads with ALL of these flags. Corresponds to `-f`", category: "advanced"} - excludeFilter: {description: "Exclude reads with ONE OR MORE of these flags. Corresponds to `-F`", category: "advanced"} - excludeSpecificFilter: {description: "Exclude reads with ALL of these flags. Corresponds to `-G`", category: "advanced"} - appendReadNumber: {description: "Append /1 and /2 to the read name, or don't. Corresponds to `-n/N`", category: "advanced"} - outputQuality: {description: "Equivalent to samtools fastq's `-O` flag.", category: "advanced"} - threads: {description: "The number of threads to use.", category: "advanced"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + inputBam: {description: "The bam file to split.", category: "required"} + outputPath: {description: "Directory to store output bams", category: "required"} + + # Optional parameters + unaccountedPath: {description: "The location to write reads to which are not detected as being part of an existing read group.", category: "common"} + filenameFormat: {description: "Format of the filename, the following tokens can be used: %% a literal % sign, %* basename, %# @RG index, %! @RG ID, %. filename extension for output format", category: "common"} + compressionLevel: {description: "Set compression level when writing gz or bgzf fastq files.", category: "advanced"} + + # outputs + splitBam: {description: "BAM file split by read groups"} + splitBamIndex: {description: "BAM indexes"} + unaccounted: {description: "Reads with no RG tag or an unrecognised RG tag."} } } task Tabix { input { File inputFile - String outputFilePath = "indexed.vcf.gz" - String type = "vcf" - String dockerImage = "quay.io/biocontainers/tabix:0.2.6--ha92aebf_0" + String outputFilePath = basename(inputFile) + String preset = "vcf" + + Int timeMinutes = 1 + ceil(size(inputFile, "GiB") * 2) + String dockerImage = "quay.io/biocontainers/htslib:1.21--h566b1c6_1" } - # FIXME: It is better to do the indexing on VCF creation. Not in a separate task. With file localization this gets hairy fast. + + # FIXME: It is better to do the indexing on VCF creation. + # Not in a separate task. With file localization this gets hairy fast. command { set -e mkdir -p "$(dirname ~{outputFilePath})" if [ ! -f ~{outputFilePath} ] then - ln ~{inputFile} ~{outputFilePath} + ln ~{inputFile} ~{outputFilePath} || cp ~{inputFile} ~{outputFilePath} fi - tabix ~{outputFilePath} -p ~{type} + tabix ~{outputFilePath} -p ~{preset} } output { @@ -316,50 +681,64 @@ task Tabix { } runtime { - docker: dockerImage + memory: "2GiB" + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { # inputs inputFile: {description: "The file to be indexed.", category: "required"} - outputFilePath: {description: "The location where the file should be written to. The index will appear alongside this link to the file.", - category: "common"} - type: {description: "The type of file (eg. vcf or bed) to be indexed.", category: "common"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + outputFilePath: {description: "The location where the file should be written to. The index will appear alongside this link to the file.", category: "common"} + preset: {description: "The preset for the file (eg. vcf or bed) to be indexed.", category: "common"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + indexedFile: {description: "Indexed input file."} + index: {description: "Index of the input file."} } } task View { input { File inFile - File? referenceFasta String outputFileName = "view.bam" Boolean uncompressedBamOutput = false + + File? referenceFasta Int? includeFilter Int? excludeFilter Int? excludeSpecificFilter Int? MAPQthreshold + File? targetFile + + Boolean fast = true # Sets compression level to 1. Int threads = 1 - String memory = "1G" - String dockerImage = "quay.io/biocontainers/samtools:1.8--h46bd0b3_5" + String memory = "1GiB" + Int timeMinutes = 1 + ceil(size(inFile, "GiB") * 5) + String dockerImage = "quay.io/biocontainers/samtools:1.21--h96c455f_1" } + String outputIndexPath = basename(outputFileName) + ".bai" - # Always output to bam and output header + # Always output to bam and output header. + # -u should be after --fast, and will override it in that case. command { set -e mkdir -p "$(dirname ~{outputFileName})" samtools view -b \ ~{"-T " + referenceFasta} \ ~{"-o " + outputFileName} \ + ~{true="--fast" false="" fast} \ ~{true="-u " false="" uncompressedBamOutput} \ ~{"-f " + includeFilter} \ ~{"-F " + excludeFilter} \ ~{"-G " + excludeSpecificFilter} \ ~{"-q " + MAPQthreshold} \ - ~{"--threads " + (threads - 1)} \ + --threads ~{threads - 1} \ + ~{"--target-file " + targetFile} \ ~{inFile} samtools index ~{outputFileName} ~{outputIndexPath} } @@ -372,57 +751,29 @@ task View { runtime { cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs inFile: {description: "A BAM, SAM or CRAM file.", category: "required"} - referenceFasta: {description: "The reference fasta file also used for mapping.", category: "advanced"} outputFileName: {description: "The location the output BAM file should be written.", category: "common"} + fast: {description: "Sets compression level to 1. Set to true by default.", category: "common"} uncompressedBamOutput: {description: "Equivalent to samtools view's `-u` flag.", category: "advanced"} + referenceFasta: {description: "The reference fasta file also used for mapping.", category: "advanced"} includeFilter: {description: "Equivalent to samtools view's `-f` option.", category: "advanced"} excludeFilter: {description: "Equivalent to samtools view's `-F` option.", category: "advanced"} excludeSpecificFilter: {description: "Equivalent to samtools view's `-G` option.", category: "advanced"} MAPQthreshold: {description: "Equivalent to samtools view's `-q` option.", category: "advanced"} - + targetFile: {description: "A BED file with regions to include", caegory: "advanced"} threads: {description: "The number of threads to use.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} - } -} - -task FilterShortReadsBam { - input { - File bamFile - String outputPathBam - String dockerImage = "quay.io/biocontainers/samtools:1.8--h46bd0b3_5" - } - - String outputPathBamIndex = sub(outputPathBam, "\.bam$", ".bai") - - command { - set -e - mkdir -p "$(dirname ~{outputPathBam})" - samtools view -h ~{bamFile} | \ - awk 'length($10) > 30 || $1 ~/^@/' | \ - samtools view -bS -> ~{outputPathBam} - samtools index ~{outputPathBam} ~{outputPathBamIndex} - } - - output { - File filteredBam = outputPathBam - File filteredBamIndex = outputPathBamIndex - } - - runtime { - docker: dockerImage - } - - parameter_meta { - bamFile: {description: "The bam file to process.", category: "required"} - outputPathBam: {description: "The filtered bam file.", category: "common"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "Processed input file."} + outputBamIndex: {description: "Index of the processed input file."} } } diff --git a/scripts b/scripts index dfef7cb2..4142daab 160000 --- a/scripts +++ b/scripts @@ -1 +1 @@ -Subproject commit dfef7cb2555667126dc1751add414527240d71bc +Subproject commit 4142daab81a7d9f28686b6a3299536757d381c81 diff --git a/seqtk.wdl b/seqtk.wdl index 321ab132..f6fa422b 100644 --- a/seqtk.wdl +++ b/seqtk.wdl @@ -24,11 +24,12 @@ task Sample { input { File sequenceFile String outFilePath = "subsampledReads.fq.gz" - String? preCommand - Int? seed Boolean twoPassMode = false - Float fractionOrNumber # when above 1.0 is the number of reads, otherwise it's a fraction + Float fractionOrNumber # When above 1.0 is the number of reads, otherwise it's a fraction. Boolean zip = true + + String? preCommand + Int? seed } command { @@ -47,4 +48,4 @@ task Sample { output { File subsampledReads = outFilePath } -} \ No newline at end of file +} diff --git a/sequali.wdl b/sequali.wdl new file mode 100644 index 00000000..cbca3653 --- /dev/null +++ b/sequali.wdl @@ -0,0 +1,73 @@ +version 1.0 + +# Copyright (c) 2024 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Sequali { + input { + File reads + File? mate_reads + String outDir = "." + + Int threads = 2 + String memory = "4GiB" + String dockerImage = "quay.io/biocontainers/sequali:0.12.0--py312hf67a6ed_0" + Int timeMinutes = 10 + ceil(size(reads, "GiB") + size(mate_reads, "GiB")) * 4 + } + + command <<< + set -e + mkdir -p $(dirname outputDir) + sequali \ + --outdir ~{outDir} \ + --threads ~{threads} \ + ~{reads} \ + ~{mate_reads} + >>> + + output { + File html = outDir + "/" + basename(reads) + ".html" + File json = outDir + "/" + basename(reads) + ".json" + } + + runtime { + cpu: threads + memory: memory + docker: dockerImage + time_minutes: timeMinutes + } + + parameter_meta { + # inputs + reads: {description: "A FASTQ or BAM file.", category: "required"} + mate_reads: {description: "FASTQ mate file"} + threads: {description: "The number of cores to use.", category: "advanced"} + + outDir: {description: "The path to write the output to.", catgory: "required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + html: {description: "HTML report file."} + json: {description: "JSON report file for use with MultiQC."} + } +} \ No newline at end of file diff --git a/smoove.wdl b/smoove.wdl new file mode 100644 index 00000000..7a1ac38b --- /dev/null +++ b/smoove.wdl @@ -0,0 +1,74 @@ +version 1.0 + +# Copyright (c) 2020 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Call { + input { + File bamFile + File bamIndex + File referenceFasta + File referenceFastaFai + String sample + String outputDir = "./smoove" + + String memory = "15GiB" + Int timeMinutes = 1440 + String dockerImage = "quay.io/biocontainers/smoove:0.2.5--0" + } + + command { + set -e + mkdir -p ~{outputDir} + smoove call \ + --outdir ~{outputDir} \ + --name ~{sample} \ + --fasta ~{referenceFasta} \ + --removepr \ + --genotype \ + ~{bamFile} + } + + output { + File smooveVcf = outputDir + "/" + sample + "-smoove.genotyped.vcf.gz" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + bamFile: {description: "The bam file to process.", category: "required"} + bamIndex: {description: "The index of the bam file.", category: "required"} + referenceFasta: {description: "The reference fasta file also used for mapping.", category: "required"} + referenceFastaFai: {description: "Fasta index (.fai) file of the reference.", category: "required" } + sample: {description: "The name of the sample.", category: "required"} + outputDir: {description: "The location the output VCF file should be written.", category: "common"} + memory: {description: "The memory required to run the programs.", category: "advanced"} + timeMinutes: {description: "The maximum duration (in minutes) the tool is allowed to run.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + smooveVcf: {description: "Calls of structural variants in VCF file."} + } +} diff --git a/snpeff.wdl b/snpeff.wdl new file mode 100644 index 00000000..b972ab30 --- /dev/null +++ b/snpeff.wdl @@ -0,0 +1,111 @@ +version 1.0 + +# MIT License +# +# Copyright (c) 2020 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task SnpEff { + input { + File vcf + File vcfIndex + String genomeVersion + File datadirZip + String outputPath = "./snpeff.vcf" + Boolean hgvs = true + Boolean lof = true + Boolean noDownstream = false + Boolean noUpstream = false + Boolean noIntergenic = false + Boolean noShiftHgvs = false + Int? upDownStreamLen + + String memory = "9GiB" + String javaXmx = "8G" + Int timeMinutes = 60 + # Multicontainer with snpeff 5.2 and bgzip/tabix 1.19.1 + String dockerImage = "quay.io/biocontainers/mulled-v2-2fe536b56916bd1d61a6a1889eb2987d9ea0cd2f:c51b2e46bf63786b2d9a7a7d23680791163ab39a-0" + } + + Boolean compressed = basename(outputPath) != basename(outputPath, ".gz") + + command { + set -e + ls ~{vcf} ~{vcfIndex} # dxCompiler localization workaroud + mkdir -p "$(dirname ~{outputPath})" + unzip ~{datadirZip} + snpEff -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + -v \ + ~{genomeVersion} \ + -noDownload \ + -dataDir $PWD/data \ + ~{vcf} \ + ~{true="-hgvs" false="-noHgvs" hgvs} \ + ~{true="-lof" false="-noLof" lof} \ + ~{true="-no-downstream" false="" noDownstream} \ + ~{true="-no-upstream" false="" noUpstream} \ + ~{true="-no-intergenic" false="" noIntergenic} \ + ~{true="-noShiftHgvs" false="" noShiftHgvs} \ + ~{"-upDownStreamLen " + upDownStreamLen} \ + ~{if compressed then "| bgzip " else ""} > ~{outputPath} + + ~{if compressed then "tabix ~{outputPath}" else ""} + rm -r $PWD/data + } + + output { + File outputVcf = outputPath + File? outputVcfIndex = outputPath + ".tbi" + } + + runtime { + docker: dockerImage + time_minutes: timeMinutes # !UnknownRuntimeKey + memory: memory + } + + parameter_meta { + # inputs + vcf: {description: "A VCF file to analyse.", category: "required"} + vcfIndex: {description: "The index for the VCF file.", category: "required"} + genomeVersion: {description: "The version of the genome to be used. The database for this genome must be present in the datadirZip.", category: "required"} + datadirZip: {description: "A zip file containing the directory of databases. This zip file must contain a directory called `data`, with the database mentioned in the genomeVersion input as subdirectory.", + category: "required"} + outputPath: {description: "The path to write the output to.", category: "common"} + hgvs: {description: "Equivalent to `-hgvs` if true or `-noHgvs` if false.", category: "advanced"} + lof: {description: "Equivalent to `-lof` if true or `-noLof` if false.", category: "advanced"} + noDownstream: {description: "Equivalent to the `-no-downstream` flag.", category: "advanced"} + noUpstream: {description: "Equivalent to the `-no-upstream` flag.", category: "advanced"} + noIntergenic: {description: "Equivalent to the `-no-intergenic` flag.", category: "advanced"} + noShiftHgvs: {description: "Equivalent to the `-noShiftHgvs` flag.", category: "advanced"} + upDownStreamLen: {descriptoin: "Equivalent to the `-upDownStreamLen` option.", category: "advanced"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + + # outputs + outputVcf: {description: "Annotated VCF file."} + outputVcfIndex: {description: "Index of annotated VCF file."} + } +} diff --git a/snpsift.wdl b/snpsift.wdl new file mode 100644 index 00000000..a62f7295 --- /dev/null +++ b/snpsift.wdl @@ -0,0 +1,84 @@ +version 1.0 + +# MIT License +# +# Copyright (c) 2025 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task SnpSiftFilter { + input { + File vcf + File? vcfIndex + String filterExpression + String outputPath = "./snpsift_filter.vcf" + + String memory = "9GiB" + String javaXmx = "8G" + Int timeMinutes = 60 + # Multicontainer with SnpSift 5.2 and bgzip/tabix 1.22 + String dockerImage = "quay.io/biocontainers/mulled-v2-d4bc0c23eb1d95c7ecff7f0e8b3a4255503fd5d4:c51b2e46bf63786b2d9a7a7d23680791163ab39a-0" + } + + Boolean compressed = basename(outputPath) != basename(outputPath, ".gz") + + command { + set -e + ls ~{vcf} ~{vcfIndex} # dxCompiler localization workaroud + + mkdir -p "$(dirname ~{outputPath})" + SnpSift -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + filter \ + "~{filterExpression}" \ + ~{vcf} \ + ~{if compressed then "| bgzip " else ""} > ~{outputPath} + + ~{if compressed then "tabix ~{outputPath}" else ""} + } + + output { + File outputVcf = outputPath + File? outputVcfIndex = outputPath + ".tbi" + } + + runtime { + docker: dockerImage + time_minutes: timeMinutes # !UnknownRuntimeKey + memory: memory + } + + parameter_meta { + # inputs + vcf: {description: "A VCF file to filter.", category: "required"} + vcfIndex: {description: "The index for the VCF file.", category: "common"} + filterExpression: {description: "The SnpSift filtering expression.", category: "required"} + outputPath: {description: "The path to write the output to.", category: "common"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + + # outputs + outputVcf: {description: "Filtered VCF file."} + outputVcfIndex: {description: "Index of filtered VCF file."} + } +} diff --git a/somaticseq.wdl b/somaticseq.wdl index 49e5c36d..7656d086 100644 --- a/somaticseq.wdl +++ b/somaticseq.wdl @@ -22,17 +22,18 @@ version 1.0 task ParallelPaired { input { - File? classifierSNV - File? classifierIndel String outputDir File referenceFasta File referenceFastaFai - File? inclusionRegion - File? exclusionRegion File tumorBam File tumorBamIndex File normalBam File normalBamIndex + + File? classifierSNV + File? classifierIndel + File? inclusionRegion + File? exclusionRegion File? mutect2VCF File? varscanSNV File? varscanIndel @@ -46,7 +47,9 @@ task ParallelPaired { File? strelkaSNV File? strelkaIndel + String memory = "2GiB" Int threads = 1 + Int timeMinutes = 60 String dockerImage = "lethalfang/somaticseq:3.1.0" } @@ -89,21 +92,24 @@ task ParallelPaired { runtime { cpu: threads + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - classifierSNV: {description: "A somaticseq SNV classifier.", category: "common"} - classifierIndel: {description: "A somaticseq Indel classifier.", category: "common"} + # inputs outputDir: {description: "The directory to write the output to.", category: "common"} referenceFasta: {description: "The reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - inclusionRegion: {description: "A bed file describing regions to include.", category: "common"} - exclusionRegion: {description: "A bed file describing regions to exclude.", category: "common"} - normalBam: {description: "The normal/control sample's BAM file.", category: "required"} - normalBamIndex: {description: "The index for the normal/control sample's BAM file.", category: "required"} tumorBam: {description: "The tumor/case sample's BAM file.", category: "required"} tumorBamIndex: {description: "The index for the tumor/case sample's BAM file.", category: "required"} + normalBam: {description: "The normal/control sample's BAM file.", category: "required"} + normalBamIndex: {description: "The index for the normal/control sample's BAM file.", category: "required"} + classifierSNV: {description: "A somaticseq SNV classifier.", category: "common"} + classifierIndel: {description: "A somaticseq Indel classifier.", category: "common"} + inclusionRegion: {description: "A bed file describing regions to include.", category: "common"} + exclusionRegion: {description: "A bed file describing regions to exclude.", category: "common"} mutect2VCF: {description: "A VCF as produced by mutect2.", category: "advanced"} varscanSNV: {description: "An SNV VCF as produced by varscan.", category: "advanced"} varscanIndel: {description: "An indel VCF as produced by varscan.", category: "advanced"} @@ -116,10 +122,16 @@ task ParallelPaired { scalpelVCF: {description: "A VCF as produced by scalpel.", category: "advanced"} strelkaSNV: {description: "An SNV VCF as produced by strelka.", category: "advanced"} strelkaIndel: {description: "An indel VCF as produced by somaticsniper.", category: "advanced"} - threads: {description: "The number of threads to use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + indels: {description: ""} + snvs: {description: ""} + ensembleIndels: {description: ""} + ensembleSNV: {description: ""} } } @@ -130,12 +142,13 @@ task ParallelPairedTrain { String outputDir File referenceFasta File referenceFastaFai - File? inclusionRegion - File? exclusionRegion File tumorBam File tumorBamIndex File normalBam File normalBamIndex + + File? inclusionRegion + File? exclusionRegion File? mutect2VCF File? varscanSNV File? varscanIndel @@ -149,7 +162,9 @@ task ParallelPairedTrain { File? strelkaSNV File? strelkaIndel + String memory = "2GiB" Int threads = 1 + Int timeMinutes = 240 String dockerImage = "lethalfang/somaticseq:3.1.0" } @@ -191,21 +206,24 @@ task ParallelPairedTrain { runtime { cpu: threads + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs truthSNV: {description: "A VCF of true SNVs.", category: "required"} truthIndel: {description: "A VCF of true indels.", category: "required"} outputDir: {description: "The directory to write the output to.", category: "common"} referenceFasta: {description: "The reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - inclusionRegion: {description: "A bed file describing regions to include.", category: "common"} - exclusionRegion: {description: "A bed file describing regions to exclude.", category: "common"} - normalBam: {description: "The normal/control sample's BAM file.", category: "required"} - normalBamIndex: {description: "The index for the normal/control sample's BAM file.", category: "required"} tumorBam: {description: "The tumor/case sample's BAM file.", category: "required"} tumorBamIndex: {description: "The index for the tumor/case sample's BAM file.", category: "required"} + normalBam: {description: "The normal/control sample's BAM file.", category: "required"} + normalBamIndex: {description: "The index for the normal/control sample's BAM file.", category: "required"} + inclusionRegion: {description: "A bed file describing regions to include.", category: "common"} + exclusionRegion: {description: "A bed file describing regions to exclude.", category: "common"} mutect2VCF: {description: "A VCF as produced by mutect2.", category: "advanced"} varscanSNV: {description: "An SNV VCF as produced by varscan.", category: "advanced"} varscanIndel: {description: "An indel VCF as produced by varscan.", category: "advanced"} @@ -218,24 +236,33 @@ task ParallelPairedTrain { scalpelVCF: {description: "A VCF as produced by scalpel.", category: "advanced"} strelkaSNV: {description: "An SNV VCF as produced by strelka.", category: "advanced"} strelkaIndel: {description: "An indel VCF as produced by somaticsniper.", category: "advanced"} - threads: {description: "The number of threads to use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + consensusIndels: {description: ""} + consensusSNV: {description: ""} + ensembleIndels: {description: ""} + ensembleSNV: {description: ""} + ensembleIndelsClassifier: {description: ""} + ensembleSNVClassifier: {description: ""} } } task ParallelSingle { input { - File? classifierSNV - File? classifierIndel + File bam + File bamIndex String outputDir File referenceFasta File referenceFastaFai + + File? classifierSNV + File? classifierIndel File? inclusionRegion File? exclusionRegion - File bam - File bamIndex File? mutect2VCF File? varscanVCF File? vardictVCF @@ -243,7 +270,9 @@ task ParallelSingle { File? scalpelVCF File? strelkaVCF + String memory = "2GiB" Int threads = 1 + Int timeMinutes = 60 String dockerImage = "lethalfang/somaticseq:3.1.0" } @@ -279,43 +308,53 @@ task ParallelSingle { runtime { cpu: threads + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - classifierSNV: {description: "A somaticseq SNV classifier.", category: "common"} - classifierIndel: {description: "A somaticseq Indel classifier.", category: "common"} + # inputs + bam: {description: "The input BAM file.", category: "required"} + bamIndex: {description: "The index for the input BAM file.", category: "required"} outputDir: {description: "The directory to write the output to.", category: "common"} referenceFasta: {description: "The reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + classifierSNV: {description: "A somaticseq SNV classifier.", category: "common"} + classifierIndel: {description: "A somaticseq Indel classifier.", category: "common"} inclusionRegion: {description: "A bed file describing regions to include.", category: "common"} exclusionRegion: {description: "A bed file describing regions to exclude.", category: "common"} - bam: {description: "The input BAM file.", category: "required"} - bamIndex: {description: "The index for the input BAM file.", category: "required"} mutect2VCF: {description: "A VCF as produced by mutect2.", category: "advanced"} varscanVCF: {description: "A VCF as produced by varscan.", category: "advanced"} vardictVCF: {description: "A VCF as produced by vardict.", category: "advanced"} lofreqVCF: {description: "A VCF as produced by lofreq.", category: "advanced"} scalpelVCF: {description: "A VCF as produced by scalpel.", category: "advanced"} strelkaVCF: {description: "A VCF as produced by strelka.", category: "advanced"} - threads: {description: "The number of threads to use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + indels: {description: ""} + snvs: {description: ""} + ensembleIndels: {description: ""} + ensembleSNV: {description: ""} } } task ParallelSingleTrain { input { + File bam + File bamIndex File truthSNV File truthIndel String outputDir File referenceFasta File referenceFastaFai + File? inclusionRegion File? exclusionRegion - File bam - File bamIndex File? mutect2VCF File? varscanVCF File? vardictVCF @@ -323,7 +362,9 @@ task ParallelSingleTrain { File? scalpelVCF File? strelkaVCF + String memory = "2GiB" Int threads = 1 + Int timeMinutes = 240 String dockerImage = "lethalfang/somaticseq:3.1.0" } @@ -358,10 +399,15 @@ task ParallelSingleTrain { runtime { cpu: threads + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs + bam: {description: "The input BAM file.", category: "required"} + bamIndex: {description: "The index for the input BAM file.", category: "required"} truthSNV: {description: "A VCF of true SNVs.", category: "required"} truthIndel: {description: "A VCF of true indels.", category: "required"} outputDir: {description: "The directory to write the output to.", category: "common"} @@ -369,18 +415,24 @@ task ParallelSingleTrain { referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} inclusionRegion: {description: "A bed file describing regions to include.", category: "common"} exclusionRegion: {description: "A bed file describing regions to exclude.", category: "common"} - bam: {description: "The input BAM file.", category: "required"} - bamIndex: {description: "The index for the input BAM file.", category: "required"} mutect2VCF: {description: "A VCF as produced by mutect2.", category: "advanced"} varscanVCF: {description: "A VCF as produced by varscan.", category: "advanced"} vardictVCF: {description: "A VCF as produced by vardict.", category: "advanced"} lofreqVCF: {description: "A VCF as produced by lofreq.", category: "advanced"} scalpelVCF: {description: "A VCF as produced by scalpel.", category: "advanced"} strelkaVCF: {description: "A VCF as produced by strelka.", category: "advanced"} - threads: {description: "The number of threads to use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + consensusIndels: {description: ""} + consensusSNV: {description: ""} + ensembleIndels: {description: ""} + ensembleSNV: {description: ""} + ensembleIndelsClassifier: {description: ""} + ensembleSNVClassifier: {description: ""} } } @@ -388,16 +440,17 @@ task ModifyStrelka { input { File strelkaVCF String outputVCFName = basename(strelkaVCF, ".gz") + + String memory = "2GiB" + Int timeMinutes = 20 String dockerImage = "lethalfang/somaticseq:3.1.0" } command { set -e - /opt/somaticseq/vcfModifier/modify_Strelka.py \ -infile ~{strelkaVCF} \ -outfile "modified_strelka.vcf" - first_FORMAT_line_num=$(grep -n -m 1 '##FORMAT' "modified_strelka.vcf" | cut -d : -f 1) sed "$first_FORMAT_line_num"'i##FORMAT=' "modified_strelka.vcf" > ~{outputVCFName} } @@ -407,13 +460,20 @@ task ModifyStrelka { } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs strelkaVCF: {description: "A vcf file as produced by strelka.", category: "required"} outputVCFName: {description: "The location the output VCF file should be written to.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: ""} } } diff --git a/spades.wdl b/spades.wdl index 204dbfea..d717ab28 100644 --- a/spades.wdl +++ b/spades.wdl @@ -22,10 +22,11 @@ version 1.0 task Spades { input { - String outputDir - String? preCommand File read1 File? read2 + String outputDir + + String? preCommand File? interlacedReads File? sangerReads File? pacbioReads @@ -33,23 +34,24 @@ task Spades { File? tslrContigs File? trustedContigs File? untrustedContigs - Boolean? singleCell - Boolean? metagenomic - Boolean? rna - Boolean? plasmid - Boolean? ionTorrent - Boolean? onlyErrorCorrection - Boolean? onlyAssembler - Boolean? careful - Boolean? disableGzipOutput - Boolean? disableRepeatResolution + Boolean singleCell = false + Boolean metagenomic = false + Boolean rna = false + Boolean plasmid = false + Boolean ionTorrent = false + Boolean onlyErrorCorrection = false + Boolean onlyAssembler = false + Boolean careful = false + Boolean disableGzipOutput = false + Boolean disableRepeatResolution = false File? dataset - Int threads = 1 - Int memoryGb = 16 File? tmpDir String? k Float? covCutoff Int? phredOffset + + Int threads = 1 + Int memoryGb = 16 } command { @@ -98,6 +100,6 @@ task Spades { runtime { cpu: threads - memory: "~{memoryGb}G" + memory: "~{memoryGb}GiB" } -} \ No newline at end of file +} diff --git a/star.wdl b/star.wdl index e1e55a26..88d3c838 100644 --- a/star.wdl +++ b/star.wdl @@ -20,14 +20,107 @@ version 1.0 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +task GenomeGenerate { + input { + String genomeDir = "STAR_index" + File referenceFasta + + File? referenceGtf + Int? sjdbOverhang + + Int threads = 4 + String memory = "32GiB" + Int timeMinutes = ceil(size(referenceFasta, "GiB") * 240 / threads) + String dockerImage = "quay.io/biocontainers/star:2.7.3a--0" + } + + command { + set -e + mkdir -p ~{genomeDir} + STAR \ + --runMode genomeGenerate \ + --runThreadN ~{threads} \ + --genomeDir ~{genomeDir} \ + --genomeFastaFiles ~{referenceFasta} \ + ~{"--sjdbGTFfile " + referenceGtf} \ + ~{"--sjdbOverhang " + sjdbOverhang} + } + + output { + File chrLength = "~{genomeDir}/chrLength.txt" + File chrNameLength = "~{genomeDir}/chrNameLength.txt" + File chrName = "~{genomeDir}/chrName.txt" + File chrStart = "~{genomeDir}/chrStart.txt" + File genome = "~{genomeDir}/Genome" + File genomeParameters = "~{genomeDir}/genomeParameters.txt" + File sa = "~{genomeDir}/SA" + File saIndex = "~{genomeDir}/SAindex" + File? exonGeTrInfo = "~{genomeDir}/exonGeTrInfo.tab" + File? exonInfo = "~{genomeDir}/exonInfo.tab" + File? geneInfo = "~{genomeDir}/geneInfo.tab" + File? sjdbInfo = "~{genomeDir}/sjdbInfo.txt" + File? sjdbListFromGtfOut = "~{genomeDir}/sjdbList.fromGTF.out.tab" + File? sjdbListOut = "~{genomeDir}/sjdbList.out.tab" + File? transcriptInfo = "~{genomeDir}/transcriptInfo.tab" + Array[File] starIndex = select_all([chrLength, chrNameLength, chrName, + chrStart, genome, genomeParameters, + sa, saIndex, exonGeTrInfo, exonInfo, + geneInfo, sjdbInfo, sjdbListFromGtfOut, + sjdbListOut, transcriptInfo]) + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + genomeDir: {description:"The directory the STAR index should be written to.", category: "common"} + referenceFasta: {description: "The reference Fasta file.", category: "required"} + referenceGtf: {description: "The reference GTF file.", category: "common"} + sjdbOverhang: {description: "Equivalent to STAR's `--sjdbOverhang` option.", category: "advanced"} + threads: {description: "The number of threads to use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + chrLength: {description: "Text chromosome lengths file."} + chrNameLength: {description: "Text chromosome name lengths file."} + chrName: {description: "Text chromosome names file."} + chrStart: {description: "Chromosome start sites file."} + genome: {description: "Binary genome sequence file."} + genomeParameters: {description: "Genome parameters file."} + sa: {description: "Suffix arrays file."} + saIndex: {description: "Index file of suffix arrays."} + exonGeTrInfo: {description: "Exon, gene and transcript information file."} + exonInfo: {description: "Exon information file."} + geneInfo: {description: "Gene information file."} + sjdbInfo: {description: "Splice junctions coordinates file."} + sjdbListFromGtfOut: {description: "Splice junctions from input GTF file."} + sjdbListOut: {description: "Splice junction list file."} + transcriptInfo: {description: "Transcripts information file."} + starIndex: {description: "A collection of all STAR index files."} + } +} + task Star { input { Array[File]+ inputR1 - Array[File]? inputR2 + Array[File] inputR2 = [] Array[File]+ indexFiles String outFileNamePrefix String outSAMtype = "BAM SortedByCoordinate" String readFilesCommand = "zcat" + Int outBAMcompression = 1 + + Int? outFilterScoreMin + Float? outFilterScoreMinOverLread + Int? outFilterMatchNmin + Float? outFilterMatchNminOverLread String? outStd String? twopassMode = "Basic" Array[String]? outSAMattrRGline @@ -35,11 +128,19 @@ task Star { Int? limitBAMsortRAM Int runThreadN = 4 - String memory = "48G" + String? memory + # 1 minute initialization + time reading in index (1 minute per G) + time aligning data. + Int timeMinutes = 1 + ceil(size(indexFiles, "GiB")) + ceil(size(flatten([inputR1, inputR2]), "GiB") * 300 / runThreadN) String dockerImage = "quay.io/biocontainers/star:2.7.3a--0" } - #TODO Could be extended for all possible output extensions + # Use a margin of 30% index size. Real memory usage is ~30 GiB for a 27 GiB index. + Int memoryGb = 1 + ceil(size(indexFiles, "GiB") * 1.3) + # For some reason doing above calculation inside a string does not work. + # So we solve it with an optional memory string and using select_first + # in the runtime section. + + #TODO: Could be extended for all possible output extensions. Map[String, String] samOutputNames = {"BAM SortedByCoordinate": "sortedByCoord.out.bam"} command { @@ -50,7 +151,12 @@ task Star { --outFileNamePrefix ~{outFileNamePrefix} \ --genomeDir ~{sub(indexFiles[0], basename(indexFiles[0]), "")} \ --outSAMtype ~{outSAMtype} \ + --outBAMcompression ~{outBAMcompression} \ --readFilesCommand ~{readFilesCommand} \ + ~{"--outFilterScoreMin " + outFilterScoreMin} \ + ~{"--outFilterScoreMinOverLread " + outFilterScoreMinOverLread} \ + ~{"--outFilterMatchNmin " + outFilterMatchNmin} \ + ~{"--outFilterMatchNminOverLread " + outFilterMatchNminOverLread} \ ~{"--outSAMunmapped " + outSAMunmapped} \ ~{"--runThreadN " + runThreadN} \ ~{"--outStd " + outStd} \ @@ -61,21 +167,29 @@ task Star { output { File bamFile = outFileNamePrefix + "Aligned." + samOutputNames[outSAMtype] + File logFinalOut = outFileNamePrefix + "Log.final.out" } runtime { cpu: runThreadN - memory: memory + memory: select_first([memory, "~{memoryGb}GiB"]) + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs inputR1: {description: "The first-/single-end FastQ files.", category: "required"} inputR2: {description: "The second-end FastQ files (in the same order as the first-end files).", category: "common"} indexFiles: {description: "The star index files.", category: "required"} outFileNamePrefix: {description: "The prefix for the output files. May include directories.", category: "required"} outSAMtype: {description: "The type of alignment file to be produced. Currently only `BAM SortedByCoordinate` is supported.", category: "advanced"} readFilesCommand: {description: "Equivalent to star's `--readFilesCommand` option.", category: "advanced"} + outBAMcompression: {description: "The compression level of the output BAM.", category: "advanced"} + outFilterScoreMin: {description: "Equivalent to star's `--outFilterScoreMin` option.", category: "advanced"} + outFilterScoreMinOverLread: {description: "Equivalent to star's `--outFilterScoreMinOverLread` option.", category: "advanced"} + outFilterMatchNmin: {description: "Equivalent to star's `--outFilterMatchNmin` option.", category: "advanced"} + outFilterMatchNminOverLread: {description: "Equivalent to star's `--outFilterMatchNminOverLread` option.", category: "advanced"} outStd: {description: "Equivalent to star's `--outStd` option.", category: "advanced"} twopassMode: {description: "Equivalent to star's `--twopassMode` option.", category: "advanced"} outSAMattrRGline: {description: "The readgroup lines for the fastq pairs given (in the same order as the fastq files).", category: "common"} @@ -83,8 +197,12 @@ task Star { limitBAMsortRAM: {description: "Equivalent to star's `--limitBAMsortRAM` option.", category: "advanced"} runThreadN: {description: "The number of threads to use.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + bamFile: {description: "Alignment file."} + logFinalOut: {description: "Log information file."} } } diff --git a/strelka.wdl b/strelka.wdl index 826cbd8e..39afe172 100644 --- a/strelka.wdl +++ b/strelka.wdl @@ -29,13 +29,15 @@ task Germline { Array[File]+ indexes File referenceFasta File referenceFastaFai - File? callRegions - File? callRegionsIndex Boolean exome = false Boolean rna = false + File? callRegions + File? callRegionsIndex + Int cores = 1 Int memoryGb = 4 + Int timeMinutes = 90 String dockerImage = "quay.io/biocontainers/strelka:2.9.7--0" } @@ -60,26 +62,31 @@ task Germline { } runtime { - docker: dockerImage cpu: cores - memory: "~{memoryGb}G" + memory: "~{memoryGb}GiB" + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs runDir: {description: "The directory to use as run/output directory.", category: "common"} bams: {description: "The input BAM files.", category: "required"} indexes: {description: "The indexes for the input BAM files.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - callRegions: {description: "The bed file which indicates the regions to operate on.", category: "common"} - callRegionsIndex: {description: "The index of the bed file which indicates the regions to operate on.", category: "common"} exome: {description: "Whether or not the data is from exome sequencing.", category: "common"} rna: {description: "Whether or not the data is from RNA sequencing.", category: "common"} - + callRegions: {description: "The bed file which indicates the regions to operate on.", category: "common"} + callRegionsIndex: {description: "The index of the bed file which indicates the regions to operate on.", category: "common"} cores: {description: "The number of cores to use.", category: "advanced"} memoryGb: {description: "The amount of memory this job will use in Gigabytes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + variants: {description: "Output VCF file."} + variantsIndex: {description: "Index of output VCF file."} } } @@ -92,14 +99,16 @@ task Somatic { File tumorBamIndex File referenceFasta File referenceFastaFai + Boolean exome = false + File? callRegions File? callRegionsIndex File? indelCandidatesVcf File? indelCandidatesVcfIndex - Boolean exome = false Int cores = 1 Int memoryGb = 4 + Int timeMinutes = 90 String dockerImage = "quay.io/biocontainers/strelka:2.9.7--0" File? doNotDefineThis #FIXME @@ -129,12 +138,14 @@ task Somatic { } runtime { - docker: dockerImage cpu: cores - memory: "~{memoryGb}G" + memory: "~{memoryGb}GiB" + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs runDir: {description: "The directory to use as run/output directory.", category: "common"} normalBam: {description: "The normal/control sample's BAM file.", category: "required"} normalBamIndex: {description: "The index for the normal/control sample's BAM file.", category: "required"} @@ -142,16 +153,21 @@ task Somatic { tumorBamIndex: {description: "The index for the tumor/case sample's BAM file.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + exome: {description: "Whether or not the data is from exome sequencing.", category: "common"} callRegions: {description: "The bed file which indicates the regions to operate on.", category: "common"} callRegionsIndex: {description: "The index of the bed file which indicates the regions to operate on.", category: "common"} indelCandidatesVcf: {description: "An indel candidates VCF file from manta.", category: "advanced"} indelCandidatesVcfIndex: {description: "The index for the indel candidates VCF file.", category: "advanced"} - exome: {description: "Whether or not the data is from exome sequencing.", category: "common"} - cores: {description: "The number of cores to use.", category: "advanced"} memoryGb: {description: "The amount of memory this job will use in Gigabytes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + indelsVcf: {description: "VCF containing INDELS."} + indelsIndex: {description: "Index of output `indelsVcf`."} + variants: {description: "VCF containing variants."} + variantsIndex: {description: "Index of output `variants`."} } meta { @@ -159,4 +175,4 @@ task Somatic { exclude: ["doNotDefineThis"] } } -} \ No newline at end of file +} diff --git a/stringtie.wdl b/stringtie.wdl index cfaccc92..fbe7e442 100644 --- a/stringtie.wdl +++ b/stringtie.wdl @@ -24,16 +24,19 @@ task Stringtie { input { File bam File bamIndex - File? referenceGtf Boolean skipNovelTranscripts = false String assembledTranscriptsFile + + File? referenceGtf Boolean? firstStranded Boolean? secondStranded String? geneAbundanceFile + Float? minimumCoverage Int threads = 1 - String memory = "10G" - String dockerImage = "quay.io/biocontainers/stringtie:1.3.4--py35_0" + String memory = "2GiB" + Int timeMinutes = 1 + ceil(size(bam, "G") * 60 / threads) + String dockerImage = "quay.io/biocontainers/stringtie:1.3.6--h92e31bf_0" } command { @@ -45,6 +48,7 @@ task Stringtie { ~{true="-e" false="" skipNovelTranscripts} \ ~{true="--rf" false="" firstStranded} \ ~{true="--fr" false="" secondStranded} \ + ~{"-c " + minimumCoverage} \ -o ~{assembledTranscriptsFile} \ ~{"-A " + geneAbundanceFile} \ ~{bam} @@ -58,54 +62,29 @@ task Stringtie { runtime { cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - bam: { - description: "The input BAM file.", - category: "required" - } - bamIndex: { - description: "The input BAM file's index.", - category: "required" - } - referenceGtf: { - description: "A reference GTF file to be used as guide.", - category: "common" - } - skipNovelTranscripts: { - description: "Whether new transcripts should be assembled or not.", - category: "common" - } - assembledTranscriptsFile: { - description: "Where the output of the assembly should be written.", - category: "required" - } - firstStranded: { - description: "Equivalent to the --rf flag of stringtie.", - category: "required" - } - secondStranded: { - description: "Equivalent to the --fr flag of stringtie.", - category: "required" - } - geneAbundanceFile: { - description: "Where the abundance file should be written.", - category: "common" - } - threads: { - description: "The number of threads to use.", - category: "advanced" - } - memory: { - description: "The amount of memory needed for this task in GB.", - category: "advanced" - } - dockerImage: { - description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced" - } + # inputs + bam: {description: "The input BAM file.", category: "required"} + bamIndex: {description: "The input BAM file's index.", category: "required"} + skipNovelTranscripts: {description: "Whether new transcripts should be assembled or not.", category: "common"} + assembledTranscriptsFile: {description: "Where the output of the assembly should be written.", category: "required"} + referenceGtf: {description: "A reference GTF file to be used as guide.", category: "common"} + firstStranded: {description: "Equivalent to the --rf flag of stringtie.", category: "required"} + secondStranded: {description: "Equivalent to the --fr flag of stringtie.", category: "required"} + geneAbundanceFile: {description: "Where the abundance file should be written.", category: "common"} + minimumCoverage: {description: "The minimum coverage for a transcript to be shown in the output.", category: "advanced"} + threads: {description: "The number of threads to use.", category: "advanced"} + memory: {description: "The amount of memory needed for this task in GB.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + assembledTranscripts: {description: "GTF file containing the assembled transcripts."} + geneAbundance: {description: "Gene abundances in tab-delimited format."} } } @@ -113,17 +92,19 @@ task Merge { input { Array[File]+ gtfFiles String outputGtfPath + Boolean keepMergedTranscriptsWithRetainedIntrons = false + File? guideGtf Int? minimumLength Float? minimumCoverage Float? minimumFPKM Float? minimumTPM Float? minimumIsoformFraction - Boolean keepMergedTranscriptsWithRetainedIntrons = false String? label - String memory = "10G" - String dockerImage = "quay.io/biocontainers/stringtie:1.3.4--py35_0" + String memory = "10GiB" + Int timeMinutes = 1 + ceil(size(gtfFiles, "G") * 20) + String dockerImage = "quay.io/biocontainers/stringtie:2.1.4--h7e0af3c_0" } command { @@ -148,57 +129,27 @@ task Merge { runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { - gtfFiles: { - description: "The GTF files produced by stringtie.", - category: "required" - } - outputGtfPath: { - description: "Where the output should be written.", - category: "required" - } - guideGtf: { - description: "Equivalent to the -G option of 'stringtie --merge'.", - category: "advanced" - } - minimumLength: { - description: "Equivalent to the -m option of 'stringtie --merge'.", - category: "advanced" - } - minimumCoverage: { - description: "Equivalent to the -c option of 'stringtie --merge'.", - category: "advanced" - } - minimumFPKM: { - description: "Equivalent to the -F option of 'stringtie --merge'.", - category: "advanced" - } - minimumTPM: { - description: "Equivalent to the -T option of 'stringtie --merge'.", - category: "advanced" - } - minimumIsoformFraction: { - description: "Equivalent to the -f option of 'stringtie --merge'.", - category: "advanced" - } - keepMergedTranscriptsWithRetainedIntrons: { - description: "Equivalent to the -i flag of 'stringtie --merge'.", - category: "advanced" - } - label: { - description: "Equivalent to the -l option of 'stringtie --merge'.", - category: "advanced" - } - memory: { - description: "The amount of memory needed for this task in GB.", - category: "advanced" - } - dockerImage: { - description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced" - } + # inputs + gtfFiles: {description: "The GTF files produced by stringtie.", category: "required"} + outputGtfPath: {description: "Where the output should be written.", category: "required"} + keepMergedTranscriptsWithRetainedIntrons: {description: "Equivalent to the -i flag of 'stringtie --merge'.", category: "advanced"} + guideGtf: {description: "Equivalent to the -G option of 'stringtie --merge'.", category: "advanced"} + minimumLength: {description: "Equivalent to the -m option of 'stringtie --merge'.", category: "advanced"} + minimumCoverage: {description: "Equivalent to the -c option of 'stringtie --merge'.", category: "advanced"} + minimumFPKM: {description: "Equivalent to the -F option of 'stringtie --merge'.", category: "advanced"} + minimumTPM: {description: "Equivalent to the -T option of 'stringtie --merge'.", category: "advanced"} + minimumIsoformFraction: {description: "Equivalent to the -f option of 'stringtie --merge'.", category: "advanced"} + label: {description: "Equivalent to the -l option of 'stringtie --merge'.", category: "advanced"} + memory: {description: "The amount of memory needed for this task in GB.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + mergedGtfFile: {description: "A merged GTF file from a set of GTF files."} } } diff --git a/survivor.wdl b/survivor.wdl index ded11d75..ae246f60 100644 --- a/survivor.wdl +++ b/survivor.wdl @@ -1,7 +1,5 @@ version 1.0 -# MIT License -# # Copyright (c) 2018 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -27,13 +25,15 @@ task Merge { Array[File] filePaths Int breakpointDistance = 1000 Int suppVecs = 2 - Int svType = 1 - Int strandType = 1 - Int distanceBySvSize = 0 + Boolean svType = true + Boolean strandType = true + Boolean distanceBySvSize = false Int minSize = 30 String outputPath = "./survivor/merged.vcf" - String memory = "24G" - String dockerImage = "quay.io/biocontainers/survivor:1.0.6--h6bb024c_0" + + String memory = "24GiB" + Int timeMinutes = 60 + String dockerImage = "quay.io/biocontainers/survivor:1.0.7--hd03093a_2" } command { @@ -44,9 +44,9 @@ task Merge { fileList \ ~{breakpointDistance} \ ~{suppVecs} \ - ~{svType} \ - ~{strandType} \ - ~{distanceBySvSize} \ + ~{true='1' false='0' svType} \ + ~{true='1' false='0' strandType} \ + ~{true='1' false='0' distanceBySvSize} \ ~{minSize} \ ~{outputPath} } @@ -57,20 +57,25 @@ task Merge { runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - filePaths: {description: "An array of VCF files (predictions) to be merged by SURVIVOR", category: "required"} - breakpointDistance: {description: "The distance between pairwise breakpoints between SVs", category: "advanced"} - suppVecs: {description: "The minimum number of SV callers to support the merging", category: "advanced"} - svType: {description: "A boolean to include the type SV to be merged", category: "advanced"} - strandType: {description: "A boolean to include strand type of an SV to be merged", category: "advanced"} - distanceBySvSize: {description: "A boolean to predict the pairwise distance between the SVs based on their size", category: "advanced"} - minSize: {description: "The mimimum size of SV to be merged", category: "advanced"} + filePaths: {description: "An array of VCF files (predictions) to be merged by SURVIVOR.", category: "required"} + breakpointDistance: {description: "The distance between pairwise breakpoints between SVs.", category: "advanced"} + suppVecs: {description: "The minimum number of SV callers to support the merging.", category: "advanced"} + svType: {description: "A boolean to include the type SV to be merged.", category: "advanced"} + strandType: {description: "A boolean to include strand type of an SV to be merged.", category: "advanced"} + distanceBySvSize: {description: "A boolean to predict the pairwise distance between the SVs based on their size.", category: "advanced"} + minSize: {description: "The mimimum size of SV to be merged.", category: "advanced"} outputPath: {description: "The location the output VCF file should be written.", category: "common"} - memory: {description: "The memory required to run the programs", category: "advanced"} + memory: {description: "The memory required to run the programs.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + mergedVcf: {description: "All the vcf files specified in fileList merged."} } } diff --git a/talon.wdl b/talon.wdl index 6ddb841e..2f93e36b 100644 --- a/talon.wdl +++ b/talon.wdl @@ -1,6 +1,6 @@ version 1.0 -# Copyright (c) 2019 Sequencing Analysis Support Core - Leiden University Medical Center +# Copyright (c) 2019 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -8,10 +8,10 @@ version 1.0 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -30,8 +30,9 @@ task CreateAbundanceFileFromDatabase { File? whitelistFile File? datasetsFile - String memory = "4G" - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String memory = "4GiB" + Int timeMinutes = 30 + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -47,27 +48,29 @@ task CreateAbundanceFileFromDatabase { } output { - File outputAbundanceFile = outputPrefix + "_talon_abundance.tsv" + File abundanceFile = outputPrefix + "_talon_abundance.tsv" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - databaseFile: {description: "TALON database.", category: "required"} + databaseFile: {description: "Talon database.", category: "required"} annotationVersion: {description: "Which annotation version to use.", category: "required"} genomeBuild: {description: "Genome build to use.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} whitelistFile: {description: "Whitelist file of transcripts to include in the output.", category: "advanced"} datasetsFile: {description: "A file indicating which datasets should be included.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputAbundanceFile: {description: "Abundance for each transcript in the TALON database across datasets."} + abundanceFile: {description: "Abundance for each transcript in the talon database across datasets."} } } @@ -83,8 +86,9 @@ task CreateGtfFromDatabase { File? whitelistFile File? datasetFile - String memory = "4G" - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String memory = "4GiB" + Int timeMinutes = 30 + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -101,17 +105,18 @@ task CreateGtfFromDatabase { } output { - File outputGTFfile = outputPrefix + "_talon.gtf" + File gtfFile = outputPrefix + "_talon.gtf" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - databaseFile: {description: "TALON database.", category: "required"} + databaseFile: {description: "Talon database.", category: "required"} genomeBuild: {description: "Genome build to use.", category: "required"} annotationVersion: {description: "Which annotation version to use.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} @@ -119,10 +124,11 @@ task CreateGtfFromDatabase { whitelistFile: {description: "Whitelist file of transcripts to include in the output.", category: "advanced"} datasetFile: {description: "A file indicating which datasets should be included.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputGTFfile: {description: "The genes, transcripts, and exons stored a TALON database in GTF format."} + gtfFile: {description: "The genes, transcripts, and exons stored a talon database in gtf format."} } } @@ -131,11 +137,16 @@ task FilterTalonTranscripts { File databaseFile String annotationVersion String outputPrefix + Float maxFracA = 0.5 + Int minCount = 5 + Boolean allowGenomic = false - File? pairingsFile + File? datasetsFile + Int? minDatasets - String memory = "4G" - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String memory = "4GiB" + Int timeMinutes = 30 + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -145,29 +156,39 @@ task FilterTalonTranscripts { --db=~{databaseFile} \ -a ~{annotationVersion} \ ~{"--o=" + outputPrefix + "_whitelist.csv"} \ - ~{"-p " + pairingsFile} + --maxFracA=~{maxFracA} \ + --minCount=~{minCount} \ + ~{true="--allowGenomic" false="" allowGenomic} \ + --datasets=~{datasetsFile} \ + --minDatasets=~{minDatasets} } output { - File outputTranscriptWhitelist = outputPrefix + "_whitelist.csv" + File transcriptWhitelist = outputPrefix + "_whitelist.csv" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - databaseFile: {description: "TALON database.", category: "required"} + databaseFile: {description: "Talon database.", category: "required"} annotationVersion: {description: "Which annotation version to use.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - pairingsFile: {description: "A file indicating which datasets should be considered together.", category: "advanced"} + maxFracA: {description: "Maximum fraction of As to allow in the window located immediately after any read assigned to a novel transcript.", category: "advanced"} + minCount: {description: "Number of minimum occurrences required for a novel transcript per dataset.", category: "advanced"} + allowGenomic: {description: "If this option is set, transcripts from the Genomic novelty category will be permitted in the output.", category: "advanced"} + datasetsFile: {description: "Datasets to include.", category: "advanced"} + minDatasets: {description: "Minimum number of datasets novel transcripts must be found in.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputTranscriptWhitelist: {description: "A transcript whitelist produced from the TALON database."} + transcriptWhitelist: {description: "Transcript whitelist produced from the talon database."} } } @@ -179,8 +200,9 @@ task GetReadAnnotations { File? datasetFile - String memory = "4G" - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String memory = "4GiB" + Int timeMinutes = 30 + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -194,115 +216,233 @@ task GetReadAnnotations { } output { - File outputAnnotation = outputPrefix + "_talon_read_annot.tsv" + File readAnnotations = outputPrefix + "_talon_read_annot.tsv" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - databaseFile: { description: "TALON database.", category: "required"} + databaseFile: { description: "Talon database.", category: "required"} genomeBuild: {description: "Genome build to use.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} datasetFile: {description: "A file indicating which datasets should be included.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputAnnotation: {description: "Read-specific annotation information from a TALON database."} + readAnnotations: {description: "Read-specific annotation information from a talon database."} + } +} + +task GetSpliceJunctions { + input { + File sjInformationFile + String inputFileType = "db" + File referenceGtf + String runMode = "intron" + String outputPrefix + + String memory = "4GiB" + Int timeMinutes = 30 + String dockerImage = "biocontainers/talon:v5.0_cv1" + } + + Map[String, String] SJfileType = {"db": "--db", "gtf": "--gtf"} + + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + talon_get_sjs \ + ~{SJfileType[inputFileType] + sjInformationFile} \ + --ref ~{referenceGtf} \ + --mode ~{runMode} \ + --outprefix ~{outputPrefix} + } + + output { + File spliceJunctions = outputPrefix + "_" + runMode + "s.tsv" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + sjInformationFile: {description: "Talon gtf file or database from which to extract exons/introns.", category: "required"} + inputFileType: {description: "The file type of sjInformationFile.", category: "common"} + referenceGtf: {description: "Gtf reference file (ie gencode).", category: "required"} + runMode: {description: "Determines whether to include introns or exons in the output.", category: "common"} + outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + spliceJunctions: {description: "File containing locations, novelty and transcript assignments of exons/introns."} } } task InitializeTalonDatabase { input { - File GTFfile + File gtfFile String genomeBuild String annotationVersion Int minimumLength = 300 - String novelIDprefix = "TALON" - Int cutoff5p = 500 - Int cutoff3p = 300 + String novelPrefix = "TALON" + Int cutOff5p = 500 + Int cutOff3p = 300 String outputPrefix - String memory = "10G" - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String memory = "10GiB" + Int timeMinutes = 60 + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { set -e mkdir -p "$(dirname ~{outputPrefix})" talon_initialize_database \ - --f=~{GTFfile} \ + --f=~{gtfFile} \ --g=~{genomeBuild} \ --a=~{annotationVersion} \ --l=~{minimumLength} \ - --idprefix=~{novelIDprefix} \ - --5p=~{cutoff5p} \ - --3p=~{cutoff3p} \ + --idprefix=~{novelPrefix} \ + --5p=~{cutOff5p} \ + --3p=~{cutOff3p} \ --o=~{outputPrefix} } output { - File outputDatabase = outputPrefix + ".db" + File databaseFile = outputPrefix + ".db" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - GTFfile: {description: "GTF annotation containing genes, transcripts, and edges.", category: "required"} - genomeBuild: {description: "Name of genome build that the GTF file is based on (ie hg38).", category: "required"} + gtfFile: {description: "Gtf annotation containing genes, transcripts, and edges.", category: "required"} + genomeBuild: {description: "Name of genome build that the gtf file is based on (ie hg38).", category: "required"} annotationVersion: {description: "Name of supplied annotation (will be used to label data).", category: "required"} minimumLength: { description: "Minimum required transcript length.", category: "common"} - novelIDprefix: {description: "Prefix for naming novel discoveries in eventual TALON runs.", category: "common"} - cutoff5p: { description: "Maximum allowable distance (bp) at the 5' end during annotation.", category: "advanced"} - cutoff3p: {description: "Maximum allowable distance (bp) at the 3' end during annotation.", category: "advanced"} + novelPrefix: {description: "Prefix for naming novel discoveries in eventual talon runs.", category: "common"} + cutOff5p: { description: "Maximum allowable distance (bp) at the 5' end during annotation.", category: "advanced"} + cutOff3p: {description: "Maximum allowable distance (bp) at the 3' end during annotation.", category: "advanced"} + outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + databaseFile: {description: "Talon database."} + } +} + +task LabelReads { + input { + File inputSam + File referenceGenome + Int fracaRangeSize = 20 + String tmpDir = "./tmp_label_reads" + Boolean deleteTmp = true + String outputPrefix + + Int threads = 4 + String memory = "25GiB" + Int timeMinutes = 2880 + String dockerImage = "biocontainers/talon:v5.0_cv1" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + talon_label_reads \ + --f=~{inputSam} \ + --g=~{referenceGenome} \ + --t=~{threads} \ + --ar=~{fracaRangeSize} \ + --tmpDir=~{tmpDir} \ + ~{true="--deleteTmp" false="" deleteTmp} \ + --o=~{outputPrefix} + } + + output { + File labeledSam = outputPrefix + "_labeled.sam" + File readLabels = outputPrefix + "_read_labels.tsv" + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputSam: {description: "Sam file of transcripts.", category: "required"} + referenceGenome: {description: "Reference genome fasta file.", category: "required"} + fracaRangeSize: {description: "Size of post-transcript interval to compute fraction.", category: "common"} + tmpDir: {description: "Path to directory for tmp files.", category: "advanced"} + deleteTmp: {description: "If set, tmp dir will be removed.", category: "advanced"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + threads: {description: "The number of threads to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputDatabase: {description: "TALON database."} + labeledSam: {description: "Sam file with labeled transcripts."} + readLabels: {description: "Tabular file with fraction description per read."} } } task ReformatGtf { input { - File GTFfile + File gtfFile - String memory = "4G" - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String memory = "4GiB" + Int timeMinutes = 30 + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { set -e talon_reformat_gtf \ - -gtf ~{GTFfile} + -gtf ~{gtfFile} } output { - File outputReformattedGTF = GTFfile + File reformattedGtf = gtfFile } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - GTFfile: {description: "GTF annotation containing genes, transcripts, and edges.", category: "required"} + gtfFile: {description: "Gtf annotation containing genes, transcripts, and edges.", category: "required"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputReformattedGTF: {description: "Reformatted GTF file."} + reformattedGtf: {description: "Reformatted gtf file."} } } @@ -312,10 +452,11 @@ task SummarizeDatasets { Boolean setVerbose = false String outputPrefix - File? datasetGroupsCSV + File? datasetGroupsCsv - String memory = "4G" - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String memory = "4GiB" + Int timeMinutes = 50 + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -325,46 +466,49 @@ task SummarizeDatasets { --db ~{databaseFile} \ ~{true="--verbose" false="" setVerbose} \ --o ~{outputPrefix} \ - ~{"--groups " + datasetGroupsCSV} + ~{"--groups " + datasetGroupsCsv} } output { - File outputSummaryFile = outputPrefix + "_talon_summary.tsv" + File summaryFile = outputPrefix + "_talon_summary.tsv" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - databaseFile: {description: "TALON database.", category: "required"} + databaseFile: {description: "Talon database.", category: "required"} setVerbose: {description: "Print out the counts in terminal.", category: "advanced"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - datasetGroupsCSV: {description: "File of comma-delimited dataset groups to process together.", category: "advanced"} + datasetGroupsCsv: {description: "File of comma-delimited dataset groups to process together.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputSummaryFile: {description: "Tab-delimited file of gene and transcript counts for each dataset."} + summaryFile: {description: "Tab-delimited file of gene and transcript counts for each dataset."} } } task Talon { input { - Array[File] SAMfiles + Array[File] samFiles String organism String sequencingPlatform = "PacBio-RS-II" File databaseFile String genomeBuild Float minimumCoverage = 0.9 - Int minimumIdentity = 0 + Float minimumIdentity = 0.8 String outputPrefix - Int cores = 4 - String memory = "25G" - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + Int threads = 4 + String memory = "25GiB" + Int timeMinutes = 2880 + String dockerImage = "biocontainers/talon:v5.0_cv1" } command <<< @@ -374,7 +518,7 @@ task Talon { ln -s $PWD/tmp /tmp/sqltmp #Multiprocessing will crash if the absolute path is too long. export TMPDIR=/tmp/sqltmp printf "" > ~{outputPrefix}/talonConfigFile.csv #File needs to be emptied when task is rerun. - for file in ~{sep=" " SAMfiles} + for file in ~{sep=" " samFiles} do configFileLine="$(basename ${file%.*}),~{organism},~{sequencingPlatform},${file}" echo ${configFileLine} >> ~{outputPrefix}/talonConfigFile.csv @@ -383,43 +527,45 @@ task Talon { ~{"--f " + outputPrefix + "/talonConfigFile.csv"} \ --db ~{databaseFile} \ --build ~{genomeBuild} \ - --threads ~{cores} \ + --threads ~{threads} \ --cov ~{minimumCoverage} \ --identity ~{minimumIdentity} \ ~{"--o " + outputPrefix + "/run"} >>> output { - File outputUpdatedDatabase = databaseFile - File outputLog = outputPrefix + "/run_QC.log" - File outputAnnot = outputPrefix + "/run_talon_read_annot.tsv" - File outputConfigFile = outputPrefix + "/talonConfigFile.csv" + File updatedDatabase = databaseFile + File talonLog = outputPrefix + "/run_QC.log" + File talonAnnotation = outputPrefix + "/run_talon_read_annot.tsv" + File talonConfigFile = outputPrefix + "/talonConfigFile.csv" } runtime { - cpu: cores + cpu: threads memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - SAMfiles: {description: "Input SAM files.", category: "required"} + samFiles: {description: "Input sam files.", category: "required"} organism: {description: "The name of the organism from which the samples originated.", category: "required"} sequencingPlatform: {description: "The sequencing platform used to generate long reads.", category: "required"} - databaseFile: {description: "TALON database. Created using initialize_talon_database.py.", category: "required"} + databaseFile: {description: "Talon database. Created using initialize_talon_database.py.", category: "required"} genomeBuild: {description: "Genome build (i.e. hg38) to use.", category: "required"} - minimumCoverage: {description: "Minimum alignment coverage in order to use a SAM entry.", category: "common"} - minimumIdentity: {description: "Minimum alignment identity in order to use a SAM entry.", category: "common" } + minimumCoverage: {description: "Minimum alignment coverage in order to use a sam entry.", category: "common"} + minimumIdentity: {description: "Minimum alignment identity in order to use a sam entry.", category: "common" } outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - cores: {description: "The number of cores to be used.", category: "advanced"} + threads: {description: "The number of threads to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputUpdatedDatabase: {description: "Updated TALON database."} - outputLog: {description: "Log file from TALON run."} - outputAnnot: {description: "Read annotation file from TALON run."} - outputConfigFile: {description: "The TALON configuration file."} + updatedDatabase: {description: "Updated talon database."} + talonLog: {description: "Log file from talon run."} + talonAnnotation: {description: "Read annotation file from talon run."} + talonConfigFile: {description: "The talon configuration file."} } } diff --git a/transcriptclean.wdl b/transcriptclean.wdl index 68bcbf24..8607a7a3 100644 --- a/transcriptclean.wdl +++ b/transcriptclean.wdl @@ -1,6 +1,6 @@ version 1.0 -# Copyright (c) 2019 Sequencing Analysis Support Core - Leiden University Medical Center +# Copyright (c) 2019 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -8,10 +8,10 @@ version 1.0 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -22,12 +22,13 @@ version 1.0 task GetSJsFromGtf { input { - File GTFfile + File gtfFile File genomeFile String outputPrefix Int minIntronSize = 21 - String memory = "8G" + String memory = "8GiB" + Int timeMinutes = 30 String dockerImage = "biocontainers/transcriptclean:v2.0.2_cv1" } @@ -35,41 +36,44 @@ task GetSJsFromGtf { set -e mkdir -p "$(dirname ~{outputPrefix})" get_SJs_from_gtf \ - --f=~{GTFfile} \ + --f=~{gtfFile} \ --g=~{genomeFile} \ --minIntronSize=~{minIntronSize} \ ~{"--o=" + outputPrefix + ".tsv"} } output { - File outputSJsFile = outputPrefix + ".tsv" + File spliceJunctionFile = outputPrefix + ".tsv" } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - GTFfile: {description: "Input GTF file", category: "required"} - genomeFile: {description: "Reference genome", category: "required"} - minIntronSize: {description: "Minimum size of intron to consider a junction.", category: "advanced"} + gtfFile: {description: "Input gtf file.", category: "required"} + genomeFile: {description: "Reference genome.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + minIntronSize: {description: "Minimum size of intron to consider a junction.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + # outputs - outputSJsFile: {description: "Extracted splice junctions."} + spliceJunctionFile: {description: "Extracted splice junctions."} } } task GetTranscriptCleanStats { input { - File transcriptCleanSAMfile + File inputSam String outputPrefix - String memory = "4G" + String memory = "4GiB" + Int timeMinutes = 30 String dockerImage = "biocontainers/transcriptclean:v2.0.2_cv1" } @@ -77,38 +81,39 @@ task GetTranscriptCleanStats { set -e mkdir -p "$(dirname ~{outputPrefix})" get_TranscriptClean_stats \ - ~{transcriptCleanSAMfile} \ + ~{inputSam} \ ~{outputPrefix} } output { - File outputStatsFile = stdout() + File statsFile = stdout() } runtime { memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { # inputs - transcriptCleanSAMfile: {description: "Output SAM file from TranscriptClean", category: "required"} + inputSam: {description: "Output sam file from transcriptclean.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} memory: {description: "The amount of memory available to the job.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputStatsFile: {description: "Summary stats from TranscriptClean run."} + statsFile: {description: "Summary stats from transcriptclean run."} } } task TranscriptClean { input { - File SAMfile + File inputSam File referenceGenome Int maxLenIndel = 5 - Int maxSJoffset = 5 + Int maxSJOffset = 5 String outputPrefix Boolean correctMismatches = true Boolean correctIndels = true @@ -123,7 +128,8 @@ task TranscriptClean { File? variantFile Int cores = 1 - String memory = "25G" + String memory = "25GiB" + Int timeMinutes = 2880 String dockerImage = "biocontainers/transcriptclean:v2.0.2_cv1" } @@ -131,11 +137,11 @@ task TranscriptClean { set -e mkdir -p "$(dirname ~{outputPrefix})" TranscriptClean \ - -s ~{SAMfile} \ + -s ~{inputSam} \ -g ~{referenceGenome} \ -t ~{cores} \ --maxLenIndel=~{maxLenIndel} \ - --maxSJOffset=~{maxSJoffset} \ + --maxSJOffset=~{maxSJOffset} \ -o ~{outputPrefix} \ ~{true="-m true" false="-m false" correctMismatches} \ ~{true="-i true" false="-i false" correctIndels} \ @@ -150,44 +156,45 @@ task TranscriptClean { } output { - File outputTranscriptCleanFasta = outputPrefix + "_clean.fa" - File outputTranscriptCleanLog = outputPrefix + "_clean.log" - File outputTranscriptCleanSAM = outputPrefix + "_clean.sam" - File outputTranscriptCleanTElog = outputPrefix + "_clean.TE.log" + File fastaFile = outputPrefix + "_clean.fa" + File logFile = outputPrefix + "_clean.log" + File outputSam = outputPrefix + "_clean.sam" + File logFileTE = outputPrefix + "_clean.TE.log" } runtime { cpu: cores memory: memory + time_minute: timeMinutes docker: dockerImage } parameter_meta { # inputs - SAMfile: {description: "Input SAM file containing transcripts to correct.", category: "required"} + inputSam: {description: "Input sam file containing transcripts to correct.", category: "required"} referenceGenome: {description: "Reference genome fasta file.", category: "required"} maxLenIndel: {description: "Maximum size indel to correct.", category: "advanced"} - maxSJoffset: {description: "Maximum distance from annotated splice junction to correct.", category: "advanced"} + maxSJOffset: {description: "Maximum distance from annotated splice junction to correct.", category: "advanced"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - correctMismatches: {description: "Set this to make TranscriptClean correct mismatches.", category: "common"} - correctIndels: {description: "Set this to make TranscriptClean correct indels.", category: "common"} - correctSJs: {description: "Set this to make TranscriptClean correct splice junctions.", category: "common"} - dryRun: {description: "TranscriptClean will read in the data but don't do any correction.", category: "advanced"} + correctMismatches: {description: "Set this to make transcriptclean correct mismatches.", category: "common"} + correctIndels: {description: "Set this to make transcriptclean correct indels.", category: "common"} + correctSJs: {description: "Set this to make transcriptclean correct splice junctions.", category: "common"} + dryRun: {description: "Transcriptclean will read in the data but don't do any correction.", category: "advanced"} primaryOnly: {description: "Only output primary mappings of transcripts.", category: "advanced"} canonOnly: {description: "Only output canonical transcripts and transcript containing annotated noncanonical junctions.", category: "advanced"} bufferSize: {description: "Number of lines to output to file at once by each thread during run.", category: "common"} - deleteTmp: {description: "The temporary directory generated by TranscriptClean will be removed.", category: "common"} + deleteTmp: {description: "The temporary directory generated by transcriptclean will be removed.", category: "common"} spliceJunctionAnnotation: {description: "Splice junction file.", category: "common"} - variantFile: {description: "VCF formatted file of variants.", category: "common"} + variantFile: {description: "Vcf formatted file of variants.", category: "common"} cores: {description: "The number of cores to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - outputTranscriptCleanFasta: {description: "Fasta file containing corrected reads."} - outputTranscriptCleanLog: {description: "Log file of TranscriptClean run."} - outputTranscriptCleanSAM: {description: "SAM file containing corrected aligned reads."} - outputTranscriptCleanTElog: {description: "TE log file of TranscriptClean run."} + fastaFile: {description: "Fasta file containing corrected reads."} + logFile: {description: "Log file of transcriptclean run."} + outputSam: {description: "Sam file containing corrected aligned reads."} + logFileTE: {description: "TE log file of transcriptclean run."} } } diff --git a/umi-tools.wdl b/umi-tools.wdl index 07518e57..d8d17c48 100644 --- a/umi-tools.wdl +++ b/umi-tools.wdl @@ -1,6 +1,6 @@ version 1.0 -# Copyright (c) 2017 Sequencing Analysis Support Core - Leiden University Medical Center +# Copyright (c) 2017 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -8,10 +8,10 @@ version 1.0 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -26,11 +26,13 @@ task Extract { File? read2 String bcPattern String? bcPattern2 - Boolean threePrime = false String read1Output = "umi_extracted_R1.fastq.gz" String? read2Output = "umi_extracted_R2.fastq.gz" + Boolean threePrime = false - String dockerImage = "quay.io/biocontainers/mulled-v2-509311a44630c01d9cb7d2ac5727725f51ea43af:6089936aca6219b5bb5f54210ac5eb456c7503f2-0" + String memory = "20GiB" + Int timeMinutes = 1 + ceil(size([read1, read2], "G") * 2) + String dockerImage = "quay.io/biocontainers/mulled-v2-509311a44630c01d9cb7d2ac5727725f51ea43af:3067b520386698317fd507c413baf7f901666fd4-0" } command { @@ -50,19 +52,26 @@ task Extract { } runtime { + memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs read1: {description: "The first/single-end fastq file.", category: "required"} read2: {description: "The second-end fastq file.", category: "common"} bcPattern: {description: "The pattern to be used for UMI extraction. See the umi_tools docs for more information.", category: "required"} bcPattern2: {description: "The pattern to be used for UMI extraction in the second-end reads. See the umi_tools docs for more information.", category: "advanced"} - threePrime: {description: "Whether or not the UMI's are at the reads' 3' end. If false the UMIs are extracted from the 5' end.", category: "advanced"} read1Output: {description: "The location to write the first/single-end output fastq file to.", category: "advanced"} read2Output: {description: "The location to write the second-end output fastq file to.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + threePrime: {description: "Whether or not the UMI's are at the reads' 3' end. If false the UMIs are extracted from the 5' end.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + extractedRead1: {description: "First read with UMI extracted to read name."} + extractedRead2: {description: "Second read with UMI extracted to read name."} } } @@ -70,28 +79,31 @@ task Dedup { input { File inputBam File inputBamIndex - String? umiSeparator String outputBamPath - String? statsPrefix + String tmpDir = "./umiToolsDedupTmpDir" + Boolean paired = true - String memory = "5G" + String? umiSeparator + String? statsPrefix - # Use a multi-package-container which includes umi_tools (0.5.5) and samtools (1.9) - String dockerImage = "quay.io/biocontainers/mulled-v2-509311a44630c01d9cb7d2ac5727725f51ea43af:6089936aca6219b5bb5f54210ac5eb456c7503f2-0" + String memory = "25GiB" + Int timeMinutes = 30 + ceil(size(inputBam, "GiB") * 30) + String dockerImage = "quay.io/biocontainers/mulled-v2-509311a44630c01d9cb7d2ac5727725f51ea43af:3067b520386698317fd507c413baf7f901666fd4-0" } String outputBamIndex = sub(outputBamPath, "\.bam$", ".bai") command { set -e - mkdir -p "$(dirname ~{outputBamPath})" + mkdir -p "$(dirname ~{outputBamPath})" "~{tmpDir}" umi_tools dedup \ - --stdin ~{inputBam} \ - --stdout ~{outputBamPath} \ + --stdin=~{inputBam} \ + --stdout=~{outputBamPath} \ ~{"--output-stats " + statsPrefix} \ ~{"--umi-separator=" + umiSeparator} \ - ~{true="--paired" false="" paired} + ~{true="--paired" false="" paired} \ + --temp-dir=~{tmpDir} samtools index ~{outputBamPath} ~{outputBamIndex} } @@ -104,19 +116,29 @@ task Dedup { } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { + # inputs inputBam: {description: "The input BAM file.", categrory: "required"} inputBamIndex: {description: "The index for the ipnut BAM file.", cateogry: "required"} outputBamPath: {description: "The location to write the output BAM file to.", category: "required"} - statsPrefix: {description: "The prefix for the stats files.", category: "advanced"} - umiSeparator: {description: "Seperator used for UMIs in the read names.", category: "advanced"} + tmpDir: {description: "Temporary directory.", category: "advanced"} paired: {description: "Whether or not the data is paired.", category: "common"} + umiSeparator: {description: "Seperator used for UMIs in the read names.", category: "advanced"} + statsPrefix: {description: "The prefix for the stats files.", category: "advanced"} memory: {description: "The amount of memory required for the task.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + deduppedBam: {description: "Deduplicated BAM file."} + deduppedBamIndex: {description: "Index of the deduplicated BAM file."} + editDistance: {description: "Report of the (binned) average edit distance between the UMIs at each position."} + umiStats: {description: "UMI-level summary statistics."} + positionStats: {description: "The counts for unique combinations of UMI and position."} } } diff --git a/umi.wdl b/umi.wdl new file mode 100644 index 00000000..0628783a --- /dev/null +++ b/umi.wdl @@ -0,0 +1,107 @@ +version 1.0 + +# Copyright (c) 2022 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task BamReadNameToUmiTag { + + # This task processes a bam file with reads that have been extracted with + # umi-tools extract. The UMI is extracted from the read name again and put + # in the bam file again with umiTag (default RX) + input { + File inputBam + String outputPath = "output.bam" + String umiTag = "RX" + String separatorChar = "_" + + String memory = "2GiB" + Int timeMinutes = 1 + ceil(size([inputBam], "GiB") * 10) + String dockerImage = "quay.io/biocontainers/pysam:0.17.0--py39h051187c_0" + } + + String bamIndexPath = sub(select_first([outputPath]), "\.bam$", ".bai") + + command <<< + python < Tuple[str, str]: + id_and_rest = name.split(maxsplit=1) + id = id_and_rest[0] + # If there was no whitespace id_and_rest will have length 1 + other_parts = id_and_rest[1] if len(id_and_rest) == 2 else "" + underscore_index = id.rfind(separator_char) + umi = id[underscore_index + 1:] + new_id = id[:underscore_index] + if other_parts: + return " ".join([new_id, other_parts]), umi + return new_id, umi + + def annotate_umis(in_file, out_file, bam_tag="RX", separator_char = "_"): + in_bam = pysam.AlignmentFile(in_file, "rb") + os.makedirs(os.path.dirname(out_file), exist_ok=True) + out_bam = pysam.AlignmentFile(out_file, "wb", template=in_bam) + # Encode bam_tag as bytes. Otherwise pysam converts it to bytes anyway. + encoded_bam_tag = bam_tag.encode('ascii') + for segment in in_bam: # type: pysam.AlignedSegment + new_name, umi = split_umi_from_name(segment.query_name, separator_char) + segment.query_name = new_name + # Encode umi as ascii. Otherwise pysam encodes it to bytes anyway. + # Value type has to be a string though, otherwise pysam crashes. + segment.set_tag(encoded_bam_tag, umi.encode('ascii'), value_type="Z") + out_bam.write(segment) + + if __name__ == "__main__": + annotate_umis("~{inputBam}", "~{outputPath}", "~{umiTag}", "~{separatorChar}") + pysam.index("~{outputPath}", "~{bamIndexPath}", b=True) + CODE + >>> + + output { + File outputBam = outputPath + File outputBamIndex = bamIndexPath + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBam: {description: "The input SAM file.", category: "required"} + outputPath: {description: "Output directory path + output file.", category: "common"} + umiTag: {description: "The tag used for UMIs in the output BAM file.", category: "common"} + separatorChar: {description: "Character used to separate the UMIs from the read name.", category: "common"} + + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "Sorted BAM file."} + outputBamIndex: {description: "Sorted BAM file index."} + } +} diff --git a/unicycler.wdl b/unicycler.wdl index fc393603..d83db3ca 100644 --- a/unicycler.wdl +++ b/unicycler.wdl @@ -22,12 +22,13 @@ version 1.0 task Unicycler { input { + String out + String? preCommand File? short1 File? short2 File? unpaired File? long - String out Int? verbosity Int? minFastaLength Int? keep @@ -65,7 +66,7 @@ task Unicycler { String? lowScore Int threads = 1 - String memory = "4G" + String memory = "4GiB" } command { @@ -125,4 +126,4 @@ task Unicycler { cpu: threads memory: memory } -} \ No newline at end of file +} diff --git a/vardict.wdl b/vardict.wdl index 7bfd118e..187b4567 100644 --- a/vardict.wdl +++ b/vardict.wdl @@ -27,35 +27,35 @@ task VarDict { String tumorSampleName File tumorBam File tumorBamIndex - String? normalSampleName - File? normalBam - File? normalBamIndex File referenceFasta File referenceFastaFai File bedFile String outputVcf - - Int chromosomeColumn = 1 - Int startColumn = 2 - Int endColumn = 3 - Int geneColumn = 4 - Boolean outputCandidateSomaticOnly = true Boolean outputAllVariantsAtSamePosition = true Float mappingQuality = 20 Int minimumTotalDepth = 8 Int minimumVariantDepth = 4 Float minimumAlleleFrequency = 0.02 + Int chromosomeColumn = 1 + Int startColumn = 2 + Int endColumn = 3 + Int geneColumn = 4 + + String? normalSampleName + File? normalBam + File? normalBamIndex - Int threads = 1 - String memory = "40G" String javaXmx = "16G" + Int threads = 1 + String memory = "18GiB" + Int timeMinutes = 300 String dockerImage = "quay.io/biocontainers/vardict-java:1.5.8--1" } command { set -e -o pipefail - export JAVA_OPTS="-Xmx~{javaXmx}" + export JAVA_OPTS="-Xmx~{javaXmx} -XX:ParallelGCThreads=1" vardict-java \ ~{"-th " + threads} \ -G ~{referenceFasta} \ @@ -87,36 +87,39 @@ task VarDict { runtime { cpu: threads + 2 memory: memory + time_minutes: timeMinutes docker: dockerImage } parameter_meta { + # inputs tumorSampleName: {description: "The name of the tumor/case sample.", category: "required"} tumorBam: {description: "The tumor/case sample's BAM file.", category: "required"} tumorBamIndex: {description: "The index for the tumor/case sample's BAM file.", category: "required"} - normalSampleName: {description: "The name of the normal/control sample.", category: "common"} - normalBam: {description: "The normal/control sample's BAM file.", category: "common"} - normalBamIndex: {description: "The normal/control sample's BAM file.", category: "common"} referenceFasta: {description: "The reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} bedFile: {description: "A bed file describing the regions to operate on. These regions must be below 1e6 bases in size.", category: "required"} outputVcf: {description: "The location to write the output VCF file to.", category: "required"} - chromosomeColumn: {description: "Equivalent to vardict-java's `-c` option.", category: "advanced"} - startColumn: {description: "Equivalent to vardict-java's `-S` option.", category: "advanced"} - endColumn: {description: "Equivalent to vardict-java's `-E` option.", category: "advanced"} - geneColumn: {description: "Equivalent to vardict-java's `-g` option.", category: "advanced"} outputCandidateSomaticOnly: {description: "Equivalent to var2vcf_paired.pl or var2vcf_valid.pl's `-M` flag.", category: "advanced"} outputAllVariantsAtSamePosition: {description: "Equivalent to var2vcf_paired.pl or var2vcf_valid.pl's `-A` flag.", category: "advanced"} mappingQuality: {description: "Equivalent to var2vcf_paired.pl or var2vcf_valid.pl's `-Q` option.", category: "advanced"} minimumTotalDepth: {description: "Equivalent to var2vcf_paired.pl or var2vcf_valid.pl's `-d` option.", category: "advanced"} minimumVariantDepth: {description: "Equivalent to var2vcf_paired.pl or var2vcf_valid.pl's `-v` option.", category: "advanced"} minimumAlleleFrequency: {description: "Equivalent to var2vcf_paired.pl or var2vcf_valid.pl's `-f` option.", category: "advanced"} - + chromosomeColumn: {description: "Equivalent to vardict-java's `-c` option.", category: "advanced"} + startColumn: {description: "Equivalent to vardict-java's `-S` option.", category: "advanced"} + endColumn: {description: "Equivalent to vardict-java's `-E` option.", category: "advanced"} + geneColumn: {description: "Equivalent to vardict-java's `-g` option.", category: "advanced"} + normalSampleName: {description: "The name of the normal/control sample.", category: "common"} + normalBam: {description: "The normal/control sample's BAM file.", category: "common"} + normalBamIndex: {description: "The normal/control sample's BAM file.", category: "common"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} threads: {description: "The number of threads to use.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + vcfFile: {description: "Output VCF file."} } } diff --git a/vep.wdl b/vep.wdl new file mode 100644 index 00000000..2c1f923b --- /dev/null +++ b/vep.wdl @@ -0,0 +1,110 @@ +version 1.0 + +# Copyright (c) 2017 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Vep { + input { + File inputFile + String outputPath = "vep.annotated.vcf.gz" + File cacheTar + File? pluginsTar + String? species + Array[String] plugins = [] + Boolean refseq = false + Boolean merged = false + + Boolean everything = false + Boolean symbol = false + + String memory = "8GiB" + # Account time for unpacking the cache. + Int timeMinutes = 1 + ceil(size(cacheTar, "GiB")) + ceil(size(inputFile, "MiB") * 15) + String dockerImage = "quay.io/biocontainers/ensembl-vep:113.3--pl5321h2a3209d_0" + } + + command <<< + set -eu + mkdir vep_cache + mkdir -p "$(dirname ~{outputPath})" + tar -x --directory vep_cache -f ~{cacheTar} + ~{"tar -x --directory vep_cache -f " + pluginsTar} + + # Make sure vep can error, so the removal always succeeds. + set +e + # Output all stats files by default for MultiQC integration + vep \ + --input_file ~{inputFile} \ + --output_file ~{outputPath} \ + ~{"--species " + species} \ + --stats_html --stats_text \ + --dir vep_cache \ + --offline \ + ~{true="--plugin" false="" length(plugins) > 0} ~{sep=" --plugin " plugins} \ + --vcf \ + --compress_output bgzip \ + ~{true="--refseq" false="" refseq} \ + ~{true="--merged" false="" merged} \ + ~{true="--everything" false="" everything} \ + ~{true="--symbol" false="" symbol} + + VEP_EXIT_CODE=$? + set -e + # Cleanup the tar extract to save filesystem space + rm -rf vep_cache + + exit $VEP_EXIT_CODE + >>> + + output { + File outputFile = outputPath + File statsHtml = outputPath + "_summary.html" + File statsTxt = outputPath + "_summary.txt" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # input + inputFile: {description: "The VCF to annotate.", category: "required"} + outputPath: {description: "Where to put the output file", category: "advanced"} + cacheTar: {description: "A TAR archive containing the cache. The TAR archives from the VEP website work (http://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html)", category: "required"} + pluginsTar: {description: "A TAR file with custom plugins.", category: "advanced"} + species: {description: "Which species cache to use", category: "common"} + plugins: {description: "Which plugins to use", category: "common"} + refseq: {description: "Use the refseq cache", category: "common"} + merged: {description: "Use the merged cache", category: "common"} + everything: {description: "Use all annotation sources bundeld with vep.", category: "common"} + symbol: {description: "Add the gene symbol to the output where available", category: "advanced"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # output + outputFile: {description: "The annotated VEP VCF file."} + statsHtml: {description: "The VEP summary stats HTML file."} + statsTxt: {description: "The VEP summary stats TXT file."} + } +} diff --git a/vt.wdl b/vt.wdl index 54599db0..635641e9 100644 --- a/vt.wdl +++ b/vt.wdl @@ -1,6 +1,6 @@ version 1.0 -# Copyright (c) 2020 Sequencing Analysis Support Core - Leiden University Medical Center +# Copyright (c) 2020 Leiden University Medical Center # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -8,10 +8,10 @@ version 1.0 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -26,23 +26,42 @@ task Normalize { File inputVCFIndex File referenceFasta File referenceFastaFai - String outputPath = "./vt/normalized_decomposed.vcf" - String dockerImage = "quay.io/biocontainers/vt:0.57721--hdf88d34_2" - String memory = "4G" + Boolean ignoreMaskedRef = false + String outputPath = "./vt/normalized_decomposed.vcf.gz" + String? filterExpression + + Int compressionLevel = 1 + + String memory = "4GiB" + Int timeMinutes = 10 + ceil(size(inputVCF, "GiB") * 240) + String dockerImage = "quay.io/biocontainers/vt:0.57721--h2419454_12" } command { - set -e + set -eo pipefail mkdir -p "$(dirname ~{outputPath})" - vt normalize ~{inputVCF} -r ~{referenceFasta} | vt decompose -s - -o ~{outputPath} + vt view -h \ + ~{"-f '" + filterExpression}~{true="'" false="" defined(filterExpression)} \ + ~{inputVCF} \ + | vt normalize - \ + -r ~{referenceFasta} \ + ~{true="-m " false="" ignoreMaskedRef} \ + | vt decompose -s - \ + | vt view - \ + -c ~{compressionLevel} \ + -o ~{outputPath} + vt index ~{outputPath} } output { File outputVcf = outputPath + File outputVcfIndex = outputPath + ".tbi" } runtime { + cpu: 1 memory: memory + time_minutes: timeMinutes docker: dockerImage } @@ -50,11 +69,19 @@ task Normalize { # inputs inputVCF: {description: "The VCF file to process.", category: "required"} inputVCFIndex: {description: "The index of the VCF file to be processed.", category: "required"} - outputPath: {description: "The location the output VCF file should be written.", category: "common"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - memory: {description: "The memory required to run the programs", category: "advanced"} + ignoreMaskedRef: {description: "Warns but does not exit when REF is inconsistent with masked reference sequence for non SNPs.", category: "advanced"} + outputPath: {description: "The location the output VCF file should be written.", category: "common"} + filterExpression: {description: "See https://genome.sph.umich.edu/wiki/Vt#Filters for valid expressions.", category: "common"} + compressionLevel: {description: "Compression level for the out vcf.gz file.", category: "advanced"} + + memory: {description: "The memory required to run the programs.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "Normalized and decomposed VCF file."} + outputVcfIndex: {description: "Index for normalized and decomposed VCF file."} } } - diff --git a/whatshap.wdl b/whatshap.wdl new file mode 100644 index 00000000..b491f566 --- /dev/null +++ b/whatshap.wdl @@ -0,0 +1,233 @@ +version 1.0 + +# Copyright (c) 2018 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Phase { + input { + String outputVCF + File vcf + File vcfIndex + File phaseInput + File phaseInputIndex + + File? reference + File? referenceIndex + String? tag + String? algorithm + Boolean? indels + String? sample + String? chromosome + String? threshold + String? ped + + String memory = 2 + ceil(size(phaseInput, "G") / 20 ) + Int timeMinutes = 400 + ceil(size(phaseInput, "G") * 0.9 ) + + # Whatshap 1.0, tabix 0.2.5. + String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" + } + + command { + set -e + + mkdir -p $(dirname ~{outputVCF}) + + whatshap phase \ + ~{vcf} \ + ~{phaseInput} \ + ~{if defined(outputVCF) then ("--output " + '"' + outputVCF + '"') else ""} \ + ~{if defined(reference) then ("--reference " + '"' + reference + '"') else ""} \ + ~{if defined(tag) then ("--tag " + '"' + tag + '"') else ""} \ + ~{if defined(algorithm) then ("--algorithm " + '"' + algorithm + '"') else ""} \ + ~{true="--indels" false="" indels} \ + ~{if defined(sample) then ("--sample " + '"' + sample + '"') else ""} \ + ~{if defined(chromosome) then ("--chromosome " + '"' + chromosome + '"') else ""} \ + ~{if defined(threshold) then ("--threshold " + '"' + threshold + '"') else ""} \ + ~{if defined(ped) then ("--ped " + '"' + ped + '"') else ""} + + tabix -p vcf ~{outputVCF} + } + + output { + File phasedVCF = outputVCF + File phasedVCFIndex = outputVCF + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + outputVCF: {description: "Output VCF file. Add .gz to the file name to get compressed output. If omitted, use standard output.", category: "common"} + vcf: {description: "VCF or BCF file with variants to be phased (can be gzip-compressed).", category: "required"} + vcfIndex: {description: "Index for the VCF or BCF file with variants to be phased.", category: "required"} + phaseInput: {description: "BAM, CRAM, VCF or BCF file(s) with phase information, either through sequencing reads (BAM, CRAM) or through phased blocks (VCF, BCF).", category: "required"} + phaseInputIndex: {description: "Index of BAM, CRAM, VCF or BCF file(s) with phase information.", category: "required"} + reference: {description: "Reference file. Provide this to detect alleles through re-alignment. If no index (.fai) exists, it will be created.", category: "common"} + referenceIndex: {description: "Index of reference file.", category: "common"} + tag: {description: "Store phasing information with PS tag (standardized) or HP tag (used by GATK ReadBackedPhasing) (default: {description: PS).", category: "common"} + algorithm: {description: "Phasing algorithm to use (default: {description: whatshap).", category: "advanced"} + indels: {description: "Also phase indels (default: {description: do not phase indels).", category: "common"} + sample: {description: "Name of a sample to phase. If not given, all samples in the input VCF are phased. Can be used multiple times.", category: "common"} + chromosome: {description: "Name of chromosome to phase. If not given, all chromosomes in the input VCF are phased. Can be used multiple times.", category: "common"} + threshold: {description: "The threshold of the ratio between the probabilities that a pair of reads come from the same haplotype and different haplotypes in the read merging model (default: {description: 1000000).", category: "advanced"} + ped: {description: "Use pedigree information in PED file to improve phasing (switches to PedMEC algorithm). Columns 2, 3, 4 must refer to child, mother, and father sample names as used in the VCF and BAM/CRAM. Other columns are ignored.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + phasedVCF: {description: "VCF file containing phased variants."} + phasedVCFIndex: {description: "Index of phased VCF file."} + } +} + +task Stats { + input { + File vcf + + String? gtf + String? sample + String? tsv + String? blockList + String? chromosome + + String memory = "4GiB" + Int timeMinutes = 30 + # Whatshap 1.0, tabix 0.2.5. + String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" + } + + command { + set -e + + mkdir -p $(dirname ~{tsv}) + + whatshap stats \ + ~{vcf} \ + ~{if defined(gtf) then ("--gtf " + '"' + gtf + '"') else ""} \ + ~{if defined(sample) then ("--sample " + '"' + sample + '"') else ""} \ + ~{if defined(tsv) then ("--tsv " + '"' + tsv + '"') else ""} \ + ~{if defined(blockList) then ("--block-list " + '"' + blockList + '"') else ""} \ + ~{if defined(chromosome) then ("--chromosome " + '"' + chromosome + '"') else ""} + } + + output { + File? phasedGTF = gtf + File? phasedTSV = tsv + File? phasedBlockList = blockList + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + vcf: {description: "Phased VCF file.", category: "required"} + gtf: {description: "Write phased blocks to GTF file.", category: "common"} + sample: {description: "Name of the sample to process. If not given, use first sample found in VCF.", category: "common"} + tsv: {description: "Filename to write statistics to (tab-separated).", category: "common"} + blockList: {description: "Filename to write list of all blocks to (one block per line).", category: "advanced"} + chromosome: {description: "Name of chromosome to process. If not given, all chromosomes in the input VCF are considered.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + phasedGTF: {description: "Phasing statistics for a single VCF file."} + phasedTSV: {description: "Statistics in a tab-separated value format."} + phasedBlockList: {description: "List of the total number of phase sets/blocks."} + } +} + +task Haplotag { + input { + File vcf + File vcfIndex + File alignments + File alignmentsIndex + String outputFile + + File? reference + File? referenceFastaIndex + String? regions + String? sample + + String memory = 2 + ceil(size(alignments, "G") / 50 ) + Int timeMinutes = 50 + ceil(size(alignments, "G") * 2 ) + + # Whatshap 1.0, tabix 0.2.5. + String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" + } + + command { + set -e + + mkdir -p $(dirname ~{outputFile}) + + whatshap haplotag \ + ~{vcf} \ + ~{alignments} \ + ~{if defined(outputFile) then ("--output " + '"' + outputFile+ '"') else ""} \ + ~{if defined(reference) then ("--reference " + '"' + reference + '"') else ""} \ + ~{if defined(regions) then ("--regions " + '"' + regions + '"') else ""} \ + ~{if defined(sample) then ("--sample " + '"' + sample + '"') else ""} + + python3 -c "import pysam; pysam.index('~{outputFile}')" + } + + output { + File bam = outputFile + File bamIndex = outputFile + ".bai" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + vcf: {description: "VCF file with phased variants (must be gzip-compressed and indexed).", category: "required"} + vcfIndex: {description: "Index for the VCF or BCF file with variants to be phased.", category: "required"} + alignments: {description: "File (BAM/CRAM) with read alignments to be tagged by haplotype.", category: "required"} + alignmentsIndex: {description: "Index for the alignment file.", category: "required"} + outputFile: {description: "Output file. If omitted, use standard output.", category: "required"} + reference: {description: "Reference file. Provide this to detect alleles through re-alignment. If no index (.fai) exists, it will be created.", category: "common"} + referenceFastaIndex: {description: "Index for the reference file.", category: "common"} + regions: {description: "Specify region(s) of interest to limit the tagging to reads/variants overlapping those regions. You can specify a space-separated list of regions in the form of chrom:start-end, chrom (consider entire chromosome), or chrom:start (consider region from this start to end of chromosome).", category: "advanced"} + sample: {description: "Name of a sample to phase. If not given, all samples in the input VCF are phased. Can be used multiple times.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + bam: {description: "BAM file containing tagged reads for haplotype."} + bamIndex: {description: "Index of the tagged BAM file."} + } +} diff --git a/wisestork.wdl b/wisestork.wdl index 0fd812b1..bef54e27 100644 --- a/wisestork.wdl +++ b/wisestork.wdl @@ -22,13 +22,16 @@ version 1.0 task Count { input { - Int? binSize - File reference - File referenceIndex - File? binFile File inputBam File inputBamIndex + File reference + File referenceIndex String outputBed = "output.bed" + + Int? binSize + File? binFile + + String memory = "2GiB" String dockerImage = "quay.io/biocontainers/wisestork:0.1.2--pyh24bf2e0_0" } @@ -48,21 +51,25 @@ task Count { } runtime { + memory: memory docker: dockerImage } } task GcCorrect { input { - Int? binSize File reference File referenceIndex - File? binFile File inputBed String outputBed = "output.bed" + + Int? binSize + File? binFile Float? fracN Int? iter Float? fracLowess + + String memory = "2GiB" String dockerImage = "quay.io/biocontainers/wisestork:0.1.2--pyh24bf2e0_0" } @@ -85,19 +92,23 @@ task GcCorrect { } runtime { + memory: memory docker: dockerImage } } task Newref { input { - Int? binSize File reference File referenceIndex - File? binFile Array[File]+ inputBeds String outputBed = "output.bed" + + Int? binSize + File? binFile Int? nBins + + Int memory = 2 + ceil(length(inputBeds) * 0.15) String dockerImage = "quay.io/biocontainers/wisestork:0.1.2--pyh24bf2e0_0" } @@ -106,36 +117,37 @@ task Newref { mkdir -p $(dirname ~{outputBed}) wisestork newref \ ~{"--binsize " + binSize} \ - --reference ~{reference} \ - ~{"--bin-file " + binFile} \ - --output ~{outputBed} \ - -I ~{sep=" -I " inputBeds} \ - ~{"--n-bins " + nBins} + --reference ~{reference} \ + ~{"--bin-file " + binFile} \ + --output ~{outputBed} \ + -I ~{sep=" -I " inputBeds} \ + ~{"--n-bins " + nBins} } output { File bedFile = outputBed } - Int memory = 2 + ceil(length(inputBeds) * 0.15) - runtime { + memory: "~{memory}GiB" docker: dockerImage - memory: "~{memory}G" } } task Zscore { input { - Int? binSize File reference File referenceIndex - File? binFile File inputBed File inputBedIndex File dictionaryFile File dictionaryFileIndex String outputBed = "output.bed" + + Int? binSize + File? binFile + + String memory = "2GiB" String dockerImage = "quay.io/biocontainers/wisestork:0.1.2--pyh24bf2e0_0" } @@ -156,7 +168,7 @@ task Zscore { } runtime { + memory: memory docker: dockerImage } } -