diff --git a/CHANGELOG.md b/CHANGELOG.md index e0b036ac..c56b124a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,14 @@ that users understand how the changes affect the new version. version 6.0.0-dev --------------------------- ++ Add a task for bcftools norm. ++ Add support for outputting compressed files to snpeff and snpsift. ++ Fixed an issue with the parameter_meta section of bcftools annotate + which caused wdlTools to error on parsing the file. ++ Updated the bcftools view task with an input for an index file. ++ Updated the bcftools view task to allow specifying a region. ++ Added a task for SnpSift filter. ++ Updated the snpEff task to allow setting the `-no-upstream` flag. + Update vt task to allow a filter expression and compress and index the output. + MultiQC image updated to version 1.28 + Samtools merge now has options added for merging RG and PG headers. diff --git a/bcftools.wdl b/bcftools.wdl index 7df8911d..31c7db13 100644 --- a/bcftools.wdl +++ b/bcftools.wdl @@ -111,7 +111,7 @@ task Annotate { collapse: {description: "Treat as identical records with , see man page for details.", category: "advanced"} exclude: {description: "Exclude sites for which the expression is true (see man page for details).", category: "advanced"} headerLines: {description: "Lines to append to the VCF header (see man page for details).", category: "advanced"} - newId: {description: "Assign ID on the fly (e.g. --set-id +'%CHROM\_%POS').", category: "advanced"} + newId: {description: "Assign ID on the fly (e.g. --set-id +'%CHROM\\_%POS').", category: "advanced"} include: {description: "Select sites for which the expression is true (see man page for details).", category: "advanced"} markSites: {description: "Annotate sites which are present ('+') or absent ('-') in the -a file with a new INFO/TAG flag.", category: "advanced"} regions: {description: "Restrict to comma-separated list of regions.", category: "advanced"} @@ -180,6 +180,71 @@ task Filter { } } +task Norm { + input { + File inputFile + File? inputFileIndex + String outputPath = "output.vcf.gz" + + File? fasta + String? regions + Boolean splitMultiallelicSites = false + + String memory = "4GiB" + Int timeMinutes = 1 + ceil(size(inputFile, "G")) + Int diskGb = ceil(2.1 * size(inputFile, "G") + size(fasta, "G")) + String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" + } + + Boolean compressed = basename(outputPath) != basename(outputPath, ".gz") + + command { + set -e + ls ~{inputFile} ~{inputFileIndex} # dxCompiler localization workaroud + + mkdir -p "$(dirname ~{outputPath})" + bcftools norm \ + -o ~{outputPath} \ + -O ~{true="z" false="v" compressed} \ + ~{"--regions " + regions} \ + ~{"--fasta " + fasta} \ + ~{if splitMultiallelicSites then "--multiallelics -both" else ""} \ + ~{inputFile} + + ~{if compressed then "bcftools index --tbi ~{outputPath}" else ""} + } + + output { + File outputVcf = outputPath + File? outputVcfIndex = outputPath + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + disks: "local-disk ~{diskGb} SSD" # Based on an example in dxCompiler docs + } + + parameter_meta { + # inputs + inputFile: {description: "A vcf or bcf file.", category: "required"} + outputPath: {description: "The location the output VCF file should be written.", category: "common"} + fasta: {description: "Equivalent to bcftools norm's `--fasta` option.", category: "advanced"} + regions: {description: "Equivalent to bcftools norm's `--regions` option.", category: "advanced"} + splitMultiallelicSites: {description: "Whether multiallelic lines should be split up.", category: "advanced"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + diskGb: {description: "The amount of disk space needed for this job in GiB.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "Normalized VCF file."} + outputVcfIndex: {description: "Index of Normalized VCF file."} + } +} + task Sort { input { File inputFile @@ -344,11 +409,13 @@ task Stats { task View { input { File inputFile + File? inputFileIndex String outputPath = "output.vcf" Boolean excludeUncalled = false String? exclude String? include + String? region Array[String] samples = [] String memory = "256MiB" @@ -360,6 +427,8 @@ task View { command { set -e + ls ~{inputFile} ~{inputFileIndex} # dxCompiler localization workaroud + mkdir -p "$(dirname ~{outputPath})" bcftools view \ ~{"--exclude " + exclude} \ @@ -368,7 +437,8 @@ task View { ~{if length(samples) > 0 then "-s" else ""} ~{sep="," samples} \ -o ~{outputPath} \ -O ~{true="z" false="v" compressed} \ - ~{inputFile} + ~{inputFile} \ + ~{region} ~{if compressed then 'bcftools index --tbi ~{outputPath}' else ''} } @@ -387,9 +457,11 @@ task View { parameter_meta { # inputs inputFile: {description: "A vcf or bcf file.", category: "required"} + inputFileIndex: {description: "the index for the input file.", category: "common"} outputPath: {description: "The location the output VCF file should be written.", category: "common"} include: {description: "Select sites for which the expression is true (see man page for details).", category: "advanced"} exclude: {description: "Exclude sites for which the expression is true (see man page for details).", category: "advanced"} + region: {description: "The region to retrieve from the VCF file.", category: "common"} excludeUncalled: {description: "Exclude sites without a called genotype (see man page for details).", category: "advanced"} samples: {description: "A list of sample names to include.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} diff --git a/snpeff.wdl b/snpeff.wdl index 0f14e5b5..b972ab30 100644 --- a/snpeff.wdl +++ b/snpeff.wdl @@ -32,6 +32,7 @@ task SnpEff { Boolean hgvs = true Boolean lof = true Boolean noDownstream = false + Boolean noUpstream = false Boolean noIntergenic = false Boolean noShiftHgvs = false Int? upDownStreamLen @@ -39,11 +40,15 @@ task SnpEff { String memory = "9GiB" String javaXmx = "8G" Int timeMinutes = 60 - String dockerImage = "quay.io/biocontainers/snpeff:5.0--0" + # Multicontainer with snpeff 5.2 and bgzip/tabix 1.19.1 + String dockerImage = "quay.io/biocontainers/mulled-v2-2fe536b56916bd1d61a6a1889eb2987d9ea0cd2f:c51b2e46bf63786b2d9a7a7d23680791163ab39a-0" } + Boolean compressed = basename(outputPath) != basename(outputPath, ".gz") + command { set -e + ls ~{vcf} ~{vcfIndex} # dxCompiler localization workaroud mkdir -p "$(dirname ~{outputPath})" unzip ~{datadirZip} snpEff -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ @@ -55,15 +60,19 @@ task SnpEff { ~{true="-hgvs" false="-noHgvs" hgvs} \ ~{true="-lof" false="-noLof" lof} \ ~{true="-no-downstream" false="" noDownstream} \ + ~{true="-no-upstream" false="" noUpstream} \ ~{true="-no-intergenic" false="" noIntergenic} \ ~{true="-noShiftHgvs" false="" noShiftHgvs} \ ~{"-upDownStreamLen " + upDownStreamLen} \ - > ~{outputPath} + ~{if compressed then "| bgzip " else ""} > ~{outputPath} + + ~{if compressed then "tabix ~{outputPath}" else ""} rm -r $PWD/data } output { File outputVcf = outputPath + File? outputVcfIndex = outputPath + ".tbi" } runtime { @@ -73,6 +82,7 @@ task SnpEff { } parameter_meta { + # inputs vcf: {description: "A VCF file to analyse.", category: "required"} vcfIndex: {description: "The index for the VCF file.", category: "required"} genomeVersion: {description: "The version of the genome to be used. The database for this genome must be present in the datadirZip.", category: "required"} @@ -82,6 +92,7 @@ task SnpEff { hgvs: {description: "Equivalent to `-hgvs` if true or `-noHgvs` if false.", category: "advanced"} lof: {description: "Equivalent to `-lof` if true or `-noLof` if false.", category: "advanced"} noDownstream: {description: "Equivalent to the `-no-downstream` flag.", category: "advanced"} + noUpstream: {description: "Equivalent to the `-no-upstream` flag.", category: "advanced"} noIntergenic: {description: "Equivalent to the `-no-intergenic` flag.", category: "advanced"} noShiftHgvs: {description: "Equivalent to the `-noShiftHgvs` flag.", category: "advanced"} upDownStreamLen: {descriptoin: "Equivalent to the `-upDownStreamLen` option.", category: "advanced"} @@ -92,5 +103,9 @@ task SnpEff { timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "Annotated VCF file."} + outputVcfIndex: {description: "Index of annotated VCF file."} } } diff --git a/snpsift.wdl b/snpsift.wdl new file mode 100644 index 00000000..a62f7295 --- /dev/null +++ b/snpsift.wdl @@ -0,0 +1,84 @@ +version 1.0 + +# MIT License +# +# Copyright (c) 2025 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task SnpSiftFilter { + input { + File vcf + File? vcfIndex + String filterExpression + String outputPath = "./snpsift_filter.vcf" + + String memory = "9GiB" + String javaXmx = "8G" + Int timeMinutes = 60 + # Multicontainer with SnpSift 5.2 and bgzip/tabix 1.22 + String dockerImage = "quay.io/biocontainers/mulled-v2-d4bc0c23eb1d95c7ecff7f0e8b3a4255503fd5d4:c51b2e46bf63786b2d9a7a7d23680791163ab39a-0" + } + + Boolean compressed = basename(outputPath) != basename(outputPath, ".gz") + + command { + set -e + ls ~{vcf} ~{vcfIndex} # dxCompiler localization workaroud + + mkdir -p "$(dirname ~{outputPath})" + SnpSift -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + filter \ + "~{filterExpression}" \ + ~{vcf} \ + ~{if compressed then "| bgzip " else ""} > ~{outputPath} + + ~{if compressed then "tabix ~{outputPath}" else ""} + } + + output { + File outputVcf = outputPath + File? outputVcfIndex = outputPath + ".tbi" + } + + runtime { + docker: dockerImage + time_minutes: timeMinutes # !UnknownRuntimeKey + memory: memory + } + + parameter_meta { + # inputs + vcf: {description: "A VCF file to filter.", category: "required"} + vcfIndex: {description: "The index for the VCF file.", category: "common"} + filterExpression: {description: "The SnpSift filtering expression.", category: "required"} + outputPath: {description: "The path to write the output to.", category: "common"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + + # outputs + outputVcf: {description: "Filtered VCF file."} + outputVcfIndex: {description: "Index of filtered VCF file."} + } +}