From 319501e7ebbc0fa76baaac1d48d56294eda4b86c Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 11:21:25 +0100 Subject: [PATCH 01/13] Add a samtools split task --- samtools.wdl | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/samtools.wdl b/samtools.wdl index fbb445e7..a82bbda1 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -514,6 +514,61 @@ task Sort { } } +task Split { + input { + File inputBam + Directory outputPath + String? unaccountedPath + String? filenameFormat = "%!.%." + String? outputFormat = "bam" + Boolean writeIndex = false + + Int threads = 1 + String memory = "1GiB" + Int timeMinutes = 1 + ceil(size(inputBam, "GiB") * 2) + String dockerImage = "quay.io/biocontainers/samtools:1.16.1--h6899075_1" + } + + command { + set -e + mkdir -p "~{outputPath}" + samtools split \ + --output-fmt ~{outputFormat} \ + -f "~{outputPath}/rg/~{filenameFormat}" \ + ~{"-u " + unaccountedPath} \ + ~{true="--write-index" false="" writeIndex} \ + ~{inputBam} + } + + output { + Array[File] split = glob(outputPath + "/rg/*." + outputFormat) + File? unaccounted = unaccountedPath + } + + runtime { + cpu: threads + memory: memory + docker: dockerImage + time_minutes: timeMinutes + } + + parameter_meta { + # inputs + inputBam: {description: "The bam file to split.", category: "required"} + outputPath: {description: "Directory to store output bams", category: "required"} + + # Optional parameters + unaccountedPath: {description: "The location to write reads to which are not detected as being part of an existing read group.", category: "optional"} + filenameFormat: {description: "Format of the filename, the following tokens can be used: %% a literal % sign, %* basename, %# @RG index, %! @RG ID, %. filename extension for output format", category: "format"} + outputFormat: {description: "Format of output files (SAM, BAM, CRAM)", category: "format"} + writeIndex: {description: "Automatically index outputs", category: "indexing"} + + # outputs + split: {description: "BAM file split by read groups"} + unaccounted: {description: "Reads with no RG tag or an unrecognised RG tag."} + } +} + task Tabix { input { File inputFile From 60dcef74f6229d81d19436a361f3e4e6aa41ddd0 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 11:22:35 +0100 Subject: [PATCH 02/13] Register in changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 378731bd..2993ddc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ version 6.0.0-dev + Use softlinks to localise the database for centrifuge. + Added the FastqFilter task. + Added a new input `revcomp` to cutadapt to set the `--revcomp` flag, defaults to `false`. ++ New samtools task: split. version 5.2.0 --------------------------- From 4030091ee212be3cc040c69a61834684b8c8be0e Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 11:27:49 +0100 Subject: [PATCH 03/13] Directory not yet available --- samtools.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samtools.wdl b/samtools.wdl index a82bbda1..51230097 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -517,7 +517,7 @@ task Sort { task Split { input { File inputBam - Directory outputPath + String outputPath String? unaccountedPath String? filenameFormat = "%!.%." String? outputFormat = "bam" From 8a0de277c0b69a7607757a0c8c102a379e8e444c Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 11:28:19 +0100 Subject: [PATCH 04/13] Must be defined --- samtools.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samtools.wdl b/samtools.wdl index 51230097..a2be09a4 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -519,8 +519,8 @@ task Split { File inputBam String outputPath String? unaccountedPath - String? filenameFormat = "%!.%." - String? outputFormat = "bam" + String filenameFormat = "%!.%." + String outputFormat = "bam" Boolean writeIndex = false Int threads = 1 From b70891c3aea7314777aaf5122de3beadf10965e3 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 12:27:17 +0100 Subject: [PATCH 05/13] noticed in wdl-aid that only these are permitted --- samtools.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/samtools.wdl b/samtools.wdl index a2be09a4..2fe9a9f7 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -558,10 +558,10 @@ task Split { outputPath: {description: "Directory to store output bams", category: "required"} # Optional parameters - unaccountedPath: {description: "The location to write reads to which are not detected as being part of an existing read group.", category: "optional"} - filenameFormat: {description: "Format of the filename, the following tokens can be used: %% a literal % sign, %* basename, %# @RG index, %! @RG ID, %. filename extension for output format", category: "format"} - outputFormat: {description: "Format of output files (SAM, BAM, CRAM)", category: "format"} - writeIndex: {description: "Automatically index outputs", category: "indexing"} + unaccountedPath: {description: "The location to write reads to which are not detected as being part of an existing read group.", category: "common"} + filenameFormat: {description: "Format of the filename, the following tokens can be used: %% a literal % sign, %* basename, %# @RG index, %! @RG ID, %. filename extension for output format", category: "common"} + outputFormat: {description: "Format of output files (SAM, BAM, CRAM)", category: "advanced"} + writeIndex: {description: "Automatically index outputs", category: "advanced"} # outputs split: {description: "BAM file split by read groups"} From 1ec88558c5b21cb1362518b2c4af95a865abcc68 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 13:01:26 +0100 Subject: [PATCH 06/13] Add compression level parameter, defaulting to 1 --- samtools.wdl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/samtools.wdl b/samtools.wdl index 2fe9a9f7..c46ea88b 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -523,6 +523,8 @@ task Split { String outputFormat = "bam" Boolean writeIndex = false + Int compressionLevel = 1 + Int threads = 1 String memory = "1GiB" Int timeMinutes = 1 + ceil(size(inputBam, "GiB") * 2) @@ -534,6 +536,7 @@ task Split { mkdir -p "~{outputPath}" samtools split \ --output-fmt ~{outputFormat} \ + --output-fmt-option level=~{compressionLevel} \ -f "~{outputPath}/rg/~{filenameFormat}" \ ~{"-u " + unaccountedPath} \ ~{true="--write-index" false="" writeIndex} \ @@ -562,6 +565,7 @@ task Split { filenameFormat: {description: "Format of the filename, the following tokens can be used: %% a literal % sign, %* basename, %# @RG index, %! @RG ID, %. filename extension for output format", category: "common"} outputFormat: {description: "Format of output files (SAM, BAM, CRAM)", category: "advanced"} writeIndex: {description: "Automatically index outputs", category: "advanced"} + compressionLevel: {description: "Set compression level when writing gz or bgzf fastq files.", category: "advanced"} # outputs split: {description: "BAM file split by read groups"} From 153db04100bf78f07b898d523a6da84544d8a02b Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 13:01:37 +0100 Subject: [PATCH 07/13] default to indexing --- samtools.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samtools.wdl b/samtools.wdl index c46ea88b..554d0903 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -521,7 +521,7 @@ task Split { String? unaccountedPath String filenameFormat = "%!.%." String outputFormat = "bam" - Boolean writeIndex = false + Boolean writeIndex = true Int compressionLevel = 1 From 1522785ae1cec9254e5bf57f942260eab2babfd4 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 13:08:33 +0100 Subject: [PATCH 08/13] Remove control of output format --- samtools.wdl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/samtools.wdl b/samtools.wdl index 554d0903..7eba529c 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -520,7 +520,6 @@ task Split { String outputPath String? unaccountedPath String filenameFormat = "%!.%." - String outputFormat = "bam" Boolean writeIndex = true Int compressionLevel = 1 @@ -535,7 +534,7 @@ task Split { set -e mkdir -p "~{outputPath}" samtools split \ - --output-fmt ~{outputFormat} \ + --output-fmt bam \ --output-fmt-option level=~{compressionLevel} \ -f "~{outputPath}/rg/~{filenameFormat}" \ ~{"-u " + unaccountedPath} \ @@ -544,7 +543,7 @@ task Split { } output { - Array[File] split = glob(outputPath + "/rg/*." + outputFormat) + Array[File] splitBam = glob(outputPath + "/rg/*.bam") File? unaccounted = unaccountedPath } @@ -563,7 +562,6 @@ task Split { # Optional parameters unaccountedPath: {description: "The location to write reads to which are not detected as being part of an existing read group.", category: "common"} filenameFormat: {description: "Format of the filename, the following tokens can be used: %% a literal % sign, %* basename, %# @RG index, %! @RG ID, %. filename extension for output format", category: "common"} - outputFormat: {description: "Format of output files (SAM, BAM, CRAM)", category: "advanced"} writeIndex: {description: "Automatically index outputs", category: "advanced"} compressionLevel: {description: "Set compression level when writing gz or bgzf fastq files.", category: "advanced"} From 2bba90e99bbc61dc08905a569d8bbb3df285878a Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 13:08:42 +0100 Subject: [PATCH 09/13] include indexes --- samtools.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/samtools.wdl b/samtools.wdl index 7eba529c..bfed7560 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -544,6 +544,7 @@ task Split { output { Array[File] splitBam = glob(outputPath + "/rg/*.bam") + Array[File] splitBamIndex = glob(outputPath + "/rg/*.bai") File? unaccounted = unaccountedPath } @@ -566,7 +567,8 @@ task Split { compressionLevel: {description: "Set compression level when writing gz or bgzf fastq files.", category: "advanced"} # outputs - split: {description: "BAM file split by read groups"} + splitBam: {description: "BAM file split by read groups"} + splitBamIndex: {description: "BAM indexes"} unaccounted: {description: "Reads with no RG tag or an unrecognised RG tag."} } } From bd4a8567cdedabf6aa1e779fa1af731b09e64b49 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 15:19:02 +0100 Subject: [PATCH 10/13] write index is non-optional --- samtools.wdl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/samtools.wdl b/samtools.wdl index bfed7560..1660aac3 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -520,7 +520,6 @@ task Split { String outputPath String? unaccountedPath String filenameFormat = "%!.%." - Boolean writeIndex = true Int compressionLevel = 1 @@ -538,7 +537,7 @@ task Split { --output-fmt-option level=~{compressionLevel} \ -f "~{outputPath}/rg/~{filenameFormat}" \ ~{"-u " + unaccountedPath} \ - ~{true="--write-index" false="" writeIndex} \ + --write-index \ ~{inputBam} } @@ -563,7 +562,6 @@ task Split { # Optional parameters unaccountedPath: {description: "The location to write reads to which are not detected as being part of an existing read group.", category: "common"} filenameFormat: {description: "Format of the filename, the following tokens can be used: %% a literal % sign, %* basename, %# @RG index, %! @RG ID, %. filename extension for output format", category: "common"} - writeIndex: {description: "Automatically index outputs", category: "advanced"} compressionLevel: {description: "Set compression level when writing gz or bgzf fastq files.", category: "advanced"} # outputs From be0aabe03a8615dad5190b5e4c4c9869bb472c2e Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 15:49:15 +0100 Subject: [PATCH 11/13] make subdirectory as well --- samtools.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samtools.wdl b/samtools.wdl index 1660aac3..c452664c 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -531,7 +531,7 @@ task Split { command { set -e - mkdir -p "~{outputPath}" + mkdir -p "~{outputPath}/rg/" samtools split \ --output-fmt bam \ --output-fmt-option level=~{compressionLevel} \ From 10e83c1c116d55d148534c7f9fc56056773aadb7 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 16:03:06 +0100 Subject: [PATCH 12/13] emits csi extension instead --- samtools.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samtools.wdl b/samtools.wdl index c452664c..191a99a2 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -543,7 +543,7 @@ task Split { output { Array[File] splitBam = glob(outputPath + "/rg/*.bam") - Array[File] splitBamIndex = glob(outputPath + "/rg/*.bai") + Array[File] splitBamIndex = glob(outputPath + "/rg/*.bam.csi") File? unaccounted = unaccountedPath } From 6ebf7cd161f15add1c8ed9af8f000ab0952d232c Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 7 Mar 2025 16:14:42 +0100 Subject: [PATCH 13/13] missing threads --- samtools.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/samtools.wdl b/samtools.wdl index 191a99a2..19ad8dab 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -537,6 +537,7 @@ task Split { --output-fmt-option level=~{compressionLevel} \ -f "~{outputPath}/rg/~{filenameFormat}" \ ~{"-u " + unaccountedPath} \ + --threads ~{threads} \ --write-index \ ~{inputBam} }