From 9e9ae08503c7c2e10c0fe16d018bfb2810c4f3de Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Tue, 22 Jul 2025 11:36:55 +0200 Subject: [PATCH 1/7] Update clair3 image --- CHANGELOG.md | 1 + clair3.wdl | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c56b124a..5cabdece 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,7 @@ version 6.0.0-dev + Add support for filterThreshold/filterPercent for `modkit.Pileup`. + Add `modkit.Summary` task. + Disable the one-click GDPR dataleak button in MultiQC `--no-ai` by default. ++ Update clair3 version from 1.0.11 to 1.1.0 version 5.2.0 --------------------------- diff --git a/clair3.wdl b/clair3.wdl index 5a6154af..ae54ef40 100644 --- a/clair3.wdl +++ b/clair3.wdl @@ -34,8 +34,8 @@ task Clair3 { Int threads = 8 Boolean includeAllCtgs = false String memory = "~{threads + 16}GiB" - Int timeMinutes = 10 + ceil(size(bam, "G") * 400 / threads) - String dockerImage = "quay.io/biocontainers/clair3:1.0.11--py39hd649744_0" + Int timeMinutes = 10 + ceil(size(bam, "G") * 200 / threads) + String dockerImage = "quay.io/biocontainers/clair3:1.1.0--py39hd649744_0" } String modelArg = "~{if defined(modelTar) then basename(select_first([modelTar]), '.tar.gz') else builtinModel}" @@ -91,4 +91,4 @@ task Clair3 { vcfIndex: {description: "Output VCF index."} } -} \ No newline at end of file +} From d648745cfeedbc816081547f9772f0ee2d9f1692 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Tue, 22 Jul 2025 12:13:43 +0200 Subject: [PATCH 2/7] Improve whatshap runtime/memory estimates --- CHANGELOG.md | 1 + whatshap.wdl | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5cabdece..cfb8f41d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -55,6 +55,7 @@ version 6.0.0-dev + Add `modkit.Summary` task. + Disable the one-click GDPR dataleak button in MultiQC `--no-ai` by default. + Update clair3 version from 1.0.11 to 1.1.0 ++ Improve whatshap runtime/memory usage for our cluster. version 5.2.0 --------------------------- diff --git a/whatshap.wdl b/whatshap.wdl index da86ad82..beef5e99 100644 --- a/whatshap.wdl +++ b/whatshap.wdl @@ -40,12 +40,19 @@ task Phase { String memory = "4GiB" Int timeMinutes = 120 + + String memory = 2 + ceil(size(bam, "G") / 20 ) + Int timeMinutes = 400 + ceil(size(bam, "G") * 0.9 ) + # Whatshap 1.0, tabix 0.2.5. String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" } command { set -e + + mkdir -p $(dirname ~{outputVCF}) + whatshap phase \ ~{vcf} \ ~{phaseInput} \ @@ -110,12 +117,16 @@ task Stats { String? chromosome String memory = "4GiB" - Int timeMinutes = 120 + Int timeMinutes = 30 # Whatshap 1.0, tabix 0.2.5. String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" } command { + set -e + + mkdir -p $(dirname ~{tsv}) + whatshap stats \ ~{vcf} \ ~{if defined(gtf) then ("--gtf " + '"' + gtf + '"') else ""} \ @@ -169,7 +180,9 @@ task Haplotag { String? regions String? sample - String memory = "4GiB" + String memory = 2 + ceil(size(bam, "G") / 50 ) + Int timeMinutes = 50 + ceil(size(bam, "G") * 2 ) + Int timeMinutes = 120 # Whatshap 1.0, tabix 0.2.5. String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" @@ -177,6 +190,9 @@ task Haplotag { command { set -e + + mkdir -p $(dirname ~{outputFile}) + whatshap haplotag \ ~{vcf} \ ~{alignments} \ From 7e246b01de31489577c434f69a5adbd2ab7cea2c Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Tue, 22 Jul 2025 12:18:54 +0200 Subject: [PATCH 3/7] Add modkit tasks --- CHANGELOG.md | 3 + modkit.wdl | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 193 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cfb8f41d..bf9d9238 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,9 @@ version 6.0.0-dev + Disable the one-click GDPR dataleak button in MultiQC `--no-ai` by default. + Update clair3 version from 1.0.11 to 1.1.0 + Improve whatshap runtime/memory usage for our cluster. ++ Add `Modkit.SampleProbs` ++ Add `Modkit.DmrMulti` ++ Add `Modkit.DmrMultiInputPrep` to construct the command line for `Modkit.DmrMulti` version 5.2.0 --------------------------- diff --git a/modkit.wdl b/modkit.wdl index 678e326a..a35d8ed2 100644 --- a/modkit.wdl +++ b/modkit.wdl @@ -130,7 +130,7 @@ task Summary { Int threads = 4 String memory = ceil(size(bam, "GiB") * 0.1) + 5 # Based on a linear model with some fudge (memory = 0.07540 * file_size - 0.6). - Int timeMinutes = 2880 / threads # 2 Days / threads + Int timeMinutes = 60 # originally this was set at "2 Days / threads" but with 4 threads and that much ram, it's pretty fast. String dockerImage = "quay.io/biocontainers/ont-modkit:0.4.3--hcdda2d0_0" } @@ -177,3 +177,192 @@ task Summary { summaryReport: {description: "The output modkit summary."} } } + +task SampleProbs { + input { + File bam + File bamIndex + + String summary = "modkit-sample-probs" + + Boolean sample = true + Int? numReads # = 10042 + Float? samplingFrac # = 0.1 + Int? seed + + Int threads = 4 + String memory = "32G" + Int timeMinutes = 60 + String dockerImage = "quay.io/biocontainers/ont-modkit:0.4.3--hcdda2d0_0" + } + + command <<< + set -e + mkdir -p ~{summary} + + modkit sample-probs \ + --threads ~{threads} \ + --out-dir ~{summary} \ + ~{true="" false="--no-sampling" sample} \ + ~{"--num-reads " + numReads} \ + ~{"--sampling-frac " + samplingFrac} \ + ~{"--seed " + seed} \ + --hist \ + ~{bam} + >>> + + output { + File reportCounts = "~{summary}/counts.html" + File reportProportion = "~{summary}/proportion.html" + File reportProbabilitiesTsv = "~{summary}/probabilities.tsv" + File reportThresholdsTsv = "~{summary}/thresholds.tsv" + } + + runtime { + docker: dockerImage + cpu: threads + memory: memory + time_minutes: timeMinutes + } + + parameter_meta { + # input + bam: {description: "The input alignment file", category: "required"} + bamIndex: {description: "The index for the input alignment file", category: "required"} + + sample: {description: "Allows you to disable sampling and report stats for the whole file.", category: "advanced"} + numReads: {description: "By default a fixed amount of reads are read, you can set this to change the number of reads to sample.", category: "advanced"} + samplingFrac: {description: "Use a fixed percentage of reads, rather than a fixed number of reads, for sampling.", category: "advanced"} + seed: {description: "A seed can be provided for reproducibility in the sampling fraction case.", category: "advanced"} + + threads: {description: "The number of threads to use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # output + summaryReport: {description: "The output modkit summary."} + } +} + +task DmrMultiInputPrep { + input { + Array[File] control + Array[File] condition + String controlName + String conditionName + + Int threads = 1 + String memory = "1G" + Int timeMinutes = 5 + String dockerImage = "quay.io/biocontainers/multiqc:1.28--pyhdfd78af_0" + } + + command <<< + cat > modkit_dmr.py <<'CODE' + #!/usr/bin/env python3 + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--control_n', type=str, default='control') + parser.add_argument('--control_f', type=str,nargs='+') + parser.add_argument('--condition_n', type=str, default='condition') + parser.add_argument('--condition_f', type=str,nargs='+') + args = parser.parse_args() + modkit = [] + for i, x in enumerate(args.control_f): + modkit.extend(['-s', x, f'{args.control_n}{i}']) + for i, x in enumerate(args.condition_f): + modkit.extend(['-s', x, f'{args.condition_n}{i}']) + print(' '.join(modkit), end='') + CODE + + python modkit_dmr.py \ + --control_n ~{controlName} \ + --control_f ~{sep=" " control} \ + --condition_n ~{conditionName} \ + --condition_f ~{sep=" " condition} + >>> + + output { + String params = select_first(read_lines(stdout())) + } + + runtime { + docker: dockerImage + cpu: threads + memory: memory + time_minutes: timeMinutes + } +} + + +task DmrMulti { + input { + String dmrMultiArguments + Array[File] control + Array[File] condition + + Array[File] controlIndex + Array[File] conditionIndex + + String controlName + String conditionName + + File referenceFasta + File referenceFastaFai + String dmr_dir = "results" + + File? cpg_islands + + Int threads = 4 + String memory = "32G" + Int timeMinutes = 600 + String dockerImage = "quay.io/biocontainers/ont-modkit:0.4.3--hcdda2d0_0" + } + + command <<< + set -e + mkdir -p ~{dmr_dir} + + modkit dmr multi \ + ~{dmrMultiArguments} \ + --out-dir ~{dmr_dir} \ + ~{"--regions-bed " + cpg_islands} \ + --ref ~{referenceFasta} \ + --base C \ + --threads ~{threads} \ + --header \ + --log-filepath dmr_multi.log + >>> + + output { + # TODO: other files + File log = "dmr_multi.log" + } + + runtime { + docker: dockerImage + cpu: threads + memory: memory + time_minutes: timeMinutes + } + + parameter_meta { + # input + bam: {description: "The input alignment file", category: "required"} + bamIndex: {description: "The index for the input alignment file", category: "required"} + + sample: {description: "Allows you to disable sampling and report stats for the whole file.", category: "advanced"} + numReads: {description: "By default a fixed amount of reads are read, you can set this to change the number of reads to sample.", category: "advanced"} + samplingFrac: {description: "Use a fixed percentage of reads, rather than a fixed number of reads, for sampling.", category: "advanced"} + seed: {description: "A seed can be provided for reproducibility in the sampling fraction case.", category: "advanced"} + + threads: {description: "The number of threads to use.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # output + summaryReport: {description: "The output modkit summary."} + } +} From 8564f8c0a757cecd00155de011e10a1e51ab32cf Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Tue, 22 Jul 2025 12:29:51 +0200 Subject: [PATCH 4/7] I don't feel like documenting it if it isn't used --- CHANGELOG.md | 2 - modkit.wdl | 122 --------------------------------------------------- 2 files changed, 124 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf9d9238..bb09f4f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,8 +57,6 @@ version 6.0.0-dev + Update clair3 version from 1.0.11 to 1.1.0 + Improve whatshap runtime/memory usage for our cluster. + Add `Modkit.SampleProbs` -+ Add `Modkit.DmrMulti` -+ Add `Modkit.DmrMultiInputPrep` to construct the command line for `Modkit.DmrMulti` version 5.2.0 --------------------------- diff --git a/modkit.wdl b/modkit.wdl index a35d8ed2..b38929f5 100644 --- a/modkit.wdl +++ b/modkit.wdl @@ -244,125 +244,3 @@ task SampleProbs { summaryReport: {description: "The output modkit summary."} } } - -task DmrMultiInputPrep { - input { - Array[File] control - Array[File] condition - String controlName - String conditionName - - Int threads = 1 - String memory = "1G" - Int timeMinutes = 5 - String dockerImage = "quay.io/biocontainers/multiqc:1.28--pyhdfd78af_0" - } - - command <<< - cat > modkit_dmr.py <<'CODE' - #!/usr/bin/env python3 - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('--control_n', type=str, default='control') - parser.add_argument('--control_f', type=str,nargs='+') - parser.add_argument('--condition_n', type=str, default='condition') - parser.add_argument('--condition_f', type=str,nargs='+') - args = parser.parse_args() - modkit = [] - for i, x in enumerate(args.control_f): - modkit.extend(['-s', x, f'{args.control_n}{i}']) - for i, x in enumerate(args.condition_f): - modkit.extend(['-s', x, f'{args.condition_n}{i}']) - print(' '.join(modkit), end='') - CODE - - python modkit_dmr.py \ - --control_n ~{controlName} \ - --control_f ~{sep=" " control} \ - --condition_n ~{conditionName} \ - --condition_f ~{sep=" " condition} - >>> - - output { - String params = select_first(read_lines(stdout())) - } - - runtime { - docker: dockerImage - cpu: threads - memory: memory - time_minutes: timeMinutes - } -} - - -task DmrMulti { - input { - String dmrMultiArguments - Array[File] control - Array[File] condition - - Array[File] controlIndex - Array[File] conditionIndex - - String controlName - String conditionName - - File referenceFasta - File referenceFastaFai - String dmr_dir = "results" - - File? cpg_islands - - Int threads = 4 - String memory = "32G" - Int timeMinutes = 600 - String dockerImage = "quay.io/biocontainers/ont-modkit:0.4.3--hcdda2d0_0" - } - - command <<< - set -e - mkdir -p ~{dmr_dir} - - modkit dmr multi \ - ~{dmrMultiArguments} \ - --out-dir ~{dmr_dir} \ - ~{"--regions-bed " + cpg_islands} \ - --ref ~{referenceFasta} \ - --base C \ - --threads ~{threads} \ - --header \ - --log-filepath dmr_multi.log - >>> - - output { - # TODO: other files - File log = "dmr_multi.log" - } - - runtime { - docker: dockerImage - cpu: threads - memory: memory - time_minutes: timeMinutes - } - - parameter_meta { - # input - bam: {description: "The input alignment file", category: "required"} - bamIndex: {description: "The index for the input alignment file", category: "required"} - - sample: {description: "Allows you to disable sampling and report stats for the whole file.", category: "advanced"} - numReads: {description: "By default a fixed amount of reads are read, you can set this to change the number of reads to sample.", category: "advanced"} - samplingFrac: {description: "Use a fixed percentage of reads, rather than a fixed number of reads, for sampling.", category: "advanced"} - seed: {description: "A seed can be provided for reproducibility in the sampling fraction case.", category: "advanced"} - - threads: {description: "The number of threads to use.", category: "advanced"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} - - # output - summaryReport: {description: "The output modkit summary."} - } -} From 040a43e6723d672f97f126a1628e773d75fb6515 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Tue, 22 Jul 2025 12:29:51 +0200 Subject: [PATCH 5/7] I don't feel like documenting it if it isn't used --- modkit.wdl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modkit.wdl b/modkit.wdl index b38929f5..ddf4dbf7 100644 --- a/modkit.wdl +++ b/modkit.wdl @@ -229,6 +229,7 @@ task SampleProbs { # input bam: {description: "The input alignment file", category: "required"} bamIndex: {description: "The index for the input alignment file", category: "required"} + summary: {description: "A folder for the outputs", category: "required"} sample: {description: "Allows you to disable sampling and report stats for the whole file.", category: "advanced"} numReads: {description: "By default a fixed amount of reads are read, you can set this to change the number of reads to sample.", category: "advanced"} @@ -241,6 +242,9 @@ task SampleProbs { dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # output - summaryReport: {description: "The output modkit summary."} + reportCounts: {description: "The output html report of counts"} + reportProportion: {description: "The output html report of proportions"} + reportProbabilitiesTsv: {description: "The output TSV of Probabilities"} + reportThresholdsTsv: {description: "The output TSV of thresholds"} } } From bd54aeb0bcdd67db3fb180b890954bca92000287 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Tue, 22 Jul 2025 13:13:43 +0200 Subject: [PATCH 6/7] incorrect inputs --- whatshap.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/whatshap.wdl b/whatshap.wdl index beef5e99..3b2bd1d3 100644 --- a/whatshap.wdl +++ b/whatshap.wdl @@ -41,8 +41,8 @@ task Phase { String memory = "4GiB" Int timeMinutes = 120 - String memory = 2 + ceil(size(bam, "G") / 20 ) - Int timeMinutes = 400 + ceil(size(bam, "G") * 0.9 ) + String memory = 2 + ceil(size(phaseInput, "G") / 20 ) + Int timeMinutes = 400 + ceil(size(phaseInput, "G") * 0.9 ) # Whatshap 1.0, tabix 0.2.5. String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" @@ -180,8 +180,8 @@ task Haplotag { String? regions String? sample - String memory = 2 + ceil(size(bam, "G") / 50 ) - Int timeMinutes = 50 + ceil(size(bam, "G") * 2 ) + String memory = 2 + ceil(size(alignments, "G") / 50 ) + Int timeMinutes = 50 + ceil(size(alignments, "G") * 2 ) Int timeMinutes = 120 # Whatshap 1.0, tabix 0.2.5. From e7061594546ceac5e7bbcdc48877bc78b5ec795c Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Tue, 22 Jul 2025 15:46:06 +0200 Subject: [PATCH 7/7] Fix duplicate declarations --- whatshap.wdl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/whatshap.wdl b/whatshap.wdl index 3b2bd1d3..b491f566 100644 --- a/whatshap.wdl +++ b/whatshap.wdl @@ -38,9 +38,6 @@ task Phase { String? threshold String? ped - String memory = "4GiB" - Int timeMinutes = 120 - String memory = 2 + ceil(size(phaseInput, "G") / 20 ) Int timeMinutes = 400 + ceil(size(phaseInput, "G") * 0.9 ) @@ -183,7 +180,6 @@ task Haplotag { String memory = 2 + ceil(size(alignments, "G") / 50 ) Int timeMinutes = 50 + ceil(size(alignments, "G") * 2 ) - Int timeMinutes = 120 # Whatshap 1.0, tabix 0.2.5. String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" }