diff --git a/CHANGELOG.md b/CHANGELOG.md index c56b124a..ff7a99f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,11 @@ that users understand how the changes affect the new version. version 6.0.0-dev --------------------------- ++ Added tasks for GEVA: + + ConvertToBin + + Geva + + EstimateAge ++ Updated bcftools view task with an input to provide a samples file. + Add a task for bcftools norm. + Add support for outputting compressed files to snpeff and snpsift. + Fixed an issue with the parameter_meta section of bcftools annotate diff --git a/bcftools.wdl b/bcftools.wdl index 31c7db13..acd74cdc 100644 --- a/bcftools.wdl +++ b/bcftools.wdl @@ -58,6 +58,7 @@ task Annotate { set -e mkdir -p "$(dirname ~{outputPath})" bcftools annotate \ + --threads ~{threads} \ -o ~{outputPath} \ -O ~{true="z" false="v" compressed} \ ~{"--annotations " + annsFile} \ @@ -89,6 +90,7 @@ task Annotate { } runtime { + cpu: threads + 1 memory: memory time_minutes: timeMinutes docker: dockerImage @@ -118,7 +120,7 @@ task Annotate { regionsFile: {description: "Restrict to regions listed in a file.", category: "advanced"} renameChrs: {description: "rename chromosomes according to the map in file (see man page for details).", category: "advanced"} samplesFile: {description: "File of samples to include.", category: "advanced"} - threads: {description: "Number of extra decompression threads [0].", category: "advanced"} + threads: {description: "Number of extra compression threads.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} @@ -129,6 +131,66 @@ task Annotate { } } +task Concat { + input { + Array[File]+ vcfFiles + Array[File]+ vcfIndexes + String outputPath + Boolean naive = false + + Int threads = 0 + String memory = "4GiB" + Int timeMinutes = 10 + ceil(size(vcfFiles, "G")) + Int diskGb = ceil(2.1 * size(vcfFiles, "G")) + String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + ls ~{sep=" " vcfFiles} ~{sep=" " vcfIndexes} # dxCompiler localization workaroud + + bcftools \ + concat \ + --threads ~{threads} \ + -O z \ + -o ~{outputPath} \ + ~{if naive then "--naive" else ""} \ + ~{sep=" " vcfFiles} + bcftools index --tbi ~{outputPath} + } + + output { + File outputVcf = outputPath + File outputVcfIndex = outputPath + ".tbi" + } + + runtime { + cpu: threads + 1 + memory: memory + time_minutes: timeMinutes + disks: "local-disk ~{diskGb} SSD" # Based on an example in dxCompiler docs + docker: dockerImage + } + + parameter_meta { + # inputs + vcfFiles: {description: "A list of vcf files.", category: "required"} + vcfIndexes: {description: "the index for the input file.", category: "common"} + outputPath: {description: "The location the output VCF file should be written.", category: "common"} + naive: {description: "Equivalent to bcftools concat's `--naive` flag.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + threads: {description: "Number of extra compression threads.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + diskGb: {description: "The amount of disk space needed for this job in GiB.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputVcf: {description: "VCF file."} + outputVcfIndex: {description: "Index of VCF file."} + } +} + task Filter { input { File vcf @@ -192,7 +254,7 @@ task Norm { String memory = "4GiB" Int timeMinutes = 1 + ceil(size(inputFile, "G")) - Int diskGb = ceil(2.1 * size(inputFile, "G") + size(fasta, "G")) + Int diskGb = 1 + ceil(2.1 * size(inputFile, "G") + size(fasta, "G")) String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" } @@ -396,7 +458,7 @@ task Stats { targets: {description: "Similar to regions but streams rather than index-jumps.", category: "advanced"} targetsFile: {description: "Similar to regionsFile but streams rather than index-jumps.", category: "advanced"} userTsTv: {description: ". Collect Ts/Tv stats for any tag using the given binning [0:1:100].", category: "advanced"} - threads: {description: "Number of extra decompression threads [0].", category: "advanced"} + threads: {description: "Number of extra compression threads.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} @@ -416,10 +478,14 @@ task View { String? exclude String? include String? region + File? regionsFile Array[String] samples = [] + File? samplesFile String memory = "256MiB" Int timeMinutes = 1 + ceil(size(inputFile, "G")) + Int threads = 0 + Int diskGb = 1 + ceil(2.1 * size(inputFile, "G")) String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" } @@ -431,10 +497,13 @@ task View { mkdir -p "$(dirname ~{outputPath})" bcftools view \ + --threads ~{threads} \ ~{"--exclude " + exclude} \ ~{"--include " + include} \ ~{true="--exclude-uncalled" false="" excludeUncalled} \ ~{if length(samples) > 0 then "-s" else ""} ~{sep="," samples} \ + ~{"--samples-file " + samplesFile} \ + ~{"--regions-file " + regionsFile} \ -o ~{outputPath} \ -O ~{true="z" false="v" compressed} \ ~{inputFile} \ @@ -449,8 +518,10 @@ task View { } runtime { + cpu: threads + 1 memory: memory time_minutes: timeMinutes + disks: "local-disk ~{diskGb} SSD" # Based on an example in dxCompiler docs docker: dockerImage } @@ -462,10 +533,14 @@ task View { include: {description: "Select sites for which the expression is true (see man page for details).", category: "advanced"} exclude: {description: "Exclude sites for which the expression is true (see man page for details).", category: "advanced"} region: {description: "The region to retrieve from the VCF file.", category: "common"} + regionsFile: {description: "File of regions to include.", category: "advanced"} excludeUncalled: {description: "Exclude sites without a called genotype (see man page for details).", category: "advanced"} samples: {description: "A list of sample names to include.", category: "advanced"} + samplesFile: {description: "File of samples to include.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} + threads: {description: "Number of extra compression threads.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + diskGb: {description: "The amount of disk space needed for this job in GiB.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs diff --git a/chunked-scatter.wdl b/chunked-scatter.wdl index af24b139..6b1ac648 100644 --- a/chunked-scatter.wdl +++ b/chunked-scatter.wdl @@ -59,7 +59,7 @@ task ChunkedScatter { parameter_meta { # inputs - inputFile: {description: "Either a bed file describing regiosn of intrest or a sequence dictionary.", category: "required"} + inputFile: {description: "Either a bed file describing regions of interest or a sequence dictionary.", category: "required"} prefix: {description: "The prefix for the output files.", category: "advanced"} splitContigs: {description: "If set, contigs are allowed to be split up over multiple files.", category: "advanced"} chunkSize: {description: "Equivalent to chunked-scatter's `-c` option.", category: "advanced"} diff --git a/geva.wdl b/geva.wdl new file mode 100644 index 00000000..1e2b634e --- /dev/null +++ b/geva.wdl @@ -0,0 +1,147 @@ +version 1.0 + +task ConvertToBin { + input { + File inputVcf + String prefix = "./geva.convert" + + String memory = "4GiB" + Int timeMinutes = 30 + # Disk usage should be about 1 byte, per variant per sample, but this is hard to estimate from compressed file sizes. + # The output file is written to disk in temporary chunks, which then get concatenated so we actually need twice that. + Int diskGb = 400 + String dockerImage = "quay.io/davycats/pkalbers-geva:5363c3db11c6b2ea2e24528affb6b68b0a939df4" + } + + command { + geva_v1beta \ + --vcf ~{inputVcf} \ + --out ~{prefix} + } + + output { + File bin = "~{prefix}.bin" + File sample = "~{prefix}.sample.txt" + File marker = "~{prefix}.marker.txt" + File log = "~{prefix}.log" + } + + runtime { + memory: memory + time_minutes: timeMinutes + disks: "local-disk ~{diskGb} SSD" # Based on an example in dxCompiler docs + docker: dockerImage + } + + parameter_meta { + # inputs + inputVcf: {description: "A VCF file (containing a single chromosome) to be converted into GEVA's binary format.", category: "required"} + prefix: {description: "Prefix (including path) for the output files.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + diskGb: {description: "The amount of disk space needed for this job in GiB.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + bin: {description: "GEVA's binary represnetation of the VCF file."} + sample: {description: "Sample ID mapping and metadata."} + marker: {description: "Variant ID mapping and metadata."} + log: {description: "GEVA's log."} + } +} + +task EstimateAge { + input { + File pairs + Int effectivePopulation = 10000 + String outputPath = "./geva.estimate.txt" + + String memory = "4GiB" + Int timeMinutes = 30 + Int diskGb = 1 + ceil(2 * size(pairs, "G")) + String dockerImage = "quay.io/davycats/pkalbers-geva:5363c3db11c6b2ea2e24528affb6b68b0a939df4" + } + + command { + set -e + mkdir ./geva_estimate_tmp + cp ~{pairs} ./geva_estimate_tmp/tmp.pairs.txt + Rscript /share/geva/estimate.R ./geva_estimate_tmp/tmp.pairs.txt ~{effectivePopulation} + cp ./geva_estimate_tmp/tmp.sites2.txt ~{outputPath} + } + + output { + File estimates = outputPath + } + + runtime { + memory: memory + time_minutes: timeMinutes + disks: "local-disk ~{diskGb} SSD" # Based on an example in dxCompiler docs + docker: dockerImage + } + + parameter_meta { + # inputs + pairs: {description: "The pairs output file from GEVA.", category: "required"} + effectivePopulation: {description: "Effective population size.", category: "advanced"} + outputPath: {description: "Path for the output file.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + diskGb: {description: "The amount of disk space needed for this job in GiB.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + estimates: {description: "Age estimations."} + } +} + +task Geva { + input { + File bin + Int position + String prefix = "./geva.~{position}" + + String memory = "750GiB" # According to GEVA's README this is dependant on AF, so hard to figure out on the fly. + Int timeMinutes = 120 + Int diskGb = 1 + ceil(2 * size(bin, "G")) + String dockerImage = "quay.io/davycats/pkalbers-geva:5363c3db11c6b2ea2e24528affb6b68b0a939df4" + } + + command { + geva_v1beta \ + -i '~{bin}' \ + -o '~{prefix}' \ + --position '~{position}' \ + --hmm /share/geva/hmm/hmm_initial_probs.txt /share/geva/hmm/hmm_emission_probs.txt + } + + output { + File pairs = "~{prefix}.pairs.txt" + File sites = "~{prefix}.sites.txt" + File log = "~{prefix}.log" + } + + runtime { + memory: memory + time_minutes: timeMinutes + disks: "local-disk ~{diskGb} SSD" # Based on an example in dxCompiler docs + docker: dockerImage + } + + parameter_meta { + # inputs + bin: {description: "The input bin file.", category: "required"} + position: {description: "The position to estimate the age for.", category: "required"} + prefix: {description: "Prefix (including path) for the output files.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + diskGb: {description: "The amount of disk space needed for this job in GiB.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + pairs: {description: "Pairwise analysis results."} + sites: {description: "Age estimations."} + log: {description: "GEVA's log."} + } +}