combine outputs Answered

June 24, 2019 18:27
13 comments

Hi! I'm trying to combine the outputs from the previous task in my workflow, but it keeps failing. I tried scatter-gather and also tried to add an additional task (CreateFoFN, https://gatkforums.broadinstitute.org/wdl/discussion/13737/make-a-list-file-outputs-list-from-the-output-of-previous-task) to make a list of the output files' file paths - neither of them worked. I keep getting the 'list' of only one output in it. I copied both versions of WDL below: scatter-gather only & with CreteFoFN. Could someone help me to do trouble shooting this? Thanks!

workflow M1_PON {
# inputs 
Array[File] normal_bams
Array[File] normal_bais
File ref_fasta
File ref_fai
File ref_dict
String mutect1_docker 
String gatk_docker 

Array[Pair[File,File]] normal_bam_pairs = zip(normal_bams, normal_bais)

scatter (normal_bam_pair in normal_bam_pairs) {
File normal_bam = normal_bam_pair.left
File normal_bai = normal_bam_pair.right

call M1 {
input:
normal_bam = normal_bam,
normal_bai = normal_bai,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
mutect1_docker = mutect1_docker
} 

call SelectVariants {
input:
input_vcf = M1.output_pon_vcf,
input_vcf_idx = M1.output_pon_vcf_index,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
gatk_docker = gatk_docker,
VCF_pre = M1.fname
}
}

call CombineVariants {
input:
filtered_vcfs_list = SelectVariants.filtered_vcf,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
gatk_docker = gatk_docker
}

output {
Array[File] output_pon_vcf = M1.output_pon_vcf
Array[File] output_pon_vcf_index = M1.output_pon_vcf_index 
Array[File] output_pon_stats_txt = M1.output_pon_stats_txt
Array[File] filtered_vcf = SelectVariants.filtered_vcf
File pon = CombineVariants.pon
}

meta {
author: "Sehyun Oh"
email: "shbrief@gmail.com"
description: "Build Pool of Normal (PoN) using MuTect v.1.1.7 for PureCN"
}
}

task M1 {
# input
File normal_bam
File normal_bai
File dbsnp_vcf
File dbsnp_vcf_idx
File cosmicVCF
File cosmicVCF_idx
File ref_fasta
File ref_fai
File ref_dict

String BAM_pre = basename(normal_bam, ".bam")

# runtime
String mutect1_docker
Int disk_size = ceil(size(normal_bam, "GB")) + 30

command <<<
java -jar -Xmx4g /home/mutect-1.1.7.jar \
--analysis_type MuTect \
-R ${ref_fasta} \
--artifact_detection_mode \
--dbsnp ${dbsnp_vcf} \
--cosmic ${cosmicVCF} \
-dt None \
-I:tumor ${normal_bam} \
-o ${BAM_pre}_pon_stats.txt \
-vcf ${BAM_pre}_pon.vcf
>>>

runtime {
docker: mutect1_docker # jvivian/mutect
memory: "32 GB"
disks: "local-disk " + disk_size + " HDD"
}

output {
String fname = "${BAM_pre}"
File output_pon_vcf = "${BAM_pre}_pon.vcf"
File output_pon_vcf_index = "${BAM_pre}_pon.vcf.idx" 
File output_pon_stats_txt = "${BAM_pre}_pon_stats.txt"
}
}

task SelectVariants {
# input
File input_vcf
File input_vcf_idx
File ref_fasta
File ref_fai
File ref_dict
String VCF_pre

# runtime
String gatk_docker

command <<<
java -jar -Xmx4g /usr/GenomeAnalysisTK.jar \
--analysis_type SelectVariants \
-R ${ref_fasta} \
-V ${input_vcf} \
-o ${VCF_pre}_pon.vcf
>>>

runtime {
docker: gatk_docker
memory: "8 GB"
}

output {
File filtered_vcf = "${VCF_pre}_pon.vcf"
}
}

task CombineVariants {
# input
Array[File] filtered_vcfs_list
File ref_fasta
File ref_fai
File ref_dict

# runtime
String gatk_docker

command <<<
java -jar -Xmx24g /usr/GenomeAnalysisTK.jar \
-T CombineVariants \
-nt 4 --minimumN 5 --genotypemergeoption UNSORTED \
-R ${ref_fasta} \
-V ${sep=' -V ' filtered_vcfs_list} \
-o "normals.merged.min5.vcf"
>>>

runtime {
docker: gatk_docker
memory: "32 GB"
}

output {
File pon = "normals.merged.min5.vcf"
}
}

With CreateFoFN

workflow M1_PON {
# inputs 
Array[File] normal_bams
Array[File] normal_bais
File ref_fasta
File ref_fai
File ref_dict
String mutect1_docker 
String gatk_docker 

Array[Pair[File,File]] normal_bam_pairs = zip(normal_bams, normal_bais)

scatter (normal_bam_pair in normal_bam_pairs) {
File normal_bam = normal_bam_pair.left
File normal_bai = normal_bam_pair.right

call M1 {
input:
normal_bam = normal_bam,
normal_bai = normal_bai,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
mutect1_docker = mutect1_docker
} 

call SelectVariants {
input:
input_vcf = M1.output_pon_vcf,
input_vcf_idx = M1.output_pon_vcf_index,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
gatk_docker = gatk_docker,
VCF_pre = M1.fname
}
}

call CreateFoFN {
input:
array_of_files = SelectVariants.filtered_vcf,
}

call CombineVariants {
input:
filtered_vcfs_list = CreateFoFN.fofn_list,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
gatk_docker = gatk_docker
}

output {
Array[File] output_pon_vcf = M1.output_pon_vcf
Array[File] output_pon_vcf_index = M1.output_pon_vcf_index 
Array[File] output_pon_stats_txt = M1.output_pon_stats_txt
Array[File] filtered_vcf = SelectVariants.filtered_vcf
File filtered_vcfs_list = CreateFoFN.fofn_list
File pon = CombineVariants.pon
}

meta {
author: "Sehyun Oh"
email: "shbrief@gmail.com"
description: "Build Pool of Normal (PoN) using MuTect v.1.1.7 for PureCN"
}
}

task M1 {
# input
File normal_bam
File normal_bai
File dbsnp_vcf
File dbsnp_vcf_idx
File cosmicVCF
File cosmicVCF_idx
File ref_fasta
File ref_fai
File ref_dict

String BAM_pre = basename(normal_bam, ".bam")

# runtime
String mutect1_docker
Int disk_size = ceil(size(normal_bam, "GB")) + 30

command <<<
java -jar -Xmx4g /home/mutect-1.1.7.jar \
--analysis_type MuTect \
-R ${ref_fasta} \
--artifact_detection_mode \
--dbsnp ${dbsnp_vcf} \
--cosmic ${cosmicVCF} \
-dt None \
-I:tumor ${normal_bam} \
-o ${BAM_pre}_pon_stats.txt \
-vcf ${BAM_pre}_pon.vcf
>>>

runtime {
docker: mutect1_docker # jvivian/mutect
memory: "32 GB"
disks: "local-disk " + disk_size + " HDD"
}

output {
String fname = "${BAM_pre}"
File output_pon_vcf = "${BAM_pre}_pon.vcf"
File output_pon_vcf_index = "${BAM_pre}_pon.vcf.idx" 
File output_pon_stats_txt = "${BAM_pre}_pon_stats.txt"
}
}

task SelectVariants {
# input
File input_vcf
File input_vcf_idx
File ref_fasta
File ref_fai
File ref_dict
String VCF_pre

# runtime
String gatk_docker

command <<<
java -jar -Xmx4g /usr/GenomeAnalysisTK.jar \
--analysis_type SelectVariants \
-R ${ref_fasta} \
-V ${input_vcf} \
-o ${VCF_pre}_pon.vcf
>>>

runtime {
docker: gatk_docker
memory: "8 GB"
}

output {
File filtered_vcf = "${VCF_pre}_pon.vcf"
}
}

task CreateFoFN {
# Command parameters
Array[String] array_of_files
String fofn_name

command <<<
mv ${write_lines(array_of_files)} ${fofn_name}.list
>>>

output {
File fofn_list = "${fofn_name}.list"
}

runtime {
docker: "ubuntu:latest"
}
}

task CombineVariants {
# input
File filtered_vcfs_list
Array[File] input_vcfs = read_lines(filtered_vcfs_list)
File ref_fasta
File ref_fai
File ref_dict

# runtime
String gatk_docker

command <<<
java -jar -Xmx24g /usr/GenomeAnalysisTK.jar \
-T CombineVariants \
-nt 4 --minimumN 5 --genotypemergeoption UNSORTED \
-R ${ref_fasta} \
-V ${sep=' -V ' input_vcfs} \
-o "normals.merged.min5.vcf"
>>>

runtime {
docker: gatk_docker
memory: "32 GB"
}

output {
File pon = "normals.merged.min5.vcf"
}
}

Comments

13 comments

Beri
- June 25, 2019 18:27
Hi Sehyun Oh

What task is failing and what is the error message? It would be helpful if you could attach the stdout and stderr log files.

0
Sehyun Oh
- June 25, 2019 21:02
Hi Beri,

There is no error - everything runs 'successfully'. But the outputs from SelectVariants never be used as a list of input files (= multiple -V arguments) for CombineVariants. So each sample gives its own output from CombineVariants... without combining... ;(

Let me know if you need any other information to understand/solve this issue.

- Sehyun

0
Beri
- June 26, 2019 00:52
That's odd, your WDL look fine at first sight. Mind sharing the workspace you're running this workflow in with GROUP_FireCloud-Support@firecloud.org so that we can view the standard log files?

0
Sehyun Oh
- June 26, 2019 01:32
Ok. I just added the email as a reader of my workspace (TOSC19-CNVWorkflow_master). Below are the submission ids for jobs with or without CreateFoFN.

Tool with CreateFoFN >>> submission-id: 4ffed12b-3186-4516-b1c4-cd96baba90cf (paused)

Tool without CreateFoFN >>> submission-id: f4f096fb-7b57-4671-af75-29387e14f05b

0
Beri
- June 26, 2019 01:45
Lets try running the workflow without the "CreateFoFN" again but instead of running the workflow on the Sample, select Sample_set from the drop down menu in the Tool configurations.

0
Sehyun Oh
- June 26, 2019 01:51
Ok. I just submitted the job with a sample_set of 3 samples, without CreateFoFN.

0
Beri
- June 26, 2019 12:43
After setting the Tool config to sample_set you'll still need to click on "Select Data" and select the sample set you would like to process. Try `MuTect1_PON_2019-06-26T03-05-56`

0
Sehyun Oh
- Edited June 26, 2019 13:00
I clicked 'Select Data' button and selected a set with three samples (MuTect1_PON_2019-06-18T01-13-55), which failed (submission ID: 4ffa7acc-b649-4268-bed6-407df436a). I started a job again with `MuTect1_PON_2019-06-26T03-05-56` dataset. Btw, since last night, my job requests repeatedly stuck at 'submitted' step... fyi.

0
Beri
- June 26, 2019 13:06
Thanks for the info, I've notified the dev team and they working towards resolving the job manager issue.

For your Tool configuration you'll need to edit the input attributes for "normal_bams" and "normal_bais" so that they work with sample_sets. This means adding "samples" in your argument before adding the column name. Use the following parameters:

normal_bams : this.samples.synthExomeBam

normal_bais : this.samples.synthExomeBamIndex

0
Beri
- June 26, 2019 18:30
Looks like your workflow completed and with a single pon VCF. Are you all set?

0
Sehyun Oh
- June 26, 2019 18:38
Yeah! Thanks a lot for your help, Beri!

Just one more question, how can I explain "why I should use 'sample_set' instead of 'sample'" for CombineVariants? Why scatter is not enough?

0
Beri
- June 26, 2019 18:52
It's not related to the CombineVariants task but to your workflow as a whole. Your workflow is expecting and array of files as input but when you select Samples as the "root entity" your telling Terra to pass one input file at a time to your workflow. So your workflow was given 1 sample bam to process. You were also selecting more than 1 sample while your root entity was set to Sample mode which tells Terra run this workflow once for each of the samples you've have selected.

On the other hand if you wanted to run a workflow flow once and pass this workflow an array of sample files (e.g. Bam files) then you would choose sample_set for your root entity.

1
Sehyun Oh
- June 26, 2019 19:34
Got it. Thanks!!

0

Please sign in to leave a comment.