diff --git a/tests/test_data/vep_annotation_reporter/output.preferred_transcript.list.tsv b/tests/test_data/vep_annotation_reporter/output.preferred_transcript.list.tsv new file mode 100644 index 0000000..8b87dc9 --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/output.preferred_transcript.list.tsv @@ -0,0 +1,77 @@ +CHROM POS REF ALT transcript_id Consequence Gene Feature +chr17 7663110 G A ENST00000413465.6 intron_variant ENSG00000141510 ENST00000413465.6 +chr17 7663126 GTATATATATAATATATATAATATAATATAA G ENST00000413465.6 intron_variant ENSG00000141510 ENST00000413465.6 +chr17 7663661 A G ENST00000269305.8 downstream_gene_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7666871 T C ENST00000269305.8 downstream_gene_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7668134 G A ENST00000269305.8 downstream_gene_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7668169 A AGCCGTG ENST00000269305.8 downstream_gene_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7674797 T C ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7675088 C T ENST00000269305.8 missense_variant,missense_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7675327 C T ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7675393 CTTT C ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7675519 A G ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7676154 G C ENST00000269305.8 missense_variant,missense_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7676325 CCCCCAGCCCTCCAGGT C ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7676483 G C ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7684369 A G ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 43047896 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43049347 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43057078 G A ENST00000471181.6 stop_gained ENSG00000012048 ENST00000471181.6 +chr17 43058379 A C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43059469 C CACAACA ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43059636 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43063808 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43064004 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43066555 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43067763 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43067787 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43071077 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43074086 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43074584 G C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43074658 A T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077743 AT A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077746 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077756 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077760 G GT ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077795 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077840 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077891 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43079204 A C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43079499 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43082453 A G ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43087455 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43087474 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43091173 T G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43091983 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43092418 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43092919 G A ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43093220 A G ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43093449 G A ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43093454 C T ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43097077 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43097346 TA T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43099629 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43104083 AAAG A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43121331 T C ENST00000471181.6 upstream_gene_variant,upstream_gene_variant,intron_variant ENSG00000198496,ENSG00000198496,ENSG00000012048 ENST00000356906.7,ENST00000467245.5,ENST00000471181.6 +chr17 43121362 G C ENST00000471181.6 upstream_gene_variant,upstream_gene_variant,intron_variant ENSG00000198496,ENSG00000198496,ENSG00000012048 ENST00000356906.7,ENST00000467245.5,ENST00000471181.6 +chr17 43127281 A G ENST00000471181.6 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,upstream_gene_variant,upstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000012048,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000471181.6,ENST00000587322.1 +chr17 43129737 G A ENST00000471181.6 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,upstream_gene_variant,upstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000012048,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000471181.6,ENST00000587322.1 +chr17 43131360 C T ENST00000587322.1 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,downstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000587322.1 +chr17 43131380 AAAT A ENST00000587322.1 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,downstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000587322.1 +chr17 43135863 A G ENST00000587322.1 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,downstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000587322.1 +chr17 43138596 G T ENST00000356906.7 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5 +chr17 43138657 G C ENST00000356906.7 splice_region_variant&non_coding_transcript_exon_variant,splice_region_variant&non_coding_transcript_exon_variant ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5 +chr17 43140410 C G ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43140722 G A ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43142914 TTG T ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43142940 G A ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43142964 GTGTATATATA G ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43145635 A C ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43145975 G T ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43146482 G A ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43147572 C T ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43147590 A G ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43147814 C CCT ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43147911 G C ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43152446 T C ENST00000467245.5 intron_variant&non_coding_transcript_variant ENSG00000198496 ENST00000467245.5 +chr17 43169893 C T ENST00000341165.10 upstream_gene_variant ENSG00000188554 ENST00000341165.10 diff --git a/tests/test_data/vep_annotation_reporter/output.preferred_transcript.tsv b/tests/test_data/vep_annotation_reporter/output.preferred_transcript.tsv new file mode 100644 index 0000000..9831997 --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/output.preferred_transcript.tsv @@ -0,0 +1,77 @@ +CHROM POS REF ALT transcript_id Consequence Gene Feature +chr17 7663110 G A ENST00000413465.6 intron_variant ENSG00000141510 ENST00000413465.6 +chr17 7663126 GTATATATATAATATATATAATATAATATAA G ENST00000413465.6 intron_variant ENSG00000141510 ENST00000413465.6 +chr17 7663661 A G ENST00000269305.8 downstream_gene_variant ENSG00000141510 ENST00000269305.8 +chr17 7666871 T C ENST00000269305.8 downstream_gene_variant ENSG00000141510 ENST00000269305.8 +chr17 7668134 G A ENST00000269305.8 downstream_gene_variant ENSG00000141510 ENST00000269305.8 +chr17 7668169 A AGCCGTG ENST00000269305.8 downstream_gene_variant ENSG00000141510 ENST00000269305.8 +chr17 7674797 T C ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7675088 C T ENST00000269305.8 missense_variant ENSG00000141510 ENST00000269305.8 +chr17 7675327 C T ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7675393 CTTT C ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7675519 A G ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7676154 G C ENST00000269305.8 missense_variant ENSG00000141510 ENST00000269305.8 +chr17 7676325 CCCCCAGCCCTCCAGGT C ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7676483 G C ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7684369 A G ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 43047896 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43049347 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43057078 G A ENST00000471181.6 stop_gained ENSG00000012048 ENST00000471181.6 +chr17 43058379 A C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43059469 C CACAACA ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43059636 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43063808 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43064004 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43066555 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43067763 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43067787 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43071077 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43074086 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43074584 G C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43074658 A T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077743 AT A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077746 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077756 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077760 G GT ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077795 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077840 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077891 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43079204 A C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43079499 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43082453 A G ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43087455 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43087474 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43091173 T G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43091983 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43092418 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43092919 G A ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43093220 A G ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43093449 G A ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43093454 C T ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43097077 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43097346 TA T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43099629 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43104083 AAAG A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43121331 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43121362 G C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43127281 A G ENST00000471181.6 upstream_gene_variant ENSG00000012048 ENST00000471181.6 +chr17 43129737 G A ENST00000471181.6 upstream_gene_variant ENSG00000012048 ENST00000471181.6 +chr17 43131360 C T ENST00000587322.1 downstream_gene_variant ENSG00000198496 ENST00000587322.1 +chr17 43131380 AAAT A ENST00000587322.1 downstream_gene_variant ENSG00000198496 ENST00000587322.1 +chr17 43135863 A G ENST00000587322.1 downstream_gene_variant ENSG00000198496 ENST00000587322.1 +chr17 43138596 G T ENST00000356906.7 intron_variant&non_coding_transcript_variant ENSG00000198496 ENST00000356906.7 +chr17 43138657 G C ENST00000356906.7 splice_region_variant&non_coding_transcript_exon_variant ENSG00000198496 ENST00000356906.7 +chr17 43140410 C G ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43140722 G A ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43142914 TTG T ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43142940 G A ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43142964 GTGTATATATA G ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43145635 A C ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43145975 G T ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43146482 G A ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43147572 C T ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43147590 A G ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43147814 C CCT ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43147911 G C ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43152446 T C ENST00000467245.5 intron_variant&non_coding_transcript_variant ENSG00000198496 ENST00000467245.5 +chr17 43169893 C T ENST00000341165.10 upstream_gene_variant ENSG00000188554 ENST00000341165.10 diff --git a/tests/test_data/vep_annotation_reporter/preferred_transcripts.list.tsv b/tests/test_data/vep_annotation_reporter/preferred_transcripts.list.tsv new file mode 100644 index 0000000..8f83ad0 --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/preferred_transcripts.list.tsv @@ -0,0 +1,9 @@ +transcript_id +ENST00000587322.1 +ENST00000471181.6 +ENST00000467245.5 +ENST00000464237.2 +ENST00000413465.6 +ENST00000356906.7 +ENST00000341165.10 +ENST00000269305.8 diff --git a/tests/test_data/vep_annotation_reporter/preferred_transcripts.pos.tsv b/tests/test_data/vep_annotation_reporter/preferred_transcripts.pos.tsv new file mode 100644 index 0000000..8956451 --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/preferred_transcripts.pos.tsv @@ -0,0 +1,77 @@ +CHROM POS REF ALT transcript_id +chr17 7663110 G A ENST00000413465.6 +chr17 7663126 GTATATATATAATATATATAATATAATATAA G ENST00000413465.6 +chr17 7663661 A G ENST00000269305.8 +chr17 7666871 T C ENST00000269305.8 +chr17 7668134 G A ENST00000269305.8 +chr17 7668169 A AGCCGTG ENST00000269305.8 +chr17 7674797 T C ENST00000269305.8 +chr17 7675088 C T ENST00000269305.8 +chr17 7675327 C T ENST00000269305.8 +chr17 7675393 CTTT C ENST00000269305.8 +chr17 7675519 A G ENST00000269305.8 +chr17 7676154 G C ENST00000269305.8 +chr17 7676325 CCCCCAGCCCTCCAGGT C ENST00000269305.8 +chr17 7676483 G C ENST00000269305.8 +chr17 7684369 A G ENST00000269305.8 +chr17 43047896 T C ENST00000471181.6 +chr17 43049347 T C ENST00000471181.6 +chr17 43057078 G A ENST00000471181.6 +chr17 43058379 A C ENST00000471181.6 +chr17 43059469 C CACAACA ENST00000471181.6 +chr17 43059636 A G ENST00000471181.6 +chr17 43063808 C T ENST00000471181.6 +chr17 43064004 G A ENST00000471181.6 +chr17 43066555 T C ENST00000471181.6 +chr17 43067763 T C ENST00000471181.6 +chr17 43067787 T C ENST00000471181.6 +chr17 43071077 T C ENST00000471181.6 +chr17 43074086 C T ENST00000471181.6 +chr17 43074584 G C ENST00000471181.6 +chr17 43074658 A T ENST00000471181.6 +chr17 43077743 AT A ENST00000471181.6 +chr17 43077746 T A ENST00000471181.6 +chr17 43077756 T C ENST00000471181.6 +chr17 43077760 G GT ENST00000471181.6 +chr17 43077795 A G ENST00000471181.6 +chr17 43077840 G A ENST00000471181.6 +chr17 43077891 T A ENST00000471181.6 +chr17 43079204 A C ENST00000471181.6 +chr17 43079499 C T ENST00000471181.6 +chr17 43082453 A G ENST00000471181.6 +chr17 43087455 G A ENST00000471181.6 +chr17 43087474 T C ENST00000471181.6 +chr17 43091173 T G ENST00000471181.6 +chr17 43091983 T C ENST00000471181.6 +chr17 43092418 T C ENST00000471181.6 +chr17 43092919 G A ENST00000471181.6 +chr17 43093220 A G ENST00000471181.6 +chr17 43093449 G A ENST00000471181.6 +chr17 43093454 C T ENST00000471181.6 +chr17 43097077 A G ENST00000471181.6 +chr17 43097346 TA T ENST00000471181.6 +chr17 43099629 T A ENST00000471181.6 +chr17 43104083 AAAG A ENST00000471181.6 +chr17 43121331 T C ENST00000471181.6 +chr17 43121362 G C ENST00000471181.6 +chr17 43127281 A G ENST00000471181.6 +chr17 43129737 G A ENST00000471181.6 +chr17 43131360 C T ENST00000587322.1 +chr17 43131380 AAAT A ENST00000587322.1 +chr17 43135863 A G ENST00000587322.1 +chr17 43138596 G T ENST00000356906.7 +chr17 43138657 G C ENST00000356906.7 +chr17 43140410 C G ENST00000464237.2 +chr17 43140722 G A ENST00000464237.2 +chr17 43142914 TTG T ENST00000464237.2 +chr17 43142940 G A ENST00000464237.2 +chr17 43142964 GTGTATATATA G ENST00000464237.2 +chr17 43145635 A C ENST00000464237.2 +chr17 43145975 G T ENST00000464237.2 +chr17 43146482 G A ENST00000464237.2 +chr17 43147572 C T ENST00000464237.2 +chr17 43147590 A G ENST00000464237.2 +chr17 43147814 C CCT ENST00000464237.2 +chr17 43147911 G C ENST00000464237.2 +chr17 43152446 T C ENST00000467245.5 +chr17 43169893 C T ENST00000341165.10 diff --git a/tests/test_vep_annotation_reporter.py b/tests/test_vep_annotation_reporter.py index e6fd30e..722053f 100644 --- a/tests/test_vep_annotation_reporter.py +++ b/tests/test_vep_annotation_reporter.py @@ -88,6 +88,36 @@ def test_multiple_multiallelic_site(self): self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.multiallelic.tsv'), os.path.join(temp_path.name, 'input.tsv'))) temp_path.cleanup() + def test_preferred_transcripts(self): + temp_path = tempfile.TemporaryDirectory() + os.symlink(os.path.join(self.test_data_dir, 'input.vcf.gz'), os.path.join(temp_path.name, 'input.vcf.gz')) + command = [ + os.path.join(temp_path.name, 'input.vcf.gz'), + 'Consequence', + 'Gene', + 'Feature', + '-t', os.path.join(self.test_data_dir, 'preferred_transcripts.pos.tsv'), + '-p', os.path.join(self.test_data_dir, 'preferred_transcripts.pos.tsv'), + ] + vep_annotation_reporter.main(command) + self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.preferred_transcript.tsv'), os.path.join(temp_path.name, 'input.tsv'))) + temp_path.cleanup() + + def test_preferred_transcripts_list(self): + temp_path = tempfile.TemporaryDirectory() + os.symlink(os.path.join(self.test_data_dir, 'input.vcf.gz'), os.path.join(temp_path.name, 'input.vcf.gz')) + command = [ + os.path.join(temp_path.name, 'input.vcf.gz'), + 'Consequence', + 'Gene', + 'Feature', + '-t', os.path.join(self.test_data_dir, 'preferred_transcripts.pos.tsv'), + '-p', os.path.join(self.test_data_dir, 'preferred_transcripts.list.tsv'), + ] + vep_annotation_reporter.main(command) + self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.preferred_transcript.list.tsv'), os.path.join(temp_path.name, 'input.tsv'))) + temp_path.cleanup() + def test_no_input_tsv(self): temp_path = tempfile.TemporaryDirectory() os.symlink(os.path.join(self.test_data_dir, 'input.vcf.gz'), os.path.join(temp_path.name, 'input.vcf.gz')) diff --git a/vatools/vep_annotation_reporter.py b/vatools/vep_annotation_reporter.py index 995dada..54a5614 100644 --- a/vatools/vep_annotation_reporter.py +++ b/vatools/vep_annotation_reporter.py @@ -31,6 +31,12 @@ def define_parser(): help="A TSV report file to add VEP annotations to. Required columns are CHROM, POS, REF, ALT. " +"These are used to match each TSV entry to a VCF entry. Must be tab-delimited." ) + parser.add_argument( + "-p", "--preferred-transcripts-tsv", + help="A TSV file listing transcript annotations to prioritize. Instead of reporting all transcript annotations " + +"or the ones selected via the VEP --flag_pick option (PICK field), report only the transcripts with the Ensembl transcript IDs listed in this TSV (expected header: transcript_id). " + +"To specify a preferred transcript for each variant, include CHROM, POS, REF, and ALT columns in this file in addition to the transcript_id column." + ) parser.add_argument( "-o", "--output-tsv", help="Path to write the output report TSV file. If not provided, the output TSV will be written " @@ -52,6 +58,33 @@ def create_tsv_reader(input_filehandle): raise Exception("ERROR: Input TSV {} doesn't contain required column '{}'.".format(input_filehandle.name, field)) return tsv_reader +def parse_preferred_transcripts_tsv(preferred_transcripts_tsv): + if preferred_transcripts_tsv is None: + return None + with open(preferred_transcripts_tsv, 'r') as fh: + tsv_reader = csv.DictReader(fh, delimiter="\t") + if 'transcript_id' not in tsv_reader.fieldnames: + raise Exception("ERROR preferred transcripts TSV {} doesn't contain required column 'transcript_id'.".format(preferred_transcripts_tsv)) + if all([header in tsv_reader.fieldnames for header in ['CHROM', 'POS', 'REF', 'ALT']]): + preferred_transcripts = {} + for line in tsv_reader: + if line['CHROM'] not in preferred_transcripts: + preferred_transcripts[line['CHROM']] = {} + + if line['POS'] not in preferred_transcripts[line['CHROM']]: + preferred_transcripts[line['CHROM']][line['POS']] = {} + + if line['REF'] not in preferred_transcripts[line['CHROM']][line['POS']]: + preferred_transcripts[line['CHROM']][line['POS']][line['REF']] = {} + + preferred_transcripts[line['CHROM']][line['POS']][line['REF']][line['ALT']] = line['transcript_id'] + return preferred_transcripts + else: + preferred_transcripts = [] + for line in tsv_reader: + preferred_transcripts.append(line['transcript_id']) + return preferred_transcripts + def parse_csq_header(vcf_reader): format_pattern = re.compile('Format: (.*)') return format_pattern.search(vcf_reader.header.get_info_field_info('CSQ').description).group(1).split('|') @@ -99,25 +132,43 @@ def resolve_alleles(entry, csq_alleles): alleles[alt] = alt return alleles -def transcript_for_alt(transcripts, alt): - no_pick_value = False - for transcript in transcripts[alt]: - if 'PICK' in transcript and transcript['PICK'] == '1': - return transcript, no_pick_value +def transcript_for_alt(transcripts, alt, preferred_transcripts): + no_preferred_transcript = None + if preferred_transcripts is not None: + if type(preferred_transcripts) is list: + transcripts_to_include = [] + for transcript in transcripts[alt]: + if transcript['Feature'] in preferred_transcripts: + transcripts_to_include.append(transcript) + else: + transcripts_to_include = [] + for transcript in transcripts[alt]: + if transcript['Feature'] == preferred_transcripts: + transcripts_to_include.append(transcript) + if len(transcripts_to_include) > 0: + return merge_transcripts(transcripts_to_include), None, False + else: + no_preferred_transcript = True if 'PICK' in transcripts[alt][0]: - no_pick_value = True + for transcript in transcripts[alt]: + if 'PICK' in transcript and transcript['PICK'] == '1': + return transcript, False, no_preferred_transcript + return merge_transcripts(transcripts[alt]), True, no_preferred_transcript + else: + return merge_transcripts(transcripts[alt]), None, no_preferred_transcript +def merge_transcripts(transcripts_to_include): merged_transcripts = {} - for key in transcripts[alt][0].keys(): - merged_transcripts[key] = ",".join([transcript[key] for transcript in transcripts[alt]]) - return merged_transcripts, no_pick_value + for key in transcripts_to_include[0].keys(): + merged_transcripts[key] = ",".join([transcript[key] for transcript in transcripts_to_include]) + return merged_transcripts def decode_hex(match_string): hex_string = match_string.group(0).replace('%', '') return binascii.unhexlify(hex_string).decode('utf-8') -def extract_vep_fields(args): +def extract_vep_fields(args, preferred_transcripts): vcf_reader = create_vcf_reader(args) csq_fields = parse_csq_header(vcf_reader) vep = {} @@ -148,7 +199,13 @@ def extract_vep_fields(args): alt = alt.serialize() if alt not in vep[chr][pos][ref]: if alleles_dict[alt] in transcripts: - values, no_pick_value = transcript_for_alt(transcripts, alleles_dict[alt]) + if type(preferred_transcripts) is dict: + p = preferred_transcripts.get(chr, {}).get(pos, {}).get(ref, {}).get(alt, {}) + else: + p = preferred_transcripts + values, no_pick_value, no_preferred_transcript = transcript_for_alt(transcripts, alleles_dict[alt], p) + if no_preferred_transcript: + logging.warning("Preferred transcripts TSV provided but no matching transcript found for variant {} {} {} {}. Writing value for PICK'ed transcript.".format(chr, pos, ref, alt)) if no_pick_value: logging.warning("VCF is annotated with the PICK flag but no PICK'ed transcript found for variant {} {} {} {}. Writing values for all transcripts.".format(chr, pos, ref, alt)) vep[chr][pos][ref][alt] = values @@ -177,7 +234,8 @@ def main(args_input = sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) - vep = extract_vep_fields(args) + preferred_transcripts = parse_preferred_transcripts_tsv(args.preferred_transcripts_tsv) + vep = extract_vep_fields(args, preferred_transcripts) if args.output_tsv: output_file = args.output_tsv