Mercurial > repos > dfornika > ivar_variants_to_vcf
changeset 0:c87f6ad32fd8 draft default tip
"planemo upload for repository https://github.com/dfornika/galaxytools/tree/master/tools/ivar_variants_to_vcf commit 16332019b4aab6af58c74e631f390dfeef23a3dc"
author | dfornika |
---|---|
date | Fri, 05 Jun 2020 05:10:05 +0000 |
parents | |
children | |
files | ivar_variants_to_vcf.py ivar_variants_to_vcf.xml test-data/SAMEA6808195-variants.tsv test-data/SAMEA6808195-variants.vcf |
diffstat | 4 files changed, 176 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ivar_variants_to_vcf.py Fri Jun 05 05:10:05 2020 +0000 @@ -0,0 +1,103 @@ +#!/usr/bin/env python +import os +import sys +import re +import errno +import argparse + +def parse_args(args=None): + Description = 'Convert iVar variants tsv file to vcf format.' + Epilog = """Example usage: python ivar_variants_to_vcf.py <FILE_IN> <FILE_OUT>""" + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument('FILE_IN', help="Input tsv file.") + parser.add_argument('FILE_OUT', help="Full path to output vcf file.") + parser.add_argument('-po', '--pass_only', dest="PASS_ONLY", help="Only output variants that PASS all filters.",action='store_true') + parser.add_argument('-ma', '--min_allele_freq', type=float, dest="MIN_ALLELE_FREQ", default=0, help="Only output variants where allele frequency greater than this number (default: 0).") + + return parser.parse_args(args) + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +def ivar_variants_to_vcf(FileIn,FileOut,passOnly=False,minAF=0): + filename = os.path.splitext(FileIn)[0] + header = ('##fileformat=VCFv4.2\n' + '##source=iVar\n' + '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n' + '##FILTER=<ID=PASS,Description="Result of p-value <= 0.05">\n' + '##FILTER=<ID=FAIL,Description="Result of p-value > 0.05">\n' + '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n' + '##FORMAT=<ID=REF_DP,Number=1,Type=Integer,Description="Depth of reference base">\n' + '##FORMAT=<ID=REF_RV,Number=1,Type=Integer,Description="Depth of reference base on reverse reads">\n' + '##FORMAT=<ID=REF_QUAL,Number=1,Type=Integer,Description="Mean quality of reference base">\n' + '##FORMAT=<ID=ALT_DP,Number=1,Type=Integer,Description="Depth of alternate base">\n' + '##FORMAT=<ID=ALT_RV,Number=1,Type=Integer,Description="Deapth of alternate base on reverse reads">\n' + '##FORMAT=<ID=ALT_QUAL,Number=1,Type=String,Description="Mean quality of alternate base">\n' + '##FORMAT=<ID=ALT_FREQ,Number=1,Type=String,Description="Frequency of alternate base">\n') + header += '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+filename+'\n' + + varList = [] + varCountDict = {'SNP':0, 'INS':0, 'DEL':0} + OutDir = os.path.dirname(FileOut) + make_dir(OutDir) + fout = open(FileOut,'w') + fout.write(header) + with open(FileIn) as f: + for line in f: + if not re.match("REGION",line): + line = re.split("\t", line) + CHROM=line[0] + POS=line[1] + ID='.' + REF=line[2] + ALT=line[3] + var_type = 'SNP' + if ALT[0] == '+': + ALT = REF + ALT[1:] + var_type = 'INS' + elif ALT[0] == '-': + REF += ALT[1:] + ALT = line[2] + var_type = 'DEL' + QUAL='.' + pass_test=line[13] + if pass_test == 'TRUE': + FILTER='PASS' + else: + FILTER='FAIL' + INFO='DP='+line[11] + FORMAT='GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ' + SAMPLE='1:'+line[4]+':'+line[5]+':'+line[6]+':'+line[7]+':'+line[8]+':'+line[9]+':'+line[10] + oline = CHROM+'\t'+POS+'\t'+ID+'\t'+REF+'\t'+ALT+'\t'+QUAL+'\t'+FILTER+'\t'+INFO+'\t'+FORMAT+'\t'+SAMPLE+'\n' + writeLine = True + if passOnly and FILTER != 'PASS': + writeLine = False + if float(line[10]) < minAF: + writeLine = False + if (CHROM,POS,REF,ALT) in varList: + writeLine = False + else: + varList.append((CHROM,POS,REF,ALT)) + if writeLine: + varCountDict[var_type] += 1 + fout.write(oline) + fout.close() + + ## Print variant counts to pass to MultiQC + varCountList = [(k, str(v)) for k, v in sorted(varCountDict.items())] + print('\t'.join(['sample'] + [x[0] for x in varCountList])) + print('\t'.join([filename] + [x[1] for x in varCountList])) + +def main(args=None): + args = parse_args(args) + ivar_variants_to_vcf(args.FILE_IN,args.FILE_OUT,args.PASS_ONLY,args.MIN_ALLELE_FREQ) + + +if __name__ == '__main__': + sys.exit(main())
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ivar_variants_to_vcf.xml Fri Jun 05 05:10:05 2020 +0000 @@ -0,0 +1,29 @@ +<tool id="ivar_variants_to_vcf" name="iVar Variants to VCF" version="0.1.0"> + <description>Convert iVar tabular variant output to .vcf format</description> + <requirements /> + <command detect_errors="exit_code"><![CDATA[ + '$__tool_directory__/ivar_variants_to_vcf.py' + -ma '${min_allele_freq}' + ${input} + ${output} + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" /> + <param name="min_allele_freq" type="float" min="0.0" value="0.0" max="1.0" /> + </inputs> + <outputs> + <data name="output" label="Variants (VCF)" format="vcf" /> + </outputs> + <tests> + <test> + <param name="input" value="SAMEA6808195-variants.tsv" /> + <param name="min_allele_freq" value="0.5" /> + <output name="output" file="SAMEA6808195-variants.vcf" lines_diff="2"/> + </test> + </tests> + <help><![CDATA[ + ]]></help> + <citations> + <citation type="doi">10.5281/zenodo.3872730</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/SAMEA6808195-variants.tsv Fri Jun 05 05:10:05 2020 +0000 @@ -0,0 +1,21 @@ +REGION POS REF ALT REF_DP REF_RV REF_QUAL ALT_DP ALT_RV ALT_QUAL ALT_FREQ TOTAL_DP PVAL PASS GFF_FEATURE REF_CODON REF_AA ALT_CODON ALT_AA +MN908947.3 30 A T 23 10 38 30 0 33 0.566038 53 1.95736e-12 TRUE NA NA NA NA NA +MN908947.3 228 C T 3 0 47 3929 107 73 0.999237 3932 0 TRUE NA NA NA NA NA +MN908947.3 241 C T 0 0 0 3943 127 72 1 3943 0 TRUE NA NA NA NA NA +MN908947.3 3037 C T 0 0 0 1713 100 74 1 1713 0 TRUE NA NA NA NA NA +MN908947.3 6696 C +T 4306 119 75 266 0 20 0.0313901 8474 1.36673e-56 TRUE NA NA NA NA NA +MN908947.3 6872 G A 4567 4278 35 386 17 74 0.0779011 4955 1.45993e-131 TRUE NA NA NA NA NA +MN908947.3 14408 C T 2 0 53 6478 209 75 0.999691 6480 0 TRUE NA NA NA NA NA +MN908947.3 15168 G A 1792 1774 25 96 96 20 0.0508475 1888 1.18961e-14 TRUE NA NA NA NA NA +MN908947.3 17126 T C 3 0 39 11851 4693 38 0.999747 11854 0 TRUE NA NA NA NA NA +MN908947.3 19298 A T 22 5 37 1 1 34 0.0434783 23 0.5 FALSE NA NA NA NA NA +MN908947.3 19305 T C 18 2 38 1 0 20 0.0526316 19 0.513514 FALSE NA NA NA NA NA +MN908947.3 19345 T C 19 3 38 1 0 37 0.05 20 0.512821 FALSE NA NA NA NA NA +MN908947.3 19516 T A 13 13 38 1 1 20 0.0714286 14 0.518519 FALSE NA NA NA NA NA +MN908947.3 19518 G -TA 14 14 38 1 0 20 0.0714286 14 0.535714 FALSE NA NA NA NA NA +MN908947.3 19548 A T 14 14 38 6 0 33 0.3 20 0.01188 FALSE NA NA NA NA NA +MN908947.3 20268 A G 0 0 0 327 1 38 1 327 4.28941e-196 TRUE NA NA NA NA NA +MN908947.3 21101 G +T 3288 3193 37 110 0 20 0.0331325 3320 5.69779e-11 TRUE NA NA NA NA NA +MN908947.3 23403 A G 1 1 39 5964 5939 38 0.99933 5968 0 TRUE NA NA NA NA NA +MN908947.3 28465 T A 178 13 38 44 44 32 0.198198 222 4.80414e-15 TRUE NA NA NA NA NA +MN908947.3 28770 C T 148 141 36 1513 40 38 0.910897 1661 0 TRUE NA NA NA NA NA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/SAMEA6808195-variants.vcf Fri Jun 05 05:10:05 2020 +0000 @@ -0,0 +1,23 @@ +##fileformat=VCFv4.2 +##source=iVar +##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> +##FILTER=<ID=PASS,Description="Result of p-value <= 0.05"> +##FILTER=<ID=FAIL,Description="Result of p-value > 0.05"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=REF_DP,Number=1,Type=Integer,Description="Depth of reference base"> +##FORMAT=<ID=REF_RV,Number=1,Type=Integer,Description="Depth of reference base on reverse reads"> +##FORMAT=<ID=REF_QUAL,Number=1,Type=Integer,Description="Mean quality of reference base"> +##FORMAT=<ID=ALT_DP,Number=1,Type=Integer,Description="Depth of alternate base"> +##FORMAT=<ID=ALT_RV,Number=1,Type=Integer,Description="Deapth of alternate base on reverse reads"> +##FORMAT=<ID=ALT_QUAL,Number=1,Type=String,Description="Mean quality of alternate base"> +##FORMAT=<ID=ALT_FREQ,Number=1,Type=String,Description="Frequency of alternate base"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMEA6808195-variants +MN908947.3 30 . A T . PASS DP=53 GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ 1:23:10:38:30:0:33:0.566038 +MN908947.3 228 . C T . PASS DP=3932 GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ 1:3:0:47:3929:107:73:0.999237 +MN908947.3 241 . C T . PASS DP=3943 GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ 1:0:0:0:3943:127:72:1 +MN908947.3 3037 . C T . PASS DP=1713 GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ 1:0:0:0:1713:100:74:1 +MN908947.3 14408 . C T . PASS DP=6480 GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ 1:2:0:53:6478:209:75:0.999691 +MN908947.3 17126 . T C . PASS DP=11854 GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ 1:3:0:39:11851:4693:38:0.999747 +MN908947.3 20268 . A G . PASS DP=327 GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ 1:0:0:0:327:1:38:1 +MN908947.3 23403 . A G . PASS DP=5968 GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ 1:1:1:39:5964:5939:38:0.99933 +MN908947.3 28770 . C T . PASS DP=1661 GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ 1:148:141:36:1513:40:38:0.910897