changeset 0:c87f6ad32fd8 draft default tip

"planemo upload for repository https://github.com/dfornika/galaxytools/tree/master/tools/ivar_variants_to_vcf commit 16332019b4aab6af58c74e631f390dfeef23a3dc"
author dfornika
date Fri, 05 Jun 2020 05:10:05 +0000
parents
children
files ivar_variants_to_vcf.py ivar_variants_to_vcf.xml test-data/SAMEA6808195-variants.tsv test-data/SAMEA6808195-variants.vcf
diffstat 4 files changed, 176 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ivar_variants_to_vcf.py	Fri Jun 05 05:10:05 2020 +0000
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+import os
+import sys
+import re
+import errno
+import argparse
+
+def parse_args(args=None):
+    Description = 'Convert iVar variants tsv file to vcf format.'
+    Epilog = """Example usage: python ivar_variants_to_vcf.py <FILE_IN> <FILE_OUT>"""
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument('FILE_IN', help="Input tsv file.")
+    parser.add_argument('FILE_OUT', help="Full path to output vcf file.")
+    parser.add_argument('-po', '--pass_only', dest="PASS_ONLY", help="Only output variants that PASS all filters.",action='store_true')
+    parser.add_argument('-ma', '--min_allele_freq', type=float, dest="MIN_ALLELE_FREQ", default=0, help="Only output variants where allele frequency greater than this number (default: 0).")
+
+    return parser.parse_args(args)
+
+def make_dir(path):
+    if not len(path) == 0:
+        try:
+            os.makedirs(path)
+        except OSError as exception:
+            if exception.errno != errno.EEXIST:
+                raise
+
+def ivar_variants_to_vcf(FileIn,FileOut,passOnly=False,minAF=0):
+    filename = os.path.splitext(FileIn)[0]
+    header = ('##fileformat=VCFv4.2\n'
+              '##source=iVar\n'
+              '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n'
+              '##FILTER=<ID=PASS,Description="Result of p-value <= 0.05">\n'
+              '##FILTER=<ID=FAIL,Description="Result of p-value > 0.05">\n'
+              '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
+              '##FORMAT=<ID=REF_DP,Number=1,Type=Integer,Description="Depth of reference base">\n'
+              '##FORMAT=<ID=REF_RV,Number=1,Type=Integer,Description="Depth of reference base on reverse reads">\n'
+              '##FORMAT=<ID=REF_QUAL,Number=1,Type=Integer,Description="Mean quality of reference base">\n'
+              '##FORMAT=<ID=ALT_DP,Number=1,Type=Integer,Description="Depth of alternate base">\n'
+              '##FORMAT=<ID=ALT_RV,Number=1,Type=Integer,Description="Deapth of alternate base on reverse reads">\n'
+              '##FORMAT=<ID=ALT_QUAL,Number=1,Type=String,Description="Mean quality of alternate base">\n'
+              '##FORMAT=<ID=ALT_FREQ,Number=1,Type=String,Description="Frequency of alternate base">\n')
+    header += '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+filename+'\n'
+
+    varList = []
+    varCountDict = {'SNP':0, 'INS':0, 'DEL':0}
+    OutDir = os.path.dirname(FileOut)
+    make_dir(OutDir)
+    fout = open(FileOut,'w')
+    fout.write(header)
+    with open(FileIn) as f:
+        for line in f:
+            if not re.match("REGION",line):
+                line = re.split("\t", line)
+                CHROM=line[0]
+                POS=line[1]
+                ID='.'
+                REF=line[2]
+                ALT=line[3]
+                var_type = 'SNP'
+                if ALT[0] == '+':
+                    ALT = REF + ALT[1:]
+                    var_type = 'INS'
+                elif ALT[0] == '-':
+                    REF += ALT[1:]
+                    ALT = line[2]
+                    var_type = 'DEL'
+                QUAL='.'
+                pass_test=line[13]
+                if pass_test == 'TRUE':
+                    FILTER='PASS'
+                else:
+                    FILTER='FAIL'
+                INFO='DP='+line[11]
+                FORMAT='GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ'
+                SAMPLE='1:'+line[4]+':'+line[5]+':'+line[6]+':'+line[7]+':'+line[8]+':'+line[9]+':'+line[10]
+                oline = CHROM+'\t'+POS+'\t'+ID+'\t'+REF+'\t'+ALT+'\t'+QUAL+'\t'+FILTER+'\t'+INFO+'\t'+FORMAT+'\t'+SAMPLE+'\n'
+                writeLine = True
+                if passOnly and FILTER != 'PASS':
+                    writeLine = False
+                if float(line[10]) < minAF:
+                    writeLine = False
+                if (CHROM,POS,REF,ALT) in varList:
+                    writeLine = False
+                else:
+                    varList.append((CHROM,POS,REF,ALT))
+                if writeLine:
+                    varCountDict[var_type] += 1
+                    fout.write(oline)
+    fout.close()
+
+    ## Print variant counts to pass to MultiQC
+    varCountList = [(k, str(v)) for k, v in sorted(varCountDict.items())]
+    print('\t'.join(['sample'] + [x[0] for x in varCountList]))
+    print('\t'.join([filename] + [x[1] for x in varCountList]))
+
+def main(args=None):
+    args = parse_args(args)
+    ivar_variants_to_vcf(args.FILE_IN,args.FILE_OUT,args.PASS_ONLY,args.MIN_ALLELE_FREQ)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ivar_variants_to_vcf.xml	Fri Jun 05 05:10:05 2020 +0000
@@ -0,0 +1,29 @@
+<tool id="ivar_variants_to_vcf" name="iVar Variants to VCF" version="0.1.0">
+    <description>Convert iVar tabular variant output to .vcf format</description>
+    <requirements />
+    <command detect_errors="exit_code"><![CDATA[
+        '$__tool_directory__/ivar_variants_to_vcf.py'
+        -ma '${min_allele_freq}'
+        ${input}
+        ${output}
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="tabular" />
+        <param name="min_allele_freq" type="float" min="0.0" value="0.0" max="1.0" />
+    </inputs>
+    <outputs>
+        <data name="output" label="Variants (VCF)" format="vcf" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="SAMEA6808195-variants.tsv" />
+            <param name="min_allele_freq" value="0.5" />
+            <output name="output" file="SAMEA6808195-variants.vcf" lines_diff="2"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+    ]]></help>
+    <citations>
+        <citation type="doi">10.5281/zenodo.3872730</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/SAMEA6808195-variants.tsv	Fri Jun 05 05:10:05 2020 +0000
@@ -0,0 +1,21 @@
+REGION	POS	REF	ALT	REF_DP	REF_RV	REF_QUAL	ALT_DP	ALT_RV	ALT_QUAL	ALT_FREQ	TOTAL_DP	PVAL	PASS	GFF_FEATURE	REF_CODON	REF_AA	ALT_CODON	ALT_AA
+MN908947.3	30	A	T	23	10	38	30	0	33	0.566038	53	1.95736e-12	TRUE	NA	NA	NA	NA	NA
+MN908947.3	228	C	T	3	0	47	3929	107	73	0.999237	3932	0	TRUE	NA	NA	NA	NA	NA
+MN908947.3	241	C	T	0	0	0	3943	127	72	1	3943	0	TRUE	NA	NA	NA	NA	NA
+MN908947.3	3037	C	T	0	0	0	1713	100	74	1	1713	0	TRUE	NA	NA	NA	NA	NA
+MN908947.3	6696	C	+T	4306	119	75	266	0	20	0.0313901	8474	1.36673e-56	TRUE	NA	NA	NA	NA	NA
+MN908947.3	6872	G	A	4567	4278	35	386	17	74	0.0779011	4955	1.45993e-131	TRUE	NA	NA	NA	NA	NA
+MN908947.3	14408	C	T	2	0	53	6478	209	75	0.999691	6480	0	TRUE	NA	NA	NA	NA	NA
+MN908947.3	15168	G	A	1792	1774	25	96	96	20	0.0508475	1888	1.18961e-14	TRUE	NA	NA	NA	NA	NA
+MN908947.3	17126	T	C	3	0	39	11851	4693	38	0.999747	11854	0	TRUE	NA	NA	NA	NA	NA
+MN908947.3	19298	A	T	22	5	37	1	1	34	0.0434783	23	0.5	FALSE	NA	NA	NA	NA	NA
+MN908947.3	19305	T	C	18	2	38	1	0	20	0.0526316	19	0.513514	FALSE	NA	NA	NA	NA	NA
+MN908947.3	19345	T	C	19	3	38	1	0	37	0.05	20	0.512821	FALSE	NA	NA	NA	NA	NA
+MN908947.3	19516	T	A	13	13	38	1	1	20	0.0714286	14	0.518519	FALSE	NA	NA	NA	NA	NA
+MN908947.3	19518	G	-TA	14	14	38	1	0	20	0.0714286	14	0.535714	FALSE	NA	NA	NA	NA	NA
+MN908947.3	19548	A	T	14	14	38	6	0	33	0.3	20	0.01188	FALSE	NA	NA	NA	NA	NA
+MN908947.3	20268	A	G	0	0	0	327	1	38	1	327	4.28941e-196	TRUE	NA	NA	NA	NA	NA
+MN908947.3	21101	G	+T	3288	3193	37	110	0	20	0.0331325	3320	5.69779e-11	TRUE	NA	NA	NA	NA	NA
+MN908947.3	23403	A	G	1	1	39	5964	5939	38	0.99933	5968	0	TRUE	NA	NA	NA	NA	NA
+MN908947.3	28465	T	A	178	13	38	44	44	32	0.198198	222	4.80414e-15	TRUE	NA	NA	NA	NA	NA
+MN908947.3	28770	C	T	148	141	36	1513	40	38	0.910897	1661	0	TRUE	NA	NA	NA	NA	NA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/SAMEA6808195-variants.vcf	Fri Jun 05 05:10:05 2020 +0000
@@ -0,0 +1,23 @@
+##fileformat=VCFv4.2
+##source=iVar
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##FILTER=<ID=PASS,Description="Result of p-value <= 0.05">
+##FILTER=<ID=FAIL,Description="Result of p-value > 0.05">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=REF_DP,Number=1,Type=Integer,Description="Depth of reference base">
+##FORMAT=<ID=REF_RV,Number=1,Type=Integer,Description="Depth of reference base on reverse reads">
+##FORMAT=<ID=REF_QUAL,Number=1,Type=Integer,Description="Mean quality of reference base">
+##FORMAT=<ID=ALT_DP,Number=1,Type=Integer,Description="Depth of alternate base">
+##FORMAT=<ID=ALT_RV,Number=1,Type=Integer,Description="Deapth of alternate base on reverse reads">
+##FORMAT=<ID=ALT_QUAL,Number=1,Type=String,Description="Mean quality of alternate base">
+##FORMAT=<ID=ALT_FREQ,Number=1,Type=String,Description="Frequency of alternate base">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMEA6808195-variants
+MN908947.3	30	.	A	T	.	PASS	DP=53	GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ	1:23:10:38:30:0:33:0.566038
+MN908947.3	228	.	C	T	.	PASS	DP=3932	GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ	1:3:0:47:3929:107:73:0.999237
+MN908947.3	241	.	C	T	.	PASS	DP=3943	GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ	1:0:0:0:3943:127:72:1
+MN908947.3	3037	.	C	T	.	PASS	DP=1713	GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ	1:0:0:0:1713:100:74:1
+MN908947.3	14408	.	C	T	.	PASS	DP=6480	GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ	1:2:0:53:6478:209:75:0.999691
+MN908947.3	17126	.	T	C	.	PASS	DP=11854	GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ	1:3:0:39:11851:4693:38:0.999747
+MN908947.3	20268	.	A	G	.	PASS	DP=327	GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ	1:0:0:0:327:1:38:1
+MN908947.3	23403	.	A	G	.	PASS	DP=5968	GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ	1:1:1:39:5964:5939:38:0.99933
+MN908947.3	28770	.	C	T	.	PASS	DP=1661	GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ	1:148:141:36:1513:40:38:0.910897