comparison MirParser.py @ 1:101fec3cba04 draft

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author drosofff
date Thu, 13 Aug 2015 06:16:29 -0400
parents 035df35a257e
children 74394e39ad22
comparison
equal deleted inserted replaced
0:035df35a257e 1:101fec3cba04
3 # version 0.0.9 (1-6-2014) 3 # version 0.0.9 (1-6-2014)
4 # Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> 4 # Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3>
5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> 5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF">
6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> 6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib>
7 7
8 import sys, subprocess 8 import sys
9 import subprocess
10
9 from smRtools import * 11 from smRtools import *
10 12
11 IndexSource = sys.argv[1] 13 IndexSource = sys.argv[1]
12 ExtractionDirective = sys.argv[2] 14 ExtractionDirective = sys.argv[2]
13 if ExtractionDirective == "--do_not_extract_index": 15 if ExtractionDirective == "--do_not_extract_index":
14 genomeRefFormat = "fastaSource" 16 genomeRefFormat = "fastaSource"
15 elif ExtractionDirective == "--extract_index": 17 elif ExtractionDirective == "--extract_index":
16 genomeRefFormat = "bowtieIndex" 18 genomeRefFormat = "bowtieIndex"
17 OutputPre_mirs = sys.argv[3] 19 OutputPre_mirs = sys.argv[3]
18 OutputMature_Mirs = sys.argv[4] 20 OutputMature_Mirs = sys.argv[4]
19 GFF3_file = sys.argv[5] 21 GFF3_file = sys.argv[5]
20 lattice = sys.argv[6] 22 lattice = sys.argv[6]
21 Rcode = sys.argv[7] 23 Rcode = sys.argv[7]
22 latticePDF = sys.argv[8] 24 latticePDF = sys.argv[8]
23 Triplets = [sys.argv[9:][i:i+3] for i in xrange(0, len(sys.argv[9:]), 3)] 25 Triplets = [sys.argv[9:][i:i + 3] for i in xrange(0, len(sys.argv[9:]), 3)]
24 MasterListOfGenomes = {} 26 MasterListOfGenomes = {}
25 27
26 for [filePath, FileExt, FileLabel] in Triplets: 28 for [filePath, FileExt, FileLabel] in Triplets:
27 print FileLabel 29 print FileLabel
28 MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=FileExt, genomeRefFile=IndexSource, genomeRefFormat=genomeRefFormat, biosample=FileLabel) 30 MasterListOfGenomes[FileLabel] = HandleSmRNAwindows(alignmentFile=filePath,
31 alignmentFileFormat=FileExt,
32 genomeRefFile=IndexSource,
33 genomeRefFormat=genomeRefFormat,
34 biosample=FileLabel)
29 35
30 header = ["gene"] 36 header = ["gene"]
31 for [filePath, FileExt, FileLabel] in Triplets: 37 for [filePath, FileExt, FileLabel] in Triplets:
32 header.append(FileLabel) 38 header.append(FileLabel)
33 39
34 hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation 40 hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation
35 41
36 ## read GFF3 to subinstantiate 42 # read GFF3 to subinstantiate
37 gff3 = open (GFF3_file, "r") 43 gff3 = open(GFF3_file, "r")
38 lattice_dataframe = [] 44 lattice_dataframe = []
39 for line in gff3: 45 for line in gff3:
40 if line[0] == "#": continue 46 if line[0] == "#":
41 gff_fields = line[:-1].split("\t") 47 continue
42 chrom = gff_fields[0] 48 gff_fields = line[:-1].split("\t")
43 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name 49 chrom = gff_fields[0]
44 item_upstream_coordinate = int(gff_fields[3]) 50 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
45 item_downstream_coordinate = int(gff_fields[4]) 51 item_upstream_coordinate = int(gff_fields[3])
46 if gff_fields[6] == "+": 52 item_downstream_coordinate = int(gff_fields[4])
47 item_polarity = "forward" 53 if gff_fields[6] == "+":
48 else: 54 item_polarity = "forward"
49 item_polarity = "reverse" 55 else:
50 item_line = [gff_name] 56 item_polarity = "reverse"
51 for sample in header[1:]: 57 item_line = [gff_name]
52 count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, polarity=item_polarity) 58 for sample in header[1:]:
53 item_line.append(str(count)) 59 count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate,
54 ## subtreatement for lattice 60 downstream_coord=item_downstream_coordinate,
55 if lattice != "dummy_dataframe_path": 61 polarity=item_polarity)
56 if ("5p" not in gff_name) and ("3p" not in gff_name): 62 item_line.append(str(count))
57 lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, windowName=gff_name+"_"+sample) ) 63 # subtreatement for lattice
58 ## end of subtreatement for lattice 64 if lattice != "dummy_dataframe_path":
59 hit_table.append("\t".join(item_line) ) 65 if ("5p" not in gff_name) and ("3p" not in gff_name):
66 lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage(
67 upstream_coord=item_upstream_coordinate,
68 downstream_coord=item_downstream_coordinate,
69 windowName=gff_name + "_" + sample))
70 # end of subtreatement for lattice
71 hit_table.append("\t".join(item_line))
60 gff3.close() 72 gff3.close()
61 73
62 Fpremirs = open (OutputPre_mirs, "w") 74 Fpremirs = open(OutputPre_mirs, "w")
63 print >> Fpremirs, hit_table[0] 75 print >> Fpremirs, hit_table[0]
64 finalPreList = [ i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)] 76 finalPreList = [i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)]
65 print >> Fpremirs, "\n".join(finalPreList ) 77 print >> Fpremirs, "\n".join(finalPreList)
66 Fpremirs.close() 78 Fpremirs.close()
67 79
68 Fmaturemires = open (OutputMature_Mirs, "w") 80 Fmaturemires = open(OutputMature_Mirs, "w")
69 print >> Fmaturemires, hit_table[0] 81 print >> Fmaturemires, hit_table[0]
70 finalMatureList = [ i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)] 82 finalMatureList = [i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)]
71 print >> Fmaturemires, "\n".join(finalMatureList ) 83 print >> Fmaturemires, "\n".join(finalMatureList)
72 Fmaturemires.close() 84 Fmaturemires.close()
73 85
74 if lattice != "dummy_dataframe_path": 86 if lattice != "dummy_dataframe_path":
75 Flattice = open(lattice, "w") 87 Flattice = open(lattice, "w")
76 print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", "mir", "offset", "offsetNorm", "counts","countsNorm", "polarity") 88 print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample",
77 print >> Flattice, "\n".join(lattice_dataframe) 89 "mir",
78 Flattice.close() 90 "offset",
79 R_command="Rscript "+ Rcode 91 "offsetNorm",
80 process = subprocess.Popen(R_command.split()) 92 "counts",
81 process.wait() 93 "countsNorm",
94 "polarity")
95 print >> Flattice, "\n".join(lattice_dataframe)
96 Flattice.close()
97 R_command = "Rscript " + Rcode
98 process = subprocess.Popen(R_command.split())
99 process.wait()