annotate MirParser.py @ 7:20b8ff9c1cb9 draft default tip

Uploaded
author drosofff
date Mon, 23 Jun 2014 05:24:28 -0400
parents f6c22925fc3c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
1 #!/usr/bin/python
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
2 # python parser module for pre-mir and mature miRNAs, guided by mirbase.org GFF3
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
3 # version 0.0.9 (1-6-2014)
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
4 # Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3>
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF">
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib>
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
7
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
8 import sys, subprocess
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
9 from smRtools import *
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
10
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
11 IndexSource = sys.argv[1]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
12 ExtractionDirective = sys.argv[2]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
13 if ExtractionDirective == "--do_not_extract_index":
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
14 genomeRefFormat = "fastaSource"
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
15 elif ExtractionDirective == "--extract_index":
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
16 genomeRefFormat = "bowtieIndex"
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
17 OutputPre_mirs = sys.argv[3]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
18 OutputMature_Mirs = sys.argv[4]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
19 GFF3_file = sys.argv[5]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
20 lattice = sys.argv[6]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
21 Rcode = sys.argv[7]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
22 latticePDF = sys.argv[8]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
23 Triplets = [sys.argv[9:][i:i+3] for i in xrange(0, len(sys.argv[9:]), 3)]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
24 MasterListOfGenomes = {}
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
25
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
26 for [filePath, FileExt, FileLabel] in Triplets:
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
27 print FileLabel
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
28 MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=FileExt, genomeRefFile=IndexSource, genomeRefFormat=genomeRefFormat, biosample=FileLabel)
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
29
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
30 header = ["gene"]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
31 for [filePath, FileExt, FileLabel] in Triplets:
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
32 header.append(FileLabel)
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
33
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
34 hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
35
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
36 ## read GFF3 to subinstantiate
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
37 gff3 = open (GFF3_file, "r")
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
38 lattice_dataframe = []
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
39 for line in gff3:
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
40 if line[0] == "#": continue
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
41 gff_fields = line[:-1].split("\t")
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
42 chrom = gff_fields[0]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
43 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
44 item_upstream_coordinate = int(gff_fields[3])
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
45 item_downstream_coordinate = int(gff_fields[4])
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
46 if gff_fields[6] == "+":
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
47 item_polarity = "forward"
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
48 else:
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
49 item_polarity = "reverse"
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
50 item_line = [gff_name]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
51 for sample in header[1:]:
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
52 count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, polarity=item_polarity)
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
53 item_line.append(str(count))
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
54 ## subtreatement for lattice
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
55 if lattice != "dummy_dataframe_path":
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
56 if ("5p" not in gff_name) and ("3p" not in gff_name):
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
57 lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, windowName=gff_name+"_"+sample) )
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
58 ## end of subtreatement for lattice
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
59 hit_table.append("\t".join(item_line) )
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
60 gff3.close()
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
61
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
62 Fpremirs = open (OutputPre_mirs, "w")
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
63 print >> Fpremirs, hit_table[0]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
64 finalPreList = [ i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
65 print >> Fpremirs, "\n".join(finalPreList )
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
66 Fpremirs.close()
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
67
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
68 Fmaturemires = open (OutputMature_Mirs, "w")
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
69 print >> Fmaturemires, hit_table[0]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
70 finalMatureList = [ i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)]
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
71 print >> Fmaturemires, "\n".join(finalMatureList )
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
72 Fmaturemires.close()
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
73
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
74 if lattice != "dummy_dataframe_path":
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
75 Flattice = open(lattice, "w")
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
76 print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", "mir", "offset", "offsetNorm", "counts","countsNorm", "polarity")
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
77 print >> Flattice, "\n".join(lattice_dataframe)
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
78 Flattice.close()
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
79 R_command="Rscript "+ Rcode
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
80 process = subprocess.Popen(R_command.split())
f6c22925fc3c Uploaded
drosofff
parents:
diff changeset
81 process.wait()