Mercurial > repos > drosofff > mir_parser
annotate MirParser.py @ 1:101fec3cba04 draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author | drosofff |
---|---|
date | Thu, 13 Aug 2015 06:16:29 -0400 |
parents | 035df35a257e |
children | 74394e39ad22 |
rev | line source |
---|---|
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
1 #!/usr/bin/env python |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
2 # python parser module for pre-mir and mature miRNAs, guided by mirbase.org GFF3 |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
3 # version 0.0.9 (1-6-2014) |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
4 # Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
7 |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
8 import sys |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
9 import subprocess |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
10 |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
11 from smRtools import * |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
12 |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
13 IndexSource = sys.argv[1] |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
14 ExtractionDirective = sys.argv[2] |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
15 if ExtractionDirective == "--do_not_extract_index": |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
16 genomeRefFormat = "fastaSource" |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
17 elif ExtractionDirective == "--extract_index": |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
18 genomeRefFormat = "bowtieIndex" |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
19 OutputPre_mirs = sys.argv[3] |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
20 OutputMature_Mirs = sys.argv[4] |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
21 GFF3_file = sys.argv[5] |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
22 lattice = sys.argv[6] |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
23 Rcode = sys.argv[7] |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
24 latticePDF = sys.argv[8] |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
25 Triplets = [sys.argv[9:][i:i + 3] for i in xrange(0, len(sys.argv[9:]), 3)] |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
26 MasterListOfGenomes = {} |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
27 |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
28 for [filePath, FileExt, FileLabel] in Triplets: |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
29 print FileLabel |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
30 MasterListOfGenomes[FileLabel] = HandleSmRNAwindows(alignmentFile=filePath, |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
31 alignmentFileFormat=FileExt, |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
32 genomeRefFile=IndexSource, |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
33 genomeRefFormat=genomeRefFormat, |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
34 biosample=FileLabel) |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
35 |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
36 header = ["gene"] |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
37 for [filePath, FileExt, FileLabel] in Triplets: |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
38 header.append(FileLabel) |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
39 |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
40 hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
41 |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
42 # read GFF3 to subinstantiate |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
43 gff3 = open(GFF3_file, "r") |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
44 lattice_dataframe = [] |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
45 for line in gff3: |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
46 if line[0] == "#": |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
47 continue |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
48 gff_fields = line[:-1].split("\t") |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
49 chrom = gff_fields[0] |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
50 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
51 item_upstream_coordinate = int(gff_fields[3]) |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
52 item_downstream_coordinate = int(gff_fields[4]) |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
53 if gff_fields[6] == "+": |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
54 item_polarity = "forward" |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
55 else: |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
56 item_polarity = "reverse" |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
57 item_line = [gff_name] |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
58 for sample in header[1:]: |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
59 count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
60 downstream_coord=item_downstream_coordinate, |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
61 polarity=item_polarity) |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
62 item_line.append(str(count)) |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
63 # subtreatement for lattice |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
64 if lattice != "dummy_dataframe_path": |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
65 if ("5p" not in gff_name) and ("3p" not in gff_name): |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
66 lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage( |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
67 upstream_coord=item_upstream_coordinate, |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
68 downstream_coord=item_downstream_coordinate, |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
69 windowName=gff_name + "_" + sample)) |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
70 # end of subtreatement for lattice |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
71 hit_table.append("\t".join(item_line)) |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
72 gff3.close() |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
73 |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
74 Fpremirs = open(OutputPre_mirs, "w") |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
75 print >> Fpremirs, hit_table[0] |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
76 finalPreList = [i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)] |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
77 print >> Fpremirs, "\n".join(finalPreList) |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
78 Fpremirs.close() |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
79 |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
80 Fmaturemires = open(OutputMature_Mirs, "w") |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
81 print >> Fmaturemires, hit_table[0] |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
82 finalMatureList = [i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)] |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
83 print >> Fmaturemires, "\n".join(finalMatureList) |
0
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
84 Fmaturemires.close() |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
85 |
035df35a257e
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
86 if lattice != "dummy_dataframe_path": |
1
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
87 Flattice = open(lattice, "w") |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
88 print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
89 "mir", |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
90 "offset", |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
91 "offsetNorm", |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
92 "counts", |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
93 "countsNorm", |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
94 "polarity") |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
95 print >> Flattice, "\n".join(lattice_dataframe) |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
96 Flattice.close() |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
97 R_command = "Rscript " + Rcode |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
98 process = subprocess.Popen(R_command.split()) |
101fec3cba04
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
0
diff
changeset
|
99 process.wait() |