Mercurial > repos > drosofff > mir_parser
comparison MirParser.py @ 1:101fec3cba04 draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author | drosofff |
---|---|
date | Thu, 13 Aug 2015 06:16:29 -0400 |
parents | 035df35a257e |
children | 74394e39ad22 |
comparison
equal
deleted
inserted
replaced
0:035df35a257e | 1:101fec3cba04 |
---|---|
3 # version 0.0.9 (1-6-2014) | 3 # version 0.0.9 (1-6-2014) |
4 # Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> | 4 # Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> |
5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> | 5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> |
6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> | 6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> |
7 | 7 |
8 import sys, subprocess | 8 import sys |
9 import subprocess | |
10 | |
9 from smRtools import * | 11 from smRtools import * |
10 | 12 |
11 IndexSource = sys.argv[1] | 13 IndexSource = sys.argv[1] |
12 ExtractionDirective = sys.argv[2] | 14 ExtractionDirective = sys.argv[2] |
13 if ExtractionDirective == "--do_not_extract_index": | 15 if ExtractionDirective == "--do_not_extract_index": |
14 genomeRefFormat = "fastaSource" | 16 genomeRefFormat = "fastaSource" |
15 elif ExtractionDirective == "--extract_index": | 17 elif ExtractionDirective == "--extract_index": |
16 genomeRefFormat = "bowtieIndex" | 18 genomeRefFormat = "bowtieIndex" |
17 OutputPre_mirs = sys.argv[3] | 19 OutputPre_mirs = sys.argv[3] |
18 OutputMature_Mirs = sys.argv[4] | 20 OutputMature_Mirs = sys.argv[4] |
19 GFF3_file = sys.argv[5] | 21 GFF3_file = sys.argv[5] |
20 lattice = sys.argv[6] | 22 lattice = sys.argv[6] |
21 Rcode = sys.argv[7] | 23 Rcode = sys.argv[7] |
22 latticePDF = sys.argv[8] | 24 latticePDF = sys.argv[8] |
23 Triplets = [sys.argv[9:][i:i+3] for i in xrange(0, len(sys.argv[9:]), 3)] | 25 Triplets = [sys.argv[9:][i:i + 3] for i in xrange(0, len(sys.argv[9:]), 3)] |
24 MasterListOfGenomes = {} | 26 MasterListOfGenomes = {} |
25 | 27 |
26 for [filePath, FileExt, FileLabel] in Triplets: | 28 for [filePath, FileExt, FileLabel] in Triplets: |
27 print FileLabel | 29 print FileLabel |
28 MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=FileExt, genomeRefFile=IndexSource, genomeRefFormat=genomeRefFormat, biosample=FileLabel) | 30 MasterListOfGenomes[FileLabel] = HandleSmRNAwindows(alignmentFile=filePath, |
31 alignmentFileFormat=FileExt, | |
32 genomeRefFile=IndexSource, | |
33 genomeRefFormat=genomeRefFormat, | |
34 biosample=FileLabel) | |
29 | 35 |
30 header = ["gene"] | 36 header = ["gene"] |
31 for [filePath, FileExt, FileLabel] in Triplets: | 37 for [filePath, FileExt, FileLabel] in Triplets: |
32 header.append(FileLabel) | 38 header.append(FileLabel) |
33 | 39 |
34 hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation | 40 hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation |
35 | 41 |
36 ## read GFF3 to subinstantiate | 42 # read GFF3 to subinstantiate |
37 gff3 = open (GFF3_file, "r") | 43 gff3 = open(GFF3_file, "r") |
38 lattice_dataframe = [] | 44 lattice_dataframe = [] |
39 for line in gff3: | 45 for line in gff3: |
40 if line[0] == "#": continue | 46 if line[0] == "#": |
41 gff_fields = line[:-1].split("\t") | 47 continue |
42 chrom = gff_fields[0] | 48 gff_fields = line[:-1].split("\t") |
43 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name | 49 chrom = gff_fields[0] |
44 item_upstream_coordinate = int(gff_fields[3]) | 50 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name |
45 item_downstream_coordinate = int(gff_fields[4]) | 51 item_upstream_coordinate = int(gff_fields[3]) |
46 if gff_fields[6] == "+": | 52 item_downstream_coordinate = int(gff_fields[4]) |
47 item_polarity = "forward" | 53 if gff_fields[6] == "+": |
48 else: | 54 item_polarity = "forward" |
49 item_polarity = "reverse" | 55 else: |
50 item_line = [gff_name] | 56 item_polarity = "reverse" |
51 for sample in header[1:]: | 57 item_line = [gff_name] |
52 count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, polarity=item_polarity) | 58 for sample in header[1:]: |
53 item_line.append(str(count)) | 59 count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, |
54 ## subtreatement for lattice | 60 downstream_coord=item_downstream_coordinate, |
55 if lattice != "dummy_dataframe_path": | 61 polarity=item_polarity) |
56 if ("5p" not in gff_name) and ("3p" not in gff_name): | 62 item_line.append(str(count)) |
57 lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, windowName=gff_name+"_"+sample) ) | 63 # subtreatement for lattice |
58 ## end of subtreatement for lattice | 64 if lattice != "dummy_dataframe_path": |
59 hit_table.append("\t".join(item_line) ) | 65 if ("5p" not in gff_name) and ("3p" not in gff_name): |
66 lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage( | |
67 upstream_coord=item_upstream_coordinate, | |
68 downstream_coord=item_downstream_coordinate, | |
69 windowName=gff_name + "_" + sample)) | |
70 # end of subtreatement for lattice | |
71 hit_table.append("\t".join(item_line)) | |
60 gff3.close() | 72 gff3.close() |
61 | 73 |
62 Fpremirs = open (OutputPre_mirs, "w") | 74 Fpremirs = open(OutputPre_mirs, "w") |
63 print >> Fpremirs, hit_table[0] | 75 print >> Fpremirs, hit_table[0] |
64 finalPreList = [ i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)] | 76 finalPreList = [i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)] |
65 print >> Fpremirs, "\n".join(finalPreList ) | 77 print >> Fpremirs, "\n".join(finalPreList) |
66 Fpremirs.close() | 78 Fpremirs.close() |
67 | 79 |
68 Fmaturemires = open (OutputMature_Mirs, "w") | 80 Fmaturemires = open(OutputMature_Mirs, "w") |
69 print >> Fmaturemires, hit_table[0] | 81 print >> Fmaturemires, hit_table[0] |
70 finalMatureList = [ i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)] | 82 finalMatureList = [i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)] |
71 print >> Fmaturemires, "\n".join(finalMatureList ) | 83 print >> Fmaturemires, "\n".join(finalMatureList) |
72 Fmaturemires.close() | 84 Fmaturemires.close() |
73 | 85 |
74 if lattice != "dummy_dataframe_path": | 86 if lattice != "dummy_dataframe_path": |
75 Flattice = open(lattice, "w") | 87 Flattice = open(lattice, "w") |
76 print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", "mir", "offset", "offsetNorm", "counts","countsNorm", "polarity") | 88 print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", |
77 print >> Flattice, "\n".join(lattice_dataframe) | 89 "mir", |
78 Flattice.close() | 90 "offset", |
79 R_command="Rscript "+ Rcode | 91 "offsetNorm", |
80 process = subprocess.Popen(R_command.split()) | 92 "counts", |
81 process.wait() | 93 "countsNorm", |
94 "polarity") | |
95 print >> Flattice, "\n".join(lattice_dataframe) | |
96 Flattice.close() | |
97 R_command = "Rscript " + Rcode | |
98 process = subprocess.Popen(R_command.split()) | |
99 process.wait() |