Mercurial > repos > drosofff > msp_sr_readmap_and_size_histograms
annotate readmap.py @ 24:bf7388df53cf draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
author | drosofff |
---|---|
date | Sat, 08 Oct 2016 07:18:45 -0400 |
parents | d6b93af0da55 |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/python |
2 # python parser module for for readmaps and size distributions, guided by GFF3 | |
3 # version 0.9.1 (1-6-2014) | |
4 # Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> | |
5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> | |
6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> | |
7 | |
8 import sys, subprocess, argparse | |
9 from smRtools import * | |
10 from collections import OrderedDict, defaultdict | |
11 import os | |
12 | |
13 def Parser(): | |
14 the_parser = argparse.ArgumentParser() | |
15 the_parser.add_argument('--output_readmap', action="store", type=str, help="readmap dataframe") | |
16 the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe") | |
17 the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file") | |
18 the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references") | |
19 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") | |
20 the_parser.add_argument('--ext',nargs='+', help="input file type") | |
21 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") | |
22 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") | |
23 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") | |
24 the_parser.add_argument('--minquery', type=int, help="Minimum readsize") | |
25 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") | |
26 args = the_parser.parse_args() | |
27 return args | |
28 | |
29 args=Parser() | |
30 if args.reference_fasta: | |
31 genomeRefFormat = "fastaSource" | |
32 genomeRefFile = args.reference_fasta | |
33 if args.reference_bowtie_index: | |
34 genomeRefFormat = "bowtieIndex" | |
35 genomeRefFile = args.reference_bowtie_index | |
36 readmap_file=args.output_readmap | |
37 size_distribution_file=args.output_size_distribution | |
38 minquery=args.minquery | |
39 maxquery=args.maxquery | |
40 filePath=args.input | |
41 fileExt=args.ext | |
42 fileLabel=args.label | |
43 normalization_factor=args.normalization_factor | |
44 | |
45 MasterListOfGenomes = OrderedDict() | |
46 | |
47 def process_samples(filePath): | |
48 for i, filePath in enumerate(filePath): | |
49 norm=normalization_factor[i] | |
50 print fileLabel[i] | |
51 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ | |
52 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) | |
53 return MasterListOfGenomes | |
54 | |
24
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
55 def remove_null_entries(listofdatalines): |
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
56 """ |
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
57 This function removes genes that have no reads aligned. |
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
58 """ |
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
59 Dict = defaultdict(float) |
0 | 60 for line in listofdatalines: |
61 fields= line.split("\t") | |
24
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
62 Dict[fields[0]] += abs(float(fields[2])) |
0 | 63 filtered_list = [] |
64 for line in listofdatalines: | |
65 fields= line.split("\t") | |
66 if Dict[fields[0]] != 0: | |
24
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
67 filtered_list.append(line) |
0 | 68 return filtered_list |
69 | |
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
70 |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
71 def listify_plottable_item(item): |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
72 """ |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
73 plottable is a list of strings: |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
74 'FBti0020401\t78\t-1.0\tR' |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
75 split on tab and return gene, coordinate, count and orientation |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
76 """ |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
77 gene, coordinate, count, orientation = item.split("\t") |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
78 return gene, coordinate, count, orientation |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
79 |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
80 def lookup_gene_length(gene, readDict): |
19
f75315939afe
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 9237338d798251fb2667280d597746e852f3ffcc-dirty
drosofff
parents:
18
diff
changeset
|
81 return readDict[readDict.keys()[0]].instanceDict[gene].size |
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
82 |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
83 def handle_start_stop_coordinates(plottable, readDict): |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
84 """ |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
85 To ensure that the plot area always includes the correct start and end coordinates, |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
86 we add an entry at start [coordinate 0] and end [last coordinate] of count 0, if these do not exist. |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
87 """ |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
88 first_line = plottable[0] |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
89 last_line = plottable[-1] |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
90 gene, coordinate, count, orientation = listify_plottable_item(first_line) |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
91 if not coordinate == "0": |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
92 new_line = "\t".join([gene, "0", "0", "F"]) |
17
a118d511a27c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 032b3f084f3b2a6d42ba476ef55f4de593b58606-dirty
mvdbeek
parents:
16
diff
changeset
|
93 plottable = [new_line] + plottable |
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
94 gene_length = str(lookup_gene_length(gene, readDict)) |
18
893560ece89f
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 9237338d798251fb2667280d597746e852f3ffcc-dirty
mvdbeek
parents:
17
diff
changeset
|
95 gene, coordinate, count, orientation = listify_plottable_item(last_line) |
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
96 if not coordinate == gene_length: |
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
97 last_line = "\t".join([gene, gene_length, "0", "F"]) |
17
a118d511a27c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 032b3f084f3b2a6d42ba476ef55f4de593b58606-dirty
mvdbeek
parents:
16
diff
changeset
|
98 plottable = plottable + [last_line] |
18
893560ece89f
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 9237338d798251fb2667280d597746e852f3ffcc-dirty
mvdbeek
parents:
17
diff
changeset
|
99 return plottable |
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
100 |
0 | 101 def write_readplot_dataframe(readDict, readmap_file): |
102 listoflines = [] | |
103 with open(readmap_file, 'w') as readmap: | |
104 print >>readmap, "gene\tcoord\tcount\tpolarity\tsample" | |
105 for sample in readDict.keys(): | |
106 if args.gff: | |
107 dict=readDict[sample] | |
108 else: | |
109 dict=readDict[sample].instanceDict | |
110 for gene in dict.keys(): | |
111 plottable = dict[gene].readplot() | |
18
893560ece89f
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 9237338d798251fb2667280d597746e852f3ffcc-dirty
mvdbeek
parents:
17
diff
changeset
|
112 plottable = handle_start_stop_coordinates(plottable, readDict) |
0 | 113 for line in plottable: |
114 listoflines.append ("%s\t%s" % (line, sample)) | |
24
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
115 listoflines = remove_null_entries(listoflines) |
0 | 116 for line in listoflines: |
117 print >>readmap, line | |
118 | |
119 def write_size_distribution_dataframe(readDict, size_distribution_file): | |
120 listoflines = [] | |
121 with open(size_distribution_file, 'w') as size_distrib: | |
122 print >>size_distrib, "gene\tsize\tcount\tpolarity\tsample" # test before was "gene\tpolarity\tsize\tcount\tsample" | |
123 for sample in readDict.keys(): | |
124 if args.gff: | |
125 dict=readDict[sample] | |
126 else: | |
127 dict=readDict[sample].instanceDict | |
128 for gene in dict.keys(): | |
24
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
129 histogram = dict[gene].size_histogram(minquery=minquery, maxquery=maxquery) |
0 | 130 for polarity in histogram.keys(): |
131 if polarity=='both': | |
132 continue | |
133 for size, count in histogram[polarity].iteritems(): | |
134 listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) ) | |
24
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
135 listoflines = remove_null_entries(listoflines) |
0 | 136 for line in listoflines: |
24
bf7388df53cf
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 3effd45f45c37a6cdaf9b7b1da1ed4d10d3b0e38
drosofff
parents:
23
diff
changeset
|
137 print >>size_distrib, line |
0 | 138 |
139 def gff_item_subinstances(readDict, gff3): | |
140 GFFinstanceDict=OrderedDict() | |
141 for sample in readDict.keys(): | |
142 GFFinstanceDict[sample]={} # to implement the 2nd level of directionary in an OrderedDict Class object (would not be required with defaultdict Class) | |
143 with open(gff3) as gff: | |
144 for line in gff: | |
145 if line[0] == "#": continue | |
146 gff_fields = line[:-1].split("\t") | |
147 chrom = gff_fields[0] | |
148 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name | |
149 item_upstream_coordinate = int(gff_fields[3]) | |
150 item_downstream_coordinate = int(gff_fields[4]) | |
151 item_polarity = gff_fields[6] | |
152 for sample in readDict.keys(): | |
153 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) | |
154 if item_polarity == '-': | |
155 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} | |
156 subinstance.gene=gff_name | |
157 GFFinstanceDict[sample][gff_name]=subinstance | |
158 return GFFinstanceDict | |
159 | |
160 MasterListOfGenomes=process_samples(filePath) | |
161 | |
162 if args.gff: | |
163 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) | |
164 | |
165 write_readplot_dataframe(MasterListOfGenomes, readmap_file) | |
166 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) | |
167 |