|
0
|
1 #!/usr/bin/python
|
|
|
2 # python parser module to analyse Features in sRbowtie alignments (guided by a GFF3 file)
|
|
|
3 # version 0.9
|
|
|
4 # Usage FeaturesParser.py <1:index source> <2:extraction directive> <3:output> <4:GFF3 guide file> <5:6:7 filePath:FileExt:FileLabel> <.. ad lib>
|
|
|
5
|
|
|
6 import sys
|
|
|
7 from smRtools import *
|
|
|
8 from collections import *
|
|
|
9
|
|
|
10 IndexSource = sys.argv[1]
|
|
|
11 ExtractionDirective = sys.argv[2]
|
|
|
12 if ExtractionDirective == "--do_not_extract_index":
|
|
|
13 genomeRefFormat = "fastaSource"
|
|
|
14 elif ExtractionDirective == "--extract_index":
|
|
|
15 genomeRefFormat = "bowtieIndex"
|
|
|
16 Output = sys.argv[3]
|
|
|
17 GFF3_file = sys.argv[4]
|
|
|
18 Triplets = [sys.argv[5:][i:i+3] for i in xrange(0, len(sys.argv[5:]), 3)]
|
|
|
19 MasterListOfGenomes = {}
|
|
|
20 FeatureDict = defaultdict(dict)
|
|
|
21
|
|
|
22 for [filePath, FileExt, FileLabel] in Triplets:
|
|
|
23 MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (filePath, FileExt, IndexSource, genomeRefFormat)
|
|
|
24 FeatureDict[FileLabel] = MasterListOfGenomes[FileLabel].CountFeatures(GFF3=GFF3_file)
|
|
|
25
|
|
|
26 # add some code to pick up the GFF3 features in their order of appearence.
|
|
|
27 F = open(GFF3_file, "r")
|
|
|
28 featureList = []
|
|
|
29 for line in F:
|
|
|
30 if line[0] == "#": continue
|
|
|
31 feature = line.split()[2]
|
|
|
32 if feature not in featureList:
|
|
|
33 featureList.append(feature)
|
|
|
34 F.close()
|
|
|
35
|
|
|
36 header = ["#Feature"]
|
|
|
37 for [filePath, FileExt, FileLabel] in Triplets:
|
|
|
38 header.append(FileLabel)
|
|
|
39
|
|
|
40 F = open (sys.argv[3], "w")
|
|
|
41 print >> F, "\t".join(header)
|
|
|
42 for feature in featureList:
|
|
|
43 line=[feature]
|
|
|
44 for sample in header[1:]:
|
|
|
45 count = str (FeatureDict[sample][feature])
|
|
|
46 # uncomment to get percentage in addition to counts
|
|
|
47 # percent = float(FeatureDict[sample][feature]) / MasterListOfGenomes[sample].alignedReads
|
|
|
48 # value = "%s | %0.2f" % (count, percent)
|
|
|
49 # line.append(value)
|
|
|
50 line.append(count)
|
|
|
51 print >> F, "\t".join(line )
|
|
|
52 line = ["Unfeatured"]
|
|
|
53 for sample in header[1:]:
|
|
|
54 matched = 0
|
|
|
55 for feature in FeatureDict[sample]:
|
|
|
56 matched += FeatureDict[sample][feature]
|
|
|
57 unmatched = MasterListOfGenomes[sample].alignedReads - matched
|
|
|
58 # uncomment to get percentage in addition to counts
|
|
|
59 # percent = float (unmatched) / (matched + unmatched)
|
|
|
60 # value = "%s | %0.2f" % (unmatched, percent)
|
|
|
61 # line.append(value)
|
|
|
62 line.append("%s" % unmatched)
|
|
|
63 print >> F, "\t".join(line)
|
|
|
64 F.close()
|