Mercurial > repos > drosofff > mir_parser
comparison MirParser.py @ 0:035df35a257e draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
| author | drosofff |
|---|---|
| date | Mon, 29 Jun 2015 05:50:44 -0400 |
| parents | |
| children | 101fec3cba04 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:035df35a257e |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # python parser module for pre-mir and mature miRNAs, guided by mirbase.org GFF3 | |
| 3 # version 0.0.9 (1-6-2014) | |
| 4 # Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> | |
| 5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> | |
| 6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> | |
| 7 | |
| 8 import sys, subprocess | |
| 9 from smRtools import * | |
| 10 | |
| 11 IndexSource = sys.argv[1] | |
| 12 ExtractionDirective = sys.argv[2] | |
| 13 if ExtractionDirective == "--do_not_extract_index": | |
| 14 genomeRefFormat = "fastaSource" | |
| 15 elif ExtractionDirective == "--extract_index": | |
| 16 genomeRefFormat = "bowtieIndex" | |
| 17 OutputPre_mirs = sys.argv[3] | |
| 18 OutputMature_Mirs = sys.argv[4] | |
| 19 GFF3_file = sys.argv[5] | |
| 20 lattice = sys.argv[6] | |
| 21 Rcode = sys.argv[7] | |
| 22 latticePDF = sys.argv[8] | |
| 23 Triplets = [sys.argv[9:][i:i+3] for i in xrange(0, len(sys.argv[9:]), 3)] | |
| 24 MasterListOfGenomes = {} | |
| 25 | |
| 26 for [filePath, FileExt, FileLabel] in Triplets: | |
| 27 print FileLabel | |
| 28 MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=FileExt, genomeRefFile=IndexSource, genomeRefFormat=genomeRefFormat, biosample=FileLabel) | |
| 29 | |
| 30 header = ["gene"] | |
| 31 for [filePath, FileExt, FileLabel] in Triplets: | |
| 32 header.append(FileLabel) | |
| 33 | |
| 34 hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation | |
| 35 | |
| 36 ## read GFF3 to subinstantiate | |
| 37 gff3 = open (GFF3_file, "r") | |
| 38 lattice_dataframe = [] | |
| 39 for line in gff3: | |
| 40 if line[0] == "#": continue | |
| 41 gff_fields = line[:-1].split("\t") | |
| 42 chrom = gff_fields[0] | |
| 43 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name | |
| 44 item_upstream_coordinate = int(gff_fields[3]) | |
| 45 item_downstream_coordinate = int(gff_fields[4]) | |
| 46 if gff_fields[6] == "+": | |
| 47 item_polarity = "forward" | |
| 48 else: | |
| 49 item_polarity = "reverse" | |
| 50 item_line = [gff_name] | |
| 51 for sample in header[1:]: | |
| 52 count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, polarity=item_polarity) | |
| 53 item_line.append(str(count)) | |
| 54 ## subtreatement for lattice | |
| 55 if lattice != "dummy_dataframe_path": | |
| 56 if ("5p" not in gff_name) and ("3p" not in gff_name): | |
| 57 lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, windowName=gff_name+"_"+sample) ) | |
| 58 ## end of subtreatement for lattice | |
| 59 hit_table.append("\t".join(item_line) ) | |
| 60 gff3.close() | |
| 61 | |
| 62 Fpremirs = open (OutputPre_mirs, "w") | |
| 63 print >> Fpremirs, hit_table[0] | |
| 64 finalPreList = [ i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)] | |
| 65 print >> Fpremirs, "\n".join(finalPreList ) | |
| 66 Fpremirs.close() | |
| 67 | |
| 68 Fmaturemires = open (OutputMature_Mirs, "w") | |
| 69 print >> Fmaturemires, hit_table[0] | |
| 70 finalMatureList = [ i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)] | |
| 71 print >> Fmaturemires, "\n".join(finalMatureList ) | |
| 72 Fmaturemires.close() | |
| 73 | |
| 74 if lattice != "dummy_dataframe_path": | |
| 75 Flattice = open(lattice, "w") | |
| 76 print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", "mir", "offset", "offsetNorm", "counts","countsNorm", "polarity") | |
| 77 print >> Flattice, "\n".join(lattice_dataframe) | |
| 78 Flattice.close() | |
| 79 R_command="Rscript "+ Rcode | |
| 80 process = subprocess.Popen(R_command.split()) | |
| 81 process.wait() |
