msp_sr_size_histograms: size_histogram.py annotate

annotate size_histogram.py @ 0:63ff807752d7 draft

Imported from capsule None

author	drosofff
date	Mon, 03 Nov 2014 10:30:29 -0500
parents
children

rev	line source
0 63ff807752d7 Imported from capsule None drosofff parents: diff changeset	1 #!/usr/bin/python
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	2 # python parser module for size distributions, guided by GFF3
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	3 # version 0.9.1 (1-6-2014)
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	4 # Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3>
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF">
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib>
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	7
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	8 import sys, subprocess, argparse
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	9 from smRtools import *
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	10 from collections import OrderedDict, defaultdict
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	11 import os
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	12
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	13 def Parser():
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	14 the_parser = argparse.ArgumentParser()
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	15 the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	16 the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	17 the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	18 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	19 the_parser.add_argument('--ext',nargs='+', help="input file type")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	20 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	21 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	22 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	23 the_parser.add_argument('--minquery', type=int, help="Minimum readsize")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	24 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	25 the_parser.add_argument('--rcode', type=str, help="R script")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	26 the_parser.add_argument('--global_size', action="store_true", help="if specified, size distribution is calcilated for the sum of all items")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	27 the_parser.add_argument('--collapse', action="store_true", help="if specified, forward and reverse reads are collapsed")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	28 args = the_parser.parse_args()
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	29 return args
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	30
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	31 args=Parser()
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	32 if args.reference_fasta:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	33 genomeRefFormat = "fastaSource"
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	34 genomeRefFile = args.reference_fasta
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	35 if args.reference_bowtie_index:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	36 genomeRefFormat = "bowtieIndex"
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	37 genomeRefFile = args.reference_bowtie_index
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	38 size_distribution_file=args.output_size_distribution
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	39 minquery=args.minquery
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	40 maxquery=args.maxquery
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	41 Rcode = args.rcode
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	42 filePath=args.input
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	43 fileExt=args.ext
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	44 fileLabel=args.label
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	45 normalization_factor=args.normalization_factor
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	46 global_size=args.global_size
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	47 collapse=args.collapse
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	48
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	49 if collapse:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	50 pol=["both"]
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	51 else:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	52 pol=["F", "R"]
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	53
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	54 MasterListOfGenomes = OrderedDict()
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	55
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	56 def process_samples(filePath):
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	57 for i, filePath in enumerate(filePath):
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	58 norm=normalization_factor[i]
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	59 print fileLabel[i]
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	60 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	61 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm)
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	62 return MasterListOfGenomes
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	63
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	64 def write_size_distribution_dataframe(readDict, size_distribution_file, pol=["both"] ):
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	65 '''refactored on 7-9-2014'''
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	66 with open(size_distribution_file, 'w') as size_distrib:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	67 print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample"
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	68 for sample in readDict.keys():
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	69 if args.gff:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	70 dict=readDict[sample]
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	71 else:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	72 dict=readDict[sample].instanceDict
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	73 for gene in dict.keys():
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	74 histogram = dict[gene].size_histogram()
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	75 for polarity in pol:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	76 for size, count in histogram[polarity].iteritems():
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	77 print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample)
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	78
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	79 def write_size_distribution_dataframe_global(readDict, size_distribution_file, pol=["both"]):
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	80 with open(size_distribution_file, 'w') as size_distrib:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	81 print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample"
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	82 for sample in readDict.keys():
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	83 histogram = readDict[sample].size_histogram()
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	84 gene="sample"
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	85 for polarity in pol:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	86 for size, count in histogram[polarity].iteritems():
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	87 print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample)
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	88
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	89 def gff_item_subinstances(readDict, gff3):
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	90 GFFinstanceDict=OrderedDict()
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	91 with open(gff3) as gff:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	92 for line in gff:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	93 if line[0] == "#": continue
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	94 gff_fields = line[:-1].split("\t")
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	95 chrom = gff_fields[0]
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	96 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	97 item_upstream_coordinate = int(gff_fields[3])
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	98 item_downstream_coordinate = int(gff_fields[4])
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	99 item_polarity = gff_fields[6]
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	100 for sample in readDict.keys():
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	101 if not GFFinstanceDict.has_key(sample):
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	102 GFFinstanceDict[sample]={}
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	103 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom])
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	104 if item_polarity == '-':
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	105 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()}
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	106 # subinstance.readDict.setdefault(key, [])
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	107 subinstance.gene=gff_name
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	108 GFFinstanceDict[sample][gff_name]=subinstance
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	109 return GFFinstanceDict
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	110
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	111 MasterListOfGenomes=process_samples(filePath)
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	112
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	113 if args.gff:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	114 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff)
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	115
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	116 if global_size:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	117 write_size_distribution_dataframe_global(MasterListOfGenomes, size_distribution_file, pol)
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	118 else:
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	119 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file, pol)
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	120
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	121 R_command="Rscript "+ Rcode
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	122 process = subprocess.Popen(R_command.split())
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	123 process.wait()
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	124
63ff807752d7 Imported from capsule None drosofff parents: diff changeset	125

Mercurial > repos > drosofff > msp_sr_size_histograms

annotate size_histogram.py @ 0:63ff807752d7 draft