Mercurial > repos > devteam > weightedaverage
diff WeightedAverage.py @ 0:e11314245c2a draft
Imported from capsule None
author | devteam |
---|---|
date | Tue, 01 Apr 2014 09:11:55 -0400 |
parents | |
children | 68a40b074399 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/WeightedAverage.py Tue Apr 01 09:11:55 2014 -0400 @@ -0,0 +1,94 @@ +#!/usr/bin/env python +""" +usage: %prog bed_file_1 bed_file_2 out_file + -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file + -2, --cols2=N,N,N,N,N: Columns for chr, start, end, strand, name/value in second file +""" + +import collections +import sys +#import numpy +from galaxy import eggs +import pkg_resources +pkg_resources.require( "bx-python" ) +from galaxy.tools.util.galaxyops import * +from bx.cookbook import doc_optparse + + +#export PYTHONPATH=~/galaxy/lib/ +#running command python WeightedAverage.py interval_interpolate.bed value_interpolate.bed interpolate_result.bed + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def FindRate(chromosome, start_stop, dictType): + OverlapList = [] + for tempO in dictType[chromosome]: + DatabaseInterval = [tempO[0], tempO[1]] + Overlap = GetOverlap( start_stop, DatabaseInterval ) + if Overlap > 0: + OverlapList.append([Overlap, tempO[2]]) + + if len(OverlapList) > 0: + SumRecomb = 0 + SumOverlap = 0 + for member in OverlapList: + SumRecomb += member[0]*member[1] + SumOverlap += member[0] + averageRate = SumRecomb/SumOverlap + return averageRate + else: + return 'NA' + + +def GetOverlap(a, b): + return min(a[1], b[1])-max(a[0], b[0]) + + +options, args = doc_optparse.parse( __doc__ ) + +try: + chr_col_1, start_col_1, end_col_1, strand_col1 = parse_cols_arg( options.cols1 ) + chr_col_2, start_col_2, end_col_2, strand_col2, name_col_2 = parse_cols_arg( options.cols2 ) + input1, input2, input3 = args +except Exception, eee: + print eee + stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." ) + +fd2 = open(input2) +lines2 = fd2.readlines() +RecombChrDict = collections.defaultdict(list) + +skipped = 0 +for line in lines2: + temp = line.strip().split() + try: + assert float(temp[int(name_col_2)]) + except: + skipped += 1 + continue + tempIndex = [int(temp[int(start_col_2)]), int(temp[int(end_col_2)]), float(temp[int(name_col_2)])] + RecombChrDict[temp[int(chr_col_2)]].append(tempIndex) + +print "Skipped %d features with invalid values" % (skipped) + +fd1 = open(input1) +lines = fd1.readlines() +finalProduct = '' +for line in lines: + temp = line.strip().split('\t') + chromosome = temp[int(chr_col_1)] + start = int(temp[int(start_col_1)]) + stop = int(temp[int(end_col_1)]) + start_stop = [start, stop] + RecombRate = FindRate( chromosome, start_stop, RecombChrDict ) + try: + RecombRate = "%.4f" % (float(RecombRate)) + except: + RecombRate = RecombRate + finalProduct += line.strip()+'\t'+str(RecombRate)+'\n' +fdd = open(input3, 'w') +fdd.writelines(finalProduct) +fdd.close()