diff WeightedAverage.py @ 0:e11314245c2a draft

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 09:11:55 -0400
parents
children 68a40b074399
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/WeightedAverage.py	Tue Apr 01 09:11:55 2014 -0400
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+"""
+usage: %prog bed_file_1 bed_file_2 out_file
+    -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file
+    -2, --cols2=N,N,N,N,N: Columns for chr, start, end, strand, name/value in second file
+"""
+
+import collections
+import sys
+#import numpy
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+from galaxy.tools.util.galaxyops import *
+from bx.cookbook import doc_optparse
+
+
+#export PYTHONPATH=~/galaxy/lib/
+#running command python WeightedAverage.py interval_interpolate.bed value_interpolate.bed interpolate_result.bed
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+
+def FindRate(chromosome, start_stop, dictType):
+    OverlapList = []
+    for tempO in dictType[chromosome]:
+        DatabaseInterval = [tempO[0], tempO[1]]
+        Overlap = GetOverlap( start_stop, DatabaseInterval )
+        if Overlap > 0:
+            OverlapList.append([Overlap, tempO[2]])
+    
+    if len(OverlapList) > 0:
+        SumRecomb = 0
+        SumOverlap = 0
+        for member in OverlapList:
+            SumRecomb += member[0]*member[1]
+            SumOverlap += member[0]
+        averageRate = SumRecomb/SumOverlap
+        return averageRate
+    else:
+        return 'NA'
+
+
+def GetOverlap(a, b):
+    return min(a[1], b[1])-max(a[0], b[0])
+
+
+options, args = doc_optparse.parse( __doc__ )
+
+try:
+    chr_col_1, start_col_1, end_col_1, strand_col1 = parse_cols_arg( options.cols1 )
+    chr_col_2, start_col_2, end_col_2, strand_col2, name_col_2 = parse_cols_arg( options.cols2 )
+    input1, input2, input3 = args
+except Exception, eee:
+    print eee
+    stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." )
+
+fd2 = open(input2)
+lines2 = fd2.readlines()
+RecombChrDict = collections.defaultdict(list)
+
+skipped = 0
+for line in lines2:
+    temp = line.strip().split()
+    try:
+        assert float(temp[int(name_col_2)])
+    except:
+        skipped += 1
+        continue
+    tempIndex = [int(temp[int(start_col_2)]), int(temp[int(end_col_2)]), float(temp[int(name_col_2)])]
+    RecombChrDict[temp[int(chr_col_2)]].append(tempIndex)
+
+print "Skipped %d features with invalid values" % (skipped)
+
+fd1 = open(input1)
+lines = fd1.readlines()
+finalProduct = ''
+for line in lines:
+    temp = line.strip().split('\t')
+    chromosome = temp[int(chr_col_1)]
+    start = int(temp[int(start_col_1)])
+    stop = int(temp[int(end_col_1)])
+    start_stop = [start, stop]
+    RecombRate = FindRate( chromosome, start_stop, RecombChrDict )
+    try:
+        RecombRate = "%.4f" % (float(RecombRate))
+    except:
+        RecombRate = RecombRate
+    finalProduct += line.strip()+'\t'+str(RecombRate)+'\n'
+fdd = open(input3, 'w')
+fdd.writelines(finalProduct)
+fdd.close()