diff find_in_reference.py @ 2:30975b3ff0dc

Allow user to add annotation columns from reference to found input entries
author Jim Johnson <jj@umn.edu>
date Thu, 23 Jan 2014 10:52:30 -0600
parents 856033fb26e8
children
line wrap: on
line diff
--- a/find_in_reference.py	Fri Jan 17 14:50:53 2014 -0600
+++ b/find_in_reference.py	Thu Jan 23 10:52:30 2014 -0600
@@ -42,6 +42,9 @@
   parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)')
   parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' )
   parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' )
+  parser.add_option( '-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference' )
+  parser.add_option( '-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines' )
+  parser.add_option( '-S', '--annotation_col_sep', dest='annotation_col_sep', default=',', help='separator character between annotation column from the same line' )
   parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout'  )
   (options, args) = parser.parse_args()
   # Input files
@@ -85,8 +88,16 @@
   refcol = -1
   if options.reference_column and options.reference_column > 0:
     refcol = int(options.reference_column)-1
+  if options.annotation_columns:
+    annotate = True
+    annotation_columns = [int(x) - 1 for x in options.annotation_columns.split(',')]
+  else:
+    annotate = False
   refFile = None
+  num_found = 0
+  num_novel = 0
   for ln,line in enumerate(inputFile):
+    annotations = []
     try:
       found = False
       search_string = line.split('\t')[incol].rstrip('\r\n')
@@ -96,22 +107,31 @@
         print >> sys.stderr, "search: %s" % (search_string)
       refFile = open(options.reference,'r')
       for tn,fline in enumerate(refFile):
-        target_string = fline.split('\t')[refcol]
+        fields = fline.split('\t')
+        target_string =fields[refcol]
         if options.ignore_case:
           target_string = target_string.upper()
         if options.debug: 
           print >> sys.stderr, "in: %s %s %s" % (search_string,search_string in target_string,target_string)
         if search_string in target_string:
           found = True
-          break
+          if annotate:
+            annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns])
+            annotations.append(annotation)  
+          else:
+            break
       if found:
+        num_found += 1
+        if annotate:
+          line = '%s\t%s\n' % (line.rstrip('\r\n'),options.annotation_separator.join(annotations))
         if options.keep == True:
           if outFile:
-              outFile.write(line)
+            outFile.write(line)
         else:
           if filteredFile:
             filteredFile.write(line)
       else:
+        num_novel += 1
         if options.keep == True:
           if filteredFile:
             filteredFile.write(line)
@@ -123,6 +143,7 @@
     finally:
       if refFile:
         refFile.close()
+  print >> sys.stdout, "found: %d novel: %d" % (num_found,num_novel)
 
 if __name__ == "__main__" : __main__()