comparison find_in_reference.py @ 2:30975b3ff0dc

Allow user to add annotation columns from reference to found input entries
author Jim Johnson <jj@umn.edu>
date Thu, 23 Jan 2014 10:52:30 -0600
parents 856033fb26e8
children
comparison
equal deleted inserted replaced
1:856033fb26e8 2:30975b3ff0dc
40 parser.add_option( '-f', '--filtered', dest='filtered', help='The output file for input lines not in the output') 40 parser.add_option( '-f', '--filtered', dest='filtered', help='The output file for input lines not in the output')
41 parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)') 41 parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)')
42 parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)') 42 parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)')
43 parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' ) 43 parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' )
44 parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' ) 44 parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' )
45 parser.add_option( '-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference' )
46 parser.add_option( '-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines' )
47 parser.add_option( '-S', '--annotation_col_sep', dest='annotation_col_sep', default=',', help='separator character between annotation column from the same line' )
45 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) 48 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
46 (options, args) = parser.parse_args() 49 (options, args) = parser.parse_args()
47 # Input files 50 # Input files
48 if options.input != None: 51 if options.input != None:
49 try: 52 try:
83 if options.input_column and options.input_column > 0: 86 if options.input_column and options.input_column > 0:
84 incol = int(options.input_column)-1 87 incol = int(options.input_column)-1
85 refcol = -1 88 refcol = -1
86 if options.reference_column and options.reference_column > 0: 89 if options.reference_column and options.reference_column > 0:
87 refcol = int(options.reference_column)-1 90 refcol = int(options.reference_column)-1
91 if options.annotation_columns:
92 annotate = True
93 annotation_columns = [int(x) - 1 for x in options.annotation_columns.split(',')]
94 else:
95 annotate = False
88 refFile = None 96 refFile = None
97 num_found = 0
98 num_novel = 0
89 for ln,line in enumerate(inputFile): 99 for ln,line in enumerate(inputFile):
100 annotations = []
90 try: 101 try:
91 found = False 102 found = False
92 search_string = line.split('\t')[incol].rstrip('\r\n') 103 search_string = line.split('\t')[incol].rstrip('\r\n')
93 if options.ignore_case: 104 if options.ignore_case:
94 search_string = search_string.upper() 105 search_string = search_string.upper()
95 if options.debug: 106 if options.debug:
96 print >> sys.stderr, "search: %s" % (search_string) 107 print >> sys.stderr, "search: %s" % (search_string)
97 refFile = open(options.reference,'r') 108 refFile = open(options.reference,'r')
98 for tn,fline in enumerate(refFile): 109 for tn,fline in enumerate(refFile):
99 target_string = fline.split('\t')[refcol] 110 fields = fline.split('\t')
111 target_string =fields[refcol]
100 if options.ignore_case: 112 if options.ignore_case:
101 target_string = target_string.upper() 113 target_string = target_string.upper()
102 if options.debug: 114 if options.debug:
103 print >> sys.stderr, "in: %s %s %s" % (search_string,search_string in target_string,target_string) 115 print >> sys.stderr, "in: %s %s %s" % (search_string,search_string in target_string,target_string)
104 if search_string in target_string: 116 if search_string in target_string:
105 found = True 117 found = True
106 break 118 if annotate:
119 annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns])
120 annotations.append(annotation)
121 else:
122 break
107 if found: 123 if found:
124 num_found += 1
125 if annotate:
126 line = '%s\t%s\n' % (line.rstrip('\r\n'),options.annotation_separator.join(annotations))
108 if options.keep == True: 127 if options.keep == True:
109 if outFile: 128 if outFile:
110 outFile.write(line) 129 outFile.write(line)
111 else: 130 else:
112 if filteredFile: 131 if filteredFile:
113 filteredFile.write(line) 132 filteredFile.write(line)
114 else: 133 else:
134 num_novel += 1
115 if options.keep == True: 135 if options.keep == True:
116 if filteredFile: 136 if filteredFile:
117 filteredFile.write(line) 137 filteredFile.write(line)
118 else: 138 else:
119 if outFile: 139 if outFile:
121 except Exception, e: 141 except Exception, e:
122 print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference,e) 142 print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference,e)
123 finally: 143 finally:
124 if refFile: 144 if refFile:
125 refFile.close() 145 refFile.close()
146 print >> sys.stdout, "found: %d novel: %d" % (num_found,num_novel)
126 147
127 if __name__ == "__main__" : __main__() 148 if __name__ == "__main__" : __main__()
128 149