Mercurial > repos > jjohnson > find_in_reference
comparison find_in_reference.py @ 2:30975b3ff0dc
Allow user to add annotation columns from reference to found input entries
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Thu, 23 Jan 2014 10:52:30 -0600 |
parents | 856033fb26e8 |
children |
comparison
equal
deleted
inserted
replaced
1:856033fb26e8 | 2:30975b3ff0dc |
---|---|
40 parser.add_option( '-f', '--filtered', dest='filtered', help='The output file for input lines not in the output') | 40 parser.add_option( '-f', '--filtered', dest='filtered', help='The output file for input lines not in the output') |
41 parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)') | 41 parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)') |
42 parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)') | 42 parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)') |
43 parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' ) | 43 parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' ) |
44 parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' ) | 44 parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' ) |
45 parser.add_option( '-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference' ) | |
46 parser.add_option( '-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines' ) | |
47 parser.add_option( '-S', '--annotation_col_sep', dest='annotation_col_sep', default=',', help='separator character between annotation column from the same line' ) | |
45 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | 48 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) |
46 (options, args) = parser.parse_args() | 49 (options, args) = parser.parse_args() |
47 # Input files | 50 # Input files |
48 if options.input != None: | 51 if options.input != None: |
49 try: | 52 try: |
83 if options.input_column and options.input_column > 0: | 86 if options.input_column and options.input_column > 0: |
84 incol = int(options.input_column)-1 | 87 incol = int(options.input_column)-1 |
85 refcol = -1 | 88 refcol = -1 |
86 if options.reference_column and options.reference_column > 0: | 89 if options.reference_column and options.reference_column > 0: |
87 refcol = int(options.reference_column)-1 | 90 refcol = int(options.reference_column)-1 |
91 if options.annotation_columns: | |
92 annotate = True | |
93 annotation_columns = [int(x) - 1 for x in options.annotation_columns.split(',')] | |
94 else: | |
95 annotate = False | |
88 refFile = None | 96 refFile = None |
97 num_found = 0 | |
98 num_novel = 0 | |
89 for ln,line in enumerate(inputFile): | 99 for ln,line in enumerate(inputFile): |
100 annotations = [] | |
90 try: | 101 try: |
91 found = False | 102 found = False |
92 search_string = line.split('\t')[incol].rstrip('\r\n') | 103 search_string = line.split('\t')[incol].rstrip('\r\n') |
93 if options.ignore_case: | 104 if options.ignore_case: |
94 search_string = search_string.upper() | 105 search_string = search_string.upper() |
95 if options.debug: | 106 if options.debug: |
96 print >> sys.stderr, "search: %s" % (search_string) | 107 print >> sys.stderr, "search: %s" % (search_string) |
97 refFile = open(options.reference,'r') | 108 refFile = open(options.reference,'r') |
98 for tn,fline in enumerate(refFile): | 109 for tn,fline in enumerate(refFile): |
99 target_string = fline.split('\t')[refcol] | 110 fields = fline.split('\t') |
111 target_string =fields[refcol] | |
100 if options.ignore_case: | 112 if options.ignore_case: |
101 target_string = target_string.upper() | 113 target_string = target_string.upper() |
102 if options.debug: | 114 if options.debug: |
103 print >> sys.stderr, "in: %s %s %s" % (search_string,search_string in target_string,target_string) | 115 print >> sys.stderr, "in: %s %s %s" % (search_string,search_string in target_string,target_string) |
104 if search_string in target_string: | 116 if search_string in target_string: |
105 found = True | 117 found = True |
106 break | 118 if annotate: |
119 annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns]) | |
120 annotations.append(annotation) | |
121 else: | |
122 break | |
107 if found: | 123 if found: |
124 num_found += 1 | |
125 if annotate: | |
126 line = '%s\t%s\n' % (line.rstrip('\r\n'),options.annotation_separator.join(annotations)) | |
108 if options.keep == True: | 127 if options.keep == True: |
109 if outFile: | 128 if outFile: |
110 outFile.write(line) | 129 outFile.write(line) |
111 else: | 130 else: |
112 if filteredFile: | 131 if filteredFile: |
113 filteredFile.write(line) | 132 filteredFile.write(line) |
114 else: | 133 else: |
134 num_novel += 1 | |
115 if options.keep == True: | 135 if options.keep == True: |
116 if filteredFile: | 136 if filteredFile: |
117 filteredFile.write(line) | 137 filteredFile.write(line) |
118 else: | 138 else: |
119 if outFile: | 139 if outFile: |
121 except Exception, e: | 141 except Exception, e: |
122 print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference,e) | 142 print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference,e) |
123 finally: | 143 finally: |
124 if refFile: | 144 if refFile: |
125 refFile.close() | 145 refFile.close() |
146 print >> sys.stdout, "found: %d novel: %d" % (num_found,num_novel) | |
126 | 147 |
127 if __name__ == "__main__" : __main__() | 148 if __name__ == "__main__" : __main__() |
128 | 149 |