Mercurial > repos > blankenberg > column_regex_substitution
comparison column_regex_substitution.py @ 0:8d8511030ebf draft default tip
planemo upload for repository https://github.com/blankenberg/tools-blankenberg/tree/master/tools/column_regex_substitution commit 78936dc6be1747303d4cbfd80d09e4cfd1cbf292
| author | blankenberg |
|---|---|
| date | Fri, 07 Sep 2018 10:28:07 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8d8511030ebf |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #Dan Blankenberg | |
| 3 | |
| 4 """ | |
| 5 A script for using regex substitutions on columns. | |
| 6 """ | |
| 7 | |
| 8 import optparse | |
| 9 import re | |
| 10 import sys | |
| 11 import string | |
| 12 | |
| 13 VERSION = "0.0.1" | |
| 14 | |
| 15 COLUMN_STRIP_VALUES = "".join( set( string.printable ) - set( string.digits ) - set(',') ) | |
| 16 | |
| 17 def get_provided_columns( provided_value, column_offset ): | |
| 18 try: | |
| 19 rval = sorted( map( lambda x: int( x.strip( COLUMN_STRIP_VALUES ) ) + column_offset, provided_value.split( ',' ) ) ) | |
| 20 except: | |
| 21 rval = None | |
| 22 if rval: | |
| 23 return rval | |
| 24 return None | |
| 25 | |
| 26 | |
| 27 def __main__(): | |
| 28 parser = optparse.OptionParser() | |
| 29 parser.add_option('--pattern', action='store', default=None, | |
| 30 help='pattern string') | |
| 31 parser.add_option('--replacement', action='store', default=None, | |
| 32 help='replacement string') | |
| 33 parser.add_option('--input', action='store', default=None, | |
| 34 help='Filename of input file') | |
| 35 parser.add_option('--output', action='store', default=None, | |
| 36 help='Filename of output file') | |
| 37 parser.add_option('--delimiter', action='store', default=None, | |
| 38 help='column delimiter') | |
| 39 parser.add_option('--columns', action='store', default=None, | |
| 40 help='columns to operate on') | |
| 41 parser.add_option('--column_offset', action='store', default=0, | |
| 42 help='offset to apply to columns index to force to zero-based') | |
| 43 parser.add_option('--skip', action='store', default=0, | |
| 44 help='Number of lines to skip') | |
| 45 parser.add_option('--version', action='store_true', default=False, | |
| 46 help='Show version') | |
| 47 | |
| 48 (options, args) = parser.parse_args() | |
| 49 | |
| 50 if options.version: | |
| 51 print "blankenberg_python_regex_substitution %s" % ( VERSION ) | |
| 52 sys.exit(0) | |
| 53 | |
| 54 if None in [ options.pattern, options.replacement, options.output ]: | |
| 55 parser.print_help() | |
| 56 sys.exit(1) | |
| 57 | |
| 58 pattern = options.pattern | |
| 59 replacement = options.replacement | |
| 60 column_offset = int( options.column_offset ) | |
| 61 print "Pattern: %s\nReplacement: %s" % ( repr( pattern ), repr( replacement ) ) | |
| 62 pattern = re.compile( pattern ) | |
| 63 provided_columns = get_provided_columns( options.columns, column_offset ) | |
| 64 if provided_columns: | |
| 65 column_str = ", ".join( map( lambda x: str( x - column_offset ), provided_columns ) ) | |
| 66 else: | |
| 67 column_str = 'all' | |
| 68 print "With delimiter %s, on columns: %s" % ( repr( options.delimiter ), column_str ) | |
| 69 if options.delimiter is None: | |
| 70 split_func = lambda x: [ x.rstrip( '\n\r' ) ] | |
| 71 join_char = "" | |
| 72 else: | |
| 73 split_func = lambda x: x.rstrip( '\n\r' ).split( options.delimiter ) | |
| 74 join_char = options.delimiter | |
| 75 with open( options.input, 'rb' ) as fin: | |
| 76 with open( options.output, 'w') as fout: | |
| 77 for i, line in enumerate( fin ): | |
| 78 if i < options.skip: | |
| 79 continue | |
| 80 line = split_func( line ) | |
| 81 field_count = len( line ) | |
| 82 if provided_columns: | |
| 83 columns = provided_columns | |
| 84 else: | |
| 85 columns = range( field_count ) | |
| 86 for j in columns: | |
| 87 if j >= field_count: | |
| 88 break | |
| 89 line[ j ] = re.sub( pattern, replacement, line[ j ] ) | |
| 90 fout.write( "%s\n" % ( join_char.join( line ) ) ) | |
| 91 | |
| 92 if __name__ == "__main__": | |
| 93 __main__() |
