Mercurial > repos > bgruening > split_file_to_collection
comparison split_file_to_collection.py @ 4:b2ad7eb9bab7 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
| author | bgruening |
|---|---|
| date | Wed, 09 Oct 2019 07:34:19 -0400 |
| parents | 128fb354ed42 |
| children |
comparison
equal
deleted
inserted
replaced
| 3:128fb354ed42 | 4:b2ad7eb9bab7 |
|---|---|
| 14 """ | 14 """ |
| 15 FILETYPES = {'fasta': '^>', | 15 FILETYPES = {'fasta': '^>', |
| 16 'fastq': '^@', | 16 'fastq': '^@', |
| 17 'tabular': '^.*', | 17 'tabular': '^.*', |
| 18 'txt': '^.*', | 18 'txt': '^.*', |
| 19 'mgf': '^BEGIN IONS'} | 19 'mgf': '^BEGIN IONS', |
| 20 'sdf': '\$\$\$\$', | |
| 21 } | |
| 20 | 22 |
| 21 | 23 |
| 22 def main(): | 24 def main(): |
| 23 ps = parser_cli() | 25 ps = parser_cli() |
| 24 args = vars(ps.parse_args()) | 26 args = vars(ps.parse_args()) |
| 57 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) | 59 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) |
| 58 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") | 60 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") |
| 59 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + | 61 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + |
| 60 " the extension of the new files (without a period)") | 62 " the extension of the new files (without a period)") |
| 61 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, | 63 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, |
| 62 choices=["mgf", "fastq", "fasta", "tabular", "txt", "generic"]) | 64 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) |
| 63 parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False) | 65 parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False) |
| 64 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", | 66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", |
| 65 default = "row", choices = ["col", "row"]) | 67 default = "row", choices = ["col", "row"]) |
| 66 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + | 68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + |
| 67 "(tabular only).") | 69 "(tabular only).") |
| 68 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') | 70 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') |
| 69 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + | 71 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + |
| 70 "If not provided and args[\"rand\"]==True, then date is used", type=int) | 72 "If not provided and args[\"rand\"]==True, then date is used", type=int) |
| 71 parser.add_argument('--numnew', '-n', type=int, default = 1, | 73 parser.add_argument('--numnew', '-n', type=int, default = 1, |
| 72 help="Number of output files desired. Not valid for splitting on a column") | 74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") |
| 75 parser.add_argument('--chunksize', '-k', type=int, default = 0, | |
| 76 help="Number of records by file. Not valid for splitting on a column") | |
| 73 parser.add_argument('--batch', action='store_true', | 77 parser.add_argument('--batch', action='store_true', |
| 74 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") | 78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") |
| 75 | 79 parser.add_argument('--split_after', '-p', action='store_true', |
| 80 help="Split between records after separator (default is before)." + | |
| 81 "Only for generic - specific ftypes are always split in the default way") | |
| 76 bycol = parser.add_argument_group('If splitting on a column') | 82 bycol = parser.add_argument_group('If splitting on a column') |
| 77 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") | 83 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") |
| 78 bycol.add_argument('--sub', '-s', default = r'\1', | 84 bycol.add_argument('--sub', '-s', default = r'\1', |
| 79 help="The regular expression to substitute in for the matched pattern.") | 85 help="The regular expression to substitute in for the matched pattern.") |
| 80 bycol.add_argument('--id_column', '-c', default="1", | 86 bycol.add_argument('--id_column', '-c', default="1", |
| 100 | 106 |
| 101 def split_by_record(args, in_file, out_dir, top, ftype): | 107 def split_by_record(args, in_file, out_dir, top, ftype): |
| 102 # get record separator for given filetype | 108 # get record separator for given filetype |
| 103 sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) | 109 sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) |
| 104 | 110 |
| 111 chunksize = args["chunksize"] | |
| 105 numnew = args["numnew"] | 112 numnew = args["numnew"] |
| 106 | 113 |
| 107 # random division | 114 # random division |
| 108 rand = args["rand"] | 115 rand = args["rand"] |
| 109 seed = args["seed"] | 116 seed = args["seed"] |
| 112 else: | 119 else: |
| 113 random.seed() | 120 random.seed() |
| 114 | 121 |
| 115 # batched division (maintains order) | 122 # batched division (maintains order) |
| 116 batch = args["batch"] | 123 batch = args["batch"] |
| 117 # define n_per_file so we don't get a warning about ref before assignment | 124 |
| 118 n_per_file = math.inf | 125 |
| 119 if batch: | 126 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected |
| 127 # define n_per_file so we don't get a warning about ref before assignment | |
| 128 n_per_file = math.inf | |
| 129 | |
| 120 # number of records | 130 # number of records |
| 121 with open(in_file) as f: | 131 with open(in_file) as f: |
| 122 i = 0 | 132 i = 0 |
| 123 for line in f: | 133 for line in f: |
| 124 if re.match(sep, line) is not None: | 134 if re.match(sep, line) is not None: |
| 125 i+=1 | 135 i+=1 |
| 126 n_records = i + 1 | 136 n_records = i + 1 |
| 127 if top: | 137 if top: |
| 128 n_records -= top # don't count the top lines | 138 n_records -= top # don't count the top lines |
| 129 | 139 |
| 130 # approx. number of lines per file | 140 if chunksize == 0: # i.e. no chunking |
| 131 n_per_file = n_records // numnew | 141 # approx. number of lines per file |
| 142 n_per_file = n_records // numnew | |
| 143 else: | |
| 144 # approx. number of lines per file | |
| 145 numnew = n_records // chunksize | |
| 146 n_per_file = chunksize | |
| 147 | |
| 148 | |
| 149 | |
| 132 | 150 |
| 133 # make new files | 151 # make new files |
| 134 # strip extension of old file and add number | 152 # strip extension of old file and add number |
| 135 custom_new_file_name = args["file_names"] | 153 custom_new_file_name = args["file_names"] |
| 136 custom_new_file_ext = "." + args["file_ext"] | 154 custom_new_file_ext = "." + args["file_ext"] |
| 177 else: | 195 else: |
| 178 # if is in fresh_files, write header and drop from freshFiles | 196 # if is in fresh_files, write header and drop from freshFiles |
| 179 if new_file_counter in fresh_files: | 197 if new_file_counter in fresh_files: |
| 180 newfiles[new_file_counter].write(header) | 198 newfiles[new_file_counter].write(header) |
| 181 fresh_files.remove(new_file_counter) | 199 fresh_files.remove(new_file_counter) |
| 182 | 200 |
| 183 # write record to file | 201 if ftype != "sdf" and args["split_after"] == False: |
| 184 newfiles[new_file_counter].write(record) | 202 # write record to file |
| 185 | 203 newfiles[new_file_counter].write(record) |
| 186 # if not the first time through, we assign the new record | 204 |
| 187 record = line | 205 # if not the first time through, we assign the new record |
| 188 | 206 record = line |
| 207 | |
| 208 else: # for sdf we want to write the line to the record before starting a new one | |
| 209 record += line | |
| 210 newfiles[new_file_counter].write(record) | |
| 211 record = "" | |
| 212 | |
| 189 # change destination file | 213 # change destination file |
| 190 if rand: | 214 if rand: |
| 191 new_file_counter = int(math.floor(random.random() * numnew)) | 215 new_file_counter = int(math.floor(random.random() * numnew)) |
| 192 elif batch: | 216 elif batch: |
| 193 # number of records read per file | 217 # number of records read per file |
