Mercurial > repos > bgruening > split_file_to_collection
comparison split_file_to_collection.py @ 0:e9d56b4c3209 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
| author | bgruening |
|---|---|
| date | Tue, 17 Jul 2018 14:36:55 -0400 |
| parents | |
| children | 0cf37301f754 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e9d56b4c3209 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse | |
| 4 import os | |
| 5 import re | |
| 6 import random | |
| 7 import math | |
| 8 | |
| 9 | |
| 10 """ | |
| 11 regexes that indicate the *beginning* of a record | |
| 12 new file types can be added by appending to this dict, | |
| 13 updating the parser, and adding a new type option in the Galaxy wrapper | |
| 14 """ | |
| 15 FILETYPES = {'fasta': '^>', | |
| 16 'fastq': '^@', | |
| 17 'tabular': '^.*', | |
| 18 'mgf': '^BEGIN IONS'} | |
| 19 | |
| 20 | |
| 21 def main(): | |
| 22 ps = parser_cli() | |
| 23 args = vars(ps.parse_args()) | |
| 24 | |
| 25 # get args and validate | |
| 26 in_file = args["in"] | |
| 27 if not os.path.isfile(args["in"]): | |
| 28 raise FileNotFoundError('Input file does not exist') | |
| 29 | |
| 30 out_dir = args["out_dir"] | |
| 31 if not os.path.isdir(args["out_dir"]): | |
| 32 raise FileNotFoundError('out_dir is not a directory') | |
| 33 | |
| 34 top = args["top"] | |
| 35 if top < 0: | |
| 36 raise ValueError("Number of header lines cannot be negative") | |
| 37 | |
| 38 ftype = args["ftype"] | |
| 39 | |
| 40 if args["ftype"] == "tabular" and args["by"] == "col": | |
| 41 args["match"] = replace_mapped_chars(args["match"]) | |
| 42 args["sub"] = replace_mapped_chars(args["sub"]) | |
| 43 split_by_column(args, in_file, out_dir, top) | |
| 44 | |
| 45 else: | |
| 46 split_by_record(args, in_file, out_dir, top, ftype) | |
| 47 | |
| 48 | |
| 49 def parser_cli(): | |
| 50 parser = argparse.ArgumentParser(description="split a file into multiple files. " + | |
| 51 "Can split on the column of a tabular file, " + | |
| 52 "with custom and useful names based on column value.") | |
| 53 parser.add_argument('--in', '-i', required=True, help="The input file") | |
| 54 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) | |
| 55 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") | |
| 56 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + | |
| 57 " the extension of the new files (without a period)") | |
| 58 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, | |
| 59 choices=["mgf", "fastq", "fasta", "tabular"]) | |
| 60 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", | |
| 61 default = "row", choices = ["col", "row"]) | |
| 62 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + | |
| 63 "(tabular only).") | |
| 64 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') | |
| 65 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + | |
| 66 "If not provided and args[\"rand\"]==True, then date is used", type=int) | |
| 67 parser.add_argument('--numnew', '-n', type=int, default = 1, | |
| 68 help="Number of output files desired. Not valid for splitting on a column") | |
| 69 parser.add_argument('--batch', action='store_true', | |
| 70 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") | |
| 71 | |
| 72 bycol = parser.add_argument_group('If splitting on a column') | |
| 73 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") | |
| 74 bycol.add_argument('--sub', '-s', default = r'\1', | |
| 75 help="The regular expression to substitute in for the matched pattern.") | |
| 76 bycol.add_argument('--id_column', '-c', default="1", | |
| 77 help="Column that is used to name output files. Indexed starting from 1.", type=int) | |
| 78 return parser | |
| 79 | |
| 80 | |
| 81 def close_files(file_list): | |
| 82 # finally, close all files | |
| 83 for open_file in file_list: | |
| 84 open_file.close() | |
| 85 | |
| 86 | |
| 87 def replace_mapped_chars(pattern): | |
| 88 """ | |
| 89 handles special escaped characters when coming from galaxy | |
| 90 """ | |
| 91 mapped_chars = {'\'': '__sq__', '\\': '__backslash__'} | |
| 92 for key, value in mapped_chars.items(): | |
| 93 pattern = pattern.replace(value, key) | |
| 94 return pattern | |
| 95 | |
| 96 | |
| 97 def split_by_record(args, in_file, out_dir, top, ftype): | |
| 98 # get record separator for given filetype | |
| 99 sep = re.compile(FILETYPES[ftype]) | |
| 100 | |
| 101 numnew = args["numnew"] | |
| 102 | |
| 103 # random division | |
| 104 rand = args["rand"] | |
| 105 seed = args["seed"] | |
| 106 if seed: | |
| 107 random.seed(seed) | |
| 108 else: | |
| 109 random.seed() | |
| 110 | |
| 111 # batched division (maintains order) | |
| 112 batch = args["batch"] | |
| 113 # define n_per_file so we don't get a warning about ref before assignment | |
| 114 n_per_file = math.inf | |
| 115 if batch: | |
| 116 # number of records | |
| 117 with open(in_file) as f: | |
| 118 i = 0 | |
| 119 for line in f: | |
| 120 if re.match(sep, line) is not None: | |
| 121 i+=1 | |
| 122 n_records = i + 1 | |
| 123 if top: | |
| 124 n_records -= top # don't count the top lines | |
| 125 | |
| 126 # approx. number of lines per file | |
| 127 n_per_file = n_records // numnew | |
| 128 | |
| 129 # make new files | |
| 130 # strip extension of old file and add number | |
| 131 custom_new_file_name = args["file_names"] | |
| 132 custom_new_file_ext = "." + args["file_ext"] | |
| 133 if custom_new_file_name is None: | |
| 134 new_file_base = os.path.splitext(os.path.basename(in_file)) | |
| 135 else: | |
| 136 new_file_base = [custom_new_file_name, custom_new_file_ext] | |
| 137 | |
| 138 newfiles = [ | |
| 139 open(out_dir + "/" + new_file_base[0] + "_" + str(count) + new_file_base[1], "w") | |
| 140 for count in range(0, numnew) | |
| 141 ] | |
| 142 | |
| 143 # bunch o' counters | |
| 144 # index to list of new files | |
| 145 new_file_counter = 0 | |
| 146 | |
| 147 # used for top | |
| 148 # number of lines read so far | |
| 149 n_read = 0 | |
| 150 # to contain header specified by top | |
| 151 header = "" | |
| 152 # keep track of the files that have been opened so far | |
| 153 fresh_files = {i for i in range(0, numnew)} | |
| 154 | |
| 155 # keep track in loop of number of records in each file | |
| 156 # only used in batch | |
| 157 records_in_file = 0 | |
| 158 | |
| 159 # open file | |
| 160 with open(in_file, "r") as file: | |
| 161 record = "" | |
| 162 for line in file: | |
| 163 n_read += 1 | |
| 164 if n_read <= top: | |
| 165 header += line | |
| 166 continue | |
| 167 # check if beginning of line is record sep | |
| 168 # if beginning of line is record sep, either start record or finish one | |
| 169 if re.match(sep, line) is not None: | |
| 170 # this only happens first time through | |
| 171 if record == "": | |
| 172 record += line | |
| 173 else: | |
| 174 # if is in fresh_files, write header and drop from freshFiles | |
| 175 if new_file_counter in fresh_files: | |
| 176 newfiles[new_file_counter].write(header) | |
| 177 fresh_files.remove(new_file_counter) | |
| 178 | |
| 179 # write record to file | |
| 180 newfiles[new_file_counter].write(record) | |
| 181 | |
| 182 # if not the first time through, we assign the new record | |
| 183 record = line | |
| 184 | |
| 185 # change destination file | |
| 186 if rand: | |
| 187 new_file_counter = int(math.floor(random.random() * numnew)) | |
| 188 elif batch: | |
| 189 # number of records read per file | |
| 190 records_in_file += 1 | |
| 191 # have we reached the max for each file? | |
| 192 # if so, switch file | |
| 193 if records_in_file >= n_per_file: | |
| 194 new_file_counter = (new_file_counter + 1) % numnew | |
| 195 records_in_file = 0 # reset to 0 | |
| 196 else: | |
| 197 new_file_counter = (new_file_counter + 1) % numnew | |
| 198 # if beginning of line is not record sep, we must be inside a record | |
| 199 # so just append | |
| 200 else: | |
| 201 record += line | |
| 202 # after loop, write final record to file | |
| 203 newfiles[new_file_counter].write(record) | |
| 204 # close new files | |
| 205 close_files(newfiles) | |
| 206 | |
| 207 | |
| 208 def split_by_column(args, in_file, out_dir, top): | |
| 209 | |
| 210 # shift to 0-based indexing | |
| 211 id_col = int(args["id_column"]) - 1 | |
| 212 | |
| 213 try: | |
| 214 match = re.compile(args["match"]) | |
| 215 except re.error: | |
| 216 print("ERROR: Match (-m) supplied is not valid regex.") | |
| 217 raise | |
| 218 | |
| 219 sub = args["sub"] | |
| 220 | |
| 221 # set of file names | |
| 222 new_files = dict() | |
| 223 | |
| 224 # keep track of how many lines have been read | |
| 225 n_read = 0 | |
| 226 header = "" | |
| 227 with open(in_file) as file: | |
| 228 for line in file: | |
| 229 # if still in top, save to header | |
| 230 n_read += 1 | |
| 231 if n_read <= top: | |
| 232 header += line | |
| 233 continue | |
| 234 # split into columns, on tab | |
| 235 fields = re.split(r'\t', line.strip('\n')) | |
| 236 | |
| 237 # get id column value | |
| 238 id_col_val = fields[id_col] | |
| 239 | |
| 240 # use regex to get new file name | |
| 241 out_file_name = re.sub(match, sub, id_col_val) | |
| 242 out_file_path = os.path.join(out_dir, out_file_name) | |
| 243 | |
| 244 # write | |
| 245 if out_file_name not in new_files.keys(): | |
| 246 # open file (new, so not already open) | |
| 247 current_new_file = open(out_file_path, "w") | |
| 248 current_new_file.write(header) | |
| 249 current_new_file.write(line) | |
| 250 # add to dict | |
| 251 new_files[out_file_name] = current_new_file | |
| 252 else: | |
| 253 # file is already open, so just write to it | |
| 254 new_files[out_file_name].write(line) | |
| 255 | |
| 256 # finally, close all files | |
| 257 close_files(new_files.values()) | |
| 258 | |
| 259 | |
| 260 if __name__ == "__main__": | |
| 261 main() |
