Mercurial > repos > galaxyp > hirieftools
comparison align_dbspec.py @ 2:c093af6f2a6c draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/pi_db_tools commit 71a4265d11aef48342142b8cf2caa86f79f9a554
| author | galaxyp |
|---|---|
| date | Fri, 01 Sep 2017 03:14:37 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:70757404c4f6 | 2:c093af6f2a6c |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import os | |
| 4 import argparse | |
| 5 import re | |
| 6 from Bio import SeqIO | |
| 7 | |
| 8 | |
| 9 def create_spectra_maps(specfiles, dbfiles, frregex, firstfr): | |
| 10 """Output something like | |
| 11 {'fr01', 'fr04'} # Normal filename set | |
| 12 and | |
| 13 {'fr03': ['fr02', 'fr03']} # pool definition | |
| 14 and | |
| 15 {'fr04': 'fr04', 'fr04b': 'fr04'} # rerun fraction, rerun may also be pool | |
| 16 """ | |
| 17 specrange = get_fn_fractionmap(specfiles, frregex) | |
| 18 to_pool = [] | |
| 19 poolmap, rerun_map, normal_fns = {}, [], set() | |
| 20 for i in range(0, len(dbfiles)): | |
| 21 num = i + firstfr | |
| 22 if num not in specrange: | |
| 23 to_pool.append(i) | |
| 24 elif to_pool and num in specrange: | |
| 25 to_pool.append(i) | |
| 26 poolmap[specrange[num][0]] = to_pool | |
| 27 to_pool = [] | |
| 28 if not to_pool and specrange[num][0] in poolmap: | |
| 29 if poolmap[specrange[num][0]][-1] != i: | |
| 30 normal_fns.add((dbfiles[num - 1], | |
| 31 specfiles[specrange[num][0]])) | |
| 32 elif not to_pool: | |
| 33 normal_fns.add((dbfiles[num - 1], specfiles[specrange[num][0]])) | |
| 34 for num in sorted(specrange.keys()): | |
| 35 if len(specrange[num]) > 1: | |
| 36 rerun_map.append(specrange[num]) | |
| 37 return normal_fns, rerun_map, poolmap | |
| 38 | |
| 39 | |
| 40 def get_fn_fractionmap(files, frregex): | |
| 41 fnfrmap = {} | |
| 42 for f_ix, fn in enumerate(files): | |
| 43 fnum = int(re.sub(frregex, '\\1', fn)) | |
| 44 try: | |
| 45 fnfrmap[fnum].append(f_ix) | |
| 46 except KeyError: | |
| 47 fnfrmap[fnum] = [f_ix] | |
| 48 return fnfrmap | |
| 49 | |
| 50 | |
| 51 def pool_fasta_files(poolfiles): | |
| 52 acc_seq = {} | |
| 53 for fr in poolfiles: | |
| 54 for seq in SeqIO.parse(fr, 'fasta'): | |
| 55 sequence = str(seq.seq.upper()) | |
| 56 try: | |
| 57 if sequence in acc_seq[seq.id]: | |
| 58 continue | |
| 59 except KeyError: | |
| 60 acc_seq[seq.id] = {sequence: 1} | |
| 61 yield seq | |
| 62 else: | |
| 63 acc_seq[seq.id][sequence] = 1 | |
| 64 yield seq | |
| 65 | |
| 66 | |
| 67 def write_pooled_fasta(poolmap, specnames, dbfiles): | |
| 68 """Runs through poolmap and pooles output files, filtering out | |
| 69 duplicates""" | |
| 70 for outfr, infrs in poolmap.items(): | |
| 71 outfn = os.path.join('aligned_out', os.path.basename(specnames[outfr])) | |
| 72 print('Pooling FASTA files {} - {} into: {}'.format( | |
| 73 dbfiles[infrs[0]], dbfiles[infrs[-1]], outfn)) | |
| 74 with open(outfn, 'w') as fp: | |
| 75 SeqIO.write(pool_fasta_files([dbfiles[x] for x in infrs]), fp, | |
| 76 'fasta') | |
| 77 | |
| 78 | |
| 79 def write_nonpooled_fasta(fractions): | |
| 80 """Symlinks nonpooled db files""" | |
| 81 print('Symlinking non-pooled non-rerun files', | |
| 82 [(fr[0], os.path.join('aligned_out', os.path.basename(fr[1]))) | |
| 83 for fr in fractions]) | |
| 84 [os.symlink(fr[0], os.path.join('aligned_out', os.path.basename(fr[1]))) | |
| 85 for fr in fractions] | |
| 86 | |
| 87 | |
| 88 def copy_rerun_fasta(rerun_map, specnames): | |
| 89 for dst_indices in rerun_map: | |
| 90 src = os.path.join(specnames[dst_indices[0]]) | |
| 91 for outfn in [specnames[x] for x in dst_indices[1:]]: | |
| 92 print('Symlinking {} to {}'.format(src, outfn)) | |
| 93 os.symlink(src, os.path.join('aligned_out', outfn)) | |
| 94 | |
| 95 | |
| 96 def main(): | |
| 97 args = parse_commandline() | |
| 98 with open(args.spectranames) as fp: | |
| 99 spectranames = [x.strip() for x in fp.read().strip().split('\n')] | |
| 100 vanilla_fr, rerun_map, poolmap = create_spectra_maps(spectranames, | |
| 101 args.dbfiles, | |
| 102 args.frspecregex, | |
| 103 args.firstfr) | |
| 104 write_pooled_fasta(poolmap, spectranames, args.dbfiles) | |
| 105 write_nonpooled_fasta(vanilla_fr) | |
| 106 copy_rerun_fasta(rerun_map, spectranames) | |
| 107 | |
| 108 | |
| 109 def parse_commandline(): | |
| 110 parser = argparse.ArgumentParser( | |
| 111 formatter_class=argparse.RawTextHelpFormatter) | |
| 112 parser.add_argument('--specnames', dest='spectranames', help='File ' | |
| 113 'containing spectra filenames with fractions. ' | |
| 114 'Test data example illustrates reruns (fr03b, 09b) and' | |
| 115 ' pooled samples (fr05-09 are inside fr09 and fr09b).', | |
| 116 required=True) | |
| 117 parser.add_argument('--dbfiles', dest='dbfiles', help='FASTA db files', | |
| 118 nargs='+', required=True) | |
| 119 parser.add_argument('--frspec', dest='frspecregex', help='Fraction regex ' | |
| 120 'to detect spectra fraction numbers', required=True) | |
| 121 parser.add_argument('--firstfr', dest='firstfr', help='First fraction nr', | |
| 122 type=int, required=True) | |
| 123 return parser.parse_args(sys.argv[1:]) | |
| 124 | |
| 125 | |
| 126 if __name__ == '__main__': | |
| 127 main() |
