Mercurial > repos > galaxyp > hirieftools
changeset 2:c093af6f2a6c draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/pi_db_tools commit 71a4265d11aef48342142b8cf2caa86f79f9a554
| author | galaxyp | 
|---|---|
| date | Fri, 01 Sep 2017 03:14:37 -0400 | 
| parents | 70757404c4f6 | 
| children | a6341e757422 | 
| files | __pycache__/peptide_pi_annotator.cpython-36.pyc align_dbspec.py delta_pi_calc.xml pi_db_split.xml pi_dbspec_align.xml test-data/merged_twice_decoy_fr1-3.fasta test-data/specnames.txt | 
| diffstat | 7 files changed, 225 insertions(+), 5 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/align_dbspec.py Fri Sep 01 03:14:37 2017 -0400 @@ -0,0 +1,127 @@ +#!/usr/bin/env python +import sys +import os +import argparse +import re +from Bio import SeqIO + + +def create_spectra_maps(specfiles, dbfiles, frregex, firstfr): + """Output something like + {'fr01', 'fr04'} # Normal filename set + and + {'fr03': ['fr02', 'fr03']} # pool definition + and + {'fr04': 'fr04', 'fr04b': 'fr04'} # rerun fraction, rerun may also be pool + """ + specrange = get_fn_fractionmap(specfiles, frregex) + to_pool = [] + poolmap, rerun_map, normal_fns = {}, [], set() + for i in range(0, len(dbfiles)): + num = i + firstfr + if num not in specrange: + to_pool.append(i) + elif to_pool and num in specrange: + to_pool.append(i) + poolmap[specrange[num][0]] = to_pool + to_pool = [] + if not to_pool and specrange[num][0] in poolmap: + if poolmap[specrange[num][0]][-1] != i: + normal_fns.add((dbfiles[num - 1], + specfiles[specrange[num][0]])) + elif not to_pool: + normal_fns.add((dbfiles[num - 1], specfiles[specrange[num][0]])) + for num in sorted(specrange.keys()): + if len(specrange[num]) > 1: + rerun_map.append(specrange[num]) + return normal_fns, rerun_map, poolmap + + +def get_fn_fractionmap(files, frregex): + fnfrmap = {} + for f_ix, fn in enumerate(files): + fnum = int(re.sub(frregex, '\\1', fn)) + try: + fnfrmap[fnum].append(f_ix) + except KeyError: + fnfrmap[fnum] = [f_ix] + return fnfrmap + + +def pool_fasta_files(poolfiles): + acc_seq = {} + for fr in poolfiles: + for seq in SeqIO.parse(fr, 'fasta'): + sequence = str(seq.seq.upper()) + try: + if sequence in acc_seq[seq.id]: + continue + except KeyError: + acc_seq[seq.id] = {sequence: 1} + yield seq + else: + acc_seq[seq.id][sequence] = 1 + yield seq + + +def write_pooled_fasta(poolmap, specnames, dbfiles): + """Runs through poolmap and pooles output files, filtering out + duplicates""" + for outfr, infrs in poolmap.items(): + outfn = os.path.join('aligned_out', os.path.basename(specnames[outfr])) + print('Pooling FASTA files {} - {} into: {}'.format( + dbfiles[infrs[0]], dbfiles[infrs[-1]], outfn)) + with open(outfn, 'w') as fp: + SeqIO.write(pool_fasta_files([dbfiles[x] for x in infrs]), fp, + 'fasta') + + +def write_nonpooled_fasta(fractions): + """Symlinks nonpooled db files""" + print('Symlinking non-pooled non-rerun files', + [(fr[0], os.path.join('aligned_out', os.path.basename(fr[1]))) + for fr in fractions]) + [os.symlink(fr[0], os.path.join('aligned_out', os.path.basename(fr[1]))) + for fr in fractions] + + +def copy_rerun_fasta(rerun_map, specnames): + for dst_indices in rerun_map: + src = os.path.join(specnames[dst_indices[0]]) + for outfn in [specnames[x] for x in dst_indices[1:]]: + print('Symlinking {} to {}'.format(src, outfn)) + os.symlink(src, os.path.join('aligned_out', outfn)) + + +def main(): + args = parse_commandline() + with open(args.spectranames) as fp: + spectranames = [x.strip() for x in fp.read().strip().split('\n')] + vanilla_fr, rerun_map, poolmap = create_spectra_maps(spectranames, + args.dbfiles, + args.frspecregex, + args.firstfr) + write_pooled_fasta(poolmap, spectranames, args.dbfiles) + write_nonpooled_fasta(vanilla_fr) + copy_rerun_fasta(rerun_map, spectranames) + + +def parse_commandline(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--specnames', dest='spectranames', help='File ' + 'containing spectra filenames with fractions. ' + 'Test data example illustrates reruns (fr03b, 09b) and' + ' pooled samples (fr05-09 are inside fr09 and fr09b).', + required=True) + parser.add_argument('--dbfiles', dest='dbfiles', help='FASTA db files', + nargs='+', required=True) + parser.add_argument('--frspec', dest='frspecregex', help='Fraction regex ' + 'to detect spectra fraction numbers', required=True) + parser.add_argument('--firstfr', dest='firstfr', help='First fraction nr', + type=int, required=True) + return parser.parse_args(sys.argv[1:]) + + +if __name__ == '__main__': + main()
--- a/delta_pi_calc.xml Mon Jul 24 05:25:05 2017 -0400 +++ b/delta_pi_calc.xml Fri Sep 01 03:14:37 2017 -0400 @@ -1,9 +1,9 @@ -<tool id="calc_delta_pi" name="Add delta pI" version="1.1"> +<tool id="calc_delta_pi" name="Add delta pI" version="1.2"> + <description>to peptide table</description> <requirements> <requirement type="package" version="3.6">python</requirement> </requirements> - <description>to peptide table</description> - <command> + <command detect_errors="exit_code"> python '$__tool_directory__/peptide_pi_annotator.py' -i '$trainingpi' -p '$peptable' --out '$output' #if $stripcol --stripcol $stripcol
--- a/pi_db_split.xml Mon Jul 24 05:25:05 2017 -0400 +++ b/pi_db_split.xml Fri Sep 01 03:14:37 2017 -0400 @@ -1,10 +1,10 @@ -<tool id="pi_db_split" name="Split peptide database" version="1.1"> +<tool id="pi_db_split" name="Split peptide database" version="1.2"> <description>into pI separated fractions</description> <requirements> <requirement type="package">numpy</requirement> <requirement type="package" version="3.6">python</requirement> </requirements> - <command> + <command detect_errors="exit_code"> <![CDATA[ mkdir pi_fr_out && cd pi_fr_out && python '$__tool_directory__/pi_database_splitter.py' -i '$pipeptides' -p '$peptable'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi_dbspec_align.xml Fri Sep 01 03:14:37 2017 -0400 @@ -0,0 +1,77 @@ +<tool id="pi_dbspec_align" name="Align DB fractions" version="0.3"> + <description>to resemble spectra fraction scheme</description> + <requirements> + <requirement type="package" version="3.6">python</requirement> + <requirement type="package" version="1.62">biopython</requirement> + </requirements> + <command detect_errors="exit_code"> + <![CDATA[ + mkdir aligned_out && + python '$__tool_directory__/align_dbspec.py' + --specnames $specnames + --dbfiles + #for $key in $databases.keys() + '$databases[$key]' + #end for + --frspec '$frspec' + --firstfr $firstfr + + ]]> + </command> + + <inputs> + <param name="specnames" type="data" format="text,tabular" label="Spectra files" /> + <param name="databases" type="data_collection" collection_type="list" format="fasta" label="Fractionated databases" /> + <param name="frspec" type="text" label="Regex to find fraction numbers in spectra file names" help="If spectra file is called myspectra_fr01b.mzML, use .*fr([0-9]+).*" > + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + </sanitizer> + </param> + <param name="firstfr" type="integer" value="1" label="First fraction number in series" /> + </inputs> + + <outputs> + <collection name="aligned_db" type="list" label="spectra-fraction-aligned DB"> + <discover_datasets pattern="__designation__" ext="fasta" directory="aligned_out" /> + </collection> + </outputs> + <tests> + <test> + <param name="specnames" value="specnames.txt" /> + <param name="databases"> + <collection type="list"> + <element name="fr1" value="target_splitdb_fr1.fasta" /> + <element name="fr2" value="target_splitdb_fr2.fasta" /> + <element name="fr3" value="target_splitdb_fr3.fasta" /> + <element name="fr4" value="decoy_splitdb_fr1.fasta" /> + <element name="fr5" value="decoy_splitdb_fr2.fasta" /> + <element name="fr6" value="decoy_splitdb_fr3.fasta" /> + <element name="fr7" value="decoy_splitdb_fr1.fasta" /> + <element name="fr8" value="decoy_splitdb_fr2.fasta" /> + <element name="fr9" value="decoy_splitdb_fr3.fasta" /> + </collection> + </param> + <param name="frspec" value=".*c_f([0-9]+).*" /> + <param name="firstfr" value="1" /> + <output_collection name="aligned_db" type="list"> + <element name="spec_f01.mzML" value="target_splitdb_fr1.fasta" /> + <element name="spec_f02.mzML" value="target_splitdb_fr2.fasta" /> + <element name="spec_f03.mzML" value="target_splitdb_fr3.fasta" /> + <element name="spec_f03b.mzML" value="target_splitdb_fr3.fasta" /> + <element name="spec_f09.mzML" value="merged_twice_decoy_fr1-3.fasta" compare="sim_size" /> + <element name="spec_f09b.mzML" value="merged_twice_decoy_fr1-3.fasta" compare="sim_size" /> + </output_collection> + </test> + </tests> + + <help> + Filters, pools and doubles fractionated databases with a set of identically fractionated spectra files which have been + subjected to pooling and contain reruns. + You may have fraction 1-10 in databases but spectra file fractions 4-7 have been pooled before loading to the MS, + and spectra fraction 2 and 8 have been reran creating fractions 2 and 2a, and 8, 8a and 8b. + This tool pools FASTA databases and duplicates them where needed to line up the databases to your spectra collections. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/merged_twice_decoy_fr1-3.fasta Fri Sep 01 03:14:37 2017 -0400 @@ -0,0 +1,10 @@ +>decoy_protein1 +TFSLFGCSIPNTNVEFSIKLFDVCLLLCNCLFSLIIMIYVII +>decoy_protein2 +TFSLFGCSIPNTNVEFSI +>decoy_protein1 +LNLSKPILSEST +>decoy_protein3 +LFDVCLLLCNCLFSLIIMIYVIIK +>decoy_protein2 +LFDVCLLLCNCLFSLIIMIYVIIKLWLFK
