Mercurial > repos > iuc > fasta_regex_finder
changeset 0:253438d9ab54 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/bioinformatics-cafe commit c318cd3f894f89c13d7eb257678673da70f72212
| author | iuc |
|---|---|
| date | Wed, 25 Jan 2023 14:03:40 +0000 |
| parents | |
| children | |
| files | fastaregexfinder.py fastaregexfinder.xml test-data/TestSeqGroup-G4-sub.bed test-data/TestSeqGroup-G4.bed test-data/TestSeqGroup-G4.fasta test-data/test-1.bed test-data/test-2.bed test-data/test-3.bed test-data/test-4.bed test-data/test.fas |
| diffstat | 10 files changed, 516 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaregexfinder.py Wed Jan 25 14:03:40 2023 +0000 @@ -0,0 +1,281 @@ +#!/usr/bin/env python + +import argparse +import operator +import re +import sys + +VERSION = "0.1.1" + +parser = argparse.ArgumentParser( + description=""" + +DESCRIPTION + + Search a fasta file for matches to a regex and return a bed file with the + coordinates of the match and the matched sequence itself. + + Output bed file has columns: + 1. Name of fasta sequence (e.g. chromosome) + 2. Start of the match + 3. End of the match + 4. ID of the match + 5. Length of the match + 6. Strand + 7. Matched sequence as it appears on the forward strand + + For matches on the reverse strand it is reported the start and end position on the + forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT' + present on the reverse strand is reported as ACCCTCCC). + + Note: Fasta sequences (chroms) are read in memory one at a time along with the + matches for that chromosome. + The order of the output is: chroms as they are found in the inut fasta, matches + sorted within chroms by positions. + +EXAMPLE: + ## Test data: + echo '>mychr' > /tmp/mychr.fa + echo 'ACTGnACTGnACTGnTGAC' >> /tmp/mychr.fa + + fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' + mychr 0 4 mychr_0_4_for 4 + ACTG + mychr 5 9 mychr_5_9_for 4 + ACTG + mychr 10 14 mychr_10_14_for 4 + ACTG + + fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' --maxstr 3 + mychr 0 4 mychr_0_4_for 4 + ACT[3,4] + mychr 5 9 mychr_5_9_for 4 + ACT[3,4] + mychr 10 14 mychr_10_14_for 4 + ACT[3,4] + + less /tmp/mychr.fa | fastaRegexFinder.py -f - -r 'A\\w\\wGn' + mychr 0 5 mychr_0_5_for 5 + ACTGn + mychr 5 10 mychr_5_10_for 5 + ACTGn + mychr 10 15 mychr_10_15_for 5 + ACTGn + +DOWNLOAD + fastaRegexFinder.py is hosted at https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder + + """, + formatter_class=argparse.RawTextHelpFormatter, +) + +parser.add_argument( + "--fasta", + "-f", + type=str, + help="Input fasta file to search. Use '-' to read the file from stdin.", + required=True, +) + +parser.add_argument( + "--regex", + "-r", + type=str, + help="""Regex to be searched in the fasta input. +Matches to the reverse complement will have - strand. +The default regex is '([gG]{3,}\\w{1,7}){3,}[gG]{3,}' which searches +for G-quadruplexes.""", + default="([gG]{3,}\\w{1,7}){3,}[gG]{3,}", +) + +parser.add_argument( + "--matchcase", + "-m", + action="store_true", + help="""Match case while searching for matches. Default is +to ignore case (I.e. 'ACTG' will match 'actg'). + """, +) + +parser.add_argument( + "--noreverse", + action="store_true", + help="""Do not search the reverse complement of the input fasta. +Use this flag to search protein sequences.""", +) + +parser.add_argument( + "--maxstr", + type=int, + required=False, + default=10000, + help="""Maximum length of the match to report in the 7th column of the output. +Default is to report up to 10000nt. +Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>] + """, +) + +parser.add_argument( + "--seqnames", + "-s", + type=str, + nargs="+", + default=[None], + required=False, + help="""List of fasta sequences in --fasta to +search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes. +Default is to search all the sequences in input. + """, +) +parser.add_argument( + "--quiet", + "-q", + action="store_true", + help="""Do not print progress report (i.e. sequence names as they are scanned). + """, +) + + +parser.add_argument("--version", "-v", action="version", version="%(prog)s " + VERSION) + + +args = parser.parse_args() + +" --------------------------[ Check and parse arguments ]---------------------- " + +if args.matchcase: + flag = 0 +else: + flag = re.IGNORECASE + +" ------------------------------[ Functions ]--------------------------------- " + + +def sort_table(table, cols): + """Code to sort list of lists + see http://www.saltycrane.com/blog/2007/12/how-to-sort-table-by-columns-in-python/ + + sort a table by multiple columns + table: a list of lists (or tuple of tuples) where each inner list + represents a row + cols: a list (or tuple) specifying the column numbers to sort by + e.g. (1,0) would sort by column 1, then by column 0 + """ + for col in reversed(cols): + table = sorted(table, key=operator.itemgetter(col)) + return table + + +def trimMatch(x, n): + """Trim the string x to be at most length n. Trimmed matches will be reported + with the syntax ACTG[a,b] where Ns are the beginning of x, a is the length of + the trimmed strng (e.g 4 here) and b is the full length of the match + EXAMPLE: + trimMatch('ACTGNNNN', 4) + >>>'ACTG[4,8]' + trimMatch('ACTGNNNN', 8) + >>>'ACTGNNNN' + """ + if len(x) > n and n is not None: + m = x[0:n] + "[" + str(n) + "," + str(len(x)) + "]" + else: + m = x + return m + + +def revcomp(x): + """Reverse complement string x. Ambiguity codes are handled and case conserved. + + Test + x= 'ACGTRYSWKMBDHVNacgtryswkmbdhvn' + revcomp(x) + """ + compdict = { + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "R": "Y", + "Y": "R", + "S": "W", + "W": "S", + "K": "M", + "M": "K", + "B": "V", + "D": "H", + "H": "D", + "V": "B", + "N": "N", + "a": "t", + "c": "g", + "g": "c", + "t": "a", + "r": "y", + "y": "r", + "s": "w", + "w": "s", + "k": "m", + "m": "k", + "b": "v", + "d": "h", + "h": "d", + "v": "b", + "n": "n", + } + xrc = [] + for n in x: + xrc.append(compdict[n]) + xrc = "".join(xrc)[::-1] + return xrc + + +# ----------------------------------------------------------------------------- + +psq_re_f = re.compile(args.regex, flags=flag) +# psq_re_r= re.compile(regexrev) + +if args.fasta != "-": + ref_seq_fh = open(args.fasta) +else: + ref_seq_fh = sys.stdin + +ref_seq = [] +line = (ref_seq_fh.readline()).strip() +chr = re.sub("^>", "", line) +line = (ref_seq_fh.readline()).strip() +gquad_list = [] +while True: + if not args.quiet: + sys.stderr.write("Processing %s\n" % (chr)) + while line.startswith(">") is False: + ref_seq.append(line) + line = (ref_seq_fh.readline()).strip() + if line == "": + break + ref_seq = "".join(ref_seq) + if args.seqnames == [None] or chr in args.seqnames: + for m in re.finditer(psq_re_f, ref_seq): + matchstr = trimMatch(m.group(0), args.maxstr) + quad_id = str(chr) + "_" + str(m.start()) + "_" + str(m.end()) + "_for" + gquad_list.append( + [chr, m.start(), m.end(), quad_id, len(m.group(0)), "+", matchstr] + ) + if args.noreverse is False: + ref_seq = revcomp(ref_seq) + seqlen = len(ref_seq) + for m in re.finditer(psq_re_f, ref_seq): + matchstr = trimMatch(revcomp(m.group(0)), args.maxstr) + mstart = seqlen - m.end() + mend = seqlen - m.start() + quad_id = str(chr) + "_" + str(mstart) + "_" + str(mend) + "_rev" + gquad_list.append( + [chr, mstart, mend, quad_id, len(m.group(0)), "-", matchstr] + ) + gquad_sorted = sort_table(gquad_list, (1, 2, 3)) + gquad_list = [] + for xline in gquad_sorted: + xline = "\t".join([str(x) for x in xline]) + print(xline) + chr = re.sub("^>", "", line) + ref_seq = [] + line = (ref_seq_fh.readline()).strip() + if line == "": + break + +# gquad_sorted= sort_table(gquad_list, (0,1,2,3)) +# +# for line in gquad_sorted: +# line= '\t'.join([str(x) for x in line]) +# print(line) +sys.exit()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaregexfinder.xml Wed Jan 25 14:03:40 2023 +0000 @@ -0,0 +1,162 @@ +<tool id="fasta_regex_finder" name="Fasta regular expression finder" version="0.1.0"> + <description> + Search in fasta for regexp match + </description> + <requirements> + <requirement type="package" version="3.8">python</requirement> + </requirements> + <version_command>python '$__tool_directory__/fastaregexfinder.py' --version</version_command> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/fastaregexfinder.py' +--fasta '$input' +--regex '$regex' +#if $settings.advanced == "advanced" + $settings.matchcase + $settings.noreverse + --maxstr $settings.maxstr + #if $settings.seqnames != "" + --seqnames $settings.seqnames + #end if +#end if +--quiet +> '$output' + ]]></command> + <inputs> + <param type="data" name="input" format="fasta" /> + <param name="regex" size="30" type="text" value="([gG]{3,}\w{1,7}){3,}[gG]{3,}" label="Regular expression" help="(--regex)"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + <conditional name="settings"> + <param name="advanced" type="select" label="Specify advanced parameters"> + <option value="simple" selected="true">No, use program defaults.</option> + <option value="advanced">Yes, see full parameter list.</option> + </param> + <when value="simple"> + </when> + <when value="advanced"> + <param name="matchcase" type="boolean" label="Match case" truevalue="--matchcase" falsevalue="" help="(--matchcase)" /> + <param name="noreverse" type="boolean" label="Do not search the reverse complement" truevalue="--noreverse" falsevalue="" help="(--noreverse)" /> + <param name="maxstr" type="integer" label="Maximum length of the match to report" value="10000" min="1" help="(--maxstr)" /> + <param name="seqnames" size="30" type="text" value="" label="Space separated list of fasta sequences to search" help="--seqnames"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output" format="bed" from_work_dir="TestSeqGroup-G4.bed" /> + </outputs> + <tests> + <test> + <param name="input" value="TestSeqGroup-G4.fasta"/> + <output name="output" file="TestSeqGroup-G4.bed"/> + </test> + <test> + <param name="input" value="test.fas"/> + <param name="regex" value="ACTG"/> + <output name="output" file="test-1.bed"/> + </test> + <test> + <param name="input" value="test.fas"/> + <param name="regex" value="ACTG"/> + <param name="advanced" value="advanced"/> + <param name="matchcase" value="--matchcase"/> + <output name="output" file="test-2.bed"/> + </test> + <test> + <param name="input" value="test.fas"/> + <param name="regex" value="ACTG"/> + <param name="advanced" value="advanced"/> + <param name="noreverse" value="--noreverse"/> + <output name="output" file="test-3.bed"/> + </test> + <test> + <param name="input" value="test.fas"/> + <param name="regex" value="ACTG"/> + <param name="advanced" value="advanced"/> + <param name="maxstr" value="3"/> + <output name="output" file="test-4.bed"/> + </test> + <test> + <param name="input" value="TestSeqGroup-G4.fasta"/> + <param name="advanced" value="advanced"/> + <param name="seqnames" value="HJ24-Shp2_oncogenicProtein2 HJ24-Shp2_oncogenicProtein"/> + <output name="output" file="TestSeqGroup-G4-sub.bed"/> + </test> +</tests> + <help><![CDATA[ +DESCRIPTION + +Search a fasta file for matches to a regular expression and return a bed file with the +coordinates of the match and the matched sequence itself. + +Output bed file has columns: + +1. Name of fasta sequence (e.g. chromosome) +2. Start of the match +3. End of the match +4. ID of the match +5. Length of the match +6. Strand +7. Matched sequence as it appears on the forward strand + +For matches on the reverse strand it is reported the start and end position on the +forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT' +present on the reverse strand is reported as ACCCTCCC). + + +Note: Fasta sequences (chroms) are read in memory one at a time along with the +matches for that chromosome. +The order of the output is: chroms as they are found in the inut fasta, matches +sorted within chroms by positions. + +ARGUMENTS: + +- regex Regex to be searched in the fasta input. Matches to the reverse complement will have - strand. The default regex is '([gG]{3,}\w{1,7}){3,}[gG]{3,}' which searches for G-quadruplexes. +- matchcase Match case while searching for matches. Default is to ignore case (I.e. 'ACTG' will match 'actg'). +- noreverse Do not search the reverse complement of the input fasta. Use this flag to search protein sequences. +- maxstr Maximum length of the match to report in the 7th column of the output. Default is to report up to 10000nt. Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>] +- seqnames List of fasta sequences in the input to search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes. Default is to search all the sequences in input. + +EXAMPLE: + +Test data:: +>mychr +ACTGnACTGnACTGnTGAC + +Example1 regex=ACTG:: + + mychr 0 4 mychr_0_4_for 4 + ACTG + mychr 5 9 mychr_5_9_for 4 + ACTG + mychr 10 14 mychr_10_14_for 4 + ACTG + +Example2 regex=ACTG maxstr=3:: + + mychr 0 4 mychr_0_4_for 4 + ACT[3,4] + mychr 5 9 mychr_5_9_for 4 + ACT[3,4] + mychr 10 14 mychr_10_14_for 4 + ACT[3,4] + +Example3 regex=A\w\wG:: + + mychr 0 5 mychr_0_5_for 5 + ACTGn + mychr 5 10 mychr_5_10_for 5 + ACTGn + mychr 10 15 mychr_10_15_for 5 + ACTGn + + ]]></help> + <citations> + <citation type="bibtex"> +@misc{githubfastaRegexFinder, + author = {Dario Beraldi}, + year = {2017}, + title = {fastaRegexFinder}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder}, +}</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/TestSeqGroup-G4-sub.bed Wed Jan 25 14:03:40 2023 +0000 @@ -0,0 +1,2 @@ +HJ24-Shp2_oncogenicProtein 17 58 HJ24-Shp2_oncogenicProtein_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG +HJ24-Shp2_oncogenicProtein2 17 58 HJ24-Shp2_oncogenicProtein2_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/TestSeqGroup-G4.bed Wed Jan 25 14:03:40 2023 +0000 @@ -0,0 +1,3 @@ +HJ24-Shp2_oncogenicProtein 17 58 HJ24-Shp2_oncogenicProtein_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG +HJ24-Shp2_oncogenicProtein2 17 58 HJ24-Shp2_oncogenicProtein2_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG +HJ24-Shp2_oncogenicProtein3 17 58 HJ24-Shp2_oncogenicProtein3_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/TestSeqGroup-G4.fasta Wed Jan 25 14:03:40 2023 +0000 @@ -0,0 +1,56 @@ +>PA#2/8 +ATACCAGCTTATTCAATTAGCAACATGAGGGGGATAGAGGGGGTGGGTTCTCTCGGCTACAATCGTAATCAGTTAG +>PA#14/89 +ATACCAGCTTATTCAATTGGGCACCACGGGAGTCGGCCACATTTGGAGTTGTTTTTGCACAATCGTAATCAGTTAG +>PA-C10 +ATACCAGCTTATTCAATTGCGCACCACGGGAGTTGGCCACATTTGGAGTTGTTTTTGCACAATCGTAATCAGTTAG +>PA#4/22 +ATACCAGCTTATTCAATTGCAGTACTGATGAGTGTAGCCGTATGATTATCGTTTGTGGACAATCGTAATCAGTTAG +>PA#4/34 +ATACCAGCTTATTCAATTCCCCAACGAGTCGATATGTAGCCCACACTCTGATTCGTCCACAATCGTAATCAGTTAG +>PA#2/11 +ATACCAGCTTATTCAATTGGAGACGACAAACTATTACGTACTACGGCATGCACTTGGTACAATCGTAATCAGTTAG +>PA-C8 +ATACCAGCTTATTCAATTAGGCCAGATGAGGGGTGCCCATGGCGGGGTGGCTGCTCCAACAATCGTAATCAGTTAG +>PA#14/82 +ATACCAGCTTATTCAATTCCACAACCGAACTCGTAAGACGTATGTAGCCGCCAACTGTACAATCGTAATCAGTTAG +>PA#2/3 +ATACCAGCTTATTCAATTCGACAAGTGGGCATTACGATTCTAGCCCTGATTATGTTCCACAATCGTAATCAGTTAG +>PA-C9 +ATACCAGCTTATTCAATTACCGAGGAGATAACGTTGTAGCCGTCCATCATCTGATTCGACAATCGTAATCAGTTAG +>PA#2/6 +ATACCAGCTTATTCAATTACCGATCACTAGCCGACTAATTGGTTTCCGATCGCAGTCCACAATCGTAATCAGTTAG +>PA-C11 +ATACCAGCTTATTCAATTCGATGGAGCTGATGATTGTTGCCGATCTGACTGTTGTTCCACAATCGTAATCAGTTAG +>PA-C13 +ATACCAGCTTATTCAATTCCCCTAACGTTACTGGATGTAGTCCGACTAACTTATGCGTACAATCGTAATCAGTTAG +>PA-C42 +ATACCAGCTTATTCAATTGCAGATTACGCCTTGTAGCCCGCACTGATCTCGATGTTTGGACAATCGTAATCAGTTAG +>PA-C16 +ATACCAGCTTATTCAATTCCCACGAGTGTAGCCGATTCTTCTGTACTCTTGTCCTCGTACAATCGTAATCAGTTAG +>PA-C15 +ATACCAGCTTATTCAATTACGTGTTGTAGCCGACCCCTGTTGATTGTTTTCCTGTACCACAATCGTAATCAGTTAG +>EA#14.3 +ATACCAGCTTATTCAATTTGAGGCGGGTGGGTGGGTTGAATACGCTGATTACCCCATCGGAGAACGTTAAGGCGCTTCAGATAGTAAGTGCAATCT +>EA#9.4 +ATACCAGCTTATTCAATTGCTGCGAGGTGGGTGGGTGGGAGCAATTGATCCTCGCTTAGCTTCTACGGTGGGCTATCTAGATAGTAAGTGCAATCT +>EA#5.10 +ATACCAGCTTATTCAATTCACCACACCTGCACCCCTGACTTCCCACTTATATCTACTACTCCGTCTCAAGCCCGTTTGAGATAGTAAGTGCAATCT +>EA#14.5 +ATACCAGCTTATTCAATTCCGAGTTTGGGTGGGAGTGGTGGGTTCGGAATTGTTAGTTATTTGGGTTTATGCGAGGTGAGATAGTAAGTGCAATCT +>EA#14.8 +ATACCAGCTTATTCAATTGACGGGGTGTTGTCGTATGCTGTAGAAGCCGTAATTTTTTTTGTTTTCCCTGCCCACCTAGATAGTAAGTGCAATCT +>EA#14.4 +ATACCAGCTTATTCAATTCCACAGGTTGTATGGGGAATAAGGTGGGTGCGCGAGATAGTAAGTGCAATCT +>EA#11.5 +ATACCAGCTTATTCAATTCCCACACCCTAACCGTAGAGCTAAGCTTTTCTTACTACTGACAGTGCTTTACCGTTTGCAAGATAGTAAGTGCAATCT +>HJ24-Shp2_oncogenicProtein +AGCGTCGAATACCACACGGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGGCTAATGGAGCTCGTGGTCAT +>HJ24-Shp2_oncogenicProtein2 +AGCGTCGAATACCACACGGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGGCTAATGGAGCTCGTGGTCAT +>HJ24-Shp2_oncogenicProtein3 +AGCGTCGAATACCACACGGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGGCTAATGGAGCTCGTGGTCAT +>RT6-HIVRT +ATCCGCCTGATTAGCGATACTCAGGCGTTAGGGAAGGGCGTCGAAAGCAGGGTGGGACTTGAGCAAAATCACCTGCAGGGG +>AptG4-HumanRNaseH1 +CGGTCGCTCCGTGTGGCTTGGGTTGGGTGTGGCAGTGAC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-1.bed Wed Jan 25 14:03:40 2023 +0000 @@ -0,0 +1,3 @@ +mychr 0 4 mychr_0_4_for 4 + ACTG +mychr 5 9 mychr_5_9_for 4 + actg +mychr 10 14 mychr_10_14_rev 4 - CAGT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-2.bed Wed Jan 25 14:03:40 2023 +0000 @@ -0,0 +1,2 @@ +mychr 0 4 mychr_0_4_for 4 + ACTG +mychr 10 14 mychr_10_14_rev 4 - CAGT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-3.bed Wed Jan 25 14:03:40 2023 +0000 @@ -0,0 +1,2 @@ +mychr 0 4 mychr_0_4_for 4 + ACTG +mychr 5 9 mychr_5_9_for 4 + actg
