Mercurial > repos > galaxy-australia > alphafold2
annotate validate_fasta.py @ 0:67c179acafdd draft default tip
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
| author | galaxy-australia |
|---|---|
| date | Thu, 03 Mar 2022 02:54:20 +0000 |
| parents | |
| children |
| rev | line source |
|---|---|
|
0
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
1 """Validate input FASTA sequence.""" |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
2 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
3 import re |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
4 import argparse |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
5 from typing import List, TextIO |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
6 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
7 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
8 class Fasta: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
9 def __init__(self, header_str: str, seq_str: str): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
10 self.header = header_str |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
11 self.aa_seq = seq_str |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
12 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
13 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
14 class FastaLoader: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
15 def __init__(self, fasta_path: str): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
16 """Initialize from FASTA file.""" |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
17 self.fastas = [] |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
18 self.load(fasta_path) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
19 print("Loaded FASTA sequences:") |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
20 for f in self.fastas: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
21 print(f.header) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
22 print(f.aa_seq) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
23 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
24 def load(self, fasta_path: str): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
25 """Load bare or FASTA formatted sequence.""" |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
26 with open(fasta_path, 'r') as f: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
27 self.content = f.read() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
28 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
29 if "__cn__" in self.content: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
30 # Pasted content with escaped characters |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
31 self.newline = '__cn__' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
32 self.caret = '__gt__' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
33 else: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
34 # Uploaded file with normal content |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
35 self.newline = '\n' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
36 self.caret = '>' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
37 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
38 self.lines = self.content.split(self.newline) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
39 header, sequence = self.interpret_first_line() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
40 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
41 i = 0 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
42 while i < len(self.lines): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
43 line = self.lines[i] |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
44 if line.startswith(self.caret): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
45 self.update_fastas(header, sequence) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
46 header = '>' + self.strip_header(line) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
47 sequence = '' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
48 else: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
49 sequence += line.strip('\n ') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
50 i += 1 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
51 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
52 # after reading whole file, header & sequence buffers might be full |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
53 self.update_fastas(header, sequence) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
54 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
55 def interpret_first_line(self): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
56 line = self.lines[0] |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
57 if line.startswith(self.caret): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
58 header = '>' + self.strip_header(line) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
59 return header, '' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
60 else: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
61 return '', line |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
62 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
63 def strip_header(self, line): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
64 """Strip characters escaped with underscores from pasted text.""" |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
65 return re.sub(r'\_\_.{2}\_\_', '', line).strip('>') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
66 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
67 def update_fastas(self, header: str, sequence: str): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
68 # if we have a sequence |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
69 if sequence: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
70 # create generic header if not exists |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
71 if not header: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
72 fasta_count = len(self.fastas) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
73 header = f'>sequence_{fasta_count}' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
74 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
75 # Create new Fasta |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
76 self.fastas.append(Fasta(header, sequence)) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
77 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
78 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
79 class FastaValidator: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
80 def __init__(self, fasta_list: List[Fasta]): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
81 self.fasta_list = fasta_list |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
82 self.min_length = 30 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
83 self.max_length = 2000 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
84 self.iupac_characters = { |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
85 'A', 'B', 'C', 'D', 'E', 'F', 'G', |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
86 'H', 'I', 'K', 'L', 'M', 'N', 'P', |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
87 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
88 'Y', 'Z', '-' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
89 } |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
90 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
91 def validate(self): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
92 """performs fasta validation""" |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
93 self.validate_num_seqs() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
94 self.validate_length() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
95 self.validate_alphabet() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
96 # not checking for 'X' nucleotides at the moment. |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
97 # alphafold can throw an error if it doesn't like it. |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
98 #self.validate_x() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
99 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
100 def validate_num_seqs(self) -> None: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
101 if len(self.fasta_list) > 1: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
102 raise Exception(f'Error encountered validating fasta: More than 1 sequence detected ({len(self.fasta_list)}). Please use single fasta sequence as input') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
103 elif len(self.fasta_list) == 0: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
104 raise Exception(f'Error encountered validating fasta: input file has no fasta sequences') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
105 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
106 def validate_length(self): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
107 """Confirms whether sequence length is valid. """ |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
108 fasta = self.fasta_list[0] |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
109 if len(fasta.aa_seq) < self.min_length: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
110 raise Exception(f'Error encountered validating fasta: Sequence too short ({len(fasta.aa_seq)}aa). Must be > 30aa') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
111 if len(fasta.aa_seq) > self.max_length: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
112 raise Exception(f'Error encountered validating fasta: Sequence too long ({len(fasta.aa_seq)}aa). Must be < 2000aa') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
113 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
114 def validate_alphabet(self): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
115 """ |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
116 Confirms whether the sequence conforms to IUPAC codes. |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
117 If not, reports the offending character and its position. |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
118 """ |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
119 fasta = self.fasta_list[0] |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
120 for i, char in enumerate(fasta.aa_seq.upper()): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
121 if char not in self.iupac_characters: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
122 raise Exception(f'Error encountered validating fasta: Invalid amino acid found at pos {i}: "{char}"') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
123 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
124 def validate_x(self): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
125 """checks if any bases are X. TODO check whether alphafold accepts X bases. """ |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
126 fasta = self.fasta_list[0] |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
127 for i, char in enumerate(fasta.aa_seq.upper()): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
128 if char == 'X': |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
129 raise Exception(f'Error encountered validating fasta: Unsupported aa code "X" found at pos {i}') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
130 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
131 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
132 class FastaWriter: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
133 def __init__(self) -> None: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
134 self.outfile = 'alphafold.fasta' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
135 self.formatted_line_len = 60 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
136 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
137 def write(self, fasta: Fasta): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
138 with open(self.outfile, 'w') as fp: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
139 header = fasta.header |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
140 seq = self.format_sequence(fasta.aa_seq) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
141 fp.write(header + '\n') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
142 fp.write(seq + '\n') |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
143 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
144 def format_sequence(self, aa_seq: str): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
145 formatted_seq = '' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
146 for i in range(0, len(aa_seq), self.formatted_line_len): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
147 formatted_seq += aa_seq[i: i + self.formatted_line_len] + '\n' |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
148 return formatted_seq |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
149 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
150 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
151 def main(): |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
152 # load fasta file |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
153 args = parse_args() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
154 fas = FastaLoader(args.input_fasta) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
155 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
156 # validate |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
157 fv = FastaValidator(fas.fastas) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
158 fv.validate() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
159 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
160 # write cleaned version |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
161 fw = FastaWriter() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
162 fw.write(fas.fastas[0]) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
163 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
164 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
165 def parse_args() -> argparse.Namespace: |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
166 parser = argparse.ArgumentParser() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
167 parser.add_argument( |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
168 "input_fasta", |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
169 help="input fasta file", |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
170 type=str |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
171 ) |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
172 return parser.parse_args() |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
173 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
174 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
175 |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
176 if __name__ == '__main__': |
|
67c179acafdd
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty"
galaxy-australia
parents:
diff
changeset
|
177 main() |
