# HG changeset patch # User galaxy-australia # Date 1646276060 0 # Node ID 67c179acafdd4c9efafd0400bfbc6e4ae67a32a3 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86-dirty" diff -r 000000000000 -r 67c179acafdd README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Thu Mar 03 02:54:20 2022 +0000 @@ -0,0 +1,164 @@ +Alphafold compute setup +======================= + +Overview +-------- + +Alphafold requires a customised compute environment to run. The machine +needs a GPU, and access to a 2.2 Tb reference data store. + +This document is designed to provide details on the compute environment +required for Alphafold operation, and the Galaxy job destination +settings to run the wrapper. + +For full details on Alphafold requirements, see +https://github.com/deepmind/alphafold. + +HARDWARE +~~~~~~~~ + +The machine is recommended to have the following specs: - 12 cores - 80 +Gb RAM - 2.5 Tb storage - A fast Nvidia GPU. + +As a minimum, the Nvidia GPU must have 8Gb RAM. It also requires +**unified memory** to be switched on. Unified memory is usually enabled +by default, but some HPC systems will turn it off so the GPU can be +shared between multiple jobs concurrently. + +ENVIRONMENT +~~~~~~~~~~~ + +This wrapper runs Alphafold as a singularity container. The following +software are needed: + +- `Singularity `_ +- `NVIDIA Container + Toolkit `_ + +As Alphafold uses an Nvidia GPU, the NVIDIA Container Toolkit is needed. +This makes the GPU available inside the running singularity container. + +To check that everything has been set up correctly, run the following + +:: + + singularity run --nv docker://nvidia/cuda:11.0-base nvidia-smi + +If you can see something similar to this output (details depend on your +GPU), it has been set up correctly. + +:: + + +-----------------------------------------------------------------------------+ + | NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 | + |-------------------------------+----------------------+----------------------+ + | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | + | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | + | | | MIG M. | + |===============================+======================+======================| + | 0 Tesla T4 Off | 00000000:00:05.0 Off | 0 | + | N/A 49C P0 28W / 70W | 0MiB / 15109MiB | 0% Default | + | | | N/A | + +-------------------------------+----------------------+----------------------+ + + +-----------------------------------------------------------------------------+ + | Processes: | + | GPU GI CI PID Type Process name GPU Memory | + | ID ID Usage | + |=============================================================================| + | No running processes found | + +-----------------------------------------------------------------------------+ + +REFERENCE DATA +~~~~~~~~~~~~~~ + +Alphafold needs reference data to run. The wrapper expects this data to +be present at ``/data/alphafold_databases``. To download, run the +following shell script command in the tool directory. + +:: + + # make folders if needed + mkdir /data /data/alphafold_databases + + # download ref data + bash scripts/download_all_data.sh /data/alphafold_databases + +This will install the reference data to ``/data/alphafold_databases``. +To check this has worked, ensure the final folder structure is as +follows: + +:: + + data/alphafold_databases + ├── bfd + │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata + │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex + │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata + │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex + │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata + │   └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex + ├── mgnify + │   └── mgy_clusters_2018_12.fa + ├── params + │   ├── LICENSE + │   ├── params_model_1.npz + │   ├── params_model_1_ptm.npz + │   ├── params_model_2.npz + │   ├── params_model_2_ptm.npz + │   ├── params_model_3.npz + │   ├── params_model_3_ptm.npz + │   ├── params_model_4.npz + │   ├── params_model_4_ptm.npz + │   ├── params_model_5.npz + │   └── params_model_5_ptm.npz + ├── pdb70 + │   ├── md5sum + │   ├── pdb70_a3m.ffdata + │   ├── pdb70_a3m.ffindex + │   ├── pdb70_clu.tsv + │   ├── pdb70_cs219.ffdata + │   ├── pdb70_cs219.ffindex + │   ├── pdb70_hhm.ffdata + │   ├── pdb70_hhm.ffindex + │   └── pdb_filter.dat + ├── pdb_mmcif + │   ├── mmcif_files + │   └── obsolete.dat + ├── uniclust30 + │   └── uniclust30_2018_08 + └── uniref90 + └── uniref90.fasta + +JOB DESTINATION +~~~~~~~~~~~~~~~ + +Alphafold needs a custom singularity job destination to run. The +destination needs to be configured for singularity, and some extra +singularity params need to be set as seen below. + +Specify the job runner. For example, a local runner + +:: + + + +Customise the job destination with required singularity settings. The +settings below are mandatory, but you may include other settings as +needed. + +:: + + + 'none' + true + --nv + "$job_directory:ro,$tool_directory:ro,$job_directory/outputs:rw,$working_directory:rw,/data/alphafold_databases:/data:ro" + + +Closing +~~~~~~~ + +If you are experiencing technical issues, feel free to write to +help@genome.edu.au. We may be able to provide advice on setting up +Alphafold on your compute environment. diff -r 000000000000 -r 67c179acafdd alphafold.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphafold.html Thu Mar 03 02:54:20 2022 +0000 @@ -0,0 +1,656 @@ + + + + + + + + + Alphafold structure prediction + + + + + + + + + + + + + + +

Alphafold structure prediction

+ +
+
+
+ +
+ +
+ Select a representation to display +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+

+ Scroll up/down + to zoom in and out +

+

+ Click + drag + to rotate the structure +

+

+ CTRL + click + drag + to move the structure +

+

+ Click + an atom to bring it into focus +

+
+ +
+
+
+
+
<50
+
70
+
90+
+
+
+ +
+

+ + Alphafold produces a + + per-residue confidence score (pLDDT) + + between 0 and 100. Some regions below 50 pLDDT may be + unstructured in isolation. + +

+
+
+
+
+ +
+
+

Select model

+

The top five structures predicted by Alphafold

+
+ + + + + + + + + +
+
+ +
+

Toggle representations

+
+ + + + + + + +
+
+ +
+

Actions

+
+ + + +
+
+ +
+

Download

+
+ + + +
+
+
+
+ + + + + + diff -r 000000000000 -r 67c179acafdd alphafold.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphafold.xml Thu Mar 03 02:54:20 2022 +0000 @@ -0,0 +1,250 @@ + + Alphafold v2.0: AI-guided 3D structure prediction of proteins + + 2.0.0 + 0 + + + topic_0082 + + + operation_0474 + + + alphafold_2.0 + + + neoformit/alphafold:latest + + input.fasta && +#end if + +python3 '$__tool_directory__/validate_fasta.py' input.fasta && + +## env vars ------------------------------- +export TF_FORCE_UNIFIED_MEMORY=1 && +export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 && +export DATE=`date +"%Y-%m-%d"` && + +## run alphafold ------------------------- +python /app/alphafold/run_alphafold.py +--fasta_paths alphafold.fasta +--output_dir output +--data_dir \${ALPHAFOLD_DB:-/data} +--uniref90_database_path \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta +--mgnify_database_path \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2018_12.fa +--pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70 +--template_mmcif_dir \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files +--obsolete_pdbs_path \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat +--max_template_date=\$DATE +--bfd_database_path \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt +--uniclust30_database_path \${ALPHAFOLD_DB:-/data}/uniclust30/uniclust30_2018_08/uniclust30_2018_08 +--use_gpu_relax=True +&& + +## Uncomment for "dummy run" - skip alphafold run and read output from test-data +## cp -r '$__tool_directory__/output' . && + +## Generate additional outputs ------------ +python3 '$__tool_directory__/gen_extra_outputs.py' output/alphafold $output_plddts && + +## HTML output +mkdir -p '${ html.files_path }' && +cp '$__tool_directory__/alphafold.html' '${html}' && +cp output/alphafold/ranked_*.pdb '${html.files_path}' + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + (output_plddts) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + `_ and `SeeSAR `_, but many `free and open-source options `_ are available such as `AutoDock `_ and `SwissDock `_. + + *Expected run times* + + .. image:: https://github.com/usegalaxy-au/galaxy-local-tools/blob/1a8d3e8daa7ccc5a345ca377697735ab95ed0666/tools/alphafold/static/img/alphafold_runtime_graph.png?raw=true + :height: 520 + :alt: Run time graph + + | + | In general, we observe a quadratic relationship between sequence length and time to fold. + | Once your job begins, a sequence of 50aa will take approximately 1hr to complete, while a sequence of 2000aa will take about 18hrs. + | + + **Input** + + *Amino acid sequence* + + | AlphaFold accepts a **single amino acid sequence** in FASTA format. + | You can choose to input either a file from your Galaxy history or paste a sequence into a text box. + | Please paste only a single sequence - we can only process a single sequence per job. + | Multiple sequences will return an error. + | + + **Outputs** + + *Visualization* + + | An interactive 3D graphic of the best predicted molecular structures. + | This output can be opened in Galaxy to give a visual impression of the results, with different structural representations to choose from. + | Open the "Visualization" history output by clicking on the "view data" icon: + | + + .. image:: https://github.com/usegalaxy-au/galaxy-local-tools/blob/1a8d3e8daa7ccc5a345ca377697735ab95ed0666/tools/alphafold/static/img/alphafold-visualization.png?raw=true + :height: 520 + :alt: Result visualization + + | + + *PDB files* + + | Five PDB (Protein Data Bank) files will be created for the best ranking models predicted by AlphaFold. + | These files describe the molecular structures and can be used for downstream analysis. e.g. *in silico* molecular docking. + | + + *Model confidence scores (optional)* + + | This optional output produces a file which describes the confidence scores for each model (based on `pLDDTs `_) which may be useful for downstream analysis. + | Model confidence scores are also included as a column in the default PDB output. + | + + **External Resources** + + We recommend checking out the + `Alphafold Protein Structure Database `_, + which contains predicted sequences for thousands of Human proteins. See also: + + - `Google Deepmind's article on AlphaFold `_ + - `AlphaFold source code on GitHub `_ + + ]]> + + https://doi.org/10.1038/s41586-021-03819-2 + + diff -r 000000000000 -r 67c179acafdd gen_extra_outputs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gen_extra_outputs.py Thu Mar 03 02:54:20 2022 +0000 @@ -0,0 +1,155 @@ + + +import json +import pickle +import argparse +from typing import Any, Dict, List + + +class Settings: + """parses then keeps track of program settings""" + def __init__(self): + self.workdir = None + self.output_confidence_scores = True + self.output_residue_scores = False + + def parse_settings(self) -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "workdir", + help="alphafold output directory", + type=str + ) + parser.add_argument( + "-p", + "--plddts", + help="output per-residue confidence scores (pLDDTs)", + action="store_true" + ) + args = parser.parse_args() + self.workdir = args.workdir.rstrip('/') + self.output_residue_scores = args.plddts + + +class ExecutionContext: + """uses program settings to get paths to files etc""" + def __init__(self, settings: Settings): + self.settings = settings + + @property + def ranking_debug(self) -> str: + return f'{self.settings.workdir}/ranking_debug.json' + + @property + def model_pkls(self) -> List[str]: + return [f'{self.settings.workdir}/result_model_{i}.pkl' + for i in range(1, 6)] + + @property + def model_conf_score_output(self) -> str: + return f'{self.settings.workdir}/model_confidence_scores.tsv' + + @property + def plddt_output(self) -> str: + return f'{self.settings.workdir}/plddts.tsv' + + +class FileLoader: + """loads file data for use by other classes""" + def __init__(self, context: ExecutionContext): + self.context = context + + def get_model_mapping(self) -> Dict[str, int]: + data = self.load_ranking_debug() + return {name: int(rank) + 1 + for (rank, name) in enumerate(data['order'])} + + def get_conf_scores(self) -> Dict[str, float]: + data = self.load_ranking_debug() + return {name: float(f'{score:.2f}') + for name, score in data['plddts'].items()} + + def load_ranking_debug(self) -> Dict[str, Any]: + with open(self.context.ranking_debug, 'r') as fp: + return json.load(fp) + + def get_model_plddts(self) -> Dict[str, List[float]]: + plddts: Dict[str, List[float]] = {} + model_pkls = self.context.model_pkls + for i in range(5): + pklfile = model_pkls[i] + with open(pklfile, 'rb') as fp: + data = pickle.load(fp) + plddts[f'model_{i+1}'] = [float(f'{x:.2f}') for x in data['plddt']] + return plddts + + +class OutputGenerator: + """generates the output data we are interested in creating""" + def __init__(self, loader: FileLoader): + self.loader = loader + + def gen_conf_scores(self): + mapping = self.loader.get_model_mapping() + scores = self.loader.get_conf_scores() + ranked = list(scores.items()) + ranked.sort(key=lambda x: x[1], reverse=True) + return {f'model_{mapping[name]}': score + for name, score in ranked} + + def gen_residue_scores(self) -> Dict[str, List[float]]: + mapping = self.loader.get_model_mapping() + model_plddts = self.loader.get_model_plddts() + return {f'model_{mapping[name]}': plddts + for name, plddts in model_plddts.items()} + + +class OutputWriter: + """writes generated data to files""" + def __init__(self, context: ExecutionContext): + self.context = context + + def write_conf_scores(self, data: Dict[str, float]) -> None: + outfile = self.context.model_conf_score_output + with open(outfile, 'w') as fp: + for model, score in data.items(): + fp.write(f'{model}\t{score}\n') + + def write_residue_scores(self, data: Dict[str, List[float]]) -> None: + outfile = self.context.plddt_output + model_plddts = list(data.items()) + model_plddts.sort() + + with open(outfile, 'w') as fp: + for model, plddts in model_plddts: + plddt_str_list = [str(x) for x in plddts] + plddt_str = ','.join(plddt_str_list) + fp.write(f'{model}\t{plddt_str}\n') + + +def main(): + # setup + settings = Settings() + settings.parse_settings() + context = ExecutionContext(settings) + loader = FileLoader(context) + + # generate & write outputs + generator = OutputGenerator(loader) + writer = OutputWriter(context) + + # confidence scores + conf_scores = generator.gen_conf_scores() + writer.write_conf_scores(conf_scores) + + # per-residue plddts + if settings.output_residue_scores: + residue_scores = generator.gen_residue_scores() + writer.write_residue_scores(residue_scores) + + +if __name__ == '__main__': + main() + + + diff -r 000000000000 -r 67c179acafdd static/img/alphafold-visualization.png Binary file static/img/alphafold-visualization.png has changed diff -r 000000000000 -r 67c179acafdd static/img/alphafold_runtime_graph.png Binary file static/img/alphafold_runtime_graph.png has changed diff -r 000000000000 -r 67c179acafdd test-data/test1.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1.fasta Thu Mar 03 02:54:20 2022 +0000 @@ -0,0 +1,3 @@ +>UPI0015CE2E61 status=active +DGKILADKVSDKLEQTATLTGLDYGRFTRSMLLSQGQFAAFLNAKPSDRAELLEELTGTE +IYGQISAMVYEQHKAARHALEKFEAQAAGIVLLTEAQQ diff -r 000000000000 -r 67c179acafdd validate_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/validate_fasta.py Thu Mar 03 02:54:20 2022 +0000 @@ -0,0 +1,177 @@ +"""Validate input FASTA sequence.""" + +import re +import argparse +from typing import List, TextIO + + +class Fasta: + def __init__(self, header_str: str, seq_str: str): + self.header = header_str + self.aa_seq = seq_str + + +class FastaLoader: + def __init__(self, fasta_path: str): + """Initialize from FASTA file.""" + self.fastas = [] + self.load(fasta_path) + print("Loaded FASTA sequences:") + for f in self.fastas: + print(f.header) + print(f.aa_seq) + + def load(self, fasta_path: str): + """Load bare or FASTA formatted sequence.""" + with open(fasta_path, 'r') as f: + self.content = f.read() + + if "__cn__" in self.content: + # Pasted content with escaped characters + self.newline = '__cn__' + self.caret = '__gt__' + else: + # Uploaded file with normal content + self.newline = '\n' + self.caret = '>' + + self.lines = self.content.split(self.newline) + header, sequence = self.interpret_first_line() + + i = 0 + while i < len(self.lines): + line = self.lines[i] + if line.startswith(self.caret): + self.update_fastas(header, sequence) + header = '>' + self.strip_header(line) + sequence = '' + else: + sequence += line.strip('\n ') + i += 1 + + # after reading whole file, header & sequence buffers might be full + self.update_fastas(header, sequence) + + def interpret_first_line(self): + line = self.lines[0] + if line.startswith(self.caret): + header = '>' + self.strip_header(line) + return header, '' + else: + return '', line + + def strip_header(self, line): + """Strip characters escaped with underscores from pasted text.""" + return re.sub(r'\_\_.{2}\_\_', '', line).strip('>') + + def update_fastas(self, header: str, sequence: str): + # if we have a sequence + if sequence: + # create generic header if not exists + if not header: + fasta_count = len(self.fastas) + header = f'>sequence_{fasta_count}' + + # Create new Fasta + self.fastas.append(Fasta(header, sequence)) + + +class FastaValidator: + def __init__(self, fasta_list: List[Fasta]): + self.fasta_list = fasta_list + self.min_length = 30 + self.max_length = 2000 + self.iupac_characters = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'K', 'L', 'M', 'N', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', '-' + } + + def validate(self): + """performs fasta validation""" + self.validate_num_seqs() + self.validate_length() + self.validate_alphabet() + # not checking for 'X' nucleotides at the moment. + # alphafold can throw an error if it doesn't like it. + #self.validate_x() + + def validate_num_seqs(self) -> None: + if len(self.fasta_list) > 1: + raise Exception(f'Error encountered validating fasta: More than 1 sequence detected ({len(self.fasta_list)}). Please use single fasta sequence as input') + elif len(self.fasta_list) == 0: + raise Exception(f'Error encountered validating fasta: input file has no fasta sequences') + + def validate_length(self): + """Confirms whether sequence length is valid. """ + fasta = self.fasta_list[0] + if len(fasta.aa_seq) < self.min_length: + raise Exception(f'Error encountered validating fasta: Sequence too short ({len(fasta.aa_seq)}aa). Must be > 30aa') + if len(fasta.aa_seq) > self.max_length: + raise Exception(f'Error encountered validating fasta: Sequence too long ({len(fasta.aa_seq)}aa). Must be < 2000aa') + + def validate_alphabet(self): + """ + Confirms whether the sequence conforms to IUPAC codes. + If not, reports the offending character and its position. + """ + fasta = self.fasta_list[0] + for i, char in enumerate(fasta.aa_seq.upper()): + if char not in self.iupac_characters: + raise Exception(f'Error encountered validating fasta: Invalid amino acid found at pos {i}: "{char}"') + + def validate_x(self): + """checks if any bases are X. TODO check whether alphafold accepts X bases. """ + fasta = self.fasta_list[0] + for i, char in enumerate(fasta.aa_seq.upper()): + if char == 'X': + raise Exception(f'Error encountered validating fasta: Unsupported aa code "X" found at pos {i}') + + +class FastaWriter: + def __init__(self) -> None: + self.outfile = 'alphafold.fasta' + self.formatted_line_len = 60 + + def write(self, fasta: Fasta): + with open(self.outfile, 'w') as fp: + header = fasta.header + seq = self.format_sequence(fasta.aa_seq) + fp.write(header + '\n') + fp.write(seq + '\n') + + def format_sequence(self, aa_seq: str): + formatted_seq = '' + for i in range(0, len(aa_seq), self.formatted_line_len): + formatted_seq += aa_seq[i: i + self.formatted_line_len] + '\n' + return formatted_seq + + +def main(): + # load fasta file + args = parse_args() + fas = FastaLoader(args.input_fasta) + + # validate + fv = FastaValidator(fas.fastas) + fv.validate() + + # write cleaned version + fw = FastaWriter() + fw.write(fas.fastas[0]) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument( + "input_fasta", + help="input fasta file", + type=str + ) + return parser.parse_args() + + + +if __name__ == '__main__': + main()