Mercurial > repos > iuc > ena_webin_cli
comparison process_input.py @ 0:7f669682f4ac draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ena_webin_cli commit abb15194a196267142d88b9348facf9e85e601ef
| author | iuc |
|---|---|
| date | Mon, 06 Oct 2025 12:13:07 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7f669682f4ac |
|---|---|
| 1 import json | |
| 2 import os | |
| 3 import sys | |
| 4 | |
| 5 import yaml | |
| 6 | |
| 7 | |
| 8 def get_section_string(f, start_line, end_line, return_string=False): | |
| 9 # consume starting lines | |
| 10 start_string = iter(f.readline, start_line) | |
| 11 start_string = ''.join(line for line in start_string) | |
| 12 # read YAML lines | |
| 13 yaml_string = iter(f.readline, end_line) | |
| 14 if return_string: | |
| 15 return ''.join(x for x in yaml_string) | |
| 16 else: | |
| 17 return [x for x in yaml_string] | |
| 18 | |
| 19 | |
| 20 def fill_from_yaml_data(yaml_only_dict, studies_samples_dict): | |
| 21 # fill experiment information (platform) | |
| 22 for index, exp in yaml_only_dict['ENA_experiment'].items(): | |
| 23 study_alias = exp['study_alias'] | |
| 24 sample_alias = exp['sample_alias'] | |
| 25 if study_alias in studies_samples_dict.keys(): | |
| 26 if sample_alias in studies_samples_dict[study_alias].keys(): | |
| 27 studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']}) | |
| 28 else: | |
| 29 studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]} | |
| 30 else: | |
| 31 studies_samples_dict[study_alias] = { | |
| 32 sample_alias: {'experiments': [{'platform': exp['platform']}]} | |
| 33 } | |
| 34 | |
| 35 | |
| 36 def load_receipt_data(input_file_path): | |
| 37 # should do some health check of the input file? | |
| 38 # load yaml section | |
| 39 loaded_data = {} | |
| 40 yaml_delimiter = 'YAML -------------\n' | |
| 41 with open(input_file_path) as input_file: | |
| 42 yaml_only_section = yaml.safe_load( | |
| 43 get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True) | |
| 44 ) | |
| 45 fill_from_yaml_data(yaml_only_section, loaded_data) | |
| 46 # read study accessions | |
| 47 study_delimiter = 'Study accession details:\n' | |
| 48 end_line = '\n' | |
| 49 with open(input_file_path) as input_file: | |
| 50 studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line) | |
| 51 | |
| 52 for study_line in studies_accession_lines: | |
| 53 if study_line != '\n': | |
| 54 alias, accession, *_ = study_line.split('\t') | |
| 55 try: | |
| 56 loaded_data[alias]['accession'] = accession | |
| 57 except KeyError: | |
| 58 print(f"Experiment {alias} has unknown study or sample") | |
| 59 | |
| 60 samples_delimiter = 'Sample accession details:\n' | |
| 61 with open(input_file_path) as input_file: | |
| 62 samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line) | |
| 63 | |
| 64 for sample_line in samples_accession_lines: | |
| 65 if sample_line != '\n': | |
| 66 alias, accession, *_ = sample_line.split('\t') | |
| 67 for study in loaded_data.keys(): | |
| 68 if alias in loaded_data[study].keys(): | |
| 69 loaded_data[study][alias]['accession'] = accession | |
| 70 break | |
| 71 | |
| 72 return loaded_data | |
| 73 | |
| 74 | |
| 75 def main(): | |
| 76 input_file_path = sys.argv[1] | |
| 77 fasta_names_list_path = sys.argv[2] | |
| 78 out_manifest_base = sys.argv[3] | |
| 79 manifest_template = sys.argv[4] | |
| 80 | |
| 81 # load submitted data from receipt file | |
| 82 data_dict = load_receipt_data(input_file_path) | |
| 83 | |
| 84 # iterate over the list of fasta files | |
| 85 with open(fasta_names_list_path, 'r') as fasta_files_json_file: | |
| 86 fasta_files_list = json.load(fasta_files_json_file) | |
| 87 | |
| 88 with open('submit_list.tab', 'w') as written_manifests_out: | |
| 89 for fasta_file in fasta_files_list: | |
| 90 if fasta_file.endswith('.fasta.gz'): | |
| 91 sample_alias = fasta_file[:-9] | |
| 92 else: | |
| 93 sample_alias = fasta_file[:-6] | |
| 94 | |
| 95 print(f'Processing {sample_alias}') | |
| 96 found_metadata = False | |
| 97 | |
| 98 for study_alias in data_dict.keys(): | |
| 99 if sample_alias in data_dict[study_alias].keys(): | |
| 100 sample_accession = data_dict[study_alias][sample_alias]['accession'] | |
| 101 study_accession = data_dict[study_alias]['accession'] | |
| 102 # TODO: get a string that concatenates platform information from multiple experiments | |
| 103 platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform'] | |
| 104 manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt') | |
| 105 | |
| 106 with open(manifest_path, "w") as output_handle: | |
| 107 # dump the contents of manifest template containing global vars | |
| 108 with open(manifest_template) as m_template: | |
| 109 output_handle.write(m_template.read()) | |
| 110 | |
| 111 output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n") | |
| 112 output_handle.write("PLATFORM\t" + platform + "\n") | |
| 113 output_handle.write("STUDY\t" + study_accession + "\n") | |
| 114 output_handle.write("SAMPLE\t" + sample_accession + "\n") | |
| 115 output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n") | |
| 116 agp_path = os.path.join("./fasta", sample_alias + ".agp") | |
| 117 if os.path.exists(agp_path): | |
| 118 output_handle.write("AGP\t" + sample_alias + ".agp\n") | |
| 119 chr_list_path = os.path.join("./fasta", sample_alias + ".tsv.gz") | |
| 120 if os.path.exists(chr_list_path): | |
| 121 output_handle.write("CHROMOSOME_LIST\t" + sample_alias + ".tsv.gz\n") | |
| 122 | |
| 123 found_metadata = True | |
| 124 written_manifests_out.write(manifest_path + '\n') | |
| 125 break | |
| 126 | |
| 127 if not found_metadata: | |
| 128 print(f'No metadata found for sample {sample_alias}') | |
| 129 | |
| 130 | |
| 131 if __name__ == '__main__': | |
| 132 main() |
