Mercurial > repos > iuc > ena_webin_cli
diff process_input.py @ 0:7f669682f4ac draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ena_webin_cli commit abb15194a196267142d88b9348facf9e85e601ef
| author | iuc |
|---|---|
| date | Mon, 06 Oct 2025 12:13:07 +0000 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/process_input.py Mon Oct 06 12:13:07 2025 +0000 @@ -0,0 +1,132 @@ +import json +import os +import sys + +import yaml + + +def get_section_string(f, start_line, end_line, return_string=False): + # consume starting lines + start_string = iter(f.readline, start_line) + start_string = ''.join(line for line in start_string) + # read YAML lines + yaml_string = iter(f.readline, end_line) + if return_string: + return ''.join(x for x in yaml_string) + else: + return [x for x in yaml_string] + + +def fill_from_yaml_data(yaml_only_dict, studies_samples_dict): + # fill experiment information (platform) + for index, exp in yaml_only_dict['ENA_experiment'].items(): + study_alias = exp['study_alias'] + sample_alias = exp['sample_alias'] + if study_alias in studies_samples_dict.keys(): + if sample_alias in studies_samples_dict[study_alias].keys(): + studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']}) + else: + studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]} + else: + studies_samples_dict[study_alias] = { + sample_alias: {'experiments': [{'platform': exp['platform']}]} + } + + +def load_receipt_data(input_file_path): + # should do some health check of the input file? + # load yaml section + loaded_data = {} + yaml_delimiter = 'YAML -------------\n' + with open(input_file_path) as input_file: + yaml_only_section = yaml.safe_load( + get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True) + ) + fill_from_yaml_data(yaml_only_section, loaded_data) + # read study accessions + study_delimiter = 'Study accession details:\n' + end_line = '\n' + with open(input_file_path) as input_file: + studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line) + + for study_line in studies_accession_lines: + if study_line != '\n': + alias, accession, *_ = study_line.split('\t') + try: + loaded_data[alias]['accession'] = accession + except KeyError: + print(f"Experiment {alias} has unknown study or sample") + + samples_delimiter = 'Sample accession details:\n' + with open(input_file_path) as input_file: + samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line) + + for sample_line in samples_accession_lines: + if sample_line != '\n': + alias, accession, *_ = sample_line.split('\t') + for study in loaded_data.keys(): + if alias in loaded_data[study].keys(): + loaded_data[study][alias]['accession'] = accession + break + + return loaded_data + + +def main(): + input_file_path = sys.argv[1] + fasta_names_list_path = sys.argv[2] + out_manifest_base = sys.argv[3] + manifest_template = sys.argv[4] + + # load submitted data from receipt file + data_dict = load_receipt_data(input_file_path) + + # iterate over the list of fasta files + with open(fasta_names_list_path, 'r') as fasta_files_json_file: + fasta_files_list = json.load(fasta_files_json_file) + + with open('submit_list.tab', 'w') as written_manifests_out: + for fasta_file in fasta_files_list: + if fasta_file.endswith('.fasta.gz'): + sample_alias = fasta_file[:-9] + else: + sample_alias = fasta_file[:-6] + + print(f'Processing {sample_alias}') + found_metadata = False + + for study_alias in data_dict.keys(): + if sample_alias in data_dict[study_alias].keys(): + sample_accession = data_dict[study_alias][sample_alias]['accession'] + study_accession = data_dict[study_alias]['accession'] + # TODO: get a string that concatenates platform information from multiple experiments + platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform'] + manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt') + + with open(manifest_path, "w") as output_handle: + # dump the contents of manifest template containing global vars + with open(manifest_template) as m_template: + output_handle.write(m_template.read()) + + output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n") + output_handle.write("PLATFORM\t" + platform + "\n") + output_handle.write("STUDY\t" + study_accession + "\n") + output_handle.write("SAMPLE\t" + sample_accession + "\n") + output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n") + agp_path = os.path.join("./fasta", sample_alias + ".agp") + if os.path.exists(agp_path): + output_handle.write("AGP\t" + sample_alias + ".agp\n") + chr_list_path = os.path.join("./fasta", sample_alias + ".tsv.gz") + if os.path.exists(chr_list_path): + output_handle.write("CHROMOSOME_LIST\t" + sample_alias + ".tsv.gz\n") + + found_metadata = True + written_manifests_out.write(manifest_path + '\n') + break + + if not found_metadata: + print(f'No metadata found for sample {sample_alias}') + + +if __name__ == '__main__': + main()
