Mercurial > repos > tduigou > domestication
diff domestication.py @ 0:8fd9af4cb080 draft
planemo upload for repository https://github.com/Edinburgh-Genome-Foundry/Examples/blob/master/templates/template1.ipynb commit 6ae809b563b40bcdb6be2e74fe2a84ddad5484ae
| author | tduigou |
|---|---|
| date | Mon, 12 May 2025 10:21:24 +0000 |
| parents | |
| children | f78651af72e4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/domestication.py Mon May 12 10:21:24 2025 +0000 @@ -0,0 +1,135 @@ +import os +import argparse +import pandas +import zipfile +import genedom +import dnacauldron +import proglog +import shutil +#proglog.notebook() +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq +from Bio import SeqIO + +def domestication(files_to_domestication, csv_file,file_name_mapping, use_file_names_as_id, + allow_edits, output_dom): + + file_to_domestication = files_to_domestication.split(',') + + records_to_domesticate = dnacauldron.biotools.load_records_from_files( + files=file_to_domestication, + folder=None, + use_file_names_as_ids=use_file_names_as_id + ) + + #refine the real record name dict + if isinstance(file_name_mapping, str): + file_name_mapping = dict( + item.split(":") for item in file_name_mapping.split(",") + ) + real_names = { + os.path.splitext(os.path.basename(k))[0]: v.replace(".gb", "") + for k, v in file_name_mapping.items() + } + updated_records = [] + for record in records_to_domesticate: + original_id = record.id + if original_id in real_names: + new_id = real_names[original_id] + record.id = new_id + updated_records.append(record) + ######################################################## + + df=pandas.read_csv(csv_file) + EMMA_PLUS = genedom.GoldenGateDomesticator.standard_from_spreadsheet(dataframe=df) + genedom.batch_domestication( + records=updated_records, + standard=EMMA_PLUS, + allow_edits=allow_edits, + target=output_dom) + + # Check if any names were truncated: + if isinstance(output_dom, str) and output_dom.endswith(".zip"): + with zipfile.ZipFile(output_dom, "r") as zipf: + with zipf.open("order_ids.csv") as f: + order_ids = pandas.read_csv(f) + else: + order_ids = pandas.read_csv(os.path.join(output_dom, "order_ids.csv")) + + any_truncated = False + for index, row in order_ids.iterrows(): + if row["sequence"] != row["order_id"]: + any_truncated = True + print("Changed names:", end=" ") + print(" --> ".join(row)) + if not any_truncated: + print("Part names were not truncated") + + #zip compressing + if os.path.isdir(output_dom): + zip_path = output_dom.rstrip("/\\") + ".zip" + shutil.make_archive(output_dom, 'zip', output_dom) + shutil.move(zip_path, output_dom) + + return output_dom + +def methylation_protection(domestication_target, output_methprot): + + if domestication_target.endswith(".zip"): + extracted_dir = "extracted_genbanks" + with zipfile.ZipFile(domestication_target, "r") as zipf: + os.makedirs(extracted_dir, exist_ok=True) + for member in zipf.namelist(): + if member.startswith("domesticated_genbanks/") and member.endswith(".gb"): + zipf.extract(member, path=extracted_dir) + gb_folder = os.path.join(extracted_dir, "domesticated_genbanks") + else: + gb_folder = os.path.join(domestication_target, "domesticated_genbanks") + + records_to_protect = dnacauldron.biotools.load_records_from_files(folder=gb_folder, use_file_names_as_ids=True) + + for record in records_to_protect: + new_seqrecord = SeqRecord("TTC") + record + SeqRecord("GAA") # these sequences are designed to prevent Dcm recognition site formation + new_seqrecord.id = record.id + new_seqrecord.name = record.name + new_seqrecord.annotations = {"molecule_type": "DNA"} + + target_file = os.path.join(output_methprot, new_seqrecord.id + ".gb") + with open(target_file, "w") as output_handle: + SeqIO.write(new_seqrecord, output_handle, "genbank") + +def parse_command_line_args(): + parser = argparse.ArgumentParser(description="Domestication") + + parser.add_argument("--files_to_domestication", required=True, + help="List of GenBank files (Comma-separated)") + parser.add_argument("--csv_file", required=True, + help="csv file") + parser.add_argument('--file_name_mapping', type=str, + help='Mapping of Galaxy filenames to original filenames') + parser.add_argument("--use_file_names_as_id", type=lambda x: x.lower() == 'true', default=True, + help="Use file names as IDs (True/False)") + parser.add_argument("--allow_edits", type=lambda x: x.lower() == 'true', default=True, + help="Allow sequence edits") + parser.add_argument("--output_dom", required=True, + help="zip output for domestication results") + parser.add_argument("--output_methprot", required=False, + help="gb output for methylation protection") + parser.add_argument("--methylation_protection", type=lambda x: x.lower() == 'true', default=False, + help="Enable methyl protection (true/false)") + + return parser.parse_args() + +if __name__ == "__main__": + args = parse_command_line_args() + + domestication( + args.files_to_domestication, args.csv_file, + args.file_name_mapping, args.use_file_names_as_id, + args.allow_edits, args.output_dom + ) + + if args.methylation_protection: + methylation_protection( + args.output_dom, args.output_methprot + )
