Mercurial > repos > tduigou > domestication
view domestication.py @ 10:c52967fb95da draft default tip
planemo upload for repository https://github.com/Edinburgh-Genome-Foundry/Examples/blob/master/templates/template1.ipynb commit 6ae809b563b40bcdb6be2e74fe2a84ddad5484ae
| author | tduigou |
|---|---|
| date | Tue, 27 May 2025 08:08:21 +0000 |
| parents | f78651af72e4 |
| children |
line wrap: on
line source
import os import argparse import pandas import zipfile import genedom import dnacauldron import proglog import shutil #proglog.notebook() from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio import SeqIO def domestication(files_to_domestication, csv_file,file_name_mapping, use_file_names_as_id, allow_edits, output_dom, gb_dom): file_to_domestication = files_to_domestication.split(',') records_to_domesticate = dnacauldron.biotools.load_records_from_files( files=file_to_domestication, folder=None, use_file_names_as_ids=use_file_names_as_id ) #refine the real record name dict if isinstance(file_name_mapping, str): file_name_mapping = dict( item.split(":") for item in file_name_mapping.split(",") ) real_names = { os.path.splitext(os.path.basename(k))[0]: v.replace(".gb", "") for k, v in file_name_mapping.items() } updated_records = [] for record in records_to_domesticate: original_id = record.id if original_id in real_names: new_id = real_names[original_id] record.id = new_id updated_records.append(record) ######################################################## df=pandas.read_csv(csv_file) EMMA_PLUS = genedom.GoldenGateDomesticator.standard_from_spreadsheet(dataframe=df) genedom.batch_domestication( records=updated_records, standard=EMMA_PLUS, allow_edits=allow_edits, target=output_dom) # Check if any names were truncated: if isinstance(output_dom, str) and output_dom.endswith(".zip"): with zipfile.ZipFile(output_dom, "r") as zipf: with zipf.open("order_ids.csv") as f: order_ids = pandas.read_csv(f) else: order_ids = pandas.read_csv(os.path.join(output_dom, "order_ids.csv")) any_truncated = False for index, row in order_ids.iterrows(): if row["sequence"] != row["order_id"]: any_truncated = True print("Changed names:", end=" ") print(" --> ".join(row)) if not any_truncated: print("Part names were not truncated") #zip compressing if os.path.isdir(output_dom): zip_path = output_dom.rstrip("/\\") + ".zip" shutil.make_archive(output_dom, 'zip', output_dom) shutil.move(zip_path, output_dom) temp_extract_dir = 'temp_extracted' os.makedirs(temp_extract_dir, exist_ok=True) with zipfile.ZipFile(output_dom, 'r') as zip_ref: zip_ref.extractall(temp_extract_dir) # Navigate into domesticated_genbanks and copy .gb files dom_gb_dir = os.path.join(temp_extract_dir, 'domesticated_genbanks') if os.path.isdir(dom_gb_dir): for file in os.listdir(dom_gb_dir): if file.endswith('.gb'): shutil.copy(os.path.join(dom_gb_dir, file), gb_dom) #print(f"Copied: {file}") else: print("Folder 'domesticated_genbanks' not found inside zip.") # Optional: clean up shutil.rmtree(temp_extract_dir) return output_dom def methylation_protection(domestication_target, output_methprot): if domestication_target.endswith(".zip"): extracted_dir = "extracted_genbanks" with zipfile.ZipFile(domestication_target, "r") as zipf: os.makedirs(extracted_dir, exist_ok=True) for member in zipf.namelist(): if member.startswith("domesticated_genbanks/") and member.endswith(".gb"): zipf.extract(member, path=extracted_dir) gb_folder = os.path.join(extracted_dir, "domesticated_genbanks") else: gb_folder = os.path.join(domestication_target, "domesticated_genbanks") records_to_protect = dnacauldron.biotools.load_records_from_files(folder=gb_folder, use_file_names_as_ids=True) for record in records_to_protect: new_seqrecord = SeqRecord("TTC") + record + SeqRecord("GAA") # these sequences are designed to prevent Dcm recognition site formation new_seqrecord.id = record.id new_seqrecord.name = record.name new_seqrecord.annotations = {"molecule_type": "DNA"} target_file = os.path.join(output_methprot, new_seqrecord.id + ".gb") with open(target_file, "w") as output_handle: SeqIO.write(new_seqrecord, output_handle, "genbank") def parse_command_line_args(): parser = argparse.ArgumentParser(description="Domestication") parser.add_argument("--files_to_domestication", required=True, help="List of GenBank files (Comma-separated)") parser.add_argument("--csv_file", required=True, help="csv file") parser.add_argument('--file_name_mapping', type=str, help='Mapping of Galaxy filenames to original filenames') parser.add_argument("--use_file_names_as_id", type=lambda x: x.lower() == 'true', default=True, help="Use file names as IDs (True/False)") parser.add_argument("--allow_edits", type=lambda x: x.lower() == 'true', default=True, help="Allow sequence edits") parser.add_argument("--output_dom", required=True, help="zip output for domestication results") parser.add_argument("--output_methprot", required=True, help="gb output for methylation protection") parser.add_argument("--output_gb_dom", required=True, help="gb output for mdomesticated gb files") parser.add_argument("--methylation_protection", type=lambda x: x.lower() == 'true', default=False, help="Enable methyl protection (true/false)") return parser.parse_args() if __name__ == "__main__": args = parse_command_line_args() domestication( args.files_to_domestication, args.csv_file, args.file_name_mapping, args.use_file_names_as_id, args.allow_edits, args.output_dom, args.output_gb_dom ) if args.methylation_protection: methylation_protection( args.output_dom, args.output_methprot )
