Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 31:faeeabb11a4d draft
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
| author | proteore |
|---|---|
| date | Wed, 08 Jan 2020 09:53:34 +0000 |
| parents | a6cabd3ab71f |
| children | ec1febc6672e |
comparison
equal
deleted
inserted
replaced
| 30:a6cabd3ab71f | 31:faeeabb11a4d |
|---|---|
| 1 # -*- coding: utf-8 -*- | 1 # -*- coding: utf-8 -*- |
| 2 """ | 2 """ |
| 3 The purpose of this script is to create source files from different databases to be used in other proteore tools | 3 The purpose of this script is to create source files from different databases to be used in other proteore tools |
| 4 """ | 4 """ |
| 5 | 5 |
| 6 import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile | 6 import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile |
| 7 from io import BytesIO | 7 from io import BytesIO |
| 8 from zipfile import ZipFile | 8 from zipfile import ZipFile |
| 9 from galaxy.util.json import from_json_string, to_json_string | 9 from galaxy.util.json import from_json_string, to_json_string |
| 10 | 10 |
| 11 ####################################################################################################### | 11 ####################################################################################################### |
| 129 # 3. ID mapping file | 129 # 3. ID mapping file |
| 130 ####################################################################################################### | 130 ####################################################################################################### |
| 131 import ftplib, gzip | 131 import ftplib, gzip |
| 132 csv.field_size_limit(sys.maxsize) # to handle big files | 132 csv.field_size_limit(sys.maxsize) # to handle big files |
| 133 | 133 |
| 134 def id_mapping_sources (data_manager_dict, species, target_directory) : | 134 def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) : |
| 135 | 135 |
| 136 human = species == "Human" | 136 human = species == "Human" |
| 137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } | 137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } |
| 138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | 138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] |
| 139 archive = os.path.join(tool_data_path, "ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d"))) | |
| 140 os.mkdir(archive) | |
| 139 | 141 |
| 140 #header | 142 #header |
| 141 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']] | 143 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']] |
| 142 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']] | 144 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']] |
| 143 | 145 |
| 146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | 148 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) |
| 147 with gzip.open(tab_path,"rt") as select : | 149 with gzip.open(tab_path,"rt") as select : |
| 148 tab_reader = csv.reader(select,delimiter="\t") | 150 tab_reader = csv.reader(select,delimiter="\t") |
| 149 for line in tab_reader : | 151 for line in tab_reader : |
| 150 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) | 152 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) |
| 151 os.remove(tab_path) | 153 shutil.move(tab_path, archive) |
| 152 | |
| 153 #print("selected_tab ok") | 154 #print("selected_tab ok") |
| 154 | 155 |
| 155 #get uniprot-AC reviewed | 156 #get uniprot-AC reviewed |
| 156 organism = species_dict[species].split("_")[1] | 157 organism = species_dict[species].split("_")[1] |
| 157 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" | 158 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" |
| 159 with requests.Session() as s: | 160 with requests.Session() as s: |
| 160 download = s.get(query) | 161 download = s.get(query) |
| 161 decoded_content = download.content.decode('utf-8') | 162 decoded_content = download.content.decode('utf-8') |
| 162 uniprot_reviewed_list = decoded_content.splitlines() | 163 uniprot_reviewed_list = decoded_content.splitlines() |
| 163 | 164 |
| 165 #save reviewed list | |
| 166 reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt') | |
| 167 with open(reviewed_list_path,w) as reviewed_list_file: | |
| 168 for id in uniprot_reviewed_list: | |
| 169 reviewed_list_file.write(id+"\n") | |
| 170 shutil.move(reviewed_list_path, archive) | |
| 171 | |
| 172 #remove unreviewed uniprot-AC | |
| 164 for line in tab[1:]: | 173 for line in tab[1:]: |
| 165 UniProtAC = line[1] | 174 UniProtAC = line[1] |
| 166 if UniProtAC not in uniprot_reviewed_list : | 175 if UniProtAC not in uniprot_reviewed_list : |
| 167 line[1]="" | 176 line[1]="" |
| 168 line[2]="" | |
| 169 | 177 |
| 170 """ | 178 """ |
| 171 Supplementary ID to get from HUMAN_9606_idmapping.dat : | 179 Supplementary ID to get from HUMAN_9606_idmapping.dat : |
| 172 -NextProt,BioGrid,STRING,KEGG | 180 -NextProt,BioGrid,STRING,KEGG |
| 173 """ | 181 """ |
| 192 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary | 200 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary |
| 193 else : | 201 else : |
| 194 unidict[uniprotID].update({ id_type : cor_id }) | 202 unidict[uniprotID].update({ id_type : cor_id }) |
| 195 elif id_type in ids : | 203 elif id_type in ids : |
| 196 unidict[uniprotID]={id_type : cor_id} | 204 unidict[uniprotID]={id_type : cor_id} |
| 197 os.remove(dat_path) | 205 shutil.move(dat_path, archive) |
| 198 | 206 |
| 199 #print("dat_file ok") | 207 #print("dat_file ok") |
| 200 | 208 |
| 201 #add ids from idmapping.dat to the final tab | 209 #add ids from idmapping.dat to the final tab |
| 202 for line in tab[1:] : | 210 for line in tab[1:] : |
| 219 #print ("tab ok") | 227 #print ("tab ok") |
| 220 | 228 |
| 221 #add missing nextprot ID for human or replace old ones | 229 #add missing nextprot ID for human or replace old ones |
| 222 if human : | 230 if human : |
| 223 #build next_dict | 231 #build next_dict |
| 224 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) | 232 nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) |
| 233 with open(nextprot_path,'r') as nextprot_ids : | |
| 234 nextprot_ids = nextprot_ids.read().splitlines() | |
| 235 shutil.move(nextprot_path,archive) | |
| 225 next_dict = {} | 236 next_dict = {} |
| 226 for nextid in nextprot_ids : | 237 for nextid in nextprot_ids : |
| 227 next_dict[nextid.replace("NX_","")] = nextid | 238 next_dict[nextid.replace("NX_","")] = nextid |
| 228 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) | 239 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) |
| 229 | 240 |
| 238 path = os.path.join(target_directory,output_file) | 249 path = os.path.join(target_directory,output_file) |
| 239 | 250 |
| 240 with open(path,"w") as out : | 251 with open(path,"w") as out : |
| 241 w = csv.writer(out,delimiter='\t') | 252 w = csv.writer(out,delimiter='\t') |
| 242 w.writerows(tab) | 253 w.writerows(tab) |
| 254 | |
| 255 subprocess.call(['tar', '-zczf', archive+".tar.gz", archive]) | |
| 243 | 256 |
| 244 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} | 257 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} |
| 245 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" | 258 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" |
| 246 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") | 259 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") |
| 247 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order | 260 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order |
| 265 ftp = ftplib.FTP("ftp.nextprot.org") | 278 ftp = ftplib.FTP("ftp.nextprot.org") |
| 266 ftp.login("anonymous", "anonymous") | 279 ftp.login("anonymous", "anonymous") |
| 267 ftp.cwd(ftp_dir) | 280 ftp.cwd(ftp_dir) |
| 268 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | 281 ftp.retrbinary("RETR " + file, open(path, 'wb').write) |
| 269 ftp.quit() | 282 ftp.quit() |
| 270 with open(path,'r') as nextprot_ids : | 283 |
| 271 nextprot_ids = nextprot_ids.read().splitlines() | 284 return (path) |
| 272 return (nextprot_ids) | |
| 273 | 285 |
| 274 #return '' if there's no value in a dictionary, avoid error | 286 #return '' if there's no value in a dictionary, avoid error |
| 275 def access_dictionary (dico,key1,key2) : | 287 def access_dictionary (dico,key1,key2) : |
| 276 if key1 in dico : | 288 if key1 in dico : |
| 277 if key2 in dico[key1] : | 289 if key2 in dico[key1] : |
| 595 parser.add_argument("--interactome", metavar = ("PPI")) | 607 parser.add_argument("--interactome", metavar = ("PPI")) |
| 596 parser.add_argument("--species") | 608 parser.add_argument("--species") |
| 597 parser.add_argument("--date") | 609 parser.add_argument("--date") |
| 598 parser.add_argument("-o", "--output") | 610 parser.add_argument("-o", "--output") |
| 599 parser.add_argument("--database") | 611 parser.add_argument("--database") |
| 612 parser.add_argument("--tool_data_path") | |
| 600 args = parser.parse_args() | 613 args = parser.parse_args() |
| 601 | 614 |
| 602 data_manager_dict = {} | 615 data_manager_dict = {} |
| 603 # Extract json file params | 616 # Extract json file params |
| 604 filename = args.output | 617 filename = args.output |
| 625 peptide_atlas = None | 638 peptide_atlas = None |
| 626 if peptide_atlas is not None: | 639 if peptide_atlas is not None: |
| 627 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" | 640 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" |
| 628 peptide_atlas = peptide_atlas.split(",") | 641 peptide_atlas = peptide_atlas.split(",") |
| 629 for pa_tissue in peptide_atlas: | 642 for pa_tissue in peptide_atlas: |
| 630 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) | 643 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory, args.tool_data_path) |
| 631 | 644 |
| 632 ## Download ID_mapping source file from Uniprot | 645 ## Download ID_mapping source file from Uniprot |
| 633 try: | 646 try: |
| 634 id_mapping=args.id_mapping | 647 id_mapping=args.id_mapping |
| 635 except NameError: | 648 except NameError: |
