Mercurial > repos > proteore > proteore_data_manager
changeset 6:f281a1eb83d6 draft
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
| author | proteore |
|---|---|
| date | Mon, 11 Mar 2019 04:30:28 -0400 |
| parents | 429e7481c392 |
| children | 77db6c42a212 |
| files | data_manager/resource_building.py data_manager/resource_building.xml data_manager_conf.xml tool-data/proteore_nextprot_ref.loc.sample tool_data_table_conf.xml.sample |
| diffstat | 5 files changed, 104 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/resource_building.py Fri Mar 08 04:09:43 2019 -0500 +++ b/data_manager/resource_building.py Mon Mar 11 04:30:28 2019 -0400 @@ -490,6 +490,76 @@ data_table_entry = dict(id=id, name = name, species = species, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") +####################################################################################################### +# 5. nextprot (add protein features) +####################################################################################################### + +def Build_nextprot_ref_file(target_directory): + nextprot_ids_file = "nextprot_ac_list_all.txt" + ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) + + nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] + for id in ids : + #print (id) + query="https://api.nextprot.org/entry/"+id+".json" + resp = requests.get(url=query) + data = resp.json() + + #get info from json dictionary + mass_mol = data["entry"]["isoforms"][0]["massAsString"] + seq_length = data['entry']["isoforms"][0]["sequenceLength"] + iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] + chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] + protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) + + #put all subcell loc in a set + if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : + subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] + all_subcell_locs = set() + for loc in subcell_locs : + all_subcell_locs.add(loc['cvTermName']) + all_subcell_locs.discard("") + all_subcell_locs = ";".join(all_subcell_locs) + else : + all_subcell_locs = "NA" + + #put all subcell loc in a set + if ('disease') in data['entry']['annotationsByCategory'].keys() : + diseases = data['entry']['annotationsByCategory']['disease'] + all_diseases = set() + for disease in diseases : + if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): + all_diseases.add(disease['cvTermName']) + if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) + else : all_diseases="NA" + else : + all_diseases="NA" + + #get all tm domain + nb_domains = 0 + if "domain" in data['entry']['annotationsByCategory'].keys(): + tm_domains = data['entry']['annotationsByCategory']["domain"] + for tm_domain in tm_domains : + if "properties" in tm_domain.keys() and tm_domain['properties']!=[]: + domains = tm_domains["properties"] + for domain in domains : + if domain["name"]=="region structure" and domain["value"]=="Helical" : + nb_domains+=1 + + + nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) + + output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" + path = os.path.join(target_directory,output_file) + name = "neXtProt release "+time.strftime("%d-%m-%Y") + id = "nextprot_ref_"+time.strftime("%d-%m-%Y") + + with open(path, 'w') as output: + writer = csv.writer(output,delimiter="\t") + writer.writerows(nextprot_file) + + data_table_entry = dict(id=id, name = name, value = path) + _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") ####################################################################################################### # Main function @@ -503,6 +573,7 @@ parser.add_argument("--species") parser.add_argument("--date") parser.add_argument("-o", "--output") + parser.add_argument("--database") args = parser.parse_args() data_manager_dict = {} @@ -557,7 +628,15 @@ species=None if interactome is not None and species is not None: PPI_ref_files(data_manager_dict, species, interactome, target_directory) - + + ## Build nextprot ref file for add protein features + try: + database=args.database + except NameError: + database=None + if database is not None : + Build_nextprot_ref_file(target_directory) + #save info to json file filename = args.output open(filename, 'wb').write(to_json_string(data_manager_dict))
--- a/data_manager/resource_building.xml Fri Mar 08 04:09:43 2019 -0500 +++ b/data_manager/resource_building.xml Mon Mar 11 04:30:28 2019 -0400 @@ -1,4 +1,4 @@ -<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.03.08.1" tool_type="manage_data"> +<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.03.10" tool_type="manage_data"> <description> to create or update reference files for proteore tools </description> @@ -23,6 +23,8 @@ #if $database.base.interactome == "biogrid" --species="$database.base.species" #end if + #else if $database.database == "nextprot" + --database=$database.database #end if --output "$output" @@ -35,6 +37,7 @@ <option value="peptide_atlas">Peptide Atlas</option> <option value="id_mapping">ID mapping</option> <option value="PPI">Build protein interaction maps</option> + <option value="nextprot">neXtProt</option> </param> <when value="human_protein_atlas"> <param name="tissues" type="select" multiple="false" label="Please select tissue">
--- a/data_manager_conf.xml Fri Mar 08 04:09:43 2019 -0500 +++ b/data_manager_conf.xml Mon Mar 11 04:30:28 2019 -0400 @@ -134,5 +134,19 @@ </column> </output> </data_table> + <data_table name="proteore_nextprot_ref"> + <output> + <column name="id" /> + <column name="name" /> + <column name="value" output_ref="output" > + <move type="file"> + <!--source>${path}</source--> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">proteore_nextprot_ref/</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/proteore_nextprot_ref/${id}.tsv</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> </data_manager> </data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_nextprot_ref.loc.sample Mon Mar 11 04:30:28 2019 -0400 @@ -0,0 +1,2 @@ +#<id> <name> <value> +#nextprot_ref_09-03-2019 neXtProt release 09-03-2019 tool-data/nextprot_ref_09-03-2019.tsv
--- a/tool_data_table_conf.xml.sample Fri Mar 08 04:09:43 2019 -0500 +++ b/tool_data_table_conf.xml.sample Mon Mar 11 04:30:28 2019 -0400 @@ -36,4 +36,8 @@ <columns>id, name, species, value</columns> <file path="tool-data/proteore_humap_dictionaries.loc" /> </table> + <table name='proteore_nextprot_ref' comment_char="#"> + <columns>id, name, value</columns> + <file path="tool-data/proteore_nextprot_ref.loc"/> + </table> </tables>
