changeset 6:f281a1eb83d6 draft

planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
author proteore
date Mon, 11 Mar 2019 04:30:28 -0400
parents 429e7481c392
children 77db6c42a212
files data_manager/resource_building.py data_manager/resource_building.xml data_manager_conf.xml tool-data/proteore_nextprot_ref.loc.sample tool_data_table_conf.xml.sample
diffstat 5 files changed, 104 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/resource_building.py	Fri Mar 08 04:09:43 2019 -0500
+++ b/data_manager/resource_building.py	Mon Mar 11 04:30:28 2019 -0400
@@ -490,6 +490,76 @@
     data_table_entry = dict(id=id, name = name, species = species, value = path)
     _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
 
+#######################################################################################################
+# 5. nextprot (add protein features)
+#######################################################################################################
+
+def Build_nextprot_ref_file(target_directory):
+    nextprot_ids_file = "nextprot_ac_list_all.txt"
+    ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
+
+    nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
+    for id in ids :
+        #print (id)
+        query="https://api.nextprot.org/entry/"+id+".json"
+        resp = requests.get(url=query)
+        data = resp.json()
+
+        #get info from json dictionary
+        mass_mol = data["entry"]["isoforms"][0]["massAsString"]
+        seq_length = data['entry']["isoforms"][0]["sequenceLength"]
+        iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
+        chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]        
+        protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
+
+        #put all subcell loc in a set
+        if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
+            subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
+            all_subcell_locs = set()
+            for loc in subcell_locs :
+                all_subcell_locs.add(loc['cvTermName'])
+            all_subcell_locs.discard("")
+            all_subcell_locs = ";".join(all_subcell_locs)
+        else :
+            all_subcell_locs = "NA"
+        
+        #put all subcell loc in a set
+        if ('disease') in data['entry']['annotationsByCategory'].keys() :
+            diseases = data['entry']['annotationsByCategory']['disease']
+            all_diseases = set()
+            for disease in diseases :
+                if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
+                    all_diseases.add(disease['cvTermName'])
+            if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
+            else : all_diseases="NA"
+        else :
+            all_diseases="NA"
+
+        #get all tm domain
+        nb_domains = 0
+        if  "domain" in data['entry']['annotationsByCategory'].keys():
+            tm_domains = data['entry']['annotationsByCategory']["domain"]
+            for tm_domain in tm_domains :
+                if "properties" in tm_domain.keys() and tm_domain['properties']!=[]:
+                    domains = tm_domains["properties"]
+                    for domain in domains :
+                        if domain["name"]=="region structure" and domain["value"]=="Helical" :
+                            nb_domains+=1
+
+        
+        nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
+    
+    output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
+    path = os.path.join(target_directory,output_file)
+    name = "neXtProt release "+time.strftime("%d-%m-%Y")
+    id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
+
+    with open(path, 'w') as output:
+        writer = csv.writer(output,delimiter="\t")
+        writer.writerows(nextprot_file)
+
+    data_table_entry = dict(id=id, name = name, value = path)
+    _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
 
 #######################################################################################################
 # Main function
@@ -503,6 +573,7 @@
     parser.add_argument("--species")
     parser.add_argument("--date")
     parser.add_argument("-o", "--output")
+    parser.add_argument("--database")
     args = parser.parse_args()
 
     data_manager_dict = {}
@@ -557,7 +628,15 @@
         species=None
     if interactome is not None and species is not None:
         PPI_ref_files(data_manager_dict, species, interactome, target_directory)
- 
+
+    ## Build nextprot ref file for add protein features
+    try:
+        database=args.database
+    except NameError:
+        database=None
+    if database is not None :
+        Build_nextprot_ref_file(target_directory)
+
     #save info to json file
     filename = args.output
     open(filename, 'wb').write(to_json_string(data_manager_dict))
--- a/data_manager/resource_building.xml	Fri Mar 08 04:09:43 2019 -0500
+++ b/data_manager/resource_building.xml	Mon Mar 11 04:30:28 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.03.08.1" tool_type="manage_data">
+<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.03.10" tool_type="manage_data">
 <description>
 to create or update reference files for proteore tools
 </description>
@@ -23,6 +23,8 @@
         #if $database.base.interactome == "biogrid"
             --species="$database.base.species"
         #end if
+    #else if $database.database == "nextprot"
+        --database=$database.database
     #end if
     --output "$output"
     
@@ -35,6 +37,7 @@
             <option value="peptide_atlas">Peptide Atlas</option>
             <option value="id_mapping">ID mapping</option>
             <option value="PPI">Build protein interaction maps</option>
+            <option value="nextprot">neXtProt</option>
         </param>
         <when value="human_protein_atlas">
             <param name="tissues" type="select" multiple="false" label="Please select tissue">
--- a/data_manager_conf.xml	Fri Mar 08 04:09:43 2019 -0500
+++ b/data_manager_conf.xml	Mon Mar 11 04:30:28 2019 -0400
@@ -134,5 +134,19 @@
                 </column>
             </output>
         </data_table>
+        <data_table name="proteore_nextprot_ref">
+            <output>
+                <column name="id" />
+                <column name="name" />
+                <column name="value" output_ref="output" >
+                    <move type="file">
+                        <!--source>${path}</source-->
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">proteore_nextprot_ref/</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/proteore_nextprot_ref/${id}.tsv</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
     </data_manager>
 </data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_nextprot_ref.loc.sample	Mon Mar 11 04:30:28 2019 -0400
@@ -0,0 +1,2 @@
+#<id>	<name>	<value>
+#nextprot_ref_09-03-2019	neXtProt release 09-03-2019	tool-data/nextprot_ref_09-03-2019.tsv
--- a/tool_data_table_conf.xml.sample	Fri Mar 08 04:09:43 2019 -0500
+++ b/tool_data_table_conf.xml.sample	Mon Mar 11 04:30:28 2019 -0400
@@ -36,4 +36,8 @@
       <columns>id, name, species, value</columns>
       <file path="tool-data/proteore_humap_dictionaries.loc" />
     </table>
+    <table name='proteore_nextprot_ref' comment_char="#">
+      <columns>id, name, value</columns>
+      <file path="tool-data/proteore_nextprot_ref.loc"/>
+    </table>
 </tables>