Mercurial > repos > proteore > proteore_protein_interaction_maps
comparison build_protein_interaction_maps.py @ 0:3fcdc585cd2e draft
planemo upload commit 696e05eadff826a980b1150be5db5d1f52d06b1b-dirty
| author | proteore |
|---|---|
| date | Fri, 01 Mar 2019 11:23:22 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3fcdc585cd2e |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 import csv, json, argparse, re | |
| 3 | |
| 4 def get_args() : | |
| 5 parser = argparse.ArgumentParser() | |
| 6 parser.add_argument("--species") | |
| 7 parser.add_argument("--database", help="Humap, Bioplex or Biogrid", required=True) | |
| 8 parser.add_argument("--dict_path", required=True) | |
| 9 parser.add_argument("--input_type", help="type of input (list of id or filename)",required=True) | |
| 10 parser.add_argument("--input", required=True) | |
| 11 parser.add_argument("--header") | |
| 12 parser.add_argument("--ncol") | |
| 13 parser.add_argument("--id_type") | |
| 14 parser.add_argument("--network_output") | |
| 15 parser.add_argument("--nodes_output") | |
| 16 args = parser.parse_args() | |
| 17 | |
| 18 if args.input_type=="file" : | |
| 19 args.ncol = nb_col_to_int(args.ncol) | |
| 20 args.header = str2bool(args.header) | |
| 21 | |
| 22 return args | |
| 23 | |
| 24 #Turn string into boolean | |
| 25 def str2bool(v): | |
| 26 if v.lower() in ('yes', 'true', 't', 'y', '1'): | |
| 27 return True | |
| 28 elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |
| 29 return False | |
| 30 else: | |
| 31 raise argparse.ArgumentTypeError('Boolean value expected.') | |
| 32 | |
| 33 #return the column number in int format | |
| 34 def nb_col_to_int(nb_col): | |
| 35 try : | |
| 36 nb_col = int(nb_col.replace("c", "")) - 1 | |
| 37 return nb_col | |
| 38 except : | |
| 39 sys.exit("Please specify the column where you would like to apply the filter with valid format") | |
| 40 | |
| 41 #return list of (unique) ids from string | |
| 42 def get_input_ids_from_string(input) : | |
| 43 ids_list = list(set(re.split(r'\s+',input.replace("\r","").replace("\n"," ").replace("\t"," ")))) | |
| 44 if "" in ids_list : ids_list.remove("") | |
| 45 #if "NA" in ids_list : ids_list.remove("NA") | |
| 46 return ids_list | |
| 47 | |
| 48 #return input_file and list of unique ids from input file path | |
| 49 def get_input_ids_from_file(input,nb_col,header) : | |
| 50 with open(input, "r") as csv_file : | |
| 51 input_file= list(csv.reader(csv_file, delimiter='\t')) | |
| 52 | |
| 53 input_file, ids_list = one_id_one_line(input_file,nb_col,header) | |
| 54 if "" in ids_list : ids_list.remove("") | |
| 55 #if "NA" in ids_list : ids_list.remove("NA") | |
| 56 | |
| 57 return input_file, ids_list | |
| 58 | |
| 59 #return input file by adding lines when there are more than one id per line | |
| 60 def one_id_one_line(input_file,nb_col,header) : | |
| 61 | |
| 62 if header : | |
| 63 new_file = [input_file[0]] | |
| 64 input_file = input_file[1:] | |
| 65 else : | |
| 66 new_file=[] | |
| 67 ids_list=[] | |
| 68 | |
| 69 for line in input_file : | |
| 70 if line != [] and set(line) != {''}: | |
| 71 line[nb_col] = re.sub(r"\s+","",line[nb_col]) | |
| 72 if ";" in line[nb_col] : | |
| 73 ids = line[nb_col].split(";") | |
| 74 for id in ids : | |
| 75 new_file.append(line[:nb_col]+[id]+line[nb_col+1:]) | |
| 76 ids_list.append(id) | |
| 77 else : | |
| 78 new_file.append(line) | |
| 79 ids_list.append(line[nb_col]) | |
| 80 | |
| 81 ids_list= list(set(ids_list)) | |
| 82 | |
| 83 return new_file, ids_list | |
| 84 | |
| 85 #replace all blank cells to NA | |
| 86 def blank_to_NA(csv_file) : | |
| 87 tmp=[] | |
| 88 for line in csv_file : | |
| 89 line = ["NA" if cell=="" or cell==" " or cell=="NaN" or cell=="-" else cell for cell in line] | |
| 90 tmp.append(line) | |
| 91 | |
| 92 return tmp | |
| 93 | |
| 94 def biogrid_output_files(ids,species) : | |
| 95 network_file=[["Entrez Gene Interactor A","Entrez Gene Interactor B","Gene symbol Interactor A","Gene symbol Interactor B","Experimental System","Experimental Type","Pubmed ID","Interaction Score","Phenotypes"]] | |
| 96 ids_set= set(ids) | |
| 97 ids_not_found=set([]) | |
| 98 for id in ids : | |
| 99 if id in ppi_dict['network'] : | |
| 100 network_file.extend(ppi_dict['network'][id]) | |
| 101 ids_set.update([interact[1] for interact in ppi_dict['network'][id]]) | |
| 102 else : | |
| 103 ids_not_found.add(id) | |
| 104 | |
| 105 nodes_file = [["Entrez gene ID","Official Symbol Interactor","Present in user input ids","ID present in Biogrid "+species,"Pathway"]] | |
| 106 for id in ids_set: | |
| 107 #get pathway | |
| 108 if id in ppi_dict['nodes']: | |
| 109 description_pathway=";".join(ppi_dict['nodes'][id]) | |
| 110 else : | |
| 111 description_pathway="NA" | |
| 112 | |
| 113 #get gene name | |
| 114 if id in ppi_dict['network']: gene_name = ppi_dict['network'][id][0][2] | |
| 115 else : gene_name="NA" | |
| 116 | |
| 117 #make line | |
| 118 nodes_file.append([id]+[gene_name]+[id in ids]+[id not in ids_not_found]+[description_pathway]) | |
| 119 | |
| 120 return network_file,nodes_file | |
| 121 | |
| 122 def bioplex_output_files(ids,id_type,species) : | |
| 123 network_file=[[id_type+" Interactor A",id_type+" Interactor B","Gene symbol Interactor A","Gene symbol Interactor B","Interaction Score"]] | |
| 124 ids_set= set(ids) | |
| 125 ids_not_found=set([]) | |
| 126 for id in ids : | |
| 127 if id in ppi_dict['network'][id_type] : | |
| 128 network_file.extend(ppi_dict['network'][id_type][id]) | |
| 129 ids_set.update([interact[1] for interact in ppi_dict['network'][id_type][id]]) | |
| 130 else : | |
| 131 ids_not_found.add(id) | |
| 132 | |
| 133 if id_type=="UniProt-AC" : nodes_file=[[id_type,"Present in user input ids","ID present in Human Bioplex","Pathway"]] | |
| 134 else: nodes_file=[[id_type,"Official symbol Interactor","Present in user input ids","Present in interactome","Pathway"]] | |
| 135 for id in ids_set: | |
| 136 | |
| 137 if id in ppi_dict['nodes'][id_type]: | |
| 138 description_pathway=";".join(ppi_dict['nodes'][id_type][id]) | |
| 139 else : | |
| 140 description_pathway="NA" | |
| 141 | |
| 142 #make line | |
| 143 if id_type=="UniProt-AC": | |
| 144 nodes_file.append([id]+[id in ids]+[id not in ids_not_found]+[description_pathway]) | |
| 145 elif id_type=="GeneID": | |
| 146 #get gene_name | |
| 147 if id in ppi_dict['network'][id_type]: gene_name = ppi_dict['network'][id_type][id][0][2] | |
| 148 else : gene_name="NA" | |
| 149 nodes_file.append([id]+[gene_name]+[id in ids]+[id not in ids_not_found]+[description_pathway]) | |
| 150 | |
| 151 return network_file,nodes_file | |
| 152 | |
| 153 def humap_output_files(ids,species) : | |
| 154 network_file=[["Entrez Gene Interactor A","Entrez Gene Interactor B","Gene symbol Interactor A","Gene symbol Interactor B","Interaction Score"]] | |
| 155 ids_set= set(ids) | |
| 156 ids_not_found=set([]) | |
| 157 for id in ids : | |
| 158 if id in ppi_dict['network'] : | |
| 159 network_file.extend(ppi_dict['network'][id]) | |
| 160 ids_set.update([interact[1] for interact in ppi_dict['network'][id]]) | |
| 161 else : | |
| 162 ids_not_found.add(id) | |
| 163 | |
| 164 nodes_file = [["Entrez gene ID","Official Symbol Interactor","Present in user input ids","ID present in Hu.MAP","Pathway"]] | |
| 165 for id in ids_set: | |
| 166 if id in ppi_dict['nodes']: | |
| 167 description_pathway=";".join(ppi_dict['nodes'][id]) | |
| 168 else : | |
| 169 description_pathway="NA" | |
| 170 | |
| 171 #get gene name | |
| 172 if id in ppi_dict['gene_name']: | |
| 173 gene_name = ppi_dict['gene_name'][id] | |
| 174 else : | |
| 175 gene_name = "NA" | |
| 176 | |
| 177 #make line | |
| 178 nodes_file.append([id]+[gene_name]+[id in ids]+[id not in ids_not_found]+[description_pathway]) | |
| 179 | |
| 180 return network_file,nodes_file | |
| 181 | |
| 182 #function to sort the csv_file by value in a specific column | |
| 183 def sort_by_column(tab,sort_col,reverse,header): | |
| 184 | |
| 185 if len(tab) > 1 : #if there's more than just a header or 1 row | |
| 186 if header : | |
| 187 head=tab[0] | |
| 188 tab=tab[1:] | |
| 189 | |
| 190 #list of empty cells in the column to sort | |
| 191 unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] | |
| 192 unsorted_tab=[ tab[i] for i in unsortable_lines] | |
| 193 tab= [line for i,line in enumerate(tab) if i not in unsortable_lines] | |
| 194 | |
| 195 if only_number(tab,sort_col) and any_float(tab,sort_col) : | |
| 196 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) | |
| 197 elif only_number(tab,sort_col): | |
| 198 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) | |
| 199 else : | |
| 200 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) | |
| 201 | |
| 202 tab.extend(unsorted_tab) | |
| 203 if header is True : tab = [head]+tab | |
| 204 | |
| 205 return tab | |
| 206 | |
| 207 def only_number(tab,col) : | |
| 208 | |
| 209 for line in tab : | |
| 210 if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) : | |
| 211 return False | |
| 212 return True | |
| 213 | |
| 214 #Check if a variable is a float or an integer | |
| 215 def is_number(number_format, n): | |
| 216 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") | |
| 217 int_format = re.compile(r"^[-]?[0-9][0-9]*$") | |
| 218 test = "" | |
| 219 if number_format == "int": | |
| 220 test = re.match(int_format, n) | |
| 221 elif number_format == "float": | |
| 222 test = re.match(float_format, n) | |
| 223 if test: | |
| 224 return True | |
| 225 | |
| 226 #return True is there is at least one float in the column | |
| 227 def any_float(tab,col) : | |
| 228 | |
| 229 for line in tab : | |
| 230 if is_number("float",line[col].replace(",",".")) : | |
| 231 return True | |
| 232 | |
| 233 return False | |
| 234 | |
| 235 def main() : | |
| 236 | |
| 237 #Get args from command line | |
| 238 global args | |
| 239 args = get_args() | |
| 240 | |
| 241 #get PPI dictionary | |
| 242 with open(args.dict_path, 'r') as handle: | |
| 243 global ppi_dict | |
| 244 ppi_dict = json.load(handle) | |
| 245 | |
| 246 #Get file and/or ids from input | |
| 247 if args.input_type == "text" : | |
| 248 ids = get_input_ids_from_string(args.input) | |
| 249 elif args.input_type == "file" : | |
| 250 input_file, ids = get_input_ids_from_file(args.input,args.ncol,args.header) | |
| 251 | |
| 252 #create output files | |
| 253 if args.database=="biogrid": | |
| 254 network_file, nodes_file = biogrid_output_files(ids,args.species) | |
| 255 elif args.database=="bioplex": | |
| 256 network_file, nodes_file = bioplex_output_files(ids,args.id_type,args.species) | |
| 257 elif args.database=="humap": | |
| 258 network_file, nodes_file = humap_output_files(ids,args.species) | |
| 259 | |
| 260 #convert blank to NA and sort files | |
| 261 network_file = blank_to_NA(network_file) | |
| 262 network_file = sort_by_column(network_file,0,False,True) | |
| 263 nodes_file = sort_by_column(nodes_file,0,False,True) | |
| 264 | |
| 265 #write output files | |
| 266 with open(args.network_output,"w") as output : | |
| 267 writer = csv.writer(output,delimiter="\t") | |
| 268 writer.writerows(network_file) | |
| 269 | |
| 270 with open(args.nodes_output,"w") as output : | |
| 271 writer = csv.writer(output,delimiter="\t") | |
| 272 for row in nodes_file: | |
| 273 writer.writerow([unicode(s).encode("utf-8") for s in row]) | |
| 274 | |
| 275 if __name__ == "__main__": | |
| 276 main() |
