Mercurial > repos > iuc > ena_upload
comparison process_xlsx.py @ 1:bd624f0547fc draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ffea061c1ad6e7291abfe220230dbdbe8d19a2bd"
| author | iuc |
|---|---|
| date | Fri, 30 Apr 2021 12:08:48 +0000 |
| parents | b7356ece7ceb |
| children | 982757f4aa8c |
comparison
equal
deleted
inserted
replaced
| 0:b7356ece7ceb | 1:bd624f0547fc |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 import pathlib | 2 import pathlib |
| 3 import sys | 3 import sys |
| 4 | 4 |
| 5 import xlrd | 5 import xlrd |
| 6 | 6 import yaml |
| 7 | 7 |
| 8 FILE_FORMAT = 'fastq' | 8 FILE_FORMAT = 'fastq' |
| 9 | 9 |
| 10 | 10 |
| 11 def extract_data(xl_sheet, expected_columns): | 11 def extract_data(xl_sheet, expected_columns): |
| 34 row_dict = {} | 34 row_dict = {} |
| 35 for col in range(1, len(expected_columns)): | 35 for col in range(1, len(expected_columns)): |
| 36 sheet_col_index = sheet_columns[expected_columns[col]] | 36 sheet_col_index = sheet_columns[expected_columns[col]] |
| 37 row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value | 37 row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value |
| 38 # should check for duplicate alias/ids? | 38 # should check for duplicate alias/ids? |
| 39 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict | 39 if xl_sheet.cell(row_id, index_col).value in data_dict.keys(): |
| 40 tmp = data_dict[xl_sheet.cell(row_id, index_col).value] | |
| 41 data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp] | |
| 42 data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict) | |
| 43 else: | |
| 44 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict | |
| 40 return data_dict | 45 return data_dict |
| 46 | |
| 47 | |
| 48 def paste_xls2yaml(xlsx_path): | |
| 49 print('YAML -------------') | |
| 50 xls = xlrd.open_workbook(xlsx_path) | |
| 51 content_dict = {} | |
| 52 for sheet_name in xls.sheet_names(): | |
| 53 if sheet_name == 'controlled_vocabulary': | |
| 54 continue | |
| 55 xls_sheet = xls.sheet_by_name(sheet_name) | |
| 56 sheet_contents_dict = {} | |
| 57 colnames = [] | |
| 58 for col in range(xls_sheet.ncols): | |
| 59 colnames.append(xls_sheet.cell(0, col).value) | |
| 60 # skip first 2 rows (column names and suggestions) | |
| 61 for row_id in range(2, xls_sheet.nrows): | |
| 62 row_dict = {} | |
| 63 for col_id in range(0, xls_sheet.ncols): | |
| 64 row_dict[colnames[col_id]] = xls_sheet.cell(row_id, col_id).value | |
| 65 # should check for duplicate alias/ids? | |
| 66 sheet_contents_dict[row_id] = row_dict | |
| 67 content_dict[sheet_name] = sheet_contents_dict | |
| 68 yaml.dump(content_dict, sys.stdout) | |
| 69 print('YAML -------------') | |
| 41 | 70 |
| 42 | 71 |
| 43 parser = argparse.ArgumentParser() | 72 parser = argparse.ArgumentParser() |
| 44 parser.add_argument('--form', dest='xlsx_path', required=True) | 73 parser.add_argument('--form', dest='xlsx_path', required=True) |
| 45 parser.add_argument('--out_dir', dest='out_path', required=True) | 74 parser.add_argument('--out_dir', dest='out_path', required=True) |
| 46 parser.add_argument('--action', dest='action', required=True) | 75 parser.add_argument('--action', dest='action', required=True) |
| 47 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') | 76 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') |
| 77 parser.add_argument('--verbose', dest='verbose', required=False, action='store_true') | |
| 48 args = parser.parse_args() | 78 args = parser.parse_args() |
| 49 | 79 |
| 50 xl_workbook = xlrd.open_workbook(args.xlsx_path) | 80 xl_workbook = xlrd.open_workbook(args.xlsx_path) |
| 51 | 81 |
| 52 # PARSE STUDIES | 82 # PARSE STUDIES |
| 75 # PARSE EXPERIMENTS | 105 # PARSE EXPERIMENTS |
| 76 ################# | 106 ################# |
| 77 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') | 107 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') |
| 78 if xl_sheet.nrows < 3: | 108 if xl_sheet.nrows < 3: |
| 79 raise ValueError('No experiments found in experiments sheet') | 109 raise ValueError('No experiments found in experiments sheet') |
| 80 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', 'library_name', | 110 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', |
| 81 'library_strategy', 'library_source', 'library_selection', 'library_layout', | 111 'library_name', 'library_strategy', 'library_source', 'library_selection', |
| 82 'insert_size', 'library_construction_protocol', 'platform', 'instrument_model'] | 112 'library_layout', 'insert_size', 'library_construction_protocol', |
| 113 'platform', 'instrument_model'] | |
| 114 | |
| 83 experiments_dict = extract_data(xl_sheet, exp_columns) | 115 experiments_dict = extract_data(xl_sheet, exp_columns) |
| 84 | 116 |
| 85 # PARSE RUNS SHEET | 117 # PARSE RUNS SHEET |
| 86 ################# | 118 ################# |
| 87 xl_sheet = xl_workbook.sheet_by_name('ENA_run') | 119 xl_sheet = xl_workbook.sheet_by_name('ENA_run') |
| 121 # WRITE DICTIONARIES TO TABLE FILES | 153 # WRITE DICTIONARIES TO TABLE FILES |
| 122 | 154 |
| 123 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? | 155 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? |
| 124 # dt_oobj = datetime.now(tz=None) | 156 # dt_oobj = datetime.now(tz=None) |
| 125 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") | 157 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") |
| 158 runs_included = [] | |
| 159 exp_included = [] | |
| 126 for study_alias, study in studies_dict.items(): | 160 for study_alias, study in studies_dict.items(): |
| 127 # study_alias = study_alias + '_' + timestamp | 161 # study_alias = study_alias + '_' + timestamp |
| 128 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], | 162 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], |
| 129 study['study_type'], study['study_abstract'], '', | 163 study['study_type'], study['study_abstract'], '', |
| 130 'ENA_submission_data']) + '\n') # assuming no pubmed_id | 164 'ENA_submission_data']) + '\n') # assuming no pubmed_id |
| 160 exp['library_layout'].lower(), | 194 exp['library_layout'].lower(), |
| 161 str(int(exp['insert_size'])), | 195 str(int(exp['insert_size'])), |
| 162 exp['library_construction_protocol'], | 196 exp['library_construction_protocol'], |
| 163 exp['platform'], exp['instrument_model'], | 197 exp['platform'], exp['instrument_model'], |
| 164 'submission_date_ENA']) + '\n') | 198 'submission_date_ENA']) + '\n') |
| 199 exp_included.append(exp_alias) | |
| 165 for run_alias, run in runs_dict.items(): | 200 for run_alias, run in runs_dict.items(): |
| 166 if run['experiment_alias'] == exp_alias: | 201 # check that the experiments library_layout is set to paired |
| 167 runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias, | 202 # when multiple entries are associated with the same run alias |
| 168 run['file_name'], FILE_FORMAT, 'file_checksum', | 203 if not isinstance(run, list): |
| 169 'submission_date_ENA']) + '\n') | 204 runs_list = [run] |
| 205 else: | |
| 206 runs_list = run | |
| 207 for run_entry in runs_list: | |
| 208 if run_entry['experiment_alias'] == exp_alias: | |
| 209 runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', | |
| 210 exp_alias, run_entry['file_name'], | |
| 211 FILE_FORMAT, 'file_checksum', | |
| 212 'submission_date_ENA']) + '\n') | |
| 213 runs_included.append(run_alias) | |
| 214 | |
| 215 # check if any experiment or run was not associated with any sample | |
| 216 for run in runs_dict.keys(): | |
| 217 if run not in runs_included: | |
| 218 print(f'The run {run} is listed in the runs section but not associated with any \ | |
| 219 used experiment') | |
| 220 | |
| 221 for exp in experiments_dict.keys(): | |
| 222 if exp not in exp_included: | |
| 223 print(f'The experiment {exp} is listed in the experiments section but not associated \ | |
| 224 with any used sample') | |
| 225 | |
| 170 studies_table.close() | 226 studies_table.close() |
| 171 samples_table.close() | 227 samples_table.close() |
| 172 experiments_table.close() | 228 experiments_table.close() |
| 173 runs_table.close() | 229 runs_table.close() |
| 230 | |
| 231 if args.verbose: | |
| 232 paste_xls2yaml(args.xlsx_path) |
