Mercurial > repos > davidvanzessen > upload_zip_file
comparison uploadzip.py @ 5:a2f200121dda draft default tip
Uploaded
| author | davidvanzessen |
|---|---|
| date | Thu, 09 Apr 2015 04:18:00 -0400 |
| parents | 6f24bce6817e |
| children |
comparison
equal
deleted
inserted
replaced
| 4:fdd4776a434f | 5:a2f200121dda |
|---|---|
| 34 | 34 |
| 35 def stop_err( msg, ret=1 ): | 35 def stop_err( msg, ret=1 ): |
| 36 sys.stderr.write( msg ) | 36 sys.stderr.write( msg ) |
| 37 sys.exit( ret ) | 37 sys.exit( ret ) |
| 38 def file_err( msg, dataset, json_file ): | 38 def file_err( msg, dataset, json_file ): |
| 39 json_file.write( dumps( dict( type = 'dataset', | 39 json_file.write( to_json_string( dict( type = 'dataset', |
| 40 ext = 'data', | 40 ext = 'data', |
| 41 dataset_id = dataset.dataset_id, | 41 dataset_id = dataset.dataset_id, |
| 42 stderr = msg ) ) + "\n" ) | 42 stderr = msg ) ) + "\n" ) |
| 43 # never remove a server-side upload | 43 # never remove a server-side upload |
| 44 if dataset.type in ( 'server_dir', 'path_paste' ): | 44 if dataset.type in ( 'server_dir', 'path_paste' ): |
| 111 elif dataset.is_multi_byte: | 111 elif dataset.is_multi_byte: |
| 112 data_type = 'multi-byte char' | 112 data_type = 'multi-byte char' |
| 113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) | 113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) |
| 114 # Is dataset content supported sniffable binary? | 114 # Is dataset content supported sniffable binary? |
| 115 else: | 115 else: |
| 116 # FIXME: This ignores the declared sniff order in datatype_conf.xml | |
| 117 # resulting in improper behavior | |
| 118 type_info = Binary.is_sniffable_binary( dataset.path ) | 116 type_info = Binary.is_sniffable_binary( dataset.path ) |
| 119 if type_info: | 117 if type_info: |
| 120 data_type = type_info[0] | 118 data_type = type_info[0] |
| 121 ext = type_info[1] | 119 ext = type_info[1] |
| 122 data_type = 'compressed archive' #upload zip file modification | 120 data_type = "compressed archive" |
| 123 if not data_type: | 121 if not data_type: |
| 124 root_datatype = registry.get_datatype_by_extension( dataset.file_type ) | 122 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress |
| 125 if getattr( root_datatype, 'compressed', False ): | 123 is_gzipped, is_valid = check_gzip( dataset.path ) |
| 126 data_type = 'compressed archive' | 124 if is_gzipped and not is_valid: |
| 127 ext = dataset.file_type | 125 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
| 128 else: | 126 return |
| 129 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress | 127 elif is_gzipped and is_valid: |
| 130 is_gzipped, is_valid = check_gzip( dataset.path ) | 128 if link_data_only == 'copy_files': |
| 131 if is_gzipped and not is_valid: | 129 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format |
| 130 CHUNK_SIZE = 2**20 # 1Mb | |
| 131 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
| 132 gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) | |
| 133 while 1: | |
| 134 try: | |
| 135 chunk = gzipped_file.read( CHUNK_SIZE ) | |
| 136 except IOError: | |
| 137 os.close( fd ) | |
| 138 os.remove( uncompressed ) | |
| 139 file_err( 'Problem decompressing gzipped data', dataset, json_file ) | |
| 140 return | |
| 141 if not chunk: | |
| 142 break | |
| 143 os.write( fd, chunk ) | |
| 144 os.close( fd ) | |
| 145 gzipped_file.close() | |
| 146 # Replace the gzipped file with the decompressed file if it's safe to do so | |
| 147 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
| 148 dataset.path = uncompressed | |
| 149 else: | |
| 150 shutil.move( uncompressed, dataset.path ) | |
| 151 os.chmod(dataset.path, 0644) | |
| 152 dataset.name = dataset.name.rstrip( '.gz' ) | |
| 153 data_type = 'gzip' | |
| 154 if not data_type and bz2 is not None: | |
| 155 # See if we have a bz2 file, much like gzip | |
| 156 is_bzipped, is_valid = check_bz2( dataset.path ) | |
| 157 if is_bzipped and not is_valid: | |
| 132 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | 158 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
| 133 return | 159 return |
| 134 elif is_gzipped and is_valid: | 160 elif is_bzipped and is_valid: |
| 135 if link_data_only == 'copy_files': | 161 if link_data_only == 'copy_files': |
| 136 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format | 162 # We need to uncompress the temp_name file |
| 137 CHUNK_SIZE = 2**20 # 1Mb | 163 CHUNK_SIZE = 2**20 # 1Mb |
| 138 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | 164 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) |
| 139 gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) | 165 bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) |
| 140 while 1: | 166 while 1: |
| 141 try: | 167 try: |
| 142 chunk = gzipped_file.read( CHUNK_SIZE ) | 168 chunk = bzipped_file.read( CHUNK_SIZE ) |
| 143 except IOError: | 169 except IOError: |
| 144 os.close( fd ) | 170 os.close( fd ) |
| 145 os.remove( uncompressed ) | 171 os.remove( uncompressed ) |
| 146 file_err( 'Problem decompressing gzipped data', dataset, json_file ) | 172 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) |
| 147 return | 173 return |
| 148 if not chunk: | 174 if not chunk: |
| 149 break | 175 break |
| 150 os.write( fd, chunk ) | 176 os.write( fd, chunk ) |
| 151 os.close( fd ) | 177 os.close( fd ) |
| 152 gzipped_file.close() | 178 bzipped_file.close() |
| 153 # Replace the gzipped file with the decompressed file if it's safe to do so | 179 # Replace the bzipped file with the decompressed file if it's safe to do so |
| 154 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | 180 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: |
| 155 dataset.path = uncompressed | 181 dataset.path = uncompressed |
| 156 else: | 182 else: |
| 157 shutil.move( uncompressed, dataset.path ) | 183 shutil.move( uncompressed, dataset.path ) |
| 158 os.chmod(dataset.path, 0644) | 184 os.chmod(dataset.path, 0644) |
| 159 dataset.name = dataset.name.rstrip( '.gz' ) | 185 dataset.name = dataset.name.rstrip( '.bz2' ) |
| 160 data_type = 'gzip' | 186 data_type = 'bz2' |
| 161 if not data_type and bz2 is not None: | 187 if not data_type: |
| 162 # See if we have a bz2 file, much like gzip | 188 # See if we have a zip archive |
| 163 is_bzipped, is_valid = check_bz2( dataset.path ) | 189 is_zipped = check_zip( dataset.path ) |
| 164 if is_bzipped and not is_valid: | 190 if is_zipped: |
| 165 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | 191 if link_data_only == 'copy_files': |
| 166 return | 192 CHUNK_SIZE = 2**20 # 1Mb |
| 167 elif is_bzipped and is_valid: | 193 uncompressed = None |
| 168 if link_data_only == 'copy_files': | 194 uncompressed_name = None |
| 169 # We need to uncompress the temp_name file | 195 unzipped = False |
| 170 CHUNK_SIZE = 2**20 # 1Mb | 196 z = zipfile.ZipFile( dataset.path ) |
| 171 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | 197 for name in z.namelist(): |
| 172 bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) | 198 if name.endswith('/'): |
| 173 while 1: | 199 continue |
| 200 if unzipped: | |
| 201 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' | |
| 202 break | |
| 203 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
| 204 if sys.version_info[:2] >= ( 2, 6 ): | |
| 205 zipped_file = z.open( name ) | |
| 206 while 1: | |
| 207 try: | |
| 208 chunk = zipped_file.read( CHUNK_SIZE ) | |
| 209 except IOError: | |
| 210 os.close( fd ) | |
| 211 os.remove( uncompressed ) | |
| 212 file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
| 213 return | |
| 214 if not chunk: | |
| 215 break | |
| 216 os.write( fd, chunk ) | |
| 217 os.close( fd ) | |
| 218 zipped_file.close() | |
| 219 uncompressed_name = name | |
| 220 unzipped = True | |
| 221 else: | |
| 222 # python < 2.5 doesn't have a way to read members in chunks(!) | |
| 174 try: | 223 try: |
| 175 chunk = bzipped_file.read( CHUNK_SIZE ) | 224 outfile = open( uncompressed, 'wb' ) |
| 225 outfile.write( z.read( name ) ) | |
| 226 outfile.close() | |
| 227 uncompressed_name = name | |
| 228 unzipped = True | |
| 176 except IOError: | 229 except IOError: |
| 177 os.close( fd ) | 230 os.close( fd ) |
| 178 os.remove( uncompressed ) | 231 os.remove( uncompressed ) |
| 179 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) | 232 file_err( 'Problem decompressing zipped data', dataset, json_file ) |
| 180 return | 233 return |
| 181 if not chunk: | 234 z.close() |
| 182 break | 235 # Replace the zipped file with the decompressed file if it's safe to do so |
| 183 os.write( fd, chunk ) | 236 if uncompressed is not None: |
| 184 os.close( fd ) | |
| 185 bzipped_file.close() | |
| 186 # Replace the bzipped file with the decompressed file if it's safe to do so | |
| 187 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | 237 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: |
| 188 dataset.path = uncompressed | 238 dataset.path = uncompressed |
| 189 else: | 239 else: |
| 190 shutil.move( uncompressed, dataset.path ) | 240 shutil.move( uncompressed, dataset.path ) |
| 191 os.chmod(dataset.path, 0644) | 241 os.chmod(dataset.path, 0644) |
| 192 dataset.name = dataset.name.rstrip( '.bz2' ) | 242 dataset.name = uncompressed_name |
| 193 data_type = 'bz2' | 243 data_type = 'zip' |
| 194 if not data_type: | 244 if not data_type: |
| 195 # See if we have a zip archive | 245 if check_binary( dataset.path ): |
| 196 is_zipped = check_zip( dataset.path ) | 246 # We have a binary dataset, but it is not Bam, Sff or Pdf |
| 197 if is_zipped: | 247 data_type = 'binary' |
| 198 if link_data_only == 'copy_files': | 248 #binary_ok = False |
| 199 CHUNK_SIZE = 2**20 # 1Mb | 249 parts = dataset.name.split( "." ) |
| 200 uncompressed = None | 250 if len( parts ) > 1: |
| 201 uncompressed_name = None | 251 ext = parts[-1].strip().lower() |
| 202 unzipped = False | 252 if not Binary.is_ext_unsniffable(ext): |
| 203 z = zipfile.ZipFile( dataset.path ) | 253 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) |
| 204 for name in z.namelist(): | 254 return |
| 205 if name.endswith('/'): | 255 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: |
| 206 continue | 256 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) |
| 207 if unzipped: | 257 file_err( err_msg, dataset, json_file ) |
| 208 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' | 258 return |
| 209 break | 259 if not data_type: |
| 210 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | 260 # We must have a text file |
| 211 if sys.version_info[:2] >= ( 2, 6 ): | 261 if check_html( dataset.path ): |
| 212 zipped_file = z.open( name ) | 262 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) |
| 213 while 1: | 263 return |
| 214 try: | 264 if data_type != 'binary': |
| 215 chunk = zipped_file.read( CHUNK_SIZE ) | 265 if link_data_only == 'copy_files': |
| 216 except IOError: | 266 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: |
| 217 os.close( fd ) | 267 in_place = False |
| 218 os.remove( uncompressed ) | 268 # Convert universal line endings to Posix line endings, but allow the user to turn it off, |
| 219 file_err( 'Problem decompressing zipped data', dataset, json_file ) | 269 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without |
| 220 return | 270 # corrupting the content of those files. |
| 221 if not chunk: | 271 if dataset.to_posix_lines: |
| 222 break | 272 if dataset.space_to_tab: |
| 223 os.write( fd, chunk ) | 273 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) |
| 224 os.close( fd ) | 274 else: |
| 225 zipped_file.close() | 275 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) |
| 226 uncompressed_name = name | 276 if dataset.file_type == 'auto': |
| 227 unzipped = True | 277 ext = sniff.guess_ext( dataset.path, registry.sniff_order ) |
| 228 else: | 278 else: |
| 229 # python < 2.5 doesn't have a way to read members in chunks(!) | 279 ext = dataset.file_type |
| 230 try: | 280 data_type = ext |
| 231 outfile = open( uncompressed, 'wb' ) | |
| 232 outfile.write( z.read( name ) ) | |
| 233 outfile.close() | |
| 234 uncompressed_name = name | |
| 235 unzipped = True | |
| 236 except IOError: | |
| 237 os.close( fd ) | |
| 238 os.remove( uncompressed ) | |
| 239 file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
| 240 return | |
| 241 z.close() | |
| 242 # Replace the zipped file with the decompressed file if it's safe to do so | |
| 243 if uncompressed is not None: | |
| 244 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
| 245 dataset.path = uncompressed | |
| 246 else: | |
| 247 shutil.move( uncompressed, dataset.path ) | |
| 248 os.chmod(dataset.path, 0644) | |
| 249 dataset.name = uncompressed_name | |
| 250 data_type = 'zip' | |
| 251 if not data_type: | |
| 252 # TODO refactor this logic. check_binary isn't guaranteed to be | |
| 253 # correct since it only looks at whether the first 100 chars are | |
| 254 # printable or not. If someone specifies a known unsniffable | |
| 255 # binary datatype and check_binary fails, the file gets mangled. | |
| 256 if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): | |
| 257 # We have a binary dataset, but it is not Bam, Sff or Pdf | |
| 258 data_type = 'binary' | |
| 259 #binary_ok = False | |
| 260 parts = dataset.name.split( "." ) | |
| 261 if len( parts ) > 1: | |
| 262 ext = parts[-1].strip().lower() | |
| 263 if not Binary.is_ext_unsniffable(ext): | |
| 264 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) | |
| 265 return | |
| 266 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: | |
| 267 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) | |
| 268 file_err( err_msg, dataset, json_file ) | |
| 269 return | |
| 270 if not data_type: | |
| 271 # We must have a text file | |
| 272 if check_html( dataset.path ): | |
| 273 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) | |
| 274 return | |
| 275 if data_type != 'binary': | |
| 276 if link_data_only == 'copy_files': | |
| 277 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: | |
| 278 in_place = False | |
| 279 # Convert universal line endings to Posix line endings, but allow the user to turn it off, | |
| 280 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without | |
| 281 # corrupting the content of those files. | |
| 282 if dataset.to_posix_lines: | |
| 283 tmpdir = output_adjacent_tmpdir( output_path ) | |
| 284 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id | |
| 285 if dataset.space_to_tab: | |
| 286 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | |
| 287 else: | |
| 288 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | |
| 289 if dataset.file_type == 'auto': | |
| 290 ext = sniff.guess_ext( dataset.path, registry.sniff_order ) | |
| 291 else: | |
| 292 ext = dataset.file_type | |
| 293 data_type = ext | |
| 294 # Save job info for the framework | 281 # Save job info for the framework |
| 295 if ext == 'auto' and dataset.ext: | 282 if ext == 'auto' and dataset.ext: |
| 296 ext = dataset.ext | 283 ext = dataset.ext |
| 297 if ext == 'auto': | 284 if ext == 'auto': |
| 298 ext = 'data' | 285 ext = 'data' |
| 325 stdout = stdout, | 312 stdout = stdout, |
| 326 name = dataset.name, | 313 name = dataset.name, |
| 327 line_count = line_count ) | 314 line_count = line_count ) |
| 328 if dataset.get('uuid', None) is not None: | 315 if dataset.get('uuid', None) is not None: |
| 329 info['uuid'] = dataset.get('uuid') | 316 info['uuid'] = dataset.get('uuid') |
| 330 json_file.write( dumps( info ) + "\n" ) | 317 json_file.write( to_json_string( info ) + "\n" ) |
| 331 | 318 |
| 332 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): | 319 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): |
| 333 # Groom the dataset content if necessary | 320 # Groom the dataset content if necessary |
| 334 datatype.groom_dataset_content( output_path ) | 321 datatype.groom_dataset_content( output_path ) |
| 335 | 322 |
| 351 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) | 338 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) |
| 352 return | 339 return |
| 353 dataset.path = temp_name | 340 dataset.path = temp_name |
| 354 dp = temp_name | 341 dp = temp_name |
| 355 if not value.is_binary: | 342 if not value.is_binary: |
| 356 tmpdir = output_adjacent_tmpdir( output_path ) | |
| 357 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id | |
| 358 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): | 343 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): |
| 359 sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | 344 sniff.convert_newlines_sep2tabs( dp ) |
| 360 else: | 345 else: |
| 361 sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | 346 sniff.convert_newlines( dp ) |
| 362 shutil.move( dp, os.path.join( files_path, name ) ) | 347 shutil.move( dp, os.path.join( files_path, name ) ) |
| 363 # Move the dataset to its "real" path | 348 # Move the dataset to its "real" path |
| 364 shutil.move( dataset.primary_file, output_path ) | 349 shutil.move( dataset.primary_file, output_path ) |
| 365 # Write the job info | 350 # Write the job info |
| 366 info = dict( type = 'dataset', | 351 info = dict( type = 'dataset', |
| 367 dataset_id = dataset.dataset_id, | 352 dataset_id = dataset.dataset_id, |
| 368 stdout = 'uploaded %s file' % dataset.file_type ) | 353 stdout = 'uploaded %s file' % dataset.file_type ) |
| 369 json_file.write( dumps( info ) + "\n" ) | 354 json_file.write( to_json_string( info ) + "\n" ) |
| 370 | |
| 371 | |
| 372 def output_adjacent_tmpdir( output_path ): | |
| 373 """ For temp files that will ultimately be moved to output_path anyway | |
| 374 just create the file directly in output_path's directory so shutil.move | |
| 375 will work optimially. | |
| 376 """ | |
| 377 return os.path.dirname( output_path ) | |
| 378 | |
| 379 | 355 |
| 380 def __main__(): | 356 def __main__(): |
| 381 | 357 |
| 382 if len( sys.argv ) < 4: | 358 if len( sys.argv ) < 4: |
| 383 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' | 359 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' |
| 388 | 364 |
| 389 registry = Registry() | 365 registry = Registry() |
| 390 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) | 366 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) |
| 391 | 367 |
| 392 for line in open( sys.argv[3], 'r' ): | 368 for line in open( sys.argv[3], 'r' ): |
| 393 dataset = loads( line ) | 369 dataset = from_json_string( line ) |
| 394 dataset = util.bunch.Bunch( **safe_dict( dataset ) ) | 370 dataset = util.bunch.Bunch( **safe_dict( dataset ) ) |
| 395 try: | 371 try: |
| 396 output_path = output_paths[int( dataset.dataset_id )][0] | 372 output_path = output_paths[int( dataset.dataset_id )][0] |
| 397 except: | 373 except: |
| 398 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id | 374 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id |
