upload_zip_file: uploadzip.py comparison

comparison uploadzip.py @ 3:6f24bce6817e draft

Uploaded

author	davidvanzessen
date	Mon, 30 Mar 2015 10:13:25 -0400
parents	30d16d36d536
children	a2f200121dda

comparison

equal deleted inserted replaced

-:30d16d36d536
+:6f24bce6817e
 def stop_err( msg, ret=1 ):
 sys.stderr.write( msg )
 sys.exit( ret )
 def file_err( msg, dataset, json_file ):
-json_file.write( to_json_string( dict( type = 'dataset',
+json_file.write( dumps( dict( type = 'dataset',
 ext = 'data',
 dataset_id = dataset.dataset_id,
 stderr = msg ) ) + "\n" )
 # never remove a server-side upload
 if dataset.type in ( 'server_dir', 'path_paste' ):
 elif dataset.is_multi_byte:
 data_type = 'multi-byte char'
 ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
 # Is dataset content supported sniffable binary?
 else:
+# FIXME: This ignores the declared sniff order in datatype_conf.xml
+# resulting in improper behavior
 type_info = Binary.is_sniffable_binary( dataset.path )
 if type_info:
 data_type = type_info[0]
 ext = type_info[1]
-data_type="binary"
+data_type = 'compressed archive' #upload zip file modification
 if not data_type:
-# See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
+root_datatype = registry.get_datatype_by_extension( dataset.file_type )
-is_gzipped, is_valid = check_gzip( dataset.path )
+if getattr( root_datatype, 'compressed', False ):
-if is_gzipped and not is_valid:
+data_type = 'compressed archive'
-file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
+ext = dataset.file_type
-return
+else:
-elif is_gzipped and is_valid:
+# See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
-if link_data_only == 'copy_files':
+is_gzipped, is_valid = check_gzip( dataset.path )
-# We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
+if is_gzipped and not is_valid:
-CHUNK_SIZE = 2**20 # 1Mb
-fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
-gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
-while 1:
-try:
-chunk = gzipped_file.read( CHUNK_SIZE )
-except IOError:
-os.close( fd )
-os.remove( uncompressed )
-file_err( 'Problem decompressing gzipped data', dataset, json_file )
-return
-if not chunk:
-break
-os.write( fd, chunk )
-os.close( fd )
-gzipped_file.close()
-# Replace the gzipped file with the decompressed file if it's safe to do so
-if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
-dataset.path = uncompressed
-else:
-shutil.move( uncompressed, dataset.path )
-os.chmod(dataset.path, 0644)
-dataset.name = dataset.name.rstrip( '.gz' )
-data_type = 'gzip'
-if not data_type and bz2 is not None:
-# See if we have a bz2 file, much like gzip
-is_bzipped, is_valid = check_bz2( dataset.path )
-if is_bzipped and not is_valid:
 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
 return
-elif is_bzipped and is_valid:
+elif is_gzipped and is_valid:
 if link_data_only == 'copy_files':
-# We need to uncompress the temp_name file
+# We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
 CHUNK_SIZE = 2**20 # 1Mb
-fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
+fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
-bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
+gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
 while 1:
 try:
-chunk = bzipped_file.read( CHUNK_SIZE )
+chunk = gzipped_file.read( CHUNK_SIZE )
 except IOError:
 os.close( fd )
 os.remove( uncompressed )
-file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
+file_err( 'Problem decompressing gzipped data', dataset, json_file )
 return
 if not chunk:
 break
 os.write( fd, chunk )
 os.close( fd )
-bzipped_file.close()
+gzipped_file.close()
-# Replace the bzipped file with the decompressed file if it's safe to do so
+# Replace the gzipped file with the decompressed file if it's safe to do so
 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
 dataset.path = uncompressed
 else:
 shutil.move( uncompressed, dataset.path )
 os.chmod(dataset.path, 0644)
-dataset.name = dataset.name.rstrip( '.bz2' )
+dataset.name = dataset.name.rstrip( '.gz' )
-data_type = 'bz2'
+data_type = 'gzip'
-if not data_type:
+if not data_type and bz2 is not None:
-# See if we have a zip archive
+# See if we have a bz2 file, much like gzip
-is_zipped = check_zip( dataset.path )
+is_bzipped, is_valid = check_bz2( dataset.path )
-if is_zipped:
+if is_bzipped and not is_valid:
-if link_data_only == 'copy_files':
+file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
-CHUNK_SIZE = 2**20 # 1Mb
+return
-uncompressed = None
+elif is_bzipped and is_valid:
-uncompressed_name = None
+if link_data_only == 'copy_files':
-unzipped = False
+# We need to uncompress the temp_name file
-z = zipfile.ZipFile( dataset.path )
+CHUNK_SIZE = 2**20 # 1Mb
-for name in z.namelist():
+fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
-if name.endswith('/'):
+bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
-continue
+while 1:
-if unzipped:
+try:
-stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
+chunk = bzipped_file.read( CHUNK_SIZE )
-break
+except IOError:
-fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
+os.close( fd )
-if sys.version_info[:2] >= ( 2, 6 ):
+os.remove( uncompressed )
-zipped_file = z.open( name )
+file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
-while 1:
+return
+if not chunk:
+break
+os.write( fd, chunk )
+os.close( fd )
+bzipped_file.close()
+# Replace the bzipped file with the decompressed file if it's safe to do so
+if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
+dataset.path = uncompressed
+else:
+shutil.move( uncompressed, dataset.path )
+os.chmod(dataset.path, 0644)
+dataset.name = dataset.name.rstrip( '.bz2' )
+data_type = 'bz2'
+if not data_type:
+# See if we have a zip archive
+is_zipped = check_zip( dataset.path )
+if is_zipped:
+if link_data_only == 'copy_files':
+CHUNK_SIZE = 2**20 # 1Mb
+uncompressed = None
+uncompressed_name = None
+unzipped = False
+z = zipfile.ZipFile( dataset.path )
+for name in z.namelist():
+if name.endswith('/'):
+continue
+if unzipped:
+stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
+break
+fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
+if sys.version_info[:2] >= ( 2, 6 ):
+zipped_file = z.open( name )
+while 1:
+try:
+chunk = zipped_file.read( CHUNK_SIZE )
+except IOError:
+os.close( fd )
+os.remove( uncompressed )
+file_err( 'Problem decompressing zipped data', dataset, json_file )
+return
+if not chunk:
+break
+os.write( fd, chunk )
+os.close( fd )
+zipped_file.close()
+uncompressed_name = name
+unzipped = True
+else:
+# python < 2.5 doesn't have a way to read members in chunks(!)
 try:
-chunk = zipped_file.read( CHUNK_SIZE )
+outfile = open( uncompressed, 'wb' )
+outfile.write( z.read( name ) )
+outfile.close()
+uncompressed_name = name
+unzipped = True
 except IOError:
 os.close( fd )
 os.remove( uncompressed )
 file_err( 'Problem decompressing zipped data', dataset, json_file )
 return
-if not chunk:
+z.close()
-break
+# Replace the zipped file with the decompressed file if it's safe to do so
-os.write( fd, chunk )
+if uncompressed is not None:
-os.close( fd )
+if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
-zipped_file.close()
+dataset.path = uncompressed
-uncompressed_name = name
+else:
-unzipped = True
+shutil.move( uncompressed, dataset.path )
+os.chmod(dataset.path, 0644)
+dataset.name = uncompressed_name
+data_type = 'zip'
+if not data_type:
+# TODO refactor this logic.  check_binary isn't guaranteed to be
+# correct since it only looks at whether the first 100 chars are
+# printable or not.  If someone specifies a known unsniffable
+# binary datatype and check_binary fails, the file gets mangled.
+if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
+# We have a binary dataset, but it is not Bam, Sff or Pdf
+data_type = 'binary'
+#binary_ok = False
+parts = dataset.name.split( "." )
+if len( parts ) > 1:
+ext = parts[-1].strip().lower()
+if not Binary.is_ext_unsniffable(ext):
+file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
+return
+elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
+err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
+file_err( err_msg, dataset, json_file )
+return
+if not data_type:
+# We must have a text file
+if check_html( dataset.path ):
+file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
+return
+if data_type != 'binary':
+if link_data_only == 'copy_files':
+if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
+in_place = False
+# Convert universal line endings to Posix line endings, but allow the user to turn it off,
+# so that is becomes possible to upload gzip, bz2 or zip files with binary data without
+# corrupting the content of those files.
+if dataset.to_posix_lines:
+tmpdir = output_adjacent_tmpdir( output_path )
+tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
+if dataset.space_to_tab:
+line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
 else:
-# python < 2.5 doesn't have a way to read members in chunks(!)
+line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
-try:
+if dataset.file_type == 'auto':
-outfile = open( uncompressed, 'wb' )
+ext = sniff.guess_ext( dataset.path, registry.sniff_order )
-outfile.write( z.read( name ) )
+else:
-outfile.close()
+ext = dataset.file_type
-uncompressed_name = name
+data_type = ext
-unzipped = True
-except IOError:
-os.close( fd )
-os.remove( uncompressed )
-file_err( 'Problem decompressing zipped data', dataset, json_file )
-return
-z.close()
-# Replace the zipped file with the decompressed file if it's safe to do so
-if uncompressed is not None:
-if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
-dataset.path = uncompressed
-else:
-shutil.move( uncompressed, dataset.path )
-os.chmod(dataset.path, 0644)
-dataset.name = uncompressed_name
-data_type = 'zip'
-if not data_type:
-if check_binary( dataset.path ):
-# We have a binary dataset, but it is not Bam, Sff or Pdf
-data_type = 'binary'
-#binary_ok = False
-parts = dataset.name.split( "." )
-if len( parts ) > 1:
-ext = parts[-1].strip().lower()
-if not Binary.is_ext_unsniffable(ext):
-file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
-return
-elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
-err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
-file_err( err_msg, dataset, json_file )
-return
-if not data_type:
-# We must have a text file
-if check_html( dataset.path ):
-file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
-return
-if data_type != 'binary':
-if link_data_only == 'copy_files':
-if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
-in_place = False
-# Convert universal line endings to Posix line endings, but allow the user to turn it off,
-# so that is becomes possible to upload gzip, bz2 or zip files with binary data without
-# corrupting the content of those files.
-if dataset.to_posix_lines:
-if dataset.space_to_tab:
-line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place )
-else:
-line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place )
-if dataset.file_type == 'auto':
-ext = sniff.guess_ext( dataset.path, registry.sniff_order )
-else:
-ext = dataset.file_type
-data_type = ext
 # Save job info for the framework
 if ext == 'auto' and dataset.ext:
 ext = dataset.ext
 if ext == 'auto':
 ext = 'data'
 stdout = stdout,
 name = dataset.name,
 line_count = line_count )
 if dataset.get('uuid', None) is not None:
 info['uuid'] = dataset.get('uuid')
-json_file.write( to_json_string( info ) + "\n" )
+json_file.write( dumps( info ) + "\n" )
 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ):
 # Groom the dataset content if necessary
 datatype.groom_dataset_content( output_path )
 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file )
 return
 dataset.path = temp_name
 dp = temp_name
 if not value.is_binary:
+tmpdir = output_adjacent_tmpdir( output_path )
+tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ):
-sniff.convert_newlines_sep2tabs( dp )
+sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
 else:
-sniff.convert_newlines( dp )
+sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
 shutil.move( dp, os.path.join( files_path, name ) )
 # Move the dataset to its "real" path
 shutil.move( dataset.primary_file, output_path )
 # Write the job info
 info = dict( type = 'dataset',
 dataset_id = dataset.dataset_id,
 stdout = 'uploaded %s file' % dataset.file_type )
-json_file.write( to_json_string( info ) + "\n" )
+json_file.write( dumps( info ) + "\n" )
+def output_adjacent_tmpdir( output_path ):
+""" For temp files that will ultimately be moved to output_path anyway
+just create the file directly in output_path's directory so shutil.move
+will work optimially.
+"""
+return os.path.dirname( output_path )
 def __main__():
 if len( sys.argv ) < 4:
 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...'
 registry = Registry()
 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] )
 for line in open( sys.argv[3], 'r' ):
-dataset = from_json_string( line )
+dataset = loads( line )
 dataset = util.bunch.Bunch( **safe_dict( dataset ) )
 try:
 output_path = output_paths[int( dataset.dataset_id )][0]
 except:
 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id

Mercurial > repos > davidvanzessen > upload_zip_file

comparison uploadzip.py @ 3:6f24bce6817e draft