Mercurial > repos > davidvanzessen > upload_zip_file
changeset 3:6f24bce6817e draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 30 Mar 2015 10:13:25 -0400 |
parents | 30d16d36d536 |
children | fdd4776a434f |
files | uploadzip.py |
diffstat | 1 files changed, 165 insertions(+), 141 deletions(-) [+] |
line wrap: on
line diff
--- a/uploadzip.py Mon Mar 30 07:58:53 2015 -0400 +++ b/uploadzip.py Mon Mar 30 10:13:25 2015 -0400 @@ -36,7 +36,7 @@ sys.stderr.write( msg ) sys.exit( ret ) def file_err( msg, dataset, json_file ): - json_file.write( to_json_string( dict( type = 'dataset', + json_file.write( dumps( dict( type = 'dataset', ext = 'data', dataset_id = dataset.dataset_id, stderr = msg ) ) + "\n" ) @@ -113,171 +113,184 @@ ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) # Is dataset content supported sniffable binary? else: + # FIXME: This ignores the declared sniff order in datatype_conf.xml + # resulting in improper behavior type_info = Binary.is_sniffable_binary( dataset.path ) if type_info: data_type = type_info[0] ext = type_info[1] - data_type="binary" + data_type = 'compressed archive' #upload zip file modification if not data_type: - # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress - is_gzipped, is_valid = check_gzip( dataset.path ) - if is_gzipped and not is_valid: - file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) - return - elif is_gzipped and is_valid: - if link_data_only == 'copy_files': - # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format - CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) - while 1: - try: - chunk = gzipped_file.read( CHUNK_SIZE ) - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing gzipped data', dataset, json_file ) - return - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - gzipped_file.close() - # Replace the gzipped file with the decompressed file if it's safe to do so - if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: - dataset.path = uncompressed - else: - shutil.move( uncompressed, dataset.path ) - os.chmod(dataset.path, 0644) - dataset.name = dataset.name.rstrip( '.gz' ) - data_type = 'gzip' - if not data_type and bz2 is not None: - # See if we have a bz2 file, much like gzip - is_bzipped, is_valid = check_bz2( dataset.path ) - if is_bzipped and not is_valid: + root_datatype = registry.get_datatype_by_extension( dataset.file_type ) + if getattr( root_datatype, 'compressed', False ): + data_type = 'compressed archive' + ext = dataset.file_type + else: + # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress + is_gzipped, is_valid = check_gzip( dataset.path ) + if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) return - elif is_bzipped and is_valid: + elif is_gzipped and is_valid: if link_data_only == 'copy_files': - # We need to uncompress the temp_name file - CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) + # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format + CHUNK_SIZE = 2**20 # 1Mb + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) while 1: try: - chunk = bzipped_file.read( CHUNK_SIZE ) + chunk = gzipped_file.read( CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed ) - file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) + file_err( 'Problem decompressing gzipped data', dataset, json_file ) return if not chunk: break os.write( fd, chunk ) os.close( fd ) - bzipped_file.close() - # Replace the bzipped file with the decompressed file if it's safe to do so + gzipped_file.close() + # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: dataset.path = uncompressed else: shutil.move( uncompressed, dataset.path ) os.chmod(dataset.path, 0644) - dataset.name = dataset.name.rstrip( '.bz2' ) - data_type = 'bz2' - if not data_type: - # See if we have a zip archive - is_zipped = check_zip( dataset.path ) - if is_zipped: - if link_data_only == 'copy_files': - CHUNK_SIZE = 2**20 # 1Mb - uncompressed = None - uncompressed_name = None - unzipped = False - z = zipfile.ZipFile( dataset.path ) - for name in z.namelist(): - if name.endswith('/'): - continue - if unzipped: - stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' - break - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - if sys.version_info[:2] >= ( 2, 6 ): - zipped_file = z.open( name ) - while 1: + dataset.name = dataset.name.rstrip( '.gz' ) + data_type = 'gzip' + if not data_type and bz2 is not None: + # See if we have a bz2 file, much like gzip + is_bzipped, is_valid = check_bz2( dataset.path ) + if is_bzipped and not is_valid: + file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) + return + elif is_bzipped and is_valid: + if link_data_only == 'copy_files': + # We need to uncompress the temp_name file + CHUNK_SIZE = 2**20 # 1Mb + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) + while 1: + try: + chunk = bzipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + bzipped_file.close() + # Replace the bzipped file with the decompressed file if it's safe to do so + if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: + dataset.path = uncompressed + else: + shutil.move( uncompressed, dataset.path ) + os.chmod(dataset.path, 0644) + dataset.name = dataset.name.rstrip( '.bz2' ) + data_type = 'bz2' + if not data_type: + # See if we have a zip archive + is_zipped = check_zip( dataset.path ) + if is_zipped: + if link_data_only == 'copy_files': + CHUNK_SIZE = 2**20 # 1Mb + uncompressed = None + uncompressed_name = None + unzipped = False + z = zipfile.ZipFile( dataset.path ) + for name in z.namelist(): + if name.endswith('/'): + continue + if unzipped: + stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' + break + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + if sys.version_info[:2] >= ( 2, 6 ): + zipped_file = z.open( name ) + while 1: + try: + chunk = zipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing zipped data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + zipped_file.close() + uncompressed_name = name + unzipped = True + else: + # python < 2.5 doesn't have a way to read members in chunks(!) try: - chunk = zipped_file.read( CHUNK_SIZE ) + outfile = open( uncompressed, 'wb' ) + outfile.write( z.read( name ) ) + outfile.close() + uncompressed_name = name + unzipped = True except IOError: os.close( fd ) os.remove( uncompressed ) file_err( 'Problem decompressing zipped data', dataset, json_file ) return - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - zipped_file.close() - uncompressed_name = name - unzipped = True - else: - # python < 2.5 doesn't have a way to read members in chunks(!) - try: - outfile = open( uncompressed, 'wb' ) - outfile.write( z.read( name ) ) - outfile.close() - uncompressed_name = name - unzipped = True - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing zipped data', dataset, json_file ) - return - z.close() - # Replace the zipped file with the decompressed file if it's safe to do so - if uncompressed is not None: - if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: - dataset.path = uncompressed + z.close() + # Replace the zipped file with the decompressed file if it's safe to do so + if uncompressed is not None: + if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: + dataset.path = uncompressed + else: + shutil.move( uncompressed, dataset.path ) + os.chmod(dataset.path, 0644) + dataset.name = uncompressed_name + data_type = 'zip' + if not data_type: + # TODO refactor this logic. check_binary isn't guaranteed to be + # correct since it only looks at whether the first 100 chars are + # printable or not. If someone specifies a known unsniffable + # binary datatype and check_binary fails, the file gets mangled. + if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): + # We have a binary dataset, but it is not Bam, Sff or Pdf + data_type = 'binary' + #binary_ok = False + parts = dataset.name.split( "." ) + if len( parts ) > 1: + ext = parts[-1].strip().lower() + if not Binary.is_ext_unsniffable(ext): + file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) + return + elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: + err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) + file_err( err_msg, dataset, json_file ) + return + if not data_type: + # We must have a text file + if check_html( dataset.path ): + file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) + return + if data_type != 'binary': + if link_data_only == 'copy_files': + if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: + in_place = False + # Convert universal line endings to Posix line endings, but allow the user to turn it off, + # so that is becomes possible to upload gzip, bz2 or zip files with binary data without + # corrupting the content of those files. + if dataset.to_posix_lines: + tmpdir = output_adjacent_tmpdir( output_path ) + tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id + if dataset.space_to_tab: + line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) else: - shutil.move( uncompressed, dataset.path ) - os.chmod(dataset.path, 0644) - dataset.name = uncompressed_name - data_type = 'zip' - if not data_type: - if check_binary( dataset.path ): - # We have a binary dataset, but it is not Bam, Sff or Pdf - data_type = 'binary' - #binary_ok = False - parts = dataset.name.split( "." ) - if len( parts ) > 1: - ext = parts[-1].strip().lower() - if not Binary.is_ext_unsniffable(ext): - file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) - return - elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: - err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) - file_err( err_msg, dataset, json_file ) - return - if not data_type: - # We must have a text file - if check_html( dataset.path ): - file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) - return - if data_type != 'binary': - if link_data_only == 'copy_files': - if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: - in_place = False - # Convert universal line endings to Posix line endings, but allow the user to turn it off, - # so that is becomes possible to upload gzip, bz2 or zip files with binary data without - # corrupting the content of those files. - if dataset.to_posix_lines: - if dataset.space_to_tab: - line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) - else: - line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) - if dataset.file_type == 'auto': - ext = sniff.guess_ext( dataset.path, registry.sniff_order ) - else: - ext = dataset.file_type - data_type = ext + line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) + if dataset.file_type == 'auto': + ext = sniff.guess_ext( dataset.path, registry.sniff_order ) + else: + ext = dataset.file_type + data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext @@ -314,7 +327,7 @@ line_count = line_count ) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') - json_file.write( to_json_string( info ) + "\n" ) + json_file.write( dumps( info ) + "\n" ) if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): # Groom the dataset content if necessary @@ -340,10 +353,12 @@ dataset.path = temp_name dp = temp_name if not value.is_binary: + tmpdir = output_adjacent_tmpdir( output_path ) + tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): - sniff.convert_newlines_sep2tabs( dp ) + sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) else: - sniff.convert_newlines( dp ) + sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) shutil.move( dp, os.path.join( files_path, name ) ) # Move the dataset to its "real" path shutil.move( dataset.primary_file, output_path ) @@ -351,7 +366,16 @@ info = dict( type = 'dataset', dataset_id = dataset.dataset_id, stdout = 'uploaded %s file' % dataset.file_type ) - json_file.write( to_json_string( info ) + "\n" ) + json_file.write( dumps( info ) + "\n" ) + + +def output_adjacent_tmpdir( output_path ): + """ For temp files that will ultimately be moved to output_path anyway + just create the file directly in output_path's directory so shutil.move + will work optimially. + """ + return os.path.dirname( output_path ) + def __main__(): @@ -366,7 +390,7 @@ registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) for line in open( sys.argv[3], 'r' ): - dataset = from_json_string( line ) + dataset = loads( line ) dataset = util.bunch.Bunch( **safe_dict( dataset ) ) try: output_path = output_paths[int( dataset.dataset_id )][0]