Mercurial > repos > davidvanzessen > upload_zip_file
changeset 5:a2f200121dda draft default tip
Uploaded
| author | davidvanzessen | 
|---|---|
| date | Thu, 09 Apr 2015 04:18:00 -0400 | 
| parents | fdd4776a434f | 
| children | |
| files | uploadzip.py | 
| diffstat | 1 files changed, 138 insertions(+), 162 deletions(-) [+] | 
line wrap: on
 line diff
--- a/uploadzip.py Thu Apr 09 04:08:05 2015 -0400 +++ b/uploadzip.py Thu Apr 09 04:18:00 2015 -0400 @@ -36,7 +36,7 @@ sys.stderr.write( msg ) sys.exit( ret ) def file_err( msg, dataset, json_file ): - json_file.write( dumps( dict( type = 'dataset', + json_file.write( to_json_string( dict( type = 'dataset', ext = 'data', dataset_id = dataset.dataset_id, stderr = msg ) ) + "\n" ) @@ -113,184 +113,171 @@ ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) # Is dataset content supported sniffable binary? else: - # FIXME: This ignores the declared sniff order in datatype_conf.xml - # resulting in improper behavior type_info = Binary.is_sniffable_binary( dataset.path ) if type_info: data_type = type_info[0] ext = type_info[1] - data_type = 'compressed archive' #upload zip file modification + data_type = "compressed archive" if not data_type: - root_datatype = registry.get_datatype_by_extension( dataset.file_type ) - if getattr( root_datatype, 'compressed', False ): - data_type = 'compressed archive' - ext = dataset.file_type - else: - # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress - is_gzipped, is_valid = check_gzip( dataset.path ) - if is_gzipped and not is_valid: + # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress + is_gzipped, is_valid = check_gzip( dataset.path ) + if is_gzipped and not is_valid: + file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) + return + elif is_gzipped and is_valid: + if link_data_only == 'copy_files': + # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format + CHUNK_SIZE = 2**20 # 1Mb + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) + while 1: + try: + chunk = gzipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing gzipped data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + gzipped_file.close() + # Replace the gzipped file with the decompressed file if it's safe to do so + if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: + dataset.path = uncompressed + else: + shutil.move( uncompressed, dataset.path ) + os.chmod(dataset.path, 0644) + dataset.name = dataset.name.rstrip( '.gz' ) + data_type = 'gzip' + if not data_type and bz2 is not None: + # See if we have a bz2 file, much like gzip + is_bzipped, is_valid = check_bz2( dataset.path ) + if is_bzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) return - elif is_gzipped and is_valid: + elif is_bzipped and is_valid: if link_data_only == 'copy_files': - # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format - CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) + # We need to uncompress the temp_name file + CHUNK_SIZE = 2**20 # 1Mb + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) while 1: try: - chunk = gzipped_file.read( CHUNK_SIZE ) + chunk = bzipped_file.read( CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed ) - file_err( 'Problem decompressing gzipped data', dataset, json_file ) + file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) return if not chunk: break os.write( fd, chunk ) os.close( fd ) - gzipped_file.close() - # Replace the gzipped file with the decompressed file if it's safe to do so + bzipped_file.close() + # Replace the bzipped file with the decompressed file if it's safe to do so if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: dataset.path = uncompressed else: shutil.move( uncompressed, dataset.path ) os.chmod(dataset.path, 0644) - dataset.name = dataset.name.rstrip( '.gz' ) - data_type = 'gzip' - if not data_type and bz2 is not None: - # See if we have a bz2 file, much like gzip - is_bzipped, is_valid = check_bz2( dataset.path ) - if is_bzipped and not is_valid: - file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) - return - elif is_bzipped and is_valid: - if link_data_only == 'copy_files': - # We need to uncompress the temp_name file - CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) - while 1: + dataset.name = dataset.name.rstrip( '.bz2' ) + data_type = 'bz2' + if not data_type: + # See if we have a zip archive + is_zipped = check_zip( dataset.path ) + if is_zipped: + if link_data_only == 'copy_files': + CHUNK_SIZE = 2**20 # 1Mb + uncompressed = None + uncompressed_name = None + unzipped = False + z = zipfile.ZipFile( dataset.path ) + for name in z.namelist(): + if name.endswith('/'): + continue + if unzipped: + stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' + break + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + if sys.version_info[:2] >= ( 2, 6 ): + zipped_file = z.open( name ) + while 1: + try: + chunk = zipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing zipped data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + zipped_file.close() + uncompressed_name = name + unzipped = True + else: + # python < 2.5 doesn't have a way to read members in chunks(!) try: - chunk = bzipped_file.read( CHUNK_SIZE ) + outfile = open( uncompressed, 'wb' ) + outfile.write( z.read( name ) ) + outfile.close() + uncompressed_name = name + unzipped = True except IOError: os.close( fd ) os.remove( uncompressed ) - file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) + file_err( 'Problem decompressing zipped data', dataset, json_file ) return - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - bzipped_file.close() - # Replace the bzipped file with the decompressed file if it's safe to do so + z.close() + # Replace the zipped file with the decompressed file if it's safe to do so + if uncompressed is not None: if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: dataset.path = uncompressed else: shutil.move( uncompressed, dataset.path ) os.chmod(dataset.path, 0644) - dataset.name = dataset.name.rstrip( '.bz2' ) - data_type = 'bz2' - if not data_type: - # See if we have a zip archive - is_zipped = check_zip( dataset.path ) - if is_zipped: - if link_data_only == 'copy_files': - CHUNK_SIZE = 2**20 # 1Mb - uncompressed = None - uncompressed_name = None - unzipped = False - z = zipfile.ZipFile( dataset.path ) - for name in z.namelist(): - if name.endswith('/'): - continue - if unzipped: - stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' - break - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - if sys.version_info[:2] >= ( 2, 6 ): - zipped_file = z.open( name ) - while 1: - try: - chunk = zipped_file.read( CHUNK_SIZE ) - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing zipped data', dataset, json_file ) - return - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - zipped_file.close() - uncompressed_name = name - unzipped = True - else: - # python < 2.5 doesn't have a way to read members in chunks(!) - try: - outfile = open( uncompressed, 'wb' ) - outfile.write( z.read( name ) ) - outfile.close() - uncompressed_name = name - unzipped = True - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing zipped data', dataset, json_file ) - return - z.close() - # Replace the zipped file with the decompressed file if it's safe to do so - if uncompressed is not None: - if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: - dataset.path = uncompressed - else: - shutil.move( uncompressed, dataset.path ) - os.chmod(dataset.path, 0644) - dataset.name = uncompressed_name - data_type = 'zip' - if not data_type: - # TODO refactor this logic. check_binary isn't guaranteed to be - # correct since it only looks at whether the first 100 chars are - # printable or not. If someone specifies a known unsniffable - # binary datatype and check_binary fails, the file gets mangled. - if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): - # We have a binary dataset, but it is not Bam, Sff or Pdf - data_type = 'binary' - #binary_ok = False - parts = dataset.name.split( "." ) - if len( parts ) > 1: - ext = parts[-1].strip().lower() - if not Binary.is_ext_unsniffable(ext): - file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) - return - elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: - err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) - file_err( err_msg, dataset, json_file ) - return - if not data_type: - # We must have a text file - if check_html( dataset.path ): - file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) - return - if data_type != 'binary': - if link_data_only == 'copy_files': - if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: - in_place = False - # Convert universal line endings to Posix line endings, but allow the user to turn it off, - # so that is becomes possible to upload gzip, bz2 or zip files with binary data without - # corrupting the content of those files. - if dataset.to_posix_lines: - tmpdir = output_adjacent_tmpdir( output_path ) - tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id - if dataset.space_to_tab: - line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) - else: - line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) - if dataset.file_type == 'auto': - ext = sniff.guess_ext( dataset.path, registry.sniff_order ) - else: - ext = dataset.file_type - data_type = ext + dataset.name = uncompressed_name + data_type = 'zip' + if not data_type: + if check_binary( dataset.path ): + # We have a binary dataset, but it is not Bam, Sff or Pdf + data_type = 'binary' + #binary_ok = False + parts = dataset.name.split( "." ) + if len( parts ) > 1: + ext = parts[-1].strip().lower() + if not Binary.is_ext_unsniffable(ext): + file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) + return + elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: + err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) + file_err( err_msg, dataset, json_file ) + return + if not data_type: + # We must have a text file + if check_html( dataset.path ): + file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) + return + if data_type != 'binary': + if link_data_only == 'copy_files': + if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: + in_place = False + # Convert universal line endings to Posix line endings, but allow the user to turn it off, + # so that is becomes possible to upload gzip, bz2 or zip files with binary data without + # corrupting the content of those files. + if dataset.to_posix_lines: + if dataset.space_to_tab: + line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) + else: + line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) + if dataset.file_type == 'auto': + ext = sniff.guess_ext( dataset.path, registry.sniff_order ) + else: + ext = dataset.file_type + data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext @@ -327,7 +314,7 @@ line_count = line_count ) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') - json_file.write( dumps( info ) + "\n" ) + json_file.write( to_json_string( info ) + "\n" ) if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): # Groom the dataset content if necessary @@ -353,12 +340,10 @@ dataset.path = temp_name dp = temp_name if not value.is_binary: - tmpdir = output_adjacent_tmpdir( output_path ) - tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): - sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) + sniff.convert_newlines_sep2tabs( dp ) else: - sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) + sniff.convert_newlines( dp ) shutil.move( dp, os.path.join( files_path, name ) ) # Move the dataset to its "real" path shutil.move( dataset.primary_file, output_path ) @@ -366,16 +351,7 @@ info = dict( type = 'dataset', dataset_id = dataset.dataset_id, stdout = 'uploaded %s file' % dataset.file_type ) - json_file.write( dumps( info ) + "\n" ) - - -def output_adjacent_tmpdir( output_path ): - """ For temp files that will ultimately be moved to output_path anyway - just create the file directly in output_path's directory so shutil.move - will work optimially. - """ - return os.path.dirname( output_path ) - + json_file.write( to_json_string( info ) + "\n" ) def __main__(): @@ -390,7 +366,7 @@ registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) for line in open( sys.argv[3], 'r' ): - dataset = loads( line ) + dataset = from_json_string( line ) dataset = util.bunch.Bunch( **safe_dict( dataset ) ) try: output_path = output_paths[int( dataset.dataset_id )][0]
