Mercurial > repos > davidvanzessen > upload_zip_file
changeset 2:30d16d36d536 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 30 Mar 2015 07:58:53 -0400 |
parents | fb547483e7bd |
children | 6f24bce6817e |
files | uploadzip.py |
diffstat | 1 files changed, 161 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/uploadzip.py Wed Nov 13 10:49:14 2013 -0500 +++ b/uploadzip.py Mon Mar 30 07:58:53 2015 -0400 @@ -117,9 +117,167 @@ if type_info: data_type = type_info[0] ext = type_info[1] + data_type="binary" if not data_type: - shutil.move( dataset.path, output_path ) - #data_type = "data" + # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress + is_gzipped, is_valid = check_gzip( dataset.path ) + if is_gzipped and not is_valid: + file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) + return + elif is_gzipped and is_valid: + if link_data_only == 'copy_files': + # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format + CHUNK_SIZE = 2**20 # 1Mb + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) + while 1: + try: + chunk = gzipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing gzipped data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + gzipped_file.close() + # Replace the gzipped file with the decompressed file if it's safe to do so + if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: + dataset.path = uncompressed + else: + shutil.move( uncompressed, dataset.path ) + os.chmod(dataset.path, 0644) + dataset.name = dataset.name.rstrip( '.gz' ) + data_type = 'gzip' + if not data_type and bz2 is not None: + # See if we have a bz2 file, much like gzip + is_bzipped, is_valid = check_bz2( dataset.path ) + if is_bzipped and not is_valid: + file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) + return + elif is_bzipped and is_valid: + if link_data_only == 'copy_files': + # We need to uncompress the temp_name file + CHUNK_SIZE = 2**20 # 1Mb + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) + while 1: + try: + chunk = bzipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + bzipped_file.close() + # Replace the bzipped file with the decompressed file if it's safe to do so + if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: + dataset.path = uncompressed + else: + shutil.move( uncompressed, dataset.path ) + os.chmod(dataset.path, 0644) + dataset.name = dataset.name.rstrip( '.bz2' ) + data_type = 'bz2' + if not data_type: + # See if we have a zip archive + is_zipped = check_zip( dataset.path ) + if is_zipped: + if link_data_only == 'copy_files': + CHUNK_SIZE = 2**20 # 1Mb + uncompressed = None + uncompressed_name = None + unzipped = False + z = zipfile.ZipFile( dataset.path ) + for name in z.namelist(): + if name.endswith('/'): + continue + if unzipped: + stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' + break + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + if sys.version_info[:2] >= ( 2, 6 ): + zipped_file = z.open( name ) + while 1: + try: + chunk = zipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing zipped data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + zipped_file.close() + uncompressed_name = name + unzipped = True + else: + # python < 2.5 doesn't have a way to read members in chunks(!) + try: + outfile = open( uncompressed, 'wb' ) + outfile.write( z.read( name ) ) + outfile.close() + uncompressed_name = name + unzipped = True + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing zipped data', dataset, json_file ) + return + z.close() + # Replace the zipped file with the decompressed file if it's safe to do so + if uncompressed is not None: + if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: + dataset.path = uncompressed + else: + shutil.move( uncompressed, dataset.path ) + os.chmod(dataset.path, 0644) + dataset.name = uncompressed_name + data_type = 'zip' + if not data_type: + if check_binary( dataset.path ): + # We have a binary dataset, but it is not Bam, Sff or Pdf + data_type = 'binary' + #binary_ok = False + parts = dataset.name.split( "." ) + if len( parts ) > 1: + ext = parts[-1].strip().lower() + if not Binary.is_ext_unsniffable(ext): + file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) + return + elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: + err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) + file_err( err_msg, dataset, json_file ) + return + if not data_type: + # We must have a text file + if check_html( dataset.path ): + file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) + return + if data_type != 'binary': + if link_data_only == 'copy_files': + if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: + in_place = False + # Convert universal line endings to Posix line endings, but allow the user to turn it off, + # so that is becomes possible to upload gzip, bz2 or zip files with binary data without + # corrupting the content of those files. + if dataset.to_posix_lines: + if dataset.space_to_tab: + line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) + else: + line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) + if dataset.file_type == 'auto': + ext = sniff.guess_ext( dataset.path, registry.sniff_order ) + else: + ext = dataset.file_type + data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext @@ -145,8 +303,7 @@ # This should not happen, but it's here just in case shutil.copy( dataset.path, output_path ) elif link_data_only == 'copy_files': - if os.path.exists(dataset.path): - shutil.move( dataset.path, output_path ) + shutil.move( dataset.path, output_path ) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict( type = 'dataset',