comparison uploadzip.py @ 3:6f24bce6817e draft

Uploaded
author davidvanzessen
date Mon, 30 Mar 2015 10:13:25 -0400
parents 30d16d36d536
children a2f200121dda
comparison
equal deleted inserted replaced
2:30d16d36d536 3:6f24bce6817e
34 34
35 def stop_err( msg, ret=1 ): 35 def stop_err( msg, ret=1 ):
36 sys.stderr.write( msg ) 36 sys.stderr.write( msg )
37 sys.exit( ret ) 37 sys.exit( ret )
38 def file_err( msg, dataset, json_file ): 38 def file_err( msg, dataset, json_file ):
39 json_file.write( to_json_string( dict( type = 'dataset', 39 json_file.write( dumps( dict( type = 'dataset',
40 ext = 'data', 40 ext = 'data',
41 dataset_id = dataset.dataset_id, 41 dataset_id = dataset.dataset_id,
42 stderr = msg ) ) + "\n" ) 42 stderr = msg ) ) + "\n" )
43 # never remove a server-side upload 43 # never remove a server-side upload
44 if dataset.type in ( 'server_dir', 'path_paste' ): 44 if dataset.type in ( 'server_dir', 'path_paste' ):
111 elif dataset.is_multi_byte: 111 elif dataset.is_multi_byte:
112 data_type = 'multi-byte char' 112 data_type = 'multi-byte char'
113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) 113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
114 # Is dataset content supported sniffable binary? 114 # Is dataset content supported sniffable binary?
115 else: 115 else:
116 # FIXME: This ignores the declared sniff order in datatype_conf.xml
117 # resulting in improper behavior
116 type_info = Binary.is_sniffable_binary( dataset.path ) 118 type_info = Binary.is_sniffable_binary( dataset.path )
117 if type_info: 119 if type_info:
118 data_type = type_info[0] 120 data_type = type_info[0]
119 ext = type_info[1] 121 ext = type_info[1]
120 data_type="binary" 122 data_type = 'compressed archive' #upload zip file modification
121 if not data_type: 123 if not data_type:
122 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress 124 root_datatype = registry.get_datatype_by_extension( dataset.file_type )
123 is_gzipped, is_valid = check_gzip( dataset.path ) 125 if getattr( root_datatype, 'compressed', False ):
124 if is_gzipped and not is_valid: 126 data_type = 'compressed archive'
125 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) 127 ext = dataset.file_type
126 return 128 else:
127 elif is_gzipped and is_valid: 129 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
128 if link_data_only == 'copy_files': 130 is_gzipped, is_valid = check_gzip( dataset.path )
129 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format 131 if is_gzipped and not is_valid:
130 CHUNK_SIZE = 2**20 # 1Mb
131 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
132 gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
133 while 1:
134 try:
135 chunk = gzipped_file.read( CHUNK_SIZE )
136 except IOError:
137 os.close( fd )
138 os.remove( uncompressed )
139 file_err( 'Problem decompressing gzipped data', dataset, json_file )
140 return
141 if not chunk:
142 break
143 os.write( fd, chunk )
144 os.close( fd )
145 gzipped_file.close()
146 # Replace the gzipped file with the decompressed file if it's safe to do so
147 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
148 dataset.path = uncompressed
149 else:
150 shutil.move( uncompressed, dataset.path )
151 os.chmod(dataset.path, 0644)
152 dataset.name = dataset.name.rstrip( '.gz' )
153 data_type = 'gzip'
154 if not data_type and bz2 is not None:
155 # See if we have a bz2 file, much like gzip
156 is_bzipped, is_valid = check_bz2( dataset.path )
157 if is_bzipped and not is_valid:
158 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) 132 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
159 return 133 return
160 elif is_bzipped and is_valid: 134 elif is_gzipped and is_valid:
161 if link_data_only == 'copy_files': 135 if link_data_only == 'copy_files':
162 # We need to uncompress the temp_name file 136 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
163 CHUNK_SIZE = 2**20 # 1Mb 137 CHUNK_SIZE = 2**20 # 1Mb
164 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) 138 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
165 bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) 139 gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
166 while 1: 140 while 1:
167 try: 141 try:
168 chunk = bzipped_file.read( CHUNK_SIZE ) 142 chunk = gzipped_file.read( CHUNK_SIZE )
169 except IOError: 143 except IOError:
170 os.close( fd ) 144 os.close( fd )
171 os.remove( uncompressed ) 145 os.remove( uncompressed )
172 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) 146 file_err( 'Problem decompressing gzipped data', dataset, json_file )
173 return 147 return
174 if not chunk: 148 if not chunk:
175 break 149 break
176 os.write( fd, chunk ) 150 os.write( fd, chunk )
177 os.close( fd ) 151 os.close( fd )
178 bzipped_file.close() 152 gzipped_file.close()
179 # Replace the bzipped file with the decompressed file if it's safe to do so 153 # Replace the gzipped file with the decompressed file if it's safe to do so
180 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: 154 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
181 dataset.path = uncompressed 155 dataset.path = uncompressed
182 else: 156 else:
183 shutil.move( uncompressed, dataset.path ) 157 shutil.move( uncompressed, dataset.path )
184 os.chmod(dataset.path, 0644) 158 os.chmod(dataset.path, 0644)
185 dataset.name = dataset.name.rstrip( '.bz2' ) 159 dataset.name = dataset.name.rstrip( '.gz' )
186 data_type = 'bz2' 160 data_type = 'gzip'
187 if not data_type: 161 if not data_type and bz2 is not None:
188 # See if we have a zip archive 162 # See if we have a bz2 file, much like gzip
189 is_zipped = check_zip( dataset.path ) 163 is_bzipped, is_valid = check_bz2( dataset.path )
190 if is_zipped: 164 if is_bzipped and not is_valid:
191 if link_data_only == 'copy_files': 165 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
192 CHUNK_SIZE = 2**20 # 1Mb 166 return
193 uncompressed = None 167 elif is_bzipped and is_valid:
194 uncompressed_name = None 168 if link_data_only == 'copy_files':
195 unzipped = False 169 # We need to uncompress the temp_name file
196 z = zipfile.ZipFile( dataset.path ) 170 CHUNK_SIZE = 2**20 # 1Mb
197 for name in z.namelist(): 171 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
198 if name.endswith('/'): 172 bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
199 continue 173 while 1:
200 if unzipped: 174 try:
201 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' 175 chunk = bzipped_file.read( CHUNK_SIZE )
202 break 176 except IOError:
203 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) 177 os.close( fd )
204 if sys.version_info[:2] >= ( 2, 6 ): 178 os.remove( uncompressed )
205 zipped_file = z.open( name ) 179 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
206 while 1: 180 return
181 if not chunk:
182 break
183 os.write( fd, chunk )
184 os.close( fd )
185 bzipped_file.close()
186 # Replace the bzipped file with the decompressed file if it's safe to do so
187 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
188 dataset.path = uncompressed
189 else:
190 shutil.move( uncompressed, dataset.path )
191 os.chmod(dataset.path, 0644)
192 dataset.name = dataset.name.rstrip( '.bz2' )
193 data_type = 'bz2'
194 if not data_type:
195 # See if we have a zip archive
196 is_zipped = check_zip( dataset.path )
197 if is_zipped:
198 if link_data_only == 'copy_files':
199 CHUNK_SIZE = 2**20 # 1Mb
200 uncompressed = None
201 uncompressed_name = None
202 unzipped = False
203 z = zipfile.ZipFile( dataset.path )
204 for name in z.namelist():
205 if name.endswith('/'):
206 continue
207 if unzipped:
208 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
209 break
210 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
211 if sys.version_info[:2] >= ( 2, 6 ):
212 zipped_file = z.open( name )
213 while 1:
214 try:
215 chunk = zipped_file.read( CHUNK_SIZE )
216 except IOError:
217 os.close( fd )
218 os.remove( uncompressed )
219 file_err( 'Problem decompressing zipped data', dataset, json_file )
220 return
221 if not chunk:
222 break
223 os.write( fd, chunk )
224 os.close( fd )
225 zipped_file.close()
226 uncompressed_name = name
227 unzipped = True
228 else:
229 # python < 2.5 doesn't have a way to read members in chunks(!)
207 try: 230 try:
208 chunk = zipped_file.read( CHUNK_SIZE ) 231 outfile = open( uncompressed, 'wb' )
232 outfile.write( z.read( name ) )
233 outfile.close()
234 uncompressed_name = name
235 unzipped = True
209 except IOError: 236 except IOError:
210 os.close( fd ) 237 os.close( fd )
211 os.remove( uncompressed ) 238 os.remove( uncompressed )
212 file_err( 'Problem decompressing zipped data', dataset, json_file ) 239 file_err( 'Problem decompressing zipped data', dataset, json_file )
213 return 240 return
214 if not chunk: 241 z.close()
215 break 242 # Replace the zipped file with the decompressed file if it's safe to do so
216 os.write( fd, chunk ) 243 if uncompressed is not None:
217 os.close( fd ) 244 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
218 zipped_file.close() 245 dataset.path = uncompressed
219 uncompressed_name = name 246 else:
220 unzipped = True 247 shutil.move( uncompressed, dataset.path )
248 os.chmod(dataset.path, 0644)
249 dataset.name = uncompressed_name
250 data_type = 'zip'
251 if not data_type:
252 # TODO refactor this logic. check_binary isn't guaranteed to be
253 # correct since it only looks at whether the first 100 chars are
254 # printable or not. If someone specifies a known unsniffable
255 # binary datatype and check_binary fails, the file gets mangled.
256 if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
257 # We have a binary dataset, but it is not Bam, Sff or Pdf
258 data_type = 'binary'
259 #binary_ok = False
260 parts = dataset.name.split( "." )
261 if len( parts ) > 1:
262 ext = parts[-1].strip().lower()
263 if not Binary.is_ext_unsniffable(ext):
264 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
265 return
266 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
267 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
268 file_err( err_msg, dataset, json_file )
269 return
270 if not data_type:
271 # We must have a text file
272 if check_html( dataset.path ):
273 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
274 return
275 if data_type != 'binary':
276 if link_data_only == 'copy_files':
277 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
278 in_place = False
279 # Convert universal line endings to Posix line endings, but allow the user to turn it off,
280 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
281 # corrupting the content of those files.
282 if dataset.to_posix_lines:
283 tmpdir = output_adjacent_tmpdir( output_path )
284 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
285 if dataset.space_to_tab:
286 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
221 else: 287 else:
222 # python < 2.5 doesn't have a way to read members in chunks(!) 288 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
223 try: 289 if dataset.file_type == 'auto':
224 outfile = open( uncompressed, 'wb' ) 290 ext = sniff.guess_ext( dataset.path, registry.sniff_order )
225 outfile.write( z.read( name ) ) 291 else:
226 outfile.close() 292 ext = dataset.file_type
227 uncompressed_name = name 293 data_type = ext
228 unzipped = True
229 except IOError:
230 os.close( fd )
231 os.remove( uncompressed )
232 file_err( 'Problem decompressing zipped data', dataset, json_file )
233 return
234 z.close()
235 # Replace the zipped file with the decompressed file if it's safe to do so
236 if uncompressed is not None:
237 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
238 dataset.path = uncompressed
239 else:
240 shutil.move( uncompressed, dataset.path )
241 os.chmod(dataset.path, 0644)
242 dataset.name = uncompressed_name
243 data_type = 'zip'
244 if not data_type:
245 if check_binary( dataset.path ):
246 # We have a binary dataset, but it is not Bam, Sff or Pdf
247 data_type = 'binary'
248 #binary_ok = False
249 parts = dataset.name.split( "." )
250 if len( parts ) > 1:
251 ext = parts[-1].strip().lower()
252 if not Binary.is_ext_unsniffable(ext):
253 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
254 return
255 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
256 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
257 file_err( err_msg, dataset, json_file )
258 return
259 if not data_type:
260 # We must have a text file
261 if check_html( dataset.path ):
262 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
263 return
264 if data_type != 'binary':
265 if link_data_only == 'copy_files':
266 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
267 in_place = False
268 # Convert universal line endings to Posix line endings, but allow the user to turn it off,
269 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
270 # corrupting the content of those files.
271 if dataset.to_posix_lines:
272 if dataset.space_to_tab:
273 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place )
274 else:
275 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place )
276 if dataset.file_type == 'auto':
277 ext = sniff.guess_ext( dataset.path, registry.sniff_order )
278 else:
279 ext = dataset.file_type
280 data_type = ext
281 # Save job info for the framework 294 # Save job info for the framework
282 if ext == 'auto' and dataset.ext: 295 if ext == 'auto' and dataset.ext:
283 ext = dataset.ext 296 ext = dataset.ext
284 if ext == 'auto': 297 if ext == 'auto':
285 ext = 'data' 298 ext = 'data'
312 stdout = stdout, 325 stdout = stdout,
313 name = dataset.name, 326 name = dataset.name,
314 line_count = line_count ) 327 line_count = line_count )
315 if dataset.get('uuid', None) is not None: 328 if dataset.get('uuid', None) is not None:
316 info['uuid'] = dataset.get('uuid') 329 info['uuid'] = dataset.get('uuid')
317 json_file.write( to_json_string( info ) + "\n" ) 330 json_file.write( dumps( info ) + "\n" )
318 331
319 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): 332 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ):
320 # Groom the dataset content if necessary 333 # Groom the dataset content if necessary
321 datatype.groom_dataset_content( output_path ) 334 datatype.groom_dataset_content( output_path )
322 335
338 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) 351 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file )
339 return 352 return
340 dataset.path = temp_name 353 dataset.path = temp_name
341 dp = temp_name 354 dp = temp_name
342 if not value.is_binary: 355 if not value.is_binary:
356 tmpdir = output_adjacent_tmpdir( output_path )
357 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
343 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): 358 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ):
344 sniff.convert_newlines_sep2tabs( dp ) 359 sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
345 else: 360 else:
346 sniff.convert_newlines( dp ) 361 sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
347 shutil.move( dp, os.path.join( files_path, name ) ) 362 shutil.move( dp, os.path.join( files_path, name ) )
348 # Move the dataset to its "real" path 363 # Move the dataset to its "real" path
349 shutil.move( dataset.primary_file, output_path ) 364 shutil.move( dataset.primary_file, output_path )
350 # Write the job info 365 # Write the job info
351 info = dict( type = 'dataset', 366 info = dict( type = 'dataset',
352 dataset_id = dataset.dataset_id, 367 dataset_id = dataset.dataset_id,
353 stdout = 'uploaded %s file' % dataset.file_type ) 368 stdout = 'uploaded %s file' % dataset.file_type )
354 json_file.write( to_json_string( info ) + "\n" ) 369 json_file.write( dumps( info ) + "\n" )
370
371
372 def output_adjacent_tmpdir( output_path ):
373 """ For temp files that will ultimately be moved to output_path anyway
374 just create the file directly in output_path's directory so shutil.move
375 will work optimially.
376 """
377 return os.path.dirname( output_path )
378
355 379
356 def __main__(): 380 def __main__():
357 381
358 if len( sys.argv ) < 4: 382 if len( sys.argv ) < 4:
359 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' 383 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...'
364 388
365 registry = Registry() 389 registry = Registry()
366 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) 390 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] )
367 391
368 for line in open( sys.argv[3], 'r' ): 392 for line in open( sys.argv[3], 'r' ):
369 dataset = from_json_string( line ) 393 dataset = loads( line )
370 dataset = util.bunch.Bunch( **safe_dict( dataset ) ) 394 dataset = util.bunch.Bunch( **safe_dict( dataset ) )
371 try: 395 try:
372 output_path = output_paths[int( dataset.dataset_id )][0] 396 output_path = output_paths[int( dataset.dataset_id )][0]
373 except: 397 except:
374 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id 398 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id