comparison uploadzip.py @ 5:a2f200121dda draft default tip

Uploaded
author davidvanzessen
date Thu, 09 Apr 2015 04:18:00 -0400
parents 6f24bce6817e
children
comparison
equal deleted inserted replaced
4:fdd4776a434f 5:a2f200121dda
34 34
35 def stop_err( msg, ret=1 ): 35 def stop_err( msg, ret=1 ):
36 sys.stderr.write( msg ) 36 sys.stderr.write( msg )
37 sys.exit( ret ) 37 sys.exit( ret )
38 def file_err( msg, dataset, json_file ): 38 def file_err( msg, dataset, json_file ):
39 json_file.write( dumps( dict( type = 'dataset', 39 json_file.write( to_json_string( dict( type = 'dataset',
40 ext = 'data', 40 ext = 'data',
41 dataset_id = dataset.dataset_id, 41 dataset_id = dataset.dataset_id,
42 stderr = msg ) ) + "\n" ) 42 stderr = msg ) ) + "\n" )
43 # never remove a server-side upload 43 # never remove a server-side upload
44 if dataset.type in ( 'server_dir', 'path_paste' ): 44 if dataset.type in ( 'server_dir', 'path_paste' ):
111 elif dataset.is_multi_byte: 111 elif dataset.is_multi_byte:
112 data_type = 'multi-byte char' 112 data_type = 'multi-byte char'
113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) 113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
114 # Is dataset content supported sniffable binary? 114 # Is dataset content supported sniffable binary?
115 else: 115 else:
116 # FIXME: This ignores the declared sniff order in datatype_conf.xml
117 # resulting in improper behavior
118 type_info = Binary.is_sniffable_binary( dataset.path ) 116 type_info = Binary.is_sniffable_binary( dataset.path )
119 if type_info: 117 if type_info:
120 data_type = type_info[0] 118 data_type = type_info[0]
121 ext = type_info[1] 119 ext = type_info[1]
122 data_type = 'compressed archive' #upload zip file modification 120 data_type = "compressed archive"
123 if not data_type: 121 if not data_type:
124 root_datatype = registry.get_datatype_by_extension( dataset.file_type ) 122 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
125 if getattr( root_datatype, 'compressed', False ): 123 is_gzipped, is_valid = check_gzip( dataset.path )
126 data_type = 'compressed archive' 124 if is_gzipped and not is_valid:
127 ext = dataset.file_type 125 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
128 else: 126 return
129 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress 127 elif is_gzipped and is_valid:
130 is_gzipped, is_valid = check_gzip( dataset.path ) 128 if link_data_only == 'copy_files':
131 if is_gzipped and not is_valid: 129 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
130 CHUNK_SIZE = 2**20 # 1Mb
131 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
132 gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
133 while 1:
134 try:
135 chunk = gzipped_file.read( CHUNK_SIZE )
136 except IOError:
137 os.close( fd )
138 os.remove( uncompressed )
139 file_err( 'Problem decompressing gzipped data', dataset, json_file )
140 return
141 if not chunk:
142 break
143 os.write( fd, chunk )
144 os.close( fd )
145 gzipped_file.close()
146 # Replace the gzipped file with the decompressed file if it's safe to do so
147 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
148 dataset.path = uncompressed
149 else:
150 shutil.move( uncompressed, dataset.path )
151 os.chmod(dataset.path, 0644)
152 dataset.name = dataset.name.rstrip( '.gz' )
153 data_type = 'gzip'
154 if not data_type and bz2 is not None:
155 # See if we have a bz2 file, much like gzip
156 is_bzipped, is_valid = check_bz2( dataset.path )
157 if is_bzipped and not is_valid:
132 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) 158 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
133 return 159 return
134 elif is_gzipped and is_valid: 160 elif is_bzipped and is_valid:
135 if link_data_only == 'copy_files': 161 if link_data_only == 'copy_files':
136 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format 162 # We need to uncompress the temp_name file
137 CHUNK_SIZE = 2**20 # 1Mb 163 CHUNK_SIZE = 2**20 # 1Mb
138 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) 164 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
139 gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) 165 bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
140 while 1: 166 while 1:
141 try: 167 try:
142 chunk = gzipped_file.read( CHUNK_SIZE ) 168 chunk = bzipped_file.read( CHUNK_SIZE )
143 except IOError: 169 except IOError:
144 os.close( fd ) 170 os.close( fd )
145 os.remove( uncompressed ) 171 os.remove( uncompressed )
146 file_err( 'Problem decompressing gzipped data', dataset, json_file ) 172 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
147 return 173 return
148 if not chunk: 174 if not chunk:
149 break 175 break
150 os.write( fd, chunk ) 176 os.write( fd, chunk )
151 os.close( fd ) 177 os.close( fd )
152 gzipped_file.close() 178 bzipped_file.close()
153 # Replace the gzipped file with the decompressed file if it's safe to do so 179 # Replace the bzipped file with the decompressed file if it's safe to do so
154 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: 180 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
155 dataset.path = uncompressed 181 dataset.path = uncompressed
156 else: 182 else:
157 shutil.move( uncompressed, dataset.path ) 183 shutil.move( uncompressed, dataset.path )
158 os.chmod(dataset.path, 0644) 184 os.chmod(dataset.path, 0644)
159 dataset.name = dataset.name.rstrip( '.gz' ) 185 dataset.name = dataset.name.rstrip( '.bz2' )
160 data_type = 'gzip' 186 data_type = 'bz2'
161 if not data_type and bz2 is not None: 187 if not data_type:
162 # See if we have a bz2 file, much like gzip 188 # See if we have a zip archive
163 is_bzipped, is_valid = check_bz2( dataset.path ) 189 is_zipped = check_zip( dataset.path )
164 if is_bzipped and not is_valid: 190 if is_zipped:
165 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) 191 if link_data_only == 'copy_files':
166 return 192 CHUNK_SIZE = 2**20 # 1Mb
167 elif is_bzipped and is_valid: 193 uncompressed = None
168 if link_data_only == 'copy_files': 194 uncompressed_name = None
169 # We need to uncompress the temp_name file 195 unzipped = False
170 CHUNK_SIZE = 2**20 # 1Mb 196 z = zipfile.ZipFile( dataset.path )
171 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) 197 for name in z.namelist():
172 bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) 198 if name.endswith('/'):
173 while 1: 199 continue
200 if unzipped:
201 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
202 break
203 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
204 if sys.version_info[:2] >= ( 2, 6 ):
205 zipped_file = z.open( name )
206 while 1:
207 try:
208 chunk = zipped_file.read( CHUNK_SIZE )
209 except IOError:
210 os.close( fd )
211 os.remove( uncompressed )
212 file_err( 'Problem decompressing zipped data', dataset, json_file )
213 return
214 if not chunk:
215 break
216 os.write( fd, chunk )
217 os.close( fd )
218 zipped_file.close()
219 uncompressed_name = name
220 unzipped = True
221 else:
222 # python < 2.5 doesn't have a way to read members in chunks(!)
174 try: 223 try:
175 chunk = bzipped_file.read( CHUNK_SIZE ) 224 outfile = open( uncompressed, 'wb' )
225 outfile.write( z.read( name ) )
226 outfile.close()
227 uncompressed_name = name
228 unzipped = True
176 except IOError: 229 except IOError:
177 os.close( fd ) 230 os.close( fd )
178 os.remove( uncompressed ) 231 os.remove( uncompressed )
179 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) 232 file_err( 'Problem decompressing zipped data', dataset, json_file )
180 return 233 return
181 if not chunk: 234 z.close()
182 break 235 # Replace the zipped file with the decompressed file if it's safe to do so
183 os.write( fd, chunk ) 236 if uncompressed is not None:
184 os.close( fd )
185 bzipped_file.close()
186 # Replace the bzipped file with the decompressed file if it's safe to do so
187 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: 237 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
188 dataset.path = uncompressed 238 dataset.path = uncompressed
189 else: 239 else:
190 shutil.move( uncompressed, dataset.path ) 240 shutil.move( uncompressed, dataset.path )
191 os.chmod(dataset.path, 0644) 241 os.chmod(dataset.path, 0644)
192 dataset.name = dataset.name.rstrip( '.bz2' ) 242 dataset.name = uncompressed_name
193 data_type = 'bz2' 243 data_type = 'zip'
194 if not data_type: 244 if not data_type:
195 # See if we have a zip archive 245 if check_binary( dataset.path ):
196 is_zipped = check_zip( dataset.path ) 246 # We have a binary dataset, but it is not Bam, Sff or Pdf
197 if is_zipped: 247 data_type = 'binary'
198 if link_data_only == 'copy_files': 248 #binary_ok = False
199 CHUNK_SIZE = 2**20 # 1Mb 249 parts = dataset.name.split( "." )
200 uncompressed = None 250 if len( parts ) > 1:
201 uncompressed_name = None 251 ext = parts[-1].strip().lower()
202 unzipped = False 252 if not Binary.is_ext_unsniffable(ext):
203 z = zipfile.ZipFile( dataset.path ) 253 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
204 for name in z.namelist(): 254 return
205 if name.endswith('/'): 255 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
206 continue 256 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
207 if unzipped: 257 file_err( err_msg, dataset, json_file )
208 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' 258 return
209 break 259 if not data_type:
210 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) 260 # We must have a text file
211 if sys.version_info[:2] >= ( 2, 6 ): 261 if check_html( dataset.path ):
212 zipped_file = z.open( name ) 262 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
213 while 1: 263 return
214 try: 264 if data_type != 'binary':
215 chunk = zipped_file.read( CHUNK_SIZE ) 265 if link_data_only == 'copy_files':
216 except IOError: 266 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
217 os.close( fd ) 267 in_place = False
218 os.remove( uncompressed ) 268 # Convert universal line endings to Posix line endings, but allow the user to turn it off,
219 file_err( 'Problem decompressing zipped data', dataset, json_file ) 269 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
220 return 270 # corrupting the content of those files.
221 if not chunk: 271 if dataset.to_posix_lines:
222 break 272 if dataset.space_to_tab:
223 os.write( fd, chunk ) 273 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place )
224 os.close( fd ) 274 else:
225 zipped_file.close() 275 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place )
226 uncompressed_name = name 276 if dataset.file_type == 'auto':
227 unzipped = True 277 ext = sniff.guess_ext( dataset.path, registry.sniff_order )
228 else: 278 else:
229 # python < 2.5 doesn't have a way to read members in chunks(!) 279 ext = dataset.file_type
230 try: 280 data_type = ext
231 outfile = open( uncompressed, 'wb' )
232 outfile.write( z.read( name ) )
233 outfile.close()
234 uncompressed_name = name
235 unzipped = True
236 except IOError:
237 os.close( fd )
238 os.remove( uncompressed )
239 file_err( 'Problem decompressing zipped data', dataset, json_file )
240 return
241 z.close()
242 # Replace the zipped file with the decompressed file if it's safe to do so
243 if uncompressed is not None:
244 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
245 dataset.path = uncompressed
246 else:
247 shutil.move( uncompressed, dataset.path )
248 os.chmod(dataset.path, 0644)
249 dataset.name = uncompressed_name
250 data_type = 'zip'
251 if not data_type:
252 # TODO refactor this logic. check_binary isn't guaranteed to be
253 # correct since it only looks at whether the first 100 chars are
254 # printable or not. If someone specifies a known unsniffable
255 # binary datatype and check_binary fails, the file gets mangled.
256 if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
257 # We have a binary dataset, but it is not Bam, Sff or Pdf
258 data_type = 'binary'
259 #binary_ok = False
260 parts = dataset.name.split( "." )
261 if len( parts ) > 1:
262 ext = parts[-1].strip().lower()
263 if not Binary.is_ext_unsniffable(ext):
264 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
265 return
266 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
267 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
268 file_err( err_msg, dataset, json_file )
269 return
270 if not data_type:
271 # We must have a text file
272 if check_html( dataset.path ):
273 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
274 return
275 if data_type != 'binary':
276 if link_data_only == 'copy_files':
277 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
278 in_place = False
279 # Convert universal line endings to Posix line endings, but allow the user to turn it off,
280 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
281 # corrupting the content of those files.
282 if dataset.to_posix_lines:
283 tmpdir = output_adjacent_tmpdir( output_path )
284 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
285 if dataset.space_to_tab:
286 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
287 else:
288 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
289 if dataset.file_type == 'auto':
290 ext = sniff.guess_ext( dataset.path, registry.sniff_order )
291 else:
292 ext = dataset.file_type
293 data_type = ext
294 # Save job info for the framework 281 # Save job info for the framework
295 if ext == 'auto' and dataset.ext: 282 if ext == 'auto' and dataset.ext:
296 ext = dataset.ext 283 ext = dataset.ext
297 if ext == 'auto': 284 if ext == 'auto':
298 ext = 'data' 285 ext = 'data'
325 stdout = stdout, 312 stdout = stdout,
326 name = dataset.name, 313 name = dataset.name,
327 line_count = line_count ) 314 line_count = line_count )
328 if dataset.get('uuid', None) is not None: 315 if dataset.get('uuid', None) is not None:
329 info['uuid'] = dataset.get('uuid') 316 info['uuid'] = dataset.get('uuid')
330 json_file.write( dumps( info ) + "\n" ) 317 json_file.write( to_json_string( info ) + "\n" )
331 318
332 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): 319 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ):
333 # Groom the dataset content if necessary 320 # Groom the dataset content if necessary
334 datatype.groom_dataset_content( output_path ) 321 datatype.groom_dataset_content( output_path )
335 322
351 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) 338 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file )
352 return 339 return
353 dataset.path = temp_name 340 dataset.path = temp_name
354 dp = temp_name 341 dp = temp_name
355 if not value.is_binary: 342 if not value.is_binary:
356 tmpdir = output_adjacent_tmpdir( output_path )
357 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
358 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): 343 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ):
359 sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) 344 sniff.convert_newlines_sep2tabs( dp )
360 else: 345 else:
361 sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) 346 sniff.convert_newlines( dp )
362 shutil.move( dp, os.path.join( files_path, name ) ) 347 shutil.move( dp, os.path.join( files_path, name ) )
363 # Move the dataset to its "real" path 348 # Move the dataset to its "real" path
364 shutil.move( dataset.primary_file, output_path ) 349 shutil.move( dataset.primary_file, output_path )
365 # Write the job info 350 # Write the job info
366 info = dict( type = 'dataset', 351 info = dict( type = 'dataset',
367 dataset_id = dataset.dataset_id, 352 dataset_id = dataset.dataset_id,
368 stdout = 'uploaded %s file' % dataset.file_type ) 353 stdout = 'uploaded %s file' % dataset.file_type )
369 json_file.write( dumps( info ) + "\n" ) 354 json_file.write( to_json_string( info ) + "\n" )
370
371
372 def output_adjacent_tmpdir( output_path ):
373 """ For temp files that will ultimately be moved to output_path anyway
374 just create the file directly in output_path's directory so shutil.move
375 will work optimially.
376 """
377 return os.path.dirname( output_path )
378
379 355
380 def __main__(): 356 def __main__():
381 357
382 if len( sys.argv ) < 4: 358 if len( sys.argv ) < 4:
383 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' 359 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...'
388 364
389 registry = Registry() 365 registry = Registry()
390 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) 366 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] )
391 367
392 for line in open( sys.argv[3], 'r' ): 368 for line in open( sys.argv[3], 'r' ):
393 dataset = loads( line ) 369 dataset = from_json_string( line )
394 dataset = util.bunch.Bunch( **safe_dict( dataset ) ) 370 dataset = util.bunch.Bunch( **safe_dict( dataset ) )
395 try: 371 try:
396 output_path = output_paths[int( dataset.dataset_id )][0] 372 output_path = output_paths[int( dataset.dataset_id )][0]
397 except: 373 except:
398 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id 374 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id