Mercurial > repos > davidvanzessen > upload_zip_file
comparison uploadzip.py @ 3:6f24bce6817e draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 30 Mar 2015 10:13:25 -0400 |
parents | 30d16d36d536 |
children | a2f200121dda |
comparison
equal
deleted
inserted
replaced
2:30d16d36d536 | 3:6f24bce6817e |
---|---|
34 | 34 |
35 def stop_err( msg, ret=1 ): | 35 def stop_err( msg, ret=1 ): |
36 sys.stderr.write( msg ) | 36 sys.stderr.write( msg ) |
37 sys.exit( ret ) | 37 sys.exit( ret ) |
38 def file_err( msg, dataset, json_file ): | 38 def file_err( msg, dataset, json_file ): |
39 json_file.write( to_json_string( dict( type = 'dataset', | 39 json_file.write( dumps( dict( type = 'dataset', |
40 ext = 'data', | 40 ext = 'data', |
41 dataset_id = dataset.dataset_id, | 41 dataset_id = dataset.dataset_id, |
42 stderr = msg ) ) + "\n" ) | 42 stderr = msg ) ) + "\n" ) |
43 # never remove a server-side upload | 43 # never remove a server-side upload |
44 if dataset.type in ( 'server_dir', 'path_paste' ): | 44 if dataset.type in ( 'server_dir', 'path_paste' ): |
111 elif dataset.is_multi_byte: | 111 elif dataset.is_multi_byte: |
112 data_type = 'multi-byte char' | 112 data_type = 'multi-byte char' |
113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) | 113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) |
114 # Is dataset content supported sniffable binary? | 114 # Is dataset content supported sniffable binary? |
115 else: | 115 else: |
116 # FIXME: This ignores the declared sniff order in datatype_conf.xml | |
117 # resulting in improper behavior | |
116 type_info = Binary.is_sniffable_binary( dataset.path ) | 118 type_info = Binary.is_sniffable_binary( dataset.path ) |
117 if type_info: | 119 if type_info: |
118 data_type = type_info[0] | 120 data_type = type_info[0] |
119 ext = type_info[1] | 121 ext = type_info[1] |
120 data_type="binary" | 122 data_type = 'compressed archive' #upload zip file modification |
121 if not data_type: | 123 if not data_type: |
122 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress | 124 root_datatype = registry.get_datatype_by_extension( dataset.file_type ) |
123 is_gzipped, is_valid = check_gzip( dataset.path ) | 125 if getattr( root_datatype, 'compressed', False ): |
124 if is_gzipped and not is_valid: | 126 data_type = 'compressed archive' |
125 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | 127 ext = dataset.file_type |
126 return | 128 else: |
127 elif is_gzipped and is_valid: | 129 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress |
128 if link_data_only == 'copy_files': | 130 is_gzipped, is_valid = check_gzip( dataset.path ) |
129 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format | 131 if is_gzipped and not is_valid: |
130 CHUNK_SIZE = 2**20 # 1Mb | |
131 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
132 gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) | |
133 while 1: | |
134 try: | |
135 chunk = gzipped_file.read( CHUNK_SIZE ) | |
136 except IOError: | |
137 os.close( fd ) | |
138 os.remove( uncompressed ) | |
139 file_err( 'Problem decompressing gzipped data', dataset, json_file ) | |
140 return | |
141 if not chunk: | |
142 break | |
143 os.write( fd, chunk ) | |
144 os.close( fd ) | |
145 gzipped_file.close() | |
146 # Replace the gzipped file with the decompressed file if it's safe to do so | |
147 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
148 dataset.path = uncompressed | |
149 else: | |
150 shutil.move( uncompressed, dataset.path ) | |
151 os.chmod(dataset.path, 0644) | |
152 dataset.name = dataset.name.rstrip( '.gz' ) | |
153 data_type = 'gzip' | |
154 if not data_type and bz2 is not None: | |
155 # See if we have a bz2 file, much like gzip | |
156 is_bzipped, is_valid = check_bz2( dataset.path ) | |
157 if is_bzipped and not is_valid: | |
158 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | 132 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
159 return | 133 return |
160 elif is_bzipped and is_valid: | 134 elif is_gzipped and is_valid: |
161 if link_data_only == 'copy_files': | 135 if link_data_only == 'copy_files': |
162 # We need to uncompress the temp_name file | 136 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format |
163 CHUNK_SIZE = 2**20 # 1Mb | 137 CHUNK_SIZE = 2**20 # 1Mb |
164 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | 138 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) |
165 bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) | 139 gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) |
166 while 1: | 140 while 1: |
167 try: | 141 try: |
168 chunk = bzipped_file.read( CHUNK_SIZE ) | 142 chunk = gzipped_file.read( CHUNK_SIZE ) |
169 except IOError: | 143 except IOError: |
170 os.close( fd ) | 144 os.close( fd ) |
171 os.remove( uncompressed ) | 145 os.remove( uncompressed ) |
172 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) | 146 file_err( 'Problem decompressing gzipped data', dataset, json_file ) |
173 return | 147 return |
174 if not chunk: | 148 if not chunk: |
175 break | 149 break |
176 os.write( fd, chunk ) | 150 os.write( fd, chunk ) |
177 os.close( fd ) | 151 os.close( fd ) |
178 bzipped_file.close() | 152 gzipped_file.close() |
179 # Replace the bzipped file with the decompressed file if it's safe to do so | 153 # Replace the gzipped file with the decompressed file if it's safe to do so |
180 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | 154 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: |
181 dataset.path = uncompressed | 155 dataset.path = uncompressed |
182 else: | 156 else: |
183 shutil.move( uncompressed, dataset.path ) | 157 shutil.move( uncompressed, dataset.path ) |
184 os.chmod(dataset.path, 0644) | 158 os.chmod(dataset.path, 0644) |
185 dataset.name = dataset.name.rstrip( '.bz2' ) | 159 dataset.name = dataset.name.rstrip( '.gz' ) |
186 data_type = 'bz2' | 160 data_type = 'gzip' |
187 if not data_type: | 161 if not data_type and bz2 is not None: |
188 # See if we have a zip archive | 162 # See if we have a bz2 file, much like gzip |
189 is_zipped = check_zip( dataset.path ) | 163 is_bzipped, is_valid = check_bz2( dataset.path ) |
190 if is_zipped: | 164 if is_bzipped and not is_valid: |
191 if link_data_only == 'copy_files': | 165 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
192 CHUNK_SIZE = 2**20 # 1Mb | 166 return |
193 uncompressed = None | 167 elif is_bzipped and is_valid: |
194 uncompressed_name = None | 168 if link_data_only == 'copy_files': |
195 unzipped = False | 169 # We need to uncompress the temp_name file |
196 z = zipfile.ZipFile( dataset.path ) | 170 CHUNK_SIZE = 2**20 # 1Mb |
197 for name in z.namelist(): | 171 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) |
198 if name.endswith('/'): | 172 bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) |
199 continue | 173 while 1: |
200 if unzipped: | 174 try: |
201 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' | 175 chunk = bzipped_file.read( CHUNK_SIZE ) |
202 break | 176 except IOError: |
203 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | 177 os.close( fd ) |
204 if sys.version_info[:2] >= ( 2, 6 ): | 178 os.remove( uncompressed ) |
205 zipped_file = z.open( name ) | 179 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) |
206 while 1: | 180 return |
181 if not chunk: | |
182 break | |
183 os.write( fd, chunk ) | |
184 os.close( fd ) | |
185 bzipped_file.close() | |
186 # Replace the bzipped file with the decompressed file if it's safe to do so | |
187 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
188 dataset.path = uncompressed | |
189 else: | |
190 shutil.move( uncompressed, dataset.path ) | |
191 os.chmod(dataset.path, 0644) | |
192 dataset.name = dataset.name.rstrip( '.bz2' ) | |
193 data_type = 'bz2' | |
194 if not data_type: | |
195 # See if we have a zip archive | |
196 is_zipped = check_zip( dataset.path ) | |
197 if is_zipped: | |
198 if link_data_only == 'copy_files': | |
199 CHUNK_SIZE = 2**20 # 1Mb | |
200 uncompressed = None | |
201 uncompressed_name = None | |
202 unzipped = False | |
203 z = zipfile.ZipFile( dataset.path ) | |
204 for name in z.namelist(): | |
205 if name.endswith('/'): | |
206 continue | |
207 if unzipped: | |
208 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' | |
209 break | |
210 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
211 if sys.version_info[:2] >= ( 2, 6 ): | |
212 zipped_file = z.open( name ) | |
213 while 1: | |
214 try: | |
215 chunk = zipped_file.read( CHUNK_SIZE ) | |
216 except IOError: | |
217 os.close( fd ) | |
218 os.remove( uncompressed ) | |
219 file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
220 return | |
221 if not chunk: | |
222 break | |
223 os.write( fd, chunk ) | |
224 os.close( fd ) | |
225 zipped_file.close() | |
226 uncompressed_name = name | |
227 unzipped = True | |
228 else: | |
229 # python < 2.5 doesn't have a way to read members in chunks(!) | |
207 try: | 230 try: |
208 chunk = zipped_file.read( CHUNK_SIZE ) | 231 outfile = open( uncompressed, 'wb' ) |
232 outfile.write( z.read( name ) ) | |
233 outfile.close() | |
234 uncompressed_name = name | |
235 unzipped = True | |
209 except IOError: | 236 except IOError: |
210 os.close( fd ) | 237 os.close( fd ) |
211 os.remove( uncompressed ) | 238 os.remove( uncompressed ) |
212 file_err( 'Problem decompressing zipped data', dataset, json_file ) | 239 file_err( 'Problem decompressing zipped data', dataset, json_file ) |
213 return | 240 return |
214 if not chunk: | 241 z.close() |
215 break | 242 # Replace the zipped file with the decompressed file if it's safe to do so |
216 os.write( fd, chunk ) | 243 if uncompressed is not None: |
217 os.close( fd ) | 244 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: |
218 zipped_file.close() | 245 dataset.path = uncompressed |
219 uncompressed_name = name | 246 else: |
220 unzipped = True | 247 shutil.move( uncompressed, dataset.path ) |
248 os.chmod(dataset.path, 0644) | |
249 dataset.name = uncompressed_name | |
250 data_type = 'zip' | |
251 if not data_type: | |
252 # TODO refactor this logic. check_binary isn't guaranteed to be | |
253 # correct since it only looks at whether the first 100 chars are | |
254 # printable or not. If someone specifies a known unsniffable | |
255 # binary datatype and check_binary fails, the file gets mangled. | |
256 if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): | |
257 # We have a binary dataset, but it is not Bam, Sff or Pdf | |
258 data_type = 'binary' | |
259 #binary_ok = False | |
260 parts = dataset.name.split( "." ) | |
261 if len( parts ) > 1: | |
262 ext = parts[-1].strip().lower() | |
263 if not Binary.is_ext_unsniffable(ext): | |
264 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) | |
265 return | |
266 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: | |
267 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) | |
268 file_err( err_msg, dataset, json_file ) | |
269 return | |
270 if not data_type: | |
271 # We must have a text file | |
272 if check_html( dataset.path ): | |
273 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) | |
274 return | |
275 if data_type != 'binary': | |
276 if link_data_only == 'copy_files': | |
277 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: | |
278 in_place = False | |
279 # Convert universal line endings to Posix line endings, but allow the user to turn it off, | |
280 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without | |
281 # corrupting the content of those files. | |
282 if dataset.to_posix_lines: | |
283 tmpdir = output_adjacent_tmpdir( output_path ) | |
284 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id | |
285 if dataset.space_to_tab: | |
286 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | |
221 else: | 287 else: |
222 # python < 2.5 doesn't have a way to read members in chunks(!) | 288 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) |
223 try: | 289 if dataset.file_type == 'auto': |
224 outfile = open( uncompressed, 'wb' ) | 290 ext = sniff.guess_ext( dataset.path, registry.sniff_order ) |
225 outfile.write( z.read( name ) ) | 291 else: |
226 outfile.close() | 292 ext = dataset.file_type |
227 uncompressed_name = name | 293 data_type = ext |
228 unzipped = True | |
229 except IOError: | |
230 os.close( fd ) | |
231 os.remove( uncompressed ) | |
232 file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
233 return | |
234 z.close() | |
235 # Replace the zipped file with the decompressed file if it's safe to do so | |
236 if uncompressed is not None: | |
237 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
238 dataset.path = uncompressed | |
239 else: | |
240 shutil.move( uncompressed, dataset.path ) | |
241 os.chmod(dataset.path, 0644) | |
242 dataset.name = uncompressed_name | |
243 data_type = 'zip' | |
244 if not data_type: | |
245 if check_binary( dataset.path ): | |
246 # We have a binary dataset, but it is not Bam, Sff or Pdf | |
247 data_type = 'binary' | |
248 #binary_ok = False | |
249 parts = dataset.name.split( "." ) | |
250 if len( parts ) > 1: | |
251 ext = parts[-1].strip().lower() | |
252 if not Binary.is_ext_unsniffable(ext): | |
253 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) | |
254 return | |
255 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: | |
256 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) | |
257 file_err( err_msg, dataset, json_file ) | |
258 return | |
259 if not data_type: | |
260 # We must have a text file | |
261 if check_html( dataset.path ): | |
262 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) | |
263 return | |
264 if data_type != 'binary': | |
265 if link_data_only == 'copy_files': | |
266 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: | |
267 in_place = False | |
268 # Convert universal line endings to Posix line endings, but allow the user to turn it off, | |
269 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without | |
270 # corrupting the content of those files. | |
271 if dataset.to_posix_lines: | |
272 if dataset.space_to_tab: | |
273 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) | |
274 else: | |
275 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) | |
276 if dataset.file_type == 'auto': | |
277 ext = sniff.guess_ext( dataset.path, registry.sniff_order ) | |
278 else: | |
279 ext = dataset.file_type | |
280 data_type = ext | |
281 # Save job info for the framework | 294 # Save job info for the framework |
282 if ext == 'auto' and dataset.ext: | 295 if ext == 'auto' and dataset.ext: |
283 ext = dataset.ext | 296 ext = dataset.ext |
284 if ext == 'auto': | 297 if ext == 'auto': |
285 ext = 'data' | 298 ext = 'data' |
312 stdout = stdout, | 325 stdout = stdout, |
313 name = dataset.name, | 326 name = dataset.name, |
314 line_count = line_count ) | 327 line_count = line_count ) |
315 if dataset.get('uuid', None) is not None: | 328 if dataset.get('uuid', None) is not None: |
316 info['uuid'] = dataset.get('uuid') | 329 info['uuid'] = dataset.get('uuid') |
317 json_file.write( to_json_string( info ) + "\n" ) | 330 json_file.write( dumps( info ) + "\n" ) |
318 | 331 |
319 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): | 332 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): |
320 # Groom the dataset content if necessary | 333 # Groom the dataset content if necessary |
321 datatype.groom_dataset_content( output_path ) | 334 datatype.groom_dataset_content( output_path ) |
322 | 335 |
338 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) | 351 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) |
339 return | 352 return |
340 dataset.path = temp_name | 353 dataset.path = temp_name |
341 dp = temp_name | 354 dp = temp_name |
342 if not value.is_binary: | 355 if not value.is_binary: |
356 tmpdir = output_adjacent_tmpdir( output_path ) | |
357 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id | |
343 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): | 358 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): |
344 sniff.convert_newlines_sep2tabs( dp ) | 359 sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) |
345 else: | 360 else: |
346 sniff.convert_newlines( dp ) | 361 sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) |
347 shutil.move( dp, os.path.join( files_path, name ) ) | 362 shutil.move( dp, os.path.join( files_path, name ) ) |
348 # Move the dataset to its "real" path | 363 # Move the dataset to its "real" path |
349 shutil.move( dataset.primary_file, output_path ) | 364 shutil.move( dataset.primary_file, output_path ) |
350 # Write the job info | 365 # Write the job info |
351 info = dict( type = 'dataset', | 366 info = dict( type = 'dataset', |
352 dataset_id = dataset.dataset_id, | 367 dataset_id = dataset.dataset_id, |
353 stdout = 'uploaded %s file' % dataset.file_type ) | 368 stdout = 'uploaded %s file' % dataset.file_type ) |
354 json_file.write( to_json_string( info ) + "\n" ) | 369 json_file.write( dumps( info ) + "\n" ) |
370 | |
371 | |
372 def output_adjacent_tmpdir( output_path ): | |
373 """ For temp files that will ultimately be moved to output_path anyway | |
374 just create the file directly in output_path's directory so shutil.move | |
375 will work optimially. | |
376 """ | |
377 return os.path.dirname( output_path ) | |
378 | |
355 | 379 |
356 def __main__(): | 380 def __main__(): |
357 | 381 |
358 if len( sys.argv ) < 4: | 382 if len( sys.argv ) < 4: |
359 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' | 383 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' |
364 | 388 |
365 registry = Registry() | 389 registry = Registry() |
366 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) | 390 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) |
367 | 391 |
368 for line in open( sys.argv[3], 'r' ): | 392 for line in open( sys.argv[3], 'r' ): |
369 dataset = from_json_string( line ) | 393 dataset = loads( line ) |
370 dataset = util.bunch.Bunch( **safe_dict( dataset ) ) | 394 dataset = util.bunch.Bunch( **safe_dict( dataset ) ) |
371 try: | 395 try: |
372 output_path = output_paths[int( dataset.dataset_id )][0] | 396 output_path = output_paths[int( dataset.dataset_id )][0] |
373 except: | 397 except: |
374 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id | 398 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id |