Mercurial > repos > davidvanzessen > upload_zip_file
comparison uploadzip.py @ 5:a2f200121dda draft default tip
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 09 Apr 2015 04:18:00 -0400 |
parents | 6f24bce6817e |
children |
comparison
equal
deleted
inserted
replaced
4:fdd4776a434f | 5:a2f200121dda |
---|---|
34 | 34 |
35 def stop_err( msg, ret=1 ): | 35 def stop_err( msg, ret=1 ): |
36 sys.stderr.write( msg ) | 36 sys.stderr.write( msg ) |
37 sys.exit( ret ) | 37 sys.exit( ret ) |
38 def file_err( msg, dataset, json_file ): | 38 def file_err( msg, dataset, json_file ): |
39 json_file.write( dumps( dict( type = 'dataset', | 39 json_file.write( to_json_string( dict( type = 'dataset', |
40 ext = 'data', | 40 ext = 'data', |
41 dataset_id = dataset.dataset_id, | 41 dataset_id = dataset.dataset_id, |
42 stderr = msg ) ) + "\n" ) | 42 stderr = msg ) ) + "\n" ) |
43 # never remove a server-side upload | 43 # never remove a server-side upload |
44 if dataset.type in ( 'server_dir', 'path_paste' ): | 44 if dataset.type in ( 'server_dir', 'path_paste' ): |
111 elif dataset.is_multi_byte: | 111 elif dataset.is_multi_byte: |
112 data_type = 'multi-byte char' | 112 data_type = 'multi-byte char' |
113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) | 113 ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) |
114 # Is dataset content supported sniffable binary? | 114 # Is dataset content supported sniffable binary? |
115 else: | 115 else: |
116 # FIXME: This ignores the declared sniff order in datatype_conf.xml | |
117 # resulting in improper behavior | |
118 type_info = Binary.is_sniffable_binary( dataset.path ) | 116 type_info = Binary.is_sniffable_binary( dataset.path ) |
119 if type_info: | 117 if type_info: |
120 data_type = type_info[0] | 118 data_type = type_info[0] |
121 ext = type_info[1] | 119 ext = type_info[1] |
122 data_type = 'compressed archive' #upload zip file modification | 120 data_type = "compressed archive" |
123 if not data_type: | 121 if not data_type: |
124 root_datatype = registry.get_datatype_by_extension( dataset.file_type ) | 122 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress |
125 if getattr( root_datatype, 'compressed', False ): | 123 is_gzipped, is_valid = check_gzip( dataset.path ) |
126 data_type = 'compressed archive' | 124 if is_gzipped and not is_valid: |
127 ext = dataset.file_type | 125 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
128 else: | 126 return |
129 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress | 127 elif is_gzipped and is_valid: |
130 is_gzipped, is_valid = check_gzip( dataset.path ) | 128 if link_data_only == 'copy_files': |
131 if is_gzipped and not is_valid: | 129 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format |
130 CHUNK_SIZE = 2**20 # 1Mb | |
131 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
132 gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) | |
133 while 1: | |
134 try: | |
135 chunk = gzipped_file.read( CHUNK_SIZE ) | |
136 except IOError: | |
137 os.close( fd ) | |
138 os.remove( uncompressed ) | |
139 file_err( 'Problem decompressing gzipped data', dataset, json_file ) | |
140 return | |
141 if not chunk: | |
142 break | |
143 os.write( fd, chunk ) | |
144 os.close( fd ) | |
145 gzipped_file.close() | |
146 # Replace the gzipped file with the decompressed file if it's safe to do so | |
147 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
148 dataset.path = uncompressed | |
149 else: | |
150 shutil.move( uncompressed, dataset.path ) | |
151 os.chmod(dataset.path, 0644) | |
152 dataset.name = dataset.name.rstrip( '.gz' ) | |
153 data_type = 'gzip' | |
154 if not data_type and bz2 is not None: | |
155 # See if we have a bz2 file, much like gzip | |
156 is_bzipped, is_valid = check_bz2( dataset.path ) | |
157 if is_bzipped and not is_valid: | |
132 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | 158 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
133 return | 159 return |
134 elif is_gzipped and is_valid: | 160 elif is_bzipped and is_valid: |
135 if link_data_only == 'copy_files': | 161 if link_data_only == 'copy_files': |
136 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format | 162 # We need to uncompress the temp_name file |
137 CHUNK_SIZE = 2**20 # 1Mb | 163 CHUNK_SIZE = 2**20 # 1Mb |
138 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | 164 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) |
139 gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) | 165 bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) |
140 while 1: | 166 while 1: |
141 try: | 167 try: |
142 chunk = gzipped_file.read( CHUNK_SIZE ) | 168 chunk = bzipped_file.read( CHUNK_SIZE ) |
143 except IOError: | 169 except IOError: |
144 os.close( fd ) | 170 os.close( fd ) |
145 os.remove( uncompressed ) | 171 os.remove( uncompressed ) |
146 file_err( 'Problem decompressing gzipped data', dataset, json_file ) | 172 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) |
147 return | 173 return |
148 if not chunk: | 174 if not chunk: |
149 break | 175 break |
150 os.write( fd, chunk ) | 176 os.write( fd, chunk ) |
151 os.close( fd ) | 177 os.close( fd ) |
152 gzipped_file.close() | 178 bzipped_file.close() |
153 # Replace the gzipped file with the decompressed file if it's safe to do so | 179 # Replace the bzipped file with the decompressed file if it's safe to do so |
154 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | 180 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: |
155 dataset.path = uncompressed | 181 dataset.path = uncompressed |
156 else: | 182 else: |
157 shutil.move( uncompressed, dataset.path ) | 183 shutil.move( uncompressed, dataset.path ) |
158 os.chmod(dataset.path, 0644) | 184 os.chmod(dataset.path, 0644) |
159 dataset.name = dataset.name.rstrip( '.gz' ) | 185 dataset.name = dataset.name.rstrip( '.bz2' ) |
160 data_type = 'gzip' | 186 data_type = 'bz2' |
161 if not data_type and bz2 is not None: | 187 if not data_type: |
162 # See if we have a bz2 file, much like gzip | 188 # See if we have a zip archive |
163 is_bzipped, is_valid = check_bz2( dataset.path ) | 189 is_zipped = check_zip( dataset.path ) |
164 if is_bzipped and not is_valid: | 190 if is_zipped: |
165 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | 191 if link_data_only == 'copy_files': |
166 return | 192 CHUNK_SIZE = 2**20 # 1Mb |
167 elif is_bzipped and is_valid: | 193 uncompressed = None |
168 if link_data_only == 'copy_files': | 194 uncompressed_name = None |
169 # We need to uncompress the temp_name file | 195 unzipped = False |
170 CHUNK_SIZE = 2**20 # 1Mb | 196 z = zipfile.ZipFile( dataset.path ) |
171 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | 197 for name in z.namelist(): |
172 bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) | 198 if name.endswith('/'): |
173 while 1: | 199 continue |
200 if unzipped: | |
201 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' | |
202 break | |
203 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
204 if sys.version_info[:2] >= ( 2, 6 ): | |
205 zipped_file = z.open( name ) | |
206 while 1: | |
207 try: | |
208 chunk = zipped_file.read( CHUNK_SIZE ) | |
209 except IOError: | |
210 os.close( fd ) | |
211 os.remove( uncompressed ) | |
212 file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
213 return | |
214 if not chunk: | |
215 break | |
216 os.write( fd, chunk ) | |
217 os.close( fd ) | |
218 zipped_file.close() | |
219 uncompressed_name = name | |
220 unzipped = True | |
221 else: | |
222 # python < 2.5 doesn't have a way to read members in chunks(!) | |
174 try: | 223 try: |
175 chunk = bzipped_file.read( CHUNK_SIZE ) | 224 outfile = open( uncompressed, 'wb' ) |
225 outfile.write( z.read( name ) ) | |
226 outfile.close() | |
227 uncompressed_name = name | |
228 unzipped = True | |
176 except IOError: | 229 except IOError: |
177 os.close( fd ) | 230 os.close( fd ) |
178 os.remove( uncompressed ) | 231 os.remove( uncompressed ) |
179 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) | 232 file_err( 'Problem decompressing zipped data', dataset, json_file ) |
180 return | 233 return |
181 if not chunk: | 234 z.close() |
182 break | 235 # Replace the zipped file with the decompressed file if it's safe to do so |
183 os.write( fd, chunk ) | 236 if uncompressed is not None: |
184 os.close( fd ) | |
185 bzipped_file.close() | |
186 # Replace the bzipped file with the decompressed file if it's safe to do so | |
187 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | 237 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: |
188 dataset.path = uncompressed | 238 dataset.path = uncompressed |
189 else: | 239 else: |
190 shutil.move( uncompressed, dataset.path ) | 240 shutil.move( uncompressed, dataset.path ) |
191 os.chmod(dataset.path, 0644) | 241 os.chmod(dataset.path, 0644) |
192 dataset.name = dataset.name.rstrip( '.bz2' ) | 242 dataset.name = uncompressed_name |
193 data_type = 'bz2' | 243 data_type = 'zip' |
194 if not data_type: | 244 if not data_type: |
195 # See if we have a zip archive | 245 if check_binary( dataset.path ): |
196 is_zipped = check_zip( dataset.path ) | 246 # We have a binary dataset, but it is not Bam, Sff or Pdf |
197 if is_zipped: | 247 data_type = 'binary' |
198 if link_data_only == 'copy_files': | 248 #binary_ok = False |
199 CHUNK_SIZE = 2**20 # 1Mb | 249 parts = dataset.name.split( "." ) |
200 uncompressed = None | 250 if len( parts ) > 1: |
201 uncompressed_name = None | 251 ext = parts[-1].strip().lower() |
202 unzipped = False | 252 if not Binary.is_ext_unsniffable(ext): |
203 z = zipfile.ZipFile( dataset.path ) | 253 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) |
204 for name in z.namelist(): | 254 return |
205 if name.endswith('/'): | 255 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: |
206 continue | 256 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) |
207 if unzipped: | 257 file_err( err_msg, dataset, json_file ) |
208 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' | 258 return |
209 break | 259 if not data_type: |
210 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | 260 # We must have a text file |
211 if sys.version_info[:2] >= ( 2, 6 ): | 261 if check_html( dataset.path ): |
212 zipped_file = z.open( name ) | 262 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) |
213 while 1: | 263 return |
214 try: | 264 if data_type != 'binary': |
215 chunk = zipped_file.read( CHUNK_SIZE ) | 265 if link_data_only == 'copy_files': |
216 except IOError: | 266 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: |
217 os.close( fd ) | 267 in_place = False |
218 os.remove( uncompressed ) | 268 # Convert universal line endings to Posix line endings, but allow the user to turn it off, |
219 file_err( 'Problem decompressing zipped data', dataset, json_file ) | 269 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without |
220 return | 270 # corrupting the content of those files. |
221 if not chunk: | 271 if dataset.to_posix_lines: |
222 break | 272 if dataset.space_to_tab: |
223 os.write( fd, chunk ) | 273 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) |
224 os.close( fd ) | 274 else: |
225 zipped_file.close() | 275 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) |
226 uncompressed_name = name | 276 if dataset.file_type == 'auto': |
227 unzipped = True | 277 ext = sniff.guess_ext( dataset.path, registry.sniff_order ) |
228 else: | 278 else: |
229 # python < 2.5 doesn't have a way to read members in chunks(!) | 279 ext = dataset.file_type |
230 try: | 280 data_type = ext |
231 outfile = open( uncompressed, 'wb' ) | |
232 outfile.write( z.read( name ) ) | |
233 outfile.close() | |
234 uncompressed_name = name | |
235 unzipped = True | |
236 except IOError: | |
237 os.close( fd ) | |
238 os.remove( uncompressed ) | |
239 file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
240 return | |
241 z.close() | |
242 # Replace the zipped file with the decompressed file if it's safe to do so | |
243 if uncompressed is not None: | |
244 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
245 dataset.path = uncompressed | |
246 else: | |
247 shutil.move( uncompressed, dataset.path ) | |
248 os.chmod(dataset.path, 0644) | |
249 dataset.name = uncompressed_name | |
250 data_type = 'zip' | |
251 if not data_type: | |
252 # TODO refactor this logic. check_binary isn't guaranteed to be | |
253 # correct since it only looks at whether the first 100 chars are | |
254 # printable or not. If someone specifies a known unsniffable | |
255 # binary datatype and check_binary fails, the file gets mangled. | |
256 if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): | |
257 # We have a binary dataset, but it is not Bam, Sff or Pdf | |
258 data_type = 'binary' | |
259 #binary_ok = False | |
260 parts = dataset.name.split( "." ) | |
261 if len( parts ) > 1: | |
262 ext = parts[-1].strip().lower() | |
263 if not Binary.is_ext_unsniffable(ext): | |
264 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) | |
265 return | |
266 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: | |
267 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) | |
268 file_err( err_msg, dataset, json_file ) | |
269 return | |
270 if not data_type: | |
271 # We must have a text file | |
272 if check_html( dataset.path ): | |
273 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) | |
274 return | |
275 if data_type != 'binary': | |
276 if link_data_only == 'copy_files': | |
277 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: | |
278 in_place = False | |
279 # Convert universal line endings to Posix line endings, but allow the user to turn it off, | |
280 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without | |
281 # corrupting the content of those files. | |
282 if dataset.to_posix_lines: | |
283 tmpdir = output_adjacent_tmpdir( output_path ) | |
284 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id | |
285 if dataset.space_to_tab: | |
286 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | |
287 else: | |
288 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | |
289 if dataset.file_type == 'auto': | |
290 ext = sniff.guess_ext( dataset.path, registry.sniff_order ) | |
291 else: | |
292 ext = dataset.file_type | |
293 data_type = ext | |
294 # Save job info for the framework | 281 # Save job info for the framework |
295 if ext == 'auto' and dataset.ext: | 282 if ext == 'auto' and dataset.ext: |
296 ext = dataset.ext | 283 ext = dataset.ext |
297 if ext == 'auto': | 284 if ext == 'auto': |
298 ext = 'data' | 285 ext = 'data' |
325 stdout = stdout, | 312 stdout = stdout, |
326 name = dataset.name, | 313 name = dataset.name, |
327 line_count = line_count ) | 314 line_count = line_count ) |
328 if dataset.get('uuid', None) is not None: | 315 if dataset.get('uuid', None) is not None: |
329 info['uuid'] = dataset.get('uuid') | 316 info['uuid'] = dataset.get('uuid') |
330 json_file.write( dumps( info ) + "\n" ) | 317 json_file.write( to_json_string( info ) + "\n" ) |
331 | 318 |
332 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): | 319 if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): |
333 # Groom the dataset content if necessary | 320 # Groom the dataset content if necessary |
334 datatype.groom_dataset_content( output_path ) | 321 datatype.groom_dataset_content( output_path ) |
335 | 322 |
351 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) | 338 file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) |
352 return | 339 return |
353 dataset.path = temp_name | 340 dataset.path = temp_name |
354 dp = temp_name | 341 dp = temp_name |
355 if not value.is_binary: | 342 if not value.is_binary: |
356 tmpdir = output_adjacent_tmpdir( output_path ) | |
357 tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id | |
358 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): | 343 if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): |
359 sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | 344 sniff.convert_newlines_sep2tabs( dp ) |
360 else: | 345 else: |
361 sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | 346 sniff.convert_newlines( dp ) |
362 shutil.move( dp, os.path.join( files_path, name ) ) | 347 shutil.move( dp, os.path.join( files_path, name ) ) |
363 # Move the dataset to its "real" path | 348 # Move the dataset to its "real" path |
364 shutil.move( dataset.primary_file, output_path ) | 349 shutil.move( dataset.primary_file, output_path ) |
365 # Write the job info | 350 # Write the job info |
366 info = dict( type = 'dataset', | 351 info = dict( type = 'dataset', |
367 dataset_id = dataset.dataset_id, | 352 dataset_id = dataset.dataset_id, |
368 stdout = 'uploaded %s file' % dataset.file_type ) | 353 stdout = 'uploaded %s file' % dataset.file_type ) |
369 json_file.write( dumps( info ) + "\n" ) | 354 json_file.write( to_json_string( info ) + "\n" ) |
370 | |
371 | |
372 def output_adjacent_tmpdir( output_path ): | |
373 """ For temp files that will ultimately be moved to output_path anyway | |
374 just create the file directly in output_path's directory so shutil.move | |
375 will work optimially. | |
376 """ | |
377 return os.path.dirname( output_path ) | |
378 | |
379 | 355 |
380 def __main__(): | 356 def __main__(): |
381 | 357 |
382 if len( sys.argv ) < 4: | 358 if len( sys.argv ) < 4: |
383 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' | 359 print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' |
388 | 364 |
389 registry = Registry() | 365 registry = Registry() |
390 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) | 366 registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] ) |
391 | 367 |
392 for line in open( sys.argv[3], 'r' ): | 368 for line in open( sys.argv[3], 'r' ): |
393 dataset = loads( line ) | 369 dataset = from_json_string( line ) |
394 dataset = util.bunch.Bunch( **safe_dict( dataset ) ) | 370 dataset = util.bunch.Bunch( **safe_dict( dataset ) ) |
395 try: | 371 try: |
396 output_path = output_paths[int( dataset.dataset_id )][0] | 372 output_path = output_paths[int( dataset.dataset_id )][0] |
397 except: | 373 except: |
398 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id | 374 print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id |