comparison uploadzip.py @ 2:30d16d36d536 draft

Uploaded
author davidvanzessen
date Mon, 30 Mar 2015 07:58:53 -0400
parents 4f3d79062c18
children 6f24bce6817e
comparison
equal deleted inserted replaced
1:fb547483e7bd 2:30d16d36d536
115 else: 115 else:
116 type_info = Binary.is_sniffable_binary( dataset.path ) 116 type_info = Binary.is_sniffable_binary( dataset.path )
117 if type_info: 117 if type_info:
118 data_type = type_info[0] 118 data_type = type_info[0]
119 ext = type_info[1] 119 ext = type_info[1]
120 data_type="binary"
120 if not data_type: 121 if not data_type:
121 shutil.move( dataset.path, output_path ) 122 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
122 #data_type = "data" 123 is_gzipped, is_valid = check_gzip( dataset.path )
124 if is_gzipped and not is_valid:
125 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
126 return
127 elif is_gzipped and is_valid:
128 if link_data_only == 'copy_files':
129 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
130 CHUNK_SIZE = 2**20 # 1Mb
131 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
132 gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
133 while 1:
134 try:
135 chunk = gzipped_file.read( CHUNK_SIZE )
136 except IOError:
137 os.close( fd )
138 os.remove( uncompressed )
139 file_err( 'Problem decompressing gzipped data', dataset, json_file )
140 return
141 if not chunk:
142 break
143 os.write( fd, chunk )
144 os.close( fd )
145 gzipped_file.close()
146 # Replace the gzipped file with the decompressed file if it's safe to do so
147 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
148 dataset.path = uncompressed
149 else:
150 shutil.move( uncompressed, dataset.path )
151 os.chmod(dataset.path, 0644)
152 dataset.name = dataset.name.rstrip( '.gz' )
153 data_type = 'gzip'
154 if not data_type and bz2 is not None:
155 # See if we have a bz2 file, much like gzip
156 is_bzipped, is_valid = check_bz2( dataset.path )
157 if is_bzipped and not is_valid:
158 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
159 return
160 elif is_bzipped and is_valid:
161 if link_data_only == 'copy_files':
162 # We need to uncompress the temp_name file
163 CHUNK_SIZE = 2**20 # 1Mb
164 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
165 bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
166 while 1:
167 try:
168 chunk = bzipped_file.read( CHUNK_SIZE )
169 except IOError:
170 os.close( fd )
171 os.remove( uncompressed )
172 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
173 return
174 if not chunk:
175 break
176 os.write( fd, chunk )
177 os.close( fd )
178 bzipped_file.close()
179 # Replace the bzipped file with the decompressed file if it's safe to do so
180 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
181 dataset.path = uncompressed
182 else:
183 shutil.move( uncompressed, dataset.path )
184 os.chmod(dataset.path, 0644)
185 dataset.name = dataset.name.rstrip( '.bz2' )
186 data_type = 'bz2'
187 if not data_type:
188 # See if we have a zip archive
189 is_zipped = check_zip( dataset.path )
190 if is_zipped:
191 if link_data_only == 'copy_files':
192 CHUNK_SIZE = 2**20 # 1Mb
193 uncompressed = None
194 uncompressed_name = None
195 unzipped = False
196 z = zipfile.ZipFile( dataset.path )
197 for name in z.namelist():
198 if name.endswith('/'):
199 continue
200 if unzipped:
201 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
202 break
203 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
204 if sys.version_info[:2] >= ( 2, 6 ):
205 zipped_file = z.open( name )
206 while 1:
207 try:
208 chunk = zipped_file.read( CHUNK_SIZE )
209 except IOError:
210 os.close( fd )
211 os.remove( uncompressed )
212 file_err( 'Problem decompressing zipped data', dataset, json_file )
213 return
214 if not chunk:
215 break
216 os.write( fd, chunk )
217 os.close( fd )
218 zipped_file.close()
219 uncompressed_name = name
220 unzipped = True
221 else:
222 # python < 2.5 doesn't have a way to read members in chunks(!)
223 try:
224 outfile = open( uncompressed, 'wb' )
225 outfile.write( z.read( name ) )
226 outfile.close()
227 uncompressed_name = name
228 unzipped = True
229 except IOError:
230 os.close( fd )
231 os.remove( uncompressed )
232 file_err( 'Problem decompressing zipped data', dataset, json_file )
233 return
234 z.close()
235 # Replace the zipped file with the decompressed file if it's safe to do so
236 if uncompressed is not None:
237 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
238 dataset.path = uncompressed
239 else:
240 shutil.move( uncompressed, dataset.path )
241 os.chmod(dataset.path, 0644)
242 dataset.name = uncompressed_name
243 data_type = 'zip'
244 if not data_type:
245 if check_binary( dataset.path ):
246 # We have a binary dataset, but it is not Bam, Sff or Pdf
247 data_type = 'binary'
248 #binary_ok = False
249 parts = dataset.name.split( "." )
250 if len( parts ) > 1:
251 ext = parts[-1].strip().lower()
252 if not Binary.is_ext_unsniffable(ext):
253 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
254 return
255 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
256 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
257 file_err( err_msg, dataset, json_file )
258 return
259 if not data_type:
260 # We must have a text file
261 if check_html( dataset.path ):
262 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
263 return
264 if data_type != 'binary':
265 if link_data_only == 'copy_files':
266 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
267 in_place = False
268 # Convert universal line endings to Posix line endings, but allow the user to turn it off,
269 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
270 # corrupting the content of those files.
271 if dataset.to_posix_lines:
272 if dataset.space_to_tab:
273 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place )
274 else:
275 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place )
276 if dataset.file_type == 'auto':
277 ext = sniff.guess_ext( dataset.path, registry.sniff_order )
278 else:
279 ext = dataset.file_type
280 data_type = ext
123 # Save job info for the framework 281 # Save job info for the framework
124 if ext == 'auto' and dataset.ext: 282 if ext == 'auto' and dataset.ext:
125 ext = dataset.ext 283 ext = dataset.ext
126 if ext == 'auto': 284 if ext == 'auto':
127 ext = 'data' 285 ext = 'data'
143 pass 301 pass
144 else: 302 else:
145 # This should not happen, but it's here just in case 303 # This should not happen, but it's here just in case
146 shutil.copy( dataset.path, output_path ) 304 shutil.copy( dataset.path, output_path )
147 elif link_data_only == 'copy_files': 305 elif link_data_only == 'copy_files':
148 if os.path.exists(dataset.path): 306 shutil.move( dataset.path, output_path )
149 shutil.move( dataset.path, output_path )
150 # Write the job info 307 # Write the job info
151 stdout = stdout or 'uploaded %s file' % data_type 308 stdout = stdout or 'uploaded %s file' % data_type
152 info = dict( type = 'dataset', 309 info = dict( type = 'dataset',
153 dataset_id = dataset.dataset_id, 310 dataset_id = dataset.dataset_id,
154 ext = ext, 311 ext = ext,