Mercurial > repos > davidvanzessen > upload_zip_file
comparison uploadzip.py @ 2:30d16d36d536 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 30 Mar 2015 07:58:53 -0400 |
parents | 4f3d79062c18 |
children | 6f24bce6817e |
comparison
equal
deleted
inserted
replaced
1:fb547483e7bd | 2:30d16d36d536 |
---|---|
115 else: | 115 else: |
116 type_info = Binary.is_sniffable_binary( dataset.path ) | 116 type_info = Binary.is_sniffable_binary( dataset.path ) |
117 if type_info: | 117 if type_info: |
118 data_type = type_info[0] | 118 data_type = type_info[0] |
119 ext = type_info[1] | 119 ext = type_info[1] |
120 data_type="binary" | |
120 if not data_type: | 121 if not data_type: |
121 shutil.move( dataset.path, output_path ) | 122 # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress |
122 #data_type = "data" | 123 is_gzipped, is_valid = check_gzip( dataset.path ) |
124 if is_gzipped and not is_valid: | |
125 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | |
126 return | |
127 elif is_gzipped and is_valid: | |
128 if link_data_only == 'copy_files': | |
129 # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format | |
130 CHUNK_SIZE = 2**20 # 1Mb | |
131 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
132 gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) | |
133 while 1: | |
134 try: | |
135 chunk = gzipped_file.read( CHUNK_SIZE ) | |
136 except IOError: | |
137 os.close( fd ) | |
138 os.remove( uncompressed ) | |
139 file_err( 'Problem decompressing gzipped data', dataset, json_file ) | |
140 return | |
141 if not chunk: | |
142 break | |
143 os.write( fd, chunk ) | |
144 os.close( fd ) | |
145 gzipped_file.close() | |
146 # Replace the gzipped file with the decompressed file if it's safe to do so | |
147 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
148 dataset.path = uncompressed | |
149 else: | |
150 shutil.move( uncompressed, dataset.path ) | |
151 os.chmod(dataset.path, 0644) | |
152 dataset.name = dataset.name.rstrip( '.gz' ) | |
153 data_type = 'gzip' | |
154 if not data_type and bz2 is not None: | |
155 # See if we have a bz2 file, much like gzip | |
156 is_bzipped, is_valid = check_bz2( dataset.path ) | |
157 if is_bzipped and not is_valid: | |
158 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | |
159 return | |
160 elif is_bzipped and is_valid: | |
161 if link_data_only == 'copy_files': | |
162 # We need to uncompress the temp_name file | |
163 CHUNK_SIZE = 2**20 # 1Mb | |
164 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
165 bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) | |
166 while 1: | |
167 try: | |
168 chunk = bzipped_file.read( CHUNK_SIZE ) | |
169 except IOError: | |
170 os.close( fd ) | |
171 os.remove( uncompressed ) | |
172 file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) | |
173 return | |
174 if not chunk: | |
175 break | |
176 os.write( fd, chunk ) | |
177 os.close( fd ) | |
178 bzipped_file.close() | |
179 # Replace the bzipped file with the decompressed file if it's safe to do so | |
180 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
181 dataset.path = uncompressed | |
182 else: | |
183 shutil.move( uncompressed, dataset.path ) | |
184 os.chmod(dataset.path, 0644) | |
185 dataset.name = dataset.name.rstrip( '.bz2' ) | |
186 data_type = 'bz2' | |
187 if not data_type: | |
188 # See if we have a zip archive | |
189 is_zipped = check_zip( dataset.path ) | |
190 if is_zipped: | |
191 if link_data_only == 'copy_files': | |
192 CHUNK_SIZE = 2**20 # 1Mb | |
193 uncompressed = None | |
194 uncompressed_name = None | |
195 unzipped = False | |
196 z = zipfile.ZipFile( dataset.path ) | |
197 for name in z.namelist(): | |
198 if name.endswith('/'): | |
199 continue | |
200 if unzipped: | |
201 stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' | |
202 break | |
203 fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
204 if sys.version_info[:2] >= ( 2, 6 ): | |
205 zipped_file = z.open( name ) | |
206 while 1: | |
207 try: | |
208 chunk = zipped_file.read( CHUNK_SIZE ) | |
209 except IOError: | |
210 os.close( fd ) | |
211 os.remove( uncompressed ) | |
212 file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
213 return | |
214 if not chunk: | |
215 break | |
216 os.write( fd, chunk ) | |
217 os.close( fd ) | |
218 zipped_file.close() | |
219 uncompressed_name = name | |
220 unzipped = True | |
221 else: | |
222 # python < 2.5 doesn't have a way to read members in chunks(!) | |
223 try: | |
224 outfile = open( uncompressed, 'wb' ) | |
225 outfile.write( z.read( name ) ) | |
226 outfile.close() | |
227 uncompressed_name = name | |
228 unzipped = True | |
229 except IOError: | |
230 os.close( fd ) | |
231 os.remove( uncompressed ) | |
232 file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
233 return | |
234 z.close() | |
235 # Replace the zipped file with the decompressed file if it's safe to do so | |
236 if uncompressed is not None: | |
237 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
238 dataset.path = uncompressed | |
239 else: | |
240 shutil.move( uncompressed, dataset.path ) | |
241 os.chmod(dataset.path, 0644) | |
242 dataset.name = uncompressed_name | |
243 data_type = 'zip' | |
244 if not data_type: | |
245 if check_binary( dataset.path ): | |
246 # We have a binary dataset, but it is not Bam, Sff or Pdf | |
247 data_type = 'binary' | |
248 #binary_ok = False | |
249 parts = dataset.name.split( "." ) | |
250 if len( parts ) > 1: | |
251 ext = parts[-1].strip().lower() | |
252 if not Binary.is_ext_unsniffable(ext): | |
253 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) | |
254 return | |
255 elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: | |
256 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) | |
257 file_err( err_msg, dataset, json_file ) | |
258 return | |
259 if not data_type: | |
260 # We must have a text file | |
261 if check_html( dataset.path ): | |
262 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) | |
263 return | |
264 if data_type != 'binary': | |
265 if link_data_only == 'copy_files': | |
266 if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: | |
267 in_place = False | |
268 # Convert universal line endings to Posix line endings, but allow the user to turn it off, | |
269 # so that is becomes possible to upload gzip, bz2 or zip files with binary data without | |
270 # corrupting the content of those files. | |
271 if dataset.to_posix_lines: | |
272 if dataset.space_to_tab: | |
273 line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) | |
274 else: | |
275 line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) | |
276 if dataset.file_type == 'auto': | |
277 ext = sniff.guess_ext( dataset.path, registry.sniff_order ) | |
278 else: | |
279 ext = dataset.file_type | |
280 data_type = ext | |
123 # Save job info for the framework | 281 # Save job info for the framework |
124 if ext == 'auto' and dataset.ext: | 282 if ext == 'auto' and dataset.ext: |
125 ext = dataset.ext | 283 ext = dataset.ext |
126 if ext == 'auto': | 284 if ext == 'auto': |
127 ext = 'data' | 285 ext = 'data' |
143 pass | 301 pass |
144 else: | 302 else: |
145 # This should not happen, but it's here just in case | 303 # This should not happen, but it's here just in case |
146 shutil.copy( dataset.path, output_path ) | 304 shutil.copy( dataset.path, output_path ) |
147 elif link_data_only == 'copy_files': | 305 elif link_data_only == 'copy_files': |
148 if os.path.exists(dataset.path): | 306 shutil.move( dataset.path, output_path ) |
149 shutil.move( dataset.path, output_path ) | |
150 # Write the job info | 307 # Write the job info |
151 stdout = stdout or 'uploaded %s file' % data_type | 308 stdout = stdout or 'uploaded %s file' % data_type |
152 info = dict( type = 'dataset', | 309 info = dict( type = 'dataset', |
153 dataset_id = dataset.dataset_id, | 310 dataset_id = dataset.dataset_id, |
154 ext = ext, | 311 ext = ext, |