Mercurial > repos > ecology > xarray_select
comparison xarray_tool.py @ 2:e639f9b79ba1 draft
"planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/data_manipulation/xarray/ commit 57b6d23e3734d883e71081c78e77964d61be82ba"
| author | ecology |
|---|---|
| date | Sun, 06 Jun 2021 08:48:37 +0000 |
| parents | 054c9023586a |
| children | 43434a5c8fb2 |
comparison
equal
deleted
inserted
replaced
| 1:054c9023586a | 2:e639f9b79ba1 |
|---|---|
| 2 # - getting metadata information | 2 # - getting metadata information |
| 3 # - select data and save results in csv file for further post-processing | 3 # - select data and save results in csv file for further post-processing |
| 4 | 4 |
| 5 import argparse | 5 import argparse |
| 6 import csv | 6 import csv |
| 7 import os | |
| 7 import warnings | 8 import warnings |
| 8 | 9 |
| 9 import geopandas as gdp | 10 import geopandas as gdp |
| 10 | 11 |
| 11 import pandas as pd | 12 import pandas as pd |
| 19 class XarrayTool (): | 20 class XarrayTool (): |
| 20 def __init__(self, infile, outfile_info="", outfile_summary="", | 21 def __init__(self, infile, outfile_info="", outfile_summary="", |
| 21 select="", outfile="", outputdir="", latname="", | 22 select="", outfile="", outputdir="", latname="", |
| 22 latvalN="", latvalS="", lonname="", lonvalE="", | 23 latvalN="", latvalS="", lonname="", lonvalE="", |
| 23 lonvalW="", filter_list="", coords="", time="", | 24 lonvalW="", filter_list="", coords="", time="", |
| 24 verbose=False | 25 verbose=False, no_missing=False, coords_info=None, |
| 25 ): | 26 tolerance=None): |
| 26 self.infile = infile | 27 self.infile = infile |
| 27 self.outfile_info = outfile_info | 28 self.outfile_info = outfile_info |
| 28 self.outfile_summary = outfile_summary | 29 self.outfile_summary = outfile_summary |
| 29 self.select = select | 30 self.select = select |
| 30 self.outfile = outfile | 31 self.outfile = outfile |
| 31 self.outputdir = outputdir | 32 self.outputdir = outputdir |
| 32 self.latname = latname | 33 self.latname = latname |
| 34 if tolerance != "" and tolerance is not None: | |
| 35 self.tolerance = float(tolerance) | |
| 36 else: | |
| 37 self.tolerance = -1 | |
| 33 if latvalN != "" and latvalN is not None: | 38 if latvalN != "" and latvalN is not None: |
| 34 self.latvalN = float(latvalN) | 39 self.latvalN = float(latvalN) |
| 35 else: | 40 else: |
| 36 self.latvalN = "" | 41 self.latvalN = "" |
| 37 if latvalS != "" and latvalS is not None: | 42 if latvalS != "" and latvalS is not None: |
| 49 self.lonvalW = "" | 54 self.lonvalW = "" |
| 50 self.filter = filter_list | 55 self.filter = filter_list |
| 51 self.time = time | 56 self.time = time |
| 52 self.coords = coords | 57 self.coords = coords |
| 53 self.verbose = verbose | 58 self.verbose = verbose |
| 59 self.no_missing = no_missing | |
| 54 # initialization | 60 # initialization |
| 55 self.dset = None | 61 self.dset = None |
| 56 self.gset = None | 62 self.gset = None |
| 63 self.coords_info = coords_info | |
| 57 if self.verbose: | 64 if self.verbose: |
| 58 print("infile: ", self.infile) | 65 print("infile: ", self.infile) |
| 59 print("outfile_info: ", self.outfile_info) | 66 print("outfile_info: ", self.outfile_info) |
| 60 print("outfile_summary: ", self.outfile_summary) | 67 print("outfile_summary: ", self.outfile_summary) |
| 61 print("outfile: ", self.outfile) | 68 print("outfile: ", self.outfile) |
| 69 print("lonvalE: ", self.lonvalE) | 76 print("lonvalE: ", self.lonvalE) |
| 70 print("lonvalW: ", self.lonvalW) | 77 print("lonvalW: ", self.lonvalW) |
| 71 print("filter: ", self.filter) | 78 print("filter: ", self.filter) |
| 72 print("time: ", self.time) | 79 print("time: ", self.time) |
| 73 print("coords: ", self.coords) | 80 print("coords: ", self.coords) |
| 81 print("coords_info: ", self.coords_info) | |
| 74 | 82 |
| 75 def info(self): | 83 def info(self): |
| 76 f = open(self.outfile_info, 'w') | 84 f = open(self.outfile_info, 'w') |
| 77 ds = xr.open_dataset(self.infile) | 85 ds = xr.open_dataset(self.infile) |
| 78 ds.info(f) | 86 ds.info(f) |
| 111 if (op == 'bi'): | 119 if (op == 'bi'): |
| 112 rl = float(split_filter[3]) | 120 rl = float(split_filter[3]) |
| 113 if filter_varname == self.select: | 121 if filter_varname == self.select: |
| 114 # filter on values of the selected variable | 122 # filter on values of the selected variable |
| 115 if op == 'bi': | 123 if op == 'bi': |
| 116 self.dset = self.dset.where((self.dset <= rl) & (self.dset >= ll)) | 124 self.dset = self.dset.where( |
| 125 (self.dset <= rl) & (self.dset >= ll) | |
| 126 ) | |
| 117 elif op == 'le': | 127 elif op == 'le': |
| 118 self.dset = self.dset.where(self.dset <= ll) | 128 self.dset = self.dset.where(self.dset <= ll) |
| 119 elif op == 'ge': | 129 elif op == 'ge': |
| 120 self.dset = self.dset.where(self.dset >= ll) | 130 self.dset = self.dset.where(self.dset >= ll) |
| 121 elif op == 'e': | 131 elif op == 'e': |
| 139 self.datetime_selection() | 149 self.datetime_selection() |
| 140 if self.filter: | 150 if self.filter: |
| 141 self.filter_selection() | 151 self.filter_selection() |
| 142 | 152 |
| 143 self.area_selection() | 153 self.area_selection() |
| 144 # convert to dataframe | 154 if self.gset.count() > 1: |
| 145 self.gset = self.gset.to_dataframe().dropna(how='all').reset_index() | 155 # convert to dataframe if several rows and cols |
| 146 self.gset.to_csv(self.outfile, header=True, sep='\t') | 156 self.gset = self.gset.to_dataframe().dropna(how='all'). \ |
| 157 reset_index() | |
| 158 self.gset.to_csv(self.outfile, header=True, sep='\t') | |
| 159 else: | |
| 160 data = { | |
| 161 self.latname: [self.gset[self.latname].values], | |
| 162 self.lonname: [self.gset[self.lonname].values], | |
| 163 self.select: [self.gset.values] | |
| 164 } | |
| 165 | |
| 166 df = pd.DataFrame(data, columns=[self.latname, self.lonname, | |
| 167 self.select]) | |
| 168 df.to_csv(self.outfile, header=True, sep='\t') | |
| 147 | 169 |
| 148 def datetime_selection(self): | 170 def datetime_selection(self): |
| 149 split_filter = self.time.split('#') | 171 split_filter = self.time.split('#') |
| 150 time_varname = split_filter[0] | 172 time_varname = split_filter[0] |
| 151 op = split_filter[1] | 173 op = split_filter[1] |
| 163 def filter_selection(self): | 185 def filter_selection(self): |
| 164 for single_filter in self.filter: | 186 for single_filter in self.filter: |
| 165 self.rowfilter(single_filter) | 187 self.rowfilter(single_filter) |
| 166 | 188 |
| 167 def area_selection(self): | 189 def area_selection(self): |
| 190 | |
| 168 if self.latvalS != "" and self.lonvalW != "": | 191 if self.latvalS != "" and self.lonvalW != "": |
| 169 # Select geographical area | 192 # Select geographical area |
| 170 self.gset = self.dset.sel({self.latname: | 193 self.gset = self.dset.sel({self.latname: |
| 171 slice(self.latvalS, self.latvalN), | 194 slice(self.latvalS, self.latvalN), |
| 172 self.lonname: | 195 self.lonname: |
| 173 slice(self.lonvalW, self.lonvalE)}) | 196 slice(self.lonvalW, self.lonvalE)}) |
| 174 elif self.latvalN != "" and self.lonvalE != "": | 197 elif self.latvalN != "" and self.lonvalE != "": |
| 175 # select nearest location | 198 # select nearest location |
| 176 self.nearest_location() # find nearest location without NaN values | 199 if self.no_missing: |
| 177 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, | 200 self.nearest_latvalN = self.latvalN |
| 178 self.lonname: self.nearest_lonvalE}, | 201 self.nearest_lonvalE = self.lonvalE |
| 179 method='nearest') | 202 else: |
| 203 # find nearest location without NaN values | |
| 204 self.nearest_location() | |
| 205 if self.tolerance > 0: | |
| 206 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, | |
| 207 self.lonname: self.nearest_lonvalE}, | |
| 208 method='nearest', | |
| 209 tolerance=self.tolerance) | |
| 210 else: | |
| 211 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, | |
| 212 self.lonname: self.nearest_lonvalE}, | |
| 213 method='nearest') | |
| 180 else: | 214 else: |
| 181 self.gset = self.dset | 215 self.gset = self.dset |
| 182 | 216 |
| 183 def nearest_location(self): | 217 def nearest_location(self): |
| 184 # Build a geopandas dataframe with all first elements in each dimension | 218 # Build a geopandas dataframe with all first elements in each dimension |
| 204 def selection_from_coords(self): | 238 def selection_from_coords(self): |
| 205 fcoords = pd.read_csv(self.coords, sep='\t') | 239 fcoords = pd.read_csv(self.coords, sep='\t') |
| 206 for row in fcoords.itertuples(): | 240 for row in fcoords.itertuples(): |
| 207 self.latvalN = row[0] | 241 self.latvalN = row[0] |
| 208 self.lonvalE = row[1] | 242 self.lonvalE = row[1] |
| 209 self.outfile = (self.outputdir + '/' + self.select + '_' + str(row.Index) + '.tabular') | 243 self.outfile = (os.path.join(self.outputdir, |
| 244 self.select + '_' + | |
| 245 str(row.Index) + '.tabular')) | |
| 210 self.selection() | 246 self.selection() |
| 247 | |
| 248 def get_coords_info(self): | |
| 249 ds = xr.open_dataset(self.infile) | |
| 250 for c in ds.coords: | |
| 251 filename = os.path.join(self.coords_info, | |
| 252 c.strip() + | |
| 253 '.tabular') | |
| 254 pd = ds.coords[c].to_pandas() | |
| 255 pd.index = range(len(pd)) | |
| 256 pd.to_csv(filename, header=False, sep='\t') | |
| 211 | 257 |
| 212 | 258 |
| 213 if __name__ == '__main__': | 259 if __name__ == '__main__': |
| 214 warnings.filterwarnings("ignore") | 260 warnings.filterwarnings("ignore") |
| 215 parser = argparse.ArgumentParser() | 261 parser = argparse.ArgumentParser() |
| 253 parser.add_argument( | 299 parser.add_argument( |
| 254 '--lonvalW', | 300 '--lonvalW', |
| 255 help='West longitude value' | 301 help='West longitude value' |
| 256 ) | 302 ) |
| 257 parser.add_argument( | 303 parser.add_argument( |
| 304 '--tolerance', | |
| 305 help='Maximum distance between original and selected value for ' | |
| 306 ' inexact matches e.g. abs(index[indexer] - target) <= tolerance' | |
| 307 ) | |
| 308 parser.add_argument( | |
| 258 '--coords', | 309 '--coords', |
| 259 help='Input file containing Latitude and Longitude' | 310 help='Input file containing Latitude and Longitude' |
| 260 'for geographical selection' | 311 'for geographical selection' |
| 261 ) | 312 ) |
| 262 parser.add_argument( | 313 parser.add_argument( |
| 314 '--coords_info', | |
| 315 help='output-folder where for each coordinate, coordinate values ' | |
| 316 ' are being printed in the corresponding outputfile' | |
| 317 ) | |
| 318 parser.add_argument( | |
| 263 '--filter', | 319 '--filter', |
| 264 nargs="*", | 320 nargs="*", |
| 265 help='Filter list variable#operator#value_s#value_e' | 321 help='Filter list variable#operator#value_s#value_e' |
| 266 ) | 322 ) |
| 267 parser.add_argument( | 323 parser.add_argument( |
| 279 '(valid only when --select)' | 335 '(valid only when --select)' |
| 280 ) | 336 ) |
| 281 parser.add_argument( | 337 parser.add_argument( |
| 282 "-v", "--verbose", | 338 "-v", "--verbose", |
| 283 help="switch on verbose mode", | 339 help="switch on verbose mode", |
| 340 action="store_true" | |
| 341 ) | |
| 342 parser.add_argument( | |
| 343 "--no_missing", | |
| 344 help="""Do not take into account possible null/missing values | |
| 345 (only valid for single location)""", | |
| 284 action="store_true" | 346 action="store_true" |
| 285 ) | 347 ) |
| 286 args = parser.parse_args() | 348 args = parser.parse_args() |
| 287 | 349 |
| 288 p = XarrayTool(args.infile, args.info, args.summary, args.select, | 350 p = XarrayTool(args.infile, args.info, args.summary, args.select, |
| 289 args.outfile, args.outputdir, args.latname, | 351 args.outfile, args.outputdir, args.latname, |
| 290 args.latvalN, args.latvalS, args.lonname, | 352 args.latvalN, args.latvalS, args.lonname, |
| 291 args.lonvalE, args.lonvalW, args.filter, | 353 args.lonvalE, args.lonvalW, args.filter, |
| 292 args.coords, args.time, args.verbose) | 354 args.coords, args.time, args.verbose, |
| 355 args.no_missing, args.coords_info, args.tolerance) | |
| 293 if args.info: | 356 if args.info: |
| 294 p.info() | 357 p.info() |
| 295 if args.summary: | 358 if args.summary: |
| 296 p.summary() | 359 p.summary() |
| 297 if args.coords: | 360 if args.coords: |
| 298 p.selection_from_coords() | 361 p.selection_from_coords() |
| 299 elif args.select: | 362 elif args.select: |
| 300 p.selection() | 363 p.selection() |
| 364 elif args.coords_info: | |
| 365 p.get_coords_info() |
