Mercurial > repos > greg > validate_temperature_data
diff validate_temperature_data.py @ 0:4290854f3af5 draft
Uploaded
author | greg |
---|---|
date | Tue, 27 Nov 2018 09:47:31 -0500 |
parents | |
children | cfe1ce427aa7 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/validate_temperature_data.py Tue Nov 27 09:47:31 2018 -0500 @@ -0,0 +1,162 @@ +#!/usr/bin/env python +import argparse +import datetime +import decimal +import re +import shutil +import sys + +parser = argparse.ArgumentParser() +parser.add_argument('--data_type', dest='data_type', default=None, help='Temperature data type, normals or actuals') +parser.add_argument('--input_actuals', dest='input_actuals', default=None, help='Daily actuals temperature data') +parser.add_argument('--input_normals', dest='input_normals', default=None, help='30 year normals temperature data') +parser.add_argument('--output', dest='output', help='Output dataset'), +args = parser.parse_args() + +ACTUALS_HEADER = "LATITUDE,LONGITUDE,DATE,DOY,TMIN,TMAX" +NORMALS_HEADER = "stationid,latitude,longitude,elev_m,name,st,mmdd,doy,tmin,tmax" + +def add_error_msg(accumulated_msgs, msg): + return "%s\n%s" % (accumulated_msgs, msg) + + +def empty_value(line_no, label, accumulated_msgs): + return add_error_msg(accumulated_msgs, "The required %s value is missing on line %d." % (label, line_no)) + + +def stop_error(msg): + sys.exit(msg) + + +def validate_date_string(line_no, date_string, accumulated_msgs): + try: + datetime.datetime.strptime(date_string, '%Y-%m-%d') + return accumulated_msgs + except ValueError: + return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be YYYY-MM-DD)." % (line_no, date_string)) + + +def validate_decimal(line_no, decimal_string, accumulated_msgs, label): + try: + decimal.Decimal(decimal_string) + return accumulated_msgs + except Exception: + return add_error_msg(accumulated_msgs, "Line %d contains an incorrect %s decimal value (%s)." % (line_no, label, decimal_string)) + + +def validate_integer(line_no, integer_string, accumulated_msgs, label): + if integer_string.isdigit(): + return accumulated_msgs + return add_error_msg(accumulated_msgs, "Line %d contains an incorrect %s integer value (%s)." % (line_no, label, decimal_string)) + + +def validate_mmdd(line_no, mmdd, accumulated_msgs): + try: + datetime.datetime.strptime(date_string, '%m-%d') + return accumulated_msgs + except ValueError: + return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be mm-dd)." % (line_no, date_string)) + + +accumulated_msgs = "" +# Parse the input file, skipping the header, and validating +# that each data line consists of 31 comma-separated items. +if args.data_type == "normals": + input_file = args.input_normals + header = NORMALS_HEADER + last_doy = 0 + num_normals_rows = 0 +else: + input_file = args.input_actuals + header = ACTUALS_HEADER +with open(input_file, "r") as ih: + for i, line in enumerate(ih): + line = line.rstrip("\r\n") + if i == 0: + if line != header: + accumulated_msgs = add_error_msg(accumulated_msgs, "The header is invalid, must be %s" % header) + items = line.split(",") + if args.data_type == "normals": + if i > 367: + accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 367 lines (must be 1 header line and 366 data lines).") + stop_error(accumulated_msgs) + if len(items) != 10: + accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 10)." % (i, len(items))) + stop_error(accumulated_msgs) + stationid = items[0] + if len(station_id) == 0: + accumulated_msgs = empty_value(i, "stationid", accumulated_msgs) + latitude = items[1] + accumulated_msgs = validate_decimal(i, latitude, accumulated_msgs, "latitude") + longitude = items[2] + accumulated_msgs = validate_decimal(i, longitude, accumulated_msgs, "longitude") + elev_m = items[3] + accumulated_msgs = validate_decimal(i, elev_m, accumulated_msgs, "elev_m") + name = items[4] + if len(name) == 0: + accumulated_msgs = empty_value(i, "name", accumulated_msgs) + st = items[5] + if len(st) == 0: + accumulated_msgs = empty_value(i, "st", accumulated_msgs) + mmdd = items[6] + accumulated_msgs = validate_mmdd(i, mmdd, accumulated_msgs) + doy = items[7] + accumulated_msgs = validate_integer(i, doy, accumulated_msgs, "doy") + # Make sure the DOY values are consecutive. + try: + if int(doy) != (last_doy + 1): + accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains a DOY (%s) that is not conexcutive." % (i, doy)) + stop_error(accumulated_msgs) + else: + last_doy += 1 + except Exception: + # The error for an invalid integer was captured above. + pass + tmin = items[8] + accumulated_msgs = validate_decimal(i, tmin, accumulated_msgs, "tmin") + tmax = items[9] + accumulated_msgs = validate_decimal(i, tmax, accumulated_msgs, "tmax") + num_normals_rows += 1 + else: + if i > 367: + accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 367 lines (must be 1 header line and no more than 366 data lines).") + stop_error(accumulated_msgs) + if len(items) != 6: + accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 6)." % (i, len(items))) + stop_error(accumulated_msgs) + latitude = items[0] + accumulated_msgs = validate_decimal(i, latitude, accumulated_msgs, "LATITUDE") + longitude = items[1] + accumulated_msgs = validate_decimal(i, longitude, accumulated_msgs, "LONGITUDE") + date_string = items[2] + accumulated_msgs = validate_date_string(line_no, date_string, accumulated_msgs) + doy = items[3] + accumulated_msgs = validate_integer(i, doy, accumulated_msgs, "doy") + # Make sure the DOY values are consecutive. + if i==0: + try: + last_doy = int(doy) + except Exception: + # The error for an invalid integer was captured above. + pass + else: + try: + if int(doy) != (last_doy + 1): + accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains a DOY (%s) that is not conexcutive." % (i, doy)) + stop_error(accumulated_msgs) + else: + last_doy += 1 + except Exception: + # The error for an invalid integer was captured above. + pass + tmin = items[8] + accumulated_msgs = validate_decimal(i, tmin, accumulated_msgs, "tmin") + tmax = items[9] + accumulated_msgs = validate_decimal(i, tmax, accumulated_msgs, "tmax") + if args.data_type == "normals" and num_normals_rows != 367: + accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains %d rows, (must be 367)." % num_normals_rows)) + +if len(accumulated_msgs) > 0: + stop_error(accumulated_msgs) + +shutil.copyfile(input_file, args.output)