Mercurial > repos > greg > validate_temperature_data
comparison validate_temperature_data.py @ 0:4290854f3af5 draft
Uploaded
author | greg |
---|---|
date | Tue, 27 Nov 2018 09:47:31 -0500 |
parents | |
children | cfe1ce427aa7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4290854f3af5 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 import datetime | |
4 import decimal | |
5 import re | |
6 import shutil | |
7 import sys | |
8 | |
9 parser = argparse.ArgumentParser() | |
10 parser.add_argument('--data_type', dest='data_type', default=None, help='Temperature data type, normals or actuals') | |
11 parser.add_argument('--input_actuals', dest='input_actuals', default=None, help='Daily actuals temperature data') | |
12 parser.add_argument('--input_normals', dest='input_normals', default=None, help='30 year normals temperature data') | |
13 parser.add_argument('--output', dest='output', help='Output dataset'), | |
14 args = parser.parse_args() | |
15 | |
16 ACTUALS_HEADER = "LATITUDE,LONGITUDE,DATE,DOY,TMIN,TMAX" | |
17 NORMALS_HEADER = "stationid,latitude,longitude,elev_m,name,st,mmdd,doy,tmin,tmax" | |
18 | |
19 def add_error_msg(accumulated_msgs, msg): | |
20 return "%s\n%s" % (accumulated_msgs, msg) | |
21 | |
22 | |
23 def empty_value(line_no, label, accumulated_msgs): | |
24 return add_error_msg(accumulated_msgs, "The required %s value is missing on line %d." % (label, line_no)) | |
25 | |
26 | |
27 def stop_error(msg): | |
28 sys.exit(msg) | |
29 | |
30 | |
31 def validate_date_string(line_no, date_string, accumulated_msgs): | |
32 try: | |
33 datetime.datetime.strptime(date_string, '%Y-%m-%d') | |
34 return accumulated_msgs | |
35 except ValueError: | |
36 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be YYYY-MM-DD)." % (line_no, date_string)) | |
37 | |
38 | |
39 def validate_decimal(line_no, decimal_string, accumulated_msgs, label): | |
40 try: | |
41 decimal.Decimal(decimal_string) | |
42 return accumulated_msgs | |
43 except Exception: | |
44 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect %s decimal value (%s)." % (line_no, label, decimal_string)) | |
45 | |
46 | |
47 def validate_integer(line_no, integer_string, accumulated_msgs, label): | |
48 if integer_string.isdigit(): | |
49 return accumulated_msgs | |
50 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect %s integer value (%s)." % (line_no, label, decimal_string)) | |
51 | |
52 | |
53 def validate_mmdd(line_no, mmdd, accumulated_msgs): | |
54 try: | |
55 datetime.datetime.strptime(date_string, '%m-%d') | |
56 return accumulated_msgs | |
57 except ValueError: | |
58 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be mm-dd)." % (line_no, date_string)) | |
59 | |
60 | |
61 accumulated_msgs = "" | |
62 # Parse the input file, skipping the header, and validating | |
63 # that each data line consists of 31 comma-separated items. | |
64 if args.data_type == "normals": | |
65 input_file = args.input_normals | |
66 header = NORMALS_HEADER | |
67 last_doy = 0 | |
68 num_normals_rows = 0 | |
69 else: | |
70 input_file = args.input_actuals | |
71 header = ACTUALS_HEADER | |
72 with open(input_file, "r") as ih: | |
73 for i, line in enumerate(ih): | |
74 line = line.rstrip("\r\n") | |
75 if i == 0: | |
76 if line != header: | |
77 accumulated_msgs = add_error_msg(accumulated_msgs, "The header is invalid, must be %s" % header) | |
78 items = line.split(",") | |
79 if args.data_type == "normals": | |
80 if i > 367: | |
81 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 367 lines (must be 1 header line and 366 data lines).") | |
82 stop_error(accumulated_msgs) | |
83 if len(items) != 10: | |
84 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 10)." % (i, len(items))) | |
85 stop_error(accumulated_msgs) | |
86 stationid = items[0] | |
87 if len(station_id) == 0: | |
88 accumulated_msgs = empty_value(i, "stationid", accumulated_msgs) | |
89 latitude = items[1] | |
90 accumulated_msgs = validate_decimal(i, latitude, accumulated_msgs, "latitude") | |
91 longitude = items[2] | |
92 accumulated_msgs = validate_decimal(i, longitude, accumulated_msgs, "longitude") | |
93 elev_m = items[3] | |
94 accumulated_msgs = validate_decimal(i, elev_m, accumulated_msgs, "elev_m") | |
95 name = items[4] | |
96 if len(name) == 0: | |
97 accumulated_msgs = empty_value(i, "name", accumulated_msgs) | |
98 st = items[5] | |
99 if len(st) == 0: | |
100 accumulated_msgs = empty_value(i, "st", accumulated_msgs) | |
101 mmdd = items[6] | |
102 accumulated_msgs = validate_mmdd(i, mmdd, accumulated_msgs) | |
103 doy = items[7] | |
104 accumulated_msgs = validate_integer(i, doy, accumulated_msgs, "doy") | |
105 # Make sure the DOY values are consecutive. | |
106 try: | |
107 if int(doy) != (last_doy + 1): | |
108 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains a DOY (%s) that is not conexcutive." % (i, doy)) | |
109 stop_error(accumulated_msgs) | |
110 else: | |
111 last_doy += 1 | |
112 except Exception: | |
113 # The error for an invalid integer was captured above. | |
114 pass | |
115 tmin = items[8] | |
116 accumulated_msgs = validate_decimal(i, tmin, accumulated_msgs, "tmin") | |
117 tmax = items[9] | |
118 accumulated_msgs = validate_decimal(i, tmax, accumulated_msgs, "tmax") | |
119 num_normals_rows += 1 | |
120 else: | |
121 if i > 367: | |
122 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 367 lines (must be 1 header line and no more than 366 data lines).") | |
123 stop_error(accumulated_msgs) | |
124 if len(items) != 6: | |
125 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 6)." % (i, len(items))) | |
126 stop_error(accumulated_msgs) | |
127 latitude = items[0] | |
128 accumulated_msgs = validate_decimal(i, latitude, accumulated_msgs, "LATITUDE") | |
129 longitude = items[1] | |
130 accumulated_msgs = validate_decimal(i, longitude, accumulated_msgs, "LONGITUDE") | |
131 date_string = items[2] | |
132 accumulated_msgs = validate_date_string(line_no, date_string, accumulated_msgs) | |
133 doy = items[3] | |
134 accumulated_msgs = validate_integer(i, doy, accumulated_msgs, "doy") | |
135 # Make sure the DOY values are consecutive. | |
136 if i==0: | |
137 try: | |
138 last_doy = int(doy) | |
139 except Exception: | |
140 # The error for an invalid integer was captured above. | |
141 pass | |
142 else: | |
143 try: | |
144 if int(doy) != (last_doy + 1): | |
145 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains a DOY (%s) that is not conexcutive." % (i, doy)) | |
146 stop_error(accumulated_msgs) | |
147 else: | |
148 last_doy += 1 | |
149 except Exception: | |
150 # The error for an invalid integer was captured above. | |
151 pass | |
152 tmin = items[8] | |
153 accumulated_msgs = validate_decimal(i, tmin, accumulated_msgs, "tmin") | |
154 tmax = items[9] | |
155 accumulated_msgs = validate_decimal(i, tmax, accumulated_msgs, "tmax") | |
156 if args.data_type == "normals" and num_normals_rows != 367: | |
157 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains %d rows, (must be 367)." % num_normals_rows)) | |
158 | |
159 if len(accumulated_msgs) > 0: | |
160 stop_error(accumulated_msgs) | |
161 | |
162 shutil.copyfile(input_file, args.output) |