Mercurial > repos > greg > validate_affy_metadata
comparison validate_affy_metadata.py @ 25:d9f3bcfeecfe draft default tip
Uploaded
| author | greg |
|---|---|
| date | Thu, 15 Aug 2019 13:15:22 -0400 |
| parents | 8a826d1afe69 |
| children |
comparison
equal
deleted
inserted
replaced
| 24:8a826d1afe69 | 25:d9f3bcfeecfe |
|---|---|
| 35 return 'True' | 35 return 'True' |
| 36 else: | 36 else: |
| 37 return 'False' | 37 return 'False' |
| 38 | 38 |
| 39 | 39 |
| 40 def validate_date_string(line_no, date_string, accumulated_msgs): | 40 def validate_date_string(line_no, date_string, column, accumulated_msgs): |
| 41 if len(date_string) == 0: | 41 if len(date_string) == 0: |
| 42 return accumulated_msgs | 42 return accumulated_msgs |
| 43 try: | 43 try: |
| 44 datetime.datetime.strptime(date_string, '%Y-%m-%d') | 44 datetime.datetime.strptime(date_string, '%Y-%m-%d') |
| 45 return accumulated_msgs | 45 return accumulated_msgs |
| 46 except ValueError: | 46 except ValueError: |
| 47 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be YYYY-MM-DD)." % (line_no, date_string)) | 47 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be YYYY-MM-DD) for column %s." % (line_no, date_string, column)) |
| 48 | 48 |
| 49 | 49 |
| 50 def validate_decimal(line_no, decimal_string, accumulated_msgs): | 50 def validate_decimal(line_no, decimal_string, column, accumulated_msgs): |
| 51 try: | 51 try: |
| 52 decimal.Decimal(decimal_string) | 52 decimal.Decimal(decimal_string) |
| 53 return accumulated_msgs | 53 return accumulated_msgs |
| 54 except Exception: | 54 except Exception: |
| 55 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s)." % (line_no, decimal_string)) | 55 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s) for column %s." % (line_no, decimal_string, column)) |
| 56 | 56 |
| 57 | 57 |
| 58 def validate_email(line_no, email, accumulated_msgs): | 58 def validate_email(line_no, email, accumulated_msgs): |
| 59 if not (VALID_EMAIL_RE.match(email)): | 59 if not (VALID_EMAIL_RE.match(email)): |
| 60 return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s). " % (line_no, email)) | 60 return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s). " % (line_no, email)) |
| 69 with open(args.input, "r") as ih: | 69 with open(args.input, "r") as ih: |
| 70 for i, line in enumerate(ih): | 70 for i, line in enumerate(ih): |
| 71 if i == 0: | 71 if i == 0: |
| 72 # Skip the header. | 72 # Skip the header. |
| 73 continue | 73 continue |
| 74 # Keep 1-based line value for error messages. | |
| 75 line_no = i + 1 | |
| 74 line = line.rstrip("\r\n") | 76 line = line.rstrip("\r\n") |
| 75 if i > 97: | 77 if i > 97: |
| 76 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 97 lines (must be 1 header line and no more than 96 data lines).") | 78 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 97 lines (must be 1 header line and no more than 96 data lines).") |
| 77 stop_error(accumulated_msgs) | 79 stop_error(accumulated_msgs) |
| 78 items = line.split("\t") | 80 items = line.split("\t") |
| 79 if len(items) != 32: | 81 if len(items) != 32: |
| 80 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 32)." % (i, len(items))) | 82 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 32)." % (line_no, len(items))) |
| 81 stop_error(accumulated_msgs) | 83 stop_error(accumulated_msgs) |
| 82 # Required and validated. | 84 # Required and validated. |
| 83 # Required. | 85 # Required. |
| 84 user_specimen_id = items[0] | 86 user_specimen_id = items[0] |
| 85 if len(user_specimen_id) == 0: | 87 if len(user_specimen_id) == 0: |
| 86 accumulated_msgs = empty_value(i, "user_specimen_id", accumulated_msgs) | 88 accumulated_msgs = empty_value(line_no, "user_specimen_id", accumulated_msgs) |
| 87 # Optional. | 89 # Optional. |
| 88 field_call = items[1] | 90 field_call = items[1] |
| 89 # Optional. | 91 # Optional. |
| 90 bcoral_genet_id = items[2] | 92 bcoral_genet_id = items[2] |
| 91 # Optional. | 93 # Optional. |
| 92 bsym_genet_id = items[3] | 94 bsym_genet_id = items[3] |
| 93 # Required. | 95 # Required. |
| 94 reef = items[4] | 96 reef = items[4] |
| 95 if len(reef) == 0: | 97 if len(reef) == 0: |
| 96 accumulated_msgs = empty_value(i, "reef", accumulated_msgs) | 98 accumulated_msgs = empty_value(line_no, "reef", accumulated_msgs) |
| 97 # Required. | 99 # Required. |
| 98 region = items[5] | 100 region = items[5] |
| 99 if len(region) == 0: | 101 if len(region) == 0: |
| 100 accumulated_msgs = empty_value(i, "region", accumulated_msgs) | 102 accumulated_msgs = empty_value(line_no, "region", accumulated_msgs) |
| 101 # Required and validated. | 103 # Required and validated. |
| 102 latitude = items[6] | 104 latitude = items[6] |
| 103 accumulated_msgs = validate_decimal(i, latitude, accumulated_msgs) | 105 accumulated_msgs = validate_decimal(line_no, latitude, "latitude", accumulated_msgs) |
| 104 # Required and validated. | 106 # Required and validated. |
| 105 longitude = items[7] | 107 longitude = items[7] |
| 106 accumulated_msgs = validate_decimal(i, longitude, accumulated_msgs) | 108 accumulated_msgs = validate_decimal(line_no, longitude, "longitude", accumulated_msgs) |
| 107 # Optional. | 109 # Optional. |
| 108 geographic_origin = items[8] | 110 geographic_origin = items[8] |
| 109 # Optional. | 111 # Optional. |
| 110 sample_location = items[9] | 112 colony_location = items[9] |
| 111 # Optional. | 113 # Optional. |
| 112 latitude_outplant = items[10] | 114 depth = items[10] |
| 113 # Optional. | 115 # Optional. |
| 114 longitude_outplant = items[11] | 116 disease_resist = items[11] |
| 115 # Optional. | 117 # Optional. |
| 116 depth = items[12] | 118 bleach_resist = items[12] |
| 117 # Optional. | 119 # Optional. |
| 118 disease_resist = items[13] | 120 mortality = items[13] |
| 119 # Optional. | 121 # Optional. |
| 120 bleach_resist = items[14] | 122 tle = items[14] |
| 121 # Optional. | 123 # Optional. |
| 122 mortality = items[15] | 124 spawning = string_as_boolean_string(items[15]) |
| 125 # Required. | |
| 126 collector_last_name = items[16] | |
| 127 if len(collector_last_name) == 0: | |
| 128 accumulated_msgs = empty_value(line_no, "collector_last_name", accumulated_msgs) | |
| 129 # Required. | |
| 130 collector_first_name = items[17] | |
| 131 if len(collector_first_name) == 0: | |
| 132 accumulated_msgs = empty_value(line_no, "collector_first_name", accumulated_msgs) | |
| 133 # Required. | |
| 134 org = items[18] | |
| 135 if len(org) == 0: | |
| 136 accumulated_msgs = empty_value(line_no, "org", accumulated_msgs) | |
| 137 # Required and validated. | |
| 138 collection_date = items[19] | |
| 139 accumulated_msgs = validate_date_string(line_no, collection_date, "collection_date", accumulated_msgs) | |
| 140 # Required and validated. | |
| 141 contact_email = items[20] | |
| 142 accumulated_msgs = validate_email(line_no, contact_email, accumulated_msgs) | |
| 143 # Required. | |
| 144 seq_facility = items[21] | |
| 145 if len(seq_facility) == 0: | |
| 146 accumulated_msgs = empty_value(line_no, "seq_facility", accumulated_msgs) | |
| 123 # Optional. | 147 # Optional. |
| 124 tle = items[16] | 148 array_version = items[22] |
| 125 # Optional. | 149 # Optional. |
| 126 spawning = string_as_boolean_string(items[17]) | 150 public = string_as_boolean_string(items[23]) |
| 127 # Required. | 151 # Optional. |
| 128 collector_last_name = items[18] | 152 public_after_date = items[24] |
| 129 if len(collector_last_name) == 0: | 153 accumulated_msga = validate_date_string(line_no, public_after_date, "public_after_date", accumulated_msgs) |
| 130 accumulated_msgs = empty_value(i, "collector_last_name", accumulated_msgs) | |
| 131 # Required. | |
| 132 collector_first_name = items[19] | |
| 133 if len(collector_first_name) == 0: | |
| 134 accumulated_msgs = empty_value(i, "collector_first_name", accumulated_msgs) | |
| 135 # Required. | |
| 136 org = items[20] | |
| 137 if len(org) == 0: | |
| 138 accumulated_msgs = empty_value(i, "org", accumulated_msgs) | |
| 139 # Required and validated. | 154 # Required and validated. |
| 140 collection_date = items[21] | 155 sperm_motility = items[25] |
| 141 accumulated_msgs = validate_date_string(i, collection_date, accumulated_msgs) | 156 accumulated_msgs = validate_decimal(line_no, sperm_motility, "sperm_motility", accumulated_msgs) |
| 142 # Required and validated. | 157 # Required and validated. |
| 143 contact_email = items[22] | 158 healing_time = items[26] |
| 144 accumulated_msgs = validate_email(i, contact_email, accumulated_msgs) | 159 accumulated_msgs = validate_decimal(line_no, healing_time, "healing_time", accumulated_msgs) |
| 145 # Required. | |
| 146 seq_facility = items[23] | |
| 147 if len(seq_facility) == 0: | |
| 148 accumulated_msgs = empty_value(i, "seq_facility", accumulated_msgs) | |
| 149 # Optional. | 160 # Optional. |
| 150 array_version = items[24] | 161 dna_extraction_method = items[27] |
| 151 # Optional. | 162 # Optional. |
| 152 public = string_as_boolean_string(items[25]) | 163 dna_concentration = items[28] |
| 153 # Optional. | |
| 154 public_after_date = items[26] | |
| 155 accumulated_msga = validate_date_string(i, public_after_date, accumulated_msgs) | |
| 156 # Required and validated. | |
| 157 sperm_motility = items[27] | |
| 158 accumulated_msgs = validate_decimal(i, sperm_motility, accumulated_msgs) | |
| 159 # Required and validated. | |
| 160 healing_time = items[28] | |
| 161 accumulated_msgs = validate_decimal(i, healing_time, accumulated_msgs) | |
| 162 # Optional. | |
| 163 dna_extraction_method = items[29] | |
| 164 # Optional. | |
| 165 dna_concentration = items[30] | |
| 166 # If dna_concentration has a value, then it must be decimal. | 164 # If dna_concentration has a value, then it must be decimal. |
| 167 if len(dna_concentration) > 0: | 165 if len(dna_concentration) > 0: |
| 168 accumulated_msgs = validate_decimal(i, dna_concentration, accumulated_msgs) | 166 accumulated_msgs = validate_decimal(line_no, dna_concentration, "dna_concentration", accumulated_msgs) |
| 169 # Optional. | 167 # Optional. |
| 170 registry_id = items[31] | 168 registry_id = items[29] |
| 171 | 169 # Optional. |
| 170 result_folder_name = items[30] | |
| 171 # Optional. | |
| 172 plate_barcode = items[31] | |
| 173 | |
| 172 | 174 |
| 173 if len(accumulated_msgs) > 0: | 175 if len(accumulated_msgs) > 0: |
| 174 stop_error(accumulated_msgs) | 176 stop_error(accumulated_msgs) |
| 175 | 177 |
| 176 shutil.copyfile(args.input, args.output) | 178 shutil.copyfile(args.input, args.output) |
