snippetpythonMinor
UK postcode validation and format correction tool
Viewed 0 times
formatcorrectionandvalidationpostcodetool
Problem
I've been working on a regex-based UK postcode validation and format correction tool - with the aim of creating a list of postcodes that can be readily geocoded. The variety of postcode formats is explained here. I have based my regex on a discussion here.
My correction tool is designed to cope with a number of commonly made mistakes when a postcode is inputted in a free text format as such:
I have managed this (somewhat clunky) here:
```
pc<-data.frame(postcode=c("GIR 0AA","M2 0AB","M2 OAB","M2 0ab","M1 1AA","M11AA","M60 1NW","M6O 1NW","M601NW","CR2 6XH","CR26XH","DN55 1PT","DN551PT","W1A 1HQ","W1A1HQ","w1a 1hq","EC1A 1BB","EC1A1BB"), true=c(TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE))
#Identify postcodes not in correct format
pc$original_validate<-grepl('^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y]))) [0-9][ABD-HJLNP-UW-Z]{2}))$',pc$postcode)
pc$non_pc<-ifelse(pc$original_validate ==FALSE,as.character(pc$postcode),"")
#Convert all to upper case
pc$new_pc<-toupper(pc$non_pc)
#Identify postcodes without a space and include a space
pc$non_pc2<-grepl('^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y]))) [0-9][ABD-HJLNP-UW-Z]{2}))$',pc$new_pc)
pc$non_pc2<-ifelse(pc$non_pc2==FALSE & pc$new_pc!="",as.character(pc$new_pc),"")
pc$new_pc2<-gsub("^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y])))))([0-9][ABD-HJLNP-UW-Z]{2})$","\\1 \\9",pc$non_pc2)
pc$non_pc<-NULL
pc$non_pc2<-NULL
#Identify postcodes containing an O instead of 0 in first half
pc$non_pc3<--grepl('^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|(
My correction tool is designed to cope with a number of commonly made mistakes when a postcode is inputted in a free text format as such:
- Postcode entered in lower case
- No space between first and second half of postcode
- O instead of 0 in two key areas e.g. CH6O 8MG -> CH60 8MG & M8 OFG -> M8 0FG
I have managed this (somewhat clunky) here:
```
pc<-data.frame(postcode=c("GIR 0AA","M2 0AB","M2 OAB","M2 0ab","M1 1AA","M11AA","M60 1NW","M6O 1NW","M601NW","CR2 6XH","CR26XH","DN55 1PT","DN551PT","W1A 1HQ","W1A1HQ","w1a 1hq","EC1A 1BB","EC1A1BB"), true=c(TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE))
#Identify postcodes not in correct format
pc$original_validate<-grepl('^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y]))) [0-9][ABD-HJLNP-UW-Z]{2}))$',pc$postcode)
pc$non_pc<-ifelse(pc$original_validate ==FALSE,as.character(pc$postcode),"")
#Convert all to upper case
pc$new_pc<-toupper(pc$non_pc)
#Identify postcodes without a space and include a space
pc$non_pc2<-grepl('^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y]))) [0-9][ABD-HJLNP-UW-Z]{2}))$',pc$new_pc)
pc$non_pc2<-ifelse(pc$non_pc2==FALSE & pc$new_pc!="",as.character(pc$new_pc),"")
pc$new_pc2<-gsub("^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y])))))([0-9][ABD-HJLNP-UW-Z]{2})$","\\1 \\9",pc$non_pc2)
pc$non_pc<-NULL
pc$non_pc2<-NULL
#Identify postcodes containing an O instead of 0 in first half
pc$non_pc3<--grepl('^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|(
Solution
My first suggestion is to write a
Then, it is also a good idea to write a separate function for each of the fixes you have. It makes your code easier to test and reuse. Also, when using appropriate function names, it makes your code self-explanatory and can replace all the comments:
%%CODEBLOCK_1%%
With these, your code can simplify to:
%%CODEBLOCK_2%%
Note how I am using vectors everywhere and only in the end putting the results in a data.frame. Indeed, there is arguably no need for a data.frame until you want to see the results in a nice format, at the very end. Keeping things in vectors avoid the repetitive
I have slightly modified your algorithm, where instead of replacing valid postcodes by
%%CODEBLOCK_3%%
It is shorter this way and removes the need for the error-prone
is_valid function since you are using this code in quite a few places:is_valid <- function(x)
grepl('^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y]))) [0-9][ABD-HJLNP-UW-Z]{2}))
Then, it is also a good idea to write a separate function for each of the fixes you have. It makes your code easier to test and reuse. Also, when using appropriate function names, it makes your code self-explanatory and can replace all the comments:
fix_case <- toupper
fix_space <- function(x)
sub("^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y])))))([0-9][ABD-HJLNP-UW-Z]{2})$","\\1 \\9", x)
fix_zero_1 <- function(x)
sub("([A-Z][0-9]|[A-Z][A-Z]|[A-Z][A-Z][0-9])[O]","\\10", x)
fix_zero_2 <- function(x) {
x <- sub("^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y]))))) ([O][ABD-HJLNP-UW-Z]{2})$", "\\1 0\\9", x)
sub("0O", "0", x)
}
With these, your code can simplify to:
postcodes <- c(
"GIR 0AA",
"M2 0AB",
"M2 OAB",
"M2 0ab",
"M1 1AA",
"M11AA",
"M60 1NW",
"M6O 1NW",
"M601NW",
"CR2 6XH",
"CR26XH",
"DN55 1PT",
"DN551PT",
"W1A 1HQ",
"W1A1HQ",
"w1a 1hq",
"EC1A 1BB",
"EC1A1BB"
)
bg <- postcodes
bg <- ifelse(is_valid(bg), bg, fix_case(bg))
bg <- ifelse(is_valid(bg), bg, fix_space(bg))
bg <- ifelse(is_valid(bg), bg, fix_zero_1(bg))
bg <- ifelse(is_valid(bg), bg, fix_zero_2(bg))
data.frame(original = postcodes,
best_guess = bg,
is_valid = is_valid(bg))
Note how I am using vectors everywhere and only in the end putting the results in a data.frame. Indeed, there is arguably no need for a data.frame until you want to see the results in a nice format, at the very end. Keeping things in vectors avoid the repetitive pc$ and the annoying conversion from character vectors to factors (what forced you to use as.character).
I have slightly modified your algorithm, where instead of replacing valid postcodes by "" so they won't get affected by fixes, I am propagating a vector of valid or "best guess" codes (bg) via the construct:
bg <- ifelse(is_valid(bg), bg, som_fix(bg))
It is shorter this way and removes the need for the error-prone paste() at the end of your code., x)Then, it is also a good idea to write a separate function for each of the fixes you have. It makes your code easier to test and reuse. Also, when using appropriate function names, it makes your code self-explanatory and can replace all the comments:
%%CODEBLOCK_1%%
With these, your code can simplify to:
%%CODEBLOCK_2%%
Note how I am using vectors everywhere and only in the end putting the results in a data.frame. Indeed, there is arguably no need for a data.frame until you want to see the results in a nice format, at the very end. Keeping things in vectors avoid the repetitive
pc$ and the annoying conversion from character vectors to factors (what forced you to use as.character).I have slightly modified your algorithm, where instead of replacing valid postcodes by
"" so they won't get affected by fixes, I am propagating a vector of valid or "best guess" codes (bg) via the construct:%%CODEBLOCK_3%%
It is shorter this way and removes the need for the error-prone
paste() at the end of your code.Code Snippets
is_valid <- function(x)
grepl('^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y]))) [0-9][ABD-HJLNP-UW-Z]{2}))$', x)fix_case <- toupper
fix_space <- function(x)
sub("^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y])))))([0-9][ABD-HJLNP-UW-Z]{2})$","\\1 \\9", x)
fix_zero_1 <- function(x)
sub("([A-Z][0-9]|[A-Z][A-Z]|[A-Z][A-Z][0-9])[O]","\\10", x)
fix_zero_2 <- function(x) {
x <- sub("^((GIR 0AA)|((([A-PR-UWYZ][A-HK-Y]?[0-9][0-9]?)|(([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRV-Y]))))) ([O][ABD-HJLNP-UW-Z]{2})$", "\\1 0\\9", x)
sub("0O", "0", x)
}postcodes <- c(
"GIR 0AA",
"M2 0AB",
"M2 OAB",
"M2 0ab",
"M1 1AA",
"M11AA",
"M60 1NW",
"M6O 1NW",
"M601NW",
"CR2 6XH",
"CR26XH",
"DN55 1PT",
"DN551PT",
"W1A 1HQ",
"W1A1HQ",
"w1a 1hq",
"EC1A 1BB",
"EC1A1BB"
)
bg <- postcodes
bg <- ifelse(is_valid(bg), bg, fix_case(bg))
bg <- ifelse(is_valid(bg), bg, fix_space(bg))
bg <- ifelse(is_valid(bg), bg, fix_zero_1(bg))
bg <- ifelse(is_valid(bg), bg, fix_zero_2(bg))
data.frame(original = postcodes,
best_guess = bg,
is_valid = is_valid(bg))bg <- ifelse(is_valid(bg), bg, som_fix(bg))Context
StackExchange Code Review Q#117801, answer score: 3
Revisions (0)
No revisions yet.