patternpythonMinor
Extract room sizes & room types from an estate description with regex
Viewed 0 times
withdescriptionsizesestateextracttypesregexfromroom
Problem
I'm trying to extract room types and sizes from a descriptive text utilizing the following scripts as an exercise. Any tips on being more pythonic would be amazing!
The script is based off the help I received in this question.
The idea is that I find anything in the following sense: 16m2, 3.90 x 3,00m, 640x390 and that I at least attempt to return what type of room it is referring to (I'm still working on a more complete list of roomtypes).
```
def extract_rooms(description, printresults=False):
regexes = []
regexes.append(r'(\d{1,4}.{0,2}m[.]?[2|xb2])') # Finds 16m2
regexes.append(r'(\d{1,4}[.,]?\d{0,3}[ ]?[x]?[ ]\d{1,4}[.,]?\d{0,3}[ ?]m)') # Finds 3.90 x 3,00m & 3.90 x 3,00
regexes.append(r'(\d{1,4}[ ]?x[ ]?\d{1,4})') # Finds 640x390
# Split description into sentences
sentences = re.split('\. |! ', description) # Extract sentences
roomtypes =['woonkamer',
'slaapkamer',
'slaapkamers',
'woning',
'Ouderslaapkamer']
rooms = {}
i = 1
for sentence in sentences: # Try regex in the order described above
if printresults:
print('Working on sentence: ' + sentence)
for regex in regexes:
match = re.findall(regex, sentence)
if len(match) > 0:
for result in match:
beforematch = re.split(result, sentence)[0]
roomtype = "Unknown"
for word in beforematch.split(' ')[::-1]: # Search in reverse order of words
if word in roomtypes:
roomtype = word
break
rooms[roomtype + str(i)] = result
i += 1
if printresults:
print(roomtype + ' of ' + result + ' found in sentence: ' + sentence)
sentence = re.sub(resu
The script is based off the help I received in this question.
The idea is that I find anything in the following sense: 16m2, 3.90 x 3,00m, 640x390 and that I at least attempt to return what type of room it is referring to (I'm still working on a more complete list of roomtypes).
```
def extract_rooms(description, printresults=False):
regexes = []
regexes.append(r'(\d{1,4}.{0,2}m[.]?[2|xb2])') # Finds 16m2
regexes.append(r'(\d{1,4}[.,]?\d{0,3}[ ]?[x]?[ ]\d{1,4}[.,]?\d{0,3}[ ?]m)') # Finds 3.90 x 3,00m & 3.90 x 3,00
regexes.append(r'(\d{1,4}[ ]?x[ ]?\d{1,4})') # Finds 640x390
# Split description into sentences
sentences = re.split('\. |! ', description) # Extract sentences
roomtypes =['woonkamer',
'slaapkamer',
'slaapkamers',
'woning',
'Ouderslaapkamer']
rooms = {}
i = 1
for sentence in sentences: # Try regex in the order described above
if printresults:
print('Working on sentence: ' + sentence)
for regex in regexes:
match = re.findall(regex, sentence)
if len(match) > 0:
for result in match:
beforematch = re.split(result, sentence)[0]
roomtype = "Unknown"
for word in beforematch.split(' ')[::-1]: # Search in reverse order of words
if word in roomtypes:
roomtype = word
break
rooms[roomtype + str(i)] = result
i += 1
if printresults:
print(roomtype + ' of ' + result + ' found in sentence: ' + sentence)
sentence = re.sub(resu
Solution
This is not Pythonic:
It would be better to use a list literal, like this:
This is also not Pythonic:
Since
you can simplify the above:
If there are no matches,
then there will be no iterations,
which is exactly the same effect as in your original code,
but shorter, simpler, Pythonic.
Although the code is fairly nicely formatted,
you don't follow strictly PEP8.
For example:
Should be:
The raw string notation
but these are simple strings:
So you should use simply
It's better to move code out of
otherwise the variables used there in the global name space may be shadowed inside the methods. Make it a habit to write this way:
regexes = []
regexes.append(r'(\d{1,4}.{0,2}m[.]?[2|xb2])') # Finds 16m2
regexes.append(r'(\d{1,4}[.,]?\d{0,3}[ ]?[x]?[ ]\d{1,4}[.,]?\d{0,3}[ ?]m)') # Finds 3.90 x 3,00m & 3.90 x 3,00
regexes.append(r'(\d{1,4}[ ]?x[ ]?\d{1,4})') # Finds 640x390It would be better to use a list literal, like this:
regexes = [
r'(\d{1,4}.{0,2}m[.]?[2|xb2])',
r'(\d{1,4}[.,]?\d{0,3}[ ]?[x]?[ ]\d{1,4}[.,]?\d{0,3}[ ?]m)',
r'(\d{1,4}[ ]?x[ ]?\d{1,4})',
]This is also not Pythonic:
match = re.findall(regex, sentence)
if len(match) > 0:
for result in match:
# ...Since
re.findall will always return a list,you can simplify the above:
for result in re.findall(regex, sentence):
# ...If there are no matches,
then there will be no iterations,
which is exactly the same effect as in your original code,
but shorter, simpler, Pythonic.
Although the code is fairly nicely formatted,
you don't follow strictly PEP8.
For example:
roomtypes =['woonkamer',Should be:
# put space around '='
roomtypes = ['woonkamer',The raw string notation
r"" is for regular expressions,but these are simple strings:
description1 = r"[u'Schrabber....."
description2 = r"[u'Goed onde....."So you should use simply
"..." instead of r'...'It's better to move code out of
if __name__ == ... into a method,otherwise the variables used there in the global name space may be shadowed inside the methods. Make it a habit to write this way:
def main():
description1 = r"[u'Schrabber ..."
description2 = r"[u'Goed onderh..."
print(extract_rooms(description1, True))
print(extract_rooms(description2, True))
if __name__ == "__main__":
main()Code Snippets
regexes = []
regexes.append(r'(\d{1,4}.{0,2}m[.]?[2|xb2])') # Finds 16m2
regexes.append(r'(\d{1,4}[.,]?\d{0,3}[ ]?[x]?[ ]\d{1,4}[.,]?\d{0,3}[ ?]m)') # Finds 3.90 x 3,00m & 3.90 x 3,00
regexes.append(r'(\d{1,4}[ ]?x[ ]?\d{1,4})') # Finds 640x390regexes = [
r'(\d{1,4}.{0,2}m[.]?[2|xb2])',
r'(\d{1,4}[.,]?\d{0,3}[ ]?[x]?[ ]\d{1,4}[.,]?\d{0,3}[ ?]m)',
r'(\d{1,4}[ ]?x[ ]?\d{1,4})',
]match = re.findall(regex, sentence)
if len(match) > 0:
for result in match:
# ...for result in re.findall(regex, sentence):
# ...roomtypes =['woonkamer',Context
StackExchange Code Review Q#73871, answer score: 6
Revisions (0)
No revisions yet.