patternpythonMinor
Making the Levenshtein distance code cleaner
Viewed 0 times
thedistancelevenshteincodemakingcleaner
Problem
I was writing an implementation of Levenshtein distance in Python and I found my code really ugly.
Have you any idea how to make it more elegant?
```
import unittest
import enum
def seq_dist(first, second):
Direction = enum.Enum('Direction', 'gap_first gap_second match finish')
def find_dist():
dist_table = [(len(second) + 1) * [0] for i in range(len(first) + 1)]
directions = [(len(second) + 1) * [None] for i in range(len(first) + 1)]
for i in range(len(first) + 1):
for j in range(len(second) + 1):
if (i, j) == (0, 0):
dist_table[i][j] = 0
directions[i][j] = Direction.finish
elif i == 0:
dist_table[i][j] = j
directions[i][j] = Direction.gap_first
elif j == 0:
dist_table[i][j] = i
directions[i][j] = Direction.gap_second
else:
gap_first = dist_table[i - 1][j] + 1
gap_second = dist_table[i][j - 1] + 1
match = dist_table[i - 1][j - 1] + (first[i-1] != second[j-1])
if gap_first <= min(gap_first, match):
dist_table[i][j] = gap_first
directions[i][j] = Direction.gap_second
elif gap_second <= min(gap_second, match):
dist_table[i][j] = gap_second
directions[i][j] = Direction.gap_first
else:
dist_table[i][j] = match
directions[i][j] = Direction.match
return dist_table, directions
def backtrack(directions):
i, j = (len(first), len(second))
seq = []
while directions[i][j] != Direction.finish:
direction = directions[i][j]
if direction == Direction.gap_first:
seq.append((None, second[j-1]))
j -= 1
Have you any idea how to make it more elegant?
```
import unittest
import enum
def seq_dist(first, second):
Direction = enum.Enum('Direction', 'gap_first gap_second match finish')
def find_dist():
dist_table = [(len(second) + 1) * [0] for i in range(len(first) + 1)]
directions = [(len(second) + 1) * [None] for i in range(len(first) + 1)]
for i in range(len(first) + 1):
for j in range(len(second) + 1):
if (i, j) == (0, 0):
dist_table[i][j] = 0
directions[i][j] = Direction.finish
elif i == 0:
dist_table[i][j] = j
directions[i][j] = Direction.gap_first
elif j == 0:
dist_table[i][j] = i
directions[i][j] = Direction.gap_second
else:
gap_first = dist_table[i - 1][j] + 1
gap_second = dist_table[i][j - 1] + 1
match = dist_table[i - 1][j - 1] + (first[i-1] != second[j-1])
if gap_first <= min(gap_first, match):
dist_table[i][j] = gap_first
directions[i][j] = Direction.gap_second
elif gap_second <= min(gap_second, match):
dist_table[i][j] = gap_second
directions[i][j] = Direction.gap_first
else:
dist_table[i][j] = match
directions[i][j] = Direction.match
return dist_table, directions
def backtrack(directions):
i, j = (len(first), len(second))
seq = []
while directions[i][j] != Direction.finish:
direction = directions[i][j]
if direction == Direction.gap_first:
seq.append((None, second[j-1]))
j -= 1
Solution
It would seem that you try to do more than is sufficient for computing the Levenshtein edit distance. You could try this:
Hope that helps.
Edit
The following extension will also give you the edit sequence, and the alignment of the two input words:
def levenshtein_distance(string1, string2):
n = len(string1)
m = len(string2)
d = [[0 for x in range(n + 1)] for y in range(m + 1)]
for i in range(1, m + 1):
d[i][0] = i
for j in range(1, n + 1):
d[0][j] = j
for j in range(1, n + 1):
for i in range(1, m + 1):
if string1[j - 1] is string2[i - 1]:
delta = 0
else:
delta = 1
d[i][j] = min(d[i - 1][j] + 1,
d[i][j - 1] + 1,
d[i - 1][j - 1] + delta)
return d[m][n]
def main():
print(levenshtein_distance("code", "rodde"))
if __name__ == "__main__":
main()Hope that helps.
Edit
The following extension will also give you the edit sequence, and the alignment of the two input words:
def levenshtein_distance(s, z):
s = "\u0000" + s
z = "\u0000" + z
n = len(s)
m = len(z)
d = [[0 for x in range(n + 1)] for y in range(m + 1)]
parent_map = dict()
parent_map[(0, 0)] = None
for i in range(1, m + 1):
d[i][0] = i
for j in range(1, n + 1):
d[0][j] = j
for j in range(1, n + 1):
for i in range(1, m + 1):
if s[j - 1] is z[i - 1]:
delta = 0
else:
delta = 1
tentative_dist = d[i - 1][j] + 1
operation = 1 # Insert
if tentative_dist > d[i][j - 1] + 1:
tentative_dist = d[i][j - 1] + 1
operation = 2 # Delete
if tentative_dist > d[i - 1][j - 1] + delta:
tentative_dist = d[i - 1][j - 1] + delta
operation = 3
if operation is 1: # Insert
parent_map[(i, j)] = (i - 1, j)
elif operation is 2: # Delete
parent_map[(i, j)] = (i, j - 1)
else: # Substitute
parent_map[(i, j)] = (i - 1, j - 1)
d[i][j] = tentative_dist
edit_string_top = ""
edit_string_bottom = ""
edit_sequence = ""
current = (m, n)
while True:
predecessor = parent_map[current]
if not predecessor:
break
if current[0] != predecessor[0] and current[1] != predecessor[1]:
schar = s[predecessor[1]]
zchar = z[predecessor[0]]
edit_string_top += schar
edit_string_bottom += zchar
if schar is zchar:
edit_sequence += "N"
else:
edit_sequence += "S"
elif current[0] != predecessor[0]:
edit_string_top += "-"
edit_string_bottom += z[predecessor[0]]
edit_sequence += "I"
else:
edit_string_top += s[predecessor[1]]
edit_string_bottom += "-"
edit_sequence += "D"
current = predecessor
edit_string_top = edit_string_top[:-1][::-1]
edit_string_bottom = edit_string_bottom[:-1][::-1]
edit_sequence = edit_sequence[:-1][::-1]
return d[m][n], edit_sequence, edit_string_top, edit_string_bottom
def main():
dist, edit_sequence, edit_string_top, edit_string_bottom = levenshtein_distance("handball", "aballad")
print("Distance: ", dist)
print("Edit sequence: ", edit_sequence)
print(edit_string_top)
print(edit_string_bottom)
if __name__ == "__main__":
main()Code Snippets
def levenshtein_distance(string1, string2):
n = len(string1)
m = len(string2)
d = [[0 for x in range(n + 1)] for y in range(m + 1)]
for i in range(1, m + 1):
d[i][0] = i
for j in range(1, n + 1):
d[0][j] = j
for j in range(1, n + 1):
for i in range(1, m + 1):
if string1[j - 1] is string2[i - 1]:
delta = 0
else:
delta = 1
d[i][j] = min(d[i - 1][j] + 1,
d[i][j - 1] + 1,
d[i - 1][j - 1] + delta)
return d[m][n]
def main():
print(levenshtein_distance("code", "rodde"))
if __name__ == "__main__":
main()def levenshtein_distance(s, z):
s = "\u0000" + s
z = "\u0000" + z
n = len(s)
m = len(z)
d = [[0 for x in range(n + 1)] for y in range(m + 1)]
parent_map = dict()
parent_map[(0, 0)] = None
for i in range(1, m + 1):
d[i][0] = i
for j in range(1, n + 1):
d[0][j] = j
for j in range(1, n + 1):
for i in range(1, m + 1):
if s[j - 1] is z[i - 1]:
delta = 0
else:
delta = 1
tentative_dist = d[i - 1][j] + 1
operation = 1 # Insert
if tentative_dist > d[i][j - 1] + 1:
tentative_dist = d[i][j - 1] + 1
operation = 2 # Delete
if tentative_dist > d[i - 1][j - 1] + delta:
tentative_dist = d[i - 1][j - 1] + delta
operation = 3
if operation is 1: # Insert
parent_map[(i, j)] = (i - 1, j)
elif operation is 2: # Delete
parent_map[(i, j)] = (i, j - 1)
else: # Substitute
parent_map[(i, j)] = (i - 1, j - 1)
d[i][j] = tentative_dist
edit_string_top = ""
edit_string_bottom = ""
edit_sequence = ""
current = (m, n)
while True:
predecessor = parent_map[current]
if not predecessor:
break
if current[0] != predecessor[0] and current[1] != predecessor[1]:
schar = s[predecessor[1]]
zchar = z[predecessor[0]]
edit_string_top += schar
edit_string_bottom += zchar
if schar is zchar:
edit_sequence += "N"
else:
edit_sequence += "S"
elif current[0] != predecessor[0]:
edit_string_top += "-"
edit_string_bottom += z[predecessor[0]]
edit_sequence += "I"
else:
edit_string_top += s[predecessor[1]]
edit_string_bottom += "-"
edit_sequence += "D"
current = predecessor
edit_string_top = edit_string_top[:-1][::-1]
edit_string_bottom = edit_string_bottom[:-1][::-1]
edit_sequence = edit_sequence[:-1][::-1]
return d[m][n], edit_sequence, edit_string_top, edit_string_bottom
def main():
dist, edit_sequence, edit_string_top, edit_string_bottom = levenshtein_distance("handball", "aballad")
print("Distance: ", dist)
print("Edit sequence: ", edit_sequence)
print(edit_string_top)
print(edit_string_bottom)
if __name__ == "__main__":
main()Context
StackExchange Code Review Q#126194, answer score: 4
Revisions (0)
No revisions yet.