HiveBrain v1.2.0
Get Started
← Back to all entries
patternpythonMinor

Making the Levenshtein distance code cleaner

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
thedistancelevenshteincodemakingcleaner

Problem

I was writing an implementation of Levenshtein distance in Python and I found my code really ugly.
Have you any idea how to make it more elegant?

```
import unittest
import enum

def seq_dist(first, second):
Direction = enum.Enum('Direction', 'gap_first gap_second match finish')

def find_dist():
dist_table = [(len(second) + 1) * [0] for i in range(len(first) + 1)]
directions = [(len(second) + 1) * [None] for i in range(len(first) + 1)]
for i in range(len(first) + 1):
for j in range(len(second) + 1):
if (i, j) == (0, 0):
dist_table[i][j] = 0
directions[i][j] = Direction.finish
elif i == 0:
dist_table[i][j] = j
directions[i][j] = Direction.gap_first
elif j == 0:
dist_table[i][j] = i
directions[i][j] = Direction.gap_second
else:
gap_first = dist_table[i - 1][j] + 1
gap_second = dist_table[i][j - 1] + 1
match = dist_table[i - 1][j - 1] + (first[i-1] != second[j-1])
if gap_first <= min(gap_first, match):
dist_table[i][j] = gap_first
directions[i][j] = Direction.gap_second
elif gap_second <= min(gap_second, match):
dist_table[i][j] = gap_second
directions[i][j] = Direction.gap_first
else:
dist_table[i][j] = match
directions[i][j] = Direction.match
return dist_table, directions

def backtrack(directions):
i, j = (len(first), len(second))
seq = []
while directions[i][j] != Direction.finish:
direction = directions[i][j]
if direction == Direction.gap_first:
seq.append((None, second[j-1]))
j -= 1

Solution

It would seem that you try to do more than is sufficient for computing the Levenshtein edit distance. You could try this:

def levenshtein_distance(string1, string2):
    n = len(string1)
    m = len(string2)
    d = [[0 for x in range(n + 1)] for y in range(m + 1)]

    for i in range(1, m + 1):
        d[i][0] = i

    for j in range(1, n + 1):
        d[0][j] = j

    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if string1[j - 1] is string2[i - 1]:
                delta = 0
            else:
                delta = 1

            d[i][j] = min(d[i - 1][j] + 1,
                          d[i][j - 1] + 1,
                          d[i - 1][j - 1] + delta)

    return d[m][n]

def main():
    print(levenshtein_distance("code", "rodde"))

if __name__ == "__main__":
    main()


Hope that helps.

Edit

The following extension will also give you the edit sequence, and the alignment of the two input words:

def levenshtein_distance(s, z):
    s = "\u0000" + s
    z = "\u0000" + z

    n = len(s)
    m = len(z)
    d = [[0 for x in range(n + 1)] for y in range(m + 1)]
    parent_map = dict()
    parent_map[(0, 0)] = None

    for i in range(1, m + 1):
        d[i][0] = i

    for j in range(1, n + 1):
        d[0][j] = j

    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if s[j - 1] is z[i - 1]:
                delta = 0
            else:
                delta = 1

            tentative_dist = d[i - 1][j] + 1
            operation = 1 # Insert

            if tentative_dist > d[i][j - 1] + 1:
                tentative_dist = d[i][j - 1] + 1
                operation = 2 # Delete

            if tentative_dist > d[i - 1][j - 1] + delta:
                tentative_dist = d[i - 1][j - 1] + delta
                operation = 3

            if operation is 1:   # Insert
                parent_map[(i, j)] = (i - 1, j)
            elif operation is 2: # Delete
                parent_map[(i, j)] = (i, j - 1)
            else:                # Substitute
                parent_map[(i, j)] = (i - 1, j - 1)

            d[i][j] = tentative_dist

    edit_string_top = ""
    edit_string_bottom = ""
    edit_sequence = ""

    current = (m, n)

    while True:
        predecessor = parent_map[current]

        if not predecessor:
            break

        if current[0] != predecessor[0] and current[1] != predecessor[1]:
            schar = s[predecessor[1]]
            zchar = z[predecessor[0]]

            edit_string_top += schar
            edit_string_bottom += zchar

            if schar is zchar:
                edit_sequence += "N"
            else:
                edit_sequence += "S"
        elif current[0] != predecessor[0]:
            edit_string_top += "-"
            edit_string_bottom += z[predecessor[0]]
            edit_sequence += "I"
        else:
            edit_string_top += s[predecessor[1]]
            edit_string_bottom += "-"
            edit_sequence += "D"

        current = predecessor

    edit_string_top = edit_string_top[:-1][::-1]
    edit_string_bottom = edit_string_bottom[:-1][::-1]
    edit_sequence = edit_sequence[:-1][::-1]

    return d[m][n], edit_sequence, edit_string_top, edit_string_bottom

def main():
    dist, edit_sequence, edit_string_top, edit_string_bottom = levenshtein_distance("handball", "aballad")
    print("Distance: ", dist)
    print("Edit sequence: ", edit_sequence)
    print(edit_string_top)
    print(edit_string_bottom)

if __name__ == "__main__":
    main()

Code Snippets

def levenshtein_distance(string1, string2):
    n = len(string1)
    m = len(string2)
    d = [[0 for x in range(n + 1)] for y in range(m + 1)]

    for i in range(1, m + 1):
        d[i][0] = i

    for j in range(1, n + 1):
        d[0][j] = j

    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if string1[j - 1] is string2[i - 1]:
                delta = 0
            else:
                delta = 1

            d[i][j] = min(d[i - 1][j] + 1,
                          d[i][j - 1] + 1,
                          d[i - 1][j - 1] + delta)

    return d[m][n]


def main():
    print(levenshtein_distance("code", "rodde"))


if __name__ == "__main__":
    main()
def levenshtein_distance(s, z):
    s = "\u0000" + s
    z = "\u0000" + z

    n = len(s)
    m = len(z)
    d = [[0 for x in range(n + 1)] for y in range(m + 1)]
    parent_map = dict()
    parent_map[(0, 0)] = None

    for i in range(1, m + 1):
        d[i][0] = i

    for j in range(1, n + 1):
        d[0][j] = j

    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if s[j - 1] is z[i - 1]:
                delta = 0
            else:
                delta = 1

            tentative_dist = d[i - 1][j] + 1
            operation = 1 # Insert

            if tentative_dist > d[i][j - 1] + 1:
                tentative_dist = d[i][j - 1] + 1
                operation = 2 # Delete

            if tentative_dist > d[i - 1][j - 1] + delta:
                tentative_dist = d[i - 1][j - 1] + delta
                operation = 3

            if operation is 1:   # Insert
                parent_map[(i, j)] = (i - 1, j)
            elif operation is 2: # Delete
                parent_map[(i, j)] = (i, j - 1)
            else:                # Substitute
                parent_map[(i, j)] = (i - 1, j - 1)

            d[i][j] = tentative_dist

    edit_string_top = ""
    edit_string_bottom = ""
    edit_sequence = ""

    current = (m, n)

    while True:
        predecessor = parent_map[current]

        if not predecessor:
            break

        if current[0] != predecessor[0] and current[1] != predecessor[1]:
            schar = s[predecessor[1]]
            zchar = z[predecessor[0]]

            edit_string_top += schar
            edit_string_bottom += zchar

            if schar is zchar:
                edit_sequence += "N"
            else:
                edit_sequence += "S"
        elif current[0] != predecessor[0]:
            edit_string_top += "-"
            edit_string_bottom += z[predecessor[0]]
            edit_sequence += "I"
        else:
            edit_string_top += s[predecessor[1]]
            edit_string_bottom += "-"
            edit_sequence += "D"

        current = predecessor

    edit_string_top = edit_string_top[:-1][::-1]
    edit_string_bottom = edit_string_bottom[:-1][::-1]
    edit_sequence = edit_sequence[:-1][::-1]

    return d[m][n], edit_sequence, edit_string_top, edit_string_bottom


def main():
    dist, edit_sequence, edit_string_top, edit_string_bottom = levenshtein_distance("handball", "aballad")
    print("Distance: ", dist)
    print("Edit sequence: ", edit_sequence)
    print(edit_string_top)
    print(edit_string_bottom)

if __name__ == "__main__":
    main()

Context

StackExchange Code Review Q#126194, answer score: 4

Revisions (0)

No revisions yet.