patternpythonMinor
Text search in Python
Viewed 0 times
textpythonsearch
Problem
Whilst self-studying algorithms, I came across the Karp-Rabin rolling hash search algorithm here. I decided to have a go at implementing it in Python:
For ease of reading; the data-structure description and formulas embedded in
```
import random
from time import process_time
def is_prime(n):
if n 100:
interval = int(x / 10)
else:
interval = x + 1
count = 0
found = False
while not found:
if (x % 2) == 0:
start = x + 1
else:
start = x
n = random.randrange(start, start + interval, 2)
if is_prime(n):
found = True
return n
count += 1
if count > (interval / 2):
interval *= 2
count = 0
class hash_div(object):
def __init__(self, size):
if not is_prime(size):
ValueError("Use a prime as the base")
self.mod = size
def h(self, num):
return num % self.mod
class rolling_hash(hash_div):
# built for strings
def __init__(self, base, size):
self.a = base
self.uModP = 0
self.size_x = 0
self.ax = 1
super(rolling_hash, self).__init__(size)
def r(self):
return self.uModP
def update_ax(self, up_down):
if up_down == "up":
self.size_x += 1
elif up_down == "down":
self.size_x -= 1
self.ax = self.h(self.a**(self.size_x - 1))
def append(self, char):
self.uModP = self.h((self.uModP*self.a) + ord(char))
self.update_ax("up")
def skip(self, char):
self.uModP = self.h(self.uModP - (ord(char) * self.h(self.ax)))
self.update_ax("down")
def search_file(fileStr, term):
size = find_prime_above(len(fileStr))
rs = rolling_hash(256, size)
rt = rolling_hash(256, size)
for c in term:
rs.append(c)
for c in fileStr[:len(term)]:
rt.append(c)
for i in range(len(term), len(fileStr)):
For ease of reading; the data-structure description and formulas embedded in
append() and skip() are:```
import random
from time import process_time
def is_prime(n):
if n 100:
interval = int(x / 10)
else:
interval = x + 1
count = 0
found = False
while not found:
if (x % 2) == 0:
start = x + 1
else:
start = x
n = random.randrange(start, start + interval, 2)
if is_prime(n):
found = True
return n
count += 1
if count > (interval / 2):
interval *= 2
count = 0
class hash_div(object):
def __init__(self, size):
if not is_prime(size):
ValueError("Use a prime as the base")
self.mod = size
def h(self, num):
return num % self.mod
class rolling_hash(hash_div):
# built for strings
def __init__(self, base, size):
self.a = base
self.uModP = 0
self.size_x = 0
self.ax = 1
super(rolling_hash, self).__init__(size)
def r(self):
return self.uModP
def update_ax(self, up_down):
if up_down == "up":
self.size_x += 1
elif up_down == "down":
self.size_x -= 1
self.ax = self.h(self.a**(self.size_x - 1))
def append(self, char):
self.uModP = self.h((self.uModP*self.a) + ord(char))
self.update_ax("up")
def skip(self, char):
self.uModP = self.h(self.uModP - (ord(char) * self.h(self.ax)))
self.update_ax("down")
def search_file(fileStr, term):
size = find_prime_above(len(fileStr))
rs = rolling_hash(256, size)
rt = rolling_hash(256, size)
for c in term:
rs.append(c)
for c in fileStr[:len(term)]:
rt.append(c)
for i in range(len(term), len(fileStr)):
Solution
-
Computing hash with Python built-in
is much faster than
-
I don't see the reason of passing string argument to
-
Finding a prime is indeed unconventional. I recommend to run a standard sieve (Chebyshov' theorem aka Bertrand postulate guarantees a prime in
Computing hash with Python built-in
pow(self.a, self_size - 1, self.mod)is much faster than
** and % done separately.-
I don't see the reason of passing string argument to
update_ax. Passing 1 and -1 directly is much cleaner and likely faster:def update_ax(self, up_down):
self.size_x += up_down
self.ax = self.h(....)-
Finding a prime is indeed unconventional. I recommend to run a standard sieve (Chebyshov' theorem aka Bertrand postulate guarantees a prime in
[n, 2n] range).Code Snippets
pow(self.a, self_size - 1, self.mod)def update_ax(self, up_down):
self.size_x += up_down
self.ax = self.h(....)Context
StackExchange Code Review Q#147423, answer score: 3
Revisions (0)
No revisions yet.