patternpythonMinor
Review Request: Python code that searches query words in a given text
Viewed 0 times
requestquerywordsgiventextthatpythoncodesearchesreview
Problem
I have this code which looks for query words in a given text. I preprocess the give text and store it in an inverted-index (dictionary with word as key and position in text as value). Searching for a query words then becomes contant time operation.
The below code also prints the surrounding text which contains the maximum number of queries.
Please help me understand the shortcomings (coding style, naming conventions, performance issues) of the code.
```
# -- coding: cp1252 --
from collections import defaultdict
class c_Highlight():
"""Highlight Class to find and print relevant snippet of a text"""
Grammar_Suffix = ['ing' ,'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
Punctuation = [',', '!', '.', '?', '"', '\n']
def __init__(self):
self.InvertedIndex = defaultdict( list )
def m_Read_File( self, f_name ):
"""
Function to Read a file, if the text is to be read from file
Args: FileName
Returns: List of words
"""
list_words = []
try:
f = open(f_name)
for line in f:
line = line.strip()
list_words.extend( line.split())
f.close()
except IOError as (errno, strerror):
print "File I/O Error({0}): {1}".format(errno, strerror)
exit(0)
return list_words
def __m_Stem( self, word ):
"""
Function to remove Suffix from the end of a word.
Input: Word
Returns: Cropped Word
"""
word = word.lower()
for suffix in c_Highlight.Grammar_Suffix:
if word.endswith(suffix):
word = word[:-len(suffix)]
break
for punctuation in c_Highlight.Punctuation:
while word.endswith(punctuation):
word = word[:-len(punctuation)]
return word #can be empty list
def m_Create_InvertedIndex( self, words ):
"""
Function to parse the input words and store them in a InvertedIndex
The Key is the word itself and Value is the position of the word in the text
The below code also prints the surrounding text which contains the maximum number of queries.
Please help me understand the shortcomings (coding style, naming conventions, performance issues) of the code.
```
# -- coding: cp1252 --
from collections import defaultdict
class c_Highlight():
"""Highlight Class to find and print relevant snippet of a text"""
Grammar_Suffix = ['ing' ,'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
Punctuation = [',', '!', '.', '?', '"', '\n']
def __init__(self):
self.InvertedIndex = defaultdict( list )
def m_Read_File( self, f_name ):
"""
Function to Read a file, if the text is to be read from file
Args: FileName
Returns: List of words
"""
list_words = []
try:
f = open(f_name)
for line in f:
line = line.strip()
list_words.extend( line.split())
f.close()
except IOError as (errno, strerror):
print "File I/O Error({0}): {1}".format(errno, strerror)
exit(0)
return list_words
def __m_Stem( self, word ):
"""
Function to remove Suffix from the end of a word.
Input: Word
Returns: Cropped Word
"""
word = word.lower()
for suffix in c_Highlight.Grammar_Suffix:
if word.endswith(suffix):
word = word[:-len(suffix)]
break
for punctuation in c_Highlight.Punctuation:
while word.endswith(punctuation):
word = word[:-len(punctuation)]
return word #can be empty list
def m_Create_InvertedIndex( self, words ):
"""
Function to parse the input words and store them in a InvertedIndex
The Key is the word itself and Value is the position of the word in the text
Solution
# -*- coding: cp1252 -*-
from collections import defaultdict
class c_Highlight():Prefixed like c_ and m_ are frowned upon as useless.
"""Highlight Class to find and print relevant snippet of a text"""
Grammar_Suffix = ['ing' ,'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
Punctuation = [',', '!', '.', '?', '"', '\n']The official python style guide recommends ALL CAPS for constants like this
def __init__(self):
self.InvertedIndex = defaultdict( list )The official python style guide recommends all_lower_case_with_underscores for variable names
def m_Read_File( self, f_name ):
"""
Function to Read a file, if the text is to be read from file
Args: FileName
Returns: List of words
"""
list_words = []
try:
f = open(f_name)
for line in f:
line = line.strip()
list_words.extend( line.split())
f.close()This can be better if you use with
with open(f_name) as f:
for line in f:
line = line.strip()
list_words.extend(line.split())This will close the file even an exception is thrown.
except IOError as (errno, strerror):
print "File I/O Error({0}): {1}".format(errno, strerror)
exit(0)exit(0) means the program succeeded in this case it didn't.
return list_wordsSince this function doesn't manipulate any object attributes, it might be better as a separate utility function.
def __m_Stem( self, word ):
"""
Function to remove Suffix from the end of a word.
Input: Word
Returns: Cropped Word
"""
word = word.lower()
for suffix in c_Highlight.Grammar_Suffix:
if word.endswith(suffix):
word = word[:-len(suffix)]
breakI recommend use return word[:-len(suffix)] instead.
for punctuation in c_Highlight.Punctuation:
while word.endswith(punctuation):
word = word[:-len(punctuation)]You don't break here like you did before
return word #can be empty listSince your words appear to be strings, this comment is just wrong.
def m_Create_InvertedIndex( self, words ):
"""
Function to parse the input words and store them in a InvertedIndex
The Key is the word itself and Value is the position of the word in the text
Input: List of Words
Return: None
"""
idx = 0
for word in words:
word = self.__m_Stem(word)
self.InvertedIndex[word].append(idx)
idx = idx+1Use
enumeratefor idx, word in enumerate(words):
word = self.__m_Stem(word)
self.InvertedIndex[word].append(index)See? much cleaner.
def m_Search_Query( self, search_query, length ):
"""
Function to search for query words in the InvertedIndex
Input: List of query words, and the number of words in the text
Return: Integer List of length same as the number of words. Each index indicates
if the word is present in the query or not. So if List[i]==0, the word is not present
and if List[i]==2, then the 2nd word in the query is present at location i in the input text.
"""Requiring the user pass in the length of the input text seems strange. The class itself knows the length of the input text and the user probably doesn't.
words_present = []
idx = 1
for x in range(length):
words_present.append(0)Use:
word_present = [0] * length to create words_present, it'll be faster and clearer.for word in search_query:
word = self.__m_Stem(word)
if word in self.InvertedIndex:
for word_index in self.InvertedIndex[word]:
words_present[word_index] = idx
idx = idx + 1You only increment
idx when a match is found. This seems strange as I'd expect the idx to correspond the indexes in search_query. Also returning a list like this seems strange as its pretty much as hard to find values in this list as it is to find the words in the first place.return words_present
def m_Find_Snippet( self, words_present, num_words, MaxWindow ):
"""
Function to find a snippet of input text with has the most number of query
words present.
Input: Integer List, length, and number of words to be present in the snippet
Return: begin, end position of the window and the count of query words present
in the snippet
"""
begin = 0
count = 0
end = 0
max_count = -1
Snippet_End = 0
Snippet_Begin = 0Python style guide recommaeds this_style for local variables.
```
num_words = len(words_present)
try:
while begin < num_words:
if words_present[begin]!=0:
count = 0
end = begin
end_done = min((MaxWindow + begin), num_words)
while end < end_done:
if words_present[end]!=0:
Code Snippets
# -*- coding: cp1252 -*-
from collections import defaultdict
class c_Highlight():"""Highlight Class to find and print relevant snippet of a text"""
Grammar_Suffix = ['ing' ,'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
Punctuation = [',', '!', '.', '?', '"', '\n']def __init__(self):
self.InvertedIndex = defaultdict( list )def m_Read_File( self, f_name ):
"""
Function to Read a file, if the text is to be read from file
Args: FileName
Returns: List of words
"""
list_words = []
try:
f = open(f_name)
for line in f:
line = line.strip()
list_words.extend( line.split())
f.close()with open(f_name) as f:
for line in f:
line = line.strip()
list_words.extend(line.split())Context
StackExchange Code Review Q#2668, answer score: 4
Revisions (0)
No revisions yet.