patternpythonMinor
Speeding up a Cython program
Viewed 0 times
speedingcythonprogram
Problem
I wrote the following piece of Python as a part of a larger system. Profiling reveals that a large amount of time is spent in
Here's the code and the unit tests I've been using.
```
import logging
from operator import itemgetter
from functools import total_ordering
from unittest import TestCase
import nose
class DocumentFeature(object):
def __init__(self, type, tokens):
self.type = type
self.tokens = tokens
@classmethod
def from_string(cls, string):
"""
Takes a string representing a DocumentFeature and creates and object out of it. String format is
"word/POS" or "word1/PoS1 word2/PoS2",... The type of the feature will be inferred from the length and
PoS tags of the input string.
:type string: str
"""
try:
token_count = string.count('_') + 1
pos_count = string.count('/')
if token_count != pos_count:
return DocumentFeature('EMPTY', tuple())
tokens = string.strip().split('_')
if len(tokens) > 3:
raise ValueError('Document feature %s is too long' % string)
bits = [x.split('/') for x in tokens]
if not all(map(itemgetter(0), bits)):
# ignore tokens with no text
return DocumentFeature('EMPTY', tuple())
tokens = tuple(Token(word, pos) for (word, pos) in bits)
if len(tokens) == 1:
t = '1-GRAM'
elif ''.join([t.pos for t in tokens]) == 'NVN':
t = 'SVO'
elif ''.join([t.pos for t in tokens]) == 'JN':
t = 'AN'
elif ''.join([t.pos for t in tokens]) == 'VN':
t = 'VO'
elif ''.join([t.pos for t in to
DocumentFeature.from_string. So far I've tried compiling the unmodified code with Cython and got a 33% improvement in running time. Any suggestions as to how this code can be improved further are greatly appreciated.Here's the code and the unit tests I've been using.
```
import logging
from operator import itemgetter
from functools import total_ordering
from unittest import TestCase
import nose
class DocumentFeature(object):
def __init__(self, type, tokens):
self.type = type
self.tokens = tokens
@classmethod
def from_string(cls, string):
"""
Takes a string representing a DocumentFeature and creates and object out of it. String format is
"word/POS" or "word1/PoS1 word2/PoS2",... The type of the feature will be inferred from the length and
PoS tags of the input string.
:type string: str
"""
try:
token_count = string.count('_') + 1
pos_count = string.count('/')
if token_count != pos_count:
return DocumentFeature('EMPTY', tuple())
tokens = string.strip().split('_')
if len(tokens) > 3:
raise ValueError('Document feature %s is too long' % string)
bits = [x.split('/') for x in tokens]
if not all(map(itemgetter(0), bits)):
# ignore tokens with no text
return DocumentFeature('EMPTY', tuple())
tokens = tuple(Token(word, pos) for (word, pos) in bits)
if len(tokens) == 1:
t = '1-GRAM'
elif ''.join([t.pos for t in tokens]) == 'NVN':
t = 'SVO'
elif ''.join([t.pos for t in tokens]) == 'JN':
t = 'AN'
elif ''.join([t.pos for t in tokens]) == 'VN':
t = 'VO'
elif ''.join([t.pos for t in to
Solution
I find your error handling is a bit counterintuitive: if the input string is invalid, you return an "empty"
I think that you're being too pessimistic in validating your inputs: the validation repeats some of the real work that would be done anyway. Furthermore, counting slashes and underscores is insufficient validation — for example,
DocumentFeature, but if there are too many tokens, it raises an exception. I would raise exceptions in both cases, and let the caller decide what to do.I think that you're being too pessimistic in validating your inputs: the validation repeats some of the real work that would be done anyway. Furthermore, counting slashes and underscores is insufficient validation — for example,
"word1_word2//" passes the initial validation, only to fail at tuple(Token(word, pos) for (word, pos) in bits). Instead, I would suggest validating as you perform the transformations._TYPES = dict([
('NVN', 'SVO'), ('JN', 'AN'), ('VN', 'VO'), ('NN', 'NN')
])
@classmethod
def from_string(cls, string):
"""
Takes a string representing a DocumentFeature and creates and object out of it. String format is
"word/PoS" or "word1/PoS1_word2/PoS2",... The type of the feature will be inferred from the length and
PoS tags of the input string.
:type string: str
"""
try:
tokens = string.strip().split('_')
if len(tokens) > 3:
raise ValueError('Document feature %s is too long' % string)
tokens = [token.split('/') for token in tokens]
# Check for too many slashes, too few slashes, or empty words
if not all(map(lambda token: len(token) == 2 and token[0], tokens)):
raise ValueError('Invalid document feature %s' % string)
tokens = tuple(Token(word, pos) for (word, pos) in tokens)
type = cls._TYPES.get(''.join([t.pos for t in tokens]),
('EMPTY', '1-GRAM', '2-GRAM', '3-GRAM')[len(tokens)])
except:
logging.error('Cannot create token out of string %s', string)
raise
return DocumentFeature(type, tokens)Code Snippets
_TYPES = dict([
('NVN', 'SVO'), ('JN', 'AN'), ('VN', 'VO'), ('NN', 'NN')
])
@classmethod
def from_string(cls, string):
"""
Takes a string representing a DocumentFeature and creates and object out of it. String format is
"word/PoS" or "word1/PoS1_word2/PoS2",... The type of the feature will be inferred from the length and
PoS tags of the input string.
:type string: str
"""
try:
tokens = string.strip().split('_')
if len(tokens) > 3:
raise ValueError('Document feature %s is too long' % string)
tokens = [token.split('/') for token in tokens]
# Check for too many slashes, too few slashes, or empty words
if not all(map(lambda token: len(token) == 2 and token[0], tokens)):
raise ValueError('Invalid document feature %s' % string)
tokens = tuple(Token(word, pos) for (word, pos) in tokens)
type = cls._TYPES.get(''.join([t.pos for t in tokens]),
('EMPTY', '1-GRAM', '2-GRAM', '3-GRAM')[len(tokens)])
except:
logging.error('Cannot create token out of string %s', string)
raise
return DocumentFeature(type, tokens)Context
StackExchange Code Review Q#38422, answer score: 4
Revisions (0)
No revisions yet.