patternpythonMinor
Sentence generation using Markov Chains
Viewed 0 times
chainsgenerationusingsentencemarkov
Problem
I have written an implementation for sentence generation using Markov Chains.
Would be great if you could review it.
```
#!/usr/bin/env python
# -- coding: utf-8 --
import os
import random
from markovipy.utils import get_word_list
from markovipy.utils import list_to_tuple
from markovipy.constants import PUNCTUATIONS
class MarkoviPy:
def __init__(self, filename="", markov_length=2):
"""
starting_word: keeps track of the words from which sentences would be starting out.
TODO:
- instead of storing final_mapping and middle_mapping on memory, put them on a tiny DB(Sqlite comes to my mind)
:type markov_length: defaults to 2
:type filename: defaults to an empty string
"""
self.starting_words = []
self.middle_mapping = {}
self.final_mapping = {}
self.words_list = []
self.markov_length = markov_length
if os.path.exists(filename):
self.filename = filename
else:
raise FileNotFoundError("Please enter a valid file name for corpus")
def _normalise_mapping(self):
"""
creates the self.final_mapping with the final probabilities in similar structure
{
...
('with',): {'Till': 0.2,
'a': 0.2,
'darkness': 0.2,
'such': 0.2,
'whispering': 0.2},
('word',): {',': 0.6666666666666666, 'within': 0.3333333333333333},
('word', ','): {'Swaddled': 0.5, 'unable': 0.5},
...
}
:return:
"""
for word_tuple, probable_word in self.middle_mapping.items():
total = sum(probable_word.values())
self.final_mapping[word_tuple] = dict([(k, v / total) for k, v in probable_word.items()])
def _build_middle_mapping(self, word_history, next_word):
"""Adds next_word to the list of possible next words after the sequence presented in word_history
Would be great if you could review it.
```
#!/usr/bin/env python
# -- coding: utf-8 --
import os
import random
from markovipy.utils import get_word_list
from markovipy.utils import list_to_tuple
from markovipy.constants import PUNCTUATIONS
class MarkoviPy:
def __init__(self, filename="", markov_length=2):
"""
starting_word: keeps track of the words from which sentences would be starting out.
TODO:
- instead of storing final_mapping and middle_mapping on memory, put them on a tiny DB(Sqlite comes to my mind)
:type markov_length: defaults to 2
:type filename: defaults to an empty string
"""
self.starting_words = []
self.middle_mapping = {}
self.final_mapping = {}
self.words_list = []
self.markov_length = markov_length
if os.path.exists(filename):
self.filename = filename
else:
raise FileNotFoundError("Please enter a valid file name for corpus")
def _normalise_mapping(self):
"""
creates the self.final_mapping with the final probabilities in similar structure
{
...
('with',): {'Till': 0.2,
'a': 0.2,
'darkness': 0.2,
'such': 0.2,
'whispering': 0.2},
('word',): {',': 0.6666666666666666, 'within': 0.3333333333333333},
('word', ','): {'Swaddled': 0.5, 'unable': 0.5},
...
}
:return:
"""
for word_tuple, probable_word in self.middle_mapping.items():
total = sum(probable_word.values())
self.final_mapping[word_tuple] = dict([(k, v / total) for k, v in probable_word.items()])
def _build_middle_mapping(self, word_history, next_word):
"""Adds next_word to the list of possible next words after the sequence presented in word_history
Solution
generate_sentencecalls_iterate_through_word_listthat seems to perform all sorts of initialization, including reading the words from a file. Making multiple calls to generate multiple sentences causes the same file to be read over again every time. It would be more logical to initiate such initialization from__init__. Also, the name_iterate_through_word_listdoes not convey what it really does.
-
All this
if key not in self.middle_mapping:
self.middle_mapping[key] = {}
self.middle_mapping[key][next_word] = 1.0
else:
if next_word in self.middle_mapping[key]:
self.middle_mapping[key][next_word] += 1
else:
self.middle_mapping[key][next_word] = 1.0could be just
self.middle_mapping[key][next_word] += 1.0by using
self.middle_mapping = collections.defaultdict(collections.Counter)-
self.middle_mapping is apparently only used in _normalise_mapping to create self.final_mapping. Such data would be better passed as a function argument.- Defining your own
list_to_tuplefunction is not necessary as the built-intuple()can be used instead.
Code Snippets
if key not in self.middle_mapping:
self.middle_mapping[key] = {}
self.middle_mapping[key][next_word] = 1.0
else:
if next_word in self.middle_mapping[key]:
self.middle_mapping[key][next_word] += 1
else:
self.middle_mapping[key][next_word] = 1.0self.middle_mapping[key][next_word] += 1.0Context
StackExchange Code Review Q#160484, answer score: 3
Revisions (0)
No revisions yet.