patternpythonMinor
Sampling words from randomly chosen text files
Viewed 0 times
samplingwordstextchosenrandomlyfilesfrom
Problem
I've developed a script to randomly select 50 elements from a list of .txt files. This process is repeated 100 times. The script joins the randomly selected text files to one long string and then filters out the longest substring.
I want to run this script on a Droplet on Digital Ocean. The server, however, kills the script. When I randomly select only 3 elements, it works. Am I running out of memory? How can I tackle this problem?
Here are the files I randomly select items from. And this is my code:
```
# coding: utf-8
import glob
from collections import Counter
import pickle
import random
de_list_soz = pickle.load(open('de_list_soz.p', 'rb'))
str_seq_list = []
for str_seq in range(0,100):
#creating random list
random_list = []
for item in range(0,50):
list_item = random.choice(de_list_soz)
random_list.append(list_item)
#creating long string
long_str = ''
for de in random_list:
input_file = open('txt_sr_de/txt_sr_de/' + de, 'r')
text = input_file.read()
text = text.replace('\n', ' ').replace('\xa0', '').replace(' ', '')
#Removing these automated notifications
text = text.replace('Wichtiger Hinweis:Diese Website wird in älteren Versionen von Netscape ohne graphische Elemente dargestellt. Die Funktionalität der Website ist aber trotzdem gewährleistet. Wenn Sie diese Website regelmässig benutzen, empfehlen wir Ihnen, auf Ihrem Computer einen aktuellen Browser zu installieren.Zurück zur Einstiegsseite Drucken Grössere Schrift', '')
text = text.replace('Vorwärts ähnliche Leitentscheide suchenähnliche Urteile ab 2000 suchen Drucken nach oben', '')
text = text.replace('Bundesgericht Tribunal fédéral Tribunale federale Tribunal federal', '')
text = text.replace('Navigation Neue Suche Zurück zum Suchresultat Rang: Zurück 180', '')
text = text.replace('Navigation Neue Suche Zurück zum Suchresultat Rang:1 ähnliche Leitentscheide suchenähnliche Urteile ab 2
I want to run this script on a Droplet on Digital Ocean. The server, however, kills the script. When I randomly select only 3 elements, it works. Am I running out of memory? How can I tackle this problem?
Here are the files I randomly select items from. And this is my code:
```
# coding: utf-8
import glob
from collections import Counter
import pickle
import random
de_list_soz = pickle.load(open('de_list_soz.p', 'rb'))
str_seq_list = []
for str_seq in range(0,100):
#creating random list
random_list = []
for item in range(0,50):
list_item = random.choice(de_list_soz)
random_list.append(list_item)
#creating long string
long_str = ''
for de in random_list:
input_file = open('txt_sr_de/txt_sr_de/' + de, 'r')
text = input_file.read()
text = text.replace('\n', ' ').replace('\xa0', '').replace(' ', '')
#Removing these automated notifications
text = text.replace('Wichtiger Hinweis:Diese Website wird in älteren Versionen von Netscape ohne graphische Elemente dargestellt. Die Funktionalität der Website ist aber trotzdem gewährleistet. Wenn Sie diese Website regelmässig benutzen, empfehlen wir Ihnen, auf Ihrem Computer einen aktuellen Browser zu installieren.Zurück zur Einstiegsseite Drucken Grössere Schrift', '')
text = text.replace('Vorwärts ähnliche Leitentscheide suchenähnliche Urteile ab 2000 suchen Drucken nach oben', '')
text = text.replace('Bundesgericht Tribunal fédéral Tribunale federale Tribunal federal', '')
text = text.replace('Navigation Neue Suche Zurück zum Suchresultat Rang: Zurück 180', '')
text = text.replace('Navigation Neue Suche Zurück zum Suchresultat Rang:1 ähnliche Leitentscheide suchenähnliche Urteile ab 2
Solution
Since your code performs multiple things at once, you should split it into functions for readability. A first naive rewrite can yield:
Now we can start cleaning things up.
In
You also happen to
Lastly, concatenating long sentences manually is not memory efficient. Let's ask
The same kind of improvements can be made to the main function by replacing the "create empty list + for loop +
The
So using something like:
For Python 2 or:
for Python 3, you can simplify the writting and save a bit on memory usage like so:
```
def extract_most_common_sequence(text, minimum_occurences=3, times=3):
sequence = ''
for n in range(1, int(len(text)/times+1)):
freqs = Counter(tuplewise(text, n))
(most
from collections import Counter
import random
def filter_text(text):
return (text
.replace('\n', ' ')
.replace('\xa0', '')
.replace(' ', '')
#Removing these automated notifications
.replace('Wichtiger Hinweis:Diese Website wird in älteren Versionen von Netscape ohne graphische Elemente dargestellt. Die Funktionalität der Website ist aber trotzdem gewährleistet. Wenn Sie diese Website regelmässig benutzen, empfehlen wir Ihnen, auf Ihrem Computer einen aktuellen Browser zu installieren.Zurück zur Einstiegsseite Drucken Grössere Schrift', '')
.replace('Vorwärts ähnliche Leitentscheide suchenähnliche Urteile ab 2000 suchen Drucken nach oben', '')
.replace('Bundesgericht Tribunal fédéral Tribunale federale Tribunal federal', '')
.replace('Navigation Neue Suche Zurück zum Suchresultat Rang: Zurück 180', '')
.replace('Navigation Neue Suche Zurück zum Suchresultat Rang:1 ähnliche Leitentscheide suchenähnliche Urteile ab 2000 suchen Drucken nach oben', '')
.replace(' ', ' '))
def create_string_from_files(files, root='txt_sr_de/txt_sr_de', sample=50):
#creating random list
random_list = []
for item in range(sample):
list_item = random.choice(files)
random_list.append(list_item)
#creating long string
long_str = ''
for de in random_list:
input_file = open(os.path.join(root, de), 'r')
text = input_file.read()
long_str = long_str + filter_text(text)
return long_str
def extract_most_common_sequence(text, minimum_occurences=3, times=3):
for n in range(1,int(len(text)/times+1)):
substrings=[text[i:i+n] for i in range(len(text)-n+1)]
freqs=Counter(substrings)
if freqs.most_common(1)[0][1] < minimum_occurences:
n-=1
return seq
else:
seq=freqs.most_common(1)[0][0]
def main(files, repeat=100):
str_seq_list = []
for str_seq in range(repeat):
long_str = create_string_from_files(files)
str_seq_list.append(extract_most_common_sequence(long_str))
return str_seq_list
if __name__ == '__main__':
import pickle
de_list_soz = pickle.load(open('de_list_soz.p', 'rb'))
str_seq_list = main(de_list_soz)
pickle.dump(str_seq_list, open('SOZIALRECHT_DE.p', 'wb'))Now we can start cleaning things up.
In
create_string_from_files, you are basically reinventing random.sample:>>> import random
>>> random.sample('abcdefghi', 4)
['h', 'e', 'c', 'f']You also happen to
open files but never close them: use the with statement to automatically handle that.Lastly, concatenating long sentences manually is not memory efficient. Let's ask
str.join to do it for us for now. But in order to do it nicely, we need to split the function further:def read_and_filter_file(filename):
with open(filename) as input_file:
text = input_file.read()
return filter_text(text)
def create_string_from_files(files, root='txt_sr_de/txt_sr_de', samples=50):
return ''.join(
read_and_filter_file(os.path.join(root, de))
for de in random.sample(files, samples)
)The same kind of improvements can be made to the main function by replacing the "create empty list + for loop +
append" template by a more efficient list-comprehension:def main(files, repeat=100):
return [
extract_most_common_sequence(create_string_from_files(files))
for _ in range(repeat)
]The
extract_most_common_sequence function can also be optimized by using some itertool recipe. I’m thinking about a variation of the pairwise recipe since your list-comprehension, with n being 2 is pretty much it. i.e. using:>>> long_str = 'This is a test'
>>> n = 2
>>> [long_str[i:i+n] for i in range(len(long_str)-n+1)]
['Th', 'hi', 'is', 's ', ' i', 'is', 's ', ' a', 'a ', ' t', 'te', 'es', 'st']
>>> n = 4
>>> [long_str[i:i+n] for i in range(len(long_str)-n+1)]
['This', 'his ', 'is i', 's is', ' is ', 'is a', 's a ', ' a t', 'a te', ' tes', 'test']So using something like:
def tuplewise(iterable, length):
tees = itertools.tee(iterable, length)
for i, t in enumerate(tees):
for _ in xrange(i):
next(t, None)
return itertools.izip(*tees)For Python 2 or:
def tuplewise(iterable, length):
tees = itertools.tee(iterable, length)
for i, t in enumerate(tees):
for _ in range(i):
next(t, None)
return zip(*tees)for Python 3, you can simplify the writting and save a bit on memory usage like so:
```
def extract_most_common_sequence(text, minimum_occurences=3, times=3):
sequence = ''
for n in range(1, int(len(text)/times+1)):
freqs = Counter(tuplewise(text, n))
(most
Code Snippets
from collections import Counter
import random
def filter_text(text):
return (text
.replace('\n', ' ')
.replace('\xa0', '')
.replace(' ', '')
#Removing these automated notifications
.replace('Wichtiger Hinweis:Diese Website wird in älteren Versionen von Netscape ohne graphische Elemente dargestellt. Die Funktionalität der Website ist aber trotzdem gewährleistet. Wenn Sie diese Website regelmässig benutzen, empfehlen wir Ihnen, auf Ihrem Computer einen aktuellen Browser zu installieren.Zurück zur Einstiegsseite Drucken Grössere Schrift', '')
.replace('Vorwärts ähnliche Leitentscheide suchenähnliche Urteile ab 2000 suchen Drucken nach oben', '')
.replace('Bundesgericht Tribunal fédéral Tribunale federale Tribunal federal', '')
.replace('Navigation Neue Suche Zurück zum Suchresultat Rang: Zurück 180', '')
.replace('Navigation Neue Suche Zurück zum Suchresultat Rang:1 ähnliche Leitentscheide suchenähnliche Urteile ab 2000 suchen Drucken nach oben', '')
.replace(' ', ' '))
def create_string_from_files(files, root='txt_sr_de/txt_sr_de', sample=50):
#creating random list
random_list = []
for item in range(sample):
list_item = random.choice(files)
random_list.append(list_item)
#creating long string
long_str = ''
for de in random_list:
input_file = open(os.path.join(root, de), 'r')
text = input_file.read()
long_str = long_str + filter_text(text)
return long_str
def extract_most_common_sequence(text, minimum_occurences=3, times=3):
for n in range(1,int(len(text)/times+1)):
substrings=[text[i:i+n] for i in range(len(text)-n+1)]
freqs=Counter(substrings)
if freqs.most_common(1)[0][1] < minimum_occurences:
n-=1
return seq
else:
seq=freqs.most_common(1)[0][0]
def main(files, repeat=100):
str_seq_list = []
for str_seq in range(repeat):
long_str = create_string_from_files(files)
str_seq_list.append(extract_most_common_sequence(long_str))
return str_seq_list
if __name__ == '__main__':
import pickle
de_list_soz = pickle.load(open('de_list_soz.p', 'rb'))
str_seq_list = main(de_list_soz)
pickle.dump(str_seq_list, open('SOZIALRECHT_DE.p', 'wb'))>>> import random
>>> random.sample('abcdefghi', 4)
['h', 'e', 'c', 'f']def read_and_filter_file(filename):
with open(filename) as input_file:
text = input_file.read()
return filter_text(text)
def create_string_from_files(files, root='txt_sr_de/txt_sr_de', samples=50):
return ''.join(
read_and_filter_file(os.path.join(root, de))
for de in random.sample(files, samples)
)def main(files, repeat=100):
return [
extract_most_common_sequence(create_string_from_files(files))
for _ in range(repeat)
]>>> long_str = 'This is a test'
>>> n = 2
>>> [long_str[i:i+n] for i in range(len(long_str)-n+1)]
['Th', 'hi', 'is', 's ', ' i', 'is', 's ', ' a', 'a ', ' t', 'te', 'es', 'st']
>>> n = 4
>>> [long_str[i:i+n] for i in range(len(long_str)-n+1)]
['This', 'his ', 'is i', 's is', ' is ', 'is a', 's a ', ' a t', 'a te', ' tes', 'test']Context
StackExchange Code Review Q#157132, answer score: 2
Revisions (0)
No revisions yet.