patternpythonMinor
Sanitizing words extracted from text files and writing them to a database
Viewed 0 times
extractedwordstextwritingdatabasefilessanitizingandfromthem
Problem
This code reads text files from some path, tokenizes, removes stop words, lowercases, removes punctuation and numbers, then writes the result to a database.
```
# -- coding: utf-8 --
from __future__ import print_function
import os, codecs, re, string, mysql
import mysql.connector
'''Reading files with txt extension'''
y_ = ""
for root, dirs, files in os.walk("/Users/Documents/source-document/part1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
y_ = y_ + lines
#print(tokenized_docs)
'''Tokenizing sentences of the text files'''
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(y_)
tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]
'''Removing stop words'''
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = stopwords.words("English")
for i in tokenized_docs[0]:
tokenized_docs = ' '.join([word for word in i.split() if word not in stopset])
stopword_removed_sentences.append(tokenized_docs)
''' Removing punctuation marks'''
regex = re.compile('[%s]' % re.escape(string.punctuation))
nw = []
for review in stopword_removed_sentences:
new_review = ''
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review += new_token
nw.append(new_review)
'''Lowercasing letters after removing puctuation marks.'''
lw = [] #lw stands for lowercase word.
for i in nw:
k = i.lower()
lw.append(k)
'''Removing number with a dummy symbol'''
nr = []
for j in lw:
string = j
regex = r'[^\[\]]+(?=\])'
# let "#" be the dummy symbol
output = re.sub(regex,'#',string)
nr.append(output)
nrfinal = []
for j in nr:
rem = 0
outr = ''
for i in j:
if ord(i)>= 48 and ord(i)<=57:
rem += 1
if rem == 1:
outr = outr+ '#'
else:
r
```
# -- coding: utf-8 --
from __future__ import print_function
import os, codecs, re, string, mysql
import mysql.connector
'''Reading files with txt extension'''
y_ = ""
for root, dirs, files in os.walk("/Users/Documents/source-document/part1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
y_ = y_ + lines
#print(tokenized_docs)
'''Tokenizing sentences of the text files'''
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(y_)
tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]
'''Removing stop words'''
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = stopwords.words("English")
for i in tokenized_docs[0]:
tokenized_docs = ' '.join([word for word in i.split() if word not in stopset])
stopword_removed_sentences.append(tokenized_docs)
''' Removing punctuation marks'''
regex = re.compile('[%s]' % re.escape(string.punctuation))
nw = []
for review in stopword_removed_sentences:
new_review = ''
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review += new_token
nw.append(new_review)
'''Lowercasing letters after removing puctuation marks.'''
lw = [] #lw stands for lowercase word.
for i in nw:
k = i.lower()
lw.append(k)
'''Removing number with a dummy symbol'''
nr = []
for j in lw:
string = j
regex = r'[^\[\]]+(?=\])'
# let "#" be the dummy symbol
output = re.sub(regex,'#',string)
nr.append(output)
nrfinal = []
for j in nr:
rem = 0
outr = ''
for i in j:
if ord(i)>= 48 and ord(i)<=57:
rem += 1
if rem == 1:
outr = outr+ '#'
else:
r
Solution
Memory usage
What I notice most, is that this is bound to use a lot of memory.
First you load all the files (at once) into memory.
Then you copy over (with modifications) the data into another variable. And another. And another. ...
This creates a few lists, all with the same total memory size as the entire documents you have read.
Ex-pen-sive.
Generators/Iterators to the rescue!
In Python, iterators are really nice. They allow something like continuations and just-in-time calculation. This causes memory usage to be a lot lower, on the (perhaps) extra cost of a bit more CPU processing. But... I think in this case it will overall be a saving because you have a lot less memory usage, so probably also less cache-misses.
Let's see what we can do about that.
The last step
The easiest step is to first make it a function.
Now this does not save us memory. But by rewriting it a little bit, it will become a generator.
The downside is that you can only iterate over
The inner loop
One thing that bothers me is the inner loop here.
raw_docs = sen
What I notice most, is that this is bound to use a lot of memory.
First you load all the files (at once) into memory.
Then you copy over (with modifications) the data into another variable. And another. And another. ...
This creates a few lists, all with the same total memory size as the entire documents you have read.
Ex-pen-sive.
Generators/Iterators to the rescue!
In Python, iterators are really nice. They allow something like continuations and just-in-time calculation. This causes memory usage to be a lot lower, on the (perhaps) extra cost of a bit more CPU processing. But... I think in this case it will overall be a saving because you have a lot less memory usage, so probably also less cache-misses.
Let's see what we can do about that.
The last step
nrfinal = []
for j in nr:
rem = 0
outr = ''
for i in j:
if ord(i)>= 48 and ord(i)<=57:
rem += 1
if rem == 1:
outr = outr+ '#'
else:
rem = 0
outr = outr+i
nrfinal.append(outr)The easiest step is to first make it a function.
def calc_nrfinal(inp):
retval = []
for j in nr:
rem = 0
outr = ''
for i in j:
if ord(i)>= 48 and ord(i)<=57:
rem += 1
if rem == 1:
outr = outr+ '#'
else:
rem = 0
outr = outr+i
retval.append(outr)
return retval
nrfinal = calc_nrfinal(nr)Now this does not save us memory. But by rewriting it a little bit, it will become a generator.
def calc_nrfinal(inp):
for j in nr:
rem = 0
outr = ''
for i in j:
if ord(i)>= 48 and ord(i)<=57:
rem += 1
if rem == 1:
outr = outr+ '#'
else:
rem = 0
outr = outr+i
yield outr
nrfinal = calc_nrfinal(nr)The downside is that you can only iterate over
nrfinal once, but you only need to do it once.The inner loop
One thing that bothers me is the inner loop here.
ord(i)>=49 and ord(i)
- The value of each of the lists is a string (
' '.join(...) must be a list).
Ok, for now I know enough. This tells me:
review is a string.
token is a 1-character substring of review.
Looking back to the troublesome code:
regex = re.compile('[%s]' % re.escape(string.punctuation))
nw = []
for review in stopword_removed_sentences:
new_review = ''
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review += new_token
nw.append(new_review)
This means we call regex.sub for all characters of review.
I have three different solutions:
new_review = regex.sub(u'', review)
Which reads very nice! Or
new_review = ''.join(token for token in review if token in string.punctuation)
Which is probably a bit more expensive due to the looping in Python (instead of C). Or,
new_review = review.translate(None, string.punctuation)
Which also saves us pre-compiling a regex. More readable, I think. Any are fine, but I think the review.translate is the easiest to look at.
Using that:
# -*- coding: utf-8 -*-
from __future__ import print_function
import os, codecs, re, string, mysql
import mysql.connector
'''Reading files with txt extension'''
y_ = ""
for root, dirs, files in os.walk("/Users/Documents/source-document/part1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
y_ = y_ + lines
#print(tokenized_docs)
'''Tokenizing sentences of the text files'''
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(y_)
tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]
'''Removing stop words'''
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = stopwords.words("English")
for i in tokenized_docs[0]:
tokenized_docs = ' '.join([word for word in i.split() if word not in stopset])
stopword_removed_sentences.append(tokenized_docs)
nw = (j.translate(None, string.punctuation) for j in stopword_removed_sentences)
lw = (i.lower() for i in nw)
nr = (re.sub(r'[^\[\]]+(?=\])', '#', j) for j in lw)
nrfinal = (re.sub('[0-9]+', '#', j) for j in nr)
'''Inserting into database'''
def connect():
for j in nrfinal:
conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'Thesis' )
cursor = conn.cursor()
cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,j))
conn.commit()
conn.close()
if __name__ == '__main__':
connect()
Quadratic run-time in the number of sentences.
Look at this code:
``raw_docs = sen
Code Snippets
nrfinal = []
for j in nr:
rem = 0
outr = ''
for i in j:
if ord(i)>= 48 and ord(i)<=57:
rem += 1
if rem == 1:
outr = outr+ '#'
else:
rem = 0
outr = outr+i
nrfinal.append(outr)def calc_nrfinal(inp):
retval = []
for j in nr:
rem = 0
outr = ''
for i in j:
if ord(i)>= 48 and ord(i)<=57:
rem += 1
if rem == 1:
outr = outr+ '#'
else:
rem = 0
outr = outr+i
retval.append(outr)
return retval
nrfinal = calc_nrfinal(nr)def calc_nrfinal(inp):
for j in nr:
rem = 0
outr = ''
for i in j:
if ord(i)>= 48 and ord(i)<=57:
rem += 1
if rem == 1:
outr = outr+ '#'
else:
rem = 0
outr = outr+i
yield outr
nrfinal = calc_nrfinal(nr)outr = re.sub(r'[0-9]+', '#', j)def calc_nrfinal(inp):
for j in nr:
yield re.sub(r'[0-9]+', '#', j)
nrfinal = calc_nrfinal(nr)Context
StackExchange Code Review Q#131405, answer score: 4
Revisions (0)
No revisions yet.