patternpythonMinor
Extracting keywords from 3 billion CSV records
Viewed 0 times
billioncsvrecordskeywordsextractingfrom
Problem
I am processing around 3 billion records using this piece of code. It's pretty slow; it would be really helpful if you could suggest better ways to do this.
```
'''
Created on 27-Mar-2015
@author: siddarth
'''
import rake
import multiprocessing as mp
import time
import os
import csv
filename = 'Large_Input_File.csv'
outName = 'Large_Output_File.csv'
BYTES_PER_MB = 1048576
start = time.time()
def elapsed():
return time.time() - start
''' Worker function used to call rake
This method basically calls the rake class which returns a
list of keywords, also each process
writes to a different output file'''
def fileparser_worker(filename, start, end, c):
outFileName = outName + str(c)
outFile = open(outFileName,'w')
with open(filename) as inFile:
inFile.seek(start)
lines = inFile.readlines(end-start)
for line in lines:
title = line.split('\t')
try:
keywords = rake.execute_rake(title[2])
except:
print('Index out of bound error due to NULL Values')
continue
for keyword in range(0,len(keywords)):
outFile.write(keywords[keyword][0] + '\n')
outFile.close()
if __name__ == '__main__':
start = time.time()
chunk_start = 0
chunk_size = 512 * BYTES_PER_MB
chunk_end = 512 * BYTES_PER_MB
filesize = os.path.getsize(filename)
print '\n%.3fs: file has %s rows' % (elapsed(), filesize)
c = 0
pause = 0
iterations = (filesize / chunk_size) + 1
''' Chunk the file '''
print(iterations)
with open(filename) as inFile:
while c filesize:
chunk_end = filesize
else:
chunk_end = chunk_start + chunk_size
inFile.seek(chunk_end)
line = inFile.readline()
if line == '':
continue
else:
chunk_end = inFile.tell()
print("Start chunk",ch
```
'''
Created on 27-Mar-2015
@author: siddarth
'''
import rake
import multiprocessing as mp
import time
import os
import csv
filename = 'Large_Input_File.csv'
outName = 'Large_Output_File.csv'
BYTES_PER_MB = 1048576
start = time.time()
def elapsed():
return time.time() - start
''' Worker function used to call rake
This method basically calls the rake class which returns a
list of keywords, also each process
writes to a different output file'''
def fileparser_worker(filename, start, end, c):
outFileName = outName + str(c)
outFile = open(outFileName,'w')
with open(filename) as inFile:
inFile.seek(start)
lines = inFile.readlines(end-start)
for line in lines:
title = line.split('\t')
try:
keywords = rake.execute_rake(title[2])
except:
print('Index out of bound error due to NULL Values')
continue
for keyword in range(0,len(keywords)):
outFile.write(keywords[keyword][0] + '\n')
outFile.close()
if __name__ == '__main__':
start = time.time()
chunk_start = 0
chunk_size = 512 * BYTES_PER_MB
chunk_end = 512 * BYTES_PER_MB
filesize = os.path.getsize(filename)
print '\n%.3fs: file has %s rows' % (elapsed(), filesize)
c = 0
pause = 0
iterations = (filesize / chunk_size) + 1
''' Chunk the file '''
print(iterations)
with open(filename) as inFile:
while c filesize:
chunk_end = filesize
else:
chunk_end = chunk_start + chunk_size
inFile.seek(chunk_end)
line = inFile.readline()
if line == '':
continue
else:
chunk_end = inFile.tell()
print("Start chunk",ch
Solution
def fileparser_worker(filename, start, end, c):
with open(filename) as inFile, open(outName + str(c),'w') as outFile:
inFile.seek(start)
#lines = inFile.readlines(end-start)because
readlines calls readline multiple times it can be replaced withread. If the text should be split by newline and each split line should be splitted by a tab character, there are two ways to do it depending on the input text.
for title in (line.split('\t') for line inFile.read(end-start).splitlines()):Here
title is list of strings. Another one is to call
str.split() or str.split(None) in which case the string is split by white spaces (space, tab or newline), this is best if it's ok to split by space character or the text has no space character.for title in inFile.read(end-start).split():Here the
title is string. If the
read().split() becomes a memory hog, I think a generator will givebetter result.
#def get_title():
# yield next((line.split('\t') for line inFile.read(end-start).splitlines()))
for title in (line.split('\t') for line inFile.read(end-start).splitlines()):
try:
keywords = rake.execute_rake(title[2])
except:
print('Index out of bound error due to NULL Values')
continueIf we could build a sequence of strings out of
keywords we can callwritelines straight. else:
outFile.writelines((keyword[0] + '\n' for keyword in keywords))~
Code Snippets
def fileparser_worker(filename, start, end, c):
with open(filename) as inFile, open(outName + str(c),'w') as outFile:
inFile.seek(start)
#lines = inFile.readlines(end-start)for title in (line.split('\t') for line inFile.read(end-start).splitlines()):for title in inFile.read(end-start).split():#def get_title():
# yield next((line.split('\t') for line inFile.read(end-start).splitlines()))
for title in (line.split('\t') for line inFile.read(end-start).splitlines()):
try:
keywords = rake.execute_rake(title[2])
except:
print('Index out of bound error due to NULL Values')
continueelse:
outFile.writelines((keyword[0] + '\n' for keyword in keywords))Context
StackExchange Code Review Q#85217, answer score: 3
Revisions (0)
No revisions yet.