patternpythonMinor

Extracting keywords from 3 billion CSV records

Submitted by: @import:stackexchange-codereview·Mar 10, 2026·

Viewed 0 times

billioncsvrecordskeywordsextractingfrom

Problem

I am processing around 3 billion records using this piece of code. It's pretty slow; it would be really helpful if you could suggest better ways to do this.

```
'''
Created on 27-Mar-2015
@author: siddarth
'''

import rake
import multiprocessing as mp
import time
import os
import csv

filename = 'Large_Input_File.csv'
outName = 'Large_Output_File.csv'
BYTES_PER_MB = 1048576

start = time.time()
def elapsed():
return time.time() - start

''' Worker function used to call rake
This method basically calls the rake class which returns a
list of keywords, also each process
writes to a different output file'''

def fileparser_worker(filename, start, end, c):
outFileName = outName + str(c)
outFile = open(outFileName,'w')
with open(filename) as inFile:
inFile.seek(start)
lines = inFile.readlines(end-start)
for line in lines:
title = line.split('\t')
try:
keywords = rake.execute_rake(title[2])
except:
print('Index out of bound error due to NULL Values')
continue
for keyword in range(0,len(keywords)):
outFile.write(keywords[keyword][0] + '\n')
outFile.close()

if __name__ == '__main__':
start = time.time()
chunk_start = 0
chunk_size = 512 * BYTES_PER_MB
chunk_end = 512 * BYTES_PER_MB

filesize = os.path.getsize(filename)

print '\n%.3fs: file has %s rows' % (elapsed(), filesize)

c = 0
pause = 0
iterations = (filesize / chunk_size) + 1
''' Chunk the file '''
print(iterations)
with open(filename) as inFile:
while c filesize:
chunk_end = filesize
else:
chunk_end = chunk_start + chunk_size

inFile.seek(chunk_end)
line = inFile.readline()

if line == '':
continue
else:
chunk_end = inFile.tell()

print("Start chunk",ch

Solution

def fileparser_worker(filename, start, end, c):
    with open(filename) as inFile, open(outName + str(c),'w') as outFile:
            inFile.seek(start)
            #lines = inFile.readlines(end-start)

because readlines calls readline multiple times it can be replaced with
read.

If the text should be split by newline and each split line should be splitted by a tab character, there are two ways to do it depending on the input text.

for title in (line.split('\t') for line inFile.read(end-start).splitlines()):

Here title is list of strings.

Another one is to call str.split() or str.split(None) in which case the string is split by white spaces (space, tab or newline), this is best if it's ok to split by space character or the text has no space character.

for title in inFile.read(end-start).split():

Here the title is string.

If the read().split() becomes a memory hog, I think a generator will give
better result.

#def get_title():
    #   yield next((line.split('\t') for line inFile.read(end-start).splitlines()))

    for title in (line.split('\t') for line inFile.read(end-start).splitlines()):  
        try:
            keywords = rake.execute_rake(title[2])
        except:
            print('Index out of bound error due to NULL Values')
            continue

If we could build a sequence of strings out of keywords we can call
writelines straight.

else:
            outFile.writelines((keyword[0] + '\n' for keyword in keywords))

Code Snippets

def fileparser_worker(filename, start, end, c):
    with open(filename) as inFile, open(outName + str(c),'w') as outFile:
            inFile.seek(start)
            #lines = inFile.readlines(end-start)

for title in (line.split('\t') for line inFile.read(end-start).splitlines()):

for title in inFile.read(end-start).split():

#def get_title():
    #   yield next((line.split('\t') for line inFile.read(end-start).splitlines()))

    for title in (line.split('\t') for line inFile.read(end-start).splitlines()):  
        try:
            keywords = rake.execute_rake(title[2])
        except:
            print('Index out of bound error due to NULL Values')
            continue

else:
            outFile.writelines((keyword[0] + '\n' for keyword in keywords))

Context

StackExchange Code Review Q#85217, answer score: 3

Revisions (0)

No revisions yet.