patternpythonMinor
Extract text from news articles
Viewed 0 times
newstextarticlesextractfrom
Problem
I have written a file to extract text from news articles. The program works as I want it to except it takes hours for it to write the output files. I have tried different buffering options as well as adding the data I want written to an array before writing that to the file. I have almost no experience programming so I might just be doing something really dumb.
The main issue with the program is when I'm printing ngrams.
```
import os, re
from collections import Counter
from math import sqrt
path = os.getcwd() + "/20_newsgroups"
#print path
def findFiles(path):
Files = []
for root, dirs, files in os.walk(path):
for f in files:
#print (os.path.join(root , f).replace("\\","/"))
Files.append(os.path.join(root , f).replace("\\","/"))
return Files
def cleanData():
data = findFiles(path)
remove = []
for entry in data:
f = open(entry,'r')
for line in f:
if line.lower().startswith("subject: re:") or line.lower().startswith("subject: re:"):
#flag = 1
remove.append(entry)
break
elif line.lower().startswith("subject: "):
break
#print line
#for word in line.split():
# if word == "Re:" or word == "re:":
#print("Found Re:")
# remove.append(entry)
# break
#print len(data)
#print len(set(data))
#print len(remove)
#print len(set(remove))
data2 = list(set(data) - set(remove))
#print len(data2)
return set(data2) #6694
d1 = {}
#d1.setdefault('test',[]).append('test2') ##this allows duplicate entries.
SetofUniqueWords = set()
def readInWords():
data = cleanData()
for entry in data:
The main issue with the program is when I'm printing ngrams.
d2 contains all of the items I want to print and it prints it based off of an index in a list. d2 is generated quickly, and only the writing is slow.```
import os, re
from collections import Counter
from math import sqrt
path = os.getcwd() + "/20_newsgroups"
#print path
def findFiles(path):
Files = []
for root, dirs, files in os.walk(path):
for f in files:
#print (os.path.join(root , f).replace("\\","/"))
Files.append(os.path.join(root , f).replace("\\","/"))
return Files
def cleanData():
data = findFiles(path)
remove = []
for entry in data:
f = open(entry,'r')
for line in f:
if line.lower().startswith("subject: re:") or line.lower().startswith("subject: re:"):
#flag = 1
remove.append(entry)
break
elif line.lower().startswith("subject: "):
break
#print line
#for word in line.split():
# if word == "Re:" or word == "re:":
#print("Found Re:")
# remove.append(entry)
# break
#print len(data)
#print len(set(data))
#print len(remove)
#print len(set(remove))
data2 = list(set(data) - set(remove))
#print len(data2)
return set(data2) #6694
d1 = {}
#d1.setdefault('test',[]).append('test2') ##this allows duplicate entries.
SetofUniqueWords = set()
def readInWords():
data = cleanData()
for entry in data:
Solution
A likely cause for the performance issues is the line
Other comments:
My rewrite of
index = IL.index(item) in printNGrams, because it does a linear search through the list of all ngrams. Use a dictionary that maps ngrams to indexes instead of the list. In fact, an OrderedDict can replace both of the variables SetOfNGram and IL.Other comments:
- Use
withto open and automatically close files.
- Use string formatting instead of eg.
'char'+ str(n) +'.csv'
- Use more descriptive variable names.
- Prefer to pass data in function arguments instead of global variables.
My rewrite of
printNGrams. I would also change names of d1 and d2 but I'm not sure to what as I did not study the rest of the code.def printNGrams(n, d1):
d2 = {}
all_ngrams = collections.OrderedDict()
for key, value in d1.iteritems():
text = " ".join(value)
for i in range(0,len(text)-n+1):
ngram = text[i:i+n]
all_ngrams.setdefault(ngram, len(all_ngrams))
d2.setdefault(key,[]).append(ngram)
with open('char{}.csv'.format(n), 'w') as g:
for key in d1:
obj_id = ObjID[key]
ngrams = d2[key]
freqs = Counter(ngrams)
mag = sqrt(sum(freq**2 for freq in freqs.itervalues()))
for ngram, freq in freqs.iteritems():
index = all_ngrams[ngram]
g.write('{},{},{}\n'.format(obj_id, index, freq/mag))
with open('char{}.clabel'.format(n), 'w') as h:
for ngram in all_ngrams:
h.write(ngram+'\n')Code Snippets
def printNGrams(n, d1):
d2 = {}
all_ngrams = collections.OrderedDict()
for key, value in d1.iteritems():
text = " ".join(value)
for i in range(0,len(text)-n+1):
ngram = text[i:i+n]
all_ngrams.setdefault(ngram, len(all_ngrams))
d2.setdefault(key,[]).append(ngram)
with open('char{}.csv'.format(n), 'w') as g:
for key in d1:
obj_id = ObjID[key]
ngrams = d2[key]
freqs = Counter(ngrams)
mag = sqrt(sum(freq**2 for freq in freqs.itervalues()))
for ngram, freq in freqs.iteritems():
index = all_ngrams[ngram]
g.write('{},{},{}\n'.format(obj_id, index, freq/mag))
with open('char{}.clabel'.format(n), 'w') as h:
for ngram in all_ngrams:
h.write(ngram+'\n')Context
StackExchange Code Review Q#111077, answer score: 5
Revisions (0)
No revisions yet.