patternpythonMinor
Process text file of fulltext articles from news publications
Viewed 0 times
newsfulltextfileprocesstextarticlespublicationsfrom
Problem
I'm relatively new to programming and have started writing some code for work. This script is something I wrote that I'd like to have someone comment on. Am I doing anything silly, stupid or totally unnecessary? Am I making things more complicated than they need to be?
This script takes a plain text file downloaded from a publication database (example file below) and processes it to create a csv of the metadata from all the articles and a separate text file for the full text of each article.
```
from sys import argv
import time
import csv
timestamp = time.strftime('%H%M-%Y%m%d')
class Article(object):
fulltxt = ""
Title = ""
Publication = ""
Date = ""
Year = ""
Author = ""
PQID = ""
csvName = 'Metadata_'+timestamp+'.csv'
fields = ['ID', 'Title', 'Author', 'Publication', 'Date', 'Year']
csvFile = open(csvName, 'wb')
csvwriter = csv.DictWriter(csvFile, delimiter=',', fieldnames=fields)
#write csv headers
csvwriter.writerow(dict((fn, fn) for fn in fields))
def abbreviate(pub):
#a publication abbreviation is included in the filename of the textfile
pubs = {
'USA TODAY (pre-1997 Fulltext)' : 'USAT',
'USA TODAY' : 'USAT',
'New York Times' : 'NYT',
'Wall Street Journal' : 'WSJ'
}
return pubs.get(pub, "X")
def writeMetadata(doc):
#write the field values of the current doc to the csv file
if doc.fulltxt != "":
writefields = dict((field, getattr(doc, field)) for field in vars(doc) if not field.startswith('__') and field != 'fulltxt')
csvwriter.writerow(writefields)
def writeArticle(doc):
name = abbreviate(doc.Publication)+'_'+doc.Year+'_'+doc.PQID+'.txt'
txtFile = open(name, 'a')
txtFile.write(doc.Title)
txtFile.write(doc.fulltxt)
txtFile.close()
script, filename = argv
docs = open(filename)
readingtxt = False
for line in docs:
line = line.strip()
if line == '____________________________________________________________':
#write ou
This script takes a plain text file downloaded from a publication database (example file below) and processes it to create a csv of the metadata from all the articles and a separate text file for the full text of each article.
```
from sys import argv
import time
import csv
timestamp = time.strftime('%H%M-%Y%m%d')
class Article(object):
fulltxt = ""
Title = ""
Publication = ""
Date = ""
Year = ""
Author = ""
PQID = ""
csvName = 'Metadata_'+timestamp+'.csv'
fields = ['ID', 'Title', 'Author', 'Publication', 'Date', 'Year']
csvFile = open(csvName, 'wb')
csvwriter = csv.DictWriter(csvFile, delimiter=',', fieldnames=fields)
#write csv headers
csvwriter.writerow(dict((fn, fn) for fn in fields))
def abbreviate(pub):
#a publication abbreviation is included in the filename of the textfile
pubs = {
'USA TODAY (pre-1997 Fulltext)' : 'USAT',
'USA TODAY' : 'USAT',
'New York Times' : 'NYT',
'Wall Street Journal' : 'WSJ'
}
return pubs.get(pub, "X")
def writeMetadata(doc):
#write the field values of the current doc to the csv file
if doc.fulltxt != "":
writefields = dict((field, getattr(doc, field)) for field in vars(doc) if not field.startswith('__') and field != 'fulltxt')
csvwriter.writerow(writefields)
def writeArticle(doc):
name = abbreviate(doc.Publication)+'_'+doc.Year+'_'+doc.PQID+'.txt'
txtFile = open(name, 'a')
txtFile.write(doc.Title)
txtFile.write(doc.fulltxt)
txtFile.close()
script, filename = argv
docs = open(filename)
readingtxt = False
for line in docs:
line = line.strip()
if line == '____________________________________________________________':
#write ou
Solution
Constants are
Passing to more serious comments:
^^ That was a real puzzler, I know understood that the slicing is there to avoid saving things like
The same result of what you did but more intuitive
In fact you should use a loop:
^ The loop above may need other modifications to your code before working properly (not tested).
Possibly, you may stop using the
And just go for a plain
Use
Should become
As it automatically closes the file.
Boring == possibly wrong
^^ Counting that the above has the correct number of underscores is boring so I will not do it (which is the correct number anyway?) you should do
CAPITALas a convention, so:TIMESTAMP = time.strftime('%H%M-%Y%m%d')
CSV_NAME = 'Metadata_' + timestamp + '.csv' # <- added some spaces
FIELDS = ['ID', 'Title', 'Author', 'Publication', 'Date', 'Year']Passing to more serious comments:
elif line.startswith("Title:"):
doc.Title = line[6:]
elif line.startswith("Publication title:"):
doc.Publication = line[19:]
elif line.startswith("Author:"):
doc.Author = line[8:]
elif line.startswith("Publication date:"):
doc.Date = line[18:]
elif line.startswith("ProQuest document ID:"):
doc.PQID = line[22:]
elif line.startswith("Publication year:"):
doc.Year = line[18:]^^ That was a real puzzler, I know understood that the slicing is there to avoid saving things like
"ProQuest document ID:" or "Publication year:", I suggest just using split and [1], like this:elif line.startswith("Publication year:"):
doc.Year = line.split("Publication year:")[1]The same result of what you did but more intuitive
In fact you should use a loop:
for salient_part in ("Title", "Publication Title:" ...):
if line.startswith(salient_part):
doc[salient_part] = line.split(salient_part)[1]^ The loop above may need other modifications to your code before working properly (not tested).
Possibly, you may stop using the
class Article(object):
fulltxt = ""
Title = ""
Publication = ""
Date = ""
Year = ""
Author = ""
ID = ""And just go for a plain
dict, that sure would make the loop simple to implement.Use
withtxtFile = open(name, 'a')
txtFile.write(Article.Title)
txtFile.write(Article.fulltxt)
txtFile.close()Should become
with open(name, 'a') as f:
f.write(Article.Title)
f.write(Article.fulltxt)As it automatically closes the file.
Boring == possibly wrong
if line == '____________________________________________________________':^^ Counting that the above has the correct number of underscores is boring so I will not do it (which is the correct number anyway?) you should do
if line == '_' * NUMBER_OF_UNDERSCORES_NEEDED:Code Snippets
TIMESTAMP = time.strftime('%H%M-%Y%m%d')
CSV_NAME = 'Metadata_' + timestamp + '.csv' # <- added some spaces
FIELDS = ['ID', 'Title', 'Author', 'Publication', 'Date', 'Year']elif line.startswith("Title:"):
doc.Title = line[6:]
elif line.startswith("Publication title:"):
doc.Publication = line[19:]
elif line.startswith("Author:"):
doc.Author = line[8:]
elif line.startswith("Publication date:"):
doc.Date = line[18:]
elif line.startswith("ProQuest document ID:"):
doc.PQID = line[22:]
elif line.startswith("Publication year:"):
doc.Year = line[18:]elif line.startswith("Publication year:"):
doc.Year = line.split("Publication year:")[1]for salient_part in ("Title", "Publication Title:" ...):
if line.startswith(salient_part):
doc[salient_part] = line.split(salient_part)[1]class Article(object):
fulltxt = ""
Title = ""
Publication = ""
Date = ""
Year = ""
Author = ""
ID = ""Context
StackExchange Code Review Q#105478, answer score: 5
Revisions (0)
No revisions yet.