patternpythonMinor
Using BeautifulSoup to scrape various tables and combine in a .csv file
Viewed 0 times
tablesfilecombinecsvscrapeusingandbeautifulsoupvarious
Problem
A page contains a table of links, each link contains a table relevant to the link (a subject). Create a list of these links to pass to the function called
I'm looking for some feedback/criticism/improvements to a piece of code I've written.
```
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import glob
import os
def scrapeTable(url):
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
#get page header
title = soup.find('h4', 'otherTablesSubTitle')
subject_name = title.contents[0]
#get table with 'tablesorter' as name
table = soup.find('table', {'class': 'tablesorter'})
#open file using page header
with open('C:/' + subject_name + '.csv', 'ab') as f:
csvwriter = csv.writer(f)
for row in table.findAll('tr'):
headers = []
for item in soup.find_all('th'):
headers.append(item.contents[0])
#because some pages don't follow exact format, rename any instances of Institution to University
for idx, h in enumerate(headers):
if 'Institution' in h:
headers[idx] = 'University'
csvwriter.writerow(headers)
for row in table.findAll('tr'):
cells = [c.text.encode('utf-8') for c in row.findAll('td')]
csvwriter.writerow(cells)
#get third index to use as id for pd.melt
header_id = headers[2]
#remove third index to use remaining as values for pd.melt
headers.pop(2)
#denormalise the table and insert subject name at beginning
df = pd.read_csv('C:/' + subject_name + '.csv')
a = pd.melt(df, id_vars=header_id, value_vars=headers, var_name='Measure', value_name='Value')
a.insert(0, 'Subject', subject_name)
a.to_csv('C:/' + subject_name + '.csv', sep=',', index=False)
#
scrapeTable which then takes the table and stores it in a CSV file. A directory of files are created for each subject which are then merged into one master file.I'm looking for some feedback/criticism/improvements to a piece of code I've written.
```
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import glob
import os
def scrapeTable(url):
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
#get page header
title = soup.find('h4', 'otherTablesSubTitle')
subject_name = title.contents[0]
#get table with 'tablesorter' as name
table = soup.find('table', {'class': 'tablesorter'})
#open file using page header
with open('C:/' + subject_name + '.csv', 'ab') as f:
csvwriter = csv.writer(f)
for row in table.findAll('tr'):
headers = []
for item in soup.find_all('th'):
headers.append(item.contents[0])
#because some pages don't follow exact format, rename any instances of Institution to University
for idx, h in enumerate(headers):
if 'Institution' in h:
headers[idx] = 'University'
csvwriter.writerow(headers)
for row in table.findAll('tr'):
cells = [c.text.encode('utf-8') for c in row.findAll('td')]
csvwriter.writerow(cells)
#get third index to use as id for pd.melt
header_id = headers[2]
#remove third index to use remaining as values for pd.melt
headers.pop(2)
#denormalise the table and insert subject name at beginning
df = pd.read_csv('C:/' + subject_name + '.csv')
a = pd.melt(df, id_vars=header_id, value_vars=headers, var_name='Measure', value_name='Value')
a.insert(0, 'Subject', subject_name)
a.to_csv('C:/' + subject_name + '.csv', sep=',', index=False)
#
Solution
Code Style
HTML parsing
-
look into using CSS selectors which is, generally, a more concise way to locate the elements. For example, you can replace
with:
where
- follow the PEP8 style guide, in particular - fix the imports order, use the
lower_case_with_udnerscoresvariable and function naming style (e.g.scrapeTableandCSVListare violations)
HTML parsing
- I think you are over-using the
.contentsattribute. Consider switching to.get_text()
-
look into using CSS selectors which is, generally, a more concise way to locate the elements. For example, you can replace
table = soup.find('table', {'class': 'tablesorter'}) with table = soup.select_one('table.tablesorter'). Or, you can replace:links = []
for anchor in soup.findAll('a', href=True):
if 'imported' in anchor['href']:
links.append('link' + anchor['href'])with:
links = ['link' + anchor['href'] for anchor in soup.select("a[href*=imported]")]where
*= means "contains".Code Snippets
links = []
for anchor in soup.findAll('a', href=True):
if 'imported' in anchor['href']:
links.append('link' + anchor['href'])links = ['link' + anchor['href'] for anchor in soup.select("a[href*=imported]")]Context
StackExchange Code Review Q#89956, answer score: 4
Revisions (0)
No revisions yet.