patternpythonMinor
Scraping columns from SEDE results
Viewed 0 times
sedecolumnsscrapingresultsfrom
Problem
I use the following script to download the result of a SEDE query and scrape a specific column from it using
Unit tests:
```
import os
import unittest
from bs4 import BeautifulSoup
from robosanta.stackexchange.sede import extract_column
BASE_DIR = os.path.dirname(__file__)
SEDE_OUTPUT_HTML = os.path.join(BASE_DIR, 'sede-output.html')
POST_ID_COLUMN = 'Post Link'
DATE_COLUMN = 'CreationDate'
ROW_COUNT = 49
def new_soup():
with open(SEDE_OUTPUT_HTML) as fh:
return BeautifulSoup(fh)
class TestGetColumn(unittest.TestCase):
def extract_column(self, colname):
return extract_column(new_s
BeautifulSoup:import json
def extract_column(soup, colname):
"""
Returns a generator of cell values in selected column.
For simple columns like timestamp, a cell value can be simple,
for example: 1414433013197
For more complex columns like Post Link, a cell value can be an object,
for example:
{
"id": 68102,
"title": "Bash Script - File Comment out & Notate"
}
:param soup: a bs4 (BeautifulSoup) object
:param colname: name of the SEDE column to extract
:return: generator of cell values in selected column
"""
def get_column_index():
for index, info in enumerate(columns):
if info['name'] == colname:
return index
return -1
for script in soup.findAll('script'):
result_sets_col = 'resultSets'
if result_sets_col in script.text:
start = script.text.rindex('{', 0, script.text.index(result_sets_col))
end = script.text.index('}', script.text.index('querySetId')) + 1
data = json.loads(script.text[start:end])
results = data[result_sets_col][0]
columns = results['columns']
rows = results['rows']
column_index = get_column_index()
if column_index > -1:
for row in rows:
yield row[column_index]Unit tests:
```
import os
import unittest
from bs4 import BeautifulSoup
from robosanta.stackexchange.sede import extract_column
BASE_DIR = os.path.dirname(__file__)
SEDE_OUTPUT_HTML = os.path.join(BASE_DIR, 'sede-output.html')
POST_ID_COLUMN = 'Post Link'
DATE_COLUMN = 'CreationDate'
ROW_COUNT = 49
def new_soup():
with open(SEDE_OUTPUT_HTML) as fh:
return BeautifulSoup(fh)
class TestGetColumn(unittest.TestCase):
def extract_column(self, colname):
return extract_column(new_s
Solution
I'd test
if not result_sets_col in script.text and continue in that case, to remove a level of nesting:for script in soup.findAll('script'):
result_sets_col = 'resultSets'
if not result_sets_col in script.text:
continue
start = script.text.rindex('{', 0, script.text.index(result_sets_col))Code Snippets
for script in soup.findAll('script'):
result_sets_col = 'resultSets'
if not result_sets_col in script.text:
continue
start = script.text.rindex('{', 0, script.text.index(result_sets_col))Context
StackExchange Code Review Q#113654, answer score: 3
Revisions (0)
No revisions yet.