patternpythonMinor
Beginner Python OOP web scraper
Viewed 0 times
oopbeginnerpythonwebscraper
Problem
I started learn OOP in Python. I have OOP basics in Java and apparently I have a problem with thinking in OOP in Python (and using the best functionalities and syntax). I have doubts about my OOP design in web scraping for a local book review page. So the basic idea is scraping information about book with the user reviews for that books.
The structure of the Python package:
parsers.py:
```
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bookscraper.book import Book
from bookscraper.comment import Comment
class Parser:
def __init__(self, url, name, search_url):
self.url = url
self.name = name
self.search_url = search_url
class DatabazeKnih(Parser):
PAGE_URL = 'http://www.databazeknih.cz/'
NAME = 'Databáze knih'
SEARCH_URL = 'search?q={}&hledat=&stranka=search'
def __init__(self):
super().__init__(DatabazeKnih.PAGE_URL, DatabazeKnih.NAME, DatabazeKnih.SEARCH_URL)
def search(self, name):
self.search_html = urlopen(DatabazeKnih.PAGE_URL + DatabazeKnih.SEARCH_URL.format(name))
self.book_link = DatabazeKnih.PAGE_URL + self.get_book_link() + '?show=alldesc'
return self.get_book_info()
def get_book_page(self):
return urlopen(self.book_link)
def get_book_link(self):
soup = BeautifulSoup(self.search_html, 'html.parser')
return soup.find(type='book').attrs['href']
@staticmethod
def parse_comment(comment):
user = comment.find('a').string
text = comment.find('p', {'class': 'odtopm new2 justify'}).text
date = comment.find('span', {'class': 'pozn_lightest odleft_pet'}).string
return user, text, date
@staticmethod
def get_comments(soup):
comments_objects = []
comments = soup.find_all('div', {'class': 'komholdu'})
for comment in comments:
user, text, date = DatabazeKnih.parse_com
The structure of the Python package:
/bookscraper
__init__.py
book.py
comment.py
parsers.pyparsers.py:
```
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bookscraper.book import Book
from bookscraper.comment import Comment
class Parser:
def __init__(self, url, name, search_url):
self.url = url
self.name = name
self.search_url = search_url
class DatabazeKnih(Parser):
PAGE_URL = 'http://www.databazeknih.cz/'
NAME = 'Databáze knih'
SEARCH_URL = 'search?q={}&hledat=&stranka=search'
def __init__(self):
super().__init__(DatabazeKnih.PAGE_URL, DatabazeKnih.NAME, DatabazeKnih.SEARCH_URL)
def search(self, name):
self.search_html = urlopen(DatabazeKnih.PAGE_URL + DatabazeKnih.SEARCH_URL.format(name))
self.book_link = DatabazeKnih.PAGE_URL + self.get_book_link() + '?show=alldesc'
return self.get_book_info()
def get_book_page(self):
return urlopen(self.book_link)
def get_book_link(self):
soup = BeautifulSoup(self.search_html, 'html.parser')
return soup.find(type='book').attrs['href']
@staticmethod
def parse_comment(comment):
user = comment.find('a').string
text = comment.find('p', {'class': 'odtopm new2 justify'}).text
date = comment.find('span', {'class': 'pozn_lightest odleft_pet'}).string
return user, text, date
@staticmethod
def get_comments(soup):
comments_objects = []
comments = soup.find_all('div', {'class': 'komholdu'})
for comment in comments:
user, text, date = DatabazeKnih.parse_com
Solution
Use list comprehensions instead of creating empty array and appending to it in a for loop. It's both faster and more readable.
But If would argue that you don't need classes here at all.
My suggestions are.
Use requests library, It's more pythonic than urllib.
Use namedtuples instead of classes where possible. Easier to read and to write. If you need to add functionality you can inherit from namedtuple.
Don't write one liner
What you're doing with function name
Also
@staticmethod
def get_comments(soup):
return [
Comment(*DatabazeKnih.parse_comment(comment)
for comment in soup.find_all('div', {'class': 'komholdu'})
]But If would argue that you don't need classes here at all.
from bs4 import BeautifulSoup
from collections import namedtuple
import requests as r
Book = namedtuple('Book', 'title rating text authors year comments')
Comment = namedtuple('Comment', 'user text date')
PAGE_URL = 'http://www.databazeknih.cz/'
def search(name):
page = r.get(PAGE_URL + 'search', params=dict(q=name, stranka='search')).text
link = BeautifulSoup(page, 'html.parser').find(type='book').attrs['href']
book_page = r.get(PAGE_URL + link, params=dict(show='alldesc')).text
return book_info(book_page)
def book_info(book_page):
soup = BeautifulSoup(book_page, 'html.parser')
title = soup.find(itemprop='name').string
rating = soup.findAll('a', {'class': 'bpoints'})[0].string
text = soup.find(itemprop='description').string
authors = [elem.string for elem in soup.find(itemprop='author').find_all('a')]
year = soup.find(itemprop='datePublished').string
comments = [
Comment(*parse_comment(elem))
for elem in soup.find_all('div', {'class': 'komholdu'})
]
return Book(title, rating, text, authors, year, comments)
def parse_comment(comment):
user = comment.find('a').string
text = comment.find('p', {'class': 'odtopm new2 justify'}).text
date = comment.find('span', {'class': 'pozn_lightest odleft_pet'}).string
return user, text, dateMy suggestions are.
Use requests library, It's more pythonic than urllib.
Use namedtuples instead of classes where possible. Easier to read and to write. If you need to add functionality you can inherit from namedtuple.
Don't write one liner
get_this get_that functions. It's useful when you're hiding some implementation, but hiding one or two lines - it's useless and it's even a bit less readable (when you would need this implementation).What you're doing with function name
get_authors you can do just as easy with giving a variable self-explanatory name like authors = .... Everybody would understand that next one or two lines (if it's a comprehension) is getting authors in some way and they can read this lines if they want details.Also
get_book_info means suggesting it's loading or getting something from somewhere. In java it just means that this is some property, but in python we have @property decorator and don't need getters and setters so get is used when you're loading something. Instead it's more accurate to think of it as pure function that takes html and outputs Book. You can call it make_book_info, def book_info_from(html): or just book_info.Code Snippets
@staticmethod
def get_comments(soup):
return [
Comment(*DatabazeKnih.parse_comment(comment)
for comment in soup.find_all('div', {'class': 'komholdu'})
]from bs4 import BeautifulSoup
from collections import namedtuple
import requests as r
Book = namedtuple('Book', 'title rating text authors year comments')
Comment = namedtuple('Comment', 'user text date')
PAGE_URL = 'http://www.databazeknih.cz/'
def search(name):
page = r.get(PAGE_URL + 'search', params=dict(q=name, stranka='search')).text
link = BeautifulSoup(page, 'html.parser').find(type='book').attrs['href']
book_page = r.get(PAGE_URL + link, params=dict(show='alldesc')).text
return book_info(book_page)
def book_info(book_page):
soup = BeautifulSoup(book_page, 'html.parser')
title = soup.find(itemprop='name').string
rating = soup.findAll('a', {'class': 'bpoints'})[0].string
text = soup.find(itemprop='description').string
authors = [elem.string for elem in soup.find(itemprop='author').find_all('a')]
year = soup.find(itemprop='datePublished').string
comments = [
Comment(*parse_comment(elem))
for elem in soup.find_all('div', {'class': 'komholdu'})
]
return Book(title, rating, text, authors, year, comments)
def parse_comment(comment):
user = comment.find('a').string
text = comment.find('p', {'class': 'odtopm new2 justify'}).text
date = comment.find('span', {'class': 'pozn_lightest odleft_pet'}).string
return user, text, dateContext
StackExchange Code Review Q#163274, answer score: 5
Revisions (0)
No revisions yet.