patternpythonMinor
HTML Scraper for Plex downloads page
Viewed 0 times
plexforpagehtmldownloadsscraper
Problem
I have written a scraper in Python 3 using Beautiful Soup 4 to retrieve the latest version of Plex Media Server from https://plex.tv, and I'd like some feedback on how to improve it.
The HTML the parser is to be used on can be found at here (the section of code that downloads the HTML itself is not included in the parser).
This is the first time I've written any type of scraper / parser, and I feel my current code is rather messy with some parts being partially redundant.
parser.py
```
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from os import path
import os
import re
from .version import PlexVersion
class PlexVersionParser(object):
def __init__(self, html):
self._html = html
self._soup = BeautifulSoup(self.html, 'html.parser')
self._versions = []
@property
def html(self):
return self._html
@property
def soup(self):
return self._soup
@property
def versions(self):
return self._versions
def _create_version(self, version_string, platform, name, address):
version_string = 'Unknown' if version_string is None else version_string
platform = 'Unknown' if platform is None else platform
name = 'Unknown' if name is None else name
address = 'Unknown' if address is None else address
version = PlexVersion(version_string, platform, name, address)
self.versions.append(version)
def _parse_download_link(self, platform, name, address):
platform = re.sub(r'^Plex Media Server for ', '', platform)
name = re.sub(r'^Download ?', '', name)
if len(name) == 0:
name = None
url = urlparse(address)
path_pieces = path.normpath(url.path).split(os.sep)
self._create_version(path_pieces[2], platform, name, address)
def _parse_download_links(self, title, links, prefix=None):
for link in links:
name = link.text if prefix is None else prefix + ' ' + link.t
The HTML the parser is to be used on can be found at here (the section of code that downloads the HTML itself is not included in the parser).
This is the first time I've written any type of scraper / parser, and I feel my current code is rather messy with some parts being partially redundant.
parser.py
```
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from os import path
import os
import re
from .version import PlexVersion
class PlexVersionParser(object):
def __init__(self, html):
self._html = html
self._soup = BeautifulSoup(self.html, 'html.parser')
self._versions = []
@property
def html(self):
return self._html
@property
def soup(self):
return self._soup
@property
def versions(self):
return self._versions
def _create_version(self, version_string, platform, name, address):
version_string = 'Unknown' if version_string is None else version_string
platform = 'Unknown' if platform is None else platform
name = 'Unknown' if name is None else name
address = 'Unknown' if address is None else address
version = PlexVersion(version_string, platform, name, address)
self.versions.append(version)
def _parse_download_link(self, platform, name, address):
platform = re.sub(r'^Plex Media Server for ', '', platform)
name = re.sub(r'^Download ?', '', name)
if len(name) == 0:
name = None
url = urlparse(address)
path_pieces = path.normpath(url.path).split(os.sep)
self._create_version(path_pieces[2], platform, name, address)
def _parse_download_links(self, title, links, prefix=None):
for link in links:
name = link.text if prefix is None else prefix + ' ' + link.t
Solution
They might have change the format in the meantime, at least I couldn't
find the correct invocation. Regardless, that doesn't we can't say
something about the code.
For your two irks:
it.
-
dropped. Even then I'd perhaps create a list and
create a helper function, e.g.:
or
Some other remarks:
and stick with it, even if external APIs don't adhere to your style.
constants such that they're only initialised once.
except for the
a class? I get that it might be useful to organise things, but all
this could be plain functions as well.
with the errors, i.e. indicate that a valid one consists of five
parts. Also consider
the code - in many cases some clever unpacking, or a functional
approach can result in drastically more compact code.
this SO post
to reuse some existing ones.
find the correct invocation. Regardless, that doesn't we can't say
something about the code.
For your two irks:
- If the
osclass requires it I don't see a particular problem with
it.
-
... if x is None else x - that's more compactly written as ... or
'Unknown' as long as it's fine that more falsy values will bedropped. Even then I'd perhaps create a list and
map over it, orcreate a helper function, e.g.:
def _create_version(self, version_string, platform, name, address):
self.versions.append(PlexVersion(*['Unknown' if x is None else x for x in [version_string, platform, name, address]))or
def _create_version(self, version_string, platform, name, address):
self.versions.append(PlexVersion(*[x or 'Unknown' for x in [version_string, platform, name, address]])Some other remarks:
- The names mix styles between camel-case and underscores - choose one
and stick with it, even if external APIs don't adhere to your style.
- If you really care enough to call
re.compile, move them into
constants such that they're only initialised once.
- The API is weird to me: Is there ever a reason that you wouldn't call
parse on this parser object? Do you really need access to fieldsexcept for the
versions results? Oh and the last bit: Why is thisa class? I get that it might be useful to organise things, but all
this could be plain functions as well.
- With regards to the
version.pyfile, consider being more descriptive
with the errors, i.e. indicate that a valid one consists of five
parts. Also consider
map(int, pieces[:3]) or something to shortenthe code - in many cases some clever unpacking, or a functional
approach can result in drastically more compact code.
- IMO the version classes are overkill, take a look at
this SO post
to reuse some existing ones.
Code Snippets
def _create_version(self, version_string, platform, name, address):
self.versions.append(PlexVersion(*['Unknown' if x is None else x for x in [version_string, platform, name, address]))def _create_version(self, version_string, platform, name, address):
self.versions.append(PlexVersion(*[x or 'Unknown' for x in [version_string, platform, name, address]])Context
StackExchange Code Review Q#117296, answer score: 2
Revisions (0)
No revisions yet.