patternpythonMinor
Extract information from product pages
Viewed 0 times
productpagesextractfrominformation
Problem
I'm new to Python, and just started to learn the principles of Object Oriented Programming, this does not judge strictly.
This code is fully working but confused some elements:
Initially, I did it in order to be able to easily cause
But now, I need to first create an instance of the class, and then call the function
I tried to do the same when declaring
```
from bs4 import BeautifulSoup
import urllib.request
import csv
class Parse:
k = 1
def __init__(self,k):
pass
def read_csv(k):
with open('/home/narnikgamarnik/PycharmProjects/my_phyton3_projects/products_links2.csv') as f:
r = csv.reader(f)
cont = [row for row in r]
d = (cont[k])[0]
return d
def get_url(d):
try:
url = urllib.request.urlopen(d)
except urllib.error.HTTPError as err:
if err.code == 404:
return False
else:
raise
return url
def get_title(url):
try:
soup = BeautifulSoup(url, 'html.parser')
ol = soup.find('ol', 'breadcrumb')
title = ol.find_all('li')[-1].string
except AttributeError:
return False
return title
def get_gender(url):
try:
soup = BeautifulSoup(url, 'html.parser')
ol = soup.find('ol', 'breadcrumb')
gender = ol.find_all('a')[0].string
except AttributeError:
return False
return gender
def get_cate
This code is fully working but confused some elements:
- I don't understand what the
__init__method needs to return.
- I don't like
self.inget_parsethat much.
Initially, I did it in order to be able to easily cause
Parse.id, or you want me to attribute.But now, I need to first create an instance of the class, and then call the function
get_parse while accepting self and the number of lines (k), and only then, I will be available for items such as Parse.title.I tried to do the same when declaring
__init__, but nothing came of it, because at this stage the function may not be available (this is especially the language, is not it? In IPython not exist, or am I mistaken too?)```
from bs4 import BeautifulSoup
import urllib.request
import csv
class Parse:
k = 1
def __init__(self,k):
pass
def read_csv(k):
with open('/home/narnikgamarnik/PycharmProjects/my_phyton3_projects/products_links2.csv') as f:
r = csv.reader(f)
cont = [row for row in r]
d = (cont[k])[0]
return d
def get_url(d):
try:
url = urllib.request.urlopen(d)
except urllib.error.HTTPError as err:
if err.code == 404:
return False
else:
raise
return url
def get_title(url):
try:
soup = BeautifulSoup(url, 'html.parser')
ol = soup.find('ol', 'breadcrumb')
title = ol.find_all('li')[-1].string
except AttributeError:
return False
return title
def get_gender(url):
try:
soup = BeautifulSoup(url, 'html.parser')
ol = soup.find('ol', 'breadcrumb')
gender = ol.find_all('a')[0].string
except AttributeError:
return False
return gender
def get_cate
Solution
Your biggest problem is that you are not using classes properly.
All class methods should have
Your second biggest problem is redundancy. All your
You could write a function that dynamically does this, and store the date what to use for which category (title, category, gender, ...) in some data structure like a dict of
The other might actually need separate methods.
All class methods should have
self as the first parameter. __init__ is the function that is called, when an instance of a class is created (which you do not do!). In that case you also don't need to constantly reassign class attributes like self.url.class Parser:
def __init__(self, file_name, k):
self.file_name = file_name
self.k = k
self.url = self.get_url(...)
...
def parse(self):
return self.read_csv()
def read_csv(self):
return self.file_name, self.k
parser = Parser('/home/narnikgamarnik/PycharmProjects/my_phyton3_projects/products_links2.csv', 1)
print parser.parse()Your second biggest problem is redundancy. All your
get_* methods do something very similar:- Instantiate a
BeatifulSoupobject
- find all
xtags
- (Find all
ytags)
- Return some part of those tags
You could write a function that dynamically does this, and store the date what to use for which category (title, category, gender, ...) in some data structure like a dict of
collections.namedtuple:import collections
Category = collections.namedtuple("Category", "pos element1 element2 method")
class Parse:
def __init__(self, k):
self.data = {'title': Category(-1, 'ol', 'li', 'breadcrumb'),
'gender': Category(0, 'ol', 'a', 'breadcrumb'),
'category': Category(1, 'ol', 'a', 'breadcrumb'),
'model': Category(2, 'ol', 'a', 'breadcrumb'),
'article': Category(0, 'p', 'span', 'product_code'),
'article2': Category(0, 'h1', 'span', '')}
def parse(self):
return [self.get(url, *data) for data in self.data]
def get(self, url, pos, element1, element2, method):
try:
soup = BeautifulSoup(url, 'html.parser')
if method:
elements = soup.find(element1, method)
else:
elements = soup.find(element1)
return elements.find_all(element2)[pos].string
except AttributeError:
return FalseThe other might actually need separate methods.
Code Snippets
class Parser:
def __init__(self, file_name, k):
self.file_name = file_name
self.k = k
self.url = self.get_url(...)
...
def parse(self):
return self.read_csv()
def read_csv(self):
return self.file_name, self.k
parser = Parser('/home/narnikgamarnik/PycharmProjects/my_phyton3_projects/products_links2.csv', 1)
print parser.parse()import collections
Category = collections.namedtuple("Category", "pos element1 element2 method")
class Parse:
def __init__(self, k):
self.data = {'title': Category(-1, 'ol', 'li', 'breadcrumb'),
'gender': Category(0, 'ol', 'a', 'breadcrumb'),
'category': Category(1, 'ol', 'a', 'breadcrumb'),
'model': Category(2, 'ol', 'a', 'breadcrumb'),
'article': Category(0, 'p', 'span', 'product_code'),
'article2': Category(0, 'h1', 'span', '')}
def parse(self):
return [self.get(url, *data) for data in self.data]
def get(self, url, pos, element1, element2, method):
try:
soup = BeautifulSoup(url, 'html.parser')
if method:
elements = soup.find(element1, method)
else:
elements = soup.find(element1)
return elements.find_all(element2)[pos].string
except AttributeError:
return FalseContext
StackExchange Code Review Q#140730, answer score: 5
Revisions (0)
No revisions yet.