patternpythonMinor
Python automate job ads search
Viewed 0 times
searchautomatepythonjobads
Problem
I recently started to work with python and came to idea to automate process of looking for new jobs. There is nice website and I managed to get content, because they use angular to render webpage and using just
Below code is functional, but a bit slow, it take about 10-15 seconds to get rendered page. Full process include going through all job ads, get title,description,link, etc, and check local database to see whether this ad already exists. If not, new mail is sent through mailgun. Any suggestion is welcomed.
```
#!/usr/bin/python
# -- coding: utf-8 --
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from bs4 import BeautifulSoup
import pdfkit
import requests
from db import DB
# Variables
base_url = "https://www.helloworld.rs"
links = []
def send_mail(title, body, attach):
return requests.post(
"https://api.mailgun.net/v3/MY_URL/messages",
auth=("api", "MY-API-KEY"),
files=[("attachment", open(attach))],
data={"from": 'Job Bot ',
"to": ["another@mail"],
"subject": "New job: " + title,
"text": body})
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
# full url of jobs of interests
url = 'https://www.helloworld.rs/oglasi-za-posao/#/page=0&tag=&cat=&show_more=0&senioritet=1&vreme_postavljanja=&rok_konkursa=&jezik='
try:
r = Render(url)
except:
print "[!] Connection error!"
exit(0)
result = r.frame.toHtml()
formatted_result = str(result.toAscii())
bs = BeautifulSoup(formatted_result, 'lxml')
jobs = []
for job in bs.find_all('div', class_="job-item"):
_company
requests gives no results, because page is rendered using javascript.Below code is functional, but a bit slow, it take about 10-15 seconds to get rendered page. Full process include going through all job ads, get title,description,link, etc, and check local database to see whether this ad already exists. If not, new mail is sent through mailgun. Any suggestion is welcomed.
```
#!/usr/bin/python
# -- coding: utf-8 --
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from bs4 import BeautifulSoup
import pdfkit
import requests
from db import DB
# Variables
base_url = "https://www.helloworld.rs"
links = []
def send_mail(title, body, attach):
return requests.post(
"https://api.mailgun.net/v3/MY_URL/messages",
auth=("api", "MY-API-KEY"),
files=[("attachment", open(attach))],
data={"from": 'Job Bot ',
"to": ["another@mail"],
"subject": "New job: " + title,
"text": body})
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
# full url of jobs of interests
url = 'https://www.helloworld.rs/oglasi-za-posao/#/page=0&tag=&cat=&show_more=0&senioritet=1&vreme_postavljanja=&rok_konkursa=&jezik='
try:
r = Render(url)
except:
print "[!] Connection error!"
exit(0)
result = r.frame.toHtml()
formatted_result = str(result.toAscii())
bs = BeautifulSoup(formatted_result, 'lxml')
jobs = []
for job in bs.find_all('div', class_="job-item"):
_company
Solution
Speeding up HTML Parsing
You can make some quick wins on the HTML parsing side by applying a
Don't forget to import
Speeding up Database Interaction
First of all, you should not be using
As far as improving the speed - you can actually use
Other performance improvements
I would probably enforce the uniqueness of a
You can make some quick wins on the HTML parsing side by applying a
SoupStrainer class allowing to parse only a part of the document containing the desired jobs:parse_only = SoupStrainer('div', class_='job-item')
bs = BeautifulSoup(formatted_result, 'lxml', parse_only=parse_only)
jobs = []
for job in bs.select("div.job-item"):
_company = job.strong
try:
company = _company.a.get_text()
except AttributeError:
company = _company.get_text()
company = company.replace('\n', '').replace('\r', '').replace('\t', '')
title = job.h3.a.string
description = job.select_one("p.description").contents[0]
link = base_url + job.select_one("a.job-link")["href"]
jobs.append({
"title": title,
"description": description,
"company": company,
"link": link
})Don't forget to import
SoupStrainer from bs4.Speeding up Database Interaction
First of all, you should not be using
str.format() to parameterize your query - it is highly discouraged. You are opening your code to SQL injections and have to deal with type conversions and quote balancing and escaping manually. Instead, properly parameterize your query via your database driver.As far as improving the speed - you can actually use
executemany() to insert jobs into the database in one go:query = """
INSERT INTO
jobs (title, description, company, link)
VALUES
(%(title)s, %(description)s, %(company)s, %(link)s)
"""
db.executemany(query, jobs)Other performance improvements
I would probably enforce the uniqueness of a
link on the database level and handle/skip constraint violations on insert instead of doing it manually via grabbing the existing links first. If you still gonna continue with this approach, make sure to make links a set to improve the lookup speed.Code Snippets
parse_only = SoupStrainer('div', class_='job-item')
bs = BeautifulSoup(formatted_result, 'lxml', parse_only=parse_only)
jobs = []
for job in bs.select("div.job-item"):
_company = job.strong
try:
company = _company.a.get_text()
except AttributeError:
company = _company.get_text()
company = company.replace('\n', '').replace('\r', '').replace('\t', '')
title = job.h3.a.string
description = job.select_one("p.description").contents[0]
link = base_url + job.select_one("a.job-link")["href"]
jobs.append({
"title": title,
"description": description,
"company": company,
"link": link
})query = """
INSERT INTO
jobs (title, description, company, link)
VALUES
(%(title)s, %(description)s, %(company)s, %(link)s)
"""
db.executemany(query, jobs)Context
StackExchange Code Review Q#153871, answer score: 2
Revisions (0)
No revisions yet.