patternpythonMinor
Basic IMDb scraper and movie generator
Viewed 0 times
movieimdbgeneratorandscraperbasic
Problem
I just built my first scraper and I'd like to get your thoughts on the structure and the way I went about it. The basic premise of the script:
- Get a random movie from IMDb's Top 250
- Ask the user whether they've seen it before. If yes, print another random movie. If no, tell them to enjoy the movie and the script ends.
from lxml import html
import requests
def moviePicker():
hasSeen = False
while hasSeen == False:
import random
page = requests.get('http://www.imdb.com/chart/top')
tree = html.fromstring(page.content)
random = random.randint(1, 250)
title = tree.xpath('//*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[' + str(random) +']/td[2]/a/text()')
yearReleased = tree.xpath('//*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[' + str(random) + ']/td[2]/span/text()')
# Parse the movie's url and pull the summary from the details page
movieUrl = str(tree.xpath('//*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[' + str(random) + ']/td[2]/a/@href'))
removeFront = movieUrl.replace("['", "")
cleanUrl = 'http://www.imdb.com' + removeFront.replace("']", "")
moviepage = requests.get(cleanUrl)
details = html.fromstring(moviepage.content)
movieSummary = details.xpath("normalize-space(//div[@class='summary_text']/text())")
print(title)
print(yearReleased)
# print(cleanUrl)
print(movieSummary)
while True:
answer = str.upper((input("Have you already watched this movie? Enter Y or N. : ")))
if answer in ['Y', 'N']:
break
else:
print("Invalid input. Please enter a Y or a N. ")
continue
if answer == 'Y':
hasSeen = False
elif answer == 'N':
print("Enjoy the movie!")
hasSeen = True
moviePicker()Solution
The first thing that comes to attention is that you are repeating stuff unnecessarily.
Every loop, you try to
Importing
Downloading the charts again and again depends highly on the speed of your internet.
It is usually also good to separate your concerns into modular functions. This way you can easily swap out implementations or reuse them later.
Python has an official style-guide, PEP8, which programmers are encouraged to adhere to. It recommends using
I would also use
Doing this, your code becomes:
With this done, next I will focus on the actual implementation of these functions.
The parser you use is very manual. You have to pass the whole path to the element you are looking for. A more user-friendly parser is
It has a method called
I would shuffle the movies list and then iterate over the shuffled list, guaranteeing that a movie will not be picked twice (during one run of the script).
Every loop, you try to
import random and download the movie charts.Importing
random has only a small performance penalty, since Python will quickly notice that it is already imported, but there is some penalty nevertheless.Downloading the charts again and again depends highly on the speed of your internet.
It is usually also good to separate your concerns into modular functions. This way you can easily swap out implementations or reuse them later.
Python has an official style-guide, PEP8, which programmers are encouraged to adhere to. It recommends using
lower_case_with_underscores for variables and functions, instead of camelCase.I would also use
str.format to put the n into the path string and make the path strings themselves constants, which makes it easier to change them if IMDB ever changes their website.Doing this, your code becomes:
from lxml import html
import requests
import random
TITLE ='//*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[{}]/td[2]/a/text()'
YEAR = '//*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[{}]/td[2]/span/text()'
URL = '//*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[{}]/td[2]/a/@href'
def get_summary(url):
moviepage = requests.get(url)
details = html.fromstring(moviepage.content)
return details.xpath("normalize-space(//div[@class='summary_text']/text())")
def get_movie(tree, n):
title = tree.xpath(TITLE.format(n))
year = tree.xpath(YEAR.format(n))
url = str(tree.xpath(URL.format(n))).replace("['", "").replace("']", "")
url = 'http://www.imdb.com' + url
return title, year, url
def get_user_yn(message):
while True:
answer = input(message).upper()
if answer in ('Y', 'N'):
return answer == 'Y'
print("Invalid input. Please enter a Y or a N. ")
def movie_picker():
page = requests.get('http://www.imdb.com/chart/top')
tree = html.fromstring(page.content)
while True:
# Parse the movie's url and pull the summary from the details page
n = random.randint(1, 250)
title, year, url = get_movie(tree, n)
summary = get_summary(url)
print(title)
print(year)
print(summary)
if not get_user_yn("Have you already watched this movie? Enter Y or N. : "):
print("Enjoy the movie!")
break
if __name__ == '__main__':
movie_picker()With this done, next I will focus on the actual implementation of these functions.
The parser you use is very manual. You have to pass the whole path to the element you are looking for. A more user-friendly parser is
BeautifulSoup from the module bs4. You can install it on Ubuntu/other Debian derivatives via sudo apt install python-beautifulsoup.It has a method called
find which returns the first tag matching some criteria. So to find the title, we can just do soup.find("td", class_="titleColumn"). Or use the find_all command to get a list of all matching tags. You will see that the parsing is a lot easier to follow with this, compared to lxml.I would shuffle the movies list and then iterate over the shuffled list, guaranteeing that a movie will not be picked twice (during one run of the script).
from bs4 import BeautifulSoup
import requests
import random
def get_movies(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movies = soup.find_all("td", class_="titleColumn")
random.shuffle(movies)
return movies
def get_summary(url):
movie_page = requests.get(url)
soup = BeautifulSoup(movie_page.text, 'html.parser')
return soup.find("div", class_="summary_text").contents[0].strip()
def get_movie_info(movie):
title = movie.a.contents[0]
year = movie.span.contents[0]
url = 'http://www.imdb.com' + movie.a['href']
return title, year, url
def get_user_yn(message):
while True:
answer = input(message).upper()
if answer in ('Y', 'N'):
return answer == 'Y'
print("Invalid input. Please enter a Y or a N. ")
def movie_picker():
for movie in get_movies('http://www.imdb.com/chart/top'):
# Parse the movie's url and pull the summary from the details page
title, year, url = get_movie_info(movie)
summary = get_summary(url)
print(title, year)
print(summary)
if not get_user_yn("Have you already watched this movie? Enter Y or N. : "):
print("Enjoy the movie!")
break
if __name__ == '__main__':
movie_picker()Code Snippets
from lxml import html
import requests
import random
TITLE ='//*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[{}]/td[2]/a/text()'
YEAR = '//*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[{}]/td[2]/span/text()'
URL = '//*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[{}]/td[2]/a/@href'
def get_summary(url):
moviepage = requests.get(url)
details = html.fromstring(moviepage.content)
return details.xpath("normalize-space(//div[@class='summary_text']/text())")
def get_movie(tree, n):
title = tree.xpath(TITLE.format(n))
year = tree.xpath(YEAR.format(n))
url = str(tree.xpath(URL.format(n))).replace("['", "").replace("']", "")
url = 'http://www.imdb.com' + url
return title, year, url
def get_user_yn(message):
while True:
answer = input(message).upper()
if answer in ('Y', 'N'):
return answer == 'Y'
print("Invalid input. Please enter a Y or a N. ")
def movie_picker():
page = requests.get('http://www.imdb.com/chart/top')
tree = html.fromstring(page.content)
while True:
# Parse the movie's url and pull the summary from the details page
n = random.randint(1, 250)
title, year, url = get_movie(tree, n)
summary = get_summary(url)
print(title)
print(year)
print(summary)
if not get_user_yn("Have you already watched this movie? Enter Y or N. : "):
print("Enjoy the movie!")
break
if __name__ == '__main__':
movie_picker()from bs4 import BeautifulSoup
import requests
import random
def get_movies(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movies = soup.find_all("td", class_="titleColumn")
random.shuffle(movies)
return movies
def get_summary(url):
movie_page = requests.get(url)
soup = BeautifulSoup(movie_page.text, 'html.parser')
return soup.find("div", class_="summary_text").contents[0].strip()
def get_movie_info(movie):
title = movie.a.contents[0]
year = movie.span.contents[0]
url = 'http://www.imdb.com' + movie.a['href']
return title, year, url
def get_user_yn(message):
while True:
answer = input(message).upper()
if answer in ('Y', 'N'):
return answer == 'Y'
print("Invalid input. Please enter a Y or a N. ")
def movie_picker():
for movie in get_movies('http://www.imdb.com/chart/top'):
# Parse the movie's url and pull the summary from the details page
title, year, url = get_movie_info(movie)
summary = get_summary(url)
print(title, year)
print(summary)
if not get_user_yn("Have you already watched this movie? Enter Y or N. : "):
print("Enjoy the movie!")
break
if __name__ == '__main__':
movie_picker()Context
StackExchange Code Review Q#153942, answer score: 4
Revisions (0)
No revisions yet.