patternpythonMinor
Scrape points from CTF sites
Viewed 0 times
ctfpointssitesscrapefrom
Problem
I am relatively new to classes. This one uses one.
I am pretty sure this is not the correct way to do this. But I also do not know the correct way.
You can use the
Do I have to many comments? Are they too verbose?
I am pretty sure this is not the correct way to do this. But I also do not know the correct way.
You can use the
certified_secure function when you make an instance of crawler that has the URL of hackthissite.org.Do I have to many comments? Are they too verbose?
#! usr/bin/env python
import bs4
import requests
users = ['user1', 'user2']
certified_secure_url = 'https://www.certifiedsecure.com/profile?alias='
hack_this_site_url = 'https://www.hackthissite.org/user/view/'
# this function takes a string as input and outputs a list with all the integers in the string
def get_num(string):
# get the numbers from string
lst = ''.join([x if x.isdigit() else ' ' for x in string]).split()
# change to list of ints instead of strings
new_lst = []
for item in lst:
new_lst.append(int(item))
return new_lst
class Crawler(object):
def __init__(self, url):
self.url = url
# retrieve data from site and
def get_site_data(self, user):
request = requests.get(self.url + user)
return bs4.BeautifulSoup(request.text, 'lxml')
def certified_secure(self, user):
experience = self.get_site_data(user).select('.level_progress_details')[0].getText()
# get the points from the string
return get_num(experience)[1]
def hack_this_site(self, user):
experience = self.get_site_data(user).select('.blight-td')[1].getText()
return get_num(experience)[0]
# make to instances to crawl
cs = Crawler(certified_secure_url)
hts = Crawler(hack_this_site_url)
for user in users:
print cs.certified_secure(user)
print hts.hack_this_site(user)Solution
You could use a list comprehension in
Also note that
You should also give your functions (and classes) a
It also seems a bit too manual to have to know which method to call depending on the url. Your crawler could decide that on its own:
get_num:def get_num(string):
"""this function takes a string as input and outputs a list with all the integers in the string"""
# get the numbers from string
numbers = ''.join(x if x.isdigit() else ' ' for x in string).split()
# change to list of ints instead of strings
return [int(number) for number in numbers]
# return map(int, numbers) # AlternativeAlso note that
join can take a generator expression, so no need to convert to a list first. I also chose more descriptive variable names here.You should also give your functions (and classes) a
docstring (by putting a triple-""" delimited string as the first line of the funciton body, as I did above), which you can access interactively via help(function_name) and which is used by many documentation building tools.It also seems a bit too manual to have to know which method to call depending on the url. Your crawler could decide that on its own:
class Crawler(object):
sites = {"hackthissite.org": ('.blight-td', 1, 0),
"certifiedsecure.com": ('.level_progress_details', 0, 1)}
def __init__(self, url):
self.url = url
self.options = self.get_sites_options(Crawler.sites)
def get_sites_options(self, sites):
for site, options in sites.items():
if self.url in site:
return options
def get_site_data(self, user):
"""retrieve data from site and"""
request = requests.get(self.url + user)
return bs4.BeautifulSoup(request.text, 'lxml')
def get_experience(self, user):
select_str, index, out_index = self.options
experience = self.get_site_data(user).select(select_str)[index].getText()
return get_num(experience)[out_index]Code Snippets
def get_num(string):
"""this function takes a string as input and outputs a list with all the integers in the string"""
# get the numbers from string
numbers = ''.join(x if x.isdigit() else ' ' for x in string).split()
# change to list of ints instead of strings
return [int(number) for number in numbers]
# return map(int, numbers) # Alternativeclass Crawler(object):
sites = {"hackthissite.org": ('.blight-td', 1, 0),
"certifiedsecure.com": ('.level_progress_details', 0, 1)}
def __init__(self, url):
self.url = url
self.options = self.get_sites_options(Crawler.sites)
def get_sites_options(self, sites):
for site, options in sites.items():
if self.url in site:
return options
def get_site_data(self, user):
"""retrieve data from site and"""
request = requests.get(self.url + user)
return bs4.BeautifulSoup(request.text, 'lxml')
def get_experience(self, user):
select_str, index, out_index = self.options
experience = self.get_site_data(user).select(select_str)[index].getText()
return get_num(experience)[out_index]Context
StackExchange Code Review Q#140753, answer score: 3
Revisions (0)
No revisions yet.