patternpythonModerate
Simple Python username scraper
Viewed 0 times
scrapersimplepythonusername
Problem
I started learning Python recently and I really like it, so I decided to share one of my first projects mainly in hopes of someone telling me what I can do to make it run faster (threading/multiprocessing?).
from requests import get
from bs4 import BeautifulSoup
from time import time
from re import compile
print('***PYTHON LEAGUE OF LEGENDS USERNAME SCRAPER***')
print('This script scrapes usernames from lolprofile.net')
region = input('Enter the region for scraping(eune/euw/na/br/tr/kr/jp/lan/las/oce/ru)\n')
numStart = input('What page to start on? Min 0\n')
numEnd = input('What page to end on? Min starting page + 1\n')
size = [] #for logging
#count = -1 #for logging
def setUrl(pageNum, region):
global url
url = 'http://lolprofile.net/leaderboards/'+region+'/'+pageNum
def is_ascii(i):
return all(ord(c) < 128 for c in i)
setUrl(numStart, region)
start = time()
while int(numStart) != int(numEnd):
print(len(size))
page = get(url)
soup = BeautifulSoup(page.text, "lxml")
userName = [a.string for a in soup.findAll(href=compile('http://lolprofile.net/summoner/*'))]
with open('usernames1.txt', 'a') as file:
for i in userName:
if is_ascii(i) and (' ' in i) == False:
file.write('%s\n' % i.lower())
size.append('0')
numStart = int(numStart)
numStart += 1
setUrl(str(numStart), region)
#count += 1
#if count % 250 == 0: #every n iterations print progress
# print(len(size))
end = time()
print(len(size),'usernames scraped in a total of',end-start,'seconds')Solution
If you're after speed, I'd suggest
test.py:
Running:
names.json:
To actually provide some code review feedback:
scrapy. I was looking for an excuse to try it out and saw your question. When I ran your code on the first 10 pages of the NA leaderboard, it took a little over 4 seconds. Running the below takes about 0.3 seconds, presumably due to initiating all the HTTP requests in parallel:test.py:
class LolSpider(scrapy.Spider):
name = 'lolspider'
start_urls = ['http://lolprofile.net/leaderboards/na/{}'.format(page) for page in range(10)]
def parse(self, response):
for name in response.xpath('//a[re:test(@href, "http://lolprofile.net/summoner/")]//text()').extract():
yield { 'name': name }Running:
$ scrapy runspider test.py -o names.jsonnames.json:
[
{"name": ""},
{"name": ""},
...
]To actually provide some code review feedback:
import requests # I prefer this and then requests.get over "from requests import get", since "get" is too common a word
from bs4 import BeautifulSoup
import time # ditto here
import re # and here
print('***PYTHON LEAGUE OF LEGENDS USERNAME SCRAPER***')
print('This script scrapes usernames from lolprofile.net')
region = input('Enter the region for scraping(eune/euw/na/br/tr/kr/jp/lan/las/oce/ru)\n')
num_start = int(input('What page to start on? Min 0\n')) # cast to int once here
num_end = int(input('What page to end on? Min starting page + 1\n')) # ditto
size = 0 # use a simple count rather than a list
# Python style dictates snake case
# get the URL rather than set a global variable
def get_url(page_num, region):
# use string formatting rather than concatenation
return 'http://lolprofile.net/leaderboards/{}/{}'.format(region, page_num)
def is_ascii(i):
return all(ord(c) < 128 for c in i)
start = time.time()
# for loop instead of while avoids the need to increment by hand
for page_num in range(num_start, num_end + 1):
url = get_url(page_num, region)
print(size)
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser") # html.parser
# /.* (slash and then anything) rather than /* (any number of slashes) in the regular expression
user_names = [a.string for a in soup.findAll(href=re.compile('http://lolprofile.net/summoner/.*'))]
with open('usernames1.txt', 'a') as file:
for i in user_names:
if is_ascii(i) and ' ' not in i: # not in
file.write('%s\n' % i.lower())
size += 1
end = time.time()
print('{} usernames scraped in a total of {} seconds.'.format(size, end-start))Code Snippets
class LolSpider(scrapy.Spider):
name = 'lolspider'
start_urls = ['http://lolprofile.net/leaderboards/na/{}'.format(page) for page in range(10)]
def parse(self, response):
for name in response.xpath('//a[re:test(@href, "http://lolprofile.net/summoner/")]//text()').extract():
yield { 'name': name }$ scrapy runspider test.py -o names.json[
{"name": "<first name here>"},
{"name": "<second name here>"},
...
]import requests # I prefer this and then requests.get over "from requests import get", since "get" is too common a word
from bs4 import BeautifulSoup
import time # ditto here
import re # and here
print('***PYTHON LEAGUE OF LEGENDS USERNAME SCRAPER***')
print('This script scrapes usernames from lolprofile.net')
region = input('Enter the region for scraping(eune/euw/na/br/tr/kr/jp/lan/las/oce/ru)\n')
num_start = int(input('What page to start on? Min 0\n')) # cast to int once here
num_end = int(input('What page to end on? Min starting page + 1\n')) # ditto
size = 0 # use a simple count rather than a list
# Python style dictates snake case
# get the URL rather than set a global variable
def get_url(page_num, region):
# use string formatting rather than concatenation
return 'http://lolprofile.net/leaderboards/{}/{}'.format(region, page_num)
def is_ascii(i):
return all(ord(c) < 128 for c in i)
start = time.time()
# for loop instead of while avoids the need to increment by hand
for page_num in range(num_start, num_end + 1):
url = get_url(page_num, region)
print(size)
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser") # html.parser
# /.* (slash and then anything) rather than /* (any number of slashes) in the regular expression
user_names = [a.string for a in soup.findAll(href=re.compile('http://lolprofile.net/summoner/.*'))]
with open('usernames1.txt', 'a') as file:
for i in user_names:
if is_ascii(i) and ' ' not in i: # not in
file.write('%s\n' % i.lower())
size += 1
end = time.time()
print('{} usernames scraped in a total of {} seconds.'.format(size, end-start))Context
StackExchange Code Review Q#139663, answer score: 11
Revisions (0)
No revisions yet.