patternpythonMinor
First program with scraping, lists, string manipulation
Viewed 0 times
withprogramscrapingfirstmanipulationlistsstring
Problem
I wanted to find out which states and cities the USA hockey team was from, but I didn't want to manually count from the roster site here.
I'm really interested to see if someone has a more elegant way to do what I've done (which feels like glue and duct tape) for future purposes. I read about 12 different Stack Overflow questions to get here.
I essentially did a three tier loop for
I'm really interested to see if someone has a more elegant way to do what I've done (which feels like glue and duct tape) for future purposes. I read about 12 different Stack Overflow questions to get here.
from bs4 import BeautifulSoup
from collections import Counter
import urllib2
url='http://olympics.usahockey.com/page/show/1067902-roster'
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
locations = []
city = []
state = []
counter = 0
tables = soup.findAll("table", { "class" : "dataTable" })
for table in tables:
rows = table.findAll("tr")
for row in rows:
entries = row.findAll("td")
for entry in entries:
counter = counter + 1
if counter == 7:
locations.append(entry.get_text().encode('ascii'))
counter = 0
for i in locations:
splitter = i.split(", ")
city.append(splitter[0])
state.append(splitter[1])
print Counter(state)
print Counter(city)I essentially did a three tier loop for
table->tr->td, and then used a counter to grab the 7th column and added it to a list. Then I iterated through the list splitting the first word to one list, and the second word to a second list. Then ran it through Counter to print the cities and states. I get a hunch this could be done a lot simpler, curious for opinions.Solution
-
You can use enumerate in order not to play with
-
-
You can use unpacking when splitting the string
For instance :
This is pretty much what my code is like at this stage :
Now, because of the way American cities are named, it might be worth counting the cities keeping their state into account (because Portand and Portland are not quite the same city). Thus, it might be worth storing information about city and state as a tuple.
This is how I'd do it :
Also, something I have forgotten but might be useful to you is to use defaultdict if all you want is to count elements.
You can use enumerate in order not to play with
counter.for counter,entry in enumerate(entries):
if counter == 6:
locations.append(entry.get_text().encode('ascii'))-
states and cities would probably be a better name for collections of states/cities.-
You can use unpacking when splitting the string
For instance :
for i in locations:
city, state = i.split(", ")
cities.append(city)
states.append(state)- You don't need to store locations in a list and then iterate over the locations, you can handle them directly without storing them.
This is pretty much what my code is like at this stage :
#!/usr/bin/python
from bs4 import BeautifulSoup
from collections import Counter
import urllib2
url='http://olympics.usahockey.com/page/show/1067902-roster'
soup = BeautifulSoup(urllib2.urlopen(url).read())
cities = []
states = []
for table in soup.findAll("table", { "class" : "dataTable" }):
for row in table.findAll("tr"):
for counter,entry in enumerate(row.findAll("td")):
if counter == 6:
city, state = entry.get_text().encode('ascii').split(", ")
cities.append(city)
states.append(state)
print Counter(states)
print Counter(cities)Now, because of the way American cities are named, it might be worth counting the cities keeping their state into account (because Portand and Portland are not quite the same city). Thus, it might be worth storing information about city and state as a tuple.
This is how I'd do it :
#!/usr/bin/python
from bs4 import BeautifulSoup
from collections import Counter
import urllib2
url='http://olympics.usahockey.com/page/show/1067902-roster'
soup = BeautifulSoup(urllib2.urlopen(url).read())
cities = []
for table in soup.findAll("table", { "class" : "dataTable" }):
for row in table.findAll("tr"):
for counter,entry in enumerate(row.findAll("td")):
if counter == 6:
city, state = entry.get_text().encode('ascii').split(", ")
cities.append((state,city))
print Counter(cities)
print Counter(state for state,city in cities)Also, something I have forgotten but might be useful to you is to use defaultdict if all you want is to count elements.
from collections import defaultdict
url='http://olympics.usahockey.com/page/show/1067902-roster'
soup = BeautifulSoup(urllib2.urlopen(url).read())
cities = defaultdict(int)
for table in soup.findAll("table", { "class" : "dataTable" }):
for row in table.findAll("tr"):
for counter,entry in enumerate(row.findAll("td")):
if counter == 6:
city, state = entry.get_text().encode('ascii').split(", ")
cities[state,city] += 1Code Snippets
for counter,entry in enumerate(entries):
if counter == 6:
locations.append(entry.get_text().encode('ascii'))for i in locations:
city, state = i.split(", ")
cities.append(city)
states.append(state)#!/usr/bin/python
from bs4 import BeautifulSoup
from collections import Counter
import urllib2
url='http://olympics.usahockey.com/page/show/1067902-roster'
soup = BeautifulSoup(urllib2.urlopen(url).read())
cities = []
states = []
for table in soup.findAll("table", { "class" : "dataTable" }):
for row in table.findAll("tr"):
for counter,entry in enumerate(row.findAll("td")):
if counter == 6:
city, state = entry.get_text().encode('ascii').split(", ")
cities.append(city)
states.append(state)
print Counter(states)
print Counter(cities)#!/usr/bin/python
from bs4 import BeautifulSoup
from collections import Counter
import urllib2
url='http://olympics.usahockey.com/page/show/1067902-roster'
soup = BeautifulSoup(urllib2.urlopen(url).read())
cities = []
for table in soup.findAll("table", { "class" : "dataTable" }):
for row in table.findAll("tr"):
for counter,entry in enumerate(row.findAll("td")):
if counter == 6:
city, state = entry.get_text().encode('ascii').split(", ")
cities.append((state,city))
print Counter(cities)
print Counter(state for state,city in cities)from collections import defaultdict
url='http://olympics.usahockey.com/page/show/1067902-roster'
soup = BeautifulSoup(urllib2.urlopen(url).read())
cities = defaultdict(int)
for table in soup.findAll("table", { "class" : "dataTable" }):
for row in table.findAll("tr"):
for counter,entry in enumerate(row.findAll("td")):
if counter == 6:
city, state = entry.get_text().encode('ascii').split(", ")
cities[state,city] += 1Context
StackExchange Code Review Q#42727, answer score: 3
Revisions (0)
No revisions yet.