patternMinor
Web Scraping with VBA
Viewed 0 times
withscrapingwebvba
Problem
I wrote this to scrape album review data from AOTY into a spreadsheet. Check it out and let me know what I could've done better.
```
Option Explicit
Const classAlbum As String = "listLargeTitle"
Const classScore As String = "listScoreContainer"
Const classRating As String = "listScoreValueContainer"
Const classReview As String = "listScoreText"
Const classNext As String = "pageSelect"
Sub ScrapeAOTY()
Dim IE As MSXML2.XMLHTTP60
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLBody As MSHTML.HTMLBody
Dim Albums As Object
Dim Ratings As Object
Dim Reviews As Object
Dim MetaData As Object
Dim InnerMeta As Object
Dim nextPage As Object
Dim selection As Object
Dim sheet As Worksheet
Dim aotyYear As String
Dim url As String
Dim bookmark As String
Dim album As String
Dim review As String
Dim releaseDate As String
Dim genre As String
Dim test As String
Dim rating As Double
Dim index As Long
Dim row As Long
Dim page As Long
Dim pageLoading As Boolean
Dim exitFlag As Boolean
Application.ScreenUpdating = False
Application.Calculation = xlCalculationManual
aotyYear = InputBox("Input the year you're scraping")
If aotyYear 4
DoEvents
Loop
Set HTMLDoc = New MSHTML.HTMLDocument
Set HTMLBody = HTMLDoc.body
HTMLBody.innerHTML = IE.responseText
Set Albums = HTMLDoc.getElementsByClassName(classAlbum)
Set Ratings = HTMLDoc.getElementsByClassName(classRating)
Set Reviews = HTMLDoc.getElementsByClassName(classReview)
Set MetaData = HTMLDoc.getElementsByClassName(classScore)
For index = 0 To Albums.Length - 1
Set InnerMeta = MetaData(index).parentElement.parentElement
Set InnerMeta = InnerMeta.getElementsByTagName("div")
album = Albums(index).innerText
rating = CDbl(Ratings(index).getAttribute("title"))
review = Reviews(index).inn
```
Option Explicit
Const classAlbum As String = "listLargeTitle"
Const classScore As String = "listScoreContainer"
Const classRating As String = "listScoreValueContainer"
Const classReview As String = "listScoreText"
Const classNext As String = "pageSelect"
Sub ScrapeAOTY()
Dim IE As MSXML2.XMLHTTP60
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLBody As MSHTML.HTMLBody
Dim Albums As Object
Dim Ratings As Object
Dim Reviews As Object
Dim MetaData As Object
Dim InnerMeta As Object
Dim nextPage As Object
Dim selection As Object
Dim sheet As Worksheet
Dim aotyYear As String
Dim url As String
Dim bookmark As String
Dim album As String
Dim review As String
Dim releaseDate As String
Dim genre As String
Dim test As String
Dim rating As Double
Dim index As Long
Dim row As Long
Dim page As Long
Dim pageLoading As Boolean
Dim exitFlag As Boolean
Application.ScreenUpdating = False
Application.Calculation = xlCalculationManual
aotyYear = InputBox("Input the year you're scraping")
If aotyYear 4
DoEvents
Loop
Set HTMLDoc = New MSHTML.HTMLDocument
Set HTMLBody = HTMLDoc.body
HTMLBody.innerHTML = IE.responseText
Set Albums = HTMLDoc.getElementsByClassName(classAlbum)
Set Ratings = HTMLDoc.getElementsByClassName(classRating)
Set Reviews = HTMLDoc.getElementsByClassName(classReview)
Set MetaData = HTMLDoc.getElementsByClassName(classScore)
For index = 0 To Albums.Length - 1
Set InnerMeta = MetaData(index).parentElement.parentElement
Set InnerMeta = InnerMeta.getElementsByTagName("div")
album = Albums(index).innerText
rating = CDbl(Ratings(index).getAttribute("title"))
review = Reviews(index).inn
Solution
I can only point out cosmetic issues (I don't know sqat about VBA),
I hope another reviewer will pass by and cover other aspects.
Unused variables
These variables seem to be unused, so why not remove them:
Magic string
This string appears twice:
It would be better to define it one place, so you can change it one place if needed.
Readability
This block:
Would be more readable if you break a line after the early return:
It might be a matter of taste,
but I think this would be more readable if the condition was not split,
on a single line.
I hope another reviewer will pass by and cover other aspects.
Unused variables
These variables seem to be unused, so why not remove them:
Dim selection As Object
Dim bookmark As String
Dim test As String
Dim pageLoading As BooleanMagic string
This string appears twice:
"http://www.albumoftheyear.org/ratings/6-highest-rated/"It would be better to define it one place, so you can change it one place if needed.
Readability
This block:
aotyYear = InputBox("Input the year you're scraping")
If aotyYear < 1900 Then Exit Sub
url = "http://www.albumoftheyear.org/ratings/6-highest-rated/" & aotyYear & "/1"
Set sheet = ThisWorkbook.Worksheets("ScrapeSheet")
row = 2
exitFlag = False
page = 1Would be more readable if you break a line after the early return:
aotyYear = InputBox("Input the year you're scraping")
If aotyYear < 1900 Then Exit Sub
url = "http://www.albumoftheyear.org/ratings/6-highest-rated/" & aotyYear & "/1"
Set sheet = ThisWorkbook.Worksheets("ScrapeSheet")
row = 2
exitFlag = False
page = 1It might be a matter of taste,
but I think this would be more readable if the condition was not split,
on a single line.
If nextPage.Length = 2 _
Or nextPage(0).innerText = "Next >" Then
page = page + 1
Else
exitFlag = True
End IfCode Snippets
Dim selection As Object
Dim bookmark As String
Dim test As String
Dim pageLoading As Boolean"http://www.albumoftheyear.org/ratings/6-highest-rated/"aotyYear = InputBox("Input the year you're scraping")
If aotyYear < 1900 Then Exit Sub
url = "http://www.albumoftheyear.org/ratings/6-highest-rated/" & aotyYear & "/1"
Set sheet = ThisWorkbook.Worksheets("ScrapeSheet")
row = 2
exitFlag = False
page = 1aotyYear = InputBox("Input the year you're scraping")
If aotyYear < 1900 Then Exit Sub
url = "http://www.albumoftheyear.org/ratings/6-highest-rated/" & aotyYear & "/1"
Set sheet = ThisWorkbook.Worksheets("ScrapeSheet")
row = 2
exitFlag = False
page = 1If nextPage.Length = 2 _
Or nextPage(0).innerText = "Next >" Then
page = page + 1
Else
exitFlag = True
End IfContext
StackExchange Code Review Q#112610, answer score: 8
Revisions (0)
No revisions yet.