patternpythonMinor
Scraping the date of most recent post from various social media services
Viewed 0 times
socialtherecentscrapingdatepostservicesfromvariousmost
Problem
Task
I have a large spreadsheet where each line should include:
I have to find if the activity field is accurate for every account on the list, and fix any mismatches.
Code
```
##### IMPORTS
import requests
from bs4 import BeautifulSoup
import datetime
import re
from selenium import webdriver
import tkinter
import csv
from time import sleep
##### METHODS TO SCRAPE DATE OF MOST RECENT ACTIVITY FROM SOCIAL PAGES
# A method to get the date of last activity from a given Twitter feed.
def getMostRecentActivityTwitter(url):
# Grab the page, raising an exception if it doesn't work
response = requests.get(url)
if '40' in str(response):
raise Exception('Couldn''t find page!')
# Parse the response into Beautiful Soup
html = response.text
soup = BeautifulSoup(html, 'html.parser')
# Grab all the tweets
tweets = soup.find_all('div', class_='tweet')
# Take a look at the first two (if there's a pinned tweet, we want both)
try:
tweet1 = tweets[0]
tweet2 = tweets[1]
# Get their dates
dateString1 = tweet1.find('a', class_='tweet-timestamp').attrs['title'].split('-')[1]
dateString2 = tweet2.find('a', class_='tweet-timestamp').attrs['title'].split('-')[1]
date1 = datetime.datetime.strptime(dateString1, ' %d %b %Y')
date2 = datetime.datetime.strptime(dateString2, ' %d %b %Y')
# Return the more recent of the two
if date1-date2>datetime.timedelta(0):
return date1
else:
return date2
# Raise an exception if it didn't work
except Exception:
raise Exception('Parsing failed!')
# A method to get the date of last activity from a given Facebook page.
def getMostRecentActivityFB(url):
# Grab the page, raising an exception if it doesn't work
response = requests.get(url)
if '4
I have a large spreadsheet where each line should include:
- The URL of a social media account
- A field indicating whether the account is "active"
- A name and UID number for each account
I have to find if the activity field is accurate for every account on the list, and fix any mismatches.
Code
```
##### IMPORTS
import requests
from bs4 import BeautifulSoup
import datetime
import re
from selenium import webdriver
import tkinter
import csv
from time import sleep
##### METHODS TO SCRAPE DATE OF MOST RECENT ACTIVITY FROM SOCIAL PAGES
# A method to get the date of last activity from a given Twitter feed.
def getMostRecentActivityTwitter(url):
# Grab the page, raising an exception if it doesn't work
response = requests.get(url)
if '40' in str(response):
raise Exception('Couldn''t find page!')
# Parse the response into Beautiful Soup
html = response.text
soup = BeautifulSoup(html, 'html.parser')
# Grab all the tweets
tweets = soup.find_all('div', class_='tweet')
# Take a look at the first two (if there's a pinned tweet, we want both)
try:
tweet1 = tweets[0]
tweet2 = tweets[1]
# Get their dates
dateString1 = tweet1.find('a', class_='tweet-timestamp').attrs['title'].split('-')[1]
dateString2 = tweet2.find('a', class_='tweet-timestamp').attrs['title'].split('-')[1]
date1 = datetime.datetime.strptime(dateString1, ' %d %b %Y')
date2 = datetime.datetime.strptime(dateString2, ' %d %b %Y')
# Return the more recent of the two
if date1-date2>datetime.timedelta(0):
return date1
else:
return date2
# Raise an exception if it didn't work
except Exception:
raise Exception('Parsing failed!')
# A method to get the date of last activity from a given Facebook page.
def getMostRecentActivityFB(url):
# Grab the page, raising an exception if it doesn't work
response = requests.get(url)
if '4
Solution
Style
Case:
Python's official style guide, PEP8, states you should use
Magic numbers:
You should avoid unexplained numbers through your code, like this:
this is confusing to read and would be better explained through a simple temporary variable.
Spacing
You should be using spacing between binary operators, like here:
Spaces after commas in a parameter list:
Miscellaneous
You can use the
You can just return this, you don't need to assign it to a temporary
There's a lot of these in the code:
seeing as they all have the same error, it would be easier to wrap the entire
You've got some duplicate code here too:
Seeing as they're identical besides one string, it would be easier to have the
Case:
Python's official style guide, PEP8, states you should use
snake_case naming for your variables instead of camelCase:getMostRecentActivityTwitterMagic numbers:
You should avoid unexplained numbers through your code, like this:
if '40'this is confusing to read and would be better explained through a simple temporary variable.
Spacing
You should be using spacing between binary operators, like here:
dateList[1]+' '+dateList[2]+' '+dateList[3]Spaces after commas in a parameter list:
date = (todaysDate - howRecent).replace(hour=0,minute=0,second=0,microsecond=0)Miscellaneous
You can use the
+= operator here:unit = unit + 's'You can just return this, you don't need to assign it to a temporary
date variable:date = (todaysDate - howRecent).replace(hour=0,minute=0,second=0,microsecond=0)There's a lot of these in the code:
except Exception as errorMessage:
writer.writerow([rowID, accountName, url, 'Not Checked', errorMessage, ''])
returnseeing as they all have the same error, it would be easier to wrap the entire
if conditions block in one big try catchYou've got some duplicate code here too:
if putativeActivity == 'Active':
writer.writerow([rowID, accountName, url, 'Inactive', '', 'Yes'])
return
else:
writer.writerow([rowID, accountName, url, 'Inactive', '', 'No'])
returnSeeing as they're identical besides one string, it would be easier to have the
if else only deal with the value of the string and have the rest of the function call separate.if putativeActivity == 'Active':
active = 'Yes'
else:
active = 'No'
writer.writerow([rowID, accountName, url, 'Inactive', '', active])
returnCode Snippets
getMostRecentActivityTwitterdateList[1]+' '+dateList[2]+' '+dateList[3]date = (todaysDate - howRecent).replace(hour=0,minute=0,second=0,microsecond=0)unit = unit + 's'date = (todaysDate - howRecent).replace(hour=0,minute=0,second=0,microsecond=0)Context
StackExchange Code Review Q#129899, answer score: 6
Revisions (0)
No revisions yet.