patternpythonMinor
Get all followers and friends of a Twitter user
Viewed 0 times
alluserfollowersgetandtwitterfriends
Problem
I'm trying to find my bug or any potential bottleneck that cause my program to be really slow. The script is to get all the followers and friends and save that in MongoDB.
```
import pymongo
import tweepy
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
from pymongo import MongoClient
client = MongoClient()
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, retry_count=3, retry_delay=60)
db = client.tweets
raw_tweets = db.raw_tweets
users = db.users
def is_user_in_db(user_id):
return get_user_from_db(user_id) != None
def get_user_from_db(user_id):
return users.find_one({'user.id' : user_id})
def get_user_from_twitter(user_id):
return api.get_user(user_id)
def get_followers(user_id):
users = []
page_count = 0
for user in tweepy.Cursor(api.followers, id=user_id, count=200).pages():
page_count += 1
print 'Getting page {} for followers'.format(page_count)
users.extend(user)
return users
def get_friends(user_id):
users = []
page_count = 0
for user in tweepy.Cursor(api.friends, id=user_id, count=200).pages():
page_count += 1
print 'Getting page {} for friends'.format(page_count)
users.extend(user)
return users
def get_followers_ids(user_id):
ids = []
page_count = 0
for page in tweepy.Cursor(api.followers_ids, id=user_id, count=5000).pages():
page_count += 1
print 'Getting page {} for followers ids'.format(page_count)
ids.extend(page)
return ids
def get_friends_ids(user_id):
ids = []
page_count = 0
for page in tweepy.Cursor(api.friends_ids, id=user_id, count=5000).pages():
page_count += 1
print 'Getting page {} for friends ids'.format(page_count)
ids.extend(page)
return ids
def process_user(user):
user_id = user
```
import pymongo
import tweepy
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
from pymongo import MongoClient
client = MongoClient()
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, retry_count=3, retry_delay=60)
db = client.tweets
raw_tweets = db.raw_tweets
users = db.users
def is_user_in_db(user_id):
return get_user_from_db(user_id) != None
def get_user_from_db(user_id):
return users.find_one({'user.id' : user_id})
def get_user_from_twitter(user_id):
return api.get_user(user_id)
def get_followers(user_id):
users = []
page_count = 0
for user in tweepy.Cursor(api.followers, id=user_id, count=200).pages():
page_count += 1
print 'Getting page {} for followers'.format(page_count)
users.extend(user)
return users
def get_friends(user_id):
users = []
page_count = 0
for user in tweepy.Cursor(api.friends, id=user_id, count=200).pages():
page_count += 1
print 'Getting page {} for friends'.format(page_count)
users.extend(user)
return users
def get_followers_ids(user_id):
ids = []
page_count = 0
for page in tweepy.Cursor(api.followers_ids, id=user_id, count=5000).pages():
page_count += 1
print 'Getting page {} for followers ids'.format(page_count)
ids.extend(page)
return ids
def get_friends_ids(user_id):
ids = []
page_count = 0
for page in tweepy.Cursor(api.friends_ids, id=user_id, count=5000).pages():
page_count += 1
print 'Getting page {} for friends ids'.format(page_count)
ids.extend(page)
return ids
def process_user(user):
user_id = user
Solution
You should use
You can also use list concatenation rather than
When you have a local variable here, use it instead of a dictionary call. It's quicker to access than a key.
You can also collapse these lines by assing the results of the function calls directly.
You can make a list comprehension out of
Can be turned into this:
Also, it's more efficient to call dictionary keys, assuming they're there and handle the exception if they're not, like so:
You could replace
One final note, you'd be surprised how expensive frequent
is to compare to None, as that's faster.def is_user_in_db(user_id):
return get_user_from_db(user_id) is NoneYou can also use list concatenation rather than
list.extend as it's slightly faster and there's no benefit to extend in this context. I also second the recommendation to use enumerate rather than having a manually incremented number in a for loop.def get_followers(user_id):
users = []
page_count = 0
for i, user in enumerate(tweepy.Cursor(api.followers, id=user_id, count=200).pages()):
print 'Getting page {} for followers'.format(i)
users += user
return usersWhen you have a local variable here, use it instead of a dictionary call. It's quicker to access than a key.
screen_name = user['screen_name']
print 'Processing user : {}'.format(screen_name)You can also collapse these lines by assing the results of the function calls directly.
user['followers_ids'] = get_followers_ids(screen_name)
user['friends_ids'] = get_friends_ids(screen_name)You can make a list comprehension out of
users_to_add, it's a single line of code that more efficiently creates a list based on a for loop-like construct. So this:users_to_add = []
for follower in get_followers(screen_name):
if not is_user_in_db(follower.id):
users_to_add.append(follower._json)
for friend in get_friends(screen_name):
if not is_user_in_db(friend.id):
users_to_add.append(friend._json)Can be turned into this:
users_to_add = [follower._json for follower in
get_followers(screen_name) if not is_user_in_db(follower.id)]
users_to_add += [friend._json for friend in
get_friends(screen_name) if not is_user_in_db(friend.id)]Also, it's more efficient to call dictionary keys, assuming they're there and handle the exception if they're not, like so:
try:
process_user(doc['user'])
except KeyError:
pass
try:
process_user(doc['retweeted_status']['user'])
except KeyError:
passYou could replace
pass with something else if you'd like, but this is more efficient as the if statement in your script would check the dictionary to see if a key exists, and then check it again to get the actual value attached to the key. The try except way only checks once, and moves on if it gets nothing.One final note, you'd be surprised how expensive frequent
print calls are. You have ones running in loops. I don't know how often the loops run, but if you are experiencing sluggishness, try removing them to see what difference it makes. Feedback is obviously important, but speed is also important.Code Snippets
def is_user_in_db(user_id):
return get_user_from_db(user_id) is Nonedef get_followers(user_id):
users = []
page_count = 0
for i, user in enumerate(tweepy.Cursor(api.followers, id=user_id, count=200).pages()):
print 'Getting page {} for followers'.format(i)
users += user
return usersscreen_name = user['screen_name']
print 'Processing user : {}'.format(screen_name)user['followers_ids'] = get_followers_ids(screen_name)
user['friends_ids'] = get_friends_ids(screen_name)users_to_add = []
for follower in get_followers(screen_name):
if not is_user_in_db(follower.id):
users_to_add.append(follower._json)
for friend in get_friends(screen_name):
if not is_user_in_db(friend.id):
users_to_add.append(friend._json)Context
StackExchange Code Review Q#101905, answer score: 5
Revisions (0)
No revisions yet.