patternpythonMinor
Scraping GitHub for security vulnerabilities
Viewed 0 times
vulnerabilitiesgithubscrapingsecurityfor
Problem
I haven't done much Python programming, but I really like the language so I've been using it for side projects that I work on. The problem with this is that I don't have the opportunity to have my peers review my code and give suggestions.
I've been working on a project that will be scraping GitHub looking for security vulnerabilities. I've created a separate file in the project that contains all functions that interact with GitHub's API.
The code is run from here:
```
import github
import json
def processRepoContents(repoContents):
# for each entry in the repo
for tree in repoC
I've been working on a project that will be scraping GitHub looking for security vulnerabilities. I've created a separate file in the project that contains all functions that interact with GitHub's API.
import requests
import re
import base64
import os
def getRepos(since=0):
url = 'http://api.github.com/repositories'
data = """{
since: %s
}""" % since
response = requests.get(url, data=data)
if response.status_code == 403:
print "Problem making request!", response.status_code
print response.headers
matches = re.match(r'', response.headers['Link'])
next = matches.group(0)[1:-1]
return response.json(), next
def getRepo(url):
response = requests.get(url)
return response.json()
def getReadMe(url):
url = url + "/readme"
response = requests.get(url)
return response.json()
# todo: return array of all commits so we can examine each one
def getRepoSHA(url):
# /repos/:owner/:repo/commits
commits = requests.get(url + "/commits").json()
return commits[0]['sha']
def getFileContent(item):
ignoreExtensions = ['jpg']
filename, extension = os.path.splitext(item['path'])
if extension in ignoreExtensions:
return []
content = requests.get(item['url']).json()
lines = content['content'].split('\n')
lines = map(base64.b64decode, lines)
print 'path', item['path']
print 'lines', "".join(lines[:5])
return "".join(lines)
def getRepoContents(url, sha):
# /repos/:owner/:repo/git/trees/:sha?recursive=1
url = url + ('/git/trees/%s?recursive=1' % sha)
# print 'url', url
response = requests.get(url)
return response.json()The code is run from here:
```
import github
import json
def processRepoContents(repoContents):
# for each entry in the repo
for tree in repoC
Solution
PEP8 - click, read, apply
PEP8 gives coding conventions for the Python code comprising the
standard library in the main Python distribution
Code:
For the second
Code
Pay more attention to variable names and data types
PEP8 gives coding conventions for the Python code comprising the
standard library in the main Python distribution
- you should use 4 spaces per indentation level
- function names should be lowercase, with words separated by underscores as necessary to improve readability. mixedCase is allowed only in contexts where that's already the prevailing style (e.g. threading.py), to retain backwards compatibility
- variable names should also stick to the above rule
- there should be two newlines between each method
- use augmented assigments where possible (
url = url + ('/git/trees/%s?recursive=1' % sha)->url += '/git/trees/%s?recursive=1' % sha)
- you're sometimes using double-quotes around strings, and sometimes single-quotes. Choose one and stick to that.
- use
format()instead of the old%
- I'd rather suggest you use the
print()function even if you're using python 2.7. It will make you code easily portable.
- you're lacking docstrings. Write docstrings for all public modules, functions, classes, and methods. They are not necessary for non-public methods, but you should have a comment that describes what the method does. This comment should appear after the
defline.)
Code:
import base64
import os
import re
import requests
def get_repos(since=0):
"""
Left as an exercise for OP
"""
url = 'http://api.github.com/repositories'
data = '{{since: {}}}'.format(since)
response = requests.get(url, data=data)
if response.status_code == 403:
print('Problem making request! {}'.format(response.status_code))
print(response.headers)
matches = re.match(r'', response.headers['Link'])
next = matches.group(0)[1:-1]
return response.json(), next
def get_repo(url):
"""
Left as an exercise for OP
"""
return requests.get(url).json()
def get_readme(url):
"""
Left as an exercise for OP
"""
url += '/readme'
return requests.get(url).json()
# todo: return array of all commits so we can examine each one
def get_repo_sha(url):
"""
Left as an exercise for OP
"""
commits = requests.get(url + '/commits').json()
return commits[0]['sha']
def get_file_content(item):
"""
Left as an exercise for OP
"""
ignore_extensions = ['jpg']
filename, extension = os.path.splitext(item['path'])
if extension in ignore_extensions:
return []
content = requests.get(item['url']).json()
lines = content['content'].split('\n')
lines = map(base64.b64decode, lines)
print('Path: '.format(item['path']))
print('Lines: '.format(''.join(lines[:5])))
return ''.join(lines)
def get_repo_contents(url, sha):
"""
Left as an exercise for OP
"""
url += '/git/trees/{}?recursive=1'.format(sha)
return requests.get(url).json()For the second
.py file:- the above rules apply
- don't import modules you're not using (
import json)
Code
import github
def process_repo_contents(repo_contents):
"""
Left as an exercise for OP
"""
for tree in repo_contents['tree']:
content_type = tree['type']
print('content_type --- {}'.format(content_type))
if content_type == 'blob':
github.get_file_content(tree)
print('***blob***')
elif content_type == 'tree':
print('***tree***')
if __name__ == '__main__':
repos, next = github.get_repos()
for repo in repos[0:10]:
sha = github.get_repo_sha(repo['url'])
repo_json = github.get_repo_contents(repo['url'], sha)
process_repo_contents(repo_json)Pay more attention to variable names and data types
nextis a builtin keyword so I'd recommend you to change it to something else
ignore_extensionsis a list in your case. And it only keeps one string. Don't you think it would be more appropriate to make it a string from the beginning ?
Code Snippets
import base64
import os
import re
import requests
def get_repos(since=0):
"""
Left as an exercise for OP
"""
url = 'http://api.github.com/repositories'
data = '{{since: {}}}'.format(since)
response = requests.get(url, data=data)
if response.status_code == 403:
print('Problem making request! {}'.format(response.status_code))
print(response.headers)
matches = re.match(r'<.+?>', response.headers['Link'])
next = matches.group(0)[1:-1]
return response.json(), next
def get_repo(url):
"""
Left as an exercise for OP
"""
return requests.get(url).json()
def get_readme(url):
"""
Left as an exercise for OP
"""
url += '/readme'
return requests.get(url).json()
# todo: return array of all commits so we can examine each one
def get_repo_sha(url):
"""
Left as an exercise for OP
"""
commits = requests.get(url + '/commits').json()
return commits[0]['sha']
def get_file_content(item):
"""
Left as an exercise for OP
"""
ignore_extensions = ['jpg']
filename, extension = os.path.splitext(item['path'])
if extension in ignore_extensions:
return []
content = requests.get(item['url']).json()
lines = content['content'].split('\n')
lines = map(base64.b64decode, lines)
print('Path: '.format(item['path']))
print('Lines: '.format(''.join(lines[:5])))
return ''.join(lines)
def get_repo_contents(url, sha):
"""
Left as an exercise for OP
"""
url += '/git/trees/{}?recursive=1'.format(sha)
return requests.get(url).json()import github
def process_repo_contents(repo_contents):
"""
Left as an exercise for OP
"""
for tree in repo_contents['tree']:
content_type = tree['type']
print('content_type --- {}'.format(content_type))
if content_type == 'blob':
github.get_file_content(tree)
print('***blob***')
elif content_type == 'tree':
print('***tree***')
if __name__ == '__main__':
repos, next = github.get_repos()
for repo in repos[0:10]:
sha = github.get_repo_sha(repo['url'])
repo_json = github.get_repo_contents(repo['url'], sha)
process_repo_contents(repo_json)Context
StackExchange Code Review Q#151444, answer score: 7
Revisions (0)
No revisions yet.