patternpythonMinor
Pagination algorithm based on scores
Viewed 0 times
paginationbasedalgorithmscores
Problem
I need to divide
Here is my working code; I am seeking advice, especially of smarter and more efficient solutions.
scoreResult into pages (which is already ordered by score column from highest to lowest), and each page cannot contain too many records (e.g. 12 is a limitation in my example), and each page cannot contain duplicate id1, and each page needs to be ordered by score from highest to lowest.Here is my working code; I am seeking advice, especially of smarter and more efficient solutions.
scoreResult = [
#id1,id2,score",
"1,28,300.1",
"4,5,209.1",
"20,7,208.1",
"23,8,207.1",
"16,10,206.1",
"1,16,205.1",
"1,31,204.6",
"6,29,204.1",
"7,20,203.1",
"8,21,202.1",
"2,18,201.1",
"2,30,200.1",
"15,27,109.1",
"10,13,108.1",
"11,26,107.1",
"12,9,106.1",
"13,1,105.1",
"22,17,104.1",
"1,2,103.1",
"28,24,102.1",
"18,14,11.1",
"6,25,10.1",
"19,15,9.1",
"3,19,8.1",
"3,11,7.1",
"27,12,6.1",
"1,3,5.1",
"25,4,4.1",
"5,6,3.1",
"29,22,2.1",
"30,23,1.1"
]
PageLimit = 12
def pageResult():
#store final resutls
pages = {}
# store dictionary to see if page exists
# key = host_id+#+page_id, value = item
pageDict = {}
# key: page ID, value: # of items
pageCount = {}
for item in scoreResult:
page= 0
host_id=item.split(',')[0]
while pageDict.has_key(str(host_id)+'#'+str(page)) or (pageCount.has_key(page) and pageCount[page] >= PageLimit):
#print page
page += 1
pageDict[str(host_id)+'#'+str(page)] = True
if pages.has_key(page):
pages[page].append(item)
else:
pages[page]=[]
pages[page].append(item)
if pageCount.has_key(page):
pageCount[page] += 1
else:
pageCount[page] = 1
for k,v in pages.items():
print k
print v
if __name__=="__main__":
pageResult()Solution
If you look at your code you have a lot of
This can be replaced with a
And
As a simple example the following is how you could use default-dicts:
I think changing
I would use a tuple as
This is as
Python also has a style guide called PEP8, which basically means that your variables should be
Using the above should get you:
You could make another default-dict to store the latest page for a
This limits the amount of times the while loop runs. This dramatically reduces the amount of time it takes on larger data-sets. But has little diffrence on small data sets, and can perform slower!
I use the same data as given in the question, but timesed by 100.
This gave me the following times, over 1000 tries:
This shows that there may be little speed-up from using a dictionary rather than
I also removed the prints.
I would also recommend:
if pageCount.has_key(page): else:.This can be replaced with a
defaultdict.pages defaults to a list, as if pages[page] does not exist you manually default it to a list.And
page_count defaults to zero.As a simple example the following is how you could use default-dicts:
from collections import defaultdict
pages = defaultdict(list)
pages[0].append('Page 1')I think changing
page_dict to a set would be better, as they index the same way, but using a set will use less memory.I would use a tuple as
page_dict's key, rather than a string.This is as
(a, b) is easier to read than str(a) + "#" + str(b).Python also has a style guide called PEP8, which basically means that your variables should be
snake_case, and PageLimit should be UPPER_SNAKE_CASE as it's a constant.Using the above should get you:
def page_result(score_result):
pages = defaultdict(list)
page_dict = set()
page_count = defaultdict(int)
for item in score_result:
page = 0
host_id = item.split(',')[0]
while (host_id, page) in page_dict or page_count[page] >= PAGE_LIMIT:
page += 1
pages[page].append(item)
page_dict.add((host_id, page))
page_count[page] += 1
for k, v in pages.items():
print k
print vYou could make another default-dict to store the latest page for a
host_id.This limits the amount of times the while loop runs. This dramatically reduces the amount of time it takes on larger data-sets. But has little diffrence on small data sets, and can perform slower!
I use the same data as given in the question, but timesed by 100.
scoreResult *= 100.This gave me the following times, over 1000 tries:
- Original: 225.99s
- My original: 53.14s
- My updatet: 2.97s
- Mathias': 58.83s
This shows that there may be little speed-up from using a dictionary rather than
len.I also removed the prints.
def page_result(score_result):
pages = defaultdict(list)
page_dict = set()
page_count = defaultdict(int)
host_page = defaultdict(int)
for item in score_result:
host_id = item.split(',')[0]
page = host_page[host_id]
while page_count[page] >= PAGE_LIMIT:
page += 1
pages[page].append(item)
page_dict.add((host_id, page))
page_count[page] += 1
host_page[host_id] = page + 1
for k, v in pages.items():
print k
print vI would also recommend:
- Moving the
printpart of the function into it's own function.
- Read the data from a file. Saving it as a JSON file and using Python's JSON library is a simple way to do this.
Code Snippets
from collections import defaultdict
pages = defaultdict(list)
pages[0].append('Page 1')def page_result(score_result):
pages = defaultdict(list)
page_dict = set()
page_count = defaultdict(int)
for item in score_result:
page = 0
host_id = item.split(',')[0]
while (host_id, page) in page_dict or page_count[page] >= PAGE_LIMIT:
page += 1
pages[page].append(item)
page_dict.add((host_id, page))
page_count[page] += 1
for k, v in pages.items():
print k
print vdef page_result(score_result):
pages = defaultdict(list)
page_dict = set()
page_count = defaultdict(int)
host_page = defaultdict(int)
for item in score_result:
host_id = item.split(',')[0]
page = host_page[host_id]
while page_count[page] >= PAGE_LIMIT:
page += 1
pages[page].append(item)
page_dict.add((host_id, page))
page_count[page] += 1
host_page[host_id] = page + 1
for k, v in pages.items():
print k
print vContext
StackExchange Code Review Q#116845, answer score: 4
Revisions (0)
No revisions yet.