patternpythonMinor
Displaying sorted results of a web crawl
Viewed 0 times
crawldisplayingwebsortedresults
Problem
The issue I have with this class is that most of the methods are almost the same. I would like for this code to be more pythonic.
Note: I plan on replacing all the
```
class Sorter(BaseCrawler):
def __init__(self, base_dir):
self.base_dir = base_dir
def by_size(self, *options):
'''
Sort files by size
'''
file_list = []
for fn in self.files(*options):
file_list.append((fn, float(os.path.getsize(fn))))
file_list.sort(key=lambda fn: fn[1])
return file_list
def by_mtime(self, *options):
'''
Sort files by modified time.
'''
file_list = []
for fn in self.files(*options):
file_list.append((fn, os.path.getmtime(fn)))
file_list.sort(key=lambda fn: fn[1])
return file_list
def by_ctime(self, *options):
'''
Sort files by creation time.
'''
file_list = []
for fn in self.files(*options):
file_list.append((fn, os.path.getctime(fn)))
file_list.sort(key=lambda fn: fn[1])
return file_list
def by_atime(self, *options):
'''
Sort files by access time.
'''
file_list = []
for fn in self.files(*options):
file_list.append((fn, os.path.getatime(fn)))
file_list.sort(key=lambda fn: fn[1])
return file_list
def last_modified(self):
'''
Return the last modified file.
'''
return self.by_mtime()[0]
def last_accessed(self):
'''
Return the last accessed file.
'''
return self.by_atime()[0]
def last_created(self):
'''
Return the last created file.
'''
return self.by_ctime()[0]
def average_fsize(self):
'''
Return the average file size.
'''
sizes = []
for name, size in c.by_size():
Note: I plan on replacing all the
for loop constructs with list comprehensions.```
class Sorter(BaseCrawler):
def __init__(self, base_dir):
self.base_dir = base_dir
def by_size(self, *options):
'''
Sort files by size
'''
file_list = []
for fn in self.files(*options):
file_list.append((fn, float(os.path.getsize(fn))))
file_list.sort(key=lambda fn: fn[1])
return file_list
def by_mtime(self, *options):
'''
Sort files by modified time.
'''
file_list = []
for fn in self.files(*options):
file_list.append((fn, os.path.getmtime(fn)))
file_list.sort(key=lambda fn: fn[1])
return file_list
def by_ctime(self, *options):
'''
Sort files by creation time.
'''
file_list = []
for fn in self.files(*options):
file_list.append((fn, os.path.getctime(fn)))
file_list.sort(key=lambda fn: fn[1])
return file_list
def by_atime(self, *options):
'''
Sort files by access time.
'''
file_list = []
for fn in self.files(*options):
file_list.append((fn, os.path.getatime(fn)))
file_list.sort(key=lambda fn: fn[1])
return file_list
def last_modified(self):
'''
Return the last modified file.
'''
return self.by_mtime()[0]
def last_accessed(self):
'''
Return the last accessed file.
'''
return self.by_atime()[0]
def last_created(self):
'''
Return the last created file.
'''
return self.by_ctime()[0]
def average_fsize(self):
'''
Return the average file size.
'''
sizes = []
for name, size in c.by_size():
Solution
Removing duplication is a very worthy goal. Just "factor out" the feature-extraction functions ("f-e-f"s):
etc, etc.
Not sure why
Other improvements are feasible which don't deal with duplication but with efficiency. For example, consider:
this does a lot of work (list building and sorting) that it doesn't need to do (and actually does not meet its name and docstring: it returns the file modified earliest, because the sort in
(this one fixes the bug; to maintain the bug, use
import operator
class Sorter(BaseCrawler):
def __init__(self, base_dir):
self.base_dir = base_dir
def _sort(self, fef, *options):
file_list = [(fn, fef(fn)) for fn in self.files(*options)]
return sorted(file_list, key=operator.itemgetter(1))
def by_size(self, *options):
def fef(fn): return float(os.path.getsize(fn))
return self._sort(fef, *options)
def by_mtime(self, *options):
return self._sort(os.path.getmtime, *options)etc, etc.
Not sure why
by_size needs that float rather than the natural int returned by os.path.getsize (I'd rather do the float cast, if needed, downstream), but I've kept that dubious choice to show how best to deal with a f-e-f that's not already a plain existing function (hint: not with a lambda:-).Other improvements are feasible which don't deal with duplication but with efficiency. For example, consider:
def last_modified(self):
'''
Return the last modified file.
'''
return self.by_mtime()[0]this does a lot of work (list building and sorting) that it doesn't need to do (and actually does not meet its name and docstring: it returns the file modified earliest, because the sort in
by_mtime is in ascending order). How much simpler to do...:def last_modified(self):
'''
Return the last modified file.
'''
return max(self.files(), key=os.path.getmtime)(this one fixes the bug; to maintain the bug, use
min in lieu of max).Code Snippets
import operator
class Sorter(BaseCrawler):
def __init__(self, base_dir):
self.base_dir = base_dir
def _sort(self, fef, *options):
file_list = [(fn, fef(fn)) for fn in self.files(*options)]
return sorted(file_list, key=operator.itemgetter(1))
def by_size(self, *options):
def fef(fn): return float(os.path.getsize(fn))
return self._sort(fef, *options)
def by_mtime(self, *options):
return self._sort(os.path.getmtime, *options)def last_modified(self):
'''
Return the last modified file.
'''
return self.by_mtime()[0]def last_modified(self):
'''
Return the last modified file.
'''
return max(self.files(), key=os.path.getmtime)Context
StackExchange Code Review Q#77804, answer score: 5
Revisions (0)
No revisions yet.