principlepythonMinor
Compare files by name in two folders A and B and delete duplicates from folder A
Viewed 0 times
fromdeletefolderstwofilesnamefolderandcompareduplicates
Problem
I wrote my very first Python program to answer a Super User question. I'm wondering how the application can be rewritten to make it as pythonic as possible.
Unit test
```
import unittest
import os
import shutil
from DeleteDuplicateInFolderA import DeleteDuplicateInFolderA
class DeleteDuplicateInFolderATest(unittest.TestCase):
def __init__(self, *args, **kwargs):
super(DeleteDuplicateInFolderATest, self).__init__(*args, **kwargs)
self._base_directory = r"c:\temp\test"
self._path_A = self._base_directory + r"\A"
self._path_B = self._base_directory + r"\B"
def create_folder_and_create_some_files(self, path, filename_list):
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path)
for filename in filename_list:
open(os.path.join(path, filename), "w+").close()
def setUp(self):
# Create folders and files for testing
self.create_folder_and_create_some_files(self._path_A, ["1.txt", "2.txt"])
self.create_folder_and_create_some_files(self._path_B, ["2.txt", "3.txt"])
def tearDown(self):
for path in [self._path_A, self._path_B, self._base_directory]:
if os.path.exists(path):
shutil.rmtree(path)
def test_duplicate_file_gets_deleted(self):
# Arrange
app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=False)
# Act
app.delete_duplicates_in_folder_A()
# Assert
self.assertFalse(os.path.isfile(self._path_A + r"\2.txt"), "File 2.txt has not been deleted.")
def test_duplicate_file_gets_not_deleted_in_mode_dryrun(self):
# Arrange
app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=True)
# Act
app.delete_duplicates_in_folder_A()
# Assert
self.assertTrue(os.path.isfile(self._path_A + r"\2.txt"), "File 2.txt should not have been deleted in mode '--dryrun'")
def main():
unittes
Unit test
```
import unittest
import os
import shutil
from DeleteDuplicateInFolderA import DeleteDuplicateInFolderA
class DeleteDuplicateInFolderATest(unittest.TestCase):
def __init__(self, *args, **kwargs):
super(DeleteDuplicateInFolderATest, self).__init__(*args, **kwargs)
self._base_directory = r"c:\temp\test"
self._path_A = self._base_directory + r"\A"
self._path_B = self._base_directory + r"\B"
def create_folder_and_create_some_files(self, path, filename_list):
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path)
for filename in filename_list:
open(os.path.join(path, filename), "w+").close()
def setUp(self):
# Create folders and files for testing
self.create_folder_and_create_some_files(self._path_A, ["1.txt", "2.txt"])
self.create_folder_and_create_some_files(self._path_B, ["2.txt", "3.txt"])
def tearDown(self):
for path in [self._path_A, self._path_B, self._base_directory]:
if os.path.exists(path):
shutil.rmtree(path)
def test_duplicate_file_gets_deleted(self):
# Arrange
app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=False)
# Act
app.delete_duplicates_in_folder_A()
# Assert
self.assertFalse(os.path.isfile(self._path_A + r"\2.txt"), "File 2.txt has not been deleted.")
def test_duplicate_file_gets_not_deleted_in_mode_dryrun(self):
# Arrange
app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=True)
# Act
app.delete_duplicates_in_folder_A()
# Assert
self.assertTrue(os.path.isfile(self._path_A + r"\2.txt"), "File 2.txt should not have been deleted in mode '--dryrun'")
def main():
unittes
Solution
To make things more Pythonic you should follow PEP8, as it is the standard for Pythonic code.
I personally would change
And so I'd make it a generator that yields files with paths relative to its folder.
I.e
To achieve this you just need to add
This results in:
Your way of finding the
I'd go for an easier to read way of making sets and getting the unison.
Nice and easy to read.
I don't like that you print different things with and without
Instead I'd make it print duplicates if there are some, otherwise nothing.
And I also don't like that any file that has the same name is a duplicate,
I can't think of good names when I want a temporary python file, and so I do make all:
Sometimes I even do
I know that those files are not duplicates, but have the same name.
So I'd use
And so I'd use the following function to do the same as you:
Which uses very little actual code.
After this, I'd change how do your
Instead of rolling your own argument passer, you can just use
Trust me it's really simple to make and use.
Instead of what you're doing, you can just do:
And you don't have to care about wrong input, as it'll never get to you.
The only thing that I think is ugly is the print I'm using, but since the alternate is slow or a lot longer I'm fine with it.
And possibly the
And so it could be:
I personally would change
get_filenames_in_folder, as it gets all child files, even ones in folders.And so I'd make it a generator that yields files with paths relative to its folder.
I.e
.\spam.py and foo\bar.py.To achieve this you just need to add
os.path.relpath and then os.path.join the result with the file name.This results in:
def sub_files(folder):
relpath = os.path.relpath
join = os.path.join
for path, _, files in os.walk(folder):
relative = relpath(path, folder)
for file in files:
yield join(relative, file)Your way of finding the
duplicates, or as you call them files_of_A_that_are_in_B, is ok.I'd go for an easier to read way of making sets and getting the unison.
set(files_a) & set(files_b).Nice and easy to read.
I don't like that you print different things with and without
is_dry_run.Instead I'd make it print duplicates if there are some, otherwise nothing.
And I also don't like that any file that has the same name is a duplicate,
I can't think of good names when I want a temporary python file, and so I do make all:
tmp, temp, tmp.py, temp.py.Sometimes I even do
temp.temp.py!I know that those files are not duplicates, but have the same name.
So I'd use
filecmp.And so I'd use the following function to do the same as you:
def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
folders = [folder_a, folder_b]
files = [set(sub_files(folder)) for folder in folders]
duplicates, *_ = groups = [files[0] & files[1]]
if not shallow:
duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
print('\n\n'.join(
'{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
if files
))
if not dry_run:
join = os.path.join
remove = os.remove
for file in duplicates:
remove(join(folder_a, file))Which uses very little actual code.
After this, I'd change how do your
__main__.Instead of rolling your own argument passer, you can just use
argparse.Trust me it's really simple to make and use.
Instead of what you're doing, you can just do:
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('folder_a', help='folder to delete from')
parser.add_argument('folder_b', help='folder to find duplicates in')
parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
action="store_true")
args = parser.parse_args()
remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)And you don't have to care about wrong input, as it'll never get to you.
The only thing that I think is ugly is the print I'm using, but since the alternate is slow or a lot longer I'm fine with it.
And possibly the
duplicates, *_ hack.And so it could be:
import os
import filecmp
import argparse
def sub_files(folder):
relpath = os.path.relpath
join = os.path.join
for path, _, files in os.walk(folder):
relative = relpath(path, folder)
for file in files:
yield join(relative, file)
def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
folders = [folder_a, folder_b]
files = [set(sub_files(folder)) for folder in folders]
duplicates, *_ = groups = [files[0] & files[1]]
if not shallow:
duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
print('\n\n'.join(
'{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
if files
))
if not dry_run:
join = os.path.join
remove = os.remove
for file in duplicates:
remove(join(folder_a, file))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('folder_a', help='folder to delete from')
parser.add_argument('folder_b', help='folder to find duplicates in')
parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
action="store_true")
args = parser.parse_args()
remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)Code Snippets
def sub_files(folder):
relpath = os.path.relpath
join = os.path.join
for path, _, files in os.walk(folder):
relative = relpath(path, folder)
for file in files:
yield join(relative, file)def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
folders = [folder_a, folder_b]
files = [set(sub_files(folder)) for folder in folders]
duplicates, *_ = groups = [files[0] & files[1]]
if not shallow:
duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
print('\n\n'.join(
'{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
if files
))
if not dry_run:
join = os.path.join
remove = os.remove
for file in duplicates:
remove(join(folder_a, file))if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('folder_a', help='folder to delete from')
parser.add_argument('folder_b', help='folder to find duplicates in')
parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
action="store_true")
args = parser.parse_args()
remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)import os
import filecmp
import argparse
def sub_files(folder):
relpath = os.path.relpath
join = os.path.join
for path, _, files in os.walk(folder):
relative = relpath(path, folder)
for file in files:
yield join(relative, file)
def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
folders = [folder_a, folder_b]
files = [set(sub_files(folder)) for folder in folders]
duplicates, *_ = groups = [files[0] & files[1]]
if not shallow:
duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
print('\n\n'.join(
'{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
if files
))
if not dry_run:
join = os.path.join
remove = os.remove
for file in duplicates:
remove(join(folder_a, file))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('folder_a', help='folder to delete from')
parser.add_argument('folder_b', help='folder to find duplicates in')
parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
action="store_true")
args = parser.parse_args()
remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)Context
StackExchange Code Review Q#133333, answer score: 3
Revisions (0)
No revisions yet.