HiveBrain v1.2.0
Get Started
← Back to all entries
principlepythonMinor

Compare files by name in two folders A and B and delete duplicates from folder A

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
fromdeletefolderstwofilesnamefolderandcompareduplicates

Problem

I wrote my very first Python program to answer a Super User question. I'm wondering how the application can be rewritten to make it as pythonic as possible.

Unit test

```
import unittest
import os
import shutil
from DeleteDuplicateInFolderA import DeleteDuplicateInFolderA

class DeleteDuplicateInFolderATest(unittest.TestCase):

def __init__(self, *args, **kwargs):
super(DeleteDuplicateInFolderATest, self).__init__(*args, **kwargs)
self._base_directory = r"c:\temp\test"
self._path_A = self._base_directory + r"\A"
self._path_B = self._base_directory + r"\B"

def create_folder_and_create_some_files(self, path, filename_list):
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path)
for filename in filename_list:
open(os.path.join(path, filename), "w+").close()

def setUp(self):
# Create folders and files for testing
self.create_folder_and_create_some_files(self._path_A, ["1.txt", "2.txt"])
self.create_folder_and_create_some_files(self._path_B, ["2.txt", "3.txt"])

def tearDown(self):
for path in [self._path_A, self._path_B, self._base_directory]:
if os.path.exists(path):
shutil.rmtree(path)

def test_duplicate_file_gets_deleted(self):
# Arrange
app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=False)

# Act
app.delete_duplicates_in_folder_A()

# Assert
self.assertFalse(os.path.isfile(self._path_A + r"\2.txt"), "File 2.txt has not been deleted.")

def test_duplicate_file_gets_not_deleted_in_mode_dryrun(self):
# Arrange
app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=True)

# Act
app.delete_duplicates_in_folder_A()

# Assert
self.assertTrue(os.path.isfile(self._path_A + r"\2.txt"), "File 2.txt should not have been deleted in mode '--dryrun'")

def main():
unittes

Solution

To make things more Pythonic you should follow PEP8, as it is the standard for Pythonic code.

I personally would change get_filenames_in_folder, as it gets all child files, even ones in folders.
And so I'd make it a generator that yields files with paths relative to its folder.
I.e .\spam.py and foo\bar.py.
To achieve this you just need to add os.path.relpath and then os.path.join the result with the file name.
This results in:

def sub_files(folder):
    relpath = os.path.relpath
    join = os.path.join
    for path, _, files in os.walk(folder):
        relative = relpath(path, folder)
        for file in files:
            yield join(relative, file)


Your way of finding the duplicates, or as you call them files_of_A_that_are_in_B, is ok.
I'd go for an easier to read way of making sets and getting the unison. set(files_a) & set(files_b).
Nice and easy to read.

I don't like that you print different things with and without is_dry_run.
Instead I'd make it print duplicates if there are some, otherwise nothing.
And I also don't like that any file that has the same name is a duplicate,
I can't think of good names when I want a temporary python file, and so I do make all: tmp, temp, tmp.py, temp.py.
Sometimes I even do temp.temp.py!
I know that those files are not duplicates, but have the same name.
So I'd use filecmp.

And so I'd use the following function to do the same as you:

def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
    folders = [folder_a, folder_b]
    files = [set(sub_files(folder)) for folder in folders]
    duplicates, *_ = groups = [files[0] & files[1]]
    if not shallow:
        duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
    print('\n\n'.join(
        '{}:\n{}'.format(name, '\n'.join('    {}'.format(file) for file in files))
        for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
        if files
    ))
    if not dry_run:
        join = os.path.join
        remove = os.remove
        for file in duplicates:
            remove(join(folder_a, file))


Which uses very little actual code.

After this, I'd change how do your __main__.
Instead of rolling your own argument passer, you can just use argparse.
Trust me it's really simple to make and use.
Instead of what you're doing, you can just do:

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('folder_a', help='folder to delete from')
    parser.add_argument('folder_b', help='folder to find duplicates in')
    parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
    parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
                        action="store_true")
    args = parser.parse_args()

    remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)


And you don't have to care about wrong input, as it'll never get to you.

The only thing that I think is ugly is the print I'm using, but since the alternate is slow or a lot longer I'm fine with it.
And possibly the duplicates, *_ hack.
And so it could be:

import os
import filecmp
import argparse

def sub_files(folder):
    relpath = os.path.relpath
    join = os.path.join
    for path, _, files in os.walk(folder):
        relative = relpath(path, folder)
        for file in files:
            yield join(relative, file)

def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
    folders = [folder_a, folder_b]
    files = [set(sub_files(folder)) for folder in folders]
    duplicates, *_ = groups = [files[0] & files[1]]
    if not shallow:
        duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
    print('\n\n'.join(
        '{}:\n{}'.format(name, '\n'.join('    {}'.format(file) for file in files))
        for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
        if files
    ))
    if not dry_run:
        join = os.path.join
        remove = os.remove
        for file in duplicates:
            remove(join(folder_a, file))

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('folder_a', help='folder to delete from')
    parser.add_argument('folder_b', help='folder to find duplicates in')
    parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
    parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
                        action="store_true")
    args = parser.parse_args()

    remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)

Code Snippets

def sub_files(folder):
    relpath = os.path.relpath
    join = os.path.join
    for path, _, files in os.walk(folder):
        relative = relpath(path, folder)
        for file in files:
            yield join(relative, file)
def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
    folders = [folder_a, folder_b]
    files = [set(sub_files(folder)) for folder in folders]
    duplicates, *_ = groups = [files[0] & files[1]]
    if not shallow:
        duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
    print('\n\n'.join(
        '{}:\n{}'.format(name, '\n'.join('    {}'.format(file) for file in files))
        for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
        if files
    ))
    if not dry_run:
        join = os.path.join
        remove = os.remove
        for file in duplicates:
            remove(join(folder_a, file))
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('folder_a', help='folder to delete from')
    parser.add_argument('folder_b', help='folder to find duplicates in')
    parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
    parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
                        action="store_true")
    args = parser.parse_args()

    remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)
import os
import filecmp
import argparse


def sub_files(folder):
    relpath = os.path.relpath
    join = os.path.join
    for path, _, files in os.walk(folder):
        relative = relpath(path, folder)
        for file in files:
            yield join(relative, file)


def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
    folders = [folder_a, folder_b]
    files = [set(sub_files(folder)) for folder in folders]
    duplicates, *_ = groups = [files[0] & files[1]]
    if not shallow:
        duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
    print('\n\n'.join(
        '{}:\n{}'.format(name, '\n'.join('    {}'.format(file) for file in files))
        for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
        if files
    ))
    if not dry_run:
        join = os.path.join
        remove = os.remove
        for file in duplicates:
            remove(join(folder_a, file))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('folder_a', help='folder to delete from')
    parser.add_argument('folder_b', help='folder to find duplicates in')
    parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
    parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
                        action="store_true")
    args = parser.parse_args()

    remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)

Context

StackExchange Code Review Q#133333, answer score: 3

Revisions (0)

No revisions yet.