HiveBrain v1.2.0
Get Started
← Back to all entries
patternpythonMinor

Removing duplicate files in a given directory

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
directoryremovingduplicatefilesgiven

Problem

The following code is a python script that removes duplicate files in a given directory. At first I considered the most basic thing: relying on identical names, but you might have 2 different files with the same name in 2 different directories. So I decided to rely on md5checksum, since any 2 files that yield the same md5checksum almost invariably have the same content.

```
#A simple Python script to remove duplicate files...Coded by MCoury AKA python-scripter
import hashlib
import os

#define a function to calculate md5checksum for a given file:
def md5(f):
"""takes one file f as an argument and generates an md5checksum for that file"""
return hashlib.md5(open(f,'rb').read()).hexdigest()

#define our main function:
def rm_dup(path):
"""relies on the md5 function above to remove duplicate files"""
if not os.path.isdir(path):#make sure the given directory exists
print('specified directory does not exist!')
else:
md5_dict={}
for root, dirs, files in os.walk(path):#the os.walk function allows checking subdirectories too...
for f in files:
if not md5(os.path.join(root,f)) in md5_dict:
md5_dict.update({md5(os.path.join(root,f)):[os.path.join(root,f)]})
else:
md5_dict[md5(os.path.join(root,f))].append(os.path.join(root,f))
for key in md5_dict:
while len(md5_dict[key])>1:
for item in md5_dict[key]:
os.remove(item)
md5_dict[key].remove(item)
print('Done!')

if __name__=='__main__':
print('=======A simple Python script to remove duplicate files===========')
print()
print('============Coded by MCoury AKA python-scripter===================')
print()
print('===========The script counts on the fact the fact=================')
print('=========that if 2 files have the same md5checksum================')
print('==========they most likely have t

Solution

To get md5 of large files you can use something like this:

def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


Now here

for f in files:
    if not md5(os.path.join(root,f)) in md5_dict:
        md5_dict.update({md5(os.path.join(root,f)):[os.path.join(root,f)]})
    else:
        md5_dict[md5(os.path.join(root,f))].append(os.path.join(root,f))


I see two things:

  • You don't need to check if key in your dict, you can use defaultdict instead



  • md5 is calculated twice per each file, first time to check if it's in dict, second time to actually add it to dict.



Here:

while len(md5_dict[key])>1:
    for item in md5_dict[key]:
        os.remove(item)
        md5_dict[key].remove(item)


You can just use list.pop()

So in the end your code should look like this:

import hashlib
import os
from collections import defaultdict

def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def rm_dup(path):
    """relies on the md5 function above to remove duplicate files"""
    if not os.path.isdir(path):  # make sure the given directory exists
        print('specified directory does not exist!')
        return

    md5_dict = defaultdict(list)
    for root, dirs, files in os.walk(path):  # the os.walk function allows checking subdirectories too...
        for filename in files:
            filepath = os.path.join(root, filename)
            file_md5 = md5(filename)
            md5_dict[file_md5].append(filepath)
    for key in md5_dict:
        file_list = md5_dict[key]
        while len(file_list) > 1:
            item = file_list.pop()
            os.remove(item)
    print('Done!')

if __name__ == '__main__':
    print('=======A simple Python script to remove duplicate files===========')
    print()
    print('============Coded by codereview.stackexchange.com AKA python-scripter===================')
    print()
    print('===========The script counts on the fact the fact=================')
    print('=========that if 2 files have the same md5checksum================')
    print('==========they most likely have the same content==================')
    print()
    path = input(r'Please provide the target path\directory... for example: c: or c:\directory...')
    print()
    rm_dup(path)


however there is one more thing, while in general it's right to first list all the files and then delete them one by one, because maybe in future you will add some option for user to select files he want to delete. In this current case you can just use set to keep md5 checksums in it, and delete files as soon as you fight one with the same chucksum.

P.S. two files with the same checksum are not 100% guaranteed to be the same. Think about adding some additional check.

Code Snippets

def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()
for f in files:
    if not md5(os.path.join(root,f)) in md5_dict:
        md5_dict.update({md5(os.path.join(root,f)):[os.path.join(root,f)]})
    else:
        md5_dict[md5(os.path.join(root,f))].append(os.path.join(root,f))
while len(md5_dict[key])>1:
    for item in md5_dict[key]:
        os.remove(item)
        md5_dict[key].remove(item)
import hashlib
import os
from collections import defaultdict


def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


def rm_dup(path):
    """relies on the md5 function above to remove duplicate files"""
    if not os.path.isdir(path):  # make sure the given directory exists
        print('specified directory does not exist!')
        return

    md5_dict = defaultdict(list)
    for root, dirs, files in os.walk(path):  # the os.walk function allows checking subdirectories too...
        for filename in files:
            filepath = os.path.join(root, filename)
            file_md5 = md5(filename)
            md5_dict[file_md5].append(filepath)
    for key in md5_dict:
        file_list = md5_dict[key]
        while len(file_list) > 1:
            item = file_list.pop()
            os.remove(item)
    print('Done!')

if __name__ == '__main__':
    print('=======A simple Python script to remove duplicate files===========')
    print()
    print('============Coded by codereview.stackexchange.com AKA python-scripter===================')
    print()
    print('===========The script counts on the fact the fact=================')
    print('=========that if 2 files have the same md5checksum================')
    print('==========they most likely have the same content==================')
    print()
    path = input(r'Please provide the target path\directory... for example: c: or c:\directory...')
    print()
    rm_dup(path)

Context

StackExchange Code Review Q#148099, answer score: 5

Revisions (0)

No revisions yet.