#
##
## SPDX-FileCopyrightText: © 2007-2024 Benedict Verhegghe <bverheg@gmail.com>
## SPDX-License-Identifier: GPL-3.0-or-later
##
## This file is part of pyFormex 3.5 (Thu Feb 8 19:11:13 CET 2024)
## pyFormex is a tool for generating, manipulating and transforming 3D
## geometrical models by sequences of mathematical operations.
## Home page: https://pyformex.org
## Project page: https://savannah.nongnu.org/projects/pyformex/
## Development: https://gitlab.com/bverheg/pyformex
## Distributed under the GNU General Public License version 3 or later.
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see http://www.gnu.org/licenses/.
##
"""Find (and delete) duplicate files.
This pyFormex app finds duplicate files on your file system(s). When running
the app, it shows a FileDialog to let you select a directory path. The it
finds all the duplicate files under that directory tree. The sets of identical
files are then presented with the option to delete one or more of the duplicate
files.
The functions in the app's module can be used separately to present other
use cases. This can conveniently be done as follows::
from pyformex.apps.FileDedup import identical_files
for resolved, size, files in identical_files(dir1, dir2, dir3):
if resolved:
do_something_with_duplicate_files(files)
This module uses some ideas from
https://discuss.python.org/t/identifying-duplicate-files-where-speed-is-a-concern/44534/15
"""
import os
import sys
import hashlib
from collections import defaultdict
import pyformex as pf
from pyformex.gui.guicore import _I
# TODO: remove when we make 3.11 the minimum
if sys.hexversion < 0x030B0000:
def file_digest(file_path, hash_algorithm='sha256', chunk_size=65536):
"""
Calculate the digest of a file using the specified hash algorithm.
Args:
file_path (str): Path to the file.
hash_algorithm (str): Hash algorithm to use, default is 'sha256'.
chunk_size (int): Size of chunks to read from the file, default is 64KB.
Returns:
str: Hexadecimal digest of the file.
"""
hash_func = hashlib.new(hash_algorithm)
with open(file_path, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
hash_func.update(chunk)
return hash_func.hexdigest()
else:
file_digest = hashlib.file_digest
[docs]def classify_by_size(files):
"""Classify files by their size
Parameters
----------
files: list of :term:`path_like`
The list of file paths to be classified
Returns
-------
dict
A dict with file size as key and a list of files with that
size as value.
"""
e = defaultdict(list)
for fn in files:
e[os.path.getsize(fn)].append(fn)
return e
CHUNKSIZE = 1 << 6
[docs]def refine_chunk3(size, files):
"""Refine lists of equal sized files by comparing small chunks
Generate subsets of files that are candidates for being identical.
Parameters
----------
size: int
The size in bytes of all the **files**
files: list of :term:`path_like`
A list of file paths all having the same **size**.
Returns
-------
resolved: bool
If True, the returned files are definitely identical.
size: int
The file size
files: list of :term:`path_like`
A subset of the input files. All files in the subset have the exact
same bytes at some small chunks. These files are thus candidates for
being identical.
Notes
-----
This is a generator function. It can be iterated until all input files
have been processed.
The current implementation reads three chunks of maximum CHUNKSIZE
bytes at the start, middle and end of the file. Files with a size
not larger than three times the CHUNKSIZE are thus necessarily
identical.
"""
e = defaultdict(list)
if size <= 3 * CHUNKSIZE:
chunksize = size
resolved = True
else:
chunksize = CHUNKSIZE
resolved = False
start1 = max(0, (size-CHUNKSIZE) // 2)
start2 = size - CHUNKSIZE
for fn in files:
with open(fn, "rb") as f:
if resolved:
chunk = f.read()
else:
chunk0 = f.read(chunksize)
f.seek(start1)
chunk1 = f.read(chunksize)
f.seek(start2)
chunk2 = f.read(chunksize)
chunk = chunk0 + chunk1 + chunk2
e[chunk].append(fn)
for chunk, cands in e.items():
if len(cands) > 1:
yield resolved, size, cands
DIGEST = 'sha256'
[docs]def refine_hash(size, files, digest=DIGEST):
"""Generate subsets of equally sized files that are identical.
Parameters
----------
size: int
The size in bytes of all the **files**
files: list of :term:`path_like`
A list of file paths all having the same **size**.
Returns
-------
resolved: bool
Always True.
size: int
The file size
files: list of :term:`path_like`
A subset of the input files that have the same hash for their full
contents, and can safely be considered identical.
Notes
-----
This is a generator function. It can be iterated until all input files
have been processed.
The current implementation uses the 'SHA256' hash, which has practically
zero chance of collisions. The number of diferent hashes is 2**256 or more
than 10**77. Having two files with the same hash is, while theoretically
possible, extremely improbable.
"""
e = defaultdict(list)
for fn in files:
with open(fn, 'rb') as f:
hash = file_digest(f, digest).digest() # requires 3.11 ?
e[hash].append(fn)
for chunk, cands in e.items():
resolved = True
if len(cands) > 1:
yield resolved, size, cands
def print_classes(classes):
nclass = len(classes)
nfiles = sum([len(c[1]) for c in classes])
print(" leaves ", nfiles, "files in", nclass, "classes")
[docs]def identical_files(files, quick=False, verbose=False):
"""Generate lists of identical files
Parameters
----------
files: list of :term:`path_like`
The list of files to search for identical ones.
quick: bool
If True, skip the hash refinement. This will run a lot faster
but not all results will be confirmed as identical. Many may
just be candidates for being identical files. The default (False)
resolves all identical files.
Returns
-------
resolved: bool
If True, the returned files are definitely identical.
size: int
The file size
files: list of :term:`path_like`
A subset of the input files. All files in the subset have the exact
same bytes at some small chunks. These files are thus candidates for
being identical.
Notes
-----
This is a generator function. It can be iterated until all input files
have been processed.
"""
times = pf.Timings()
nfiles = len(files)
files = classify_by_size(files)
nclass = len(files)
nbytes = sum([k * len(v) for k, v in files.items()])
if verbose:
print(f"Starting with {nfiles} files in {nclass} classes for "
f"{round(nbytes / (1<<30), 3)} GiB")
with pf.Timing("Classify by size", reg=times):
classes = []
for size, cands in files.items():
if len(cands) > 1:
if size == 0 and len(cands) > 1:
# these files are identical
yield True, size, cands
else:
# unresolved
classes.append((size, cands))
if verbose:
print("refine_size")
print_classes(classes)
if quick:
refines = (refine_chunk3,)
else:
refines = (refine_chunk3, refine_hash)
for refine in refines:
if verbose:
print(refine.__name__)
with pf.Timing(refine.__name__, reg=times):
resolved = True
newclasses = []
for size, cands in classes:
for resolved, size, files in refine(size, cands):
if resolved:
yield resolved, size, files
else:
newclasses.append((size, files))
classes = newclasses
if verbose:
print_classes(classes)
if not classes:
break
if quick:
for size, files in classes:
yield False, size, files
else:
assert not classes # everything should be resolved
if verbose:
print(times)
[docs]def listDuplicates(files):
"""Print the sets of identical files to stdout
Parameters
----------
files: list of :term:`path_like`
The list of file paths to be classified
"""
for resolved, size, eqfiles in identical_files(files):
print(len(eqfiles), [str(f) for f in eqfiles])
[docs]def collectFiles(*paths):
"""Collect all the files along the provided paths.
Parameters
----------
paths: list of :term:`path_like`
One or more paths to collect files for deduplication. If path is
a directory, all files below it are added to the list of files.
In no path is provided, the current directory is used.
Returns
-------
list of :class:`~path.Path`
A list of all files under the provided ``paths``.
"""
if len(paths) == 0:
paths = ['.']
files = []
for p in paths:
if isinstance(p, str):
p = pf.Path(p)
if not isinstance(p, pf.Path):
raise ValueError(f"Expected str or Path, got {type(p)}")
if p.is_dir():
files.extend(p.listTree())
else:
files.append(p)
return files
[docs]def deduplicate(*paths, quick=False):
"""Deduplicate indentical files.
Search for identical files and offers an option to delete duplicates.
Parameters
----------
paths: list of :term:`path_like`
One or more paths to collect files for deduplication. If path is
a directory, all files below it are added to the list of files.
If it is a file, it is added as such. Symlinks are not followed.
"""
files = collectFiles(*paths)
for resolved, size, eqfiles in identical_files(files, quick=quick):
items = [_I(n, False) for n in eqfiles]
info = (f"Number of files: {len(eqfiles)}; "
f"File size: {size}; "
f"Identical: {'Yes' if resolved else 'Maybe'}")
items.append(_I('_info_', info, itemtype='label', text=''))
ok = False
while not ok:
res = pf.askItems(
items=items,
caption="FileDedup", size=None,
message="Check the files that you want to delete",
)
ok = (
not res # canceled
or any(not i for i in res.values()) # kept at least one file
or pf.ack("Do you really want to delete ALL the duplicate files?")
)
if not res:
break # a cancel stops everything
for k in res:
if res[k]:
print(f"Deleting '{k}'")
pf.Path(k).remove()
print("Done")
def run():
parent = pf.askDirname()
if parent:
quick = False
deduplicate(parent, quick=quick)
if __name__ == '__draw__':
run()
elif __name__ == '__main__':
import sys
listDuplicates(collectFiles(*sys.argv[1:]))
# End