237 lines
8.5 KiB
Python
237 lines
8.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Clean duplicates script
|
|
|
|
This script parses a duplicates list (output by a specialized detection tool) in
|
|
`resultsduplicate.txt` and moves duplicate files that live under a specific
|
|
directory (VAR_DIRECTORY) to the trash (recycle bin) instead of deleting them
|
|
permanently.
|
|
|
|
Behavior:
|
|
- The script parses blocks like:
|
|
- 2 equal files of size 5256842
|
|
"I:\\01_AI\\01_IMAGES\\00_Input\\old\\file.png"
|
|
"I:\\01_AI\\01_IMAGES\\55_Img2Img\\other\\file.png"
|
|
- If any files in the block live under `VAR_DIRECTORY` they are candidates for
|
|
removal; they are moved to trash. (They are not removed permanently.)
|
|
- If all files in a block are inside `VAR_DIRECTORY` the script will skip
|
|
deletion for that block to avoid losing every copy.
|
|
|
|
The script supports a DRY_RUN mode (no changes are made, only logged). It uses
|
|
send2trash if available, otherwise falls back to moving files to a local
|
|
`.recycle_bin` directory in the project.
|
|
|
|
This script takes no command-line arguments; change the variables near the
|
|
top of the file to configure behavior.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
import shutil
|
|
from typing import List
|
|
|
|
try:
|
|
from send2trash import send2trash
|
|
SEND2TRASH_AVAILABLE = True
|
|
except Exception:
|
|
SEND2TRASH_AVAILABLE = False
|
|
|
|
|
|
# Configuration variables (script takes no CLI args - edit here):
|
|
DUPLICATES_FILE = "results.txt"
|
|
VAR_DIRECTORY = r"I:\01_AI\01_IMAGES\00_Input"
|
|
# When DRY_RUN is True, the script logs actions but does not move files
|
|
DRY_RUN = False
|
|
# Default logging level (DEBUG/INFO/WARNING/ERROR)
|
|
LOG_LEVEL = logging.INFO
|
|
|
|
LOCAL_TRASH_DIR = ".recycle_bin"
|
|
|
|
|
|
def setup_logger(level: int = logging.INFO) -> logging.Logger:
|
|
logger = logging.getLogger("cleanDupli")
|
|
logger.setLevel(level)
|
|
handler = logging.StreamHandler(sys.stdout)
|
|
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
handler.setFormatter(formatter)
|
|
if not logger.handlers:
|
|
logger.addHandler(handler)
|
|
return logger
|
|
|
|
|
|
logger = setup_logger(LOG_LEVEL)
|
|
|
|
|
|
def normalize_path(p: str) -> str:
|
|
# Remove surrounding quotes/spaces then normalize case and path separators
|
|
p = p.strip().strip('"').strip("'")
|
|
p = os.path.normpath(p)
|
|
return os.path.normcase(p)
|
|
|
|
|
|
def parse_duplicates_file(path: str) -> List[List[str]]:
|
|
if not os.path.exists(path):
|
|
logger.error("Duplicates file '%s' does not exist.", path)
|
|
raise FileNotFoundError(path)
|
|
|
|
blocks: List[List[str]] = []
|
|
current_block: List[str] = []
|
|
with open(path, "r", encoding="utf-8", errors="replace") as fh:
|
|
for raw in fh:
|
|
line = raw.rstrip("\n")
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
# block header begins with '- ' (like '- 2 equal files')
|
|
if stripped.startswith("-") and "equal file" in stripped:
|
|
# if we had a current block, push to blocks
|
|
if current_block:
|
|
blocks.append(current_block)
|
|
current_block = []
|
|
# skip header
|
|
continue
|
|
# lines with paths are indented and quoted
|
|
# push any quoted path into current block
|
|
if stripped.startswith('"') or stripped.startswith("'"):
|
|
p = stripped.strip().strip('"').strip("'")
|
|
# Windows paths in the file may use either forward/back slashes
|
|
p = p.replace("/", os.sep)
|
|
p = p.replace("\\\\", "\\")
|
|
current_block.append(p)
|
|
else:
|
|
# if not a path and not a header -- ignore
|
|
continue
|
|
if current_block:
|
|
blocks.append(current_block)
|
|
return blocks
|
|
|
|
|
|
def ensure_local_trash() -> str:
|
|
os.makedirs(LOCAL_TRASH_DIR, exist_ok=True)
|
|
return os.path.abspath(LOCAL_TRASH_DIR)
|
|
|
|
|
|
def move_to_trash(fp: str) -> None:
|
|
if DRY_RUN:
|
|
logger.info("[DRY RUN] Would move to trash: %s", fp)
|
|
return
|
|
|
|
if SEND2TRASH_AVAILABLE:
|
|
try:
|
|
send2trash(fp)
|
|
logger.info("Moved to system trash: %s", fp)
|
|
return
|
|
except Exception as ex:
|
|
logger.warning("send2trash failed for %s: %s", fp, ex)
|
|
# fallback to local trash
|
|
|
|
# fallback: copy file to local recycle bin and remove original
|
|
try:
|
|
trash_dir = ensure_local_trash()
|
|
base = os.path.basename(fp)
|
|
dst = os.path.join(trash_dir, base)
|
|
# avoid accidental overwrite
|
|
if os.path.exists(dst):
|
|
# append a unique suffix
|
|
import uuid
|
|
dst = os.path.join(trash_dir, f"{uuid.uuid4().hex[:8]}-{base}")
|
|
shutil.move(fp, dst)
|
|
logger.info("Moved to local trash (%s): %s", dst, fp)
|
|
except Exception as ex:
|
|
logger.error("Failed to move to local trash %s: %s", fp, ex)
|
|
raise
|
|
|
|
|
|
def process_blocks(blocks: List[List[str]], var_dir: str) -> dict:
|
|
results = {
|
|
"total_blocks": len(blocks),
|
|
"candidate_files": 0,
|
|
"skipped_blocks": 0,
|
|
"moved_files": 0,
|
|
"errors": 0,
|
|
}
|
|
var_dir_norm = normalize_path(var_dir)
|
|
|
|
processed_count = 0
|
|
log_every_n = 200 # periodic progress logging for large lists
|
|
for block in blocks:
|
|
# skip single-file blocks
|
|
if len(block) <= 1:
|
|
logger.debug("Skipping single-entry block: %s", block)
|
|
results["skipped_blocks"] += 1
|
|
continue
|
|
|
|
normalized_paths = [normalize_path(p) for p in block]
|
|
in_var = [p for p in normalized_paths if p.startswith(var_dir_norm)]
|
|
out_var = [p for p in normalized_paths if not p.startswith(var_dir_norm)]
|
|
|
|
logger.debug("Block has %d entries (%d in var_dir, %d out): %s", len(block), len(in_var), len(out_var), block)
|
|
|
|
# if there are no files outside var_dir we avoid deleting everything
|
|
if not out_var and in_var:
|
|
logger.warning("Skipping block because all copies are in VAR_DIRECTORY (not deleting all): %s", block)
|
|
results["skipped_blocks"] += 1
|
|
continue
|
|
|
|
# otherwise, any files inside VAR_DIRECTORY are safe to remove
|
|
for p in in_var:
|
|
results["candidate_files"] += 1
|
|
try:
|
|
if not os.path.exists(p):
|
|
logger.warning("File not found: %s (skipping)", p)
|
|
results["errors"] += 1
|
|
continue
|
|
move_to_trash(p)
|
|
# update counters
|
|
if DRY_RUN:
|
|
# the file would be moved in a real run
|
|
results.setdefault("would_move_files", 0)
|
|
results["would_move_files"] += 1
|
|
else:
|
|
results["moved_files"] += 1
|
|
processed_count += 1
|
|
# periodic progress logging
|
|
if processed_count % log_every_n == 0:
|
|
if DRY_RUN:
|
|
logger.info("Dry-run progress: %d files would be moved so far...", processed_count)
|
|
else:
|
|
logger.info("Progress: %d files moved so far...", processed_count)
|
|
except Exception as ex:
|
|
logger.error("Error moving to trash: %s -> %s", p, ex)
|
|
results["errors"] += 1
|
|
|
|
return results
|
|
|
|
|
|
def main() -> int:
|
|
logger.setLevel(LOG_LEVEL)
|
|
logger.info("Starting clean duplicates script")
|
|
logger.debug("Configuration: DUPLICATES_FILE=%s VAR_DIRECTORY=%s DRY_RUN=%s LOG_LEVEL=%s", DUPLICATES_FILE, VAR_DIRECTORY, DRY_RUN, LOG_LEVEL)
|
|
|
|
if not os.path.exists(DUPLICATES_FILE):
|
|
logger.error("File '%s' not found in working dir %s", DUPLICATES_FILE, os.getcwd())
|
|
return 2
|
|
|
|
try:
|
|
blocks = parse_duplicates_file(DUPLICATES_FILE)
|
|
except Exception as ex:
|
|
logger.error("Failed to parse duplicates file: %s", ex)
|
|
return 3
|
|
|
|
logger.info("Parsed %d duplicate groups", len(blocks))
|
|
results = process_blocks(blocks, VAR_DIRECTORY)
|
|
|
|
logger.info("Done. Results: total_blocks=%d candidate_files=%d moved_files=%d would_move_files=%d skipped_blocks=%d errors=%d", results["total_blocks"], results["candidate_files"], results.get("moved_files", 0), results.get("would_move_files", 0), results["skipped_blocks"], results["errors"])
|
|
# if any local trash used and not dry run, show message
|
|
if not SEND2TRASH_AVAILABLE and not DRY_RUN:
|
|
logger.warning("send2trash not available; files were moved to %s", os.path.abspath(LOCAL_TRASH_DIR))
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
rc = main()
|
|
sys.exit(rc) |