#!/usr/bin/env python3 """ Clean duplicates script This script parses a duplicates list (output by a specialized detection tool) in `resultsduplicate.txt` and moves duplicate files that live under a specific directory (VAR_DIRECTORY) to the trash (recycle bin) instead of deleting them permanently. Behavior: - The script parses blocks like: - 2 equal files of size 5256842 "I:\\01_AI\\01_IMAGES\\00_Input\\old\\file.png" "I:\\01_AI\\01_IMAGES\\55_Img2Img\\other\\file.png" - If any files in the block live under `VAR_DIRECTORY` they are candidates for removal; they are moved to trash. (They are not removed permanently.) - If all files in a block are inside `VAR_DIRECTORY` the script will skip deletion for that block to avoid losing every copy. The script supports a DRY_RUN mode (no changes are made, only logged). It uses send2trash if available, otherwise falls back to moving files to a local `.recycle_bin` directory in the project. This script takes no command-line arguments; change the variables near the top of the file to configure behavior. """ from __future__ import annotations import logging import os import sys import shutil from typing import List try: from send2trash import send2trash SEND2TRASH_AVAILABLE = True except Exception: SEND2TRASH_AVAILABLE = False # Configuration variables (script takes no CLI args - edit here): DUPLICATES_FILE = "results.txt" VAR_DIRECTORY = r"I:\01_AI\01_IMAGES\00_Input" # When DRY_RUN is True, the script logs actions but does not move files DRY_RUN = False # Default logging level (DEBUG/INFO/WARNING/ERROR) LOG_LEVEL = logging.INFO LOCAL_TRASH_DIR = ".recycle_bin" def setup_logger(level: int = logging.INFO) -> logging.Logger: logger = logging.getLogger("cleanDupli") logger.setLevel(level) handler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) if not logger.handlers: logger.addHandler(handler) return logger logger = setup_logger(LOG_LEVEL) def normalize_path(p: str) -> str: # Remove surrounding quotes/spaces then normalize case and path separators p = p.strip().strip('"').strip("'") p = os.path.normpath(p) return os.path.normcase(p) def parse_duplicates_file(path: str) -> List[List[str]]: if not os.path.exists(path): logger.error("Duplicates file '%s' does not exist.", path) raise FileNotFoundError(path) blocks: List[List[str]] = [] current_block: List[str] = [] with open(path, "r", encoding="utf-8", errors="replace") as fh: for raw in fh: line = raw.rstrip("\n") stripped = line.strip() if not stripped: continue # block header begins with '- ' (like '- 2 equal files') if stripped.startswith("-") and "equal file" in stripped: # if we had a current block, push to blocks if current_block: blocks.append(current_block) current_block = [] # skip header continue # lines with paths are indented and quoted # push any quoted path into current block if stripped.startswith('"') or stripped.startswith("'"): p = stripped.strip().strip('"').strip("'") # Windows paths in the file may use either forward/back slashes p = p.replace("/", os.sep) p = p.replace("\\\\", "\\") current_block.append(p) else: # if not a path and not a header -- ignore continue if current_block: blocks.append(current_block) return blocks def ensure_local_trash() -> str: os.makedirs(LOCAL_TRASH_DIR, exist_ok=True) return os.path.abspath(LOCAL_TRASH_DIR) def move_to_trash(fp: str) -> None: if DRY_RUN: logger.info("[DRY RUN] Would move to trash: %s", fp) return if SEND2TRASH_AVAILABLE: try: send2trash(fp) logger.info("Moved to system trash: %s", fp) return except Exception as ex: logger.warning("send2trash failed for %s: %s", fp, ex) # fallback to local trash # fallback: copy file to local recycle bin and remove original try: trash_dir = ensure_local_trash() base = os.path.basename(fp) dst = os.path.join(trash_dir, base) # avoid accidental overwrite if os.path.exists(dst): # append a unique suffix import uuid dst = os.path.join(trash_dir, f"{uuid.uuid4().hex[:8]}-{base}") shutil.move(fp, dst) logger.info("Moved to local trash (%s): %s", dst, fp) except Exception as ex: logger.error("Failed to move to local trash %s: %s", fp, ex) raise def process_blocks(blocks: List[List[str]], var_dir: str) -> dict: results = { "total_blocks": len(blocks), "candidate_files": 0, "skipped_blocks": 0, "moved_files": 0, "errors": 0, } var_dir_norm = normalize_path(var_dir) processed_count = 0 log_every_n = 200 # periodic progress logging for large lists for block in blocks: # skip single-file blocks if len(block) <= 1: logger.debug("Skipping single-entry block: %s", block) results["skipped_blocks"] += 1 continue normalized_paths = [normalize_path(p) for p in block] in_var = [p for p in normalized_paths if p.startswith(var_dir_norm)] out_var = [p for p in normalized_paths if not p.startswith(var_dir_norm)] logger.debug("Block has %d entries (%d in var_dir, %d out): %s", len(block), len(in_var), len(out_var), block) # if there are no files outside var_dir we avoid deleting everything if not out_var and in_var: logger.warning("Skipping block because all copies are in VAR_DIRECTORY (not deleting all): %s", block) results["skipped_blocks"] += 1 continue # otherwise, any files inside VAR_DIRECTORY are safe to remove for p in in_var: results["candidate_files"] += 1 try: if not os.path.exists(p): logger.warning("File not found: %s (skipping)", p) results["errors"] += 1 continue move_to_trash(p) # update counters if DRY_RUN: # the file would be moved in a real run results.setdefault("would_move_files", 0) results["would_move_files"] += 1 else: results["moved_files"] += 1 processed_count += 1 # periodic progress logging if processed_count % log_every_n == 0: if DRY_RUN: logger.info("Dry-run progress: %d files would be moved so far...", processed_count) else: logger.info("Progress: %d files moved so far...", processed_count) except Exception as ex: logger.error("Error moving to trash: %s -> %s", p, ex) results["errors"] += 1 return results def main() -> int: logger.setLevel(LOG_LEVEL) logger.info("Starting clean duplicates script") logger.debug("Configuration: DUPLICATES_FILE=%s VAR_DIRECTORY=%s DRY_RUN=%s LOG_LEVEL=%s", DUPLICATES_FILE, VAR_DIRECTORY, DRY_RUN, LOG_LEVEL) if not os.path.exists(DUPLICATES_FILE): logger.error("File '%s' not found in working dir %s", DUPLICATES_FILE, os.getcwd()) return 2 try: blocks = parse_duplicates_file(DUPLICATES_FILE) except Exception as ex: logger.error("Failed to parse duplicates file: %s", ex) return 3 logger.info("Parsed %d duplicate groups", len(blocks)) results = process_blocks(blocks, VAR_DIRECTORY) logger.info("Done. Results: total_blocks=%d candidate_files=%d moved_files=%d would_move_files=%d skipped_blocks=%d errors=%d", results["total_blocks"], results["candidate_files"], results.get("moved_files", 0), results.get("would_move_files", 0), results["skipped_blocks"], results["errors"]) # if any local trash used and not dry run, show message if not SEND2TRASH_AVAILABLE and not DRY_RUN: logger.warning("send2trash not available; files were moved to %s", os.path.abspath(LOCAL_TRASH_DIR)) return 0 if __name__ == "__main__": rc = main() sys.exit(rc)