Version 2.1

2026-03-16 09:34:32 +01:00 · 2026-03-16 09:34:32 +01:00 · 81639a87b5
parent b90b7bf3e2
commit 81639a87b5
10 changed files with 540 additions and 36 deletions
--- a/.env_example
+++ b/.env_example
@ -0,0 +1,52 @@
 # AWS credentials (replace with your own credentials)
 AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID
 AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY
 AWS_REGION=us-east-1
 AWS_ENDPOINT_URL=https://s3.your-provider.example.com
 BUCKET_NAME=your-bucket-name
 # Database credentials (replace with your own database info)
 #DB_HOST=your-db-host
 #DB_NAME=your_db_name
 #DB_USER=your_db_user
 #DB_PASSWORD=your_db_password
 #DB_PORT=5432
 # Example local development database
 DB_HOST=127.0.0.1
 DB_NAME=your_local_db_name
 # DB_NAME=artchive_production
 DB_USER=your_local_db_user
 DB_PASSWORD=your_local_db_password
 DB_PORT=5432
 # LOGS FILE
 LOG_FILE_PATH="./logs/ACH_media_import_errors.log"
 ERROR_LOG_FILE_PATH="./logs/ACH_media_import_critical_errors.log"
 WARING_LOG_FILE_PATH="./logs/ACH_media_import_warnings.log"
 INFO_LOG_FILE_PATH="./logs/ACH_media_import_info.log"
 # Email configuration (replace with your SMTP settings)
 SMTP_SERVER=smtp.example.com
 SMTP_PORT=587
 SMTP_USER=your-smtp-user
 SMTP_PASSWORD="your-smtp-password"
 SENDER_EMAIL=sender@example.com
 EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
 ERROR_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
 SUCCESS_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
 # ACH configuration
 ACH_ENV="development" # "production" or "development"
 ACH_DRY_RUN="true"
 ACH_SAFE_RUN="true"
 ACH_CACHE_S3_LIST="true"
 ACH_SYNC_CHUNK_SIZE=10 # in % of total files to import, from media_files_to_process used to determine the number of files to process in each batch
 ACH_EDITOR_ID=1
 ACH_APPROVER_ID=1
 ACH_NOTES="Imported automatically from the S3 bucket"
 ACH_STORAGE_LOCATION='{"storage_type": "lto", "storage_location_id": 6}'
 ACH_FILE_TYPE='{"type": "video/mov"}'
--- a/.gitignore
+++ b/.gitignore
@ -6,4 +6,226 @@ __pycache__/
 *.pyo
 logs/
 *.logs
-*.log
+*.log
 output/
 TODO-mime.md
 .github/copilot-instructions.md
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[codz]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #   Usually these files are written by a python script from a template
 #   before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py.cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 # Pipfile.lock
 # UV
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 # uv.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 # poetry.lock
 # poetry.toml
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
 #   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
 # pdm.lock
 # pdm.toml
 .pdm-python
 .pdm-build/
 # pixi
 #   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
 # pixi.lock
 #   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
 #   in the .venv directory. It is recommended not to include this directory in version control.
 .pixi
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # Redis
 *.rdb
 *.aof
 *.pid
 # RabbitMQ
 mnesia/
 rabbitmq/
 rabbitmq-data/
 # ActiveMQ
 activemq-data/
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .envrc
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #   and can be added to the global gitignore or merged into this file.  For a more nuclear
 #   option (not recommended) you can uncomment the following to ignore the entire idea folder.
 # .idea/
 # Abstra
 #   Abstra is an AI-powered process automation framework.
 #   Ignore directories containing user credentials, local state, and settings.
 #   Learn more at https://abstra.io/docs
 .abstra/
 # Visual Studio Code
 #   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
 #   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
 #   and can be added to the global gitignore or merged into this file. However, if you prefer, 
 #   you could uncomment the following to ignore the entire vscode folder
 # .vscode/
 # Ruff stuff:
 .ruff_cache/
 # PyPI configuration file
 .pypirc
 # Marimo
 marimo/_static/
 marimo/_lsp/
 __marimo__/
 # Streamlit
 .streamlit/secrets.toml
--- a/README.md
+++ b/README.md
@ -95,6 +95,18 @@ docker compose up -d --build
 docker compose logs -f app
 ```
 ### Run inside the container (from the host)
 If you want to execute the importer manually inside the running container (for debugging or one-off runs), you can use either of the following:
 ```bash
 # Using docker compose (recommended)
 docker compose exec app python /app/main.py
 # Or using docker exec with the container name
 docker exec -it ACH_server_media_importer python /app/main.py
 ```
 ### Stop
 ```bash
--- a/db_utils.py
+++ b/db_utils.py
@ -30,7 +30,8 @@ from psycopg2 import sql
 import logging
 from datetime import datetime
 import re
-from email_utils import handle_error
+from error_handler import notify_error
 from utils import check_audio_info
 import json
 import os
 import config
@ -73,8 +74,24 @@ def get_mime_from_mediainfo(ach_variables: dict) -> str:
    mediainfo = ach_variables.get('custom_data_in', {}).get('mediainfo', {})
    tracks = mediainfo.get('media', {}).get('track', [])
-    # ---- Master (outside FILE/) must be ProRes ----
+    # ---- Master (outside FILE/) ----
    if not is_file_folder:
        # Determine if this is an audio-type inventory code (OA4/MCC/DAT) or video master.
        inventory_code = ach_variables.get('inventory_code', '') or ''
        inventory_type = inventory_code[3:6] if len(inventory_code) >= 6 else ''
        audio_inventory_types = {'OA4', 'MCC', 'DAT'}
        if inventory_type in audio_inventory_types:
            # For audio masters we validate the audio metadata (not ProRes video tracks).
            result, message = check_audio_info(mediainfo)
            if not result:
                raise ValueError(f"Audio validation failed: {message}")
            # Derive MIME from extension; fall back to the configured mapping.
            extension = os.path.splitext(file_fullpath_norm)[1].lower()
            return get_mime_for_extension(extension or ach_variables.get('extension'))
        # Otherwise, enforce a ProRes video track for video masters.
        # Find the video track
        video_track = None
        for t in tracks:
@ -250,16 +267,15 @@ def check_inventory_in_db(s3_client, cur, base_name):
        if result:
            logging.info(f"Inventory code {truncated_base_name[:12]} found in the database.")
            # Call the function to retrieve digital file names, assuming this function is implemented
            return True, truncated_base_name
        else:
            logging.info(f"Inventory code {truncated_base_name} not found in the database.")
-            handle_error(f"Inventory code {truncated_base_name} not found in the database.")
+            notify_error(f"Inventory code {truncated_base_name} not found in the database.")
            #raise ValueError(f"Inventory code {truncated_base_name} not found in the database.")
            return False, None
    except Exception as e:
-        logging.error(f'Error checking inventory code {base_name}:', {e})
+        notify_error(f'Error checking inventory code {base_name}', e)
        raise e
 # Function to check if the object key exists in the database
@ -296,7 +312,7 @@ def check_objkey_in_file_db(cur, base_name):
            return False
    except Exception as e:
-        logging.error(f"Error checking inventory code {base_name}: {e}")
+        notify_error(f"Error checking inventory code {base_name}", e)
        raise e
 # Function to add a file record and its relationship to the support record
@ -333,7 +349,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
    notes = f"{ach_config.get('ach_notes','') } {date_part} {time_part}"
    ach_variables['file_copia_conservativa'] = ach_variables['custom_data_in'].get('mediainfo', {}).get("media", {}).get("@ref", "")
-    logging.info(f"ach_variables['file_copia_conservativa']a: {ach_variables['file_copia_conservativa']}")
+    logging.info(f"ach_variables['file_copia_conservativa']: {ach_variables['file_copia_conservativa']}")
    logging.debug("Executing add_file_record_and_relationship")
@ -441,7 +457,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
            return False
    except Exception as e:
-        logging.error(f'Error adding file record and relationship: {e}')
+        notify_error(f"Error adding file record and relationship: {base_name}", e)
        raise e
 # Functio to add a file record 
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,7 +1,7 @@
 services:
  app:
    build: .
-    container_name: ACH_server_media_importer
+    container_name: ACH_server_media_importer02
    volumes:
      - logs:/app/logs  # Add this line to map the logs volume
    env_file:
--- a/error_handler.py
+++ b/error_handler.py
@ -17,6 +17,18 @@ def handle_value_error(e):
 def handle_error(error_message):
    logging.error(f"Error: {error_message}")
 def notify_error(error_message, e=None):
    """
    Centralized error reporting: logs the error and triggers email notification.
    """
    full_message = f"{error_message}: {e}" if e else error_message
    logging.error(full_message)
    try:
        from email_utils import handle_error as send_notification
        send_notification(Exception(full_message) if e is None else e)
    except Exception as notify_err:
        logging.error(f"Failed to trigger email notification: {notify_err}")
 class ClientError(Exception):
    """Custom exception class for client errors."""
    pass
--- a/file_utils.py
+++ b/file_utils.py
@ -72,7 +72,7 @@ def retrieve_file_contents(s3, base_name):
        logging.error(f'Error formatting file contents as JSON: {e}', exc_info=True)
        raise e
-def check_related_files(s3, file_name_with_path, file, bucket_name):
+def check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=None):
    """
    Check for related files in S3 based on the given file type.
    Parameters:
@ -80,6 +80,7 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
    - file_name_with_path: The name of the file with its path.
    - file: The file name.
    - bucket_name: The name of the S3 bucket.
    - s3_listing_cache: Optional mapping of S3 keys to listing objects.
    Returns:
    None
    Raises:
@ -111,7 +112,14 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
        logging.info(f"Checking for related file: {related_file}")
        try:
-            if not check_file_exists_in_s3(s3, related_file,bucket_name):
+            # Optimized existence check
            exists = False
            if s3_listing_cache:
                exists = related_file in s3_listing_cache
            else:
                exists = check_file_exists_in_s3(s3, related_file, bucket_name)
            if not exists:
                error_message = f"Required file {related_file} not found in S3."
                logging.error(error_message)
                raise FileNotFoundError(error_message)
@ -126,10 +134,15 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
        # Check the size of the related file
        try:
-            if ext in ['json', 'md5', 'pdf']: 
+            if ext in ['json', 'md5', 'pdf']:
-                file_size = get_file_size(s3, bucket_name, related_file)
+                # Optimized size check
-                if file_size == 0:
+                if s3_listing_cache and related_file in s3_listing_cache:
-                    error_message = f"File {related_file} has zero size."
+                    file_size = s3_listing_cache[related_file].get('Size')
                else:
                    file_size = get_file_size(s3, bucket_name, related_file)
                if file_size == 0 or file_size is None:
                    error_message = f"File {related_file} has zero size or missing."
                    logging.error(error_message)
                    raise ValueError(error_message)
                else:
@ -143,8 +156,17 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
        # If the required file is a .pdf, get its size and update ach_pdf_disk_size
        if ext =='pdf': 
            pdf_file = f"{file_name_with_path}.pdf"
-            if  check_file_exists_in_s3(s3, pdf_file,bucket_name):
+            pdf_exists = False
-                pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
+            if s3_listing_cache:
                pdf_exists = pdf_file in s3_listing_cache
            else:
                pdf_exists = check_file_exists_in_s3(s3, pdf_file, bucket_name)
            if pdf_exists:
                if s3_listing_cache and pdf_file in s3_listing_cache:
                    pdf_file_size = s3_listing_cache[pdf_file].get('Size')
                else:
                    pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
                ach_pdf_disk_size = pdf_file_size
                # logging.info(f"PDF disk size: {ach_pdf_disk_size}")
            else:
--- a/main.py
+++ b/main.py
@ -5,6 +5,7 @@ from datetime import datetime
 import pytz
 import os
 import re
 import math
 from logging_config import setup_logging, CUSTOM_ERROR_LEVEL
 from email_utils import handle_error, send_email_with_attachment
 from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files
@ -90,17 +91,24 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
        # List S3 bucket s3_validated_contents
        list_s3_files = list_s3_bucket(s3_client, bucket_name)
        # Build a quick in-memory map (cache) of the bucket listing.
        # This will be used exclusively for metadata lookups (size/existence)
        # to avoid redundant S3 network calls, without changing the main logic.
        s3_listing_cache = {obj['Key']: obj for obj in list_s3_files}
        # Define valid extensions and excluded folders
        # NOTE: This list is used only for the initial S3 filtering step (Phase 1).
        # It determines which object keys are considered for further processing.
-        valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'}
+        valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'} # dont like this 
        # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'}
        # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/', 'DVD/', 'UMT/'}
-        excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'UMT/'}
+        excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/' ,'MCC/'}
        # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/',}
        # included_folders = {'FILE/'} # uncomment this to NOT use excluded folders 
        # included_folders = {'TEST-FOLDER-DEV/'} # uncomment this to NOT use excluded folders 
        # aggiungere un distinct e count delle estenisoni perma di qualsiasi filtro
        # Extract and filter file names
        # s3_file_names: include only files that match valid extensions and
@ -112,7 +120,26 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
        except NameError:
            use_included = False
        logging.info(f"Filtering S3 objects with valid extensions: {valid_extensions}")
        if use_included:
            logging.info(f"Using include-folder filter: {included_folders}")
            filter_mode = "include"
        else:
            logging.info(f"Using exclusion-folder filter: {excluded_folders}")
            filter_mode = "exclude"
        # Ask for confirmation before proceeding (y/N). If user declines, exit cleanly.
        try:
            answer = input(f"Proceed using '{filter_mode}' filter mode? (y/N): ").strip().lower()
        except Exception:
            answer = 'n'
        if answer != 'y':
            logging.info("User chose not to proceed with the current filter mode. Exiting.")
            return
        if use_included:
            s3_file_names = [
                content['Key'] for content in list_s3_files
                if any(content['Key'].endswith(ext) for ext in valid_extensions)
@ -127,6 +154,19 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
            ]
            logging.info("Using excluded_folders filter")
        # Count file extensions that survived the initial filtering.
        # This provides a stable "perma" summary of what is being considered
        # for the rest of the workflow.
        from collections import Counter
        extension_counts = Counter(
            os.path.splitext(f)[1].lower() or "(no_ext)" for f in s3_file_names
        )
        # Log a user-friendly multi-line summary instead of a single dict dump
        extension_summary = "\n".join(
            f"  {ext or '(no_ext)'}: {count}" for ext, count in sorted(extension_counts.items())
        )
        logging.info("Extension counts after initial filtering:\n%s", extension_summary)
        # check inventory code syntax
        # first check s3_file_names if the file base name and folder name match  pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$'
        s3_validated_contents = []
@ -207,11 +247,16 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
        filtered_file_names=list_s3_not_in_db(s3_validated_contents, db_file_names, db_sidecar_basenames)           
        # Produces only media files for Phase 3 parsing.
        # Sidecars (.json, .md5, .pdf) are validated as dependencies of the media files.
        media_files_to_process = [f for f in filtered_file_names if f.lower().endswith(('.mp4', '.mp3'))]
        # Print the total number of files
        total_files_s3 = len(s3_validated_contents)
        logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
        total_files = len(filtered_file_names)
        logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
        logging.info(f"Total media files (.mp4, .mp3) to process in Phase 3: {len(media_files_to_process)}")
        # Log the files that need to be updated (those not yet in DB)
        if total_files > 0:
@ -329,11 +374,44 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
        # ---------------------------------------------------------------------
        logging.info("PHASE 3: Parse S3 objects and insert new records into the database")
        # Implement ACH_SYNC_CHUNK_SIZE (dev/testing only).
        # Production always processes 100%.
        # ach_sync_pct = 100
        # if ach_env != 'production':
        #     try:
        #         ach_sync_pct = int(os.getenv('ACH_SYNC_CHUNK_SIZE', '100'))
        #     except Exception:
        #         ach_sync_pct = 100
        # ach_sync_pct = max(0, min(100, ach_sync_pct))
        # total_media_files = len(media_files_to_process)
        # if total_media_files > 0 and ach_sync_pct < 100:
        #     sync_limit = max(1, math.ceil(total_media_files * ach_sync_pct / 100))
        #     logging.info(
        #         "ACH_SYNC_CHUNK_SIZE enabled: processing %s/%s media files (%s%%)",
        #         sync_limit,
        #         total_media_files,
        #         ach_sync_pct,
        #     )
        #     media_files_to_process = media_files_to_process[:sync_limit]
        # else:
        #     logging.info(
        #         "Processing all %s media files (ACH_SYNC_CHUNK_SIZE=%s%%)",
        #         total_media_files,
        #         ach_sync_pct,
        #     )
        # Try to parse S3 files
        try:
            # If DRY RUN is set to True, the files will not be uploaded to the database
            if os.getenv('ACH_DRY_RUN', 'true') == 'false':
-                uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(s3_client, filtered_file_names, ach_variables, excluded_folders)
+                uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(
                    s3_client, 
                    media_files_to_process, 
                    ach_variables, 
                    excluded_folders,
                    s3_listing_cache=s3_listing_cache
                )
            else:
                logging.warning("DRY RUN is set to TRUE - No files will be added to the database")
                # set the tuples to zero
--- a/s3_cache.json
+++ b/s3_cache.json
--- a/s3_utils.py
+++ b/s3_utils.py
@ -7,20 +7,21 @@ import psycopg2 # for PostgreSQL
 # Import custom modules
 from file_utils import retrieve_file_contents, check_related_files, extract_and_validate_file_info # for file operations
-from email_utils import handle_error # for error handling depecradted?
+from error_handler import notify_error
 from db_utils import get_db_connection, check_inventory_in_db, check_objkey_in_file_db, add_file_record_and_relationship, retrieve_digital_file_names # for database operations
 import config
 # Function to check the existence of related files and validate in PostgreSQL 
-def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
+def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[], s3_listing_cache=None):
    """
    Parses the S3 files and performs various operations on them.
    Args:
        s3 (S3): The S3 object for accessing S3 services.
        s3_files (list): The list of S3 files to be processed.
        s3_listing_cache (dict, optional): Mapping of S3 keys to listing objects.
    Returns:
-        None
+        tuple: (uploaded_files_count, warning_files_count, error_files_count)
    Raises:
        FileNotFoundError: If a required file is not found in S3.
        ValueError: If a file has zero size or if the file type is unsupported.
@ -69,9 +70,42 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
            # Display progress to console only (not written to log files)
            print(f"--------------\n---  file {idx} of {total_files}  ---\n--------------", flush=True)
-            # Use a savepoint per file to allow rollback on individual failures
+            # Ensure we start each file with a clean transaction state.
-            # without aborting the full batch.
+            try:
-            cur.execute("SAVEPOINT file_save")
+                conn.rollback()
            except Exception as e:
                logging.error(f"Rollback failed before processing {file}: {e}")
            try:
                cur.close()
            except Exception:
                pass
            cur = conn.cursor()
            try:
                # Use a savepoint per file to allow rollback on individual failures
                # without aborting the full batch.
                cur.execute("SAVEPOINT file_save")
            except Exception as e:
                # If the transaction is aborted, log and retry once.
                import traceback
                logging.error(f"Transaction aborted before processing {file}, retrying after reset: {e}")
                logging.error(traceback.format_exc())
                try:
                    conn.rollback()
                except Exception as rollback_err:
                    logging.error(f"Rollback failed while recovering from aborted transaction: {rollback_err}")
                try:
                    cur.close()
                except Exception:
                    pass
                cur = conn.cursor()
                # Retry savepoint once after recovery
                cur.execute("SAVEPOINT file_save")
            try:
                if file.endswith(('.mp4', '.mp3')):  # Check for both .mp4 and .mp3
                    logging.info("Processing file: %s in the bucket: %s", file, bucket_name)
@ -130,7 +164,13 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                try:
                    # Retrieve and log the file size
-                    file_size = get_file_size(s3, bucket_name, file)
+                    # Optimized: Check cache first
                    if s3_listing_cache and file in s3_listing_cache:
                        file_size = s3_listing_cache[file].get('Size')
                        logging.info(f"Retrieved file size from cache for: {file}")
                    else:
                        file_size = get_file_size(s3, bucket_name, file)
                    # maybe can trow an error inside te get_file_size function and catch it here
                    if file_size is not None:
                        ach_variables['media_disk_size'] = file_size
@ -142,12 +182,12 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                            logging.error("ACH_SAFE_RUN=true: aborting Phase 3 due to warnings (missing file size): %s", file)
                            raise ValueError("ACH_SAFE_RUN=true: aborting due to warnings in Phase 3")
                        continue  # Skip to the next file in the loop
-                    
+
                    logging.info("Start Validating files for %s...", base_name)
                    # Check if related file exist and retreive .pdf file size
                    try:
                        # Check if the required files exist in S3
-                        ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name)
+                        ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=s3_listing_cache)
                        logging.info(f"PDF disk size: {ach_variables['pdf_disk_size']}")
                    except FileNotFoundError as e:
                        # Handle case where the file is not found
@ -166,7 +206,10 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                        continue  # Move on to the next file in the loop      
                    except Exception as e:
                        # Handle any other exceptions
-                        logging.error(f"An error occurred: {e}")
+                        logging.error(f"Validation step failed for {file}: {e}")
                        cur.execute("ROLLBACK TO SAVEPOINT file_save")
                        error_files_count += 1
                        continue
                    # Retrieve the file contents for related files: .md5, .json
                    try:
@ -176,6 +219,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                    except Exception as e:
                        # Log the error 
                        logging.error(f"Error retrieving file contents for {file_name_with_path}: {e}")
                        cur.execute("ROLLBACK TO SAVEPOINT file_save")
                        file_contents = None  # Set file_contents to None or handle it as needed
                        error_files_count +=1
                        continue  # Move on to the next file in the loop
@ -183,6 +227,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                    # if contents dont exists
                    if file_contents is None:
                        logging.error(f"Error retrieving file contents for {file}.")
                        cur.execute("ROLLBACK TO SAVEPOINT file_save")
                        error_files_count +=1
                        continue  # Move on to the next file in the loop
@ -222,14 +267,17 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                            add_file_record_and_relationship(s3, cur, base_name, ach_variables)
                        else:
                            logging.warning(f"File record already exists for {base_name}.")
                            cur.execute("ROLLBACK TO SAVEPOINT file_save")
                            warning_files_count +=1
                            continue        
                    else:
                        logging.error(f"Inventory code {base_name} not found in the database.")
                        cur.execute("ROLLBACK TO SAVEPOINT file_save")
                        error_files_count +=1
                        continue    
-                except ValueError as e:
+                except Exception as e:
-                    logging.error(f"An error occurred: {e}")
+                    logging.error(f"DB operation failed for {base_name}: {e}")
                    cur.execute("ROLLBACK TO SAVEPOINT file_save")
                    error_files_count +=1
                    continue 
@ -241,7 +289,9 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                uploaded_files_count +=1          
            except Exception as e:
                # Roll back the changes done for this file only and continue processing others
                import traceback
                logging.error(f"Error processing {file}: {e}. Rolling back this file's changes.")
                logging.error(traceback.format_exc())
                try:
                    cur.execute("ROLLBACK TO SAVEPOINT file_save")
                except Exception as rollback_err:
@ -264,8 +314,8 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
        raise e  # Raise the exception to the calling function
    except Exception as e:
        # Handle any other unexpected errors
-        logging.error(f"Unexpected error: {e}")
+        import traceback
-        #handle_error(e)  # Pass unexpected errors to handle_error
+        notify_error("FATAL ERROR in Phase 3 process", e)
        raise e  # Raise the exception to the calling function
    # return the file saved
@ -294,15 +344,54 @@ def create_s3_client(aws_config):
 # Function to list the contents of an S3 bucket
 def list_s3_bucket(s3_client, bucket_name):
    """
    Lists S3 bucket contents with optional local JSON caching.
    Uses ACH_CACHE_S3_LIST from .env to decide if it should cache/read from 's3_cache.json'.
    """
    cache_enabled = os.getenv('ACH_CACHE_S3_LIST', 'false').lower() == 'true'
    cache_file = 's3_cache.json'
    if cache_enabled and os.path.exists(cache_file):
        try:
            with open(cache_file, 'r', encoding='utf-8') as f:
                cached_data = json.load(f)
                logging.info(f"Loaded {len(cached_data)} items from local cache: {cache_file}")
                return cached_data
        except Exception as e:
            logging.warning(f"Failed to read S3 cache file: {e}. Falling back to S3 listing.")
    try:
        logging.info(f"Listing all objects in bucket: {bucket_name}...")
        paginator = s3_client.get_paginator('list_objects_v2')
        bucket_contents = []
        # Convert datetime objects to string for JSON serialization
        def _serialize_datetime(obj):
            if isinstance(obj, datetime):
                return obj.isoformat()
            return obj
        from datetime import datetime
        for page in paginator.paginate(Bucket=bucket_name):
            if 'Contents' in page:
-                bucket_contents.extend(page['Contents'])
+                for obj in page['Contents']:
                    # Normalize dates for JSON compatibility
                    if 'LastModified' in obj:
                        obj['LastModified'] = obj['LastModified'].isoformat()
                    bucket_contents.extend([obj])
        logging.info(f"Retrieved {len(bucket_contents)} items from the bucket.")
        # Save to cache if enabled
        if cache_enabled:
            try:
                with open(cache_file, 'w', encoding='utf-8') as f:
                    json.dump(bucket_contents, f)
                logging.info(f"S3 bucket listing saved to local cache: {cache_file}")
            except Exception as e:
                logging.warning(f"Failed to write S3 cache file: {e}")
        return bucket_contents
    except ClientError as e:
        logging.error(f'Error listing bucket contents: {e}')