Version 2.1

2026-03-16 09:34:32 +01:00 · 2026-03-16 09:34:32 +01:00 · 81639a87b5
parent b90b7bf3e2
commit 81639a87b5
10 changed files with 540 additions and 36 deletions
--- a/.env_example
+++ b/.env_example
@ -0,0 +1,52 @@
+# AWS credentials (replace with your own credentials)
+AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID
+AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY
+AWS_REGION=us-east-1
+AWS_ENDPOINT_URL=https://s3.your-provider.example.com
+
+BUCKET_NAME=your-bucket-name
+
+# Database credentials (replace with your own database info)
+#DB_HOST=your-db-host
+#DB_NAME=your_db_name
+#DB_USER=your_db_user
+#DB_PASSWORD=your_db_password
+#DB_PORT=5432
+
+# Example local development database
+DB_HOST=127.0.0.1
+DB_NAME=your_local_db_name
+# DB_NAME=artchive_production
+DB_USER=your_local_db_user
+DB_PASSWORD=your_local_db_password
+DB_PORT=5432
+
+# LOGS FILE
+LOG_FILE_PATH="./logs/ACH_media_import_errors.log"
+ERROR_LOG_FILE_PATH="./logs/ACH_media_import_critical_errors.log"
+WARING_LOG_FILE_PATH="./logs/ACH_media_import_warnings.log"
+INFO_LOG_FILE_PATH="./logs/ACH_media_import_info.log"
+
+# Email configuration (replace with your SMTP settings)
+SMTP_SERVER=smtp.example.com
+SMTP_PORT=587
+SMTP_USER=your-smtp-user
+SMTP_PASSWORD="your-smtp-password"
+SENDER_EMAIL=sender@example.com
+EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
+ERROR_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
+SUCCESS_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
+
+# ACH configuration
+ACH_ENV="development" # "production" or "development"
+ACH_DRY_RUN="true"
+ACH_SAFE_RUN="true"
+ACH_CACHE_S3_LIST="true"
+
+ACH_SYNC_CHUNK_SIZE=10 # in % of total files to import, from media_files_to_process used to determine the number of files to process in each batch
+
+ACH_EDITOR_ID=1
+ACH_APPROVER_ID=1
+ACH_NOTES="Imported automatically from the S3 bucket"
+ACH_STORAGE_LOCATION='{"storage_type": "lto", "storage_location_id": 6}'
+ACH_FILE_TYPE='{"type": "video/mov"}'
--- a/.gitignore
+++ b/.gitignore
@ -6,4 +6,226 @@ __pycache__/
 *.pyo
 logs/
 *.logs
-*.log
+*.log
+
+output/
+
+TODO-mime.md
+.github/copilot-instructions.md
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# Redis
+*.rdb
+*.aof
+*.pid
+
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+
+# ActiveMQ
+activemq-data/
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+
+# Streamlit
+.streamlit/secrets.toml
--- a/README.md
+++ b/README.md
@ -95,6 +95,18 @@ docker compose up -d --build
 docker compose logs -f app
 ```

+### Run inside the container (from the host)
+
+If you want to execute the importer manually inside the running container (for debugging or one-off runs), you can use either of the following:
+
+```bash
+# Using docker compose (recommended)
+docker compose exec app python /app/main.py
+
+# Or using docker exec with the container name
+docker exec -it ACH_server_media_importer python /app/main.py
+```
+
 ### Stop

 ```bash
--- a/db_utils.py
+++ b/db_utils.py
@ -30,7 +30,8 @@ from psycopg2 import sql
 import logging
 from datetime import datetime
 import re
-from email_utils import handle_error
+from error_handler import notify_error
+from utils import check_audio_info
 import json
 import os
 import config
@ -73,8 +74,24 @@ def get_mime_from_mediainfo(ach_variables: dict) -> str:
    mediainfo = ach_variables.get('custom_data_in', {}).get('mediainfo', {})
    tracks = mediainfo.get('media', {}).get('track', [])

-    # ---- Master (outside FILE/) must be ProRes ----
+    # ---- Master (outside FILE/) ----
    if not is_file_folder:
+        # Determine if this is an audio-type inventory code (OA4/MCC/DAT) or video master.
+        inventory_code = ach_variables.get('inventory_code', '') or ''
+        inventory_type = inventory_code[3:6] if len(inventory_code) >= 6 else ''
+        audio_inventory_types = {'OA4', 'MCC', 'DAT'}
+
+        if inventory_type in audio_inventory_types:
+            # For audio masters we validate the audio metadata (not ProRes video tracks).
+            result, message = check_audio_info(mediainfo)
+            if not result:
+                raise ValueError(f"Audio validation failed: {message}")
+
+            # Derive MIME from extension; fall back to the configured mapping.
+            extension = os.path.splitext(file_fullpath_norm)[1].lower()
+            return get_mime_for_extension(extension or ach_variables.get('extension'))
+
+        # Otherwise, enforce a ProRes video track for video masters.
        # Find the video track
        video_track = None
        for t in tracks:
@ -250,16 +267,15 @@ def check_inventory_in_db(s3_client, cur, base_name):

        if result:
            logging.info(f"Inventory code {truncated_base_name[:12]} found in the database.")
-            # Call the function to retrieve digital file names, assuming this function is implemented
            return True, truncated_base_name
        else:
            logging.info(f"Inventory code {truncated_base_name} not found in the database.")
-            handle_error(f"Inventory code {truncated_base_name} not found in the database.")
+            notify_error(f"Inventory code {truncated_base_name} not found in the database.")
            #raise ValueError(f"Inventory code {truncated_base_name} not found in the database.")
            return False, None

    except Exception as e:
-        logging.error(f'Error checking inventory code {base_name}:', {e})
+        notify_error(f'Error checking inventory code {base_name}', e)
        raise e

 # Function to check if the object key exists in the database
@ -296,7 +312,7 @@ def check_objkey_in_file_db(cur, base_name):
            return False

    except Exception as e:
-        logging.error(f"Error checking inventory code {base_name}: {e}")
+        notify_error(f"Error checking inventory code {base_name}", e)
        raise e

 # Function to add a file record and its relationship to the support record
@ -333,7 +349,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
    notes = f"{ach_config.get('ach_notes','') } {date_part} {time_part}"

    ach_variables['file_copia_conservativa'] = ach_variables['custom_data_in'].get('mediainfo', {}).get("media", {}).get("@ref", "")
-    logging.info(f"ach_variables['file_copia_conservativa']a: {ach_variables['file_copia_conservativa']}")
+    logging.info(f"ach_variables['file_copia_conservativa']: {ach_variables['file_copia_conservativa']}")

    logging.debug("Executing add_file_record_and_relationship")

@ -441,7 +457,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
            return False

    except Exception as e:
-        logging.error(f'Error adding file record and relationship: {e}')
+        notify_error(f"Error adding file record and relationship: {base_name}", e)
        raise e

 # Functio to add a file record 
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,7 +1,7 @@
 services:
  app:
    build: .
-    container_name: ACH_server_media_importer
+    container_name: ACH_server_media_importer02
    volumes:
      - logs:/app/logs  # Add this line to map the logs volume
    env_file:
--- a/error_handler.py
+++ b/error_handler.py
@ -17,6 +17,18 @@ def handle_value_error(e):
 def handle_error(error_message):
    logging.error(f"Error: {error_message}")

+def notify_error(error_message, e=None):
+    """
+    Centralized error reporting: logs the error and triggers email notification.
+    """
+    full_message = f"{error_message}: {e}" if e else error_message
+    logging.error(full_message)
+    try:
+        from email_utils import handle_error as send_notification
+        send_notification(Exception(full_message) if e is None else e)
+    except Exception as notify_err:
+        logging.error(f"Failed to trigger email notification: {notify_err}")
+
 class ClientError(Exception):
    """Custom exception class for client errors."""
    pass
--- a/file_utils.py
+++ b/file_utils.py
@ -72,7 +72,7 @@ def retrieve_file_contents(s3, base_name):
        logging.error(f'Error formatting file contents as JSON: {e}', exc_info=True)
        raise e

-def check_related_files(s3, file_name_with_path, file, bucket_name):
+def check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=None):
    """
    Check for related files in S3 based on the given file type.
    Parameters:
@ -80,6 +80,7 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
    - file_name_with_path: The name of the file with its path.
    - file: The file name.
    - bucket_name: The name of the S3 bucket.
+    - s3_listing_cache: Optional mapping of S3 keys to listing objects.
    Returns:
    None
    Raises:
@ -111,7 +112,14 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
        logging.info(f"Checking for related file: {related_file}")

        try:
-            if not check_file_exists_in_s3(s3, related_file,bucket_name):
+            # Optimized existence check
+            exists = False
+            if s3_listing_cache:
+                exists = related_file in s3_listing_cache
+            else:
+                exists = check_file_exists_in_s3(s3, related_file, bucket_name)
+
+            if not exists:
                error_message = f"Required file {related_file} not found in S3."
                logging.error(error_message)
                raise FileNotFoundError(error_message)
@ -126,10 +134,15 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):

        # Check the size of the related file
        try:
-            if ext in ['json', 'md5', 'pdf']: 
-                file_size = get_file_size(s3, bucket_name, related_file)
-                if file_size == 0:
-                    error_message = f"File {related_file} has zero size."
+            if ext in ['json', 'md5', 'pdf']:
+                # Optimized size check
+                if s3_listing_cache and related_file in s3_listing_cache:
+                    file_size = s3_listing_cache[related_file].get('Size')
+                else:
+                    file_size = get_file_size(s3, bucket_name, related_file)
+
+                if file_size == 0 or file_size is None:
+                    error_message = f"File {related_file} has zero size or missing."
                    logging.error(error_message)
                    raise ValueError(error_message)
                else:
@ -143,8 +156,17 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
        # If the required file is a .pdf, get its size and update ach_pdf_disk_size
        if ext =='pdf': 
            pdf_file = f"{file_name_with_path}.pdf"
-            if  check_file_exists_in_s3(s3, pdf_file,bucket_name):
-                pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
+            pdf_exists = False
+            if s3_listing_cache:
+                pdf_exists = pdf_file in s3_listing_cache
+            else:
+                pdf_exists = check_file_exists_in_s3(s3, pdf_file, bucket_name)
+
+            if pdf_exists:
+                if s3_listing_cache and pdf_file in s3_listing_cache:
+                    pdf_file_size = s3_listing_cache[pdf_file].get('Size')
+                else:
+                    pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
                ach_pdf_disk_size = pdf_file_size
                # logging.info(f"PDF disk size: {ach_pdf_disk_size}")
            else:
--- a/main.py
+++ b/main.py
@ -5,6 +5,7 @@ from datetime import datetime
 import pytz
 import os
 import re
+import math
 from logging_config import setup_logging, CUSTOM_ERROR_LEVEL
 from email_utils import handle_error, send_email_with_attachment
 from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files
@ -90,17 +91,24 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
        # List S3 bucket s3_validated_contents
        list_s3_files = list_s3_bucket(s3_client, bucket_name)

+        # Build a quick in-memory map (cache) of the bucket listing.
+        # This will be used exclusively for metadata lookups (size/existence)
+        # to avoid redundant S3 network calls, without changing the main logic.
+        s3_listing_cache = {obj['Key']: obj for obj in list_s3_files}
+
        # Define valid extensions and excluded folders
        # NOTE: This list is used only for the initial S3 filtering step (Phase 1).
        # It determines which object keys are considered for further processing.
-        valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'}
+        valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'} # dont like this 
        # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'}
        # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/', 'DVD/', 'UMT/'}
-        excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'UMT/'}
+        excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/' ,'MCC/'}
        # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/',}
        # included_folders = {'FILE/'} # uncomment this to NOT use excluded folders 
        # included_folders = {'TEST-FOLDER-DEV/'} # uncomment this to NOT use excluded folders 

+        # aggiungere un distinct e count delle estenisoni perma di qualsiasi filtro
+
        # Extract and filter file names
    
        # s3_file_names: include only files that match valid extensions and
@ -112,7 +120,26 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
        except NameError:
            use_included = False

+        logging.info(f"Filtering S3 objects with valid extensions: {valid_extensions}")
        if use_included:
+            logging.info(f"Using include-folder filter: {included_folders}")
+            filter_mode = "include"
+        else:
+            logging.info(f"Using exclusion-folder filter: {excluded_folders}")
+            filter_mode = "exclude"
+
+        # Ask for confirmation before proceeding (y/N). If user declines, exit cleanly.
+        try:
+            answer = input(f"Proceed using '{filter_mode}' filter mode? (y/N): ").strip().lower()
+        except Exception:
+            answer = 'n'
+
+        if answer != 'y':
+            logging.info("User chose not to proceed with the current filter mode. Exiting.")
+            return
+
+        if use_included:
+
            s3_file_names = [
                content['Key'] for content in list_s3_files
                if any(content['Key'].endswith(ext) for ext in valid_extensions)
@ -127,6 +154,19 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
            ]
            logging.info("Using excluded_folders filter")

+        # Count file extensions that survived the initial filtering.
+        # This provides a stable "perma" summary of what is being considered
+        # for the rest of the workflow.
+        from collections import Counter
+        extension_counts = Counter(
+            os.path.splitext(f)[1].lower() or "(no_ext)" for f in s3_file_names
+        )
+        # Log a user-friendly multi-line summary instead of a single dict dump
+        extension_summary = "\n".join(
+            f"  {ext or '(no_ext)'}: {count}" for ext, count in sorted(extension_counts.items())
+        )
+        logging.info("Extension counts after initial filtering:\n%s", extension_summary)
+
        # check inventory code syntax
        # first check s3_file_names if the file base name and folder name match  pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$'
        s3_validated_contents = []
@ -207,11 +247,16 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
        filtered_file_names=list_s3_not_in_db(s3_validated_contents, db_file_names, db_sidecar_basenames)           


+        # Produces only media files for Phase 3 parsing.
+        # Sidecars (.json, .md5, .pdf) are validated as dependencies of the media files.
+        media_files_to_process = [f for f in filtered_file_names if f.lower().endswith(('.mp4', '.mp3'))]
+
        # Print the total number of files
        total_files_s3 = len(s3_validated_contents)
        logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
        total_files = len(filtered_file_names)
        logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
+        logging.info(f"Total media files (.mp4, .mp3) to process in Phase 3: {len(media_files_to_process)}")
        
        # Log the files that need to be updated (those not yet in DB)
        if total_files > 0:
@ -329,11 +374,44 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
        # ---------------------------------------------------------------------
        logging.info("PHASE 3: Parse S3 objects and insert new records into the database")

+        # Implement ACH_SYNC_CHUNK_SIZE (dev/testing only).
+        # Production always processes 100%.
+        # ach_sync_pct = 100
+        # if ach_env != 'production':
+        #     try:
+        #         ach_sync_pct = int(os.getenv('ACH_SYNC_CHUNK_SIZE', '100'))
+        #     except Exception:
+        #         ach_sync_pct = 100
+        # ach_sync_pct = max(0, min(100, ach_sync_pct))
+
+        # total_media_files = len(media_files_to_process)
+        # if total_media_files > 0 and ach_sync_pct < 100:
+        #     sync_limit = max(1, math.ceil(total_media_files * ach_sync_pct / 100))
+        #     logging.info(
+        #         "ACH_SYNC_CHUNK_SIZE enabled: processing %s/%s media files (%s%%)",
+        #         sync_limit,
+        #         total_media_files,
+        #         ach_sync_pct,
+        #     )
+        #     media_files_to_process = media_files_to_process[:sync_limit]
+        # else:
+        #     logging.info(
+        #         "Processing all %s media files (ACH_SYNC_CHUNK_SIZE=%s%%)",
+        #         total_media_files,
+        #         ach_sync_pct,
+        #     )
+
        # Try to parse S3 files
        try:
            # If DRY RUN is set to True, the files will not be uploaded to the database
            if os.getenv('ACH_DRY_RUN', 'true') == 'false':
-                uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(s3_client, filtered_file_names, ach_variables, excluded_folders)
+                uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(
+                    s3_client, 
+                    media_files_to_process, 
+                    ach_variables, 
+                    excluded_folders,
+                    s3_listing_cache=s3_listing_cache
+                )
            else:
                logging.warning("DRY RUN is set to TRUE - No files will be added to the database")
                # set the tuples to zero
--- a/s3_cache.json
+++ b/s3_cache.json
--- a/s3_utils.py
+++ b/s3_utils.py
@ -7,20 +7,21 @@ import psycopg2 # for PostgreSQL

 # Import custom modules
 from file_utils import retrieve_file_contents, check_related_files, extract_and_validate_file_info # for file operations
-from email_utils import handle_error # for error handling depecradted?
+from error_handler import notify_error
 from db_utils import get_db_connection, check_inventory_in_db, check_objkey_in_file_db, add_file_record_and_relationship, retrieve_digital_file_names # for database operations

 import config

 # Function to check the existence of related files and validate in PostgreSQL 
-def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
+def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[], s3_listing_cache=None):
    """
    Parses the S3 files and performs various operations on them.
    Args:
        s3 (S3): The S3 object for accessing S3 services.
        s3_files (list): The list of S3 files to be processed.
+        s3_listing_cache (dict, optional): Mapping of S3 keys to listing objects.
    Returns:
-        None
+        tuple: (uploaded_files_count, warning_files_count, error_files_count)
    Raises:
        FileNotFoundError: If a required file is not found in S3.
        ValueError: If a file has zero size or if the file type is unsupported.
@ -69,9 +70,42 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
            # Display progress to console only (not written to log files)
            print(f"--------------\n---  file {idx} of {total_files}  ---\n--------------", flush=True)

-            # Use a savepoint per file to allow rollback on individual failures
-            # without aborting the full batch.
-            cur.execute("SAVEPOINT file_save")
+            # Ensure we start each file with a clean transaction state.
+            try:
+                conn.rollback()
+            except Exception as e:
+                logging.error(f"Rollback failed before processing {file}: {e}")
+
+            try:
+                cur.close()
+            except Exception:
+                pass
+            cur = conn.cursor()
+
+            try:
+                # Use a savepoint per file to allow rollback on individual failures
+                # without aborting the full batch.
+                cur.execute("SAVEPOINT file_save")
+            except Exception as e:
+                # If the transaction is aborted, log and retry once.
+                import traceback
+                logging.error(f"Transaction aborted before processing {file}, retrying after reset: {e}")
+                logging.error(traceback.format_exc())
+
+                try:
+                    conn.rollback()
+                except Exception as rollback_err:
+                    logging.error(f"Rollback failed while recovering from aborted transaction: {rollback_err}")
+
+                try:
+                    cur.close()
+                except Exception:
+                    pass
+                cur = conn.cursor()
+
+                # Retry savepoint once after recovery
+                cur.execute("SAVEPOINT file_save")
+
            try:
                if file.endswith(('.mp4', '.mp3')):  # Check for both .mp4 and .mp3
                    logging.info("Processing file: %s in the bucket: %s", file, bucket_name)
@ -130,7 +164,13 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):

                try:
                    # Retrieve and log the file size
-                    file_size = get_file_size(s3, bucket_name, file)
+                    # Optimized: Check cache first
+                    if s3_listing_cache and file in s3_listing_cache:
+                        file_size = s3_listing_cache[file].get('Size')
+                        logging.info(f"Retrieved file size from cache for: {file}")
+                    else:
+                        file_size = get_file_size(s3, bucket_name, file)
+                    
                    # maybe can trow an error inside te get_file_size function and catch it here
                    if file_size is not None:
                        ach_variables['media_disk_size'] = file_size
@ -142,12 +182,12 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                            logging.error("ACH_SAFE_RUN=true: aborting Phase 3 due to warnings (missing file size): %s", file)
                            raise ValueError("ACH_SAFE_RUN=true: aborting due to warnings in Phase 3")
                        continue  # Skip to the next file in the loop
-                    
+
                    logging.info("Start Validating files for %s...", base_name)
                    # Check if related file exist and retreive .pdf file size
                    try:
                        # Check if the required files exist in S3
-                        ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name)
+                        ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=s3_listing_cache)
                        logging.info(f"PDF disk size: {ach_variables['pdf_disk_size']}")
                    except FileNotFoundError as e:
                        # Handle case where the file is not found
@ -166,7 +206,10 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                        continue  # Move on to the next file in the loop      
                    except Exception as e:
                        # Handle any other exceptions
-                        logging.error(f"An error occurred: {e}")
+                        logging.error(f"Validation step failed for {file}: {e}")
+                        cur.execute("ROLLBACK TO SAVEPOINT file_save")
+                        error_files_count += 1
+                        continue
                    
                    # Retrieve the file contents for related files: .md5, .json
                    try:
@ -176,6 +219,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                    except Exception as e:
                        # Log the error 
                        logging.error(f"Error retrieving file contents for {file_name_with_path}: {e}")
+                        cur.execute("ROLLBACK TO SAVEPOINT file_save")
                        file_contents = None  # Set file_contents to None or handle it as needed
                        error_files_count +=1
                        continue  # Move on to the next file in the loop
@ -183,6 +227,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                    # if contents dont exists
                    if file_contents is None:
                        logging.error(f"Error retrieving file contents for {file}.")
+                        cur.execute("ROLLBACK TO SAVEPOINT file_save")
                        error_files_count +=1
                        continue  # Move on to the next file in the loop

@ -222,14 +267,17 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                            add_file_record_and_relationship(s3, cur, base_name, ach_variables)
                        else:
                            logging.warning(f"File record already exists for {base_name}.")
+                            cur.execute("ROLLBACK TO SAVEPOINT file_save")
                            warning_files_count +=1
                            continue        
                    else:
                        logging.error(f"Inventory code {base_name} not found in the database.")
+                        cur.execute("ROLLBACK TO SAVEPOINT file_save")
                        error_files_count +=1
                        continue    
-                except ValueError as e:
-                    logging.error(f"An error occurred: {e}")
+                except Exception as e:
+                    logging.error(f"DB operation failed for {base_name}: {e}")
+                    cur.execute("ROLLBACK TO SAVEPOINT file_save")
                    error_files_count +=1
                    continue 
                
@ -241,7 +289,9 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
                uploaded_files_count +=1          
            except Exception as e:
                # Roll back the changes done for this file only and continue processing others
+                import traceback
                logging.error(f"Error processing {file}: {e}. Rolling back this file's changes.")
+                logging.error(traceback.format_exc())
                try:
                    cur.execute("ROLLBACK TO SAVEPOINT file_save")
                except Exception as rollback_err:
@ -264,8 +314,8 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
        raise e  # Raise the exception to the calling function
    except Exception as e:
        # Handle any other unexpected errors
-        logging.error(f"Unexpected error: {e}")
-        #handle_error(e)  # Pass unexpected errors to handle_error
+        import traceback
+        notify_error("FATAL ERROR in Phase 3 process", e)
        raise e  # Raise the exception to the calling function
    
    # return the file saved
@ -294,15 +344,54 @@ def create_s3_client(aws_config):

 # Function to list the contents of an S3 bucket
 def list_s3_bucket(s3_client, bucket_name):
+    """
+    Lists S3 bucket contents with optional local JSON caching.
+    Uses ACH_CACHE_S3_LIST from .env to decide if it should cache/read from 's3_cache.json'.
+    """
+    cache_enabled = os.getenv('ACH_CACHE_S3_LIST', 'false').lower() == 'true'
+    cache_file = 's3_cache.json'
+
+    if cache_enabled and os.path.exists(cache_file):
+        try:
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                cached_data = json.load(f)
+                logging.info(f"Loaded {len(cached_data)} items from local cache: {cache_file}")
+                return cached_data
+        except Exception as e:
+            logging.warning(f"Failed to read S3 cache file: {e}. Falling back to S3 listing.")
+
    try:
+        logging.info(f"Listing all objects in bucket: {bucket_name}...")
        paginator = s3_client.get_paginator('list_objects_v2')
        bucket_contents = []

+        # Convert datetime objects to string for JSON serialization
+        def _serialize_datetime(obj):
+            if isinstance(obj, datetime):
+                return obj.isoformat()
+            return obj
+
+        from datetime import datetime
+
        for page in paginator.paginate(Bucket=bucket_name):
            if 'Contents' in page:
-                bucket_contents.extend(page['Contents'])
+                for obj in page['Contents']:
+                    # Normalize dates for JSON compatibility
+                    if 'LastModified' in obj:
+                        obj['LastModified'] = obj['LastModified'].isoformat()
+                    bucket_contents.extend([obj])

        logging.info(f"Retrieved {len(bucket_contents)} items from the bucket.")
+
+        # Save to cache if enabled
+        if cache_enabled:
+            try:
+                with open(cache_file, 'w', encoding='utf-8') as f:
+                    json.dump(bucket_contents, f)
+                logging.info(f"S3 bucket listing saved to local cache: {cache_file}")
+            except Exception as e:
+                logging.warning(f"Failed to write S3 cache file: {e}")
+
        return bucket_contents
    except ClientError as e:
        logging.error(f'Error listing bucket contents: {e}')