Version 2.1

This commit is contained in:
MSVstudios 2026-03-16 09:34:32 +01:00
parent b90b7bf3e2
commit 81639a87b5
10 changed files with 540 additions and 36 deletions

52
.env_example Normal file
View File

@ -0,0 +1,52 @@
# AWS credentials (replace with your own credentials)
AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY
AWS_REGION=us-east-1
AWS_ENDPOINT_URL=https://s3.your-provider.example.com
BUCKET_NAME=your-bucket-name
# Database credentials (replace with your own database info)
#DB_HOST=your-db-host
#DB_NAME=your_db_name
#DB_USER=your_db_user
#DB_PASSWORD=your_db_password
#DB_PORT=5432
# Example local development database
DB_HOST=127.0.0.1
DB_NAME=your_local_db_name
# DB_NAME=artchive_production
DB_USER=your_local_db_user
DB_PASSWORD=your_local_db_password
DB_PORT=5432
# LOGS FILE
LOG_FILE_PATH="./logs/ACH_media_import_errors.log"
ERROR_LOG_FILE_PATH="./logs/ACH_media_import_critical_errors.log"
WARING_LOG_FILE_PATH="./logs/ACH_media_import_warnings.log"
INFO_LOG_FILE_PATH="./logs/ACH_media_import_info.log"
# Email configuration (replace with your SMTP settings)
SMTP_SERVER=smtp.example.com
SMTP_PORT=587
SMTP_USER=your-smtp-user
SMTP_PASSWORD="your-smtp-password"
SENDER_EMAIL=sender@example.com
EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
ERROR_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
SUCCESS_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
# ACH configuration
ACH_ENV="development" # "production" or "development"
ACH_DRY_RUN="true"
ACH_SAFE_RUN="true"
ACH_CACHE_S3_LIST="true"
ACH_SYNC_CHUNK_SIZE=10 # in % of total files to import, from media_files_to_process used to determine the number of files to process in each batch
ACH_EDITOR_ID=1
ACH_APPROVER_ID=1
ACH_NOTES="Imported automatically from the S3 bucket"
ACH_STORAGE_LOCATION='{"storage_type": "lto", "storage_location_id": 6}'
ACH_FILE_TYPE='{"type": "video/mov"}'

222
.gitignore vendored
View File

@ -7,3 +7,225 @@ __pycache__/
logs/ logs/
*.logs *.logs
*.log *.log
output/
TODO-mime.md
.github/copilot-instructions.md
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
# Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
# poetry.lock
# poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
# pdm.lock
# pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
# pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# Redis
*.rdb
*.aof
*.pid
# RabbitMQ
mnesia/
rabbitmq/
rabbitmq-data/
# ActiveMQ
activemq-data/
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
# .idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
# Streamlit
.streamlit/secrets.toml

View File

@ -95,6 +95,18 @@ docker compose up -d --build
docker compose logs -f app docker compose logs -f app
``` ```
### Run inside the container (from the host)
If you want to execute the importer manually inside the running container (for debugging or one-off runs), you can use either of the following:
```bash
# Using docker compose (recommended)
docker compose exec app python /app/main.py
# Or using docker exec with the container name
docker exec -it ACH_server_media_importer python /app/main.py
```
### Stop ### Stop
```bash ```bash

View File

@ -30,7 +30,8 @@ from psycopg2 import sql
import logging import logging
from datetime import datetime from datetime import datetime
import re import re
from email_utils import handle_error from error_handler import notify_error
from utils import check_audio_info
import json import json
import os import os
import config import config
@ -73,8 +74,24 @@ def get_mime_from_mediainfo(ach_variables: dict) -> str:
mediainfo = ach_variables.get('custom_data_in', {}).get('mediainfo', {}) mediainfo = ach_variables.get('custom_data_in', {}).get('mediainfo', {})
tracks = mediainfo.get('media', {}).get('track', []) tracks = mediainfo.get('media', {}).get('track', [])
# ---- Master (outside FILE/) must be ProRes ---- # ---- Master (outside FILE/) ----
if not is_file_folder: if not is_file_folder:
# Determine if this is an audio-type inventory code (OA4/MCC/DAT) or video master.
inventory_code = ach_variables.get('inventory_code', '') or ''
inventory_type = inventory_code[3:6] if len(inventory_code) >= 6 else ''
audio_inventory_types = {'OA4', 'MCC', 'DAT'}
if inventory_type in audio_inventory_types:
# For audio masters we validate the audio metadata (not ProRes video tracks).
result, message = check_audio_info(mediainfo)
if not result:
raise ValueError(f"Audio validation failed: {message}")
# Derive MIME from extension; fall back to the configured mapping.
extension = os.path.splitext(file_fullpath_norm)[1].lower()
return get_mime_for_extension(extension or ach_variables.get('extension'))
# Otherwise, enforce a ProRes video track for video masters.
# Find the video track # Find the video track
video_track = None video_track = None
for t in tracks: for t in tracks:
@ -250,16 +267,15 @@ def check_inventory_in_db(s3_client, cur, base_name):
if result: if result:
logging.info(f"Inventory code {truncated_base_name[:12]} found in the database.") logging.info(f"Inventory code {truncated_base_name[:12]} found in the database.")
# Call the function to retrieve digital file names, assuming this function is implemented
return True, truncated_base_name return True, truncated_base_name
else: else:
logging.info(f"Inventory code {truncated_base_name} not found in the database.") logging.info(f"Inventory code {truncated_base_name} not found in the database.")
handle_error(f"Inventory code {truncated_base_name} not found in the database.") notify_error(f"Inventory code {truncated_base_name} not found in the database.")
#raise ValueError(f"Inventory code {truncated_base_name} not found in the database.") #raise ValueError(f"Inventory code {truncated_base_name} not found in the database.")
return False, None return False, None
except Exception as e: except Exception as e:
logging.error(f'Error checking inventory code {base_name}:', {e}) notify_error(f'Error checking inventory code {base_name}', e)
raise e raise e
# Function to check if the object key exists in the database # Function to check if the object key exists in the database
@ -296,7 +312,7 @@ def check_objkey_in_file_db(cur, base_name):
return False return False
except Exception as e: except Exception as e:
logging.error(f"Error checking inventory code {base_name}: {e}") notify_error(f"Error checking inventory code {base_name}", e)
raise e raise e
# Function to add a file record and its relationship to the support record # Function to add a file record and its relationship to the support record
@ -333,7 +349,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
notes = f"{ach_config.get('ach_notes','') } {date_part} {time_part}" notes = f"{ach_config.get('ach_notes','') } {date_part} {time_part}"
ach_variables['file_copia_conservativa'] = ach_variables['custom_data_in'].get('mediainfo', {}).get("media", {}).get("@ref", "") ach_variables['file_copia_conservativa'] = ach_variables['custom_data_in'].get('mediainfo', {}).get("media", {}).get("@ref", "")
logging.info(f"ach_variables['file_copia_conservativa']a: {ach_variables['file_copia_conservativa']}") logging.info(f"ach_variables['file_copia_conservativa']: {ach_variables['file_copia_conservativa']}")
logging.debug("Executing add_file_record_and_relationship") logging.debug("Executing add_file_record_and_relationship")
@ -441,7 +457,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
return False return False
except Exception as e: except Exception as e:
logging.error(f'Error adding file record and relationship: {e}') notify_error(f"Error adding file record and relationship: {base_name}", e)
raise e raise e
# Functio to add a file record # Functio to add a file record

View File

@ -1,7 +1,7 @@
services: services:
app: app:
build: . build: .
container_name: ACH_server_media_importer container_name: ACH_server_media_importer02
volumes: volumes:
- logs:/app/logs # Add this line to map the logs volume - logs:/app/logs # Add this line to map the logs volume
env_file: env_file:

View File

@ -17,6 +17,18 @@ def handle_value_error(e):
def handle_error(error_message): def handle_error(error_message):
logging.error(f"Error: {error_message}") logging.error(f"Error: {error_message}")
def notify_error(error_message, e=None):
"""
Centralized error reporting: logs the error and triggers email notification.
"""
full_message = f"{error_message}: {e}" if e else error_message
logging.error(full_message)
try:
from email_utils import handle_error as send_notification
send_notification(Exception(full_message) if e is None else e)
except Exception as notify_err:
logging.error(f"Failed to trigger email notification: {notify_err}")
class ClientError(Exception): class ClientError(Exception):
"""Custom exception class for client errors.""" """Custom exception class for client errors."""
pass pass

View File

@ -72,7 +72,7 @@ def retrieve_file_contents(s3, base_name):
logging.error(f'Error formatting file contents as JSON: {e}', exc_info=True) logging.error(f'Error formatting file contents as JSON: {e}', exc_info=True)
raise e raise e
def check_related_files(s3, file_name_with_path, file, bucket_name): def check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=None):
""" """
Check for related files in S3 based on the given file type. Check for related files in S3 based on the given file type.
Parameters: Parameters:
@ -80,6 +80,7 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
- file_name_with_path: The name of the file with its path. - file_name_with_path: The name of the file with its path.
- file: The file name. - file: The file name.
- bucket_name: The name of the S3 bucket. - bucket_name: The name of the S3 bucket.
- s3_listing_cache: Optional mapping of S3 keys to listing objects.
Returns: Returns:
None None
Raises: Raises:
@ -111,7 +112,14 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
logging.info(f"Checking for related file: {related_file}") logging.info(f"Checking for related file: {related_file}")
try: try:
if not check_file_exists_in_s3(s3, related_file,bucket_name): # Optimized existence check
exists = False
if s3_listing_cache:
exists = related_file in s3_listing_cache
else:
exists = check_file_exists_in_s3(s3, related_file, bucket_name)
if not exists:
error_message = f"Required file {related_file} not found in S3." error_message = f"Required file {related_file} not found in S3."
logging.error(error_message) logging.error(error_message)
raise FileNotFoundError(error_message) raise FileNotFoundError(error_message)
@ -127,9 +135,14 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
# Check the size of the related file # Check the size of the related file
try: try:
if ext in ['json', 'md5', 'pdf']: if ext in ['json', 'md5', 'pdf']:
# Optimized size check
if s3_listing_cache and related_file in s3_listing_cache:
file_size = s3_listing_cache[related_file].get('Size')
else:
file_size = get_file_size(s3, bucket_name, related_file) file_size = get_file_size(s3, bucket_name, related_file)
if file_size == 0:
error_message = f"File {related_file} has zero size." if file_size == 0 or file_size is None:
error_message = f"File {related_file} has zero size or missing."
logging.error(error_message) logging.error(error_message)
raise ValueError(error_message) raise ValueError(error_message)
else: else:
@ -143,7 +156,16 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
# If the required file is a .pdf, get its size and update ach_pdf_disk_size # If the required file is a .pdf, get its size and update ach_pdf_disk_size
if ext =='pdf': if ext =='pdf':
pdf_file = f"{file_name_with_path}.pdf" pdf_file = f"{file_name_with_path}.pdf"
if check_file_exists_in_s3(s3, pdf_file,bucket_name): pdf_exists = False
if s3_listing_cache:
pdf_exists = pdf_file in s3_listing_cache
else:
pdf_exists = check_file_exists_in_s3(s3, pdf_file, bucket_name)
if pdf_exists:
if s3_listing_cache and pdf_file in s3_listing_cache:
pdf_file_size = s3_listing_cache[pdf_file].get('Size')
else:
pdf_file_size = get_file_size(s3, bucket_name, pdf_file) pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
ach_pdf_disk_size = pdf_file_size ach_pdf_disk_size = pdf_file_size
# logging.info(f"PDF disk size: {ach_pdf_disk_size}") # logging.info(f"PDF disk size: {ach_pdf_disk_size}")

84
main.py
View File

@ -5,6 +5,7 @@ from datetime import datetime
import pytz import pytz
import os import os
import re import re
import math
from logging_config import setup_logging, CUSTOM_ERROR_LEVEL from logging_config import setup_logging, CUSTOM_ERROR_LEVEL
from email_utils import handle_error, send_email_with_attachment from email_utils import handle_error, send_email_with_attachment
from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files
@ -90,17 +91,24 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
# List S3 bucket s3_validated_contents # List S3 bucket s3_validated_contents
list_s3_files = list_s3_bucket(s3_client, bucket_name) list_s3_files = list_s3_bucket(s3_client, bucket_name)
# Build a quick in-memory map (cache) of the bucket listing.
# This will be used exclusively for metadata lookups (size/existence)
# to avoid redundant S3 network calls, without changing the main logic.
s3_listing_cache = {obj['Key']: obj for obj in list_s3_files}
# Define valid extensions and excluded folders # Define valid extensions and excluded folders
# NOTE: This list is used only for the initial S3 filtering step (Phase 1). # NOTE: This list is used only for the initial S3 filtering step (Phase 1).
# It determines which object keys are considered for further processing. # It determines which object keys are considered for further processing.
valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'} valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'} # dont like this
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'} # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'}
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/', 'DVD/', 'UMT/'} # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/', 'DVD/', 'UMT/'}
excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'UMT/'} excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/' ,'MCC/'}
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/',} # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/',}
# included_folders = {'FILE/'} # uncomment this to NOT use excluded folders # included_folders = {'FILE/'} # uncomment this to NOT use excluded folders
# included_folders = {'TEST-FOLDER-DEV/'} # uncomment this to NOT use excluded folders # included_folders = {'TEST-FOLDER-DEV/'} # uncomment this to NOT use excluded folders
# aggiungere un distinct e count delle estenisoni perma di qualsiasi filtro
# Extract and filter file names # Extract and filter file names
# s3_file_names: include only files that match valid extensions and # s3_file_names: include only files that match valid extensions and
@ -112,7 +120,26 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
except NameError: except NameError:
use_included = False use_included = False
logging.info(f"Filtering S3 objects with valid extensions: {valid_extensions}")
if use_included: if use_included:
logging.info(f"Using include-folder filter: {included_folders}")
filter_mode = "include"
else:
logging.info(f"Using exclusion-folder filter: {excluded_folders}")
filter_mode = "exclude"
# Ask for confirmation before proceeding (y/N). If user declines, exit cleanly.
try:
answer = input(f"Proceed using '{filter_mode}' filter mode? (y/N): ").strip().lower()
except Exception:
answer = 'n'
if answer != 'y':
logging.info("User chose not to proceed with the current filter mode. Exiting.")
return
if use_included:
s3_file_names = [ s3_file_names = [
content['Key'] for content in list_s3_files content['Key'] for content in list_s3_files
if any(content['Key'].endswith(ext) for ext in valid_extensions) if any(content['Key'].endswith(ext) for ext in valid_extensions)
@ -127,6 +154,19 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
] ]
logging.info("Using excluded_folders filter") logging.info("Using excluded_folders filter")
# Count file extensions that survived the initial filtering.
# This provides a stable "perma" summary of what is being considered
# for the rest of the workflow.
from collections import Counter
extension_counts = Counter(
os.path.splitext(f)[1].lower() or "(no_ext)" for f in s3_file_names
)
# Log a user-friendly multi-line summary instead of a single dict dump
extension_summary = "\n".join(
f" {ext or '(no_ext)'}: {count}" for ext, count in sorted(extension_counts.items())
)
logging.info("Extension counts after initial filtering:\n%s", extension_summary)
# check inventory code syntax # check inventory code syntax
# first check s3_file_names if the file base name and folder name match pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$' # first check s3_file_names if the file base name and folder name match pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$'
s3_validated_contents = [] s3_validated_contents = []
@ -207,11 +247,16 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
filtered_file_names=list_s3_not_in_db(s3_validated_contents, db_file_names, db_sidecar_basenames) filtered_file_names=list_s3_not_in_db(s3_validated_contents, db_file_names, db_sidecar_basenames)
# Produces only media files for Phase 3 parsing.
# Sidecars (.json, .md5, .pdf) are validated as dependencies of the media files.
media_files_to_process = [f for f in filtered_file_names if f.lower().endswith(('.mp4', '.mp3'))]
# Print the total number of files # Print the total number of files
total_files_s3 = len(s3_validated_contents) total_files_s3 = len(s3_validated_contents)
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}") logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
total_files = len(filtered_file_names) total_files = len(filtered_file_names)
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}") logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
logging.info(f"Total media files (.mp4, .mp3) to process in Phase 3: {len(media_files_to_process)}")
# Log the files that need to be updated (those not yet in DB) # Log the files that need to be updated (those not yet in DB)
if total_files > 0: if total_files > 0:
@ -329,11 +374,44 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
logging.info("PHASE 3: Parse S3 objects and insert new records into the database") logging.info("PHASE 3: Parse S3 objects and insert new records into the database")
# Implement ACH_SYNC_CHUNK_SIZE (dev/testing only).
# Production always processes 100%.
# ach_sync_pct = 100
# if ach_env != 'production':
# try:
# ach_sync_pct = int(os.getenv('ACH_SYNC_CHUNK_SIZE', '100'))
# except Exception:
# ach_sync_pct = 100
# ach_sync_pct = max(0, min(100, ach_sync_pct))
# total_media_files = len(media_files_to_process)
# if total_media_files > 0 and ach_sync_pct < 100:
# sync_limit = max(1, math.ceil(total_media_files * ach_sync_pct / 100))
# logging.info(
# "ACH_SYNC_CHUNK_SIZE enabled: processing %s/%s media files (%s%%)",
# sync_limit,
# total_media_files,
# ach_sync_pct,
# )
# media_files_to_process = media_files_to_process[:sync_limit]
# else:
# logging.info(
# "Processing all %s media files (ACH_SYNC_CHUNK_SIZE=%s%%)",
# total_media_files,
# ach_sync_pct,
# )
# Try to parse S3 files # Try to parse S3 files
try: try:
# If DRY RUN is set to True, the files will not be uploaded to the database # If DRY RUN is set to True, the files will not be uploaded to the database
if os.getenv('ACH_DRY_RUN', 'true') == 'false': if os.getenv('ACH_DRY_RUN', 'true') == 'false':
uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(s3_client, filtered_file_names, ach_variables, excluded_folders) uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(
s3_client,
media_files_to_process,
ach_variables,
excluded_folders,
s3_listing_cache=s3_listing_cache
)
else: else:
logging.warning("DRY RUN is set to TRUE - No files will be added to the database") logging.warning("DRY RUN is set to TRUE - No files will be added to the database")
# set the tuples to zero # set the tuples to zero

1
s3_cache.json Normal file

File diff suppressed because one or more lines are too long

View File

@ -7,20 +7,21 @@ import psycopg2 # for PostgreSQL
# Import custom modules # Import custom modules
from file_utils import retrieve_file_contents, check_related_files, extract_and_validate_file_info # for file operations from file_utils import retrieve_file_contents, check_related_files, extract_and_validate_file_info # for file operations
from email_utils import handle_error # for error handling depecradted? from error_handler import notify_error
from db_utils import get_db_connection, check_inventory_in_db, check_objkey_in_file_db, add_file_record_and_relationship, retrieve_digital_file_names # for database operations from db_utils import get_db_connection, check_inventory_in_db, check_objkey_in_file_db, add_file_record_and_relationship, retrieve_digital_file_names # for database operations
import config import config
# Function to check the existence of related files and validate in PostgreSQL # Function to check the existence of related files and validate in PostgreSQL
def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]): def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[], s3_listing_cache=None):
""" """
Parses the S3 files and performs various operations on them. Parses the S3 files and performs various operations on them.
Args: Args:
s3 (S3): The S3 object for accessing S3 services. s3 (S3): The S3 object for accessing S3 services.
s3_files (list): The list of S3 files to be processed. s3_files (list): The list of S3 files to be processed.
s3_listing_cache (dict, optional): Mapping of S3 keys to listing objects.
Returns: Returns:
None tuple: (uploaded_files_count, warning_files_count, error_files_count)
Raises: Raises:
FileNotFoundError: If a required file is not found in S3. FileNotFoundError: If a required file is not found in S3.
ValueError: If a file has zero size or if the file type is unsupported. ValueError: If a file has zero size or if the file type is unsupported.
@ -69,9 +70,42 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
# Display progress to console only (not written to log files) # Display progress to console only (not written to log files)
print(f"--------------\n--- file {idx} of {total_files} ---\n--------------", flush=True) print(f"--------------\n--- file {idx} of {total_files} ---\n--------------", flush=True)
# Ensure we start each file with a clean transaction state.
try:
conn.rollback()
except Exception as e:
logging.error(f"Rollback failed before processing {file}: {e}")
try:
cur.close()
except Exception:
pass
cur = conn.cursor()
try:
# Use a savepoint per file to allow rollback on individual failures # Use a savepoint per file to allow rollback on individual failures
# without aborting the full batch. # without aborting the full batch.
cur.execute("SAVEPOINT file_save") cur.execute("SAVEPOINT file_save")
except Exception as e:
# If the transaction is aborted, log and retry once.
import traceback
logging.error(f"Transaction aborted before processing {file}, retrying after reset: {e}")
logging.error(traceback.format_exc())
try:
conn.rollback()
except Exception as rollback_err:
logging.error(f"Rollback failed while recovering from aborted transaction: {rollback_err}")
try:
cur.close()
except Exception:
pass
cur = conn.cursor()
# Retry savepoint once after recovery
cur.execute("SAVEPOINT file_save")
try: try:
if file.endswith(('.mp4', '.mp3')): # Check for both .mp4 and .mp3 if file.endswith(('.mp4', '.mp3')): # Check for both .mp4 and .mp3
logging.info("Processing file: %s in the bucket: %s", file, bucket_name) logging.info("Processing file: %s in the bucket: %s", file, bucket_name)
@ -130,7 +164,13 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
try: try:
# Retrieve and log the file size # Retrieve and log the file size
# Optimized: Check cache first
if s3_listing_cache and file in s3_listing_cache:
file_size = s3_listing_cache[file].get('Size')
logging.info(f"Retrieved file size from cache for: {file}")
else:
file_size = get_file_size(s3, bucket_name, file) file_size = get_file_size(s3, bucket_name, file)
# maybe can trow an error inside te get_file_size function and catch it here # maybe can trow an error inside te get_file_size function and catch it here
if file_size is not None: if file_size is not None:
ach_variables['media_disk_size'] = file_size ach_variables['media_disk_size'] = file_size
@ -147,7 +187,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
# Check if related file exist and retreive .pdf file size # Check if related file exist and retreive .pdf file size
try: try:
# Check if the required files exist in S3 # Check if the required files exist in S3
ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name) ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=s3_listing_cache)
logging.info(f"PDF disk size: {ach_variables['pdf_disk_size']}") logging.info(f"PDF disk size: {ach_variables['pdf_disk_size']}")
except FileNotFoundError as e: except FileNotFoundError as e:
# Handle case where the file is not found # Handle case where the file is not found
@ -166,7 +206,10 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
continue # Move on to the next file in the loop continue # Move on to the next file in the loop
except Exception as e: except Exception as e:
# Handle any other exceptions # Handle any other exceptions
logging.error(f"An error occurred: {e}") logging.error(f"Validation step failed for {file}: {e}")
cur.execute("ROLLBACK TO SAVEPOINT file_save")
error_files_count += 1
continue
# Retrieve the file contents for related files: .md5, .json # Retrieve the file contents for related files: .md5, .json
try: try:
@ -176,6 +219,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
except Exception as e: except Exception as e:
# Log the error # Log the error
logging.error(f"Error retrieving file contents for {file_name_with_path}: {e}") logging.error(f"Error retrieving file contents for {file_name_with_path}: {e}")
cur.execute("ROLLBACK TO SAVEPOINT file_save")
file_contents = None # Set file_contents to None or handle it as needed file_contents = None # Set file_contents to None or handle it as needed
error_files_count +=1 error_files_count +=1
continue # Move on to the next file in the loop continue # Move on to the next file in the loop
@ -183,6 +227,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
# if contents dont exists # if contents dont exists
if file_contents is None: if file_contents is None:
logging.error(f"Error retrieving file contents for {file}.") logging.error(f"Error retrieving file contents for {file}.")
cur.execute("ROLLBACK TO SAVEPOINT file_save")
error_files_count +=1 error_files_count +=1
continue # Move on to the next file in the loop continue # Move on to the next file in the loop
@ -222,14 +267,17 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
add_file_record_and_relationship(s3, cur, base_name, ach_variables) add_file_record_and_relationship(s3, cur, base_name, ach_variables)
else: else:
logging.warning(f"File record already exists for {base_name}.") logging.warning(f"File record already exists for {base_name}.")
cur.execute("ROLLBACK TO SAVEPOINT file_save")
warning_files_count +=1 warning_files_count +=1
continue continue
else: else:
logging.error(f"Inventory code {base_name} not found in the database.") logging.error(f"Inventory code {base_name} not found in the database.")
cur.execute("ROLLBACK TO SAVEPOINT file_save")
error_files_count +=1 error_files_count +=1
continue continue
except ValueError as e: except Exception as e:
logging.error(f"An error occurred: {e}") logging.error(f"DB operation failed for {base_name}: {e}")
cur.execute("ROLLBACK TO SAVEPOINT file_save")
error_files_count +=1 error_files_count +=1
continue continue
@ -241,7 +289,9 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
uploaded_files_count +=1 uploaded_files_count +=1
except Exception as e: except Exception as e:
# Roll back the changes done for this file only and continue processing others # Roll back the changes done for this file only and continue processing others
import traceback
logging.error(f"Error processing {file}: {e}. Rolling back this file's changes.") logging.error(f"Error processing {file}: {e}. Rolling back this file's changes.")
logging.error(traceback.format_exc())
try: try:
cur.execute("ROLLBACK TO SAVEPOINT file_save") cur.execute("ROLLBACK TO SAVEPOINT file_save")
except Exception as rollback_err: except Exception as rollback_err:
@ -264,8 +314,8 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
raise e # Raise the exception to the calling function raise e # Raise the exception to the calling function
except Exception as e: except Exception as e:
# Handle any other unexpected errors # Handle any other unexpected errors
logging.error(f"Unexpected error: {e}") import traceback
#handle_error(e) # Pass unexpected errors to handle_error notify_error("FATAL ERROR in Phase 3 process", e)
raise e # Raise the exception to the calling function raise e # Raise the exception to the calling function
# return the file saved # return the file saved
@ -294,15 +344,54 @@ def create_s3_client(aws_config):
# Function to list the contents of an S3 bucket # Function to list the contents of an S3 bucket
def list_s3_bucket(s3_client, bucket_name): def list_s3_bucket(s3_client, bucket_name):
"""
Lists S3 bucket contents with optional local JSON caching.
Uses ACH_CACHE_S3_LIST from .env to decide if it should cache/read from 's3_cache.json'.
"""
cache_enabled = os.getenv('ACH_CACHE_S3_LIST', 'false').lower() == 'true'
cache_file = 's3_cache.json'
if cache_enabled and os.path.exists(cache_file):
try: try:
with open(cache_file, 'r', encoding='utf-8') as f:
cached_data = json.load(f)
logging.info(f"Loaded {len(cached_data)} items from local cache: {cache_file}")
return cached_data
except Exception as e:
logging.warning(f"Failed to read S3 cache file: {e}. Falling back to S3 listing.")
try:
logging.info(f"Listing all objects in bucket: {bucket_name}...")
paginator = s3_client.get_paginator('list_objects_v2') paginator = s3_client.get_paginator('list_objects_v2')
bucket_contents = [] bucket_contents = []
# Convert datetime objects to string for JSON serialization
def _serialize_datetime(obj):
if isinstance(obj, datetime):
return obj.isoformat()
return obj
from datetime import datetime
for page in paginator.paginate(Bucket=bucket_name): for page in paginator.paginate(Bucket=bucket_name):
if 'Contents' in page: if 'Contents' in page:
bucket_contents.extend(page['Contents']) for obj in page['Contents']:
# Normalize dates for JSON compatibility
if 'LastModified' in obj:
obj['LastModified'] = obj['LastModified'].isoformat()
bucket_contents.extend([obj])
logging.info(f"Retrieved {len(bucket_contents)} items from the bucket.") logging.info(f"Retrieved {len(bucket_contents)} items from the bucket.")
# Save to cache if enabled
if cache_enabled:
try:
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(bucket_contents, f)
logging.info(f"S3 bucket listing saved to local cache: {cache_file}")
except Exception as e:
logging.warning(f"Failed to write S3 cache file: {e}")
return bucket_contents return bucket_contents
except ClientError as e: except ClientError as e:
logging.error(f'Error listing bucket contents: {e}') logging.error(f'Error listing bucket contents: {e}')