Version 2.1
This commit is contained in:
parent
b90b7bf3e2
commit
81639a87b5
|
|
@ -0,0 +1,52 @@
|
||||||
|
# AWS credentials (replace with your own credentials)
|
||||||
|
AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID
|
||||||
|
AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY
|
||||||
|
AWS_REGION=us-east-1
|
||||||
|
AWS_ENDPOINT_URL=https://s3.your-provider.example.com
|
||||||
|
|
||||||
|
BUCKET_NAME=your-bucket-name
|
||||||
|
|
||||||
|
# Database credentials (replace with your own database info)
|
||||||
|
#DB_HOST=your-db-host
|
||||||
|
#DB_NAME=your_db_name
|
||||||
|
#DB_USER=your_db_user
|
||||||
|
#DB_PASSWORD=your_db_password
|
||||||
|
#DB_PORT=5432
|
||||||
|
|
||||||
|
# Example local development database
|
||||||
|
DB_HOST=127.0.0.1
|
||||||
|
DB_NAME=your_local_db_name
|
||||||
|
# DB_NAME=artchive_production
|
||||||
|
DB_USER=your_local_db_user
|
||||||
|
DB_PASSWORD=your_local_db_password
|
||||||
|
DB_PORT=5432
|
||||||
|
|
||||||
|
# LOGS FILE
|
||||||
|
LOG_FILE_PATH="./logs/ACH_media_import_errors.log"
|
||||||
|
ERROR_LOG_FILE_PATH="./logs/ACH_media_import_critical_errors.log"
|
||||||
|
WARING_LOG_FILE_PATH="./logs/ACH_media_import_warnings.log"
|
||||||
|
INFO_LOG_FILE_PATH="./logs/ACH_media_import_info.log"
|
||||||
|
|
||||||
|
# Email configuration (replace with your SMTP settings)
|
||||||
|
SMTP_SERVER=smtp.example.com
|
||||||
|
SMTP_PORT=587
|
||||||
|
SMTP_USER=your-smtp-user
|
||||||
|
SMTP_PASSWORD="your-smtp-password"
|
||||||
|
SENDER_EMAIL=sender@example.com
|
||||||
|
EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
|
||||||
|
ERROR_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
|
||||||
|
SUCCESS_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
|
||||||
|
|
||||||
|
# ACH configuration
|
||||||
|
ACH_ENV="development" # "production" or "development"
|
||||||
|
ACH_DRY_RUN="true"
|
||||||
|
ACH_SAFE_RUN="true"
|
||||||
|
ACH_CACHE_S3_LIST="true"
|
||||||
|
|
||||||
|
ACH_SYNC_CHUNK_SIZE=10 # in % of total files to import, from media_files_to_process used to determine the number of files to process in each batch
|
||||||
|
|
||||||
|
ACH_EDITOR_ID=1
|
||||||
|
ACH_APPROVER_ID=1
|
||||||
|
ACH_NOTES="Imported automatically from the S3 bucket"
|
||||||
|
ACH_STORAGE_LOCATION='{"storage_type": "lto", "storage_location_id": 6}'
|
||||||
|
ACH_FILE_TYPE='{"type": "video/mov"}'
|
||||||
|
|
@ -6,4 +6,226 @@ __pycache__/
|
||||||
*.pyo
|
*.pyo
|
||||||
logs/
|
logs/
|
||||||
*.logs
|
*.logs
|
||||||
*.log
|
*.log
|
||||||
|
|
||||||
|
output/
|
||||||
|
|
||||||
|
TODO-mime.md
|
||||||
|
.github/copilot-instructions.md
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[codz]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
# Pipfile.lock
|
||||||
|
|
||||||
|
# UV
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# uv.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
# poetry.lock
|
||||||
|
# poetry.toml
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
||||||
|
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
||||||
|
# pdm.lock
|
||||||
|
# pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# pixi
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
||||||
|
# pixi.lock
|
||||||
|
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
||||||
|
# in the .venv directory. It is recommended not to include this directory in version control.
|
||||||
|
.pixi
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# Redis
|
||||||
|
*.rdb
|
||||||
|
*.aof
|
||||||
|
*.pid
|
||||||
|
|
||||||
|
# RabbitMQ
|
||||||
|
mnesia/
|
||||||
|
rabbitmq/
|
||||||
|
rabbitmq-data/
|
||||||
|
|
||||||
|
# ActiveMQ
|
||||||
|
activemq-data/
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.envrc
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
# .idea/
|
||||||
|
|
||||||
|
# Abstra
|
||||||
|
# Abstra is an AI-powered process automation framework.
|
||||||
|
# Ignore directories containing user credentials, local state, and settings.
|
||||||
|
# Learn more at https://abstra.io/docs
|
||||||
|
.abstra/
|
||||||
|
|
||||||
|
# Visual Studio Code
|
||||||
|
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
||||||
|
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
||||||
|
# you could uncomment the following to ignore the entire vscode folder
|
||||||
|
# .vscode/
|
||||||
|
|
||||||
|
# Ruff stuff:
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# PyPI configuration file
|
||||||
|
.pypirc
|
||||||
|
|
||||||
|
# Marimo
|
||||||
|
marimo/_static/
|
||||||
|
marimo/_lsp/
|
||||||
|
__marimo__/
|
||||||
|
|
||||||
|
# Streamlit
|
||||||
|
.streamlit/secrets.toml
|
||||||
12
README.md
12
README.md
|
|
@ -95,6 +95,18 @@ docker compose up -d --build
|
||||||
docker compose logs -f app
|
docker compose logs -f app
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Run inside the container (from the host)
|
||||||
|
|
||||||
|
If you want to execute the importer manually inside the running container (for debugging or one-off runs), you can use either of the following:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Using docker compose (recommended)
|
||||||
|
docker compose exec app python /app/main.py
|
||||||
|
|
||||||
|
# Or using docker exec with the container name
|
||||||
|
docker exec -it ACH_server_media_importer python /app/main.py
|
||||||
|
```
|
||||||
|
|
||||||
### Stop
|
### Stop
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
|
||||||
32
db_utils.py
32
db_utils.py
|
|
@ -30,7 +30,8 @@ from psycopg2 import sql
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
from email_utils import handle_error
|
from error_handler import notify_error
|
||||||
|
from utils import check_audio_info
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import config
|
import config
|
||||||
|
|
@ -73,8 +74,24 @@ def get_mime_from_mediainfo(ach_variables: dict) -> str:
|
||||||
mediainfo = ach_variables.get('custom_data_in', {}).get('mediainfo', {})
|
mediainfo = ach_variables.get('custom_data_in', {}).get('mediainfo', {})
|
||||||
tracks = mediainfo.get('media', {}).get('track', [])
|
tracks = mediainfo.get('media', {}).get('track', [])
|
||||||
|
|
||||||
# ---- Master (outside FILE/) must be ProRes ----
|
# ---- Master (outside FILE/) ----
|
||||||
if not is_file_folder:
|
if not is_file_folder:
|
||||||
|
# Determine if this is an audio-type inventory code (OA4/MCC/DAT) or video master.
|
||||||
|
inventory_code = ach_variables.get('inventory_code', '') or ''
|
||||||
|
inventory_type = inventory_code[3:6] if len(inventory_code) >= 6 else ''
|
||||||
|
audio_inventory_types = {'OA4', 'MCC', 'DAT'}
|
||||||
|
|
||||||
|
if inventory_type in audio_inventory_types:
|
||||||
|
# For audio masters we validate the audio metadata (not ProRes video tracks).
|
||||||
|
result, message = check_audio_info(mediainfo)
|
||||||
|
if not result:
|
||||||
|
raise ValueError(f"Audio validation failed: {message}")
|
||||||
|
|
||||||
|
# Derive MIME from extension; fall back to the configured mapping.
|
||||||
|
extension = os.path.splitext(file_fullpath_norm)[1].lower()
|
||||||
|
return get_mime_for_extension(extension or ach_variables.get('extension'))
|
||||||
|
|
||||||
|
# Otherwise, enforce a ProRes video track for video masters.
|
||||||
# Find the video track
|
# Find the video track
|
||||||
video_track = None
|
video_track = None
|
||||||
for t in tracks:
|
for t in tracks:
|
||||||
|
|
@ -250,16 +267,15 @@ def check_inventory_in_db(s3_client, cur, base_name):
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
logging.info(f"Inventory code {truncated_base_name[:12]} found in the database.")
|
logging.info(f"Inventory code {truncated_base_name[:12]} found in the database.")
|
||||||
# Call the function to retrieve digital file names, assuming this function is implemented
|
|
||||||
return True, truncated_base_name
|
return True, truncated_base_name
|
||||||
else:
|
else:
|
||||||
logging.info(f"Inventory code {truncated_base_name} not found in the database.")
|
logging.info(f"Inventory code {truncated_base_name} not found in the database.")
|
||||||
handle_error(f"Inventory code {truncated_base_name} not found in the database.")
|
notify_error(f"Inventory code {truncated_base_name} not found in the database.")
|
||||||
#raise ValueError(f"Inventory code {truncated_base_name} not found in the database.")
|
#raise ValueError(f"Inventory code {truncated_base_name} not found in the database.")
|
||||||
return False, None
|
return False, None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'Error checking inventory code {base_name}:', {e})
|
notify_error(f'Error checking inventory code {base_name}', e)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
# Function to check if the object key exists in the database
|
# Function to check if the object key exists in the database
|
||||||
|
|
@ -296,7 +312,7 @@ def check_objkey_in_file_db(cur, base_name):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error checking inventory code {base_name}: {e}")
|
notify_error(f"Error checking inventory code {base_name}", e)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
# Function to add a file record and its relationship to the support record
|
# Function to add a file record and its relationship to the support record
|
||||||
|
|
@ -333,7 +349,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
|
||||||
notes = f"{ach_config.get('ach_notes','') } {date_part} {time_part}"
|
notes = f"{ach_config.get('ach_notes','') } {date_part} {time_part}"
|
||||||
|
|
||||||
ach_variables['file_copia_conservativa'] = ach_variables['custom_data_in'].get('mediainfo', {}).get("media", {}).get("@ref", "")
|
ach_variables['file_copia_conservativa'] = ach_variables['custom_data_in'].get('mediainfo', {}).get("media", {}).get("@ref", "")
|
||||||
logging.info(f"ach_variables['file_copia_conservativa']a: {ach_variables['file_copia_conservativa']}")
|
logging.info(f"ach_variables['file_copia_conservativa']: {ach_variables['file_copia_conservativa']}")
|
||||||
|
|
||||||
logging.debug("Executing add_file_record_and_relationship")
|
logging.debug("Executing add_file_record_and_relationship")
|
||||||
|
|
||||||
|
|
@ -441,7 +457,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'Error adding file record and relationship: {e}')
|
notify_error(f"Error adding file record and relationship: {base_name}", e)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
# Functio to add a file record
|
# Functio to add a file record
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
services:
|
services:
|
||||||
app:
|
app:
|
||||||
build: .
|
build: .
|
||||||
container_name: ACH_server_media_importer
|
container_name: ACH_server_media_importer02
|
||||||
volumes:
|
volumes:
|
||||||
- logs:/app/logs # Add this line to map the logs volume
|
- logs:/app/logs # Add this line to map the logs volume
|
||||||
env_file:
|
env_file:
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,18 @@ def handle_value_error(e):
|
||||||
def handle_error(error_message):
|
def handle_error(error_message):
|
||||||
logging.error(f"Error: {error_message}")
|
logging.error(f"Error: {error_message}")
|
||||||
|
|
||||||
|
def notify_error(error_message, e=None):
|
||||||
|
"""
|
||||||
|
Centralized error reporting: logs the error and triggers email notification.
|
||||||
|
"""
|
||||||
|
full_message = f"{error_message}: {e}" if e else error_message
|
||||||
|
logging.error(full_message)
|
||||||
|
try:
|
||||||
|
from email_utils import handle_error as send_notification
|
||||||
|
send_notification(Exception(full_message) if e is None else e)
|
||||||
|
except Exception as notify_err:
|
||||||
|
logging.error(f"Failed to trigger email notification: {notify_err}")
|
||||||
|
|
||||||
class ClientError(Exception):
|
class ClientError(Exception):
|
||||||
"""Custom exception class for client errors."""
|
"""Custom exception class for client errors."""
|
||||||
pass
|
pass
|
||||||
|
|
@ -72,7 +72,7 @@ def retrieve_file_contents(s3, base_name):
|
||||||
logging.error(f'Error formatting file contents as JSON: {e}', exc_info=True)
|
logging.error(f'Error formatting file contents as JSON: {e}', exc_info=True)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def check_related_files(s3, file_name_with_path, file, bucket_name):
|
def check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=None):
|
||||||
"""
|
"""
|
||||||
Check for related files in S3 based on the given file type.
|
Check for related files in S3 based on the given file type.
|
||||||
Parameters:
|
Parameters:
|
||||||
|
|
@ -80,6 +80,7 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
|
||||||
- file_name_with_path: The name of the file with its path.
|
- file_name_with_path: The name of the file with its path.
|
||||||
- file: The file name.
|
- file: The file name.
|
||||||
- bucket_name: The name of the S3 bucket.
|
- bucket_name: The name of the S3 bucket.
|
||||||
|
- s3_listing_cache: Optional mapping of S3 keys to listing objects.
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
Raises:
|
Raises:
|
||||||
|
|
@ -111,7 +112,14 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
|
||||||
logging.info(f"Checking for related file: {related_file}")
|
logging.info(f"Checking for related file: {related_file}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not check_file_exists_in_s3(s3, related_file,bucket_name):
|
# Optimized existence check
|
||||||
|
exists = False
|
||||||
|
if s3_listing_cache:
|
||||||
|
exists = related_file in s3_listing_cache
|
||||||
|
else:
|
||||||
|
exists = check_file_exists_in_s3(s3, related_file, bucket_name)
|
||||||
|
|
||||||
|
if not exists:
|
||||||
error_message = f"Required file {related_file} not found in S3."
|
error_message = f"Required file {related_file} not found in S3."
|
||||||
logging.error(error_message)
|
logging.error(error_message)
|
||||||
raise FileNotFoundError(error_message)
|
raise FileNotFoundError(error_message)
|
||||||
|
|
@ -126,10 +134,15 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
|
||||||
|
|
||||||
# Check the size of the related file
|
# Check the size of the related file
|
||||||
try:
|
try:
|
||||||
if ext in ['json', 'md5', 'pdf']:
|
if ext in ['json', 'md5', 'pdf']:
|
||||||
file_size = get_file_size(s3, bucket_name, related_file)
|
# Optimized size check
|
||||||
if file_size == 0:
|
if s3_listing_cache and related_file in s3_listing_cache:
|
||||||
error_message = f"File {related_file} has zero size."
|
file_size = s3_listing_cache[related_file].get('Size')
|
||||||
|
else:
|
||||||
|
file_size = get_file_size(s3, bucket_name, related_file)
|
||||||
|
|
||||||
|
if file_size == 0 or file_size is None:
|
||||||
|
error_message = f"File {related_file} has zero size or missing."
|
||||||
logging.error(error_message)
|
logging.error(error_message)
|
||||||
raise ValueError(error_message)
|
raise ValueError(error_message)
|
||||||
else:
|
else:
|
||||||
|
|
@ -143,8 +156,17 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
|
||||||
# If the required file is a .pdf, get its size and update ach_pdf_disk_size
|
# If the required file is a .pdf, get its size and update ach_pdf_disk_size
|
||||||
if ext =='pdf':
|
if ext =='pdf':
|
||||||
pdf_file = f"{file_name_with_path}.pdf"
|
pdf_file = f"{file_name_with_path}.pdf"
|
||||||
if check_file_exists_in_s3(s3, pdf_file,bucket_name):
|
pdf_exists = False
|
||||||
pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
|
if s3_listing_cache:
|
||||||
|
pdf_exists = pdf_file in s3_listing_cache
|
||||||
|
else:
|
||||||
|
pdf_exists = check_file_exists_in_s3(s3, pdf_file, bucket_name)
|
||||||
|
|
||||||
|
if pdf_exists:
|
||||||
|
if s3_listing_cache and pdf_file in s3_listing_cache:
|
||||||
|
pdf_file_size = s3_listing_cache[pdf_file].get('Size')
|
||||||
|
else:
|
||||||
|
pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
|
||||||
ach_pdf_disk_size = pdf_file_size
|
ach_pdf_disk_size = pdf_file_size
|
||||||
# logging.info(f"PDF disk size: {ach_pdf_disk_size}")
|
# logging.info(f"PDF disk size: {ach_pdf_disk_size}")
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
84
main.py
84
main.py
|
|
@ -5,6 +5,7 @@ from datetime import datetime
|
||||||
import pytz
|
import pytz
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import math
|
||||||
from logging_config import setup_logging, CUSTOM_ERROR_LEVEL
|
from logging_config import setup_logging, CUSTOM_ERROR_LEVEL
|
||||||
from email_utils import handle_error, send_email_with_attachment
|
from email_utils import handle_error, send_email_with_attachment
|
||||||
from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files
|
from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files
|
||||||
|
|
@ -90,17 +91,24 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
# List S3 bucket s3_validated_contents
|
# List S3 bucket s3_validated_contents
|
||||||
list_s3_files = list_s3_bucket(s3_client, bucket_name)
|
list_s3_files = list_s3_bucket(s3_client, bucket_name)
|
||||||
|
|
||||||
|
# Build a quick in-memory map (cache) of the bucket listing.
|
||||||
|
# This will be used exclusively for metadata lookups (size/existence)
|
||||||
|
# to avoid redundant S3 network calls, without changing the main logic.
|
||||||
|
s3_listing_cache = {obj['Key']: obj for obj in list_s3_files}
|
||||||
|
|
||||||
# Define valid extensions and excluded folders
|
# Define valid extensions and excluded folders
|
||||||
# NOTE: This list is used only for the initial S3 filtering step (Phase 1).
|
# NOTE: This list is used only for the initial S3 filtering step (Phase 1).
|
||||||
# It determines which object keys are considered for further processing.
|
# It determines which object keys are considered for further processing.
|
||||||
valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'}
|
valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'} # dont like this
|
||||||
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'}
|
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'}
|
||||||
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/', 'DVD/', 'UMT/'}
|
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/', 'DVD/', 'UMT/'}
|
||||||
excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'UMT/'}
|
excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/' ,'MCC/'}
|
||||||
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/',}
|
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/',}
|
||||||
# included_folders = {'FILE/'} # uncomment this to NOT use excluded folders
|
# included_folders = {'FILE/'} # uncomment this to NOT use excluded folders
|
||||||
# included_folders = {'TEST-FOLDER-DEV/'} # uncomment this to NOT use excluded folders
|
# included_folders = {'TEST-FOLDER-DEV/'} # uncomment this to NOT use excluded folders
|
||||||
|
|
||||||
|
# aggiungere un distinct e count delle estenisoni perma di qualsiasi filtro
|
||||||
|
|
||||||
# Extract and filter file names
|
# Extract and filter file names
|
||||||
|
|
||||||
# s3_file_names: include only files that match valid extensions and
|
# s3_file_names: include only files that match valid extensions and
|
||||||
|
|
@ -112,7 +120,26 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
except NameError:
|
except NameError:
|
||||||
use_included = False
|
use_included = False
|
||||||
|
|
||||||
|
logging.info(f"Filtering S3 objects with valid extensions: {valid_extensions}")
|
||||||
if use_included:
|
if use_included:
|
||||||
|
logging.info(f"Using include-folder filter: {included_folders}")
|
||||||
|
filter_mode = "include"
|
||||||
|
else:
|
||||||
|
logging.info(f"Using exclusion-folder filter: {excluded_folders}")
|
||||||
|
filter_mode = "exclude"
|
||||||
|
|
||||||
|
# Ask for confirmation before proceeding (y/N). If user declines, exit cleanly.
|
||||||
|
try:
|
||||||
|
answer = input(f"Proceed using '{filter_mode}' filter mode? (y/N): ").strip().lower()
|
||||||
|
except Exception:
|
||||||
|
answer = 'n'
|
||||||
|
|
||||||
|
if answer != 'y':
|
||||||
|
logging.info("User chose not to proceed with the current filter mode. Exiting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if use_included:
|
||||||
|
|
||||||
s3_file_names = [
|
s3_file_names = [
|
||||||
content['Key'] for content in list_s3_files
|
content['Key'] for content in list_s3_files
|
||||||
if any(content['Key'].endswith(ext) for ext in valid_extensions)
|
if any(content['Key'].endswith(ext) for ext in valid_extensions)
|
||||||
|
|
@ -127,6 +154,19 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
]
|
]
|
||||||
logging.info("Using excluded_folders filter")
|
logging.info("Using excluded_folders filter")
|
||||||
|
|
||||||
|
# Count file extensions that survived the initial filtering.
|
||||||
|
# This provides a stable "perma" summary of what is being considered
|
||||||
|
# for the rest of the workflow.
|
||||||
|
from collections import Counter
|
||||||
|
extension_counts = Counter(
|
||||||
|
os.path.splitext(f)[1].lower() or "(no_ext)" for f in s3_file_names
|
||||||
|
)
|
||||||
|
# Log a user-friendly multi-line summary instead of a single dict dump
|
||||||
|
extension_summary = "\n".join(
|
||||||
|
f" {ext or '(no_ext)'}: {count}" for ext, count in sorted(extension_counts.items())
|
||||||
|
)
|
||||||
|
logging.info("Extension counts after initial filtering:\n%s", extension_summary)
|
||||||
|
|
||||||
# check inventory code syntax
|
# check inventory code syntax
|
||||||
# first check s3_file_names if the file base name and folder name match pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$'
|
# first check s3_file_names if the file base name and folder name match pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$'
|
||||||
s3_validated_contents = []
|
s3_validated_contents = []
|
||||||
|
|
@ -207,11 +247,16 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
filtered_file_names=list_s3_not_in_db(s3_validated_contents, db_file_names, db_sidecar_basenames)
|
filtered_file_names=list_s3_not_in_db(s3_validated_contents, db_file_names, db_sidecar_basenames)
|
||||||
|
|
||||||
|
|
||||||
|
# Produces only media files for Phase 3 parsing.
|
||||||
|
# Sidecars (.json, .md5, .pdf) are validated as dependencies of the media files.
|
||||||
|
media_files_to_process = [f for f in filtered_file_names if f.lower().endswith(('.mp4', '.mp3'))]
|
||||||
|
|
||||||
# Print the total number of files
|
# Print the total number of files
|
||||||
total_files_s3 = len(s3_validated_contents)
|
total_files_s3 = len(s3_validated_contents)
|
||||||
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
|
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
|
||||||
total_files = len(filtered_file_names)
|
total_files = len(filtered_file_names)
|
||||||
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
|
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
|
||||||
|
logging.info(f"Total media files (.mp4, .mp3) to process in Phase 3: {len(media_files_to_process)}")
|
||||||
|
|
||||||
# Log the files that need to be updated (those not yet in DB)
|
# Log the files that need to be updated (those not yet in DB)
|
||||||
if total_files > 0:
|
if total_files > 0:
|
||||||
|
|
@ -329,11 +374,44 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
# ---------------------------------------------------------------------
|
# ---------------------------------------------------------------------
|
||||||
logging.info("PHASE 3: Parse S3 objects and insert new records into the database")
|
logging.info("PHASE 3: Parse S3 objects and insert new records into the database")
|
||||||
|
|
||||||
|
# Implement ACH_SYNC_CHUNK_SIZE (dev/testing only).
|
||||||
|
# Production always processes 100%.
|
||||||
|
# ach_sync_pct = 100
|
||||||
|
# if ach_env != 'production':
|
||||||
|
# try:
|
||||||
|
# ach_sync_pct = int(os.getenv('ACH_SYNC_CHUNK_SIZE', '100'))
|
||||||
|
# except Exception:
|
||||||
|
# ach_sync_pct = 100
|
||||||
|
# ach_sync_pct = max(0, min(100, ach_sync_pct))
|
||||||
|
|
||||||
|
# total_media_files = len(media_files_to_process)
|
||||||
|
# if total_media_files > 0 and ach_sync_pct < 100:
|
||||||
|
# sync_limit = max(1, math.ceil(total_media_files * ach_sync_pct / 100))
|
||||||
|
# logging.info(
|
||||||
|
# "ACH_SYNC_CHUNK_SIZE enabled: processing %s/%s media files (%s%%)",
|
||||||
|
# sync_limit,
|
||||||
|
# total_media_files,
|
||||||
|
# ach_sync_pct,
|
||||||
|
# )
|
||||||
|
# media_files_to_process = media_files_to_process[:sync_limit]
|
||||||
|
# else:
|
||||||
|
# logging.info(
|
||||||
|
# "Processing all %s media files (ACH_SYNC_CHUNK_SIZE=%s%%)",
|
||||||
|
# total_media_files,
|
||||||
|
# ach_sync_pct,
|
||||||
|
# )
|
||||||
|
|
||||||
# Try to parse S3 files
|
# Try to parse S3 files
|
||||||
try:
|
try:
|
||||||
# If DRY RUN is set to True, the files will not be uploaded to the database
|
# If DRY RUN is set to True, the files will not be uploaded to the database
|
||||||
if os.getenv('ACH_DRY_RUN', 'true') == 'false':
|
if os.getenv('ACH_DRY_RUN', 'true') == 'false':
|
||||||
uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(s3_client, filtered_file_names, ach_variables, excluded_folders)
|
uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(
|
||||||
|
s3_client,
|
||||||
|
media_files_to_process,
|
||||||
|
ach_variables,
|
||||||
|
excluded_folders,
|
||||||
|
s3_listing_cache=s3_listing_cache
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logging.warning("DRY RUN is set to TRUE - No files will be added to the database")
|
logging.warning("DRY RUN is set to TRUE - No files will be added to the database")
|
||||||
# set the tuples to zero
|
# set the tuples to zero
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
119
s3_utils.py
119
s3_utils.py
|
|
@ -7,20 +7,21 @@ import psycopg2 # for PostgreSQL
|
||||||
|
|
||||||
# Import custom modules
|
# Import custom modules
|
||||||
from file_utils import retrieve_file_contents, check_related_files, extract_and_validate_file_info # for file operations
|
from file_utils import retrieve_file_contents, check_related_files, extract_and_validate_file_info # for file operations
|
||||||
from email_utils import handle_error # for error handling depecradted?
|
from error_handler import notify_error
|
||||||
from db_utils import get_db_connection, check_inventory_in_db, check_objkey_in_file_db, add_file_record_and_relationship, retrieve_digital_file_names # for database operations
|
from db_utils import get_db_connection, check_inventory_in_db, check_objkey_in_file_db, add_file_record_and_relationship, retrieve_digital_file_names # for database operations
|
||||||
|
|
||||||
import config
|
import config
|
||||||
|
|
||||||
# Function to check the existence of related files and validate in PostgreSQL
|
# Function to check the existence of related files and validate in PostgreSQL
|
||||||
def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[], s3_listing_cache=None):
|
||||||
"""
|
"""
|
||||||
Parses the S3 files and performs various operations on them.
|
Parses the S3 files and performs various operations on them.
|
||||||
Args:
|
Args:
|
||||||
s3 (S3): The S3 object for accessing S3 services.
|
s3 (S3): The S3 object for accessing S3 services.
|
||||||
s3_files (list): The list of S3 files to be processed.
|
s3_files (list): The list of S3 files to be processed.
|
||||||
|
s3_listing_cache (dict, optional): Mapping of S3 keys to listing objects.
|
||||||
Returns:
|
Returns:
|
||||||
None
|
tuple: (uploaded_files_count, warning_files_count, error_files_count)
|
||||||
Raises:
|
Raises:
|
||||||
FileNotFoundError: If a required file is not found in S3.
|
FileNotFoundError: If a required file is not found in S3.
|
||||||
ValueError: If a file has zero size or if the file type is unsupported.
|
ValueError: If a file has zero size or if the file type is unsupported.
|
||||||
|
|
@ -69,9 +70,42 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||||
# Display progress to console only (not written to log files)
|
# Display progress to console only (not written to log files)
|
||||||
print(f"--------------\n--- file {idx} of {total_files} ---\n--------------", flush=True)
|
print(f"--------------\n--- file {idx} of {total_files} ---\n--------------", flush=True)
|
||||||
|
|
||||||
# Use a savepoint per file to allow rollback on individual failures
|
# Ensure we start each file with a clean transaction state.
|
||||||
# without aborting the full batch.
|
try:
|
||||||
cur.execute("SAVEPOINT file_save")
|
conn.rollback()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Rollback failed before processing {file}: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
cur.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Use a savepoint per file to allow rollback on individual failures
|
||||||
|
# without aborting the full batch.
|
||||||
|
cur.execute("SAVEPOINT file_save")
|
||||||
|
except Exception as e:
|
||||||
|
# If the transaction is aborted, log and retry once.
|
||||||
|
import traceback
|
||||||
|
logging.error(f"Transaction aborted before processing {file}, retrying after reset: {e}")
|
||||||
|
logging.error(traceback.format_exc())
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn.rollback()
|
||||||
|
except Exception as rollback_err:
|
||||||
|
logging.error(f"Rollback failed while recovering from aborted transaction: {rollback_err}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
cur.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
# Retry savepoint once after recovery
|
||||||
|
cur.execute("SAVEPOINT file_save")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if file.endswith(('.mp4', '.mp3')): # Check for both .mp4 and .mp3
|
if file.endswith(('.mp4', '.mp3')): # Check for both .mp4 and .mp3
|
||||||
logging.info("Processing file: %s in the bucket: %s", file, bucket_name)
|
logging.info("Processing file: %s in the bucket: %s", file, bucket_name)
|
||||||
|
|
@ -130,7 +164,13 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Retrieve and log the file size
|
# Retrieve and log the file size
|
||||||
file_size = get_file_size(s3, bucket_name, file)
|
# Optimized: Check cache first
|
||||||
|
if s3_listing_cache and file in s3_listing_cache:
|
||||||
|
file_size = s3_listing_cache[file].get('Size')
|
||||||
|
logging.info(f"Retrieved file size from cache for: {file}")
|
||||||
|
else:
|
||||||
|
file_size = get_file_size(s3, bucket_name, file)
|
||||||
|
|
||||||
# maybe can trow an error inside te get_file_size function and catch it here
|
# maybe can trow an error inside te get_file_size function and catch it here
|
||||||
if file_size is not None:
|
if file_size is not None:
|
||||||
ach_variables['media_disk_size'] = file_size
|
ach_variables['media_disk_size'] = file_size
|
||||||
|
|
@ -142,12 +182,12 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||||
logging.error("ACH_SAFE_RUN=true: aborting Phase 3 due to warnings (missing file size): %s", file)
|
logging.error("ACH_SAFE_RUN=true: aborting Phase 3 due to warnings (missing file size): %s", file)
|
||||||
raise ValueError("ACH_SAFE_RUN=true: aborting due to warnings in Phase 3")
|
raise ValueError("ACH_SAFE_RUN=true: aborting due to warnings in Phase 3")
|
||||||
continue # Skip to the next file in the loop
|
continue # Skip to the next file in the loop
|
||||||
|
|
||||||
logging.info("Start Validating files for %s...", base_name)
|
logging.info("Start Validating files for %s...", base_name)
|
||||||
# Check if related file exist and retreive .pdf file size
|
# Check if related file exist and retreive .pdf file size
|
||||||
try:
|
try:
|
||||||
# Check if the required files exist in S3
|
# Check if the required files exist in S3
|
||||||
ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name)
|
ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=s3_listing_cache)
|
||||||
logging.info(f"PDF disk size: {ach_variables['pdf_disk_size']}")
|
logging.info(f"PDF disk size: {ach_variables['pdf_disk_size']}")
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError as e:
|
||||||
# Handle case where the file is not found
|
# Handle case where the file is not found
|
||||||
|
|
@ -166,7 +206,10 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||||
continue # Move on to the next file in the loop
|
continue # Move on to the next file in the loop
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Handle any other exceptions
|
# Handle any other exceptions
|
||||||
logging.error(f"An error occurred: {e}")
|
logging.error(f"Validation step failed for {file}: {e}")
|
||||||
|
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||||
|
error_files_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
# Retrieve the file contents for related files: .md5, .json
|
# Retrieve the file contents for related files: .md5, .json
|
||||||
try:
|
try:
|
||||||
|
|
@ -176,6 +219,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Log the error
|
# Log the error
|
||||||
logging.error(f"Error retrieving file contents for {file_name_with_path}: {e}")
|
logging.error(f"Error retrieving file contents for {file_name_with_path}: {e}")
|
||||||
|
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||||
file_contents = None # Set file_contents to None or handle it as needed
|
file_contents = None # Set file_contents to None or handle it as needed
|
||||||
error_files_count +=1
|
error_files_count +=1
|
||||||
continue # Move on to the next file in the loop
|
continue # Move on to the next file in the loop
|
||||||
|
|
@ -183,6 +227,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||||
# if contents dont exists
|
# if contents dont exists
|
||||||
if file_contents is None:
|
if file_contents is None:
|
||||||
logging.error(f"Error retrieving file contents for {file}.")
|
logging.error(f"Error retrieving file contents for {file}.")
|
||||||
|
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||||
error_files_count +=1
|
error_files_count +=1
|
||||||
continue # Move on to the next file in the loop
|
continue # Move on to the next file in the loop
|
||||||
|
|
||||||
|
|
@ -222,14 +267,17 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||||
add_file_record_and_relationship(s3, cur, base_name, ach_variables)
|
add_file_record_and_relationship(s3, cur, base_name, ach_variables)
|
||||||
else:
|
else:
|
||||||
logging.warning(f"File record already exists for {base_name}.")
|
logging.warning(f"File record already exists for {base_name}.")
|
||||||
|
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||||
warning_files_count +=1
|
warning_files_count +=1
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
logging.error(f"Inventory code {base_name} not found in the database.")
|
logging.error(f"Inventory code {base_name} not found in the database.")
|
||||||
|
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||||
error_files_count +=1
|
error_files_count +=1
|
||||||
continue
|
continue
|
||||||
except ValueError as e:
|
except Exception as e:
|
||||||
logging.error(f"An error occurred: {e}")
|
logging.error(f"DB operation failed for {base_name}: {e}")
|
||||||
|
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||||
error_files_count +=1
|
error_files_count +=1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
@ -241,7 +289,9 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||||
uploaded_files_count +=1
|
uploaded_files_count +=1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Roll back the changes done for this file only and continue processing others
|
# Roll back the changes done for this file only and continue processing others
|
||||||
|
import traceback
|
||||||
logging.error(f"Error processing {file}: {e}. Rolling back this file's changes.")
|
logging.error(f"Error processing {file}: {e}. Rolling back this file's changes.")
|
||||||
|
logging.error(traceback.format_exc())
|
||||||
try:
|
try:
|
||||||
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||||
except Exception as rollback_err:
|
except Exception as rollback_err:
|
||||||
|
|
@ -264,8 +314,8 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||||
raise e # Raise the exception to the calling function
|
raise e # Raise the exception to the calling function
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Handle any other unexpected errors
|
# Handle any other unexpected errors
|
||||||
logging.error(f"Unexpected error: {e}")
|
import traceback
|
||||||
#handle_error(e) # Pass unexpected errors to handle_error
|
notify_error("FATAL ERROR in Phase 3 process", e)
|
||||||
raise e # Raise the exception to the calling function
|
raise e # Raise the exception to the calling function
|
||||||
|
|
||||||
# return the file saved
|
# return the file saved
|
||||||
|
|
@ -294,15 +344,54 @@ def create_s3_client(aws_config):
|
||||||
|
|
||||||
# Function to list the contents of an S3 bucket
|
# Function to list the contents of an S3 bucket
|
||||||
def list_s3_bucket(s3_client, bucket_name):
|
def list_s3_bucket(s3_client, bucket_name):
|
||||||
|
"""
|
||||||
|
Lists S3 bucket contents with optional local JSON caching.
|
||||||
|
Uses ACH_CACHE_S3_LIST from .env to decide if it should cache/read from 's3_cache.json'.
|
||||||
|
"""
|
||||||
|
cache_enabled = os.getenv('ACH_CACHE_S3_LIST', 'false').lower() == 'true'
|
||||||
|
cache_file = 's3_cache.json'
|
||||||
|
|
||||||
|
if cache_enabled and os.path.exists(cache_file):
|
||||||
|
try:
|
||||||
|
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||||
|
cached_data = json.load(f)
|
||||||
|
logging.info(f"Loaded {len(cached_data)} items from local cache: {cache_file}")
|
||||||
|
return cached_data
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Failed to read S3 cache file: {e}. Falling back to S3 listing.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
logging.info(f"Listing all objects in bucket: {bucket_name}...")
|
||||||
paginator = s3_client.get_paginator('list_objects_v2')
|
paginator = s3_client.get_paginator('list_objects_v2')
|
||||||
bucket_contents = []
|
bucket_contents = []
|
||||||
|
|
||||||
|
# Convert datetime objects to string for JSON serialization
|
||||||
|
def _serialize_datetime(obj):
|
||||||
|
if isinstance(obj, datetime):
|
||||||
|
return obj.isoformat()
|
||||||
|
return obj
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
for page in paginator.paginate(Bucket=bucket_name):
|
for page in paginator.paginate(Bucket=bucket_name):
|
||||||
if 'Contents' in page:
|
if 'Contents' in page:
|
||||||
bucket_contents.extend(page['Contents'])
|
for obj in page['Contents']:
|
||||||
|
# Normalize dates for JSON compatibility
|
||||||
|
if 'LastModified' in obj:
|
||||||
|
obj['LastModified'] = obj['LastModified'].isoformat()
|
||||||
|
bucket_contents.extend([obj])
|
||||||
|
|
||||||
logging.info(f"Retrieved {len(bucket_contents)} items from the bucket.")
|
logging.info(f"Retrieved {len(bucket_contents)} items from the bucket.")
|
||||||
|
|
||||||
|
# Save to cache if enabled
|
||||||
|
if cache_enabled:
|
||||||
|
try:
|
||||||
|
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(bucket_contents, f)
|
||||||
|
logging.info(f"S3 bucket listing saved to local cache: {cache_file}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Failed to write S3 cache file: {e}")
|
||||||
|
|
||||||
return bucket_contents
|
return bucket_contents
|
||||||
except ClientError as e:
|
except ClientError as e:
|
||||||
logging.error(f'Error listing bucket contents: {e}')
|
logging.error(f'Error listing bucket contents: {e}')
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue