Version 2.1
This commit is contained in:
parent
b90b7bf3e2
commit
81639a87b5
|
|
@ -0,0 +1,52 @@
|
|||
# AWS credentials (replace with your own credentials)
|
||||
AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID
|
||||
AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY
|
||||
AWS_REGION=us-east-1
|
||||
AWS_ENDPOINT_URL=https://s3.your-provider.example.com
|
||||
|
||||
BUCKET_NAME=your-bucket-name
|
||||
|
||||
# Database credentials (replace with your own database info)
|
||||
#DB_HOST=your-db-host
|
||||
#DB_NAME=your_db_name
|
||||
#DB_USER=your_db_user
|
||||
#DB_PASSWORD=your_db_password
|
||||
#DB_PORT=5432
|
||||
|
||||
# Example local development database
|
||||
DB_HOST=127.0.0.1
|
||||
DB_NAME=your_local_db_name
|
||||
# DB_NAME=artchive_production
|
||||
DB_USER=your_local_db_user
|
||||
DB_PASSWORD=your_local_db_password
|
||||
DB_PORT=5432
|
||||
|
||||
# LOGS FILE
|
||||
LOG_FILE_PATH="./logs/ACH_media_import_errors.log"
|
||||
ERROR_LOG_FILE_PATH="./logs/ACH_media_import_critical_errors.log"
|
||||
WARING_LOG_FILE_PATH="./logs/ACH_media_import_warnings.log"
|
||||
INFO_LOG_FILE_PATH="./logs/ACH_media_import_info.log"
|
||||
|
||||
# Email configuration (replace with your SMTP settings)
|
||||
SMTP_SERVER=smtp.example.com
|
||||
SMTP_PORT=587
|
||||
SMTP_USER=your-smtp-user
|
||||
SMTP_PASSWORD="your-smtp-password"
|
||||
SENDER_EMAIL=sender@example.com
|
||||
EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
|
||||
ERROR_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
|
||||
SUCCESS_EMAIL_RECIPIENTS="recipient1@example.com,recipient2@example.com"
|
||||
|
||||
# ACH configuration
|
||||
ACH_ENV="development" # "production" or "development"
|
||||
ACH_DRY_RUN="true"
|
||||
ACH_SAFE_RUN="true"
|
||||
ACH_CACHE_S3_LIST="true"
|
||||
|
||||
ACH_SYNC_CHUNK_SIZE=10 # in % of total files to import, from media_files_to_process used to determine the number of files to process in each batch
|
||||
|
||||
ACH_EDITOR_ID=1
|
||||
ACH_APPROVER_ID=1
|
||||
ACH_NOTES="Imported automatically from the S3 bucket"
|
||||
ACH_STORAGE_LOCATION='{"storage_type": "lto", "storage_location_id": 6}'
|
||||
ACH_FILE_TYPE='{"type": "video/mov"}'
|
||||
|
|
@ -6,4 +6,226 @@ __pycache__/
|
|||
*.pyo
|
||||
logs/
|
||||
*.logs
|
||||
*.log
|
||||
*.log
|
||||
|
||||
output/
|
||||
|
||||
TODO-mime.md
|
||||
.github/copilot-instructions.md
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[codz]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
# Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
# poetry.lock
|
||||
# poetry.toml
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
||||
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
||||
# pdm.lock
|
||||
# pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# pixi
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
||||
# pixi.lock
|
||||
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
||||
# in the .venv directory. It is recommended not to include this directory in version control.
|
||||
.pixi
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# Redis
|
||||
*.rdb
|
||||
*.aof
|
||||
*.pid
|
||||
|
||||
# RabbitMQ
|
||||
mnesia/
|
||||
rabbitmq/
|
||||
rabbitmq-data/
|
||||
|
||||
# ActiveMQ
|
||||
activemq-data/
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.envrc
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
# .idea/
|
||||
|
||||
# Abstra
|
||||
# Abstra is an AI-powered process automation framework.
|
||||
# Ignore directories containing user credentials, local state, and settings.
|
||||
# Learn more at https://abstra.io/docs
|
||||
.abstra/
|
||||
|
||||
# Visual Studio Code
|
||||
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
||||
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
||||
# you could uncomment the following to ignore the entire vscode folder
|
||||
# .vscode/
|
||||
|
||||
# Ruff stuff:
|
||||
.ruff_cache/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
||||
# Marimo
|
||||
marimo/_static/
|
||||
marimo/_lsp/
|
||||
__marimo__/
|
||||
|
||||
# Streamlit
|
||||
.streamlit/secrets.toml
|
||||
12
README.md
12
README.md
|
|
@ -95,6 +95,18 @@ docker compose up -d --build
|
|||
docker compose logs -f app
|
||||
```
|
||||
|
||||
### Run inside the container (from the host)
|
||||
|
||||
If you want to execute the importer manually inside the running container (for debugging or one-off runs), you can use either of the following:
|
||||
|
||||
```bash
|
||||
# Using docker compose (recommended)
|
||||
docker compose exec app python /app/main.py
|
||||
|
||||
# Or using docker exec with the container name
|
||||
docker exec -it ACH_server_media_importer python /app/main.py
|
||||
```
|
||||
|
||||
### Stop
|
||||
|
||||
```bash
|
||||
|
|
|
|||
32
db_utils.py
32
db_utils.py
|
|
@ -30,7 +30,8 @@ from psycopg2 import sql
|
|||
import logging
|
||||
from datetime import datetime
|
||||
import re
|
||||
from email_utils import handle_error
|
||||
from error_handler import notify_error
|
||||
from utils import check_audio_info
|
||||
import json
|
||||
import os
|
||||
import config
|
||||
|
|
@ -73,8 +74,24 @@ def get_mime_from_mediainfo(ach_variables: dict) -> str:
|
|||
mediainfo = ach_variables.get('custom_data_in', {}).get('mediainfo', {})
|
||||
tracks = mediainfo.get('media', {}).get('track', [])
|
||||
|
||||
# ---- Master (outside FILE/) must be ProRes ----
|
||||
# ---- Master (outside FILE/) ----
|
||||
if not is_file_folder:
|
||||
# Determine if this is an audio-type inventory code (OA4/MCC/DAT) or video master.
|
||||
inventory_code = ach_variables.get('inventory_code', '') or ''
|
||||
inventory_type = inventory_code[3:6] if len(inventory_code) >= 6 else ''
|
||||
audio_inventory_types = {'OA4', 'MCC', 'DAT'}
|
||||
|
||||
if inventory_type in audio_inventory_types:
|
||||
# For audio masters we validate the audio metadata (not ProRes video tracks).
|
||||
result, message = check_audio_info(mediainfo)
|
||||
if not result:
|
||||
raise ValueError(f"Audio validation failed: {message}")
|
||||
|
||||
# Derive MIME from extension; fall back to the configured mapping.
|
||||
extension = os.path.splitext(file_fullpath_norm)[1].lower()
|
||||
return get_mime_for_extension(extension or ach_variables.get('extension'))
|
||||
|
||||
# Otherwise, enforce a ProRes video track for video masters.
|
||||
# Find the video track
|
||||
video_track = None
|
||||
for t in tracks:
|
||||
|
|
@ -250,16 +267,15 @@ def check_inventory_in_db(s3_client, cur, base_name):
|
|||
|
||||
if result:
|
||||
logging.info(f"Inventory code {truncated_base_name[:12]} found in the database.")
|
||||
# Call the function to retrieve digital file names, assuming this function is implemented
|
||||
return True, truncated_base_name
|
||||
else:
|
||||
logging.info(f"Inventory code {truncated_base_name} not found in the database.")
|
||||
handle_error(f"Inventory code {truncated_base_name} not found in the database.")
|
||||
notify_error(f"Inventory code {truncated_base_name} not found in the database.")
|
||||
#raise ValueError(f"Inventory code {truncated_base_name} not found in the database.")
|
||||
return False, None
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f'Error checking inventory code {base_name}:', {e})
|
||||
notify_error(f'Error checking inventory code {base_name}', e)
|
||||
raise e
|
||||
|
||||
# Function to check if the object key exists in the database
|
||||
|
|
@ -296,7 +312,7 @@ def check_objkey_in_file_db(cur, base_name):
|
|||
return False
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error checking inventory code {base_name}: {e}")
|
||||
notify_error(f"Error checking inventory code {base_name}", e)
|
||||
raise e
|
||||
|
||||
# Function to add a file record and its relationship to the support record
|
||||
|
|
@ -333,7 +349,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
|
|||
notes = f"{ach_config.get('ach_notes','') } {date_part} {time_part}"
|
||||
|
||||
ach_variables['file_copia_conservativa'] = ach_variables['custom_data_in'].get('mediainfo', {}).get("media", {}).get("@ref", "")
|
||||
logging.info(f"ach_variables['file_copia_conservativa']a: {ach_variables['file_copia_conservativa']}")
|
||||
logging.info(f"ach_variables['file_copia_conservativa']: {ach_variables['file_copia_conservativa']}")
|
||||
|
||||
logging.debug("Executing add_file_record_and_relationship")
|
||||
|
||||
|
|
@ -441,7 +457,7 @@ def add_file_record_and_relationship(s3_client, cur, base_name,ach_variables):
|
|||
return False
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f'Error adding file record and relationship: {e}')
|
||||
notify_error(f"Error adding file record and relationship: {base_name}", e)
|
||||
raise e
|
||||
|
||||
# Functio to add a file record
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
services:
|
||||
app:
|
||||
build: .
|
||||
container_name: ACH_server_media_importer
|
||||
container_name: ACH_server_media_importer02
|
||||
volumes:
|
||||
- logs:/app/logs # Add this line to map the logs volume
|
||||
env_file:
|
||||
|
|
|
|||
|
|
@ -17,6 +17,18 @@ def handle_value_error(e):
|
|||
def handle_error(error_message):
|
||||
logging.error(f"Error: {error_message}")
|
||||
|
||||
def notify_error(error_message, e=None):
|
||||
"""
|
||||
Centralized error reporting: logs the error and triggers email notification.
|
||||
"""
|
||||
full_message = f"{error_message}: {e}" if e else error_message
|
||||
logging.error(full_message)
|
||||
try:
|
||||
from email_utils import handle_error as send_notification
|
||||
send_notification(Exception(full_message) if e is None else e)
|
||||
except Exception as notify_err:
|
||||
logging.error(f"Failed to trigger email notification: {notify_err}")
|
||||
|
||||
class ClientError(Exception):
|
||||
"""Custom exception class for client errors."""
|
||||
pass
|
||||
|
|
@ -72,7 +72,7 @@ def retrieve_file_contents(s3, base_name):
|
|||
logging.error(f'Error formatting file contents as JSON: {e}', exc_info=True)
|
||||
raise e
|
||||
|
||||
def check_related_files(s3, file_name_with_path, file, bucket_name):
|
||||
def check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=None):
|
||||
"""
|
||||
Check for related files in S3 based on the given file type.
|
||||
Parameters:
|
||||
|
|
@ -80,6 +80,7 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
|
|||
- file_name_with_path: The name of the file with its path.
|
||||
- file: The file name.
|
||||
- bucket_name: The name of the S3 bucket.
|
||||
- s3_listing_cache: Optional mapping of S3 keys to listing objects.
|
||||
Returns:
|
||||
None
|
||||
Raises:
|
||||
|
|
@ -111,7 +112,14 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
|
|||
logging.info(f"Checking for related file: {related_file}")
|
||||
|
||||
try:
|
||||
if not check_file_exists_in_s3(s3, related_file,bucket_name):
|
||||
# Optimized existence check
|
||||
exists = False
|
||||
if s3_listing_cache:
|
||||
exists = related_file in s3_listing_cache
|
||||
else:
|
||||
exists = check_file_exists_in_s3(s3, related_file, bucket_name)
|
||||
|
||||
if not exists:
|
||||
error_message = f"Required file {related_file} not found in S3."
|
||||
logging.error(error_message)
|
||||
raise FileNotFoundError(error_message)
|
||||
|
|
@ -126,10 +134,15 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
|
|||
|
||||
# Check the size of the related file
|
||||
try:
|
||||
if ext in ['json', 'md5', 'pdf']:
|
||||
file_size = get_file_size(s3, bucket_name, related_file)
|
||||
if file_size == 0:
|
||||
error_message = f"File {related_file} has zero size."
|
||||
if ext in ['json', 'md5', 'pdf']:
|
||||
# Optimized size check
|
||||
if s3_listing_cache and related_file in s3_listing_cache:
|
||||
file_size = s3_listing_cache[related_file].get('Size')
|
||||
else:
|
||||
file_size = get_file_size(s3, bucket_name, related_file)
|
||||
|
||||
if file_size == 0 or file_size is None:
|
||||
error_message = f"File {related_file} has zero size or missing."
|
||||
logging.error(error_message)
|
||||
raise ValueError(error_message)
|
||||
else:
|
||||
|
|
@ -143,8 +156,17 @@ def check_related_files(s3, file_name_with_path, file, bucket_name):
|
|||
# If the required file is a .pdf, get its size and update ach_pdf_disk_size
|
||||
if ext =='pdf':
|
||||
pdf_file = f"{file_name_with_path}.pdf"
|
||||
if check_file_exists_in_s3(s3, pdf_file,bucket_name):
|
||||
pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
|
||||
pdf_exists = False
|
||||
if s3_listing_cache:
|
||||
pdf_exists = pdf_file in s3_listing_cache
|
||||
else:
|
||||
pdf_exists = check_file_exists_in_s3(s3, pdf_file, bucket_name)
|
||||
|
||||
if pdf_exists:
|
||||
if s3_listing_cache and pdf_file in s3_listing_cache:
|
||||
pdf_file_size = s3_listing_cache[pdf_file].get('Size')
|
||||
else:
|
||||
pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
|
||||
ach_pdf_disk_size = pdf_file_size
|
||||
# logging.info(f"PDF disk size: {ach_pdf_disk_size}")
|
||||
else:
|
||||
|
|
|
|||
84
main.py
84
main.py
|
|
@ -5,6 +5,7 @@ from datetime import datetime
|
|||
import pytz
|
||||
import os
|
||||
import re
|
||||
import math
|
||||
from logging_config import setup_logging, CUSTOM_ERROR_LEVEL
|
||||
from email_utils import handle_error, send_email_with_attachment
|
||||
from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files
|
||||
|
|
@ -90,17 +91,24 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
|||
# List S3 bucket s3_validated_contents
|
||||
list_s3_files = list_s3_bucket(s3_client, bucket_name)
|
||||
|
||||
# Build a quick in-memory map (cache) of the bucket listing.
|
||||
# This will be used exclusively for metadata lookups (size/existence)
|
||||
# to avoid redundant S3 network calls, without changing the main logic.
|
||||
s3_listing_cache = {obj['Key']: obj for obj in list_s3_files}
|
||||
|
||||
# Define valid extensions and excluded folders
|
||||
# NOTE: This list is used only for the initial S3 filtering step (Phase 1).
|
||||
# It determines which object keys are considered for further processing.
|
||||
valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'}
|
||||
valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'} # dont like this
|
||||
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'}
|
||||
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/', 'DVD/', 'UMT/'}
|
||||
excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'UMT/'}
|
||||
excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/', 'FILE/' ,'MCC/'}
|
||||
# excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'TST/',}
|
||||
# included_folders = {'FILE/'} # uncomment this to NOT use excluded folders
|
||||
# included_folders = {'TEST-FOLDER-DEV/'} # uncomment this to NOT use excluded folders
|
||||
|
||||
# aggiungere un distinct e count delle estenisoni perma di qualsiasi filtro
|
||||
|
||||
# Extract and filter file names
|
||||
|
||||
# s3_file_names: include only files that match valid extensions and
|
||||
|
|
@ -112,7 +120,26 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
|||
except NameError:
|
||||
use_included = False
|
||||
|
||||
logging.info(f"Filtering S3 objects with valid extensions: {valid_extensions}")
|
||||
if use_included:
|
||||
logging.info(f"Using include-folder filter: {included_folders}")
|
||||
filter_mode = "include"
|
||||
else:
|
||||
logging.info(f"Using exclusion-folder filter: {excluded_folders}")
|
||||
filter_mode = "exclude"
|
||||
|
||||
# Ask for confirmation before proceeding (y/N). If user declines, exit cleanly.
|
||||
try:
|
||||
answer = input(f"Proceed using '{filter_mode}' filter mode? (y/N): ").strip().lower()
|
||||
except Exception:
|
||||
answer = 'n'
|
||||
|
||||
if answer != 'y':
|
||||
logging.info("User chose not to proceed with the current filter mode. Exiting.")
|
||||
return
|
||||
|
||||
if use_included:
|
||||
|
||||
s3_file_names = [
|
||||
content['Key'] for content in list_s3_files
|
||||
if any(content['Key'].endswith(ext) for ext in valid_extensions)
|
||||
|
|
@ -127,6 +154,19 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
|||
]
|
||||
logging.info("Using excluded_folders filter")
|
||||
|
||||
# Count file extensions that survived the initial filtering.
|
||||
# This provides a stable "perma" summary of what is being considered
|
||||
# for the rest of the workflow.
|
||||
from collections import Counter
|
||||
extension_counts = Counter(
|
||||
os.path.splitext(f)[1].lower() or "(no_ext)" for f in s3_file_names
|
||||
)
|
||||
# Log a user-friendly multi-line summary instead of a single dict dump
|
||||
extension_summary = "\n".join(
|
||||
f" {ext or '(no_ext)'}: {count}" for ext, count in sorted(extension_counts.items())
|
||||
)
|
||||
logging.info("Extension counts after initial filtering:\n%s", extension_summary)
|
||||
|
||||
# check inventory code syntax
|
||||
# first check s3_file_names if the file base name and folder name match pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$'
|
||||
s3_validated_contents = []
|
||||
|
|
@ -207,11 +247,16 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
|||
filtered_file_names=list_s3_not_in_db(s3_validated_contents, db_file_names, db_sidecar_basenames)
|
||||
|
||||
|
||||
# Produces only media files for Phase 3 parsing.
|
||||
# Sidecars (.json, .md5, .pdf) are validated as dependencies of the media files.
|
||||
media_files_to_process = [f for f in filtered_file_names if f.lower().endswith(('.mp4', '.mp3'))]
|
||||
|
||||
# Print the total number of files
|
||||
total_files_s3 = len(s3_validated_contents)
|
||||
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
|
||||
total_files = len(filtered_file_names)
|
||||
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
|
||||
logging.info(f"Total media files (.mp4, .mp3) to process in Phase 3: {len(media_files_to_process)}")
|
||||
|
||||
# Log the files that need to be updated (those not yet in DB)
|
||||
if total_files > 0:
|
||||
|
|
@ -329,11 +374,44 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
|||
# ---------------------------------------------------------------------
|
||||
logging.info("PHASE 3: Parse S3 objects and insert new records into the database")
|
||||
|
||||
# Implement ACH_SYNC_CHUNK_SIZE (dev/testing only).
|
||||
# Production always processes 100%.
|
||||
# ach_sync_pct = 100
|
||||
# if ach_env != 'production':
|
||||
# try:
|
||||
# ach_sync_pct = int(os.getenv('ACH_SYNC_CHUNK_SIZE', '100'))
|
||||
# except Exception:
|
||||
# ach_sync_pct = 100
|
||||
# ach_sync_pct = max(0, min(100, ach_sync_pct))
|
||||
|
||||
# total_media_files = len(media_files_to_process)
|
||||
# if total_media_files > 0 and ach_sync_pct < 100:
|
||||
# sync_limit = max(1, math.ceil(total_media_files * ach_sync_pct / 100))
|
||||
# logging.info(
|
||||
# "ACH_SYNC_CHUNK_SIZE enabled: processing %s/%s media files (%s%%)",
|
||||
# sync_limit,
|
||||
# total_media_files,
|
||||
# ach_sync_pct,
|
||||
# )
|
||||
# media_files_to_process = media_files_to_process[:sync_limit]
|
||||
# else:
|
||||
# logging.info(
|
||||
# "Processing all %s media files (ACH_SYNC_CHUNK_SIZE=%s%%)",
|
||||
# total_media_files,
|
||||
# ach_sync_pct,
|
||||
# )
|
||||
|
||||
# Try to parse S3 files
|
||||
try:
|
||||
# If DRY RUN is set to True, the files will not be uploaded to the database
|
||||
if os.getenv('ACH_DRY_RUN', 'true') == 'false':
|
||||
uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(s3_client, filtered_file_names, ach_variables, excluded_folders)
|
||||
uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(
|
||||
s3_client,
|
||||
media_files_to_process,
|
||||
ach_variables,
|
||||
excluded_folders,
|
||||
s3_listing_cache=s3_listing_cache
|
||||
)
|
||||
else:
|
||||
logging.warning("DRY RUN is set to TRUE - No files will be added to the database")
|
||||
# set the tuples to zero
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
119
s3_utils.py
119
s3_utils.py
|
|
@ -7,20 +7,21 @@ import psycopg2 # for PostgreSQL
|
|||
|
||||
# Import custom modules
|
||||
from file_utils import retrieve_file_contents, check_related_files, extract_and_validate_file_info # for file operations
|
||||
from email_utils import handle_error # for error handling depecradted?
|
||||
from error_handler import notify_error
|
||||
from db_utils import get_db_connection, check_inventory_in_db, check_objkey_in_file_db, add_file_record_and_relationship, retrieve_digital_file_names # for database operations
|
||||
|
||||
import config
|
||||
|
||||
# Function to check the existence of related files and validate in PostgreSQL
|
||||
def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
||||
def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[], s3_listing_cache=None):
|
||||
"""
|
||||
Parses the S3 files and performs various operations on them.
|
||||
Args:
|
||||
s3 (S3): The S3 object for accessing S3 services.
|
||||
s3_files (list): The list of S3 files to be processed.
|
||||
s3_listing_cache (dict, optional): Mapping of S3 keys to listing objects.
|
||||
Returns:
|
||||
None
|
||||
tuple: (uploaded_files_count, warning_files_count, error_files_count)
|
||||
Raises:
|
||||
FileNotFoundError: If a required file is not found in S3.
|
||||
ValueError: If a file has zero size or if the file type is unsupported.
|
||||
|
|
@ -69,9 +70,42 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
|||
# Display progress to console only (not written to log files)
|
||||
print(f"--------------\n--- file {idx} of {total_files} ---\n--------------", flush=True)
|
||||
|
||||
# Use a savepoint per file to allow rollback on individual failures
|
||||
# without aborting the full batch.
|
||||
cur.execute("SAVEPOINT file_save")
|
||||
# Ensure we start each file with a clean transaction state.
|
||||
try:
|
||||
conn.rollback()
|
||||
except Exception as e:
|
||||
logging.error(f"Rollback failed before processing {file}: {e}")
|
||||
|
||||
try:
|
||||
cur.close()
|
||||
except Exception:
|
||||
pass
|
||||
cur = conn.cursor()
|
||||
|
||||
try:
|
||||
# Use a savepoint per file to allow rollback on individual failures
|
||||
# without aborting the full batch.
|
||||
cur.execute("SAVEPOINT file_save")
|
||||
except Exception as e:
|
||||
# If the transaction is aborted, log and retry once.
|
||||
import traceback
|
||||
logging.error(f"Transaction aborted before processing {file}, retrying after reset: {e}")
|
||||
logging.error(traceback.format_exc())
|
||||
|
||||
try:
|
||||
conn.rollback()
|
||||
except Exception as rollback_err:
|
||||
logging.error(f"Rollback failed while recovering from aborted transaction: {rollback_err}")
|
||||
|
||||
try:
|
||||
cur.close()
|
||||
except Exception:
|
||||
pass
|
||||
cur = conn.cursor()
|
||||
|
||||
# Retry savepoint once after recovery
|
||||
cur.execute("SAVEPOINT file_save")
|
||||
|
||||
try:
|
||||
if file.endswith(('.mp4', '.mp3')): # Check for both .mp4 and .mp3
|
||||
logging.info("Processing file: %s in the bucket: %s", file, bucket_name)
|
||||
|
|
@ -130,7 +164,13 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
|||
|
||||
try:
|
||||
# Retrieve and log the file size
|
||||
file_size = get_file_size(s3, bucket_name, file)
|
||||
# Optimized: Check cache first
|
||||
if s3_listing_cache and file in s3_listing_cache:
|
||||
file_size = s3_listing_cache[file].get('Size')
|
||||
logging.info(f"Retrieved file size from cache for: {file}")
|
||||
else:
|
||||
file_size = get_file_size(s3, bucket_name, file)
|
||||
|
||||
# maybe can trow an error inside te get_file_size function and catch it here
|
||||
if file_size is not None:
|
||||
ach_variables['media_disk_size'] = file_size
|
||||
|
|
@ -142,12 +182,12 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
|||
logging.error("ACH_SAFE_RUN=true: aborting Phase 3 due to warnings (missing file size): %s", file)
|
||||
raise ValueError("ACH_SAFE_RUN=true: aborting due to warnings in Phase 3")
|
||||
continue # Skip to the next file in the loop
|
||||
|
||||
|
||||
logging.info("Start Validating files for %s...", base_name)
|
||||
# Check if related file exist and retreive .pdf file size
|
||||
try:
|
||||
# Check if the required files exist in S3
|
||||
ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name)
|
||||
ach_variables['pdf_disk_size'] = check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=s3_listing_cache)
|
||||
logging.info(f"PDF disk size: {ach_variables['pdf_disk_size']}")
|
||||
except FileNotFoundError as e:
|
||||
# Handle case where the file is not found
|
||||
|
|
@ -166,7 +206,10 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
|||
continue # Move on to the next file in the loop
|
||||
except Exception as e:
|
||||
# Handle any other exceptions
|
||||
logging.error(f"An error occurred: {e}")
|
||||
logging.error(f"Validation step failed for {file}: {e}")
|
||||
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||
error_files_count += 1
|
||||
continue
|
||||
|
||||
# Retrieve the file contents for related files: .md5, .json
|
||||
try:
|
||||
|
|
@ -176,6 +219,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
|||
except Exception as e:
|
||||
# Log the error
|
||||
logging.error(f"Error retrieving file contents for {file_name_with_path}: {e}")
|
||||
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||
file_contents = None # Set file_contents to None or handle it as needed
|
||||
error_files_count +=1
|
||||
continue # Move on to the next file in the loop
|
||||
|
|
@ -183,6 +227,7 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
|||
# if contents dont exists
|
||||
if file_contents is None:
|
||||
logging.error(f"Error retrieving file contents for {file}.")
|
||||
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||
error_files_count +=1
|
||||
continue # Move on to the next file in the loop
|
||||
|
||||
|
|
@ -222,14 +267,17 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
|||
add_file_record_and_relationship(s3, cur, base_name, ach_variables)
|
||||
else:
|
||||
logging.warning(f"File record already exists for {base_name}.")
|
||||
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||
warning_files_count +=1
|
||||
continue
|
||||
else:
|
||||
logging.error(f"Inventory code {base_name} not found in the database.")
|
||||
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||
error_files_count +=1
|
||||
continue
|
||||
except ValueError as e:
|
||||
logging.error(f"An error occurred: {e}")
|
||||
except Exception as e:
|
||||
logging.error(f"DB operation failed for {base_name}: {e}")
|
||||
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||
error_files_count +=1
|
||||
continue
|
||||
|
||||
|
|
@ -241,7 +289,9 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
|||
uploaded_files_count +=1
|
||||
except Exception as e:
|
||||
# Roll back the changes done for this file only and continue processing others
|
||||
import traceback
|
||||
logging.error(f"Error processing {file}: {e}. Rolling back this file's changes.")
|
||||
logging.error(traceback.format_exc())
|
||||
try:
|
||||
cur.execute("ROLLBACK TO SAVEPOINT file_save")
|
||||
except Exception as rollback_err:
|
||||
|
|
@ -264,8 +314,8 @@ def parse_s3_files( s3, s3_files, ach_variables, excluded_folders=[]):
|
|||
raise e # Raise the exception to the calling function
|
||||
except Exception as e:
|
||||
# Handle any other unexpected errors
|
||||
logging.error(f"Unexpected error: {e}")
|
||||
#handle_error(e) # Pass unexpected errors to handle_error
|
||||
import traceback
|
||||
notify_error("FATAL ERROR in Phase 3 process", e)
|
||||
raise e # Raise the exception to the calling function
|
||||
|
||||
# return the file saved
|
||||
|
|
@ -294,15 +344,54 @@ def create_s3_client(aws_config):
|
|||
|
||||
# Function to list the contents of an S3 bucket
|
||||
def list_s3_bucket(s3_client, bucket_name):
|
||||
"""
|
||||
Lists S3 bucket contents with optional local JSON caching.
|
||||
Uses ACH_CACHE_S3_LIST from .env to decide if it should cache/read from 's3_cache.json'.
|
||||
"""
|
||||
cache_enabled = os.getenv('ACH_CACHE_S3_LIST', 'false').lower() == 'true'
|
||||
cache_file = 's3_cache.json'
|
||||
|
||||
if cache_enabled and os.path.exists(cache_file):
|
||||
try:
|
||||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||
cached_data = json.load(f)
|
||||
logging.info(f"Loaded {len(cached_data)} items from local cache: {cache_file}")
|
||||
return cached_data
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to read S3 cache file: {e}. Falling back to S3 listing.")
|
||||
|
||||
try:
|
||||
logging.info(f"Listing all objects in bucket: {bucket_name}...")
|
||||
paginator = s3_client.get_paginator('list_objects_v2')
|
||||
bucket_contents = []
|
||||
|
||||
# Convert datetime objects to string for JSON serialization
|
||||
def _serialize_datetime(obj):
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
return obj
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
for page in paginator.paginate(Bucket=bucket_name):
|
||||
if 'Contents' in page:
|
||||
bucket_contents.extend(page['Contents'])
|
||||
for obj in page['Contents']:
|
||||
# Normalize dates for JSON compatibility
|
||||
if 'LastModified' in obj:
|
||||
obj['LastModified'] = obj['LastModified'].isoformat()
|
||||
bucket_contents.extend([obj])
|
||||
|
||||
logging.info(f"Retrieved {len(bucket_contents)} items from the bucket.")
|
||||
|
||||
# Save to cache if enabled
|
||||
if cache_enabled:
|
||||
try:
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(bucket_contents, f)
|
||||
logging.info(f"S3 bucket listing saved to local cache: {cache_file}")
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to write S3 cache file: {e}")
|
||||
|
||||
return bucket_contents
|
||||
except ClientError as e:
|
||||
logging.error(f'Error listing bucket contents: {e}')
|
||||
|
|
|
|||
Loading…
Reference in New Issue