# v20251103 - Main script to import media files from S3 to the database import logging import time from datetime import datetime import pytz import os from logging_config import setup_logging, CUSTOM_ERROR_LEVEL from email_utils import handle_error, send_email_with_attachment from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files from error_handler import handle_general_error, handle_file_not_found_error, handle_value_error from file_utils import is_file_empty from db_utils import count_files, get_distinct_filenames_from_db from dotenv import load_dotenv import config import psycopg2 load_dotenv() # MAIN PROCESS def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables): # import global variables #from config import load_config, aws_config, db_config, ach_config, bucket_name #global aws_config, db_config, ach_config, bucket_name #config import load_config , aws_config, db_config, ach_config, bucket_name #load_config() logging.info(f"bucket_name: {bucket_name}") # Ensure timing variables are always defined so later error-email logic # won't fail if an exception is raised before end_time/elapsed_time is set. start_time = time.time() end_time = start_time elapsed_time = 0.0 try: logging.info("Starting the main process...") # Create the S3 client s3_client = create_s3_client(aws_config) # List S3 bucket contents contents = list_s3_bucket(s3_client, bucket_name) # Define valid extensions and excluded folders valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'} excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'} # Extract and filter file names s3_file_names = [ content['Key'] for content in contents if any(content['Key'].endswith(ext) for ext in valid_extensions) and not any(content['Key'].startswith(folder) for folder in excluded_folders) ] s3_only_mp4_file_names = [ content['Key'] for content in contents if content['Key'].endswith('.mp4') and not any(content['Key'].startswith(folder) for folder in excluded_folders) ] total_file_s3mp4 = len(s3_only_mp4_file_names) logging.info(f"Total number of distinct .mp4 files in the S3 bucket before import: {total_file_s3mp4}") # filter_s3_files_not_in_db # --- Get all DB filenames in one call --- db_file_names = get_distinct_filenames_from_db() # --- Keep only those not in DB --- file_names = [f for f in s3_file_names if f not in db_file_names] # Print the total number of files total_file_db = len(db_file_names) logging.info(f"Total number of distinct files in the database before import: {total_file_db}") total_files_s3 = len(s3_file_names) logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}") total_files = len(file_names) logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}") # Count files with .mp4 and .mp3 extensions mp4_count = sum(1 for file in s3_file_names if file.endswith('.mp4')) mp3_count = sum(1 for file in s3_file_names if file.endswith('.mp3')) md5_count = sum(1 for file in s3_file_names if file.endswith('.md5')) pdf_count = sum(1 for file in s3_file_names if file.endswith('.pdf')) json_count = sum(1 for file in s3_file_names if file.endswith('.json')) mov_count = sum(1 for file in s3_file_names if file.endswith('.mov')) # jpg_count = sum(1 for file in file_names if file.endswith('.jpg')) # file directory avi_count = sum(1 for file in s3_file_names if file.endswith('.avi')) m4v_count = sum(1 for file in s3_file_names if file.endswith('.m4v')) # Log the counts # Get the logger instance logger = logging.getLogger() # Use the logger instance to log custom info logging.warning("Number of .mp4 files on S3 bucket (%s): %s", bucket_name, mp4_count) logging.warning("Number of .mp3 files on S3 bucket (%s): %s", bucket_name, mp3_count) logging.warning("Number of .md5 files on S3 bucket (%s): %s", bucket_name, md5_count) logging.warning("Number of .pdf files on S3 bucket (%s): %s", bucket_name, pdf_count) logging.warning("Number of .json files on S3 bucket (%s): %s", bucket_name, json_count) logging.warning("Number of .mov files on S3 bucket (%s): %s", bucket_name, mov_count) if mp4_count != pdf_count: logging.error("Number of .mp4 files is not equal to number of .pdf files") logging.error("Abort Import Process due to missing files") # return if mp3_count + mp4_count != json_count: logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .json files") logging.error("Abort Import Process due to missing files") # return if mp3_count + mp4_count != md5_count: logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .md5 files") logging.error("Abort Import Process due to missing files") # return # Try to parse S3 files try: # if DRY RUN is set to True, the files will not be uploaded to the database logging.warning("DRY RUN is set to TRUE - No files will be added to the database") # set the tuples to zero uploaded_files_count, warning_files_count, error_files_count = (0, 0, 0) logging.warning("Total number of files (mp3+mp4) with warnings: %s. (Probably already existing in the DB)", warning_files_count) logging.warning("Total number of files with errors: %s", error_files_count) logging.warning("Total number of files uploaded: %s", uploaded_files_count) logging.warning("All files parsed") except Exception as e: logging.error(f"An error occurred while parsing S3 files: {e}") handle_general_error(e) # Check results # connect to database conn = psycopg2.connect(**db_config) cur = conn.cursor() # function count_files that are wav and mov in db # Map file extensions (include leading dot) to mime types EXTENSION_MIME_MAP = { '.avi': 'video/x-msvideo', '.mov': 'video/mov', '.wav': 'audio/wav', '.mp4': 'video/mp4', '.m4v': 'video/mp4', '.mp3': 'audio/mp3', '.mxf': 'application/mxf', '.mpg': 'video/mpeg', } # populate mime_type list with all relevant MediaInfo/MIME values mime_type = [ 'video/x-msvideo', # .avi 'video/mov', # .mov 'audio/wav', # .wav 'video/mp4', # .mp4, .m4v 'audio/mp3', # .mp3 'application/mxf', # .mxf 'video/mpeg', # .mpg ] logging.info(f"Mime types for counting files: {mime_type}") all_files_on_db = count_files(cur, mime_type,'*', False) mov_files_on_db = count_files(cur,['video/mov'],'.mov', False ) mxf_files_on_db = count_files(cur,['application/mxf'],'.mxf', False ) mpg_files_on_db = count_files(cur,['video/mpeg'],'.mpg', False ) avi_files_on_db = count_files(cur,['video/x-msvideo'],'.avi', False ) m4v_files_on_db = count_files(cur,['video/mp4'],'.m4v', False ) mp4_files_on_db = count_files(cur,['video/mp4'],'.mp4', False ) wav_files_on_db = count_files(cur,['audio/wav'],'.wav', False ) mp3_files_on_db = count_files(cur,['audio/mp3'],'.mp3', False ) # mov + m4v + avi + mxf + mpg logging.warning(f"Number of all video files in the database: {all_files_on_db}") logging.warning(f"Number of .mov files in the database: {mov_files_on_db} and S3: {mov_count} ") logging.warning(f"Number of .mp4 files in the database: {mp4_files_on_db} and S3: {mp4_count}") # compare the mp4 name and s3 name and report the missing files in the 2 lists a print the list missing_mp4s = [f for f in file_names if f.endswith('.mp4') and f not in db_file_names] logging.warning(f"Missing .mp4 files in DB compared to S3: {missing_mp4s}") logging.warning(f"Number of .wav files in the database: {wav_files_on_db} ") logging.warning(f"Number of .mp3 files in the database: {mp3_files_on_db} and S3: {mp3_count}") logging.warning(f"Number of .avi files in the database: {avi_files_on_db} ") logging.warning(f"Number of .m4v files in the database: {m4v_files_on_db} ") logging.warning(f"Number of .mxf files in the database: {mxf_files_on_db} ") logging.warning(f"Number of .mpg files in the database: {mpg_files_on_db} ") logging.warning(f"Total file in s3 before import {total_files}") # time elapsed end_time = time.time() # Record end time elapsed_time = end_time - start_time logging.warning(f"Processing completed. Time taken: {elapsed_time:.2f} seconds") except Exception as e: handle_general_error(e) except FileNotFoundError as e: handle_file_not_found_error(e) except ValueError as e: handle_value_error(e) if __name__ == "__main__": try: # Setup logging using standard TimedRotatingFileHandler handlers. # Rely on the handler's built-in rotation; don't call doRollover manually. logger, rotating_handler, error_handler, warning_handler = setup_logging() # Load configuration settings aws_config, db_config, ach_config, bucket_name, ach_variables = config.load_config() logging.info("Config loaded, and logging setup") # Run the main process main_process(aws_config, db_config, ach_config, bucket_name, ach_variables) except Exception as e: logging.error(f"An error occurred: {e}")