219 lines
10 KiB
Python
219 lines
10 KiB
Python
# v20251103 - Main script to import media files from S3 to the database
|
|
import logging
|
|
import time
|
|
from datetime import datetime
|
|
import pytz
|
|
import os
|
|
from logging_config import setup_logging, CUSTOM_ERROR_LEVEL
|
|
from email_utils import handle_error, send_email_with_attachment
|
|
from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files
|
|
from error_handler import handle_general_error, handle_file_not_found_error, handle_value_error
|
|
from file_utils import is_file_empty
|
|
from db_utils import count_files, get_distinct_filenames_from_db
|
|
from dotenv import load_dotenv
|
|
import config
|
|
import psycopg2
|
|
|
|
load_dotenv()
|
|
|
|
# MAIN PROCESS
|
|
def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
|
# import global variables
|
|
#from config import load_config, aws_config, db_config, ach_config, bucket_name
|
|
#global aws_config, db_config, ach_config, bucket_name
|
|
#config import load_config , aws_config, db_config, ach_config, bucket_name
|
|
#load_config()
|
|
|
|
logging.info(f"bucket_name: {bucket_name}")
|
|
|
|
# Ensure timing variables are always defined so later error-email logic
|
|
# won't fail if an exception is raised before end_time/elapsed_time is set.
|
|
start_time = time.time()
|
|
end_time = start_time
|
|
elapsed_time = 0.0
|
|
|
|
try:
|
|
logging.info("Starting the main process...")
|
|
|
|
# Create the S3 client
|
|
s3_client = create_s3_client(aws_config)
|
|
|
|
# List S3 bucket contents
|
|
contents = list_s3_bucket(s3_client, bucket_name)
|
|
|
|
# Define valid extensions and excluded folders
|
|
valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'}
|
|
excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'}
|
|
|
|
# Extract and filter file names
|
|
s3_file_names = [
|
|
content['Key'] for content in contents
|
|
if any(content['Key'].endswith(ext) for ext in valid_extensions) and
|
|
not any(content['Key'].startswith(folder) for folder in excluded_folders)
|
|
]
|
|
|
|
s3_only_mp4_file_names = [
|
|
content['Key'] for content in contents
|
|
if content['Key'].endswith('.mp4') and
|
|
not any(content['Key'].startswith(folder) for folder in excluded_folders)
|
|
]
|
|
|
|
total_file_s3mp4 = len(s3_only_mp4_file_names)
|
|
logging.info(f"Total number of distinct .mp4 files in the S3 bucket before import: {total_file_s3mp4}")
|
|
|
|
# filter_s3_files_not_in_db
|
|
# --- Get all DB filenames in one call ---
|
|
db_file_names = get_distinct_filenames_from_db()
|
|
|
|
# --- Keep only those not in DB ---
|
|
file_names = [f for f in s3_file_names if f not in db_file_names]
|
|
|
|
# Print the total number of files
|
|
total_file_db = len(db_file_names)
|
|
logging.info(f"Total number of distinct files in the database before import: {total_file_db}")
|
|
total_files_s3 = len(s3_file_names)
|
|
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
|
|
total_files = len(file_names)
|
|
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
|
|
|
|
# Count files with .mp4 and .mp3 extensions
|
|
mp4_count = sum(1 for file in s3_file_names if file.endswith('.mp4'))
|
|
mp3_count = sum(1 for file in s3_file_names if file.endswith('.mp3'))
|
|
md5_count = sum(1 for file in s3_file_names if file.endswith('.md5'))
|
|
pdf_count = sum(1 for file in s3_file_names if file.endswith('.pdf'))
|
|
json_count = sum(1 for file in s3_file_names if file.endswith('.json'))
|
|
mov_count = sum(1 for file in s3_file_names if file.endswith('.mov'))
|
|
# jpg_count = sum(1 for file in file_names if file.endswith('.jpg'))
|
|
# file directory
|
|
avi_count = sum(1 for file in s3_file_names if file.endswith('.avi'))
|
|
m4v_count = sum(1 for file in s3_file_names if file.endswith('.m4v'))
|
|
# Log the counts
|
|
# Get the logger instance
|
|
logger = logging.getLogger()
|
|
# Use the logger instance to log custom info
|
|
logging.warning("Number of .mp4 files on S3 bucket (%s): %s", bucket_name, mp4_count)
|
|
logging.warning("Number of .mp3 files on S3 bucket (%s): %s", bucket_name, mp3_count)
|
|
logging.warning("Number of .md5 files on S3 bucket (%s): %s", bucket_name, md5_count)
|
|
logging.warning("Number of .pdf files on S3 bucket (%s): %s", bucket_name, pdf_count)
|
|
logging.warning("Number of .json files on S3 bucket (%s): %s", bucket_name, json_count)
|
|
logging.warning("Number of .mov files on S3 bucket (%s): %s", bucket_name, mov_count)
|
|
if mp4_count != pdf_count:
|
|
logging.error("Number of .mp4 files is not equal to number of .pdf files")
|
|
logging.error("Abort Import Process due to missing files")
|
|
# return
|
|
if mp3_count + mp4_count != json_count:
|
|
logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .json files")
|
|
logging.error("Abort Import Process due to missing files")
|
|
# return
|
|
if mp3_count + mp4_count != md5_count:
|
|
logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .md5 files")
|
|
logging.error("Abort Import Process due to missing files")
|
|
# return
|
|
|
|
# Try to parse S3 files
|
|
try:
|
|
# if DRY RUN is set to True, the files will not be uploaded to the database
|
|
|
|
logging.warning("DRY RUN is set to TRUE - No files will be added to the database")
|
|
# set the tuples to zero
|
|
uploaded_files_count, warning_files_count, error_files_count = (0, 0, 0)
|
|
|
|
logging.warning("Total number of files (mp3+mp4) with warnings: %s. (Probably already existing in the DB)", warning_files_count)
|
|
logging.warning("Total number of files with errors: %s", error_files_count)
|
|
logging.warning("Total number of files uploaded: %s", uploaded_files_count)
|
|
logging.warning("All files parsed")
|
|
except Exception as e:
|
|
logging.error(f"An error occurred while parsing S3 files: {e}")
|
|
handle_general_error(e)
|
|
|
|
# Check results
|
|
# connect to database
|
|
conn = psycopg2.connect(**db_config)
|
|
cur = conn.cursor()
|
|
# function count_files that are wav and mov in db
|
|
# Map file extensions (include leading dot) to mime types
|
|
EXTENSION_MIME_MAP = {
|
|
'.avi': 'video/x-msvideo',
|
|
'.mov': 'video/mov',
|
|
'.wav': 'audio/wav',
|
|
'.mp4': 'video/mp4',
|
|
'.m4v': 'video/mp4',
|
|
'.mp3': 'audio/mp3',
|
|
'.mxf': 'application/mxf',
|
|
'.mpg': 'video/mpeg',
|
|
}
|
|
|
|
# populate mime_type list with all relevant MediaInfo/MIME values
|
|
mime_type = [
|
|
'video/x-msvideo', # .avi
|
|
'video/mov', # .mov
|
|
'audio/wav', # .wav
|
|
'video/mp4', # .mp4, .m4v
|
|
'audio/mp3', # .mp3
|
|
'application/mxf', # .mxf
|
|
'video/mpeg', # .mpg
|
|
]
|
|
|
|
logging.info(f"Mime types for counting files: {mime_type}")
|
|
|
|
all_files_on_db = count_files(cur, mime_type,'*', False)
|
|
mov_files_on_db = count_files(cur,['video/mov'],'.mov', False )
|
|
mxf_files_on_db = count_files(cur,['application/mxf'],'.mxf', False )
|
|
mpg_files_on_db = count_files(cur,['video/mpeg'],'.mpg', False )
|
|
avi_files_on_db = count_files(cur,['video/x-msvideo'],'.avi', False )
|
|
m4v_files_on_db = count_files(cur,['video/mp4'],'.m4v', False )
|
|
mp4_files_on_db = count_files(cur,['video/mp4'],'.mp4', False )
|
|
wav_files_on_db = count_files(cur,['audio/wav'],'.wav', False )
|
|
mp3_files_on_db = count_files(cur,['audio/mp3'],'.mp3', False )
|
|
|
|
# mov + m4v + avi + mxf + mpg
|
|
logging.warning(f"Number of all video files in the database: {all_files_on_db}")
|
|
logging.warning(f"Number of .mov files in the database: {mov_files_on_db} and S3: {mov_count} ")
|
|
logging.warning(f"Number of .mp4 files in the database: {mp4_files_on_db} and S3: {mp4_count}")
|
|
|
|
# compare the mp4 name and s3 name and report the missing files in the 2 lists a print the list
|
|
missing_mp4s = [f for f in file_names if f.endswith('.mp4') and f not in db_file_names]
|
|
logging.warning(f"Missing .mp4 files in DB compared to S3: {missing_mp4s}")
|
|
|
|
|
|
logging.warning(f"Number of .wav files in the database: {wav_files_on_db} ")
|
|
logging.warning(f"Number of .mp3 files in the database: {mp3_files_on_db} and S3: {mp3_count}")
|
|
logging.warning(f"Number of .avi files in the database: {avi_files_on_db} ")
|
|
logging.warning(f"Number of .m4v files in the database: {m4v_files_on_db} ")
|
|
logging.warning(f"Number of .mxf files in the database: {mxf_files_on_db} ")
|
|
logging.warning(f"Number of .mpg files in the database: {mpg_files_on_db} ")
|
|
|
|
logging.warning(f"Total file in s3 before import {total_files}")
|
|
|
|
# time elapsed
|
|
end_time = time.time() # Record end time
|
|
|
|
elapsed_time = end_time - start_time
|
|
logging.warning(f"Processing completed. Time taken: {elapsed_time:.2f} seconds")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
handle_general_error(e)
|
|
except FileNotFoundError as e:
|
|
handle_file_not_found_error(e)
|
|
except ValueError as e:
|
|
handle_value_error(e)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
# Setup logging using standard TimedRotatingFileHandler handlers.
|
|
# Rely on the handler's built-in rotation; don't call doRollover manually.
|
|
logger, rotating_handler, error_handler, warning_handler = setup_logging()
|
|
|
|
# Load configuration settings
|
|
aws_config, db_config, ach_config, bucket_name, ach_variables = config.load_config()
|
|
|
|
logging.info("Config loaded, and logging setup")
|
|
|
|
# Run the main process
|
|
main_process(aws_config, db_config, ach_config, bucket_name, ach_variables)
|
|
|
|
except Exception as e:
|
|
logging.error(f"An error occurred: {e}") |