ACH-ARKIVO-ImportMedia/countfiles.py

199 lines
9.4 KiB
Python

# v20251103 - Main script to import media files from S3 to the database
import logging
import time
from datetime import datetime
import pytz
import os
from logging_config import setup_logging, CUSTOM_ERROR_LEVEL
from email_utils import handle_error, send_email_with_attachment
from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files
from error_handler import handle_general_error, handle_file_not_found_error, handle_value_error
from file_utils import is_file_empty
from db_utils import count_files, get_distinct_filenames_from_db
from dotenv import load_dotenv
import config
import psycopg2
load_dotenv()
# MAIN PROCESS
def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
# import global variables
#from config import load_config, aws_config, db_config, ach_config, bucket_name
#global aws_config, db_config, ach_config, bucket_name
#config import load_config , aws_config, db_config, ach_config, bucket_name
#load_config()
logging.info(f"bucket_name: {bucket_name}")
# Ensure timing variables are always defined so later error-email logic
# won't fail if an exception is raised before end_time/elapsed_time is set.
start_time = time.time()
end_time = start_time
elapsed_time = 0.0
try:
logging.info("Starting the main process...")
# Create the S3 client
s3_client = create_s3_client(aws_config)
# List S3 bucket contents
contents = list_s3_bucket(s3_client, bucket_name)
# Define valid extensions and excluded folders
valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'}
excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'}
# Extract and filter file names
s3_file_names = [
content['Key'] for content in contents
if any(content['Key'].endswith(ext) for ext in valid_extensions) and
not any(content['Key'].startswith(folder) for folder in excluded_folders)
]
s3_only_mp4_file_names = [
content['Key'] for content in contents
if content['Key'].endswith('.mp4') and
not any(content['Key'].startswith(folder) for folder in excluded_folders)
]
total_file_s3mp4 = len(s3_only_mp4_file_names)
logging.info(f"Total number of distinct .mp4 files in the S3 bucket before import: {total_file_s3mp4}")
# filter_s3_files_not_in_db
# --- Get all DB filenames in one call ---
db_file_names = get_distinct_filenames_from_db()
# --- Keep only those not in DB ---
file_names = [f for f in s3_file_names if f not in db_file_names]
# Print the total number of files
total_file_db = len(db_file_names)
logging.info(f"Total number of distinct files in the database before import: {total_file_db}")
total_files_s3 = len(s3_file_names)
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
total_files = len(file_names)
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
# Count files with .mp4 and .mp3 extensions
mp4_count = sum(1 for file in s3_file_names if file.endswith('.mp4'))
mp3_count = sum(1 for file in s3_file_names if file.endswith('.mp3'))
md5_count = sum(1 for file in s3_file_names if file.endswith('.md5'))
pdf_count = sum(1 for file in s3_file_names if file.endswith('.pdf'))
json_count = sum(1 for file in s3_file_names if file.endswith('.json'))
mov_count = sum(1 for file in s3_file_names if file.endswith('.mov'))
# jpg_count = sum(1 for file in file_names if file.endswith('.jpg'))
# file directory
avi_count = sum(1 for file in s3_file_names if file.endswith('.avi'))
m4v_count = sum(1 for file in s3_file_names if file.endswith('.m4v'))
# Log the counts
# Get the logger instance
logger = logging.getLogger()
# Use the logger instance to log custom info
logging.warning("Number of .mp4 files on S3 bucket (%s): %s", bucket_name, mp4_count)
logging.warning("Number of .mp3 files on S3 bucket (%s): %s", bucket_name, mp3_count)
logging.warning("Number of .md5 files on S3 bucket (%s): %s", bucket_name, md5_count)
logging.warning("Number of .pdf files on S3 bucket (%s): %s", bucket_name, pdf_count)
logging.warning("Number of .json files on S3 bucket (%s): %s", bucket_name, json_count)
logging.warning("Number of .mov files on S3 bucket (%s): %s", bucket_name, mov_count)
if mp4_count != pdf_count:
logging.error("Number of .mp4 files is not equal to number of .pdf files")
logging.error("Abort Import Process due to missing files")
# return
if mp3_count + mp4_count != json_count:
logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .json files")
logging.error("Abort Import Process due to missing files")
# return
if mp3_count + mp4_count != md5_count:
logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .md5 files")
logging.error("Abort Import Process due to missing files")
# return
# Try to parse S3 files
try:
# if DRY RUN is set to True, the files will not be uploaded to the database
logging.warning("DRY RUN is set to TRUE - No files will be added to the database")
# set the tuples to zero
uploaded_files_count, warning_files_count, error_files_count = (0, 0, 0)
logging.warning("Total number of files (mp3+mp4) with warnings: %s. (Probably already existing in the DB)", warning_files_count)
logging.warning("Total number of files with errors: %s", error_files_count)
logging.warning("Total number of files uploaded: %s", uploaded_files_count)
logging.warning("All files parsed")
except Exception as e:
logging.error(f"An error occurred while parsing S3 files: {e}")
handle_general_error(e)
# Check results
# connect to database
conn = psycopg2.connect(**db_config)
cur = conn.cursor()
# Use centralized mime types from config
from config import EXTENSION_MIME_MAP, MIME_TYPES
logging.info(f"Mime types for counting files: {MIME_TYPES}")
all_files_on_db = count_files(cur, MIME_TYPES,'*', False)
mov_files_on_db = count_files(cur,['video/mov'],'.mov', False )
mxf_files_on_db = count_files(cur,['application/mxf'],'.mxf', False )
mpg_files_on_db = count_files(cur,['video/mpeg'],'.mpg', False )
avi_files_on_db = count_files(cur,['video/x-msvideo'],'.avi', False )
m4v_files_on_db = count_files(cur,['video/mp4'],'.m4v', False )
mp4_files_on_db = count_files(cur,['video/mp4'],'.mp4', False )
wav_files_on_db = count_files(cur,['audio/wav'],'.wav', False )
mp3_files_on_db = count_files(cur,['audio/mp3'],'.mp3', False )
# mov + m4v + avi + mxf + mpg
logging.warning(f"Number of all video files in the database: {all_files_on_db}")
logging.warning(f"Number of .mov files in the database: {mov_files_on_db} and S3: {mov_count} ")
logging.warning(f"Number of .mp4 files in the database: {mp4_files_on_db} and S3: {mp4_count}")
# compare the mp4 name and s3 name and report the missing files in the 2 lists a print the list
missing_mp4s = [f for f in file_names if f.endswith('.mp4') and f not in db_file_names]
logging.warning(f"Missing .mp4 files in DB compared to S3: {missing_mp4s}")
logging.warning(f"Number of .wav files in the database: {wav_files_on_db} ")
logging.warning(f"Number of .mp3 files in the database: {mp3_files_on_db} and S3: {mp3_count}")
logging.warning(f"Number of .avi files in the database: {avi_files_on_db} ")
logging.warning(f"Number of .m4v files in the database: {m4v_files_on_db} ")
logging.warning(f"Number of .mxf files in the database: {mxf_files_on_db} ")
logging.warning(f"Number of .mpg files in the database: {mpg_files_on_db} ")
logging.warning(f"Total file in s3 before import {total_files}")
# time elapsed
end_time = time.time() # Record end time
elapsed_time = end_time - start_time
logging.warning(f"Processing completed. Time taken: {elapsed_time:.2f} seconds")
except Exception as e:
handle_general_error(e)
except FileNotFoundError as e:
handle_file_not_found_error(e)
except ValueError as e:
handle_value_error(e)
if __name__ == "__main__":
try:
# Setup logging using standard TimedRotatingFileHandler handlers.
# Rely on the handler's built-in rotation; don't call doRollover manually.
logger, rotating_handler, error_handler, warning_handler = setup_logging()
# Load configuration settings
aws_config, db_config, ach_config, bucket_name, ach_variables = config.load_config()
logging.info("Config loaded, and logging setup")
# Run the main process
main_process(aws_config, db_config, ach_config, bucket_name, ach_variables)
except Exception as e:
logging.error(f"An error occurred: {e}")