# v20251103 - Main script to import media files from S3 to the database import logging import time from datetime import datetime import pytz import os import re from logging_config import setup_logging, CUSTOM_ERROR_LEVEL from email_utils import handle_error, send_email_with_attachment from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files from error_handler import handle_general_error, handle_file_not_found_error, handle_value_error from file_utils import is_file_empty from db_utils import count_files, get_distinct_filenames_from_db from dotenv import load_dotenv import config import psycopg2 load_dotenv() import re import logging import os def analyze_pattern_match(text, description): """Analyze which part of the 12-char pattern is not matching. The code currently truncates base/folder names to the first 12 characters and uses the pattern r'^[VA][OC]-[A-Z0-9]{3}-\d{5}$' which is 12 characters long. This function therefore validates a 12-character string and avoids indexing beyond its length. """ if not text: return [f"{description}: Empty or None text"] issues = [] expected_length = 12 # Pattern: [VA][OC]-[3chars]-[5digits] # Check length if len(text) != expected_length: issues.append(f"Length mismatch: expected {expected_length}, got {len(text)}") return issues # Step 1: Check 1st character - V or A if text[0] not in ['V', 'A']: issues.append(f"Position 1: Expected [V,A], got '{text[0]}'") # Step 2: Check 2nd character - O or C if text[1] not in ['O', 'C']: issues.append(f"Position 2: Expected [O,C], got '{text[1]}'") # Step 3: Check 3rd character - dash if text[2] != '-': issues.append(f"Position 3: Expected '-', got '{text[2]}'") # Step 4: Check positions 4,5,6 - [A-Z0-9] for i in range(3, 6): if not re.match(r'^[A-Z0-9]$', text[i]): issues.append(f"Position {i+1}: Expected [A-Z0-9], got '{text[i]}'") # Step 5: Check 7th character - dash if text[6] != '-': issues.append(f"Position 7: Expected '-', got '{text[6]}'") # Step 6: Check positions 8-12 - digits for i in range(7, 12): if not text[i].isdigit(): issues.append(f"Position {i+1}: Expected digit, got '{text[i]}'") return issues # MAIN PROCESS def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables): # import global variables #from config import load_config, aws_config, db_config, ach_config, bucket_name #global aws_config, db_config, ach_config, bucket_name #config import load_config , aws_config, db_config, ach_config, bucket_name #load_config() logging.info(f"bucket_name: {bucket_name}") # Ensure timing variables are always defined so later error-email logic # won't fail if an exception is raised before end_time/elapsed_time is set. start_time = time.time() # IN HUMAN READABLE FORMAT logging.info(f"Process started at {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}") end_time = start_time elapsed_time = 0.0 try: logging.info("Starting the main process...") # Helper to make spaces visible in filenames for logging (replace ' ' with open-box char) def _visible_spaces(name: str) -> str: try: return name.replace(' ', '\u2423') except Exception: return name # Create the S3 client s3_client = create_s3_client(aws_config) # List S3 bucket contents list_s3_files = list_s3_bucket(s3_client, bucket_name) # Define valid extensions and excluded folders valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'} # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'} excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/'} # included_folders = {'FILE/'} # uncomment this to NOT use excluded folders # included_folders = {'TEST-FOLDER-DEV/'} # uncomment this to NOT use excluded folders # Extract and filter file names # s3_file_names: include only files that match valid extensions and # (if configured) whose key starts with one of the included_folders. # We still skip any explicitly excluded_folders. Guard against the # case where `included_folders` isn't defined to avoid NameError. try: use_included = bool(included_folders) except NameError: use_included = False if use_included: s3_file_names = [ content['Key'] for content in list_s3_files if any(content['Key'].endswith(ext) for ext in valid_extensions) and any(content['Key'].startswith(folder) for folder in included_folders) ] logging.info(f"Using included_folders filter: {included_folders}") else: s3_file_names = [ content['Key'] for content in list_s3_files if any(content['Key'].endswith(ext) for ext in valid_extensions) and not any(content['Key'].startswith(folder) for folder in excluded_folders) ] logging.info("Using excluded_folders filter") # check inventory code syntax # first check s3_file_names if the file base name and folder name match pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$' pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}$' contents = [] for s3file in s3_file_names: # s3_file_names contains the object keys (strings), not dicts. base_name = os.path.basename(s3file) # keep only first 12 chars base_name = base_name[:12] logging.info(f"Base name: {base_name}") folder_name = os.path.dirname(s3file) # keep only first 12 chars of folder name as well # folder_name = folder_name[:12] # logging.info(f"Folder name: {folder_name}") if re.match(pattern, base_name): # and re.match(pattern, folder_name): logging.info(f"File {base_name} matches pattern.") contents.append(s3file) else: # Check base name if not re.match(pattern, base_name): base_issues = analyze_pattern_match(base_name, "Base name") logging.warning(f"Base name '{base_name}' does not match pattern. Issues: {base_issues}") logging.warning(f"File {base_name} in folder {folder_name} does not match pattern.") # filter_s3_files_not_in_db # --- Get all DB filenames in one call --- db_file_names = get_distinct_filenames_from_db() # --- Keep only those not in DB --- # Additionally, if the DB already contains a sidecar record for the # same basename (for extensions .md5, .json, .pdf), skip the S3 file # since the asset is already represented in the DB via those sidecars. sidecar_exts = ('.md5', '.json', '.pdf') db_sidecar_basenames = set() for dbf in db_file_names: for ext in sidecar_exts: if dbf.endswith(ext): db_sidecar_basenames.add(dbf[:-len(ext)]) break file_names = [] for f in contents: # exact key present in DB -> skip if f in db_file_names: continue # strip extension to get basename and skip if DB has sidecar for it base = os.path.splitext(f)[0] if base in db_sidecar_basenames: # logging.info("Skipping %s because DB already contains sidecar for basename %s", _visible_spaces(f), _visible_spaces(base)) continue file_names.append(f) # Print the total number of files total_files_s3 = len(contents) logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}") total_files = len(file_names) logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}") # Count files with .mp4 and .mp3 extensions mp4_count = sum(1 for file in s3_file_names if file.endswith('.mp4')) mp3_count = sum(1 for file in s3_file_names if file.endswith('.mp3')) md5_count = sum(1 for file in s3_file_names if file.endswith('.md5')) pdf_count = sum(1 for file in s3_file_names if file.endswith('.pdf')) json_count = sum(1 for file in s3_file_names if file.endswith('.json')) mov_count = sum(1 for file in s3_file_names if file.endswith('.mov')) # jpg_count = sum(1 for file in file_names if file.endswith('.jpg')) # file directory avi_count = sum(1 for file in s3_file_names if file.endswith('.avi')) m4v_count = sum(1 for file in s3_file_names if file.endswith('.m4v')) # Log the counts # Get the logger instance logger = logging.getLogger() # Use the logger instance to log custom info logging.info("Number of .mp4 files on S3 bucket (%s): %s", bucket_name, mp4_count) logging.info("Number of .mp3 files on S3 bucket (%s): %s", bucket_name, mp3_count) logging.info("Number of .md5 files on S3 bucket (%s): %s", bucket_name, md5_count) logging.info("Number of .pdf files on S3 bucket (%s): %s", bucket_name, pdf_count) logging.info("Number of .json files on S3 bucket (%s): %s", bucket_name, json_count) logging.info("Number of .mov files on S3 bucket (%s): %s", bucket_name, mov_count) # logging.info(f"Number of .jpg files: {jpg_count}") # If ACH_SAFE_RUN is 'false' we enforce strict mp4/pdf parity and abort # when mismatched. Default is 'true' which skips this abort to allow # safer runs during testing or manual reconciliation. if os.getenv('ACH_SAFE_RUN', 'true') == 'true': if mp4_count != pdf_count: logging.error("Number of .mp4 files is not equal to number of .pdf files") # MOD 20251103 # add a check to find the missing pdf or mp4 files and report them # use file_names to find missing files # store tuples (source_file, expected_counterpart) for clearer logging missing_pdfs = [] # list of (mp4_file, expected_pdf) missing_mp4s = [] # list of (pdf_file, expected_mp4) for file in file_names: if file.endswith('.mp4'): # remove extension base_name = file[:-4] # keeps any path prefix # if the mp4 is an H264 variant (e.g. name_H264.mp4) remove the suffix if base_name.endswith('_H264'): base_name = base_name[:-5] expected_pdf = base_name + '.pdf' if expected_pdf not in file_names: missing_pdfs.append((file, expected_pdf)) elif file.endswith('.pdf'): # Normalize base name and accept either the regular mp4 or the _H264 variant. base_name = file[:-4] expected_mp4 = base_name + '.mp4' h264_variant = base_name + '_H264.mp4' # If neither the regular mp4 nor the H264 variant exists, report missing. if expected_mp4 not in file_names and h264_variant not in file_names: missing_mp4s.append((file, expected_mp4)) # report missing files if missing_pdfs: logging.error("Missing .pdf files (mp4 -> expected pdf):") for mp4_file, expected_pdf in missing_pdfs: logging.error("%s -> %s", _visible_spaces(mp4_file), _visible_spaces(expected_pdf)) if missing_mp4s: logging.error("Missing .mp4 files (pdf -> expected mp4):") for pdf_file, expected_mp4 in missing_mp4s: logging.error("%s -> %s", _visible_spaces(pdf_file), _visible_spaces(expected_mp4)) logging.error("Abort Import Process due to missing files") raise ValueError("Inconsistent file counts mp4 vs pdf") if mp3_count + mp4_count != json_count: logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .json files") logging.error("Abort Import Process due to missing files") # search wich file dont match TODO raise ValueError("Inconsistent file counts mp3+mp4 vs json") if mp3_count + mp4_count != md5_count: logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .md5 files") logging.error("Abort Import Process due to missing files") # search wich file dont match TODO raise ValueError("Inconsistent file counts mp3+mp4 vs md5") # Try to parse S3 files try: # If DRY RUN is set to True, the files will not be uploaded to the database if os.getenv('ACH_DRY_RUN', 'true') == 'false': uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(s3_client, file_names, ach_variables, excluded_folders) else: logging.warning("DRY RUN is set to TRUE - No files will be added to the database") # set the tuples to zero uploaded_files_count, warning_files_count, error_files_count = (0, 0, 0) logging.info("Total number of files (mp3+mp4) with warnings: %s. (Probably already existing in the DB)", warning_files_count) logging.info("Total number of files with errors: %s", error_files_count) logging.info("Total number of files uploaded: %s", uploaded_files_count) logging.info("All files parsed") except Exception as e: logging.error(f"An error occurred while parsing S3 files: {e}") handle_general_error(e) # Check results # connect to database conn = psycopg2.connect(**db_config) cur = conn.cursor() # function count_files that are wav and mov in db # Map file extensions (include leading dot) to mime types EXTENSION_MIME_MAP = { '.avi': 'video/x-msvideo', '.mov': 'video/mov', '.wav': 'audio/wav', '.mp4': 'video/mp4', '.m4v': 'video/mp4', '.mp3': 'audio/mp3', '.mxf': 'application/mxf', '.mpg': 'video/mpeg', } # populate mime_type list with all relevant MediaInfo/MIME values mime_type = [ 'video/x-msvideo', # .avi 'video/mov', # .mov 'audio/wav', # .wav 'video/mp4', # .mp4, .m4v 'audio/mp3', # .mp3 'application/mxf', # .mxf 'video/mpeg', # .mpg ] logging.info(f"Mime types for counting files: {mime_type}") all_files_on_db = count_files(cur, mime_type,'*', False) mov_files_on_db = count_files(cur,['video/mov'],'.mov', False ) mxf_files_on_db = count_files(cur,['application/mxf'],'.mxf', False ) mpg_files_on_db = count_files(cur,['video/mpeg'],'.mpg', False ) avi_files_on_db = count_files(cur,['video/x-msvideo'],'.avi', False ) m4v_files_on_db = count_files(cur,['video/mp4'],'.m4v', False ) mp4_files_on_db = count_files(cur,['video/mp4'],'.mp4', False ) wav_files_on_db = count_files(cur,['audio/wav'],'.wav', False ) mp3_files_on_db = count_files(cur,['audio/mp3'],'.mp3', False ) # mov + m4v + avi + mxf + mpg logging.info(f"Number of all video files in the database: {all_files_on_db}") logging.info(f"Number of .mov files in the database: {mov_files_on_db} and S3: {mov_count} ") logging.info(f"Number of .mp4 files in the database: {mp4_files_on_db} and S3: {mp4_count}") # compare the mp4 name and s3 name and report the missing files in the 2 lists a print the list missing_mp4s = [f for f in file_names if f.endswith('.mp4') and f not in db_file_names] # if missing_mp4s empty do not return a warning if missing_mp4s: logging.warning(f"Missing .mp4 files in DB compared to S3: {missing_mp4s}") logging.info(f"Number of .wav files in the database: {wav_files_on_db} ") logging.info(f"Number of .mp3 files in the database: {mp3_files_on_db} and S3: {mp3_count}") missing_mp3s = [f for f in file_names if f.endswith('.mp3') and f not in db_file_names] # if missing_mp3s empty do not return a warning if missing_mp3s: logging.warning(f"Missing .mp3 files in DB compared to S3: {missing_mp3s}") logging.info(f"Number of .avi files in the database: {avi_files_on_db} ") logging.info(f"Number of .m4v files in the database: {m4v_files_on_db} ") logging.info(f"Number of .mxf files in the database: {mxf_files_on_db} ") logging.info(f"Number of .mpg files in the database: {mpg_files_on_db} ") logging.info(f"Total file in s3 before import {total_files}") # time elapsed end_time = time.time() # Record end time elapsed_time = end_time - start_time logging.info(f"Processing completed. Time taken: {elapsed_time:.2f} seconds") except FileNotFoundError as e: handle_file_not_found_error(e) except ValueError as e: handle_value_error(e) except Exception as e: handle_general_error(e) # Send Email with logs if success or failure # Define the CET timezone cet = pytz.timezone('CET') # Helper to rename a log file by appending a timestamp and return the new path. def _rename_log_if_nonempty(path): try: if not path or not os.path.exists(path): return None # If file is empty, don't attach/rename it if os.path.getsize(path) == 0: return None dir_name = os.path.dirname(path) base_name = os.path.splitext(os.path.basename(path))[0] timestamp = datetime.now(cet).strftime("%Y%m%d_%H%M%S") new_log_path = os.path.join(dir_name, f"{base_name}_{timestamp}.log") # Attempt to move/replace atomically where possible try: os.replace(path, new_log_path) except Exception: # Fallback to rename (may raise on Windows if target exists) os.rename(path, new_log_path) return new_log_path except Exception as e: logging.error("Failed to rename log %s: %s", path, e) return None # close logging to flush handlers before moving files logging.shutdown() logging.info("Preparing summary email") error_log = './logs/ACH_media_import_errors.log' warning_log = './logs/ACH_media_import_warning.log' # Determine presence of errors/warnings has_errors = False has_warnings = False try: if os.path.exists(error_log) and os.path.getsize(error_log) > 0: with open(error_log, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() if 'ERROR' in content or len(content.strip()) > 0: has_errors = True if os.path.exists(warning_log) and os.path.getsize(warning_log) > 0: with open(warning_log, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() if 'WARNING' in content or len(content.strip()) > 0: has_warnings = True except Exception as e: logging.error("Error while reading log files: %s", e) # from env - split safely and strip whitespace def _split_env_list(name): raw = os.getenv(name, '') return [s.strip() for s in raw.split(',') if s.strip()] EMAIL_RECIPIENTS = _split_env_list('EMAIL_RECIPIENTS') ERROR_EMAIL_RECIPIENTS = _split_env_list('ERROR_EMAIL_RECIPIENTS') or EMAIL_RECIPIENTS SUCCESS_EMAIL_RECIPIENTS = _split_env_list('SUCCESS_EMAIL_RECIPIENTS') or EMAIL_RECIPIENTS # Choose subject and attachment based on severity if has_errors: subject = "ARKIVO Import of Video/Audio Ran with Errors" attachment_to_send = _rename_log_if_nonempty(error_log) or error_log body = "Please find the attached error log file. Job started at %s and ended at %s, taking %.2f seconds." % ( datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S'), datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S'), elapsed_time ) email_recipients=ERROR_EMAIL_RECIPIENTS elif has_warnings: subject = "ARKIVO Import of Video/Audio Completed with Warnings" # Attach the warnings log for investigation attachment_to_send = _rename_log_if_nonempty(warning_log) or warning_log body = "The import completed with warnings. Please find the attached warning log. Job started at %s and ended at %s, taking %.2f seconds." % ( datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S'), datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S'), elapsed_time ) email_recipients=ERROR_EMAIL_RECIPIENTS else: subject = "ARKIVO Video/Audio Import Completed Successfully" # No attachment for clean success attachment_to_send = None body = "The import of media (video/audio) completed successfully without any errors or warnings. Job started at %s and ended at %s, taking %.2f seconds." % ( datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S'), datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S'), elapsed_time ) email_recipients=SUCCESS_EMAIL_RECIPIENTS logging.info("Sending summary email: %s (attach: %s)", subject, bool(attachment_to_send)) # Send email try: send_email_with_attachment( subject=subject, body=body, attachment_path=attachment_to_send, email_recipients=email_recipients ) except Exception as e: logging.error("Failed to send summary email: %s", e) return if __name__ == "__main__": try: # Setup logging using standard TimedRotatingFileHandler handlers. # No manual doRollover calls — rely on the handler's built-in rotation. logger, rotating_handler, error_handler, warning_handler = setup_logging() # Load configuration settings aws_config, db_config, ach_config, bucket_name, ach_variables = config.load_config() logging.info("Config loaded, logging setup done") # Run the main process main_process(aws_config, db_config, ach_config, bucket_name, ach_variables) logging.info("Main process completed at: %s", datetime.now().strftime('%Y-%m-%d %H:%M:%S')) except Exception as e: logging.error(f"An error occurred: {e}")