# v20251103 - Main script to import media files from S3 to the database
import logging
import time
from datetime import datetime
import pytz
import os
import re
from logging_config import setup_logging, CUSTOM_ERROR_LEVEL
from email_utils import handle_error, send_email_with_attachment
from s3_utils import create_s3_client, list_s3_bucket, parse_s3_files
from error_handler import handle_general_error, handle_file_not_found_error, handle_value_error
from file_utils import is_file_empty
from db_utils import count_files, get_distinct_filenames_from_db
from dotenv import load_dotenv
import config
import psycopg2

load_dotenv()

import re
import logging
import os

def analyze_pattern_match(text, description):
    """Analyze which part of the 12-char pattern is not matching.

    The code currently truncates base/folder names to the first 12 characters and
    uses the pattern r'^[VA][OC]-[A-Z0-9]{3}-\d{5}$' which is 12 characters long.
    This function therefore validates a 12-character string and avoids
    indexing beyond its length.
    """
    if not text:
        return [f"{description}: Empty or None text"]

    issues = []
    expected_length = 12  # Pattern: [VA][OC]-[3chars]-[5digits]

    # Check length
    if len(text) != expected_length:
        issues.append(f"Length mismatch: expected {expected_length}, got {len(text)}")
        return issues

    # Step 1: Check 1st character - V or A
    if text[0] not in ['V', 'A']:
        issues.append(f"Position 1: Expected [V,A], got '{text[0]}'")

    # Step 2: Check 2nd character - O or C
    if text[1] not in ['O', 'C']:
        issues.append(f"Position 2: Expected [O,C], got '{text[1]}'")

    # Step 3: Check 3rd character - dash
    if text[2] != '-':
        issues.append(f"Position 3: Expected '-', got '{text[2]}'")

    # Step 4: Check positions 4,5,6 - [A-Z0-9]
    for i in range(3, 6):
        if not re.match(r'^[A-Z0-9]$', text[i]):
            issues.append(f"Position {i+1}: Expected [A-Z0-9], got '{text[i]}'")

    # Step 5: Check 7th character - dash
    if text[6] != '-':
        issues.append(f"Position 7: Expected '-', got '{text[6]}'")

    # Step 6: Check positions 8-12 - digits
    for i in range(7, 12):
        if not text[i].isdigit():
            issues.append(f"Position {i+1}: Expected digit, got '{text[i]}'")

    return issues

# MAIN PROCESS
def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
    # import global variables
    #from config import load_config, aws_config, db_config, ach_config, bucket_name
    #global aws_config, db_config, ach_config, bucket_name
    #config import load_config , aws_config, db_config, ach_config, bucket_name
    #load_config()
    
    logging.info(f"bucket_name: {bucket_name}")
    
    # Ensure timing variables are always defined so later error-email logic
    # won't fail if an exception is raised before end_time/elapsed_time is set.
    start_time = time.time()
    # IN HUMAN READABLE FORMAT
    logging.info(f"Process started at {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
    end_time = start_time
    elapsed_time = 0.0

    try:
        logging.info("Starting the main process...")

        # Helper to make spaces visible in filenames for logging (replace ' ' with open-box char)
        def _visible_spaces(name: str) -> str:
            try:
                return name.replace(' ', '\u2423')
            except Exception:
                return name

        # Create the S3 client
        s3_client = create_s3_client(aws_config)

        # List S3 bucket contents
        list_s3_files = list_s3_bucket(s3_client, bucket_name)

        # Define valid extensions and excluded folders
        valid_extensions = {'.mp3', '.mp4', '.md5', '.json', '.pdf'}
        # excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/', 'FILE/'}
        excluded_folders = {'DOCUMENTAZIONE_FOTOGRAFICA/', 'TEST-FOLDER-DEV/'}
        # included_folders = {'FILE/'} # uncomment this to NOT use excluded folders 
        # included_folders = {'TEST-FOLDER-DEV/'} # uncomment this to NOT use excluded folders 

        # Extract and filter file names
    
        # s3_file_names: include only files that match valid extensions and
        # (if configured) whose key starts with one of the included_folders.
        # We still skip any explicitly excluded_folders. Guard against the
        # case where `included_folders` isn't defined to avoid NameError.
        try:
            use_included = bool(included_folders)
        except NameError:
            use_included = False

        if use_included:
            s3_file_names = [
                content['Key'] for content in list_s3_files
                if any(content['Key'].endswith(ext) for ext in valid_extensions)
                and any(content['Key'].startswith(folder) for folder in included_folders)
            ]
            logging.info(f"Using included_folders filter: {included_folders}")
        else:
            s3_file_names = [
                content['Key'] for content in list_s3_files
                if any(content['Key'].endswith(ext) for ext in valid_extensions)
                and not any(content['Key'].startswith(folder) for folder in excluded_folders)
            ]
            logging.info("Using excluded_folders filter")

        # check inventory code syntax
        # first check s3_file_names if the file base name and folder name match  pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$'
        pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}$'
        contents = []

        for s3file in s3_file_names:
            # s3_file_names contains the object keys (strings), not dicts.
            base_name = os.path.basename(s3file)
            # keep only first 12 chars
            base_name = base_name[:12]
            logging.info(f"Base name: {base_name}")
            folder_name = os.path.dirname(s3file)
            # keep only first 12 chars of folder name as well
            # folder_name = folder_name[:12]
            # logging.info(f"Folder name: {folder_name}")

            if re.match(pattern, base_name): # and re.match(pattern, folder_name):
                logging.info(f"File {base_name}  matches pattern.")
                contents.append(s3file)
            else:
                # Check base name
                if not re.match(pattern, base_name):
                    base_issues = analyze_pattern_match(base_name, "Base name")
                    logging.warning(f"Base name '{base_name}' does not match pattern. Issues: {base_issues}")
                                
                logging.warning(f"File {base_name} in folder {folder_name} does not match pattern.")
        

        # filter_s3_files_not_in_db
        # --- Get all DB filenames in one call ---
        db_file_names = get_distinct_filenames_from_db()

        # --- Keep only those not in DB ---
        # Additionally, if the DB already contains a sidecar record for the
        # same basename (for extensions .md5, .json, .pdf), skip the S3 file
        # since the asset is already represented in the DB via those sidecars.
        sidecar_exts = ('.md5', '.json', '.pdf')
        db_sidecar_basenames = set()
        for dbf in db_file_names:
            for ext in sidecar_exts:
                if dbf.endswith(ext):
                    db_sidecar_basenames.add(dbf[:-len(ext)])
                    break

        file_names = []
        for f in contents:
            # exact key present in DB -> skip
            if f in db_file_names:
                continue
            # strip extension to get basename and skip if DB has sidecar for it
            base = os.path.splitext(f)[0]
            if base in db_sidecar_basenames:
                # logging.info("Skipping %s because DB already contains sidecar for basename %s", _visible_spaces(f), _visible_spaces(base))
                continue
            file_names.append(f)

        # Print the total number of files
        total_files_s3 = len(contents)
        logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
        total_files = len(file_names)
        logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
        
        # Count files with .mp4 and .mp3 extensions
        mp4_count = sum(1 for file in s3_file_names if file.endswith('.mp4'))
        mp3_count = sum(1 for file in s3_file_names if file.endswith('.mp3'))
        md5_count = sum(1 for file in s3_file_names if file.endswith('.md5'))
        pdf_count = sum(1 for file in s3_file_names if file.endswith('.pdf'))
        json_count = sum(1 for file in s3_file_names if file.endswith('.json'))
        mov_count = sum(1 for file in s3_file_names if file.endswith('.mov'))
        # jpg_count = sum(1 for file in file_names if file.endswith('.jpg'))

        # file directory
        avi_count = sum(1 for file in s3_file_names if file.endswith('.avi'))
        m4v_count = sum(1 for file in s3_file_names if file.endswith('.m4v'))
        # Log the counts
        # Get the logger instance    
        logger = logging.getLogger()

        # Use the logger instance to log custom info
        logging.info("Number of .mp4 files on S3 bucket (%s): %s", bucket_name, mp4_count)
        logging.info("Number of .mp3 files on S3 bucket (%s): %s", bucket_name, mp3_count)
        logging.info("Number of .md5 files on S3 bucket (%s): %s", bucket_name, md5_count)
        logging.info("Number of .pdf files on S3 bucket (%s): %s", bucket_name, pdf_count)
        logging.info("Number of .json files on S3 bucket (%s): %s", bucket_name, json_count)
        logging.info("Number of .mov files on S3 bucket (%s): %s", bucket_name, mov_count)
        # logging.info(f"Number of .jpg files: {jpg_count}")

        # If ACH_SAFE_RUN is 'false' we enforce strict mp4/pdf parity and abort
        # when mismatched. Default is 'true' which skips this abort to allow
        # safer runs during testing or manual reconciliation.
        if os.getenv('ACH_SAFE_RUN', 'true') == 'true':
            if mp4_count != pdf_count:
                logging.error("Number of .mp4 files is not equal to number of .pdf files")
                # MOD 20251103
                # add a check to find the missing pdf or mp4 files and report them
                # use file_names to find missing files
                # store tuples (source_file, expected_counterpart) for clearer logging
                missing_pdfs = []  # list of (mp4_file, expected_pdf)
                missing_mp4s = []  # list of (pdf_file, expected_mp4)
                for file in file_names:
                    if file.endswith('.mp4'):
                        # remove extension
                        base_name = file[:-4]  # keeps any path prefix
                        # if the mp4 is an H264 variant (e.g. name_H264.mp4) remove the suffix
                        if base_name.endswith('_H264'):
                            base_name = base_name[:-5]
                        expected_pdf = base_name + '.pdf'
                        if expected_pdf not in file_names:
                            missing_pdfs.append((file, expected_pdf))
                    elif file.endswith('.pdf'):
                        # Normalize base name and accept either the regular mp4 or the _H264 variant.
                        base_name = file[:-4]
                        expected_mp4 = base_name + '.mp4'
                        h264_variant = base_name + '_H264.mp4'
                        # If neither the regular mp4 nor the H264 variant exists, report missing.
                        if expected_mp4 not in file_names and h264_variant not in file_names:
                            missing_mp4s.append((file, expected_mp4))
                # report missing files
                if missing_pdfs:
                    logging.error("Missing .pdf files (mp4 -> expected pdf):")
                    for mp4_file, expected_pdf in missing_pdfs:
                        logging.error("%s -> %s", _visible_spaces(mp4_file), _visible_spaces(expected_pdf))

                if missing_mp4s:
                    logging.error("Missing .mp4 files (pdf -> expected mp4):")
                    for pdf_file, expected_mp4 in missing_mp4s:
                        logging.error("%s -> %s", _visible_spaces(pdf_file), _visible_spaces(expected_mp4))

                logging.error("Abort Import Process due to missing files")
                raise ValueError("Inconsistent file counts mp4 vs pdf")
            
            if mp3_count + mp4_count != json_count:
                logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .json files")
                logging.error("Abort Import Process due to missing files")   
                # search wich file dont match TODO
                raise ValueError("Inconsistent file counts mp3+mp4 vs json")  
             
            if mp3_count + mp4_count != md5_count:
                logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .md5 files")        
                logging.error("Abort Import Process due to missing files")
                # search wich file dont match TODO
                raise ValueError("Inconsistent file counts mp3+mp4 vs md5")
        
        # Try to parse S3 files
        try:
            # If DRY RUN is set to True, the files will not be uploaded to the database
            if os.getenv('ACH_DRY_RUN', 'true') == 'false':
                uploaded_files_count, warning_files_count, error_files_count = parse_s3_files(s3_client, file_names, ach_variables, excluded_folders)
            else:
                logging.warning("DRY RUN is set to TRUE - No files will be added to the database")
                # set the tuples to zero
                uploaded_files_count, warning_files_count, error_files_count = (0, 0, 0)
            logging.info("Total number of files (mp3+mp4) with warnings: %s. (Probably already existing in the DB)", warning_files_count)
            logging.info("Total number of files with errors: %s", error_files_count)
            logging.info("Total number of files uploaded: %s", uploaded_files_count)
            logging.info("All files parsed")
        except Exception as e:
            logging.error(f"An error occurred while parsing S3 files: {e}")
            handle_general_error(e)

        # Check results
        # connect to database
        conn = psycopg2.connect(**db_config)
        cur = conn.cursor()
        # function count_files that are wav and mov in db
        # Map file extensions (include leading dot) to mime types
        EXTENSION_MIME_MAP = {
            '.avi': 'video/x-msvideo',
            '.mov': 'video/mov',
            '.wav': 'audio/wav',
            '.mp4': 'video/mp4',
            '.m4v': 'video/mp4',
            '.mp3': 'audio/mp3',
            '.mxf': 'application/mxf',
            '.mpg': 'video/mpeg',
        }

        # populate mime_type list with all relevant MediaInfo/MIME values
        mime_type = [
            'video/x-msvideo',  # .avi
            'video/mov',        # .mov
            'audio/wav',        # .wav
            'video/mp4',        # .mp4, .m4v
            'audio/mp3',        # .mp3
            'application/mxf',  # .mxf
            'video/mpeg',       # .mpg
        ]

        logging.info(f"Mime types for counting files: {mime_type}")

        all_files_on_db = count_files(cur, mime_type,'*', False)
        mov_files_on_db = count_files(cur,['video/mov'],'.mov', False )
        mxf_files_on_db = count_files(cur,['application/mxf'],'.mxf', False )
        mpg_files_on_db = count_files(cur,['video/mpeg'],'.mpg', False )
        avi_files_on_db = count_files(cur,['video/x-msvideo'],'.avi', False )
        m4v_files_on_db = count_files(cur,['video/mp4'],'.m4v', False ) 
        mp4_files_on_db = count_files(cur,['video/mp4'],'.mp4', False )
        wav_files_on_db = count_files(cur,['audio/wav'],'.wav', False )
        mp3_files_on_db = count_files(cur,['audio/mp3'],'.mp3', False )

        # mov + m4v + avi + mxf + mpg 
        logging.info(f"Number of all video files in the database: {all_files_on_db}")
        logging.info(f"Number of .mov files in the database: {mov_files_on_db} and S3: {mov_count} ")
        logging.info(f"Number of .mp4 files in the database: {mp4_files_on_db} and S3: {mp4_count}")
        
        # compare the mp4 name and s3 name and report the missing files in the 2 lists a print the list
        missing_mp4s = [f for f in file_names if f.endswith('.mp4') and f not in db_file_names]
        # if missing_mp4s empty do not return a warning
        if missing_mp4s:       
            logging.warning(f"Missing .mp4 files in DB compared to S3: {missing_mp4s}")

        logging.info(f"Number of .wav files in the database: {wav_files_on_db} ")
        logging.info(f"Number of .mp3 files in the database: {mp3_files_on_db} and S3: {mp3_count}")
        
        missing_mp3s = [f for f in file_names if f.endswith('.mp3') and f not in db_file_names]
        # if missing_mp3s empty do not return a warning
        if missing_mp3s:
            logging.warning(f"Missing .mp3 files in DB compared to S3: {missing_mp3s}")
        
        logging.info(f"Number of .avi files in the database: {avi_files_on_db}  ")
        logging.info(f"Number of .m4v files in the database: {m4v_files_on_db}  ")
        logging.info(f"Number of .mxf files in the database: {mxf_files_on_db}  ")
        logging.info(f"Number of .mpg files in the database: {mpg_files_on_db}  ")

        logging.info(f"Total file in s3 before import {total_files}")

        # time elapsed
        end_time = time.time()  # Record end time

        elapsed_time = end_time - start_time
        logging.info(f"Processing completed. Time taken: {elapsed_time:.2f} seconds")

    except FileNotFoundError as e:
        handle_file_not_found_error(e)
    except ValueError as e:
        handle_value_error(e)
    except Exception as e:
        handle_general_error(e)

    # Send Email with logs if success or failure
    # Define the CET timezone
    cet = pytz.timezone('CET')

    # Helper to rename a log file by appending a timestamp and return the new path.
    def _rename_log_if_nonempty(path):
        try:
            if not path or not os.path.exists(path):
                return None
            # If file is empty, don't attach/rename it
            if os.path.getsize(path) == 0:
                return None
            dir_name = os.path.dirname(path)
            base_name = os.path.splitext(os.path.basename(path))[0]
            timestamp = datetime.now(cet).strftime("%Y%m%d_%H%M%S")
            new_log_path = os.path.join(dir_name, f"{base_name}_{timestamp}.log")
            # Attempt to move/replace atomically where possible
            try:
                os.replace(path, new_log_path)
            except Exception:
                # Fallback to rename (may raise on Windows if target exists)
                os.rename(path, new_log_path)
            return new_log_path
        except Exception as e:
            logging.error("Failed to rename log %s: %s", path, e)
            return None

    # close logging to flush handlers before moving files
    logging.shutdown()

    logging.info("Preparing summary email")

    error_log = './logs/ACH_media_import_errors.log'
    warning_log = './logs/ACH_media_import_warning.log'

    # Determine presence of errors/warnings
    has_errors = False
    has_warnings = False
    try:
        if os.path.exists(error_log) and os.path.getsize(error_log) > 0:
            with open(error_log, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                if 'ERROR' in content or len(content.strip()) > 0:
                    has_errors = True
        if os.path.exists(warning_log) and os.path.getsize(warning_log) > 0:
            with open(warning_log, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                if 'WARNING' in content or len(content.strip()) > 0:
                    has_warnings = True
    except Exception as e:
        logging.error("Error while reading log files: %s", e)

    # from env - split safely and strip whitespace
    def _split_env_list(name):
        raw = os.getenv(name, '')
        return [s.strip() for s in raw.split(',') if s.strip()]

    EMAIL_RECIPIENTS = _split_env_list('EMAIL_RECIPIENTS')
    ERROR_EMAIL_RECIPIENTS = _split_env_list('ERROR_EMAIL_RECIPIENTS') or EMAIL_RECIPIENTS
    SUCCESS_EMAIL_RECIPIENTS = _split_env_list('SUCCESS_EMAIL_RECIPIENTS') or EMAIL_RECIPIENTS

    # Choose subject and attachment based on severity
    if has_errors:
        subject = "ARKIVO Import of Video/Audio Ran with Errors"
        attachment_to_send = _rename_log_if_nonempty(error_log) or error_log
        body = "Please find the attached error log file. Job started at %s and ended at %s, taking %.2f seconds." % (
            datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S'),
            datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S'),
            elapsed_time
        )
        email_recipients=ERROR_EMAIL_RECIPIENTS
    elif has_warnings:
        subject = "ARKIVO Import of Video/Audio Completed with Warnings"
        # Attach the warnings log for investigation
        attachment_to_send = _rename_log_if_nonempty(warning_log) or warning_log
        body = "The import completed with warnings. Please find the attached warning log. Job started at %s and ended at %s, taking %.2f seconds." % (
            datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S'),
            datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S'),
            elapsed_time
        )
        email_recipients=ERROR_EMAIL_RECIPIENTS
    else:
        subject = "ARKIVO Video/Audio Import Completed Successfully"
        # No attachment for clean success
        attachment_to_send = None
        body = "The import of media (video/audio) completed successfully without any errors or warnings. Job started at %s and ended at %s, taking %.2f seconds." % (
            datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S'),
            datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S'),
            elapsed_time    
        )
        email_recipients=SUCCESS_EMAIL_RECIPIENTS 

    logging.info("Sending summary email: %s (attach: %s)", subject, bool(attachment_to_send))

    # Send email
    try:
        send_email_with_attachment(
            subject=subject,
            body=body,
            attachment_path=attachment_to_send,
            email_recipients=email_recipients
        )
    except Exception as e:
        logging.error("Failed to send summary email: %s", e)

    return    

      

if __name__ == "__main__":
    try:
        # Setup logging using standard TimedRotatingFileHandler handlers.
        # No manual doRollover calls — rely on the handler's built-in rotation.
        logger, rotating_handler, error_handler, warning_handler = setup_logging()

        # Load configuration settings
        aws_config, db_config, ach_config, bucket_name, ach_variables = config.load_config()

        logging.info("Config loaded, logging setup done")

        # Run the main process    
        main_process(aws_config, db_config, ach_config, bucket_name, ach_variables)

        logging.info("Main process completed at: %s", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        
    except Exception as e:
        logging.error(f"An error occurred: {e}")