import os import logging from logging.handlers import RotatingFileHandler import json from utils import check_video_info, check_audio_info from error_handler import handle_error from botocore.exceptions import ClientError #from config import load_config, aws_config, bucket_name import config def retrieve_file_contents(s3, base_name): file_contents = {} # Retrieve the configuration values # aws_config, db_config, ach_config, bucket_name, ach_variables = config.load_config() _, _, _, bucket_name, _ = config.load_config() try: # Define the file extensions as pairs file_extensions = [['json', 'json'], ['md5', 'md5']] for ext_pair in file_extensions: file_name = f"{base_name}.{ext_pair[0]}" try: response = s3.get_object(Bucket=bucket_name, Key=file_name) file_contents[ext_pair[1]] = response['Body'].read().decode('utf-8') logging.info(f"Retrieved {ext_pair[1]} file content for base_name {base_name}.") except ClientError as e: # S3 returns a NoSuchKey error code when the key is missing. code = e.response.get('Error', {}).get('Code', '') if code in ('NoSuchKey', '404', 'NotFound'): logging.warning(f"{file_name} not found in S3 (code={code}).") # treat missing sidecars as non-fatal; continue continue else: logging.error(f"Error retrieving {file_name}: {e}", exc_info=True) # Re-raise other ClientError types raise except Exception as e: logging.error(f'Error retrieving file contents for {base_name}: {e}', exc_info=True) # Return empty JSON structure instead of raising to avoid tracebacks in callers try: return json.dumps({}) except Exception: return '{}' # Clean and format file_contents as proper JSON try: cleaned_contents = {} # Clean the contents for key, value in file_contents.items(): if isinstance(value, str): # Remove trailing newlines or any other unwanted characters cleaned_value = value.strip() # Attempt to parse JSON try: cleaned_contents[key] = json.loads(cleaned_value) except json.JSONDecodeError: cleaned_contents[key] = cleaned_value else: cleaned_contents[key] = value # Return the cleaned and formatted JSON return json.dumps(cleaned_contents, indent=4) except (TypeError, ValueError) as e: logging.error(f'Error formatting file contents as JSON: {e}', exc_info=True) raise e def check_related_files(s3, file_name_with_path, file, bucket_name, s3_listing_cache=None): """ Check for related files in S3 based on the given file type. Parameters: - s3: The S3 client object. - file_name_with_path: The name of the file with its path. - file: The file name. - bucket_name: The name of the S3 bucket. - s3_listing_cache: Optional mapping of S3 keys to listing objects. Returns: None Raises: - FileNotFoundError: If a required file is not found in S3. - ValueError: If a file has zero size. - Exception: If an unexpected exception occurs. """ from s3_utils import check_file_exists_in_s3, get_file_size # avoid circular import import config # Load the configuration from the .env file # aws_config, db_config, ach_config, bucket_name, ach_variables = config.load_config() _, _, _, bucket_name, _ = config.load_config() ach_pdf_disk_size = 0 # Set required extensions based on the file type if file.endswith('.mp4'): required_extensions = ['json', 'md5', 'pdf'] elif file.endswith('.mp3'): required_extensions = ['json', 'md5'] else: required_extensions = [] logging.info(f"Required extensions: {required_extensions}") for ext in required_extensions: related_file = f"{file_name_with_path}.{ext}" logging.info(f"Checking for related file: {related_file}") try: # Optimized existence check exists = False if s3_listing_cache: exists = related_file in s3_listing_cache else: exists = check_file_exists_in_s3(s3, related_file, bucket_name) if not exists: error_message = f"Required file {related_file} not found in S3." logging.error(error_message) raise FileNotFoundError(error_message) else: logging.info(f"Found related file: {related_file}") except FileNotFoundError as e: logging.error(f"Caught a FileNotFoundError: {e}") except Exception as e: logging.error(f"Caught an unexpected exception: {e}") # Check the size of the related file try: if ext in ['json', 'md5', 'pdf']: # Optimized size check if s3_listing_cache and related_file in s3_listing_cache: file_size = s3_listing_cache[related_file].get('Size') else: file_size = get_file_size(s3, bucket_name, related_file) if file_size == 0 or file_size is None: error_message = f"File {related_file} has zero size or missing." logging.error(error_message) raise ValueError(error_message) else: logging.info(f"File {related_file} size: {file_size}") except ValueError as e: logging.error(f"Caught a ValueError file Size is zero: {e}") raise ValueError(f"File {related_file} has zero size.") except Exception as e: logging.error(f"Caught an unexpected exception: {e}") # If the required file is a .pdf, get its size and update ach_pdf_disk_size if ext =='pdf': pdf_file = f"{file_name_with_path}.pdf" pdf_exists = False if s3_listing_cache: pdf_exists = pdf_file in s3_listing_cache else: pdf_exists = check_file_exists_in_s3(s3, pdf_file, bucket_name) if pdf_exists: if s3_listing_cache and pdf_file in s3_listing_cache: pdf_file_size = s3_listing_cache[pdf_file].get('Size') else: pdf_file_size = get_file_size(s3, bucket_name, pdf_file) ach_pdf_disk_size = pdf_file_size # logging.info(f"PDF disk size: {ach_pdf_disk_size}") else: logging.error(f"PDF file {pdf_file} not found.") raise FileNotFoundError(f"PDF file {pdf_file} not found.") return ach_pdf_disk_size def extract_and_validate_file_info(file_contents, file, ach_variables): # Load the configuration from the .env file #aws_config, db_config, ach_config, bucket_name, _ = config.load_config() # Extract relevant information from nested JSON ach_custom_data_in = file_contents # Check if json contain mediainfo metadata or ffprobe metadata or both logging.info(f"Extracted JSON contents: {ach_custom_data_in['json']}") # Check for keys at the first level if 'mediainfo' in ach_custom_data_in['json'] and 'ffprobe' in ach_custom_data_in['json']: ach_variables['custom_data_in'] = { "mediainfo": ach_custom_data_in['json'].get('mediainfo', {}), "ffprobe": ach_custom_data_in['json'].get('ffprobe', {}), "filename": ach_custom_data_in['json'].get('filename', ''), "md5": ach_custom_data_in.get('md5', '') } logging.info("mediainfo and ffprobe metadata found in JSON file.") # Check for keys at the second level if it is not already ordered elif 'creatingLibrary' in ach_custom_data_in['json'] and ach_custom_data_in['json'].get('creatingLibrary','').get('name','') == 'MediaInfoLib': ach_variables['custom_data_in'] = { "mediainfo": ach_custom_data_in['json'], "md5": ach_custom_data_in.get('md5', '') } logging.info("mediainfo metadata found in JSON file.") elif 'streams' in ach_custom_data_in['json']: ach_variables['custom_data_in'] = { "ffprobe": ach_custom_data_in['json'], "md5": ach_custom_data_in.get('md5', '') } logging.info("ffprobe metadata found in JSON file.") else: ach_variables['custom_data_in'] = { "md5": ach_custom_data_in.get('md5', '') } logging.error(f"No recognized data found in JSON file.{ach_custom_data_in} - {file_contents}") # trhow an error raise ValueError("No recognized data found in JSON file.") logging.info(f"Extracted JSON contents: {ach_variables['custom_data_in']}") # Extract FileExtension and FileSize if "@type" is "General" ach_disk_size = None tracks = ach_variables['custom_data_in'].get('mediainfo', {}).get('media', {}).get('track', []) for track in tracks: # Check if @type is "General" if track.get('@type') == 'General': # Retrieve the disk size from the General track ach_disk_size = track.get('FileSize', None) logging.info(f"Disk size from JSON media.track.General: {ach_disk_size}") # Retrieve the file extension from the General track ach_conservative_copy_extension = '.' + track.get('FileExtension', None) logging.info(f"FileExtension JSON media.track.General: {ach_conservative_copy_extension}") # Exit loop after finding the General track break # Exit the loop after finding the General track # Convert ach_disk_size to an integer if found if ach_disk_size is not None: ach_disk_size = int(ach_disk_size) # MEDIAINFO if "mediainfo" in ach_variables['custom_data_in'] and "media" in ach_variables['custom_data_in'].get("mediainfo"): # Extract the media_ref field from the JSON file contents media_ref = ach_variables['custom_data_in'].get('mediainfo', {}).get("media", {}).get("@ref", "") #STRIP DOUBLE BACK SLASKS FROM PATH media_ref = media_ref.replace("\\", "/") logging.info(f"Media ref medianfo: {media_ref}") # Split the path using '/' and get the last part (file name) file_name = media_ref.split('/')[-2] + '/' + media_ref.split('/')[-1] logging.info(f"Media file name (copia conservativa): {file_name}") # Update the @ref field with the new file name ach_variables['custom_data_in']["mediainfo"]["media"]["@ref"] = file_name logging.info(f"Updated the truncated file_name at mediainfo.media.@ref {ach_variables['custom_data_in']['mediainfo']['media']['@ref']}") else: logging.warning(f"mediainfo.media.@ref not found in JSON file.") # FFPROBE if "ffprobe" in ach_variables['custom_data_in'] and "format" in ach_variables['custom_data_in'].get("ffprobe"): # Extract the media_ref field from the JSON file contents media_ref = ach_variables['custom_data_in'].get('ffprobe', {}).get("format", {}).get("filename", "") #STRIP DOUBLE BACK SLASKS FROM PATH media_ref = media_ref.replace("\\", "/") logging.info(f"Media ref medianfo: {media_ref}") # Split the path using '/' and get the last part (file name) file_name = media_ref.split('/')[-2] + '/' + media_ref.split('/')[-1] logging.info(f"Media file name (copia conservativa): {file_name}") # Update the @ref field with the new file name ach_variables['custom_data_in']["ffprobe"]["format"]["filename"] = file_name logging.info(f"Updated the truncated file_name at ffprobe.format.filename {ach_variables['custom_data_in']['mediainfo']['media']['@ref']}") else: logging.warning(f"ffprobe.format.filename not found in JSON file.") logging.info(f"Updated the truncated file_name at mediainfo.media.@ref {file_name}") logging.info(f"JSON contents: {ach_variables['custom_data_in']}") # Check if file_contents is a string if isinstance(ach_variables['custom_data_in'], str): # Parse the JSON string into a dictionary ach_custom_data_in = json.loads(ach_variables['custom_data_in']) else: # Assume file_contents is already a dictionary ach_custom_data_in = ach_variables['custom_data_in'] # Check if basename is equal to name in the json file json_ref_mediainfo_path = ach_custom_data_in.get('mediainfo', {}).get("media", {}).get("@ref", "") json_ref_ffprobe_path = ach_custom_data_in.get('ffprobe', {}).get("format", {}).get("filename", "") logging.info(f"JSON file names: mediainfo: '{json_ref_mediainfo_path}', ffprobe: '{json_ref_ffprobe_path}', ach_file_fullpath: '{ach_variables['file_fullpath']}'") # Extract base names basename_fullpath = os.path.splitext(os.path.basename(ach_variables['file_fullpath']))[0] basename_fullpath = basename_fullpath.replace('_H264', '') basename_mediainfo = os.path.splitext(os.path.basename(json_ref_mediainfo_path))[0] basename_ffprobe = os.path.splitext(os.path.basename(json_ref_ffprobe_path))[0] # Check if the basenames are equal if basename_fullpath != basename_mediainfo: logging.warning(f"ach_file_fullpath '{basename_fullpath}' does not match JSON mediainfo file name '{basename_mediainfo}'.") else: logging.info(f"ach_file_fullpath '{basename_fullpath}' matches JSON mediainfo file name '{basename_mediainfo}'.") # Check if the basename matches the ffprobe path if basename_fullpath != basename_ffprobe: logging.warning(f"ach_file_fullpath '{basename_fullpath}' does not match JSON ffprobe file name '{basename_ffprobe}'.") else: logging.info(f"ach_file_fullpath '{basename_fullpath}' matches JSON ffprobe file name '{basename_ffprobe}'.") # Check folder prefixes (e.g. FILE/ vs DBT/) match between S3 file and JSON refs def _extract_prefix(path: str) -> str: if not path: return '' # Normalize separators to '/' so we can reliably split on a single character normalized = path.replace('\\', '/').lstrip('/') return normalized.split('/', 1)[0] if '/' in normalized else normalized prefix_fullpath = _extract_prefix(ach_variables['file_fullpath']) prefix_mediainfo = _extract_prefix(json_ref_mediainfo_path) prefix_ffprobe = _extract_prefix(json_ref_ffprobe_path) if prefix_fullpath != prefix_mediainfo or prefix_fullpath != prefix_ffprobe: logging.warning( "Prefix mismatch for S3 file '%s': S3 prefix='%s' (fullpath='%s') vs JSON prefixes (mediainfo='%s' [%s], ffprobe='%s' [%s]).", ach_variables.get('file_fullpath'), prefix_fullpath, ach_variables.get('file_fullpath'), prefix_mediainfo, json_ref_mediainfo_path, prefix_ffprobe, json_ref_ffprobe_path, ) if basename_fullpath != basename_mediainfo and basename_fullpath != basename_ffprobe: logging.error(f"ach_file_fullpath '{basename_fullpath}' does not match either JSON file name '{basename_mediainfo}' or '{basename_ffprobe}'.") raise ValueError(f"ach_file_fullpath '{basename_fullpath}' does not match either JSON file name '{basename_mediainfo}' or '{basename_ffprobe}'.") # Check if the file is a video or audio file try: if file.endswith('.mp4'): result, message = check_video_info(ach_custom_data_in.get('mediainfo', {})) logging.info(f"Validation result for {file}: {message}") elif file.endswith('.mp3'): result, message = check_audio_info(ach_custom_data_in.get('mediainfo', {})) logging.info(f"Validation result for {file}: {message}") else: # Handle cases where the file type is not supported raise ValueError(f"Unsupported file type: {file}") # Handle the error if validation fails if not result: error_message = f"Validation failed for {file}: {message}" logging.error(error_message) # handle_error(ValueError(error_message)) # Create and handle the exception except ValueError as e: # Handle specific ValueError exceptions logging.error(f"Caught a ValueError: {e}") #handle_error(e) # Pass the ValueError to handle_error except Exception as e: # Handle any other unexpected exceptions logging.error(f"Caught an unexpected exception: {e}") #handle_error(e) # Pass unexpected exceptions to handle_error # Return the updated ach_custom_data_in dictionary ach_custom_data_in.pop('filename', None) # Remove 'filename' key if it exists # logging.info(f"ach_custom_data_in: {json.dumps(ach_custom_data_in, indent=4)}") return ach_custom_data_in, ach_disk_size, ach_conservative_copy_extension def is_file_empty(file_path): return os.path.exists(file_path) and os.path.getsize(file_path) == 0 # unused function def read_file(file_path): try: with open(file_path, 'r') as file: return file.read() except FileNotFoundError as e: logging.error(f"File not found: {e}") raise e except IOError as e: logging.error(f"IO error: {e}") raise e def write_file(file_path, content): try: with open(file_path, 'w') as file: file.write(content) except IOError as e: logging.error(f"IO error: {e}") raise e