349 lines
16 KiB
Python
349 lines
16 KiB
Python
import os
|
|
import logging
|
|
from logging.handlers import RotatingFileHandler
|
|
import json
|
|
|
|
from utils import check_video_info, check_audio_info
|
|
|
|
from error_handler import handle_error
|
|
from botocore.exceptions import ClientError
|
|
|
|
#from config import load_config, aws_config, bucket_name
|
|
import config
|
|
|
|
def retrieve_file_contents(s3, base_name):
|
|
file_contents = {}
|
|
|
|
# Retrieve the configuration values
|
|
# aws_config, db_config, ach_config, bucket_name, ach_variables = config.load_config()
|
|
_, _, _, bucket_name, _ = config.load_config()
|
|
|
|
try:
|
|
# Define the file extensions as pairs
|
|
file_extensions = [['json', 'json'], ['md5', 'md5']]
|
|
|
|
for ext_pair in file_extensions:
|
|
file_name = f"{base_name}.{ext_pair[0]}"
|
|
|
|
try:
|
|
response = s3.get_object(Bucket=bucket_name, Key=file_name)
|
|
file_contents[ext_pair[1]] = response['Body'].read().decode('utf-8')
|
|
logging.info(f"Retrieved {ext_pair[1]} file content for base_name {base_name}.")
|
|
except ClientError as e:
|
|
# S3 returns a NoSuchKey error code when the key is missing.
|
|
code = e.response.get('Error', {}).get('Code', '')
|
|
if code in ('NoSuchKey', '404', 'NotFound'):
|
|
logging.warning(f"{file_name} not found in S3 (code={code}).")
|
|
# treat missing sidecars as non-fatal; continue
|
|
continue
|
|
else:
|
|
logging.error(f"Error retrieving {file_name}: {e}", exc_info=True)
|
|
# Re-raise other ClientError types
|
|
raise
|
|
except Exception as e:
|
|
logging.error(f'Error retrieving file contents for {base_name}: {e}', exc_info=True)
|
|
# Return empty JSON structure instead of raising to avoid tracebacks in callers
|
|
try:
|
|
return json.dumps({})
|
|
except Exception:
|
|
return '{}'
|
|
|
|
# Clean and format file_contents as proper JSON
|
|
try:
|
|
cleaned_contents = {}
|
|
|
|
# Clean the contents
|
|
for key, value in file_contents.items():
|
|
if isinstance(value, str):
|
|
# Remove trailing newlines or any other unwanted characters
|
|
cleaned_value = value.strip()
|
|
|
|
# Attempt to parse JSON
|
|
try:
|
|
cleaned_contents[key] = json.loads(cleaned_value)
|
|
except json.JSONDecodeError:
|
|
cleaned_contents[key] = cleaned_value
|
|
else:
|
|
cleaned_contents[key] = value
|
|
|
|
# Return the cleaned and formatted JSON
|
|
return json.dumps(cleaned_contents, indent=4)
|
|
except (TypeError, ValueError) as e:
|
|
logging.error(f'Error formatting file contents as JSON: {e}', exc_info=True)
|
|
raise e
|
|
|
|
def check_related_files(s3, file_name_with_path, file, bucket_name):
|
|
"""
|
|
Check for related files in S3 based on the given file type.
|
|
Parameters:
|
|
- s3: The S3 client object.
|
|
- file_name_with_path: The name of the file with its path.
|
|
- file: The file name.
|
|
- bucket_name: The name of the S3 bucket.
|
|
Returns:
|
|
None
|
|
Raises:
|
|
- FileNotFoundError: If a required file is not found in S3.
|
|
- ValueError: If a file has zero size.
|
|
- Exception: If an unexpected exception occurs.
|
|
"""
|
|
from s3_utils import check_file_exists_in_s3, get_file_size # avoid circular import
|
|
|
|
import config
|
|
|
|
# Load the configuration from the .env file
|
|
# aws_config, db_config, ach_config, bucket_name, ach_variables = config.load_config()
|
|
_, _, _, bucket_name, _ = config.load_config()
|
|
|
|
ach_pdf_disk_size = 0
|
|
|
|
# Set required extensions based on the file type
|
|
if file.endswith('.mp4'):
|
|
required_extensions = ['json', 'md5', 'pdf']
|
|
elif file.endswith('.mp3'):
|
|
required_extensions = ['json', 'md5']
|
|
else:
|
|
required_extensions = []
|
|
|
|
logging.info(f"Required extensions: {required_extensions}")
|
|
for ext in required_extensions:
|
|
related_file = f"{file_name_with_path}.{ext}"
|
|
logging.info(f"Checking for related file: {related_file}")
|
|
|
|
try:
|
|
if not check_file_exists_in_s3(s3, related_file,bucket_name):
|
|
error_message = f"Required file {related_file} not found in S3."
|
|
logging.error(error_message)
|
|
raise FileNotFoundError(error_message)
|
|
else:
|
|
logging.info(f"Found related file: {related_file}")
|
|
|
|
except FileNotFoundError as e:
|
|
logging.error(f"Caught a FileNotFoundError: {e}")
|
|
|
|
except Exception as e:
|
|
logging.error(f"Caught an unexpected exception: {e}")
|
|
|
|
# Check the size of the related file
|
|
try:
|
|
if ext in ['json', 'md5', 'pdf']:
|
|
file_size = get_file_size(s3, bucket_name, related_file)
|
|
if file_size == 0:
|
|
error_message = f"File {related_file} has zero size."
|
|
logging.error(error_message)
|
|
raise ValueError(error_message)
|
|
else:
|
|
logging.info(f"File {related_file} size: {file_size}")
|
|
except ValueError as e:
|
|
logging.error(f"Caught a ValueError file Size is zero: {e}")
|
|
raise ValueError(f"File {related_file} has zero size.")
|
|
except Exception as e:
|
|
logging.error(f"Caught an unexpected exception: {e}")
|
|
|
|
# If the required file is a .pdf, get its size and update ach_pdf_disk_size
|
|
if ext =='pdf':
|
|
pdf_file = f"{file_name_with_path}.pdf"
|
|
if check_file_exists_in_s3(s3, pdf_file,bucket_name):
|
|
pdf_file_size = get_file_size(s3, bucket_name, pdf_file)
|
|
ach_pdf_disk_size = pdf_file_size
|
|
logging.info(f"PDF disk size: {ach_pdf_disk_size}")
|
|
else:
|
|
logging.error(f"PDF file {pdf_file} not found.")
|
|
raise FileNotFoundError(f"PDF file {pdf_file} not found.")
|
|
|
|
return ach_pdf_disk_size
|
|
|
|
def extract_and_validate_file_info(file_contents, file, ach_variables):
|
|
|
|
# Load the configuration from the .env file
|
|
#aws_config, db_config, ach_config, bucket_name, _ = config.load_config()
|
|
|
|
# Extract relevant information from nested JSON
|
|
ach_custom_data_in = file_contents
|
|
|
|
# Check if json contain mediainfo metadata or ffprobe metadata or both
|
|
logging.info(f"Extracted JSON contents: {ach_custom_data_in['json']}")
|
|
# Check for keys at the first level
|
|
if 'mediainfo' in ach_custom_data_in['json'] and 'ffprobe' in ach_custom_data_in['json']:
|
|
ach_variables['custom_data_in'] = {
|
|
"mediainfo": ach_custom_data_in['json'].get('mediainfo', {}),
|
|
"ffprobe": ach_custom_data_in['json'].get('ffprobe', {}),
|
|
"filename": ach_custom_data_in['json'].get('filename', ''),
|
|
"md5": ach_custom_data_in.get('md5', '')
|
|
}
|
|
logging.info("mediainfo and ffprobe metadata found in JSON file.")
|
|
# Check for keys at the second level if it is not already ordered
|
|
elif 'creatingLibrary' in ach_custom_data_in['json'] and ach_custom_data_in['json'].get('creatingLibrary','').get('name','') == 'MediaInfoLib':
|
|
ach_variables['custom_data_in'] = {
|
|
"mediainfo": ach_custom_data_in['json'],
|
|
"md5": ach_custom_data_in.get('md5', '')
|
|
}
|
|
logging.info("mediainfo metadata found in JSON file.")
|
|
elif 'streams' in ach_custom_data_in['json']:
|
|
ach_variables['custom_data_in'] = {
|
|
"ffprobe": ach_custom_data_in['json'],
|
|
"md5": ach_custom_data_in.get('md5', '')
|
|
}
|
|
logging.info("ffprobe metadata found in JSON file.")
|
|
else:
|
|
ach_variables['custom_data_in'] = {
|
|
"md5": ach_custom_data_in.get('md5', '')
|
|
}
|
|
logging.error(f"No recognized data found in JSON file.{ach_custom_data_in} - {file_contents}")
|
|
# trhow an error
|
|
raise ValueError("No recognized data found in JSON file.")
|
|
|
|
logging.info(f"Extracted JSON contents: {ach_variables['custom_data_in']}")
|
|
# Extract FileExtension and FileSize if "@type" is "General"
|
|
ach_disk_size = None
|
|
tracks = ach_variables['custom_data_in'].get('mediainfo', {}).get('media', {}).get('track', [])
|
|
|
|
for track in tracks:
|
|
# Check if @type is "General"
|
|
if track.get('@type') == 'General':
|
|
|
|
# Retrieve the disk size from the General track
|
|
ach_disk_size = track.get('FileSize', None)
|
|
logging.info(f"Disk size from JSON media.track.General: {ach_disk_size}")
|
|
|
|
# Retrieve the file extension from the General track
|
|
ach_conservative_copy_extension = '.' + track.get('FileExtension', None)
|
|
logging.info(f"FileExtension JSON media.track.General: {ach_conservative_copy_extension}")
|
|
|
|
# Exit loop after finding the General track
|
|
break # Exit the loop after finding the General track
|
|
|
|
# Convert ach_disk_size to an integer if found
|
|
if ach_disk_size is not None:
|
|
ach_disk_size = int(ach_disk_size)
|
|
|
|
# MEDIAINFO
|
|
if "mediainfo" in ach_variables['custom_data_in'] and "media" in ach_variables['custom_data_in'].get("mediainfo"):
|
|
# Extract the media_ref field from the JSON file contents
|
|
media_ref = ach_variables['custom_data_in'].get('mediainfo', {}).get("media", {}).get("@ref", "")
|
|
|
|
#STRIP DOUBLE BACK SLASKS FROM PATH
|
|
media_ref = media_ref.replace("\\", "/")
|
|
logging.info(f"Media ref medianfo: {media_ref}")
|
|
# Split the path using '/' and get the last part (file name)
|
|
file_name = media_ref.split('/')[-2] + '/' + media_ref.split('/')[-1]
|
|
logging.info(f"Media file name (copia conservativa): {file_name}")
|
|
|
|
# Update the @ref field with the new file name
|
|
ach_variables['custom_data_in']["mediainfo"]["media"]["@ref"] = file_name
|
|
logging.info(f"Updated the truncated file_name at mediainfo.media.@ref {ach_variables['custom_data_in']['mediainfo']['media']['@ref']}")
|
|
else:
|
|
logging.warning(f"mediainfo.media.@ref not found in JSON file.")
|
|
|
|
# FFPROBE
|
|
if "ffprobe" in ach_variables['custom_data_in'] and "format" in ach_variables['custom_data_in'].get("ffprobe"):
|
|
# Extract the media_ref field from the JSON file contents
|
|
media_ref = ach_variables['custom_data_in'].get('ffprobe', {}).get("format", {}).get("filename", "")
|
|
|
|
#STRIP DOUBLE BACK SLASKS FROM PATH
|
|
media_ref = media_ref.replace("\\", "/")
|
|
logging.info(f"Media ref medianfo: {media_ref}")
|
|
# Split the path using '/' and get the last part (file name)
|
|
file_name = media_ref.split('/')[-2] + '/' + media_ref.split('/')[-1]
|
|
logging.info(f"Media file name (copia conservativa): {file_name}")
|
|
# Update the @ref field with the new file name
|
|
ach_variables['custom_data_in']["ffprobe"]["format"]["filename"] = file_name
|
|
logging.info(f"Updated the truncated file_name at ffprobe.format.filename {ach_variables['custom_data_in']['mediainfo']['media']['@ref']}")
|
|
else:
|
|
logging.warning(f"ffprobe.format.filename not found in JSON file.")
|
|
|
|
logging.info(f"Updated the truncated file_name at mediainfo.media.@ref {file_name}")
|
|
logging.info(f"JSON contents: {ach_variables['custom_data_in']}")
|
|
|
|
# Check if file_contents is a string
|
|
if isinstance(ach_variables['custom_data_in'], str):
|
|
# Parse the JSON string into a dictionary
|
|
ach_custom_data_in = json.loads(ach_variables['custom_data_in'])
|
|
else:
|
|
# Assume file_contents is already a dictionary
|
|
ach_custom_data_in = ach_variables['custom_data_in']
|
|
|
|
# Check if basename is equal to name in the json file
|
|
json_ref_mediainfo_path = ach_custom_data_in.get('mediainfo', {}).get("media", {}).get("@ref", "")
|
|
json_ref_ffprobe_path = ach_custom_data_in.get('ffprobe', {}).get("format", {}).get("filename", "")
|
|
logging.info(f"JSON file names: mediainfo: '{json_ref_mediainfo_path}', ffprobe: '{json_ref_ffprobe_path}', ach_file_fullpath: '{ach_variables['file_fullpath']}'")
|
|
|
|
# Extract base names
|
|
basename_fullpath = os.path.splitext(os.path.basename(ach_variables['file_fullpath']))[0]
|
|
basename_fullpath = basename_fullpath.replace('_H264', '')
|
|
basename_mediainfo = os.path.splitext(os.path.basename(json_ref_mediainfo_path))[0]
|
|
basename_ffprobe = os.path.splitext(os.path.basename(json_ref_ffprobe_path))[0]
|
|
|
|
# Check if the basenames are equal
|
|
if basename_fullpath != basename_mediainfo:
|
|
logging.warning(f"ach_file_fullpath '{basename_fullpath}' does not match JSON mediainfo file name '{basename_mediainfo}'.")
|
|
else:
|
|
logging.info(f"ach_file_fullpath '{basename_fullpath}' matches JSON mediainfo file name '{basename_mediainfo}'.")
|
|
|
|
# Check if the basename matches the ffprobe path
|
|
if basename_fullpath != basename_ffprobe:
|
|
logging.warning(f"ach_file_fullpath '{basename_fullpath}' does not match JSON ffprobe file name '{basename_ffprobe}'.")
|
|
else:
|
|
logging.info(f"ach_file_fullpath '{basename_fullpath}' matches JSON ffprobe file name '{basename_ffprobe}'.")
|
|
|
|
if basename_fullpath != basename_mediainfo and basename_fullpath != basename_ffprobe:
|
|
logging.error(f"ach_file_fullpath '{basename_fullpath}' does not match either JSON file name '{basename_mediainfo}' or '{basename_ffprobe}'.")
|
|
raise ValueError(f"ach_file_fullpath '{basename_fullpath}' does not match either JSON file name '{basename_mediainfo}' or '{basename_ffprobe}'.")
|
|
|
|
# Check if the file is a video or audio file
|
|
try:
|
|
if file.endswith('.mp4'):
|
|
result, message = check_video_info(ach_custom_data_in.get('mediainfo', {}))
|
|
logging.info(f"Validation result for {file}: {message}")
|
|
elif file.endswith('.mp3'):
|
|
result, message = check_audio_info(ach_custom_data_in.get('mediainfo', {}))
|
|
logging.info(f"Validation result for {file}: {message}")
|
|
else:
|
|
# Handle cases where the file type is not supported
|
|
raise ValueError(f"Unsupported file type: {file}")
|
|
|
|
# Handle the error if validation fails
|
|
if not result:
|
|
error_message = f"Validation failed for {file}: {message}"
|
|
logging.error(error_message)
|
|
# handle_error(ValueError(error_message)) # Create and handle the exception
|
|
except ValueError as e:
|
|
# Handle specific ValueError exceptions
|
|
logging.error(f"Caught a ValueError: {e}")
|
|
#handle_error(e) # Pass the ValueError to handle_error
|
|
|
|
except Exception as e:
|
|
# Handle any other unexpected exceptions
|
|
logging.error(f"Caught an unexpected exception: {e}")
|
|
#handle_error(e) # Pass unexpected exceptions to handle_error
|
|
|
|
|
|
# Return the updated ach_custom_data_in dictionary
|
|
ach_custom_data_in.pop('filename', None) # Remove 'filename' key if it exists
|
|
# logging.info(f"ach_custom_data_in: {json.dumps(ach_custom_data_in, indent=4)}")
|
|
return ach_custom_data_in, ach_disk_size, ach_conservative_copy_extension
|
|
|
|
def is_file_empty(file_path):
|
|
return os.path.exists(file_path) and os.path.getsize(file_path) == 0
|
|
|
|
# unused function
|
|
|
|
def read_file(file_path):
|
|
try:
|
|
with open(file_path, 'r') as file:
|
|
return file.read()
|
|
except FileNotFoundError as e:
|
|
logging.error(f"File not found: {e}")
|
|
raise e
|
|
except IOError as e:
|
|
logging.error(f"IO error: {e}")
|
|
raise e
|
|
|
|
def write_file(file_path, content):
|
|
try:
|
|
with open(file_path, 'w') as file:
|
|
file.write(content)
|
|
except IOError as e:
|
|
logging.error(f"IO error: {e}")
|
|
raise e
|
|
|