before -refactor
This commit is contained in:
parent
74afbad9a8
commit
6030cc3f84
139
main.py
139
main.py
|
|
@ -12,6 +12,7 @@ from error_handler import handle_general_error, handle_file_not_found_error, han
|
||||||
from file_utils import is_file_empty
|
from file_utils import is_file_empty
|
||||||
from db_utils import count_files, get_distinct_filenames_from_db
|
from db_utils import count_files, get_distinct_filenames_from_db
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
from validation_utils import validate_inventory_code, analyze_pattern_match, validate_icode_extension
|
||||||
import config
|
import config
|
||||||
import psycopg2
|
import psycopg2
|
||||||
|
|
||||||
|
|
@ -21,52 +22,6 @@ import re
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
def analyze_pattern_match(text, description):
|
|
||||||
"""Analyze which part of the 12-char pattern is not matching.
|
|
||||||
|
|
||||||
The code currently truncates base/folder names to the first 12 characters and
|
|
||||||
uses the pattern r'^[VA][OC]-[A-Z0-9]{3}-\d{5}$' which is 12 characters long.
|
|
||||||
This function therefore validates a 12-character string and avoids
|
|
||||||
indexing beyond its length.
|
|
||||||
"""
|
|
||||||
if not text:
|
|
||||||
return [f"{description}: Empty or None text"]
|
|
||||||
|
|
||||||
issues = []
|
|
||||||
expected_length = 12 # Pattern: [VA][OC]-[3chars]-[5digits]
|
|
||||||
|
|
||||||
# Check length
|
|
||||||
if len(text) != expected_length:
|
|
||||||
issues.append(f"Length mismatch: expected {expected_length}, got {len(text)}")
|
|
||||||
return issues
|
|
||||||
|
|
||||||
# Step 1: Check 1st character - V or A
|
|
||||||
if text[0] not in ['V', 'A']:
|
|
||||||
issues.append(f"Position 1: Expected [V,A], got '{text[0]}'")
|
|
||||||
|
|
||||||
# Step 2: Check 2nd character - O or C
|
|
||||||
if text[1] not in ['O', 'C']:
|
|
||||||
issues.append(f"Position 2: Expected [O,C], got '{text[1]}'")
|
|
||||||
|
|
||||||
# Step 3: Check 3rd character - dash
|
|
||||||
if text[2] != '-':
|
|
||||||
issues.append(f"Position 3: Expected '-', got '{text[2]}'")
|
|
||||||
|
|
||||||
# Step 4: Check positions 4,5,6 - [A-Z0-9]
|
|
||||||
for i in range(3, 6):
|
|
||||||
if not re.match(r'^[A-Z0-9]$', text[i]):
|
|
||||||
issues.append(f"Position {i+1}: Expected [A-Z0-9], got '{text[i]}'")
|
|
||||||
|
|
||||||
# Step 5: Check 7th character - dash
|
|
||||||
if text[6] != '-':
|
|
||||||
issues.append(f"Position 7: Expected '-', got '{text[6]}'")
|
|
||||||
|
|
||||||
# Step 6: Check positions 8-12 - digits
|
|
||||||
for i in range(7, 12):
|
|
||||||
if not text[i].isdigit():
|
|
||||||
issues.append(f"Position {i+1}: Expected digit, got '{text[i]}'")
|
|
||||||
|
|
||||||
return issues
|
|
||||||
|
|
||||||
# MAIN PROCESS
|
# MAIN PROCESS
|
||||||
def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
|
|
@ -99,7 +54,7 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
# Create the S3 client
|
# Create the S3 client
|
||||||
s3_client = create_s3_client(aws_config)
|
s3_client = create_s3_client(aws_config)
|
||||||
|
|
||||||
# List S3 bucket contents
|
# List S3 bucket s3_validated_contents
|
||||||
list_s3_files = list_s3_bucket(s3_client, bucket_name)
|
list_s3_files = list_s3_bucket(s3_client, bucket_name)
|
||||||
|
|
||||||
# Define valid extensions and excluded folders
|
# Define valid extensions and excluded folders
|
||||||
|
|
@ -137,31 +92,25 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
|
|
||||||
# check inventory code syntax
|
# check inventory code syntax
|
||||||
# first check s3_file_names if the file base name and folder name match pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$'
|
# first check s3_file_names if the file base name and folder name match pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}_\d{2}$'
|
||||||
pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}$'
|
s3_validated_contents = []
|
||||||
contents = []
|
|
||||||
|
|
||||||
for s3file in s3_file_names:
|
for s3file in s3_file_names:
|
||||||
# s3_file_names contains the object keys (strings), not dicts.
|
# s3_file_names contains the object keys (strings), not dicts.
|
||||||
base_name = os.path.basename(s3file)
|
base_name = os.path.basename(s3file)
|
||||||
# keep only first 12 chars
|
logging.info(f"S3 Base name: {base_name}")
|
||||||
base_name = base_name[:12]
|
if validate_inventory_code(base_name): # truncated to first 12 char in the function
|
||||||
logging.info(f"Base name: {base_name}")
|
|
||||||
folder_name = os.path.dirname(s3file)
|
|
||||||
# keep only first 12 chars of folder name as well
|
|
||||||
# folder_name = folder_name[:12]
|
|
||||||
# logging.info(f"Folder name: {folder_name}")
|
|
||||||
|
|
||||||
if re.match(pattern, base_name): # and re.match(pattern, folder_name):
|
|
||||||
logging.info(f"File {base_name} matches pattern.")
|
logging.info(f"File {base_name} matches pattern.")
|
||||||
contents.append(s3file)
|
# if valid check extension too
|
||||||
|
if not validate_icode_extension(s3file):
|
||||||
|
logging.warning(f"File {s3file} has invalid extension for its inventory code.")
|
||||||
|
continue # skip adding this file to validated contents
|
||||||
|
s3_validated_contents.append(s3file)
|
||||||
else:
|
else:
|
||||||
# Check base name
|
# Check base name in case of error
|
||||||
if not re.match(pattern, base_name):
|
base_issues = analyze_pattern_match(base_name, "Base name")
|
||||||
base_issues = analyze_pattern_match(base_name, "Base name")
|
logging.warning(f"Base name '{base_name}' does not match pattern. Issues: {base_issues}")
|
||||||
logging.warning(f"Base name '{base_name}' does not match pattern. Issues: {base_issues}")
|
folder_name = os.path.dirname(s3file)
|
||||||
|
logging.warning(f"File {s3file} in folder {folder_name} does not match pattern.")
|
||||||
logging.warning(f"File {base_name} in folder {folder_name} does not match pattern.")
|
|
||||||
|
|
||||||
|
|
||||||
# filter_s3_files_not_in_db
|
# filter_s3_files_not_in_db
|
||||||
# --- Get all DB filenames in one call ---
|
# --- Get all DB filenames in one call ---
|
||||||
|
|
@ -179,22 +128,14 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
db_sidecar_basenames.add(dbf[:-len(ext)])
|
db_sidecar_basenames.add(dbf[:-len(ext)])
|
||||||
break
|
break
|
||||||
|
|
||||||
file_names = []
|
|
||||||
for f in contents:
|
filtered_file_names=list_s3_not_in_db(s3_validated_contents, db_file_names, db_sidecar_basenames)
|
||||||
# exact key present in DB -> skip
|
|
||||||
if f in db_file_names:
|
|
||||||
continue
|
|
||||||
# strip extension to get basename and skip if DB has sidecar for it
|
|
||||||
base = os.path.splitext(f)[0]
|
|
||||||
if base in db_sidecar_basenames:
|
|
||||||
# logging.info("Skipping %s because DB already contains sidecar for basename %s", _visible_spaces(f), _visible_spaces(base))
|
|
||||||
continue
|
|
||||||
file_names.append(f)
|
|
||||||
|
|
||||||
# Print the total number of files
|
# Print the total number of files
|
||||||
total_files_s3 = len(contents)
|
total_files_s3 = len(s3_validated_contents)
|
||||||
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
|
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files in the S3 bucket before DB filter: {total_files_s3}")
|
||||||
total_files = len(file_names)
|
total_files = len(filtered_file_names)
|
||||||
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
|
logging.info(f"Total number of the valid (mp3,mp4,md5,json,pdf) files after DB filter: {total_files}")
|
||||||
|
|
||||||
# Count files with .mp4 and .mp3 extensions
|
# Count files with .mp4 and .mp3 extensions
|
||||||
|
|
@ -222,9 +163,15 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
logging.info("Number of .mov files on S3 bucket (%s): %s", bucket_name, mov_count)
|
logging.info("Number of .mov files on S3 bucket (%s): %s", bucket_name, mov_count)
|
||||||
# logging.info(f"Number of .jpg files: {jpg_count}")
|
# logging.info(f"Number of .jpg files: {jpg_count}")
|
||||||
|
|
||||||
|
# should check all .mp4 should have base_name.endswith('_H264'):
|
||||||
|
for file in s3_file_names:
|
||||||
|
if file.endswith('.mp4'):
|
||||||
|
validate_mp4_file(file) # validation_utils.py - check also _H264 at the end
|
||||||
|
|
||||||
# If ACH_SAFE_RUN is 'false' we enforce strict mp4/pdf parity and abort
|
# If ACH_SAFE_RUN is 'false' we enforce strict mp4/pdf parity and abort
|
||||||
# when mismatched. Default is 'true' which skips this abort to allow
|
# when mismatched. Default is 'true' which skips this abort to allow
|
||||||
# safer runs during testing or manual reconciliation.
|
# safer runs during testing or manual reconciliation.
|
||||||
|
s3_files_filtered= []
|
||||||
if os.getenv('ACH_SAFE_RUN', 'true') == 'true':
|
if os.getenv('ACH_SAFE_RUN', 'true') == 'true':
|
||||||
if mp4_count != pdf_count:
|
if mp4_count != pdf_count:
|
||||||
logging.error("Number of .mp4 files is not equal to number of .pdf files")
|
logging.error("Number of .mp4 files is not equal to number of .pdf files")
|
||||||
|
|
@ -234,30 +181,35 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
# store tuples (source_file, expected_counterpart) for clearer logging
|
# store tuples (source_file, expected_counterpart) for clearer logging
|
||||||
missing_pdfs = [] # list of (mp4_file, expected_pdf)
|
missing_pdfs = [] # list of (mp4_file, expected_pdf)
|
||||||
missing_mp4s = [] # list of (pdf_file, expected_mp4)
|
missing_mp4s = [] # list of (pdf_file, expected_mp4)
|
||||||
for file in file_names:
|
for file in filtered_file_names:
|
||||||
if file.endswith('.mp4'):
|
if file.endswith('.mp4'):
|
||||||
# remove extension
|
# remove extension robustly using os.path.splitext to preserve any path prefix
|
||||||
base_name = file[:-4] # keeps any path prefix
|
base_name = os.path.splitext(file)[0]
|
||||||
# if the mp4 is an H264 variant (e.g. name_H264.mp4) remove the suffix
|
# if the mp4 is an H264 variant (e.g. name_H264.mp4) remove the suffix
|
||||||
if base_name.endswith('_H264'):
|
if base_name.endswith('_H264'):
|
||||||
|
# must check if has extra number for DBT and DVD amd [FILE]
|
||||||
base_name = base_name[:-5]
|
base_name = base_name[:-5]
|
||||||
expected_pdf = base_name + '.pdf'
|
expected_pdf = base_name + '.pdf'
|
||||||
if expected_pdf not in file_names:
|
if expected_pdf not in filtered_file_names:
|
||||||
missing_pdfs.append((file, expected_pdf))
|
missing_pdfs.append((file, expected_pdf))
|
||||||
elif file.endswith('.pdf'):
|
elif file.endswith('.pdf'): # check if pdf as no .mp4
|
||||||
# Normalize base name and accept either the regular mp4 or the _H264 variant.
|
# Normalize base name and accept either the regular mp4 or the _H264 variant.
|
||||||
base_name = file[:-4]
|
# remove extension robustly using os.path.splitext
|
||||||
expected_mp4 = base_name + '.mp4'
|
base_name = os.path.splitext(file)[0]
|
||||||
h264_variant = base_name + '_H264.mp4'
|
expected_mp4 = base_name + '_H264.mp4'
|
||||||
# If neither the regular mp4 nor the H264 variant exists, report missing.
|
# If neither the regular mp4 nor the H264 variant exists, report missing.
|
||||||
if expected_mp4 not in file_names and h264_variant not in file_names:
|
if expected_mp4 not in filtered_file_names:
|
||||||
missing_mp4s.append((file, expected_mp4))
|
missing_mp4s.append((file, expected_mp4))
|
||||||
# report missing files
|
else:
|
||||||
|
# append to s3_files_filtered
|
||||||
|
s3_files_filtered.append(file)
|
||||||
|
continue
|
||||||
|
# report missing .pdf files
|
||||||
if missing_pdfs:
|
if missing_pdfs:
|
||||||
logging.error("Missing .pdf files (mp4 -> expected pdf):")
|
logging.error("Missing .pdf files (mp4 -> expected pdf):")
|
||||||
for mp4_file, expected_pdf in missing_pdfs:
|
for mp4_file, expected_pdf in missing_pdfs:
|
||||||
logging.error("%s -> %s", _visible_spaces(mp4_file), _visible_spaces(expected_pdf))
|
logging.error("%s -> %s", _visible_spaces(mp4_file), _visible_spaces(expected_pdf))
|
||||||
|
# report missing .mp4 files
|
||||||
if missing_mp4s:
|
if missing_mp4s:
|
||||||
logging.error("Missing .mp4 files (pdf -> expected mp4):")
|
logging.error("Missing .mp4 files (pdf -> expected mp4):")
|
||||||
for pdf_file, expected_mp4 in missing_mp4s:
|
for pdf_file, expected_mp4 in missing_mp4s:
|
||||||
|
|
@ -268,7 +220,14 @@ def main_process(aws_config, db_config, ach_config, bucket_name, ach_variables):
|
||||||
|
|
||||||
if mp3_count + mp4_count != json_count:
|
if mp3_count + mp4_count != json_count:
|
||||||
logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .json files")
|
logging.error("Number of .mp3 files + number of .mp4 files is not equal to number of .json files")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# add check of mp3 +6 mp4 vs json and md5 file like above for mp4 and pdf
|
||||||
logging.error("Abort Import Process due to missing files")
|
logging.error("Abort Import Process due to missing files")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# search wich file dont match TODO
|
# search wich file dont match TODO
|
||||||
raise ValueError("Inconsistent file counts mp3+mp4 vs json")
|
raise ValueError("Inconsistent file counts mp3+mp4 vs json")
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,410 @@
|
||||||
|
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
def analyze_pattern_match(text, description):
|
||||||
|
"""Analyze which part of the 12-char pattern is not matching.
|
||||||
|
|
||||||
|
The code currently truncates base/folder names to the first 12 characters and
|
||||||
|
uses the pattern r'^[VA][OC]-[A-Z0-9]{3}-\d{5}$' which is 12 characters long.
|
||||||
|
This function therefore validates a 12-character string and avoids
|
||||||
|
indexing beyond its length.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return [f"{description}: Empty or None text"]
|
||||||
|
|
||||||
|
issues = []
|
||||||
|
expected_length = 12 # Pattern: [VA][OC]-[3chars]-[5digits]
|
||||||
|
|
||||||
|
# Check length (analyze_pattern_match is only for the 12-char base)
|
||||||
|
if len(text) != expected_length:
|
||||||
|
issues.append(f"Length mismatch: expected {expected_length}, got {len(text)}")
|
||||||
|
return issues
|
||||||
|
|
||||||
|
# Step 1: Check 1st character - V or A
|
||||||
|
if text[0] not in ['V', 'A']:
|
||||||
|
issues.append(f"Position 1: Expected [V,A], got '{text[0]}'")
|
||||||
|
|
||||||
|
# Step 2: Check 2nd character - O or C
|
||||||
|
if text[1] not in ['O', 'C']:
|
||||||
|
issues.append(f"Position 2: Expected [O,C], got '{text[1]}'")
|
||||||
|
|
||||||
|
# Step 3: Check 3rd character - dash
|
||||||
|
if text[2] != '-':
|
||||||
|
issues.append(f"Position 3: Expected '-', got '{text[2]}'")
|
||||||
|
|
||||||
|
# Step 4: Check positions 4,5,6 - [A-Z0-9]
|
||||||
|
for i in range(3, 6):
|
||||||
|
if not re.match(r'^[A-Z0-9]$', text[i]):
|
||||||
|
issues.append(f"Position {i+1}: Expected [A-Z0-9], got '{text[i]}'")
|
||||||
|
|
||||||
|
# Step 5: Check 7th character - dash
|
||||||
|
if text[6] != '-':
|
||||||
|
issues.append(f"Position 7: Expected '-', got '{text[6]}'")
|
||||||
|
|
||||||
|
# Step 6: Check positions 8-12 - digits
|
||||||
|
for i in range(7, 12):
|
||||||
|
if not text[i].isdigit():
|
||||||
|
issues.append(f"Position {i+1}: Expected digit, got '{text[i]}'")
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
def validate_inventory_code(inventory_code):
|
||||||
|
"""
|
||||||
|
Validates the given inventory code.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inventory_code (str): The inventory code to validate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the inventory code is valid, False otherwise.
|
||||||
|
"""
|
||||||
|
# Check the base_name format with regex pattern first
|
||||||
|
pattern = r'^[VA][OC]-[A-Z0-9]{3}-\d{5}$'
|
||||||
|
# first 12 characters (safely handle non-string input)
|
||||||
|
if not isinstance(inventory_code, str) or inventory_code == '':
|
||||||
|
logging.warning("Empty or non-string inventory code provided to validate_inventory_code")
|
||||||
|
return False
|
||||||
|
truncated_base_name = inventory_code[:12]
|
||||||
|
logging.info("Inventory Code: %s", truncated_base_name)
|
||||||
|
# Use fullmatch on the truncated string so the pattern covers exactly 12 chars
|
||||||
|
if re.fullmatch(pattern, truncated_base_name):
|
||||||
|
logging.info("Inventory code '%s' is valid.", inventory_code)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# When base validation fails, provide a detailed analysis to help logging
|
||||||
|
issues = analyze_pattern_match(truncated_base_name, "Base name")
|
||||||
|
for issue in issues:
|
||||||
|
logging.warning("Inventory code base issue: %s", issue)
|
||||||
|
|
||||||
|
logging.warning("Inventory code '%s' is invalid.", inventory_code)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def validate_icode_extension(file_inventory_code):
|
||||||
|
"""
|
||||||
|
Validate an inventory code's optional extension.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- The base is the first 12 characters and must match the same pattern
|
||||||
|
used by :func:`validate_inventory_code`.
|
||||||
|
- Total length must be <= 17 characters (12 base + up to 5-char extension).
|
||||||
|
- The extension (characters after position 12) is optional. When present
|
||||||
|
it must match the expected pattern for the code type found in positions
|
||||||
|
4-6 of the 12-character base (for example: 'BTC', 'OA4', 'DVD', 'MCC').
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_inventory_code (str): Full inventory code (base + optional extension).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True when base is valid and extension is absent or valid for the
|
||||||
|
detected code type; False otherwise.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the provided string length exceeds the allowed maximum
|
||||||
|
(17 characters).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# mapping from 3-char code to extension pattern (includes leading underscore)
|
||||||
|
file_type_to_regex = {
|
||||||
|
'BTC': r'_\d{4}',
|
||||||
|
'OA4': r'_\d{2}',
|
||||||
|
'DVD': r'_\d{2}',
|
||||||
|
'MCC': r'_[AB]'
|
||||||
|
}
|
||||||
|
|
||||||
|
if not isinstance(file_inventory_code, str) or file_inventory_code == '':
|
||||||
|
logging.warning("Empty or non-string inventory code provided to validate_icode_extension")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Enforce maximum length (12 base + up to 5 extension chars)
|
||||||
|
if len(file_inventory_code) > 17:
|
||||||
|
logging.warning("Inventory code '%s' exceeds maximum allowed length (17).", file_inventory_code)
|
||||||
|
raise ValueError("Inventory code with extension exceeds maximum length of 17 characters.")
|
||||||
|
|
||||||
|
# Validate base first (first 12 chars). If base invalid -> reject.
|
||||||
|
base = file_inventory_code[:12]
|
||||||
|
if not validate_inventory_code(base):
|
||||||
|
logging.warning("Base inventory code '%s' is invalid.", base)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Determine type (positions 4-6 of the base) and the extension (may be empty)
|
||||||
|
support_type = base[3:6]
|
||||||
|
extension = file_inventory_code[12:]
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_extension_pattern(file_inventory_code):
|
||||||
|
"""Analyze extension and base-length issues for a full inventory code.
|
||||||
|
|
||||||
|
Returns a list of human-readable issues (empty list means no issues found).
|
||||||
|
This function does not raise; it only returns diagnostics.
|
||||||
|
"""
|
||||||
|
issues = []
|
||||||
|
if not isinstance(file_inventory_code, str) or file_inventory_code == '':
|
||||||
|
return ["Empty or non-string inventory code"]
|
||||||
|
|
||||||
|
extension_part = file_inventory_code[12:]
|
||||||
|
|
||||||
|
# Base truncated for the primary 12-char checks
|
||||||
|
base_part = base_part[:12]
|
||||||
|
base_issues = analyze_pattern_match(base_part, "Base12")
|
||||||
|
issues.extend([f"Base12: {it}" for it in base_issues])
|
||||||
|
|
||||||
|
extension_part = file_inventory_code[12:]
|
||||||
|
|
||||||
|
# For clarity and maintainability prefer describing an example extension
|
||||||
|
# for each type (strings) rather than numeric magic values. Consumers
|
||||||
|
# can compute lengths via len(example). We do not enforce base-part
|
||||||
|
# total lengths here; instead we validate the extension against the
|
||||||
|
# per-type regex (see below) which is the primary source of truth.
|
||||||
|
'''
|
||||||
|
extension_examples = {
|
||||||
|
'BTC': '_1234', # 4 digits after underscore
|
||||||
|
'OA4': '_01', # 2 digits
|
||||||
|
'DVD': '_01', # 2 digits
|
||||||
|
'MCC': '_A', # single letter A or B
|
||||||
|
'DBT': '_01223', # example 5-char extension (if used)
|
||||||
|
} '''
|
||||||
|
|
||||||
|
support_type = file_inventory_code[3:6] if len(file_inventory_code) >= 6 else ''
|
||||||
|
|
||||||
|
# Extension validation similar to validate_icode_extension
|
||||||
|
file_type_to_regex = {
|
||||||
|
'BTC': r'_\d{4}',
|
||||||
|
'OA4': r'_\d{2}',
|
||||||
|
'DVD': r'_\d{2}',
|
||||||
|
'MCC': r'_[AB]'
|
||||||
|
}
|
||||||
|
|
||||||
|
if extension == '':
|
||||||
|
issues.append("No extension present")
|
||||||
|
return issues
|
||||||
|
|
||||||
|
expected_ext = file_type_to_regex.get(support_type)
|
||||||
|
if expected_ext is None:
|
||||||
|
issues.append(f"Unsupported type '{support_type}' for extension validation")
|
||||||
|
return issues
|
||||||
|
|
||||||
|
if not re.fullmatch(expected_ext, extension):
|
||||||
|
issues.append(f"Extension '{extension}' does not match expected pattern '{expected_ext}' for type '{support_type}'")
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
def validate_mp4_file(file_list):
|
||||||
|
"""
|
||||||
|
Validates if the given file is a valid MP4 file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): The path to the MP4 file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the file is a valid MP4 file, False otherwise.
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
Given an iterable of file paths, return the subset that end with '.mp4'.
|
||||||
|
|
||||||
|
This is intentionally lightweight: it only filters by extension and
|
||||||
|
returns a list of accepted paths. Additional checks (metadata, codecs)
|
||||||
|
should be implemented by callers if needed.
|
||||||
|
"""
|
||||||
|
valid_files = []
|
||||||
|
try:
|
||||||
|
for f in file_list:
|
||||||
|
if not isinstance(f, str):
|
||||||
|
logging.debug("Skipping non-string entry in file_list: %r", f)
|
||||||
|
continue
|
||||||
|
if f.lower().endswith('.mp4'):
|
||||||
|
valid_files.append(f)
|
||||||
|
return valid_files
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("Error validating MP4 files: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def validate_mp3_file(file_list):
|
||||||
|
"""
|
||||||
|
Validates the given MP3 file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_list (list): List of MP3files to validate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the file is a valid MP3 file, False otherwise.
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
Given an iterable of file paths, return the subset that end with '.mp3'.
|
||||||
|
|
||||||
|
Lightweight filter by extension; further validation may be done elsewhere.
|
||||||
|
"""
|
||||||
|
valid_files = []
|
||||||
|
try:
|
||||||
|
for f in file_list:
|
||||||
|
if not isinstance(f, str):
|
||||||
|
logging.debug("Skipping non-string entry in file_list: %r", f)
|
||||||
|
continue
|
||||||
|
if f.lower().endswith('.mp3'):
|
||||||
|
valid_files.append(f)
|
||||||
|
return valid_files
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("Error validating MP3 files: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def list_s3_not_in_db(s3_list, db_list, db_sidecar_basenames):
|
||||||
|
"""
|
||||||
|
Return the subset of S3 object keys that are not represented in the database
|
||||||
|
either as an exact key or as a basename for which a sidecar already exists.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
s3_list (Iterable[str]):
|
||||||
|
Sequence of S3 object keys (typically filenames or object paths) to be
|
||||||
|
checked. Order and duplicates in this sequence are preserved in the
|
||||||
|
result.
|
||||||
|
db_list (Iterable[str]):
|
||||||
|
Sequence of keys that are known to exist in the database. Any exact
|
||||||
|
match between an item in s3_list and an item in db_list will cause the
|
||||||
|
S3 item to be excluded from the result.
|
||||||
|
db_sidecar_basenames (Iterable[str] or set[str]):
|
||||||
|
Collection of basenames (file names without their final extension)
|
||||||
|
for which the database already contains a sidecar. If the basename of
|
||||||
|
an S3 key (computed via os.path.splitext) is present in this collection,
|
||||||
|
the corresponding S3 key will be excluded.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[str]:
|
||||||
|
A list of keys from s3_list that are not present in db_list and whose
|
||||||
|
basenames are not present in db_sidecar_basenames. Matching is exact
|
||||||
|
(case-sensitive) and performed on the raw string values; the function
|
||||||
|
does not normalize paths or perform filesystem I/O.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Basename extraction uses os.path.splitext, which removes only the final
|
||||||
|
extension (the portion after the last dot). For example, "foo.tar.gz"
|
||||||
|
yields basename "foo.tar".
|
||||||
|
- The function does not modify the input iterables and preserves the order
|
||||||
|
of items from s3_list that pass the filters.
|
||||||
|
- For best performance, pass db_sidecar_basenames (and db_list if large) as
|
||||||
|
a set to have O(1) membership checks.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> s3_list = ["a.jpg", "b.jpg", "c.txt"]
|
||||||
|
>>> db_list = ["b.jpg"]
|
||||||
|
>>> db_sidecar_basenames = {"c"}
|
||||||
|
>>> list_s3_not_in_db(s3_list, db_list, db_sidecar_basenames)
|
||||||
|
["a.jpg"]
|
||||||
|
"""
|
||||||
|
file_names = []
|
||||||
|
for f in s3_list:
|
||||||
|
# exact key present in DB -> skip
|
||||||
|
if f in db_list:
|
||||||
|
# logging.info("Skipping %s because exact key present in DB", _visible_spaces(f))
|
||||||
|
continue
|
||||||
|
# strip extension to get basename and skip if DB has sidecar for it
|
||||||
|
base = os.path.splitext(f)[0]
|
||||||
|
if base in db_sidecar_basenames:
|
||||||
|
# logging.info("Skipping %s because DB already contains sidecar for basename %s", _visible_spaces(f), _visible_spaces(base))
|
||||||
|
continue
|
||||||
|
# else add to list
|
||||||
|
file_names.append(f)
|
||||||
|
return file_names
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Improved test harness for the validation functions in this module.
|
||||||
|
# - Structured test cases with expected outcomes (value or exception)
|
||||||
|
# - Prints PASS/FAIL and a short summary
|
||||||
|
# Note: We avoid calling analyze_extension_pattern() from tests because
|
||||||
|
# that helper currently contains a runtime bug in this file. If you want
|
||||||
|
# to enable those diagnostics, fix analyze_extension_pattern first and
|
||||||
|
# then add tests that exercise it.
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
print("Running validation_utils structured tests...\n")
|
||||||
|
|
||||||
|
# small helper to wrap callables that may raise
|
||||||
|
def call_fn(fn):
|
||||||
|
try:
|
||||||
|
return (False, fn())
|
||||||
|
except Exception as e:
|
||||||
|
return (True, e)
|
||||||
|
|
||||||
|
cases = [
|
||||||
|
# analyze_pattern_match -> expect lists (empty == no issues)
|
||||||
|
{"name": "analyze_pattern_match valid", "fn": lambda: analyze_pattern_match("VO-DVD-12345", "Base"), "expect": []},
|
||||||
|
{"name": "analyze_pattern_match invalid length", "fn": lambda: analyze_pattern_match("VO-DVD-1234", "Base"), "expect_non_empty": True},
|
||||||
|
{"name": "analyze_pattern_match invalid chars", "fn": lambda: analyze_pattern_match("XO-DVD-12345", "Base"), "expect_non_empty": True},
|
||||||
|
|
||||||
|
# validate_inventory_code
|
||||||
|
{"name": "validate_inventory_code valid", "fn": lambda: validate_inventory_code("VO-DVD-12345"), "expect": True},
|
||||||
|
{"name": "validate_inventory_code invalid prefix", "fn": lambda: validate_inventory_code("XO-DVD-12345"), "expect": False},
|
||||||
|
{"name": "validate_inventory_code invalid format", "fn": lambda: validate_inventory_code("VO-DV-12345"), "expect": False},
|
||||||
|
|
||||||
|
# validate_icode_extension (valid and invalid)
|
||||||
|
{"name": "validate_icode_extension no ext", "fn": lambda: validate_icode_extension("VO-DVD-12345"), "expect": True},
|
||||||
|
{"name": "validate_icode_extension BTC valid", "fn": lambda: validate_icode_extension("VO-BTC-12345_1234"), "expect": True},
|
||||||
|
{"name": "validate_icode_extension DVD valid", "fn": lambda: validate_icode_extension("VO-DVD-12345_12"), "expect": True},
|
||||||
|
{"name": "validate_icode_extension MCC valid", "fn": lambda: validate_icode_extension("VO-MCC-12345_A"), "expect": True},
|
||||||
|
{"name": "validate_icode_extension unsupported type", "fn": lambda: validate_icode_extension("VO-XYZ-12345_12"), "expect": False},
|
||||||
|
{"name": "validate_icode_extension too long extension (raises)", "fn": lambda: validate_icode_extension("VO-DVD-12345_12345"), "expect_exception": ValueError},
|
||||||
|
{"name": "validate_icode_extension BTC invalid pattern", "fn": lambda: validate_icode_extension("VO-BTC-12345_12"), "expect": False},
|
||||||
|
|
||||||
|
# mp4/mp3 validators - return lists
|
||||||
|
{"name": "validate_mp4_file valid list", "fn": lambda: validate_mp4_file(["folder/VO-DVD-12345.mp4", "folder/movie_H264.mp4"]), "expect": ["folder/VO-DVD-12345.mp4", "folder/movie_H264.mp4"]},
|
||||||
|
{"name": "validate_mp4_file no mp4s", "fn": lambda: validate_mp4_file(["folder/readme.txt", "folder/image.jpg"]), "expect": []},
|
||||||
|
{"name": "validate_mp3_file valid list", "fn": lambda: validate_mp3_file(["audio/VO-DVD-12345.mp3"]), "expect": ["audio/VO-DVD-12345.mp3"]},
|
||||||
|
{"name": "validate_mp3_file no mp3s", "fn": lambda: validate_mp3_file(["audio/readme.md"]), "expect": []},
|
||||||
|
|
||||||
|
# list_s3_not_in_db
|
||||||
|
{"name": "list_s3_not_in_db some available", "fn": lambda: list_s3_not_in_db(["a.mp4", "b.mp3", "c.pdf"], ["b.mp3"], {"a"}), "expect": ["c.pdf"]},
|
||||||
|
{"name": "list_s3_not_in_db all filtered by sidecar", "fn": lambda: list_s3_not_in_db(["a.mp4"], [], {"a"}), "expect": []},
|
||||||
|
{"name": "list_s3_not_in_db all filtered by exact db match", "fn": lambda: list_s3_not_in_db(["b.mp3"], ["b.mp3"], set()), "expect": []},
|
||||||
|
]
|
||||||
|
|
||||||
|
passed = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for case in cases:
|
||||||
|
name = case["name"]
|
||||||
|
expect = case.get("expect", None)
|
||||||
|
expect_non_empty = case.get("expect_non_empty", False)
|
||||||
|
expect_exception = case.get("expect_exception", None)
|
||||||
|
|
||||||
|
raised, result = call_fn(case["fn"])
|
||||||
|
|
||||||
|
ok = False
|
||||||
|
details = None
|
||||||
|
|
||||||
|
if expect_exception is not None:
|
||||||
|
# We expect an exception of a given type
|
||||||
|
if raised and isinstance(result, expect_exception):
|
||||||
|
ok = True
|
||||||
|
else:
|
||||||
|
ok = False
|
||||||
|
details = f"expected exception {expect_exception.__name__}, got {'no exception' if not raised else type(result).__name__}"
|
||||||
|
else:
|
||||||
|
# No exception expected
|
||||||
|
if raised:
|
||||||
|
ok = False
|
||||||
|
details = f"raised unexpected {type(result).__name__}: {result}"
|
||||||
|
else:
|
||||||
|
# compare results
|
||||||
|
if expect_non_empty:
|
||||||
|
ok = bool(result)
|
||||||
|
if not ok:
|
||||||
|
details = f"expected non-empty result, got {result!r}"
|
||||||
|
elif expect is not None:
|
||||||
|
ok = (result == expect)
|
||||||
|
if not ok:
|
||||||
|
details = f"expected {expect!r}, got {result!r}"
|
||||||
|
else:
|
||||||
|
# fallback: treat truthiness as success
|
||||||
|
ok = bool(result)
|
||||||
|
|
||||||
|
if ok:
|
||||||
|
print(f"PASS: {name}")
|
||||||
|
passed += 1
|
||||||
|
else:
|
||||||
|
print(f"FAIL: {name} -> {details if details else result}")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
print(f"\nTest summary: passed={passed}, failed={failed}, total={passed+failed}")
|
||||||
|
|
||||||
|
# Mark remaining todo: run syntax check
|
||||||
|
# (We don't call analyze_extension_pattern here to avoid hitting the current bug.)
|
||||||
Loading…
Reference in New Issue