MemeDb/utils/dateExtractor.py

import re
import os
from datetime import datetime
from typing import Optional, Union # For type hinting

def extract_date_from_path(file_path: str) -> Optional[datetime]:
    """
    Extracts a datetime object from a file path.
    It first tries to parse the date and time from the filename.
    If unsuccessful, it falls back to the file's last modification timestamp.

    Args:
        file_path (str): The absolute or relative path to the file.

    Returns:
        Optional[datetime]: A datetime object if a date could be extracted,
                            otherwise None.
    """
    filename = os.path.basename(file_path)

    # Define regex patterns to try. Order might matter if filenames could match multiple.
    # Pattern 1: (Optional_Prefix_)YYYYMMDD_HHMMSS(_Optional_Suffix)
    # e.g., IMG_20210811_141036.jpg, 20210509_005303.jpg, IMG_20190723_211320_065.jpg
    patterns = [
        re.compile(r"(?:.*_)?(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})(?:_.*)?\..+"), # YYYYMMDD_HHMMSS
        re.compile(r"(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})(?:_.*)?\..+"),       # Strict YYYYMMDD_HHMMSS at start
        re.compile(r"(\d{8})_(\d{6})(?:_.*)?\..+")                                 # YYYYYMMDD_HHMMSS (compact)
    ]

    for pattern in patterns:
        match = pattern.search(filename)
        if match:
            groups = match.groups()
            try:
                if len(groups) == 6: # YYYY, MM, DD, HH, MM, SS
                    year, month, day, hour, minute, second = map(int, groups)
                    return datetime(year, month, day, hour, minute, second)
                elif len(groups) == 2: # YYYYMMDD, HHMMSS
                    date_str, time_str = groups
                    return datetime.strptime(date_str + time_str, "%Y%m%d%H%M%S")
            except ValueError as e:
                print(f"Warning: Could not parse date from filename groups {groups} for {filename}: {e}")
                # Continue to next pattern or fallback
                pass

    if os.path.exists(file_path):
        try:
            # Get the last modification time
            mtime = os.path.getmtime(file_path)
            return datetime.fromtimestamp(mtime)
        except OSError as e:
            print(f"Error: Could not get metadata for {file_path}: {e}")
            return None
        except Exception as e: # Catch any other unexpected error during metadata access
            print(f"Unexpected error getting metadata for {file_path}: {e}")
            return None
    else:
        print(f"Warning: File not found at '{file_path}'. Cannot get metadata.")
        return None

    return None # Should not be reached if os.path.exists is handled, but as a final fallback.