MemeDb/utils/dateExtractor.py
Djalim Simaila 03a41b6996 feat: add initial implementation of the shitpost scanning application with database integration and various scanners for music, text, speech, and tags extraction
- Introduce `app.py` as the main application file to handle shitpost scanning.
- Create `config.py` for configuration settings including scan paths and file types.
- Implement database models in `Models.py` for shitposts, songs, speech outputs, and tags.
- Add database creation logic in `db.py`.
- Develop various scanners (`OcrScanner.py`, `SongScanner.py`, `SpeechScanner.py`, `TagScanner.py`) for extracting information from shitposts.
- Implement utility functions in `dateExtractor.py` and `shitpostFactory.py` for handling file metadata and creating shitpost objects.
- Include a `pyproject.toml` for project dependencies and configuration.
2025-06-23 23:28:04 +02:00

62 lines
2.7 KiB
Python

import re
import os
from datetime import datetime
from typing import Optional, Union # For type hinting
def extract_date_from_path(file_path: str) -> Optional[datetime]:
"""
Extracts a datetime object from a file path.
It first tries to parse the date and time from the filename.
If unsuccessful, it falls back to the file's last modification timestamp.
Args:
file_path (str): The absolute or relative path to the file.
Returns:
Optional[datetime]: A datetime object if a date could be extracted,
otherwise None.
"""
filename = os.path.basename(file_path)
# Define regex patterns to try. Order might matter if filenames could match multiple.
# Pattern 1: (Optional_Prefix_)YYYYMMDD_HHMMSS(_Optional_Suffix)
# e.g., IMG_20210811_141036.jpg, 20210509_005303.jpg, IMG_20190723_211320_065.jpg
patterns = [
re.compile(r"(?:.*_)?(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})(?:_.*)?\..+"), # YYYYMMDD_HHMMSS
re.compile(r"(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})(?:_.*)?\..+"), # Strict YYYYMMDD_HHMMSS at start
re.compile(r"(\d{8})_(\d{6})(?:_.*)?\..+") # YYYYYMMDD_HHMMSS (compact)
]
for pattern in patterns:
match = pattern.search(filename)
if match:
groups = match.groups()
try:
if len(groups) == 6: # YYYY, MM, DD, HH, MM, SS
year, month, day, hour, minute, second = map(int, groups)
return datetime(year, month, day, hour, minute, second)
elif len(groups) == 2: # YYYYMMDD, HHMMSS
date_str, time_str = groups
return datetime.strptime(date_str + time_str, "%Y%m%d%H%M%S")
except ValueError as e:
print(f"Warning: Could not parse date from filename groups {groups} for {filename}: {e}")
# Continue to next pattern or fallback
pass
if os.path.exists(file_path):
try:
# Get the last modification time
mtime = os.path.getmtime(file_path)
return datetime.fromtimestamp(mtime)
except OSError as e:
print(f"Error: Could not get metadata for {file_path}: {e}")
return None
except Exception as e: # Catch any other unexpected error during metadata access
print(f"Unexpected error getting metadata for {file_path}: {e}")
return None
else:
print(f"Warning: File not found at '{file_path}'. Cannot get metadata.")
return None
return None # Should not be reached if os.path.exists is handled, but as a final fallback.