feat: add initial implementation of the shitpost scanning application with database integration and various scanners for music, text, speech, and tags extraction

- Introduce `app.py` as the main application file to handle shitpost scanning. - Create `config.py` for configuration settings including scan paths and file types. - Implement database models in `Models.py` for shitposts, songs, speech outputs, and tags. - Add database creation logic in `db.py`. - Develop various scanners (`OcrScanner.py`, `SongScanner.py`, `SpeechScanner.py`, `TagScanner.py`) for extracting information from shitposts. - Implement utility functions in `dateExtractor.py` and `shitpostFactory.py` for handling file metadata and creating shitpost objects. - Include a `pyproject.toml` for project dependencies and configuration.
2025-06-23 23:28:04 +02:00 · 2025-06-23 23:28:04 +02:00 · 03a41b6996
commit 03a41b6996
13 changed files with 3050 additions and 0 deletions
--- a/README.md
+++ b/README.md
--- a/app.py
+++ b/app.py
@ -0,0 +1,79 @@
+import os
+import time
+from typing import List, Tuple
+import warnings
+import asyncio
+import threading
+
+from sqlalchemy import create_engine
+from sqlalchemy.orm import Session
+
+from config import SCAN_PATHS
+from db.Models import Shitpost
+from db.db import create_db
+from scanners.OcrScanner import extractText
+from scanners.SongScanner import extractSong
+from scanners.SpeechScanner import extractSpeech
+from utils.shitpostFactory import ShitpostFactory
+
+warnings.filterwarnings("ignore")
+
+
+def scanMusics(shitposts:List[Tuple[Shitpost,threading.Lock]]):
+    for shitpost in shitposts:
+        lock = shitpost[1]
+        shitpost = shitpost[0]
+        extractSong(shitpost,lock)
+
+def scanText(shitposts:List[Tuple[Shitpost,threading.Lock]]):
+    for shitpost in shitposts:
+        lock = shitpost[1]
+        shitpost = shitpost[0]
+        extractText(shitpost,lock)
+
+def scanSpeech(shitposts:List[Tuple[Shitpost,threading.Lock]]):
+    for shitpost in shitposts:
+        lock = shitpost[1]
+        shitpost = shitpost[0]
+        extractSpeech(shitpost,lock)
+
+
+async def scanShitposts():
+    engine = create_engine("sqlite:///Shitpost.db", future=True)
+    session = Session(engine)
+    shitposts = []
+    paths = set()
+    for shitpost in session.query(Shitpost).all():
+        shitposts.append((shitpost,threading.Lock()))
+        paths.add(shitpost.path)
+
+    for path in SCAN_PATHS:
+        for r,d,f in os.walk(path):
+            for file in f:
+                path = os.path.join(r,file)
+                if path not in paths:
+                    print(file)
+                    try:
+                        shitposts.append((ShitpostFactory(path),threading.Lock()))
+                    except:
+                        f = open("failed.txt","a")
+                        f.write(file+"\n")
+                        f.close()
+
+    # scanMusics(shitposts)
+    # scanText(shitposts)
+    # scanSpeech(shitposts)
+    task1 = asyncio.to_thread(scanSpeech,shitposts)
+    task2 = asyncio.to_thread(scanText,shitposts)
+    task3 = asyncio.to_thread(scanMusics,shitposts)
+    await asyncio.gather(task1,task2,task3)
+    for shitpost in shitposts:
+        shitpost = shitpost[0]
+        session.add(shitpost)
+    session.commit()
+    session.close()
+
+
+if __name__ == "__main__":
+    create_db()
+    asyncio.run(scanShitposts())
--- a/config.py
+++ b/config.py
@ -0,0 +1,11 @@
+SCAN_PATHS = [
+        #"./testMedia/",
+        "/home/djalim/Vidéos/Shitpost/",
+        "/home/djalim/Images/Shitpost et Art/"
+        ]
+
+VIDEO_FILETYPES = [
+        "mp4","webm","mkv"
+        ]
+
+IMAGE_FILETYPES = ["jpg", "jpeg", "png", "webp"]
--- a/db/Models.py
+++ b/db/Models.py
@ -0,0 +1,92 @@
+from sqlalchemy import Column, ForeignKey, LargeBinary, Table, String, Integer, Boolean # Added String, Integer, Boolean for Table definition
+from sqlalchemy.orm import declarative_base, Mapped, mapped_column, relationship
+from typing import List
+
+Base = declarative_base()
+
+# It's good practice to add primary_key=True for columns in an association table
+# and explicitly state types, though SQLAlchemy can often infer them.
+shitposts_tags = Table(
+    "shitposts_tags",
+    Base.metadata,
+    Column("left_id", String, ForeignKey("shitposts.hash"), primary_key=True),
+    Column("right_id", Integer, ForeignKey("tags.id"), primary_key=True),
+)
+
+class Shitpost(Base):
+    __tablename__ = 'shitposts'
+    hash: Mapped[str] = mapped_column(primary_key=True)
+    path: Mapped[str] = mapped_column()
+    date: Mapped[str] = mapped_column()  # Consider using sqlalchemy.types.DateTime for date fields
+    file_type: Mapped[str] = mapped_column()
+    thumbnail: Mapped[str] = mapped_column(LargeBinary,deferred=True)
+    correct_song_match: Mapped[bool] = mapped_column()
+
+    # One-to-one relationship with OcrOutput
+    # Mapped["OcrOutput"] implies uselist=False, making it scalar.
+    # For this to be truly 1-to-1, OcrOutput.shitpost_id should be unique.
+    ocr_output: Mapped["OcrOutput"] = relationship(back_populates="shitpost")
+
+    # One-to-one relationship with SpeechOutput
+    # Mapped["SpeechOutput"] implies uselist=False.
+    # For this to be truly 1-to-1, SpeechOutput.shitpost_id should be unique.
+    speech_output: Mapped["SpeechOutput"] = relationship(back_populates="shitpost")
+
+    # Many-to-many relationship with Tags
+    tags: Mapped[List["Tags"]] = relationship(
+        secondary=shitposts_tags,
+        back_populates="shitposts"  # Matches Tags.shitposts
+    )
+
+    # Foreign key to SongMatch
+    song_id: Mapped[int] = mapped_column(ForeignKey('song_match.id'), nullable=True) # Assuming a shitpost might not have a song
+    # Many-to-one relationship with SongMatch
+    song: Mapped["SongMatch"] = relationship(back_populates="shitposts") # Added back_populates
+
+class SongMatch(Base):
+    __tablename__ = 'song_match'
+    id: Mapped[int] = mapped_column(primary_key=True)
+    song_name: Mapped[str] = mapped_column()  # Added mapped_column()
+    artist_name: Mapped[str] = mapped_column()  # Added mapped_column()
+
+    # One-to-many relationship with Shitpost
+    shitposts: Mapped[List["Shitpost"]] = relationship(back_populates="song")
+
+class SpeechOutput(Base):
+    __tablename__ = 'speech_output'  # Corrected: was 'ocr_output'
+    id: Mapped[int] = mapped_column(primary_key=True)
+    text: Mapped[str] = mapped_column()
+
+    # Foreign key column linking to Shitpost.
+    # Shitpost.hash is Mapped[str], so shitpost_id must be Mapped[str].
+    shitpost_id: Mapped[str] = mapped_column(ForeignKey('shitposts.hash')) # Corrected type and ForeignKey usage
+
+    # Relationship to Shitpost. Mapped['Shitpost'] indicates a scalar (single object) relationship.
+    shitpost: Mapped['Shitpost'] = relationship(back_populates='speech_output') # Corrected: uses relationship()
+
+class Tags(Base):
+    __tablename__ = "tags"
+    id: Mapped[int] = mapped_column(primary_key=True)
+    name: Mapped[str] = mapped_column()  # Added mapped_column()
+
+    # Many-to-many relationship with Shitpost
+    shitposts: Mapped[List["Shitpost"]] = relationship(
+        secondary=shitposts_tags,
+        back_populates="tags"  # Corrected: uses relationship(), matches Shitpost.tags
+    )
+
+class OcrOutput(Base):
+    __tablename__ = 'ocr_output'
+    id: Mapped[int] = mapped_column(primary_key=True)
+    text: Mapped[str] = mapped_column()
+
+    # Foreign key column linking to Shitpost.
+    # Shitpost.hash is Mapped[str], so shitpost_id must be Mapped[str].
+    shitpost_id: Mapped[str] = mapped_column(
+        ForeignKey('shitposts.hash'),  # Corrected: target table 'shitposts', column 'hash', and ForeignKey usage
+        nullable=False,
+        index=True
+    )
+    # Relationship to Shitpost. Mapped['Shitpost'] indicates a scalar relationship.
+    shitpost: Mapped['Shitpost'] = relationship(back_populates='ocr_output')
+    # Removed duplicated shitpost_id definition that was here.
--- a/db/db.py
+++ b/db/db.py
@ -0,0 +1,7 @@
+from sqlalchemy import create_engine
+from db.Models import *
+
+
+def create_db():
+    engine = create_engine("sqlite:///Shitpost.db", future=True)
+    Base.metadata.create_all(engine)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,33 @@
+[project]
+name = "memedb"
+version = "0.1.0"
+description = ""
+authors = [
+    {name = "Djalim Simaila",email = "DjalimS.pro@outlook.fr"}
+]
+readme = "README.md"
+requires-python = ">=3.13,<4.0"
+dependencies = [
+    "torch (>=2.7.0,<3.0.0)",
+    "sqlalchemy (>=2.0.41,<3.0.0)",
+    "pillow (>=11.2.1,<12.0.0)",
+    "opencv-python (>=4.11.0.86,<5.0.0.0)",
+    "faster-whisper (>=1.1.1,<2.0.0)",
+    "easyocr (>=1.7.2,<2.0.0)",
+    "thumbnail (>=1.5,<2.0)",
+    "openai (>=1.84.0,<2.0.0)",
+    "lmstudio (>=1.3.1,<2.0.0)"
+]
+package-mode = false
+
+
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pyright]
+venvPath = "."
+venv = ".venv"
+
+[virtualenvs]
+in-project = true
--- a/scanners/OcrScanner.py
+++ b/scanners/OcrScanner.py
@ -0,0 +1,72 @@
+from json import dumps
+import os
+import subprocess
+import tempfile
+import threading
+import lmstudio as lms
+from config import IMAGE_FILETYPES, VIDEO_FILETYPES
+from db.Models import OcrOutput, Shitpost
+
+import time
+import base64
+import easyocr
+reader = easyocr.Reader(['fr', 'en'])
+
+
+def get_filename_from_path(path):
+    return path.split("/")[-1].split(".")[0]
+
+def extractImage(filepath:str)->str:
+    s = ""
+    results = reader.readtext(filepath)
+    for (bbox, text, prob) in results:
+       s+= text + "\n"
+    return s
+
+def extractImageLlm(filepath:str)->str:
+    prompt = """extract the text in the image, put it between two "%" exemple :  %extracted \ntext%, if there is nothing just say %%"""
+
+    image = lms.prepare_image(filepath)
+    chat = lms.Chat()
+    model = lms.llm("qwen/qwen2.5-vl-7b")
+    chat.add_user_message(prompt, images=[image])
+    prediction = model.respond(chat)
+
+    return prediction.content.split("%")[1]
+
+
+def scanImage(shitpost: Shitpost):
+    text = extractImage(shitpost.path)
+    dico = {"frames":{0:text}}
+    ocr_output = dumps(dico)
+    shitpost.ocr_output = OcrOutput(text=ocr_output)
+
+def scanVideo(shitpost:Shitpost,lock:threading.Lock):
+    #create tmp dir
+    dico = {"frames" : {}}
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_path = tmpdir
+        # extract keyframes
+        cmd = f"ffmpeg -loglevel quiet -i '{shitpost.path}' -r 1 -f image2 {tmp_path}/frame-%04d.jpg"
+        subprocess.run(cmd, shell=True)
+        # apply OCR to each frame
+        for r,d,f in os.walk(tmp_path):
+            for file in f:
+                if file.endswith(".jpg"):
+                    #parse frame number
+                    frame_number = int(file.split("-")[1].split(".")[0])
+                    text = extractImage(os.path.join(r,file))
+                    dico['frames'][frame_number] = text
+        with lock:
+            shitpost.ocr_output = OcrOutput(text=dumps(dico))
+
+
+def extractText(shitpost:Shitpost,lock):
+    t1 = time.time()
+    print(f"\tstarting to extract text for {shitpost.hash[:4]} aka {shitpost.path}")
+    if shitpost.ocr_output is None:
+        if shitpost.file_type in VIDEO_FILETYPES:
+            scanVideo(shitpost, lock)
+        if shitpost.file_type in IMAGE_FILETYPES:
+            scanImage(shitpost)
+    print(f"\ttext extraced for {shitpost.hash[:4]} in :{time.time()-t1} ")
--- a/scanners/SongScanner.py
+++ b/scanners/SongScanner.py
@ -0,0 +1,21 @@
+from time import time
+from config import VIDEO_FILETYPES
+from db.Models import Shitpost, SongMatch
+import subprocess
+import threading
+
+def extractSong(shitpost: Shitpost,lock:threading.Lock):
+    t1 = time()
+    print(f"\tstarting to extract song for {shitpost.hash[:4]} aka {shitpost.path}")
+
+    if shitpost.song is None and shitpost.file_type in VIDEO_FILETYPES:
+        result = subprocess.run(['songrec', 'recognize', shitpost.path], capture_output=True, text=True)
+        artist = result.stdout.split("-")[0][:-1]
+        songname = result.stdout.split("-")[1][0:]
+        songmatch = SongMatch(
+                song_name=songname,
+                artist_name=artist
+        )
+        with lock:
+            shitpost.song = songmatch
+    print(f"\tsong extraced for {shitpost.hash[:4]} in :{time()-t1} ")
--- a/scanners/SpeechScanner.py
+++ b/scanners/SpeechScanner.py
@ -0,0 +1,51 @@
+import tempfile
+from time import time
+
+from config import VIDEO_FILETYPES
+from db.Models import Shitpost, SpeechOutput
+from faster_whisper import WhisperModel
+
+
+import threading
+import subprocess
+import json
+
+def extractSpeech(shitpost: Shitpost,lock:threading.Lock):
+    t1 = time()
+    print(f"\tstarting to extract speech for {shitpost.hash[:4]} aka {shitpost.path}")
+    if shitpost.speech_output is None and shitpost.file_type not in VIDEO_FILETYPES:
+        #whisper(shitpost)
+        fastWhisper(shitpost, lock)
+    print(f"\tspeech extracted for {shitpost.hash[:4]} in :{time()-t1} ")
+
+
+
+def whisper(shitpost: Shitpost):
+    filename = shitpost.path.split("/")[-1]
+    filename = filename.split(".")[0]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        cmd = f"whisper --verbose False -f json -o {tmpdir} \"{shitpost.path}\""
+        subprocess.run(cmd, shell=True)
+        with open(f"{tmpdir}/{filename}.json", "r") as file:
+            data = json.load(file)
+            print("extracted speech :", data["text"] )
+            shitpost.speech_output = SpeechOutput(
+                text=json.dumps(data),
+            )
+
+def fastWhisper(shitpost: Shitpost,lock:threading.Lock):
+    dico = {}
+    filename = shitpost.path.split("/")[-1]
+    filename = filename.split(".")[0]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model = WhisperModel("turbo", device="cpu", compute_type="int8")
+        segments, info = model.transcribe(shitpost.path,beam_size=5,language_detection_segments=2)
+        #print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
+        for segment in segments:
+            #print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
+            dico[round(segment.start,2)] = segment.text
+
+    with lock:
+        shitpost.speech_output = SpeechOutput(
+                text=json.dumps(dico),
+        )
--- a/scanners/TagScanner.py
+++ b/scanners/TagScanner.py
@ -0,0 +1,15 @@
+import base64
+import threading
+import lmstudio as lms
+
+from db.Models import Shitpost
+
+def extractTags(shitpost:Shitpost, lock:threading.Lock):
+    prompt = f"""I will give you an image and a list of tags, you'll have to return a list of tags that accurately describe the image in a json format, you'll try to match any existing tags, before trying to add new one
+tags=[]"""
+
+    image = lms.prepare_image(shitpost.path)
+    chat = lms.Chat()
+    model = lms.llm("qwen/qwen2.5-vl-7b")
+    chat.add_user_message(prompt, images=[image])
+    prediction = model.respond(chat)
--- a/utils/dateExtractor.py
+++ b/utils/dateExtractor.py
@ -0,0 +1,61 @@
+import re
+import os
+from datetime import datetime
+from typing import Optional, Union # For type hinting
+
+def extract_date_from_path(file_path: str) -> Optional[datetime]:
+    """
+    Extracts a datetime object from a file path.
+    It first tries to parse the date and time from the filename.
+    If unsuccessful, it falls back to the file's last modification timestamp.
+
+    Args:
+        file_path (str): The absolute or relative path to the file.
+
+    Returns:
+        Optional[datetime]: A datetime object if a date could be extracted,
+                            otherwise None.
+    """
+    filename = os.path.basename(file_path)
+
+    # Define regex patterns to try. Order might matter if filenames could match multiple.
+    # Pattern 1: (Optional_Prefix_)YYYYMMDD_HHMMSS(_Optional_Suffix)
+    # e.g., IMG_20210811_141036.jpg, 20210509_005303.jpg, IMG_20190723_211320_065.jpg
+    patterns = [
+        re.compile(r"(?:.*_)?(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})(?:_.*)?\..+"), # YYYYMMDD_HHMMSS
+        re.compile(r"(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})(?:_.*)?\..+"),       # Strict YYYYMMDD_HHMMSS at start
+        re.compile(r"(\d{8})_(\d{6})(?:_.*)?\..+")                                 # YYYYYMMDD_HHMMSS (compact)
+    ]
+
+    for pattern in patterns:
+        match = pattern.search(filename)
+        if match:
+            groups = match.groups()
+            try:
+                if len(groups) == 6: # YYYY, MM, DD, HH, MM, SS
+                    year, month, day, hour, minute, second = map(int, groups)
+                    return datetime(year, month, day, hour, minute, second)
+                elif len(groups) == 2: # YYYYMMDD, HHMMSS
+                    date_str, time_str = groups
+                    return datetime.strptime(date_str + time_str, "%Y%m%d%H%M%S")
+            except ValueError as e:
+                print(f"Warning: Could not parse date from filename groups {groups} for {filename}: {e}")
+                # Continue to next pattern or fallback
+                pass
+
+    if os.path.exists(file_path):
+        try:
+            # Get the last modification time
+            mtime = os.path.getmtime(file_path)
+            return datetime.fromtimestamp(mtime)
+        except OSError as e:
+            print(f"Error: Could not get metadata for {file_path}: {e}")
+            return None
+        except Exception as e: # Catch any other unexpected error during metadata access
+            print(f"Unexpected error getting metadata for {file_path}: {e}")
+            return None
+    else:
+        print(f"Warning: File not found at '{file_path}'. Cannot get metadata.")
+        return None
+
+    return None # Should not be reached if os.path.exists is handled, but as a final fallback.
--- a/utils/shitpostFactory.py
+++ b/utils/shitpostFactory.py
@ -0,0 +1,55 @@
+import hashlib
+import os
+import shutil
+import tempfile
+
+from thumbnail import generate_thumbnail
+
+from db.Models import Shitpost
+from utils.dateExtractor import extract_date_from_path
+
+options = {
+	'trim': False,
+	'height': 300,
+	'width': 300,
+	'quality': 85,
+	'type': 'thumbnail'
+}
+
+
+def hashfile(file_path:str)->str:
+    with open(file_path, 'rb', buffering=0) as f:
+        return hashlib.file_digest(f, 'sha256').hexdigest()
+
+def ShitpostFactory(file_path:str):
+    with tempfile.TemporaryDirectory() as tmpdir: 
+
+        filename = file_path.split("/")[-1]
+        filetype = os.path.splitext(filename)[1].lower()[1:]
+        shitpost_hash = hashfile(file_path)
+
+
+        #get date file
+        shitpost_date = extract_date_from_path(file_path).timestamp()
+        shitpost = Shitpost(
+                 hash=shitpost_hash,
+                 path=file_path,
+                 date=shitpost_date,
+                 file_type=filetype
+        )
+
+        #create thumbnail 
+        shitpost_cpy = os.path.join(tmpdir, f"{shitpost_hash}.{filetype}")
+        shutil.copyfile(file_path, shitpost_cpy)
+        
+        thumpath = f"{tmpdir}/{shitpost_hash}.png"
+        generate_thumbnail(shitpost_cpy,thumpath, options)
+        
+        thumb = open(f"{tmpdir}/{shitpost_hash}.png", "rb")
+        shitpost.thumbnail = thumb.read()
+        thumb.close()
+        
+        #song match default value
+        shitpost.correct_song_match = False
+        
+        return shitpost