MemeDb/scanners/OcrScanner.py

from json import dumps
import os
import subprocess
import tempfile
import threading
import lmstudio as lms
from config import IMAGE_FILETYPES, VIDEO_FILETYPES
from db.Models import OcrOutput, Shitpost

import time
import base64
import easyocr
reader = easyocr.Reader(['fr', 'en'])


def get_filename_from_path(path):
    return path.split("/")[-1].split(".")[0]

def extractImage(filepath:str)->str:
    s = ""
    results = reader.readtext(filepath)
    for (bbox, text, prob) in results:
       s+= text + "\n"
    return s

def extractImageLlm(filepath:str)->str:
    prompt = """extract the text in the image, put it between two "%" exemple :  %extracted \ntext%, if there is nothing just say %%"""

    image = lms.prepare_image(filepath)
    chat = lms.Chat()
    model = lms.llm("qwen/qwen2.5-vl-7b")
    chat.add_user_message(prompt, images=[image])
    prediction = model.respond(chat)

    return prediction.content.split("%")[1]


def scanImage(shitpost: Shitpost):
    text = extractImage(shitpost.path)
    dico = {"frames":{0:text}}
    ocr_output = dumps(dico)
    shitpost.ocr_output = OcrOutput(text=ocr_output)

def scanVideo(shitpost:Shitpost,lock:threading.Lock):
    #create tmp dir
    dico = {"frames" : {}}
    with tempfile.TemporaryDirectory() as tmpdir:
        tmp_path = tmpdir
        # extract keyframes
        cmd = f"ffmpeg -loglevel quiet -i '{shitpost.path}' -r 1 -f image2 {tmp_path}/frame-%04d.jpg"
        subprocess.run(cmd, shell=True)
        # apply OCR to each frame
        for r,d,f in os.walk(tmp_path):
            for file in f:
                if file.endswith(".jpg"):
                    #parse frame number
                    frame_number = int(file.split("-")[1].split(".")[0])
                    text = extractImage(os.path.join(r,file))
                    dico['frames'][frame_number] = text
        with lock:
            shitpost.ocr_output = OcrOutput(text=dumps(dico))


def extractText(shitpost:Shitpost,lock):
    t1 = time.time()
    print(f"\tstarting to extract text for {shitpost.hash[:4]} aka {shitpost.path}")
    if shitpost.ocr_output is None:
        if shitpost.file_type in VIDEO_FILETYPES:
            scanVideo(shitpost, lock)
        if shitpost.file_type in IMAGE_FILETYPES:
            scanImage(shitpost)
    print(f"\ttext extraced for {shitpost.hash[:4]} in :{time.time()-t1} ")