MemeDb/scanners/OcrScanner.py
Djalim Simaila 03a41b6996 feat: add initial implementation of the shitpost scanning application with database integration and various scanners for music, text, speech, and tags extraction
- Introduce `app.py` as the main application file to handle shitpost scanning.
- Create `config.py` for configuration settings including scan paths and file types.
- Implement database models in `Models.py` for shitposts, songs, speech outputs, and tags.
- Add database creation logic in `db.py`.
- Develop various scanners (`OcrScanner.py`, `SongScanner.py`, `SpeechScanner.py`, `TagScanner.py`) for extracting information from shitposts.
- Implement utility functions in `dateExtractor.py` and `shitpostFactory.py` for handling file metadata and creating shitpost objects.
- Include a `pyproject.toml` for project dependencies and configuration.
2025-06-23 23:28:04 +02:00

73 lines
2.3 KiB
Python

from json import dumps
import os
import subprocess
import tempfile
import threading
import lmstudio as lms
from config import IMAGE_FILETYPES, VIDEO_FILETYPES
from db.Models import OcrOutput, Shitpost
import time
import base64
import easyocr
reader = easyocr.Reader(['fr', 'en'])
def get_filename_from_path(path):
return path.split("/")[-1].split(".")[0]
def extractImage(filepath:str)->str:
s = ""
results = reader.readtext(filepath)
for (bbox, text, prob) in results:
s+= text + "\n"
return s
def extractImageLlm(filepath:str)->str:
prompt = """extract the text in the image, put it between two "%" exemple : %extracted \ntext%, if there is nothing just say %%"""
image = lms.prepare_image(filepath)
chat = lms.Chat()
model = lms.llm("qwen/qwen2.5-vl-7b")
chat.add_user_message(prompt, images=[image])
prediction = model.respond(chat)
return prediction.content.split("%")[1]
def scanImage(shitpost: Shitpost):
text = extractImage(shitpost.path)
dico = {"frames":{0:text}}
ocr_output = dumps(dico)
shitpost.ocr_output = OcrOutput(text=ocr_output)
def scanVideo(shitpost:Shitpost,lock:threading.Lock):
#create tmp dir
dico = {"frames" : {}}
with tempfile.TemporaryDirectory() as tmpdir:
tmp_path = tmpdir
# extract keyframes
cmd = f"ffmpeg -loglevel quiet -i '{shitpost.path}' -r 1 -f image2 {tmp_path}/frame-%04d.jpg"
subprocess.run(cmd, shell=True)
# apply OCR to each frame
for r,d,f in os.walk(tmp_path):
for file in f:
if file.endswith(".jpg"):
#parse frame number
frame_number = int(file.split("-")[1].split(".")[0])
text = extractImage(os.path.join(r,file))
dico['frames'][frame_number] = text
with lock:
shitpost.ocr_output = OcrOutput(text=dumps(dico))
def extractText(shitpost:Shitpost,lock):
t1 = time.time()
print(f"\tstarting to extract text for {shitpost.hash[:4]} aka {shitpost.path}")
if shitpost.ocr_output is None:
if shitpost.file_type in VIDEO_FILETYPES:
scanVideo(shitpost, lock)
if shitpost.file_type in IMAGE_FILETYPES:
scanImage(shitpost)
print(f"\ttext extraced for {shitpost.hash[:4]} in :{time.time()-t1} ")