- Introduce `app.py` as the main application file to handle shitpost scanning. - Create `config.py` for configuration settings including scan paths and file types. - Implement database models in `Models.py` for shitposts, songs, speech outputs, and tags. - Add database creation logic in `db.py`. - Develop various scanners (`OcrScanner.py`, `SongScanner.py`, `SpeechScanner.py`, `TagScanner.py`) for extracting information from shitposts. - Implement utility functions in `dateExtractor.py` and `shitpostFactory.py` for handling file metadata and creating shitpost objects. - Include a `pyproject.toml` for project dependencies and configuration.
73 lines
2.3 KiB
Python
73 lines
2.3 KiB
Python
from json import dumps
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import threading
|
|
import lmstudio as lms
|
|
from config import IMAGE_FILETYPES, VIDEO_FILETYPES
|
|
from db.Models import OcrOutput, Shitpost
|
|
|
|
import time
|
|
import base64
|
|
import easyocr
|
|
reader = easyocr.Reader(['fr', 'en'])
|
|
|
|
|
|
def get_filename_from_path(path):
|
|
return path.split("/")[-1].split(".")[0]
|
|
|
|
def extractImage(filepath:str)->str:
|
|
s = ""
|
|
results = reader.readtext(filepath)
|
|
for (bbox, text, prob) in results:
|
|
s+= text + "\n"
|
|
return s
|
|
|
|
def extractImageLlm(filepath:str)->str:
|
|
prompt = """extract the text in the image, put it between two "%" exemple : %extracted \ntext%, if there is nothing just say %%"""
|
|
|
|
image = lms.prepare_image(filepath)
|
|
chat = lms.Chat()
|
|
model = lms.llm("qwen/qwen2.5-vl-7b")
|
|
chat.add_user_message(prompt, images=[image])
|
|
prediction = model.respond(chat)
|
|
|
|
return prediction.content.split("%")[1]
|
|
|
|
|
|
def scanImage(shitpost: Shitpost):
|
|
text = extractImage(shitpost.path)
|
|
dico = {"frames":{0:text}}
|
|
ocr_output = dumps(dico)
|
|
shitpost.ocr_output = OcrOutput(text=ocr_output)
|
|
|
|
def scanVideo(shitpost:Shitpost,lock:threading.Lock):
|
|
#create tmp dir
|
|
dico = {"frames" : {}}
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tmp_path = tmpdir
|
|
# extract keyframes
|
|
cmd = f"ffmpeg -loglevel quiet -i '{shitpost.path}' -r 1 -f image2 {tmp_path}/frame-%04d.jpg"
|
|
subprocess.run(cmd, shell=True)
|
|
# apply OCR to each frame
|
|
for r,d,f in os.walk(tmp_path):
|
|
for file in f:
|
|
if file.endswith(".jpg"):
|
|
#parse frame number
|
|
frame_number = int(file.split("-")[1].split(".")[0])
|
|
text = extractImage(os.path.join(r,file))
|
|
dico['frames'][frame_number] = text
|
|
with lock:
|
|
shitpost.ocr_output = OcrOutput(text=dumps(dico))
|
|
|
|
|
|
def extractText(shitpost:Shitpost,lock):
|
|
t1 = time.time()
|
|
print(f"\tstarting to extract text for {shitpost.hash[:4]} aka {shitpost.path}")
|
|
if shitpost.ocr_output is None:
|
|
if shitpost.file_type in VIDEO_FILETYPES:
|
|
scanVideo(shitpost, lock)
|
|
if shitpost.file_type in IMAGE_FILETYPES:
|
|
scanImage(shitpost)
|
|
print(f"\ttext extraced for {shitpost.hash[:4]} in :{time.time()-t1} ")
|