MemeDb/scanners/SpeechScanner.py

import tempfile
from time import time

from config import VIDEO_FILETYPES
from db.Models import Shitpost, SpeechOutput
from faster_whisper import WhisperModel


import threading
import subprocess
import json

def extractSpeech(shitpost: Shitpost,lock:threading.Lock):
    t1 = time()
    print(f"\tstarting to extract speech for {shitpost.hash[:4]} aka {shitpost.path}")
    if shitpost.speech_output is None and shitpost.file_type not in VIDEO_FILETYPES:
        #whisper(shitpost)
        fastWhisper(shitpost, lock)
    print(f"\tspeech extracted for {shitpost.hash[:4]} in :{time()-t1} ")


def whisper(shitpost: Shitpost):
    filename = shitpost.path.split("/")[-1]
    filename = filename.split(".")[0]
    with tempfile.TemporaryDirectory() as tmpdir:
        cmd = f"whisper --verbose False -f json -o {tmpdir} \"{shitpost.path}\""
        subprocess.run(cmd, shell=True)
        with open(f"{tmpdir}/{filename}.json", "r") as file:
            data = json.load(file)
            print("extracted speech :", data["text"] )
            shitpost.speech_output = SpeechOutput(
                text=json.dumps(data),
            )

def fastWhisper(shitpost: Shitpost,lock:threading.Lock):
    dico = {}
    filename = shitpost.path.split("/")[-1]
    filename = filename.split(".")[0]
    with tempfile.TemporaryDirectory() as tmpdir:
        model = WhisperModel("turbo", device="cpu", compute_type="int8")
        segments, info = model.transcribe(shitpost.path,beam_size=5,language_detection_segments=2)
        #print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
        for segment in segments:
            #print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
            dico[round(segment.start,2)] = segment.text

    with lock:
        shitpost.speech_output = SpeechOutput(
                text=json.dumps(dico),
        )