from vosk import Model, KaldiRecognizer, SetLogLevel import sys import os import wave import subprocess import codecs import datetime import json def format_sec(sec): total_sec = int(sec) s = total_sec % 60 m = total_sec // 60 h = m // 60 m = m % 60 frame = int(sec * 1000) % 1000 frame = frame // (1000 // 25) return "{0:02d}:{1:02d}:{2:02d},{3:03d}".format(h, m, s, frame) class Stt: def __init__(self, mp4_path): self.index = 0 self.model_path = r'/path/to/vosk-model-en-us-0.42-gigaspeech' self.mp4_path = mp4_path self.ffmpeg_path = 'ffmpeg' self.sample_rate = 16000 self.model = Model(self.model_path) self.rec = KaldiRecognizer(self.model, self.sample_rate) self.rec.SetWords(True) SetLogLevel(-1) self.process = subprocess.Popen([self.ffmpeg_path, '-loglevel', 'quiet', '-i', self.mp4_path, '-ar', str(self.sample_rate), '-ac', '1', '-f', 's16le', '-'], stdout=subprocess.PIPE) def output_one_line(self, result, frm, to, f): sentence = "" start_sec = 60 * 60 * 24 end_sec = 0 for i in range(frm, to): float_value = float(result[i]["start"]) if start_sec > float_value: start_sec = float_value float_value = float(result[i]["end"]) if end_sec < float_value: end_sec = float_value sentence += result[i]["word"] + " " self.index += 1 f.write(str(self.index) + "\n") f.write(format_sec(start_sec) + " --> " + format_sec(end_sec) + "\n") f.write(sentence + "\n") f.write("\n") f.flush() sys.stdout.write(format_sec(end_sec) + "\n") def recv_text(self, subtitle, f): js = json.loads(subtitle) if not "result" in js: sys.stdout.write("no result in json.\n") return result = js["result"] frm = 0 length = len(result) while frm < length: to = frm + 7 if length - to < 7: to = length if to > length: to = length self.output_one_line(result, frm, to, f) frm = to # sys.stdout.buffer.write(subtitle) sys.stdout.flush() def transcribe(self): with open(self.mp4_path + ".srt", 'w') as f: while True: data = self.process.stdout.read(4000) if len(data) == 0: subtitle = self.rec.FinalResult().encode('utf-8') self.recv_text(subtitle, f) break if self.rec.AcceptWaveform(data): subtitle = self.rec.Result().encode('utf-8') self.recv_text(subtitle, f) f.close() stt = Stt(sys.argv[1]) stt.transcribe() |
refer to:
kdenlive-23.04.1/bin/data/kdenlive/scripts/speechtotext.py
https://docs.kdenlive.org/en/effects_and_compositions/subtitles.html
https://realpython.com/python-formatted-output/