from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import os
import wave
import subprocess
import codecs
import datetime
import json
def format_sec(sec):
total_sec = int(sec)
s = total_sec % 60
m = total_sec // 60
h = m // 60
m = m % 60
frame = int(sec * 1000) % 1000
frame = frame // (1000 // 25)
return "{0:02d}:{1:02d}:{2:02d},{3:03d}".format(h, m, s, frame)
class Stt:
def __init__(self, mp4_path):
self.index = 0
self.model_path = r'/path/to/vosk-model-en-us-0.42-gigaspeech'
self.mp4_path = mp4_path
self.ffmpeg_path = 'ffmpeg'
self.sample_rate = 16000
self.model = Model(self.model_path)
self.rec = KaldiRecognizer(self.model, self.sample_rate)
self.rec.SetWords(True)
SetLogLevel(-1)
self.process = subprocess.Popen([self.ffmpeg_path, '-loglevel', 'quiet', '-i',
self.mp4_path,
'-ar', str(self.sample_rate), '-ac', '1', '-f', 's16le', '-'],
stdout=subprocess.PIPE)
def output_one_line(self, result, frm, to, f):
sentence = ""
start_sec = 60 * 60 * 24
end_sec = 0
for i in range(frm, to):
float_value = float(result[i]["start"])
if start_sec > float_value:
start_sec = float_value
float_value = float(result[i]["end"])
if end_sec < float_value:
end_sec = float_value
sentence += result[i]["word"] + " "
self.index += 1
f.write(str(self.index) + "\n")
f.write(format_sec(start_sec) + " --> " + format_sec(end_sec) + "\n")
f.write(sentence + "\n")
f.write("\n")
f.flush()
sys.stdout.write(format_sec(end_sec) + "\n")
def recv_text(self, subtitle, f):
js = json.loads(subtitle)
if not "result" in js:
sys.stdout.write("no result in json.\n")
return
result = js["result"]
frm = 0
length = len(result)
while frm < length:
to = frm + 7
if length - to < 7:
to = length
if to > length:
to = length
self.output_one_line(result, frm, to, f)
frm = to
# sys.stdout.buffer.write(subtitle)
sys.stdout.flush()
def transcribe(self):
with open(self.mp4_path + ".srt", 'w') as f:
while True:
data = self.process.stdout.read(4000)
if len(data) == 0:
subtitle = self.rec.FinalResult().encode('utf-8')
self.recv_text(subtitle, f)
break
if self.rec.AcceptWaveform(data):
subtitle = self.rec.Result().encode('utf-8')
self.recv_text(subtitle, f)
f.close()
stt = Stt(sys.argv[1])
stt.transcribe() |