vosk – 优海

from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import os
import wave
import subprocess
import codecs
import datetime
import json
 
 
def format_sec(sec):
	total_sec = int(sec)
	s = total_sec % 60
	m = total_sec // 60
	h = m // 60
	m = m % 60
	frame = int(sec * 1000) % 1000
	frame = frame // (1000 // 25)
	return "{0:02d}:{1:02d}:{2:02d},{3:03d}".format(h, m, s, frame)
 
 
class Stt:
	def __init__(self, mp4_path):
		self.index = 0
 
		self.model_path = r'/path/to/vosk-model-en-us-0.42-gigaspeech'
		self.mp4_path = mp4_path
 
		self.ffmpeg_path = 'ffmpeg'
		self.sample_rate = 16000
		self.model = Model(self.model_path)
		self.rec = KaldiRecognizer(self.model, self.sample_rate)
		self.rec.SetWords(True)
 
		SetLogLevel(-1)
 
		self.process = subprocess.Popen([self.ffmpeg_path, '-loglevel', 'quiet', '-i',
						self.mp4_path,
						'-ar', str(self.sample_rate), '-ac', '1', '-f', 's16le', '-'],
						stdout=subprocess.PIPE)
 
	def output_one_line(self, result, frm, to, f):
		sentence = ""
		start_sec = 60 * 60 * 24
		end_sec = 0
		for i in range(frm, to):
 
			float_value = float(result[i]["start"])
			if start_sec > float_value:
				start_sec = float_value
 
			float_value = float(result[i]["end"])
			if end_sec < float_value:
				end_sec = float_value
 
			sentence += result[i]["word"] + " "
 
		self.index += 1
 
		f.write(str(self.index) + "\n")
		f.write(format_sec(start_sec) + " --> " + format_sec(end_sec) + "\n")
		f.write(sentence + "\n")
		f.write("\n")
		f.flush()
 
		sys.stdout.write(format_sec(end_sec) + "\n")
 
	def recv_text(self, subtitle, f):
		js = json.loads(subtitle)
		if not "result" in js:
			sys.stdout.write("no result in json.\n")
			return
		result = js["result"]
 
		frm = 0
		length = len(result)
		while frm < length:
			to = frm + 7
			if length - to < 7:
				to = length
			if to > length:
				to = length
			self.output_one_line(result, frm, to, f)
			frm = to
 
		# sys.stdout.buffer.write(subtitle)
		sys.stdout.flush()
 
	def transcribe(self):
		with open(self.mp4_path + ".srt", 'w') as f:
			while True:
				data = self.process.stdout.read(4000)
				if len(data) == 0:
					subtitle = self.rec.FinalResult().encode('utf-8')
					self.recv_text(subtitle, f)
					break
				if self.rec.AcceptWaveform(data):
					subtitle = self.rec.Result().encode('utf-8')
					self.recv_text(subtitle, f)
 
		f.close()
 
stt = Stt(sys.argv[1])
stt.transcribe()

from vosk import Model, KaldiRecognizer, SetLogLevel import sys import os import wave import subprocess import codecs import datetime import json def format_sec(sec): total_sec = int(sec) s = total_sec % 60 m = total_sec // 60 h = m // 60 m = m % 60 frame = int(sec * 1000) % 1000 frame = frame // (1000 // 25) return "{0:02d}:{1:02d}:{2:02d},{3:03d}".format(h, m, s, frame) class Stt: def __init__(self, mp4_path): self.index = 0 self.model_path = r'/path/to/vosk-model-en-us-0.42-gigaspeech' self.mp4_path = mp4_path self.ffmpeg_path = 'ffmpeg' self.sample_rate = 16000 self.model = Model(self.model_path) self.rec = KaldiRecognizer(self.model, self.sample_rate) self.rec.SetWords(True) SetLogLevel(-1) self.process = subprocess.Popen([self.ffmpeg_path, '-loglevel', 'quiet', '-i', self.mp4_path, '-ar', str(self.sample_rate), '-ac', '1', '-f', 's16le', '-'], stdout=subprocess.PIPE) def output_one_line(self, result, frm, to, f): sentence = "" start_sec = 60 * 60 * 24 end_sec = 0 for i in range(frm, to): float_value = float(result[i]["start"]) if start_sec > float_value: start_sec = float_value float_value = float(result[i]["end"]) if end_sec < float_value: end_sec = float_value sentence += result[i]["word"] + " " self.index += 1 f.write(str(self.index) + "\n") f.write(format_sec(start_sec) + " --> " + format_sec(end_sec) + "\n") f.write(sentence + "\n") f.write("\n") f.flush() sys.stdout.write(format_sec(end_sec) + "\n") def recv_text(self, subtitle, f): js = json.loads(subtitle) if not "result" in js: sys.stdout.write("no result in json.\n") return result = js["result"] frm = 0 length = len(result) while frm < length: to = frm + 7 if length - to < 7: to = length if to > length: to = length self.output_one_line(result, frm, to, f) frm = to # sys.stdout.buffer.write(subtitle) sys.stdout.flush() def transcribe(self): with open(self.mp4_path + ".srt", 'w') as f: while True: data = self.process.stdout.read(4000) if len(data) == 0: subtitle = self.rec.FinalResult().encode('utf-8') self.recv_text(subtitle, f) break if self.rec.AcceptWaveform(data): subtitle = self.rec.Result().encode('utf-8') self.recv_text(subtitle, f) f.close() stt = Stt(sys.argv[1]) stt.transcribe()

refer to:
kdenlive-23.04.1/bin/data/kdenlive/scripts/speechtotext.py
https://docs.kdenlive.org/en/effects_and_compositions/subtitles.html
https://realpython.com/python-formatted-output/

M	T	W	T	F	S	S
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31