using Microsoft.Extensions.Options; using SherpaOnnx; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using VideoAnalysisCore.AICore.Whisper; using VideoAnalysisCore.Common; namespace VideoAnalysisCore.AICore.SherpaOnnx { public class SenseVoice { static OfflineRecognizer OR =default!; static VoiceActivityDetector VAD =default!; static VadModelConfig VADModelConfig = default!; /// /// 初始化 SenseVoice /// /// /// public static void Init(int speakerNumber = 0, double threshold = 0.6) { Console.WriteLine("初始化 SenseVoice"); OfflineRecognizerConfig config = new OfflineRecognizerConfig(); //采样率 config.FeatConfig.SampleRate = 16000; //用于训练模型的特征维度 config.FeatConfig.FeatureDim = 80; //Path to tokens.txt config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "tokens.txt"); //SenseVoice 模型 config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "model.onnx"); //1 使用逆文本规范化处理感官语音。 config.ModelConfig.SenseVoice.UseInverseTextNormalization =1; //模型类型 config.ModelConfig.ModelType = string.Empty; #region 有效的解码方法 //贪婪搜索[greedy_search] 改进的波束搜索 [modified_beam_search] //贪婪搜索 config.DecodingMethod = "greedy_search"; ////改进的波束搜索 //config.DecodingMethod = "modified_beam_search"; ////仅在 --decoding--method 为 [波束搜索]modified_beam_search 时使用。 ////它指定搜索过程中要保留的活动路径数 //config.MaxActivePaths =4; #endregion //热词目录 config.HotwordsFile = string.Empty; //热词得分 config.HotwordsScore =1.5f ; //反转文本规范化规则 fst 的路径 config.RuleFsts = string.Empty; config.ModelConfig.Debug = 0; OR = new OfflineRecognizer(config); VADModelConfig = new VadModelConfig(); VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "silero_VAD.onnx"); VADModelConfig.Debug = 0; //缓冲区大小 VAD = new VoiceActivityDetector(VADModelConfig, 60); } /// /// 获取语音字幕 /// /// /// public static async Task RunTask(string task) { var filePath = Path.Combine(task.LocalPath(), task + ".wav"); if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath)) throw new Exception("task 音频路径未找到"); string testWaveFilename = filePath; WaveReader reader = new WaveReader(testWaveFilename); int numSamples = reader.Samples.Length; int windowSize = VADModelConfig.SileroVad.WindowSize; int sampleRate = VADModelConfig.SampleRate; int numIter = numSamples / windowSize; var res = new List(500); for (int i = 0; i != numIter; ++i) { int start = i * windowSize; float[] samples = new float[windowSize]; Array.Copy(reader.Samples, start, samples, 0, windowSize); VAD.AcceptWaveform(samples); //是否检测到语音 if (VAD.IsSpeechDetected()) { while (!VAD.IsEmpty()) { //获取最新的发言片段 SpeechSegment segment = VAD.Front(); float startTime = segment.Start / (float)sampleRate; float duration = segment.Samples.Length / (float)sampleRate; OfflineStream stream = OR.CreateStream(); stream.AcceptWaveform(sampleRate, segment.Samples); OR.Decode(stream); if (!string.IsNullOrEmpty(stream.Result.Text)) { res.Add(new() { Text = stream.Result.Text, Start= startTime, End = startTime + duration }); } VAD.Pop(); } } } VAD.Flush(); while (!VAD.IsEmpty()) { SpeechSegment segment = VAD.Front(); float startTime = segment.Start / (float)sampleRate; float duration = segment.Samples.Length / (float)sampleRate; OfflineStream stream = OR.CreateStream(); stream.AcceptWaveform(sampleRate, segment.Samples); OR.Decode(stream); if (!string.IsNullOrEmpty(stream.Result.Text)) { res.Add(new() { Text = stream.Result.Text, Start = startTime, End = startTime + duration }); } VAD.Pop(); } await RedisExpand.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res); RedisExpand.InsertChannel(Enum.RedisChannelEnum.ParsingSpeaker, task); } } }