using Microsoft.Extensions.Options; using SherpaOnnx; using SqlSugar.IOC; using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Linq; using System.Text; using System.Text.Json; using System.Text.RegularExpressions; using System.Threading.Tasks; using VideoAnalysisCore.Common; using VideoAnalysisCore.Model; using VideoAnalysisCore.Model.Enum; using static System.Runtime.InteropServices.JavaScript.JSType; namespace VideoAnalysisCore.AICore.SherpaOnnx { public static class SenseVoice { const string TransducerStr = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20"; static OfflineRecognizer OR = default!; //static VoiceActivityDetector VAD = default!; static VadModelConfig VADModelConfig = default!; /// /// 初始化 SenseVoice /// /// 默认6线程 /// 是否使用gpu 报错请看安装CUDA环境 public static void Init(int numThreads = 6, bool useGPU = false, bool useHotwords = false) { Console.WriteLine("初始化 SenseVoice"); OfflineRecognizerConfig config = new OfflineRecognizerConfig(); //采样率 config.FeatConfig.SampleRate = 16000; //用于训练模型的特征维度 config.FeatConfig.FeatureDim = 80; //Path to tokens.txt config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "tokens.txt"); //SenseVoice 模型 config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "model.onnx"); //1 使用逆文本规范化处理感官语音。 config.ModelConfig.SenseVoice.UseInverseTextNormalization = 1; config.ModelConfig.SenseVoice.Language = "zh"; //模型类型 config.ModelConfig.ModelType = string.Empty; config.ModelConfig.NumThreads = numThreads; config.ModelConfig.Provider = "cpu"; //需要使用GPU if (!useGPU) config.ModelConfig.Provider = "cuda"; #region 有效的解码方法 //贪婪搜索[greedy_search] 改进的波束搜索 [modified_beam_search] //贪婪搜索 config.DecodingMethod = "greedy_search"; ////改进的波束搜索 //config.DecodingMethod = "modified_beam_search"; ////仅在 --decoding--method 为 [波束搜索]modified_beam_search 时使用。 ////它指定搜索过程中要保留的活动路径数 //config.MaxActivePaths =4; #endregion #region 热词功能[无效] //if (false) //{ // //热词目录 // config.HotwordsFile = Path.Combine(AppCommon.AIModelFile, "Hotwords.txt"); // config.DecodingMethod = "modified_beam_search"; // //热词得分 // config.HotwordsScore = 1.5f; // config.ModelConfig.ModelingUnit = "cjkchar+bpe"; // config.ModelConfig.BpeVocab = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "bpe.model"); // config.ModelConfig.Transducer = new OfflineTransducerModelConfig() // { // Decoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "decoder-epoch-99-avg-1.onnx"), // Encoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "encoder-epoch-99-avg-1.onnx"), // Joiner = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "joiner-epoch-99-avg-1.onnx"), // }; //} #endregion //反转文本规范化规则 fst 的路径 config.RuleFsts = Path.Combine(AppCommon.AIModelFile, "itn_subject_sx.fst"); #if DEBUG config.ModelConfig.Debug = 1; #endif OR = new OfflineRecognizer(config); VADModelConfig = new VadModelConfig(); VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "silero_vad.onnx"); VADModelConfig.Debug = 0; } /// /// 获取语音字幕 /// /// /// public static async Task> RunTask(Stream s) { if (s is null) throw new Exception("音频路径 is null"); return await TaskHandle(new WaveReader(s)); } /// /// 获取语音字幕 /// /// /// public static async Task RunTask(string task) { var filePath = Path.Combine(task.LocalPath(), "task.wav"); if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath)) throw new Exception("task 音频路径未找到"); await TaskHandle(new WaveReader(filePath), task); } /// /// 任务处理 /// /// Wave /// 任务id [默认Null] /// /// public static async Task> TaskHandle(WaveReader reader, string? task = null) { if (OR is null) Init(); int numSamples = reader.Samples.Length; int windowSize = VADModelConfig.SileroVad.WindowSize; int sampleRate = VADModelConfig.SampleRate; int numIter = numSamples / windowSize; var totalSecond = numSamples / (float)sampleRate; var res = new List(500); using var VAD = new VoiceActivityDetector(VADModelConfig, 30); for (int i = 0; i != numIter; ++i) { int start = i * windowSize; float[] samples = new float[windowSize]; Array.Copy(reader.Samples, start, samples, 0, windowSize); VAD.AcceptWaveform(samples); //Memory samples = new float[windowSize]; //Memory sourceSpan = reader.Samples.AsMemory(start, windowSize); //sourceSpan.CopyTo(samples); //VAD.AcceptWaveform(samples.ToArray()); //是否检测到语音 if (VAD.IsSpeechDetected()) { //获取最新的发言片段 while (!VAD.IsEmpty()) await VAD.ReadNext(res, totalSecond, task); } } VAD.Flush(); while (!VAD.IsEmpty()) await VAD.ReadNext(res, totalSecond, task); //如果携带任务ID if (!string.IsNullOrEmpty(task)) { Console.WriteLine(DateTime.Now + "=> SenseVoice 字幕数量" + res.Count); var captionsStr = res.ToJson(); await DbScoped.Sugar .Updateable() .SetColumns(it => it.Captions == captionsStr) .Where(it => it.Id == long.Parse(task)) .ExecuteCommandAsync(); await RedisExpand.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res); //分析完成视频字幕后继续接收任务 RedisExpand.NewTask(); } return res; } /// /// 处理vad 下一个切片 /// /// /// 字幕处理后写入数组 /// 总时长 /// 所属任务id /// public static async Task ReadNext(this VoiceActivityDetector VAD, List res, float totalSecond, string? task = null) { var segment = VAD.Front(); var sampleRate = VADModelConfig.SampleRate; var sampleRateF = (float)VADModelConfig.SampleRate; float startTime = segment.Start / sampleRateF; float duration = segment.Samples.Length / sampleRateF; using var stream = OR.CreateStream(); stream.AcceptWaveform(sampleRate, segment.Samples); OR.Decode(stream); if (!string.IsNullOrEmpty(stream.Result.Text)) { var text = stream.Result.Text.Trim(); if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号 { VAD.Pop(); return; } res.Add(new() { Text = stream.Result.Text, Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero), End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero), }); if (!string.IsNullOrEmpty(task)) RedisExpand.SetTaskProgress(task, Math.Round((double)(startTime + duration) / (totalSecond) * 100,2)+"%"); } VAD.Pop(); } } }