using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Options; using SherpaOnnx; using SqlSugar; using SqlSugar.IOC; using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Linq; using System.Text; using System.Text.Json; using System.Text.RegularExpressions; using System.Threading.Tasks; using VideoAnalysisCore.Common; using VideoAnalysisCore.Model; using VideoAnalysisCore.Model.Enum; using static System.Net.WebRequestMethods; namespace VideoAnalysisCore.AICore.SherpaOnnx { public static class SherpaVadExpand { /// /// 添加 Vad 语言切片 /// /// public static void AddSherpaVadExpand(this IServiceCollection services) { services.AddTransient(); } } /// /// 语音切片服务的版本 /// public class SherpaVadVersion { public const string silero_vad_v4 = "silero_vad_v4.onnx"; public const string silero_vad_v5 = "silero_vad_v5.onnx"; /// /// ten_vad (324 kb版本) /// public const string ten_vad_324 = "ten-vad.onnx"; } /// /// 语音切片服务 /// public class SherpaVad { private VadModelConfig VADModelConfig; private readonly RedisManager redisManager; private int WindowSize = 512; private readonly IServiceProvider serviceProvider; private readonly VoiceActivityDetector vad; private Func Callback; public SherpaVad(RedisManager redisManager, IServiceProvider serviceProvider) { this.redisManager = redisManager; this.serviceProvider = serviceProvider; VADModelConfig = new VadModelConfig(); #if DEBUG VADModelConfig.Debug = 1; #endif } /// /// 初始化 SenseVoice /// /// vad识别成功后触发后回调 /// 版本采用 /// 默认1线程 /// 是否使用gpu 报错请看安装CUDA环境 private void Init(Func func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false) { VADModelConfig.NumThreads = numThreads; VADModelConfig.Provider = useGPU? "cuda" : "cpu"; var path = Path.Combine(AppCommon.AIModelFile, "vad", vadVersion); switch (vadVersion) { case SherpaVadVersion.silero_vad_v4: case SherpaVadVersion.silero_vad_v5: VADModelConfig.SileroVad = new SileroVadModelConfig(); VADModelConfig.SileroVad.Model = path; //(阈值 / 灵敏度) 含义:判定为“语音”的置信度。取值范围通常在 0 到 1 之间。 VADModelConfig.SileroVad.Threshold = 0.25f; //(最小静音长度)秒。 含义:“要沉默多久,我才认为这句话说完了?” VADModelConfig.SileroVad.MinSilenceDuration = 0.2f; // (最小语音长度)秒 含义:“这段声音至少要多长,我才认为它是有效的说话?” VADModelConfig.SileroVad.MinSpeechDuration = 0.2f; //(最大语音长度)秒 VADModelConfig.SileroVad.MaxSpeechDuration = 3.5f; WindowSize = VADModelConfig.SileroVad.WindowSize; break; case SherpaVadVersion.ten_vad_324: VADModelConfig.TenVad = new TenVadModelConfig(); VADModelConfig.TenVad.Model = path; //(阈值 / 灵敏度) 含义:判定为“语音”的置信度。取值范围通常在 0 到 1 之间。 VADModelConfig.TenVad.Threshold = 0.3f; //(最小静音长度)秒。 含义:“要沉默多久,我才认为这句话说完了?” VADModelConfig.TenVad.MinSilenceDuration = 0.2f; // (最小语音长度)秒 含义:“这段声音至少要多长,我才认为它是有效的说话?” VADModelConfig.TenVad.MinSpeechDuration = 0.2f; //(最大语音长度)秒 VADModelConfig.TenVad.MaxSpeechDuration = 3.5f; VADModelConfig.TenVad.WindowSize = 256; WindowSize = VADModelConfig.TenVad.WindowSize; break; default: break; } Callback = func; } /// /// 任务处理 /// /// Wave /// vad识别成功后触发后回调 /// 版本采用 /// 默认1线程 /// 是否使用gpu 报错请看安装CUDA环境 /// 任务id [默认Null] /// /// public List TaskHandle(WaveReader reader, string? task,Func func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false ) { Init(func, vadVersion, numThreads, useGPU); // 使用 Span 操作原始数据 ReadOnlySpan allSamples = reader.Samples.AsSpan(); int numSamples = allSamples.Length; VADModelConfig.SampleRate = reader.SampleRate; int sampleRate = VADModelConfig.SampleRate; int numIter = numSamples / WindowSize; var totalSecond = numSamples / (float)sampleRate; var res = new List(500); VoiceActivityDetector vad; try { vad = new VoiceActivityDetector(VADModelConfig, bufferSizeInSeconds: 20); } catch (Exception ex) { throw; } // 优化:复用缓冲区,避免在循环中重复分配内存 float[] buffer = new float[WindowSize]; for (int i = 0; i != numIter; ++i) { int start = i * WindowSize; // 使用 Span 高效复制数据到固定缓冲区 allSamples.Slice(start, WindowSize).CopyTo(buffer); vad.AcceptWaveform(buffer); //是否检测到语音 if (vad.IsSpeechDetected()) { //获取最新的发言片段 while (!vad.IsEmpty()) { var p = ReadNext(vad,res, totalSecond); if (p != null) redisManager.SetTaskProgress(task, p + "%"); } } } vad.Flush(); while (!vad.IsEmpty()) { var p = ReadNext(vad, res, totalSecond); if(p!= null) redisManager.SetTaskProgress(task, p + "%"); } //如果携带任务ID if (!string.IsNullOrEmpty(task)) { _ = redisManager.AddTaskLog(task, "==>字幕数量" + res.Count); var captionsStr = res.ToJson(); _ = serviceProvider.GetRequiredService>() .AsUpdateable() .SetColumns(it => it.Captions == captionsStr) .Where(it => it.Id == long.Parse(task)) .ExecuteCommandAsync(); _ = redisManager.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res); //分析完成视频字幕后继续接收任务 //redisManager.NewTask(); } vad.Dispose(); return res; } /// /// 处理vad 下一个切片 /// /// /// 字幕处理后写入数组 /// 总时长 /// public double? ReadNext(VoiceActivityDetector VAD, List res, float totalSecond) { var segment = VAD.Front(); var sampleRate = VADModelConfig.SampleRate; var sampleRateF = (float)VADModelConfig.SampleRate; float startTime = segment.Start / sampleRateF; float duration = segment.Samples.Length / sampleRateF; using var stream = Callback(sampleRate, segment.Samples); double? resP =null; if (!string.IsNullOrEmpty(stream.Result.Text)) { var text = stream.Result.Text.Trim(); if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号 { VAD.Pop(); return resP; } res.Add(new() { Text = stream.Result.Text, Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero), End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero), }); resP = Math.Round((double)(startTime + duration) / (totalSecond) * 100, 2); } VAD.Pop(); return resP; } } }