using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Options; using SherpaOnnx; using SqlSugar; using SqlSugar.IOC; using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Linq; using System.Text; using System.Text.Json; using System.Text.RegularExpressions; using System.Threading.Tasks; using VideoAnalysisCore.Common; using VideoAnalysisCore.Model; using VideoAnalysisCore.Model.Enum; using static System.Net.WebRequestMethods; namespace VideoAnalysisCore.AICore.SherpaOnnx { public static class SherpaVadExpand { /// /// 添加 Vad 语言切片 /// /// public static void AddSherpaVadExpand(this IServiceCollection services) { services.AddTransient(); } } /// /// 语音切片服务的版本 /// public class SherpaVadVersion { public const string silero_vad_v4 = "silero_vad_v4.onnx"; public const string silero_vad_v5 = "silero_vad_v5.onnx"; /// /// ten_vad (324 kb版本) /// public const string ten_vad_324 = "ten-vad.onnx"; } /// /// 语音切片服务 /// public class SherpaVad { static VadModelConfig VADModelConfig = default!; private readonly RedisManager redisManager; private readonly IServiceProvider serviceProvider; private readonly VoiceActivityDetector vad; private Func Callback; public SherpaVad(RedisManager redisManager, IServiceProvider serviceProvider) { this.redisManager = redisManager; this.serviceProvider = serviceProvider; VADModelConfig = new VadModelConfig(); VADModelConfig.SampleRate = 16000; VADModelConfig.NumThreads = 1; VADModelConfig.Provider = "cpu"; #if DEBUG VADModelConfig.Debug = 1; #endif VADModelConfig.SileroVad = new SileroVadModelConfig(); VADModelConfig.TenVad = new TenVadModelConfig(); } /// /// 初始化 SenseVoice /// /// vad识别成功后触发后回调 /// 版本采用 /// 默认1线程 /// 是否使用gpu 报错请看安装CUDA环境 private void Init(Func func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false) { VADModelConfig.NumThreads = numThreads; VADModelConfig.Provider = useGPU? "cuda" : "cpu"; var path = Path.Combine(AppCommon.AIModelFile, "vad", SherpaVadVersion.silero_vad_v5); switch (vadVersion) { case SherpaVadVersion.silero_vad_v4: case SherpaVadVersion.silero_vad_v5: VADModelConfig.SileroVad.Model = path; break; case SherpaVadVersion.ten_vad_324: VADModelConfig.TenVad.Model = path; break; default: break; } Callback = func; } /// /// 任务处理 /// /// Wave /// vad识别成功后触发后回调 /// 版本采用 /// 默认1线程 /// 是否使用gpu 报错请看安装CUDA环境 /// 任务id [默认Null] /// /// public List TaskHandle(WaveReader reader, string? task,Func func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false ) { Init(func, vadVersion, numThreads, useGPU); // 使用 Span 操作原始数据 ReadOnlySpan allSamples = reader.Samples.AsSpan(); int numSamples = allSamples.Length; int windowSize = VADModelConfig.SileroVad.WindowSize; int sampleRate = VADModelConfig.SampleRate; int numIter = numSamples / windowSize; var totalSecond = numSamples / (float)sampleRate; var res = new List(500); using var VAD = new VoiceActivityDetector(VADModelConfig, bufferSizeInSeconds: 30); // 优化:复用缓冲区,避免在循环中重复分配内存 float[] buffer = new float[windowSize]; for (int i = 0; i != numIter; ++i) { int start = i * windowSize; // 使用 Span 高效复制数据到固定缓冲区 allSamples.Slice(start, windowSize).CopyTo(buffer); VAD.AcceptWaveform(buffer); //是否检测到语音 if (VAD.IsSpeechDetected()) { //获取最新的发言片段 while (!VAD.IsEmpty()) { var p = ReadNext(VAD,res, totalSecond); if (p != null) redisManager.SetTaskProgress(task, p + "%"); } } } VAD.Flush(); while (!VAD.IsEmpty()) { var p = ReadNext(VAD, res, totalSecond); if(p!= null) redisManager.SetTaskProgress(task, p + "%"); } //如果携带任务ID if (!string.IsNullOrEmpty(task)) { _ = redisManager.AddTaskLog(task, "==>字幕数量" + res.Count); var captionsStr = res.ToJson(); _ = serviceProvider.GetRequiredService>() .AsUpdateable() .SetColumns(it => it.Captions == captionsStr) .Where(it => it.Id == long.Parse(task)) .ExecuteCommandAsync(); _ = redisManager.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res); //分析完成视频字幕后继续接收任务 //redisManager.NewTask(); } return res; } /// /// 处理vad 下一个切片 /// /// /// 字幕处理后写入数组 /// 总时长 /// public double? ReadNext(VoiceActivityDetector VAD, List res, float totalSecond) { var segment = VAD.Front(); var sampleRate = VADModelConfig.SampleRate; var sampleRateF = (float)VADModelConfig.SampleRate; float startTime = segment.Start / sampleRateF; float duration = segment.Samples.Length / sampleRateF; using var stream = Callback(sampleRate, segment.Samples); double? resP =null; if (!string.IsNullOrEmpty(stream.Result.Text)) { var text = stream.Result.Text.Trim(); if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号 { VAD.Pop(); return resP; } res.Add(new() { Text = stream.Result.Text, Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero), End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero), }); resP = Math.Round((double)(startTime + duration) / (totalSecond) * 100, 2); } VAD.Pop(); return resP; } } }