diff --git a/VideoAnalysis/Program.cs b/VideoAnalysis/Program.cs index bddd625..a383bad 100644 --- a/VideoAnalysis/Program.cs +++ b/VideoAnalysis/Program.cs @@ -53,6 +53,7 @@ namespace Learn.VideoAnalysis builder.Services.AddAlibabaCloudVod(); builder.Services.AddAliyunOSS(); builder.Services.AddSenseVoiceExpand(); + builder.Services.AddSherpaVadExpand(); //builder.Services.AddSpeakerAI(); builder.Services.AddCoravel(); diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/FunASRNano.cs b/VideoAnalysisCore/AICore/SherpaOnnx/FunASRNano.cs new file mode 100644 index 0000000..89d41da --- /dev/null +++ b/VideoAnalysisCore/AICore/SherpaOnnx/FunASRNano.cs @@ -0,0 +1,132 @@ +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Options; +using SherpaOnnx; +using SqlSugar.IOC; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using VideoAnalysisCore.Common; +using VideoAnalysisCore.Model; +using VideoAnalysisCore.Model.Enum; + +namespace VideoAnalysisCore.AICore.SherpaOnnx +{ + public static class FunASRNanoExpand + { + + /// + /// 添加 SenseVoice 语音转文字 + /// + /// + public static void AddFunASRNanoExpand(this IServiceCollection services) + { + services.AddSingleton(); + } + } + /// + /// 基于 sherpa-onnx 平台接入的 Fun-ASR-Nano-2512 + /// 版本 Fun-ASR-Nano-2512 + /// 来源 https://github.com/modelscope/FunASR/blob/main/README_zh.md + /// + public class FunASRNano + { + static OfflineRecognizer OR = default!; + private readonly IServiceProvider serviceProvider; + + public FunASRNano( RedisManager redisManager, IServiceProvider serviceProvider) + { + this.serviceProvider = serviceProvider; + } + + /// + /// 初始化 SenseVoice + /// + /// 默认6线程 + /// 是否使用gpu 报错请看安装CUDA环境 + public void Init(int numThreads = 6, bool useGPU = false, bool useHotwords = false) + { + Console.WriteLine("初始化 FunASRNano"); + OfflineRecognizerConfig config = new OfflineRecognizerConfig(); + //采样率 + config.FeatConfig.SampleRate = 16000; + //用于训练模型的特征维度 + config.FeatConfig.FeatureDim = 80; + var topFolder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-funasr-nano-fp16-2025-12-30"); + + //模型配置 + //将非结构化数据(文本、图像、音频等)转换为低维稠密向量 + config.ModelConfig.FunAsrNano.EncoderAdaptor = Path.Combine(topFolder, "encoder_adaptor.int8.onnx"); + //接入的大语言模型 + config.ModelConfig.FunAsrNano.LLM = Path.Combine(topFolder, "llm.fp16.onnx"); + //插入预训练模型(如Transformer)的小型可训练模块 (如语音识别、情感分析) + config.ModelConfig.FunAsrNano.Embedding = Path.Combine(topFolder, "embedding.int8.onnx"); + //分词器 + config.ModelConfig.FunAsrNano.Tokenizer = Path.Combine(topFolder, "Qwen3-0.6B"); + //提示词 + config.ModelConfig.FunAsrNano.SystemPrompt = "You are a professional video audio transcription assistant."; + config.ModelConfig.FunAsrNano.UserPrompt = "这是一趟中国的课堂视频音频,请你帮我分析出它讲述的内容!"; + config.ModelConfig.FunAsrNano.MaxNewTokens = 512; + config.ModelConfig.FunAsrNano.Temperature = 1E-06f; + config.ModelConfig.FunAsrNano.TopP = 0.8f; + config.ModelConfig.FunAsrNano.Seed = 42; + + //模型类型 + config.ModelConfig.ModelType = string.Empty; + config.ModelConfig.NumThreads = numThreads; + config.ModelConfig.Provider = "cpu"; + //需要使用GPU + if (!useGPU) + config.ModelConfig.Provider = "cuda"; +#if DEBUG + config.ModelConfig.Debug = 1; +#endif + OR = new OfflineRecognizer(config); + } + + /// + /// 获取语音字幕 + /// + /// + /// + public List RunTask(Stream s) + { + if (s is null) throw new Exception("音频路径 is null"); + return serviceProvider.GetRequiredService() + .TaskHandle(new WaveReader(s), null, SoundHandle, SherpaVadVersion.silero_vad_v5); + } + /// + /// 获取语音字幕 + /// + /// + /// + public Task RunTask(string task) + { + var filePath = Path.Combine(task.LocalPath(), "task.wav"); + if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath)) + throw new Exception("task 音频路径未找到"); + serviceProvider.GetRequiredService() + .TaskHandle(new WaveReader(filePath), null, SoundHandle, SherpaVadVersion.silero_vad_v5); + + return Task.CompletedTask; + } + /// + /// 获取语音字幕 + /// + /// 采样率 + /// 采样值(样品) + /// 结果流 + public OfflineStream SoundHandle(int sampleRate, float[] samples) + { + var stream = OR.CreateStream(); + stream.AcceptWaveform(sampleRate, samples); + OR.Decode(stream); + return stream; + } + } +} diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs index 199534c..cbe1d0a 100644 --- a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs +++ b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs @@ -14,7 +14,6 @@ using System.Threading.Tasks; using VideoAnalysisCore.Common; using VideoAnalysisCore.Model; using VideoAnalysisCore.Model.Enum; -using static System.Runtime.InteropServices.JavaScript.JSType; namespace VideoAnalysisCore.AICore.SherpaOnnx { @@ -32,22 +31,18 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx } public class SenseVoice { - //const string TransducerStr = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20"; - static OfflineRecognizer OR = default!; - static OfflineRecognizer OR_old = default!; - static VadModelConfig VADModelConfig = default!; - public Repository videoTaskDB { get; set; } + static OfflineRecognizer OR = default!; - private readonly RedisManager redisManager; + private readonly IServiceProvider serviceProvider; - public SenseVoice(Repository videoTaskDB, RedisManager redisManager) + + public SenseVoice(RedisManager redisManager, IServiceProvider serviceProvider) { - this.videoTaskDB = videoTaskDB; - this.redisManager = redisManager; + this.serviceProvider = serviceProvider; } /// - /// 初始化 SenseVoice + /// 初始化 SenseVoice /// /// 默认6线程 /// 是否使用gpu 报错请看安装CUDA环境 @@ -61,10 +56,9 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx config.FeatConfig.FeatureDim = 80; // Path to tokens.txt var AIModelVersion_270717 = "sherpa-onnx-sense-voice-24-07-17"; - var AIModelVersion_251217 = "sherpa-onnx-sense-voice-funasr-nano-2025-12-17"; - config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, AIModelVersion_251217, "tokens.txt"); + config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "tokens.txt"); //SenseVoice 模型 - config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, AIModelVersion_251217, "model.onnx"); + config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "model.onnx"); //1 使用逆文本规范化处理感官语音 [控制标点符号生成]。 config.ModelConfig.SenseVoice.UseInverseTextNormalization = 1; //反转文本规范化规则 fst 的路径 @@ -91,54 +85,11 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx //config.MaxActivePaths =4; #endregion - #region 热词功能[无效] - //if (false) - //{ - // //热词目录 - // config.HotwordsFile = Path.Combine(AppCommon.AIModelFile, "Hotwords.txt"); - // config.DecodingMethod = "modified_beam_search"; - // //热词得分 - // config.HotwordsScore = 1.5f; - - // config.ModelConfig.ModelingUnit = "cjkchar+bpe"; - // config.ModelConfig.BpeVocab = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "bpe.model"); - // config.ModelConfig.Transducer = new OfflineTransducerModelConfig() - // { - // Decoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "decoder-epoch-99-avg-1.onnx"), - // Encoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "encoder-epoch-99-avg-1.onnx"), - // Joiner = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "joiner-epoch-99-avg-1.onnx"), - // }; - //} - #endregion - #if DEBUG config.ModelConfig.Debug = 1; #endif - OR = new OfflineRecognizer(config); - - - OfflineRecognizerConfig oldConfig = new OfflineRecognizerConfig(); - //采样率 - oldConfig.FeatConfig.SampleRate = 16000; - oldConfig.FeatConfig.FeatureDim = 80; - oldConfig.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "tokens.txt"); - oldConfig.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "model.onnx"); - oldConfig.ModelConfig.SenseVoice.UseInverseTextNormalization = 1; - //反转文本规范化规则 fst 的路径 - //config.RuleFsts = Path.Combine(AppCommon.AIModelFile, "itn_subject_sx.fst"); - - oldConfig.ModelConfig.SenseVoice.Language = "zh"; - //模型类型 - oldConfig.ModelConfig.ModelType = string.Empty; - oldConfig.ModelConfig.NumThreads = numThreads; - oldConfig.ModelConfig.Provider = "cpu"; - OR_old = new OfflineRecognizer(oldConfig); - - VADModelConfig = new VadModelConfig(); - VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "silero_vad.onnx"); - VADModelConfig.Debug = 0; } /// @@ -146,137 +97,42 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx /// /// /// - public async Task> RunTask(Stream s) + public List RunTask(Stream s) { - if (s is null) - throw new Exception("音频路径 is null"); - return await TaskHandle(new WaveReader(s), null); + if (s is null) throw new Exception("音频路径 is null"); + return serviceProvider.GetRequiredService() + .TaskHandle(new WaveReader(s), null, SoundHandle, SherpaVadVersion.silero_vad_v5); } - /// /// 获取语音字幕 /// /// /// - public async Task RunTask(string task) + public Task RunTask(string task) { var filePath = Path.Combine(task.LocalPath(), "task.wav"); if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath)) throw new Exception("task 音频路径未找到"); - await TaskHandle(new WaveReader(filePath), task); - } + serviceProvider.GetRequiredService() + .TaskHandle(new WaveReader(filePath), null, SoundHandle, SherpaVadVersion.silero_vad_v5); - /// - /// 任务处理 - /// - /// Wave - /// 任务id [默认Null] - /// - /// - public async Task> TaskHandle(WaveReader reader, string? task ) - { - if (OR is null) - Init(); - int numSamples = reader.Samples.Length; - int windowSize = VADModelConfig.SileroVad.WindowSize; - int sampleRate = VADModelConfig.SampleRate; - int numIter = numSamples / windowSize; - var totalSecond = numSamples / (float)sampleRate; - var res = new List(500); - using var VAD = new VoiceActivityDetector(VADModelConfig, bufferSizeInSeconds: 20); - for (int i = 0; i != numIter; ++i) - { - int start = i * windowSize; - float[] samples = new float[windowSize]; - Array.Copy(reader.Samples, start, samples, 0, windowSize); - VAD.AcceptWaveform(samples); - - //Memory samples = new float[windowSize]; - //Memory sourceSpan = reader.Samples.AsMemory(start, windowSize); - //sourceSpan.CopyTo(samples); - //VAD.AcceptWaveform(samples.ToArray()); - - //是否检测到语音 - if (VAD.IsSpeechDetected()) - { - //获取最新的发言片段 - while (!VAD.IsEmpty()) - { - var p = await ReadNext(VAD,res, totalSecond); - if (p != null) redisManager.SetTaskProgress(task, p + "%"); - } - } - } - VAD.Flush(); - while (!VAD.IsEmpty()) - { - var p = await ReadNext(VAD, res, totalSecond); - if(p!= null) redisManager.SetTaskProgress(task, p + "%"); - } - //如果携带任务ID - if (!string.IsNullOrEmpty(task)) - { - await redisManager.AddTaskLog(task, "==> SenseVoice 字幕数量" + res.Count); - var captionsStr = res.ToJson(); - await videoTaskDB.AsUpdateable() - .SetColumns(it => it.Captions == captionsStr) - .Where(it => it.Id == long.Parse(task)) - .ExecuteCommandAsync(); - await redisManager.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res); - //分析完成视频字幕后继续接收任务 - //redisManager.NewTask(); - } - return res; + return Task.CompletedTask; } /// - /// 处理vad 下一个切片 + /// 获取语音字幕 /// - /// - /// 字幕处理后写入数组 - /// 总时长 - /// 任务回调 - /// - public async Task ReadNext(VoiceActivityDetector VAD, List res, float totalSecond) + /// 采样率 + /// 采样值(样品) + /// 结果流 + public OfflineStream SoundHandle(int sampleRate, float[] samples) { - var segment = VAD.Front(); - var sampleRate = VADModelConfig.SampleRate; - var sampleRateF = (float)VADModelConfig.SampleRate; - float startTime = segment.Start / sampleRateF; - float duration = segment.Samples.Length / sampleRateF; - using var stream = OR.CreateStream(); - stream.AcceptWaveform(sampleRate, segment.Samples); + var stream = OR.CreateStream(); + stream.AcceptWaveform(sampleRate, samples); OR.Decode(stream); - - //old - using var stream1 = OR_old.CreateStream(); - stream1.AcceptWaveform(sampleRate, segment.Samples); - OR.Decode(stream1); - if (stream.Result.Text != stream1.Result.Text) - { - Console.WriteLine("=>" + (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero)); - Console.WriteLine("新=>" + stream.Result.Text); - Console.WriteLine("旧=>" + stream1.Result.Text); - } - Console.WriteLine(); - double? resP =null; - if (!string.IsNullOrEmpty(stream.Result.Text)) - { - var text = stream.Result.Text.Trim(); - if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号 - { - VAD.Pop(); - return resP; - } - res.Add(new() - { - Text = stream.Result.Text, - Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero), - End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero), - }); - resP = Math.Round((double)(startTime + duration) / (totalSecond) * 100, 2); - } - VAD.Pop(); - return resP; + return stream; } + + + } } diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/SherpaVad.cs b/VideoAnalysisCore/AICore/SherpaOnnx/SherpaVad.cs new file mode 100644 index 0000000..7175942 --- /dev/null +++ b/VideoAnalysisCore/AICore/SherpaOnnx/SherpaVad.cs @@ -0,0 +1,210 @@ +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Options; +using SherpaOnnx; +using SqlSugar; +using SqlSugar.IOC; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using VideoAnalysisCore.Common; +using VideoAnalysisCore.Model; +using VideoAnalysisCore.Model.Enum; +using static System.Net.WebRequestMethods; + +namespace VideoAnalysisCore.AICore.SherpaOnnx +{ + public static class SherpaVadExpand + { + + /// + /// 添加 Vad 语言切片 + /// + /// + public static void AddSherpaVadExpand(this IServiceCollection services) + { + services.AddTransient(); + } + } + + /// + /// 语音切片服务的版本 + /// + public class SherpaVadVersion + { + public const string silero_vad_v4 = "silero_vad_v4.onnx"; + public const string silero_vad_v5 = "silero_vad_v5.onnx"; + /// + /// ten_vad (324 kb版本) + /// + public const string ten_vad_324 = "ten-vad.onnx"; + } + /// + /// 语音切片服务 + /// + public class SherpaVad + { + static VadModelConfig VADModelConfig = default!; + + private readonly RedisManager redisManager; + private readonly IServiceProvider serviceProvider; + private readonly VoiceActivityDetector vad; + private Func Callback; + + + public SherpaVad(RedisManager redisManager, IServiceProvider serviceProvider) + { + this.redisManager = redisManager; + this.serviceProvider = serviceProvider; + VADModelConfig = new VadModelConfig(); + + VADModelConfig.SampleRate = 16000; + VADModelConfig.NumThreads = 1; + VADModelConfig.Provider = "cpu"; +#if DEBUG + VADModelConfig.Debug = 1; +#endif + VADModelConfig.SileroVad = new SileroVadModelConfig(); + VADModelConfig.TenVad = new TenVadModelConfig(); + } + + /// + /// 初始化 SenseVoice + /// + /// vad识别成功后触发后回调 + /// 版本采用 + /// 默认1线程 + /// 是否使用gpu 报错请看安装CUDA环境 + private void Init(Func func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false) + { + VADModelConfig.NumThreads = numThreads; + VADModelConfig.Provider = useGPU? "cuda" : "cpu"; + var path = Path.Combine(AppCommon.AIModelFile, "vad", SherpaVadVersion.silero_vad_v5); + switch (vadVersion) + { + case SherpaVadVersion.silero_vad_v4: + case SherpaVadVersion.silero_vad_v5: + VADModelConfig.SileroVad.Model = path; + break; + case SherpaVadVersion.ten_vad_324: + VADModelConfig.TenVad.Model = path; + break; + default: + break; + } + Callback = func; + } + + /// + /// 任务处理 + /// + /// Wave + /// vad识别成功后触发后回调 + /// 版本采用 + /// 默认1线程 + /// 是否使用gpu 报错请看安装CUDA环境 + + /// 任务id [默认Null] + /// + /// + public List TaskHandle(WaveReader reader, string? task,Func func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false ) + { + Init(func, vadVersion, numThreads, useGPU); + // 使用 Span 操作原始数据 + ReadOnlySpan allSamples = reader.Samples.AsSpan(); + int numSamples = allSamples.Length; + int windowSize = VADModelConfig.SileroVad.WindowSize; + int sampleRate = VADModelConfig.SampleRate; + int numIter = numSamples / windowSize; + var totalSecond = numSamples / (float)sampleRate; + var res = new List(500); + + using var VAD = new VoiceActivityDetector(VADModelConfig, bufferSizeInSeconds: 30); + + // 优化:复用缓冲区,避免在循环中重复分配内存 + float[] buffer = new float[windowSize]; + + for (int i = 0; i != numIter; ++i) + { + int start = i * windowSize; + + // 使用 Span 高效复制数据到固定缓冲区 + allSamples.Slice(start, windowSize).CopyTo(buffer); + + VAD.AcceptWaveform(buffer); + + //是否检测到语音 + if (VAD.IsSpeechDetected()) + { + //获取最新的发言片段 + while (!VAD.IsEmpty()) + { + var p = ReadNext(VAD,res, totalSecond); + if (p != null) redisManager.SetTaskProgress(task, p + "%"); + } + } + } + VAD.Flush(); + while (!VAD.IsEmpty()) + { + var p = ReadNext(VAD, res, totalSecond); + if(p!= null) redisManager.SetTaskProgress(task, p + "%"); + } + //如果携带任务ID + if (!string.IsNullOrEmpty(task)) + { + _ = redisManager.AddTaskLog(task, "==> SenseVoice 字幕数量" + res.Count); + var captionsStr = res.ToJson(); + _ = serviceProvider.GetRequiredService>() + .AsUpdateable() + .SetColumns(it => it.Captions == captionsStr) + .Where(it => it.Id == long.Parse(task)) + .ExecuteCommandAsync(); + _ = redisManager.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res); + //分析完成视频字幕后继续接收任务 + //redisManager.NewTask(); + } + return res; + } + /// + /// 处理vad 下一个切片 + /// + /// + /// 字幕处理后写入数组 + /// 总时长 + /// + public double? ReadNext(VoiceActivityDetector VAD, List res, float totalSecond) + { + var segment = VAD.Front(); + var sampleRate = VADModelConfig.SampleRate; + var sampleRateF = (float)VADModelConfig.SampleRate; + float startTime = segment.Start / sampleRateF; + float duration = segment.Samples.Length / sampleRateF; + using var stream = Callback(sampleRate, segment.Samples); + double? resP =null; + if (!string.IsNullOrEmpty(stream.Result.Text)) + { + var text = stream.Result.Text.Trim(); + if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号 + { + VAD.Pop(); + return resP; + } + res.Add(new() + { + Text = stream.Result.Text, + Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero), + End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero), + }); + resP = Math.Round((double)(startTime + duration) / (totalSecond) * 100, 2); + } + VAD.Pop(); + return resP; + } + } +} diff --git a/VideoAnalysisCore/Controllers/VideoTaskController.cs b/VideoAnalysisCore/Controllers/VideoTaskController.cs index 473e3dc..614c208 100644 --- a/VideoAnalysisCore/Controllers/VideoTaskController.cs +++ b/VideoAnalysisCore/Controllers/VideoTaskController.cs @@ -129,7 +129,7 @@ namespace VideoAnalysisCore.Controllers using HttpClient client = new HttpClient(); // 发送GET请求获取网络文件流 using var networkStream = await client.GetStreamAsync(url); - var res = await senseVoice.RunTask(networkStream); + var res = senseVoice.RunTask(networkStream); return Ok(res); } catch (Exception ex) @@ -143,11 +143,11 @@ namespace VideoAnalysisCore.Controllers /// 文件流 /// [HttpPost(Name = "AudioRecognition")] - public async Task AudioRecognition(IFormFile file) + public IActionResult AudioRecognition(IFormFile file) { using var s = file.OpenReadStream(); - var res = await senseVoice.RunTask(s); - return Ok(res); + var res = senseVoice.RunTask(s); + return Ok(res); } diff --git a/VideoAnalysisCore/VideoAnalysisCore.csproj b/VideoAnalysisCore/VideoAnalysisCore.csproj index d7b668b..ccfab38 100644 --- a/VideoAnalysisCore/VideoAnalysisCore.csproj +++ b/VideoAnalysisCore/VideoAnalysisCore.csproj @@ -71,7 +71,7 @@ - +