using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Options; using SherpaOnnx; using SqlSugar.IOC; using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Linq; using System.Text; using System.Text.Json; using System.Text.RegularExpressions; using System.Threading.Tasks; using VideoAnalysisCore.Common; using VideoAnalysisCore.Model; using VideoAnalysisCore.Model.Enum; namespace VideoAnalysisCore.AICore.SherpaOnnx { public static class FunASRNanoExpand { /// /// 添加 SenseVoice 语音转文字 /// /// public static void AddFunASRNanoExpand(this IServiceCollection services) { services.AddSingleton(); } } /// /// 基于 sherpa-onnx 平台接入的 Fun-ASR-Nano-2512 /// 版本 Fun-ASR-Nano-2512 /// 来源 https://github.com/modelscope/FunASR/blob/main/README_zh.md /// public class FunASRNano { public static OfflineRecognizer OR = default!; private readonly IServiceProvider serviceProvider; public FunASRNano( RedisManager redisManager, IServiceProvider serviceProvider) { this.serviceProvider = serviceProvider; } /// /// 初始化 SenseVoice /// /// 默认6线程 /// 是否使用gpu 报错请看安装CUDA环境 public void Init(int numThreads = 6, bool useGPU = false, bool useHotwords = false) { Console.WriteLine("初始化 FunASRNano"); OfflineRecognizerConfig config = new OfflineRecognizerConfig(); //采样率 config.FeatConfig.SampleRate = 16000; //用于训练模型的特征维度 config.FeatConfig.FeatureDim = 80; var topFolder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-funasr-nano-fp16-2025-12-30"); //模型配置 //将非结构化数据(文本、图像、音频等)转换为低维稠密向量 config.ModelConfig.FunAsrNano.EncoderAdaptor = Path.Combine(topFolder, "encoder_adaptor.int8.onnx"); //接入的大语言模型 config.ModelConfig.FunAsrNano.LLM = Path.Combine(topFolder, "llm.fp16.onnx"); //插入预训练模型(如Transformer)的小型可训练模块 (如语音识别、情感分析) config.ModelConfig.FunAsrNano.Embedding = Path.Combine(topFolder, "embedding.int8.onnx"); //分词器 config.ModelConfig.FunAsrNano.Tokenizer = Path.Combine(topFolder, "Qwen3-0.6B"); //提示词 config.ModelConfig.FunAsrNano.SystemPrompt = "You are a professional video audio transcription assistant."; config.ModelConfig.FunAsrNano.UserPrompt = "这是一堂中国的课堂视频音频,请你帮我分析出它讲述的内容!"; config.ModelConfig.FunAsrNano.MaxNewTokens = 512; config.ModelConfig.FunAsrNano.Temperature = 1E-06f; config.ModelConfig.FunAsrNano.TopP = 0.8f; config.ModelConfig.FunAsrNano.Seed = 42; //模型类型 config.ModelConfig.ModelType = string.Empty; config.ModelConfig.NumThreads = numThreads; config.ModelConfig.Provider = "cpu"; //需要使用GPU if (!useGPU) config.ModelConfig.Provider = "cuda"; #if DEBUG config.ModelConfig.Debug = 1; #endif OR = new OfflineRecognizer(config); } /// /// 获取语音字幕 /// /// /// public List RunTask(Stream s) { if (s is null) throw new Exception("音频路径 is null"); if (OR is null) Init(); return serviceProvider.GetRequiredService() .TaskHandle(new WaveReader(s), null, SoundHandle, SherpaVadVersion.ten_vad_324); } /// /// 获取语音字幕 /// /// /// public Task RunTask(string task) { var filePath = Path.Combine(task.LocalPath(), "task.wav"); if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath)) throw new Exception("task 音频路径未找到"); if (OR is null) Init(); serviceProvider.GetRequiredService() .TaskHandle(new WaveReader(filePath), null, SoundHandle, SherpaVadVersion.ten_vad_324); return Task.CompletedTask; } /// /// 获取语音字幕 /// /// 采样率 /// 采样值(样品) /// 结果流 public OfflineStream SoundHandle(int sampleRate, float[] samples) { var stream = OR.CreateStream(); stream.AcceptWaveform(sampleRate, samples); OR.Decode(stream); return stream; } } }