新增 封装vad,接入 FunASRNano
This commit is contained in:
parent
de1cdcf32c
commit
d52504a3a0
|
|
@ -53,6 +53,7 @@ namespace Learn.VideoAnalysis
|
||||||
builder.Services.AddAlibabaCloudVod();
|
builder.Services.AddAlibabaCloudVod();
|
||||||
builder.Services.AddAliyunOSS();
|
builder.Services.AddAliyunOSS();
|
||||||
builder.Services.AddSenseVoiceExpand();
|
builder.Services.AddSenseVoiceExpand();
|
||||||
|
builder.Services.AddSherpaVadExpand();
|
||||||
//builder.Services.AddSpeakerAI();
|
//builder.Services.AddSpeakerAI();
|
||||||
builder.Services.AddCoravel();
|
builder.Services.AddCoravel();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,132 @@
|
||||||
|
using Microsoft.Extensions.DependencyInjection;
|
||||||
|
using Microsoft.Extensions.Options;
|
||||||
|
using SherpaOnnx;
|
||||||
|
using SqlSugar.IOC;
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Diagnostics;
|
||||||
|
using System.IO;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Text;
|
||||||
|
using System.Text.Json;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using VideoAnalysisCore.Common;
|
||||||
|
using VideoAnalysisCore.Model;
|
||||||
|
using VideoAnalysisCore.Model.Enum;
|
||||||
|
|
||||||
|
namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
|
{
|
||||||
|
public static class FunASRNanoExpand
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 添加 SenseVoice 语音转文字
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="services"></param>
|
||||||
|
public static void AddFunASRNanoExpand(this IServiceCollection services)
|
||||||
|
{
|
||||||
|
services.AddSingleton<SenseVoice>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// 基于 sherpa-onnx 平台接入的 Fun-ASR-Nano-2512
|
||||||
|
/// <para>版本 Fun-ASR-Nano-2512</para>
|
||||||
|
/// <para>来源 https://github.com/modelscope/FunASR/blob/main/README_zh.md</para>
|
||||||
|
/// </summary>
|
||||||
|
public class FunASRNano
|
||||||
|
{
|
||||||
|
static OfflineRecognizer OR = default!;
|
||||||
|
private readonly IServiceProvider serviceProvider;
|
||||||
|
|
||||||
|
public FunASRNano( RedisManager redisManager, IServiceProvider serviceProvider)
|
||||||
|
{
|
||||||
|
this.serviceProvider = serviceProvider;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 初始化 SenseVoice
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="numThreads">默认6线程</param>
|
||||||
|
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境 <see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
||||||
|
public void Init(int numThreads = 6, bool useGPU = false, bool useHotwords = false)
|
||||||
|
{
|
||||||
|
Console.WriteLine("初始化 FunASRNano");
|
||||||
|
OfflineRecognizerConfig config = new OfflineRecognizerConfig();
|
||||||
|
//采样率
|
||||||
|
config.FeatConfig.SampleRate = 16000;
|
||||||
|
//用于训练模型的特征维度
|
||||||
|
config.FeatConfig.FeatureDim = 80;
|
||||||
|
var topFolder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-funasr-nano-fp16-2025-12-30");
|
||||||
|
|
||||||
|
//模型配置
|
||||||
|
//将非结构化数据(文本、图像、音频等)转换为低维稠密向量
|
||||||
|
config.ModelConfig.FunAsrNano.EncoderAdaptor = Path.Combine(topFolder, "encoder_adaptor.int8.onnx");
|
||||||
|
//接入的大语言模型
|
||||||
|
config.ModelConfig.FunAsrNano.LLM = Path.Combine(topFolder, "llm.fp16.onnx");
|
||||||
|
//插入预训练模型(如Transformer)的小型可训练模块 (如语音识别、情感分析)
|
||||||
|
config.ModelConfig.FunAsrNano.Embedding = Path.Combine(topFolder, "embedding.int8.onnx");
|
||||||
|
//分词器
|
||||||
|
config.ModelConfig.FunAsrNano.Tokenizer = Path.Combine(topFolder, "Qwen3-0.6B");
|
||||||
|
//提示词
|
||||||
|
config.ModelConfig.FunAsrNano.SystemPrompt = "You are a professional video audio transcription assistant.";
|
||||||
|
config.ModelConfig.FunAsrNano.UserPrompt = "这是一趟中国的课堂视频音频,请你帮我分析出它讲述的内容!";
|
||||||
|
config.ModelConfig.FunAsrNano.MaxNewTokens = 512;
|
||||||
|
config.ModelConfig.FunAsrNano.Temperature = 1E-06f;
|
||||||
|
config.ModelConfig.FunAsrNano.TopP = 0.8f;
|
||||||
|
config.ModelConfig.FunAsrNano.Seed = 42;
|
||||||
|
|
||||||
|
//模型类型
|
||||||
|
config.ModelConfig.ModelType = string.Empty;
|
||||||
|
config.ModelConfig.NumThreads = numThreads;
|
||||||
|
config.ModelConfig.Provider = "cpu";
|
||||||
|
//需要使用GPU
|
||||||
|
if (!useGPU)
|
||||||
|
config.ModelConfig.Provider = "cuda";
|
||||||
|
#if DEBUG
|
||||||
|
config.ModelConfig.Debug = 1;
|
||||||
|
#endif
|
||||||
|
OR = new OfflineRecognizer(config);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 获取语音字幕
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="s"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public List<SenseVoiceRes> RunTask(Stream s)
|
||||||
|
{
|
||||||
|
if (s is null) throw new Exception("音频路径 is null");
|
||||||
|
return serviceProvider.GetRequiredService<SherpaVad>()
|
||||||
|
.TaskHandle(new WaveReader(s), null, SoundHandle, SherpaVadVersion.silero_vad_v5);
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// 获取语音字幕
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="task"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public Task RunTask(string task)
|
||||||
|
{
|
||||||
|
var filePath = Path.Combine(task.LocalPath(), "task.wav");
|
||||||
|
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
|
||||||
|
throw new Exception("task 音频路径未找到");
|
||||||
|
serviceProvider.GetRequiredService<SherpaVad>()
|
||||||
|
.TaskHandle(new WaveReader(filePath), null, SoundHandle, SherpaVadVersion.silero_vad_v5);
|
||||||
|
|
||||||
|
return Task.CompletedTask;
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// 获取语音字幕
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="sampleRate">采样率</param>
|
||||||
|
/// <param name="samples">采样值(样品)</param>
|
||||||
|
/// <returns>结果流</returns>
|
||||||
|
public OfflineStream SoundHandle(int sampleRate, float[] samples)
|
||||||
|
{
|
||||||
|
var stream = OR.CreateStream();
|
||||||
|
stream.AcceptWaveform(sampleRate, samples);
|
||||||
|
OR.Decode(stream);
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -14,7 +14,6 @@ using System.Threading.Tasks;
|
||||||
using VideoAnalysisCore.Common;
|
using VideoAnalysisCore.Common;
|
||||||
using VideoAnalysisCore.Model;
|
using VideoAnalysisCore.Model;
|
||||||
using VideoAnalysisCore.Model.Enum;
|
using VideoAnalysisCore.Model.Enum;
|
||||||
using static System.Runtime.InteropServices.JavaScript.JSType;
|
|
||||||
|
|
||||||
namespace VideoAnalysisCore.AICore.SherpaOnnx
|
namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
{
|
{
|
||||||
|
|
@ -32,22 +31,18 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
}
|
}
|
||||||
public class SenseVoice
|
public class SenseVoice
|
||||||
{
|
{
|
||||||
//const string TransducerStr = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20";
|
static OfflineRecognizer OR = default!;
|
||||||
static OfflineRecognizer OR = default!;
|
|
||||||
static OfflineRecognizer OR_old = default!;
|
|
||||||
static VadModelConfig VADModelConfig = default!;
|
|
||||||
public Repository<VideoTask> videoTaskDB { get; set; }
|
|
||||||
|
|
||||||
private readonly RedisManager redisManager;
|
private readonly IServiceProvider serviceProvider;
|
||||||
|
|
||||||
public SenseVoice(Repository<VideoTask> videoTaskDB, RedisManager redisManager)
|
|
||||||
|
public SenseVoice(RedisManager redisManager, IServiceProvider serviceProvider)
|
||||||
{
|
{
|
||||||
this.videoTaskDB = videoTaskDB;
|
this.serviceProvider = serviceProvider;
|
||||||
this.redisManager = redisManager;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 初始化 SenseVoice
|
/// 初始化 SenseVoice
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="numThreads">默认6线程</param>
|
/// <param name="numThreads">默认6线程</param>
|
||||||
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
||||||
|
|
@ -61,10 +56,9 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
config.FeatConfig.FeatureDim = 80;
|
config.FeatConfig.FeatureDim = 80;
|
||||||
// Path to tokens.txt
|
// Path to tokens.txt
|
||||||
var AIModelVersion_270717 = "sherpa-onnx-sense-voice-24-07-17";
|
var AIModelVersion_270717 = "sherpa-onnx-sense-voice-24-07-17";
|
||||||
var AIModelVersion_251217 = "sherpa-onnx-sense-voice-funasr-nano-2025-12-17";
|
config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "tokens.txt");
|
||||||
config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, AIModelVersion_251217, "tokens.txt");
|
|
||||||
//SenseVoice 模型
|
//SenseVoice 模型
|
||||||
config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, AIModelVersion_251217, "model.onnx");
|
config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "model.onnx");
|
||||||
//1 使用逆文本规范化处理感官语音 [控制标点符号生成]。
|
//1 使用逆文本规范化处理感官语音 [控制标点符号生成]。
|
||||||
config.ModelConfig.SenseVoice.UseInverseTextNormalization = 1;
|
config.ModelConfig.SenseVoice.UseInverseTextNormalization = 1;
|
||||||
//反转文本规范化规则 fst 的路径
|
//反转文本规范化规则 fst 的路径
|
||||||
|
|
@ -91,54 +85,11 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
//config.MaxActivePaths =4;
|
//config.MaxActivePaths =4;
|
||||||
#endregion
|
#endregion
|
||||||
|
|
||||||
#region 热词功能[无效]
|
|
||||||
//if (false)
|
|
||||||
//{
|
|
||||||
// //热词目录
|
|
||||||
// config.HotwordsFile = Path.Combine(AppCommon.AIModelFile, "Hotwords.txt");
|
|
||||||
// config.DecodingMethod = "modified_beam_search";
|
|
||||||
// //热词得分
|
|
||||||
// config.HotwordsScore = 1.5f;
|
|
||||||
|
|
||||||
// config.ModelConfig.ModelingUnit = "cjkchar+bpe";
|
|
||||||
// config.ModelConfig.BpeVocab = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "bpe.model");
|
|
||||||
// config.ModelConfig.Transducer = new OfflineTransducerModelConfig()
|
|
||||||
// {
|
|
||||||
// Decoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "decoder-epoch-99-avg-1.onnx"),
|
|
||||||
// Encoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "encoder-epoch-99-avg-1.onnx"),
|
|
||||||
// Joiner = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "joiner-epoch-99-avg-1.onnx"),
|
|
||||||
// };
|
|
||||||
//}
|
|
||||||
#endregion
|
|
||||||
|
|
||||||
|
|
||||||
#if DEBUG
|
#if DEBUG
|
||||||
config.ModelConfig.Debug = 1;
|
config.ModelConfig.Debug = 1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
OR = new OfflineRecognizer(config);
|
OR = new OfflineRecognizer(config);
|
||||||
|
|
||||||
|
|
||||||
OfflineRecognizerConfig oldConfig = new OfflineRecognizerConfig();
|
|
||||||
//采样率
|
|
||||||
oldConfig.FeatConfig.SampleRate = 16000;
|
|
||||||
oldConfig.FeatConfig.FeatureDim = 80;
|
|
||||||
oldConfig.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "tokens.txt");
|
|
||||||
oldConfig.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "model.onnx");
|
|
||||||
oldConfig.ModelConfig.SenseVoice.UseInverseTextNormalization = 1;
|
|
||||||
//反转文本规范化规则 fst 的路径
|
|
||||||
//config.RuleFsts = Path.Combine(AppCommon.AIModelFile, "itn_subject_sx.fst");
|
|
||||||
|
|
||||||
oldConfig.ModelConfig.SenseVoice.Language = "zh";
|
|
||||||
//模型类型
|
|
||||||
oldConfig.ModelConfig.ModelType = string.Empty;
|
|
||||||
oldConfig.ModelConfig.NumThreads = numThreads;
|
|
||||||
oldConfig.ModelConfig.Provider = "cpu";
|
|
||||||
OR_old = new OfflineRecognizer(oldConfig);
|
|
||||||
|
|
||||||
VADModelConfig = new VadModelConfig();
|
|
||||||
VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, AIModelVersion_270717, "silero_vad.onnx");
|
|
||||||
VADModelConfig.Debug = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
|
@ -146,137 +97,42 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="s"></param>
|
/// <param name="s"></param>
|
||||||
/// <returns></returns>
|
/// <returns></returns>
|
||||||
public async Task<List<SenseVoiceRes>> RunTask(Stream s)
|
public List<SenseVoiceRes> RunTask(Stream s)
|
||||||
{
|
{
|
||||||
if (s is null)
|
if (s is null) throw new Exception("音频路径 is null");
|
||||||
throw new Exception("音频路径 is null");
|
return serviceProvider.GetRequiredService<SherpaVad>()
|
||||||
return await TaskHandle(new WaveReader(s), null);
|
.TaskHandle(new WaveReader(s), null, SoundHandle, SherpaVadVersion.silero_vad_v5);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 获取语音字幕
|
/// 获取语音字幕
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="task"></param>
|
/// <param name="task"></param>
|
||||||
/// <returns></returns>
|
/// <returns></returns>
|
||||||
public async Task RunTask(string task)
|
public Task RunTask(string task)
|
||||||
{
|
{
|
||||||
var filePath = Path.Combine(task.LocalPath(), "task.wav");
|
var filePath = Path.Combine(task.LocalPath(), "task.wav");
|
||||||
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
|
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
|
||||||
throw new Exception("task 音频路径未找到");
|
throw new Exception("task 音频路径未找到");
|
||||||
await TaskHandle(new WaveReader(filePath), task);
|
serviceProvider.GetRequiredService<SherpaVad>()
|
||||||
}
|
.TaskHandle(new WaveReader(filePath), null, SoundHandle, SherpaVadVersion.silero_vad_v5);
|
||||||
|
|
||||||
/// <summary>
|
return Task.CompletedTask;
|
||||||
/// 任务处理
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="reader">Wave</param>
|
|
||||||
/// <param name="task">任务id [默认Null]</param>
|
|
||||||
/// <returns></returns>
|
|
||||||
/// <exception cref="Exception"></exception>
|
|
||||||
public async Task<List<SenseVoiceRes>> TaskHandle(WaveReader reader, string? task )
|
|
||||||
{
|
|
||||||
if (OR is null)
|
|
||||||
Init();
|
|
||||||
int numSamples = reader.Samples.Length;
|
|
||||||
int windowSize = VADModelConfig.SileroVad.WindowSize;
|
|
||||||
int sampleRate = VADModelConfig.SampleRate;
|
|
||||||
int numIter = numSamples / windowSize;
|
|
||||||
var totalSecond = numSamples / (float)sampleRate;
|
|
||||||
var res = new List<SenseVoiceRes>(500);
|
|
||||||
using var VAD = new VoiceActivityDetector(VADModelConfig, bufferSizeInSeconds: 20);
|
|
||||||
for (int i = 0; i != numIter; ++i)
|
|
||||||
{
|
|
||||||
int start = i * windowSize;
|
|
||||||
float[] samples = new float[windowSize];
|
|
||||||
Array.Copy(reader.Samples, start, samples, 0, windowSize);
|
|
||||||
VAD.AcceptWaveform(samples);
|
|
||||||
|
|
||||||
//Memory<float> samples = new float[windowSize];
|
|
||||||
//Memory<float> sourceSpan = reader.Samples.AsMemory(start, windowSize);
|
|
||||||
//sourceSpan.CopyTo(samples);
|
|
||||||
//VAD.AcceptWaveform(samples.ToArray());
|
|
||||||
|
|
||||||
//是否检测到语音
|
|
||||||
if (VAD.IsSpeechDetected())
|
|
||||||
{
|
|
||||||
//获取最新的发言片段
|
|
||||||
while (!VAD.IsEmpty())
|
|
||||||
{
|
|
||||||
var p = await ReadNext(VAD,res, totalSecond);
|
|
||||||
if (p != null) redisManager.SetTaskProgress(task, p + "%");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
VAD.Flush();
|
|
||||||
while (!VAD.IsEmpty())
|
|
||||||
{
|
|
||||||
var p = await ReadNext(VAD, res, totalSecond);
|
|
||||||
if(p!= null) redisManager.SetTaskProgress(task, p + "%");
|
|
||||||
}
|
|
||||||
//如果携带任务ID
|
|
||||||
if (!string.IsNullOrEmpty(task))
|
|
||||||
{
|
|
||||||
await redisManager.AddTaskLog(task, "==> SenseVoice 字幕数量" + res.Count);
|
|
||||||
var captionsStr = res.ToJson();
|
|
||||||
await videoTaskDB.AsUpdateable()
|
|
||||||
.SetColumns(it => it.Captions == captionsStr)
|
|
||||||
.Where(it => it.Id == long.Parse(task))
|
|
||||||
.ExecuteCommandAsync();
|
|
||||||
await redisManager.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
|
|
||||||
//分析完成视频字幕后继续接收任务
|
|
||||||
//redisManager.NewTask();
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 处理vad 下一个切片
|
/// 获取语音字幕
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="VAD"></param>
|
/// <param name="sampleRate">采样率</param>
|
||||||
/// <param name="res">字幕处理后写入数组</param>
|
/// <param name="samples">采样值(样品)</param>
|
||||||
/// <param name="totalSecond">总时长</param>
|
/// <returns>结果流</returns>
|
||||||
/// <param name="progressCallback">任务回调</param>
|
public OfflineStream SoundHandle(int sampleRate, float[] samples)
|
||||||
/// <returns></returns>
|
|
||||||
public async Task<double?> ReadNext(VoiceActivityDetector VAD, List<SenseVoiceRes> res, float totalSecond)
|
|
||||||
{
|
{
|
||||||
var segment = VAD.Front();
|
var stream = OR.CreateStream();
|
||||||
var sampleRate = VADModelConfig.SampleRate;
|
stream.AcceptWaveform(sampleRate, samples);
|
||||||
var sampleRateF = (float)VADModelConfig.SampleRate;
|
|
||||||
float startTime = segment.Start / sampleRateF;
|
|
||||||
float duration = segment.Samples.Length / sampleRateF;
|
|
||||||
using var stream = OR.CreateStream();
|
|
||||||
stream.AcceptWaveform(sampleRate, segment.Samples);
|
|
||||||
OR.Decode(stream);
|
OR.Decode(stream);
|
||||||
|
return stream;
|
||||||
//old
|
|
||||||
using var stream1 = OR_old.CreateStream();
|
|
||||||
stream1.AcceptWaveform(sampleRate, segment.Samples);
|
|
||||||
OR.Decode(stream1);
|
|
||||||
if (stream.Result.Text != stream1.Result.Text)
|
|
||||||
{
|
|
||||||
Console.WriteLine("=>" + (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero));
|
|
||||||
Console.WriteLine("新=>" + stream.Result.Text);
|
|
||||||
Console.WriteLine("旧=>" + stream1.Result.Text);
|
|
||||||
}
|
|
||||||
Console.WriteLine();
|
|
||||||
double? resP =null;
|
|
||||||
if (!string.IsNullOrEmpty(stream.Result.Text))
|
|
||||||
{
|
|
||||||
var text = stream.Result.Text.Trim();
|
|
||||||
if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号
|
|
||||||
{
|
|
||||||
VAD.Pop();
|
|
||||||
return resP;
|
|
||||||
}
|
|
||||||
res.Add(new()
|
|
||||||
{
|
|
||||||
Text = stream.Result.Text,
|
|
||||||
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
|
||||||
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
|
||||||
});
|
|
||||||
resP = Math.Round((double)(startTime + duration) / (totalSecond) * 100, 2);
|
|
||||||
}
|
|
||||||
VAD.Pop();
|
|
||||||
return resP;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,210 @@
|
||||||
|
using Microsoft.Extensions.DependencyInjection;
|
||||||
|
using Microsoft.Extensions.Options;
|
||||||
|
using SherpaOnnx;
|
||||||
|
using SqlSugar;
|
||||||
|
using SqlSugar.IOC;
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Diagnostics;
|
||||||
|
using System.IO;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Text;
|
||||||
|
using System.Text.Json;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using VideoAnalysisCore.Common;
|
||||||
|
using VideoAnalysisCore.Model;
|
||||||
|
using VideoAnalysisCore.Model.Enum;
|
||||||
|
using static System.Net.WebRequestMethods;
|
||||||
|
|
||||||
|
namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
|
{
|
||||||
|
public static class SherpaVadExpand
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 添加 Vad 语言切片
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="services"></param>
|
||||||
|
public static void AddSherpaVadExpand(this IServiceCollection services)
|
||||||
|
{
|
||||||
|
services.AddTransient<SherpaVad>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 语音切片服务的版本
|
||||||
|
/// </summary>
|
||||||
|
public class SherpaVadVersion
|
||||||
|
{
|
||||||
|
public const string silero_vad_v4 = "silero_vad_v4.onnx";
|
||||||
|
public const string silero_vad_v5 = "silero_vad_v5.onnx";
|
||||||
|
/// <summary>
|
||||||
|
/// ten_vad (324 kb版本)
|
||||||
|
/// </summary>
|
||||||
|
public const string ten_vad_324 = "ten-vad.onnx";
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// 语音切片服务
|
||||||
|
/// </summary>
|
||||||
|
public class SherpaVad
|
||||||
|
{
|
||||||
|
static VadModelConfig VADModelConfig = default!;
|
||||||
|
|
||||||
|
private readonly RedisManager redisManager;
|
||||||
|
private readonly IServiceProvider serviceProvider;
|
||||||
|
private readonly VoiceActivityDetector vad;
|
||||||
|
private Func<int, float[], OfflineStream> Callback;
|
||||||
|
|
||||||
|
|
||||||
|
public SherpaVad(RedisManager redisManager, IServiceProvider serviceProvider)
|
||||||
|
{
|
||||||
|
this.redisManager = redisManager;
|
||||||
|
this.serviceProvider = serviceProvider;
|
||||||
|
VADModelConfig = new VadModelConfig();
|
||||||
|
|
||||||
|
VADModelConfig.SampleRate = 16000;
|
||||||
|
VADModelConfig.NumThreads = 1;
|
||||||
|
VADModelConfig.Provider = "cpu";
|
||||||
|
#if DEBUG
|
||||||
|
VADModelConfig.Debug = 1;
|
||||||
|
#endif
|
||||||
|
VADModelConfig.SileroVad = new SileroVadModelConfig();
|
||||||
|
VADModelConfig.TenVad = new TenVadModelConfig();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 初始化 SenseVoice
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="func">vad识别成功后触发后回调</param>
|
||||||
|
/// <param name="vadVersion">版本采用 <see cref="SherpaVadVersion.silero_vad_v5"/> </param>
|
||||||
|
/// <param name="numThreads">默认1线程</param>
|
||||||
|
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
||||||
|
private void Init(Func<int, float[], OfflineStream> func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false)
|
||||||
|
{
|
||||||
|
VADModelConfig.NumThreads = numThreads;
|
||||||
|
VADModelConfig.Provider = useGPU? "cuda" : "cpu";
|
||||||
|
var path = Path.Combine(AppCommon.AIModelFile, "vad", SherpaVadVersion.silero_vad_v5);
|
||||||
|
switch (vadVersion)
|
||||||
|
{
|
||||||
|
case SherpaVadVersion.silero_vad_v4:
|
||||||
|
case SherpaVadVersion.silero_vad_v5:
|
||||||
|
VADModelConfig.SileroVad.Model = path;
|
||||||
|
break;
|
||||||
|
case SherpaVadVersion.ten_vad_324:
|
||||||
|
VADModelConfig.TenVad.Model = path;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Callback = func;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 任务处理
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="reader">Wave</param>
|
||||||
|
/// <param name="func">vad识别成功后触发后回调</param>
|
||||||
|
/// <param name="vadVersion">版本采用 <see cref="SherpaVadVersion.silero_vad_v5"/> </param>
|
||||||
|
/// <param name="numThreads">默认1线程</param>
|
||||||
|
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
||||||
|
|
||||||
|
/// <param name="task">任务id [默认Null]</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
/// <exception cref="Exception"></exception>
|
||||||
|
public List<SenseVoiceRes> TaskHandle(WaveReader reader, string? task,Func<int, float[], OfflineStream> func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false )
|
||||||
|
{
|
||||||
|
Init(func, vadVersion, numThreads, useGPU);
|
||||||
|
// 使用 Span 操作原始数据
|
||||||
|
ReadOnlySpan<float> allSamples = reader.Samples.AsSpan();
|
||||||
|
int numSamples = allSamples.Length;
|
||||||
|
int windowSize = VADModelConfig.SileroVad.WindowSize;
|
||||||
|
int sampleRate = VADModelConfig.SampleRate;
|
||||||
|
int numIter = numSamples / windowSize;
|
||||||
|
var totalSecond = numSamples / (float)sampleRate;
|
||||||
|
var res = new List<SenseVoiceRes>(500);
|
||||||
|
|
||||||
|
using var VAD = new VoiceActivityDetector(VADModelConfig, bufferSizeInSeconds: 30);
|
||||||
|
|
||||||
|
// 优化:复用缓冲区,避免在循环中重复分配内存
|
||||||
|
float[] buffer = new float[windowSize];
|
||||||
|
|
||||||
|
for (int i = 0; i != numIter; ++i)
|
||||||
|
{
|
||||||
|
int start = i * windowSize;
|
||||||
|
|
||||||
|
// 使用 Span 高效复制数据到固定缓冲区
|
||||||
|
allSamples.Slice(start, windowSize).CopyTo(buffer);
|
||||||
|
|
||||||
|
VAD.AcceptWaveform(buffer);
|
||||||
|
|
||||||
|
//是否检测到语音
|
||||||
|
if (VAD.IsSpeechDetected())
|
||||||
|
{
|
||||||
|
//获取最新的发言片段
|
||||||
|
while (!VAD.IsEmpty())
|
||||||
|
{
|
||||||
|
var p = ReadNext(VAD,res, totalSecond);
|
||||||
|
if (p != null) redisManager.SetTaskProgress(task, p + "%");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
VAD.Flush();
|
||||||
|
while (!VAD.IsEmpty())
|
||||||
|
{
|
||||||
|
var p = ReadNext(VAD, res, totalSecond);
|
||||||
|
if(p!= null) redisManager.SetTaskProgress(task, p + "%");
|
||||||
|
}
|
||||||
|
//如果携带任务ID
|
||||||
|
if (!string.IsNullOrEmpty(task))
|
||||||
|
{
|
||||||
|
_ = redisManager.AddTaskLog(task, "==> SenseVoice 字幕数量" + res.Count);
|
||||||
|
var captionsStr = res.ToJson();
|
||||||
|
_ = serviceProvider.GetRequiredService<Repository<VideoTask>>()
|
||||||
|
.AsUpdateable()
|
||||||
|
.SetColumns(it => it.Captions == captionsStr)
|
||||||
|
.Where(it => it.Id == long.Parse(task))
|
||||||
|
.ExecuteCommandAsync();
|
||||||
|
_ = redisManager.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
|
||||||
|
//分析完成视频字幕后继续接收任务
|
||||||
|
//redisManager.NewTask();
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// 处理vad 下一个切片
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="VAD"></param>
|
||||||
|
/// <param name="res">字幕处理后写入数组</param>
|
||||||
|
/// <param name="totalSecond">总时长</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public double? ReadNext(VoiceActivityDetector VAD, List<SenseVoiceRes> res, float totalSecond)
|
||||||
|
{
|
||||||
|
var segment = VAD.Front();
|
||||||
|
var sampleRate = VADModelConfig.SampleRate;
|
||||||
|
var sampleRateF = (float)VADModelConfig.SampleRate;
|
||||||
|
float startTime = segment.Start / sampleRateF;
|
||||||
|
float duration = segment.Samples.Length / sampleRateF;
|
||||||
|
using var stream = Callback(sampleRate, segment.Samples);
|
||||||
|
double? resP =null;
|
||||||
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
||||||
|
{
|
||||||
|
var text = stream.Result.Text.Trim();
|
||||||
|
if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号
|
||||||
|
{
|
||||||
|
VAD.Pop();
|
||||||
|
return resP;
|
||||||
|
}
|
||||||
|
res.Add(new()
|
||||||
|
{
|
||||||
|
Text = stream.Result.Text,
|
||||||
|
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
||||||
|
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
||||||
|
});
|
||||||
|
resP = Math.Round((double)(startTime + duration) / (totalSecond) * 100, 2);
|
||||||
|
}
|
||||||
|
VAD.Pop();
|
||||||
|
return resP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -129,7 +129,7 @@ namespace VideoAnalysisCore.Controllers
|
||||||
using HttpClient client = new HttpClient();
|
using HttpClient client = new HttpClient();
|
||||||
// 发送GET请求获取网络文件流
|
// 发送GET请求获取网络文件流
|
||||||
using var networkStream = await client.GetStreamAsync(url);
|
using var networkStream = await client.GetStreamAsync(url);
|
||||||
var res = await senseVoice.RunTask(networkStream);
|
var res = senseVoice.RunTask(networkStream);
|
||||||
return Ok(res);
|
return Ok(res);
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
|
|
@ -143,11 +143,11 @@ namespace VideoAnalysisCore.Controllers
|
||||||
/// <param name="file">文件流</param>
|
/// <param name="file">文件流</param>
|
||||||
/// <returns></returns>
|
/// <returns></returns>
|
||||||
[HttpPost(Name = "AudioRecognition")]
|
[HttpPost(Name = "AudioRecognition")]
|
||||||
public async Task<IActionResult> AudioRecognition(IFormFile file)
|
public IActionResult AudioRecognition(IFormFile file)
|
||||||
{
|
{
|
||||||
using var s = file.OpenReadStream();
|
using var s = file.OpenReadStream();
|
||||||
var res = await senseVoice.RunTask(s);
|
var res = senseVoice.RunTask(s);
|
||||||
return Ok(res);
|
return Ok(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyModel" Version="7.0.0" />
|
<PackageReference Include="Microsoft.Extensions.DependencyModel" Version="7.0.0" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.0" />
|
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.0" />
|
||||||
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
|
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
|
||||||
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="1.12.20" />
|
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="1.12.21" />
|
||||||
<PackageReference Include="SixLabors.ImageSharp" Version="3.1.7" />
|
<PackageReference Include="SixLabors.ImageSharp" Version="3.1.7" />
|
||||||
<PackageReference Include="SqlSugar.IOC" Version="2.0.0" />
|
<PackageReference Include="SqlSugar.IOC" Version="2.0.0" />
|
||||||
<PackageReference Include="SqlSugarCore" Version="5.1.4.205" />
|
<PackageReference Include="SqlSugarCore" Version="5.1.4.205" />
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue