211 lines
8.4 KiB
C#
211 lines
8.4 KiB
C#
using Microsoft.Extensions.DependencyInjection;
|
|
using Microsoft.Extensions.Options;
|
|
using SherpaOnnx;
|
|
using SqlSugar;
|
|
using SqlSugar.IOC;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Diagnostics;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using System.Text.Json;
|
|
using System.Text.RegularExpressions;
|
|
using System.Threading.Tasks;
|
|
using VideoAnalysisCore.Common;
|
|
using VideoAnalysisCore.Model;
|
|
using VideoAnalysisCore.Model.Enum;
|
|
using static System.Net.WebRequestMethods;
|
|
|
|
namespace VideoAnalysisCore.AICore.SherpaOnnx
|
|
{
|
|
public static class SherpaVadExpand
|
|
{
|
|
|
|
/// <summary>
|
|
/// 添加 Vad 语言切片
|
|
/// </summary>
|
|
/// <param name="services"></param>
|
|
public static void AddSherpaVadExpand(this IServiceCollection services)
|
|
{
|
|
services.AddTransient<SherpaVad>();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// 语音切片服务的版本
|
|
/// </summary>
|
|
public class SherpaVadVersion
|
|
{
|
|
public const string silero_vad_v4 = "silero_vad_v4.onnx";
|
|
public const string silero_vad_v5 = "silero_vad_v5.onnx";
|
|
/// <summary>
|
|
/// ten_vad (324 kb版本)
|
|
/// </summary>
|
|
public const string ten_vad_324 = "ten-vad.onnx";
|
|
}
|
|
/// <summary>
|
|
/// 语音切片服务
|
|
/// </summary>
|
|
public class SherpaVad
|
|
{
|
|
static VadModelConfig VADModelConfig = default!;
|
|
|
|
private readonly RedisManager redisManager;
|
|
private readonly IServiceProvider serviceProvider;
|
|
private readonly VoiceActivityDetector vad;
|
|
private Func<int, float[], OfflineStream> Callback;
|
|
|
|
|
|
public SherpaVad(RedisManager redisManager, IServiceProvider serviceProvider)
|
|
{
|
|
this.redisManager = redisManager;
|
|
this.serviceProvider = serviceProvider;
|
|
VADModelConfig = new VadModelConfig();
|
|
|
|
VADModelConfig.SampleRate = 16000;
|
|
VADModelConfig.NumThreads = 1;
|
|
VADModelConfig.Provider = "cpu";
|
|
#if DEBUG
|
|
VADModelConfig.Debug = 1;
|
|
#endif
|
|
VADModelConfig.SileroVad = new SileroVadModelConfig();
|
|
VADModelConfig.TenVad = new TenVadModelConfig();
|
|
}
|
|
|
|
/// <summary>
|
|
/// 初始化 SenseVoice
|
|
/// </summary>
|
|
/// <param name="func">vad识别成功后触发后回调</param>
|
|
/// <param name="vadVersion">版本采用 <see cref="SherpaVadVersion.silero_vad_v5"/> </param>
|
|
/// <param name="numThreads">默认1线程</param>
|
|
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
|
private void Init(Func<int, float[], OfflineStream> func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false)
|
|
{
|
|
VADModelConfig.NumThreads = numThreads;
|
|
VADModelConfig.Provider = useGPU? "cuda" : "cpu";
|
|
var path = Path.Combine(AppCommon.AIModelFile, "vad", SherpaVadVersion.silero_vad_v5);
|
|
switch (vadVersion)
|
|
{
|
|
case SherpaVadVersion.silero_vad_v4:
|
|
case SherpaVadVersion.silero_vad_v5:
|
|
VADModelConfig.SileroVad.Model = path;
|
|
break;
|
|
case SherpaVadVersion.ten_vad_324:
|
|
VADModelConfig.TenVad.Model = path;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
Callback = func;
|
|
}
|
|
|
|
/// <summary>
|
|
/// 任务处理
|
|
/// </summary>
|
|
/// <param name="reader">Wave</param>
|
|
/// <param name="func">vad识别成功后触发后回调</param>
|
|
/// <param name="vadVersion">版本采用 <see cref="SherpaVadVersion.silero_vad_v5"/> </param>
|
|
/// <param name="numThreads">默认1线程</param>
|
|
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
|
|
|
/// <param name="task">任务id [默认Null]</param>
|
|
/// <returns></returns>
|
|
/// <exception cref="Exception"></exception>
|
|
public List<SenseVoiceRes> TaskHandle(WaveReader reader, string? task,Func<int, float[], OfflineStream> func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false )
|
|
{
|
|
Init(func, vadVersion, numThreads, useGPU);
|
|
// 使用 Span 操作原始数据
|
|
ReadOnlySpan<float> allSamples = reader.Samples.AsSpan();
|
|
int numSamples = allSamples.Length;
|
|
int windowSize = VADModelConfig.SileroVad.WindowSize;
|
|
int sampleRate = VADModelConfig.SampleRate;
|
|
int numIter = numSamples / windowSize;
|
|
var totalSecond = numSamples / (float)sampleRate;
|
|
var res = new List<SenseVoiceRes>(500);
|
|
|
|
using var VAD = new VoiceActivityDetector(VADModelConfig, bufferSizeInSeconds: 30);
|
|
|
|
// 优化:复用缓冲区,避免在循环中重复分配内存
|
|
float[] buffer = new float[windowSize];
|
|
|
|
for (int i = 0; i != numIter; ++i)
|
|
{
|
|
int start = i * windowSize;
|
|
|
|
// 使用 Span 高效复制数据到固定缓冲区
|
|
allSamples.Slice(start, windowSize).CopyTo(buffer);
|
|
|
|
VAD.AcceptWaveform(buffer);
|
|
|
|
//是否检测到语音
|
|
if (VAD.IsSpeechDetected())
|
|
{
|
|
//获取最新的发言片段
|
|
while (!VAD.IsEmpty())
|
|
{
|
|
var p = ReadNext(VAD,res, totalSecond);
|
|
if (p != null) redisManager.SetTaskProgress(task, p + "%");
|
|
}
|
|
}
|
|
}
|
|
VAD.Flush();
|
|
while (!VAD.IsEmpty())
|
|
{
|
|
var p = ReadNext(VAD, res, totalSecond);
|
|
if(p!= null) redisManager.SetTaskProgress(task, p + "%");
|
|
}
|
|
//如果携带任务ID
|
|
if (!string.IsNullOrEmpty(task))
|
|
{
|
|
_ = redisManager.AddTaskLog(task, "==> SenseVoice 字幕数量" + res.Count);
|
|
var captionsStr = res.ToJson();
|
|
_ = serviceProvider.GetRequiredService<Repository<VideoTask>>()
|
|
.AsUpdateable()
|
|
.SetColumns(it => it.Captions == captionsStr)
|
|
.Where(it => it.Id == long.Parse(task))
|
|
.ExecuteCommandAsync();
|
|
_ = redisManager.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
|
|
//分析完成视频字幕后继续接收任务
|
|
//redisManager.NewTask();
|
|
}
|
|
return res;
|
|
}
|
|
/// <summary>
|
|
/// 处理vad 下一个切片
|
|
/// </summary>
|
|
/// <param name="VAD"></param>
|
|
/// <param name="res">字幕处理后写入数组</param>
|
|
/// <param name="totalSecond">总时长</param>
|
|
/// <returns></returns>
|
|
public double? ReadNext(VoiceActivityDetector VAD, List<SenseVoiceRes> res, float totalSecond)
|
|
{
|
|
var segment = VAD.Front();
|
|
var sampleRate = VADModelConfig.SampleRate;
|
|
var sampleRateF = (float)VADModelConfig.SampleRate;
|
|
float startTime = segment.Start / sampleRateF;
|
|
float duration = segment.Samples.Length / sampleRateF;
|
|
using var stream = Callback(sampleRate, segment.Samples);
|
|
double? resP =null;
|
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
|
{
|
|
var text = stream.Result.Text.Trim();
|
|
if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号
|
|
{
|
|
VAD.Pop();
|
|
return resP;
|
|
}
|
|
res.Add(new()
|
|
{
|
|
Text = stream.Result.Text,
|
|
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
|
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
|
});
|
|
resP = Math.Round((double)(startTime + duration) / (totalSecond) * 100, 2);
|
|
}
|
|
VAD.Pop();
|
|
return resP;
|
|
}
|
|
}
|
|
}
|