238 lines
10 KiB
C#
238 lines
10 KiB
C#
using Microsoft.Extensions.DependencyInjection;
|
||
using Microsoft.Extensions.Options;
|
||
using SherpaOnnx;
|
||
using SqlSugar;
|
||
using SqlSugar.IOC;
|
||
using System;
|
||
using System.Collections.Generic;
|
||
using System.Diagnostics;
|
||
using System.IO;
|
||
using System.Linq;
|
||
using System.Text;
|
||
using System.Text.Json;
|
||
using System.Text.RegularExpressions;
|
||
using System.Threading.Tasks;
|
||
using VideoAnalysisCore.Common;
|
||
using VideoAnalysisCore.Model;
|
||
using VideoAnalysisCore.Model.Enum;
|
||
using static System.Net.WebRequestMethods;
|
||
|
||
namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||
{
|
||
public static class SherpaVadExpand
|
||
{
|
||
|
||
/// <summary>
|
||
/// 添加 Vad 语言切片
|
||
/// </summary>
|
||
/// <param name="services"></param>
|
||
public static void AddSherpaVadExpand(this IServiceCollection services)
|
||
{
|
||
services.AddTransient<SherpaVad>();
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// 语音切片服务的版本
|
||
/// </summary>
|
||
public class SherpaVadVersion
|
||
{
|
||
public const string silero_vad_v4 = "silero_vad_v4.onnx";
|
||
public const string silero_vad_v5 = "silero_vad_v5.onnx";
|
||
/// <summary>
|
||
/// ten_vad (324 kb版本)
|
||
/// </summary>
|
||
public const string ten_vad_324 = "ten-vad.onnx";
|
||
}
|
||
/// <summary>
|
||
/// 语音切片服务
|
||
/// </summary>
|
||
public class SherpaVad
|
||
{
|
||
private VadModelConfig VADModelConfig;
|
||
|
||
private readonly VideoSliceWorkflowManager _workflowManager;
|
||
private int WindowSize = 512;
|
||
private readonly IServiceProvider serviceProvider;
|
||
private readonly VoiceActivityDetector vad;
|
||
private Func<int, float[], OfflineStream> Callback;
|
||
|
||
|
||
public SherpaVad(VideoSliceWorkflowManager workflowManager, IServiceProvider serviceProvider)
|
||
{
|
||
_workflowManager = workflowManager;
|
||
this.serviceProvider = serviceProvider;
|
||
VADModelConfig = new VadModelConfig();
|
||
|
||
#if DEBUG
|
||
VADModelConfig.Debug = 1;
|
||
#endif
|
||
|
||
|
||
}
|
||
|
||
/// <summary>
|
||
/// 初始化 SenseVoice
|
||
/// </summary>
|
||
/// <param name="func">vad识别成功后触发后回调</param>
|
||
/// <param name="vadVersion">版本采用 <see cref="SherpaVadVersion.silero_vad_v5"/> </param>
|
||
/// <param name="numThreads">默认1线程</param>
|
||
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
||
private void Init(Func<int, float[], OfflineStream> func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false)
|
||
{
|
||
VADModelConfig.NumThreads = numThreads;
|
||
VADModelConfig.Provider = useGPU? "cuda" : "cpu";
|
||
var path = Path.Combine(AppCommon.AIModelFile, "vad", vadVersion);
|
||
switch (vadVersion)
|
||
{
|
||
case SherpaVadVersion.silero_vad_v4:
|
||
case SherpaVadVersion.silero_vad_v5:
|
||
VADModelConfig.SileroVad = new SileroVadModelConfig();
|
||
VADModelConfig.SileroVad.Model = path;
|
||
//(阈值 / 灵敏度) 含义:判定为“语音”的置信度。取值范围通常在 0 到 1 之间。
|
||
VADModelConfig.SileroVad.Threshold = 0.25f;
|
||
//(最小静音长度)秒。 含义:“要沉默多久,我才认为这句话说完了?”
|
||
VADModelConfig.SileroVad.MinSilenceDuration = 0.2f;
|
||
// (最小语音长度)秒 含义:“这段声音至少要多长,我才认为它是有效的说话?”
|
||
VADModelConfig.SileroVad.MinSpeechDuration = 0.2f;
|
||
//(最大语音长度)秒
|
||
VADModelConfig.SileroVad.MaxSpeechDuration = 3.5f;
|
||
WindowSize = VADModelConfig.SileroVad.WindowSize;
|
||
break;
|
||
case SherpaVadVersion.ten_vad_324:
|
||
VADModelConfig.TenVad = new TenVadModelConfig();
|
||
VADModelConfig.TenVad.Model = path;
|
||
//(阈值 / 灵敏度) 含义:判定为“语音”的置信度。取值范围通常在 0 到 1 之间。
|
||
VADModelConfig.TenVad.Threshold = 0.3f;
|
||
//(最小静音长度)秒。 含义:“要沉默多久,我才认为这句话说完了?”
|
||
VADModelConfig.TenVad.MinSilenceDuration = 0.2f;
|
||
// (最小语音长度)秒 含义:“这段声音至少要多长,我才认为它是有效的说话?”
|
||
VADModelConfig.TenVad.MinSpeechDuration = 0.2f;
|
||
//(最大语音长度)秒
|
||
VADModelConfig.TenVad.MaxSpeechDuration = 3.5f;
|
||
VADModelConfig.TenVad.WindowSize = 256;
|
||
WindowSize = VADModelConfig.TenVad.WindowSize;
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
Callback = func;
|
||
}
|
||
|
||
/// <summary>
|
||
/// 任务处理
|
||
/// </summary>
|
||
/// <param name="reader">Wave</param>
|
||
/// <param name="func">vad识别成功后触发后回调</param>
|
||
/// <param name="vadVersion">版本采用 <see cref="SherpaVadVersion.silero_vad_v5"/> </param>
|
||
/// <param name="numThreads">默认1线程</param>
|
||
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
||
|
||
/// <param name="task">任务id [默认Null]</param>
|
||
/// <returns></returns>
|
||
/// <exception cref="Exception"></exception>
|
||
public List<SenseVoiceRes> TaskHandle(WaveReader reader, string? task,Func<int, float[], OfflineStream> func, string vadVersion = SherpaVadVersion.silero_vad_v5, int numThreads = 1, bool useGPU = false )
|
||
{
|
||
Init(func, vadVersion, numThreads, useGPU);
|
||
// 使用 Span 操作原始数据
|
||
ReadOnlySpan<float> allSamples = reader.Samples.AsSpan();
|
||
int numSamples = allSamples.Length;
|
||
VADModelConfig.SampleRate = reader.SampleRate;
|
||
int sampleRate = VADModelConfig.SampleRate;
|
||
int numIter = numSamples / WindowSize;
|
||
var totalSecond = numSamples / (float)sampleRate;
|
||
var res = new List<SenseVoiceRes>(500);
|
||
VoiceActivityDetector vad;
|
||
try
|
||
{
|
||
vad = new VoiceActivityDetector(VADModelConfig, bufferSizeInSeconds: 20);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
throw;
|
||
}
|
||
|
||
// 优化:复用缓冲区,避免在循环中重复分配内存
|
||
float[] buffer = new float[WindowSize];
|
||
|
||
for (int i = 0; i != numIter; ++i)
|
||
{
|
||
int start = i * WindowSize;
|
||
|
||
// 使用 Span 高效复制数据到固定缓冲区
|
||
allSamples.Slice(start, WindowSize).CopyTo(buffer);
|
||
|
||
vad.AcceptWaveform(buffer);
|
||
|
||
//是否检测到语音
|
||
if (vad.IsSpeechDetected())
|
||
{
|
||
//获取最新的发言片段
|
||
while (!vad.IsEmpty())
|
||
{
|
||
var p = ReadNext(vad,res, totalSecond);
|
||
if (p != null) _workflowManager.SetTaskProgress(task, p + "%");
|
||
}
|
||
}
|
||
}
|
||
vad.Flush();
|
||
while (!vad.IsEmpty())
|
||
{
|
||
var p = ReadNext(vad, res, totalSecond);
|
||
if(p!= null) _workflowManager.SetTaskProgress(task, p + "%");
|
||
}
|
||
//如果携带任务ID
|
||
if (!string.IsNullOrEmpty(task))
|
||
{
|
||
_ = _workflowManager.AddTaskLog(task, "==>字幕数量" + res.Count);
|
||
var captionsStr = res.ToJson();
|
||
_ = serviceProvider.GetRequiredService<Repository<VideoTask>>()
|
||
.AsUpdateable()
|
||
.SetColumns(it => it.Captions == captionsStr)
|
||
.Where(it => it.Id == long.Parse(task))
|
||
.ExecuteCommandAsync();
|
||
_ = _workflowManager.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
|
||
//分析完成视频字幕后继续接收任务
|
||
//redisManager.NewTask();
|
||
}
|
||
vad.Dispose();
|
||
return res;
|
||
}
|
||
/// <summary>
|
||
/// 处理vad 下一个切片
|
||
/// </summary>
|
||
/// <param name="VAD"></param>
|
||
/// <param name="res">字幕处理后写入数组</param>
|
||
/// <param name="totalSecond">总时长</param>
|
||
/// <returns></returns>
|
||
public double? ReadNext(VoiceActivityDetector VAD, List<SenseVoiceRes> res, float totalSecond)
|
||
{
|
||
var segment = VAD.Front();
|
||
var sampleRate = VADModelConfig.SampleRate;
|
||
var sampleRateF = (float)VADModelConfig.SampleRate;
|
||
float startTime = segment.Start / sampleRateF;
|
||
float duration = segment.Samples.Length / sampleRateF;
|
||
using var stream = Callback(sampleRate, segment.Samples);
|
||
double? resP =null;
|
||
if (!string.IsNullOrEmpty(stream.Result.Text))
|
||
{
|
||
var text = stream.Result.Text.Trim();
|
||
if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号
|
||
{
|
||
VAD.Pop();
|
||
return resP;
|
||
}
|
||
res.Add(new()
|
||
{
|
||
Text = stream.Result.Text,
|
||
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
||
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
||
});
|
||
resP = Math.Round((double)(startTime + duration) / (totalSecond) * 100, 2);
|
||
}
|
||
VAD.Pop();
|
||
return resP;
|
||
}
|
||
}
|
||
}
|