Learn.VideoAnalysis/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs

226 lines
9.6 KiB
C#

using Microsoft.Extensions.Options;
using SherpaOnnx;
using SqlSugar.IOC;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using VideoAnalysisCore.Common;
using VideoAnalysisCore.Model;
using VideoAnalysisCore.Model.Enum;
using static System.Runtime.InteropServices.JavaScript.JSType;
namespace VideoAnalysisCore.AICore.SherpaOnnx
{
public static class SenseVoice
{
const string TransducerStr = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20";
static OfflineRecognizer OR = default!;
//static VoiceActivityDetector VAD = default!;
static VadModelConfig VADModelConfig = default!;
/// <summary>
/// 初始化 SenseVoice
/// </summary>
/// <param name="numThreads">默认6线程</param>
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
public static void Init(int numThreads = 6, bool useGPU = false, bool useHotwords = false)
{
Console.WriteLine("初始化 SenseVoice");
OfflineRecognizerConfig config = new OfflineRecognizerConfig();
//采样率
config.FeatConfig.SampleRate = 16000;
//用于训练模型的特征维度
config.FeatConfig.FeatureDim = 80;
//Path to tokens.txt
config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "tokens.txt");
//SenseVoice 模型
config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "model.onnx");
//1 使用逆文本规范化处理感官语音。
config.ModelConfig.SenseVoice.UseInverseTextNormalization = 1;
config.ModelConfig.SenseVoice.Language = "zh";
//模型类型
config.ModelConfig.ModelType = string.Empty;
config.ModelConfig.NumThreads = numThreads;
config.ModelConfig.Provider = "cpu";
//需要使用GPU
if (!useGPU)
config.ModelConfig.Provider = "cuda";
#region
//贪婪搜索[greedy_search] 改进的波束搜索 [modified_beam_search]
//贪婪搜索
config.DecodingMethod = "greedy_search";
////改进的波束搜索
//config.DecodingMethod = "modified_beam_search";
////仅在 --decoding--method 为 [波束搜索]modified_beam_search 时使用。
////它指定搜索过程中要保留的活动路径数
//config.MaxActivePaths =4;
#endregion
#region []
//if (false)
//{
// //热词目录
// config.HotwordsFile = Path.Combine(AppCommon.AIModelFile, "Hotwords.txt");
// config.DecodingMethod = "modified_beam_search";
// //热词得分
// config.HotwordsScore = 1.5f;
// config.ModelConfig.ModelingUnit = "cjkchar+bpe";
// config.ModelConfig.BpeVocab = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "bpe.model");
// config.ModelConfig.Transducer = new OfflineTransducerModelConfig()
// {
// Decoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "decoder-epoch-99-avg-1.onnx"),
// Encoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "encoder-epoch-99-avg-1.onnx"),
// Joiner = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "joiner-epoch-99-avg-1.onnx"),
// };
//}
#endregion
//反转文本规范化规则 fst 的路径
config.RuleFsts = Path.Combine(AppCommon.AIModelFile, "itn_subject_sx.fst");
#if DEBUG
config.ModelConfig.Debug = 1;
#endif
OR = new OfflineRecognizer(config);
VADModelConfig = new VadModelConfig();
VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "silero_vad.onnx");
VADModelConfig.Debug = 0;
}
/// <summary>
/// 获取语音字幕
/// </summary>
/// <param name="s"></param>
/// <returns></returns>
public static async Task<List<SenseVoiceRes>> RunTask(Stream s)
{
if (s is null)
throw new Exception("音频路径 is null");
return await TaskHandle(new WaveReader(s));
}
/// <summary>
/// 获取语音字幕
/// </summary>
/// <param name="task"></param>
/// <returns></returns>
public static async Task RunTask(string task)
{
var filePath = Path.Combine(task.LocalPath(), "task.wav");
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
throw new Exception("task 音频路径未找到");
await TaskHandle(new WaveReader(filePath), task);
}
/// <summary>
/// 任务处理
/// </summary>
/// <param name="reader">Wave</param>
/// <param name="task">任务id [默认Null]</param>
/// <returns></returns>
/// <exception cref="Exception"></exception>
public static async Task<List<SenseVoiceRes>> TaskHandle(WaveReader reader, string? task = null)
{
if (OR is null)
Init();
int numSamples = reader.Samples.Length;
int windowSize = VADModelConfig.SileroVad.WindowSize;
int sampleRate = VADModelConfig.SampleRate;
int numIter = numSamples / windowSize;
var totalSecond = numSamples / (float)sampleRate;
var res = new List<SenseVoiceRes>(500);
using var VAD = new VoiceActivityDetector(VADModelConfig, 30);
for (int i = 0; i != numIter; ++i)
{
int start = i * windowSize;
float[] samples = new float[windowSize];
Array.Copy(reader.Samples, start, samples, 0, windowSize);
VAD.AcceptWaveform(samples);
//Memory<float> samples = new float[windowSize];
//Memory<float> sourceSpan = reader.Samples.AsMemory(start, windowSize);
//sourceSpan.CopyTo(samples);
//VAD.AcceptWaveform(samples.ToArray());
//是否检测到语音
if (VAD.IsSpeechDetected())
{
//获取最新的发言片段
while (!VAD.IsEmpty())
await VAD.ReadNext(res, totalSecond, task);
}
}
VAD.Flush();
while (!VAD.IsEmpty())
await VAD.ReadNext(res, totalSecond, task);
//如果携带任务ID
if (!string.IsNullOrEmpty(task))
{
Console.WriteLine(DateTime.Now + "=> SenseVoice 字幕数量" + res.Count);
var captionsStr = res.ToJson();
await DbScoped.Sugar
.Updateable<VideoTask>()
.SetColumns(it => it.Captions == captionsStr)
.Where(it => it.Id == long.Parse(task))
.ExecuteCommandAsync();
await RedisExpand.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
//分析完成视频字幕后继续接收任务
RedisExpand.NewTaskAsync();
}
return res;
}
/// <summary>
/// 处理vad 下一个切片
/// </summary>
/// <param name="VAD"></param>
/// <param name="res">字幕处理后写入数组</param>
/// <param name="totalSecond">总时长</param>
/// <param name="task">所属任务id</param>
/// <returns></returns>
public static async Task ReadNext(this VoiceActivityDetector VAD, List<SenseVoiceRes> res, float totalSecond, string? task = null)
{
var segment = VAD.Front();
var sampleRate = VADModelConfig.SampleRate;
var sampleRateF = (float)VADModelConfig.SampleRate;
float startTime = segment.Start / sampleRateF;
float duration = segment.Samples.Length / sampleRateF;
using var stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
var text = stream.Result.Text.Trim();
if (text.Length == 1 && text == "。")// 检查字符是否只有一个句号
{
VAD.Pop();
return;
}
res.Add(new()
{
Text = stream.Result.Text,
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
});
if (!string.IsNullOrEmpty(task))
RedisExpand.SetTaskProgress(task, Math.Round((double)(startTime + duration) / (totalSecond) * 100,2)+"%");
}
VAD.Pop();
}
}
}