265 lines
12 KiB
C#
265 lines
12 KiB
C#
using Microsoft.Extensions.Options;
|
|
using SherpaOnnx;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Diagnostics;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using System.Threading.Tasks;
|
|
using VideoAnalysisCore.Common;
|
|
using static System.Runtime.InteropServices.JavaScript.JSType;
|
|
|
|
namespace VideoAnalysisCore.AICore.SherpaOnnx
|
|
{
|
|
public class SenseVoice
|
|
{
|
|
const string TransducerStr = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20";
|
|
static OfflineRecognizer OR =default!;
|
|
//static VoiceActivityDetector VAD = default!;
|
|
static VadModelConfig VADModelConfig = default!;
|
|
/// <summary>
|
|
/// 初始化 SenseVoice
|
|
/// </summary>
|
|
/// <param name="numThreads">默认6线程</param>
|
|
/// <param name="useGPU">是否使用gpu 报错请看安装CUDA环境<see cref="https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/large-v3.html#run-with-gpu-float32"/></param>
|
|
public static void Init(int numThreads =6,bool useGPU=false,bool useHotwords = false)
|
|
{
|
|
Console.WriteLine("初始化 SenseVoice");
|
|
OfflineRecognizerConfig config = new OfflineRecognizerConfig();
|
|
//采样率
|
|
config.FeatConfig.SampleRate = 16000;
|
|
//用于训练模型的特征维度
|
|
config.FeatConfig.FeatureDim = 80;
|
|
//Path to tokens.txt
|
|
config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "tokens.txt");
|
|
//SenseVoice 模型
|
|
config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "model.onnx");
|
|
//1 使用逆文本规范化处理感官语音。
|
|
config.ModelConfig.SenseVoice.UseInverseTextNormalization =1;
|
|
config.ModelConfig.SenseVoice.Language = "zh";
|
|
//模型类型
|
|
config.ModelConfig.ModelType = string.Empty;
|
|
config.ModelConfig.NumThreads = numThreads;
|
|
//需要使用GPU
|
|
if (!useGPU)
|
|
config.ModelConfig.Provider = "cuda";
|
|
|
|
#region 有效的解码方法
|
|
//贪婪搜索[greedy_search] 改进的波束搜索 [modified_beam_search]
|
|
//贪婪搜索
|
|
config.DecodingMethod = "greedy_search";
|
|
|
|
////改进的波束搜索
|
|
//config.DecodingMethod = "modified_beam_search";
|
|
////仅在 --decoding--method 为 [波束搜索]modified_beam_search 时使用。
|
|
////它指定搜索过程中要保留的活动路径数
|
|
//config.MaxActivePaths =4;
|
|
#endregion
|
|
|
|
#region 热词功能[无效]
|
|
//if (false)
|
|
//{
|
|
// //热词目录
|
|
// config.HotwordsFile = Path.Combine(AppCommon.AIModelFile, "Hotwords.txt");
|
|
// config.DecodingMethod = "modified_beam_search";
|
|
// //热词得分
|
|
// config.HotwordsScore = 1.5f;
|
|
|
|
// config.ModelConfig.ModelingUnit = "cjkchar+bpe";
|
|
// config.ModelConfig.BpeVocab = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "bpe.model");
|
|
// config.ModelConfig.Transducer = new OfflineTransducerModelConfig()
|
|
// {
|
|
// Decoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "decoder-epoch-99-avg-1.onnx"),
|
|
// Encoder = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "encoder-epoch-99-avg-1.onnx"),
|
|
// Joiner = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", "joiner-epoch-99-avg-1.onnx"),
|
|
// };
|
|
//}
|
|
#endregion
|
|
|
|
//反转文本规范化规则 fst 的路径
|
|
config.RuleFsts = Path.Combine(AppCommon.AIModelFile, "itn_subject_sx.fst");
|
|
|
|
#if DEBUG
|
|
config.ModelConfig.Debug = 1;
|
|
#endif
|
|
|
|
OR = new OfflineRecognizer(config);
|
|
|
|
VADModelConfig = new VadModelConfig();
|
|
VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "silero_vad.onnx");
|
|
VADModelConfig.Debug = 0;
|
|
}
|
|
|
|
/// <summary>
|
|
/// 获取语音字幕
|
|
/// </summary>
|
|
/// <param name="s"></param>
|
|
/// <returns></returns>
|
|
public static async Task<List<SenseVoiceRes>> RunTask(Stream s)
|
|
{
|
|
if (OR is null)
|
|
Init();
|
|
if (s is null)
|
|
throw new Exception("音频路径 is null");
|
|
|
|
WaveReader reader = new WaveReader(s);
|
|
int numSamples = reader.Samples.Length;
|
|
int windowSize = VADModelConfig.SileroVad.WindowSize;
|
|
int sampleRate = VADModelConfig.SampleRate;
|
|
int numIter = numSamples / windowSize;
|
|
var totalSecond = numSamples / (float)sampleRate;
|
|
var res = new List<SenseVoiceRes>(500);
|
|
|
|
//缓冲区大小
|
|
var VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
|
//var VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
|
for (int i = 0; i != numIter; ++i)
|
|
{
|
|
int start = i * windowSize;
|
|
float[] samples = new float[windowSize];
|
|
Array.Copy(reader.Samples, start, samples, 0, windowSize);
|
|
VAD.AcceptWaveform(samples);
|
|
//是否检测到语音
|
|
if (VAD.IsSpeechDetected())
|
|
{
|
|
while (!VAD.IsEmpty())
|
|
{
|
|
//获取最新的发言片段
|
|
SpeechSegment segment = VAD.Front();
|
|
float startTime = segment.Start / (float)sampleRate;
|
|
float duration = segment.Samples.Length / (float)sampleRate;
|
|
using OfflineStream stream = OR.CreateStream();
|
|
stream.AcceptWaveform(sampleRate, segment.Samples);
|
|
OR.Decode(stream);
|
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
|
{
|
|
res.Add(new()
|
|
{
|
|
Text = stream.Result.Text,
|
|
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
|
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
|
});
|
|
}
|
|
VAD.Pop();
|
|
}
|
|
}
|
|
}
|
|
VAD.Flush();
|
|
|
|
while (!VAD.IsEmpty())
|
|
{
|
|
SpeechSegment segment = VAD.Front();
|
|
float startTime = segment.Start / (float)sampleRate;
|
|
float duration = segment.Samples.Length / (float)sampleRate;
|
|
|
|
OfflineStream stream = OR.CreateStream();
|
|
stream.AcceptWaveform(sampleRate, segment.Samples);
|
|
OR.Decode(stream);
|
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
|
{
|
|
res.Add(new()
|
|
{
|
|
Text = stream.Result.Text,
|
|
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
|
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
|
});
|
|
}
|
|
|
|
VAD.Pop();
|
|
}
|
|
VAD.Reset();
|
|
return res;
|
|
|
|
}
|
|
/// <summary>
|
|
/// 获取语音字幕
|
|
/// </summary>
|
|
/// <param name="task"></param>
|
|
/// <returns></returns>
|
|
public static async Task RunTask(string task)
|
|
{
|
|
if (OR is null)
|
|
Init();
|
|
var filePath = Path.Combine(task.LocalPath(), task + ".wav");
|
|
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
|
|
throw new Exception("task 音频路径未找到");
|
|
|
|
WaveReader reader = new WaveReader(filePath);
|
|
int numSamples = reader.Samples.Length;
|
|
int windowSize = VADModelConfig.SileroVad.WindowSize;
|
|
int sampleRate = VADModelConfig.SampleRate;
|
|
int numIter = numSamples / windowSize;
|
|
var totalSecond = numSamples / (float)sampleRate;
|
|
var res = new List<SenseVoiceRes>(500);
|
|
var VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
|
for (int i = 0; i != numIter; ++i)
|
|
{
|
|
int start = i * windowSize;
|
|
float[] samples = new float[windowSize];
|
|
Array.Copy(reader.Samples, start, samples, 0, windowSize);
|
|
VAD.AcceptWaveform(samples);
|
|
//是否检测到语音
|
|
if (VAD.IsSpeechDetected())
|
|
{
|
|
while (!VAD.IsEmpty())
|
|
{
|
|
//获取最新的发言片段
|
|
SpeechSegment segment = VAD.Front();
|
|
float startTime = segment.Start / (float)sampleRate;
|
|
float duration = segment.Samples.Length / (float)sampleRate;
|
|
OfflineStream stream = OR.CreateStream();
|
|
stream.AcceptWaveform(sampleRate, segment.Samples);
|
|
OR.Decode(stream);
|
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
|
{
|
|
res.Add(new()
|
|
{
|
|
Text = stream.Result.Text,
|
|
//Text = ExpandFunction.HandleFormula(stream.Result.Text),
|
|
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
|
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
|
});
|
|
var progress = (float)(startTime + duration) / (totalSecond) * 100;
|
|
RedisExpand.SetTaskProgress(task, progress);
|
|
|
|
}
|
|
VAD.Pop();
|
|
}
|
|
}
|
|
}
|
|
VAD.Flush();
|
|
|
|
while (!VAD.IsEmpty())
|
|
{
|
|
SpeechSegment segment = VAD.Front();
|
|
float startTime = segment.Start / (float)sampleRate;
|
|
float duration = segment.Samples.Length / (float)sampleRate;
|
|
|
|
OfflineStream stream = OR.CreateStream();
|
|
stream.AcceptWaveform(sampleRate, segment.Samples);
|
|
OR.Decode(stream);
|
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
|
{
|
|
res.Add(new()
|
|
{
|
|
|
|
Text = stream.Result.Text,
|
|
//Text = ExpandFunction.HandleFormula(stream.Result.Text),
|
|
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
|
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
|
});
|
|
}
|
|
|
|
VAD.Pop();
|
|
}
|
|
|
|
await RedisExpand.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
|
|
//RedisExpand.InsertChannel(Enum.RedisChannelEnum.ParsingSpeaker, task);
|
|
RedisExpand.InsertChannel(Enum.RedisChannelEnum.ChatModelAnalysis, task);
|
|
|
|
}
|
|
}
|
|
}
|