From 9e0dca0283972baca7ab8889a3a9b69e34db97a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E8=82=A5=E7=BE=8A?= <1048382248@qq.com> Date: Mon, 4 Nov 2024 18:29:58 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8E=A5=E5=85=A5=20SenseVoice=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- VideoAnalysis/Program.cs | 1 + .../AICore/SherpaOnnx/SenseVoice.cs | 88 +++++++++++++++---- .../AICore/SherpaOnnx/SherpaOnnxDto.cs | 29 ++++++ .../AICore/SherpaOnnx/Speaker.cs | 11 +-- .../AICore/Whisper/WhisperDto.cs | 10 ++- VideoAnalysisCore/Common/RedisExpand.cs | 5 +- 6 files changed, 118 insertions(+), 26 deletions(-) create mode 100644 VideoAnalysisCore/AICore/SherpaOnnx/SherpaOnnxDto.cs diff --git a/VideoAnalysis/Program.cs b/VideoAnalysis/Program.cs index f6efdb5..cf4e036 100644 --- a/VideoAnalysis/Program.cs +++ b/VideoAnalysis/Program.cs @@ -49,6 +49,7 @@ namespace Learn.VideoAnalysis //ʼ Speaker.Init(); RedisExpand.Init(); + SenseVoice.Init(); builder.Services.AddScoped(sp => diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs index 8067b92..1ac7864 100644 --- a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs +++ b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs @@ -5,6 +5,7 @@ using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; +using VideoAnalysisCore.AICore.Whisper; using VideoAnalysisCore.Common; namespace VideoAnalysisCore.AICore.SherpaOnnx @@ -12,6 +13,8 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx public class SenseVoice { static OfflineRecognizer OR =default!; + static VoiceActivityDetector VAD =default!; + static VadModelConfig VADModelConfig = default!; /// /// 初始化 SenseVoice /// @@ -19,6 +22,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx /// public static void Init(int speakerNumber = 0, double threshold = 0.6) { + Console.WriteLine("初始化 SenseVoice"); OfflineRecognizerConfig config = new OfflineRecognizerConfig(); //采样率 config.FeatConfig.SampleRate = 16000; @@ -55,6 +59,12 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx config.ModelConfig.Debug = 0; OR = new OfflineRecognizer(config); + + VADModelConfig = new VadModelConfig(); + VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "silero_VAD.onnx"); + VADModelConfig.Debug = 0; + //缓冲区大小 + VAD = new VoiceActivityDetector(VADModelConfig, 60); } /// /// 获取语音字幕 @@ -66,27 +76,73 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx var filePath = Path.Combine(task.LocalPath(), task + ".wav"); if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath)) throw new Exception("task 音频路径未找到"); - OfflineStream stream = OR.CreateStream(); - WaveReader waveReader = new WaveReader(filePath); - stream.AcceptWaveform(waveReader.SampleRate, waveReader.Samples); - OR.Decode(stream); - var r = stream.Result; - Console.WriteLine("--------------------"); - Console.WriteLine("Text: {0}", r.Text); - Console.WriteLine("Tokens: [{0}]", string.Join(", ", r.Tokens)); - if (r.Timestamps != null && r.Timestamps.Length > 0) + string testWaveFilename = filePath; + WaveReader reader = new WaveReader(testWaveFilename); + + int numSamples = reader.Samples.Length; + int windowSize = VADModelConfig.SileroVad.WindowSize; + int sampleRate = VADModelConfig.SampleRate; + int numIter = numSamples / windowSize; + + var res = new List(500); + for (int i = 0; i != numIter; ++i) { - Console.Write("Timestamps: ["); - var sep = ""; - for (int k = 0; k != r.Timestamps.Length; ++k) + int start = i * windowSize; + float[] samples = new float[windowSize]; + Array.Copy(reader.Samples, start, samples, 0, windowSize); + VAD.AcceptWaveform(samples); + //是否检测到语音 + if (VAD.IsSpeechDetected()) { - Console.Write("{0}{1}", sep, r.Timestamps[k].ToString("0.00")); - sep = ", "; + while (!VAD.IsEmpty()) + { + //获取最新的发言片段 + SpeechSegment segment = VAD.Front(); + float startTime = segment.Start / (float)sampleRate; + float duration = segment.Samples.Length / (float)sampleRate; + OfflineStream stream = OR.CreateStream(); + stream.AcceptWaveform(sampleRate, segment.Samples); + OR.Decode(stream); + if (!string.IsNullOrEmpty(stream.Result.Text)) + { + res.Add(new() + { + Text = stream.Result.Text, + Start= startTime, + End = startTime + duration }); + } + VAD.Pop(); + } } - Console.WriteLine("]"); } - await Task.CompletedTask; + VAD.Flush(); + + while (!VAD.IsEmpty()) + { + SpeechSegment segment = VAD.Front(); + float startTime = segment.Start / (float)sampleRate; + float duration = segment.Samples.Length / (float)sampleRate; + + OfflineStream stream = OR.CreateStream(); + stream.AcceptWaveform(sampleRate, segment.Samples); + OR.Decode(stream); + if (!string.IsNullOrEmpty(stream.Result.Text)) + { + res.Add(new() + { + Text = stream.Result.Text, + Start = startTime, + End = startTime + duration + }); + } + + VAD.Pop(); + } + + await RedisExpand.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res); + RedisExpand.InsertChannel(Enum.RedisChannelEnum.ParsingSpeaker, task); + } } } diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/SherpaOnnxDto.cs b/VideoAnalysisCore/AICore/SherpaOnnx/SherpaOnnxDto.cs new file mode 100644 index 0000000..0b4ab60 --- /dev/null +++ b/VideoAnalysisCore/AICore/SherpaOnnx/SherpaOnnxDto.cs @@ -0,0 +1,29 @@ +using Whisper.net; + +namespace VideoAnalysisCore.AICore.Whisper +{ + /// + /// 字幕识别 结果 + /// + public class SenseVoiceRes + { + public SenseVoiceRes() + { + + } + /// + /// 文本 + /// + public string Text { get; set; } = string.Empty; + /// + /// 开始时间 + /// + + public float Start { get; set; } + /// + /// 结束时间 + /// + + public float End { get; set; } + } +} diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs index 02b68ba..203ce1f 100644 --- a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs +++ b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs @@ -22,6 +22,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx /// public static void Init(int speakerNumber = 0, double threshold = 0.6) { + Console.WriteLine("初始化 Speaker"); var config = new OfflineSpeakerDiarizationConfig(); //Pyannote模型地址 config.Segmentation.Pyannote.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-pyannote-segmentation-3-0", "model.onnx"); @@ -72,15 +73,15 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx /// 总持续时间 /// [JsonIgnore] - public decimal Total => End - Start; + public float Total => End - Start; /// /// 开始时间 /// - public decimal Start { get; set; } + public float Start { get; set; } /// /// 结束时间 /// - public decimal End { get; set; } + public float End { get; set; } /// /// 讲话人索引 /// @@ -99,8 +100,8 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx /// public OfflineSpeakerRes(OfflineSpeakerDiarizationSegment sds) { - Start = (decimal)sds.Start; - End = (decimal)sds.End; + Start = sds.Start; + End =sds.End; SpeakerIndex = sds.Speaker; } } diff --git a/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs b/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs index 0524c39..3bd5b05 100644 --- a/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs +++ b/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs @@ -7,6 +7,10 @@ namespace VideoAnalysisCore.AICore.Whisper /// public class WhisperResDto { + public WhisperResDto() + { + + } /// /// /// @@ -20,16 +24,16 @@ namespace VideoAnalysisCore.AICore.Whisper /// /// 文本 /// - public string Text { get; } = string.Empty; + public string Text { get; set; } = string.Empty; /// /// 开始时间 /// - public TimeSpan Start { get; } + public TimeSpan Start { get; set; } /// /// 结束时间 /// - public TimeSpan End { get; } + public TimeSpan End { get; set; } } } diff --git a/VideoAnalysisCore/Common/RedisExpand.cs b/VideoAnalysisCore/Common/RedisExpand.cs index d7e6a36..a6ff907 100644 --- a/VideoAnalysisCore/Common/RedisExpand.cs +++ b/VideoAnalysisCore/Common/RedisExpand.cs @@ -79,11 +79,12 @@ namespace VideoAnalysisCore.Common /// public static RedisClient Redis = new RedisClient(AppCommon.Config.Redis.ConnectionString); /// - /// 初始化redis + /// 初始化 redis /// 需要在初始化配置文件时候调用 /// public static void Init() { + Console.WriteLine("初始化 redis"); Redis.Serialize = obj => System.Text.Json.JsonSerializer.Serialize(obj); Redis.Deserialize = (json, type) => System.Text.Json.JsonSerializer.Deserialize(json, type); InitChannel(); @@ -125,7 +126,7 @@ namespace VideoAnalysisCore.Common (msg) => { TouchChannel(RedisChannelEnum.SeparateAudio, msg, FFMPGEHandle.Audio2WAV16KAsync); }); Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingCaptions), - (msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, WhisperHandle.RunTask); }); + (msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, SenseVoice.RunTask); }); Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingSpeaker), (msg) => { TouchChannel(RedisChannelEnum.ParsingSpeaker, msg, Speaker.Run); }); Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ChatModelAnalysis),