接入 SenseVoice模型

2024-11-04 18:29:58 +08:00 · 2024-11-04 18:29:58 +08:00 · 9e0dca0283
parent b1cbedb9e8
commit 9e0dca0283
6 changed files with 118 additions and 26 deletions
--- a/VideoAnalysis/Program.cs
+++ b/VideoAnalysis/Program.cs
@ -49,6 +49,7 @@ namespace Learn.VideoAnalysis
            //初始化 插件
            Speaker.Init();
            RedisExpand.Init();
+            SenseVoice.Init();


            builder.Services.AddScoped(sp =>
--- a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
@ -5,6 +5,7 @@ using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
+using VideoAnalysisCore.AICore.Whisper;
 using VideoAnalysisCore.Common;

 namespace VideoAnalysisCore.AICore.SherpaOnnx
@ -12,6 +13,8 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
    public  class SenseVoice
    {
        static OfflineRecognizer OR =default!;
+        static VoiceActivityDetector VAD =default!;
+        static VadModelConfig VADModelConfig = default!;
        /// <summary>
        /// 初始化 SenseVoice
        /// </summary>
@ -19,6 +22,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
        /// <param name="threshold"></param>
        public static void Init(int speakerNumber = 0, double threshold = 0.6)
        {
+            Console.WriteLine("初始化 SenseVoice");
            OfflineRecognizerConfig config = new OfflineRecognizerConfig();
            //采样率
            config.FeatConfig.SampleRate = 16000;
@ -55,6 +59,12 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
            config.ModelConfig.Debug = 0;

            OR = new OfflineRecognizer(config);
+
+            VADModelConfig = new VadModelConfig();
+            VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "silero_VAD.onnx");
+            VADModelConfig.Debug = 0;
+            //缓冲区大小
+            VAD = new VoiceActivityDetector(VADModelConfig, 60);
        }
        /// <summary>
        /// 获取语音字幕
@ -66,27 +76,73 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
            var filePath = Path.Combine(task.LocalPath(), task + ".wav");
            if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
                throw new Exception("task 音频路径未找到");
-            OfflineStream stream = OR.CreateStream();
-            WaveReader waveReader = new WaveReader(filePath);
-            stream.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
-            OR.Decode(stream);

-            var r = stream.Result;
-            Console.WriteLine("--------------------");
-            Console.WriteLine("Text: {0}", r.Text);
-            Console.WriteLine("Tokens: [{0}]", string.Join(", ", r.Tokens));
-            if (r.Timestamps != null && r.Timestamps.Length > 0)
+            string testWaveFilename = filePath;
+            WaveReader reader = new WaveReader(testWaveFilename);
+
+            int numSamples = reader.Samples.Length;
+            int windowSize = VADModelConfig.SileroVad.WindowSize;
+            int sampleRate = VADModelConfig.SampleRate;
+            int numIter = numSamples / windowSize;
+
+            var res = new List<SenseVoiceRes>(500);
+            for (int i = 0; i != numIter; ++i)
            {
-                Console.Write("Timestamps: [");
-                var sep = "";
-                for (int k = 0; k != r.Timestamps.Length; ++k)
+                int start = i * windowSize;
+                float[] samples = new float[windowSize];
+                Array.Copy(reader.Samples, start, samples, 0, windowSize);
+                VAD.AcceptWaveform(samples);
+                //是否检测到语音
+                if (VAD.IsSpeechDetected())
                {
-                    Console.Write("{0}{1}", sep, r.Timestamps[k].ToString("0.00"));
-                    sep = ", ";
+                    while (!VAD.IsEmpty())
+                    {
+                        //获取最新的发言片段
+                        SpeechSegment segment = VAD.Front();
+                        float startTime = segment.Start / (float)sampleRate;
+                        float duration = segment.Samples.Length / (float)sampleRate;
+                        OfflineStream stream = OR.CreateStream();
+                        stream.AcceptWaveform(sampleRate, segment.Samples);
+                        OR.Decode(stream);
+                        if (!string.IsNullOrEmpty(stream.Result.Text))
+                        {
+                            res.Add(new() 
+                            { 
+                                Text = stream.Result.Text,
+                                Start= startTime,
+                                End = startTime + duration });
                        }
-                Console.WriteLine("]");
-            }
-            await Task.CompletedTask;
+                        VAD.Pop();
+                    }
+                }
+            }
+            VAD.Flush();
+
+            while (!VAD.IsEmpty())
+            {
+                SpeechSegment segment = VAD.Front();
+                float startTime = segment.Start / (float)sampleRate;
+                float duration = segment.Samples.Length / (float)sampleRate;
+
+                OfflineStream stream = OR.CreateStream();
+                stream.AcceptWaveform(sampleRate, segment.Samples);
+                OR.Decode(stream);
+                if (!string.IsNullOrEmpty(stream.Result.Text))
+                {
+                    res.Add(new()
+                    {
+                        Text = stream.Result.Text,
+                        Start = startTime,
+                        End = startTime + duration
+                    });
+                }
+
+                VAD.Pop();
+            }
+
+            await RedisExpand.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
+            RedisExpand.InsertChannel(Enum.RedisChannelEnum.ParsingSpeaker, task);
+
        }
    }
 }
--- a/VideoAnalysisCore/AICore/SherpaOnnx/SherpaOnnxDto.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/SherpaOnnxDto.cs
@ -0,0 +1,29 @@
+using Whisper.net;
+
+namespace VideoAnalysisCore.AICore.Whisper
+{
+    /// <summary>
+    ///  字幕识别 结果
+    /// </summary>
+    public class SenseVoiceRes
+    {
+        public SenseVoiceRes()
+        {
+                
+        }
+        /// <summary>
+        /// 文本
+        /// </summary>
+        public string Text { get; set; } = string.Empty;
+        /// <summary>
+        /// 开始时间
+        /// </summary>
+
+        public float Start { get; set; }
+        /// <summary>
+        /// 结束时间
+        /// </summary>
+
+        public float End { get; set; }
+    }
+}
--- a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
@ -22,6 +22,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
        /// <param name="threshold"></param>
        public static void Init(int speakerNumber = 0, double threshold = 0.6)
        {
+            Console.WriteLine("初始化 Speaker");
            var config = new OfflineSpeakerDiarizationConfig();
            //Pyannote模型地址
            config.Segmentation.Pyannote.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-pyannote-segmentation-3-0", "model.onnx");
@ -72,15 +73,15 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
        /// 总持续时间
        /// </summary>
        [JsonIgnore]
-        public decimal Total => End - Start;
+        public float Total => End - Start;
        /// <summary>
        /// 开始时间
        /// </summary>
-        public decimal Start { get; set; }
+        public float Start { get; set; }
        /// <summary>
        /// 结束时间
        /// </summary>
-        public decimal End { get; set; }
+        public float End { get; set; }
        /// <summary>
        /// 讲话人索引
        /// </summary>
@ -99,8 +100,8 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
        /// <param name="sds"></param>
        public OfflineSpeakerRes(OfflineSpeakerDiarizationSegment sds)
        {
-            Start = (decimal)sds.Start;
-            End = (decimal)sds.End;
+            Start = sds.Start;
+            End =sds.End;
            SpeakerIndex = sds.Speaker;
        }
    }
--- a/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs
+++ b/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs
@ -7,6 +7,10 @@ namespace VideoAnalysisCore.AICore.Whisper
    /// </summary>
    public class WhisperResDto
    {
+        public WhisperResDto()
+        {
+                
+        }
        /// <summary>
        /// 
        /// </summary>
@ -20,16 +24,16 @@ namespace VideoAnalysisCore.AICore.Whisper
        /// <summary>
        /// 文本
        /// </summary>
-        public string Text { get; } = string.Empty;
+        public string Text { get; set; } = string.Empty;
        /// <summary>
        /// 开始时间
        /// </summary>

-        public TimeSpan Start { get; }
+        public TimeSpan Start { get; set; }
        /// <summary>
        /// 结束时间
        /// </summary>

-        public TimeSpan End { get; }
+        public TimeSpan End { get; set; }
    }
 }
--- a/VideoAnalysisCore/Common/RedisExpand.cs
+++ b/VideoAnalysisCore/Common/RedisExpand.cs
@ -84,6 +84,7 @@ namespace VideoAnalysisCore.Common
        /// </summary>
        public static void Init()
        {
+            Console.WriteLine("初始化 redis");
            Redis.Serialize = obj => System.Text.Json.JsonSerializer.Serialize(obj);
            Redis.Deserialize = (json, type) => System.Text.Json.JsonSerializer.Deserialize(json, type);
            InitChannel();
@ -125,7 +126,7 @@ namespace VideoAnalysisCore.Common
                (msg) => { TouchChannel(RedisChannelEnum.SeparateAudio, msg, FFMPGEHandle.Audio2WAV16KAsync); });

            Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingCaptions),
-                (msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, WhisperHandle.RunTask); });
+                (msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, SenseVoice.RunTask); });
            Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingSpeaker),
                (msg) => { TouchChannel(RedisChannelEnum.ParsingSpeaker, msg, Speaker.Run); });
            Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ChatModelAnalysis),