优化语音识别vad 进度重置

2024-11-20 14:58:00 +08:00 · 2024-11-20 14:58:00 +08:00 · b9563452c3
parent d477051c06
commit b9563452c3
4 changed files with 174 additions and 6 deletions
--- a/VideoAnalysis/Controllers/ApiController.cs
+++ b/VideoAnalysis/Controllers/ApiController.cs
@ -13,6 +13,10 @@ using System.Threading.Tasks;
 using FFmpeg.NET.Services;
 using MapsterMapper;
 using Mapster;
+using VideoAnalysisCore.AICore.SherpaOnnx;
+using System.Net;
+using System.Security.Policy;
+using System.IO;

 namespace Learn.VideoAnalysis.Controllers
 {
@ -42,13 +46,49 @@ namespace Learn.VideoAnalysis.Controllers
        }

        /// <summary>
-        /// 获取视频信息<para>taskId/tagId二选一</para>
+        /// 语音识别
        /// </summary>
-        /// <param name="taskId"></param>
-        /// <param name="tagId">自定义id</param>
-        /// <param name="needSubtitle">是否附加字幕</param>
+        /// <param name="url">文件流</param>
        /// <returns></returns>
-        [HttpGet(Name = "TaskInfo")]
+        [HttpGet(Name = "AudioRecognitionUrl")]
+        public async Task<IActionResult> AudioRecognitionUrl(string url)
+        {
+            try
+            {
+                using HttpClient client = new HttpClient();
+                // 发送GET请求获取网络文件流
+                using var networkStream = await client.GetStreamAsync(url);
+                var res = await SenseVoice.RunTask(networkStream);
+                return Ok(res);
+            }
+            catch (Exception ex)
+            {
+                return BadRequest(ex.Message);
+            }
+        }
+        /// <summary>
+        /// 语音识别
+        /// </summary>
+        /// <param name="file">文件流</param>
+        /// <returns></returns>
+        [HttpPost(Name = "AudioRecognition")]
+        public async Task<IActionResult> AudioRecognition(IFormFile file)
+        {
+             using var s = file.OpenReadStream();
+            var res = await SenseVoice.RunTask(s);
+            return Ok(res);
+        }
+
+
+
+            /// <summary>
+            /// 获取视频信息<para>taskId/tagId二选一</para>
+            /// </summary>
+            /// <param name="taskId"></param>
+            /// <param name="tagId">自定义id</param>
+            /// <param name="needSubtitle">是否附加字幕</param>
+            /// <returns></returns>
+            [HttpGet(Name = "TaskInfo")]
        public async Task<IActionResult> TaskInfo(long taskId,string? tagId,bool needSubtitle=false)
        {
            var  task = await videoTaskDB.AsQueryable()
--- a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
@ -2,6 +2,7 @@
 using SherpaOnnx;
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
@ -13,7 +14,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
    public  class SenseVoice
    {
        static OfflineRecognizer OR =default!;
-        static VoiceActivityDetector VAD =default!;
+        static VoiceActivityDetector VAD = default!;
        static VadModelConfig VADModelConfig = default!;
        /// <summary>
        /// 初始化 SenseVoice
@ -61,7 +62,9 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
            //反转文本规范化规则 fst 的路径
            config.RuleFsts =  string.Empty;

+#if DEBUG
            config.ModelConfig.Debug = 1;
+#endif

            OR = new OfflineRecognizer(config);

@ -71,6 +74,85 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
            //缓冲区大小
            VAD = new VoiceActivityDetector(VADModelConfig, 60);
        }
+
+        /// <summary>
+        /// 获取语音字幕
+        /// </summary>
+        /// <param name="s"></param>
+        /// <returns></returns>
+        public static async Task<List<SenseVoiceRes>> RunTask(Stream s)
+        {
+            if (OR is null)
+                Init();
+            if (s is null)
+                throw new Exception("音频路径 is null");
+
+            WaveReader reader = new WaveReader(s);
+            int numSamples = reader.Samples.Length;
+            int windowSize = VADModelConfig.SileroVad.WindowSize;
+            int sampleRate = VADModelConfig.SampleRate;
+            int numIter = numSamples / windowSize;
+            var totalSecond = numSamples / (float)sampleRate;
+            var res = new List<SenseVoiceRes>(500);
+            //var VAD = new VoiceActivityDetector(VADModelConfig, 60);
+            for (int i = 0; i != numIter; ++i)
+            {
+                int start = i * windowSize;
+                float[] samples = new float[windowSize];
+                Array.Copy(reader.Samples, start, samples, 0, windowSize);
+                VAD.AcceptWaveform(samples);
+                //是否检测到语音
+                if (VAD.IsSpeechDetected())
+                {
+                    while (!VAD.IsEmpty())
+                    {
+                        //获取最新的发言片段
+                        SpeechSegment segment = VAD.Front();
+                        float startTime = segment.Start / (float)sampleRate;
+                        float duration = segment.Samples.Length / (float)sampleRate;
+                        OfflineStream stream = OR.CreateStream();
+                        stream.AcceptWaveform(sampleRate, segment.Samples);
+                        OR.Decode(stream);
+                        if (!string.IsNullOrEmpty(stream.Result.Text))
+                        {
+                            res.Add(new()
+                            {
+                                Text = stream.Result.Text,
+                                Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
+                                End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
+                            });
+                        }
+                        VAD.Pop();
+                    }
+                }
+            }
+            VAD.Flush();
+
+            while (!VAD.IsEmpty())
+            {
+                SpeechSegment segment = VAD.Front();
+                float startTime = segment.Start / (float)sampleRate;
+                float duration = segment.Samples.Length / (float)sampleRate;
+
+                OfflineStream stream = OR.CreateStream();
+                stream.AcceptWaveform(sampleRate, segment.Samples);
+                OR.Decode(stream);
+                if (!string.IsNullOrEmpty(stream.Result.Text))
+                {
+                    res.Add(new()
+                    {
+                        Text = stream.Result.Text,
+                        Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
+                        End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
+                    });
+                }
+
+                VAD.Pop();
+            }
+            VAD.Reset();
+            return res;
+
+        }
        /// <summary>
        /// 获取语音字幕
        /// </summary>
@ -91,6 +173,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
            int numIter = numSamples / windowSize;
            var totalSecond = numSamples / (float)sampleRate;
            var res = new List<SenseVoiceRes>(500);
+            //var VAD = new VoiceActivityDetector(VADModelConfig, 60);
            for (int i = 0; i != numIter; ++i)
            {
                int start = i * windowSize;
--- a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
@ -35,6 +35,10 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
            //需要使用GPU
            if (!useGPU)
                config.Embedding.Provider = "cuda";
+#if DEBUG
+            config.Embedding.Debug = 1;
+#endif
+
            //说话人判定阈值
            config.Clustering.Threshold = threshold;
            SD = new OfflineSpeakerDiarization(config);
--- a/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs
@ -90,6 +90,29 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
    // The sample rate can be any value.
    public class WaveReader
    {
+        public WaveReader(Stream stream)
+        {
+            using (var reader = new BinaryReader(stream))
+            {
+                _header = ReadHeader(reader);
+                if (!_header.Validate())
+                    throw new ApplicationException($"无效的音频文件");
+                SkipMetaData(reader);
+                //现在阅读样本
+                //_header.SubChunk2Size 包含总共的字节数。
+                //我们假设每个样本都是 int16 类型
+                byte[] buffer = reader.ReadBytes(_header.SubChunk2Size);
+                short[] samples_int16 = new short[_header.SubChunk2Size / 2];
+                Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length);
+
+                _samples = new float[samples_int16.Length];
+
+                for (var i = 0; i < samples_int16.Length; ++i)
+                {
+                    _samples[i] = samples_int16[i] / 32768.0F;
+                }
+            }
+        }
        public WaveReader(string fileName)
        {
            if (!File.Exists(fileName))
@ -139,6 +162,24 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
        }

        private void SkipMetaData(BinaryReader reader)
+        {
+            int subChunk2ID = _header.SubChunk2ID;
+            int subChunk2Size = _header.SubChunk2Size;
+            while ( subChunk2ID != 0x61746164)
+            {
+                // 读取并缓存当前位置之后的数据，以避免直接使用Position属性
+                byte[] buffer = new byte[subChunk2Size];
+                int bytesRead = reader.Read(buffer, 0, subChunk2Size);
+
+                subChunk2ID = reader.ReadInt32();
+                subChunk2Size = reader.ReadInt32();
+                if (bytesRead < subChunk2Size)
+                    break;
+            }
+            _header.SubChunk2ID = subChunk2ID;
+            _header.SubChunk2Size = subChunk2Size;
+        }
+        private void SkipMetaData_old(BinaryReader reader)
        {
            var bs = reader.BaseStream;