优化 语音识别vad 进度重置

This commit is contained in:
小肥羊 2024-11-20 14:58:00 +08:00
parent d477051c06
commit b9563452c3
4 changed files with 174 additions and 6 deletions

View File

@ -13,6 +13,10 @@ using System.Threading.Tasks;
using FFmpeg.NET.Services;
using MapsterMapper;
using Mapster;
using VideoAnalysisCore.AICore.SherpaOnnx;
using System.Net;
using System.Security.Policy;
using System.IO;
namespace Learn.VideoAnalysis.Controllers
{
@ -42,13 +46,49 @@ namespace Learn.VideoAnalysis.Controllers
}
/// <summary>
/// 获取视频信息<para>taskId/tagId二选一</para>
/// 语音识别
/// </summary>
/// <param name="taskId"></param>
/// <param name="tagId">自定义id</param>
/// <param name="needSubtitle">是否附加字幕</param>
/// <param name="url">文件流</param>
/// <returns></returns>
[HttpGet(Name = "TaskInfo")]
[HttpGet(Name = "AudioRecognitionUrl")]
public async Task<IActionResult> AudioRecognitionUrl(string url)
{
try
{
using HttpClient client = new HttpClient();
// 发送GET请求获取网络文件流
using var networkStream = await client.GetStreamAsync(url);
var res = await SenseVoice.RunTask(networkStream);
return Ok(res);
}
catch (Exception ex)
{
return BadRequest(ex.Message);
}
}
/// <summary>
/// 语音识别
/// </summary>
/// <param name="file">文件流</param>
/// <returns></returns>
[HttpPost(Name = "AudioRecognition")]
public async Task<IActionResult> AudioRecognition(IFormFile file)
{
using var s = file.OpenReadStream();
var res = await SenseVoice.RunTask(s);
return Ok(res);
}
/// <summary>
/// 获取视频信息<para>taskId/tagId二选一</para>
/// </summary>
/// <param name="taskId"></param>
/// <param name="tagId">自定义id</param>
/// <param name="needSubtitle">是否附加字幕</param>
/// <returns></returns>
[HttpGet(Name = "TaskInfo")]
public async Task<IActionResult> TaskInfo(long taskId,string? tagId,bool needSubtitle=false)
{
var task = await videoTaskDB.AsQueryable()

View File

@ -2,6 +2,7 @@
using SherpaOnnx;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
@ -13,7 +14,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
public class SenseVoice
{
static OfflineRecognizer OR =default!;
static VoiceActivityDetector VAD =default!;
static VoiceActivityDetector VAD = default!;
static VadModelConfig VADModelConfig = default!;
/// <summary>
/// 初始化 SenseVoice
@ -61,7 +62,9 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
//反转文本规范化规则 fst 的路径
config.RuleFsts = string.Empty;
#if DEBUG
config.ModelConfig.Debug = 1;
#endif
OR = new OfflineRecognizer(config);
@ -71,6 +74,85 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
//缓冲区大小
VAD = new VoiceActivityDetector(VADModelConfig, 60);
}
/// <summary>
/// 获取语音字幕
/// </summary>
/// <param name="s"></param>
/// <returns></returns>
public static async Task<List<SenseVoiceRes>> RunTask(Stream s)
{
if (OR is null)
Init();
if (s is null)
throw new Exception("音频路径 is null");
WaveReader reader = new WaveReader(s);
int numSamples = reader.Samples.Length;
int windowSize = VADModelConfig.SileroVad.WindowSize;
int sampleRate = VADModelConfig.SampleRate;
int numIter = numSamples / windowSize;
var totalSecond = numSamples / (float)sampleRate;
var res = new List<SenseVoiceRes>(500);
//var VAD = new VoiceActivityDetector(VADModelConfig, 60);
for (int i = 0; i != numIter; ++i)
{
int start = i * windowSize;
float[] samples = new float[windowSize];
Array.Copy(reader.Samples, start, samples, 0, windowSize);
VAD.AcceptWaveform(samples);
//是否检测到语音
if (VAD.IsSpeechDetected())
{
while (!VAD.IsEmpty())
{
//获取最新的发言片段
SpeechSegment segment = VAD.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
res.Add(new()
{
Text = stream.Result.Text,
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
});
}
VAD.Pop();
}
}
}
VAD.Flush();
while (!VAD.IsEmpty())
{
SpeechSegment segment = VAD.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
res.Add(new()
{
Text = stream.Result.Text,
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
});
}
VAD.Pop();
}
VAD.Reset();
return res;
}
/// <summary>
/// 获取语音字幕
/// </summary>
@ -91,6 +173,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
int numIter = numSamples / windowSize;
var totalSecond = numSamples / (float)sampleRate;
var res = new List<SenseVoiceRes>(500);
//var VAD = new VoiceActivityDetector(VADModelConfig, 60);
for (int i = 0; i != numIter; ++i)
{
int start = i * windowSize;

View File

@ -35,6 +35,10 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
//需要使用GPU
if (!useGPU)
config.Embedding.Provider = "cuda";
#if DEBUG
config.Embedding.Debug = 1;
#endif
//说话人判定阈值
config.Clustering.Threshold = threshold;
SD = new OfflineSpeakerDiarization(config);

View File

@ -90,6 +90,29 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
// The sample rate can be any value.
public class WaveReader
{
public WaveReader(Stream stream)
{
using (var reader = new BinaryReader(stream))
{
_header = ReadHeader(reader);
if (!_header.Validate())
throw new ApplicationException($"无效的音频文件");
SkipMetaData(reader);
//现在阅读样本
//_header.SubChunk2Size 包含总共的字节数。
//我们假设每个样本都是 int16 类型
byte[] buffer = reader.ReadBytes(_header.SubChunk2Size);
short[] samples_int16 = new short[_header.SubChunk2Size / 2];
Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length);
_samples = new float[samples_int16.Length];
for (var i = 0; i < samples_int16.Length; ++i)
{
_samples[i] = samples_int16[i] / 32768.0F;
}
}
}
public WaveReader(string fileName)
{
if (!File.Exists(fileName))
@ -139,6 +162,24 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
}
private void SkipMetaData(BinaryReader reader)
{
int subChunk2ID = _header.SubChunk2ID;
int subChunk2Size = _header.SubChunk2Size;
while ( subChunk2ID != 0x61746164)
{
// 读取并缓存当前位置之后的数据以避免直接使用Position属性
byte[] buffer = new byte[subChunk2Size];
int bytesRead = reader.Read(buffer, 0, subChunk2Size);
subChunk2ID = reader.ReadInt32();
subChunk2Size = reader.ReadInt32();
if (bytesRead < subChunk2Size)
break;
}
_header.SubChunk2ID = subChunk2ID;
_header.SubChunk2Size = subChunk2Size;
}
private void SkipMetaData_old(BinaryReader reader)
{
var bs = reader.BaseStream;