优化 语音识别vad 进度重置
This commit is contained in:
parent
d477051c06
commit
b9563452c3
|
|
@ -13,6 +13,10 @@ using System.Threading.Tasks;
|
|||
using FFmpeg.NET.Services;
|
||||
using MapsterMapper;
|
||||
using Mapster;
|
||||
using VideoAnalysisCore.AICore.SherpaOnnx;
|
||||
using System.Net;
|
||||
using System.Security.Policy;
|
||||
using System.IO;
|
||||
|
||||
namespace Learn.VideoAnalysis.Controllers
|
||||
{
|
||||
|
|
@ -42,13 +46,49 @@ namespace Learn.VideoAnalysis.Controllers
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// 获取视频信息<para>taskId/tagId二选一</para>
|
||||
/// 语音识别
|
||||
/// </summary>
|
||||
/// <param name="taskId"></param>
|
||||
/// <param name="tagId">自定义id</param>
|
||||
/// <param name="needSubtitle">是否附加字幕</param>
|
||||
/// <param name="url">文件流</param>
|
||||
/// <returns></returns>
|
||||
[HttpGet(Name = "TaskInfo")]
|
||||
[HttpGet(Name = "AudioRecognitionUrl")]
|
||||
public async Task<IActionResult> AudioRecognitionUrl(string url)
|
||||
{
|
||||
try
|
||||
{
|
||||
using HttpClient client = new HttpClient();
|
||||
// 发送GET请求获取网络文件流
|
||||
using var networkStream = await client.GetStreamAsync(url);
|
||||
var res = await SenseVoice.RunTask(networkStream);
|
||||
return Ok(res);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return BadRequest(ex.Message);
|
||||
}
|
||||
}
|
||||
/// <summary>
|
||||
/// 语音识别
|
||||
/// </summary>
|
||||
/// <param name="file">文件流</param>
|
||||
/// <returns></returns>
|
||||
[HttpPost(Name = "AudioRecognition")]
|
||||
public async Task<IActionResult> AudioRecognition(IFormFile file)
|
||||
{
|
||||
using var s = file.OpenReadStream();
|
||||
var res = await SenseVoice.RunTask(s);
|
||||
return Ok(res);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// 获取视频信息<para>taskId/tagId二选一</para>
|
||||
/// </summary>
|
||||
/// <param name="taskId"></param>
|
||||
/// <param name="tagId">自定义id</param>
|
||||
/// <param name="needSubtitle">是否附加字幕</param>
|
||||
/// <returns></returns>
|
||||
[HttpGet(Name = "TaskInfo")]
|
||||
public async Task<IActionResult> TaskInfo(long taskId,string? tagId,bool needSubtitle=false)
|
||||
{
|
||||
var task = await videoTaskDB.AsQueryable()
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
using SherpaOnnx;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
|
|
@ -13,7 +14,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
|||
public class SenseVoice
|
||||
{
|
||||
static OfflineRecognizer OR =default!;
|
||||
static VoiceActivityDetector VAD =default!;
|
||||
static VoiceActivityDetector VAD = default!;
|
||||
static VadModelConfig VADModelConfig = default!;
|
||||
/// <summary>
|
||||
/// 初始化 SenseVoice
|
||||
|
|
@ -61,7 +62,9 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
|||
//反转文本规范化规则 fst 的路径
|
||||
config.RuleFsts = string.Empty;
|
||||
|
||||
#if DEBUG
|
||||
config.ModelConfig.Debug = 1;
|
||||
#endif
|
||||
|
||||
OR = new OfflineRecognizer(config);
|
||||
|
||||
|
|
@ -71,6 +74,85 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
|||
//缓冲区大小
|
||||
VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// 获取语音字幕
|
||||
/// </summary>
|
||||
/// <param name="s"></param>
|
||||
/// <returns></returns>
|
||||
public static async Task<List<SenseVoiceRes>> RunTask(Stream s)
|
||||
{
|
||||
if (OR is null)
|
||||
Init();
|
||||
if (s is null)
|
||||
throw new Exception("音频路径 is null");
|
||||
|
||||
WaveReader reader = new WaveReader(s);
|
||||
int numSamples = reader.Samples.Length;
|
||||
int windowSize = VADModelConfig.SileroVad.WindowSize;
|
||||
int sampleRate = VADModelConfig.SampleRate;
|
||||
int numIter = numSamples / windowSize;
|
||||
var totalSecond = numSamples / (float)sampleRate;
|
||||
var res = new List<SenseVoiceRes>(500);
|
||||
//var VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
||||
for (int i = 0; i != numIter; ++i)
|
||||
{
|
||||
int start = i * windowSize;
|
||||
float[] samples = new float[windowSize];
|
||||
Array.Copy(reader.Samples, start, samples, 0, windowSize);
|
||||
VAD.AcceptWaveform(samples);
|
||||
//是否检测到语音
|
||||
if (VAD.IsSpeechDetected())
|
||||
{
|
||||
while (!VAD.IsEmpty())
|
||||
{
|
||||
//获取最新的发言片段
|
||||
SpeechSegment segment = VAD.Front();
|
||||
float startTime = segment.Start / (float)sampleRate;
|
||||
float duration = segment.Samples.Length / (float)sampleRate;
|
||||
OfflineStream stream = OR.CreateStream();
|
||||
stream.AcceptWaveform(sampleRate, segment.Samples);
|
||||
OR.Decode(stream);
|
||||
if (!string.IsNullOrEmpty(stream.Result.Text))
|
||||
{
|
||||
res.Add(new()
|
||||
{
|
||||
Text = stream.Result.Text,
|
||||
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
||||
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
||||
});
|
||||
}
|
||||
VAD.Pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
VAD.Flush();
|
||||
|
||||
while (!VAD.IsEmpty())
|
||||
{
|
||||
SpeechSegment segment = VAD.Front();
|
||||
float startTime = segment.Start / (float)sampleRate;
|
||||
float duration = segment.Samples.Length / (float)sampleRate;
|
||||
|
||||
OfflineStream stream = OR.CreateStream();
|
||||
stream.AcceptWaveform(sampleRate, segment.Samples);
|
||||
OR.Decode(stream);
|
||||
if (!string.IsNullOrEmpty(stream.Result.Text))
|
||||
{
|
||||
res.Add(new()
|
||||
{
|
||||
Text = stream.Result.Text,
|
||||
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
||||
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
||||
});
|
||||
}
|
||||
|
||||
VAD.Pop();
|
||||
}
|
||||
VAD.Reset();
|
||||
return res;
|
||||
|
||||
}
|
||||
/// <summary>
|
||||
/// 获取语音字幕
|
||||
/// </summary>
|
||||
|
|
@ -91,6 +173,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
|||
int numIter = numSamples / windowSize;
|
||||
var totalSecond = numSamples / (float)sampleRate;
|
||||
var res = new List<SenseVoiceRes>(500);
|
||||
//var VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
||||
for (int i = 0; i != numIter; ++i)
|
||||
{
|
||||
int start = i * windowSize;
|
||||
|
|
|
|||
|
|
@ -35,6 +35,10 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
|||
//需要使用GPU
|
||||
if (!useGPU)
|
||||
config.Embedding.Provider = "cuda";
|
||||
#if DEBUG
|
||||
config.Embedding.Debug = 1;
|
||||
#endif
|
||||
|
||||
//说话人判定阈值
|
||||
config.Clustering.Threshold = threshold;
|
||||
SD = new OfflineSpeakerDiarization(config);
|
||||
|
|
|
|||
|
|
@ -90,6 +90,29 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
|||
// The sample rate can be any value.
|
||||
public class WaveReader
|
||||
{
|
||||
public WaveReader(Stream stream)
|
||||
{
|
||||
using (var reader = new BinaryReader(stream))
|
||||
{
|
||||
_header = ReadHeader(reader);
|
||||
if (!_header.Validate())
|
||||
throw new ApplicationException($"无效的音频文件");
|
||||
SkipMetaData(reader);
|
||||
//现在阅读样本
|
||||
//_header.SubChunk2Size 包含总共的字节数。
|
||||
//我们假设每个样本都是 int16 类型
|
||||
byte[] buffer = reader.ReadBytes(_header.SubChunk2Size);
|
||||
short[] samples_int16 = new short[_header.SubChunk2Size / 2];
|
||||
Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length);
|
||||
|
||||
_samples = new float[samples_int16.Length];
|
||||
|
||||
for (var i = 0; i < samples_int16.Length; ++i)
|
||||
{
|
||||
_samples[i] = samples_int16[i] / 32768.0F;
|
||||
}
|
||||
}
|
||||
}
|
||||
public WaveReader(string fileName)
|
||||
{
|
||||
if (!File.Exists(fileName))
|
||||
|
|
@ -139,6 +162,24 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
|||
}
|
||||
|
||||
private void SkipMetaData(BinaryReader reader)
|
||||
{
|
||||
int subChunk2ID = _header.SubChunk2ID;
|
||||
int subChunk2Size = _header.SubChunk2Size;
|
||||
while ( subChunk2ID != 0x61746164)
|
||||
{
|
||||
// 读取并缓存当前位置之后的数据,以避免直接使用Position属性
|
||||
byte[] buffer = new byte[subChunk2Size];
|
||||
int bytesRead = reader.Read(buffer, 0, subChunk2Size);
|
||||
|
||||
subChunk2ID = reader.ReadInt32();
|
||||
subChunk2Size = reader.ReadInt32();
|
||||
if (bytesRead < subChunk2Size)
|
||||
break;
|
||||
}
|
||||
_header.SubChunk2ID = subChunk2ID;
|
||||
_header.SubChunk2Size = subChunk2Size;
|
||||
}
|
||||
private void SkipMetaData_old(BinaryReader reader)
|
||||
{
|
||||
var bs = reader.BaseStream;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue