优化 语音识别vad 进度重置
This commit is contained in:
parent
d477051c06
commit
b9563452c3
|
|
@ -13,6 +13,10 @@ using System.Threading.Tasks;
|
||||||
using FFmpeg.NET.Services;
|
using FFmpeg.NET.Services;
|
||||||
using MapsterMapper;
|
using MapsterMapper;
|
||||||
using Mapster;
|
using Mapster;
|
||||||
|
using VideoAnalysisCore.AICore.SherpaOnnx;
|
||||||
|
using System.Net;
|
||||||
|
using System.Security.Policy;
|
||||||
|
using System.IO;
|
||||||
|
|
||||||
namespace Learn.VideoAnalysis.Controllers
|
namespace Learn.VideoAnalysis.Controllers
|
||||||
{
|
{
|
||||||
|
|
@ -42,13 +46,49 @@ namespace Learn.VideoAnalysis.Controllers
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 获取视频信息<para>taskId/tagId二选一</para>
|
/// 语音识别
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="taskId"></param>
|
/// <param name="url">文件流</param>
|
||||||
/// <param name="tagId">自定义id</param>
|
|
||||||
/// <param name="needSubtitle">是否附加字幕</param>
|
|
||||||
/// <returns></returns>
|
/// <returns></returns>
|
||||||
[HttpGet(Name = "TaskInfo")]
|
[HttpGet(Name = "AudioRecognitionUrl")]
|
||||||
|
public async Task<IActionResult> AudioRecognitionUrl(string url)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using HttpClient client = new HttpClient();
|
||||||
|
// 发送GET请求获取网络文件流
|
||||||
|
using var networkStream = await client.GetStreamAsync(url);
|
||||||
|
var res = await SenseVoice.RunTask(networkStream);
|
||||||
|
return Ok(res);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
return BadRequest(ex.Message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// 语音识别
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="file">文件流</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
[HttpPost(Name = "AudioRecognition")]
|
||||||
|
public async Task<IActionResult> AudioRecognition(IFormFile file)
|
||||||
|
{
|
||||||
|
using var s = file.OpenReadStream();
|
||||||
|
var res = await SenseVoice.RunTask(s);
|
||||||
|
return Ok(res);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 获取视频信息<para>taskId/tagId二选一</para>
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="taskId"></param>
|
||||||
|
/// <param name="tagId">自定义id</param>
|
||||||
|
/// <param name="needSubtitle">是否附加字幕</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
[HttpGet(Name = "TaskInfo")]
|
||||||
public async Task<IActionResult> TaskInfo(long taskId,string? tagId,bool needSubtitle=false)
|
public async Task<IActionResult> TaskInfo(long taskId,string? tagId,bool needSubtitle=false)
|
||||||
{
|
{
|
||||||
var task = await videoTaskDB.AsQueryable()
|
var task = await videoTaskDB.AsQueryable()
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
using SherpaOnnx;
|
using SherpaOnnx;
|
||||||
using System;
|
using System;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
|
using System.Diagnostics;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
|
|
@ -13,7 +14,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
public class SenseVoice
|
public class SenseVoice
|
||||||
{
|
{
|
||||||
static OfflineRecognizer OR =default!;
|
static OfflineRecognizer OR =default!;
|
||||||
static VoiceActivityDetector VAD =default!;
|
static VoiceActivityDetector VAD = default!;
|
||||||
static VadModelConfig VADModelConfig = default!;
|
static VadModelConfig VADModelConfig = default!;
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 初始化 SenseVoice
|
/// 初始化 SenseVoice
|
||||||
|
|
@ -61,7 +62,9 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
//反转文本规范化规则 fst 的路径
|
//反转文本规范化规则 fst 的路径
|
||||||
config.RuleFsts = string.Empty;
|
config.RuleFsts = string.Empty;
|
||||||
|
|
||||||
|
#if DEBUG
|
||||||
config.ModelConfig.Debug = 1;
|
config.ModelConfig.Debug = 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
OR = new OfflineRecognizer(config);
|
OR = new OfflineRecognizer(config);
|
||||||
|
|
||||||
|
|
@ -71,6 +74,85 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
//缓冲区大小
|
//缓冲区大小
|
||||||
VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 获取语音字幕
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="s"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static async Task<List<SenseVoiceRes>> RunTask(Stream s)
|
||||||
|
{
|
||||||
|
if (OR is null)
|
||||||
|
Init();
|
||||||
|
if (s is null)
|
||||||
|
throw new Exception("音频路径 is null");
|
||||||
|
|
||||||
|
WaveReader reader = new WaveReader(s);
|
||||||
|
int numSamples = reader.Samples.Length;
|
||||||
|
int windowSize = VADModelConfig.SileroVad.WindowSize;
|
||||||
|
int sampleRate = VADModelConfig.SampleRate;
|
||||||
|
int numIter = numSamples / windowSize;
|
||||||
|
var totalSecond = numSamples / (float)sampleRate;
|
||||||
|
var res = new List<SenseVoiceRes>(500);
|
||||||
|
//var VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
||||||
|
for (int i = 0; i != numIter; ++i)
|
||||||
|
{
|
||||||
|
int start = i * windowSize;
|
||||||
|
float[] samples = new float[windowSize];
|
||||||
|
Array.Copy(reader.Samples, start, samples, 0, windowSize);
|
||||||
|
VAD.AcceptWaveform(samples);
|
||||||
|
//是否检测到语音
|
||||||
|
if (VAD.IsSpeechDetected())
|
||||||
|
{
|
||||||
|
while (!VAD.IsEmpty())
|
||||||
|
{
|
||||||
|
//获取最新的发言片段
|
||||||
|
SpeechSegment segment = VAD.Front();
|
||||||
|
float startTime = segment.Start / (float)sampleRate;
|
||||||
|
float duration = segment.Samples.Length / (float)sampleRate;
|
||||||
|
OfflineStream stream = OR.CreateStream();
|
||||||
|
stream.AcceptWaveform(sampleRate, segment.Samples);
|
||||||
|
OR.Decode(stream);
|
||||||
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
||||||
|
{
|
||||||
|
res.Add(new()
|
||||||
|
{
|
||||||
|
Text = stream.Result.Text,
|
||||||
|
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
||||||
|
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
VAD.Pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
VAD.Flush();
|
||||||
|
|
||||||
|
while (!VAD.IsEmpty())
|
||||||
|
{
|
||||||
|
SpeechSegment segment = VAD.Front();
|
||||||
|
float startTime = segment.Start / (float)sampleRate;
|
||||||
|
float duration = segment.Samples.Length / (float)sampleRate;
|
||||||
|
|
||||||
|
OfflineStream stream = OR.CreateStream();
|
||||||
|
stream.AcceptWaveform(sampleRate, segment.Samples);
|
||||||
|
OR.Decode(stream);
|
||||||
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
||||||
|
{
|
||||||
|
res.Add(new()
|
||||||
|
{
|
||||||
|
Text = stream.Result.Text,
|
||||||
|
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
|
||||||
|
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
VAD.Pop();
|
||||||
|
}
|
||||||
|
VAD.Reset();
|
||||||
|
return res;
|
||||||
|
|
||||||
|
}
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 获取语音字幕
|
/// 获取语音字幕
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|
@ -91,6 +173,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
int numIter = numSamples / windowSize;
|
int numIter = numSamples / windowSize;
|
||||||
var totalSecond = numSamples / (float)sampleRate;
|
var totalSecond = numSamples / (float)sampleRate;
|
||||||
var res = new List<SenseVoiceRes>(500);
|
var res = new List<SenseVoiceRes>(500);
|
||||||
|
//var VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
||||||
for (int i = 0; i != numIter; ++i)
|
for (int i = 0; i != numIter; ++i)
|
||||||
{
|
{
|
||||||
int start = i * windowSize;
|
int start = i * windowSize;
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,10 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
//需要使用GPU
|
//需要使用GPU
|
||||||
if (!useGPU)
|
if (!useGPU)
|
||||||
config.Embedding.Provider = "cuda";
|
config.Embedding.Provider = "cuda";
|
||||||
|
#if DEBUG
|
||||||
|
config.Embedding.Debug = 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
//说话人判定阈值
|
//说话人判定阈值
|
||||||
config.Clustering.Threshold = threshold;
|
config.Clustering.Threshold = threshold;
|
||||||
SD = new OfflineSpeakerDiarization(config);
|
SD = new OfflineSpeakerDiarization(config);
|
||||||
|
|
|
||||||
|
|
@ -90,6 +90,29 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
// The sample rate can be any value.
|
// The sample rate can be any value.
|
||||||
public class WaveReader
|
public class WaveReader
|
||||||
{
|
{
|
||||||
|
public WaveReader(Stream stream)
|
||||||
|
{
|
||||||
|
using (var reader = new BinaryReader(stream))
|
||||||
|
{
|
||||||
|
_header = ReadHeader(reader);
|
||||||
|
if (!_header.Validate())
|
||||||
|
throw new ApplicationException($"无效的音频文件");
|
||||||
|
SkipMetaData(reader);
|
||||||
|
//现在阅读样本
|
||||||
|
//_header.SubChunk2Size 包含总共的字节数。
|
||||||
|
//我们假设每个样本都是 int16 类型
|
||||||
|
byte[] buffer = reader.ReadBytes(_header.SubChunk2Size);
|
||||||
|
short[] samples_int16 = new short[_header.SubChunk2Size / 2];
|
||||||
|
Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length);
|
||||||
|
|
||||||
|
_samples = new float[samples_int16.Length];
|
||||||
|
|
||||||
|
for (var i = 0; i < samples_int16.Length; ++i)
|
||||||
|
{
|
||||||
|
_samples[i] = samples_int16[i] / 32768.0F;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
public WaveReader(string fileName)
|
public WaveReader(string fileName)
|
||||||
{
|
{
|
||||||
if (!File.Exists(fileName))
|
if (!File.Exists(fileName))
|
||||||
|
|
@ -139,6 +162,24 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
}
|
}
|
||||||
|
|
||||||
private void SkipMetaData(BinaryReader reader)
|
private void SkipMetaData(BinaryReader reader)
|
||||||
|
{
|
||||||
|
int subChunk2ID = _header.SubChunk2ID;
|
||||||
|
int subChunk2Size = _header.SubChunk2Size;
|
||||||
|
while ( subChunk2ID != 0x61746164)
|
||||||
|
{
|
||||||
|
// 读取并缓存当前位置之后的数据,以避免直接使用Position属性
|
||||||
|
byte[] buffer = new byte[subChunk2Size];
|
||||||
|
int bytesRead = reader.Read(buffer, 0, subChunk2Size);
|
||||||
|
|
||||||
|
subChunk2ID = reader.ReadInt32();
|
||||||
|
subChunk2Size = reader.ReadInt32();
|
||||||
|
if (bytesRead < subChunk2Size)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
_header.SubChunk2ID = subChunk2ID;
|
||||||
|
_header.SubChunk2Size = subChunk2Size;
|
||||||
|
}
|
||||||
|
private void SkipMetaData_old(BinaryReader reader)
|
||||||
{
|
{
|
||||||
var bs = reader.BaseStream;
|
var bs = reader.BaseStream;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue