diff --git a/VideoAnalysis/Controllers/ApiController.cs b/VideoAnalysis/Controllers/ApiController.cs
index 1be35f5..bfc9f8b 100644
--- a/VideoAnalysis/Controllers/ApiController.cs
+++ b/VideoAnalysis/Controllers/ApiController.cs
@@ -13,6 +13,10 @@ using System.Threading.Tasks;
using FFmpeg.NET.Services;
using MapsterMapper;
using Mapster;
+using VideoAnalysisCore.AICore.SherpaOnnx;
+using System.Net;
+using System.Security.Policy;
+using System.IO;
namespace Learn.VideoAnalysis.Controllers
{
@@ -42,13 +46,49 @@ namespace Learn.VideoAnalysis.Controllers
}
///
- /// 获取视频信息taskId/tagId二选一
+ /// 语音识别
///
- ///
- /// 自定义id
- /// 是否附加字幕
+ /// 文件流
///
- [HttpGet(Name = "TaskInfo")]
+ [HttpGet(Name = "AudioRecognitionUrl")]
+ public async Task AudioRecognitionUrl(string url)
+ {
+ try
+ {
+ using HttpClient client = new HttpClient();
+ // 发送GET请求获取网络文件流
+ using var networkStream = await client.GetStreamAsync(url);
+ var res = await SenseVoice.RunTask(networkStream);
+ return Ok(res);
+ }
+ catch (Exception ex)
+ {
+ return BadRequest(ex.Message);
+ }
+ }
+ ///
+ /// 语音识别
+ ///
+ /// 文件流
+ ///
+ [HttpPost(Name = "AudioRecognition")]
+ public async Task AudioRecognition(IFormFile file)
+ {
+ using var s = file.OpenReadStream();
+ var res = await SenseVoice.RunTask(s);
+ return Ok(res);
+ }
+
+
+
+ ///
+ /// 获取视频信息taskId/tagId二选一
+ ///
+ ///
+ /// 自定义id
+ /// 是否附加字幕
+ ///
+ [HttpGet(Name = "TaskInfo")]
public async Task TaskInfo(long taskId,string? tagId,bool needSubtitle=false)
{
var task = await videoTaskDB.AsQueryable()
diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
index a6842af..4ed95e7 100644
--- a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
@@ -2,6 +2,7 @@
using SherpaOnnx;
using System;
using System.Collections.Generic;
+using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
@@ -13,7 +14,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
public class SenseVoice
{
static OfflineRecognizer OR =default!;
- static VoiceActivityDetector VAD =default!;
+ static VoiceActivityDetector VAD = default!;
static VadModelConfig VADModelConfig = default!;
///
/// 鍒濆鍖 SenseVoice
@@ -61,7 +62,9 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
//鍙嶈浆鏂囨湰瑙勮寖鍖栬鍒 fst 鐨勮矾寰
config.RuleFsts = string.Empty;
+#if DEBUG
config.ModelConfig.Debug = 1;
+#endif
OR = new OfflineRecognizer(config);
@@ -71,6 +74,85 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
//缂撳啿鍖哄ぇ灏
VAD = new VoiceActivityDetector(VADModelConfig, 60);
}
+
+ ///
+ /// 鑾峰彇璇煶瀛楀箷
+ ///
+ ///
+ ///
+ public static async Task> RunTask(Stream s)
+ {
+ if (OR is null)
+ Init();
+ if (s is null)
+ throw new Exception("闊抽璺緞 is null");
+
+ WaveReader reader = new WaveReader(s);
+ int numSamples = reader.Samples.Length;
+ int windowSize = VADModelConfig.SileroVad.WindowSize;
+ int sampleRate = VADModelConfig.SampleRate;
+ int numIter = numSamples / windowSize;
+ var totalSecond = numSamples / (float)sampleRate;
+ var res = new List(500);
+ //var VAD = new VoiceActivityDetector(VADModelConfig, 60);
+ for (int i = 0; i != numIter; ++i)
+ {
+ int start = i * windowSize;
+ float[] samples = new float[windowSize];
+ Array.Copy(reader.Samples, start, samples, 0, windowSize);
+ VAD.AcceptWaveform(samples);
+ //鏄惁妫娴嬪埌璇煶
+ if (VAD.IsSpeechDetected())
+ {
+ while (!VAD.IsEmpty())
+ {
+ //鑾峰彇鏈鏂扮殑鍙戣█鐗囨
+ SpeechSegment segment = VAD.Front();
+ float startTime = segment.Start / (float)sampleRate;
+ float duration = segment.Samples.Length / (float)sampleRate;
+ OfflineStream stream = OR.CreateStream();
+ stream.AcceptWaveform(sampleRate, segment.Samples);
+ OR.Decode(stream);
+ if (!string.IsNullOrEmpty(stream.Result.Text))
+ {
+ res.Add(new()
+ {
+ Text = stream.Result.Text,
+ Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
+ End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
+ });
+ }
+ VAD.Pop();
+ }
+ }
+ }
+ VAD.Flush();
+
+ while (!VAD.IsEmpty())
+ {
+ SpeechSegment segment = VAD.Front();
+ float startTime = segment.Start / (float)sampleRate;
+ float duration = segment.Samples.Length / (float)sampleRate;
+
+ OfflineStream stream = OR.CreateStream();
+ stream.AcceptWaveform(sampleRate, segment.Samples);
+ OR.Decode(stream);
+ if (!string.IsNullOrEmpty(stream.Result.Text))
+ {
+ res.Add(new()
+ {
+ Text = stream.Result.Text,
+ Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
+ End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
+ });
+ }
+
+ VAD.Pop();
+ }
+ VAD.Reset();
+ return res;
+
+ }
///
/// 鑾峰彇璇煶瀛楀箷
///
@@ -91,6 +173,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
int numIter = numSamples / windowSize;
var totalSecond = numSamples / (float)sampleRate;
var res = new List(500);
+ //var VAD = new VoiceActivityDetector(VADModelConfig, 60);
for (int i = 0; i != numIter; ++i)
{
int start = i * windowSize;
diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
index 1a8151c..fd55497 100644
--- a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
@@ -35,6 +35,10 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
//闇瑕佷娇鐢℅PU
if (!useGPU)
config.Embedding.Provider = "cuda";
+#if DEBUG
+ config.Embedding.Debug = 1;
+#endif
+
//璇磋瘽浜哄垽瀹氶槇鍊
config.Clustering.Threshold = threshold;
SD = new OfflineSpeakerDiarization(config);
diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs b/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs
index 24084c0..d3688c7 100644
--- a/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs
@@ -90,6 +90,29 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
// The sample rate can be any value.
public class WaveReader
{
+ public WaveReader(Stream stream)
+ {
+ using (var reader = new BinaryReader(stream))
+ {
+ _header = ReadHeader(reader);
+ if (!_header.Validate())
+ throw new ApplicationException($"鏃犳晥鐨勯煶棰戞枃浠");
+ SkipMetaData(reader);
+ //鐜板湪闃呰鏍锋湰
+ //_header.SubChunk2Size 鍖呭惈鎬诲叡鐨勫瓧鑺傛暟銆
+ //鎴戜滑鍋囪姣忎釜鏍锋湰閮芥槸 int16 绫诲瀷
+ byte[] buffer = reader.ReadBytes(_header.SubChunk2Size);
+ short[] samples_int16 = new short[_header.SubChunk2Size / 2];
+ Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length);
+
+ _samples = new float[samples_int16.Length];
+
+ for (var i = 0; i < samples_int16.Length; ++i)
+ {
+ _samples[i] = samples_int16[i] / 32768.0F;
+ }
+ }
+ }
public WaveReader(string fileName)
{
if (!File.Exists(fileName))
@@ -139,6 +162,24 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
}
private void SkipMetaData(BinaryReader reader)
+ {
+ int subChunk2ID = _header.SubChunk2ID;
+ int subChunk2Size = _header.SubChunk2Size;
+ while ( subChunk2ID != 0x61746164)
+ {
+ // 璇诲彇骞剁紦瀛樺綋鍓嶄綅缃箣鍚庣殑鏁版嵁锛屼互閬垮厤鐩存帴浣跨敤Position灞炴
+ byte[] buffer = new byte[subChunk2Size];
+ int bytesRead = reader.Read(buffer, 0, subChunk2Size);
+
+ subChunk2ID = reader.ReadInt32();
+ subChunk2Size = reader.ReadInt32();
+ if (bytesRead < subChunk2Size)
+ break;
+ }
+ _header.SubChunk2ID = subChunk2ID;
+ _header.SubChunk2Size = subChunk2Size;
+ }
+ private void SkipMetaData_old(BinaryReader reader)
{
var bs = reader.BaseStream;