diff --git a/VideoAnalysis/Controllers/ApiController.cs b/VideoAnalysis/Controllers/ApiController.cs index 1be35f5..bfc9f8b 100644 --- a/VideoAnalysis/Controllers/ApiController.cs +++ b/VideoAnalysis/Controllers/ApiController.cs @@ -13,6 +13,10 @@ using System.Threading.Tasks; using FFmpeg.NET.Services; using MapsterMapper; using Mapster; +using VideoAnalysisCore.AICore.SherpaOnnx; +using System.Net; +using System.Security.Policy; +using System.IO; namespace Learn.VideoAnalysis.Controllers { @@ -42,13 +46,49 @@ namespace Learn.VideoAnalysis.Controllers } /// - /// 获取视频信息taskId/tagId二选一 + /// 语音识别 /// - /// - /// 自定义id - /// 是否附加字幕 + /// 文件流 /// - [HttpGet(Name = "TaskInfo")] + [HttpGet(Name = "AudioRecognitionUrl")] + public async Task AudioRecognitionUrl(string url) + { + try + { + using HttpClient client = new HttpClient(); + // 发送GET请求获取网络文件流 + using var networkStream = await client.GetStreamAsync(url); + var res = await SenseVoice.RunTask(networkStream); + return Ok(res); + } + catch (Exception ex) + { + return BadRequest(ex.Message); + } + } + /// + /// 语音识别 + /// + /// 文件流 + /// + [HttpPost(Name = "AudioRecognition")] + public async Task AudioRecognition(IFormFile file) + { + using var s = file.OpenReadStream(); + var res = await SenseVoice.RunTask(s); + return Ok(res); + } + + + + /// + /// 获取视频信息taskId/tagId二选一 + /// + /// + /// 自定义id + /// 是否附加字幕 + /// + [HttpGet(Name = "TaskInfo")] public async Task TaskInfo(long taskId,string? tagId,bool needSubtitle=false) { var task = await videoTaskDB.AsQueryable() diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs index a6842af..4ed95e7 100644 --- a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs +++ b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs @@ -2,6 +2,7 @@ using SherpaOnnx; using System; using System.Collections.Generic; +using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; @@ -13,7 +14,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx public class SenseVoice { static OfflineRecognizer OR =default!; - static VoiceActivityDetector VAD =default!; + static VoiceActivityDetector VAD = default!; static VadModelConfig VADModelConfig = default!; /// /// 鍒濆鍖 SenseVoice @@ -61,7 +62,9 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx //鍙嶈浆鏂囨湰瑙勮寖鍖栬鍒 fst 鐨勮矾寰 config.RuleFsts = string.Empty; +#if DEBUG config.ModelConfig.Debug = 1; +#endif OR = new OfflineRecognizer(config); @@ -71,6 +74,85 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx //缂撳啿鍖哄ぇ灏 VAD = new VoiceActivityDetector(VADModelConfig, 60); } + + /// + /// 鑾峰彇璇煶瀛楀箷 + /// + /// + /// + public static async Task> RunTask(Stream s) + { + if (OR is null) + Init(); + if (s is null) + throw new Exception("闊抽璺緞 is null"); + + WaveReader reader = new WaveReader(s); + int numSamples = reader.Samples.Length; + int windowSize = VADModelConfig.SileroVad.WindowSize; + int sampleRate = VADModelConfig.SampleRate; + int numIter = numSamples / windowSize; + var totalSecond = numSamples / (float)sampleRate; + var res = new List(500); + //var VAD = new VoiceActivityDetector(VADModelConfig, 60); + for (int i = 0; i != numIter; ++i) + { + int start = i * windowSize; + float[] samples = new float[windowSize]; + Array.Copy(reader.Samples, start, samples, 0, windowSize); + VAD.AcceptWaveform(samples); + //鏄惁妫娴嬪埌璇煶 + if (VAD.IsSpeechDetected()) + { + while (!VAD.IsEmpty()) + { + //鑾峰彇鏈鏂扮殑鍙戣█鐗囨 + SpeechSegment segment = VAD.Front(); + float startTime = segment.Start / (float)sampleRate; + float duration = segment.Samples.Length / (float)sampleRate; + OfflineStream stream = OR.CreateStream(); + stream.AcceptWaveform(sampleRate, segment.Samples); + OR.Decode(stream); + if (!string.IsNullOrEmpty(stream.Result.Text)) + { + res.Add(new() + { + Text = stream.Result.Text, + Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero), + End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero), + }); + } + VAD.Pop(); + } + } + } + VAD.Flush(); + + while (!VAD.IsEmpty()) + { + SpeechSegment segment = VAD.Front(); + float startTime = segment.Start / (float)sampleRate; + float duration = segment.Samples.Length / (float)sampleRate; + + OfflineStream stream = OR.CreateStream(); + stream.AcceptWaveform(sampleRate, segment.Samples); + OR.Decode(stream); + if (!string.IsNullOrEmpty(stream.Result.Text)) + { + res.Add(new() + { + Text = stream.Result.Text, + Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero), + End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero), + }); + } + + VAD.Pop(); + } + VAD.Reset(); + return res; + + } /// /// 鑾峰彇璇煶瀛楀箷 /// @@ -91,6 +173,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx int numIter = numSamples / windowSize; var totalSecond = numSamples / (float)sampleRate; var res = new List(500); + //var VAD = new VoiceActivityDetector(VADModelConfig, 60); for (int i = 0; i != numIter; ++i) { int start = i * windowSize; diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs index 1a8151c..fd55497 100644 --- a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs +++ b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs @@ -35,6 +35,10 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx //闇瑕佷娇鐢℅PU if (!useGPU) config.Embedding.Provider = "cuda"; +#if DEBUG + config.Embedding.Debug = 1; +#endif + //璇磋瘽浜哄垽瀹氶槇鍊 config.Clustering.Threshold = threshold; SD = new OfflineSpeakerDiarization(config); diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs b/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs index 24084c0..d3688c7 100644 --- a/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs +++ b/VideoAnalysisCore/AICore/SherpaOnnx/WaveHeader.cs @@ -90,6 +90,29 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx // The sample rate can be any value. public class WaveReader { + public WaveReader(Stream stream) + { + using (var reader = new BinaryReader(stream)) + { + _header = ReadHeader(reader); + if (!_header.Validate()) + throw new ApplicationException($"鏃犳晥鐨勯煶棰戞枃浠"); + SkipMetaData(reader); + //鐜板湪闃呰鏍锋湰 + //_header.SubChunk2Size 鍖呭惈鎬诲叡鐨勫瓧鑺傛暟銆 + //鎴戜滑鍋囪姣忎釜鏍锋湰閮芥槸 int16 绫诲瀷 + byte[] buffer = reader.ReadBytes(_header.SubChunk2Size); + short[] samples_int16 = new short[_header.SubChunk2Size / 2]; + Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length); + + _samples = new float[samples_int16.Length]; + + for (var i = 0; i < samples_int16.Length; ++i) + { + _samples[i] = samples_int16[i] / 32768.0F; + } + } + } public WaveReader(string fileName) { if (!File.Exists(fileName)) @@ -139,6 +162,24 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx } private void SkipMetaData(BinaryReader reader) + { + int subChunk2ID = _header.SubChunk2ID; + int subChunk2Size = _header.SubChunk2Size; + while ( subChunk2ID != 0x61746164) + { + // 璇诲彇骞剁紦瀛樺綋鍓嶄綅缃箣鍚庣殑鏁版嵁锛屼互閬垮厤鐩存帴浣跨敤Position灞炴 + byte[] buffer = new byte[subChunk2Size]; + int bytesRead = reader.Read(buffer, 0, subChunk2Size); + + subChunk2ID = reader.ReadInt32(); + subChunk2Size = reader.ReadInt32(); + if (bytesRead < subChunk2Size) + break; + } + _header.SubChunk2ID = subChunk2ID; + _header.SubChunk2Size = subChunk2Size; + } + private void SkipMetaData_old(BinaryReader reader) { var bs = reader.BaseStream;