diff --git a/VideoAnalysis/Program.cs b/VideoAnalysis/Program.cs
index f6efdb5..cf4e036 100644
--- a/VideoAnalysis/Program.cs
+++ b/VideoAnalysis/Program.cs
@@ -49,6 +49,7 @@ namespace Learn.VideoAnalysis
//ʼ
Speaker.Init();
RedisExpand.Init();
+ SenseVoice.Init();
builder.Services.AddScoped(sp =>
diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
index 8067b92..1ac7864 100644
--- a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs
@@ -5,6 +5,7 @@ using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
+using VideoAnalysisCore.AICore.Whisper;
using VideoAnalysisCore.Common;
namespace VideoAnalysisCore.AICore.SherpaOnnx
@@ -12,6 +13,8 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
public class SenseVoice
{
static OfflineRecognizer OR =default!;
+ static VoiceActivityDetector VAD =default!;
+ static VadModelConfig VADModelConfig = default!;
///
/// 初始化 SenseVoice
///
@@ -19,6 +22,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
///
public static void Init(int speakerNumber = 0, double threshold = 0.6)
{
+ Console.WriteLine("初始化 SenseVoice");
OfflineRecognizerConfig config = new OfflineRecognizerConfig();
//采样率
config.FeatConfig.SampleRate = 16000;
@@ -55,6 +59,12 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
config.ModelConfig.Debug = 0;
OR = new OfflineRecognizer(config);
+
+ VADModelConfig = new VadModelConfig();
+ VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "silero_VAD.onnx");
+ VADModelConfig.Debug = 0;
+ //缓冲区大小
+ VAD = new VoiceActivityDetector(VADModelConfig, 60);
}
///
/// 获取语音字幕
@@ -66,27 +76,73 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
var filePath = Path.Combine(task.LocalPath(), task + ".wav");
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
throw new Exception("task 音频路径未找到");
- OfflineStream stream = OR.CreateStream();
- WaveReader waveReader = new WaveReader(filePath);
- stream.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
- OR.Decode(stream);
- var r = stream.Result;
- Console.WriteLine("--------------------");
- Console.WriteLine("Text: {0}", r.Text);
- Console.WriteLine("Tokens: [{0}]", string.Join(", ", r.Tokens));
- if (r.Timestamps != null && r.Timestamps.Length > 0)
+ string testWaveFilename = filePath;
+ WaveReader reader = new WaveReader(testWaveFilename);
+
+ int numSamples = reader.Samples.Length;
+ int windowSize = VADModelConfig.SileroVad.WindowSize;
+ int sampleRate = VADModelConfig.SampleRate;
+ int numIter = numSamples / windowSize;
+
+ var res = new List(500);
+ for (int i = 0; i != numIter; ++i)
{
- Console.Write("Timestamps: [");
- var sep = "";
- for (int k = 0; k != r.Timestamps.Length; ++k)
+ int start = i * windowSize;
+ float[] samples = new float[windowSize];
+ Array.Copy(reader.Samples, start, samples, 0, windowSize);
+ VAD.AcceptWaveform(samples);
+ //是否检测到语音
+ if (VAD.IsSpeechDetected())
{
- Console.Write("{0}{1}", sep, r.Timestamps[k].ToString("0.00"));
- sep = ", ";
+ while (!VAD.IsEmpty())
+ {
+ //获取最新的发言片段
+ SpeechSegment segment = VAD.Front();
+ float startTime = segment.Start / (float)sampleRate;
+ float duration = segment.Samples.Length / (float)sampleRate;
+ OfflineStream stream = OR.CreateStream();
+ stream.AcceptWaveform(sampleRate, segment.Samples);
+ OR.Decode(stream);
+ if (!string.IsNullOrEmpty(stream.Result.Text))
+ {
+ res.Add(new()
+ {
+ Text = stream.Result.Text,
+ Start= startTime,
+ End = startTime + duration });
+ }
+ VAD.Pop();
+ }
}
- Console.WriteLine("]");
}
- await Task.CompletedTask;
+ VAD.Flush();
+
+ while (!VAD.IsEmpty())
+ {
+ SpeechSegment segment = VAD.Front();
+ float startTime = segment.Start / (float)sampleRate;
+ float duration = segment.Samples.Length / (float)sampleRate;
+
+ OfflineStream stream = OR.CreateStream();
+ stream.AcceptWaveform(sampleRate, segment.Samples);
+ OR.Decode(stream);
+ if (!string.IsNullOrEmpty(stream.Result.Text))
+ {
+ res.Add(new()
+ {
+ Text = stream.Result.Text,
+ Start = startTime,
+ End = startTime + duration
+ });
+ }
+
+ VAD.Pop();
+ }
+
+ await RedisExpand.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
+ RedisExpand.InsertChannel(Enum.RedisChannelEnum.ParsingSpeaker, task);
+
}
}
}
diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/SherpaOnnxDto.cs b/VideoAnalysisCore/AICore/SherpaOnnx/SherpaOnnxDto.cs
new file mode 100644
index 0000000..0b4ab60
--- /dev/null
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/SherpaOnnxDto.cs
@@ -0,0 +1,29 @@
+using Whisper.net;
+
+namespace VideoAnalysisCore.AICore.Whisper
+{
+ ///
+ /// 字幕识别 结果
+ ///
+ public class SenseVoiceRes
+ {
+ public SenseVoiceRes()
+ {
+
+ }
+ ///
+ /// 文本
+ ///
+ public string Text { get; set; } = string.Empty;
+ ///
+ /// 开始时间
+ ///
+
+ public float Start { get; set; }
+ ///
+ /// 结束时间
+ ///
+
+ public float End { get; set; }
+ }
+}
diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
index 02b68ba..203ce1f 100644
--- a/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
+++ b/VideoAnalysisCore/AICore/SherpaOnnx/Speaker.cs
@@ -22,6 +22,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
///
public static void Init(int speakerNumber = 0, double threshold = 0.6)
{
+ Console.WriteLine("初始化 Speaker");
var config = new OfflineSpeakerDiarizationConfig();
//Pyannote模型地址
config.Segmentation.Pyannote.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-pyannote-segmentation-3-0", "model.onnx");
@@ -72,15 +73,15 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
/// 总持续时间
///
[JsonIgnore]
- public decimal Total => End - Start;
+ public float Total => End - Start;
///
/// 开始时间
///
- public decimal Start { get; set; }
+ public float Start { get; set; }
///
/// 结束时间
///
- public decimal End { get; set; }
+ public float End { get; set; }
///
/// 讲话人索引
///
@@ -99,8 +100,8 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
///
public OfflineSpeakerRes(OfflineSpeakerDiarizationSegment sds)
{
- Start = (decimal)sds.Start;
- End = (decimal)sds.End;
+ Start = sds.Start;
+ End =sds.End;
SpeakerIndex = sds.Speaker;
}
}
diff --git a/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs b/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs
index 0524c39..3bd5b05 100644
--- a/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs
+++ b/VideoAnalysisCore/AICore/Whisper/WhisperDto.cs
@@ -7,6 +7,10 @@ namespace VideoAnalysisCore.AICore.Whisper
///
public class WhisperResDto
{
+ public WhisperResDto()
+ {
+
+ }
///
///
///
@@ -20,16 +24,16 @@ namespace VideoAnalysisCore.AICore.Whisper
///
/// 文本
///
- public string Text { get; } = string.Empty;
+ public string Text { get; set; } = string.Empty;
///
/// 开始时间
///
- public TimeSpan Start { get; }
+ public TimeSpan Start { get; set; }
///
/// 结束时间
///
- public TimeSpan End { get; }
+ public TimeSpan End { get; set; }
}
}
diff --git a/VideoAnalysisCore/Common/RedisExpand.cs b/VideoAnalysisCore/Common/RedisExpand.cs
index d7e6a36..a6ff907 100644
--- a/VideoAnalysisCore/Common/RedisExpand.cs
+++ b/VideoAnalysisCore/Common/RedisExpand.cs
@@ -79,11 +79,12 @@ namespace VideoAnalysisCore.Common
///
public static RedisClient Redis = new RedisClient(AppCommon.Config.Redis.ConnectionString);
///
- /// 初始化redis
+ /// 初始化 redis
/// 需要在初始化配置文件时候调用
///
public static void Init()
{
+ Console.WriteLine("初始化 redis");
Redis.Serialize = obj => System.Text.Json.JsonSerializer.Serialize(obj);
Redis.Deserialize = (json, type) => System.Text.Json.JsonSerializer.Deserialize(json, type);
InitChannel();
@@ -125,7 +126,7 @@ namespace VideoAnalysisCore.Common
(msg) => { TouchChannel(RedisChannelEnum.SeparateAudio, msg, FFMPGEHandle.Audio2WAV16KAsync); });
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingCaptions),
- (msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, WhisperHandle.RunTask); });
+ (msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, SenseVoice.RunTask); });
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingSpeaker),
(msg) => { TouchChannel(RedisChannelEnum.ParsingSpeaker, msg, Speaker.Run); });
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ChatModelAnalysis),