接入 SenseVoice模型
This commit is contained in:
parent
b1cbedb9e8
commit
9e0dca0283
|
|
@ -49,6 +49,7 @@ namespace Learn.VideoAnalysis
|
||||||
//初始化 插件
|
//初始化 插件
|
||||||
Speaker.Init();
|
Speaker.Init();
|
||||||
RedisExpand.Init();
|
RedisExpand.Init();
|
||||||
|
SenseVoice.Init();
|
||||||
|
|
||||||
|
|
||||||
builder.Services.AddScoped(sp =>
|
builder.Services.AddScoped(sp =>
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ using System.Collections.Generic;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
|
using VideoAnalysisCore.AICore.Whisper;
|
||||||
using VideoAnalysisCore.Common;
|
using VideoAnalysisCore.Common;
|
||||||
|
|
||||||
namespace VideoAnalysisCore.AICore.SherpaOnnx
|
namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
|
|
@ -12,6 +13,8 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
public class SenseVoice
|
public class SenseVoice
|
||||||
{
|
{
|
||||||
static OfflineRecognizer OR =default!;
|
static OfflineRecognizer OR =default!;
|
||||||
|
static VoiceActivityDetector VAD =default!;
|
||||||
|
static VadModelConfig VADModelConfig = default!;
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 初始化 SenseVoice
|
/// 初始化 SenseVoice
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|
@ -19,6 +22,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
/// <param name="threshold"></param>
|
/// <param name="threshold"></param>
|
||||||
public static void Init(int speakerNumber = 0, double threshold = 0.6)
|
public static void Init(int speakerNumber = 0, double threshold = 0.6)
|
||||||
{
|
{
|
||||||
|
Console.WriteLine("初始化 SenseVoice");
|
||||||
OfflineRecognizerConfig config = new OfflineRecognizerConfig();
|
OfflineRecognizerConfig config = new OfflineRecognizerConfig();
|
||||||
//采样率
|
//采样率
|
||||||
config.FeatConfig.SampleRate = 16000;
|
config.FeatConfig.SampleRate = 16000;
|
||||||
|
|
@ -55,6 +59,12 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
config.ModelConfig.Debug = 0;
|
config.ModelConfig.Debug = 0;
|
||||||
|
|
||||||
OR = new OfflineRecognizer(config);
|
OR = new OfflineRecognizer(config);
|
||||||
|
|
||||||
|
VADModelConfig = new VadModelConfig();
|
||||||
|
VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "silero_VAD.onnx");
|
||||||
|
VADModelConfig.Debug = 0;
|
||||||
|
//缓冲区大小
|
||||||
|
VAD = new VoiceActivityDetector(VADModelConfig, 60);
|
||||||
}
|
}
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 获取语音字幕
|
/// 获取语音字幕
|
||||||
|
|
@ -66,27 +76,73 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
var filePath = Path.Combine(task.LocalPath(), task + ".wav");
|
var filePath = Path.Combine(task.LocalPath(), task + ".wav");
|
||||||
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
|
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
|
||||||
throw new Exception("task 音频路径未找到");
|
throw new Exception("task 音频路径未找到");
|
||||||
OfflineStream stream = OR.CreateStream();
|
|
||||||
WaveReader waveReader = new WaveReader(filePath);
|
|
||||||
stream.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
|
|
||||||
OR.Decode(stream);
|
|
||||||
|
|
||||||
var r = stream.Result;
|
string testWaveFilename = filePath;
|
||||||
Console.WriteLine("--------------------");
|
WaveReader reader = new WaveReader(testWaveFilename);
|
||||||
Console.WriteLine("Text: {0}", r.Text);
|
|
||||||
Console.WriteLine("Tokens: [{0}]", string.Join(", ", r.Tokens));
|
int numSamples = reader.Samples.Length;
|
||||||
if (r.Timestamps != null && r.Timestamps.Length > 0)
|
int windowSize = VADModelConfig.SileroVad.WindowSize;
|
||||||
|
int sampleRate = VADModelConfig.SampleRate;
|
||||||
|
int numIter = numSamples / windowSize;
|
||||||
|
|
||||||
|
var res = new List<SenseVoiceRes>(500);
|
||||||
|
for (int i = 0; i != numIter; ++i)
|
||||||
{
|
{
|
||||||
Console.Write("Timestamps: [");
|
int start = i * windowSize;
|
||||||
var sep = "";
|
float[] samples = new float[windowSize];
|
||||||
for (int k = 0; k != r.Timestamps.Length; ++k)
|
Array.Copy(reader.Samples, start, samples, 0, windowSize);
|
||||||
|
VAD.AcceptWaveform(samples);
|
||||||
|
//是否检测到语音
|
||||||
|
if (VAD.IsSpeechDetected())
|
||||||
{
|
{
|
||||||
Console.Write("{0}{1}", sep, r.Timestamps[k].ToString("0.00"));
|
while (!VAD.IsEmpty())
|
||||||
sep = ", ";
|
{
|
||||||
|
//获取最新的发言片段
|
||||||
|
SpeechSegment segment = VAD.Front();
|
||||||
|
float startTime = segment.Start / (float)sampleRate;
|
||||||
|
float duration = segment.Samples.Length / (float)sampleRate;
|
||||||
|
OfflineStream stream = OR.CreateStream();
|
||||||
|
stream.AcceptWaveform(sampleRate, segment.Samples);
|
||||||
|
OR.Decode(stream);
|
||||||
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
||||||
|
{
|
||||||
|
res.Add(new()
|
||||||
|
{
|
||||||
|
Text = stream.Result.Text,
|
||||||
|
Start= startTime,
|
||||||
|
End = startTime + duration });
|
||||||
|
}
|
||||||
|
VAD.Pop();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Console.WriteLine("]");
|
|
||||||
}
|
}
|
||||||
await Task.CompletedTask;
|
VAD.Flush();
|
||||||
|
|
||||||
|
while (!VAD.IsEmpty())
|
||||||
|
{
|
||||||
|
SpeechSegment segment = VAD.Front();
|
||||||
|
float startTime = segment.Start / (float)sampleRate;
|
||||||
|
float duration = segment.Samples.Length / (float)sampleRate;
|
||||||
|
|
||||||
|
OfflineStream stream = OR.CreateStream();
|
||||||
|
stream.AcceptWaveform(sampleRate, segment.Samples);
|
||||||
|
OR.Decode(stream);
|
||||||
|
if (!string.IsNullOrEmpty(stream.Result.Text))
|
||||||
|
{
|
||||||
|
res.Add(new()
|
||||||
|
{
|
||||||
|
Text = stream.Result.Text,
|
||||||
|
Start = startTime,
|
||||||
|
End = startTime + duration
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
VAD.Pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
await RedisExpand.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
|
||||||
|
RedisExpand.InsertChannel(Enum.RedisChannelEnum.ParsingSpeaker, task);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,29 @@
|
||||||
|
using Whisper.net;
|
||||||
|
|
||||||
|
namespace VideoAnalysisCore.AICore.Whisper
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// 字幕识别 结果
|
||||||
|
/// </summary>
|
||||||
|
public class SenseVoiceRes
|
||||||
|
{
|
||||||
|
public SenseVoiceRes()
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// 文本
|
||||||
|
/// </summary>
|
||||||
|
public string Text { get; set; } = string.Empty;
|
||||||
|
/// <summary>
|
||||||
|
/// 开始时间
|
||||||
|
/// </summary>
|
||||||
|
|
||||||
|
public float Start { get; set; }
|
||||||
|
/// <summary>
|
||||||
|
/// 结束时间
|
||||||
|
/// </summary>
|
||||||
|
|
||||||
|
public float End { get; set; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -22,6 +22,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
/// <param name="threshold"></param>
|
/// <param name="threshold"></param>
|
||||||
public static void Init(int speakerNumber = 0, double threshold = 0.6)
|
public static void Init(int speakerNumber = 0, double threshold = 0.6)
|
||||||
{
|
{
|
||||||
|
Console.WriteLine("初始化 Speaker");
|
||||||
var config = new OfflineSpeakerDiarizationConfig();
|
var config = new OfflineSpeakerDiarizationConfig();
|
||||||
//Pyannote模型地址
|
//Pyannote模型地址
|
||||||
config.Segmentation.Pyannote.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-pyannote-segmentation-3-0", "model.onnx");
|
config.Segmentation.Pyannote.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-pyannote-segmentation-3-0", "model.onnx");
|
||||||
|
|
@ -72,15 +73,15 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
/// 总持续时间
|
/// 总持续时间
|
||||||
/// </summary>
|
/// </summary>
|
||||||
[JsonIgnore]
|
[JsonIgnore]
|
||||||
public decimal Total => End - Start;
|
public float Total => End - Start;
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 开始时间
|
/// 开始时间
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public decimal Start { get; set; }
|
public float Start { get; set; }
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 结束时间
|
/// 结束时间
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public decimal End { get; set; }
|
public float End { get; set; }
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 讲话人索引
|
/// 讲话人索引
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|
@ -99,8 +100,8 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
|
||||||
/// <param name="sds"></param>
|
/// <param name="sds"></param>
|
||||||
public OfflineSpeakerRes(OfflineSpeakerDiarizationSegment sds)
|
public OfflineSpeakerRes(OfflineSpeakerDiarizationSegment sds)
|
||||||
{
|
{
|
||||||
Start = (decimal)sds.Start;
|
Start = sds.Start;
|
||||||
End = (decimal)sds.End;
|
End =sds.End;
|
||||||
SpeakerIndex = sds.Speaker;
|
SpeakerIndex = sds.Speaker;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,10 @@ namespace VideoAnalysisCore.AICore.Whisper
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class WhisperResDto
|
public class WhisperResDto
|
||||||
{
|
{
|
||||||
|
public WhisperResDto()
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
/// <summary>
|
/// <summary>
|
||||||
///
|
///
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|
@ -20,16 +24,16 @@ namespace VideoAnalysisCore.AICore.Whisper
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 文本
|
/// 文本
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public string Text { get; } = string.Empty;
|
public string Text { get; set; } = string.Empty;
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 开始时间
|
/// 开始时间
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|
||||||
public TimeSpan Start { get; }
|
public TimeSpan Start { get; set; }
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 结束时间
|
/// 结束时间
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|
||||||
public TimeSpan End { get; }
|
public TimeSpan End { get; set; }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -79,11 +79,12 @@ namespace VideoAnalysisCore.Common
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public static RedisClient Redis = new RedisClient(AppCommon.Config.Redis.ConnectionString);
|
public static RedisClient Redis = new RedisClient(AppCommon.Config.Redis.ConnectionString);
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// 初始化redis
|
/// 初始化 redis
|
||||||
/// <para>需要在初始化配置文件时候调用</para>
|
/// <para>需要在初始化配置文件时候调用</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public static void Init()
|
public static void Init()
|
||||||
{
|
{
|
||||||
|
Console.WriteLine("初始化 redis");
|
||||||
Redis.Serialize = obj => System.Text.Json.JsonSerializer.Serialize(obj);
|
Redis.Serialize = obj => System.Text.Json.JsonSerializer.Serialize(obj);
|
||||||
Redis.Deserialize = (json, type) => System.Text.Json.JsonSerializer.Deserialize(json, type);
|
Redis.Deserialize = (json, type) => System.Text.Json.JsonSerializer.Deserialize(json, type);
|
||||||
InitChannel();
|
InitChannel();
|
||||||
|
|
@ -125,7 +126,7 @@ namespace VideoAnalysisCore.Common
|
||||||
(msg) => { TouchChannel(RedisChannelEnum.SeparateAudio, msg, FFMPGEHandle.Audio2WAV16KAsync); });
|
(msg) => { TouchChannel(RedisChannelEnum.SeparateAudio, msg, FFMPGEHandle.Audio2WAV16KAsync); });
|
||||||
|
|
||||||
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingCaptions),
|
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingCaptions),
|
||||||
(msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, WhisperHandle.RunTask); });
|
(msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, SenseVoice.RunTask); });
|
||||||
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingSpeaker),
|
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingSpeaker),
|
||||||
(msg) => { TouchChannel(RedisChannelEnum.ParsingSpeaker, msg, Speaker.Run); });
|
(msg) => { TouchChannel(RedisChannelEnum.ParsingSpeaker, msg, Speaker.Run); });
|
||||||
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ChatModelAnalysis),
|
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ChatModelAnalysis),
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue