Compare commits

...

2 Commits

Author SHA1 Message Date
小肥羊 9e0dca0283 接入 SenseVoice模型 2024-11-04 18:29:58 +08:00
小肥羊 b1cbedb9e8 新增 SenseVoice 字幕识别AI 2024-11-04 17:13:56 +08:00
9 changed files with 200 additions and 16 deletions

View File

@ -49,6 +49,7 @@ namespace Learn.VideoAnalysis
//初始化 插件 //初始化 插件
Speaker.Init(); Speaker.Init();
RedisExpand.Init(); RedisExpand.Init();
SenseVoice.Init();
builder.Services.AddScoped(sp => builder.Services.AddScoped(sp =>

View File

@ -19,7 +19,8 @@
"ChatGpt": { "ChatGpt": {
"KIMI": { "KIMI": {
"Host": "https://api.moonshot.cn", "Host": "https://api.moonshot.cn",
"ApiKey": "sk-CNYJdRHgJsgtgw1Q8GhQ5ayXuFPVLSk5bduOF4l2FMvI5lUo" //"ApiKey": "sk-CNYJdRHgJsgtgw1Q8GhQ5ayXuFPVLSk5bduOF4l2FMvI5lUo"
"ApiKey": "sk-8BvvhESZIkgUbiaaJhglPxFa4o2X9H3xEv9lXELrWWwGxHWY"
} }
}, },
"DB": { "DB": {

View File

@ -67,7 +67,7 @@ namespace VideoAnalysisCore.AICore.ChatGPT.KIMI
criteriaBuilder.Append("|"); criteriaBuilder.Append("|");
} }
var resFormat = "问题编号:int,结果:array|bool,问题解释:string"; var resFormat = "[{问题编号:int,结果:array|bool,问题解释:string}]";
var postMessages = var postMessages =
$"以下是一段音频的字幕,分析这段字幕(格式 说话人:开始秒:结束秒:内容|下一段字幕)." + $"以下是一段音频的字幕,分析这段字幕(格式 说话人:开始秒:结束秒:内容|下一段字幕)." +
$"来简明的回答提出的问题 问题列表 {criteriaBuilder} " + $"来简明的回答提出的问题 问题列表 {criteriaBuilder} " +
@ -80,15 +80,14 @@ namespace VideoAnalysisCore.AICore.ChatGPT.KIMI
var modelId = reqTokenCount > 32 * 1000 ? "moonshot-v1-128k" : "moonshot-v1-32k"; var modelId = reqTokenCount > 32 * 1000 ? "moonshot-v1-128k" : "moonshot-v1-32k";
var chatRep = new ChatReq var chatRep = new ChatReq
{ {
max_tokens =1000 * 31, max_tokens = reqTokenCount * 2,
temperature = 0.3, temperature = 0.3,
frequency_penalty = 0, frequency_penalty = 0,
presence_penalty = 0, presence_penalty = 0,
model = modelId, model = modelId,
messages = new List<MessagesItem>(){ messages = new List<MessagesItem>(){
new MessagesItem(postMessages,"system"), new MessagesItem(postMessages,"system"),
//todo 规定返回json格式 new MessagesItem(postMessages,"assistant"),
//new MessagesItem(postMessages,"assistant"),
} }
}; };
var chatResp = await moonshotClient.Chat(chatRep); var chatResp = await moonshotClient.Chat(chatRep);

View File

@ -0,0 +1,148 @@
using Microsoft.Extensions.Options;
using SherpaOnnx;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using VideoAnalysisCore.AICore.Whisper;
using VideoAnalysisCore.Common;
namespace VideoAnalysisCore.AICore.SherpaOnnx
{
public class SenseVoice
{
static OfflineRecognizer OR =default!;
static VoiceActivityDetector VAD =default!;
static VadModelConfig VADModelConfig = default!;
/// <summary>
/// 初始化 SenseVoice
/// </summary>
/// <param name="speakerNumber"></param>
/// <param name="threshold"></param>
public static void Init(int speakerNumber = 0, double threshold = 0.6)
{
Console.WriteLine("初始化 SenseVoice");
OfflineRecognizerConfig config = new OfflineRecognizerConfig();
//采样率
config.FeatConfig.SampleRate = 16000;
//用于训练模型的特征维度
config.FeatConfig.FeatureDim = 80;
//Path to tokens.txt
config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "tokens.txt");
//SenseVoice 模型
config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "model.onnx");
//1 使用逆文本规范化处理感官语音。
config.ModelConfig.SenseVoice.UseInverseTextNormalization =1;
//模型类型
config.ModelConfig.ModelType = string.Empty;
#region
//贪婪搜索[greedy_search] 改进的波束搜索 [modified_beam_search]
//贪婪搜索
config.DecodingMethod = "greedy_search";
////改进的波束搜索
//config.DecodingMethod = "modified_beam_search";
////仅在 --decoding--method 为 [波束搜索]modified_beam_search 时使用。
////它指定搜索过程中要保留的活动路径数
//config.MaxActivePaths =4;
#endregion
//热词目录
config.HotwordsFile = string.Empty;
//热词得分
config.HotwordsScore =1.5f ;
//反转文本规范化规则 fst 的路径
config.RuleFsts = string.Empty;
config.ModelConfig.Debug = 0;
OR = new OfflineRecognizer(config);
VADModelConfig = new VadModelConfig();
VADModelConfig.SileroVad.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "silero_VAD.onnx");
VADModelConfig.Debug = 0;
//缓冲区大小
VAD = new VoiceActivityDetector(VADModelConfig, 60);
}
/// <summary>
/// 获取语音字幕
/// </summary>
/// <param name="task"></param>
/// <returns></returns>
public static async Task RunTask(string task)
{
var filePath = Path.Combine(task.LocalPath(), task + ".wav");
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
throw new Exception("task 音频路径未找到");
string testWaveFilename = filePath;
WaveReader reader = new WaveReader(testWaveFilename);
int numSamples = reader.Samples.Length;
int windowSize = VADModelConfig.SileroVad.WindowSize;
int sampleRate = VADModelConfig.SampleRate;
int numIter = numSamples / windowSize;
var res = new List<SenseVoiceRes>(500);
for (int i = 0; i != numIter; ++i)
{
int start = i * windowSize;
float[] samples = new float[windowSize];
Array.Copy(reader.Samples, start, samples, 0, windowSize);
VAD.AcceptWaveform(samples);
//是否检测到语音
if (VAD.IsSpeechDetected())
{
while (!VAD.IsEmpty())
{
//获取最新的发言片段
SpeechSegment segment = VAD.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
res.Add(new()
{
Text = stream.Result.Text,
Start= startTime,
End = startTime + duration });
}
VAD.Pop();
}
}
}
VAD.Flush();
while (!VAD.IsEmpty())
{
SpeechSegment segment = VAD.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
res.Add(new()
{
Text = stream.Result.Text,
Start = startTime,
End = startTime + duration
});
}
VAD.Pop();
}
await RedisExpand.Redis.HMSetAsync(RedisExpandKey.Task(task), "Captions", res);
RedisExpand.InsertChannel(Enum.RedisChannelEnum.ParsingSpeaker, task);
}
}
}

View File

@ -0,0 +1,29 @@
using Whisper.net;
namespace VideoAnalysisCore.AICore.Whisper
{
/// <summary>
/// 字幕识别 结果
/// </summary>
public class SenseVoiceRes
{
public SenseVoiceRes()
{
}
/// <summary>
/// 文本
/// </summary>
public string Text { get; set; } = string.Empty;
/// <summary>
/// 开始时间
/// </summary>
public float Start { get; set; }
/// <summary>
/// 结束时间
/// </summary>
public float End { get; set; }
}
}

View File

@ -22,6 +22,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
/// <param name="threshold"></param> /// <param name="threshold"></param>
public static void Init(int speakerNumber = 0, double threshold = 0.6) public static void Init(int speakerNumber = 0, double threshold = 0.6)
{ {
Console.WriteLine("初始化 Speaker");
var config = new OfflineSpeakerDiarizationConfig(); var config = new OfflineSpeakerDiarizationConfig();
//Pyannote模型地址 //Pyannote模型地址
config.Segmentation.Pyannote.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-pyannote-segmentation-3-0", "model.onnx"); config.Segmentation.Pyannote.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-pyannote-segmentation-3-0", "model.onnx");
@ -72,15 +73,15 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
/// 总持续时间 /// 总持续时间
/// </summary> /// </summary>
[JsonIgnore] [JsonIgnore]
public decimal Total => End - Start; public float Total => End - Start;
/// <summary> /// <summary>
/// 开始时间 /// 开始时间
/// </summary> /// </summary>
public decimal Start { get; set; } public float Start { get; set; }
/// <summary> /// <summary>
/// 结束时间 /// 结束时间
/// </summary> /// </summary>
public decimal End { get; set; } public float End { get; set; }
/// <summary> /// <summary>
/// 讲话人索引 /// 讲话人索引
/// </summary> /// </summary>
@ -99,8 +100,8 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
/// <param name="sds"></param> /// <param name="sds"></param>
public OfflineSpeakerRes(OfflineSpeakerDiarizationSegment sds) public OfflineSpeakerRes(OfflineSpeakerDiarizationSegment sds)
{ {
Start = (decimal)sds.Start; Start = sds.Start;
End = (decimal)sds.End; End =sds.End;
SpeakerIndex = sds.Speaker; SpeakerIndex = sds.Speaker;
} }
} }

View File

@ -7,6 +7,10 @@ namespace VideoAnalysisCore.AICore.Whisper
/// </summary> /// </summary>
public class WhisperResDto public class WhisperResDto
{ {
public WhisperResDto()
{
}
/// <summary> /// <summary>
/// ///
/// </summary> /// </summary>
@ -20,16 +24,16 @@ namespace VideoAnalysisCore.AICore.Whisper
/// <summary> /// <summary>
/// 文本 /// 文本
/// </summary> /// </summary>
public string Text { get; } = string.Empty; public string Text { get; set; } = string.Empty;
/// <summary> /// <summary>
/// 开始时间 /// 开始时间
/// </summary> /// </summary>
public TimeSpan Start { get; } public TimeSpan Start { get; set; }
/// <summary> /// <summary>
/// 结束时间 /// 结束时间
/// </summary> /// </summary>
public TimeSpan End { get; } public TimeSpan End { get; set; }
} }
} }

View File

@ -84,6 +84,7 @@ namespace VideoAnalysisCore.Common
/// </summary> /// </summary>
public static void Init() public static void Init()
{ {
Console.WriteLine("初始化 redis");
Redis.Serialize = obj => System.Text.Json.JsonSerializer.Serialize(obj); Redis.Serialize = obj => System.Text.Json.JsonSerializer.Serialize(obj);
Redis.Deserialize = (json, type) => System.Text.Json.JsonSerializer.Deserialize(json, type); Redis.Deserialize = (json, type) => System.Text.Json.JsonSerializer.Deserialize(json, type);
InitChannel(); InitChannel();
@ -125,7 +126,7 @@ namespace VideoAnalysisCore.Common
(msg) => { TouchChannel(RedisChannelEnum.SeparateAudio, msg, FFMPGEHandle.Audio2WAV16KAsync); }); (msg) => { TouchChannel(RedisChannelEnum.SeparateAudio, msg, FFMPGEHandle.Audio2WAV16KAsync); });
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingCaptions), Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingCaptions),
(msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, WhisperHandle.RunTask); }); (msg) => { TouchChannel(RedisChannelEnum.ParsingCaptions, msg, SenseVoice.RunTask); });
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingSpeaker), Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ParsingSpeaker),
(msg) => { TouchChannel(RedisChannelEnum.ParsingSpeaker, msg, Speaker.Run); }); (msg) => { TouchChannel(RedisChannelEnum.ParsingSpeaker, msg, Speaker.Run); });
Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ChatModelAnalysis), Redis.SubscribeList(RedisExpandKey.EnumKey(RedisChannelEnum.ChatModelAnalysis),

View File

@ -49,7 +49,7 @@
<PackageReference Include="Microsoft.Extensions.DependencyModel" Version="7.0.0" /> <PackageReference Include="Microsoft.Extensions.DependencyModel" Version="7.0.0" />
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.0" /> <PackageReference Include="Microsoft.Extensions.Http" Version="8.0.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" /> <PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="1.10.28" /> <PackageReference Include="org.k2fsa.sherpa.onnx" Version="1.10.30" />
<PackageReference Include="SqlSugar.IOC" Version="2.0.0" /> <PackageReference Include="SqlSugar.IOC" Version="2.0.0" />
<PackageReference Include="SqlSugarCore" Version="5.1.4.170" /> <PackageReference Include="SqlSugarCore" Version="5.1.4.170" />
<PackageReference Include="Whisper.net" Version="1.5.0" /> <PackageReference Include="Whisper.net" Version="1.5.0" />