优化 sv语言转录函数[待测试]

This commit is contained in:
小肥羊 2025-03-18 12:01:03 +08:00
parent b5e174e683
commit 18af52484a
3 changed files with 97 additions and 143 deletions

View File

@ -69,16 +69,24 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek
|| s.Depth == 2)) || s.Depth == 2))
.Select(s => s.Name).ToArrayAsync(); .Select(s => s.Name).ToArrayAsync();
string title = taskInfo.MediaName; string title = taskInfo.MediaName;
var speakerArr = JsonSerializer.Deserialize<OfflineSpeakerRes[]>(taskInfo.Speaker);
var captionsArr = JsonSerializer.Deserialize<SenseVoiceRes[]>(taskInfo.Captions);
var fileNameResFormat = "{授课章节: string|null}"; var fileNameResFormat = "{授课章节: string|null}";
//var fileNamePostMessages = title +
// " 这是一堂课的标题,请你基于标题帮我分析出这堂课所讲授的内容与最恰当的授课章节(关联最贴切的章节,保留一个章节!)." +
// $"章节范围限定在[{string.Join(',', xkwKnows)}]范围内." +
// $"输出格式 json字符串 对象格式{fileNameResFormat}";
var fileNamePostMessages = title + var fileNamePostMessages = title +
" 这是一堂课的标题,请你基于标题帮我分析出这堂课所讲授的内容与最恰当的授课章节(关联最贴切的章节,保留一个章节!)." + " 这是一堂课的部分授课字幕,请你基于字幕内容帮我分析出这堂课所讲授的内容与最恰当的授课章节(关联最贴切的章节,保留一个章节!)." +
$"章节范围限定在[{string.Join(',', xkwKnows)}]范围内." + $"章节范围限定在[{string.Join(',', xkwKnows)}]范围内." +
$"输出格式 json字符串 对象格式{fileNameResFormat}"; $"输出格式 json字符串 对象格式{fileNameResFormat}";
var fileNameInfoRes = await ChatAsync<FileNameInfo> var fileNameInfoRes = await ChatAsync<FileNameInfo>
(task, fileNamePostMessages, null);//, "deepseek-chat"); (task, fileNamePostMessages, null);//, "deepseek-chat");
var speakerArr = JsonSerializer.Deserialize<OfflineSpeakerRes[]>(taskInfo.Speaker);
var captionsArr = JsonSerializer.Deserialize<SenseVoiceRes[]>(taskInfo.Captions);
var captions = ExpandFunction.GetSpeakerCaptions(captionsArr, speakerArr); var captions = ExpandFunction.GetSpeakerCaptions(captionsArr, speakerArr);
var maxVideoTime = captions?.TimeBase?.LastOrDefault()?.End ?? 0; var maxVideoTime = captions?.TimeBase?.LastOrDefault()?.End ?? 0;
var criteriaBuilder = new StringBuilder(); var criteriaBuilder = new StringBuilder();

View File

@ -17,7 +17,7 @@ using static System.Runtime.InteropServices.JavaScript.JSType;
namespace VideoAnalysisCore.AICore.SherpaOnnx namespace VideoAnalysisCore.AICore.SherpaOnnx
{ {
public class SenseVoice public static class SenseVoice
{ {
const string TransducerStr = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20"; const string TransducerStr = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20";
static OfflineRecognizer OR = default!; static OfflineRecognizer OR = default!;
@ -104,78 +104,10 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
/// <returns></returns> /// <returns></returns>
public static async Task<List<SenseVoiceRes>> RunTask(Stream s) public static async Task<List<SenseVoiceRes>> RunTask(Stream s)
{ {
if (OR is null)
Init();
if (s is null) if (s is null)
throw new Exception("音频路径 is null"); throw new Exception("音频路径 is null");
return await TaskHandle(new WaveReader(s));
WaveReader reader = new WaveReader(s);
int numSamples = reader.Samples.Length;
int windowSize = VADModelConfig.SileroVad.WindowSize;
int sampleRate = VADModelConfig.SampleRate;
int numIter = numSamples / windowSize;
var totalSecond = numSamples / (float)sampleRate;
var res = new List<SenseVoiceRes>(500);
//缓冲区大小
var VAD = new VoiceActivityDetector(VADModelConfig, 60);
//var VAD = new VoiceActivityDetector(VADModelConfig, 60);
for (int i = 0; i != numIter; ++i)
{
int start = i * windowSize;
float[] samples = new float[windowSize];
Array.Copy(reader.Samples, start, samples, 0, windowSize);
VAD.AcceptWaveform(samples);
//是否检测到语音
if (VAD.IsSpeechDetected())
{
while (!VAD.IsEmpty())
{
//获取最新的发言片段
SpeechSegment segment = VAD.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
using OfflineStream stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
res.Add(new()
{
Text = stream.Result.Text,
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
});
}
VAD.Pop();
}
}
}
VAD.Flush();
while (!VAD.IsEmpty())
{
SpeechSegment segment = VAD.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
res.Add(new()
{
Text = stream.Result.Text,
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
});
}
VAD.Pop();
}
VAD.Reset();
return res;
} }
/// <summary> /// <summary>
@ -185,82 +117,57 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
/// <returns></returns> /// <returns></returns>
public static async Task RunTask(string task) public static async Task RunTask(string task)
{ {
if (OR is null)
Init();
var filePath = Path.Combine(task.LocalPath(), task + ".wav"); var filePath = Path.Combine(task.LocalPath(), task + ".wav");
if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath)) if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
throw new Exception("task 音频路径未找到"); throw new Exception("task 音频路径未找到");
await TaskHandle(new WaveReader(filePath));
}
WaveReader reader = new WaveReader(filePath); /// <summary>
/// 任务处理
/// </summary>
/// <param name="reader">Wave</param>
/// <param name="task">任务id [默认Null]</param>
/// <returns></returns>
/// <exception cref="Exception"></exception>
public static async Task<List<SenseVoiceRes>> TaskHandle(WaveReader reader, string? task = null)
{
if (OR is null)
Init();
int numSamples = reader.Samples.Length; int numSamples = reader.Samples.Length;
int windowSize = VADModelConfig.SileroVad.WindowSize; int windowSize = VADModelConfig.SileroVad.WindowSize;
int sampleRate = VADModelConfig.SampleRate; int sampleRate = VADModelConfig.SampleRate;
int numIter = numSamples / windowSize; int numIter = numSamples / windowSize;
var totalSecond = numSamples / (float)sampleRate; var totalSecond = numSamples / (float)sampleRate;
var res = new List<SenseVoiceRes>(500); var res = new List<SenseVoiceRes>(500);
var VAD = new VoiceActivityDetector(VADModelConfig, 60); using var VAD = new VoiceActivityDetector(VADModelConfig, 30);
for (int i = 0; i != numIter; ++i) for (int i = 0; i != numIter; ++i)
{ {
int start = i * windowSize; int start = i * windowSize;
float[] samples = new float[windowSize]; //float[] samples = new float[windowSize];
Array.Copy(reader.Samples, start, samples, 0, windowSize); //Array.Copy(reader.Samples, start, samples, 0, windowSize);
VAD.AcceptWaveform(samples); //VAD.AcceptWaveform(samples);
Memory<float> samples = new float[windowSize];
Memory<float> sourceSpan = reader.Samples.AsMemory(start, windowSize);
sourceSpan.CopyTo(samples);
VAD.AcceptWaveform(samples.ToArray());
//是否检测到语音 //是否检测到语音
if (VAD.IsSpeechDetected()) if (VAD.IsSpeechDetected())
{
while (!VAD.IsEmpty())
{ {
//获取最新的发言片段 //获取最新的发言片段
SpeechSegment segment = VAD.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
res.Add(new()
{
Text = stream.Result.Text,
//Text = ExpandFunction.HandleFormula(stream.Result.Text),
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
});
var progress = (float)(startTime + duration) / (totalSecond) * 100;
RedisExpand.SetTaskProgress(task, progress);
}
VAD.Pop();
}
}
}
VAD.Flush();
while (!VAD.IsEmpty()) while (!VAD.IsEmpty())
{ await VAD.ReadNext(res, totalSecond, task);
SpeechSegment segment = VAD.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
res.Add(new()
{
Text = stream.Result.Text,
//Text = ExpandFunction.HandleFormula(stream.Result.Text),
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
});
} }
VAD.Pop();
} }
while (!VAD.IsEmpty())
await VAD.ReadNext(res, totalSecond, task);
VAD.Flush();
//如果携带任务ID
if (!string.IsNullOrEmpty(task))
{
Console.WriteLine(DateTime.Now + "=> SenseVoice 字幕数量" + res.Count); Console.WriteLine(DateTime.Now + "=> SenseVoice 字幕数量" + res.Count);
var captionsStr = JsonSerializer.Serialize(res); var captionsStr = JsonSerializer.Serialize(res);
await DbScoped.Sugar await DbScoped.Sugar
.Updateable<VideoTask>() .Updateable<VideoTask>()
@ -273,7 +180,45 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx
RedisExpand.NewTask(); RedisExpand.NewTask();
RedisExpand.InsertChannel(RedisChannelEnum.ChatModelAnalysis, task); RedisExpand.InsertChannel(RedisChannelEnum.ChatModelAnalysis, task);
}
return res;
}
/// <summary>
/// 处理vad 下一个切片
/// </summary>
/// <param name="VAD"></param>
/// <param name="res">字幕处理后写入数组</param>
/// <param name="totalSecond">总时长</param>
/// <param name="task">所属任务id</param>
/// <returns></returns>
public static async Task ReadNext(this VoiceActivityDetector VAD, List<SenseVoiceRes> res, float totalSecond, string? task = null)
{
var segment = VAD.Front();
var sampleRate = VADModelConfig.SampleRate;
var sampleRateF = (float)VADModelConfig.SampleRate;
float startTime = segment.Start / sampleRateF;
float duration = segment.Samples.Length / sampleRateF;
using var stream = OR.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
OR.Decode(stream);
if (!string.IsNullOrEmpty(stream.Result.Text))
{
var text = stream.Result.Text.Trim();
if (text.Length == 1 && text.First() >= '\uFF00' && text.First() <= '\uFFEF') // 检查字符是否在全角半角字符集的标点符号范围内
{
VAD.Pop();
return;
}
res.Add(new()
{
Text = stream.Result.Text,
Start = (float)Math.Round(startTime, 2, MidpointRounding.AwayFromZero),
End = (float)Math.Round(startTime + duration, 2, MidpointRounding.AwayFromZero),
});
if (!string.IsNullOrEmpty(task))
RedisExpand.SetTaskProgress(task, (double)(startTime + duration) / (totalSecond) * 100);
}
VAD.Pop();
} }
} }
} }

View File

@ -9,6 +9,7 @@ using VideoAnalysisCore.Model;
using System.Text.Json; using System.Text.Json;
using VideoAnalysisCore.Model.Enum; using VideoAnalysisCore.Model.Enum;
using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.DependencyInjection;
using UserCenter.Model.Enum;
namespace VideoAnalysisCore.AICore.SherpaOnnx namespace VideoAnalysisCore.AICore.SherpaOnnx
{ {