diff --git a/VideoAnalysis/WebUI/src/api/videoTask.ts b/VideoAnalysis/WebUI/src/api/videoTask.ts index 6a8bdf3..293756b 100644 --- a/VideoAnalysis/WebUI/src/api/videoTask.ts +++ b/VideoAnalysis/WebUI/src/api/videoTask.ts @@ -17,7 +17,7 @@ export interface VideoKnowRes { startTime: number; } -export interface SenseVoiceRes { + export interface SenseVoiceRes { text: string; start: number; end: number; @@ -72,4 +72,4 @@ export const ErrorTaskList = (data: any) => { return http.request("post", "/api/VideoTask/ErrorTaskList", { data }); -}; \ No newline at end of file +}; diff --git a/VideoAnalysisCore/AICore/GPT/ChatGPT/ChatGPTClient.cs b/VideoAnalysisCore/AICore/GPT/ChatGPT/ChatGPTClient.cs index dd646eb..d79afde 100644 --- a/VideoAnalysisCore/AICore/GPT/ChatGPT/ChatGPTClient.cs +++ b/VideoAnalysisCore/AICore/GPT/ChatGPT/ChatGPTClient.cs @@ -44,7 +44,7 @@ namespace VideoAnalysisCore.AICore.GPT.ChatGPT /// 最大token 不设置默认最大值 16000/8000 /// /// - public async Task ChatAsync(string task, string postMessages, string title, string model = null, int max_tokens = 8000) + public override async Task ChatAsync(string task, string postMessages, string title, string model = null, int max_tokens = 16000) { Message[] messageArr = [ new Message(postMessages,"user"), @@ -56,7 +56,7 @@ namespace VideoAnalysisCore.AICore.GPT.ChatGPT taskId = task, model = model, title = title, - max_tokens =8000, + max_tokens = max_tokens, stream = true, temperature = 0.2f, messages = messageArr diff --git a/VideoAnalysisCore/AICore/GPT/DeepSeek/DeepSeekGPTClient.cs b/VideoAnalysisCore/AICore/GPT/DeepSeek/DeepSeekGPTClient.cs index 5be5b2d..4795118 100644 --- a/VideoAnalysisCore/AICore/GPT/DeepSeek/DeepSeekGPTClient.cs +++ b/VideoAnalysisCore/AICore/GPT/DeepSeek/DeepSeekGPTClient.cs @@ -42,7 +42,7 @@ namespace VideoAnalysisCore.AICore.GPT.ChatGPT /// 最大token 不设置默认最大值 16000/8000 /// /// - public async Task ChatAsync(string task, string postMessages, string title, string model =null, int max_tokens = 8000) + public override async Task ChatAsync(string task, string postMessages, string title, string model =null, int max_tokens = 16000) { Message[] messageArr = [ new Message(postMessages,"user"), diff --git a/VideoAnalysisCore/AICore/GPT/GPTClient.cs b/VideoAnalysisCore/AICore/GPT/GPTClient.cs index 2b1946d..b2ac97e 100644 --- a/VideoAnalysisCore/AICore/GPT/GPTClient.cs +++ b/VideoAnalysisCore/AICore/GPT/GPTClient.cs @@ -127,9 +127,12 @@ namespace VideoAnalysisCore.AICore.GPT messageBuilder.Append(str); if (!string.IsNullOrEmpty(strReasoning)) messageBuilder1.Append(strReasoning); - var steamCount = messageBuilder.Length + messageBuilder1.Length; - if (++threshold % 30 == 0) - redisManager.SetTaskProgress(chatReq.taskId, "steam=>" + steamCount); + if (chatReq.title != "优化字幕") + { + var steamCount = messageBuilder.Length + messageBuilder1.Length; + if (++threshold % 30 == 0) + redisManager.SetTaskProgress(chatReq.taskId, "steam=>" + steamCount); + } } catch (Exception e) { @@ -172,13 +175,14 @@ namespace VideoAnalysisCore.AICore.GPT redisCached[1] = new object[] { chatResp.Value.res, chatResp.Value.u, chatResp.Value.reasoning }; redisManager.SetTaskGPTCached(chatRep.taskId, time, redisCached); } - chatResContent = chatResContent?.ExtractJsonStrings()?.FirstOrDefault(); chatResContent = chatResContent?.Replace("\n", ""); chatResContent = chatResContent?.Replace("```json", ""); chatResContent = chatResContent?.Replace("```", ""); chatResContent = chatResContent?.Replace("}{", "},{"); chatResContent = chatResContent?.Replace("}|{", "},{"); - chatResContent = chatResContent?.Trim(); + chatResContent = chatResContent?.Trim(); + chatResContent = chatResContent?.ExtractJsonStrings()?.FirstOrDefault(); + if (string.IsNullOrEmpty(chatResContent)) throw new Exception($"GPT返回结果无有效JSON =>{chatResp?.res}"); @@ -251,5 +255,25 @@ namespace VideoAnalysisCore.AICore.GPT } throw errorMSG.Last(s => s != null); } + + + + + /// + /// 请求AI + /// + /// 返回JSON类型 + /// 任务id + /// 提示词 + /// 任务类型 + /// GPT版本 + /// 最大token 不设置默认最大值 16000 + /// + /// + public virtual Task ChatAsync(string task, string postMessages, string title, + string model = null, int max_tokens = 16000) + { + throw new Exception("需要实现"); + } } } \ No newline at end of file diff --git a/VideoAnalysisCore/AICore/GPT/GTP_Analysis_1.cs b/VideoAnalysisCore/AICore/GPT/GTP_Analysis_1.cs index cad5718..e313d6d 100644 --- a/VideoAnalysisCore/AICore/GPT/GTP_Analysis_1.cs +++ b/VideoAnalysisCore/AICore/GPT/GTP_Analysis_1.cs @@ -81,21 +81,22 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek var thems = questionRes.Adapt().ToJson(); var checkResFormat1 = """[{"StartTime":开始秒(number),"KnowPoint":知识点名称(string),"KnowPointId":知识点Id(string)}]"""; var knowMessages = - $"我针对{taskInfo.Subject}课堂授课视频分析出了视频的授课阶段片段。" + - $"现在需要你通过每个片段的内容总结来分配正确的知识点(单个片段允许多个知识点用逗号','分割)。" + - $"这是我的分段 {thems}。" + - $"课堂内容与{sections}章节相关" + - $"最后请确保分配的知识点是用户提供的,并且一定正确合理!" + - $"返回的片段数量与传入片段数量一致!" + - $"输出内容只返回json格式({checkResFormat1})" + - $" 格式 (方法点Id|方法点名称) " + - $"提供的知识点名称({knows})。"; + $"我针对{taskInfo.Subject}课堂授课视频分析出了视频的授课阶段片段。\n" + + $"现在需要你通过每个片段的内容总结来分配正确的知识点(单个片段允许多个知识点用逗号','分割)。\n" + + $"这是我的分段 {thems}。\n" + + $"课堂内容与{sections}章节相关\n" + + $"最后请确保分配的知识点是用户提供的,并且一定正确合理!\n" + + $"返回的片段数量与传入片段数量一致(硬性条件)!\n" + + $"输出内容只返回json格式({checkResFormat1})\n" + + $" 格式 (方法点Id|方法点名称) \n" + + $"提供的`知识点名称({knows})。\n"; await redisManager.AddTaskLog(taskInfo.Id, "=>2.开始分析视频内容知识点"); VideoKnowRes[] konwRes; - var knowOK = false; - for (int i = 0; i < 4; i++) + var knowOK = false; + var chatClentArr = new GPTClient[] { chatGPTClient, geminiClient,deepSeekClient }; + for (int i = 0; i < 3; i++) { - konwRes = await chatGPTClient.ChatAsync(taskInfo.Id.ToString(), knowMessages, "知识点"); + konwRes = await chatClentArr[i].ChatAsync(taskInfo.Id.ToString(), knowMessages, "知识点"); // 分析结果的片段数量与预期不匹配 if (questionRes.Length != konwRes.Length) continue; for (int xi = 0; xi < konwRes.Count(); xi++) @@ -103,14 +104,12 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek knowOK = true; break; } - if (!knowOK) + if (!knowOK) { - await redisManager.AddTaskLog(taskInfo.Id,"GPT未能分析出有效的分段的知识点"); + await redisManager.AddTaskLog(taskInfo.Id, "GPT未能分析出有效的分段的知识点"); throw new Exception("GPT未能分析出有效的分段的知识点"); } - - //todo 未包含的知识点片段 如何处理 return questionRes .Where(s => !string.IsNullOrEmpty(s.KnowPoint)) .SelectMany( @@ -190,7 +189,7 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek var pptFormat = taskInfo.VideoType == AttachmentsInfoType.复习 ? "这堂课是习题课,所讲解内容几乎都是试题。" : string.Empty; - var checkResFormat = """{"Score":打分(number),"Evaluation":评价以及扣分原因(string)""";//,"Data":优化后的分段(array)}"""; + var checkResFormat = """{"Score":打分(number),"Evaluation":扣分原因/改进建议(string)""";//,"Data":优化后的分段(array)}"""; var checkMessage = $""" 请你担任一位专业的视频内容分析教研老师,擅长评估视频内容的结构和逻辑流暢度。 @@ -228,9 +227,11 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek var newCaptionsList = new List(captionsArr.Length); var spanCount = 75; var totalCount = captionsArr.Length / spanCount + 1; + await redisManager.AddTaskLog(taskInfo.Id, $"=>字幕优化"); + var chatClentArr = new GPTClient[] { deepSeekClient,chatGPTClient, geminiClient }; await Parallel.ForAsync(0, totalCount, - new ParallelOptions() { MaxDegreeOfParallelism =15 }, + new ParallelOptions() { MaxDegreeOfParallelism = 1 }, async (s, c) => { var cArr = captionsArr @@ -242,24 +243,33 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek var nowCaptionStr = cStrArr.ToJson(); var resFormat = """[string(修改结果)]"""; var postMessages = - $"角色设定:你是一位专业的中国{subject}学科专家,负责校对关于{sections}内容的课堂教学字幕。\n\n" + + $"角色设定:你是一位专业的中国{subject}学科专家,负责校对关于{sections}内容的课堂教学字幕。\n" + $"任务描述:\n" + $"请根据上下文逻辑,对输入的语音识别(STT)字幕进行深度优化。具体要求如下:\n" + $"1. 逻辑纠错:结合{subject}学科背景,利用前后文语义修正所有错误词汇。不仅要修正同音错别词(如:树列改为数列),还要修正因识别模糊导致的语义断裂或学科术语错误。\n" + $"2. 断句与标点:优化字幕的标点符号,并根据老师说话的语感和学科逻辑重新调整断句位置。确保每一条字幕在学术表达上自然、通顺,修复由于语音停顿造成的断句不当。\n" + - $"3. 公式规范:将字幕中提到的数学或科学公式统一转化为规范的 LaTeX 格式。\n\n" + + $"3. 公式规范:将字幕中提到的数学或科学公式统一转化为规范的 LaTeX 格式(使用$包裹公式,注意严格遵守Json格式的转义符号)。\n" + $"强制约束:\n" + $"- 数量对齐:输出的字幕条数(Array Length)必须与输入的字幕条数完全一致,严禁合并、拆分或删除任何条目。\n" + $"- 纯净返回:只允许返回 JSON 格式的字符串,严禁包含任何前言、后缀或解释性文字。\n" + - $"- 数据格式:JSON 结构必须严格符合:{resFormat}\n\n" + + $"- 数据格式:JSON 结构必须严格符合:{resFormat}\n" + $"待优化字幕内容:\n" + - $"{nowCaptionStr}\n\n" + + $"{nowCaptionStr}\n" + $"最终核对:请确保输出 JSON 中包含的字幕条数与输入的字幕条数完全对应。"; - var resData = await deepSeekClient.ChatAsync(taskInfo.Id.ToString(), postMessages, "优化字幕"); + string[]? resData = null; + for (int i = 0; i < 3; i++) + { + resData = await chatClentArr[i].ChatAsync(taskInfo.Id.ToString(), postMessages, "优化字幕", ChatGPTType.Deepseek_Chat, 8000); + if (resData.Count() == cArr.Count()) + break; + else + await redisManager.AddTaskLog(taskInfo.Id, $"=>字幕优化 分段{s} AI结果数量不匹配 重试{i}"); + } + if (resData.Count() != cArr.Count()) { resData = cStrArr.ToArray(); - await redisManager.AddTaskLog(taskInfo.Id, $"=>字幕优化 分段{s} AI结果数量不匹配,采用原始值"); + await redisManager.AddTaskLog(taskInfo.Id, $"=>字幕优化 分段{s} AI结果数量不匹配 采用原始值"); } newCaptionsList.AddRange(resData.Select((text, i) => new SenseVoiceRes() { @@ -290,33 +300,34 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek { try { - var keyFrameArr = string.IsNullOrEmpty(taskInfo?.PPTVideoCode) || string.IsNullOrEmpty(taskInfo?.PPTKeyFrame) - ? $"" - : $"初步划分阶段:授课PPT发生了变化的时间是{taskInfo.PPTKeyFrame},基于PPT变化时间点,将字幕内容分割成时间段。每个时间段的起始和结束应接近这些时间点(例如,以时间点为中心,扩展至内容自然过渡处)。"; + //分段超长问题,评分优化如何处理 + var keyFrameStr = string.IsNullOrEmpty(taskInfo?.PPTVideoCode) || string.IsNullOrEmpty(taskInfo?.PPTKeyFrame) + ? $"请分析授课中字幕描述的知识内容,然后基于视频整体知识点讲解提炼出不同的阶段以便对老师上课内容切片提取为知识库,所以请确保阶段的内容准确性" + : $"授课中老师的PPT在这些时间段内进行了切换{taskInfo.PPTKeyFrame},理应这些时间段内的讲述内容也发生了变化,请你基于PPT变化时间点结合字幕描述的知识内容提炼出不同的切片。每个阶段的起始和结束应接近这些时间点(例如,以时间点为中心,扩展至内容自然过渡处)。"; var resFormat = """[{"StartTime":开始秒(number),"EndTime":结束秒(number),"Stage":阶段(string),"Theme":主题(string),"Content":内容总结(string)}]"""; - + var reviewStr = taskInfo?.VideoType == AttachmentsInfoType.复习 + ? $"但本堂课是习题课,所以大部分阶段是不同的例题讲解内容。\n" + : string.Empty; var postMessages = string.Empty; - postMessages = - $"请通过视频字幕内容分析出视频中课堂的授课阶段。" + - $"课堂内容与{taskInfo.Subject}学科下的{sections}章节相关。" + - $"完整的课堂标准流程包含以下5个阶段:课程引入/新知讲解/例题精讲/课堂练习/知识总结。" + - (taskInfo?.VideoType == AttachmentsInfoType.复习 - ? $"但本堂课是习题课,所以大部分阶段是不同的例题讲解内容。" - : string.Empty) + - $"{keyFrameArr}" + - $"内容分析:对每个时间段,提取主要讲解内容:识别关键词(如“例题”“证明”“练习”“总结”)和内容结构。" + - $"判断阶段类型:如果内容以解题为主,归类为“例题精讲”;如果涉及新知识讲解,归类为“新知讲解”;以此类推。" + - $"内容总结:简述该阶段的核心讲解内容70~200字,确保内容与阶段时间内授课内容符合。" + - $"阶段主题:基于内容总结,提炼一个恰当的主题(例如,“柯西不等式的基本应用”)。" + - $"输出要求:确保阶段划分合理、无重叠,且时长符合要求" + - $"输出格式要求:内容只返回json格式({resFormat})" + - $"字幕格式(开始秒:内容|下一段字幕).以下是包含时间的视频字幕文本。" + + $"请通过视频字幕内容分析出视频中课堂的授课知识点切片\n" + + $"阶段的细分程度到某个知识点的讲解/认识/例题/总结\n" + + $"课堂内容与{taskInfo.Subject}学科下的{sections}章节相关。\n" + + $"完整的课堂标准流程包含以下5个阶段:课程引入/新知讲解/例题精讲/课堂练习/知识总结。\n" + + reviewStr + + $"初步划分阶段:{keyFrameStr}\n" + + $"内容分析:对每个时间段,提取主要讲解内容:识别关键词(如“例题”“证明”“练习”“总结”)和内容结构。\n" + + $"判断阶段类型:如果内容以解题为主,归类为“例题精讲”;如果涉及新知识讲解,归类为“新知讲解”;以此类推。\n" + + $"内容总结:简述该阶段的核心讲解内容70~200字,确保内容与阶段时间内授课内容符合。\n" + + $"阶段主题:基于内容总结,提炼一个恰当的主题(例如,“柯西不等式的基本应用”)。\n" + + $"输出要求:确保阶段划分合理、无` 重叠,且时长符合要求\n" + + $"输出格式要求:内容只返回json格式({resFormat})\n" + + $"字幕格式(开始秒:内容|下一段字幕).以下是包含时间的视频字幕文本。\n" + $"字幕列表 {captions.Captions} 字幕结束!"; - - await redisManager.AddTaskLog(taskInfo.Id, $"开始分析视频内容 {tryCount}"); + + await redisManager.AddTaskLog(taskInfo.Id, $"开始分析视频内容 {tryCount}"); //return await chatGPTClient.ChatAsync(taskInfo.Id.ToString(), postMessages, "分析字幕"); - var res = await geminiClient.ChatAsync(taskInfo.Id.ToString(), postMessages, "分析字幕"); + var res = await geminiClient.ChatAsync(taskInfo.Id.ToString(), postMessages, "分析字幕"); //var r2 = await chatClient.ChatAsync(taskInfo.Id.ToString(), postMessages, "分析字幕"); return res; } @@ -328,7 +339,7 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek return null; } - + @@ -402,7 +413,7 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek if (resData is null || resData.Count() == 0) break; foreach (var q in resData) - { + { var TopicId = YitIdHelper.NextId(); foreach (var qt in q.QuestionArr) { @@ -442,7 +453,7 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek } } } - insertData = insertData.GroupBy(s => string.Join("", Regex.Matches(s.StartTime+s.TopicStem+s.Question, "[\u4e00-\u9fa5a-zA-Z0-9]+"))) + insertData = insertData.GroupBy(s => string.Join("", Regex.Matches(s.StartTime + s.TopicStem + s.Question, "[\u4e00-\u9fa5a-zA-Z0-9]+"))) .Select(s => s.First()).ToList(); if (insertData == null || insertData.Count == 0 || insertQuestionKonw.Count == 0) return null; @@ -503,7 +514,7 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek } catch (Exception) { - throw new Exception("没有对应的子知识点=>" + sections+" "+ kInfo?.Name); + throw new Exception("没有对应的子知识点=>" + sections + " " + kInfo?.Name); } //AI优化字幕 @@ -523,17 +534,18 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek var insertData = await GetVideoKnow(questionRes, taskInfo, sections, knowledgeInfos);//ChatGPT //校验结果质量 var checkRes = await VerifySpanQuality(questionRes, taskInfo, captions, sections, Course_Id); - - await redisManager.AddTaskLog(taskInfo.Id, $"=>课堂内容AI分析结果 得分=>{checkRes.Score} "); - await redisManager.AddTaskLog(taskInfo.Id, checkRes.Evaluation); - if (checkRes != null && checkRes.Score >= 80) + await redisManager.AddTaskLog(taskInfo.Id, $"=>课堂内容AI分析结果 得分=>{checkRes.Score} "); + await redisManager.AddTaskLog(taskInfo.Id, checkRes.Evaluation); + + if (checkRes != null && checkRes.Score >= 85) { //写入知识点 await videoKonwPointDB.DeleteAsync(s => s.VideoTaskId == taskInfo.Id); await videoKonwPointDB.InsertRangeAsync(insertData); break; - }else + } + else await redisManager.AddTaskLog(taskInfo.Id, $"=>课堂内容AI分析结果不合格!即将重试 剩余次数{tryCount}"); if (questionRes.Any(s => s.KeepTime < 30)) { @@ -596,7 +608,7 @@ namespace VideoAnalysisCore.AICore.GPT.DeepSeek var taskInfo = await videoTaskDB.CopyNew().AsQueryable() .Where(s => s.Id == taskId) .FirstAsync(); - if (taskInfo.VideoType != null&& taskInfo.VideoType!=AttachmentsInfoType.无) + if (taskInfo.VideoType != null && taskInfo.VideoType != AttachmentsInfoType.无) return; var subject = taskInfo.Subject.ToString(); var Course_Id = taskInfo.CourseId; diff --git a/VideoAnalysisCore/AICore/GPT/Gemini/GeminiGPTClient.cs b/VideoAnalysisCore/AICore/GPT/Gemini/GeminiGPTClient.cs index 2dd8c7f..1c50699 100644 --- a/VideoAnalysisCore/AICore/GPT/Gemini/GeminiGPTClient.cs +++ b/VideoAnalysisCore/AICore/GPT/Gemini/GeminiGPTClient.cs @@ -41,7 +41,8 @@ namespace VideoAnalysisCore.AICore.GPT.ChatGPT /// 最大token 不设置默认最大值 16000/8000 /// /// - public async Task ChatAsync(string task, string postMessages, string title, string model = null, int max_tokens = 8000) + public override async Task ChatAsync(string task, string postMessages, string title, + string model = null, int max_tokens = 16000) { Message[] messageArr = [ new Message(postMessages,"user"), @@ -53,7 +54,7 @@ namespace VideoAnalysisCore.AICore.GPT.ChatGPT taskId = task, title=title, model = model, - max_tokens =12000, + max_tokens = max_tokens, stream = true, temperature = 0.2f, messages = messageArr diff --git a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs index 49daf94..14f74e8 100644 --- a/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs +++ b/VideoAnalysisCore/AICore/SherpaOnnx/SenseVoice.cs @@ -63,8 +63,11 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx config.ModelConfig.Tokens = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "tokens.txt"); //SenseVoice 模型 config.ModelConfig.SenseVoice.Model = Path.Combine(AppCommon.AIModelFile, "sherpa-onnx-sense-voice-24-07-17", "model.onnx"); - //1 使用逆文本规范化处理感官语音。 + //1 使用逆文本规范化处理感官语音 [控制标点符号生成]。 config.ModelConfig.SenseVoice.UseInverseTextNormalization = 1; + //反转文本规范化规则 fst 的路径 + //config.RuleFsts = Path.Combine(AppCommon.AIModelFile, "itn_subject_sx.fst"); + config.ModelConfig.SenseVoice.Language = "zh"; //模型类型 config.ModelConfig.ModelType = string.Empty; @@ -106,8 +109,6 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx //} #endregion - //反转文本规范化规则 fst 的路径 - config.RuleFsts = Path.Combine(AppCommon.AIModelFile, "itn_subject_sx.fst"); #if DEBUG config.ModelConfig.Debug = 1; @@ -162,7 +163,7 @@ namespace VideoAnalysisCore.AICore.SherpaOnnx int numIter = numSamples / windowSize; var totalSecond = numSamples / (float)sampleRate; var res = new List(500); - using var VAD = new VoiceActivityDetector(VADModelConfig, 30); + using var VAD = new VoiceActivityDetector(VADModelConfig, bufferSizeInSeconds: 20); for (int i = 0; i != numIter; ++i) { int start = i * windowSize; diff --git a/VideoAnalysisCore/Common/JsonExtractor.cs b/VideoAnalysisCore/Common/JsonExtractor.cs index f0bd2aa..4c2d483 100644 --- a/VideoAnalysisCore/Common/JsonExtractor.cs +++ b/VideoAnalysisCore/Common/JsonExtractor.cs @@ -1,84 +1,167 @@ using System; using System.Collections.Generic; +using System.Text; using System.Text.Json; +using System.Text.RegularExpressions; using static System.Runtime.InteropServices.JavaScript.JSType; namespace VideoAnalysisCore.Common { public static class JsonExtractor { + /// + /// 修复字符串中不规范的反斜杠转义,使其符合 JSON 规范。 + /// 特别适用于包含 LaTeX 公式(如 \overrightarrow, \unit)的非标准 JSON 数据。 + /// + public static string ToSafeJsonString(this string json) + { + if (string.IsNullOrEmpty(json)) return json; + + // 预分配稍大一点的空间,避免频繁扩容 + StringBuilder sb = new StringBuilder(json.Length + (json.Length / 10)); + int i = 0; + int len = json.Length; + + while (i < len) + { + char c = json[i]; + + if (c == '\\') + { + int start = i; + while (i < len && json[i] == '\\') + { + i++; + } + int count = i - start; + + // 只有奇数个反斜杠才需要检查“尾巴”是否合法 + if (count % 2 != 0) + { + // 检查这最后一个反斜杠后面跟的是不是合法的 JSON 转义字符 + if (i >= len || !IsValidJsonEscape(json, i)) + { + // 非法转义,补齐它 + count++; + } + } + + // 性能优化:直接添加指定数量的字符 + sb.Append('\\', count); + } + else + { + sb.Append(c); + i++; + } + } + + return sb.ToString(); + } + + private static bool IsValidJsonEscape(string text, int nextCharIndex) + { + char nextChar = text[nextCharIndex]; + + // 标准 JSON 简单转义 + if (nextChar == '"' || nextChar == '\\' || nextChar == '/' || + nextChar == 'b' || nextChar == 'f' || nextChar == 'n' || + nextChar == 'r' || nextChar == 't') + return true; + + // Unicode 转义检查: \uXXXX + if (nextChar == 'u') + { + if (nextCharIndex + 4 < text.Length) + { + for (int k = 1; k <= 4; k++) + { + char hex = text[nextCharIndex + k]; + bool isHex = (hex >= '0' && hex <= '9') || + (hex >= 'a' && hex <= 'f') || + (hex >= 'A' && hex <= 'F'); + if (!isHex) return false; + } + return true; + } + return false; + } + + return false; + } /// /// 提取json字符串 /// /// /// - public static List ExtractJsonStrings(this string input) - { - var results = new List(); - if (string.IsNullOrWhiteSpace(input)) return results; - - int braceCount = 0; - int bracketCount = 0; - int startIndex = -1; - bool inString = false; - bool isEscaped = false; - - for (int i = 0; i ExtractJsonStrings(this string input) { - char c = input[i]; + input = input.ToSafeJsonString(); + var results = new List(); + if (string.IsNullOrWhiteSpace(input)) return results; - // 1. 处理转义字符 (例如 \") - if (isEscaped) + int braceCount = 0; + int bracketCount = 0; + int startIndex = -1; + bool inString = false; + bool isEscaped = false; + + for (int i = 0; i < input.Length; i++) { - isEscaped = false; - continue; + char c = input[i]; + + // 1. 处理转义字符 (例如 \") + if (isEscaped) + { + isEscaped = false; + continue; + } + + if (c == '\\') + { + isEscaped = true; + continue; + } + + // 2. 处理字符串边界 + if (c == '"') + { + inString = !inString; + continue; + } + + // 3. 如果在字符串内,忽略括号逻辑 + if (inString) continue; + + // 4. 处理 JSON 对象和数组的开始 + if (c == '{' || c == '[') + { + if (braceCount == 0 && bracketCount == 0) + { + startIndex = i; + } + if (c == '{') braceCount++; + else bracketCount++; + } + // 5. 处理 JSON 对象和数组的结束 + else if (c == '}' || c == ']') + { + if (c == '}') braceCount--; + else bracketCount--; + + if (braceCount == 0 && bracketCount == 0 && startIndex != -1) + { + string potentialJson = input.Substring(startIndex, i - startIndex + 1); + if (IsValidJson(potentialJson)) + { + results.Add(potentialJson); + } + startIndex = -1; + } + } } - - if (c == '\\') - { - isEscaped = true; - continue; - } - -// 2. 处理字符串边界 -if (c == '"') -{ - inString = !inString; - continue; -} - -// 3. 如果在字符串内,忽略括号逻辑 -if (inString) continue; - -// 4. 处理 JSON 对象和数组的开始 -if (c == '{' || c == '[') -{ - if (braceCount == 0 && bracketCount == 0) - { - startIndex = i; - } - if (c == '{') braceCount++; - else bracketCount++; -} -// 5. 处理 JSON 对象和数组的结束 -else if (c == '}' || c == ']') -{ - if (c == '}') braceCount--; - else bracketCount--; - - if (braceCount == 0 && bracketCount == 0 && startIndex != -1) - { - string potentialJson = input.Substring(startIndex, i - startIndex + 1); - if (IsValidJson(potentialJson)) - { - results.Add(potentialJson); + return results; } - startIndex = -1; - } -} - } - return results; - } public static bool IsValidJson(string candidate) { @@ -89,7 +172,7 @@ else if (c == '}' || c == ']') JsonDocument.Parse(candidate); return true; } - catch + catch( Exception e) { return false; }