PhysicsCorrection/scripts/test_urls.py

148 lines
6.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
诊断脚本:验证图片 URL 和 Word 文档 URL 是否可访问
"""
import urllib.request
import requests
import sys
from typing import Dict, List, Tuple
def check_url(url: str, headers: Dict[str, str] = None, timeout: int = 10) -> Tuple[bool, str, int]:
"""
检查 URL 是否可访问
Args:
url: 要检查的 URL
headers: HTTP 头
timeout: 超时时间(秒)
Returns:
(is_valid, error_message, status_code)
"""
if headers is None:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*/*',
}
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, timeout=timeout) as response:
status_code = response.getcode()
content_type = response.headers.get('Content-Type', '')
# 读取前 100 字节检查
preview = response.read(100)
# 检查是否为 HTML 错误页面
if b'<html' in preview.lower() or b'<!doctype' in preview.lower():
error_msg = f"返回 HTML 页面(状态码: {status_code}"
if b'404' in preview.lower():
error_msg = "返回 404 错误页面"
elif b'403' in preview.lower():
error_msg = "返回 403 禁止访问页面"
return (False, error_msg, status_code)
# 检查内容类型
if 'image' in content_type:
return (True, f"图片 - {content_type}", status_code)
elif 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' in content_type or 'application/msword' in content_type:
return (True, f"Word 文档 - {content_type}", status_code)
else:
return (True, f"其他类型 - {content_type}", status_code)
except urllib.error.HTTPError as e:
status_code = e.code
error_msg = f"HTTP 错误 {status_code}"
if status_code == 404:
error_msg = "404 Not Found - 资源不存在"
elif status_code == 403:
error_msg = "403 Forbidden - 禁止访问"
elif status_code == 401:
error_msg = "401 Unauthorized - 需要认证"
return (False, error_msg, status_code)
except urllib.error.URLError as e:
return (False, f"网络错误: {str(e)[:100]}", 0)
except Exception as e:
return (False, f"未知错误: {str(e)[:100]}", 0)
def main():
# 测试 URL 列表
image_urls = [
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/18/69baa4f5-4826-4901-00e1-c6e66f02947f.jpg?x-oss-process=image/resize,w_1000",
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/25/69c344ba-4826-4901-00e1-c6ff235a12b2.jpg?x-oss-process=image/resize,w_1000",
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/25/69c344ba-4826-4901-00e1-c7055a396422.jpg?x-oss-process=image/resize,w_1000",
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/25/69c344ba-4826-4901-00e1-c6fe7f9c07ef.jpg?x-oss-process=image/resize,w_1000",
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/25/69c344ba-4826-4901-00e1-c70052d895e3.jpg?x-oss-process=image/resize,w_1000",
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/25/69c344ba-4826-4901-00e1-c6fc6ca7c4bf.png?x-oss-process=image/resize,w_1000",
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/25/69c33f31-4826-4901-00e1-c6fb50697e06.png?x-oss-process=image/resize,w_1000",
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/23/69c1029b-4826-4901-00e1-c6f614bc06d9.jpg?x-oss-process=image/resize,w_1000",
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/25/69c344ba-4826-4901-00e1-c6fd479e0a0e.jpg?x-oss-process=image/resize,w_1000",
"https://dpcclass.oss-cn-beijing.aliyuncs.com/umsupload/2026/03/23/69c1029b-4826-4901-00e1-c6f569b31a14.jpeg?x-oss-process=image/resize,w_1000",
]
doc_url = "https://dpc-oss.23544.com/umsupload/2026/03/25/69c353d0-4826-4901-00e1-c7081bcab988.docx"
print("=" * 80)
print("图片 URL 诊断")
print("=" * 80)
valid_count = 0
invalid_count = 0
for i, url in enumerate(image_urls, 1):
is_valid, msg, status_code = check_url(url)
status = "✅ 有效" if is_valid else "❌ 无效"
print(f"{i}. {status} - {msg}")
print(f" URL: {url}")
print(f" 状态码: {status_code}")
print()
if is_valid:
valid_count += 1
else:
invalid_count += 1
print("=" * 80)
print(f"汇总: {valid_count} 个有效,{invalid_count} 个无效")
print("=" * 80)
print()
print("=" * 80)
print("Word 文档 URL 诊断")
print("=" * 80)
is_valid, msg, status_code = check_url(doc_url)
status = "✅ 有效" if is_valid else "❌ 无效"
print(f"{status} - {msg}")
print(f"URL: {doc_url}")
print(f"状态码: {status_code}")
print()
if invalid_count > 0:
print("=" * 80)
print("⚠️ 警告:部分 URL 无法访问")
print("=" * 80)
print("可能的原因:")
print("1. URL 已过期(阿里云 OSS 签名 URL 有时效性)")
print("2. 访问权限不足(需要认证或 IP 白名单)")
print("3. 网络连接问题Docker 容器网络配置)")
print("4. 文件已被删除或移动")
print()
print("建议:")
print("- 检查这些 URL 是否在浏览器中可以访问")
print("- 如果使用签名 URL确保 URL 未过期")
print("- 检查 Docker 容器的网络配置和 DNS 设置")
print("- 考虑使用公开可访问的 URL 或配置 OSS 访问权限")
sys.exit(1)
else:
print("✅ 所有 URL 均可访问")
sys.exit(0)
if __name__ == "__main__":
main()