图片本地化

记一次php解析html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 防止html被转义
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
# 加载html
$dom = new \DOMDocument('1.0', 'UTF-8');
@$dom->loadHTML($html);
# 读取所有img标签
$images = $dom->getElementsByTagName('img');
# 通过类获取指定html所有节点 nodes 是数组
$finder = new \DOMXPath($dom);
$nodes = $finder->query("//*[contains(concat(' ', normalize-space(@class), ' '), ' $class ')]");
# 通过id获取html节点
$element = $dom->getElementById($id);

#### 内容保存 ####

# 只取body内容,防止内容丢失
$body = $dom->getElementsByTagName('body')->item(0);
# 保存html
$newHtml = $dom->saveHTML($body);
# 去除<body>标签
$newHtml = preg_replace('/^<body[^>]*>|<\/body>$/i', '', $newHtml);

# 反转义所有HTML实体 针对第一步
$newHtml = html_entity_decode($newHtml, ENT_QUOTES | ENT_HTML5, 'UTF-8');

图片本地化

基于laravel

主要用到了laravel中的Http类

use Illuminate\Support\Facades\DB;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
use Illuminate\Support\Facades\Http;
foreach ($images as $img) {
$src = $img->getAttribute('src');
# 如果包含本地路径,则跳过
if (str_contains($src, $domain) !== false) continue;
try {
if (!str_contains($hostname, $strMain)) {
$response = Http::withoutVerifying()->head($src);
} else {
$response = Http::head($src);
}
if (!$response->ok()) {
$statusCode = $response->status();
$errorMessage = "远程图片资源不存在 (状态码: $statusCode)";
if (isset($statusCodeMap[$statusCode])) {
$errorMessage .= " - " . $statusCodeMap[$statusCode];
}
throw new \Exception($errorMessage);
}
$parsed = parse_url($src);
$path = $parsed['path'] ?? '';
$filename = basename($path);
if (empty($filename)) {
throw new \Exception('获取图片名错误');
}
// 下载图片
$imageData = file_get_contents($src);
if ($imageData === false) {
throw new \Exception('下载失败');
}
// [新增] 内容验证(核心校验逻辑)
$isValidImage = true;
// 方法2:使用finfo更严格检测(双重验证)
$finfo = new \finfo(FILEINFO_MIME_TYPE);
$detectedMime = $finfo->buffer($imageData);
if (!str_starts_with($detectedMime, 'image/')) {
$isValidImage = false;
}
// [新增] 扩展名与MIME类型匹配校验
$parsed = parse_url($src);
$path = $parsed['path'] ?? '';
$allowedTypes = [
'jpg' => 'image/jpeg',
'jpeg' => 'image/jpeg',
'png' => 'image/png',
'gif' => 'image/gif',
'webp' => 'image/webp'
];
# 判断 MIME 类型在不在数组中
if (!in_array($detectedMime, $allowedTypes)) {
throw new \Exception("非图片类型");
}
# 获取真实后缀
$ext = array_search($detectedMime, $allowedTypes);
# 这种写法会导致当文件是其他图片格式,但是改了后缀就无法通过的问题
$originalExt = strtolower(pathinfo($path, PATHINFO_EXTENSION));
# 如果后缀和MIME类型一致,则使用原来的后缀(不是就使用源后缀)
if ($allowedTypes[$originalExt] == $detectedMime) {
$ext = $originalExt;
}
if (!in_array($originalExt, array_keys($allowedTypes))) {
throw new \Exception("非图片类型");
}
if (!$isValidImage) {
throw new \Exception('文件不是图片');
}
$filename = md5(uniqid()) . "." . $ext;
// 保存文件
$localPath = $fullDir . $filename;
file_put_contents($localPath, $imageData);
if (isProduction()) {
// 构建web访问路径
$webPath = "https://" . $domain . $saveDir . $filename;
} else {
$webPath = "http://" . $domain . $saveDir . $filename;
}
$img->setAttribute('src', $webPath);
$results['success'][] = [
"remote" => $src,
"local" => $webPath,
];
$insert[] = [
"url" => $webPath,
"remote_url" => $src,
"path" => $saveDir . $filename,
"admin_id" => auth("plat")->user()->id,
"created_at" => $time,
"updated_at" => $time,
];
} catch (\Exception $e) {
$results['failed'][] = [
"message" => $e->getMessage(),
"remote" => $src
];
}
}