Files
Train/apps/api/scripts/fetch-images.mjs
T
2026-06-16 00:55:20 +08:00

256 lines
9.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 批量取图脚本(在有网络的本机运行):
* 按 Wikidata P18 → Commons 同名分类 → Commons 关键词搜索 的顺序,
* 为每个车型下载若干张真实照片到「本地共享图库」(uploads + photos 表),
* 并记录署名(作者/许可证/来源)。下载后应用即从本地提供图片,运行时不再访问 Commons。
*
* 用法(在 apps/api 目录):
* node scripts/fetch-images.mjs # 全部缺图车型,每个 1 张
* node scripts/fetch-images.mjs --per 3 # 每个车型最多 3 张
* node scripts/fetch-images.mjs --limit 20 # 仅处理前 20 个缺图车型
* node scripts/fetch-images.mjs --category 电力机车
* node scripts/fetch-images.mjs --dry # 只检索不下载不入库
*
* 所有下载图片一律入库为「候选(candidate)」,进入管理员「候选审图」队列,
* 由人工确认后才会作为封面/图册展示(候选 + 人工确认 双闸门,保证准确与合规)。
*
* 注意:需联网访问 wikidata.org / commons.wikimedia.org(自由授权,已记录署名)。
* 下载完成后图片保存在本地 app/data/uploads/,库表记录在 app/data/app.db。
*/
import Database from 'better-sqlite3';
import { mkdirSync } from 'fs';
import { writeFile } from 'fs/promises';
import { join, resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
import { randomBytes } from 'crypto';
const __dirname = dirname(fileURLToPath(import.meta.url));
const ROOT = resolve(__dirname, '..', '..', '..'); // Train/
const MACHINES_DB = join(ROOT, 'app', 'data', 'machines.db');
const APP_DB = process.env.APP_DB_PATH || join(ROOT, 'app', 'data', 'app.db');
const UPLOAD_DIR = process.env.UPLOAD_DIR || join(ROOT, 'app', 'data', 'uploads');
const UA = 'ChinaLocoAtlas/0.1 (image fetcher; contact admin)';
const args = process.argv.slice(2);
const opt = (k) => {
const i = args.indexOf(k);
return i >= 0 ? args[i + 1] : undefined;
};
const LIMIT = opt('--limit') ? Number(opt('--limit')) : Infinity;
const PER = opt('--per') ? Math.max(1, Number(opt('--per'))) : 1;
const CATEGORY = opt('--category');
const DRY = args.includes('--dry');
const STATUS = 'candidate'; // 一律入候选,交管理员「候选审图」确认
const CAT_HINT = {
蒸汽机车: 'steam locomotive',
电力机车: 'electric locomotive',
内燃机车: 'diesel locomotive',
动车组: 'EMU',
客车: 'passenger car',
货车: 'freight car',
检测车: 'inspection car',
旅游列车: 'tourist train',
};
const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
const latin = (s) => (s.match(/[A-Za-z0-9-]+/g) || []).join('');
const strip = (h) => (h || '').replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim();
const isImg = (s) => /\.(jpe?g|png)$/i.test(s || '');
async function api(url) {
const res = await fetch(url, { headers: { 'User-Agent': UA } });
if (!res.ok) throw new Error('HTTP ' + res.status);
return res.json();
}
/** 从 Commons File:xxx 取缩略图 url + 署名。*/
async function commonsFileInfo(title) {
const u =
'https://commons.wikimedia.org/w/api.php?action=query&format=json&origin=*' +
`&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url|extmetadata&iiurlwidth=1280`;
const d = await api(u);
const pages = d?.query?.pages || {};
const p = Object.values(pages)[0];
const ii = p?.imageinfo?.[0];
if (!ii?.thumburl) return null;
const em = ii.extmetadata || {};
return {
url: ii.thumburl,
descriptionUrl: ii.descriptionurl || '',
author: strip(em.Artist?.value) || '未署名',
license: strip(em.LicenseShortName?.value) || '见来源',
};
}
/** 1) Wikidata P18 主图(最多 1 张,最准)。*/
async function viaWikidata(model) {
const q = `${latin(model.model_code) || model.model_code} ${CAT_HINT[model.category] || 'train'}`;
const s = await api(
'https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json&origin=*' +
`&language=en&type=item&limit=3&search=${encodeURIComponent(q)}`,
);
for (const hit of s?.search || []) {
const e = await api(
'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&origin=*' +
`&props=claims&ids=${hit.id}`,
);
const p18 = e?.entities?.[hit.id]?.claims?.P18?.[0]?.mainsnak?.datavalue?.value;
if (p18) {
const info = await commonsFileInfo('File:' + p18);
if (info) return [{ ...info, via: 'wikidata' }];
}
}
return [];
}
/** 2) Commons 同名分类(可多张)。*/
async function viaCategory(model, want) {
const code = latin(model.model_code) || model.model_code;
const out = [];
for (const cat of [`Category:China Railways ${code}`, `Category:${code}`]) {
const d = await api(
'https://commons.wikimedia.org/w/api.php?action=query&format=json&origin=*' +
`&list=categorymembers&cmtype=file&cmlimit=20&cmtitle=${encodeURIComponent(cat)}`,
);
for (const f of d?.query?.categorymembers || []) {
if (!isImg(f.title)) continue;
const info = await commonsFileInfo(f.title);
if (info) out.push({ ...info, via: 'category' });
if (out.length >= want) return out;
}
}
return out;
}
/** 3) Commons 关键词搜索(最不准,兜底,可多张)。*/
async function viaSearch(model, want) {
const q = `${latin(model.model_code) || model.model_code} ${CAT_HINT[model.category] || 'train'} China Railway`;
const d = await api(
'https://commons.wikimedia.org/w/api.php?action=query&format=json&origin=*' +
`&generator=search&gsrnamespace=6&gsrlimit=${Math.max(3, want)}&gsrsearch=${encodeURIComponent(q)}` +
'&prop=imageinfo&iiprop=url|extmetadata&iiurlwidth=1280',
);
const pages = d?.query?.pages || {};
const out = [];
for (const p of Object.values(pages)) {
const ii = p?.imageinfo?.[0];
if (!ii?.thumburl || !isImg(ii.thumburl)) continue;
const em = ii.extmetadata || {};
out.push({
url: ii.thumburl,
descriptionUrl: ii.descriptionurl || '',
author: strip(em.Artist?.value) || '未署名',
license: strip(em.LicenseShortName?.value) || '见来源',
via: 'search',
});
if (out.length >= want) break;
}
return out;
}
/** 聚合各来源,按 url 去重,返回最多 want 张。*/
async function collectImages(model, want) {
const seen = new Set();
const out = [];
const push = (arr) => {
for (const it of arr) {
if (out.length >= want) break;
if (seen.has(it.url)) continue;
seen.add(it.url);
out.push(it);
}
};
push(await viaWikidata(model));
if (out.length < want) push(await viaCategory(model, want - out.length));
if (out.length < want) push(await viaSearch(model, want - out.length));
return out;
}
async function main() {
mkdirSync(UPLOAD_DIR, { recursive: true });
const mdb = new Database(MACHINES_DB, { readonly: true });
const adb = new Database(APP_DB);
const admin =
adb.prepare("SELECT id FROM users WHERE role='admin' ORDER BY id LIMIT 1").get() ||
adb.prepare('SELECT id FROM users ORDER BY id LIMIT 1').get();
if (!admin) {
console.error('找不到用户:请先启动后端注册一个管理员(或设 ADMIN_EMAIL)。');
process.exit(1);
}
const where = CATEGORY ? 'WHERE c.name = ?' : '';
const models = mdb
.prepare(
`SELECT m.id, m.model_code, c.name AS category
FROM model m JOIN category c ON c.id = m.category_id ${where}
ORDER BY m.id`,
)
.all(...(CATEGORY ? [CATEGORY] : []));
const hasPhoto = adb.prepare('SELECT 1 FROM photos WHERE model_id = ? LIMIT 1');
const insert = adb.prepare(
`INSERT INTO photos (model_id, uploader_id, filename, caption, status, source_url, author, license)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
);
console.log(
`取图:每车型最多 ${PER} 张,入库状态=候选(candidate)${DRY ? 'dry-run' : ''};保存到 ${UPLOAD_DIR}`,
);
const report = [];
let done = 0;
let saved = 0;
for (const m of models) {
if (done >= LIMIT) break;
if (hasPhoto.get(m.id)) continue;
done++;
let hits = [];
try {
hits = await collectImages(m, PER);
} catch (e) {
report.push(`${m.model_code}:检索出错 ${e.message}`);
await sleep(300);
continue;
}
if (hits.length === 0) {
report.push(`${m.model_code}:未找到`);
await sleep(200);
continue;
}
if (DRY) {
report.push(`${m.model_code}[dry] ${hits.map((h) => h.via).join(',')} (${hits.length} 张)`);
await sleep(200);
continue;
}
let n = 0;
for (const hit of hits) {
try {
const buf = Buffer.from(
await (await fetch(hit.url, { headers: { 'User-Agent': UA } })).arrayBuffer(),
);
const ext = (hit.url.match(/\.(jpe?g|png)/i) || ['.jpg'])[0].toLowerCase();
const fn = `${Date.now()}-${randomBytes(6).toString('hex')}${ext}`;
await writeFile(join(UPLOAD_DIR, fn), buf);
insert.run(m.id, admin.id, fn, `${m.model_code}${hit.via}`, STATUS, hit.descriptionUrl, hit.author, hit.license);
n++;
saved++;
} catch (e) {
report.push(`${m.model_code}:下载失败 ${e.message}`);
}
await sleep(300); // 礼貌限速
}
if (n > 0) report.push(`${m.model_code}:入库 ${n} 张(${STATUS}`);
}
const ok = report.filter((r) => r.startsWith('✓')).length;
const miss = report.filter((r) => r.startsWith('—')).length;
const err = report.filter((r) => r.startsWith('✗')).length;
console.log(report.join('\n'));
console.log(`\n命中 ${ok} · 未找到 ${miss} · 出错 ${err}(处理 ${done} 个车型,保存 ${saved} 张)`);
console.log('候选图已入库,请用管理员在网页"候选审图 / 图册"逐个确认后作为封面展示。');
}
main();