256 lines
9.7 KiB
JavaScript
256 lines
9.7 KiB
JavaScript
/**
|
||
* 批量取图脚本(在有网络的本机运行):
|
||
* 按 Wikidata P18 → Commons 同名分类 → Commons 关键词搜索 的顺序,
|
||
* 为每个车型下载若干张真实照片到「本地共享图库」(uploads + photos 表),
|
||
* 并记录署名(作者/许可证/来源)。下载后应用即从本地提供图片,运行时不再访问 Commons。
|
||
*
|
||
* 用法(在 apps/api 目录):
|
||
* node scripts/fetch-images.mjs # 全部缺图车型,每个 1 张
|
||
* node scripts/fetch-images.mjs --per 3 # 每个车型最多 3 张
|
||
* node scripts/fetch-images.mjs --limit 20 # 仅处理前 20 个缺图车型
|
||
* node scripts/fetch-images.mjs --category 电力机车
|
||
* node scripts/fetch-images.mjs --dry # 只检索不下载不入库
|
||
*
|
||
* 所有下载图片一律入库为「候选(candidate)」,进入管理员「候选审图」队列,
|
||
* 由人工确认后才会作为封面/图册展示(候选 + 人工确认 双闸门,保证准确与合规)。
|
||
*
|
||
* 注意:需联网访问 wikidata.org / commons.wikimedia.org(自由授权,已记录署名)。
|
||
* 下载完成后图片保存在本地 app/data/uploads/,库表记录在 app/data/app.db。
|
||
*/
|
||
import Database from 'better-sqlite3';
|
||
import { mkdirSync } from 'fs';
|
||
import { writeFile } from 'fs/promises';
|
||
import { join, resolve, dirname } from 'path';
|
||
import { fileURLToPath } from 'url';
|
||
import { randomBytes } from 'crypto';
|
||
|
||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||
const ROOT = resolve(__dirname, '..', '..', '..'); // Train/
|
||
const MACHINES_DB = join(ROOT, 'app', 'data', 'machines.db');
|
||
const APP_DB = process.env.APP_DB_PATH || join(ROOT, 'app', 'data', 'app.db');
|
||
const UPLOAD_DIR = process.env.UPLOAD_DIR || join(ROOT, 'app', 'data', 'uploads');
|
||
const UA = 'ChinaLocoAtlas/0.1 (image fetcher; contact admin)';
|
||
|
||
const args = process.argv.slice(2);
|
||
const opt = (k) => {
|
||
const i = args.indexOf(k);
|
||
return i >= 0 ? args[i + 1] : undefined;
|
||
};
|
||
const LIMIT = opt('--limit') ? Number(opt('--limit')) : Infinity;
|
||
const PER = opt('--per') ? Math.max(1, Number(opt('--per'))) : 1;
|
||
const CATEGORY = opt('--category');
|
||
const DRY = args.includes('--dry');
|
||
const STATUS = 'candidate'; // 一律入候选,交管理员「候选审图」确认
|
||
|
||
const CAT_HINT = {
|
||
蒸汽机车: 'steam locomotive',
|
||
电力机车: 'electric locomotive',
|
||
内燃机车: 'diesel locomotive',
|
||
动车组: 'EMU',
|
||
客车: 'passenger car',
|
||
货车: 'freight car',
|
||
检测车: 'inspection car',
|
||
旅游列车: 'tourist train',
|
||
};
|
||
|
||
const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
|
||
const latin = (s) => (s.match(/[A-Za-z0-9-]+/g) || []).join('');
|
||
const strip = (h) => (h || '').replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim();
|
||
const isImg = (s) => /\.(jpe?g|png)$/i.test(s || '');
|
||
|
||
async function api(url) {
|
||
const res = await fetch(url, { headers: { 'User-Agent': UA } });
|
||
if (!res.ok) throw new Error('HTTP ' + res.status);
|
||
return res.json();
|
||
}
|
||
|
||
/** 从 Commons File:xxx 取缩略图 url + 署名。*/
|
||
async function commonsFileInfo(title) {
|
||
const u =
|
||
'https://commons.wikimedia.org/w/api.php?action=query&format=json&origin=*' +
|
||
`&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url|extmetadata&iiurlwidth=1280`;
|
||
const d = await api(u);
|
||
const pages = d?.query?.pages || {};
|
||
const p = Object.values(pages)[0];
|
||
const ii = p?.imageinfo?.[0];
|
||
if (!ii?.thumburl) return null;
|
||
const em = ii.extmetadata || {};
|
||
return {
|
||
url: ii.thumburl,
|
||
descriptionUrl: ii.descriptionurl || '',
|
||
author: strip(em.Artist?.value) || '未署名',
|
||
license: strip(em.LicenseShortName?.value) || '见来源',
|
||
};
|
||
}
|
||
|
||
/** 1) Wikidata P18 主图(最多 1 张,最准)。*/
|
||
async function viaWikidata(model) {
|
||
const q = `${latin(model.model_code) || model.model_code} ${CAT_HINT[model.category] || 'train'}`;
|
||
const s = await api(
|
||
'https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json&origin=*' +
|
||
`&language=en&type=item&limit=3&search=${encodeURIComponent(q)}`,
|
||
);
|
||
for (const hit of s?.search || []) {
|
||
const e = await api(
|
||
'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&origin=*' +
|
||
`&props=claims&ids=${hit.id}`,
|
||
);
|
||
const p18 = e?.entities?.[hit.id]?.claims?.P18?.[0]?.mainsnak?.datavalue?.value;
|
||
if (p18) {
|
||
const info = await commonsFileInfo('File:' + p18);
|
||
if (info) return [{ ...info, via: 'wikidata' }];
|
||
}
|
||
}
|
||
return [];
|
||
}
|
||
|
||
/** 2) Commons 同名分类(可多张)。*/
|
||
async function viaCategory(model, want) {
|
||
const code = latin(model.model_code) || model.model_code;
|
||
const out = [];
|
||
for (const cat of [`Category:China Railways ${code}`, `Category:${code}`]) {
|
||
const d = await api(
|
||
'https://commons.wikimedia.org/w/api.php?action=query&format=json&origin=*' +
|
||
`&list=categorymembers&cmtype=file&cmlimit=20&cmtitle=${encodeURIComponent(cat)}`,
|
||
);
|
||
for (const f of d?.query?.categorymembers || []) {
|
||
if (!isImg(f.title)) continue;
|
||
const info = await commonsFileInfo(f.title);
|
||
if (info) out.push({ ...info, via: 'category' });
|
||
if (out.length >= want) return out;
|
||
}
|
||
}
|
||
return out;
|
||
}
|
||
|
||
/** 3) Commons 关键词搜索(最不准,兜底,可多张)。*/
|
||
async function viaSearch(model, want) {
|
||
const q = `${latin(model.model_code) || model.model_code} ${CAT_HINT[model.category] || 'train'} China Railway`;
|
||
const d = await api(
|
||
'https://commons.wikimedia.org/w/api.php?action=query&format=json&origin=*' +
|
||
`&generator=search&gsrnamespace=6&gsrlimit=${Math.max(3, want)}&gsrsearch=${encodeURIComponent(q)}` +
|
||
'&prop=imageinfo&iiprop=url|extmetadata&iiurlwidth=1280',
|
||
);
|
||
const pages = d?.query?.pages || {};
|
||
const out = [];
|
||
for (const p of Object.values(pages)) {
|
||
const ii = p?.imageinfo?.[0];
|
||
if (!ii?.thumburl || !isImg(ii.thumburl)) continue;
|
||
const em = ii.extmetadata || {};
|
||
out.push({
|
||
url: ii.thumburl,
|
||
descriptionUrl: ii.descriptionurl || '',
|
||
author: strip(em.Artist?.value) || '未署名',
|
||
license: strip(em.LicenseShortName?.value) || '见来源',
|
||
via: 'search',
|
||
});
|
||
if (out.length >= want) break;
|
||
}
|
||
return out;
|
||
}
|
||
|
||
/** 聚合各来源,按 url 去重,返回最多 want 张。*/
|
||
async function collectImages(model, want) {
|
||
const seen = new Set();
|
||
const out = [];
|
||
const push = (arr) => {
|
||
for (const it of arr) {
|
||
if (out.length >= want) break;
|
||
if (seen.has(it.url)) continue;
|
||
seen.add(it.url);
|
||
out.push(it);
|
||
}
|
||
};
|
||
push(await viaWikidata(model));
|
||
if (out.length < want) push(await viaCategory(model, want - out.length));
|
||
if (out.length < want) push(await viaSearch(model, want - out.length));
|
||
return out;
|
||
}
|
||
|
||
async function main() {
|
||
mkdirSync(UPLOAD_DIR, { recursive: true });
|
||
const mdb = new Database(MACHINES_DB, { readonly: true });
|
||
const adb = new Database(APP_DB);
|
||
|
||
const admin =
|
||
adb.prepare("SELECT id FROM users WHERE role='admin' ORDER BY id LIMIT 1").get() ||
|
||
adb.prepare('SELECT id FROM users ORDER BY id LIMIT 1').get();
|
||
if (!admin) {
|
||
console.error('找不到用户:请先启动后端注册一个管理员(或设 ADMIN_EMAIL)。');
|
||
process.exit(1);
|
||
}
|
||
|
||
const where = CATEGORY ? 'WHERE c.name = ?' : '';
|
||
const models = mdb
|
||
.prepare(
|
||
`SELECT m.id, m.model_code, c.name AS category
|
||
FROM model m JOIN category c ON c.id = m.category_id ${where}
|
||
ORDER BY m.id`,
|
||
)
|
||
.all(...(CATEGORY ? [CATEGORY] : []));
|
||
|
||
const hasPhoto = adb.prepare('SELECT 1 FROM photos WHERE model_id = ? LIMIT 1');
|
||
const insert = adb.prepare(
|
||
`INSERT INTO photos (model_id, uploader_id, filename, caption, status, source_url, author, license)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
|
||
);
|
||
|
||
console.log(
|
||
`取图:每车型最多 ${PER} 张,入库状态=候选(candidate)${DRY ? '(dry-run)' : ''};保存到 ${UPLOAD_DIR}`,
|
||
);
|
||
|
||
const report = [];
|
||
let done = 0;
|
||
let saved = 0;
|
||
for (const m of models) {
|
||
if (done >= LIMIT) break;
|
||
if (hasPhoto.get(m.id)) continue;
|
||
done++;
|
||
let hits = [];
|
||
try {
|
||
hits = await collectImages(m, PER);
|
||
} catch (e) {
|
||
report.push(`✗ ${m.model_code}:检索出错 ${e.message}`);
|
||
await sleep(300);
|
||
continue;
|
||
}
|
||
if (hits.length === 0) {
|
||
report.push(`— ${m.model_code}:未找到`);
|
||
await sleep(200);
|
||
continue;
|
||
}
|
||
if (DRY) {
|
||
report.push(`✓ ${m.model_code}:[dry] ${hits.map((h) => h.via).join(',')} (${hits.length} 张)`);
|
||
await sleep(200);
|
||
continue;
|
||
}
|
||
let n = 0;
|
||
for (const hit of hits) {
|
||
try {
|
||
const buf = Buffer.from(
|
||
await (await fetch(hit.url, { headers: { 'User-Agent': UA } })).arrayBuffer(),
|
||
);
|
||
const ext = (hit.url.match(/\.(jpe?g|png)/i) || ['.jpg'])[0].toLowerCase();
|
||
const fn = `${Date.now()}-${randomBytes(6).toString('hex')}${ext}`;
|
||
await writeFile(join(UPLOAD_DIR, fn), buf);
|
||
insert.run(m.id, admin.id, fn, `${m.model_code}(${hit.via})`, STATUS, hit.descriptionUrl, hit.author, hit.license);
|
||
n++;
|
||
saved++;
|
||
} catch (e) {
|
||
report.push(`✗ ${m.model_code}:下载失败 ${e.message}`);
|
||
}
|
||
await sleep(300); // 礼貌限速
|
||
}
|
||
if (n > 0) report.push(`✓ ${m.model_code}:入库 ${n} 张(${STATUS})`);
|
||
}
|
||
|
||
const ok = report.filter((r) => r.startsWith('✓')).length;
|
||
const miss = report.filter((r) => r.startsWith('—')).length;
|
||
const err = report.filter((r) => r.startsWith('✗')).length;
|
||
console.log(report.join('\n'));
|
||
console.log(`\n命中 ${ok} · 未找到 ${miss} · 出错 ${err}(处理 ${done} 个车型,保存 ${saved} 张)`);
|
||
console.log('候选图已入库,请用管理员在网页"候选审图 / 图册"逐个确认后作为封面展示。');
|
||
}
|
||
|
||
main();
|