ba2a67290d
e2e / e2e (linux, xvfb) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.11) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.12) (push) Waiting to run
tests / pytest (windows-latest, py3.11) (push) Waiting to run
tests / pytest (windows-latest, py3.12) (push) Waiting to run
- scrape_zhaopin_ai.py: lightweight no-login list scraper - scrape_zhaopin_full.py: SQLite storage, resumable crawl, detail-page JD - analyze_zhaopin.py: stats by city/salary/education/experience/skills - scrape_zhipin_ai.py: BOSS Zhipin variant (login-based, fallback) - gitignore scraper data artifacts and browser profile
278 lines
11 KiB
Python
278 lines
11 KiB
Python
"""智联招聘 AI 岗位爬虫(完整版)—— 免登录 + SQLite + 断点续抓 + 详情页 JD 全文。
|
|
|
|
特性:
|
|
* 写入 SQLite (INSERT OR IGNORE 去重, 每条提交, 中断不丢数据)
|
|
* 断点续抓:
|
|
- 列表阶段: 记录每个 (keyword, jl) 已抓到的最大页码, 重跑从下一页继续
|
|
- 详情阶段: 只抓还没有 JD 的岗位, 重跑自动补齐
|
|
* 详情页: 进入每个岗位页面抓 职位描述(JD)全文 + 技能标签
|
|
|
|
合规提醒: 仅抓公开岗位字段, 不抓招聘者个人联系方式; 低频自用, 遵守站点条款。
|
|
|
|
用法:
|
|
# 抓列表(前5页) + 进详情页抓 JD
|
|
python examples/scrape_zhaopin_full.py --keyword AI --pages 5
|
|
|
|
# 只抓列表, 不进详情
|
|
python examples/scrape_zhaopin_full.py --keyword 大模型 --pages 5 --no-detail
|
|
|
|
# 中断后直接重跑同一命令 => 自动从断点继续
|
|
python examples/scrape_zhaopin_full.py --keyword AI --pages 5
|
|
|
|
# 导出已抓数据到 CSV
|
|
python examples/scrape_zhaopin_full.py --export ai_jobs.csv
|
|
|
|
# 重置某关键词的列表进度(重新从第1页抓)
|
|
python examples/scrape_zhaopin_full.py --keyword AI --reset
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import json
|
|
import random
|
|
import re
|
|
import sqlite3
|
|
import time
|
|
from contextlib import closing
|
|
from urllib.parse import quote
|
|
|
|
from invisible_playwright import InvisiblePlaywright
|
|
|
|
DB_DEFAULT = "zhaopin_jobs.db"
|
|
|
|
_LIST_JS = r"""
|
|
() => {
|
|
const txt = (el) => el ? el.innerText.trim() : "";
|
|
return Array.from(document.querySelectorAll(".joblist-box__item")).map(card => {
|
|
const nameA = card.querySelector("a.jobinfo__name");
|
|
const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item"))
|
|
.map(e => e.innerText.trim());
|
|
const locSpan = card.querySelector(".jobinfo__other-info-item span");
|
|
const companyA = card.querySelector("a.companyinfo__name");
|
|
return {
|
|
title: txt(nameA),
|
|
link: nameA ? nameA.href : "",
|
|
salary: txt(card.querySelector(".jobinfo__salary")),
|
|
job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag"))
|
|
.map(e => e.innerText.trim()),
|
|
location: locSpan ? locSpan.innerText.trim() : (info[0] || ""),
|
|
experience: info[1] || "",
|
|
education: info[2] || "",
|
|
company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim() : "",
|
|
company_url: companyA ? companyA.href : "",
|
|
company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag"))
|
|
.map(e => e.innerText.trim()),
|
|
};
|
|
});
|
|
}
|
|
"""
|
|
|
|
_DETAIL_JS = r"""
|
|
() => {
|
|
const c = document.querySelector(".describtion-card__detail-content");
|
|
const skills = Array.from(document.querySelectorAll(".describtion-card__skills-item"))
|
|
.map(e => e.innerText.trim());
|
|
return { jd: c ? c.innerText.trim() : "", skills };
|
|
}
|
|
"""
|
|
|
|
|
|
# ── DB ──────────────────────────────────────────────────────────────────
|
|
def init_db(path: str) -> sqlite3.Connection:
|
|
conn = sqlite3.connect(path)
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS jobs (
|
|
job_id TEXT PRIMARY KEY,
|
|
title TEXT,
|
|
salary TEXT,
|
|
experience TEXT,
|
|
education TEXT,
|
|
location TEXT,
|
|
company TEXT,
|
|
company_url TEXT,
|
|
job_tags TEXT,
|
|
company_tags TEXT,
|
|
link TEXT,
|
|
jd_text TEXT,
|
|
skills TEXT,
|
|
keyword TEXT,
|
|
created_at TEXT DEFAULT (datetime('now','localtime')),
|
|
detail_at TEXT
|
|
)
|
|
""")
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS progress (
|
|
scope TEXT PRIMARY KEY, -- f"{keyword}|{jl}"
|
|
last_page INTEGER DEFAULT 0
|
|
)
|
|
""")
|
|
conn.commit()
|
|
return conn
|
|
|
|
|
|
def job_id_from_link(link: str) -> str:
|
|
m = re.search(r"/jobdetail/([^.?/]+)\.htm", link)
|
|
return m.group(1) if m else link
|
|
|
|
|
|
def get_last_page(conn: sqlite3.Connection, scope: str) -> int:
|
|
row = conn.execute("SELECT last_page FROM progress WHERE scope=?", (scope,)).fetchone()
|
|
return row[0] if row else 0
|
|
|
|
|
|
def set_last_page(conn: sqlite3.Connection, scope: str, page: int) -> None:
|
|
conn.execute(
|
|
"INSERT INTO progress(scope,last_page) VALUES(?,?) "
|
|
"ON CONFLICT(scope) DO UPDATE SET last_page=excluded.last_page",
|
|
(scope, page),
|
|
)
|
|
conn.commit()
|
|
|
|
|
|
def upsert_job(conn: sqlite3.Connection, r: dict, keyword: str) -> bool:
|
|
jid = job_id_from_link(r.get("link", ""))
|
|
if not jid:
|
|
return False
|
|
cur = conn.execute(
|
|
"""INSERT OR IGNORE INTO jobs
|
|
(job_id,title,salary,experience,education,location,company,
|
|
company_url,job_tags,company_tags,link,keyword)
|
|
VALUES(?,?,?,?,?,?,?,?,?,?,?,?)""",
|
|
(jid, r.get("title"), r.get("salary"), r.get("experience"),
|
|
r.get("education"), r.get("location"), r.get("company"),
|
|
r.get("company_url"),
|
|
json.dumps(r.get("job_tags") or [], ensure_ascii=False),
|
|
json.dumps(r.get("company_tags") or [], ensure_ascii=False),
|
|
r.get("link"), keyword),
|
|
)
|
|
conn.commit()
|
|
return cur.rowcount > 0
|
|
|
|
|
|
# ── 抓取 ────────────────────────────────────────────────────────────────
|
|
def crawl_list(conn, page, keyword: str, pages: int, jl: str | None) -> None:
|
|
scope = f"{keyword}|{jl or ''}"
|
|
start = get_last_page(conn, scope) + 1
|
|
if start > pages:
|
|
print(f"[列表] '{keyword}' 已抓到第 {start-1} 页, 目标 {pages} 页, 无需续抓。")
|
|
return
|
|
print(f"[列表] '{keyword}' 从第 {start} 页抓到第 {pages} 页")
|
|
for n in range(start, pages + 1):
|
|
url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}"
|
|
if jl:
|
|
url += f"&jl={jl}"
|
|
try:
|
|
page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
|
page.wait_for_selector(".joblist-box__item", timeout=20000)
|
|
except Exception:
|
|
print(f" 第 {n} 页未加载出列表, 停止本轮 (重跑可从此页续)。")
|
|
break
|
|
page.wait_for_timeout(1200)
|
|
rows = page.evaluate(_LIST_JS)
|
|
new = sum(upsert_job(conn, r, keyword) for r in rows)
|
|
set_last_page(conn, scope, n)
|
|
print(f" 第 {n} 页: {len(rows)} 条 (新增 {new})")
|
|
time.sleep(random.uniform(3, 7))
|
|
|
|
|
|
def crawl_details(conn, page, keyword: str | None, limit: int | None) -> None:
|
|
q = "SELECT job_id,link FROM jobs WHERE (jd_text IS NULL OR jd_text='') AND link!=''"
|
|
params: list = []
|
|
if keyword:
|
|
q += " AND keyword=?"
|
|
params.append(keyword)
|
|
q += " ORDER BY created_at"
|
|
if limit:
|
|
q += f" LIMIT {int(limit)}"
|
|
todo = conn.execute(q, params).fetchall()
|
|
if not todo:
|
|
print("[详情] 没有待补充 JD 的岗位。")
|
|
return
|
|
print(f"[详情] 待抓 JD: {len(todo)} 条")
|
|
for i, (jid, link) in enumerate(todo, 1):
|
|
try:
|
|
page.goto(link, wait_until="domcontentloaded", timeout=60000)
|
|
page.wait_for_selector(".describtion-card__detail-content", timeout=15000)
|
|
page.wait_for_timeout(800)
|
|
data = page.evaluate(_DETAIL_JS)
|
|
except Exception as e:
|
|
print(f" [{i}/{len(todo)}] {jid} 抓取失败: {str(e)[:60]}")
|
|
continue
|
|
conn.execute(
|
|
"UPDATE jobs SET jd_text=?, skills=?, detail_at=datetime('now','localtime') "
|
|
"WHERE job_id=?",
|
|
(data.get("jd", ""),
|
|
json.dumps(data.get("skills") or [], ensure_ascii=False), jid),
|
|
)
|
|
conn.commit() # 逐条提交 => 中断安全
|
|
jd_len = len(data.get("jd", ""))
|
|
print(f" [{i}/{len(todo)}] {jid} JD {jd_len} 字")
|
|
time.sleep(random.uniform(2, 5))
|
|
|
|
|
|
# ── 导出 ────────────────────────────────────────────────────────────────
|
|
def export_csv(conn, path: str, keyword: str | None) -> None:
|
|
q = ("SELECT title,salary,experience,education,location,company,"
|
|
"job_tags,company_tags,skills,jd_text,link FROM jobs")
|
|
params: list = []
|
|
if keyword:
|
|
q += " WHERE keyword=?"
|
|
params.append(keyword)
|
|
rows = conn.execute(q, params).fetchall()
|
|
cols = ["title", "salary", "experience", "education", "location", "company",
|
|
"job_tags", "company_tags", "skills", "jd_text", "link"]
|
|
with open(path, "w", newline="", encoding="utf-8-sig") as f:
|
|
w = csv.writer(f)
|
|
w.writerow(cols)
|
|
for row in rows:
|
|
row = list(row)
|
|
for idx in (6, 7, 8): # json 数组列 -> 用 / 连接
|
|
try:
|
|
row[idx] = " / ".join(json.loads(row[idx] or "[]"))
|
|
except Exception:
|
|
pass
|
|
w.writerow(row)
|
|
print(f"已导出 {len(rows)} 条 -> {path}")
|
|
|
|
|
|
# ── main ────────────────────────────────────────────────────────────────
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--keyword", default="AI", help="搜索关键词")
|
|
ap.add_argument("--pages", type=int, default=3, help="列表抓取页数")
|
|
ap.add_argument("--jl", default=None, help="城市编码, 不填=全国")
|
|
ap.add_argument("--db", default=DB_DEFAULT, help="SQLite 文件路径")
|
|
ap.add_argument("--no-detail", action="store_true", help="只抓列表, 不进详情页")
|
|
ap.add_argument("--detail-limit", type=int, default=None, help="本轮最多抓多少条 JD")
|
|
ap.add_argument("--headful", action="store_true", help="显示浏览器窗口")
|
|
ap.add_argument("--reset", action="store_true", help="重置该关键词的列表进度")
|
|
ap.add_argument("--export", metavar="CSV", help="导出已抓数据到 CSV 后退出")
|
|
args = ap.parse_args()
|
|
|
|
with closing(init_db(args.db)) as conn:
|
|
if args.export:
|
|
export_csv(conn, args.export, args.keyword if args.keyword != "AI" else None)
|
|
return
|
|
if args.reset:
|
|
scope = f"{args.keyword}|{args.jl or ''}"
|
|
conn.execute("DELETE FROM progress WHERE scope=?", (scope,))
|
|
conn.commit()
|
|
print(f"已重置进度: {scope}")
|
|
|
|
with InvisiblePlaywright(seed=42, headless=not args.headful) as browser:
|
|
page = browser.new_page()
|
|
crawl_list(conn, page, args.keyword, args.pages, args.jl)
|
|
if not args.no_detail:
|
|
crawl_details(conn, page, args.keyword, args.detail_limit)
|
|
|
|
total = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0]
|
|
with_jd = conn.execute(
|
|
"SELECT COUNT(*) FROM jobs WHERE jd_text IS NOT NULL AND jd_text!=''"
|
|
).fetchone()[0]
|
|
print(f"\n库内合计 {total} 条, 其中含 JD 全文 {with_jd} 条 -> {args.db}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|