"""智联招聘 AI 岗位爬虫(完整版)—— 免登录 + SQLite + 断点续抓 + 详情页 JD 全文。 特性: * 写入 SQLite (INSERT OR IGNORE 去重, 每条提交, 中断不丢数据) * 断点续抓: - 列表阶段: 记录每个 (keyword, jl) 已抓到的最大页码, 重跑从下一页继续 - 详情阶段: 只抓还没有 JD 的岗位, 重跑自动补齐 * 详情页: 进入每个岗位页面抓 职位描述(JD)全文 + 技能标签 合规提醒: 仅抓公开岗位字段, 不抓招聘者个人联系方式; 低频自用, 遵守站点条款。 用法: # 抓列表(前5页) + 进详情页抓 JD python examples/scrape_zhaopin_full.py --keyword AI --pages 5 # 只抓列表, 不进详情 python examples/scrape_zhaopin_full.py --keyword 大模型 --pages 5 --no-detail # 中断后直接重跑同一命令 => 自动从断点继续 python examples/scrape_zhaopin_full.py --keyword AI --pages 5 # 导出已抓数据到 CSV python examples/scrape_zhaopin_full.py --export ai_jobs.csv # 重置某关键词的列表进度(重新从第1页抓) python examples/scrape_zhaopin_full.py --keyword AI --reset """ from __future__ import annotations import argparse import csv import json import random import re import sqlite3 import time from contextlib import closing from urllib.parse import quote from invisible_playwright import InvisiblePlaywright DB_DEFAULT = "zhaopin_jobs.db" _LIST_JS = r""" () => { const txt = (el) => el ? el.innerText.trim() : ""; return Array.from(document.querySelectorAll(".joblist-box__item")).map(card => { const nameA = card.querySelector("a.jobinfo__name"); const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item")) .map(e => e.innerText.trim()); const locSpan = card.querySelector(".jobinfo__other-info-item span"); const companyA = card.querySelector("a.companyinfo__name"); return { title: txt(nameA), link: nameA ? nameA.href : "", salary: txt(card.querySelector(".jobinfo__salary")), job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag")) .map(e => e.innerText.trim()), location: locSpan ? locSpan.innerText.trim() : (info[0] || ""), experience: info[1] || "", education: info[2] || "", company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim() : "", company_url: companyA ? companyA.href : "", company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag")) .map(e => e.innerText.trim()), }; }); } """ _DETAIL_JS = r""" () => { const c = document.querySelector(".describtion-card__detail-content"); const skills = Array.from(document.querySelectorAll(".describtion-card__skills-item")) .map(e => e.innerText.trim()); return { jd: c ? c.innerText.trim() : "", skills }; } """ # ── DB ────────────────────────────────────────────────────────────────── def init_db(path: str) -> sqlite3.Connection: conn = sqlite3.connect(path) conn.execute(""" CREATE TABLE IF NOT EXISTS jobs ( job_id TEXT PRIMARY KEY, title TEXT, salary TEXT, experience TEXT, education TEXT, location TEXT, company TEXT, company_url TEXT, job_tags TEXT, company_tags TEXT, link TEXT, jd_text TEXT, skills TEXT, keyword TEXT, created_at TEXT DEFAULT (datetime('now','localtime')), detail_at TEXT ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS progress ( scope TEXT PRIMARY KEY, -- f"{keyword}|{jl}" last_page INTEGER DEFAULT 0 ) """) conn.commit() return conn def job_id_from_link(link: str) -> str: m = re.search(r"/jobdetail/([^.?/]+)\.htm", link) return m.group(1) if m else link def get_last_page(conn: sqlite3.Connection, scope: str) -> int: row = conn.execute("SELECT last_page FROM progress WHERE scope=?", (scope,)).fetchone() return row[0] if row else 0 def set_last_page(conn: sqlite3.Connection, scope: str, page: int) -> None: conn.execute( "INSERT INTO progress(scope,last_page) VALUES(?,?) " "ON CONFLICT(scope) DO UPDATE SET last_page=excluded.last_page", (scope, page), ) conn.commit() def upsert_job(conn: sqlite3.Connection, r: dict, keyword: str) -> bool: jid = job_id_from_link(r.get("link", "")) if not jid: return False cur = conn.execute( """INSERT OR IGNORE INTO jobs (job_id,title,salary,experience,education,location,company, company_url,job_tags,company_tags,link,keyword) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)""", (jid, r.get("title"), r.get("salary"), r.get("experience"), r.get("education"), r.get("location"), r.get("company"), r.get("company_url"), json.dumps(r.get("job_tags") or [], ensure_ascii=False), json.dumps(r.get("company_tags") or [], ensure_ascii=False), r.get("link"), keyword), ) conn.commit() return cur.rowcount > 0 # ── 抓取 ──────────────────────────────────────────────────────────────── def crawl_list(conn, page, keyword: str, pages: int, jl: str | None) -> None: scope = f"{keyword}|{jl or ''}" start = get_last_page(conn, scope) + 1 if start > pages: print(f"[列表] '{keyword}' 已抓到第 {start-1} 页, 目标 {pages} 页, 无需续抓。") return print(f"[列表] '{keyword}' 从第 {start} 页抓到第 {pages} 页") for n in range(start, pages + 1): url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}" if jl: url += f"&jl={jl}" try: page.goto(url, wait_until="domcontentloaded", timeout=60000) page.wait_for_selector(".joblist-box__item", timeout=20000) except Exception: print(f" 第 {n} 页未加载出列表, 停止本轮 (重跑可从此页续)。") break page.wait_for_timeout(1200) rows = page.evaluate(_LIST_JS) new = sum(upsert_job(conn, r, keyword) for r in rows) set_last_page(conn, scope, n) print(f" 第 {n} 页: {len(rows)} 条 (新增 {new})") time.sleep(random.uniform(3, 7)) def crawl_details(conn, page, keyword: str | None, limit: int | None) -> None: q = "SELECT job_id,link FROM jobs WHERE (jd_text IS NULL OR jd_text='') AND link!=''" params: list = [] if keyword: q += " AND keyword=?" params.append(keyword) q += " ORDER BY created_at" if limit: q += f" LIMIT {int(limit)}" todo = conn.execute(q, params).fetchall() if not todo: print("[详情] 没有待补充 JD 的岗位。") return print(f"[详情] 待抓 JD: {len(todo)} 条") for i, (jid, link) in enumerate(todo, 1): try: page.goto(link, wait_until="domcontentloaded", timeout=60000) page.wait_for_selector(".describtion-card__detail-content", timeout=15000) page.wait_for_timeout(800) data = page.evaluate(_DETAIL_JS) except Exception as e: print(f" [{i}/{len(todo)}] {jid} 抓取失败: {str(e)[:60]}") continue conn.execute( "UPDATE jobs SET jd_text=?, skills=?, detail_at=datetime('now','localtime') " "WHERE job_id=?", (data.get("jd", ""), json.dumps(data.get("skills") or [], ensure_ascii=False), jid), ) conn.commit() # 逐条提交 => 中断安全 jd_len = len(data.get("jd", "")) print(f" [{i}/{len(todo)}] {jid} JD {jd_len} 字") time.sleep(random.uniform(2, 5)) # ── 导出 ──────────────────────────────────────────────────────────────── def export_csv(conn, path: str, keyword: str | None) -> None: q = ("SELECT title,salary,experience,education,location,company," "job_tags,company_tags,skills,jd_text,link FROM jobs") params: list = [] if keyword: q += " WHERE keyword=?" params.append(keyword) rows = conn.execute(q, params).fetchall() cols = ["title", "salary", "experience", "education", "location", "company", "job_tags", "company_tags", "skills", "jd_text", "link"] with open(path, "w", newline="", encoding="utf-8-sig") as f: w = csv.writer(f) w.writerow(cols) for row in rows: row = list(row) for idx in (6, 7, 8): # json 数组列 -> 用 / 连接 try: row[idx] = " / ".join(json.loads(row[idx] or "[]")) except Exception: pass w.writerow(row) print(f"已导出 {len(rows)} 条 -> {path}") # ── main ──────────────────────────────────────────────────────────────── def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--keyword", default="AI", help="搜索关键词") ap.add_argument("--pages", type=int, default=3, help="列表抓取页数") ap.add_argument("--jl", default=None, help="城市编码, 不填=全国") ap.add_argument("--db", default=DB_DEFAULT, help="SQLite 文件路径") ap.add_argument("--no-detail", action="store_true", help="只抓列表, 不进详情页") ap.add_argument("--detail-limit", type=int, default=None, help="本轮最多抓多少条 JD") ap.add_argument("--headful", action="store_true", help="显示浏览器窗口") ap.add_argument("--reset", action="store_true", help="重置该关键词的列表进度") ap.add_argument("--export", metavar="CSV", help="导出已抓数据到 CSV 后退出") args = ap.parse_args() with closing(init_db(args.db)) as conn: if args.export: export_csv(conn, args.export, args.keyword if args.keyword != "AI" else None) return if args.reset: scope = f"{args.keyword}|{args.jl or ''}" conn.execute("DELETE FROM progress WHERE scope=?", (scope,)) conn.commit() print(f"已重置进度: {scope}") with InvisiblePlaywright(seed=42, headless=not args.headful) as browser: page = browser.new_page() crawl_list(conn, page, args.keyword, args.pages, args.jl) if not args.no_detail: crawl_details(conn, page, args.keyword, args.detail_limit) total = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0] with_jd = conn.execute( "SELECT COUNT(*) FROM jobs WHERE jd_text IS NOT NULL AND jd_text!=''" ).fetchone()[0] print(f"\n库内合计 {total} 条, 其中含 JD 全文 {with_jd} 条 -> {args.db}") if __name__ == "__main__": main()