"""抓取智联招聘「AI 相关」岗位 —— 免登录, 渲染搜索结果页直接解析。 合规提醒: - 仅抓取公开展示的岗位标题/薪资/公司/标签等字段。 - 不抓取招聘者个人联系方式; 低频请求, 遵守目标站点服务条款, 风险自负。 用法: python examples/scrape_zhaopin_ai.py # 默认抓 "AI" 前 3 页 python examples/scrape_zhaopin_ai.py --keyword 大模型 --pages 5 python examples/scrape_zhaopin_ai.py --keyword AI --jl 530 # 530=北京 python examples/scrape_zhaopin_ai.py --headful # 显示浏览器窗口 城市编码(jl): 北京530 上海538 广州763 深圳765 杭州653 成都801 武汉736 南京635 """ from __future__ import annotations import argparse import csv import json import random import time from urllib.parse import quote from invisible_playwright import InvisiblePlaywright # 在渲染后的页面里一次性提取所有卡片, 比逐个 query 更快更稳。 # 选择器来自对真实页面的探测 (见 _probe_zhaopin.py)。 _EXTRACT_JS = r""" () => { const txt = (el) => el ? el.innerText.trim() : ""; const cards = Array.from(document.querySelectorAll(".joblist-box__item")); return cards.map(card => { const nameA = card.querySelector("a.jobinfo__name"); const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item")) .map(e => e.innerText.trim()); // 第一个 other-info 是地点(带图标), 取其 span; 其余按顺序是经验/学历 const locSpan = card.querySelector(".jobinfo__other-info-item span"); const companyA = card.querySelector("a.companyinfo__name"); return { title: txt(nameA), link: nameA ? nameA.href : "", salary: txt(card.querySelector(".jobinfo__salary")), job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag")) .map(e => e.innerText.trim()), location: locSpan ? locSpan.innerText.trim() : (info[0] || ""), experience: info[1] || "", education: info[2] || "", company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim() : "", company_url: companyA ? companyA.href : "", company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag")) .map(e => e.innerText.trim()), }; }); } """ def scrape(keyword: str, pages: int, jl: str | None, headful: bool) -> list[dict]: results: list[dict] = [] seen: set[str] = set() with InvisiblePlaywright(seed=42, headless=not headful) as browser: page = browser.new_page() for n in range(1, pages + 1): url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}" if jl: url += f"&jl={jl}" try: page.goto(url, wait_until="domcontentloaded", timeout=60000) page.wait_for_selector(".joblist-box__item", timeout=20000) except Exception: print(f"第 {n} 页未加载出岗位列表, 跳过 (可能触发风控)。") continue page.wait_for_timeout(1500) # 让懒加载内容补齐 rows = page.evaluate(_EXTRACT_JS) new = 0 for r in rows: key = r.get("link") or (r.get("title", "") + r.get("company", "")) if key and key not in seen: seen.add(key) results.append(r) new += 1 print(f"第 {n} 页: 抓到 {len(rows)} 条 (新增 {new})") time.sleep(random.uniform(3, 7)) # 低频, 降低风控 return results def save(rows: list[dict], stem: str) -> None: with open(f"{stem}.json", "w", encoding="utf-8") as f: json.dump(rows, f, ensure_ascii=False, indent=2) if rows: fields = ["title", "salary", "experience", "education", "location", "company", "job_tags", "company_tags", "link", "company_url"] with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f: w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore") w.writeheader() for r in rows: row = dict(r) row["job_tags"] = " / ".join(row.get("job_tags") or []) row["company_tags"] = " / ".join(row.get("company_tags") or []) w.writerow(row) print(f"\n已保存 {len(rows)} 条 → {stem}.json / {stem}.csv") def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--keyword", default="AI", help="搜索关键词") ap.add_argument("--pages", type=int, default=3, help="抓取页数") ap.add_argument("--jl", default=None, help="城市编码, 不填=全国") ap.add_argument("--headful", action="store_true", help="显示浏览器窗口") args = ap.parse_args() rows = scrape(args.keyword, args.pages, args.jl, args.headful) save(rows, stem=f"zhaopin_{args.keyword}") if __name__ == "__main__": main()