"""抓取 BOSS 直聘「AI 相关」岗位(自用 / 学习用途)。 合规提醒: - 仅抓取公开展示的岗位标题/薪资/公司等字段, 不抓取招聘者个人联系方式。 - 低频请求, 遵守目标站点服务条款; 风险自负。 首次使用: python scrape_zhipin_ai.py --login # 打开浏览器, 手动扫码登录一次 之后: python scrape_zhipin_ai.py --keyword AI --city 101010100 --pages 3 """ from __future__ import annotations import argparse import csv import json import random import time from pathlib import Path from urllib.parse import quote from invisible_playwright import InvisiblePlaywright PROFILE_DIR = Path(__file__).parent / ".zhipin_profile" # 持久化登录态 SEED = 20240614 # 固定 seed → 跨会话指纹一致, 配合持久化 profile def login_flow() -> None: """首次手动登录: 打开页面, 你扫码, 登录态写入 PROFILE_DIR。""" with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx: page = ctx.new_page() page.goto("https://www.zhipin.com/web/user/?ka=header-login", wait_until="domcontentloaded") print("请在打开的浏览器中扫码登录, 登录完成后回到终端按回车...") input() # 等你登录完成 print("登录态已保存到", PROFILE_DIR) def scrape(keyword: str, city: str, pages: int) -> list[dict]: results: list[dict] = [] with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx: page = ctx.new_page() for n in range(1, pages + 1): url = ( "https://www.zhipin.com/web/geek/job" f"?query={quote(keyword)}&city={city}&page={n}" ) page.goto(url, wait_until="domcontentloaded") # 等列表渲染; 选择器需按实际页面结构核对/调整 try: page.wait_for_selector("li.job-card-wrapper", timeout=15000) except Exception: print(f"第 {n} 页未出现岗位列表, 可能需要登录或触发了验证码。") # 给你时间手动过验证码 input("处理完页面后按回车继续...") rows = page.eval_on_selector_all( "li.job-card-wrapper", """els => els.map(e => ({ title: e.querySelector('.job-name')?.innerText?.trim(), salary: e.querySelector('.salary')?.innerText?.trim(), company: e.querySelector('.company-name')?.innerText?.trim(), tags: Array.from(e.querySelectorAll('.tag-list li')) .map(t => t.innerText.trim()), area: e.querySelector('.job-area')?.innerText?.trim(), link: e.querySelector('a.job-card-left')?.href || e.querySelector('a')?.href, }))""", ) print(f"第 {n} 页抓到 {len(rows)} 条") results.extend(rows) # 低频: 随机停顿, 降低风控触发概率 time.sleep(random.uniform(4, 9)) return results def save(rows: list[dict], stem: str) -> None: Path(f"{stem}.json").write_text( json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8" ) if rows: keys = ["title", "salary", "company", "area", "tags", "link"] with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f: w = csv.DictWriter(f, fieldnames=keys, extrasaction="ignore") w.writeheader() for r in rows: r = dict(r) r["tags"] = " / ".join(r.get("tags") or []) w.writerow(r) print(f"已保存 {len(rows)} 条 → {stem}.json / {stem}.csv") def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--login", action="store_true", help="首次手动登录") ap.add_argument("--keyword", default="AI", help="搜索关键词") ap.add_argument("--city", default="101010100", help="城市编码 (101010100=北京)") ap.add_argument("--pages", type=int, default=3, help="抓取页数") args = ap.parse_args() if args.login: login_flow() return rows = scrape(args.keyword, args.city, args.pages) save(rows, stem=f"zhipin_{args.keyword}") if __name__ == "__main__": main()