ba2a67290d
e2e / e2e (linux, xvfb) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.11) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.12) (push) Waiting to run
tests / pytest (windows-latest, py3.11) (push) Waiting to run
tests / pytest (windows-latest, py3.12) (push) Waiting to run
- scrape_zhaopin_ai.py: lightweight no-login list scraper - scrape_zhaopin_full.py: SQLite storage, resumable crawl, detail-page JD - analyze_zhaopin.py: stats by city/salary/education/experience/skills - scrape_zhipin_ai.py: BOSS Zhipin variant (login-based, fallback) - gitignore scraper data artifacts and browser profile
113 lines
4.3 KiB
Python
113 lines
4.3 KiB
Python
"""抓取 BOSS 直聘「AI 相关」岗位(自用 / 学习用途)。
|
|
|
|
合规提醒:
|
|
- 仅抓取公开展示的岗位标题/薪资/公司等字段, 不抓取招聘者个人联系方式。
|
|
- 低频请求, 遵守目标站点服务条款; 风险自负。
|
|
|
|
首次使用:
|
|
python scrape_zhipin_ai.py --login # 打开浏览器, 手动扫码登录一次
|
|
之后:
|
|
python scrape_zhipin_ai.py --keyword AI --city 101010100 --pages 3
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import json
|
|
import random
|
|
import time
|
|
from pathlib import Path
|
|
from urllib.parse import quote
|
|
|
|
from invisible_playwright import InvisiblePlaywright
|
|
|
|
PROFILE_DIR = Path(__file__).parent / ".zhipin_profile" # 持久化登录态
|
|
SEED = 20240614 # 固定 seed → 跨会话指纹一致, 配合持久化 profile
|
|
|
|
|
|
def login_flow() -> None:
|
|
"""首次手动登录: 打开页面, 你扫码, 登录态写入 PROFILE_DIR。"""
|
|
with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx:
|
|
page = ctx.new_page()
|
|
page.goto("https://www.zhipin.com/web/user/?ka=header-login",
|
|
wait_until="domcontentloaded")
|
|
print("请在打开的浏览器中扫码登录, 登录完成后回到终端按回车...")
|
|
input() # 等你登录完成
|
|
print("登录态已保存到", PROFILE_DIR)
|
|
|
|
|
|
def scrape(keyword: str, city: str, pages: int) -> list[dict]:
|
|
results: list[dict] = []
|
|
with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx:
|
|
page = ctx.new_page()
|
|
for n in range(1, pages + 1):
|
|
url = (
|
|
"https://www.zhipin.com/web/geek/job"
|
|
f"?query={quote(keyword)}&city={city}&page={n}"
|
|
)
|
|
page.goto(url, wait_until="domcontentloaded")
|
|
|
|
# 等列表渲染; 选择器需按实际页面结构核对/调整
|
|
try:
|
|
page.wait_for_selector("li.job-card-wrapper", timeout=15000)
|
|
except Exception:
|
|
print(f"第 {n} 页未出现岗位列表, 可能需要登录或触发了验证码。")
|
|
# 给你时间手动过验证码
|
|
input("处理完页面后按回车继续...")
|
|
|
|
rows = page.eval_on_selector_all(
|
|
"li.job-card-wrapper",
|
|
"""els => els.map(e => ({
|
|
title: e.querySelector('.job-name')?.innerText?.trim(),
|
|
salary: e.querySelector('.salary')?.innerText?.trim(),
|
|
company: e.querySelector('.company-name')?.innerText?.trim(),
|
|
tags: Array.from(e.querySelectorAll('.tag-list li'))
|
|
.map(t => t.innerText.trim()),
|
|
area: e.querySelector('.job-area')?.innerText?.trim(),
|
|
link: e.querySelector('a.job-card-left')?.href
|
|
|| e.querySelector('a')?.href,
|
|
}))""",
|
|
)
|
|
print(f"第 {n} 页抓到 {len(rows)} 条")
|
|
results.extend(rows)
|
|
|
|
# 低频: 随机停顿, 降低风控触发概率
|
|
time.sleep(random.uniform(4, 9))
|
|
return results
|
|
|
|
|
|
def save(rows: list[dict], stem: str) -> None:
|
|
Path(f"{stem}.json").write_text(
|
|
json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8"
|
|
)
|
|
if rows:
|
|
keys = ["title", "salary", "company", "area", "tags", "link"]
|
|
with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f:
|
|
w = csv.DictWriter(f, fieldnames=keys, extrasaction="ignore")
|
|
w.writeheader()
|
|
for r in rows:
|
|
r = dict(r)
|
|
r["tags"] = " / ".join(r.get("tags") or [])
|
|
w.writerow(r)
|
|
print(f"已保存 {len(rows)} 条 → {stem}.json / {stem}.csv")
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--login", action="store_true", help="首次手动登录")
|
|
ap.add_argument("--keyword", default="AI", help="搜索关键词")
|
|
ap.add_argument("--city", default="101010100", help="城市编码 (101010100=北京)")
|
|
ap.add_argument("--pages", type=int, default=3, help="抓取页数")
|
|
args = ap.parse_args()
|
|
|
|
if args.login:
|
|
login_flow()
|
|
return
|
|
|
|
rows = scrape(args.keyword, args.city, args.pages)
|
|
save(rows, stem=f"zhipin_{args.keyword}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|