ba2a67290d
e2e / e2e (linux, xvfb) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.11) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.12) (push) Waiting to run
tests / pytest (windows-latest, py3.11) (push) Waiting to run
tests / pytest (windows-latest, py3.12) (push) Waiting to run
- scrape_zhaopin_ai.py: lightweight no-login list scraper - scrape_zhaopin_full.py: SQLite storage, resumable crawl, detail-page JD - analyze_zhaopin.py: stats by city/salary/education/experience/skills - scrape_zhipin_ai.py: BOSS Zhipin variant (login-based, fallback) - gitignore scraper data artifacts and browser profile
121 lines
5.0 KiB
Python
121 lines
5.0 KiB
Python
"""抓取智联招聘「AI 相关」岗位 —— 免登录, 渲染搜索结果页直接解析。
|
|
|
|
合规提醒:
|
|
- 仅抓取公开展示的岗位标题/薪资/公司/标签等字段。
|
|
- 不抓取招聘者个人联系方式; 低频请求, 遵守目标站点服务条款, 风险自负。
|
|
|
|
用法:
|
|
python examples/scrape_zhaopin_ai.py # 默认抓 "AI" 前 3 页
|
|
python examples/scrape_zhaopin_ai.py --keyword 大模型 --pages 5
|
|
python examples/scrape_zhaopin_ai.py --keyword AI --jl 530 # 530=北京
|
|
python examples/scrape_zhaopin_ai.py --headful # 显示浏览器窗口
|
|
|
|
城市编码(jl): 北京530 上海538 广州763 深圳765 杭州653 成都801 武汉736 南京635
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import json
|
|
import random
|
|
import time
|
|
from urllib.parse import quote
|
|
|
|
from invisible_playwright import InvisiblePlaywright
|
|
|
|
# 在渲染后的页面里一次性提取所有卡片, 比逐个 query 更快更稳。
|
|
# 选择器来自对真实页面的探测 (见 _probe_zhaopin.py)。
|
|
_EXTRACT_JS = r"""
|
|
() => {
|
|
const txt = (el) => el ? el.innerText.trim() : "";
|
|
const cards = Array.from(document.querySelectorAll(".joblist-box__item"));
|
|
return cards.map(card => {
|
|
const nameA = card.querySelector("a.jobinfo__name");
|
|
const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item"))
|
|
.map(e => e.innerText.trim());
|
|
// 第一个 other-info 是地点(带图标), 取其 span; 其余按顺序是经验/学历
|
|
const locSpan = card.querySelector(".jobinfo__other-info-item span");
|
|
const companyA = card.querySelector("a.companyinfo__name");
|
|
return {
|
|
title: txt(nameA),
|
|
link: nameA ? nameA.href : "",
|
|
salary: txt(card.querySelector(".jobinfo__salary")),
|
|
job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag"))
|
|
.map(e => e.innerText.trim()),
|
|
location: locSpan ? locSpan.innerText.trim() : (info[0] || ""),
|
|
experience: info[1] || "",
|
|
education: info[2] || "",
|
|
company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim()
|
|
: "",
|
|
company_url: companyA ? companyA.href : "",
|
|
company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag"))
|
|
.map(e => e.innerText.trim()),
|
|
};
|
|
});
|
|
}
|
|
"""
|
|
|
|
|
|
def scrape(keyword: str, pages: int, jl: str | None, headful: bool) -> list[dict]:
|
|
results: list[dict] = []
|
|
seen: set[str] = set()
|
|
with InvisiblePlaywright(seed=42, headless=not headful) as browser:
|
|
page = browser.new_page()
|
|
for n in range(1, pages + 1):
|
|
url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}"
|
|
if jl:
|
|
url += f"&jl={jl}"
|
|
try:
|
|
page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
|
page.wait_for_selector(".joblist-box__item", timeout=20000)
|
|
except Exception:
|
|
print(f"第 {n} 页未加载出岗位列表, 跳过 (可能触发风控)。")
|
|
continue
|
|
page.wait_for_timeout(1500) # 让懒加载内容补齐
|
|
|
|
rows = page.evaluate(_EXTRACT_JS)
|
|
new = 0
|
|
for r in rows:
|
|
key = r.get("link") or (r.get("title", "") + r.get("company", ""))
|
|
if key and key not in seen:
|
|
seen.add(key)
|
|
results.append(r)
|
|
new += 1
|
|
print(f"第 {n} 页: 抓到 {len(rows)} 条 (新增 {new})")
|
|
|
|
time.sleep(random.uniform(3, 7)) # 低频, 降低风控
|
|
return results
|
|
|
|
|
|
def save(rows: list[dict], stem: str) -> None:
|
|
with open(f"{stem}.json", "w", encoding="utf-8") as f:
|
|
json.dump(rows, f, ensure_ascii=False, indent=2)
|
|
if rows:
|
|
fields = ["title", "salary", "experience", "education", "location",
|
|
"company", "job_tags", "company_tags", "link", "company_url"]
|
|
with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f:
|
|
w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
|
w.writeheader()
|
|
for r in rows:
|
|
row = dict(r)
|
|
row["job_tags"] = " / ".join(row.get("job_tags") or [])
|
|
row["company_tags"] = " / ".join(row.get("company_tags") or [])
|
|
w.writerow(row)
|
|
print(f"\n已保存 {len(rows)} 条 → {stem}.json / {stem}.csv")
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--keyword", default="AI", help="搜索关键词")
|
|
ap.add_argument("--pages", type=int, default=3, help="抓取页数")
|
|
ap.add_argument("--jl", default=None, help="城市编码, 不填=全国")
|
|
ap.add_argument("--headful", action="store_true", help="显示浏览器窗口")
|
|
args = ap.parse_args()
|
|
|
|
rows = scrape(args.keyword, args.pages, args.jl, args.headful)
|
|
save(rows, stem=f"zhaopin_{args.keyword}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|