examples: add zhaopin AI job scrapers + SQLite analysis
e2e / e2e (linux, xvfb) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.11) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.12) (push) Waiting to run
tests / pytest (windows-latest, py3.11) (push) Waiting to run
tests / pytest (windows-latest, py3.12) (push) Waiting to run

- scrape_zhaopin_ai.py: lightweight no-login list scraper
- scrape_zhaopin_full.py: SQLite storage, resumable crawl, detail-page JD
- analyze_zhaopin.py: stats by city/salary/education/experience/skills
- scrape_zhipin_ai.py: BOSS Zhipin variant (login-based, fallback)
- gitignore scraper data artifacts and browser profile
This commit is contained in:
freedakgmail
2026-06-14 23:18:51 +08:00
parent 29262a644e
commit ba2a67290d
5 changed files with 684 additions and 0 deletions
+6
View File
@@ -6,3 +6,9 @@ build/
.pytest_cache/
.venv/
firefox-source/
# scraper runtime artifacts (examples/) — data outputs & browser profiles
*.db
zhaopin_*.json
zhaopin_*.csv
ai_jobs.csv
examples/.zhipin_profile/
+169
View File
@@ -0,0 +1,169 @@
"""智联招聘抓取数据分析 —— 读取 SQLite, 按城市/薪资/学历/经验/技能做统计。
纯标准库, 无需 pandas。
用法:
python examples/analyze_zhaopin.py # 全量分析
python examples/analyze_zhaopin.py --keyword AI # 只看某关键词
python examples/analyze_zhaopin.py --city 北京 # 只看某城市
python examples/analyze_zhaopin.py --top 15 # 排行榜取前 15
python examples/analyze_zhaopin.py --db zhaopin_jobs.db
"""
from __future__ import annotations
import argparse
import json
import re
import sqlite3
import statistics
from collections import Counter
WORKDAYS_PER_MONTH = 21.75 # 日薪 -> 月薪折算
def parse_salary(s: str) -> tuple[int, int] | None:
"""把薪资字符串解析成 (月薪下限, 月薪上限) 元。无法解析返回 None。"""
if not s or "面议" in s:
return None
s = s.strip()
is_daily = "/天" in s or "元/天" in s
is_wan = "" in s
nums = re.findall(r"\d+(?:\.\d+)?", s)
if not nums:
return None
vals = [float(x) for x in nums[:2]]
if len(vals) == 1:
vals = [vals[0], vals[0]]
lo, hi = vals[0], vals[1]
if is_wan:
lo, hi = lo * 10000, hi * 10000
if is_daily:
lo, hi = lo * WORKDAYS_PER_MONTH, hi * WORKDAYS_PER_MONTH
return int(lo), int(hi)
def city_of(location: str) -> str:
return (location or "").split("·")[0].strip() or "未知"
SALARY_BUCKETS = [
(0, 5000, "<5k"),
(5000, 8000, "5-8k"),
(8000, 12000, "8-12k"),
(12000, 18000, "12-18k"),
(18000, 25000, "18-25k"),
(25000, 10**9, ">=25k"),
]
def bucket_of(mid: float) -> str:
for lo, hi, label in SALARY_BUCKETS:
if lo <= mid < hi:
return label
return "?"
def bar(n: int, maxn: int, width: int = 30) -> str:
if maxn <= 0:
return ""
return "" * max(1, round(n / maxn * width))
def section(title: str) -> None:
print("\n" + "=" * 56)
print(title)
print("=" * 56)
def rank_table(counter: Counter, top: int, label: str) -> None:
if not counter:
print(" (无数据)")
return
maxn = counter.most_common(1)[0][1]
for name, n in counter.most_common(top):
print(f" {name[:18]:<18} {n:>4} {bar(n, maxn)}")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--db", default="zhaopin_jobs.db")
ap.add_argument("--keyword", default=None, help="按关键词过滤")
ap.add_argument("--city", default=None, help="按城市过滤")
ap.add_argument("--top", type=int, default=12, help="排行榜条数")
args = ap.parse_args()
conn = sqlite3.connect(args.db)
q = ("SELECT title,salary,experience,education,location,company,"
"job_tags,company_tags,skills FROM jobs")
params: list = []
if args.keyword:
q += " WHERE keyword=?"
params.append(args.keyword)
rows = conn.execute(q, params).fetchall()
if args.city:
rows = [r for r in rows if city_of(r[4]) == args.city]
if not rows:
print("没有匹配的数据。先用 scrape_zhaopin_full.py 抓一些, 或检查过滤条件。")
return
n_total = len(rows)
print(f"分析样本: {n_total}"
+ (f" | 关键词={args.keyword}" if args.keyword else "")
+ (f" | 城市={args.city}" if args.city else ""))
cities, edus, exps = Counter(), Counter(), Counter()
skill_freq, salary_buckets = Counter(), Counter()
mids: list[float] = []
n_salary_parsed = 0
for (title, salary, exp, edu, loc, comp, jtags, ctags, skills) in rows:
cities[city_of(loc)] += 1
edus[(edu or "未知").strip() or "未知"] += 1
exps[(exp or "未知").strip() or "未知"] += 1
# 技能: 合并 skills + job_tags
for src in (skills, jtags):
try:
for t in json.loads(src or "[]"):
t = t.strip()
if t:
skill_freq[t] += 1
except Exception:
pass
rng = parse_salary(salary)
if rng:
n_salary_parsed += 1
mid = (rng[0] + rng[1]) / 2
mids.append(mid)
salary_buckets[bucket_of(mid)] += 1
section("城市分布 (Top)")
rank_table(cities, args.top, "城市")
section("薪资区间分布 (按月薪中位点)")
if mids:
order = {label: i for i, (_, _, label) in enumerate(SALARY_BUCKETS)}
maxn = max(salary_buckets.values())
for label in sorted(salary_buckets, key=lambda x: order.get(x, 99)):
n = salary_buckets[label]
print(f" {label:<8} {n:>4} {bar(n, maxn)}")
print(f"\n 可解析薪资: {n_salary_parsed}/{n_total} 条 (面议等已排除)")
print(f" 月薪中位点 平均: {statistics.mean(mids):>8,.0f}")
print(f" 中位数: {statistics.median(mids):>8,.0f}")
print(f" 最低/最高: {min(mids):,.0f} / {max(mids):,.0f}")
else:
print(" (无可解析薪资)")
section("学历要求")
rank_table(edus, args.top, "学历")
section("经验要求")
rank_table(exps, args.top, "经验")
section("技能/标签词频 (Top)")
rank_table(skill_freq, args.top, "技能")
if __name__ == "__main__":
main()
+120
View File
@@ -0,0 +1,120 @@
"""抓取智联招聘「AI 相关」岗位 —— 免登录, 渲染搜索结果页直接解析。
合规提醒:
- 仅抓取公开展示的岗位标题/薪资/公司/标签等字段。
- 不抓取招聘者个人联系方式; 低频请求, 遵守目标站点服务条款, 风险自负。
用法:
python examples/scrape_zhaopin_ai.py # 默认抓 "AI" 前 3 页
python examples/scrape_zhaopin_ai.py --keyword 大模型 --pages 5
python examples/scrape_zhaopin_ai.py --keyword AI --jl 530 # 530=北京
python examples/scrape_zhaopin_ai.py --headful # 显示浏览器窗口
城市编码(jl): 北京530 上海538 广州763 深圳765 杭州653 成都801 武汉736 南京635
"""
from __future__ import annotations
import argparse
import csv
import json
import random
import time
from urllib.parse import quote
from invisible_playwright import InvisiblePlaywright
# 在渲染后的页面里一次性提取所有卡片, 比逐个 query 更快更稳。
# 选择器来自对真实页面的探测 (见 _probe_zhaopin.py)。
_EXTRACT_JS = r"""
() => {
const txt = (el) => el ? el.innerText.trim() : "";
const cards = Array.from(document.querySelectorAll(".joblist-box__item"));
return cards.map(card => {
const nameA = card.querySelector("a.jobinfo__name");
const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item"))
.map(e => e.innerText.trim());
// 第一个 other-info 是地点(带图标), 取其 span; 其余按顺序是经验/学历
const locSpan = card.querySelector(".jobinfo__other-info-item span");
const companyA = card.querySelector("a.companyinfo__name");
return {
title: txt(nameA),
link: nameA ? nameA.href : "",
salary: txt(card.querySelector(".jobinfo__salary")),
job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag"))
.map(e => e.innerText.trim()),
location: locSpan ? locSpan.innerText.trim() : (info[0] || ""),
experience: info[1] || "",
education: info[2] || "",
company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim()
: "",
company_url: companyA ? companyA.href : "",
company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag"))
.map(e => e.innerText.trim()),
};
});
}
"""
def scrape(keyword: str, pages: int, jl: str | None, headful: bool) -> list[dict]:
results: list[dict] = []
seen: set[str] = set()
with InvisiblePlaywright(seed=42, headless=not headful) as browser:
page = browser.new_page()
for n in range(1, pages + 1):
url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}"
if jl:
url += f"&jl={jl}"
try:
page.goto(url, wait_until="domcontentloaded", timeout=60000)
page.wait_for_selector(".joblist-box__item", timeout=20000)
except Exception:
print(f"{n} 页未加载出岗位列表, 跳过 (可能触发风控)。")
continue
page.wait_for_timeout(1500) # 让懒加载内容补齐
rows = page.evaluate(_EXTRACT_JS)
new = 0
for r in rows:
key = r.get("link") or (r.get("title", "") + r.get("company", ""))
if key and key not in seen:
seen.add(key)
results.append(r)
new += 1
print(f"{n} 页: 抓到 {len(rows)} 条 (新增 {new})")
time.sleep(random.uniform(3, 7)) # 低频, 降低风控
return results
def save(rows: list[dict], stem: str) -> None:
with open(f"{stem}.json", "w", encoding="utf-8") as f:
json.dump(rows, f, ensure_ascii=False, indent=2)
if rows:
fields = ["title", "salary", "experience", "education", "location",
"company", "job_tags", "company_tags", "link", "company_url"]
with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f:
w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
w.writeheader()
for r in rows:
row = dict(r)
row["job_tags"] = " / ".join(row.get("job_tags") or [])
row["company_tags"] = " / ".join(row.get("company_tags") or [])
w.writerow(row)
print(f"\n已保存 {len(rows)} 条 → {stem}.json / {stem}.csv")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--keyword", default="AI", help="搜索关键词")
ap.add_argument("--pages", type=int, default=3, help="抓取页数")
ap.add_argument("--jl", default=None, help="城市编码, 不填=全国")
ap.add_argument("--headful", action="store_true", help="显示浏览器窗口")
args = ap.parse_args()
rows = scrape(args.keyword, args.pages, args.jl, args.headful)
save(rows, stem=f"zhaopin_{args.keyword}")
if __name__ == "__main__":
main()
+277
View File
@@ -0,0 +1,277 @@
"""智联招聘 AI 岗位爬虫(完整版)—— 免登录 + SQLite + 断点续抓 + 详情页 JD 全文。
特性:
* 写入 SQLite (INSERT OR IGNORE 去重, 每条提交, 中断不丢数据)
* 断点续抓:
- 列表阶段: 记录每个 (keyword, jl) 已抓到的最大页码, 重跑从下一页继续
- 详情阶段: 只抓还没有 JD 的岗位, 重跑自动补齐
* 详情页: 进入每个岗位页面抓 职位描述(JD)全文 + 技能标签
合规提醒: 仅抓公开岗位字段, 不抓招聘者个人联系方式; 低频自用, 遵守站点条款。
用法:
# 抓列表(前5页) + 进详情页抓 JD
python examples/scrape_zhaopin_full.py --keyword AI --pages 5
# 只抓列表, 不进详情
python examples/scrape_zhaopin_full.py --keyword 大模型 --pages 5 --no-detail
# 中断后直接重跑同一命令 => 自动从断点继续
python examples/scrape_zhaopin_full.py --keyword AI --pages 5
# 导出已抓数据到 CSV
python examples/scrape_zhaopin_full.py --export ai_jobs.csv
# 重置某关键词的列表进度(重新从第1页抓)
python examples/scrape_zhaopin_full.py --keyword AI --reset
"""
from __future__ import annotations
import argparse
import csv
import json
import random
import re
import sqlite3
import time
from contextlib import closing
from urllib.parse import quote
from invisible_playwright import InvisiblePlaywright
DB_DEFAULT = "zhaopin_jobs.db"
_LIST_JS = r"""
() => {
const txt = (el) => el ? el.innerText.trim() : "";
return Array.from(document.querySelectorAll(".joblist-box__item")).map(card => {
const nameA = card.querySelector("a.jobinfo__name");
const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item"))
.map(e => e.innerText.trim());
const locSpan = card.querySelector(".jobinfo__other-info-item span");
const companyA = card.querySelector("a.companyinfo__name");
return {
title: txt(nameA),
link: nameA ? nameA.href : "",
salary: txt(card.querySelector(".jobinfo__salary")),
job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag"))
.map(e => e.innerText.trim()),
location: locSpan ? locSpan.innerText.trim() : (info[0] || ""),
experience: info[1] || "",
education: info[2] || "",
company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim() : "",
company_url: companyA ? companyA.href : "",
company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag"))
.map(e => e.innerText.trim()),
};
});
}
"""
_DETAIL_JS = r"""
() => {
const c = document.querySelector(".describtion-card__detail-content");
const skills = Array.from(document.querySelectorAll(".describtion-card__skills-item"))
.map(e => e.innerText.trim());
return { jd: c ? c.innerText.trim() : "", skills };
}
"""
# ── DB ──────────────────────────────────────────────────────────────────
def init_db(path: str) -> sqlite3.Connection:
conn = sqlite3.connect(path)
conn.execute("""
CREATE TABLE IF NOT EXISTS jobs (
job_id TEXT PRIMARY KEY,
title TEXT,
salary TEXT,
experience TEXT,
education TEXT,
location TEXT,
company TEXT,
company_url TEXT,
job_tags TEXT,
company_tags TEXT,
link TEXT,
jd_text TEXT,
skills TEXT,
keyword TEXT,
created_at TEXT DEFAULT (datetime('now','localtime')),
detail_at TEXT
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS progress (
scope TEXT PRIMARY KEY, -- f"{keyword}|{jl}"
last_page INTEGER DEFAULT 0
)
""")
conn.commit()
return conn
def job_id_from_link(link: str) -> str:
m = re.search(r"/jobdetail/([^.?/]+)\.htm", link)
return m.group(1) if m else link
def get_last_page(conn: sqlite3.Connection, scope: str) -> int:
row = conn.execute("SELECT last_page FROM progress WHERE scope=?", (scope,)).fetchone()
return row[0] if row else 0
def set_last_page(conn: sqlite3.Connection, scope: str, page: int) -> None:
conn.execute(
"INSERT INTO progress(scope,last_page) VALUES(?,?) "
"ON CONFLICT(scope) DO UPDATE SET last_page=excluded.last_page",
(scope, page),
)
conn.commit()
def upsert_job(conn: sqlite3.Connection, r: dict, keyword: str) -> bool:
jid = job_id_from_link(r.get("link", ""))
if not jid:
return False
cur = conn.execute(
"""INSERT OR IGNORE INTO jobs
(job_id,title,salary,experience,education,location,company,
company_url,job_tags,company_tags,link,keyword)
VALUES(?,?,?,?,?,?,?,?,?,?,?,?)""",
(jid, r.get("title"), r.get("salary"), r.get("experience"),
r.get("education"), r.get("location"), r.get("company"),
r.get("company_url"),
json.dumps(r.get("job_tags") or [], ensure_ascii=False),
json.dumps(r.get("company_tags") or [], ensure_ascii=False),
r.get("link"), keyword),
)
conn.commit()
return cur.rowcount > 0
# ── 抓取 ────────────────────────────────────────────────────────────────
def crawl_list(conn, page, keyword: str, pages: int, jl: str | None) -> None:
scope = f"{keyword}|{jl or ''}"
start = get_last_page(conn, scope) + 1
if start > pages:
print(f"[列表] '{keyword}' 已抓到第 {start-1} 页, 目标 {pages} 页, 无需续抓。")
return
print(f"[列表] '{keyword}' 从第 {start} 页抓到第 {pages}")
for n in range(start, pages + 1):
url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}"
if jl:
url += f"&jl={jl}"
try:
page.goto(url, wait_until="domcontentloaded", timeout=60000)
page.wait_for_selector(".joblist-box__item", timeout=20000)
except Exception:
print(f"{n} 页未加载出列表, 停止本轮 (重跑可从此页续)。")
break
page.wait_for_timeout(1200)
rows = page.evaluate(_LIST_JS)
new = sum(upsert_job(conn, r, keyword) for r in rows)
set_last_page(conn, scope, n)
print(f"{n} 页: {len(rows)} 条 (新增 {new})")
time.sleep(random.uniform(3, 7))
def crawl_details(conn, page, keyword: str | None, limit: int | None) -> None:
q = "SELECT job_id,link FROM jobs WHERE (jd_text IS NULL OR jd_text='') AND link!=''"
params: list = []
if keyword:
q += " AND keyword=?"
params.append(keyword)
q += " ORDER BY created_at"
if limit:
q += f" LIMIT {int(limit)}"
todo = conn.execute(q, params).fetchall()
if not todo:
print("[详情] 没有待补充 JD 的岗位。")
return
print(f"[详情] 待抓 JD: {len(todo)}")
for i, (jid, link) in enumerate(todo, 1):
try:
page.goto(link, wait_until="domcontentloaded", timeout=60000)
page.wait_for_selector(".describtion-card__detail-content", timeout=15000)
page.wait_for_timeout(800)
data = page.evaluate(_DETAIL_JS)
except Exception as e:
print(f" [{i}/{len(todo)}] {jid} 抓取失败: {str(e)[:60]}")
continue
conn.execute(
"UPDATE jobs SET jd_text=?, skills=?, detail_at=datetime('now','localtime') "
"WHERE job_id=?",
(data.get("jd", ""),
json.dumps(data.get("skills") or [], ensure_ascii=False), jid),
)
conn.commit() # 逐条提交 => 中断安全
jd_len = len(data.get("jd", ""))
print(f" [{i}/{len(todo)}] {jid} JD {jd_len}")
time.sleep(random.uniform(2, 5))
# ── 导出 ────────────────────────────────────────────────────────────────
def export_csv(conn, path: str, keyword: str | None) -> None:
q = ("SELECT title,salary,experience,education,location,company,"
"job_tags,company_tags,skills,jd_text,link FROM jobs")
params: list = []
if keyword:
q += " WHERE keyword=?"
params.append(keyword)
rows = conn.execute(q, params).fetchall()
cols = ["title", "salary", "experience", "education", "location", "company",
"job_tags", "company_tags", "skills", "jd_text", "link"]
with open(path, "w", newline="", encoding="utf-8-sig") as f:
w = csv.writer(f)
w.writerow(cols)
for row in rows:
row = list(row)
for idx in (6, 7, 8): # json 数组列 -> 用 / 连接
try:
row[idx] = " / ".join(json.loads(row[idx] or "[]"))
except Exception:
pass
w.writerow(row)
print(f"已导出 {len(rows)} 条 -> {path}")
# ── main ────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--keyword", default="AI", help="搜索关键词")
ap.add_argument("--pages", type=int, default=3, help="列表抓取页数")
ap.add_argument("--jl", default=None, help="城市编码, 不填=全国")
ap.add_argument("--db", default=DB_DEFAULT, help="SQLite 文件路径")
ap.add_argument("--no-detail", action="store_true", help="只抓列表, 不进详情页")
ap.add_argument("--detail-limit", type=int, default=None, help="本轮最多抓多少条 JD")
ap.add_argument("--headful", action="store_true", help="显示浏览器窗口")
ap.add_argument("--reset", action="store_true", help="重置该关键词的列表进度")
ap.add_argument("--export", metavar="CSV", help="导出已抓数据到 CSV 后退出")
args = ap.parse_args()
with closing(init_db(args.db)) as conn:
if args.export:
export_csv(conn, args.export, args.keyword if args.keyword != "AI" else None)
return
if args.reset:
scope = f"{args.keyword}|{args.jl or ''}"
conn.execute("DELETE FROM progress WHERE scope=?", (scope,))
conn.commit()
print(f"已重置进度: {scope}")
with InvisiblePlaywright(seed=42, headless=not args.headful) as browser:
page = browser.new_page()
crawl_list(conn, page, args.keyword, args.pages, args.jl)
if not args.no_detail:
crawl_details(conn, page, args.keyword, args.detail_limit)
total = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0]
with_jd = conn.execute(
"SELECT COUNT(*) FROM jobs WHERE jd_text IS NOT NULL AND jd_text!=''"
).fetchone()[0]
print(f"\n库内合计 {total} 条, 其中含 JD 全文 {with_jd} 条 -> {args.db}")
if __name__ == "__main__":
main()
+112
View File
@@ -0,0 +1,112 @@
"""抓取 BOSS 直聘「AI 相关」岗位(自用 / 学习用途)。
合规提醒:
- 仅抓取公开展示的岗位标题/薪资/公司等字段, 不抓取招聘者个人联系方式。
- 低频请求, 遵守目标站点服务条款; 风险自负。
首次使用:
python scrape_zhipin_ai.py --login # 打开浏览器, 手动扫码登录一次
之后:
python scrape_zhipin_ai.py --keyword AI --city 101010100 --pages 3
"""
from __future__ import annotations
import argparse
import csv
import json
import random
import time
from pathlib import Path
from urllib.parse import quote
from invisible_playwright import InvisiblePlaywright
PROFILE_DIR = Path(__file__).parent / ".zhipin_profile" # 持久化登录态
SEED = 20240614 # 固定 seed → 跨会话指纹一致, 配合持久化 profile
def login_flow() -> None:
"""首次手动登录: 打开页面, 你扫码, 登录态写入 PROFILE_DIR。"""
with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx:
page = ctx.new_page()
page.goto("https://www.zhipin.com/web/user/?ka=header-login",
wait_until="domcontentloaded")
print("请在打开的浏览器中扫码登录, 登录完成后回到终端按回车...")
input() # 等你登录完成
print("登录态已保存到", PROFILE_DIR)
def scrape(keyword: str, city: str, pages: int) -> list[dict]:
results: list[dict] = []
with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx:
page = ctx.new_page()
for n in range(1, pages + 1):
url = (
"https://www.zhipin.com/web/geek/job"
f"?query={quote(keyword)}&city={city}&page={n}"
)
page.goto(url, wait_until="domcontentloaded")
# 等列表渲染; 选择器需按实际页面结构核对/调整
try:
page.wait_for_selector("li.job-card-wrapper", timeout=15000)
except Exception:
print(f"{n} 页未出现岗位列表, 可能需要登录或触发了验证码。")
# 给你时间手动过验证码
input("处理完页面后按回车继续...")
rows = page.eval_on_selector_all(
"li.job-card-wrapper",
"""els => els.map(e => ({
title: e.querySelector('.job-name')?.innerText?.trim(),
salary: e.querySelector('.salary')?.innerText?.trim(),
company: e.querySelector('.company-name')?.innerText?.trim(),
tags: Array.from(e.querySelectorAll('.tag-list li'))
.map(t => t.innerText.trim()),
area: e.querySelector('.job-area')?.innerText?.trim(),
link: e.querySelector('a.job-card-left')?.href
|| e.querySelector('a')?.href,
}))""",
)
print(f"{n} 页抓到 {len(rows)}")
results.extend(rows)
# 低频: 随机停顿, 降低风控触发概率
time.sleep(random.uniform(4, 9))
return results
def save(rows: list[dict], stem: str) -> None:
Path(f"{stem}.json").write_text(
json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8"
)
if rows:
keys = ["title", "salary", "company", "area", "tags", "link"]
with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f:
w = csv.DictWriter(f, fieldnames=keys, extrasaction="ignore")
w.writeheader()
for r in rows:
r = dict(r)
r["tags"] = " / ".join(r.get("tags") or [])
w.writerow(r)
print(f"已保存 {len(rows)} 条 → {stem}.json / {stem}.csv")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--login", action="store_true", help="首次手动登录")
ap.add_argument("--keyword", default="AI", help="搜索关键词")
ap.add_argument("--city", default="101010100", help="城市编码 (101010100=北京)")
ap.add_argument("--pages", type=int, default=3, help="抓取页数")
args = ap.parse_args()
if args.login:
login_flow()
return
rows = scrape(args.keyword, args.city, args.pages)
save(rows, stem=f"zhipin_{args.keyword}")
if __name__ == "__main__":
main()