diff --git a/.gitignore b/.gitignore index e6136de..ffe0cf7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,9 @@ build/ .pytest_cache/ .venv/ firefox-source/ +# scraper runtime artifacts (examples/) — data outputs & browser profiles +*.db +zhaopin_*.json +zhaopin_*.csv +ai_jobs.csv +examples/.zhipin_profile/ diff --git a/examples/analyze_zhaopin.py b/examples/analyze_zhaopin.py new file mode 100644 index 0000000..7d3e199 --- /dev/null +++ b/examples/analyze_zhaopin.py @@ -0,0 +1,169 @@ +"""智联招聘抓取数据分析 —— 读取 SQLite, 按城市/薪资/学历/经验/技能做统计。 + +纯标准库, 无需 pandas。 + +用法: + python examples/analyze_zhaopin.py # 全量分析 + python examples/analyze_zhaopin.py --keyword AI # 只看某关键词 + python examples/analyze_zhaopin.py --city 北京 # 只看某城市 + python examples/analyze_zhaopin.py --top 15 # 排行榜取前 15 + python examples/analyze_zhaopin.py --db zhaopin_jobs.db +""" +from __future__ import annotations + +import argparse +import json +import re +import sqlite3 +import statistics +from collections import Counter + +WORKDAYS_PER_MONTH = 21.75 # 日薪 -> 月薪折算 + + +def parse_salary(s: str) -> tuple[int, int] | None: + """把薪资字符串解析成 (月薪下限, 月薪上限) 元。无法解析返回 None。""" + if not s or "面议" in s: + return None + s = s.strip() + is_daily = "/天" in s or "元/天" in s + is_wan = "万" in s + nums = re.findall(r"\d+(?:\.\d+)?", s) + if not nums: + return None + vals = [float(x) for x in nums[:2]] + if len(vals) == 1: + vals = [vals[0], vals[0]] + lo, hi = vals[0], vals[1] + if is_wan: + lo, hi = lo * 10000, hi * 10000 + if is_daily: + lo, hi = lo * WORKDAYS_PER_MONTH, hi * WORKDAYS_PER_MONTH + return int(lo), int(hi) + + +def city_of(location: str) -> str: + return (location or "").split("·")[0].strip() or "未知" + + +SALARY_BUCKETS = [ + (0, 5000, "<5k"), + (5000, 8000, "5-8k"), + (8000, 12000, "8-12k"), + (12000, 18000, "12-18k"), + (18000, 25000, "18-25k"), + (25000, 10**9, ">=25k"), +] + + +def bucket_of(mid: float) -> str: + for lo, hi, label in SALARY_BUCKETS: + if lo <= mid < hi: + return label + return "?" + + +def bar(n: int, maxn: int, width: int = 30) -> str: + if maxn <= 0: + return "" + return "█" * max(1, round(n / maxn * width)) + + +def section(title: str) -> None: + print("\n" + "=" * 56) + print(title) + print("=" * 56) + + +def rank_table(counter: Counter, top: int, label: str) -> None: + if not counter: + print(" (无数据)") + return + maxn = counter.most_common(1)[0][1] + for name, n in counter.most_common(top): + print(f" {name[:18]:<18} {n:>4} {bar(n, maxn)}") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--db", default="zhaopin_jobs.db") + ap.add_argument("--keyword", default=None, help="按关键词过滤") + ap.add_argument("--city", default=None, help="按城市过滤") + ap.add_argument("--top", type=int, default=12, help="排行榜条数") + args = ap.parse_args() + + conn = sqlite3.connect(args.db) + q = ("SELECT title,salary,experience,education,location,company," + "job_tags,company_tags,skills FROM jobs") + params: list = [] + if args.keyword: + q += " WHERE keyword=?" + params.append(args.keyword) + rows = conn.execute(q, params).fetchall() + + if args.city: + rows = [r for r in rows if city_of(r[4]) == args.city] + + if not rows: + print("没有匹配的数据。先用 scrape_zhaopin_full.py 抓一些, 或检查过滤条件。") + return + + n_total = len(rows) + print(f"分析样本: {n_total} 条" + + (f" | 关键词={args.keyword}" if args.keyword else "") + + (f" | 城市={args.city}" if args.city else "")) + + cities, edus, exps = Counter(), Counter(), Counter() + skill_freq, salary_buckets = Counter(), Counter() + mids: list[float] = [] + n_salary_parsed = 0 + + for (title, salary, exp, edu, loc, comp, jtags, ctags, skills) in rows: + cities[city_of(loc)] += 1 + edus[(edu or "未知").strip() or "未知"] += 1 + exps[(exp or "未知").strip() or "未知"] += 1 + # 技能: 合并 skills + job_tags + for src in (skills, jtags): + try: + for t in json.loads(src or "[]"): + t = t.strip() + if t: + skill_freq[t] += 1 + except Exception: + pass + rng = parse_salary(salary) + if rng: + n_salary_parsed += 1 + mid = (rng[0] + rng[1]) / 2 + mids.append(mid) + salary_buckets[bucket_of(mid)] += 1 + + section("城市分布 (Top)") + rank_table(cities, args.top, "城市") + + section("薪资区间分布 (按月薪中位点)") + if mids: + order = {label: i for i, (_, _, label) in enumerate(SALARY_BUCKETS)} + maxn = max(salary_buckets.values()) + for label in sorted(salary_buckets, key=lambda x: order.get(x, 99)): + n = salary_buckets[label] + print(f" {label:<8} {n:>4} {bar(n, maxn)}") + print(f"\n 可解析薪资: {n_salary_parsed}/{n_total} 条 (面议等已排除)") + print(f" 月薪中位点 平均: {statistics.mean(mids):>8,.0f} 元") + print(f" 中位数: {statistics.median(mids):>8,.0f} 元") + print(f" 最低/最高: {min(mids):,.0f} / {max(mids):,.0f} 元") + else: + print(" (无可解析薪资)") + + section("学历要求") + rank_table(edus, args.top, "学历") + + section("经验要求") + rank_table(exps, args.top, "经验") + + section("技能/标签词频 (Top)") + rank_table(skill_freq, args.top, "技能") + + +if __name__ == "__main__": + main() diff --git a/examples/scrape_zhaopin_ai.py b/examples/scrape_zhaopin_ai.py new file mode 100644 index 0000000..ddd12e7 --- /dev/null +++ b/examples/scrape_zhaopin_ai.py @@ -0,0 +1,120 @@ +"""抓取智联招聘「AI 相关」岗位 —— 免登录, 渲染搜索结果页直接解析。 + +合规提醒: + - 仅抓取公开展示的岗位标题/薪资/公司/标签等字段。 + - 不抓取招聘者个人联系方式; 低频请求, 遵守目标站点服务条款, 风险自负。 + +用法: + python examples/scrape_zhaopin_ai.py # 默认抓 "AI" 前 3 页 + python examples/scrape_zhaopin_ai.py --keyword 大模型 --pages 5 + python examples/scrape_zhaopin_ai.py --keyword AI --jl 530 # 530=北京 + python examples/scrape_zhaopin_ai.py --headful # 显示浏览器窗口 + +城市编码(jl): 北京530 上海538 广州763 深圳765 杭州653 成都801 武汉736 南京635 +""" +from __future__ import annotations + +import argparse +import csv +import json +import random +import time +from urllib.parse import quote + +from invisible_playwright import InvisiblePlaywright + +# 在渲染后的页面里一次性提取所有卡片, 比逐个 query 更快更稳。 +# 选择器来自对真实页面的探测 (见 _probe_zhaopin.py)。 +_EXTRACT_JS = r""" +() => { + const txt = (el) => el ? el.innerText.trim() : ""; + const cards = Array.from(document.querySelectorAll(".joblist-box__item")); + return cards.map(card => { + const nameA = card.querySelector("a.jobinfo__name"); + const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item")) + .map(e => e.innerText.trim()); + // 第一个 other-info 是地点(带图标), 取其 span; 其余按顺序是经验/学历 + const locSpan = card.querySelector(".jobinfo__other-info-item span"); + const companyA = card.querySelector("a.companyinfo__name"); + return { + title: txt(nameA), + link: nameA ? nameA.href : "", + salary: txt(card.querySelector(".jobinfo__salary")), + job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag")) + .map(e => e.innerText.trim()), + location: locSpan ? locSpan.innerText.trim() : (info[0] || ""), + experience: info[1] || "", + education: info[2] || "", + company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim() + : "", + company_url: companyA ? companyA.href : "", + company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag")) + .map(e => e.innerText.trim()), + }; + }); +} +""" + + +def scrape(keyword: str, pages: int, jl: str | None, headful: bool) -> list[dict]: + results: list[dict] = [] + seen: set[str] = set() + with InvisiblePlaywright(seed=42, headless=not headful) as browser: + page = browser.new_page() + for n in range(1, pages + 1): + url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}" + if jl: + url += f"&jl={jl}" + try: + page.goto(url, wait_until="domcontentloaded", timeout=60000) + page.wait_for_selector(".joblist-box__item", timeout=20000) + except Exception: + print(f"第 {n} 页未加载出岗位列表, 跳过 (可能触发风控)。") + continue + page.wait_for_timeout(1500) # 让懒加载内容补齐 + + rows = page.evaluate(_EXTRACT_JS) + new = 0 + for r in rows: + key = r.get("link") or (r.get("title", "") + r.get("company", "")) + if key and key not in seen: + seen.add(key) + results.append(r) + new += 1 + print(f"第 {n} 页: 抓到 {len(rows)} 条 (新增 {new})") + + time.sleep(random.uniform(3, 7)) # 低频, 降低风控 + return results + + +def save(rows: list[dict], stem: str) -> None: + with open(f"{stem}.json", "w", encoding="utf-8") as f: + json.dump(rows, f, ensure_ascii=False, indent=2) + if rows: + fields = ["title", "salary", "experience", "education", "location", + "company", "job_tags", "company_tags", "link", "company_url"] + with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f: + w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore") + w.writeheader() + for r in rows: + row = dict(r) + row["job_tags"] = " / ".join(row.get("job_tags") or []) + row["company_tags"] = " / ".join(row.get("company_tags") or []) + w.writerow(row) + print(f"\n已保存 {len(rows)} 条 → {stem}.json / {stem}.csv") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--keyword", default="AI", help="搜索关键词") + ap.add_argument("--pages", type=int, default=3, help="抓取页数") + ap.add_argument("--jl", default=None, help="城市编码, 不填=全国") + ap.add_argument("--headful", action="store_true", help="显示浏览器窗口") + args = ap.parse_args() + + rows = scrape(args.keyword, args.pages, args.jl, args.headful) + save(rows, stem=f"zhaopin_{args.keyword}") + + +if __name__ == "__main__": + main() diff --git a/examples/scrape_zhaopin_full.py b/examples/scrape_zhaopin_full.py new file mode 100644 index 0000000..2324c21 --- /dev/null +++ b/examples/scrape_zhaopin_full.py @@ -0,0 +1,277 @@ +"""智联招聘 AI 岗位爬虫(完整版)—— 免登录 + SQLite + 断点续抓 + 详情页 JD 全文。 + +特性: + * 写入 SQLite (INSERT OR IGNORE 去重, 每条提交, 中断不丢数据) + * 断点续抓: + - 列表阶段: 记录每个 (keyword, jl) 已抓到的最大页码, 重跑从下一页继续 + - 详情阶段: 只抓还没有 JD 的岗位, 重跑自动补齐 + * 详情页: 进入每个岗位页面抓 职位描述(JD)全文 + 技能标签 + +合规提醒: 仅抓公开岗位字段, 不抓招聘者个人联系方式; 低频自用, 遵守站点条款。 + +用法: + # 抓列表(前5页) + 进详情页抓 JD + python examples/scrape_zhaopin_full.py --keyword AI --pages 5 + + # 只抓列表, 不进详情 + python examples/scrape_zhaopin_full.py --keyword 大模型 --pages 5 --no-detail + + # 中断后直接重跑同一命令 => 自动从断点继续 + python examples/scrape_zhaopin_full.py --keyword AI --pages 5 + + # 导出已抓数据到 CSV + python examples/scrape_zhaopin_full.py --export ai_jobs.csv + + # 重置某关键词的列表进度(重新从第1页抓) + python examples/scrape_zhaopin_full.py --keyword AI --reset +""" +from __future__ import annotations + +import argparse +import csv +import json +import random +import re +import sqlite3 +import time +from contextlib import closing +from urllib.parse import quote + +from invisible_playwright import InvisiblePlaywright + +DB_DEFAULT = "zhaopin_jobs.db" + +_LIST_JS = r""" +() => { + const txt = (el) => el ? el.innerText.trim() : ""; + return Array.from(document.querySelectorAll(".joblist-box__item")).map(card => { + const nameA = card.querySelector("a.jobinfo__name"); + const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item")) + .map(e => e.innerText.trim()); + const locSpan = card.querySelector(".jobinfo__other-info-item span"); + const companyA = card.querySelector("a.companyinfo__name"); + return { + title: txt(nameA), + link: nameA ? nameA.href : "", + salary: txt(card.querySelector(".jobinfo__salary")), + job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag")) + .map(e => e.innerText.trim()), + location: locSpan ? locSpan.innerText.trim() : (info[0] || ""), + experience: info[1] || "", + education: info[2] || "", + company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim() : "", + company_url: companyA ? companyA.href : "", + company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag")) + .map(e => e.innerText.trim()), + }; + }); +} +""" + +_DETAIL_JS = r""" +() => { + const c = document.querySelector(".describtion-card__detail-content"); + const skills = Array.from(document.querySelectorAll(".describtion-card__skills-item")) + .map(e => e.innerText.trim()); + return { jd: c ? c.innerText.trim() : "", skills }; +} +""" + + +# ── DB ────────────────────────────────────────────────────────────────── +def init_db(path: str) -> sqlite3.Connection: + conn = sqlite3.connect(path) + conn.execute(""" + CREATE TABLE IF NOT EXISTS jobs ( + job_id TEXT PRIMARY KEY, + title TEXT, + salary TEXT, + experience TEXT, + education TEXT, + location TEXT, + company TEXT, + company_url TEXT, + job_tags TEXT, + company_tags TEXT, + link TEXT, + jd_text TEXT, + skills TEXT, + keyword TEXT, + created_at TEXT DEFAULT (datetime('now','localtime')), + detail_at TEXT + ) + """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS progress ( + scope TEXT PRIMARY KEY, -- f"{keyword}|{jl}" + last_page INTEGER DEFAULT 0 + ) + """) + conn.commit() + return conn + + +def job_id_from_link(link: str) -> str: + m = re.search(r"/jobdetail/([^.?/]+)\.htm", link) + return m.group(1) if m else link + + +def get_last_page(conn: sqlite3.Connection, scope: str) -> int: + row = conn.execute("SELECT last_page FROM progress WHERE scope=?", (scope,)).fetchone() + return row[0] if row else 0 + + +def set_last_page(conn: sqlite3.Connection, scope: str, page: int) -> None: + conn.execute( + "INSERT INTO progress(scope,last_page) VALUES(?,?) " + "ON CONFLICT(scope) DO UPDATE SET last_page=excluded.last_page", + (scope, page), + ) + conn.commit() + + +def upsert_job(conn: sqlite3.Connection, r: dict, keyword: str) -> bool: + jid = job_id_from_link(r.get("link", "")) + if not jid: + return False + cur = conn.execute( + """INSERT OR IGNORE INTO jobs + (job_id,title,salary,experience,education,location,company, + company_url,job_tags,company_tags,link,keyword) + VALUES(?,?,?,?,?,?,?,?,?,?,?,?)""", + (jid, r.get("title"), r.get("salary"), r.get("experience"), + r.get("education"), r.get("location"), r.get("company"), + r.get("company_url"), + json.dumps(r.get("job_tags") or [], ensure_ascii=False), + json.dumps(r.get("company_tags") or [], ensure_ascii=False), + r.get("link"), keyword), + ) + conn.commit() + return cur.rowcount > 0 + + +# ── 抓取 ──────────────────────────────────────────────────────────────── +def crawl_list(conn, page, keyword: str, pages: int, jl: str | None) -> None: + scope = f"{keyword}|{jl or ''}" + start = get_last_page(conn, scope) + 1 + if start > pages: + print(f"[列表] '{keyword}' 已抓到第 {start-1} 页, 目标 {pages} 页, 无需续抓。") + return + print(f"[列表] '{keyword}' 从第 {start} 页抓到第 {pages} 页") + for n in range(start, pages + 1): + url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}" + if jl: + url += f"&jl={jl}" + try: + page.goto(url, wait_until="domcontentloaded", timeout=60000) + page.wait_for_selector(".joblist-box__item", timeout=20000) + except Exception: + print(f" 第 {n} 页未加载出列表, 停止本轮 (重跑可从此页续)。") + break + page.wait_for_timeout(1200) + rows = page.evaluate(_LIST_JS) + new = sum(upsert_job(conn, r, keyword) for r in rows) + set_last_page(conn, scope, n) + print(f" 第 {n} 页: {len(rows)} 条 (新增 {new})") + time.sleep(random.uniform(3, 7)) + + +def crawl_details(conn, page, keyword: str | None, limit: int | None) -> None: + q = "SELECT job_id,link FROM jobs WHERE (jd_text IS NULL OR jd_text='') AND link!=''" + params: list = [] + if keyword: + q += " AND keyword=?" + params.append(keyword) + q += " ORDER BY created_at" + if limit: + q += f" LIMIT {int(limit)}" + todo = conn.execute(q, params).fetchall() + if not todo: + print("[详情] 没有待补充 JD 的岗位。") + return + print(f"[详情] 待抓 JD: {len(todo)} 条") + for i, (jid, link) in enumerate(todo, 1): + try: + page.goto(link, wait_until="domcontentloaded", timeout=60000) + page.wait_for_selector(".describtion-card__detail-content", timeout=15000) + page.wait_for_timeout(800) + data = page.evaluate(_DETAIL_JS) + except Exception as e: + print(f" [{i}/{len(todo)}] {jid} 抓取失败: {str(e)[:60]}") + continue + conn.execute( + "UPDATE jobs SET jd_text=?, skills=?, detail_at=datetime('now','localtime') " + "WHERE job_id=?", + (data.get("jd", ""), + json.dumps(data.get("skills") or [], ensure_ascii=False), jid), + ) + conn.commit() # 逐条提交 => 中断安全 + jd_len = len(data.get("jd", "")) + print(f" [{i}/{len(todo)}] {jid} JD {jd_len} 字") + time.sleep(random.uniform(2, 5)) + + +# ── 导出 ──────────────────────────────────────────────────────────────── +def export_csv(conn, path: str, keyword: str | None) -> None: + q = ("SELECT title,salary,experience,education,location,company," + "job_tags,company_tags,skills,jd_text,link FROM jobs") + params: list = [] + if keyword: + q += " WHERE keyword=?" + params.append(keyword) + rows = conn.execute(q, params).fetchall() + cols = ["title", "salary", "experience", "education", "location", "company", + "job_tags", "company_tags", "skills", "jd_text", "link"] + with open(path, "w", newline="", encoding="utf-8-sig") as f: + w = csv.writer(f) + w.writerow(cols) + for row in rows: + row = list(row) + for idx in (6, 7, 8): # json 数组列 -> 用 / 连接 + try: + row[idx] = " / ".join(json.loads(row[idx] or "[]")) + except Exception: + pass + w.writerow(row) + print(f"已导出 {len(rows)} 条 -> {path}") + + +# ── main ──────────────────────────────────────────────────────────────── +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--keyword", default="AI", help="搜索关键词") + ap.add_argument("--pages", type=int, default=3, help="列表抓取页数") + ap.add_argument("--jl", default=None, help="城市编码, 不填=全国") + ap.add_argument("--db", default=DB_DEFAULT, help="SQLite 文件路径") + ap.add_argument("--no-detail", action="store_true", help="只抓列表, 不进详情页") + ap.add_argument("--detail-limit", type=int, default=None, help="本轮最多抓多少条 JD") + ap.add_argument("--headful", action="store_true", help="显示浏览器窗口") + ap.add_argument("--reset", action="store_true", help="重置该关键词的列表进度") + ap.add_argument("--export", metavar="CSV", help="导出已抓数据到 CSV 后退出") + args = ap.parse_args() + + with closing(init_db(args.db)) as conn: + if args.export: + export_csv(conn, args.export, args.keyword if args.keyword != "AI" else None) + return + if args.reset: + scope = f"{args.keyword}|{args.jl or ''}" + conn.execute("DELETE FROM progress WHERE scope=?", (scope,)) + conn.commit() + print(f"已重置进度: {scope}") + + with InvisiblePlaywright(seed=42, headless=not args.headful) as browser: + page = browser.new_page() + crawl_list(conn, page, args.keyword, args.pages, args.jl) + if not args.no_detail: + crawl_details(conn, page, args.keyword, args.detail_limit) + + total = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0] + with_jd = conn.execute( + "SELECT COUNT(*) FROM jobs WHERE jd_text IS NOT NULL AND jd_text!=''" + ).fetchone()[0] + print(f"\n库内合计 {total} 条, 其中含 JD 全文 {with_jd} 条 -> {args.db}") + + +if __name__ == "__main__": + main() diff --git a/examples/scrape_zhipin_ai.py b/examples/scrape_zhipin_ai.py new file mode 100644 index 0000000..c5bd825 --- /dev/null +++ b/examples/scrape_zhipin_ai.py @@ -0,0 +1,112 @@ +"""抓取 BOSS 直聘「AI 相关」岗位(自用 / 学习用途)。 + +合规提醒: + - 仅抓取公开展示的岗位标题/薪资/公司等字段, 不抓取招聘者个人联系方式。 + - 低频请求, 遵守目标站点服务条款; 风险自负。 + +首次使用: + python scrape_zhipin_ai.py --login # 打开浏览器, 手动扫码登录一次 +之后: + python scrape_zhipin_ai.py --keyword AI --city 101010100 --pages 3 +""" +from __future__ import annotations + +import argparse +import csv +import json +import random +import time +from pathlib import Path +from urllib.parse import quote + +from invisible_playwright import InvisiblePlaywright + +PROFILE_DIR = Path(__file__).parent / ".zhipin_profile" # 持久化登录态 +SEED = 20240614 # 固定 seed → 跨会话指纹一致, 配合持久化 profile + + +def login_flow() -> None: + """首次手动登录: 打开页面, 你扫码, 登录态写入 PROFILE_DIR。""" + with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx: + page = ctx.new_page() + page.goto("https://www.zhipin.com/web/user/?ka=header-login", + wait_until="domcontentloaded") + print("请在打开的浏览器中扫码登录, 登录完成后回到终端按回车...") + input() # 等你登录完成 + print("登录态已保存到", PROFILE_DIR) + + +def scrape(keyword: str, city: str, pages: int) -> list[dict]: + results: list[dict] = [] + with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx: + page = ctx.new_page() + for n in range(1, pages + 1): + url = ( + "https://www.zhipin.com/web/geek/job" + f"?query={quote(keyword)}&city={city}&page={n}" + ) + page.goto(url, wait_until="domcontentloaded") + + # 等列表渲染; 选择器需按实际页面结构核对/调整 + try: + page.wait_for_selector("li.job-card-wrapper", timeout=15000) + except Exception: + print(f"第 {n} 页未出现岗位列表, 可能需要登录或触发了验证码。") + # 给你时间手动过验证码 + input("处理完页面后按回车继续...") + + rows = page.eval_on_selector_all( + "li.job-card-wrapper", + """els => els.map(e => ({ + title: e.querySelector('.job-name')?.innerText?.trim(), + salary: e.querySelector('.salary')?.innerText?.trim(), + company: e.querySelector('.company-name')?.innerText?.trim(), + tags: Array.from(e.querySelectorAll('.tag-list li')) + .map(t => t.innerText.trim()), + area: e.querySelector('.job-area')?.innerText?.trim(), + link: e.querySelector('a.job-card-left')?.href + || e.querySelector('a')?.href, + }))""", + ) + print(f"第 {n} 页抓到 {len(rows)} 条") + results.extend(rows) + + # 低频: 随机停顿, 降低风控触发概率 + time.sleep(random.uniform(4, 9)) + return results + + +def save(rows: list[dict], stem: str) -> None: + Path(f"{stem}.json").write_text( + json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8" + ) + if rows: + keys = ["title", "salary", "company", "area", "tags", "link"] + with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f: + w = csv.DictWriter(f, fieldnames=keys, extrasaction="ignore") + w.writeheader() + for r in rows: + r = dict(r) + r["tags"] = " / ".join(r.get("tags") or []) + w.writerow(r) + print(f"已保存 {len(rows)} 条 → {stem}.json / {stem}.csv") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--login", action="store_true", help="首次手动登录") + ap.add_argument("--keyword", default="AI", help="搜索关键词") + ap.add_argument("--city", default="101010100", help="城市编码 (101010100=北京)") + ap.add_argument("--pages", type=int, default=3, help="抓取页数") + args = ap.parse_args() + + if args.login: + login_flow() + return + + rows = scrape(args.keyword, args.city, args.pages) + save(rows, stem=f"zhipin_{args.keyword}") + + +if __name__ == "__main__": + main()