examples: add zhaopin AI job scrapers + SQLite analysis
e2e / e2e (linux, xvfb) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.11) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.12) (push) Waiting to run
tests / pytest (windows-latest, py3.11) (push) Waiting to run
tests / pytest (windows-latest, py3.12) (push) Waiting to run
e2e / e2e (linux, xvfb) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.11) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.12) (push) Waiting to run
tests / pytest (windows-latest, py3.11) (push) Waiting to run
tests / pytest (windows-latest, py3.12) (push) Waiting to run
- scrape_zhaopin_ai.py: lightweight no-login list scraper - scrape_zhaopin_full.py: SQLite storage, resumable crawl, detail-page JD - analyze_zhaopin.py: stats by city/salary/education/experience/skills - scrape_zhipin_ai.py: BOSS Zhipin variant (login-based, fallback) - gitignore scraper data artifacts and browser profile
This commit is contained in:
@@ -6,3 +6,9 @@ build/
|
||||
.pytest_cache/
|
||||
.venv/
|
||||
firefox-source/
|
||||
# scraper runtime artifacts (examples/) — data outputs & browser profiles
|
||||
*.db
|
||||
zhaopin_*.json
|
||||
zhaopin_*.csv
|
||||
ai_jobs.csv
|
||||
examples/.zhipin_profile/
|
||||
|
||||
@@ -0,0 +1,169 @@
|
||||
"""智联招聘抓取数据分析 —— 读取 SQLite, 按城市/薪资/学历/经验/技能做统计。
|
||||
|
||||
纯标准库, 无需 pandas。
|
||||
|
||||
用法:
|
||||
python examples/analyze_zhaopin.py # 全量分析
|
||||
python examples/analyze_zhaopin.py --keyword AI # 只看某关键词
|
||||
python examples/analyze_zhaopin.py --city 北京 # 只看某城市
|
||||
python examples/analyze_zhaopin.py --top 15 # 排行榜取前 15
|
||||
python examples/analyze_zhaopin.py --db zhaopin_jobs.db
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import statistics
|
||||
from collections import Counter
|
||||
|
||||
WORKDAYS_PER_MONTH = 21.75 # 日薪 -> 月薪折算
|
||||
|
||||
|
||||
def parse_salary(s: str) -> tuple[int, int] | None:
|
||||
"""把薪资字符串解析成 (月薪下限, 月薪上限) 元。无法解析返回 None。"""
|
||||
if not s or "面议" in s:
|
||||
return None
|
||||
s = s.strip()
|
||||
is_daily = "/天" in s or "元/天" in s
|
||||
is_wan = "万" in s
|
||||
nums = re.findall(r"\d+(?:\.\d+)?", s)
|
||||
if not nums:
|
||||
return None
|
||||
vals = [float(x) for x in nums[:2]]
|
||||
if len(vals) == 1:
|
||||
vals = [vals[0], vals[0]]
|
||||
lo, hi = vals[0], vals[1]
|
||||
if is_wan:
|
||||
lo, hi = lo * 10000, hi * 10000
|
||||
if is_daily:
|
||||
lo, hi = lo * WORKDAYS_PER_MONTH, hi * WORKDAYS_PER_MONTH
|
||||
return int(lo), int(hi)
|
||||
|
||||
|
||||
def city_of(location: str) -> str:
|
||||
return (location or "").split("·")[0].strip() or "未知"
|
||||
|
||||
|
||||
SALARY_BUCKETS = [
|
||||
(0, 5000, "<5k"),
|
||||
(5000, 8000, "5-8k"),
|
||||
(8000, 12000, "8-12k"),
|
||||
(12000, 18000, "12-18k"),
|
||||
(18000, 25000, "18-25k"),
|
||||
(25000, 10**9, ">=25k"),
|
||||
]
|
||||
|
||||
|
||||
def bucket_of(mid: float) -> str:
|
||||
for lo, hi, label in SALARY_BUCKETS:
|
||||
if lo <= mid < hi:
|
||||
return label
|
||||
return "?"
|
||||
|
||||
|
||||
def bar(n: int, maxn: int, width: int = 30) -> str:
|
||||
if maxn <= 0:
|
||||
return ""
|
||||
return "█" * max(1, round(n / maxn * width))
|
||||
|
||||
|
||||
def section(title: str) -> None:
|
||||
print("\n" + "=" * 56)
|
||||
print(title)
|
||||
print("=" * 56)
|
||||
|
||||
|
||||
def rank_table(counter: Counter, top: int, label: str) -> None:
|
||||
if not counter:
|
||||
print(" (无数据)")
|
||||
return
|
||||
maxn = counter.most_common(1)[0][1]
|
||||
for name, n in counter.most_common(top):
|
||||
print(f" {name[:18]:<18} {n:>4} {bar(n, maxn)}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--db", default="zhaopin_jobs.db")
|
||||
ap.add_argument("--keyword", default=None, help="按关键词过滤")
|
||||
ap.add_argument("--city", default=None, help="按城市过滤")
|
||||
ap.add_argument("--top", type=int, default=12, help="排行榜条数")
|
||||
args = ap.parse_args()
|
||||
|
||||
conn = sqlite3.connect(args.db)
|
||||
q = ("SELECT title,salary,experience,education,location,company,"
|
||||
"job_tags,company_tags,skills FROM jobs")
|
||||
params: list = []
|
||||
if args.keyword:
|
||||
q += " WHERE keyword=?"
|
||||
params.append(args.keyword)
|
||||
rows = conn.execute(q, params).fetchall()
|
||||
|
||||
if args.city:
|
||||
rows = [r for r in rows if city_of(r[4]) == args.city]
|
||||
|
||||
if not rows:
|
||||
print("没有匹配的数据。先用 scrape_zhaopin_full.py 抓一些, 或检查过滤条件。")
|
||||
return
|
||||
|
||||
n_total = len(rows)
|
||||
print(f"分析样本: {n_total} 条"
|
||||
+ (f" | 关键词={args.keyword}" if args.keyword else "")
|
||||
+ (f" | 城市={args.city}" if args.city else ""))
|
||||
|
||||
cities, edus, exps = Counter(), Counter(), Counter()
|
||||
skill_freq, salary_buckets = Counter(), Counter()
|
||||
mids: list[float] = []
|
||||
n_salary_parsed = 0
|
||||
|
||||
for (title, salary, exp, edu, loc, comp, jtags, ctags, skills) in rows:
|
||||
cities[city_of(loc)] += 1
|
||||
edus[(edu or "未知").strip() or "未知"] += 1
|
||||
exps[(exp or "未知").strip() or "未知"] += 1
|
||||
# 技能: 合并 skills + job_tags
|
||||
for src in (skills, jtags):
|
||||
try:
|
||||
for t in json.loads(src or "[]"):
|
||||
t = t.strip()
|
||||
if t:
|
||||
skill_freq[t] += 1
|
||||
except Exception:
|
||||
pass
|
||||
rng = parse_salary(salary)
|
||||
if rng:
|
||||
n_salary_parsed += 1
|
||||
mid = (rng[0] + rng[1]) / 2
|
||||
mids.append(mid)
|
||||
salary_buckets[bucket_of(mid)] += 1
|
||||
|
||||
section("城市分布 (Top)")
|
||||
rank_table(cities, args.top, "城市")
|
||||
|
||||
section("薪资区间分布 (按月薪中位点)")
|
||||
if mids:
|
||||
order = {label: i for i, (_, _, label) in enumerate(SALARY_BUCKETS)}
|
||||
maxn = max(salary_buckets.values())
|
||||
for label in sorted(salary_buckets, key=lambda x: order.get(x, 99)):
|
||||
n = salary_buckets[label]
|
||||
print(f" {label:<8} {n:>4} {bar(n, maxn)}")
|
||||
print(f"\n 可解析薪资: {n_salary_parsed}/{n_total} 条 (面议等已排除)")
|
||||
print(f" 月薪中位点 平均: {statistics.mean(mids):>8,.0f} 元")
|
||||
print(f" 中位数: {statistics.median(mids):>8,.0f} 元")
|
||||
print(f" 最低/最高: {min(mids):,.0f} / {max(mids):,.0f} 元")
|
||||
else:
|
||||
print(" (无可解析薪资)")
|
||||
|
||||
section("学历要求")
|
||||
rank_table(edus, args.top, "学历")
|
||||
|
||||
section("经验要求")
|
||||
rank_table(exps, args.top, "经验")
|
||||
|
||||
section("技能/标签词频 (Top)")
|
||||
rank_table(skill_freq, args.top, "技能")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,120 @@
|
||||
"""抓取智联招聘「AI 相关」岗位 —— 免登录, 渲染搜索结果页直接解析。
|
||||
|
||||
合规提醒:
|
||||
- 仅抓取公开展示的岗位标题/薪资/公司/标签等字段。
|
||||
- 不抓取招聘者个人联系方式; 低频请求, 遵守目标站点服务条款, 风险自负。
|
||||
|
||||
用法:
|
||||
python examples/scrape_zhaopin_ai.py # 默认抓 "AI" 前 3 页
|
||||
python examples/scrape_zhaopin_ai.py --keyword 大模型 --pages 5
|
||||
python examples/scrape_zhaopin_ai.py --keyword AI --jl 530 # 530=北京
|
||||
python examples/scrape_zhaopin_ai.py --headful # 显示浏览器窗口
|
||||
|
||||
城市编码(jl): 北京530 上海538 广州763 深圳765 杭州653 成都801 武汉736 南京635
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from urllib.parse import quote
|
||||
|
||||
from invisible_playwright import InvisiblePlaywright
|
||||
|
||||
# 在渲染后的页面里一次性提取所有卡片, 比逐个 query 更快更稳。
|
||||
# 选择器来自对真实页面的探测 (见 _probe_zhaopin.py)。
|
||||
_EXTRACT_JS = r"""
|
||||
() => {
|
||||
const txt = (el) => el ? el.innerText.trim() : "";
|
||||
const cards = Array.from(document.querySelectorAll(".joblist-box__item"));
|
||||
return cards.map(card => {
|
||||
const nameA = card.querySelector("a.jobinfo__name");
|
||||
const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item"))
|
||||
.map(e => e.innerText.trim());
|
||||
// 第一个 other-info 是地点(带图标), 取其 span; 其余按顺序是经验/学历
|
||||
const locSpan = card.querySelector(".jobinfo__other-info-item span");
|
||||
const companyA = card.querySelector("a.companyinfo__name");
|
||||
return {
|
||||
title: txt(nameA),
|
||||
link: nameA ? nameA.href : "",
|
||||
salary: txt(card.querySelector(".jobinfo__salary")),
|
||||
job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag"))
|
||||
.map(e => e.innerText.trim()),
|
||||
location: locSpan ? locSpan.innerText.trim() : (info[0] || ""),
|
||||
experience: info[1] || "",
|
||||
education: info[2] || "",
|
||||
company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim()
|
||||
: "",
|
||||
company_url: companyA ? companyA.href : "",
|
||||
company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag"))
|
||||
.map(e => e.innerText.trim()),
|
||||
};
|
||||
});
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def scrape(keyword: str, pages: int, jl: str | None, headful: bool) -> list[dict]:
|
||||
results: list[dict] = []
|
||||
seen: set[str] = set()
|
||||
with InvisiblePlaywright(seed=42, headless=not headful) as browser:
|
||||
page = browser.new_page()
|
||||
for n in range(1, pages + 1):
|
||||
url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}"
|
||||
if jl:
|
||||
url += f"&jl={jl}"
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
page.wait_for_selector(".joblist-box__item", timeout=20000)
|
||||
except Exception:
|
||||
print(f"第 {n} 页未加载出岗位列表, 跳过 (可能触发风控)。")
|
||||
continue
|
||||
page.wait_for_timeout(1500) # 让懒加载内容补齐
|
||||
|
||||
rows = page.evaluate(_EXTRACT_JS)
|
||||
new = 0
|
||||
for r in rows:
|
||||
key = r.get("link") or (r.get("title", "") + r.get("company", ""))
|
||||
if key and key not in seen:
|
||||
seen.add(key)
|
||||
results.append(r)
|
||||
new += 1
|
||||
print(f"第 {n} 页: 抓到 {len(rows)} 条 (新增 {new})")
|
||||
|
||||
time.sleep(random.uniform(3, 7)) # 低频, 降低风控
|
||||
return results
|
||||
|
||||
|
||||
def save(rows: list[dict], stem: str) -> None:
|
||||
with open(f"{stem}.json", "w", encoding="utf-8") as f:
|
||||
json.dump(rows, f, ensure_ascii=False, indent=2)
|
||||
if rows:
|
||||
fields = ["title", "salary", "experience", "education", "location",
|
||||
"company", "job_tags", "company_tags", "link", "company_url"]
|
||||
with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f:
|
||||
w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
||||
w.writeheader()
|
||||
for r in rows:
|
||||
row = dict(r)
|
||||
row["job_tags"] = " / ".join(row.get("job_tags") or [])
|
||||
row["company_tags"] = " / ".join(row.get("company_tags") or [])
|
||||
w.writerow(row)
|
||||
print(f"\n已保存 {len(rows)} 条 → {stem}.json / {stem}.csv")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--keyword", default="AI", help="搜索关键词")
|
||||
ap.add_argument("--pages", type=int, default=3, help="抓取页数")
|
||||
ap.add_argument("--jl", default=None, help="城市编码, 不填=全国")
|
||||
ap.add_argument("--headful", action="store_true", help="显示浏览器窗口")
|
||||
args = ap.parse_args()
|
||||
|
||||
rows = scrape(args.keyword, args.pages, args.jl, args.headful)
|
||||
save(rows, stem=f"zhaopin_{args.keyword}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,277 @@
|
||||
"""智联招聘 AI 岗位爬虫(完整版)—— 免登录 + SQLite + 断点续抓 + 详情页 JD 全文。
|
||||
|
||||
特性:
|
||||
* 写入 SQLite (INSERT OR IGNORE 去重, 每条提交, 中断不丢数据)
|
||||
* 断点续抓:
|
||||
- 列表阶段: 记录每个 (keyword, jl) 已抓到的最大页码, 重跑从下一页继续
|
||||
- 详情阶段: 只抓还没有 JD 的岗位, 重跑自动补齐
|
||||
* 详情页: 进入每个岗位页面抓 职位描述(JD)全文 + 技能标签
|
||||
|
||||
合规提醒: 仅抓公开岗位字段, 不抓招聘者个人联系方式; 低频自用, 遵守站点条款。
|
||||
|
||||
用法:
|
||||
# 抓列表(前5页) + 进详情页抓 JD
|
||||
python examples/scrape_zhaopin_full.py --keyword AI --pages 5
|
||||
|
||||
# 只抓列表, 不进详情
|
||||
python examples/scrape_zhaopin_full.py --keyword 大模型 --pages 5 --no-detail
|
||||
|
||||
# 中断后直接重跑同一命令 => 自动从断点继续
|
||||
python examples/scrape_zhaopin_full.py --keyword AI --pages 5
|
||||
|
||||
# 导出已抓数据到 CSV
|
||||
python examples/scrape_zhaopin_full.py --export ai_jobs.csv
|
||||
|
||||
# 重置某关键词的列表进度(重新从第1页抓)
|
||||
python examples/scrape_zhaopin_full.py --keyword AI --reset
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
from contextlib import closing
|
||||
from urllib.parse import quote
|
||||
|
||||
from invisible_playwright import InvisiblePlaywright
|
||||
|
||||
DB_DEFAULT = "zhaopin_jobs.db"
|
||||
|
||||
_LIST_JS = r"""
|
||||
() => {
|
||||
const txt = (el) => el ? el.innerText.trim() : "";
|
||||
return Array.from(document.querySelectorAll(".joblist-box__item")).map(card => {
|
||||
const nameA = card.querySelector("a.jobinfo__name");
|
||||
const info = Array.from(card.querySelectorAll(".jobinfo__other-info-item"))
|
||||
.map(e => e.innerText.trim());
|
||||
const locSpan = card.querySelector(".jobinfo__other-info-item span");
|
||||
const companyA = card.querySelector("a.companyinfo__name");
|
||||
return {
|
||||
title: txt(nameA),
|
||||
link: nameA ? nameA.href : "",
|
||||
salary: txt(card.querySelector(".jobinfo__salary")),
|
||||
job_tags: Array.from(card.querySelectorAll(".jobinfo__tag .joblist-box__item-tag"))
|
||||
.map(e => e.innerText.trim()),
|
||||
location: locSpan ? locSpan.innerText.trim() : (info[0] || ""),
|
||||
experience: info[1] || "",
|
||||
education: info[2] || "",
|
||||
company: companyA ? (companyA.getAttribute("title") || companyA.innerText).trim() : "",
|
||||
company_url: companyA ? companyA.href : "",
|
||||
company_tags: Array.from(card.querySelectorAll(".companyinfo__tag .joblist-box__item-tag"))
|
||||
.map(e => e.innerText.trim()),
|
||||
};
|
||||
});
|
||||
}
|
||||
"""
|
||||
|
||||
_DETAIL_JS = r"""
|
||||
() => {
|
||||
const c = document.querySelector(".describtion-card__detail-content");
|
||||
const skills = Array.from(document.querySelectorAll(".describtion-card__skills-item"))
|
||||
.map(e => e.innerText.trim());
|
||||
return { jd: c ? c.innerText.trim() : "", skills };
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
# ── DB ──────────────────────────────────────────────────────────────────
|
||||
def init_db(path: str) -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(path)
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS jobs (
|
||||
job_id TEXT PRIMARY KEY,
|
||||
title TEXT,
|
||||
salary TEXT,
|
||||
experience TEXT,
|
||||
education TEXT,
|
||||
location TEXT,
|
||||
company TEXT,
|
||||
company_url TEXT,
|
||||
job_tags TEXT,
|
||||
company_tags TEXT,
|
||||
link TEXT,
|
||||
jd_text TEXT,
|
||||
skills TEXT,
|
||||
keyword TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now','localtime')),
|
||||
detail_at TEXT
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS progress (
|
||||
scope TEXT PRIMARY KEY, -- f"{keyword}|{jl}"
|
||||
last_page INTEGER DEFAULT 0
|
||||
)
|
||||
""")
|
||||
conn.commit()
|
||||
return conn
|
||||
|
||||
|
||||
def job_id_from_link(link: str) -> str:
|
||||
m = re.search(r"/jobdetail/([^.?/]+)\.htm", link)
|
||||
return m.group(1) if m else link
|
||||
|
||||
|
||||
def get_last_page(conn: sqlite3.Connection, scope: str) -> int:
|
||||
row = conn.execute("SELECT last_page FROM progress WHERE scope=?", (scope,)).fetchone()
|
||||
return row[0] if row else 0
|
||||
|
||||
|
||||
def set_last_page(conn: sqlite3.Connection, scope: str, page: int) -> None:
|
||||
conn.execute(
|
||||
"INSERT INTO progress(scope,last_page) VALUES(?,?) "
|
||||
"ON CONFLICT(scope) DO UPDATE SET last_page=excluded.last_page",
|
||||
(scope, page),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def upsert_job(conn: sqlite3.Connection, r: dict, keyword: str) -> bool:
|
||||
jid = job_id_from_link(r.get("link", ""))
|
||||
if not jid:
|
||||
return False
|
||||
cur = conn.execute(
|
||||
"""INSERT OR IGNORE INTO jobs
|
||||
(job_id,title,salary,experience,education,location,company,
|
||||
company_url,job_tags,company_tags,link,keyword)
|
||||
VALUES(?,?,?,?,?,?,?,?,?,?,?,?)""",
|
||||
(jid, r.get("title"), r.get("salary"), r.get("experience"),
|
||||
r.get("education"), r.get("location"), r.get("company"),
|
||||
r.get("company_url"),
|
||||
json.dumps(r.get("job_tags") or [], ensure_ascii=False),
|
||||
json.dumps(r.get("company_tags") or [], ensure_ascii=False),
|
||||
r.get("link"), keyword),
|
||||
)
|
||||
conn.commit()
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
# ── 抓取 ────────────────────────────────────────────────────────────────
|
||||
def crawl_list(conn, page, keyword: str, pages: int, jl: str | None) -> None:
|
||||
scope = f"{keyword}|{jl or ''}"
|
||||
start = get_last_page(conn, scope) + 1
|
||||
if start > pages:
|
||||
print(f"[列表] '{keyword}' 已抓到第 {start-1} 页, 目标 {pages} 页, 无需续抓。")
|
||||
return
|
||||
print(f"[列表] '{keyword}' 从第 {start} 页抓到第 {pages} 页")
|
||||
for n in range(start, pages + 1):
|
||||
url = f"https://sou.zhaopin.com/?kw={quote(keyword)}&p={n}"
|
||||
if jl:
|
||||
url += f"&jl={jl}"
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
page.wait_for_selector(".joblist-box__item", timeout=20000)
|
||||
except Exception:
|
||||
print(f" 第 {n} 页未加载出列表, 停止本轮 (重跑可从此页续)。")
|
||||
break
|
||||
page.wait_for_timeout(1200)
|
||||
rows = page.evaluate(_LIST_JS)
|
||||
new = sum(upsert_job(conn, r, keyword) for r in rows)
|
||||
set_last_page(conn, scope, n)
|
||||
print(f" 第 {n} 页: {len(rows)} 条 (新增 {new})")
|
||||
time.sleep(random.uniform(3, 7))
|
||||
|
||||
|
||||
def crawl_details(conn, page, keyword: str | None, limit: int | None) -> None:
|
||||
q = "SELECT job_id,link FROM jobs WHERE (jd_text IS NULL OR jd_text='') AND link!=''"
|
||||
params: list = []
|
||||
if keyword:
|
||||
q += " AND keyword=?"
|
||||
params.append(keyword)
|
||||
q += " ORDER BY created_at"
|
||||
if limit:
|
||||
q += f" LIMIT {int(limit)}"
|
||||
todo = conn.execute(q, params).fetchall()
|
||||
if not todo:
|
||||
print("[详情] 没有待补充 JD 的岗位。")
|
||||
return
|
||||
print(f"[详情] 待抓 JD: {len(todo)} 条")
|
||||
for i, (jid, link) in enumerate(todo, 1):
|
||||
try:
|
||||
page.goto(link, wait_until="domcontentloaded", timeout=60000)
|
||||
page.wait_for_selector(".describtion-card__detail-content", timeout=15000)
|
||||
page.wait_for_timeout(800)
|
||||
data = page.evaluate(_DETAIL_JS)
|
||||
except Exception as e:
|
||||
print(f" [{i}/{len(todo)}] {jid} 抓取失败: {str(e)[:60]}")
|
||||
continue
|
||||
conn.execute(
|
||||
"UPDATE jobs SET jd_text=?, skills=?, detail_at=datetime('now','localtime') "
|
||||
"WHERE job_id=?",
|
||||
(data.get("jd", ""),
|
||||
json.dumps(data.get("skills") or [], ensure_ascii=False), jid),
|
||||
)
|
||||
conn.commit() # 逐条提交 => 中断安全
|
||||
jd_len = len(data.get("jd", ""))
|
||||
print(f" [{i}/{len(todo)}] {jid} JD {jd_len} 字")
|
||||
time.sleep(random.uniform(2, 5))
|
||||
|
||||
|
||||
# ── 导出 ────────────────────────────────────────────────────────────────
|
||||
def export_csv(conn, path: str, keyword: str | None) -> None:
|
||||
q = ("SELECT title,salary,experience,education,location,company,"
|
||||
"job_tags,company_tags,skills,jd_text,link FROM jobs")
|
||||
params: list = []
|
||||
if keyword:
|
||||
q += " WHERE keyword=?"
|
||||
params.append(keyword)
|
||||
rows = conn.execute(q, params).fetchall()
|
||||
cols = ["title", "salary", "experience", "education", "location", "company",
|
||||
"job_tags", "company_tags", "skills", "jd_text", "link"]
|
||||
with open(path, "w", newline="", encoding="utf-8-sig") as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(cols)
|
||||
for row in rows:
|
||||
row = list(row)
|
||||
for idx in (6, 7, 8): # json 数组列 -> 用 / 连接
|
||||
try:
|
||||
row[idx] = " / ".join(json.loads(row[idx] or "[]"))
|
||||
except Exception:
|
||||
pass
|
||||
w.writerow(row)
|
||||
print(f"已导出 {len(rows)} 条 -> {path}")
|
||||
|
||||
|
||||
# ── main ────────────────────────────────────────────────────────────────
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--keyword", default="AI", help="搜索关键词")
|
||||
ap.add_argument("--pages", type=int, default=3, help="列表抓取页数")
|
||||
ap.add_argument("--jl", default=None, help="城市编码, 不填=全国")
|
||||
ap.add_argument("--db", default=DB_DEFAULT, help="SQLite 文件路径")
|
||||
ap.add_argument("--no-detail", action="store_true", help="只抓列表, 不进详情页")
|
||||
ap.add_argument("--detail-limit", type=int, default=None, help="本轮最多抓多少条 JD")
|
||||
ap.add_argument("--headful", action="store_true", help="显示浏览器窗口")
|
||||
ap.add_argument("--reset", action="store_true", help="重置该关键词的列表进度")
|
||||
ap.add_argument("--export", metavar="CSV", help="导出已抓数据到 CSV 后退出")
|
||||
args = ap.parse_args()
|
||||
|
||||
with closing(init_db(args.db)) as conn:
|
||||
if args.export:
|
||||
export_csv(conn, args.export, args.keyword if args.keyword != "AI" else None)
|
||||
return
|
||||
if args.reset:
|
||||
scope = f"{args.keyword}|{args.jl or ''}"
|
||||
conn.execute("DELETE FROM progress WHERE scope=?", (scope,))
|
||||
conn.commit()
|
||||
print(f"已重置进度: {scope}")
|
||||
|
||||
with InvisiblePlaywright(seed=42, headless=not args.headful) as browser:
|
||||
page = browser.new_page()
|
||||
crawl_list(conn, page, args.keyword, args.pages, args.jl)
|
||||
if not args.no_detail:
|
||||
crawl_details(conn, page, args.keyword, args.detail_limit)
|
||||
|
||||
total = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0]
|
||||
with_jd = conn.execute(
|
||||
"SELECT COUNT(*) FROM jobs WHERE jd_text IS NOT NULL AND jd_text!=''"
|
||||
).fetchone()[0]
|
||||
print(f"\n库内合计 {total} 条, 其中含 JD 全文 {with_jd} 条 -> {args.db}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,112 @@
|
||||
"""抓取 BOSS 直聘「AI 相关」岗位(自用 / 学习用途)。
|
||||
|
||||
合规提醒:
|
||||
- 仅抓取公开展示的岗位标题/薪资/公司等字段, 不抓取招聘者个人联系方式。
|
||||
- 低频请求, 遵守目标站点服务条款; 风险自负。
|
||||
|
||||
首次使用:
|
||||
python scrape_zhipin_ai.py --login # 打开浏览器, 手动扫码登录一次
|
||||
之后:
|
||||
python scrape_zhipin_ai.py --keyword AI --city 101010100 --pages 3
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
from invisible_playwright import InvisiblePlaywright
|
||||
|
||||
PROFILE_DIR = Path(__file__).parent / ".zhipin_profile" # 持久化登录态
|
||||
SEED = 20240614 # 固定 seed → 跨会话指纹一致, 配合持久化 profile
|
||||
|
||||
|
||||
def login_flow() -> None:
|
||||
"""首次手动登录: 打开页面, 你扫码, 登录态写入 PROFILE_DIR。"""
|
||||
with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx:
|
||||
page = ctx.new_page()
|
||||
page.goto("https://www.zhipin.com/web/user/?ka=header-login",
|
||||
wait_until="domcontentloaded")
|
||||
print("请在打开的浏览器中扫码登录, 登录完成后回到终端按回车...")
|
||||
input() # 等你登录完成
|
||||
print("登录态已保存到", PROFILE_DIR)
|
||||
|
||||
|
||||
def scrape(keyword: str, city: str, pages: int) -> list[dict]:
|
||||
results: list[dict] = []
|
||||
with InvisiblePlaywright(seed=SEED, profile_dir=PROFILE_DIR) as ctx:
|
||||
page = ctx.new_page()
|
||||
for n in range(1, pages + 1):
|
||||
url = (
|
||||
"https://www.zhipin.com/web/geek/job"
|
||||
f"?query={quote(keyword)}&city={city}&page={n}"
|
||||
)
|
||||
page.goto(url, wait_until="domcontentloaded")
|
||||
|
||||
# 等列表渲染; 选择器需按实际页面结构核对/调整
|
||||
try:
|
||||
page.wait_for_selector("li.job-card-wrapper", timeout=15000)
|
||||
except Exception:
|
||||
print(f"第 {n} 页未出现岗位列表, 可能需要登录或触发了验证码。")
|
||||
# 给你时间手动过验证码
|
||||
input("处理完页面后按回车继续...")
|
||||
|
||||
rows = page.eval_on_selector_all(
|
||||
"li.job-card-wrapper",
|
||||
"""els => els.map(e => ({
|
||||
title: e.querySelector('.job-name')?.innerText?.trim(),
|
||||
salary: e.querySelector('.salary')?.innerText?.trim(),
|
||||
company: e.querySelector('.company-name')?.innerText?.trim(),
|
||||
tags: Array.from(e.querySelectorAll('.tag-list li'))
|
||||
.map(t => t.innerText.trim()),
|
||||
area: e.querySelector('.job-area')?.innerText?.trim(),
|
||||
link: e.querySelector('a.job-card-left')?.href
|
||||
|| e.querySelector('a')?.href,
|
||||
}))""",
|
||||
)
|
||||
print(f"第 {n} 页抓到 {len(rows)} 条")
|
||||
results.extend(rows)
|
||||
|
||||
# 低频: 随机停顿, 降低风控触发概率
|
||||
time.sleep(random.uniform(4, 9))
|
||||
return results
|
||||
|
||||
|
||||
def save(rows: list[dict], stem: str) -> None:
|
||||
Path(f"{stem}.json").write_text(
|
||||
json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
if rows:
|
||||
keys = ["title", "salary", "company", "area", "tags", "link"]
|
||||
with open(f"{stem}.csv", "w", newline="", encoding="utf-8-sig") as f:
|
||||
w = csv.DictWriter(f, fieldnames=keys, extrasaction="ignore")
|
||||
w.writeheader()
|
||||
for r in rows:
|
||||
r = dict(r)
|
||||
r["tags"] = " / ".join(r.get("tags") or [])
|
||||
w.writerow(r)
|
||||
print(f"已保存 {len(rows)} 条 → {stem}.json / {stem}.csv")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--login", action="store_true", help="首次手动登录")
|
||||
ap.add_argument("--keyword", default="AI", help="搜索关键词")
|
||||
ap.add_argument("--city", default="101010100", help="城市编码 (101010100=北京)")
|
||||
ap.add_argument("--pages", type=int, default=3, help="抓取页数")
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.login:
|
||||
login_flow()
|
||||
return
|
||||
|
||||
rows = scrape(args.keyword, args.city, args.pages)
|
||||
save(rows, stem=f"zhipin_{args.keyword}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user