Files
invisible_playwright/examples/analyze_zhaopin.py
T
freedakgmail ba2a67290d
e2e / e2e (linux, xvfb) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.11) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.12) (push) Waiting to run
tests / pytest (windows-latest, py3.11) (push) Waiting to run
tests / pytest (windows-latest, py3.12) (push) Waiting to run
examples: add zhaopin AI job scrapers + SQLite analysis
- scrape_zhaopin_ai.py: lightweight no-login list scraper
- scrape_zhaopin_full.py: SQLite storage, resumable crawl, detail-page JD
- analyze_zhaopin.py: stats by city/salary/education/experience/skills
- scrape_zhipin_ai.py: BOSS Zhipin variant (login-based, fallback)
- gitignore scraper data artifacts and browser profile
2026-06-14 23:18:51 +08:00

170 lines
5.3 KiB
Python

"""智联招聘抓取数据分析 —— 读取 SQLite, 按城市/薪资/学历/经验/技能做统计。
纯标准库, 无需 pandas。
用法:
python examples/analyze_zhaopin.py # 全量分析
python examples/analyze_zhaopin.py --keyword AI # 只看某关键词
python examples/analyze_zhaopin.py --city 北京 # 只看某城市
python examples/analyze_zhaopin.py --top 15 # 排行榜取前 15
python examples/analyze_zhaopin.py --db zhaopin_jobs.db
"""
from __future__ import annotations
import argparse
import json
import re
import sqlite3
import statistics
from collections import Counter
WORKDAYS_PER_MONTH = 21.75 # 日薪 -> 月薪折算
def parse_salary(s: str) -> tuple[int, int] | None:
"""把薪资字符串解析成 (月薪下限, 月薪上限) 元。无法解析返回 None。"""
if not s or "面议" in s:
return None
s = s.strip()
is_daily = "/天" in s or "元/天" in s
is_wan = "" in s
nums = re.findall(r"\d+(?:\.\d+)?", s)
if not nums:
return None
vals = [float(x) for x in nums[:2]]
if len(vals) == 1:
vals = [vals[0], vals[0]]
lo, hi = vals[0], vals[1]
if is_wan:
lo, hi = lo * 10000, hi * 10000
if is_daily:
lo, hi = lo * WORKDAYS_PER_MONTH, hi * WORKDAYS_PER_MONTH
return int(lo), int(hi)
def city_of(location: str) -> str:
return (location or "").split("·")[0].strip() or "未知"
SALARY_BUCKETS = [
(0, 5000, "<5k"),
(5000, 8000, "5-8k"),
(8000, 12000, "8-12k"),
(12000, 18000, "12-18k"),
(18000, 25000, "18-25k"),
(25000, 10**9, ">=25k"),
]
def bucket_of(mid: float) -> str:
for lo, hi, label in SALARY_BUCKETS:
if lo <= mid < hi:
return label
return "?"
def bar(n: int, maxn: int, width: int = 30) -> str:
if maxn <= 0:
return ""
return "" * max(1, round(n / maxn * width))
def section(title: str) -> None:
print("\n" + "=" * 56)
print(title)
print("=" * 56)
def rank_table(counter: Counter, top: int, label: str) -> None:
if not counter:
print(" (无数据)")
return
maxn = counter.most_common(1)[0][1]
for name, n in counter.most_common(top):
print(f" {name[:18]:<18} {n:>4} {bar(n, maxn)}")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--db", default="zhaopin_jobs.db")
ap.add_argument("--keyword", default=None, help="按关键词过滤")
ap.add_argument("--city", default=None, help="按城市过滤")
ap.add_argument("--top", type=int, default=12, help="排行榜条数")
args = ap.parse_args()
conn = sqlite3.connect(args.db)
q = ("SELECT title,salary,experience,education,location,company,"
"job_tags,company_tags,skills FROM jobs")
params: list = []
if args.keyword:
q += " WHERE keyword=?"
params.append(args.keyword)
rows = conn.execute(q, params).fetchall()
if args.city:
rows = [r for r in rows if city_of(r[4]) == args.city]
if not rows:
print("没有匹配的数据。先用 scrape_zhaopin_full.py 抓一些, 或检查过滤条件。")
return
n_total = len(rows)
print(f"分析样本: {n_total}"
+ (f" | 关键词={args.keyword}" if args.keyword else "")
+ (f" | 城市={args.city}" if args.city else ""))
cities, edus, exps = Counter(), Counter(), Counter()
skill_freq, salary_buckets = Counter(), Counter()
mids: list[float] = []
n_salary_parsed = 0
for (title, salary, exp, edu, loc, comp, jtags, ctags, skills) in rows:
cities[city_of(loc)] += 1
edus[(edu or "未知").strip() or "未知"] += 1
exps[(exp or "未知").strip() or "未知"] += 1
# 技能: 合并 skills + job_tags
for src in (skills, jtags):
try:
for t in json.loads(src or "[]"):
t = t.strip()
if t:
skill_freq[t] += 1
except Exception:
pass
rng = parse_salary(salary)
if rng:
n_salary_parsed += 1
mid = (rng[0] + rng[1]) / 2
mids.append(mid)
salary_buckets[bucket_of(mid)] += 1
section("城市分布 (Top)")
rank_table(cities, args.top, "城市")
section("薪资区间分布 (按月薪中位点)")
if mids:
order = {label: i for i, (_, _, label) in enumerate(SALARY_BUCKETS)}
maxn = max(salary_buckets.values())
for label in sorted(salary_buckets, key=lambda x: order.get(x, 99)):
n = salary_buckets[label]
print(f" {label:<8} {n:>4} {bar(n, maxn)}")
print(f"\n 可解析薪资: {n_salary_parsed}/{n_total} 条 (面议等已排除)")
print(f" 月薪中位点 平均: {statistics.mean(mids):>8,.0f}")
print(f" 中位数: {statistics.median(mids):>8,.0f}")
print(f" 最低/最高: {min(mids):,.0f} / {max(mids):,.0f}")
else:
print(" (无可解析薪资)")
section("学历要求")
rank_table(edus, args.top, "学历")
section("经验要求")
rank_table(exps, args.top, "经验")
section("技能/标签词频 (Top)")
rank_table(skill_freq, args.top, "技能")
if __name__ == "__main__":
main()