ba2a67290d
e2e / e2e (linux, xvfb) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.11) (push) Waiting to run
tests / pytest (ubuntu-latest, py3.12) (push) Waiting to run
tests / pytest (windows-latest, py3.11) (push) Waiting to run
tests / pytest (windows-latest, py3.12) (push) Waiting to run
- scrape_zhaopin_ai.py: lightweight no-login list scraper - scrape_zhaopin_full.py: SQLite storage, resumable crawl, detail-page JD - analyze_zhaopin.py: stats by city/salary/education/experience/skills - scrape_zhipin_ai.py: BOSS Zhipin variant (login-based, fallback) - gitignore scraper data artifacts and browser profile
170 lines
5.3 KiB
Python
170 lines
5.3 KiB
Python
"""智联招聘抓取数据分析 —— 读取 SQLite, 按城市/薪资/学历/经验/技能做统计。
|
|
|
|
纯标准库, 无需 pandas。
|
|
|
|
用法:
|
|
python examples/analyze_zhaopin.py # 全量分析
|
|
python examples/analyze_zhaopin.py --keyword AI # 只看某关键词
|
|
python examples/analyze_zhaopin.py --city 北京 # 只看某城市
|
|
python examples/analyze_zhaopin.py --top 15 # 排行榜取前 15
|
|
python examples/analyze_zhaopin.py --db zhaopin_jobs.db
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
import statistics
|
|
from collections import Counter
|
|
|
|
WORKDAYS_PER_MONTH = 21.75 # 日薪 -> 月薪折算
|
|
|
|
|
|
def parse_salary(s: str) -> tuple[int, int] | None:
|
|
"""把薪资字符串解析成 (月薪下限, 月薪上限) 元。无法解析返回 None。"""
|
|
if not s or "面议" in s:
|
|
return None
|
|
s = s.strip()
|
|
is_daily = "/天" in s or "元/天" in s
|
|
is_wan = "万" in s
|
|
nums = re.findall(r"\d+(?:\.\d+)?", s)
|
|
if not nums:
|
|
return None
|
|
vals = [float(x) for x in nums[:2]]
|
|
if len(vals) == 1:
|
|
vals = [vals[0], vals[0]]
|
|
lo, hi = vals[0], vals[1]
|
|
if is_wan:
|
|
lo, hi = lo * 10000, hi * 10000
|
|
if is_daily:
|
|
lo, hi = lo * WORKDAYS_PER_MONTH, hi * WORKDAYS_PER_MONTH
|
|
return int(lo), int(hi)
|
|
|
|
|
|
def city_of(location: str) -> str:
|
|
return (location or "").split("·")[0].strip() or "未知"
|
|
|
|
|
|
SALARY_BUCKETS = [
|
|
(0, 5000, "<5k"),
|
|
(5000, 8000, "5-8k"),
|
|
(8000, 12000, "8-12k"),
|
|
(12000, 18000, "12-18k"),
|
|
(18000, 25000, "18-25k"),
|
|
(25000, 10**9, ">=25k"),
|
|
]
|
|
|
|
|
|
def bucket_of(mid: float) -> str:
|
|
for lo, hi, label in SALARY_BUCKETS:
|
|
if lo <= mid < hi:
|
|
return label
|
|
return "?"
|
|
|
|
|
|
def bar(n: int, maxn: int, width: int = 30) -> str:
|
|
if maxn <= 0:
|
|
return ""
|
|
return "█" * max(1, round(n / maxn * width))
|
|
|
|
|
|
def section(title: str) -> None:
|
|
print("\n" + "=" * 56)
|
|
print(title)
|
|
print("=" * 56)
|
|
|
|
|
|
def rank_table(counter: Counter, top: int, label: str) -> None:
|
|
if not counter:
|
|
print(" (无数据)")
|
|
return
|
|
maxn = counter.most_common(1)[0][1]
|
|
for name, n in counter.most_common(top):
|
|
print(f" {name[:18]:<18} {n:>4} {bar(n, maxn)}")
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--db", default="zhaopin_jobs.db")
|
|
ap.add_argument("--keyword", default=None, help="按关键词过滤")
|
|
ap.add_argument("--city", default=None, help="按城市过滤")
|
|
ap.add_argument("--top", type=int, default=12, help="排行榜条数")
|
|
args = ap.parse_args()
|
|
|
|
conn = sqlite3.connect(args.db)
|
|
q = ("SELECT title,salary,experience,education,location,company,"
|
|
"job_tags,company_tags,skills FROM jobs")
|
|
params: list = []
|
|
if args.keyword:
|
|
q += " WHERE keyword=?"
|
|
params.append(args.keyword)
|
|
rows = conn.execute(q, params).fetchall()
|
|
|
|
if args.city:
|
|
rows = [r for r in rows if city_of(r[4]) == args.city]
|
|
|
|
if not rows:
|
|
print("没有匹配的数据。先用 scrape_zhaopin_full.py 抓一些, 或检查过滤条件。")
|
|
return
|
|
|
|
n_total = len(rows)
|
|
print(f"分析样本: {n_total} 条"
|
|
+ (f" | 关键词={args.keyword}" if args.keyword else "")
|
|
+ (f" | 城市={args.city}" if args.city else ""))
|
|
|
|
cities, edus, exps = Counter(), Counter(), Counter()
|
|
skill_freq, salary_buckets = Counter(), Counter()
|
|
mids: list[float] = []
|
|
n_salary_parsed = 0
|
|
|
|
for (title, salary, exp, edu, loc, comp, jtags, ctags, skills) in rows:
|
|
cities[city_of(loc)] += 1
|
|
edus[(edu or "未知").strip() or "未知"] += 1
|
|
exps[(exp or "未知").strip() or "未知"] += 1
|
|
# 技能: 合并 skills + job_tags
|
|
for src in (skills, jtags):
|
|
try:
|
|
for t in json.loads(src or "[]"):
|
|
t = t.strip()
|
|
if t:
|
|
skill_freq[t] += 1
|
|
except Exception:
|
|
pass
|
|
rng = parse_salary(salary)
|
|
if rng:
|
|
n_salary_parsed += 1
|
|
mid = (rng[0] + rng[1]) / 2
|
|
mids.append(mid)
|
|
salary_buckets[bucket_of(mid)] += 1
|
|
|
|
section("城市分布 (Top)")
|
|
rank_table(cities, args.top, "城市")
|
|
|
|
section("薪资区间分布 (按月薪中位点)")
|
|
if mids:
|
|
order = {label: i for i, (_, _, label) in enumerate(SALARY_BUCKETS)}
|
|
maxn = max(salary_buckets.values())
|
|
for label in sorted(salary_buckets, key=lambda x: order.get(x, 99)):
|
|
n = salary_buckets[label]
|
|
print(f" {label:<8} {n:>4} {bar(n, maxn)}")
|
|
print(f"\n 可解析薪资: {n_salary_parsed}/{n_total} 条 (面议等已排除)")
|
|
print(f" 月薪中位点 平均: {statistics.mean(mids):>8,.0f} 元")
|
|
print(f" 中位数: {statistics.median(mids):>8,.0f} 元")
|
|
print(f" 最低/最高: {min(mids):,.0f} / {max(mids):,.0f} 元")
|
|
else:
|
|
print(" (无可解析薪资)")
|
|
|
|
section("学历要求")
|
|
rank_table(edus, args.top, "学历")
|
|
|
|
section("经验要求")
|
|
rank_table(exps, args.top, "经验")
|
|
|
|
section("技能/标签词频 (Top)")
|
|
rank_table(skill_freq, args.top, "技能")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|