"""智联招聘抓取数据分析 —— 读取 SQLite, 按城市/薪资/学历/经验/技能做统计。 纯标准库, 无需 pandas。 用法: python examples/analyze_zhaopin.py # 全量分析 python examples/analyze_zhaopin.py --keyword AI # 只看某关键词 python examples/analyze_zhaopin.py --city 北京 # 只看某城市 python examples/analyze_zhaopin.py --top 15 # 排行榜取前 15 python examples/analyze_zhaopin.py --db zhaopin_jobs.db """ from __future__ import annotations import argparse import json import re import sqlite3 import statistics from collections import Counter WORKDAYS_PER_MONTH = 21.75 # 日薪 -> 月薪折算 def parse_salary(s: str) -> tuple[int, int] | None: """把薪资字符串解析成 (月薪下限, 月薪上限) 元。无法解析返回 None。""" if not s or "面议" in s: return None s = s.strip() is_daily = "/天" in s or "元/天" in s is_wan = "万" in s nums = re.findall(r"\d+(?:\.\d+)?", s) if not nums: return None vals = [float(x) for x in nums[:2]] if len(vals) == 1: vals = [vals[0], vals[0]] lo, hi = vals[0], vals[1] if is_wan: lo, hi = lo * 10000, hi * 10000 if is_daily: lo, hi = lo * WORKDAYS_PER_MONTH, hi * WORKDAYS_PER_MONTH return int(lo), int(hi) def city_of(location: str) -> str: return (location or "").split("·")[0].strip() or "未知" SALARY_BUCKETS = [ (0, 5000, "<5k"), (5000, 8000, "5-8k"), (8000, 12000, "8-12k"), (12000, 18000, "12-18k"), (18000, 25000, "18-25k"), (25000, 10**9, ">=25k"), ] def bucket_of(mid: float) -> str: for lo, hi, label in SALARY_BUCKETS: if lo <= mid < hi: return label return "?" def bar(n: int, maxn: int, width: int = 30) -> str: if maxn <= 0: return "" return "█" * max(1, round(n / maxn * width)) def section(title: str) -> None: print("\n" + "=" * 56) print(title) print("=" * 56) def rank_table(counter: Counter, top: int, label: str) -> None: if not counter: print(" (无数据)") return maxn = counter.most_common(1)[0][1] for name, n in counter.most_common(top): print(f" {name[:18]:<18} {n:>4} {bar(n, maxn)}") def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--db", default="zhaopin_jobs.db") ap.add_argument("--keyword", default=None, help="按关键词过滤") ap.add_argument("--city", default=None, help="按城市过滤") ap.add_argument("--top", type=int, default=12, help="排行榜条数") args = ap.parse_args() conn = sqlite3.connect(args.db) q = ("SELECT title,salary,experience,education,location,company," "job_tags,company_tags,skills FROM jobs") params: list = [] if args.keyword: q += " WHERE keyword=?" params.append(args.keyword) rows = conn.execute(q, params).fetchall() if args.city: rows = [r for r in rows if city_of(r[4]) == args.city] if not rows: print("没有匹配的数据。先用 scrape_zhaopin_full.py 抓一些, 或检查过滤条件。") return n_total = len(rows) print(f"分析样本: {n_total} 条" + (f" | 关键词={args.keyword}" if args.keyword else "") + (f" | 城市={args.city}" if args.city else "")) cities, edus, exps = Counter(), Counter(), Counter() skill_freq, salary_buckets = Counter(), Counter() mids: list[float] = [] n_salary_parsed = 0 for (title, salary, exp, edu, loc, comp, jtags, ctags, skills) in rows: cities[city_of(loc)] += 1 edus[(edu or "未知").strip() or "未知"] += 1 exps[(exp or "未知").strip() or "未知"] += 1 # 技能: 合并 skills + job_tags for src in (skills, jtags): try: for t in json.loads(src or "[]"): t = t.strip() if t: skill_freq[t] += 1 except Exception: pass rng = parse_salary(salary) if rng: n_salary_parsed += 1 mid = (rng[0] + rng[1]) / 2 mids.append(mid) salary_buckets[bucket_of(mid)] += 1 section("城市分布 (Top)") rank_table(cities, args.top, "城市") section("薪资区间分布 (按月薪中位点)") if mids: order = {label: i for i, (_, _, label) in enumerate(SALARY_BUCKETS)} maxn = max(salary_buckets.values()) for label in sorted(salary_buckets, key=lambda x: order.get(x, 99)): n = salary_buckets[label] print(f" {label:<8} {n:>4} {bar(n, maxn)}") print(f"\n 可解析薪资: {n_salary_parsed}/{n_total} 条 (面议等已排除)") print(f" 月薪中位点 平均: {statistics.mean(mids):>8,.0f} 元") print(f" 中位数: {statistics.median(mids):>8,.0f} 元") print(f" 最低/最高: {min(mids):,.0f} / {max(mids):,.0f} 元") else: print(" (无可解析薪资)") section("学历要求") rank_table(edus, args.top, "学历") section("经验要求") rank_table(exps, args.top, "经验") section("技能/标签词频 (Top)") rank_table(skill_freq, args.top, "技能") if __name__ == "__main__": main()