"""统一字段字典与枚举。 将各 sheet 中形态各异的表头映射到规范字段名(canonical), 并声明哪些字段是"数值 + 单位"型(需要拆分)。 对应任务 T-1.1(字段字典)/ T-1.2(字段映射)。 """ # 规范字段 -> 该字段的同义表头集合(去空格后匹配) # 注意:表头里常含空格/换行,匹配前统一做 normalize_header() 处理。 FIELD_SYNONYMS = { "series": ["系列"], "model_code": ["型号", "型号(代号)", "车型", "车型代码", "小表编号车型"], "full_name": ["车型全称", "车型名称"], "first_year": ["首产时间", "制造时间", "生产时间"], "last_year": ["停产时间"], "manufacturer": ["生产商", "主要生产厂商", "生产厂商", "制造商"], "length": ["车体长度/mm", "机车长度/mm", "车辆全长/mm", "车体全长/mm"], "width": ["车体宽度/mm", "机车宽度/mm"], "height": ["车体高度/mm"], "wheelbase": ["轴距/mm", "车辆定距/mm"], "weight": ["整备重量/t", "自重/t", "整备重量/轴重/t", "整备重量/节数"], "axle_load": ["轴重/t"], "axle_arrangement": ["轴列式", "轴式", "轴列式(标准编组)"], "tractive_start": ["牵引力(起动)/kN"], "tractive_cont": ["牵引力(持续)/kN"], "power_kw": ["牵引力/kW"], "efficiency": ["传动效率", "机械效率"], "drive": ["传动方式/动力来源", "供电方式(传动方式)", "供电方式", "制动方式"], "max_speed": ["最高运行时速", "最高运营时速", "构造速度/km/h", "运行时速", "最高速度", "构造速度"], "usage": ["用途", "主要用途", "核心运行区段/主题"], "production_count": ["产量", "累计产量", "生产台数", "生产数量", "累计产量/配属"], "capacity": ["容积/m³", "定员/人"], "load": ["载重/t"], "bogie": ["转向架型号"], "coupler": ["车钩类型"], # 个体(Unit)粒度字段 "car_number": ["车号", "编号"], "function": ["功能"], "depot": ["配属", "所属路局"], "livery": ["颜色", "涂装"], "status": ["状态"], "side_mark": ["侧标"], "note": ["备注"], "location": ["存放位置"], "formation": ["动力车、拖车", "动力车拖车"], "predecessor": ["前身(部分含)", "前身"], "lifespan": ["最大使用寿命(结合实际)"], "tour_name": ["旅游列车名称"], "tractor_models": ["牵引机车常用型号"], } # "数值 + 单位"型字段:导入时拆出 _value(float) 与 _unit(text) NUMERIC_UNIT_FIELDS = { "length": "mm", "width": "mm", "height": "mm", "wheelbase": "mm", "weight": "t", "axle_load": "t", "load": "t", "tractive_start": "kN", "tractive_cont": "kN", "power_kw": "kW", "max_speed": "km/h", "capacity": "", } # 状态枚举(规范化目标) STATUS_ENUM = ["现役", "封存", "半封存", "报废", "退役", "保存", "试验", "未知"] # 国别属性枚举 COUNTRY_TYPE_ENUM = ["国产", "进口", "引进仿制", "中外合资", "未知"] # sheet 文件名 -> 分类配置 # grain: "model" 车型粒度 / "unit" 个体粒度 CATEGORY_CONFIG = { "中国蒸汽汽车统计表(建国前)": {"category": "蒸汽机车", "subcat": "建国前", "grain": "model"}, "中国蒸汽机车型号表(建国后)": {"category": "蒸汽机车", "subcat": "建国后", "grain": "model"}, "北京表(蒸汽机车)": {"category": "蒸汽机车", "subcat": "北京现存", "grain": "unit"}, "全国普速客车型号统计表": {"category": "客车", "subcat": "普速客车", "grain": "model"}, "全国电力机车型号表": {"category": "电力机车", "subcat": "", "grain": "model", "col_override": {0: "series", 1: "model_code"}}, "全国内燃机车型号表": {"category": "内燃机车", "subcat": "", "grain": "model"}, "CJ型动车组、早期动车组及和谐号统计表": {"category": "动车组", "subcat": "CJ/早期/和谐号", "grain": "model"}, "全国复兴号统计表": {"category": "动车组", "subcat": "复兴号", "grain": "model"}, "中国铁路货车统计表": {"category": "货车", "subcat": "", "grain": "model"}, "中国铁路普速检测车统计表": {"category": "检测车", "subcat": "普速检测", "grain": "unit"}, "中国高速综合检测列车统计表": {"category": "检测车", "subcat": "高速综合检测", "grain": "unit"}, "中国旅游列车统计表": {"category": "旅游列车", "subcat": "", "grain": "model"}, } # 用于识别"表头行"的关键 token(normalize 后) HEADER_TOKENS = {"型号", "车号", "车型代码", "车型", "旅游列车名称", "小表编号", "型号(代号)", "系列", "车型全称"} # 反向索引:normalized header -> canonical field _REVERSE = {} for _canon, _syns in FIELD_SYNONYMS.items(): for _s in _syns: _REVERSE[_s] = _canon def normalize_header(h: str) -> str: """规范化表头:去除空白、换行、全角空格,并将全角括号转半角。""" if h is None: return "" s = (h.replace("\n", "").replace("\r", "") .replace(" ", "").replace("\u3000", "").strip()) s = s.replace("(", "(").replace(")", ")") return s def map_header(h: str): """把原始表头映射到 canonical 字段名,未知返回 None。""" return _REVERSE.get(normalize_header(h))