Files
Train/app/etl/field_dict.py
T
2026-06-16 00:55:20 +08:00

110 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""统一字段字典与枚举。
将各 sheet 中形态各异的表头映射到规范字段名(canonical),
并声明哪些字段是"数值 + 单位"型(需要拆分)。
对应任务 T-1.1(字段字典)/ T-1.2(字段映射)。
"""
# 规范字段 -> 该字段的同义表头集合(去空格后匹配)
# 注意:表头里常含空格/换行,匹配前统一做 normalize_header() 处理。
FIELD_SYNONYMS = {
"series": ["系列"],
"model_code": ["型号", "型号(代号)", "车型", "车型代码", "小表编号车型"],
"full_name": ["车型全称", "车型名称"],
"first_year": ["首产时间", "制造时间", "生产时间"],
"last_year": ["停产时间"],
"manufacturer": ["生产商", "主要生产厂商", "生产厂商", "制造商"],
"length": ["车体长度/mm", "机车长度/mm", "车辆全长/mm", "车体全长/mm"],
"width": ["车体宽度/mm", "机车宽度/mm"],
"height": ["车体高度/mm"],
"wheelbase": ["轴距/mm", "车辆定距/mm"],
"weight": ["整备重量/t", "自重/t", "整备重量/轴重/t", "整备重量/节数"],
"axle_load": ["轴重/t"],
"axle_arrangement": ["轴列式", "轴式", "轴列式(标准编组)"],
"tractive_start": ["牵引力(起动)/kN"],
"tractive_cont": ["牵引力(持续)/kN"],
"power_kw": ["牵引力/kW"],
"efficiency": ["传动效率", "机械效率"],
"drive": ["传动方式/动力来源", "供电方式(传动方式)", "供电方式", "制动方式"],
"max_speed": ["最高运行时速", "最高运营时速", "构造速度/km/h", "运行时速",
"最高速度", "构造速度"],
"usage": ["用途", "主要用途", "核心运行区段/主题"],
"production_count": ["产量", "累计产量", "生产台数", "生产数量",
"累计产量/配属"],
"capacity": ["容积/m³", "定员/人"],
"load": ["载重/t"],
"bogie": ["转向架型号"],
"coupler": ["车钩类型"],
# 个体(Unit)粒度字段
"car_number": ["车号", "编号"],
"function": ["功能"],
"depot": ["配属", "所属路局"],
"livery": ["颜色", "涂装"],
"status": ["状态"],
"side_mark": ["侧标"],
"note": ["备注"],
"location": ["存放位置"],
"formation": ["动力车、拖车", "动力车拖车"],
"predecessor": ["前身(部分含)", "前身"],
"lifespan": ["最大使用寿命(结合实际)"],
"tour_name": ["旅游列车名称"],
"tractor_models": ["牵引机车常用型号"],
}
# "数值 + 单位"型字段:导入时拆出 <field>_value(float) 与 <field>_unit(text)
NUMERIC_UNIT_FIELDS = {
"length": "mm", "width": "mm", "height": "mm", "wheelbase": "mm",
"weight": "t", "axle_load": "t", "load": "t",
"tractive_start": "kN", "tractive_cont": "kN", "power_kw": "kW",
"max_speed": "km/h", "capacity": "",
}
# 状态枚举(规范化目标)
STATUS_ENUM = ["现役", "封存", "半封存", "报废", "退役", "保存", "试验", "未知"]
# 国别属性枚举
COUNTRY_TYPE_ENUM = ["国产", "进口", "引进仿制", "中外合资", "未知"]
# sheet 文件名 -> 分类配置
# grain: "model" 车型粒度 / "unit" 个体粒度
CATEGORY_CONFIG = {
"中国蒸汽汽车统计表(建国前)": {"category": "蒸汽机车", "subcat": "建国前", "grain": "model"},
"中国蒸汽机车型号表(建国后)": {"category": "蒸汽机车", "subcat": "建国后", "grain": "model"},
"北京表(蒸汽机车)": {"category": "蒸汽机车", "subcat": "北京现存", "grain": "unit"},
"全国普速客车型号统计表": {"category": "客车", "subcat": "普速客车", "grain": "model"},
"全国电力机车型号表": {"category": "电力机车", "subcat": "", "grain": "model",
"col_override": {0: "series", 1: "model_code"}},
"全国内燃机车型号表": {"category": "内燃机车", "subcat": "", "grain": "model"},
"CJ型动车组、早期动车组及和谐号统计表": {"category": "动车组", "subcat": "CJ/早期/和谐号", "grain": "model"},
"全国复兴号统计表": {"category": "动车组", "subcat": "复兴号", "grain": "model"},
"中国铁路货车统计表": {"category": "货车", "subcat": "", "grain": "model"},
"中国铁路普速检测车统计表": {"category": "检测车", "subcat": "普速检测", "grain": "unit"},
"中国高速综合检测列车统计表": {"category": "检测车", "subcat": "高速综合检测", "grain": "unit"},
"中国旅游列车统计表": {"category": "旅游列车", "subcat": "", "grain": "model"},
}
# 用于识别"表头行"的关键 tokennormalize 后)
HEADER_TOKENS = {"型号", "车号", "车型代码", "车型", "旅游列车名称", "小表编号",
"型号(代号)", "系列", "车型全称"}
# 反向索引:normalized header -> canonical field
_REVERSE = {}
for _canon, _syns in FIELD_SYNONYMS.items():
for _s in _syns:
_REVERSE[_s] = _canon
def normalize_header(h: str) -> str:
"""规范化表头:去除空白、换行、全角空格,并将全角括号转半角。"""
if h is None:
return ""
s = (h.replace("\n", "").replace("\r", "")
.replace(" ", "").replace("\u3000", "").strip())
s = s.replace("", "(").replace("", ")")
return s
def map_header(h: str):
"""把原始表头映射到 canonical 字段名,未知返回 None。"""
return _REVERSE.get(normalize_header(h))