"""清洗规则:单元格清理、数值+单位拆分、年代解析、状态/国别规范化。 对应任务 T-1.2(清洗)。所有函数均为纯函数,便于单元测试(T-1.2 UT)。 """ import re # 占位/空值标记 _EMPTY_MARKERS = {"", "-", "——", "—", "/", "N/A", "n/a", "无", "未知"} def clean_cell(value) -> str: """清理单元格:去换行、压缩空白、trim。返回字符串。""" if value is None: return "" s = str(value).replace("\n", "").replace("\r", "") s = s.replace("\u3000", " ") s = re.sub(r"\s+", " ", s).strip() return s def is_empty(value) -> bool: """判断是否为空/占位值。""" return clean_cell(value) in _EMPTY_MARKERS # 匹配前导数字(含小数、千分位、负号),后跟可选单位 _NUM_RE = re.compile(r"-?\d+(?:\.\d+)?") def parse_value_unit(raw, default_unit: str = ""): """从形如 '400km/h(试验)'、'17t'、'2×92(100)'、'126.0'、'——' 中拆出 (数值: float|None, 单位: str, 原文: str)。 规则:取第一个出现的数值作为主数值;单位优先取 default_unit, 否则尝试从数值后紧跟的字母/单位片段提取;无数值则返回 (None, '', 原文)。 """ text = clean_cell(raw) if text in _EMPTY_MARKERS: return None, "", text m = _NUM_RE.search(text) if not m: return None, "", text value = float(m.group()) unit = default_unit if not unit: # 取数值之后紧邻的单位片段(字母 / 常见单位字符) rest = text[m.end():] um = re.match(r"\s*([A-Za-z%/·³²]+(?:/[A-Za-z]+)?)", rest) if um: unit = um.group(1) return value, unit, text def parse_year(raw): """解析年份:'1971 年'、'2006'、'2007-12-22 00:00:00' -> int|None。""" text = clean_cell(raw) if text in _EMPTY_MARKERS: return None m = re.search(r"(1[89]\d{2}|20\d{2})", text) return int(m.group(1)) if m else None def normalize_status(raw) -> str: """把自由文本状态规范到枚举之一。""" text = clean_cell(raw) if not text: return "未知" mapping = [ ("半封存", "半封存"), ("封存", "封存"), ("现役", "现役"), ("在役", "现役"), ("退役", "退役"), ("淘汰", "退役"), ("报废", "报废"), ("保存", "保存"), ("试验", "试验"), ("样车", "试验"), ] for kw, val in mapping: if kw in text: return val return "未知" # 国别属性的简单启发式推断(默认国产;后续众包修订可覆盖) _IMPORT_HINTS = ["进口", "苏联", "罗马尼亚", "法国", "日本", "德国", "美国", "捷克"] def infer_country_type(*texts) -> str: blob = " ".join(clean_cell(t) for t in texts) if "合资" in blob: return "中外合资" if "仿制" in blob or "引进" in blob: return "引进仿制" for h in _IMPORT_HINTS: if h in blob: return "进口" return "国产" def forward_fill(values): """对一列做向前填充(处理合并单元格被拆空的 系列 列)。""" out, last = [], "" for v in values: c = clean_cell(v) if c: last = c out.append(last) return out