Files
Train/app/etl/clean.py
T
2026-06-16 00:55:20 +08:00

111 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""清洗规则:单元格清理、数值+单位拆分、年代解析、状态/国别规范化。
对应任务 T-1.2(清洗)。所有函数均为纯函数,便于单元测试(T-1.2 UT)。
"""
import re
# 占位/空值标记
_EMPTY_MARKERS = {"", "-", "——", "", "/", "N/A", "n/a", "", "未知"}
def clean_cell(value) -> str:
"""清理单元格:去换行、压缩空白、trim。返回字符串。"""
if value is None:
return ""
s = str(value).replace("\n", "").replace("\r", "")
s = s.replace("\u3000", " ")
s = re.sub(r"\s+", " ", s).strip()
return s
def is_empty(value) -> bool:
"""判断是否为空/占位值。"""
return clean_cell(value) in _EMPTY_MARKERS
# 匹配前导数字(含小数、千分位、负号),后跟可选单位
_NUM_RE = re.compile(r"-?\d+(?:\.\d+)?")
def parse_value_unit(raw, default_unit: str = ""):
"""从形如 '400km/h(试验)''17t''2×92(100)''126.0''——'
中拆出 (数值: float|None, 单位: str, 原文: str)。
规则:取第一个出现的数值作为主数值;单位优先取 default_unit
否则尝试从数值后紧跟的字母/单位片段提取;无数值则返回 (None, '', 原文)。
"""
text = clean_cell(raw)
if text in _EMPTY_MARKERS:
return None, "", text
m = _NUM_RE.search(text)
if not m:
return None, "", text
value = float(m.group())
unit = default_unit
if not unit:
# 取数值之后紧邻的单位片段(字母 / 常见单位字符)
rest = text[m.end():]
um = re.match(r"\s*([A-Za-z%/·³²]+(?:/[A-Za-z]+)?)", rest)
if um:
unit = um.group(1)
return value, unit, text
def parse_year(raw):
"""解析年份:'1971 年''2006''2007-12-22 00:00:00' -> int|None。"""
text = clean_cell(raw)
if text in _EMPTY_MARKERS:
return None
m = re.search(r"(1[89]\d{2}|20\d{2})", text)
return int(m.group(1)) if m else None
def normalize_status(raw) -> str:
"""把自由文本状态规范到枚举之一。"""
text = clean_cell(raw)
if not text:
return "未知"
mapping = [
("半封存", "半封存"),
("封存", "封存"),
("现役", "现役"),
("在役", "现役"),
("退役", "退役"),
("淘汰", "退役"),
("报废", "报废"),
("保存", "保存"),
("试验", "试验"),
("样车", "试验"),
]
for kw, val in mapping:
if kw in text:
return val
return "未知"
# 国别属性的简单启发式推断(默认国产;后续众包修订可覆盖)
_IMPORT_HINTS = ["进口", "苏联", "罗马尼亚", "法国", "日本", "德国", "美国", "捷克"]
def infer_country_type(*texts) -> str:
blob = " ".join(clean_cell(t) for t in texts)
if "合资" in blob:
return "中外合资"
if "仿制" in blob or "引进" in blob:
return "引进仿制"
for h in _IMPORT_HINTS:
if h in blob:
return "进口"
return "国产"
def forward_fill(values):
"""对一列做向前填充(处理合并单元格被拆空的 系列 列)。"""
out, last = [], ""
for v in values:
c = clean_cell(v)
if c:
last = c
out.append(last)
return out