init: AI培训与智能巡检系统

This commit is contained in:
selfrelease
2026-06-16 00:55:20 +08:00
commit c55598494b
201 changed files with 53131 additions and 0 deletions
+110
View File
@@ -0,0 +1,110 @@
"""清洗规则:单元格清理、数值+单位拆分、年代解析、状态/国别规范化。
对应任务 T-1.2(清洗)。所有函数均为纯函数,便于单元测试(T-1.2 UT)。
"""
import re
# 占位/空值标记
_EMPTY_MARKERS = {"", "-", "——", "", "/", "N/A", "n/a", "", "未知"}
def clean_cell(value) -> str:
"""清理单元格:去换行、压缩空白、trim。返回字符串。"""
if value is None:
return ""
s = str(value).replace("\n", "").replace("\r", "")
s = s.replace("\u3000", " ")
s = re.sub(r"\s+", " ", s).strip()
return s
def is_empty(value) -> bool:
"""判断是否为空/占位值。"""
return clean_cell(value) in _EMPTY_MARKERS
# 匹配前导数字(含小数、千分位、负号),后跟可选单位
_NUM_RE = re.compile(r"-?\d+(?:\.\d+)?")
def parse_value_unit(raw, default_unit: str = ""):
"""从形如 '400km/h(试验)''17t''2×92(100)''126.0''——'
中拆出 (数值: float|None, 单位: str, 原文: str)。
规则:取第一个出现的数值作为主数值;单位优先取 default_unit
否则尝试从数值后紧跟的字母/单位片段提取;无数值则返回 (None, '', 原文)。
"""
text = clean_cell(raw)
if text in _EMPTY_MARKERS:
return None, "", text
m = _NUM_RE.search(text)
if not m:
return None, "", text
value = float(m.group())
unit = default_unit
if not unit:
# 取数值之后紧邻的单位片段(字母 / 常见单位字符)
rest = text[m.end():]
um = re.match(r"\s*([A-Za-z%/·³²]+(?:/[A-Za-z]+)?)", rest)
if um:
unit = um.group(1)
return value, unit, text
def parse_year(raw):
"""解析年份:'1971 年''2006''2007-12-22 00:00:00' -> int|None。"""
text = clean_cell(raw)
if text in _EMPTY_MARKERS:
return None
m = re.search(r"(1[89]\d{2}|20\d{2})", text)
return int(m.group(1)) if m else None
def normalize_status(raw) -> str:
"""把自由文本状态规范到枚举之一。"""
text = clean_cell(raw)
if not text:
return "未知"
mapping = [
("半封存", "半封存"),
("封存", "封存"),
("现役", "现役"),
("在役", "现役"),
("退役", "退役"),
("淘汰", "退役"),
("报废", "报废"),
("保存", "保存"),
("试验", "试验"),
("样车", "试验"),
]
for kw, val in mapping:
if kw in text:
return val
return "未知"
# 国别属性的简单启发式推断(默认国产;后续众包修订可覆盖)
_IMPORT_HINTS = ["进口", "苏联", "罗马尼亚", "法国", "日本", "德国", "美国", "捷克"]
def infer_country_type(*texts) -> str:
blob = " ".join(clean_cell(t) for t in texts)
if "合资" in blob:
return "中外合资"
if "仿制" in blob or "引进" in blob:
return "引进仿制"
for h in _IMPORT_HINTS:
if h in blob:
return "进口"
return "国产"
def forward_fill(values):
"""对一列做向前填充(处理合并单元格被拆空的 系列 列)。"""
out, last = [], ""
for v in values:
c = clean_cell(v)
if c:
last = c
out.append(last)
return out