Initial commit: InternalAuditInterprise

This commit is contained in:
freedakgmail
2026-06-16 00:38:57 +08:00
commit 7b1e2b10a8
57 changed files with 4622 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
"""审计数据中台模块:本体/知识图谱、双时态、时序、数据版本。"""
+83
View File
@@ -0,0 +1,83 @@
"""双时态事实仓储:写入与"按历史时点回放"查询。
对应需求 R3 / ADR-0002
- 业务有效期 valid_from/valid_to(应用时间)
- 系统记录期 system_from/system_to(事务时间)
回放 = 给定 (as_of_valid, as_of_system) 在两条时间线上各取"包含该时点"的记录。
"""
from __future__ import annotations
import datetime as dt
import uuid
from sqlalchemy import or_
from sqlalchemy.orm import Session
from app.datahub.models import BitemporalFact
def record_fact(
session: Session,
entity_id: uuid.UUID,
attr_name: str,
attr_value: dict,
valid_from: dt.datetime,
valid_to: dt.datetime | None = None,
data_version_id: uuid.UUID | None = None,
) -> BitemporalFact:
"""记录一条双时态事实(system_from 自动取当前事务时间)。"""
fact = BitemporalFact(
entity_id=entity_id,
attr_name=attr_name,
attr_value=attr_value,
valid_from=valid_from,
valid_to=valid_to,
data_version_id=data_version_id,
)
session.add(fact)
session.flush()
return fact
def as_of(
session: Session,
entity_id: uuid.UUID,
attr_name: str,
as_of_valid: dt.datetime,
as_of_system: dt.datetime | None = None,
) -> BitemporalFact | None:
"""回放:返回在给定业务时点且按给定系统时点可见的事实。
- 业务时间线:valid_from <= as_of_valid < valid_to(或为空表示至今)
- 系统时间线:system_from <= as_of_system < system_to(或为空表示当前可见)
"""
as_of_system = as_of_system or dt.datetime.now(dt.UTC)
q = (
session.query(BitemporalFact)
.filter(BitemporalFact.entity_id == entity_id)
.filter(BitemporalFact.attr_name == attr_name)
.filter(BitemporalFact.valid_from <= as_of_valid)
.filter(
or_(BitemporalFact.valid_to.is_(None), BitemporalFact.valid_to > as_of_valid)
)
.filter(BitemporalFact.system_from <= as_of_system)
.filter(
or_(
BitemporalFact.system_to.is_(None),
BitemporalFact.system_to > as_of_system,
)
)
.order_by(BitemporalFact.system_from.desc())
)
return q.first()
def close_fact(
session: Session, fact: BitemporalFact, system_to: dt.datetime | None = None
) -> None:
"""逻辑关闭一条事实的系统可见期(用于更正/失效,而非物理删除)。"""
fact.system_to = system_to or dt.datetime.now(dt.UTC)
session.add(fact)
session.flush()
+58
View File
@@ -0,0 +1,58 @@
"""数据中台 schema 初始化。
MVP 阶段以 SQLAlchemy metadata 建表(后续可迁移到 Alembic)。
扩展按可用性可选启用:
- btree_gist / vector:若可用则创建。
- timescaledb:若可用则把 metric_event 转为超表;不可用则保持普通表(带时间索引)。
"""
from __future__ import annotations
from sqlalchemy import text
from sqlalchemy.engine import Engine
from app.datahub import models # noqa: F401 确保模型注册到 metadata
from app.db import Base, get_engine
def _extension_available(engine: Engine, name: str) -> bool:
with engine.connect() as conn:
row = conn.execute(
text("SELECT 1 FROM pg_available_extensions WHERE name = :n"), {"n": name}
).first()
return row is not None
def init_extensions(engine: Engine) -> dict[str, bool]:
"""按可用性创建扩展,返回各扩展启用状态。"""
status: dict[str, bool] = {}
for ext in ("btree_gist", "vector", "timescaledb"):
available = _extension_available(engine, ext)
status[ext] = available
if available:
with engine.begin() as conn:
conn.execute(text(f"CREATE EXTENSION IF NOT EXISTS {ext}"))
return status
def create_schema(engine: Engine | None = None) -> dict[str, bool]:
"""创建数据中台全部表,并按需启用时序超表。返回扩展状态。"""
engine = engine or get_engine()
status = init_extensions(engine)
Base.metadata.create_all(engine)
# 若 TimescaleDB 可用,将时序事件表转为超表(幂等)
if status.get("timescaledb"):
with engine.begin() as conn:
conn.execute(
text(
"SELECT create_hypertable('metric_event', 'event_time', "
"if_not_exists => TRUE, migrate_data => TRUE)"
)
)
return status
if __name__ == "__main__":
st = create_schema()
print("数据中台 schema 初始化完成。扩展状态:", st)
+118
View File
@@ -0,0 +1,118 @@
"""知识图谱仓储:实体/关系写入与多跳穿透(递归 CTE)。
对应需求 R2:支撑隐性实控人、关联方网络、"马甲"供应商等穿透分析。
统一穿透查询服务(P1.2.5)在此之上封装对外 API,对上层屏蔽底层是关系表还是图库。
"""
from __future__ import annotations
import uuid
from sqlalchemy import text
from sqlalchemy.orm import Session
from app.datahub.models import Entity, EntityRelationship
from app.datahub.ontology import EntityType, RelationshipType, is_valid_relationship
class OntologyViolationError(ValueError):
"""关系不符合本体约束。"""
def upsert_entity(
session: Session,
entity_type: EntityType,
business_key: str,
display_name: str | None = None,
attributes: dict | None = None,
data_version_id: uuid.UUID | None = None,
) -> Entity:
"""按 (类型, 业务主键) 幂等写入实体(主数据对齐的归一锚点)。"""
existing = (
session.query(Entity)
.filter(Entity.entity_type == entity_type.value, Entity.business_key == business_key)
.one_or_none()
)
if existing is not None:
if display_name is not None:
existing.display_name = display_name
if attributes:
existing.attributes = {**(existing.attributes or {}), **attributes}
return existing
entity = Entity(
entity_type=entity_type.value,
business_key=business_key,
display_name=display_name,
attributes=attributes or {},
data_version_id=data_version_id,
)
session.add(entity)
session.flush()
return entity
def add_relationship(
session: Session,
rel_type: RelationshipType,
source: Entity,
target: Entity,
attributes: dict | None = None,
data_version_id: uuid.UUID | None = None,
) -> EntityRelationship:
"""新增一条关系边,写入前校验本体约束。"""
src_type = EntityType(source.entity_type)
tgt_type = EntityType(target.entity_type)
if not is_valid_relationship(rel_type, src_type, tgt_type):
raise OntologyViolationError(
f"关系 {rel_type.value} 不允许从 {src_type.value} 指向 {tgt_type.value}"
)
rel = EntityRelationship(
rel_type=rel_type.value,
source_id=source.id,
target_id=target.id,
attributes=attributes or {},
data_version_id=data_version_id,
)
session.add(rel)
session.flush()
return rel
# 多跳穿透:以无向方式遍历关系边,返回与起点在 max_depth 跳内连通的实体集合。
# 用于"疑似同一实控人/关联方网络"识别。
_TRAVERSE_SQL = text(
"""
WITH RECURSIVE reachable(entity_id, depth, path) AS (
SELECT :start_id, 0, ARRAY[:start_id]
UNION ALL
SELECT
CASE WHEN r.source_id = rc.entity_id THEN r.target_id ELSE r.source_id END,
rc.depth + 1,
rc.path || CASE WHEN r.source_id = rc.entity_id THEN r.target_id ELSE r.source_id END
FROM reachable rc
JOIN entity_relationship r
ON (r.source_id = rc.entity_id OR r.target_id = rc.entity_id)
WHERE rc.depth < :max_depth
AND NOT (
CASE WHEN r.source_id = rc.entity_id THEN r.target_id ELSE r.source_id END
= ANY(rc.path)
)
)
SELECT DISTINCT entity_id, MIN(depth) AS depth
FROM reachable
WHERE entity_id <> :start_id
GROUP BY entity_id
ORDER BY depth;
"""
)
def find_related_entities(
session: Session, start_id: uuid.UUID, max_depth: int = 3
) -> list[tuple[uuid.UUID, int]]:
"""返回与起点实体在 max_depth 跳内连通的实体 (id, 最短跳数) 列表。"""
rows = session.execute(
_TRAVERSE_SQL, {"start_id": start_id, "max_depth": max_depth}
).all()
return [(r[0], r[1]) for r in rows]
+157
View File
@@ -0,0 +1,157 @@
"""审计数据中台 ORM 模型。
涵盖:数据版本、本体实体、知识图谱关系边、双时态属性、时序事件。
对应需求 R2 / R3,建模决策见 ADR-0002。
"""
from __future__ import annotations
import datetime as dt
import uuid
from sqlalchemy import (
DateTime,
Float,
ForeignKey,
Index,
Integer,
String,
Text,
UniqueConstraint,
)
from sqlalchemy.dialects.postgresql import JSONB, UUID
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db import Base
def _uuid() -> uuid.UUID:
return uuid.uuid4()
class DataVersion(Base):
"""数据版本登记:每批接入数据的来源/批次/时间/行数,支撑结论可追溯(R3)。"""
__tablename__ = "data_version"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=_uuid)
source_system: Mapped[str] = mapped_column(String(64), nullable=False)
batch_label: Mapped[str] = mapped_column(String(128), nullable=False)
row_count: Mapped[int] = mapped_column(Integer, default=0)
ingested_at: Mapped[dt.datetime] = mapped_column(
DateTime(timezone=True), default=lambda: dt.datetime.now(dt.UTC)
)
note: Mapped[str | None] = mapped_column(Text, nullable=True)
class Entity(Base):
"""本体实体节点(知识图谱顶点)。
business_key 是源系统中的业务主键,用于主数据对齐(同一实体跨系统归一)。
"""
__tablename__ = "entity"
__table_args__ = (
UniqueConstraint("entity_type", "business_key", name="uq_entity_type_bizkey"),
Index("ix_entity_type", "entity_type"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=_uuid)
entity_type: Mapped[str] = mapped_column(String(32), nullable=False)
business_key: Mapped[str] = mapped_column(String(128), nullable=False)
display_name: Mapped[str | None] = mapped_column(String(256), nullable=True)
attributes: Mapped[dict] = mapped_column(JSONB, default=dict)
# 主数据对齐:被归并到的"金主"实体(同一实控人/同一主体)。NULL 表示自身即主实体。
canonical_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("entity.id"), nullable=True
)
data_version_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("data_version.id"), nullable=True
)
class EntityRelationship(Base):
"""知识图谱关系边(有向)。多跳穿透用递归 CTE 遍历本表。"""
__tablename__ = "entity_relationship"
__table_args__ = (
Index("ix_rel_source", "source_id"),
Index("ix_rel_target", "target_id"),
Index("ix_rel_type", "rel_type"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=_uuid)
rel_type: Mapped[str] = mapped_column(String(32), nullable=False)
source_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("entity.id"), nullable=False
)
target_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("entity.id"), nullable=False
)
attributes: Mapped[dict] = mapped_column(JSONB, default=dict)
data_version_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("data_version.id"), nullable=True
)
source: Mapped[Entity] = relationship(foreign_keys=[source_id])
target: Mapped[Entity] = relationship(foreign_keys=[target_id])
class BitemporalFact(Base):
"""双时态事实:实体的某个属性/状态随时间变化的记录。
- 业务有效期 valid_from/valid_to(应用时间)
- 系统记录期 system_from/system_to(事务时间)
回放历史 = 给定 (as_of_valid, as_of_system) 过滤两条时间线(见 repository)。
"""
__tablename__ = "bitemporal_fact"
__table_args__ = (
Index("ix_btf_entity_attr", "entity_id", "attr_name"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=_uuid)
entity_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("entity.id"), nullable=False
)
attr_name: Mapped[str] = mapped_column(String(64), nullable=False)
attr_value: Mapped[dict] = mapped_column(JSONB, default=dict)
valid_from: Mapped[dt.datetime] = mapped_column(DateTime(timezone=True), nullable=False)
valid_to: Mapped[dt.datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
system_from: Mapped[dt.datetime] = mapped_column(
DateTime(timezone=True), default=lambda: dt.datetime.now(dt.UTC)
)
system_to: Mapped[dt.datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
data_version_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("data_version.id"), nullable=True
)
class MetricEvent(Base):
"""时序事件:行为/指标类数据(用户生命周期、回款、话务、佣金、资源使用)。
部署后通过 TimescaleDB create_hypertable('metric_event', 'event_time') 转为超表。
"""
__tablename__ = "metric_event"
__table_args__ = (
Index("ix_metric_subject_time", "subject_type", "subject_key", "event_time"),
Index("ix_metric_name_time", "metric_name", "event_time"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=_uuid)
event_time: Mapped[dt.datetime] = mapped_column(DateTime(timezone=True), nullable=False)
subject_type: Mapped[str] = mapped_column(String(32), nullable=False) # 如 msisdn/channel
subject_key: Mapped[str] = mapped_column(String(128), nullable=False)
metric_name: Mapped[str] = mapped_column(String(64), nullable=False) # 如 traffic_mb/commission
metric_value: Mapped[float] = mapped_column(Float, default=0.0)
attributes: Mapped[dict] = mapped_column(JSONB, default=dict)
data_version_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("data_version.id"), nullable=True
)
+86
View File
@@ -0,0 +1,86 @@
"""审计本体(Ontology)定义。
定义电信内审域的核心实体类型与关系类型,作为知识图谱与主数据对齐的基准。
对应需求 R2。
"""
from __future__ import annotations
from enum import Enum
class EntityType(str, Enum):
"""审计本体核心实体类型。"""
CUSTOMER = "customer" # 客户(含政企)
CONTRACT = "contract" # 合同
MSISDN = "msisdn" # 号码
IMEI = "imei" # 终端设备
ACCOUNT = "account" # 账户(付款/收款)
WORK_ORDER = "work_order" # 工单
SUPPLIER = "supplier" # 供应商
SETTLEMENT = "settlement" # 结算单
EMPLOYEE = "employee" # 员工
CHANNEL = "channel" # 渠道/代理商
LEGAL_PERSON = "legal_person" # 法人/自然人
ADDRESS = "address" # 地址
class RelationshipType(str, Enum):
"""审计本体核心关系类型(有向)。"""
SIGNED = "signed" # 客户 —签约→ 合同
PAID_BY = "paid_by" # 合同 —回款账户→ 账户
OWNS_ACCOUNT = "owns_account" # 客户/供应商 —拥有→ 账户
REGISTERED_AT = "registered_at" # 客户/供应商 —注册地址→ 地址
LEGAL_REP_OF = "legal_rep_of" # 法人 —法定代表人→ 客户/供应商
RELATED_TO = "related_to" # 法人 —亲属/关联→ 法人
HOLDS_MSISDN = "holds_msisdn" # 客户 —持有→ 号码
BOUND_DEVICE = "bound_device" # 号码 —绑定→ IMEI
BELONGS_TO_CHANNEL = "belongs_to_channel" # 号码/合同 —归属→ 渠道
SUPPLIES = "supplies" # 供应商 —供货→ 合同/工单
HANDLED_BY = "handled_by" # 工单 —处理人→ 员工
SETTLES = "settles" # 结算单 —结算→ 合同
# 关系的合法 (源实体类型, 目标实体类型) 约束,用于校验图谱写入
RELATIONSHIP_DOMAIN: dict[RelationshipType, tuple[set[EntityType], set[EntityType]]] = {
RelationshipType.SIGNED: ({EntityType.CUSTOMER}, {EntityType.CONTRACT}),
RelationshipType.PAID_BY: ({EntityType.CONTRACT}, {EntityType.ACCOUNT}),
RelationshipType.OWNS_ACCOUNT: (
{EntityType.CUSTOMER, EntityType.SUPPLIER, EntityType.LEGAL_PERSON},
{EntityType.ACCOUNT},
),
RelationshipType.REGISTERED_AT: (
{EntityType.CUSTOMER, EntityType.SUPPLIER},
{EntityType.ADDRESS},
),
RelationshipType.LEGAL_REP_OF: (
{EntityType.LEGAL_PERSON},
{EntityType.CUSTOMER, EntityType.SUPPLIER},
),
RelationshipType.RELATED_TO: ({EntityType.LEGAL_PERSON}, {EntityType.LEGAL_PERSON}),
RelationshipType.HOLDS_MSISDN: ({EntityType.CUSTOMER}, {EntityType.MSISDN}),
RelationshipType.BOUND_DEVICE: ({EntityType.MSISDN}, {EntityType.IMEI}),
RelationshipType.BELONGS_TO_CHANNEL: (
{EntityType.MSISDN, EntityType.CONTRACT},
{EntityType.CHANNEL},
),
RelationshipType.SUPPLIES: (
{EntityType.SUPPLIER},
{EntityType.CONTRACT, EntityType.WORK_ORDER},
),
RelationshipType.HANDLED_BY: ({EntityType.WORK_ORDER}, {EntityType.EMPLOYEE}),
RelationshipType.SETTLES: ({EntityType.SETTLEMENT}, {EntityType.CONTRACT}),
}
def is_valid_relationship(
rel: RelationshipType, source: EntityType, target: EntityType
) -> bool:
"""校验一条关系的源/目标实体类型是否符合本体约束。"""
domain = RELATIONSHIP_DOMAIN.get(rel)
if domain is None:
return False
sources, targets = domain
return source in sources and target in targets