Files
my_git_project/CardioAI/module2_predictor/train_and_save.py
zhenchuan199 b5b299a682 Add CardioAI project with usage instructions
- Add comprehensive README.md with setup and usage instructions
- Add .env.example template (sanitized, no real API keys)
- Add root-level .gitignore to exclude .env and generated files
- Add all project modules (dashboard, predictor)
- Add data file and requirements.txt
2026-04-02 20:11:24 +08:00

200 lines
6.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
CardioAI 模块2: 模型训练脚本
心血管疾病风险预测模型 - 训练与保存
"""
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from xgboost import XGBClassifier
# ==================== 常量定义 ====================
CODE_ROOT = Path(r"F:\My_Git_Project\CardioAI")
DATA_PATH = CODE_ROOT / "data" / "心血管疾病.xlsx"
MODEL_PATH = CODE_ROOT / "module2_predictor" / "cardio_predictor_model.pkl"
# 特征列定义
CONTINUOUS_FEATURES = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
CATEGORICAL_FEATURES = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
def load_and_clean_data(file_path: Path) -> pd.DataFrame:
"""加载并清洗数据"""
print(f"📂 正在加载数据: {file_path}")
# 加载Excel数据
df = pd.read_excel(file_path, engine='openpyxl')
print(f"✅ 数据加载成功,共 {len(df)} 条记录")
# 复制数据
df = df.copy()
# 特征工程: age(天) -> age_years
df['age_years'] = (df['age'] / 365).round().astype(int)
# 计算BMI
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
# 异常值处理: 删除舒张压 >= 收缩压的记录
initial_count = len(df)
df = df[df['ap_lo'] < df['ap_hi']]
print(f"🗑️ 删除舒张压>=收缩压的记录: {initial_count - len(df)}")
# 删除血压极端异常值
# 收缩压 ∈ [90, 250]
initial_count = len(df)
df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)]
removed_hy = initial_count - len(df)
# 舒张压 ∈ [60, 150]
initial_count = len(df)
df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)]
removed_lo = initial_count - len(df)
print(f"🗑️ 删除血压异常值: 收缩压 {removed_hy} 条, 舒张压 {removed_lo}")
print(f"✅ 数据清洗完成,剩余 {len(df)} 条记录")
return df
def prepare_features(df: pd.DataFrame) -> tuple:
"""
准备特征和标签
删除id和原始age字段保留处理后的特征
"""
# 定义要使用的特征删除id和原始age保留age_years
feature_columns = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi']
X = df[feature_columns].copy()
y = df['cardio'].copy()
print(f"📊 特征数量: {len(feature_columns)}")
print(f"📊 特征列: {feature_columns}")
return X, y, feature_columns
def build_pipeline() -> Pipeline:
"""构建包含预处理器和分类器的Pipeline"""
print("🔧 正在构建Pipeline...")
# 连续特征列
continuous_cols = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
# 分类特征列
categorical_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
# 预处理器
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), continuous_cols),
('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
],
remainder='drop'
)
# 完整Pipeline: 预处理 + XGBoost分类器
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
random_state=42,
use_label_encoder=False,
eval_metric='logloss',
n_jobs=-1
))
])
print("✅ Pipeline构建完成")
return pipeline
def train_and_evaluate(X: pd.DataFrame, y: pd.Series, pipeline: Pipeline):
"""训练模型并评估"""
print("\n" + "="*50)
print("📈 开始模型训练...")
print("="*50)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"📊 训练集大小: {len(X_train)}")
print(f"📊 测试集大小: {len(X_test)}")
# 训练模型
print("🏋️ 正在训练XGBoost模型...")
pipeline.fit(X_train, y_train)
# 预测
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
# 评估指标
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("\n" + "="*50)
print("📊 模型评估结果:")
print("="*50)
print(f"✅ 准确率 (Accuracy): {accuracy:.4f}")
print(f"✅ ROC-AUC 分数: {roc_auc:.4f}")
print("\n📋 分类报告:")
print(classification_report(y_test, y_pred, target_names=['健康', '有风险']))
return pipeline
def save_model(pipeline: Pipeline, model_path: Path):
"""保存模型"""
print(f"\n💾 正在保存模型到: {model_path}")
joblib.dump(pipeline, model_path)
print(f"✅ 模型保存成功!")
# 验证模型文件
file_size = model_path.stat().st_size / (1024 * 1024)
print(f"📦 模型文件大小: {file_size:.2f} MB")
def main():
"""主函数"""
print("\n" + "="*60)
print("❤️ CardioAI 模块2: 心血管疾病风险预测模型训练")
print("="*60 + "\n")
# 1. 加载并清洗数据
df = load_and_clean_data(DATA_PATH)
# 2. 准备特征
X, y, feature_columns = prepare_features(df)
# 3. 构建Pipeline
pipeline = build_pipeline()
# 4. 训练并评估模型
trained_pipeline = train_and_evaluate(X, y, pipeline)
# 5. 保存模型
save_model(trained_pipeline, MODEL_PATH)
print("\n" + "="*60)
print("🎉 模型训练完成!")
print("="*60)
print(f"\n📌 模型使用说明:")
print(f" 1. 启动Flask API: python app.py")
print(f" 2. 访问 http://localhost:5001 查看预测界面")
print(f" 3. 输入11个特征值进行预测")
if __name__ == "__main__":
main()