- Add comprehensive README.md with setup and usage instructions - Add .env.example template (sanitized, no real API keys) - Add root-level .gitignore to exclude .env and generated files - Add all project modules (dashboard, predictor) - Add data file and requirements.txt
200 lines
6.0 KiB
Python
200 lines
6.0 KiB
Python
"""
|
||
CardioAI 模块2: 模型训练脚本
|
||
心血管疾病风险预测模型 - 训练与保存
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
import joblib
|
||
from pathlib import Path
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn.compose import ColumnTransformer
|
||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
|
||
from xgboost import XGBClassifier
|
||
|
||
# ==================== 常量定义 ====================
|
||
CODE_ROOT = Path(r"F:\My_Git_Project\CardioAI")
|
||
DATA_PATH = CODE_ROOT / "data" / "心血管疾病.xlsx"
|
||
MODEL_PATH = CODE_ROOT / "module2_predictor" / "cardio_predictor_model.pkl"
|
||
|
||
# 特征列定义
|
||
CONTINUOUS_FEATURES = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
|
||
CATEGORICAL_FEATURES = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
|
||
|
||
|
||
def load_and_clean_data(file_path: Path) -> pd.DataFrame:
|
||
"""加载并清洗数据"""
|
||
print(f"📂 正在加载数据: {file_path}")
|
||
|
||
# 加载Excel数据
|
||
df = pd.read_excel(file_path, engine='openpyxl')
|
||
print(f"✅ 数据加载成功,共 {len(df)} 条记录")
|
||
|
||
# 复制数据
|
||
df = df.copy()
|
||
|
||
# 特征工程: age(天) -> age_years
|
||
df['age_years'] = (df['age'] / 365).round().astype(int)
|
||
|
||
# 计算BMI
|
||
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
|
||
|
||
# 异常值处理: 删除舒张压 >= 收缩压的记录
|
||
initial_count = len(df)
|
||
df = df[df['ap_lo'] < df['ap_hi']]
|
||
print(f"🗑️ 删除舒张压>=收缩压的记录: {initial_count - len(df)} 条")
|
||
|
||
# 删除血压极端异常值
|
||
# 收缩压 ∈ [90, 250]
|
||
initial_count = len(df)
|
||
df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)]
|
||
removed_hy = initial_count - len(df)
|
||
|
||
# 舒张压 ∈ [60, 150]
|
||
initial_count = len(df)
|
||
df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)]
|
||
removed_lo = initial_count - len(df)
|
||
print(f"🗑️ 删除血压异常值: 收缩压 {removed_hy} 条, 舒张压 {removed_lo} 条")
|
||
|
||
print(f"✅ 数据清洗完成,剩余 {len(df)} 条记录")
|
||
return df
|
||
|
||
|
||
def prepare_features(df: pd.DataFrame) -> tuple:
|
||
"""
|
||
准备特征和标签
|
||
删除id和原始age字段,保留处理后的特征
|
||
"""
|
||
# 定义要使用的特征(删除id和原始age,保留age_years)
|
||
feature_columns = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
|
||
'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi']
|
||
|
||
X = df[feature_columns].copy()
|
||
y = df['cardio'].copy()
|
||
|
||
print(f"📊 特征数量: {len(feature_columns)}")
|
||
print(f"📊 特征列: {feature_columns}")
|
||
|
||
return X, y, feature_columns
|
||
|
||
|
||
def build_pipeline() -> Pipeline:
|
||
"""构建包含预处理器和分类器的Pipeline"""
|
||
print("🔧 正在构建Pipeline...")
|
||
|
||
# 连续特征列
|
||
continuous_cols = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
|
||
|
||
# 分类特征列
|
||
categorical_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
|
||
|
||
# 预处理器
|
||
preprocessor = ColumnTransformer(
|
||
transformers=[
|
||
('num', StandardScaler(), continuous_cols),
|
||
('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
|
||
],
|
||
remainder='drop'
|
||
)
|
||
|
||
# 完整Pipeline: 预处理 + XGBoost分类器
|
||
pipeline = Pipeline([
|
||
('preprocessor', preprocessor),
|
||
('classifier', XGBClassifier(
|
||
n_estimators=100,
|
||
max_depth=6,
|
||
learning_rate=0.1,
|
||
random_state=42,
|
||
use_label_encoder=False,
|
||
eval_metric='logloss',
|
||
n_jobs=-1
|
||
))
|
||
])
|
||
|
||
print("✅ Pipeline构建完成")
|
||
return pipeline
|
||
|
||
|
||
def train_and_evaluate(X: pd.DataFrame, y: pd.Series, pipeline: Pipeline):
|
||
"""训练模型并评估"""
|
||
print("\n" + "="*50)
|
||
print("📈 开始模型训练...")
|
||
print("="*50)
|
||
|
||
# 划分训练集和测试集
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
X, y, test_size=0.2, random_state=42, stratify=y
|
||
)
|
||
|
||
print(f"📊 训练集大小: {len(X_train)}")
|
||
print(f"📊 测试集大小: {len(X_test)}")
|
||
|
||
# 训练模型
|
||
print("🏋️ 正在训练XGBoost模型...")
|
||
pipeline.fit(X_train, y_train)
|
||
|
||
# 预测
|
||
y_pred = pipeline.predict(X_test)
|
||
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
|
||
|
||
# 评估指标
|
||
accuracy = accuracy_score(y_test, y_pred)
|
||
roc_auc = roc_auc_score(y_test, y_pred_proba)
|
||
|
||
print("\n" + "="*50)
|
||
print("📊 模型评估结果:")
|
||
print("="*50)
|
||
print(f"✅ 准确率 (Accuracy): {accuracy:.4f}")
|
||
print(f"✅ ROC-AUC 分数: {roc_auc:.4f}")
|
||
print("\n📋 分类报告:")
|
||
print(classification_report(y_test, y_pred, target_names=['健康', '有风险']))
|
||
|
||
return pipeline
|
||
|
||
|
||
def save_model(pipeline: Pipeline, model_path: Path):
|
||
"""保存模型"""
|
||
print(f"\n💾 正在保存模型到: {model_path}")
|
||
joblib.dump(pipeline, model_path)
|
||
print(f"✅ 模型保存成功!")
|
||
|
||
# 验证模型文件
|
||
file_size = model_path.stat().st_size / (1024 * 1024)
|
||
print(f"📦 模型文件大小: {file_size:.2f} MB")
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
print("\n" + "="*60)
|
||
print("❤️ CardioAI 模块2: 心血管疾病风险预测模型训练")
|
||
print("="*60 + "\n")
|
||
|
||
# 1. 加载并清洗数据
|
||
df = load_and_clean_data(DATA_PATH)
|
||
|
||
# 2. 准备特征
|
||
X, y, feature_columns = prepare_features(df)
|
||
|
||
# 3. 构建Pipeline
|
||
pipeline = build_pipeline()
|
||
|
||
# 4. 训练并评估模型
|
||
trained_pipeline = train_and_evaluate(X, y, pipeline)
|
||
|
||
# 5. 保存模型
|
||
save_model(trained_pipeline, MODEL_PATH)
|
||
|
||
print("\n" + "="*60)
|
||
print("🎉 模型训练完成!")
|
||
print("="*60)
|
||
print(f"\n📌 模型使用说明:")
|
||
print(f" 1. 启动Flask API: python app.py")
|
||
print(f" 2. 访问 http://localhost:5001 查看预测界面")
|
||
print(f" 3. 输入11个特征值进行预测")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|