Files
itcast_01/module2_predictor/train_and_save.py
Zane Xu 88021e4a4b 添加Module2心血管疾病预测模型和Flask API
Part A: 模型训练与保存
- train_and_save.py: 一次性脚本,训练XGBoost模型并保存完整Pipeline
- cardio_predictor_model.pkl: 包含预处理器和分类器的完整Pipeline

Part B: Flask API部署
- app.py: 提供/predict_cardio接口,接收11个特征值并返回预测结果
- 包含输入验证、数据处理和模型加载功能

Part C: 前端交互界面
- templates/index.html: 响应式HTML表单,集成JavaScript Fetch API
- 提供示例数据填充和实时预测结果显示

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 22:53:47 +08:00

260 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/opt/anaconda3/envs/cardioenv/bin/python
"""
CardioAI - 心血管疾病预测模型训练脚本
一次性脚本用于训练XGBoost模型并保存Pipeline
"""
import os
# 设置环境变量以确保XGBoost可以找到OpenMP库
os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/opt/libomp/lib:' + os.environ.get('DYLD_LIBRARY_PATH', '')
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import joblib
# 数据路径
DATA_PATH = "../data/心血管疾病.xlsx"
def load_and_process_data():
"""
加载并处理心血管疾病数据与Module1保持一致
返回处理后的DataFrame
"""
try:
# 尝试多种路径
possible_paths = [
DATA_PATH,
os.path.abspath(DATA_PATH),
os.path.abspath(os.path.join(os.path.dirname(__file__), DATA_PATH)),
os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "心血管疾病.xlsx")),
os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "data", "心血管疾病.xlsx"))
]
data_path = None
for path in possible_paths:
if os.path.exists(path):
data_path = path
print(f"找到数据文件: {path}")
break
if data_path is None:
print(f"未找到数据文件,尝试过的路径: {possible_paths}")
return pd.DataFrame()
# 加载数据
df = pd.read_excel(data_path)
# 1. 特征工程
# 将age(天)转换为年,四舍五入
df['age_years'] = (df['age'] / 365.25).round().astype(int)
# 计算BMI: weight / (height/100)^2
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
# 2. 异常值处理
# 删除舒张压 >= 收缩压的记录
df = df[df['ap_lo'] < df['ap_hi']].copy()
# 删除血压极端异常值
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
df = df[
(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250) &
(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
].copy()
# 3. 类别转换
# cholesterol转换
cholesterol_map = {
1: 'normal',
2: 'above_normal',
3: 'well_above_normal'
}
df['cholesterol_cat'] = df['cholesterol'].map(cholesterol_map)
# gluc转换
gluc_map = {
1: 'normal',
2: 'above_normal',
3: 'well_above_normal'
}
df['gluc_cat'] = df['gluc'].map(gluc_map)
# BMI分类
def categorize_bmi(bmi):
if bmi < 18.5:
return 'underweight'
elif 18.5 <= bmi < 25:
return 'normal'
elif 25 <= bmi < 30:
return 'overweight'
else:
return 'obese'
df['bmi_category'] = df['bmi'].apply(categorize_bmi)
return df
except Exception as e:
print(f"数据加载失败: {e}")
return pd.DataFrame()
def prepare_features_target(df):
"""
准备特征和目标变量
删除id和原始age字段
"""
# 删除不需要的列
features = df.drop(['id', 'age', 'cardio'], axis=1)
target = df['cardio']
return features, target
def build_pipeline():
"""
构建预处理和建模的Pipeline
"""
# 定义特征类型
numeric_features = ['height', 'weight', 'ap_hi', 'ap_lo', 'bmi', 'age_years']
categorical_features = ['gender', 'cholesterol_cat', 'gluc_cat', 'smoke', 'alco', 'active', 'bmi_category']
# 构建ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(drop='if_binary', sparse_output=False, handle_unknown='ignore'), categorical_features)
])
# 构建完整Pipeline
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', XGBClassifier(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
eval_metric='logloss',
use_label_encoder=False
))
])
return pipeline
def evaluate_model(model, X_test, y_test):
"""
评估模型性能
"""
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"模型评估结果:")
print(f" 准确率: {accuracy:.4f}")
print(f" 精确率: {precision:.4f}")
print(f" 召回率: {recall:.4f}")
print(f" F1分数: {f1:.4f}")
print(f" ROC AUC: {roc_auc:.4f}")
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
'roc_auc': roc_auc
}
def main():
"""
主训练流程
"""
print("=" * 60)
print("CardioAI - 心血管疾病预测模型训练")
print("=" * 60)
# 1. 加载和处理数据
print("\n1. 加载和处理数据...")
df = load_and_process_data()
if df.empty:
print("❌ 数据加载失败,请检查数据文件路径")
return
print(f" 处理后的数据形状: {df.shape}")
print(f" 阳性样本比例: {df['cardio'].mean():.2%}")
# 2. 准备特征和目标
print("\n2. 准备特征和目标变量...")
X, y = prepare_features_target(df)
print(f" 特征数量: {X.shape[1]}")
print(f" 样本数量: {X.shape[0]}")
# 3. 划分训练集和测试集
print("\n3. 划分训练集和测试集...")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f" 训练集大小: {X_train.shape[0]}")
print(f" 测试集大小: {X_test.shape[0]}")
# 4. 构建和训练Pipeline
print("\n4. 构建和训练Pipeline...")
pipeline = build_pipeline()
print(" 开始训练模型...")
pipeline.fit(X_train, y_train)
print(" 模型训练完成!")
# 5. 评估模型
print("\n5. 评估模型性能...")
metrics = evaluate_model(pipeline, X_test, y_test)
# 6. 保存模型
print("\n6. 保存模型...")
model_path = os.path.join(os.path.dirname(__file__), "cardio_predictor_model.pkl")
model_dir = os.path.dirname(model_path)
# 确保目录存在
if model_dir:
os.makedirs(model_dir, exist_ok=True)
# 保存模型
joblib.dump(pipeline, model_path)
print(f" 模型已保存到: {model_path}")
# 7. 打印特征信息
print("\n7. 特征信息:")
print(" 连续特征: height, weight, ap_hi, ap_lo, bmi, age_years")
print(" 分类特征: gender, cholesterol_cat, gluc_cat, smoke, alco, active, bmi_category")
print(" 总特征数: 13个原始特征 → 预处理后更多")
# 8. 验证模型加载
print("\n8. 验证模型加载...")
try:
loaded_model = joblib.load(model_path)
test_pred = loaded_model.predict(X_test.iloc[:1])
print(f" 模型加载成功! 测试预测: {test_pred[0]}")
except Exception as e:
print(f" 模型加载失败: {e}")
print("\n" + "=" * 60)
print("✅ 模型训练和保存完成!")
print("=" * 60)
if __name__ == "__main__":
main()