Part A: 模型训练与保存 - train_and_save.py: 一次性脚本,训练XGBoost模型并保存完整Pipeline - cardio_predictor_model.pkl: 包含预处理器和分类器的完整Pipeline Part B: Flask API部署 - app.py: 提供/predict_cardio接口,接收11个特征值并返回预测结果 - 包含输入验证、数据处理和模型加载功能 Part C: 前端交互界面 - templates/index.html: 响应式HTML表单,集成JavaScript Fetch API - 提供示例数据填充和实时预测结果显示 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
260 lines
7.7 KiB
Python
260 lines
7.7 KiB
Python
#!/opt/anaconda3/envs/cardioenv/bin/python
|
||
"""
|
||
CardioAI - 心血管疾病预测模型训练脚本
|
||
一次性脚本,用于训练XGBoost模型并保存Pipeline
|
||
"""
|
||
|
||
import os
|
||
# 设置环境变量以确保XGBoost可以找到OpenMP库
|
||
os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/opt/libomp/lib:' + os.environ.get('DYLD_LIBRARY_PATH', '')
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
import warnings
|
||
warnings.filterwarnings('ignore')
|
||
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||
from sklearn.compose import ColumnTransformer
|
||
from sklearn.pipeline import Pipeline
|
||
from xgboost import XGBClassifier
|
||
import joblib
|
||
|
||
# 数据路径
|
||
DATA_PATH = "../data/心血管疾病.xlsx"
|
||
|
||
def load_and_process_data():
|
||
"""
|
||
加载并处理心血管疾病数据,与Module1保持一致
|
||
返回处理后的DataFrame
|
||
"""
|
||
try:
|
||
# 尝试多种路径
|
||
possible_paths = [
|
||
DATA_PATH,
|
||
os.path.abspath(DATA_PATH),
|
||
os.path.abspath(os.path.join(os.path.dirname(__file__), DATA_PATH)),
|
||
os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "心血管疾病.xlsx")),
|
||
os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "data", "心血管疾病.xlsx"))
|
||
]
|
||
|
||
data_path = None
|
||
for path in possible_paths:
|
||
if os.path.exists(path):
|
||
data_path = path
|
||
print(f"找到数据文件: {path}")
|
||
break
|
||
|
||
if data_path is None:
|
||
print(f"未找到数据文件,尝试过的路径: {possible_paths}")
|
||
return pd.DataFrame()
|
||
|
||
# 加载数据
|
||
df = pd.read_excel(data_path)
|
||
|
||
# 1. 特征工程
|
||
# 将age(天)转换为年,四舍五入
|
||
df['age_years'] = (df['age'] / 365.25).round().astype(int)
|
||
|
||
# 计算BMI: weight / (height/100)^2
|
||
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
|
||
|
||
# 2. 异常值处理
|
||
# 删除舒张压 >= 收缩压的记录
|
||
df = df[df['ap_lo'] < df['ap_hi']].copy()
|
||
|
||
# 删除血压极端异常值
|
||
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
|
||
df = df[
|
||
(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250) &
|
||
(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
|
||
].copy()
|
||
|
||
# 3. 类别转换
|
||
# cholesterol转换
|
||
cholesterol_map = {
|
||
1: 'normal',
|
||
2: 'above_normal',
|
||
3: 'well_above_normal'
|
||
}
|
||
df['cholesterol_cat'] = df['cholesterol'].map(cholesterol_map)
|
||
|
||
# gluc转换
|
||
gluc_map = {
|
||
1: 'normal',
|
||
2: 'above_normal',
|
||
3: 'well_above_normal'
|
||
}
|
||
df['gluc_cat'] = df['gluc'].map(gluc_map)
|
||
|
||
# BMI分类
|
||
def categorize_bmi(bmi):
|
||
if bmi < 18.5:
|
||
return 'underweight'
|
||
elif 18.5 <= bmi < 25:
|
||
return 'normal'
|
||
elif 25 <= bmi < 30:
|
||
return 'overweight'
|
||
else:
|
||
return 'obese'
|
||
|
||
df['bmi_category'] = df['bmi'].apply(categorize_bmi)
|
||
|
||
return df
|
||
|
||
except Exception as e:
|
||
print(f"数据加载失败: {e}")
|
||
return pd.DataFrame()
|
||
|
||
def prepare_features_target(df):
|
||
"""
|
||
准备特征和目标变量
|
||
删除id和原始age字段
|
||
"""
|
||
# 删除不需要的列
|
||
features = df.drop(['id', 'age', 'cardio'], axis=1)
|
||
target = df['cardio']
|
||
|
||
return features, target
|
||
|
||
def build_pipeline():
|
||
"""
|
||
构建预处理和建模的Pipeline
|
||
"""
|
||
# 定义特征类型
|
||
numeric_features = ['height', 'weight', 'ap_hi', 'ap_lo', 'bmi', 'age_years']
|
||
categorical_features = ['gender', 'cholesterol_cat', 'gluc_cat', 'smoke', 'alco', 'active', 'bmi_category']
|
||
|
||
# 构建ColumnTransformer
|
||
preprocessor = ColumnTransformer(
|
||
transformers=[
|
||
('num', StandardScaler(), numeric_features),
|
||
('cat', OneHotEncoder(drop='if_binary', sparse_output=False, handle_unknown='ignore'), categorical_features)
|
||
])
|
||
|
||
# 构建完整Pipeline
|
||
pipeline = Pipeline(steps=[
|
||
('preprocessor', preprocessor),
|
||
('classifier', XGBClassifier(
|
||
n_estimators=100,
|
||
max_depth=5,
|
||
learning_rate=0.1,
|
||
subsample=0.8,
|
||
colsample_bytree=0.8,
|
||
random_state=42,
|
||
eval_metric='logloss',
|
||
use_label_encoder=False
|
||
))
|
||
])
|
||
|
||
return pipeline
|
||
|
||
def evaluate_model(model, X_test, y_test):
|
||
"""
|
||
评估模型性能
|
||
"""
|
||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
||
|
||
y_pred = model.predict(X_test)
|
||
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
||
|
||
accuracy = accuracy_score(y_test, y_pred)
|
||
precision = precision_score(y_test, y_pred)
|
||
recall = recall_score(y_test, y_pred)
|
||
f1 = f1_score(y_test, y_pred)
|
||
roc_auc = roc_auc_score(y_test, y_pred_proba)
|
||
|
||
print(f"模型评估结果:")
|
||
print(f" 准确率: {accuracy:.4f}")
|
||
print(f" 精确率: {precision:.4f}")
|
||
print(f" 召回率: {recall:.4f}")
|
||
print(f" F1分数: {f1:.4f}")
|
||
print(f" ROC AUC: {roc_auc:.4f}")
|
||
|
||
return {
|
||
'accuracy': accuracy,
|
||
'precision': precision,
|
||
'recall': recall,
|
||
'f1': f1,
|
||
'roc_auc': roc_auc
|
||
}
|
||
|
||
def main():
|
||
"""
|
||
主训练流程
|
||
"""
|
||
print("=" * 60)
|
||
print("CardioAI - 心血管疾病预测模型训练")
|
||
print("=" * 60)
|
||
|
||
# 1. 加载和处理数据
|
||
print("\n1. 加载和处理数据...")
|
||
df = load_and_process_data()
|
||
|
||
if df.empty:
|
||
print("❌ 数据加载失败,请检查数据文件路径")
|
||
return
|
||
|
||
print(f" 处理后的数据形状: {df.shape}")
|
||
print(f" 阳性样本比例: {df['cardio'].mean():.2%}")
|
||
|
||
# 2. 准备特征和目标
|
||
print("\n2. 准备特征和目标变量...")
|
||
X, y = prepare_features_target(df)
|
||
print(f" 特征数量: {X.shape[1]}")
|
||
print(f" 样本数量: {X.shape[0]}")
|
||
|
||
# 3. 划分训练集和测试集
|
||
print("\n3. 划分训练集和测试集...")
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
X, y, test_size=0.2, random_state=42, stratify=y
|
||
)
|
||
print(f" 训练集大小: {X_train.shape[0]}")
|
||
print(f" 测试集大小: {X_test.shape[0]}")
|
||
|
||
# 4. 构建和训练Pipeline
|
||
print("\n4. 构建和训练Pipeline...")
|
||
pipeline = build_pipeline()
|
||
|
||
print(" 开始训练模型...")
|
||
pipeline.fit(X_train, y_train)
|
||
print(" 模型训练完成!")
|
||
|
||
# 5. 评估模型
|
||
print("\n5. 评估模型性能...")
|
||
metrics = evaluate_model(pipeline, X_test, y_test)
|
||
|
||
# 6. 保存模型
|
||
print("\n6. 保存模型...")
|
||
model_path = os.path.join(os.path.dirname(__file__), "cardio_predictor_model.pkl")
|
||
model_dir = os.path.dirname(model_path)
|
||
|
||
# 确保目录存在
|
||
if model_dir:
|
||
os.makedirs(model_dir, exist_ok=True)
|
||
|
||
# 保存模型
|
||
joblib.dump(pipeline, model_path)
|
||
print(f" 模型已保存到: {model_path}")
|
||
|
||
# 7. 打印特征信息
|
||
print("\n7. 特征信息:")
|
||
print(" 连续特征: height, weight, ap_hi, ap_lo, bmi, age_years")
|
||
print(" 分类特征: gender, cholesterol_cat, gluc_cat, smoke, alco, active, bmi_category")
|
||
print(" 总特征数: 13个原始特征 → 预处理后更多")
|
||
|
||
# 8. 验证模型加载
|
||
print("\n8. 验证模型加载...")
|
||
try:
|
||
loaded_model = joblib.load(model_path)
|
||
test_pred = loaded_model.predict(X_test.iloc[:1])
|
||
print(f" 模型加载成功! 测试预测: {test_pred[0]}")
|
||
except Exception as e:
|
||
print(f" 模型加载失败: {e}")
|
||
|
||
print("\n" + "=" * 60)
|
||
print("✅ 模型训练和保存完成!")
|
||
print("=" * 60)
|
||
|
||
if __name__ == "__main__":
|
||
main() |