331 lines
10 KiB
Python
331 lines
10 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
CardioAI - 心血管疾病预测模型训练脚本
|
||
|
||
功能:
|
||
1. 加载和清洗数据(与模块1相同的流程)
|
||
2. 特征工程:年龄转换、BMI计算、异常值处理
|
||
3. 构建机器学习Pipeline
|
||
4. 训练XGBoost分类器
|
||
5. 保存完整Pipeline到文件
|
||
|
||
注意:此脚本为一次性训练脚本,生成模型文件供Flask应用使用。
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||
from sklearn.compose import ColumnTransformer
|
||
from sklearn.pipeline import Pipeline
|
||
from xgboost import XGBClassifier
|
||
import joblib
|
||
import warnings
|
||
import sys
|
||
import os
|
||
from pathlib import Path
|
||
|
||
# 忽略警告
|
||
warnings.filterwarnings('ignore')
|
||
|
||
# 添加项目根目录到Python路径
|
||
project_root = Path(__file__).parent.parent
|
||
sys.path.append(str(project_root))
|
||
|
||
def load_and_preprocess_data():
|
||
"""
|
||
加载数据并进行预处理(与模块1相同的清洗和特征工程)
|
||
|
||
返回:
|
||
pd.DataFrame: 预处理后的数据框
|
||
"""
|
||
print("开始加载和预处理数据...")
|
||
|
||
# 数据文件路径
|
||
data_path = project_root / "data" / "心血管疾病.xlsx"
|
||
|
||
try:
|
||
# 加载数据
|
||
df = pd.read_excel(data_path)
|
||
print(f"原始数据形状: {df.shape}")
|
||
|
||
# 检查必要列
|
||
required_columns = ['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
|
||
'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
|
||
missing_columns = [col for col in required_columns if col not in df.columns]
|
||
if missing_columns:
|
||
raise ValueError(f"数据文件中缺少必要列: {missing_columns}")
|
||
|
||
# 创建数据副本
|
||
df_processed = df.copy()
|
||
|
||
# 1. 年龄转换:从天转换为年(四舍五入)
|
||
df_processed['age_years'] = (df_processed['age'] / 365.25).round().astype(int)
|
||
|
||
# 2. 计算BMI: BMI = weight(kg) / (height(m)^2)
|
||
df_processed['bmi'] = df_processed['weight'] / ((df_processed['height'] / 100) ** 2)
|
||
df_processed['bmi'] = df_processed['bmi'].round(2)
|
||
|
||
# 3. 异常值处理
|
||
# 删除舒张压 >= 收缩压的记录
|
||
invalid_bp = df_processed['ap_lo'] >= df_processed['ap_hi']
|
||
if invalid_bp.any():
|
||
print(f"删除 {invalid_bp.sum()} 条舒张压 >= 收缩压的异常记录")
|
||
df_processed = df_processed[~invalid_bp].copy()
|
||
|
||
# 删除血压极端异常值
|
||
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
|
||
bp_outliers = ~((df_processed['ap_hi'] >= 90) & (df_processed['ap_hi'] <= 250) &
|
||
(df_processed['ap_lo'] >= 60) & (df_processed['ap_lo'] <= 150))
|
||
if bp_outliers.any():
|
||
print(f"删除 {bp_outliers.sum()} 条血压极端异常值记录")
|
||
df_processed = df_processed[~bp_outliers].copy()
|
||
|
||
# 4. 删除不需要的列
|
||
# 删除id和原始age字段(使用转换后的age_years)
|
||
df_processed = df_processed.drop(['id', 'age'], axis=1)
|
||
|
||
print(f"预处理后数据形状: {df_processed.shape}")
|
||
print("数据预处理完成!")
|
||
|
||
return df_processed
|
||
|
||
except Exception as e:
|
||
print(f"数据加载和预处理失败: {str(e)}")
|
||
raise
|
||
|
||
def prepare_features_and_target(df):
|
||
"""
|
||
准备特征矩阵X和目标向量y
|
||
|
||
参数:
|
||
df: 预处理后的数据框
|
||
|
||
返回:
|
||
X: 特征矩阵
|
||
y: 目标向量
|
||
feature_names: 特征名称列表
|
||
"""
|
||
print("准备特征和目标变量...")
|
||
|
||
# 目标变量
|
||
y = df['cardio'].values
|
||
|
||
# 特征矩阵 - 删除目标变量
|
||
X = df.drop('cardio', axis=1)
|
||
|
||
print(f"特征矩阵形状: {X.shape}")
|
||
print(f"目标变量分布: 0={sum(y==0)}, 1={sum(y==1)}")
|
||
|
||
return X, y, X.columns.tolist()
|
||
|
||
def build_pipeline():
|
||
"""
|
||
构建机器学习Pipeline
|
||
|
||
返回:
|
||
Pipeline: 包含预处理和分类器的完整Pipeline
|
||
"""
|
||
print("构建机器学习Pipeline...")
|
||
|
||
# 定义特征类型
|
||
# 连续特征:需要标准化
|
||
numerical_features = ['age_years', 'bmi', 'ap_hi', 'ap_lo']
|
||
|
||
# 分类特征:需要独热编码
|
||
categorical_features = ['gender', 'cholesterol', 'gluc']
|
||
|
||
# 二元特征:直接使用(不需要编码)
|
||
binary_features = ['smoke', 'alco', 'active']
|
||
|
||
# 所有特征顺序
|
||
all_features = numerical_features + categorical_features + binary_features
|
||
|
||
# 创建列转换器
|
||
preprocessor = ColumnTransformer(
|
||
transformers=[
|
||
('num', StandardScaler(), numerical_features),
|
||
('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
|
||
categorical_features),
|
||
# 二元特征直接通过(不进行变换)
|
||
('binary', 'passthrough', binary_features)
|
||
],
|
||
remainder='drop' # 丢弃其他列
|
||
)
|
||
|
||
# 创建完整Pipeline
|
||
pipeline = Pipeline([
|
||
('preprocessor', preprocessor),
|
||
('classifier', XGBClassifier(
|
||
n_estimators=100,
|
||
max_depth=5,
|
||
learning_rate=0.1,
|
||
subsample=0.8,
|
||
colsample_bytree=0.8,
|
||
random_state=42,
|
||
eval_metric='logloss',
|
||
use_label_encoder=False
|
||
))
|
||
])
|
||
|
||
print("Pipeline构建完成!")
|
||
return pipeline, all_features
|
||
|
||
def train_model(X, y, pipeline):
|
||
"""
|
||
训练模型
|
||
|
||
参数:
|
||
X: 特征矩阵
|
||
y: 目标向量
|
||
pipeline: 机器学习Pipeline
|
||
|
||
返回:
|
||
训练好的Pipeline
|
||
"""
|
||
print("开始训练模型...")
|
||
|
||
# 划分训练集和测试集
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
X, y, test_size=0.2, random_state=42, stratify=y
|
||
)
|
||
|
||
print(f"训练集大小: {X_train.shape}")
|
||
print(f"测试集大小: {X_test.shape}")
|
||
|
||
# 训练模型
|
||
pipeline.fit(X_train, y_train)
|
||
|
||
# 评估模型
|
||
train_score = pipeline.score(X_train, y_train)
|
||
test_score = pipeline.score(X_test, y_test)
|
||
|
||
print(f"训练集准确率: {train_score:.4f}")
|
||
print(f"测试集准确率: {test_score:.4f}")
|
||
|
||
# 特征重要性(如果可用)
|
||
if hasattr(pipeline.named_steps['classifier'], 'feature_importances_'):
|
||
importances = pipeline.named_steps['classifier'].feature_importances_
|
||
print(f"特征重要性数量: {len(importances)}")
|
||
|
||
# 获取特征名称(需要从预处理器中提取)
|
||
preprocessor = pipeline.named_steps['preprocessor']
|
||
|
||
# 获取转换后的特征名称
|
||
feature_names = []
|
||
|
||
# 数值特征名称
|
||
feature_names.extend(preprocessor.transformers_[0][2])
|
||
|
||
# 分类特征名称(独热编码后)
|
||
if len(preprocessor.transformers_) > 1:
|
||
cat_encoder = preprocessor.transformers_[1][1]
|
||
if hasattr(cat_encoder, 'get_feature_names_out'):
|
||
cat_features = cat_encoder.get_feature_names_out(
|
||
preprocessor.transformers_[1][2]
|
||
)
|
||
feature_names.extend(cat_features)
|
||
|
||
# 二元特征名称
|
||
if len(preprocessor.transformers_) > 2:
|
||
feature_names.extend(preprocessor.transformers_[2][2])
|
||
|
||
# 打印最重要的特征
|
||
if len(feature_names) == len(importances):
|
||
print("\nTop 10 特征重要性:")
|
||
indices = np.argsort(importances)[::-1]
|
||
for i in range(min(10, len(importances))):
|
||
print(f" {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
|
||
|
||
return pipeline
|
||
|
||
def save_pipeline(pipeline, all_features):
|
||
"""
|
||
保存Pipeline到文件
|
||
|
||
参数:
|
||
pipeline: 训练好的Pipeline
|
||
all_features: 特征名称列表
|
||
"""
|
||
print("保存模型和特征信息...")
|
||
|
||
# 创建模型保存目录
|
||
model_dir = Path(__file__).parent / "models"
|
||
model_dir.mkdir(exist_ok=True)
|
||
|
||
# 模型文件路径
|
||
model_path = model_dir / "cardio_predictor_model.pkl"
|
||
|
||
# 保存Pipeline对象
|
||
model_data = {
|
||
'pipeline': pipeline,
|
||
'feature_names': all_features,
|
||
'model_version': '1.0.0',
|
||
'description': 'CardioAI心血管疾病预测模型'
|
||
}
|
||
|
||
joblib.dump(model_data, model_path)
|
||
print(f"模型已保存到: {model_path}")
|
||
|
||
# 保存特征信息到单独文件(可选)
|
||
features_path = model_dir / "feature_info.txt"
|
||
with open(features_path, 'w', encoding='utf-8') as f:
|
||
f.write("CardioAI模型特征信息\n")
|
||
f.write("=" * 50 + "\n\n")
|
||
f.write("特征列表(按输入顺序):\n")
|
||
for i, feature in enumerate(all_features, 1):
|
||
f.write(f"{i:2d}. {feature}\n")
|
||
|
||
f.write("\n\n特征说明:\n")
|
||
f.write("- age_years: 年龄(岁),由原始天数转换而来\n")
|
||
f.write("- bmi: 身体质量指数,计算公式:体重(kg) / (身高(m)^2)\n")
|
||
f.write("- ap_hi: 收缩压(mmHg)\n")
|
||
f.write("- ap_lo: 舒张压(mmHg)\n")
|
||
f.write("- gender: 性别(1=女性,2=男性)\n")
|
||
f.write("- cholesterol: 胆固醇水平(1=正常,2=高于正常,3=极高)\n")
|
||
f.write("- gluc: 血糖水平(1=正常,2=高于正常,3=极高)\n")
|
||
f.write("- smoke: 吸烟(0=否,1=是)\n")
|
||
f.write("- alco: 饮酒(0=否,1=是)\n")
|
||
f.write("- active: 体育活动(0=否,1=是)\n")
|
||
|
||
print(f"特征信息已保存到: {features_path}")
|
||
|
||
return model_path
|
||
|
||
def main():
|
||
"""主函数"""
|
||
print("=" * 60)
|
||
print("CardioAI - 心血管疾病预测模型训练")
|
||
print("=" * 60)
|
||
|
||
try:
|
||
# 1. 加载和预处理数据
|
||
df = load_and_preprocess_data()
|
||
|
||
# 2. 准备特征和目标
|
||
X, y, original_features = prepare_features_and_target(df)
|
||
|
||
# 3. 构建Pipeline
|
||
pipeline, all_features = build_pipeline()
|
||
|
||
# 4. 训练模型
|
||
trained_pipeline = train_model(X, y, pipeline)
|
||
|
||
# 5. 保存模型
|
||
model_path = save_pipeline(trained_pipeline, all_features)
|
||
|
||
print("\n" + "=" * 60)
|
||
print("模型训练完成!")
|
||
print(f"模型文件: {model_path}")
|
||
print("下一步:使用Flask应用部署模型")
|
||
print("=" * 60)
|
||
|
||
except Exception as e:
|
||
print(f"\n训练过程出现错误: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
sys.exit(1)
|
||
|
||
if __name__ == "__main__":
|
||
main() |