Files
AIcode/test/module2_predictor/train_and_save.py

331 lines
10 KiB
Python
Raw Normal View History

2026-04-02 19:52:38 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
CardioAI - 心血管疾病预测模型训练脚本
功能
1. 加载和清洗数据与模块1相同的流程
2. 特征工程年龄转换BMI计算异常值处理
3. 构建机器学习Pipeline
4. 训练XGBoost分类器
5. 保存完整Pipeline到文件
注意此脚本为一次性训练脚本生成模型文件供Flask应用使用
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import joblib
import warnings
import sys
import os
from pathlib import Path
# 忽略警告
warnings.filterwarnings('ignore')
# 添加项目根目录到Python路径
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
def load_and_preprocess_data():
"""
加载数据并进行预处理与模块1相同的清洗和特征工程
返回
pd.DataFrame: 预处理后的数据框
"""
print("开始加载和预处理数据...")
# 数据文件路径
data_path = project_root / "data" / "心血管疾病.xlsx"
try:
# 加载数据
df = pd.read_excel(data_path)
print(f"原始数据形状: {df.shape}")
# 检查必要列
required_columns = ['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"数据文件中缺少必要列: {missing_columns}")
# 创建数据副本
df_processed = df.copy()
# 1. 年龄转换:从天转换为年(四舍五入)
df_processed['age_years'] = (df_processed['age'] / 365.25).round().astype(int)
# 2. 计算BMI: BMI = weight(kg) / (height(m)^2)
df_processed['bmi'] = df_processed['weight'] / ((df_processed['height'] / 100) ** 2)
df_processed['bmi'] = df_processed['bmi'].round(2)
# 3. 异常值处理
# 删除舒张压 >= 收缩压的记录
invalid_bp = df_processed['ap_lo'] >= df_processed['ap_hi']
if invalid_bp.any():
print(f"删除 {invalid_bp.sum()} 条舒张压 >= 收缩压的异常记录")
df_processed = df_processed[~invalid_bp].copy()
# 删除血压极端异常值
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
bp_outliers = ~((df_processed['ap_hi'] >= 90) & (df_processed['ap_hi'] <= 250) &
(df_processed['ap_lo'] >= 60) & (df_processed['ap_lo'] <= 150))
if bp_outliers.any():
print(f"删除 {bp_outliers.sum()} 条血压极端异常值记录")
df_processed = df_processed[~bp_outliers].copy()
# 4. 删除不需要的列
# 删除id和原始age字段使用转换后的age_years
df_processed = df_processed.drop(['id', 'age'], axis=1)
print(f"预处理后数据形状: {df_processed.shape}")
print("数据预处理完成!")
return df_processed
except Exception as e:
print(f"数据加载和预处理失败: {str(e)}")
raise
def prepare_features_and_target(df):
"""
准备特征矩阵X和目标向量y
参数
df: 预处理后的数据框
返回
X: 特征矩阵
y: 目标向量
feature_names: 特征名称列表
"""
print("准备特征和目标变量...")
# 目标变量
y = df['cardio'].values
# 特征矩阵 - 删除目标变量
X = df.drop('cardio', axis=1)
print(f"特征矩阵形状: {X.shape}")
print(f"目标变量分布: 0={sum(y==0)}, 1={sum(y==1)}")
return X, y, X.columns.tolist()
def build_pipeline():
"""
构建机器学习Pipeline
返回
Pipeline: 包含预处理和分类器的完整Pipeline
"""
print("构建机器学习Pipeline...")
# 定义特征类型
# 连续特征:需要标准化
numerical_features = ['age_years', 'bmi', 'ap_hi', 'ap_lo']
# 分类特征:需要独热编码
categorical_features = ['gender', 'cholesterol', 'gluc']
# 二元特征:直接使用(不需要编码)
binary_features = ['smoke', 'alco', 'active']
# 所有特征顺序
all_features = numerical_features + categorical_features + binary_features
# 创建列转换器
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
categorical_features),
# 二元特征直接通过(不进行变换)
('binary', 'passthrough', binary_features)
],
remainder='drop' # 丢弃其他列
)
# 创建完整Pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', XGBClassifier(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
eval_metric='logloss',
use_label_encoder=False
))
])
print("Pipeline构建完成")
return pipeline, all_features
def train_model(X, y, pipeline):
"""
训练模型
参数
X: 特征矩阵
y: 目标向量
pipeline: 机器学习Pipeline
返回
训练好的Pipeline
"""
print("开始训练模型...")
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")
# 训练模型
pipeline.fit(X_train, y_train)
# 评估模型
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print(f"训练集准确率: {train_score:.4f}")
print(f"测试集准确率: {test_score:.4f}")
# 特征重要性(如果可用)
if hasattr(pipeline.named_steps['classifier'], 'feature_importances_'):
importances = pipeline.named_steps['classifier'].feature_importances_
print(f"特征重要性数量: {len(importances)}")
# 获取特征名称(需要从预处理器中提取)
preprocessor = pipeline.named_steps['preprocessor']
# 获取转换后的特征名称
feature_names = []
# 数值特征名称
feature_names.extend(preprocessor.transformers_[0][2])
# 分类特征名称(独热编码后)
if len(preprocessor.transformers_) > 1:
cat_encoder = preprocessor.transformers_[1][1]
if hasattr(cat_encoder, 'get_feature_names_out'):
cat_features = cat_encoder.get_feature_names_out(
preprocessor.transformers_[1][2]
)
feature_names.extend(cat_features)
# 二元特征名称
if len(preprocessor.transformers_) > 2:
feature_names.extend(preprocessor.transformers_[2][2])
# 打印最重要的特征
if len(feature_names) == len(importances):
print("\nTop 10 特征重要性:")
indices = np.argsort(importances)[::-1]
for i in range(min(10, len(importances))):
print(f" {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
return pipeline
def save_pipeline(pipeline, all_features):
"""
保存Pipeline到文件
参数
pipeline: 训练好的Pipeline
all_features: 特征名称列表
"""
print("保存模型和特征信息...")
# 创建模型保存目录
model_dir = Path(__file__).parent / "models"
model_dir.mkdir(exist_ok=True)
# 模型文件路径
model_path = model_dir / "cardio_predictor_model.pkl"
# 保存Pipeline对象
model_data = {
'pipeline': pipeline,
'feature_names': all_features,
'model_version': '1.0.0',
'description': 'CardioAI心血管疾病预测模型'
}
joblib.dump(model_data, model_path)
print(f"模型已保存到: {model_path}")
# 保存特征信息到单独文件(可选)
features_path = model_dir / "feature_info.txt"
with open(features_path, 'w', encoding='utf-8') as f:
f.write("CardioAI模型特征信息\n")
f.write("=" * 50 + "\n\n")
f.write("特征列表(按输入顺序):\n")
for i, feature in enumerate(all_features, 1):
f.write(f"{i:2d}. {feature}\n")
f.write("\n\n特征说明:\n")
f.write("- age_years: 年龄(岁),由原始天数转换而来\n")
f.write("- bmi: 身体质量指数,计算公式:体重(kg) / (身高(m)^2)\n")
f.write("- ap_hi: 收缩压mmHg\n")
f.write("- ap_lo: 舒张压mmHg\n")
f.write("- gender: 性别1=女性2=男性)\n")
f.write("- cholesterol: 胆固醇水平1=正常2=高于正常3=极高)\n")
f.write("- gluc: 血糖水平1=正常2=高于正常3=极高)\n")
f.write("- smoke: 吸烟0=否1=是)\n")
f.write("- alco: 饮酒0=否1=是)\n")
f.write("- active: 体育活动0=否1=是)\n")
print(f"特征信息已保存到: {features_path}")
return model_path
def main():
"""主函数"""
print("=" * 60)
print("CardioAI - 心血管疾病预测模型训练")
print("=" * 60)
try:
# 1. 加载和预处理数据
df = load_and_preprocess_data()
# 2. 准备特征和目标
X, y, original_features = prepare_features_and_target(df)
# 3. 构建Pipeline
pipeline, all_features = build_pipeline()
# 4. 训练模型
trained_pipeline = train_model(X, y, pipeline)
# 5. 保存模型
model_path = save_pipeline(trained_pipeline, all_features)
print("\n" + "=" * 60)
print("模型训练完成!")
print(f"模型文件: {model_path}")
print("下一步使用Flask应用部署模型")
print("=" * 60)
except Exception as e:
print(f"\n训练过程出现错误: {str(e)}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()