Files
AIcode/test/module2_predictor/train_and_save.py
2026-04-02 19:52:38 +08:00

331 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
CardioAI - 心血管疾病预测模型训练脚本
功能:
1. 加载和清洗数据与模块1相同的流程
2. 特征工程年龄转换、BMI计算、异常值处理
3. 构建机器学习Pipeline
4. 训练XGBoost分类器
5. 保存完整Pipeline到文件
注意此脚本为一次性训练脚本生成模型文件供Flask应用使用。
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import joblib
import warnings
import sys
import os
from pathlib import Path
# 忽略警告
warnings.filterwarnings('ignore')
# 添加项目根目录到Python路径
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
def load_and_preprocess_data():
"""
加载数据并进行预处理与模块1相同的清洗和特征工程
返回:
pd.DataFrame: 预处理后的数据框
"""
print("开始加载和预处理数据...")
# 数据文件路径
data_path = project_root / "data" / "心血管疾病.xlsx"
try:
# 加载数据
df = pd.read_excel(data_path)
print(f"原始数据形状: {df.shape}")
# 检查必要列
required_columns = ['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"数据文件中缺少必要列: {missing_columns}")
# 创建数据副本
df_processed = df.copy()
# 1. 年龄转换:从天转换为年(四舍五入)
df_processed['age_years'] = (df_processed['age'] / 365.25).round().astype(int)
# 2. 计算BMI: BMI = weight(kg) / (height(m)^2)
df_processed['bmi'] = df_processed['weight'] / ((df_processed['height'] / 100) ** 2)
df_processed['bmi'] = df_processed['bmi'].round(2)
# 3. 异常值处理
# 删除舒张压 >= 收缩压的记录
invalid_bp = df_processed['ap_lo'] >= df_processed['ap_hi']
if invalid_bp.any():
print(f"删除 {invalid_bp.sum()} 条舒张压 >= 收缩压的异常记录")
df_processed = df_processed[~invalid_bp].copy()
# 删除血压极端异常值
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
bp_outliers = ~((df_processed['ap_hi'] >= 90) & (df_processed['ap_hi'] <= 250) &
(df_processed['ap_lo'] >= 60) & (df_processed['ap_lo'] <= 150))
if bp_outliers.any():
print(f"删除 {bp_outliers.sum()} 条血压极端异常值记录")
df_processed = df_processed[~bp_outliers].copy()
# 4. 删除不需要的列
# 删除id和原始age字段使用转换后的age_years
df_processed = df_processed.drop(['id', 'age'], axis=1)
print(f"预处理后数据形状: {df_processed.shape}")
print("数据预处理完成!")
return df_processed
except Exception as e:
print(f"数据加载和预处理失败: {str(e)}")
raise
def prepare_features_and_target(df):
"""
准备特征矩阵X和目标向量y
参数:
df: 预处理后的数据框
返回:
X: 特征矩阵
y: 目标向量
feature_names: 特征名称列表
"""
print("准备特征和目标变量...")
# 目标变量
y = df['cardio'].values
# 特征矩阵 - 删除目标变量
X = df.drop('cardio', axis=1)
print(f"特征矩阵形状: {X.shape}")
print(f"目标变量分布: 0={sum(y==0)}, 1={sum(y==1)}")
return X, y, X.columns.tolist()
def build_pipeline():
"""
构建机器学习Pipeline
返回:
Pipeline: 包含预处理和分类器的完整Pipeline
"""
print("构建机器学习Pipeline...")
# 定义特征类型
# 连续特征:需要标准化
numerical_features = ['age_years', 'bmi', 'ap_hi', 'ap_lo']
# 分类特征:需要独热编码
categorical_features = ['gender', 'cholesterol', 'gluc']
# 二元特征:直接使用(不需要编码)
binary_features = ['smoke', 'alco', 'active']
# 所有特征顺序
all_features = numerical_features + categorical_features + binary_features
# 创建列转换器
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
categorical_features),
# 二元特征直接通过(不进行变换)
('binary', 'passthrough', binary_features)
],
remainder='drop' # 丢弃其他列
)
# 创建完整Pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', XGBClassifier(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
eval_metric='logloss',
use_label_encoder=False
))
])
print("Pipeline构建完成")
return pipeline, all_features
def train_model(X, y, pipeline):
"""
训练模型
参数:
X: 特征矩阵
y: 目标向量
pipeline: 机器学习Pipeline
返回:
训练好的Pipeline
"""
print("开始训练模型...")
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")
# 训练模型
pipeline.fit(X_train, y_train)
# 评估模型
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print(f"训练集准确率: {train_score:.4f}")
print(f"测试集准确率: {test_score:.4f}")
# 特征重要性(如果可用)
if hasattr(pipeline.named_steps['classifier'], 'feature_importances_'):
importances = pipeline.named_steps['classifier'].feature_importances_
print(f"特征重要性数量: {len(importances)}")
# 获取特征名称(需要从预处理器中提取)
preprocessor = pipeline.named_steps['preprocessor']
# 获取转换后的特征名称
feature_names = []
# 数值特征名称
feature_names.extend(preprocessor.transformers_[0][2])
# 分类特征名称(独热编码后)
if len(preprocessor.transformers_) > 1:
cat_encoder = preprocessor.transformers_[1][1]
if hasattr(cat_encoder, 'get_feature_names_out'):
cat_features = cat_encoder.get_feature_names_out(
preprocessor.transformers_[1][2]
)
feature_names.extend(cat_features)
# 二元特征名称
if len(preprocessor.transformers_) > 2:
feature_names.extend(preprocessor.transformers_[2][2])
# 打印最重要的特征
if len(feature_names) == len(importances):
print("\nTop 10 特征重要性:")
indices = np.argsort(importances)[::-1]
for i in range(min(10, len(importances))):
print(f" {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
return pipeline
def save_pipeline(pipeline, all_features):
"""
保存Pipeline到文件
参数:
pipeline: 训练好的Pipeline
all_features: 特征名称列表
"""
print("保存模型和特征信息...")
# 创建模型保存目录
model_dir = Path(__file__).parent / "models"
model_dir.mkdir(exist_ok=True)
# 模型文件路径
model_path = model_dir / "cardio_predictor_model.pkl"
# 保存Pipeline对象
model_data = {
'pipeline': pipeline,
'feature_names': all_features,
'model_version': '1.0.0',
'description': 'CardioAI心血管疾病预测模型'
}
joblib.dump(model_data, model_path)
print(f"模型已保存到: {model_path}")
# 保存特征信息到单独文件(可选)
features_path = model_dir / "feature_info.txt"
with open(features_path, 'w', encoding='utf-8') as f:
f.write("CardioAI模型特征信息\n")
f.write("=" * 50 + "\n\n")
f.write("特征列表(按输入顺序):\n")
for i, feature in enumerate(all_features, 1):
f.write(f"{i:2d}. {feature}\n")
f.write("\n\n特征说明:\n")
f.write("- age_years: 年龄(岁),由原始天数转换而来\n")
f.write("- bmi: 身体质量指数,计算公式:体重(kg) / (身高(m)^2)\n")
f.write("- ap_hi: 收缩压mmHg\n")
f.write("- ap_lo: 舒张压mmHg\n")
f.write("- gender: 性别1=女性2=男性)\n")
f.write("- cholesterol: 胆固醇水平1=正常2=高于正常3=极高)\n")
f.write("- gluc: 血糖水平1=正常2=高于正常3=极高)\n")
f.write("- smoke: 吸烟0=否1=是)\n")
f.write("- alco: 饮酒0=否1=是)\n")
f.write("- active: 体育活动0=否1=是)\n")
print(f"特征信息已保存到: {features_path}")
return model_path
def main():
"""主函数"""
print("=" * 60)
print("CardioAI - 心血管疾病预测模型训练")
print("=" * 60)
try:
# 1. 加载和预处理数据
df = load_and_preprocess_data()
# 2. 准备特征和目标
X, y, original_features = prepare_features_and_target(df)
# 3. 构建Pipeline
pipeline, all_features = build_pipeline()
# 4. 训练模型
trained_pipeline = train_model(X, y, pipeline)
# 5. 保存模型
model_path = save_pipeline(trained_pipeline, all_features)
print("\n" + "=" * 60)
print("模型训练完成!")
print(f"模型文件: {model_path}")
print("下一步使用Flask应用部署模型")
print("=" * 60)
except Exception as e:
print(f"\n训练过程出现错误: {str(e)}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()