Files
ai_coding/module2_predictor/train_and_save.py

71 lines
1.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import joblib
# 数据路径
DATA_PATH = "D:\\AI_Coding\\data\\心血管疾病.xlsx"
# 加载和预处理数据
def load_and_preprocess_data():
# 加载数据
df = pd.read_excel(DATA_PATH)
# 特征工程
# 将age转换为年四舍五入
df['age_years'] = round(df['age'] / 365.25, 0)
# 计算BMI
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
# 异常值处理
# 删除舒张压≥收缩压的记录
df = df[df['ap_lo'] < df['ap_hi']]
# 删除血压极端异常值
df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)]
df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)]
# 删除id和原始age字段
df = df.drop(['id', 'age'], axis=1)
return df
# 加载数据
df = load_and_preprocess_data()
# 定义特征和目标变量
X = df.drop('cardio', axis=1)
y = df['cardio']
# 定义特征类型
continuous_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
# 创建预处理器
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), continuous_features),
('cat', OneHotEncoder(), categorical_features)
]
)
# 创建完整的Pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', XGBClassifier(random_state=42))
])
# 训练模型
pipeline.fit(X, y)
# 保存模型
model_path = "module2_predictor\\cardio_predictor_model.pkl"
joblib.dump(pipeline, model_path)
print(f"模型已保存到: {model_path}")
print("训练完成!")