71 lines
1.9 KiB
Python
71 lines
1.9 KiB
Python
|
|
import pandas as pd
|
|||
|
|
import numpy as np
|
|||
|
|
from sklearn.pipeline import Pipeline
|
|||
|
|
from sklearn.compose import ColumnTransformer
|
|||
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|||
|
|
from sklearn.model_selection import train_test_split
|
|||
|
|
from xgboost import XGBClassifier
|
|||
|
|
import joblib
|
|||
|
|
|
|||
|
|
# 数据路径
|
|||
|
|
DATA_PATH = "D:\\AI_Coding\\data\\心血管疾病.xlsx"
|
|||
|
|
|
|||
|
|
# 加载和预处理数据
|
|||
|
|
def load_and_preprocess_data():
|
|||
|
|
# 加载数据
|
|||
|
|
df = pd.read_excel(DATA_PATH)
|
|||
|
|
|
|||
|
|
# 特征工程
|
|||
|
|
# 将age(天)转换为年(四舍五入)
|
|||
|
|
df['age_years'] = round(df['age'] / 365.25, 0)
|
|||
|
|
|
|||
|
|
# 计算BMI
|
|||
|
|
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
|
|||
|
|
|
|||
|
|
# 异常值处理
|
|||
|
|
# 删除舒张压≥收缩压的记录
|
|||
|
|
df = df[df['ap_lo'] < df['ap_hi']]
|
|||
|
|
|
|||
|
|
# 删除血压极端异常值
|
|||
|
|
df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)]
|
|||
|
|
df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)]
|
|||
|
|
|
|||
|
|
# 删除id和原始age字段
|
|||
|
|
df = df.drop(['id', 'age'], axis=1)
|
|||
|
|
|
|||
|
|
return df
|
|||
|
|
|
|||
|
|
# 加载数据
|
|||
|
|
df = load_and_preprocess_data()
|
|||
|
|
|
|||
|
|
# 定义特征和目标变量
|
|||
|
|
X = df.drop('cardio', axis=1)
|
|||
|
|
y = df['cardio']
|
|||
|
|
|
|||
|
|
# 定义特征类型
|
|||
|
|
continuous_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
|
|||
|
|
categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
|
|||
|
|
|
|||
|
|
# 创建预处理器
|
|||
|
|
preprocessor = ColumnTransformer(
|
|||
|
|
transformers=[
|
|||
|
|
('num', StandardScaler(), continuous_features),
|
|||
|
|
('cat', OneHotEncoder(), categorical_features)
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 创建完整的Pipeline
|
|||
|
|
pipeline = Pipeline([
|
|||
|
|
('preprocessor', preprocessor),
|
|||
|
|
('classifier', XGBClassifier(random_state=42))
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
# 训练模型
|
|||
|
|
pipeline.fit(X, y)
|
|||
|
|
|
|||
|
|
# 保存模型
|
|||
|
|
model_path = "module2_predictor\\cardio_predictor_model.pkl"
|
|||
|
|
joblib.dump(pipeline, model_path)
|
|||
|
|
|
|||
|
|
print(f"模型已保存到: {model_path}")
|
|||
|
|
print("训练完成!")
|