71 lines
1.9 KiB
Python
71 lines
1.9 KiB
Python
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn.compose import ColumnTransformer
|
||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||
from sklearn.model_selection import train_test_split
|
||
from xgboost import XGBClassifier
|
||
import joblib
|
||
|
||
# 数据路径
|
||
DATA_PATH = "D:\\AI_Coding\\data\\心血管疾病.xlsx"
|
||
|
||
# 加载和预处理数据
|
||
def load_and_preprocess_data():
|
||
# 加载数据
|
||
df = pd.read_excel(DATA_PATH)
|
||
|
||
# 特征工程
|
||
# 将age(天)转换为年(四舍五入)
|
||
df['age_years'] = round(df['age'] / 365.25, 0)
|
||
|
||
# 计算BMI
|
||
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
|
||
|
||
# 异常值处理
|
||
# 删除舒张压≥收缩压的记录
|
||
df = df[df['ap_lo'] < df['ap_hi']]
|
||
|
||
# 删除血压极端异常值
|
||
df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)]
|
||
df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)]
|
||
|
||
# 删除id和原始age字段
|
||
df = df.drop(['id', 'age'], axis=1)
|
||
|
||
return df
|
||
|
||
# 加载数据
|
||
df = load_and_preprocess_data()
|
||
|
||
# 定义特征和目标变量
|
||
X = df.drop('cardio', axis=1)
|
||
y = df['cardio']
|
||
|
||
# 定义特征类型
|
||
continuous_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
|
||
categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
|
||
|
||
# 创建预处理器
|
||
preprocessor = ColumnTransformer(
|
||
transformers=[
|
||
('num', StandardScaler(), continuous_features),
|
||
('cat', OneHotEncoder(), categorical_features)
|
||
]
|
||
)
|
||
|
||
# 创建完整的Pipeline
|
||
pipeline = Pipeline([
|
||
('preprocessor', preprocessor),
|
||
('classifier', XGBClassifier(random_state=42))
|
||
])
|
||
|
||
# 训练模型
|
||
pipeline.fit(X, y)
|
||
|
||
# 保存模型
|
||
model_path = "module2_predictor\\cardio_predictor_model.pkl"
|
||
joblib.dump(pipeline, model_path)
|
||
|
||
print(f"模型已保存到: {model_path}")
|
||
print("训练完成!") |