import pandas as pd import numpy as np from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.model_selection import train_test_split from xgboost import XGBClassifier import joblib # 数据路径 DATA_PATH = "D:\\AI_Coding\\data\\心血管疾病.xlsx" # 加载和预处理数据 def load_and_preprocess_data(): # 加载数据 df = pd.read_excel(DATA_PATH) # 特征工程 # 将age(天)转换为年(四舍五入) df['age_years'] = round(df['age'] / 365.25, 0) # 计算BMI df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2) # 异常值处理 # 删除舒张压≥收缩压的记录 df = df[df['ap_lo'] < df['ap_hi']] # 删除血压极端异常值 df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)] df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)] # 删除id和原始age字段 df = df.drop(['id', 'age'], axis=1) return df # 加载数据 df = load_and_preprocess_data() # 定义特征和目标变量 X = df.drop('cardio', axis=1) y = df['cardio'] # 定义特征类型 continuous_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi'] categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'] # 创建预处理器 preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), continuous_features), ('cat', OneHotEncoder(), categorical_features) ] ) # 创建完整的Pipeline pipeline = Pipeline([ ('preprocessor', preprocessor), ('classifier', XGBClassifier(random_state=42)) ]) # 训练模型 pipeline.fit(X, y) # 保存模型 model_path = "module2_predictor\\cardio_predictor_model.pkl" joblib.dump(pipeline, model_path) print(f"模型已保存到: {model_path}") print("训练完成!")