#!/usr/bin/env python3 """ CardioAI - Model Training and Saving Script Train XGBoost classifier for cardiovascular disease prediction and save the model. """ import pandas as pd import numpy as np import warnings warnings.filterwarnings('ignore') from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.metrics import classification_report, confusion_matrix, accuracy_score import xgboost as xgb from xgboost import XGBClassifier import joblib # Constants DATA_PATH = "/Users/anthony/ai_lesson/ai_zhangzhongshan/心血管疾病.xlsx" MODEL_PATH = "/Users/anthony/PycharmProjects/ sad_test01/aicodes/module2_predictor/cardio_predictor_model.pkl" def load_and_preprocess_data(): """ Load and preprocess cardiovascular disease data. Returns cleaned DataFrame with features and target. """ print("📊 Step 1: Loading data...") # Load data df = pd.read_excel(DATA_PATH, engine='openpyxl') print(f" Original data shape: {df.shape}") # Standardize column names (similar to module1) column_mapping = { 'age(天)': 'age', 'cholestero1': 'cholesterol', 'cholesterol': 'cholesterol', 'gluc': 'gluc', 'ap_hi': 'ap_hi', 'ap_lo': 'ap_lo', 'cardio': 'cardio', 'gender': 'gender', 'height': 'height', 'weight': 'weight', 'smoke': 'smoke', 'alco': 'alco', 'active': 'active', 'id': 'id' } # Rename columns to standardized names df_columns = df.columns.tolist() for col in df_columns: for key, value in column_mapping.items(): if key.lower() in col.lower(): df.rename(columns={col: value}, inplace=True) break print("📊 Step 2: Performing feature engineering...") # 1. Convert age (in days) to years (age_years) if 'age' in df.columns: df['age_years'] = np.round(df['age'] / 365.25).astype(int) # 2. Calculate BMI if 'height' in df.columns and 'weight' in df.columns: df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2) df['bmi'] = df['bmi'].round(2) # 3. Handle blood pressure outliers # Remove records where diastolic (ap_lo) >= systolic (ap_hi) if 'ap_hi' in df.columns and 'ap_lo' in df.columns: initial_count = len(df) df = df[df['ap_lo'] < df['ap_hi']] # Remove extreme blood pressure values systolic_mask = (df['ap_hi'] >= 90) & (df['ap_hi'] <= 250) diastolic_mask = (df['ap_lo'] >= 60) & (df['ap_lo'] <= 150) df = df[systolic_mask & diastolic_mask] print(f" Removed {initial_count - len(df)} blood pressure outliers") # 4. Convert cholesterol and glucose to categorical (keep as numeric for model) # XGBoost can handle numeric categorical features directly # 5. Create BMI categories for potential use (but we'll use continuous BMI for model) if 'bmi' in df.columns: def categorize_bmi(bmi): if bmi < 18.5: return 0 # Underweight elif 18.5 <= bmi < 25: return 1 # Normal elif 25 <= bmi < 30: return 2 # Overweight else: return 3 # Obese df['bmi_category'] = df['bmi'].apply(categorize_bmi) # Check required columns required_cols = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: print(f"⚠️ Warning: Missing columns: {missing_cols}") return df def prepare_features_target(df): """ Prepare features (X) and target (y) for model training. """ print("📊 Step 3: Preparing features and target...") # Define features based on typical cardiovascular dataset # These are the 11 features mentioned in the task requirements feature_cols = [ 'age_years', # Age in years (converted from days) 'gender', # Gender (1: female, 2: male) 'height', # Height in cm 'weight', # Weight in kg 'ap_hi', # Systolic blood pressure 'ap_lo', # Diastolic blood pressure 'cholesterol', # Cholesterol level (1: normal, 2: above normal, 3: well above normal) 'gluc', # Glucose level (1: normal, 2: above normal, 3: well above normal) 'smoke', # Smoking (0: no, 1: yes) 'alco', # Alcohol intake (0: no, 1: yes) 'active' # Physical activity (0: no, 1: yes) ] # Check which features are available available_features = [col for col in feature_cols if col in df.columns] print(f" Available features: {available_features}") # If bmi is available, use it instead of height and weight (or in addition) if 'bmi' in df.columns and 'bmi' not in available_features: available_features.append('bmi') # Remove id column if exists if 'id' in df.columns: df = df.drop(columns=['id']) # Remove original age (in days) column if 'age' in df.columns: df = df.drop(columns=['age']) X = df[available_features].copy() y = df['cardio'].copy() if 'cardio' in df.columns else None print(f" Features shape: {X.shape}") print(f" Target shape: {y.shape if y is not None else 'N/A'}") return X, y, available_features def create_preprocessing_pipeline(feature_cols): """ Create a preprocessing pipeline with ColumnTransformer. """ print("📊 Step 4: Creating preprocessing pipeline...") # Define numeric and categorical features numeric_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo'] if 'bmi' in feature_cols: numeric_features.append('bmi') categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'] # Filter to only include features that exist in our data numeric_features = [feat for feat in numeric_features if feat in feature_cols] categorical_features = [feat for feat in categorical_features if feat in feature_cols] print(f" Numeric features: {numeric_features}") print(f" Categorical features: {categorical_features}") # Create preprocessing pipeline preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), numeric_features), ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_features) ], remainder='drop' # Drop any columns not specified ) return preprocessor def train_model(X, y, preprocessor): """ Train XGBoost classifier with preprocessing pipeline. """ print("📊 Step 5: Training XGBoost model...") # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f" Training set: {X_train.shape}") print(f" Test set: {X_test.shape}") # Create full pipeline pipeline = Pipeline([ ('preprocessor', preprocessor), ('classifier', XGBClassifier( n_estimators=100, max_depth=5, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=42, eval_metric='logloss', use_label_encoder=False )) ]) # Train the model pipeline.fit(X_train, y_train) # Evaluate the model print("📊 Step 6: Evaluating model performance...") y_pred = pipeline.predict(X_test) y_pred_proba = pipeline.predict_proba(X_test)[:, 1] accuracy = accuracy_score(y_test, y_pred) print(f" Accuracy: {accuracy:.4f}") print("\n Classification Report:") print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease'])) # Confusion matrix cm = confusion_matrix(y_test, y_pred) print(f" Confusion Matrix:") print(f" [[{cm[0,0]:4d} {cm[0,1]:4d}]") print(f" [{cm[1,0]:4d} {cm[1,1]:4d}]]") # Feature importance (if available) try: # Get feature names after preprocessing feature_names = [] # Get numeric feature names num_features = pipeline.named_steps['preprocessor'].transformers_[0][2] feature_names.extend(num_features) # Get categorical feature names after one-hot encoding cat_transformer = pipeline.named_steps['preprocessor'].transformers_[1][1] cat_features = pipeline.named_steps['preprocessor'].transformers_[1][2] # Get one-hot encoded feature names if hasattr(cat_transformer, 'get_feature_names_out'): cat_feature_names = cat_transformer.get_feature_names_out(cat_features) feature_names.extend(cat_feature_names) else: # Fallback: just use categorical feature names feature_names.extend(cat_features) # Get feature importance from XGBoost importances = pipeline.named_steps['classifier'].feature_importances_ # Sort by importance importance_df = pd.DataFrame({ 'feature': feature_names[:len(importances)], 'importance': importances }).sort_values('importance', ascending=False) print("\n Top 10 Feature Importances:") print(importance_df.head(10).to_string(index=False)) except Exception as e: print(f" Could not extract feature importance: {e}") return pipeline def save_model(pipeline, model_path): """ Save the trained pipeline to disk. """ print("📊 Step 7: Saving model...") # Save the model joblib.dump(pipeline, model_path) print(f" Model saved to: {model_path}") # Also save metadata about expected features metadata = { 'model_version': '1.0', 'description': 'CardioAI Cardiovascular Disease Prediction Model', 'features': pipeline.named_steps['preprocessor'].transformers_[0][2] + pipeline.named_steps['preprocessor'].transformers_[1][2], 'creation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S') } metadata_path = model_path.replace('.pkl', '_metadata.json') import json with open(metadata_path, 'w') as f: json.dump(metadata, f, indent=2) print(f" Metadata saved to: {metadata_path}") def main(): """ Main training workflow. """ print("=" * 60) print("🚀 CardioAI - Model Training Script") print("=" * 60) try: # Load and preprocess data df = load_and_preprocess_data() # Prepare features and target X, y, feature_cols = prepare_features_target(df) if y is None: print("❌ Error: Target column 'cardio' not found in data!") return # Create preprocessing pipeline preprocessor = create_preprocessing_pipeline(feature_cols) # Train model pipeline = train_model(X, y, preprocessor) # Save model save_model(pipeline, MODEL_PATH) print("\n✅ Model training completed successfully!") print("\n📋 Model Summary:") print(f" - Model type: XGBoost Classifier") print(f" - Features used: {len(feature_cols)}") print(f" - Training samples: {len(X)}") print(f" - Model saved to: {MODEL_PATH}") except Exception as e: print(f"\n❌ Error during model training: {e}") import traceback traceback.print_exc() return 1 return 0 if __name__ == "__main__": exit(main())