sad_test01/aicodes/module2_predictor/train_and_save.py

#!/usr/bin/env python3
"""
CardioAI - Model Training and Saving Script
Train XGBoost classifier for cardiovascular disease prediction and save the model.
"""

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
from xgboost import XGBClassifier
import joblib

# Constants
DATA_PATH = "/Users/anthony/ai_lesson/ai_zhangzhongshan/心血管疾病.xlsx"
MODEL_PATH = "/Users/anthony/PycharmProjects/ sad_test01/aicodes/module2_predictor/cardio_predictor_model.pkl"

def load_and_preprocess_data():
    """
    Load and preprocess cardiovascular disease data.
    Returns cleaned DataFrame with features and target.
    """
    print("📊 Step 1: Loading data...")
    # Load data
    df = pd.read_excel(DATA_PATH, engine='openpyxl')
    print(f"   Original data shape: {df.shape}")

    # Standardize column names (similar to module1)
    column_mapping = {
        'age(天)': 'age',
        'cholestero1': 'cholesterol',
        'cholesterol': 'cholesterol',
        'gluc': 'gluc',
        'ap_hi': 'ap_hi',
        'ap_lo': 'ap_lo',
        'cardio': 'cardio',
        'gender': 'gender',
        'height': 'height',
        'weight': 'weight',
        'smoke': 'smoke',
        'alco': 'alco',
        'active': 'active',
        'id': 'id'
    }

    # Rename columns to standardized names
    df_columns = df.columns.tolist()
    for col in df_columns:
        for key, value in column_mapping.items():
            if key.lower() in col.lower():
                df.rename(columns={col: value}, inplace=True)
                break

    print("📊 Step 2: Performing feature engineering...")

    # 1. Convert age (in days) to years (age_years)
    if 'age' in df.columns:
        df['age_years'] = np.round(df['age'] / 365.25).astype(int)

    # 2. Calculate BMI
    if 'height' in df.columns and 'weight' in df.columns:
        df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
        df['bmi'] = df['bmi'].round(2)

    # 3. Handle blood pressure outliers
    # Remove records where diastolic (ap_lo) >= systolic (ap_hi)
    if 'ap_hi' in df.columns and 'ap_lo' in df.columns:
        initial_count = len(df)
        df = df[df['ap_lo'] < df['ap_hi']]

        # Remove extreme blood pressure values
        systolic_mask = (df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)
        diastolic_mask = (df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
        df = df[systolic_mask & diastolic_mask]

        print(f"   Removed {initial_count - len(df)} blood pressure outliers")

    # 4. Convert cholesterol and glucose to categorical (keep as numeric for model)
    # XGBoost can handle numeric categorical features directly

    # 5. Create BMI categories for potential use (but we'll use continuous BMI for model)
    if 'bmi' in df.columns:
        def categorize_bmi(bmi):
            if bmi < 18.5:
                return 0  # Underweight
            elif 18.5 <= bmi < 25:
                return 1  # Normal
            elif 25 <= bmi < 30:
                return 2  # Overweight
            else:
                return 3  # Obese

        df['bmi_category'] = df['bmi'].apply(categorize_bmi)

    # Check required columns
    required_cols = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
                     'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"⚠️  Warning: Missing columns: {missing_cols}")

    return df

def prepare_features_target(df):
    """
    Prepare features (X) and target (y) for model training.
    """
    print("📊 Step 3: Preparing features and target...")

    # Define features based on typical cardiovascular dataset
    # These are the 11 features mentioned in the task requirements
    feature_cols = [
        'age_years',      # Age in years (converted from days)
        'gender',         # Gender (1: female, 2: male)
        'height',         # Height in cm
        'weight',         # Weight in kg
        'ap_hi',          # Systolic blood pressure
        'ap_lo',          # Diastolic blood pressure
        'cholesterol',    # Cholesterol level (1: normal, 2: above normal, 3: well above normal)
        'gluc',           # Glucose level (1: normal, 2: above normal, 3: well above normal)
        'smoke',          # Smoking (0: no, 1: yes)
        'alco',           # Alcohol intake (0: no, 1: yes)
        'active'          # Physical activity (0: no, 1: yes)
    ]

    # Check which features are available
    available_features = [col for col in feature_cols if col in df.columns]
    print(f"   Available features: {available_features}")

    # If bmi is available, use it instead of height and weight (or in addition)
    if 'bmi' in df.columns and 'bmi' not in available_features:
        available_features.append('bmi')

    # Remove id column if exists
    if 'id' in df.columns:
        df = df.drop(columns=['id'])

    # Remove original age (in days) column
    if 'age' in df.columns:
        df = df.drop(columns=['age'])

    X = df[available_features].copy()
    y = df['cardio'].copy() if 'cardio' in df.columns else None

    print(f"   Features shape: {X.shape}")
    print(f"   Target shape: {y.shape if y is not None else 'N/A'}")

    return X, y, available_features

def create_preprocessing_pipeline(feature_cols):
    """
    Create a preprocessing pipeline with ColumnTransformer.
    """
    print("📊 Step 4: Creating preprocessing pipeline...")

    # Define numeric and categorical features
    numeric_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo']
    if 'bmi' in feature_cols:
        numeric_features.append('bmi')

    categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

    # Filter to only include features that exist in our data
    numeric_features = [feat for feat in numeric_features if feat in feature_cols]
    categorical_features = [feat for feat in categorical_features if feat in feature_cols]

    print(f"   Numeric features: {numeric_features}")
    print(f"   Categorical features: {categorical_features}")

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
             categorical_features)
        ],
        remainder='drop'  # Drop any columns not specified
    )

    return preprocessor

def train_model(X, y, preprocessor):
    """
    Train XGBoost classifier with preprocessing pipeline.
    """
    print("📊 Step 5: Training XGBoost model...")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"   Training set: {X_train.shape}")
    print(f"   Test set: {X_test.shape}")

    # Create full pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            eval_metric='logloss',
            use_label_encoder=False
        ))
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Evaluate the model
    print("📊 Step 6: Evaluating model performance...")

    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    print(f"   Accuracy: {accuracy:.4f}")

    print("\n   Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"   Confusion Matrix:")
    print(f"   [[{cm[0,0]:4d} {cm[0,1]:4d}]")
    print(f"    [{cm[1,0]:4d} {cm[1,1]:4d}]]")

    # Feature importance (if available)
    try:
        # Get feature names after preprocessing
        feature_names = []

        # Get numeric feature names
        num_features = pipeline.named_steps['preprocessor'].transformers_[0][2]
        feature_names.extend(num_features)

        # Get categorical feature names after one-hot encoding
        cat_transformer = pipeline.named_steps['preprocessor'].transformers_[1][1]
        cat_features = pipeline.named_steps['preprocessor'].transformers_[1][2]

        # Get one-hot encoded feature names
        if hasattr(cat_transformer, 'get_feature_names_out'):
            cat_feature_names = cat_transformer.get_feature_names_out(cat_features)
            feature_names.extend(cat_feature_names)
        else:
            # Fallback: just use categorical feature names
            feature_names.extend(cat_features)

        # Get feature importance from XGBoost
        importances = pipeline.named_steps['classifier'].feature_importances_

        # Sort by importance
        importance_df = pd.DataFrame({
            'feature': feature_names[:len(importances)],
            'importance': importances
        }).sort_values('importance', ascending=False)

        print("\n   Top 10 Feature Importances:")
        print(importance_df.head(10).to_string(index=False))

    except Exception as e:
        print(f"   Could not extract feature importance: {e}")

    return pipeline

def save_model(pipeline, model_path):
    """
    Save the trained pipeline to disk.
    """
    print("📊 Step 7: Saving model...")

    # Save the model
    joblib.dump(pipeline, model_path)
    print(f"   Model saved to: {model_path}")

    # Also save metadata about expected features
    metadata = {
        'model_version': '1.0',
        'description': 'CardioAI Cardiovascular Disease Prediction Model',
        'features': pipeline.named_steps['preprocessor'].transformers_[0][2] +
                   pipeline.named_steps['preprocessor'].transformers_[1][2],
        'creation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    metadata_path = model_path.replace('.pkl', '_metadata.json')
    import json
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"   Metadata saved to: {metadata_path}")

def main():
    """
    Main training workflow.
    """
    print("=" * 60)
    print("🚀 CardioAI - Model Training Script")
    print("=" * 60)

    try:
        # Load and preprocess data
        df = load_and_preprocess_data()

        # Prepare features and target
        X, y, feature_cols = prepare_features_target(df)

        if y is None:
            print("❌ Error: Target column 'cardio' not found in data!")
            return

        # Create preprocessing pipeline
        preprocessor = create_preprocessing_pipeline(feature_cols)

        # Train model
        pipeline = train_model(X, y, preprocessor)

        # Save model
        save_model(pipeline, MODEL_PATH)

        print("\n✅ Model training completed successfully!")
        print("\n📋 Model Summary:")
        print(f"   - Model type: XGBoost Classifier")
        print(f"   - Features used: {len(feature_cols)}")
        print(f"   - Training samples: {len(X)}")
        print(f"   - Model saved to: {MODEL_PATH}")

    except Exception as e:
        print(f"\n❌ Error during model training: {e}")
        import traceback
        traceback.print_exc()
        return 1

    return 0

if __name__ == "__main__":
    exit(main())