Add CardioAI project with three modules

- Module 1: Dashboard for cardiovascular disease data visualization - Module 2: Machine learning predictor with Flask API - Module 3: Voice assistant with DeepSeek and CosyVoice integration - Add .gitignore for proper file exclusion - Update requirements and documentation Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 21:26:10 +08:00
parent 5246dce55c
commit ffc4192ff0
21 changed files with 3759 additions and 43 deletions
--- a/aicodes/module2_predictor/train_and_save.py
+++ b/aicodes/module2_predictor/train_and_save.py
@@ -0,0 +1,348 @@
+#!/usr/bin/env python3
+"""
+CardioAI - Model Training and Saving Script
+Train XGBoost classifier for cardiovascular disease prediction and save the model.
+"""
+
+import pandas as pd
+import numpy as np
+import warnings
+warnings.filterwarnings('ignore')
+
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
+import xgboost as xgb
+from xgboost import XGBClassifier
+import joblib
+
+# Constants
+DATA_PATH = "/Users/anthony/ai_lesson/ai_zhangzhongshan/心血管疾病.xlsx"
+MODEL_PATH = "/Users/anthony/PycharmProjects/ sad_test01/aicodes/module2_predictor/cardio_predictor_model.pkl"
+
+def load_and_preprocess_data():
+    """
+    Load and preprocess cardiovascular disease data.
+    Returns cleaned DataFrame with features and target.
+    """
+    print("📊 Step 1: Loading data...")
+    # Load data
+    df = pd.read_excel(DATA_PATH, engine='openpyxl')
+    print(f"   Original data shape: {df.shape}")
+
+    # Standardize column names (similar to module1)
+    column_mapping = {
+        'age(天)': 'age',
+        'cholestero1': 'cholesterol',
+        'cholesterol': 'cholesterol',
+        'gluc': 'gluc',
+        'ap_hi': 'ap_hi',
+        'ap_lo': 'ap_lo',
+        'cardio': 'cardio',
+        'gender': 'gender',
+        'height': 'height',
+        'weight': 'weight',
+        'smoke': 'smoke',
+        'alco': 'alco',
+        'active': 'active',
+        'id': 'id'
+    }
+
+    # Rename columns to standardized names
+    df_columns = df.columns.tolist()
+    for col in df_columns:
+        for key, value in column_mapping.items():
+            if key.lower() in col.lower():
+                df.rename(columns={col: value}, inplace=True)
+                break
+
+    print("📊 Step 2: Performing feature engineering...")
+
+    # 1. Convert age (in days) to years (age_years)
+    if 'age' in df.columns:
+        df['age_years'] = np.round(df['age'] / 365.25).astype(int)
+
+    # 2. Calculate BMI
+    if 'height' in df.columns and 'weight' in df.columns:
+        df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
+        df['bmi'] = df['bmi'].round(2)
+
+    # 3. Handle blood pressure outliers
+    # Remove records where diastolic (ap_lo) >= systolic (ap_hi)
+    if 'ap_hi' in df.columns and 'ap_lo' in df.columns:
+        initial_count = len(df)
+        df = df[df['ap_lo'] < df['ap_hi']]
+
+        # Remove extreme blood pressure values
+        systolic_mask = (df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)
+        diastolic_mask = (df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
+        df = df[systolic_mask & diastolic_mask]
+
+        print(f"   Removed {initial_count - len(df)} blood pressure outliers")
+
+    # 4. Convert cholesterol and glucose to categorical (keep as numeric for model)
+    # XGBoost can handle numeric categorical features directly
+
+    # 5. Create BMI categories for potential use (but we'll use continuous BMI for model)
+    if 'bmi' in df.columns:
+        def categorize_bmi(bmi):
+            if bmi < 18.5:
+                return 0  # Underweight
+            elif 18.5 <= bmi < 25:
+                return 1  # Normal
+            elif 25 <= bmi < 30:
+                return 2  # Overweight
+            else:
+                return 3  # Obese
+
+        df['bmi_category'] = df['bmi'].apply(categorize_bmi)
+
+    # Check required columns
+    required_cols = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
+                     'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
+
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        print(f"⚠️  Warning: Missing columns: {missing_cols}")
+
+    return df
+
+def prepare_features_target(df):
+    """
+    Prepare features (X) and target (y) for model training.
+    """
+    print("📊 Step 3: Preparing features and target...")
+
+    # Define features based on typical cardiovascular dataset
+    # These are the 11 features mentioned in the task requirements
+    feature_cols = [
+        'age_years',      # Age in years (converted from days)
+        'gender',         # Gender (1: female, 2: male)
+        'height',         # Height in cm
+        'weight',         # Weight in kg
+        'ap_hi',          # Systolic blood pressure
+        'ap_lo',          # Diastolic blood pressure
+        'cholesterol',    # Cholesterol level (1: normal, 2: above normal, 3: well above normal)
+        'gluc',           # Glucose level (1: normal, 2: above normal, 3: well above normal)
+        'smoke',          # Smoking (0: no, 1: yes)
+        'alco',           # Alcohol intake (0: no, 1: yes)
+        'active'          # Physical activity (0: no, 1: yes)
+    ]
+
+    # Check which features are available
+    available_features = [col for col in feature_cols if col in df.columns]
+    print(f"   Available features: {available_features}")
+
+    # If bmi is available, use it instead of height and weight (or in addition)
+    if 'bmi' in df.columns and 'bmi' not in available_features:
+        available_features.append('bmi')
+
+    # Remove id column if exists
+    if 'id' in df.columns:
+        df = df.drop(columns=['id'])
+
+    # Remove original age (in days) column
+    if 'age' in df.columns:
+        df = df.drop(columns=['age'])
+
+    X = df[available_features].copy()
+    y = df['cardio'].copy() if 'cardio' in df.columns else None
+
+    print(f"   Features shape: {X.shape}")
+    print(f"   Target shape: {y.shape if y is not None else 'N/A'}")
+
+    return X, y, available_features
+
+def create_preprocessing_pipeline(feature_cols):
+    """
+    Create a preprocessing pipeline with ColumnTransformer.
+    """
+    print("📊 Step 4: Creating preprocessing pipeline...")
+
+    # Define numeric and categorical features
+    numeric_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo']
+    if 'bmi' in feature_cols:
+        numeric_features.append('bmi')
+
+    categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
+
+    # Filter to only include features that exist in our data
+    numeric_features = [feat for feat in numeric_features if feat in feature_cols]
+    categorical_features = [feat for feat in categorical_features if feat in feature_cols]
+
+    print(f"   Numeric features: {numeric_features}")
+    print(f"   Categorical features: {categorical_features}")
+
+    # Create preprocessing pipeline
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', StandardScaler(), numeric_features),
+            ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
+             categorical_features)
+        ],
+        remainder='drop'  # Drop any columns not specified
+    )
+
+    return preprocessor
+
+def train_model(X, y, preprocessor):
+    """
+    Train XGBoost classifier with preprocessing pipeline.
+    """
+    print("📊 Step 5: Training XGBoost model...")
+
+    # Split data
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y
+    )
+
+    print(f"   Training set: {X_train.shape}")
+    print(f"   Test set: {X_test.shape}")
+
+    # Create full pipeline
+    pipeline = Pipeline([
+        ('preprocessor', preprocessor),
+        ('classifier', XGBClassifier(
+            n_estimators=100,
+            max_depth=5,
+            learning_rate=0.1,
+            subsample=0.8,
+            colsample_bytree=0.8,
+            random_state=42,
+            eval_metric='logloss',
+            use_label_encoder=False
+        ))
+    ])
+
+    # Train the model
+    pipeline.fit(X_train, y_train)
+
+    # Evaluate the model
+    print("📊 Step 6: Evaluating model performance...")
+
+    y_pred = pipeline.predict(X_test)
+    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
+
+    accuracy = accuracy_score(y_test, y_pred)
+    print(f"   Accuracy: {accuracy:.4f}")
+
+    print("\n   Classification Report:")
+    print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))
+
+    # Confusion matrix
+    cm = confusion_matrix(y_test, y_pred)
+    print(f"   Confusion Matrix:")
+    print(f"   [[{cm[0,0]:4d} {cm[0,1]:4d}]")
+    print(f"    [{cm[1,0]:4d} {cm[1,1]:4d}]]")
+
+    # Feature importance (if available)
+    try:
+        # Get feature names after preprocessing
+        feature_names = []
+
+        # Get numeric feature names
+        num_features = pipeline.named_steps['preprocessor'].transformers_[0][2]
+        feature_names.extend(num_features)
+
+        # Get categorical feature names after one-hot encoding
+        cat_transformer = pipeline.named_steps['preprocessor'].transformers_[1][1]
+        cat_features = pipeline.named_steps['preprocessor'].transformers_[1][2]
+
+        # Get one-hot encoded feature names
+        if hasattr(cat_transformer, 'get_feature_names_out'):
+            cat_feature_names = cat_transformer.get_feature_names_out(cat_features)
+            feature_names.extend(cat_feature_names)
+        else:
+            # Fallback: just use categorical feature names
+            feature_names.extend(cat_features)
+
+        # Get feature importance from XGBoost
+        importances = pipeline.named_steps['classifier'].feature_importances_
+
+        # Sort by importance
+        importance_df = pd.DataFrame({
+            'feature': feature_names[:len(importances)],
+            'importance': importances
+        }).sort_values('importance', ascending=False)
+
+        print("\n   Top 10 Feature Importances:")
+        print(importance_df.head(10).to_string(index=False))
+
+    except Exception as e:
+        print(f"   Could not extract feature importance: {e}")
+
+    return pipeline
+
+def save_model(pipeline, model_path):
+    """
+    Save the trained pipeline to disk.
+    """
+    print("📊 Step 7: Saving model...")
+
+    # Save the model
+    joblib.dump(pipeline, model_path)
+    print(f"   Model saved to: {model_path}")
+
+    # Also save metadata about expected features
+    metadata = {
+        'model_version': '1.0',
+        'description': 'CardioAI Cardiovascular Disease Prediction Model',
+        'features': pipeline.named_steps['preprocessor'].transformers_[0][2] +
+                   pipeline.named_steps['preprocessor'].transformers_[1][2],
+        'creation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
+    }
+
+    metadata_path = model_path.replace('.pkl', '_metadata.json')
+    import json
+    with open(metadata_path, 'w') as f:
+        json.dump(metadata, f, indent=2)
+
+    print(f"   Metadata saved to: {metadata_path}")
+
+def main():
+    """
+    Main training workflow.
+    """
+    print("=" * 60)
+    print("🚀 CardioAI - Model Training Script")
+    print("=" * 60)
+
+    try:
+        # Load and preprocess data
+        df = load_and_preprocess_data()
+
+        # Prepare features and target
+        X, y, feature_cols = prepare_features_target(df)
+
+        if y is None:
+            print("❌ Error: Target column 'cardio' not found in data!")
+            return
+
+        # Create preprocessing pipeline
+        preprocessor = create_preprocessing_pipeline(feature_cols)
+
+        # Train model
+        pipeline = train_model(X, y, preprocessor)
+
+        # Save model
+        save_model(pipeline, MODEL_PATH)
+
+        print("\n✅ Model training completed successfully!")
+        print("\n📋 Model Summary:")
+        print(f"   - Model type: XGBoost Classifier")
+        print(f"   - Features used: {len(feature_cols)}")
+        print(f"   - Training samples: {len(X)}")
+        print(f"   - Model saved to: {MODEL_PATH}")
+
+    except Exception as e:
+        print(f"\n❌ Error during model training: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    return 0
+
+if __name__ == "__main__":
+    exit(main())