Files
sad_test01/aicodes/module2_predictor/train_and_save.py
anthony-s526 ffc4192ff0 Add CardioAI project with three modules
- Module 1: Dashboard for cardiovascular disease data visualization
- Module 2: Machine learning predictor with Flask API
- Module 3: Voice assistant with DeepSeek and CosyVoice integration
- Add .gitignore for proper file exclusion
- Update requirements and documentation

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 21:26:10 +08:00

348 lines
12 KiB
Python

#!/usr/bin/env python3
"""
CardioAI - Model Training and Saving Script
Train XGBoost classifier for cardiovascular disease prediction and save the model.
"""
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
from xgboost import XGBClassifier
import joblib
# Constants
DATA_PATH = "/Users/anthony/ai_lesson/ai_zhangzhongshan/心血管疾病.xlsx"
MODEL_PATH = "/Users/anthony/PycharmProjects/ sad_test01/aicodes/module2_predictor/cardio_predictor_model.pkl"
def load_and_preprocess_data():
"""
Load and preprocess cardiovascular disease data.
Returns cleaned DataFrame with features and target.
"""
print("📊 Step 1: Loading data...")
# Load data
df = pd.read_excel(DATA_PATH, engine='openpyxl')
print(f" Original data shape: {df.shape}")
# Standardize column names (similar to module1)
column_mapping = {
'age(天)': 'age',
'cholestero1': 'cholesterol',
'cholesterol': 'cholesterol',
'gluc': 'gluc',
'ap_hi': 'ap_hi',
'ap_lo': 'ap_lo',
'cardio': 'cardio',
'gender': 'gender',
'height': 'height',
'weight': 'weight',
'smoke': 'smoke',
'alco': 'alco',
'active': 'active',
'id': 'id'
}
# Rename columns to standardized names
df_columns = df.columns.tolist()
for col in df_columns:
for key, value in column_mapping.items():
if key.lower() in col.lower():
df.rename(columns={col: value}, inplace=True)
break
print("📊 Step 2: Performing feature engineering...")
# 1. Convert age (in days) to years (age_years)
if 'age' in df.columns:
df['age_years'] = np.round(df['age'] / 365.25).astype(int)
# 2. Calculate BMI
if 'height' in df.columns and 'weight' in df.columns:
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
df['bmi'] = df['bmi'].round(2)
# 3. Handle blood pressure outliers
# Remove records where diastolic (ap_lo) >= systolic (ap_hi)
if 'ap_hi' in df.columns and 'ap_lo' in df.columns:
initial_count = len(df)
df = df[df['ap_lo'] < df['ap_hi']]
# Remove extreme blood pressure values
systolic_mask = (df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)
diastolic_mask = (df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
df = df[systolic_mask & diastolic_mask]
print(f" Removed {initial_count - len(df)} blood pressure outliers")
# 4. Convert cholesterol and glucose to categorical (keep as numeric for model)
# XGBoost can handle numeric categorical features directly
# 5. Create BMI categories for potential use (but we'll use continuous BMI for model)
if 'bmi' in df.columns:
def categorize_bmi(bmi):
if bmi < 18.5:
return 0 # Underweight
elif 18.5 <= bmi < 25:
return 1 # Normal
elif 25 <= bmi < 30:
return 2 # Overweight
else:
return 3 # Obese
df['bmi_category'] = df['bmi'].apply(categorize_bmi)
# Check required columns
required_cols = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
print(f"⚠️ Warning: Missing columns: {missing_cols}")
return df
def prepare_features_target(df):
"""
Prepare features (X) and target (y) for model training.
"""
print("📊 Step 3: Preparing features and target...")
# Define features based on typical cardiovascular dataset
# These are the 11 features mentioned in the task requirements
feature_cols = [
'age_years', # Age in years (converted from days)
'gender', # Gender (1: female, 2: male)
'height', # Height in cm
'weight', # Weight in kg
'ap_hi', # Systolic blood pressure
'ap_lo', # Diastolic blood pressure
'cholesterol', # Cholesterol level (1: normal, 2: above normal, 3: well above normal)
'gluc', # Glucose level (1: normal, 2: above normal, 3: well above normal)
'smoke', # Smoking (0: no, 1: yes)
'alco', # Alcohol intake (0: no, 1: yes)
'active' # Physical activity (0: no, 1: yes)
]
# Check which features are available
available_features = [col for col in feature_cols if col in df.columns]
print(f" Available features: {available_features}")
# If bmi is available, use it instead of height and weight (or in addition)
if 'bmi' in df.columns and 'bmi' not in available_features:
available_features.append('bmi')
# Remove id column if exists
if 'id' in df.columns:
df = df.drop(columns=['id'])
# Remove original age (in days) column
if 'age' in df.columns:
df = df.drop(columns=['age'])
X = df[available_features].copy()
y = df['cardio'].copy() if 'cardio' in df.columns else None
print(f" Features shape: {X.shape}")
print(f" Target shape: {y.shape if y is not None else 'N/A'}")
return X, y, available_features
def create_preprocessing_pipeline(feature_cols):
"""
Create a preprocessing pipeline with ColumnTransformer.
"""
print("📊 Step 4: Creating preprocessing pipeline...")
# Define numeric and categorical features
numeric_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo']
if 'bmi' in feature_cols:
numeric_features.append('bmi')
categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
# Filter to only include features that exist in our data
numeric_features = [feat for feat in numeric_features if feat in feature_cols]
categorical_features = [feat for feat in categorical_features if feat in feature_cols]
print(f" Numeric features: {numeric_features}")
print(f" Categorical features: {categorical_features}")
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
categorical_features)
],
remainder='drop' # Drop any columns not specified
)
return preprocessor
def train_model(X, y, preprocessor):
"""
Train XGBoost classifier with preprocessing pipeline.
"""
print("📊 Step 5: Training XGBoost model...")
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f" Training set: {X_train.shape}")
print(f" Test set: {X_test.shape}")
# Create full pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', XGBClassifier(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
eval_metric='logloss',
use_label_encoder=False
))
])
# Train the model
pipeline.fit(X_train, y_train)
# Evaluate the model
print("📊 Step 6: Evaluating model performance...")
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
print(f" Accuracy: {accuracy:.4f}")
print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f" Confusion Matrix:")
print(f" [[{cm[0,0]:4d} {cm[0,1]:4d}]")
print(f" [{cm[1,0]:4d} {cm[1,1]:4d}]]")
# Feature importance (if available)
try:
# Get feature names after preprocessing
feature_names = []
# Get numeric feature names
num_features = pipeline.named_steps['preprocessor'].transformers_[0][2]
feature_names.extend(num_features)
# Get categorical feature names after one-hot encoding
cat_transformer = pipeline.named_steps['preprocessor'].transformers_[1][1]
cat_features = pipeline.named_steps['preprocessor'].transformers_[1][2]
# Get one-hot encoded feature names
if hasattr(cat_transformer, 'get_feature_names_out'):
cat_feature_names = cat_transformer.get_feature_names_out(cat_features)
feature_names.extend(cat_feature_names)
else:
# Fallback: just use categorical feature names
feature_names.extend(cat_features)
# Get feature importance from XGBoost
importances = pipeline.named_steps['classifier'].feature_importances_
# Sort by importance
importance_df = pd.DataFrame({
'feature': feature_names[:len(importances)],
'importance': importances
}).sort_values('importance', ascending=False)
print("\n Top 10 Feature Importances:")
print(importance_df.head(10).to_string(index=False))
except Exception as e:
print(f" Could not extract feature importance: {e}")
return pipeline
def save_model(pipeline, model_path):
"""
Save the trained pipeline to disk.
"""
print("📊 Step 7: Saving model...")
# Save the model
joblib.dump(pipeline, model_path)
print(f" Model saved to: {model_path}")
# Also save metadata about expected features
metadata = {
'model_version': '1.0',
'description': 'CardioAI Cardiovascular Disease Prediction Model',
'features': pipeline.named_steps['preprocessor'].transformers_[0][2] +
pipeline.named_steps['preprocessor'].transformers_[1][2],
'creation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}
metadata_path = model_path.replace('.pkl', '_metadata.json')
import json
with open(metadata_path, 'w') as f:
json.dump(metadata, f, indent=2)
print(f" Metadata saved to: {metadata_path}")
def main():
"""
Main training workflow.
"""
print("=" * 60)
print("🚀 CardioAI - Model Training Script")
print("=" * 60)
try:
# Load and preprocess data
df = load_and_preprocess_data()
# Prepare features and target
X, y, feature_cols = prepare_features_target(df)
if y is None:
print("❌ Error: Target column 'cardio' not found in data!")
return
# Create preprocessing pipeline
preprocessor = create_preprocessing_pipeline(feature_cols)
# Train model
pipeline = train_model(X, y, preprocessor)
# Save model
save_model(pipeline, MODEL_PATH)
print("\n✅ Model training completed successfully!")
print("\n📋 Model Summary:")
print(f" - Model type: XGBoost Classifier")
print(f" - Features used: {len(feature_cols)}")
print(f" - Training samples: {len(X)}")
print(f" - Model saved to: {MODEL_PATH}")
except Exception as e:
print(f"\n❌ Error during model training: {e}")
import traceback
traceback.print_exc()
return 1
return 0
if __name__ == "__main__":
exit(main())