- Module 1: Dashboard for cardiovascular disease data visualization - Module 2: Machine learning predictor with Flask API - Module 3: Voice assistant with DeepSeek and CosyVoice integration - Add .gitignore for proper file exclusion - Update requirements and documentation Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
348 lines
12 KiB
Python
348 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
CardioAI - Model Training and Saving Script
|
|
Train XGBoost classifier for cardiovascular disease prediction and save the model.
|
|
"""
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import warnings
|
|
warnings.filterwarnings('ignore')
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
|
import xgboost as xgb
|
|
from xgboost import XGBClassifier
|
|
import joblib
|
|
|
|
# Constants
|
|
DATA_PATH = "/Users/anthony/ai_lesson/ai_zhangzhongshan/心血管疾病.xlsx"
|
|
MODEL_PATH = "/Users/anthony/PycharmProjects/ sad_test01/aicodes/module2_predictor/cardio_predictor_model.pkl"
|
|
|
|
def load_and_preprocess_data():
|
|
"""
|
|
Load and preprocess cardiovascular disease data.
|
|
Returns cleaned DataFrame with features and target.
|
|
"""
|
|
print("📊 Step 1: Loading data...")
|
|
# Load data
|
|
df = pd.read_excel(DATA_PATH, engine='openpyxl')
|
|
print(f" Original data shape: {df.shape}")
|
|
|
|
# Standardize column names (similar to module1)
|
|
column_mapping = {
|
|
'age(天)': 'age',
|
|
'cholestero1': 'cholesterol',
|
|
'cholesterol': 'cholesterol',
|
|
'gluc': 'gluc',
|
|
'ap_hi': 'ap_hi',
|
|
'ap_lo': 'ap_lo',
|
|
'cardio': 'cardio',
|
|
'gender': 'gender',
|
|
'height': 'height',
|
|
'weight': 'weight',
|
|
'smoke': 'smoke',
|
|
'alco': 'alco',
|
|
'active': 'active',
|
|
'id': 'id'
|
|
}
|
|
|
|
# Rename columns to standardized names
|
|
df_columns = df.columns.tolist()
|
|
for col in df_columns:
|
|
for key, value in column_mapping.items():
|
|
if key.lower() in col.lower():
|
|
df.rename(columns={col: value}, inplace=True)
|
|
break
|
|
|
|
print("📊 Step 2: Performing feature engineering...")
|
|
|
|
# 1. Convert age (in days) to years (age_years)
|
|
if 'age' in df.columns:
|
|
df['age_years'] = np.round(df['age'] / 365.25).astype(int)
|
|
|
|
# 2. Calculate BMI
|
|
if 'height' in df.columns and 'weight' in df.columns:
|
|
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
|
|
df['bmi'] = df['bmi'].round(2)
|
|
|
|
# 3. Handle blood pressure outliers
|
|
# Remove records where diastolic (ap_lo) >= systolic (ap_hi)
|
|
if 'ap_hi' in df.columns and 'ap_lo' in df.columns:
|
|
initial_count = len(df)
|
|
df = df[df['ap_lo'] < df['ap_hi']]
|
|
|
|
# Remove extreme blood pressure values
|
|
systolic_mask = (df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)
|
|
diastolic_mask = (df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
|
|
df = df[systolic_mask & diastolic_mask]
|
|
|
|
print(f" Removed {initial_count - len(df)} blood pressure outliers")
|
|
|
|
# 4. Convert cholesterol and glucose to categorical (keep as numeric for model)
|
|
# XGBoost can handle numeric categorical features directly
|
|
|
|
# 5. Create BMI categories for potential use (but we'll use continuous BMI for model)
|
|
if 'bmi' in df.columns:
|
|
def categorize_bmi(bmi):
|
|
if bmi < 18.5:
|
|
return 0 # Underweight
|
|
elif 18.5 <= bmi < 25:
|
|
return 1 # Normal
|
|
elif 25 <= bmi < 30:
|
|
return 2 # Overweight
|
|
else:
|
|
return 3 # Obese
|
|
|
|
df['bmi_category'] = df['bmi'].apply(categorize_bmi)
|
|
|
|
# Check required columns
|
|
required_cols = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
|
|
'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
|
|
|
|
missing_cols = [col for col in required_cols if col not in df.columns]
|
|
if missing_cols:
|
|
print(f"⚠️ Warning: Missing columns: {missing_cols}")
|
|
|
|
return df
|
|
|
|
def prepare_features_target(df):
|
|
"""
|
|
Prepare features (X) and target (y) for model training.
|
|
"""
|
|
print("📊 Step 3: Preparing features and target...")
|
|
|
|
# Define features based on typical cardiovascular dataset
|
|
# These are the 11 features mentioned in the task requirements
|
|
feature_cols = [
|
|
'age_years', # Age in years (converted from days)
|
|
'gender', # Gender (1: female, 2: male)
|
|
'height', # Height in cm
|
|
'weight', # Weight in kg
|
|
'ap_hi', # Systolic blood pressure
|
|
'ap_lo', # Diastolic blood pressure
|
|
'cholesterol', # Cholesterol level (1: normal, 2: above normal, 3: well above normal)
|
|
'gluc', # Glucose level (1: normal, 2: above normal, 3: well above normal)
|
|
'smoke', # Smoking (0: no, 1: yes)
|
|
'alco', # Alcohol intake (0: no, 1: yes)
|
|
'active' # Physical activity (0: no, 1: yes)
|
|
]
|
|
|
|
# Check which features are available
|
|
available_features = [col for col in feature_cols if col in df.columns]
|
|
print(f" Available features: {available_features}")
|
|
|
|
# If bmi is available, use it instead of height and weight (or in addition)
|
|
if 'bmi' in df.columns and 'bmi' not in available_features:
|
|
available_features.append('bmi')
|
|
|
|
# Remove id column if exists
|
|
if 'id' in df.columns:
|
|
df = df.drop(columns=['id'])
|
|
|
|
# Remove original age (in days) column
|
|
if 'age' in df.columns:
|
|
df = df.drop(columns=['age'])
|
|
|
|
X = df[available_features].copy()
|
|
y = df['cardio'].copy() if 'cardio' in df.columns else None
|
|
|
|
print(f" Features shape: {X.shape}")
|
|
print(f" Target shape: {y.shape if y is not None else 'N/A'}")
|
|
|
|
return X, y, available_features
|
|
|
|
def create_preprocessing_pipeline(feature_cols):
|
|
"""
|
|
Create a preprocessing pipeline with ColumnTransformer.
|
|
"""
|
|
print("📊 Step 4: Creating preprocessing pipeline...")
|
|
|
|
# Define numeric and categorical features
|
|
numeric_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo']
|
|
if 'bmi' in feature_cols:
|
|
numeric_features.append('bmi')
|
|
|
|
categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
|
|
|
|
# Filter to only include features that exist in our data
|
|
numeric_features = [feat for feat in numeric_features if feat in feature_cols]
|
|
categorical_features = [feat for feat in categorical_features if feat in feature_cols]
|
|
|
|
print(f" Numeric features: {numeric_features}")
|
|
print(f" Categorical features: {categorical_features}")
|
|
|
|
# Create preprocessing pipeline
|
|
preprocessor = ColumnTransformer(
|
|
transformers=[
|
|
('num', StandardScaler(), numeric_features),
|
|
('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
|
|
categorical_features)
|
|
],
|
|
remainder='drop' # Drop any columns not specified
|
|
)
|
|
|
|
return preprocessor
|
|
|
|
def train_model(X, y, preprocessor):
|
|
"""
|
|
Train XGBoost classifier with preprocessing pipeline.
|
|
"""
|
|
print("📊 Step 5: Training XGBoost model...")
|
|
|
|
# Split data
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=0.2, random_state=42, stratify=y
|
|
)
|
|
|
|
print(f" Training set: {X_train.shape}")
|
|
print(f" Test set: {X_test.shape}")
|
|
|
|
# Create full pipeline
|
|
pipeline = Pipeline([
|
|
('preprocessor', preprocessor),
|
|
('classifier', XGBClassifier(
|
|
n_estimators=100,
|
|
max_depth=5,
|
|
learning_rate=0.1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
random_state=42,
|
|
eval_metric='logloss',
|
|
use_label_encoder=False
|
|
))
|
|
])
|
|
|
|
# Train the model
|
|
pipeline.fit(X_train, y_train)
|
|
|
|
# Evaluate the model
|
|
print("📊 Step 6: Evaluating model performance...")
|
|
|
|
y_pred = pipeline.predict(X_test)
|
|
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
|
|
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
print(f" Accuracy: {accuracy:.4f}")
|
|
|
|
print("\n Classification Report:")
|
|
print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))
|
|
|
|
# Confusion matrix
|
|
cm = confusion_matrix(y_test, y_pred)
|
|
print(f" Confusion Matrix:")
|
|
print(f" [[{cm[0,0]:4d} {cm[0,1]:4d}]")
|
|
print(f" [{cm[1,0]:4d} {cm[1,1]:4d}]]")
|
|
|
|
# Feature importance (if available)
|
|
try:
|
|
# Get feature names after preprocessing
|
|
feature_names = []
|
|
|
|
# Get numeric feature names
|
|
num_features = pipeline.named_steps['preprocessor'].transformers_[0][2]
|
|
feature_names.extend(num_features)
|
|
|
|
# Get categorical feature names after one-hot encoding
|
|
cat_transformer = pipeline.named_steps['preprocessor'].transformers_[1][1]
|
|
cat_features = pipeline.named_steps['preprocessor'].transformers_[1][2]
|
|
|
|
# Get one-hot encoded feature names
|
|
if hasattr(cat_transformer, 'get_feature_names_out'):
|
|
cat_feature_names = cat_transformer.get_feature_names_out(cat_features)
|
|
feature_names.extend(cat_feature_names)
|
|
else:
|
|
# Fallback: just use categorical feature names
|
|
feature_names.extend(cat_features)
|
|
|
|
# Get feature importance from XGBoost
|
|
importances = pipeline.named_steps['classifier'].feature_importances_
|
|
|
|
# Sort by importance
|
|
importance_df = pd.DataFrame({
|
|
'feature': feature_names[:len(importances)],
|
|
'importance': importances
|
|
}).sort_values('importance', ascending=False)
|
|
|
|
print("\n Top 10 Feature Importances:")
|
|
print(importance_df.head(10).to_string(index=False))
|
|
|
|
except Exception as e:
|
|
print(f" Could not extract feature importance: {e}")
|
|
|
|
return pipeline
|
|
|
|
def save_model(pipeline, model_path):
|
|
"""
|
|
Save the trained pipeline to disk.
|
|
"""
|
|
print("📊 Step 7: Saving model...")
|
|
|
|
# Save the model
|
|
joblib.dump(pipeline, model_path)
|
|
print(f" Model saved to: {model_path}")
|
|
|
|
# Also save metadata about expected features
|
|
metadata = {
|
|
'model_version': '1.0',
|
|
'description': 'CardioAI Cardiovascular Disease Prediction Model',
|
|
'features': pipeline.named_steps['preprocessor'].transformers_[0][2] +
|
|
pipeline.named_steps['preprocessor'].transformers_[1][2],
|
|
'creation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
}
|
|
|
|
metadata_path = model_path.replace('.pkl', '_metadata.json')
|
|
import json
|
|
with open(metadata_path, 'w') as f:
|
|
json.dump(metadata, f, indent=2)
|
|
|
|
print(f" Metadata saved to: {metadata_path}")
|
|
|
|
def main():
|
|
"""
|
|
Main training workflow.
|
|
"""
|
|
print("=" * 60)
|
|
print("🚀 CardioAI - Model Training Script")
|
|
print("=" * 60)
|
|
|
|
try:
|
|
# Load and preprocess data
|
|
df = load_and_preprocess_data()
|
|
|
|
# Prepare features and target
|
|
X, y, feature_cols = prepare_features_target(df)
|
|
|
|
if y is None:
|
|
print("❌ Error: Target column 'cardio' not found in data!")
|
|
return
|
|
|
|
# Create preprocessing pipeline
|
|
preprocessor = create_preprocessing_pipeline(feature_cols)
|
|
|
|
# Train model
|
|
pipeline = train_model(X, y, preprocessor)
|
|
|
|
# Save model
|
|
save_model(pipeline, MODEL_PATH)
|
|
|
|
print("\n✅ Model training completed successfully!")
|
|
print("\n📋 Model Summary:")
|
|
print(f" - Model type: XGBoost Classifier")
|
|
print(f" - Features used: {len(feature_cols)}")
|
|
print(f" - Training samples: {len(X)}")
|
|
print(f" - Model saved to: {MODEL_PATH}")
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ Error during model training: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
exit(main()) |