Add CardioAI project with three modules
- Module 1: Dashboard for cardiovascular disease data visualization - Module 2: Machine learning predictor with Flask API - Module 3: Voice assistant with DeepSeek and CosyVoice integration - Add .gitignore for proper file exclusion - Update requirements and documentation Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
348
aicodes/module2_predictor/train_and_save.py
Normal file
348
aicodes/module2_predictor/train_and_save.py
Normal file
@@ -0,0 +1,348 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
CardioAI - Model Training and Saving Script
|
||||
Train XGBoost classifier for cardiovascular disease prediction and save the model.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
||||
import xgboost as xgb
|
||||
from xgboost import XGBClassifier
|
||||
import joblib
|
||||
|
||||
# Constants
|
||||
DATA_PATH = "/Users/anthony/ai_lesson/ai_zhangzhongshan/心血管疾病.xlsx"
|
||||
MODEL_PATH = "/Users/anthony/PycharmProjects/ sad_test01/aicodes/module2_predictor/cardio_predictor_model.pkl"
|
||||
|
||||
def load_and_preprocess_data():
|
||||
"""
|
||||
Load and preprocess cardiovascular disease data.
|
||||
Returns cleaned DataFrame with features and target.
|
||||
"""
|
||||
print("📊 Step 1: Loading data...")
|
||||
# Load data
|
||||
df = pd.read_excel(DATA_PATH, engine='openpyxl')
|
||||
print(f" Original data shape: {df.shape}")
|
||||
|
||||
# Standardize column names (similar to module1)
|
||||
column_mapping = {
|
||||
'age(天)': 'age',
|
||||
'cholestero1': 'cholesterol',
|
||||
'cholesterol': 'cholesterol',
|
||||
'gluc': 'gluc',
|
||||
'ap_hi': 'ap_hi',
|
||||
'ap_lo': 'ap_lo',
|
||||
'cardio': 'cardio',
|
||||
'gender': 'gender',
|
||||
'height': 'height',
|
||||
'weight': 'weight',
|
||||
'smoke': 'smoke',
|
||||
'alco': 'alco',
|
||||
'active': 'active',
|
||||
'id': 'id'
|
||||
}
|
||||
|
||||
# Rename columns to standardized names
|
||||
df_columns = df.columns.tolist()
|
||||
for col in df_columns:
|
||||
for key, value in column_mapping.items():
|
||||
if key.lower() in col.lower():
|
||||
df.rename(columns={col: value}, inplace=True)
|
||||
break
|
||||
|
||||
print("📊 Step 2: Performing feature engineering...")
|
||||
|
||||
# 1. Convert age (in days) to years (age_years)
|
||||
if 'age' in df.columns:
|
||||
df['age_years'] = np.round(df['age'] / 365.25).astype(int)
|
||||
|
||||
# 2. Calculate BMI
|
||||
if 'height' in df.columns and 'weight' in df.columns:
|
||||
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
|
||||
df['bmi'] = df['bmi'].round(2)
|
||||
|
||||
# 3. Handle blood pressure outliers
|
||||
# Remove records where diastolic (ap_lo) >= systolic (ap_hi)
|
||||
if 'ap_hi' in df.columns and 'ap_lo' in df.columns:
|
||||
initial_count = len(df)
|
||||
df = df[df['ap_lo'] < df['ap_hi']]
|
||||
|
||||
# Remove extreme blood pressure values
|
||||
systolic_mask = (df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)
|
||||
diastolic_mask = (df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
|
||||
df = df[systolic_mask & diastolic_mask]
|
||||
|
||||
print(f" Removed {initial_count - len(df)} blood pressure outliers")
|
||||
|
||||
# 4. Convert cholesterol and glucose to categorical (keep as numeric for model)
|
||||
# XGBoost can handle numeric categorical features directly
|
||||
|
||||
# 5. Create BMI categories for potential use (but we'll use continuous BMI for model)
|
||||
if 'bmi' in df.columns:
|
||||
def categorize_bmi(bmi):
|
||||
if bmi < 18.5:
|
||||
return 0 # Underweight
|
||||
elif 18.5 <= bmi < 25:
|
||||
return 1 # Normal
|
||||
elif 25 <= bmi < 30:
|
||||
return 2 # Overweight
|
||||
else:
|
||||
return 3 # Obese
|
||||
|
||||
df['bmi_category'] = df['bmi'].apply(categorize_bmi)
|
||||
|
||||
# Check required columns
|
||||
required_cols = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
|
||||
'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
|
||||
|
||||
missing_cols = [col for col in required_cols if col not in df.columns]
|
||||
if missing_cols:
|
||||
print(f"⚠️ Warning: Missing columns: {missing_cols}")
|
||||
|
||||
return df
|
||||
|
||||
def prepare_features_target(df):
|
||||
"""
|
||||
Prepare features (X) and target (y) for model training.
|
||||
"""
|
||||
print("📊 Step 3: Preparing features and target...")
|
||||
|
||||
# Define features based on typical cardiovascular dataset
|
||||
# These are the 11 features mentioned in the task requirements
|
||||
feature_cols = [
|
||||
'age_years', # Age in years (converted from days)
|
||||
'gender', # Gender (1: female, 2: male)
|
||||
'height', # Height in cm
|
||||
'weight', # Weight in kg
|
||||
'ap_hi', # Systolic blood pressure
|
||||
'ap_lo', # Diastolic blood pressure
|
||||
'cholesterol', # Cholesterol level (1: normal, 2: above normal, 3: well above normal)
|
||||
'gluc', # Glucose level (1: normal, 2: above normal, 3: well above normal)
|
||||
'smoke', # Smoking (0: no, 1: yes)
|
||||
'alco', # Alcohol intake (0: no, 1: yes)
|
||||
'active' # Physical activity (0: no, 1: yes)
|
||||
]
|
||||
|
||||
# Check which features are available
|
||||
available_features = [col for col in feature_cols if col in df.columns]
|
||||
print(f" Available features: {available_features}")
|
||||
|
||||
# If bmi is available, use it instead of height and weight (or in addition)
|
||||
if 'bmi' in df.columns and 'bmi' not in available_features:
|
||||
available_features.append('bmi')
|
||||
|
||||
# Remove id column if exists
|
||||
if 'id' in df.columns:
|
||||
df = df.drop(columns=['id'])
|
||||
|
||||
# Remove original age (in days) column
|
||||
if 'age' in df.columns:
|
||||
df = df.drop(columns=['age'])
|
||||
|
||||
X = df[available_features].copy()
|
||||
y = df['cardio'].copy() if 'cardio' in df.columns else None
|
||||
|
||||
print(f" Features shape: {X.shape}")
|
||||
print(f" Target shape: {y.shape if y is not None else 'N/A'}")
|
||||
|
||||
return X, y, available_features
|
||||
|
||||
def create_preprocessing_pipeline(feature_cols):
|
||||
"""
|
||||
Create a preprocessing pipeline with ColumnTransformer.
|
||||
"""
|
||||
print("📊 Step 4: Creating preprocessing pipeline...")
|
||||
|
||||
# Define numeric and categorical features
|
||||
numeric_features = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo']
|
||||
if 'bmi' in feature_cols:
|
||||
numeric_features.append('bmi')
|
||||
|
||||
categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
|
||||
|
||||
# Filter to only include features that exist in our data
|
||||
numeric_features = [feat for feat in numeric_features if feat in feature_cols]
|
||||
categorical_features = [feat for feat in categorical_features if feat in feature_cols]
|
||||
|
||||
print(f" Numeric features: {numeric_features}")
|
||||
print(f" Categorical features: {categorical_features}")
|
||||
|
||||
# Create preprocessing pipeline
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', StandardScaler(), numeric_features),
|
||||
('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
|
||||
categorical_features)
|
||||
],
|
||||
remainder='drop' # Drop any columns not specified
|
||||
)
|
||||
|
||||
return preprocessor
|
||||
|
||||
def train_model(X, y, preprocessor):
|
||||
"""
|
||||
Train XGBoost classifier with preprocessing pipeline.
|
||||
"""
|
||||
print("📊 Step 5: Training XGBoost model...")
|
||||
|
||||
# Split data
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42, stratify=y
|
||||
)
|
||||
|
||||
print(f" Training set: {X_train.shape}")
|
||||
print(f" Test set: {X_test.shape}")
|
||||
|
||||
# Create full pipeline
|
||||
pipeline = Pipeline([
|
||||
('preprocessor', preprocessor),
|
||||
('classifier', XGBClassifier(
|
||||
n_estimators=100,
|
||||
max_depth=5,
|
||||
learning_rate=0.1,
|
||||
subsample=0.8,
|
||||
colsample_bytree=0.8,
|
||||
random_state=42,
|
||||
eval_metric='logloss',
|
||||
use_label_encoder=False
|
||||
))
|
||||
])
|
||||
|
||||
# Train the model
|
||||
pipeline.fit(X_train, y_train)
|
||||
|
||||
# Evaluate the model
|
||||
print("📊 Step 6: Evaluating model performance...")
|
||||
|
||||
y_pred = pipeline.predict(X_test)
|
||||
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
|
||||
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
print(f" Accuracy: {accuracy:.4f}")
|
||||
|
||||
print("\n Classification Report:")
|
||||
print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))
|
||||
|
||||
# Confusion matrix
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
print(f" Confusion Matrix:")
|
||||
print(f" [[{cm[0,0]:4d} {cm[0,1]:4d}]")
|
||||
print(f" [{cm[1,0]:4d} {cm[1,1]:4d}]]")
|
||||
|
||||
# Feature importance (if available)
|
||||
try:
|
||||
# Get feature names after preprocessing
|
||||
feature_names = []
|
||||
|
||||
# Get numeric feature names
|
||||
num_features = pipeline.named_steps['preprocessor'].transformers_[0][2]
|
||||
feature_names.extend(num_features)
|
||||
|
||||
# Get categorical feature names after one-hot encoding
|
||||
cat_transformer = pipeline.named_steps['preprocessor'].transformers_[1][1]
|
||||
cat_features = pipeline.named_steps['preprocessor'].transformers_[1][2]
|
||||
|
||||
# Get one-hot encoded feature names
|
||||
if hasattr(cat_transformer, 'get_feature_names_out'):
|
||||
cat_feature_names = cat_transformer.get_feature_names_out(cat_features)
|
||||
feature_names.extend(cat_feature_names)
|
||||
else:
|
||||
# Fallback: just use categorical feature names
|
||||
feature_names.extend(cat_features)
|
||||
|
||||
# Get feature importance from XGBoost
|
||||
importances = pipeline.named_steps['classifier'].feature_importances_
|
||||
|
||||
# Sort by importance
|
||||
importance_df = pd.DataFrame({
|
||||
'feature': feature_names[:len(importances)],
|
||||
'importance': importances
|
||||
}).sort_values('importance', ascending=False)
|
||||
|
||||
print("\n Top 10 Feature Importances:")
|
||||
print(importance_df.head(10).to_string(index=False))
|
||||
|
||||
except Exception as e:
|
||||
print(f" Could not extract feature importance: {e}")
|
||||
|
||||
return pipeline
|
||||
|
||||
def save_model(pipeline, model_path):
|
||||
"""
|
||||
Save the trained pipeline to disk.
|
||||
"""
|
||||
print("📊 Step 7: Saving model...")
|
||||
|
||||
# Save the model
|
||||
joblib.dump(pipeline, model_path)
|
||||
print(f" Model saved to: {model_path}")
|
||||
|
||||
# Also save metadata about expected features
|
||||
metadata = {
|
||||
'model_version': '1.0',
|
||||
'description': 'CardioAI Cardiovascular Disease Prediction Model',
|
||||
'features': pipeline.named_steps['preprocessor'].transformers_[0][2] +
|
||||
pipeline.named_steps['preprocessor'].transformers_[1][2],
|
||||
'creation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
metadata_path = model_path.replace('.pkl', '_metadata.json')
|
||||
import json
|
||||
with open(metadata_path, 'w') as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
print(f" Metadata saved to: {metadata_path}")
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main training workflow.
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("🚀 CardioAI - Model Training Script")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
# Load and preprocess data
|
||||
df = load_and_preprocess_data()
|
||||
|
||||
# Prepare features and target
|
||||
X, y, feature_cols = prepare_features_target(df)
|
||||
|
||||
if y is None:
|
||||
print("❌ Error: Target column 'cardio' not found in data!")
|
||||
return
|
||||
|
||||
# Create preprocessing pipeline
|
||||
preprocessor = create_preprocessing_pipeline(feature_cols)
|
||||
|
||||
# Train model
|
||||
pipeline = train_model(X, y, preprocessor)
|
||||
|
||||
# Save model
|
||||
save_model(pipeline, MODEL_PATH)
|
||||
|
||||
print("\n✅ Model training completed successfully!")
|
||||
print("\n📋 Model Summary:")
|
||||
print(f" - Model type: XGBoost Classifier")
|
||||
print(f" - Features used: {len(feature_cols)}")
|
||||
print(f" - Training samples: {len(X)}")
|
||||
print(f" - Model saved to: {MODEL_PATH}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error during model training: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
Reference in New Issue
Block a user