Add CardioAI project with usage instructions

- Add comprehensive README.md with setup and usage instructions - Add .env.example template (sanitized, no real API keys) - Add root-level .gitignore to exclude .env and generated files - Add all project modules (dashboard, predictor) - Add data file and requirements.txt
2026-04-02 20:11:24 +08:00
parent 9133925f0a
commit b5b299a682
9 changed files with 2277 additions and 0 deletions
--- a/CardioAI/module2_predictor/app.py
+++ b/CardioAI/module2_predictor/app.py
@@ -0,0 +1,187 @@
+"""
+CardioAI 模块2: Flask API服务
+心血管疾病风险预测 - 后端接口
+"""
+
+import joblib
+import numpy as np
+import pandas as pd
+from flask import Flask, request, jsonify, render_template
+from pathlib import Path
+
+# ==================== 常量定义 ====================
+CODE_ROOT = Path(r"F:\My_Git_Project\CardioAI")
+MODEL_PATH = CODE_ROOT / "module2_predictor" / "cardio_predictor_model.pkl"
+
+# ==================== Flask应用 ====================
+app = Flask(__name__,
+            template_folder='templates',
+            static_folder='static')
+
+# 全局变量存储模型
+model = None
+
+
+def load_model():
+    """加载模型"""
+    global model
+    if model is None:
+        print("📂 正在加载模型...")
+        model = joblib.load(MODEL_PATH)
+        print("✅ 模型加载成功!")
+    return model
+
+
+# ==================== 路由定义 ====================
+@app.route('/')
+def index():
+    """渲染前端页面"""
+    return render_template('index.html')
+
+
+@app.route('/predict_cardio', methods=['POST'])
+def predict_cardio():
+    """
+    心血管疾病风险预测接口
+    接收11个原始特征值的JSON POST请求
+    返回预测概率和结果
+    """
+    try:
+        # 获取JSON数据
+        data = request.get_json()
+
+        if not data:
+            return jsonify({
+                'success': False,
+                'error': '未收到数据'
+            }), 400
+
+        # 定义特征列顺序（与训练时一致）
+        feature_names = [
+            'age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
+            'cholesterol', 'gluc', 'smoke', 'alco', 'active'
+        ]
+
+        # 从请求中提取特征值
+        features = []
+        missing_fields = []
+
+        for col in feature_names:
+            if col in data:
+                features.append(float(data[col]))
+            else:
+                missing_fields.append(col)
+                features.append(0.0)  # 默认值
+
+        # 计算BMI: weight / (height/100)^2
+        weight = float(data.get('weight', 0))
+        height = float(data.get('height', 0))
+        if height > 0:
+            bmi = weight / ((height / 100) ** 2)
+            features.append(bmi)
+        else:
+            features.append(0.0)
+
+        if missing_fields:
+            return jsonify({
+                'success': False,
+                'error': f'缺少必要字段: {", ".join(missing_fields)}'
+            }), 400
+
+        # 定义特征列名（与训练时一致）
+        feature_columns = [
+            'age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
+            'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi'
+        ]
+
+        # 转换为DataFrame格式
+        X_input = pd.DataFrame([features], columns=feature_columns)
+
+        # 加载模型（如果尚未加载）
+        predictor = load_model()
+
+        # 预测
+        prediction = int(predictor.predict(X_input)[0])
+        prob_risk = float(predictor.predict_proba(X_input)[0][1])
+        prob_healthy = float(predictor.predict_proba(X_input)[0][0])
+
+        # 构建响应
+        result = {
+            'success': True,
+            'prediction': prediction,
+            'prediction_label': '有风险' if prediction == 1 else '健康',
+            'probability': {
+                '健康': round(prob_healthy * 100, 2),
+                '有风险': round(prob_risk * 100, 2)
+            },
+            'risk_level': get_risk_level(prob_risk),
+            'recommendation': get_recommendation(prob_risk, data)
+        }
+
+        return jsonify(result)
+
+    except ValueError as e:
+        return jsonify({
+            'success': False,
+            'error': f'数据格式错误: {str(e)}'
+        }), 400
+
+    except Exception as e:
+        return jsonify({
+            'success': False,
+            'error': f'预测失败: {str(e)}'
+        }), 500
+
+
+def get_risk_level(probability: float) -> str:
+    """根据概率返回风险等级"""
+    if probability < 0.3:
+        return '🟢 低风险'
+    elif probability < 0.5:
+        return '🟡 中低风险'
+    elif probability < 0.7:
+        return '🟠 中高风险'
+    else:
+        return '🔴 高风险'
+
+
+def get_recommendation(probability: float, data: dict) -> str:
+    """根据预测结果给出建议"""
+    if probability < 0.3:
+        return '继续保持健康的生活方式，定期体检。'
+    elif probability < 0.5:
+        return '建议适当增加运动，注意饮食均衡。'
+    elif probability < 0.7:
+        return '建议咨询医生，制定健康管理计划。'
+    else:
+        return '⚠️ 风险较高，请尽快就医检查。'
+
+
+@app.route('/health', methods=['GET'])
+def health_check():
+    """健康检查接口"""
+    return jsonify({
+        'status': 'healthy',
+        'service': 'CardioAI Cardiovascular Prediction API',
+        'version': '1.0.0'
+    })
+
+
+# ==================== 启动应用 ====================
+if __name__ == '__main__':
+    print("\n" + "="*60)
+    print("❤️ CardioAI 心血管疾病风险预测 API")
+    print("="*60)
+    print(f"📂 模型路径: {MODEL_PATH}")
+    print(f"🌐 启动地址: http://localhost:5001")
+    print("="*60 + "\n")
+
+    # 预加载模型
+    load_model()
+
+    # 启动Flask应用
+    app.run(
+        host='0.0.0.0',
+        port=5001,
+        debug=True
+    )
--- a/CardioAI/module2_predictor/templates/index.html
+++ b/CardioAI/module2_predictor/templates/index.html
--- a/CardioAI/module2_predictor/train_and_save.py
+++ b/CardioAI/module2_predictor/train_and_save.py
@@ -0,0 +1,199 @@
+"""
+CardioAI 模块2: 模型训练脚本
+心血管疾病风险预测模型 - 训练与保存
+"""
+
+import pandas as pd
+import numpy as np
+import joblib
+from pathlib import Path
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
+from xgboost import XGBClassifier
+
+# ==================== 常量定义 ====================
+CODE_ROOT = Path(r"F:\My_Git_Project\CardioAI")
+DATA_PATH = CODE_ROOT / "data" / "心血管疾病.xlsx"
+MODEL_PATH = CODE_ROOT / "module2_predictor" / "cardio_predictor_model.pkl"
+
+# 特征列定义
+CONTINUOUS_FEATURES = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
+CATEGORICAL_FEATURES = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
+
+
+def load_and_clean_data(file_path: Path) -> pd.DataFrame:
+    """加载并清洗数据"""
+    print(f"📂 正在加载数据: {file_path}")
+
+    # 加载Excel数据
+    df = pd.read_excel(file_path, engine='openpyxl')
+    print(f"✅ 数据加载成功，共 {len(df)} 条记录")
+
+    # 复制数据
+    df = df.copy()
+
+    # 特征工程: age(天) -> age_years
+    df['age_years'] = (df['age'] / 365).round().astype(int)
+
+    # 计算BMI
+    df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
+
+    # 异常值处理: 删除舒张压 >= 收缩压的记录
+    initial_count = len(df)
+    df = df[df['ap_lo'] < df['ap_hi']]
+    print(f"🗑️ 删除舒张压>=收缩压的记录: {initial_count - len(df)} 条")
+
+    # 删除血压极端异常值
+    # 收缩压 ∈ [90, 250]
+    initial_count = len(df)
+    df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)]
+    removed_hy = initial_count - len(df)
+
+    # 舒张压 ∈ [60, 150]
+    initial_count = len(df)
+    df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)]
+    removed_lo = initial_count - len(df)
+    print(f"🗑️ 删除血压异常值: 收缩压 {removed_hy} 条, 舒张压 {removed_lo} 条")
+
+    print(f"✅ 数据清洗完成，剩余 {len(df)} 条记录")
+    return df
+
+
+def prepare_features(df: pd.DataFrame) -> tuple:
+    """
+    准备特征和标签
+    删除id和原始age字段，保留处理后的特征
+    """
+    # 定义要使用的特征（删除id和原始age，保留age_years）
+    feature_columns = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
+                      'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi']
+
+    X = df[feature_columns].copy()
+    y = df['cardio'].copy()
+
+    print(f"📊 特征数量: {len(feature_columns)}")
+    print(f"📊 特征列: {feature_columns}")
+
+    return X, y, feature_columns
+
+
+def build_pipeline() -> Pipeline:
+    """构建包含预处理器和分类器的Pipeline"""
+    print("🔧 正在构建Pipeline...")
+
+    # 连续特征列
+    continuous_cols = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
+
+    # 分类特征列
+    categorical_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
+
+    # 预处理器
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', StandardScaler(), continuous_cols),
+            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
+        ],
+        remainder='drop'
+    )
+
+    # 完整Pipeline: 预处理 + XGBoost分类器
+    pipeline = Pipeline([
+        ('preprocessor', preprocessor),
+        ('classifier', XGBClassifier(
+            n_estimators=100,
+            max_depth=6,
+            learning_rate=0.1,
+            random_state=42,
+            use_label_encoder=False,
+            eval_metric='logloss',
+            n_jobs=-1
+        ))
+    ])
+
+    print("✅ Pipeline构建完成")
+    return pipeline
+
+
+def train_and_evaluate(X: pd.DataFrame, y: pd.Series, pipeline: Pipeline):
+    """训练模型并评估"""
+    print("\n" + "="*50)
+    print("📈 开始模型训练...")
+    print("="*50)
+
+    # 划分训练集和测试集
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y
+    )
+
+    print(f"📊 训练集大小: {len(X_train)}")
+    print(f"📊 测试集大小: {len(X_test)}")
+
+    # 训练模型
+    print("🏋️ 正在训练XGBoost模型...")
+    pipeline.fit(X_train, y_train)
+
+    # 预测
+    y_pred = pipeline.predict(X_test)
+    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
+
+    # 评估指标
+    accuracy = accuracy_score(y_test, y_pred)
+    roc_auc = roc_auc_score(y_test, y_pred_proba)
+
+    print("\n" + "="*50)
+    print("📊 模型评估结果:")
+    print("="*50)
+    print(f"✅ 准确率 (Accuracy): {accuracy:.4f}")
+    print(f"✅ ROC-AUC 分数: {roc_auc:.4f}")
+    print("\n📋 分类报告:")
+    print(classification_report(y_test, y_pred, target_names=['健康', '有风险']))
+
+    return pipeline
+
+
+def save_model(pipeline: Pipeline, model_path: Path):
+    """保存模型"""
+    print(f"\n💾 正在保存模型到: {model_path}")
+    joblib.dump(pipeline, model_path)
+    print(f"✅ 模型保存成功!")
+
+    # 验证模型文件
+    file_size = model_path.stat().st_size / (1024 * 1024)
+    print(f"📦 模型文件大小: {file_size:.2f} MB")
+
+
+def main():
+    """主函数"""
+    print("\n" + "="*60)
+    print("❤️ CardioAI 模块2: 心血管疾病风险预测模型训练")
+    print("="*60 + "\n")
+
+    # 1. 加载并清洗数据
+    df = load_and_clean_data(DATA_PATH)
+
+    # 2. 准备特征
+    X, y, feature_columns = prepare_features(df)
+
+    # 3. 构建Pipeline
+    pipeline = build_pipeline()
+
+    # 4. 训练并评估模型
+    trained_pipeline = train_and_evaluate(X, y, pipeline)
+
+    # 5. 保存模型
+    save_model(trained_pipeline, MODEL_PATH)
+
+    print("\n" + "="*60)
+    print("🎉 模型训练完成!")
+    print("="*60)
+    print(f"\n📌 模型使用说明:")
+    print(f"   1. 启动Flask API: python app.py")
+    print(f"   2. 访问 http://localhost:5001 查看预测界面")
+    print(f"   3. 输入11个特征值进行预测")
+
+
+if __name__ == "__main__":
+    main()