diff --git a/CardioAI/.env.example b/CardioAI/.env.example new file mode 100644 index 0000000..a0184ca --- /dev/null +++ b/CardioAI/.env.example @@ -0,0 +1,11 @@ +# DeepSeek API 配置 +BASE_URL=https://api.deepseek.com/v1 +DEEPSEEK_API_KEY=your_api_key_here +MODEL_NAME=deepseek-reasoner + +# Flask 配置 +FLASK_ENV=development +FLASK_DEBUG=True + +# 数据文件路径 +DATA_PATH=./data/心血管疾病.xlsx diff --git a/CardioAI/.gitignore b/CardioAI/.gitignore new file mode 100644 index 0000000..11d1421 --- /dev/null +++ b/CardioAI/.gitignore @@ -0,0 +1,13 @@ +# Environment +.env +*.pyc +__pycache__/ + +# IDE +.idea/ + +# Model files (generated) +*.pkl + +# OS +.DS_Store diff --git a/CardioAI/README.md b/CardioAI/README.md new file mode 100644 index 0000000..2aefdbf --- /dev/null +++ b/CardioAI/README.md @@ -0,0 +1,102 @@ +# CardioAI - 心血管疾病智能辅助系统 + +多模块应用,集成了数据可视化、机器学习预测和AI问答功能。 + +## 项目结构 + +``` +CardioAI/ +├── data/ # 数据文件目录 +│ └── 心血管疾病.xlsx # 心血管疾病数据集 +├── module1_dashboard/ # 模块1: Streamlit 交互式仪表盘 +│ └── cardio_dashboard.py +├── module2_predictor/ # 模块2: XGBoost 风险预测模型 +│ ├── train_and_save.py # 模型训练脚本 +│ ├── cardio_predictor_model.pkl # 训练好的模型 +│ ├── app.py # Flask API 服务 +│ └── templates/ +│ └── index.html # 预测前端页面 +├── requirements.txt # 项目依赖 +├── .env.example # 环境变量模板 +└── .gitignore # Git 忽略文件 +``` + +## 环境配置 + +### 1. 创建 conda 环境 + +```bash +conda create -n cardioenv python=3.10 +conda activate cardioenv +``` + +### 2. 安装依赖 + +```bash +cd F:\My_Git_Project\CardioAI +pip install -r requirements.txt +``` + +### 3. 配置环境变量 + +复制 `.env.example` 为 `.env`,并填入您的 API Key: + +```bash +copy .env.example .env +``` + +编辑 `.env` 文件,填入您的 DeepSeek API Key。 + +## 模块说明 + +### 模块1: 交互式仪表盘 (Streamlit) + +心血管数据的交互式可视化界面。 + +**启动命令:** +```bash +cd F:\My_Git_Project\CardioAI +streamlit run module1_dashboard/cardio_dashboard.py +``` + +**功能特性:** +- 年龄范围筛选 +- 性别和心血管疾病状态筛选 +- 统计数据展示 +- BMI分布可视化 + +### 模块2: 心血管风险预测模型 (Flask + XGBoost) + +基于 XGBoost 的心血管疾病风险预测 API。 + +**训练模型:** +```bash +cd F:\My_Git_Project\CardioAI +python module2_predictor/train_and_save.py +``` + +**启动预测服务:** +```bash +cd F:\My_Git_Project\CardioAI\module2_predictor +set FLASK_APP=app.py +flask run --host=0.0.0.0 --port=5000 +``` + +**API 接口:** +- `POST /predict_cardio` - 提交11个特征值,返回预测概率和结果 + +## 依赖说明 + +- pandas, openpyxl - 数据处理 +- numpy, scikit-learn - 数值计算 +- xgboost, joblib - 机器学习 +- streamlit, plotly - 数据可视化 +- Flask - Web 服务 +- python-dotenv - 环境变量 +- langchain-openai, dashscope, requests - AI 集成 + +## 注意事项 + +1. 数据文件路径可通过 `.env` 中的 `DATA_PATH` 配置 +2. 确保 `.env` 文件不提交到版本库(已加入 .gitignore) +3. 使用前请确保已正确配置 DeepSeek API Key diff --git a/CardioAI/data/心血管疾病.xlsx b/CardioAI/data/心血管疾病.xlsx new file mode 100644 index 0000000..6a4700e Binary files /dev/null and b/CardioAI/data/心血管疾病.xlsx differ diff --git a/CardioAI/module1_dashboard/cardio_dashboard.py b/CardioAI/module1_dashboard/cardio_dashboard.py new file mode 100644 index 0000000..f901fe3 --- /dev/null +++ b/CardioAI/module1_dashboard/cardio_dashboard.py @@ -0,0 +1,684 @@ +""" +CardioAI 模块1: 交互式仪表盘 +心血管疾病数据可视化系统 - 美化版 +""" + +import streamlit as st +import pandas as pd +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from pathlib import Path + +# ==================== 页面配置 ==================== +st.set_page_config( + page_title="CardioAI 心血管疾病分析", + page_icon="❤️", + layout="wide", + initial_sidebar_state="expanded" +) + +# ==================== 自定义CSS样式 ==================== +st.markdown(""" + +""", unsafe_allow_html=True) + +# ==================== 常量定义 ==================== +CODE_ROOT = Path(r"F:\My_Git_Project\CardioAI") +DATA_PATH = CODE_ROOT / "data" / "心血管疾病.xlsx" + +# 配色方案 +COLORS = { + 'primary': ['#667eea', '#764ba2', '#f093fb', '#f5576c'], + 'safe': '#2ecc71', + 'risk': '#e74c3c', + 'gradient': ['#667eea', '#764ba2'], + 'bmi': ['#3498db', '#2ecc71', '#f39c12', '#e74c3c'] +} + + +# ==================== 数据加载函数 ==================== +@st.cache_data(show_spinner=False) +def load_data(file_path: Path) -> pd.DataFrame: + """加载数据,支持Excel格式""" + try: + df = pd.read_excel(file_path, engine='openpyxl') + return df + except Exception as e: + st.error(f"数据加载失败: {e}") + return pd.DataFrame() + + +@st.cache_data(show_spinner=False) +def clean_and_engineer_features(df: pd.DataFrame) -> pd.DataFrame: + """数据清洗和特征工程""" + df = df.copy() + + # 1. 特征工程 + df['age_years'] = (df['age'] / 365).round().astype(int) + df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2) + + # 2. 异常值处理 + df = df[df['ap_lo'] < df['ap_hi']] + df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)] + df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)] + + # 3. 类别转换 + cholesterol_map = {1: '正常', 2: '偏高', 3: '非常高'} + gluc_map = {1: '正常', 2: '偏高', 3: '非常高'} + df['cholesterol_cat'] = df['cholesterol'].map(cholesterol_map) + df['gluc_cat'] = df['gluc'].map(gluc_map) + + # 4. BMI分类 + def categorize_bmi(bmi): + if bmi < 18.5: + return '体重过低' + elif bmi < 25: + return '体重正常' + elif bmi < 30: + return '超重' + else: + return '肥胖' + + df['bmi_category'] = df['bmi'].apply(categorize_bmi) + + return df + + +# ==================== UI组件 ==================== +def render_header(): + """渲染页面头部""" + st.markdown('

❤️ CardioAI

', unsafe_allow_html=True) + st.markdown('

心血管疾病智能分析系统 | 数据驱动的健康洞察

', unsafe_allow_html=True) + st.markdown("
", unsafe_allow_html=True) + + +def create_sidebar(df: pd.DataFrame) -> dict: + """创建美观的侧边栏""" + with st.sidebar: + st.markdown("### 🎛️ 数据筛选器") + st.markdown("---") + + # 年龄范围 + with st.container(): + st.markdown("**📅 年龄范围**") + age_range = st.slider( + "", + min_value=int(df['age_years'].min()), + max_value=int(df['age_years'].max()), + value=(int(df['age_years'].min()), int(df['age_years'].max())), + key="age_slider" + ) + + st.markdown("
", unsafe_allow_html=True) + + # 性别选择 + with st.container(): + st.markdown("**👤 性别**") + gender_options = st.multiselect( + "", + options=[1, 2], + default=[1, 2], + format_func=lambda x: "👩 女性" if x == 1 else "👨 男性", + key="gender_select" + ) + + st.markdown("
", unsafe_allow_html=True) + + # 心血管疾病状态 + with st.container(): + st.markdown("**🏥 心血管健康状态**") + cardio_options = st.multiselect( + "", + options=[0, 1], + default=[0, 1], + format_func=lambda x: "✅ 健康" if x == 0 else "⚠️ 有风险", + key="cardio_select" + ) + + st.markdown("---") + + # 数据统计 + st.markdown("### 📊 数据概览") + st.metric("总记录数", f"{len(df):,}") + st.metric("平均BMI", f"{df['bmi'].mean():.1f}") + st.metric("平均年龄", f"{df['age_years'].mean():.1f} 岁") + + return { + 'age_range': age_range, + 'gender': gender_options, + 'cardio': cardio_options + } + + +def apply_filters(df: pd.DataFrame, filters: dict) -> pd.DataFrame: + """应用筛选条件""" + return df[ + (df['age_years'] >= filters['age_range'][0]) & + (df['age_years'] <= filters['age_range'][1]) & + (df['gender'].isin(filters['gender'])) & + (df['cardio'].isin(filters['cardio'])) + ] + + +def render_metrics(filtered_df: pd.DataFrame, total_count: int): + """渲染指标卡片""" + st.markdown('
📊 关键指标
', unsafe_allow_html=True) + + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.markdown(f""" +
+
📋 筛选记录数
+
{len(filtered_df):,}
+
占比 {(len(filtered_df)/total_count*100):.1f}%
+
+ """, unsafe_allow_html=True) + + with col2: + risk_rate = (filtered_df['cardio'].sum() / len(filtered_df) * 100) if len(filtered_df) > 0 else 0 + st.markdown(f""" +
+
⚠️ 风险率
+
{risk_rate:.1f}%
+
心血管疾病患者占比
+
+ """, unsafe_allow_html=True) + + with col3: + avg_age = filtered_df['age_years'].mean() if len(filtered_df) > 0 else 0 + st.markdown(f""" +
+
📅 平均年龄
+
{avg_age:.1f}
+
+
+ """, unsafe_allow_html=True) + + with col4: + avg_bmi = filtered_df['bmi'].mean() if len(filtered_df) > 0 else 0 + st.markdown(f""" +
+
⚖️ 平均BMI
+
{avg_bmi:.1f}
+
{get_bmi_status(avg_bmi)}
+
+ """, unsafe_allow_html=True) + + +def get_bmi_status(bmi: float) -> str: + """获取BMI状态""" + if bmi < 18.5: + return "体重过低" + elif bmi < 25: + return "体重正常" + elif bmi < 30: + return "超重" + return "肥胖" + + +def plot_age_distribution(df: pd.DataFrame): + """年龄分布图 - 美化版""" + fig = px.histogram( + df, + x='age_years', + color='cardio', + nbins=30, + title="年龄分布趋势", + labels={'age_years': '年龄', 'count': '人数'}, + color_discrete_map={0: '#2ecc71', 1: '#e74c3c'}, + barmode='overlay', + opacity=0.8 + ) + + fig.update_layout( + template='plotly_white', + title_font_size=18, + title_x=0.5, + legend_title_text="", + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1 + ), + plot_bgcolor='rgba(0,0,0,0)', + paper_bgcolor='rgba(0,0,0,0)', + font=dict(family="Arial, sans-serif", size=12), + margin=dict(t=60, b=40, l=40, r=40) + ) + + fig.for_each_trace(lambda t: t.update( + name="✅ 健康" if t.name == "0" else "⚠️ 有风险", + marker_line_width=1, + marker_line_color='white' + )) + + return fig + + +def plot_bmi_distribution(df: pd.DataFrame): + """BMI分布饼图""" + bmi_counts = df['bmi_category'].value_counts().reindex(['体重过低', '体重正常', '超重', '肥胖']) + + fig = go.Figure(data=[go.Pie( + labels=bmi_counts.index, + values=bmi_counts.values, + hole=0.5, + marker=dict( + colors=COLORS['bmi'], + line=dict(color='white', width=2) + ), + textinfo='label+percent', + textposition='outside', + textfont=dict(size=12) + )]) + + fig.update_layout( + title=dict(text="BMI分布", font=dict(size=18), x=0.5), + template='plotly_white', + showlegend=False, + plot_bgcolor='rgba(0,0,0,0)', + paper_bgcolor='rgba(0,0,0,0)', + margin=dict(t=60, b=40, l=40, r=40), + annotations=[dict(text='BMI', x=0.5, y=0.5, font_size=20, showarrow=False)] + ) + + return fig + + +def plot_bmi_cardio_relation(df: pd.DataFrame): + """BMI与心血管疾病关系 - 堆叠柱状图""" + bmi_cardio = df.groupby(['bmi_category', 'cardio']).size().unstack(fill_value=0) + bmi_order = ['体重过低', '体重正常', '超重', '肥胖'] + bmi_cardio = bmi_cardio.reindex(bmi_order) + + fig = go.Figure() + + fig.add_trace(go.Bar( + name='✅ 健康', + x=bmi_cardio.index, + y=bmi_cardio[0], + marker_color='#2ecc71', + marker_line=dict(color='white', width=1) + )) + + fig.add_trace(go.Bar( + name='⚠️ 有风险', + x=bmi_cardio.index, + y=bmi_cardio[1], + marker_color='#e74c3c', + marker_line=dict(color='white', width=1) + )) + + fig.update_layout( + title=dict(text="BMI与心血管疾病关联分析", font=dict(size=18), x=0.5), + xaxis_title="BMI类别", + yaxis_title="人数", + barmode='stack', + template='plotly_white', + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1 + ), + plot_bgcolor='rgba(0,0,0,0)', + paper_bgcolor='rgba(0,0,0,0)', + margin=dict(t=60, b=40, l=40, r=40) + ) + + return fig + + +def plot_blood_pressure_scatter(df: pd.DataFrame): + """血压散点图""" + sample_df = df.sample(min(2000, len(df))) # 采样避免过多点 + + fig = px.scatter( + sample_df, + x='ap_lo', + y='ap_hi', + color='cardio', + color_discrete_map={0: '#2ecc71', 1: '#e74c3c'}, + opacity=0.6, + title="血压分布散点图", + labels={'ap_lo': '舒张压 (mmHg)', 'ap_hi': '收缩压 (mmHg)'} + ) + + fig.update_layout( + template='plotly_white', + title_font_size=18, + title_x=0.5, + legend_title_text="", + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1 + ), + plot_bgcolor='rgba(0,0,0,0)', + paper_bgcolor='rgba(0,0,0,0)', + margin=dict(t=60, b=40, l=40, r=40) + ) + + fig.for_each_trace(lambda t: t.update( + name="✅ 健康" if t.name == "0" else "⚠️ 有风险" + )) + + return fig + + +def plot_cholesterol_analysis(df: pd.DataFrame): + """胆固醇与心血管疾病关系""" + chol_cardio = df.groupby(['cholesterol_cat', 'cardio']).size().unstack(fill_value=0) + chol_order = ['正常', '偏高', '非常高'] + chol_cardio = chol_cardio.reindex(chol_order) + + fig = go.Figure() + + fig.add_trace(go.Bar( + name='✅ 健康', + x=chol_cardio.index, + y=chol_cardio[0] if 0 in chol_cardio.columns else [0, 0, 0], + marker_color='#2ecc71' + )) + + fig.add_trace(go.Bar( + name='⚠️ 有风险', + x=chol_cardio.index, + y=chol_cardio[1] if 1 in chol_cardio.columns else [0, 0, 0], + marker_color='#e74c3c' + )) + + fig.update_layout( + title=dict(text="胆固醇水平与心血管疾病", font=dict(size=18), x=0.5), + xaxis_title="胆固醇水平", + yaxis_title="人数", + barmode='group', + template='plotly_white', + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1 + ), + plot_bgcolor='rgba(0,0,0,0)', + paper_bgcolor='rgba(0,0,0,0)', + margin=dict(t=60, b=40, l=40, r=40) + ) + + return fig + + +def render_data_table(df: pd.DataFrame): + """渲染数据表格""" + st.markdown('
📋 数据明细
', unsafe_allow_html=True) + + display_cols = ['id', 'age_years', 'gender', 'height', 'weight', 'bmi', 'bmi_category', + 'ap_hi', 'ap_lo', 'cholesterol_cat', 'gluc_cat', 'cardio'] + + display_df = df[display_cols].copy() + display_df['gender'] = display_df['gender'].map({1: '女性', 2: '男性'}) + display_df['cardio'] = display_df['cardio'].map({0: '✅ 健康', 1: '⚠️ 有风险'}) + + display_df.columns = ['ID', '年龄', '性别', '身高(cm)', '体重(kg)', 'BMI', 'BMI分类', + '收缩压', '舒张压', '胆固醇', '血糖', '心血管状态'] + + st.dataframe( + display_df, + use_container_width=True, + height=400, + column_config={ + "ID": st.column_config.NumberColumn(width="small"), + "年龄": st.column_config.NumberColumn(width="small"), + "心血管状态": st.column_config.TextColumn(width="medium") + } + ) + + +def main(): + """主函数""" + render_header() + + # 加载数据 + with st.spinner("正在加载数据..."): + raw_df = load_data(DATA_PATH) + + if raw_df.empty: + st.error("❌ 数据加载失败,请检查数据文件路径!") + return + + # 数据处理 + with st.spinner("正在进行数据清洗..."): + df = clean_and_engineer_features(raw_df) + + total_count = len(df) + + # 侧边栏筛选 + filters = create_sidebar(df) + + # 应用筛选 + filtered_df = apply_filters(df, filters) + + # 指标卡片 + render_metrics(filtered_df, total_count) + + st.markdown("
", unsafe_allow_html=True) + + # 图表区域 + st.markdown('
📈 可视化分析
', unsafe_allow_html=True) + + # 第一行图表 + col1, col2 = st.columns(2) + with col1: + fig1 = plot_age_distribution(filtered_df) + st.plotly_chart(fig1, use_container_width=True, key="age_chart") + + with col2: + fig2 = plot_bmi_distribution(filtered_df) + st.plotly_chart(fig2, use_container_width=True, key="bmi_pie") + + # 第二行图表 + col3, col4 = st.columns(2) + with col3: + fig3 = plot_bmi_cardio_relation(filtered_df) + st.plotly_chart(fig3, use_container_width=True, key="bmi_cardio") + + with col4: + fig4 = plot_cholesterol_analysis(filtered_df) + st.plotly_chart(fig4, use_container_width=True, key="chol_chart") + + # 第三行图表 + col5, _ = st.columns([1, 1]) + with col5: + fig5 = plot_blood_pressure_scatter(filtered_df) + st.plotly_chart(fig5, use_container_width=True, key="bp_scatter") + + st.markdown("
", unsafe_allow_html=True) + + # 数据表格 + render_data_table(filtered_df) + + # 页脚 + st.markdown("
", unsafe_allow_html=True) + st.markdown( + "

❤️ CardioAI © 2024 | 心血管疾病智能分析系统

", + unsafe_allow_html=True + ) + + +if __name__ == "__main__": + main() diff --git a/CardioAI/module2_predictor/app.py b/CardioAI/module2_predictor/app.py new file mode 100644 index 0000000..e2f0d1d --- /dev/null +++ b/CardioAI/module2_predictor/app.py @@ -0,0 +1,187 @@ +""" +CardioAI 模块2: Flask API服务 +心血管疾病风险预测 - 后端接口 +""" + +import joblib +import numpy as np +import pandas as pd +from flask import Flask, request, jsonify, render_template +from pathlib import Path + +# ==================== 常量定义 ==================== +CODE_ROOT = Path(r"F:\My_Git_Project\CardioAI") +MODEL_PATH = CODE_ROOT / "module2_predictor" / "cardio_predictor_model.pkl" + +# ==================== Flask应用 ==================== +app = Flask(__name__, + template_folder='templates', + static_folder='static') + +# 全局变量存储模型 +model = None + + +def load_model(): + """加载模型""" + global model + if model is None: + print("📂 正在加载模型...") + model = joblib.load(MODEL_PATH) + print("✅ 模型加载成功!") + return model + + +# ==================== 路由定义 ==================== +@app.route('/') +def index(): + """渲染前端页面""" + return render_template('index.html') + + +@app.route('/predict_cardio', methods=['POST']) +def predict_cardio(): + """ + 心血管疾病风险预测接口 + 接收11个原始特征值的JSON POST请求 + 返回预测概率和结果 + """ + try: + # 获取JSON数据 + data = request.get_json() + + if not data: + return jsonify({ + 'success': False, + 'error': '未收到数据' + }), 400 + + # 定义特征列顺序(与训练时一致) + feature_names = [ + 'age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', + 'cholesterol', 'gluc', 'smoke', 'alco', 'active' + ] + + # 从请求中提取特征值 + features = [] + missing_fields = [] + + for col in feature_names: + if col in data: + features.append(float(data[col])) + else: + missing_fields.append(col) + features.append(0.0) # 默认值 + + # 计算BMI: weight / (height/100)^2 + weight = float(data.get('weight', 0)) + height = float(data.get('height', 0)) + if height > 0: + bmi = weight / ((height / 100) ** 2) + features.append(bmi) + else: + features.append(0.0) + + if missing_fields: + return jsonify({ + 'success': False, + 'error': f'缺少必要字段: {", ".join(missing_fields)}' + }), 400 + + # 定义特征列名(与训练时一致) + feature_columns = [ + 'age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', + 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi' + ] + + # 转换为DataFrame格式 + X_input = pd.DataFrame([features], columns=feature_columns) + + # 加载模型(如果尚未加载) + predictor = load_model() + + # 预测 + prediction = int(predictor.predict(X_input)[0]) + prob_risk = float(predictor.predict_proba(X_input)[0][1]) + prob_healthy = float(predictor.predict_proba(X_input)[0][0]) + + # 构建响应 + result = { + 'success': True, + 'prediction': prediction, + 'prediction_label': '有风险' if prediction == 1 else '健康', + 'probability': { + '健康': round(prob_healthy * 100, 2), + '有风险': round(prob_risk * 100, 2) + }, + 'risk_level': get_risk_level(prob_risk), + 'recommendation': get_recommendation(prob_risk, data) + } + + return jsonify(result) + + except ValueError as e: + return jsonify({ + 'success': False, + 'error': f'数据格式错误: {str(e)}' + }), 400 + + except Exception as e: + return jsonify({ + 'success': False, + 'error': f'预测失败: {str(e)}' + }), 500 + + +def get_risk_level(probability: float) -> str: + """根据概率返回风险等级""" + if probability < 0.3: + return '🟢 低风险' + elif probability < 0.5: + return '🟡 中低风险' + elif probability < 0.7: + return '🟠 中高风险' + else: + return '🔴 高风险' + + +def get_recommendation(probability: float, data: dict) -> str: + """根据预测结果给出建议""" + if probability < 0.3: + return '继续保持健康的生活方式,定期体检。' + elif probability < 0.5: + return '建议适当增加运动,注意饮食均衡。' + elif probability < 0.7: + return '建议咨询医生,制定健康管理计划。' + else: + return '⚠️ 风险较高,请尽快就医检查。' + + +@app.route('/health', methods=['GET']) +def health_check(): + """健康检查接口""" + return jsonify({ + 'status': 'healthy', + 'service': 'CardioAI Cardiovascular Prediction API', + 'version': '1.0.0' + }) + + +# ==================== 启动应用 ==================== +if __name__ == '__main__': + print("\n" + "="*60) + print("❤️ CardioAI 心血管疾病风险预测 API") + print("="*60) + print(f"📂 模型路径: {MODEL_PATH}") + print(f"🌐 启动地址: http://localhost:5001") + print("="*60 + "\n") + + # 预加载模型 + load_model() + + # 启动Flask应用 + app.run( + host='0.0.0.0', + port=5001, + debug=True + ) diff --git a/CardioAI/module2_predictor/templates/index.html b/CardioAI/module2_predictor/templates/index.html new file mode 100644 index 0000000..5a2f3b0 --- /dev/null +++ b/CardioAI/module2_predictor/templates/index.html @@ -0,0 +1,1060 @@ + + + + + + CardioAI 心血管疾病风险预测 + + + + + +
+ +
+
+
❤️
+

CardioAI

+
+

智能心血管疾病风险预测系统

+
+ + +
+ +
+
+ + AI 分析完成 +
+ +
+ +
+ +
健康状态良好
+ +
+
0%
+
心血管疾病风险概率
+
+ + +
+
+
+
+ 低风险 + 中风险 + 高风险 +
+ + +
+
+
BMI 指数
+
--
+
+
+
血压状态
+
--
+
+
+
健康评分
+
--
+
+
+ + +
+
+ + 健康建议 +
+
+ 继续保持健康的生活方式 +
+
+ + +
+ +
+
+ + +
+
+
+
🫀
+
+
+ AI 正在分析您的健康数据 +
+

请稍候

+
+ + +
+
+ + +
+ +
+

+ + 健康信息录入 +

+

请填写以下信息,AI 将为您评估心血管疾病风险

+
+ +
+ +
+
+
+
基本信息
+
+
+
+ +
+ + +
+
年龄范围 18-120 岁
+
+
+ +
+ + +
+
+
+ +
+ + +
+
+
+ +
+ + +
+
+
+ +
+ BMI 指数 + -- +
+
+ + +
+
+
+
血压信息
+
+
+
+ +
+ + +
+
正常范围:90-140 mmHg
+
+
+ +
+ + +
+
正常范围:60-90 mmHg
+
+
+
+ + +
+
+
+
生化指标
+
+
+
+ +
+ + +
+
+
+ +
+ + +
+
+
+
+ + +
+
+
+
生活习惯
+
+
+
+ +
+ + +
+
+
+ +
+ + +
+
+
+ +
+ + +
+
+
+
+ + +
+ + +
+
+
+
+ + + +
+ + + + diff --git a/CardioAI/module2_predictor/train_and_save.py b/CardioAI/module2_predictor/train_and_save.py new file mode 100644 index 0000000..cecc105 --- /dev/null +++ b/CardioAI/module2_predictor/train_and_save.py @@ -0,0 +1,199 @@ +""" +CardioAI 模块2: 模型训练脚本 +心血管疾病风险预测模型 - 训练与保存 +""" + +import pandas as pd +import numpy as np +import joblib +from pathlib import Path +from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report, roc_auc_score, accuracy_score +from xgboost import XGBClassifier + +# ==================== 常量定义 ==================== +CODE_ROOT = Path(r"F:\My_Git_Project\CardioAI") +DATA_PATH = CODE_ROOT / "data" / "心血管疾病.xlsx" +MODEL_PATH = CODE_ROOT / "module2_predictor" / "cardio_predictor_model.pkl" + +# 特征列定义 +CONTINUOUS_FEATURES = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi'] +CATEGORICAL_FEATURES = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'] + + +def load_and_clean_data(file_path: Path) -> pd.DataFrame: + """加载并清洗数据""" + print(f"📂 正在加载数据: {file_path}") + + # 加载Excel数据 + df = pd.read_excel(file_path, engine='openpyxl') + print(f"✅ 数据加载成功,共 {len(df)} 条记录") + + # 复制数据 + df = df.copy() + + # 特征工程: age(天) -> age_years + df['age_years'] = (df['age'] / 365).round().astype(int) + + # 计算BMI + df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2) + + # 异常值处理: 删除舒张压 >= 收缩压的记录 + initial_count = len(df) + df = df[df['ap_lo'] < df['ap_hi']] + print(f"🗑️ 删除舒张压>=收缩压的记录: {initial_count - len(df)} 条") + + # 删除血压极端异常值 + # 收缩压 ∈ [90, 250] + initial_count = len(df) + df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)] + removed_hy = initial_count - len(df) + + # 舒张压 ∈ [60, 150] + initial_count = len(df) + df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)] + removed_lo = initial_count - len(df) + print(f"🗑️ 删除血压异常值: 收缩压 {removed_hy} 条, 舒张压 {removed_lo} 条") + + print(f"✅ 数据清洗完成,剩余 {len(df)} 条记录") + return df + + +def prepare_features(df: pd.DataFrame) -> tuple: + """ + 准备特征和标签 + 删除id和原始age字段,保留处理后的特征 + """ + # 定义要使用的特征(删除id和原始age,保留age_years) + feature_columns = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', + 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi'] + + X = df[feature_columns].copy() + y = df['cardio'].copy() + + print(f"📊 特征数量: {len(feature_columns)}") + print(f"📊 特征列: {feature_columns}") + + return X, y, feature_columns + + +def build_pipeline() -> Pipeline: + """构建包含预处理器和分类器的Pipeline""" + print("🔧 正在构建Pipeline...") + + # 连续特征列 + continuous_cols = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi'] + + # 分类特征列 + categorical_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'] + + # 预处理器 + preprocessor = ColumnTransformer( + transformers=[ + ('num', StandardScaler(), continuous_cols), + ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols) + ], + remainder='drop' + ) + + # 完整Pipeline: 预处理 + XGBoost分类器 + pipeline = Pipeline([ + ('preprocessor', preprocessor), + ('classifier', XGBClassifier( + n_estimators=100, + max_depth=6, + learning_rate=0.1, + random_state=42, + use_label_encoder=False, + eval_metric='logloss', + n_jobs=-1 + )) + ]) + + print("✅ Pipeline构建完成") + return pipeline + + +def train_and_evaluate(X: pd.DataFrame, y: pd.Series, pipeline: Pipeline): + """训练模型并评估""" + print("\n" + "="*50) + print("📈 开始模型训练...") + print("="*50) + + # 划分训练集和测试集 + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y + ) + + print(f"📊 训练集大小: {len(X_train)}") + print(f"📊 测试集大小: {len(X_test)}") + + # 训练模型 + print("🏋️ 正在训练XGBoost模型...") + pipeline.fit(X_train, y_train) + + # 预测 + y_pred = pipeline.predict(X_test) + y_pred_proba = pipeline.predict_proba(X_test)[:, 1] + + # 评估指标 + accuracy = accuracy_score(y_test, y_pred) + roc_auc = roc_auc_score(y_test, y_pred_proba) + + print("\n" + "="*50) + print("📊 模型评估结果:") + print("="*50) + print(f"✅ 准确率 (Accuracy): {accuracy:.4f}") + print(f"✅ ROC-AUC 分数: {roc_auc:.4f}") + print("\n📋 分类报告:") + print(classification_report(y_test, y_pred, target_names=['健康', '有风险'])) + + return pipeline + + +def save_model(pipeline: Pipeline, model_path: Path): + """保存模型""" + print(f"\n💾 正在保存模型到: {model_path}") + joblib.dump(pipeline, model_path) + print(f"✅ 模型保存成功!") + + # 验证模型文件 + file_size = model_path.stat().st_size / (1024 * 1024) + print(f"📦 模型文件大小: {file_size:.2f} MB") + + +def main(): + """主函数""" + print("\n" + "="*60) + print("❤️ CardioAI 模块2: 心血管疾病风险预测模型训练") + print("="*60 + "\n") + + # 1. 加载并清洗数据 + df = load_and_clean_data(DATA_PATH) + + # 2. 准备特征 + X, y, feature_columns = prepare_features(df) + + # 3. 构建Pipeline + pipeline = build_pipeline() + + # 4. 训练并评估模型 + trained_pipeline = train_and_evaluate(X, y, pipeline) + + # 5. 保存模型 + save_model(trained_pipeline, MODEL_PATH) + + print("\n" + "="*60) + print("🎉 模型训练完成!") + print("="*60) + print(f"\n📌 模型使用说明:") + print(f" 1. 启动Flask API: python app.py") + print(f" 2. 访问 http://localhost:5001 查看预测界面") + print(f" 3. 输入11个特征值进行预测") + + +if __name__ == "__main__": + main() diff --git a/CardioAI/requirements.txt b/CardioAI/requirements.txt new file mode 100644 index 0000000..9149356 --- /dev/null +++ b/CardioAI/requirements.txt @@ -0,0 +1,21 @@ +# CardioAI 项目依赖 +# 创建并激活 conda 环境: +# conda create -n cardioenv python=3.10 +# conda activate cardioenv + +# 然后安装依赖: +# pip install -r requirements.txt + +pandas +openpyxl +numpy +scikit-learn +xgboost +joblib +streamlit +plotly +Flask +python-dotenv +langchain-openai +dashscope +requests