A
This commit is contained in:
10
test/.claude/settings.local.json
Normal file
10
test/.claude/settings.local.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(python -c \"import pandas as pd; df = pd.read_excel\\('data/心血管疾病.xlsx', nrows=5\\); print\\('Columns:', df.columns.tolist\\(\\)\\); print\\('Data types:', df.dtypes\\); print\\('Sample data:'\\); print\\(df.head\\(\\)\\)\")",
|
||||
"Bash(\"D:\\\\software\\\\anaconda\\\\Scripts\\\\conda.exe\" run:*)",
|
||||
"Bash(\"D:\\\\software\\\\anaconda\\\\envs\\\\cardioenv\\\\python.exe\" module1_dashboard/test_data.py)",
|
||||
"Bash(\"D:\\\\software\\\\anaconda\\\\envs\\\\cardioenv\\\\python.exe\" -m streamlit run module1_dashboard/cardio_dashboard.py --help)"
|
||||
]
|
||||
}
|
||||
}
|
||||
36
test/.env
Normal file
36
test/.env
Normal file
@@ -0,0 +1,36 @@
|
||||
# CardioAI Configuration
|
||||
# Environment variables for the cardiovascular disease intelligent assistant system
|
||||
|
||||
# Data file path (relative to project root)
|
||||
DATA_PATH=./data/心血管疾病.xlsx
|
||||
|
||||
# Flask server configuration
|
||||
FLASK_APP=module2_predictor/app.py
|
||||
FLASK_ENV=development
|
||||
FLASK_DEBUG=True
|
||||
FLASK_HOST=0.0.0.0
|
||||
FLASK_PORT=5000
|
||||
|
||||
# Streamlit dashboard configuration
|
||||
STREAMLIT_PORT=8501
|
||||
STREAMLIT_THEME=light
|
||||
|
||||
# AI Model API Keys (replace with your actual keys)
|
||||
# DeepSeek API (from dashscope)
|
||||
DASHSCOPE_API_KEY=your_dashscope_api_key_here
|
||||
|
||||
# OpenAI API (for langchain-openai, if used)
|
||||
OPENAI_API_KEY=your_openai_api_key_here
|
||||
|
||||
# Voice assistant configuration
|
||||
VOICE_ASSISTANT_PORT=5001
|
||||
VOICE_ASSISTANT_HOST=0.0.0.0
|
||||
|
||||
# Model file paths (will be generated during training)
|
||||
MODEL_PATH=./module2_predictor/models/xgb_model.pkl
|
||||
SCALER_PATH=./module2_predictor/models/scaler.pkl
|
||||
ENCODER_PATH=./module2_predictor/models/encoder.pkl
|
||||
|
||||
# Feature configuration
|
||||
NUMERICAL_FEATURES=age,trestbps,chol,thalach,oldpeak
|
||||
CATEGORICAL_FEATURES=sex,cp,fbs,restecg,exang,slope,ca,thal
|
||||
8
test/.idea/.gitignore
generated
vendored
Normal file
8
test/.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
# 默认忽略的文件
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# 基于编辑器的 HTTP 客户端请求
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
6
test/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
test/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
4
test/.idea/misc.xml
generated
Normal file
4
test/.idea/misc.xml
generated
Normal file
@@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="cardioenv" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
8
test/.idea/modules.xml
generated
Normal file
8
test/.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/test.iml" filepath="$PROJECT_DIR$/.idea/test.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
16
test/.idea/test.iml
generated
Normal file
16
test/.idea/test.iml
generated
Normal file
@@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="cardioenv" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TemplatesService">
|
||||
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
|
||||
<option name="TEMPLATE_FOLDERS">
|
||||
<list>
|
||||
<option value="$MODULE_DIR$/module2_predictor/templates" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
</module>
|
||||
6
test/.idea/vcs.xml
generated
Normal file
6
test/.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
264
test/README.md
Normal file
264
test/README.md
Normal file
@@ -0,0 +1,264 @@
|
||||
# CardioAI - 心血管疾病智能辅助系统
|
||||
|
||||
## 项目概述
|
||||
|
||||
CardioAI是一个多模块应用系统,集成了数据可视化、机器学习预测和AI语音问答功能,用于心血管疾病的智能辅助分析和诊断。
|
||||
|
||||
### 系统模块
|
||||
|
||||
1. **Module 1: 数据可视化仪表板** (Streamlit) - 本模块
|
||||
- 数据清洗与特征工程
|
||||
- 交互式数据筛选
|
||||
- 可视化分析图表
|
||||
|
||||
2. **Module 2: 机器学习预测器** (Flask + XGBoost)
|
||||
- 心血管疾病风险预测模型
|
||||
- RESTful API接口
|
||||
- 实时预测服务
|
||||
|
||||
3. **Module 3: AI语音助手** (DeepSeek + CosyVoice)
|
||||
- 自然语言问答
|
||||
- 语音交互界面
|
||||
- 疾病知识查询
|
||||
|
||||
## Module 1: 数据可视化仪表板
|
||||
|
||||
### 功能特性
|
||||
|
||||
- ✅ **数据加载与清洗**: 自动处理异常值和缺失数据
|
||||
- ✅ **特征工程**: 年龄转换、BMI计算、类别编码
|
||||
- ✅ **交互式筛选**: 侧边栏多维度数据筛选
|
||||
- ✅ **可视化分析**: Plotly交互式图表
|
||||
- ✅ **性能优化**: 使用缓存加速数据加载
|
||||
|
||||
### 数据处理流程
|
||||
|
||||
1. **数据加载**: 从Excel文件加载原始数据
|
||||
2. **年龄转换**: 将天数转换为年数(四舍五入)
|
||||
3. **BMI计算**: `BMI = 体重(kg) / (身高(m)^2)`
|
||||
4. **异常值处理**:
|
||||
- 删除舒张压 ≥ 收缩压的记录
|
||||
- 删除收缩压不在[90, 250] mmHg范围的记录
|
||||
- 删除舒张压不在[60, 150] mmHg范围的记录
|
||||
5. **类别转换**:
|
||||
- 胆固醇水平: 1=正常, 2=高于正常, 3=极高
|
||||
- 血糖水平: 1=正常, 2=高于正常, 3=极高
|
||||
- 性别: 1=女性, 2=男性
|
||||
- BMI分类: <18.5=偏瘦, 18.5-24.9=正常, 25-29.9=超重, ≥30=肥胖
|
||||
|
||||
### 快速开始
|
||||
|
||||
#### 1. 环境配置
|
||||
|
||||
```bash
|
||||
# 创建并激活conda虚拟环境
|
||||
conda create -n cardioenv python=3.10
|
||||
conda activate cardioenv
|
||||
|
||||
# 安装依赖包
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### 2. 数据准备
|
||||
|
||||
确保数据文件位于正确路径:
|
||||
```
|
||||
项目根目录/
|
||||
├── data/
|
||||
│ └── 心血管疾病.xlsx
|
||||
└── module1_dashboard/
|
||||
└── cardio_dashboard.py
|
||||
```
|
||||
|
||||
#### 3. 启动仪表板
|
||||
|
||||
```bash
|
||||
# 进入项目根目录
|
||||
cd D:\Project\PythonProject\AIcode\test
|
||||
|
||||
# 激活conda环境
|
||||
conda activate cardioenv
|
||||
|
||||
# 启动Streamlit应用程序
|
||||
streamlit run module1_dashboard/cardio_dashboard.py
|
||||
```
|
||||
|
||||
或者使用conda直接运行:
|
||||
|
||||
```bash
|
||||
"D:\software\anaconda\Scripts\conda.exe" run -n cardioenv streamlit run module1_dashboard/cardio_dashboard.py
|
||||
```
|
||||
|
||||
#### 4. 访问应用
|
||||
|
||||
打开浏览器,访问: [http://localhost:8501](http://localhost:8501)
|
||||
|
||||
### 界面说明
|
||||
|
||||
#### 侧边栏筛选器
|
||||
- **年龄范围**: 滑动选择器,筛选指定年龄范围的记录
|
||||
- **性别**: 多选框,选择要分析的性别(女性/男性)
|
||||
- **心血管疾病状态**: 多选框,选择疾病状态(有/无)
|
||||
- **BMI分类**: 多选框,选择BMI分类(偏瘦/正常/超重/肥胖)
|
||||
- **胆固醇水平**: 多选框,选择胆固醇水平
|
||||
- **血糖水平**: 多选框,选择血糖水平
|
||||
|
||||
#### 主界面区域
|
||||
|
||||
1. **关键指标面板**
|
||||
- 筛选后记录数
|
||||
- 心血管疾病风险率
|
||||
- 平均年龄
|
||||
- 平均BMI
|
||||
|
||||
2. **数据可视化图表**
|
||||
- 年龄分布与心血管疾病关系直方图
|
||||
- BMI分类对心血管疾病影响的堆叠柱状图
|
||||
- 血压关系散点图
|
||||
- 胆固醇水平分布饼图
|
||||
- 血糖水平分布饼图
|
||||
|
||||
3. **数据预览**
|
||||
- 数据摘要(形状、类型、缺失值)
|
||||
- 原始数据表格(可自定义显示的列)
|
||||
|
||||
### 配置文件说明
|
||||
|
||||
`.env` 文件包含以下配置项:
|
||||
|
||||
```ini
|
||||
# 数据文件路径
|
||||
DATA_PATH=./data/心血管疾病.xlsx
|
||||
|
||||
# Flask服务器配置
|
||||
FLASK_APP=module2_predictor/app.py
|
||||
FLASK_ENV=development
|
||||
|
||||
# Streamlit配置
|
||||
STREAMLIT_PORT=8501
|
||||
|
||||
# AI模型API密钥(需要替换为实际值)
|
||||
DASHSCOPE_API_KEY=your_dashscope_api_key_here
|
||||
OPENAI_API_KEY=your_openai_api_key_here
|
||||
|
||||
# 模型文件路径
|
||||
MODEL_PATH=./module2_predictor/models/xgb_model.pkl
|
||||
```
|
||||
|
||||
### 依赖包说明
|
||||
|
||||
详细依赖见 `requirements.txt`:
|
||||
|
||||
- **数据处理**: pandas, numpy, openpyxl
|
||||
- **机器学习**: scikit-learn, xgboost, joblib
|
||||
- **可视化**: streamlit, plotly
|
||||
- **Web服务**: Flask
|
||||
- **环境管理**: python-dotenv
|
||||
- **AI集成**: langchain-openai, dashscope, requests
|
||||
|
||||
### 常见问题
|
||||
|
||||
#### Q1: 数据加载失败
|
||||
**症状**: 应用程序无法启动,提示文件找不到或格式错误
|
||||
**解决**:
|
||||
1. 检查 `data/心血管疾病.xlsx` 文件是否存在
|
||||
2. 确认文件格式为Excel 2007+ (.xlsx)
|
||||
3. 检查文件编码,确保不是二进制损坏
|
||||
|
||||
#### Q2: 图表显示异常
|
||||
**症状**: 图表不显示或显示错误
|
||||
**解决**:
|
||||
1. 检查Plotly是否正确安装:`pip install plotly`
|
||||
2. 确保数据经过正确清洗,没有无限值或NaN
|
||||
3. 检查筛选条件是否过于严格导致无数据
|
||||
|
||||
#### Q3: 应用程序运行缓慢
|
||||
**症状**: 页面响应慢,筛选操作延迟
|
||||
**解决**:
|
||||
1. 利用 `@st.cache_data` 装饰器的缓存功能
|
||||
2. 减少一次性加载的数据量
|
||||
3. 优化图表复杂度,减少数据点数量
|
||||
|
||||
#### Q4: 中文显示乱码
|
||||
**症状**: 中文文字显示为乱码
|
||||
**解决**:
|
||||
1. 确保系统支持中文字体
|
||||
2. Streamlit默认支持UTF-8编码,检查源代码文件保存为UTF-8
|
||||
3. 在Windows系统上,设置控制台编码为UTF-8
|
||||
|
||||
### 开发说明
|
||||
|
||||
#### 项目结构
|
||||
```
|
||||
D:\Project\PythonProject\AIcode\test\
|
||||
├── data\ # 数据文件目录
|
||||
│ └── 心血管疾病.xlsx
|
||||
├── module1_dashboard\ # 数据可视化模块
|
||||
│ ├── cardio_dashboard.py
|
||||
│ └── test_data.py
|
||||
├── module2_predictor\ # 机器学习预测模块
|
||||
│ └── templates\
|
||||
├── module3_voice_assistant\ # AI语音助手模块
|
||||
│ └── templates\
|
||||
├── requirements.txt # Python依赖包
|
||||
├── .env # 环境配置
|
||||
└── README.md # 项目文档
|
||||
```
|
||||
|
||||
#### 代码结构
|
||||
- `load_and_process_data()`: 数据加载和清洗主函数,使用 `@st.cache_data` 缓存
|
||||
- `create_filters()`: 创建侧边栏筛选器组件
|
||||
- `apply_filters()`: 应用筛选条件到数据框
|
||||
- `display_metrics()`: 显示关键指标卡片
|
||||
- `create_visualizations()`: 创建所有可视化图表
|
||||
- `display_data_preview()`: 显示数据预览和摘要
|
||||
|
||||
#### 扩展开发
|
||||
|
||||
1. **添加新图表**
|
||||
```python
|
||||
def create_new_chart(df):
|
||||
fig = px.scatter(df, x='column1', y='column2', color='cardio_str')
|
||||
st.plotly_chart(fig, use_container_width=True)
|
||||
```
|
||||
|
||||
2. **添加新筛选器**
|
||||
```python
|
||||
# 在create_filters函数中添加
|
||||
new_filter = st.sidebar.selectbox("新筛选器", options=['选项1', '选项2'])
|
||||
```
|
||||
|
||||
3. **自定义样式**
|
||||
```python
|
||||
st.markdown("""
|
||||
<style>
|
||||
.custom-class { color: blue; }
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
```
|
||||
|
||||
### 性能优化建议
|
||||
|
||||
1. **数据缓存**: 所有数据处理函数使用 `@st.cache_data` 装饰器
|
||||
2. **增量加载**: 对于大型数据集,考虑分页或懒加载
|
||||
3. **图表优化**: 使用采样或聚合减少数据点数量
|
||||
4. **异步处理**: 长时间操作使用异步函数避免阻塞UI
|
||||
|
||||
### 下一步计划
|
||||
|
||||
1. **Module 2开发**: 实现XGBoost预测模型和Flask API
|
||||
2. **Module 3开发**: 集成DeepSeek和CosyVoice语音助手
|
||||
3. **功能增强**: 添加数据导出、报告生成功能
|
||||
4. **部署优化**: Docker容器化,云平台部署
|
||||
|
||||
### 技术支持
|
||||
|
||||
- **问题反馈**: 检查GitHub Issues或联系开发团队
|
||||
- **文档更新**: 随着功能迭代保持文档同步
|
||||
- **版本管理**: 使用Git进行版本控制,定期发布稳定版本
|
||||
|
||||
---
|
||||
|
||||
**版权所有 © 2024 CardioAI项目组**
|
||||
**版本**: 1.0.0
|
||||
**最后更新**: 2024-04-02
|
||||
BIN
test/data/心血管疾病.xlsx
Normal file
BIN
test/data/心血管疾病.xlsx
Normal file
Binary file not shown.
Binary file not shown.
581
test/module1_dashboard/cardio_dashboard.py
Normal file
581
test/module1_dashboard/cardio_dashboard.py
Normal file
@@ -0,0 +1,581 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
CardioAI - 心血管疾病智能辅助系统
|
||||
数据可视化仪表板模块
|
||||
|
||||
功能:
|
||||
1. 数据加载与清洗
|
||||
2. 特征工程(年龄转换、BMI计算、类别转换)
|
||||
3. 交互式数据筛选
|
||||
4. 可视化分析(Plotly图表)
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import plotly.express as px
|
||||
import plotly.graph_objects as go
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import os
|
||||
|
||||
# 设置页面配置
|
||||
st.set_page_config(
|
||||
page_title="CardioAI - 心血管疾病分析仪表板",
|
||||
page_icon="❤️",
|
||||
layout="wide",
|
||||
initial_sidebar_state="expanded"
|
||||
)
|
||||
|
||||
# 添加项目根目录到Python路径,确保可以导入其他模块
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.append(str(project_root))
|
||||
|
||||
# 设置中文显示
|
||||
st.markdown("""
|
||||
<style>
|
||||
.main-header {
|
||||
font-size: 2.5rem;
|
||||
color: #e63946;
|
||||
text-align: center;
|
||||
margin-bottom: 2rem;
|
||||
font-weight: bold;
|
||||
}
|
||||
.sub-header {
|
||||
font-size: 1.5rem;
|
||||
color: #457b9d;
|
||||
margin-top: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
font-weight: bold;
|
||||
}
|
||||
.metric-card {
|
||||
background-color: #f1faee;
|
||||
padding: 1.5rem;
|
||||
border-radius: 10px;
|
||||
border-left: 5px solid #1d3557;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
.stButton>button {
|
||||
background-color: #1d3557;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 0.5rem 1rem;
|
||||
border-radius: 5px;
|
||||
}
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# 数据文件路径
|
||||
DATA_PATH = Path(__file__).parent.parent / "data" / "心血管疾病.xlsx"
|
||||
|
||||
@st.cache_data(show_spinner="正在加载和清洗数据...")
|
||||
def load_and_process_data():
|
||||
"""
|
||||
加载Excel数据并进行清洗和特征工程
|
||||
|
||||
返回:
|
||||
pd.DataFrame: 处理后的数据框
|
||||
"""
|
||||
try:
|
||||
# 加载数据
|
||||
st.info(f"正在从 {DATA_PATH} 加载数据...")
|
||||
df = pd.read_excel(DATA_PATH)
|
||||
|
||||
# 检查必要列是否存在
|
||||
required_columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
|
||||
'cholesterol', 'gluc', 'cardio']
|
||||
missing_columns = [col for col in required_columns if col not in df.columns]
|
||||
if missing_columns:
|
||||
st.error(f"数据文件中缺少必要列: {missing_columns}")
|
||||
return pd.DataFrame()
|
||||
|
||||
# 创建数据副本
|
||||
df_processed = df.copy()
|
||||
|
||||
# 1. 年龄转换:从天转换为年(四舍五入)
|
||||
df_processed['age_years'] = (df_processed['age'] / 365.25).round().astype(int)
|
||||
|
||||
# 2. 计算BMI: BMI = weight(kg) / (height(m)^2)
|
||||
# 注意:height数据单位为厘米,需要转换为米
|
||||
df_processed['bmi'] = df_processed['weight'] / ((df_processed['height'] / 100) ** 2)
|
||||
df_processed['bmi'] = df_processed['bmi'].round(2)
|
||||
|
||||
# 3. 异常值处理
|
||||
# 删除舒张压 >= 收缩压的记录
|
||||
invalid_bp = df_processed['ap_lo'] >= df_processed['ap_hi']
|
||||
if invalid_bp.any():
|
||||
st.warning(f"删除 {invalid_bp.sum()} 条舒张压 >= 收缩压的异常记录")
|
||||
df_processed = df_processed[~invalid_bp].copy()
|
||||
|
||||
# 删除血压极端异常值
|
||||
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
|
||||
bp_outliers = ~((df_processed['ap_hi'] >= 90) & (df_processed['ap_hi'] <= 250) &
|
||||
(df_processed['ap_lo'] >= 60) & (df_processed['ap_lo'] <= 150))
|
||||
if bp_outliers.any():
|
||||
st.warning(f"删除 {bp_outliers.sum()} 条血压极端异常值记录")
|
||||
df_processed = df_processed[~bp_outliers].copy()
|
||||
|
||||
# 4. 类别转换
|
||||
# cholesterol转换
|
||||
cholesterol_map = {
|
||||
1: "正常",
|
||||
2: "高于正常",
|
||||
3: "极高"
|
||||
}
|
||||
df_processed['cholesterol_str'] = df_processed['cholesterol'].map(cholesterol_map)
|
||||
|
||||
# gluc转换
|
||||
gluc_map = {
|
||||
1: "正常",
|
||||
2: "高于正常",
|
||||
3: "极高"
|
||||
}
|
||||
df_processed['gluc_str'] = df_processed['gluc'].map(gluc_map)
|
||||
|
||||
# gender转换
|
||||
gender_map = {
|
||||
1: "女性",
|
||||
2: "男性"
|
||||
}
|
||||
df_processed['gender_str'] = df_processed['gender'].map(gender_map)
|
||||
|
||||
# cardio转换
|
||||
cardio_map = {
|
||||
0: "无心血管疾病",
|
||||
1: "有心血管疾病"
|
||||
}
|
||||
df_processed['cardio_str'] = df_processed['cardio'].map(cardio_map)
|
||||
|
||||
# 5. BMI分类
|
||||
def categorize_bmi(bmi):
|
||||
if bmi < 18.5:
|
||||
return "偏瘦"
|
||||
elif 18.5 <= bmi < 24.9:
|
||||
return "正常"
|
||||
elif 25 <= bmi < 29.9:
|
||||
return "超重"
|
||||
else:
|
||||
return "肥胖"
|
||||
|
||||
df_processed['bmi_category'] = df_processed['bmi'].apply(categorize_bmi)
|
||||
|
||||
# 记录处理后的数据信息
|
||||
st.success(f"数据加载和清洗完成!共处理 {len(df_processed)} 条记录")
|
||||
st.info(f"原始数据: {len(df)} 条记录, 清洗后: {len(df_processed)} 条记录")
|
||||
|
||||
return df_processed
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"数据加载失败: {str(e)}")
|
||||
return pd.DataFrame()
|
||||
|
||||
def create_filters(df):
|
||||
"""
|
||||
创建侧边栏筛选器
|
||||
|
||||
参数:
|
||||
df: 处理后的数据框
|
||||
|
||||
返回:
|
||||
dict: 筛选条件字典
|
||||
"""
|
||||
st.sidebar.markdown("## 🔍 数据筛选")
|
||||
|
||||
# 年龄范围筛选
|
||||
min_age = int(df['age_years'].min())
|
||||
max_age = int(df['age_years'].max())
|
||||
age_range = st.sidebar.slider(
|
||||
"选择年龄范围:",
|
||||
min_value=min_age,
|
||||
max_value=max_age,
|
||||
value=(min_age, max_age),
|
||||
help="筛选指定年龄范围内的记录"
|
||||
)
|
||||
|
||||
# 性别筛选
|
||||
gender_options = df['gender_str'].unique().tolist()
|
||||
selected_genders = st.sidebar.multiselect(
|
||||
"选择性别:",
|
||||
options=gender_options,
|
||||
default=gender_options,
|
||||
help="选择要分析的性别"
|
||||
)
|
||||
|
||||
# 心血管疾病状态筛选
|
||||
cardio_options = df['cardio_str'].unique().tolist()
|
||||
selected_cardio = st.sidebar.multiselect(
|
||||
"选择心血管疾病状态:",
|
||||
options=cardio_options,
|
||||
default=cardio_options,
|
||||
help="选择要分析的心血管疾病状态"
|
||||
)
|
||||
|
||||
# BMI分类筛选
|
||||
bmi_options = df['bmi_category'].unique().tolist()
|
||||
selected_bmi = st.sidebar.multiselect(
|
||||
"选择BMI分类:",
|
||||
options=bmi_options,
|
||||
default=bmi_options,
|
||||
help="选择要分析的BMI分类"
|
||||
)
|
||||
|
||||
# 胆固醇水平筛选
|
||||
cholesterol_options = df['cholesterol_str'].unique().tolist()
|
||||
selected_cholesterol = st.sidebar.multiselect(
|
||||
"选择胆固醇水平:",
|
||||
options=cholesterol_options,
|
||||
default=cholesterol_options,
|
||||
help="选择要分析的胆固醇水平"
|
||||
)
|
||||
|
||||
# 血糖水平筛选
|
||||
gluc_options = df['gluc_str'].unique().tolist()
|
||||
selected_gluc = st.sidebar.multiselect(
|
||||
"选择血糖水平:",
|
||||
options=gluc_options,
|
||||
default=gluc_options,
|
||||
help="选择要分析的血糖水平"
|
||||
)
|
||||
|
||||
return {
|
||||
'age_range': age_range,
|
||||
'genders': selected_genders,
|
||||
'cardio': selected_cardio,
|
||||
'bmi_categories': selected_bmi,
|
||||
'cholesterol': selected_cholesterol,
|
||||
'gluc': selected_gluc
|
||||
}
|
||||
|
||||
def apply_filters(df, filters):
|
||||
"""
|
||||
应用筛选条件到数据框
|
||||
|
||||
参数:
|
||||
df: 原始数据框
|
||||
filters: 筛选条件字典
|
||||
|
||||
返回:
|
||||
pd.DataFrame: 筛选后的数据框
|
||||
"""
|
||||
filtered_df = df.copy()
|
||||
|
||||
# 应用年龄筛选
|
||||
filtered_df = filtered_df[
|
||||
(filtered_df['age_years'] >= filters['age_range'][0]) &
|
||||
(filtered_df['age_years'] <= filters['age_range'][1])
|
||||
]
|
||||
|
||||
# 应用性别筛选
|
||||
if filters['genders']:
|
||||
filtered_df = filtered_df[filtered_df['gender_str'].isin(filters['genders'])]
|
||||
|
||||
# 应用心血管疾病筛选
|
||||
if filters['cardio']:
|
||||
filtered_df = filtered_df[filtered_df['cardio_str'].isin(filters['cardio'])]
|
||||
|
||||
# 应用BMI分类筛选
|
||||
if filters['bmi_categories']:
|
||||
filtered_df = filtered_df[filtered_df['bmi_category'].isin(filters['bmi_categories'])]
|
||||
|
||||
# 应用胆固醇筛选
|
||||
if filters['cholesterol']:
|
||||
filtered_df = filtered_df[filtered_df['cholesterol_str'].isin(filters['cholesterol'])]
|
||||
|
||||
# 应用血糖筛选
|
||||
if filters['gluc']:
|
||||
filtered_df = filtered_df[filtered_df['gluc_str'].isin(filters['gluc'])]
|
||||
|
||||
return filtered_df
|
||||
|
||||
def display_metrics(filtered_df, original_df):
|
||||
"""
|
||||
显示关键指标
|
||||
|
||||
参数:
|
||||
filtered_df: 筛选后的数据框
|
||||
original_df: 原始数据框
|
||||
"""
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
with col1:
|
||||
st.metric(
|
||||
label="筛选后记录数",
|
||||
value=f"{len(filtered_df):,}",
|
||||
delta=f"{len(filtered_df) - len(original_df):+,}"
|
||||
)
|
||||
|
||||
with col2:
|
||||
# 心血管疾病风险率
|
||||
if len(filtered_df) > 0:
|
||||
risk_rate = (filtered_df['cardio'].sum() / len(filtered_df) * 100).round(2)
|
||||
st.metric(
|
||||
label="心血管疾病风险率",
|
||||
value=f"{risk_rate}%",
|
||||
help="当前筛选条件下心血管疾病患者比例"
|
||||
)
|
||||
else:
|
||||
st.metric(label="心血管疾病风险率", value="N/A")
|
||||
|
||||
with col3:
|
||||
# 平均年龄
|
||||
if len(filtered_df) > 0:
|
||||
avg_age = filtered_df['age_years'].mean().round(1)
|
||||
st.metric(
|
||||
label="平均年龄",
|
||||
value=f"{avg_age} 岁",
|
||||
help="当前筛选条件下的平均年龄"
|
||||
)
|
||||
else:
|
||||
st.metric(label="平均年龄", value="N/A")
|
||||
|
||||
with col4:
|
||||
# 平均BMI
|
||||
if len(filtered_df) > 0:
|
||||
avg_bmi = filtered_df['bmi'].mean().round(1)
|
||||
st.metric(
|
||||
label="平均BMI",
|
||||
value=str(avg_bmi),
|
||||
help="当前筛选条件下的平均身体质量指数"
|
||||
)
|
||||
else:
|
||||
st.metric(label="平均BMI", value="N/A")
|
||||
|
||||
def create_visualizations(df):
|
||||
"""
|
||||
创建可视化图表
|
||||
|
||||
参数:
|
||||
df: 要可视化的数据框
|
||||
"""
|
||||
if len(df) == 0:
|
||||
st.warning("没有可用的数据进行可视化")
|
||||
return
|
||||
|
||||
st.markdown("## 📊 数据可视化分析")
|
||||
|
||||
# 图1: 年龄分布直方图(按心血管疾病状态区分)
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
st.markdown("### 年龄分布分析")
|
||||
fig_age = px.histogram(
|
||||
df,
|
||||
x='age_years',
|
||||
color='cardio_str',
|
||||
nbins=30,
|
||||
barmode='overlay',
|
||||
opacity=0.7,
|
||||
labels={
|
||||
'age_years': '年龄(岁)',
|
||||
'cardio_str': '心血管疾病状态',
|
||||
'count': '人数'
|
||||
},
|
||||
title="年龄分布与心血管疾病关系",
|
||||
color_discrete_map={
|
||||
"有心血管疾病": "#e63946",
|
||||
"无心血管疾病": "#457b9d"
|
||||
}
|
||||
)
|
||||
fig_age.update_layout(
|
||||
legend_title="疾病状态",
|
||||
hovermode='x unified'
|
||||
)
|
||||
st.plotly_chart(fig_age, use_container_width=True)
|
||||
|
||||
with col2:
|
||||
st.markdown("### BMI分类与心血管疾病关系")
|
||||
# 创建交叉表
|
||||
bmi_cardio_cross = pd.crosstab(
|
||||
df['bmi_category'],
|
||||
df['cardio_str'],
|
||||
normalize='index'
|
||||
).reset_index()
|
||||
|
||||
# 转换为长格式
|
||||
bmi_cardio_long = bmi_cardio_cross.melt(
|
||||
id_vars='bmi_category',
|
||||
var_name='cardio_status',
|
||||
value_name='proportion'
|
||||
)
|
||||
|
||||
fig_bmi = px.bar(
|
||||
bmi_cardio_long,
|
||||
x='bmi_category',
|
||||
y='proportion',
|
||||
color='cardio_status',
|
||||
barmode='stack',
|
||||
labels={
|
||||
'bmi_category': 'BMI分类',
|
||||
'proportion': '比例',
|
||||
'cardio_status': '心血管疾病状态'
|
||||
},
|
||||
title="BMI分类对心血管疾病的影响",
|
||||
color_discrete_map={
|
||||
"有心血管疾病": "#e63946",
|
||||
"无心血管疾病": "#457b9d"
|
||||
}
|
||||
)
|
||||
fig_bmi.update_layout(
|
||||
yaxis_tickformat='.1%',
|
||||
legend_title="疾病状态"
|
||||
)
|
||||
st.plotly_chart(fig_bmi, use_container_width=True)
|
||||
|
||||
# 图3: 血压关系散点图
|
||||
st.markdown("### 血压关系分析")
|
||||
fig_bp = px.scatter(
|
||||
df,
|
||||
x='ap_hi',
|
||||
y='ap_lo',
|
||||
color='cardio_str',
|
||||
size='bmi',
|
||||
hover_data=['age_years', 'gender_str', 'cholesterol_str'],
|
||||
labels={
|
||||
'ap_hi': '收缩压 (mmHg)',
|
||||
'ap_lo': '舒张压 (mmHg)',
|
||||
'cardio_str': '心血管疾病状态',
|
||||
'bmi': 'BMI'
|
||||
},
|
||||
title="血压关系散点图",
|
||||
color_discrete_map={
|
||||
"有心血管疾病": "#e63946",
|
||||
"无心血管疾病": "#457b9d"
|
||||
}
|
||||
)
|
||||
fig_bp.update_layout(legend_title="疾病状态")
|
||||
st.plotly_chart(fig_bp, use_container_width=True)
|
||||
|
||||
# 图4: 胆固醇和血糖水平分析
|
||||
col3, col4 = st.columns(2)
|
||||
|
||||
with col3:
|
||||
st.markdown("### 胆固醇水平分布")
|
||||
cholesterol_counts = df['cholesterol_str'].value_counts().reset_index()
|
||||
cholesterol_counts.columns = ['cholesterol_level', 'count']
|
||||
|
||||
fig_chol = px.pie(
|
||||
cholesterol_counts,
|
||||
values='count',
|
||||
names='cholesterol_level',
|
||||
title="胆固醇水平分布",
|
||||
color_discrete_sequence=px.colors.sequential.RdBu
|
||||
)
|
||||
fig_chol.update_traces(textposition='inside', textinfo='percent+label')
|
||||
st.plotly_chart(fig_chol, use_container_width=True)
|
||||
|
||||
with col4:
|
||||
st.markdown("### 血糖水平分布")
|
||||
gluc_counts = df['gluc_str'].value_counts().reset_index()
|
||||
gluc_counts.columns = ['gluc_level', 'count']
|
||||
|
||||
fig_gluc = px.pie(
|
||||
gluc_counts,
|
||||
values='count',
|
||||
names='gluc_level',
|
||||
title="血糖水平分布",
|
||||
color_discrete_sequence=px.colors.sequential.Blues
|
||||
)
|
||||
fig_gluc.update_traces(textposition='inside', textinfo='percent+label')
|
||||
st.plotly_chart(fig_gluc, use_container_width=True)
|
||||
|
||||
def display_data_preview(df):
|
||||
"""
|
||||
显示数据预览
|
||||
|
||||
参数:
|
||||
df: 要预览的数据框
|
||||
"""
|
||||
st.markdown("## 📋 数据预览")
|
||||
|
||||
# 显示数据摘要
|
||||
with st.expander("数据摘要", expanded=False):
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
st.write("**数据形状:**", df.shape)
|
||||
st.write("**数据类型:**")
|
||||
st.write(df.dtypes.astype(str).reset_index().rename(columns={0: '类型', 'index': '列名'}))
|
||||
|
||||
with col2:
|
||||
st.write("**缺失值统计:**")
|
||||
missing = df.isnull().sum().reset_index()
|
||||
missing.columns = ['列名', '缺失值数量']
|
||||
missing = missing[missing['缺失值数量'] > 0]
|
||||
if len(missing) > 0:
|
||||
st.write(missing)
|
||||
else:
|
||||
st.write("无缺失值")
|
||||
|
||||
# 显示数据表格
|
||||
with st.expander("查看原始数据", expanded=False):
|
||||
# 选择要显示的列
|
||||
available_columns = df.columns.tolist()
|
||||
default_columns = ['age_years', 'gender_str', 'bmi', 'bmi_category',
|
||||
'ap_hi', 'ap_lo', 'cholesterol_str', 'gluc_str', 'cardio_str']
|
||||
|
||||
selected_columns = st.multiselect(
|
||||
"选择要显示的列:",
|
||||
options=available_columns,
|
||||
default=default_columns
|
||||
)
|
||||
|
||||
if selected_columns:
|
||||
display_df = df[selected_columns].copy()
|
||||
st.dataframe(display_df.head(100), use_container_width=True)
|
||||
st.caption(f"显示前 100 行(共 {len(df)} 行)")
|
||||
else:
|
||||
st.info("请选择要显示的列")
|
||||
|
||||
def main():
|
||||
"""
|
||||
主函数
|
||||
"""
|
||||
# 标题
|
||||
st.markdown('<h1 class="main-header">❤️ CardioAI - 心血管疾病智能分析仪表板</h1>', unsafe_allow_html=True)
|
||||
st.markdown("---")
|
||||
|
||||
# 加载数据
|
||||
with st.spinner("正在加载数据,请稍候..."):
|
||||
df = load_and_process_data()
|
||||
|
||||
if df.empty:
|
||||
st.error("数据加载失败,请检查数据文件路径和格式")
|
||||
return
|
||||
|
||||
# 创建筛选器
|
||||
filters = create_filters(df)
|
||||
|
||||
# 应用筛选
|
||||
filtered_df = apply_filters(df, filters)
|
||||
|
||||
# 显示关键指标
|
||||
st.markdown("## 📈 关键指标")
|
||||
display_metrics(filtered_df, df)
|
||||
|
||||
# 显示数据预览
|
||||
display_data_preview(filtered_df)
|
||||
|
||||
# 创建可视化图表
|
||||
create_visualizations(filtered_df)
|
||||
|
||||
# 侧边栏信息
|
||||
st.sidebar.markdown("---")
|
||||
st.sidebar.markdown("## ℹ️ 使用说明")
|
||||
st.sidebar.info("""
|
||||
1. 使用左侧筛选器选择要分析的数据子集
|
||||
2. 查看上方的关键指标了解数据概况
|
||||
3. 探索下方的可视化图表分析趋势和关系
|
||||
4. 点击数据预览查看详细数据
|
||||
""")
|
||||
|
||||
st.sidebar.markdown("## 📊 数据说明")
|
||||
st.sidebar.info("""
|
||||
- **年龄**: 原始数据为天数,已转换为岁数
|
||||
- **BMI**: 身体质量指数,计算公式:体重(kg)/身高(m)²
|
||||
- **血压**: 收缩压(ap_hi)和舒张压(ap_lo)
|
||||
- **胆固醇/血糖**: 1=正常, 2=高于正常, 3=极高
|
||||
- **心血管疾病**: 0=无, 1=有
|
||||
""")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
84
test/module1_dashboard/test_data.py
Normal file
84
test/module1_dashboard/test_data.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
测试数据加载和处理的脚本
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# 添加父目录到路径
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
# 导入数据处理函数
|
||||
from cardio_dashboard import load_and_process_data
|
||||
|
||||
def test_data_loading():
|
||||
"""测试数据加载和清洗功能"""
|
||||
print("开始测试数据加载和清洗...")
|
||||
|
||||
try:
|
||||
# 加载数据
|
||||
df = load_and_process_data()
|
||||
|
||||
if df.empty:
|
||||
print("❌ 数据加载失败:返回空数据框")
|
||||
return False
|
||||
|
||||
print(f"✅ 数据加载成功!共 {len(df)} 条记录")
|
||||
|
||||
# 检查必要的列
|
||||
required_columns = ['age_years', 'bmi', 'bmi_category', 'cholesterol_str',
|
||||
'gluc_str', 'gender_str', 'cardio_str', 'ap_hi', 'ap_lo']
|
||||
|
||||
missing_columns = [col for col in required_columns if col not in df.columns]
|
||||
if missing_columns:
|
||||
print(f"❌ 缺少必要的列: {missing_columns}")
|
||||
return False
|
||||
|
||||
print("✅ 所有必要的列都存在")
|
||||
|
||||
# 检查数据类型
|
||||
print("\n数据摘要:")
|
||||
print(f"- 年龄范围: {df['age_years'].min()} ~ {df['age_years'].max()} 岁")
|
||||
print(f"- BMI范围: {df['bmi'].min():.1f} ~ {df['bmi'].max():.1f}")
|
||||
print(f"- 收缩压范围: {df['ap_hi'].min()} ~ {df['ap_hi'].max()} mmHg")
|
||||
print(f"- 舒张压范围: {df['ap_lo'].min()} ~ {df['ap_lo'].max()} mmHg")
|
||||
|
||||
# 检查类别转换
|
||||
print("\n类别分布:")
|
||||
print(f"- 性别: {df['gender_str'].value_counts().to_dict()}")
|
||||
print(f"- 心血管疾病: {df['cardio_str'].value_counts().to_dict()}")
|
||||
print(f"- BMI分类: {df['bmi_category'].value_counts().to_dict()}")
|
||||
print(f"- 胆固醇水平: {df['cholesterol_str'].value_counts().to_dict()}")
|
||||
print(f"- 血糖水平: {df['gluc_str'].value_counts().to_dict()}")
|
||||
|
||||
# 检查异常值处理
|
||||
invalid_bp = df['ap_lo'] >= df['ap_hi']
|
||||
if invalid_bp.any():
|
||||
print(f"❌ 仍然存在舒张压 >= 收缩压的记录: {invalid_bp.sum()} 条")
|
||||
return False
|
||||
else:
|
||||
print("✅ 已成功删除舒张压 >= 收缩压的记录")
|
||||
|
||||
# 检查血压范围
|
||||
bp_in_range = ((df['ap_hi'] >= 90) & (df['ap_hi'] <= 250) &
|
||||
(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150))
|
||||
if not bp_in_range.all():
|
||||
print(f"❌ 仍然存在血压异常值: {(~bp_in_range).sum()} 条")
|
||||
return False
|
||||
else:
|
||||
print("✅ 所有血压值都在正常范围内")
|
||||
|
||||
print("\n🎉 所有测试通过!")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 测试过程中发生错误: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = test_data_loading()
|
||||
sys.exit(0 if success else 1)
|
||||
33
test/requirements.txt
Normal file
33
test/requirements.txt
Normal file
@@ -0,0 +1,33 @@
|
||||
# CardioAI - Cardiovascular Disease Intelligent Assistant System
|
||||
# Python dependencies for the multi-module application
|
||||
|
||||
# Conda Environment Setup Instructions:
|
||||
# 1. Create a new conda environment named 'cardioenv' with Python 3.10:
|
||||
# conda create -n cardioenv python=3.10
|
||||
# 2. Activate the environment:
|
||||
# conda activate cardioenv
|
||||
# 3. Install dependencies from this file:
|
||||
# pip install -r requirements.txt
|
||||
|
||||
# Core data processing and machine learning
|
||||
pandas>=2.0.0
|
||||
openpyxl>=3.1.0
|
||||
numpy>=1.24.0
|
||||
scikit-learn>=1.3.0
|
||||
xgboost>=2.0.0
|
||||
joblib>=1.3.0
|
||||
|
||||
# Data visualization and dashboard
|
||||
streamlit>=1.28.0
|
||||
plotly>=5.18.0
|
||||
|
||||
# Web API and prediction server
|
||||
Flask>=3.0.0
|
||||
|
||||
# Environment configuration
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# AI and language model integration
|
||||
langchain-openai>=0.0.5
|
||||
dashscope>=1.14.0
|
||||
requests>=2.31.0
|
||||
1
zzs_test
Submodule
1
zzs_test
Submodule
Submodule zzs_test added at bf30e493dd
1
zzs_test4
Submodule
1
zzs_test4
Submodule
Submodule zzs_test4 added at 39b398e159
Reference in New Issue
Block a user