Files
AIcode/test/module1_dashboard/cardio_dashboard.py
2026-04-02 17:16:19 +08:00

581 lines
17 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
CardioAI - 心血管疾病智能辅助系统
数据可视化仪表板模块
功能:
1. 数据加载与清洗
2. 特征工程年龄转换、BMI计算、类别转换
3. 交互式数据筛选
4. 可视化分析Plotly图表
"""
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import sys
import os
# 设置页面配置
st.set_page_config(
page_title="CardioAI - 心血管疾病分析仪表板",
page_icon="❤️",
layout="wide",
initial_sidebar_state="expanded"
)
# 添加项目根目录到Python路径确保可以导入其他模块
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
# 设置中文显示
st.markdown("""
<style>
.main-header {
font-size: 2.5rem;
color: #e63946;
text-align: center;
margin-bottom: 2rem;
font-weight: bold;
}
.sub-header {
font-size: 1.5rem;
color: #457b9d;
margin-top: 1.5rem;
margin-bottom: 1rem;
font-weight: bold;
}
.metric-card {
background-color: #f1faee;
padding: 1.5rem;
border-radius: 10px;
border-left: 5px solid #1d3557;
margin-bottom: 1rem;
}
.stButton>button {
background-color: #1d3557;
color: white;
border: none;
padding: 0.5rem 1rem;
border-radius: 5px;
}
</style>
""", unsafe_allow_html=True)
# 数据文件路径
DATA_PATH = Path(__file__).parent.parent / "data" / "心血管疾病.xlsx"
@st.cache_data(show_spinner="正在加载和清洗数据...")
def load_and_process_data():
"""
加载Excel数据并进行清洗和特征工程
返回:
pd.DataFrame: 处理后的数据框
"""
try:
# 加载数据
st.info(f"正在从 {DATA_PATH} 加载数据...")
df = pd.read_excel(DATA_PATH)
# 检查必要列是否存在
required_columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol', 'gluc', 'cardio']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
st.error(f"数据文件中缺少必要列: {missing_columns}")
return pd.DataFrame()
# 创建数据副本
df_processed = df.copy()
# 1. 年龄转换:从天转换为年(四舍五入)
df_processed['age_years'] = (df_processed['age'] / 365.25).round().astype(int)
# 2. 计算BMI: BMI = weight(kg) / (height(m)^2)
# 注意height数据单位为厘米需要转换为米
df_processed['bmi'] = df_processed['weight'] / ((df_processed['height'] / 100) ** 2)
df_processed['bmi'] = df_processed['bmi'].round(2)
# 3. 异常值处理
# 删除舒张压 >= 收缩压的记录
invalid_bp = df_processed['ap_lo'] >= df_processed['ap_hi']
if invalid_bp.any():
st.warning(f"删除 {invalid_bp.sum()} 条舒张压 >= 收缩压的异常记录")
df_processed = df_processed[~invalid_bp].copy()
# 删除血压极端异常值
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
bp_outliers = ~((df_processed['ap_hi'] >= 90) & (df_processed['ap_hi'] <= 250) &
(df_processed['ap_lo'] >= 60) & (df_processed['ap_lo'] <= 150))
if bp_outliers.any():
st.warning(f"删除 {bp_outliers.sum()} 条血压极端异常值记录")
df_processed = df_processed[~bp_outliers].copy()
# 4. 类别转换
# cholesterol转换
cholesterol_map = {
1: "正常",
2: "高于正常",
3: "极高"
}
df_processed['cholesterol_str'] = df_processed['cholesterol'].map(cholesterol_map)
# gluc转换
gluc_map = {
1: "正常",
2: "高于正常",
3: "极高"
}
df_processed['gluc_str'] = df_processed['gluc'].map(gluc_map)
# gender转换
gender_map = {
1: "女性",
2: "男性"
}
df_processed['gender_str'] = df_processed['gender'].map(gender_map)
# cardio转换
cardio_map = {
0: "无心血管疾病",
1: "有心血管疾病"
}
df_processed['cardio_str'] = df_processed['cardio'].map(cardio_map)
# 5. BMI分类
def categorize_bmi(bmi):
if bmi < 18.5:
return "偏瘦"
elif 18.5 <= bmi < 24.9:
return "正常"
elif 25 <= bmi < 29.9:
return "超重"
else:
return "肥胖"
df_processed['bmi_category'] = df_processed['bmi'].apply(categorize_bmi)
# 记录处理后的数据信息
st.success(f"数据加载和清洗完成!共处理 {len(df_processed)} 条记录")
st.info(f"原始数据: {len(df)} 条记录, 清洗后: {len(df_processed)} 条记录")
return df_processed
except Exception as e:
st.error(f"数据加载失败: {str(e)}")
return pd.DataFrame()
def create_filters(df):
"""
创建侧边栏筛选器
参数:
df: 处理后的数据框
返回:
dict: 筛选条件字典
"""
st.sidebar.markdown("## 🔍 数据筛选")
# 年龄范围筛选
min_age = int(df['age_years'].min())
max_age = int(df['age_years'].max())
age_range = st.sidebar.slider(
"选择年龄范围:",
min_value=min_age,
max_value=max_age,
value=(min_age, max_age),
help="筛选指定年龄范围内的记录"
)
# 性别筛选
gender_options = df['gender_str'].unique().tolist()
selected_genders = st.sidebar.multiselect(
"选择性别:",
options=gender_options,
default=gender_options,
help="选择要分析的性别"
)
# 心血管疾病状态筛选
cardio_options = df['cardio_str'].unique().tolist()
selected_cardio = st.sidebar.multiselect(
"选择心血管疾病状态:",
options=cardio_options,
default=cardio_options,
help="选择要分析的心血管疾病状态"
)
# BMI分类筛选
bmi_options = df['bmi_category'].unique().tolist()
selected_bmi = st.sidebar.multiselect(
"选择BMI分类:",
options=bmi_options,
default=bmi_options,
help="选择要分析的BMI分类"
)
# 胆固醇水平筛选
cholesterol_options = df['cholesterol_str'].unique().tolist()
selected_cholesterol = st.sidebar.multiselect(
"选择胆固醇水平:",
options=cholesterol_options,
default=cholesterol_options,
help="选择要分析的胆固醇水平"
)
# 血糖水平筛选
gluc_options = df['gluc_str'].unique().tolist()
selected_gluc = st.sidebar.multiselect(
"选择血糖水平:",
options=gluc_options,
default=gluc_options,
help="选择要分析的血糖水平"
)
return {
'age_range': age_range,
'genders': selected_genders,
'cardio': selected_cardio,
'bmi_categories': selected_bmi,
'cholesterol': selected_cholesterol,
'gluc': selected_gluc
}
def apply_filters(df, filters):
"""
应用筛选条件到数据框
参数:
df: 原始数据框
filters: 筛选条件字典
返回:
pd.DataFrame: 筛选后的数据框
"""
filtered_df = df.copy()
# 应用年龄筛选
filtered_df = filtered_df[
(filtered_df['age_years'] >= filters['age_range'][0]) &
(filtered_df['age_years'] <= filters['age_range'][1])
]
# 应用性别筛选
if filters['genders']:
filtered_df = filtered_df[filtered_df['gender_str'].isin(filters['genders'])]
# 应用心血管疾病筛选
if filters['cardio']:
filtered_df = filtered_df[filtered_df['cardio_str'].isin(filters['cardio'])]
# 应用BMI分类筛选
if filters['bmi_categories']:
filtered_df = filtered_df[filtered_df['bmi_category'].isin(filters['bmi_categories'])]
# 应用胆固醇筛选
if filters['cholesterol']:
filtered_df = filtered_df[filtered_df['cholesterol_str'].isin(filters['cholesterol'])]
# 应用血糖筛选
if filters['gluc']:
filtered_df = filtered_df[filtered_df['gluc_str'].isin(filters['gluc'])]
return filtered_df
def display_metrics(filtered_df, original_df):
"""
显示关键指标
参数:
filtered_df: 筛选后的数据框
original_df: 原始数据框
"""
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
label="筛选后记录数",
value=f"{len(filtered_df):,}",
delta=f"{len(filtered_df) - len(original_df):+,}"
)
with col2:
# 心血管疾病风险率
if len(filtered_df) > 0:
risk_rate = (filtered_df['cardio'].sum() / len(filtered_df) * 100).round(2)
st.metric(
label="心血管疾病风险率",
value=f"{risk_rate}%",
help="当前筛选条件下心血管疾病患者比例"
)
else:
st.metric(label="心血管疾病风险率", value="N/A")
with col3:
# 平均年龄
if len(filtered_df) > 0:
avg_age = filtered_df['age_years'].mean().round(1)
st.metric(
label="平均年龄",
value=f"{avg_age}",
help="当前筛选条件下的平均年龄"
)
else:
st.metric(label="平均年龄", value="N/A")
with col4:
# 平均BMI
if len(filtered_df) > 0:
avg_bmi = filtered_df['bmi'].mean().round(1)
st.metric(
label="平均BMI",
value=str(avg_bmi),
help="当前筛选条件下的平均身体质量指数"
)
else:
st.metric(label="平均BMI", value="N/A")
def create_visualizations(df):
"""
创建可视化图表
参数:
df: 要可视化的数据框
"""
if len(df) == 0:
st.warning("没有可用的数据进行可视化")
return
st.markdown("## 📊 数据可视化分析")
# 图1: 年龄分布直方图(按心血管疾病状态区分)
col1, col2 = st.columns(2)
with col1:
st.markdown("### 年龄分布分析")
fig_age = px.histogram(
df,
x='age_years',
color='cardio_str',
nbins=30,
barmode='overlay',
opacity=0.7,
labels={
'age_years': '年龄(岁)',
'cardio_str': '心血管疾病状态',
'count': '人数'
},
title="年龄分布与心血管疾病关系",
color_discrete_map={
"有心血管疾病": "#e63946",
"无心血管疾病": "#457b9d"
}
)
fig_age.update_layout(
legend_title="疾病状态",
hovermode='x unified'
)
st.plotly_chart(fig_age, use_container_width=True)
with col2:
st.markdown("### BMI分类与心血管疾病关系")
# 创建交叉表
bmi_cardio_cross = pd.crosstab(
df['bmi_category'],
df['cardio_str'],
normalize='index'
).reset_index()
# 转换为长格式
bmi_cardio_long = bmi_cardio_cross.melt(
id_vars='bmi_category',
var_name='cardio_status',
value_name='proportion'
)
fig_bmi = px.bar(
bmi_cardio_long,
x='bmi_category',
y='proportion',
color='cardio_status',
barmode='stack',
labels={
'bmi_category': 'BMI分类',
'proportion': '比例',
'cardio_status': '心血管疾病状态'
},
title="BMI分类对心血管疾病的影响",
color_discrete_map={
"有心血管疾病": "#e63946",
"无心血管疾病": "#457b9d"
}
)
fig_bmi.update_layout(
yaxis_tickformat='.1%',
legend_title="疾病状态"
)
st.plotly_chart(fig_bmi, use_container_width=True)
# 图3: 血压关系散点图
st.markdown("### 血压关系分析")
fig_bp = px.scatter(
df,
x='ap_hi',
y='ap_lo',
color='cardio_str',
size='bmi',
hover_data=['age_years', 'gender_str', 'cholesterol_str'],
labels={
'ap_hi': '收缩压 (mmHg)',
'ap_lo': '舒张压 (mmHg)',
'cardio_str': '心血管疾病状态',
'bmi': 'BMI'
},
title="血压关系散点图",
color_discrete_map={
"有心血管疾病": "#e63946",
"无心血管疾病": "#457b9d"
}
)
fig_bp.update_layout(legend_title="疾病状态")
st.plotly_chart(fig_bp, use_container_width=True)
# 图4: 胆固醇和血糖水平分析
col3, col4 = st.columns(2)
with col3:
st.markdown("### 胆固醇水平分布")
cholesterol_counts = df['cholesterol_str'].value_counts().reset_index()
cholesterol_counts.columns = ['cholesterol_level', 'count']
fig_chol = px.pie(
cholesterol_counts,
values='count',
names='cholesterol_level',
title="胆固醇水平分布",
color_discrete_sequence=px.colors.sequential.RdBu
)
fig_chol.update_traces(textposition='inside', textinfo='percent+label')
st.plotly_chart(fig_chol, use_container_width=True)
with col4:
st.markdown("### 血糖水平分布")
gluc_counts = df['gluc_str'].value_counts().reset_index()
gluc_counts.columns = ['gluc_level', 'count']
fig_gluc = px.pie(
gluc_counts,
values='count',
names='gluc_level',
title="血糖水平分布",
color_discrete_sequence=px.colors.sequential.Blues
)
fig_gluc.update_traces(textposition='inside', textinfo='percent+label')
st.plotly_chart(fig_gluc, use_container_width=True)
def display_data_preview(df):
"""
显示数据预览
参数:
df: 要预览的数据框
"""
st.markdown("## 📋 数据预览")
# 显示数据摘要
with st.expander("数据摘要", expanded=False):
col1, col2 = st.columns(2)
with col1:
st.write("**数据形状:**", df.shape)
st.write("**数据类型:**")
st.write(df.dtypes.astype(str).reset_index().rename(columns={0: '类型', 'index': '列名'}))
with col2:
st.write("**缺失值统计:**")
missing = df.isnull().sum().reset_index()
missing.columns = ['列名', '缺失值数量']
missing = missing[missing['缺失值数量'] > 0]
if len(missing) > 0:
st.write(missing)
else:
st.write("无缺失值")
# 显示数据表格
with st.expander("查看原始数据", expanded=False):
# 选择要显示的列
available_columns = df.columns.tolist()
default_columns = ['age_years', 'gender_str', 'bmi', 'bmi_category',
'ap_hi', 'ap_lo', 'cholesterol_str', 'gluc_str', 'cardio_str']
selected_columns = st.multiselect(
"选择要显示的列:",
options=available_columns,
default=default_columns
)
if selected_columns:
display_df = df[selected_columns].copy()
st.dataframe(display_df.head(100), use_container_width=True)
st.caption(f"显示前 100 行(共 {len(df)} 行)")
else:
st.info("请选择要显示的列")
def main():
"""
主函数
"""
# 标题
st.markdown('<h1 class="main-header">❤️ CardioAI - 心血管疾病智能分析仪表板</h1>', unsafe_allow_html=True)
st.markdown("---")
# 加载数据
with st.spinner("正在加载数据,请稍候..."):
df = load_and_process_data()
if df.empty:
st.error("数据加载失败,请检查数据文件路径和格式")
return
# 创建筛选器
filters = create_filters(df)
# 应用筛选
filtered_df = apply_filters(df, filters)
# 显示关键指标
st.markdown("## 📈 关键指标")
display_metrics(filtered_df, df)
# 显示数据预览
display_data_preview(filtered_df)
# 创建可视化图表
create_visualizations(filtered_df)
# 侧边栏信息
st.sidebar.markdown("---")
st.sidebar.markdown("## 使用说明")
st.sidebar.info("""
1. 使用左侧筛选器选择要分析的数据子集
2. 查看上方的关键指标了解数据概况
3. 探索下方的可视化图表分析趋势和关系
4. 点击数据预览查看详细数据
""")
st.sidebar.markdown("## 📊 数据说明")
st.sidebar.info("""
- **年龄**: 原始数据为天数,已转换为岁数
- **BMI**: 身体质量指数,计算公式:体重(kg)/身高(m)²
- **血压**: 收缩压(ap_hi)和舒张压(ap_lo)
- **胆固醇/血糖**: 1=正常, 2=高于正常, 3=极高
- **心血管疾病**: 0=无, 1=有
""")
if __name__ == "__main__":
main()