581 lines
17 KiB
Python
581 lines
17 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
CardioAI - 心血管疾病智能辅助系统
|
||
数据可视化仪表板模块
|
||
|
||
功能:
|
||
1. 数据加载与清洗
|
||
2. 特征工程(年龄转换、BMI计算、类别转换)
|
||
3. 交互式数据筛选
|
||
4. 可视化分析(Plotly图表)
|
||
"""
|
||
|
||
import streamlit as st
|
||
import pandas as pd
|
||
import numpy as np
|
||
import plotly.express as px
|
||
import plotly.graph_objects as go
|
||
from pathlib import Path
|
||
import sys
|
||
import os
|
||
|
||
# 设置页面配置
|
||
st.set_page_config(
|
||
page_title="CardioAI - 心血管疾病分析仪表板",
|
||
page_icon="❤️",
|
||
layout="wide",
|
||
initial_sidebar_state="expanded"
|
||
)
|
||
|
||
# 添加项目根目录到Python路径,确保可以导入其他模块
|
||
project_root = Path(__file__).parent.parent
|
||
sys.path.append(str(project_root))
|
||
|
||
# 设置中文显示
|
||
st.markdown("""
|
||
<style>
|
||
.main-header {
|
||
font-size: 2.5rem;
|
||
color: #e63946;
|
||
text-align: center;
|
||
margin-bottom: 2rem;
|
||
font-weight: bold;
|
||
}
|
||
.sub-header {
|
||
font-size: 1.5rem;
|
||
color: #457b9d;
|
||
margin-top: 1.5rem;
|
||
margin-bottom: 1rem;
|
||
font-weight: bold;
|
||
}
|
||
.metric-card {
|
||
background-color: #f1faee;
|
||
padding: 1.5rem;
|
||
border-radius: 10px;
|
||
border-left: 5px solid #1d3557;
|
||
margin-bottom: 1rem;
|
||
}
|
||
.stButton>button {
|
||
background-color: #1d3557;
|
||
color: white;
|
||
border: none;
|
||
padding: 0.5rem 1rem;
|
||
border-radius: 5px;
|
||
}
|
||
</style>
|
||
""", unsafe_allow_html=True)
|
||
|
||
# 数据文件路径
|
||
DATA_PATH = Path(__file__).parent.parent / "data" / "心血管疾病.xlsx"
|
||
|
||
@st.cache_data(show_spinner="正在加载和清洗数据...")
|
||
def load_and_process_data():
|
||
"""
|
||
加载Excel数据并进行清洗和特征工程
|
||
|
||
返回:
|
||
pd.DataFrame: 处理后的数据框
|
||
"""
|
||
try:
|
||
# 加载数据
|
||
st.info(f"正在从 {DATA_PATH} 加载数据...")
|
||
df = pd.read_excel(DATA_PATH)
|
||
|
||
# 检查必要列是否存在
|
||
required_columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
|
||
'cholesterol', 'gluc', 'cardio']
|
||
missing_columns = [col for col in required_columns if col not in df.columns]
|
||
if missing_columns:
|
||
st.error(f"数据文件中缺少必要列: {missing_columns}")
|
||
return pd.DataFrame()
|
||
|
||
# 创建数据副本
|
||
df_processed = df.copy()
|
||
|
||
# 1. 年龄转换:从天转换为年(四舍五入)
|
||
df_processed['age_years'] = (df_processed['age'] / 365.25).round().astype(int)
|
||
|
||
# 2. 计算BMI: BMI = weight(kg) / (height(m)^2)
|
||
# 注意:height数据单位为厘米,需要转换为米
|
||
df_processed['bmi'] = df_processed['weight'] / ((df_processed['height'] / 100) ** 2)
|
||
df_processed['bmi'] = df_processed['bmi'].round(2)
|
||
|
||
# 3. 异常值处理
|
||
# 删除舒张压 >= 收缩压的记录
|
||
invalid_bp = df_processed['ap_lo'] >= df_processed['ap_hi']
|
||
if invalid_bp.any():
|
||
st.warning(f"删除 {invalid_bp.sum()} 条舒张压 >= 收缩压的异常记录")
|
||
df_processed = df_processed[~invalid_bp].copy()
|
||
|
||
# 删除血压极端异常值
|
||
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
|
||
bp_outliers = ~((df_processed['ap_hi'] >= 90) & (df_processed['ap_hi'] <= 250) &
|
||
(df_processed['ap_lo'] >= 60) & (df_processed['ap_lo'] <= 150))
|
||
if bp_outliers.any():
|
||
st.warning(f"删除 {bp_outliers.sum()} 条血压极端异常值记录")
|
||
df_processed = df_processed[~bp_outliers].copy()
|
||
|
||
# 4. 类别转换
|
||
# cholesterol转换
|
||
cholesterol_map = {
|
||
1: "正常",
|
||
2: "高于正常",
|
||
3: "极高"
|
||
}
|
||
df_processed['cholesterol_str'] = df_processed['cholesterol'].map(cholesterol_map)
|
||
|
||
# gluc转换
|
||
gluc_map = {
|
||
1: "正常",
|
||
2: "高于正常",
|
||
3: "极高"
|
||
}
|
||
df_processed['gluc_str'] = df_processed['gluc'].map(gluc_map)
|
||
|
||
# gender转换
|
||
gender_map = {
|
||
1: "女性",
|
||
2: "男性"
|
||
}
|
||
df_processed['gender_str'] = df_processed['gender'].map(gender_map)
|
||
|
||
# cardio转换
|
||
cardio_map = {
|
||
0: "无心血管疾病",
|
||
1: "有心血管疾病"
|
||
}
|
||
df_processed['cardio_str'] = df_processed['cardio'].map(cardio_map)
|
||
|
||
# 5. BMI分类
|
||
def categorize_bmi(bmi):
|
||
if bmi < 18.5:
|
||
return "偏瘦"
|
||
elif 18.5 <= bmi < 24.9:
|
||
return "正常"
|
||
elif 25 <= bmi < 29.9:
|
||
return "超重"
|
||
else:
|
||
return "肥胖"
|
||
|
||
df_processed['bmi_category'] = df_processed['bmi'].apply(categorize_bmi)
|
||
|
||
# 记录处理后的数据信息
|
||
st.success(f"数据加载和清洗完成!共处理 {len(df_processed)} 条记录")
|
||
st.info(f"原始数据: {len(df)} 条记录, 清洗后: {len(df_processed)} 条记录")
|
||
|
||
return df_processed
|
||
|
||
except Exception as e:
|
||
st.error(f"数据加载失败: {str(e)}")
|
||
return pd.DataFrame()
|
||
|
||
def create_filters(df):
|
||
"""
|
||
创建侧边栏筛选器
|
||
|
||
参数:
|
||
df: 处理后的数据框
|
||
|
||
返回:
|
||
dict: 筛选条件字典
|
||
"""
|
||
st.sidebar.markdown("## 🔍 数据筛选")
|
||
|
||
# 年龄范围筛选
|
||
min_age = int(df['age_years'].min())
|
||
max_age = int(df['age_years'].max())
|
||
age_range = st.sidebar.slider(
|
||
"选择年龄范围:",
|
||
min_value=min_age,
|
||
max_value=max_age,
|
||
value=(min_age, max_age),
|
||
help="筛选指定年龄范围内的记录"
|
||
)
|
||
|
||
# 性别筛选
|
||
gender_options = df['gender_str'].unique().tolist()
|
||
selected_genders = st.sidebar.multiselect(
|
||
"选择性别:",
|
||
options=gender_options,
|
||
default=gender_options,
|
||
help="选择要分析的性别"
|
||
)
|
||
|
||
# 心血管疾病状态筛选
|
||
cardio_options = df['cardio_str'].unique().tolist()
|
||
selected_cardio = st.sidebar.multiselect(
|
||
"选择心血管疾病状态:",
|
||
options=cardio_options,
|
||
default=cardio_options,
|
||
help="选择要分析的心血管疾病状态"
|
||
)
|
||
|
||
# BMI分类筛选
|
||
bmi_options = df['bmi_category'].unique().tolist()
|
||
selected_bmi = st.sidebar.multiselect(
|
||
"选择BMI分类:",
|
||
options=bmi_options,
|
||
default=bmi_options,
|
||
help="选择要分析的BMI分类"
|
||
)
|
||
|
||
# 胆固醇水平筛选
|
||
cholesterol_options = df['cholesterol_str'].unique().tolist()
|
||
selected_cholesterol = st.sidebar.multiselect(
|
||
"选择胆固醇水平:",
|
||
options=cholesterol_options,
|
||
default=cholesterol_options,
|
||
help="选择要分析的胆固醇水平"
|
||
)
|
||
|
||
# 血糖水平筛选
|
||
gluc_options = df['gluc_str'].unique().tolist()
|
||
selected_gluc = st.sidebar.multiselect(
|
||
"选择血糖水平:",
|
||
options=gluc_options,
|
||
default=gluc_options,
|
||
help="选择要分析的血糖水平"
|
||
)
|
||
|
||
return {
|
||
'age_range': age_range,
|
||
'genders': selected_genders,
|
||
'cardio': selected_cardio,
|
||
'bmi_categories': selected_bmi,
|
||
'cholesterol': selected_cholesterol,
|
||
'gluc': selected_gluc
|
||
}
|
||
|
||
def apply_filters(df, filters):
|
||
"""
|
||
应用筛选条件到数据框
|
||
|
||
参数:
|
||
df: 原始数据框
|
||
filters: 筛选条件字典
|
||
|
||
返回:
|
||
pd.DataFrame: 筛选后的数据框
|
||
"""
|
||
filtered_df = df.copy()
|
||
|
||
# 应用年龄筛选
|
||
filtered_df = filtered_df[
|
||
(filtered_df['age_years'] >= filters['age_range'][0]) &
|
||
(filtered_df['age_years'] <= filters['age_range'][1])
|
||
]
|
||
|
||
# 应用性别筛选
|
||
if filters['genders']:
|
||
filtered_df = filtered_df[filtered_df['gender_str'].isin(filters['genders'])]
|
||
|
||
# 应用心血管疾病筛选
|
||
if filters['cardio']:
|
||
filtered_df = filtered_df[filtered_df['cardio_str'].isin(filters['cardio'])]
|
||
|
||
# 应用BMI分类筛选
|
||
if filters['bmi_categories']:
|
||
filtered_df = filtered_df[filtered_df['bmi_category'].isin(filters['bmi_categories'])]
|
||
|
||
# 应用胆固醇筛选
|
||
if filters['cholesterol']:
|
||
filtered_df = filtered_df[filtered_df['cholesterol_str'].isin(filters['cholesterol'])]
|
||
|
||
# 应用血糖筛选
|
||
if filters['gluc']:
|
||
filtered_df = filtered_df[filtered_df['gluc_str'].isin(filters['gluc'])]
|
||
|
||
return filtered_df
|
||
|
||
def display_metrics(filtered_df, original_df):
|
||
"""
|
||
显示关键指标
|
||
|
||
参数:
|
||
filtered_df: 筛选后的数据框
|
||
original_df: 原始数据框
|
||
"""
|
||
col1, col2, col3, col4 = st.columns(4)
|
||
|
||
with col1:
|
||
st.metric(
|
||
label="筛选后记录数",
|
||
value=f"{len(filtered_df):,}",
|
||
delta=f"{len(filtered_df) - len(original_df):+,}"
|
||
)
|
||
|
||
with col2:
|
||
# 心血管疾病风险率
|
||
if len(filtered_df) > 0:
|
||
risk_rate = (filtered_df['cardio'].sum() / len(filtered_df) * 100).round(2)
|
||
st.metric(
|
||
label="心血管疾病风险率",
|
||
value=f"{risk_rate}%",
|
||
help="当前筛选条件下心血管疾病患者比例"
|
||
)
|
||
else:
|
||
st.metric(label="心血管疾病风险率", value="N/A")
|
||
|
||
with col3:
|
||
# 平均年龄
|
||
if len(filtered_df) > 0:
|
||
avg_age = filtered_df['age_years'].mean().round(1)
|
||
st.metric(
|
||
label="平均年龄",
|
||
value=f"{avg_age} 岁",
|
||
help="当前筛选条件下的平均年龄"
|
||
)
|
||
else:
|
||
st.metric(label="平均年龄", value="N/A")
|
||
|
||
with col4:
|
||
# 平均BMI
|
||
if len(filtered_df) > 0:
|
||
avg_bmi = filtered_df['bmi'].mean().round(1)
|
||
st.metric(
|
||
label="平均BMI",
|
||
value=str(avg_bmi),
|
||
help="当前筛选条件下的平均身体质量指数"
|
||
)
|
||
else:
|
||
st.metric(label="平均BMI", value="N/A")
|
||
|
||
def create_visualizations(df):
|
||
"""
|
||
创建可视化图表
|
||
|
||
参数:
|
||
df: 要可视化的数据框
|
||
"""
|
||
if len(df) == 0:
|
||
st.warning("没有可用的数据进行可视化")
|
||
return
|
||
|
||
st.markdown("## 📊 数据可视化分析")
|
||
|
||
# 图1: 年龄分布直方图(按心血管疾病状态区分)
|
||
col1, col2 = st.columns(2)
|
||
|
||
with col1:
|
||
st.markdown("### 年龄分布分析")
|
||
fig_age = px.histogram(
|
||
df,
|
||
x='age_years',
|
||
color='cardio_str',
|
||
nbins=30,
|
||
barmode='overlay',
|
||
opacity=0.7,
|
||
labels={
|
||
'age_years': '年龄(岁)',
|
||
'cardio_str': '心血管疾病状态',
|
||
'count': '人数'
|
||
},
|
||
title="年龄分布与心血管疾病关系",
|
||
color_discrete_map={
|
||
"有心血管疾病": "#e63946",
|
||
"无心血管疾病": "#457b9d"
|
||
}
|
||
)
|
||
fig_age.update_layout(
|
||
legend_title="疾病状态",
|
||
hovermode='x unified'
|
||
)
|
||
st.plotly_chart(fig_age, use_container_width=True)
|
||
|
||
with col2:
|
||
st.markdown("### BMI分类与心血管疾病关系")
|
||
# 创建交叉表
|
||
bmi_cardio_cross = pd.crosstab(
|
||
df['bmi_category'],
|
||
df['cardio_str'],
|
||
normalize='index'
|
||
).reset_index()
|
||
|
||
# 转换为长格式
|
||
bmi_cardio_long = bmi_cardio_cross.melt(
|
||
id_vars='bmi_category',
|
||
var_name='cardio_status',
|
||
value_name='proportion'
|
||
)
|
||
|
||
fig_bmi = px.bar(
|
||
bmi_cardio_long,
|
||
x='bmi_category',
|
||
y='proportion',
|
||
color='cardio_status',
|
||
barmode='stack',
|
||
labels={
|
||
'bmi_category': 'BMI分类',
|
||
'proportion': '比例',
|
||
'cardio_status': '心血管疾病状态'
|
||
},
|
||
title="BMI分类对心血管疾病的影响",
|
||
color_discrete_map={
|
||
"有心血管疾病": "#e63946",
|
||
"无心血管疾病": "#457b9d"
|
||
}
|
||
)
|
||
fig_bmi.update_layout(
|
||
yaxis_tickformat='.1%',
|
||
legend_title="疾病状态"
|
||
)
|
||
st.plotly_chart(fig_bmi, use_container_width=True)
|
||
|
||
# 图3: 血压关系散点图
|
||
st.markdown("### 血压关系分析")
|
||
fig_bp = px.scatter(
|
||
df,
|
||
x='ap_hi',
|
||
y='ap_lo',
|
||
color='cardio_str',
|
||
size='bmi',
|
||
hover_data=['age_years', 'gender_str', 'cholesterol_str'],
|
||
labels={
|
||
'ap_hi': '收缩压 (mmHg)',
|
||
'ap_lo': '舒张压 (mmHg)',
|
||
'cardio_str': '心血管疾病状态',
|
||
'bmi': 'BMI'
|
||
},
|
||
title="血压关系散点图",
|
||
color_discrete_map={
|
||
"有心血管疾病": "#e63946",
|
||
"无心血管疾病": "#457b9d"
|
||
}
|
||
)
|
||
fig_bp.update_layout(legend_title="疾病状态")
|
||
st.plotly_chart(fig_bp, use_container_width=True)
|
||
|
||
# 图4: 胆固醇和血糖水平分析
|
||
col3, col4 = st.columns(2)
|
||
|
||
with col3:
|
||
st.markdown("### 胆固醇水平分布")
|
||
cholesterol_counts = df['cholesterol_str'].value_counts().reset_index()
|
||
cholesterol_counts.columns = ['cholesterol_level', 'count']
|
||
|
||
fig_chol = px.pie(
|
||
cholesterol_counts,
|
||
values='count',
|
||
names='cholesterol_level',
|
||
title="胆固醇水平分布",
|
||
color_discrete_sequence=px.colors.sequential.RdBu
|
||
)
|
||
fig_chol.update_traces(textposition='inside', textinfo='percent+label')
|
||
st.plotly_chart(fig_chol, use_container_width=True)
|
||
|
||
with col4:
|
||
st.markdown("### 血糖水平分布")
|
||
gluc_counts = df['gluc_str'].value_counts().reset_index()
|
||
gluc_counts.columns = ['gluc_level', 'count']
|
||
|
||
fig_gluc = px.pie(
|
||
gluc_counts,
|
||
values='count',
|
||
names='gluc_level',
|
||
title="血糖水平分布",
|
||
color_discrete_sequence=px.colors.sequential.Blues
|
||
)
|
||
fig_gluc.update_traces(textposition='inside', textinfo='percent+label')
|
||
st.plotly_chart(fig_gluc, use_container_width=True)
|
||
|
||
def display_data_preview(df):
|
||
"""
|
||
显示数据预览
|
||
|
||
参数:
|
||
df: 要预览的数据框
|
||
"""
|
||
st.markdown("## 📋 数据预览")
|
||
|
||
# 显示数据摘要
|
||
with st.expander("数据摘要", expanded=False):
|
||
col1, col2 = st.columns(2)
|
||
with col1:
|
||
st.write("**数据形状:**", df.shape)
|
||
st.write("**数据类型:**")
|
||
st.write(df.dtypes.astype(str).reset_index().rename(columns={0: '类型', 'index': '列名'}))
|
||
|
||
with col2:
|
||
st.write("**缺失值统计:**")
|
||
missing = df.isnull().sum().reset_index()
|
||
missing.columns = ['列名', '缺失值数量']
|
||
missing = missing[missing['缺失值数量'] > 0]
|
||
if len(missing) > 0:
|
||
st.write(missing)
|
||
else:
|
||
st.write("无缺失值")
|
||
|
||
# 显示数据表格
|
||
with st.expander("查看原始数据", expanded=False):
|
||
# 选择要显示的列
|
||
available_columns = df.columns.tolist()
|
||
default_columns = ['age_years', 'gender_str', 'bmi', 'bmi_category',
|
||
'ap_hi', 'ap_lo', 'cholesterol_str', 'gluc_str', 'cardio_str']
|
||
|
||
selected_columns = st.multiselect(
|
||
"选择要显示的列:",
|
||
options=available_columns,
|
||
default=default_columns
|
||
)
|
||
|
||
if selected_columns:
|
||
display_df = df[selected_columns].copy()
|
||
st.dataframe(display_df.head(100), use_container_width=True)
|
||
st.caption(f"显示前 100 行(共 {len(df)} 行)")
|
||
else:
|
||
st.info("请选择要显示的列")
|
||
|
||
def main():
|
||
"""
|
||
主函数
|
||
"""
|
||
# 标题
|
||
st.markdown('<h1 class="main-header">❤️ CardioAI - 心血管疾病智能分析仪表板</h1>', unsafe_allow_html=True)
|
||
st.markdown("---")
|
||
|
||
# 加载数据
|
||
with st.spinner("正在加载数据,请稍候..."):
|
||
df = load_and_process_data()
|
||
|
||
if df.empty:
|
||
st.error("数据加载失败,请检查数据文件路径和格式")
|
||
return
|
||
|
||
# 创建筛选器
|
||
filters = create_filters(df)
|
||
|
||
# 应用筛选
|
||
filtered_df = apply_filters(df, filters)
|
||
|
||
# 显示关键指标
|
||
st.markdown("## 📈 关键指标")
|
||
display_metrics(filtered_df, df)
|
||
|
||
# 显示数据预览
|
||
display_data_preview(filtered_df)
|
||
|
||
# 创建可视化图表
|
||
create_visualizations(filtered_df)
|
||
|
||
# 侧边栏信息
|
||
st.sidebar.markdown("---")
|
||
st.sidebar.markdown("## ℹ️ 使用说明")
|
||
st.sidebar.info("""
|
||
1. 使用左侧筛选器选择要分析的数据子集
|
||
2. 查看上方的关键指标了解数据概况
|
||
3. 探索下方的可视化图表分析趋势和关系
|
||
4. 点击数据预览查看详细数据
|
||
""")
|
||
|
||
st.sidebar.markdown("## 📊 数据说明")
|
||
st.sidebar.info("""
|
||
- **年龄**: 原始数据为天数,已转换为岁数
|
||
- **BMI**: 身体质量指数,计算公式:体重(kg)/身高(m)²
|
||
- **血压**: 收缩压(ap_hi)和舒张压(ap_lo)
|
||
- **胆固醇/血糖**: 1=正常, 2=高于正常, 3=极高
|
||
- **心血管疾病**: 0=无, 1=有
|
||
""")
|
||
|
||
if __name__ == "__main__":
|
||
main() |