feat: 添加 Streamlit 心血管疾病数据可视化仪表板

- 实现数据加载和清洗函数 (带 @st.cache_data 装饰器)
- 年龄转换: 天 -> 年
- BMI 计算和分类
- 异常值处理: 血压范围过滤
- 类别转换: cholesterol, gluc, gender, cardio
- Streamlit 交互界面: 侧边栏筛选器
- 可视化图表: 年龄分布直方图、BMI与心血管疾病堆叠柱状图

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
code_sam
2026-03-15 10:24:49 +08:00
parent d74774ef0b
commit b83677d19f

View File

@@ -0,0 +1,297 @@
"""
CardioAI 心血管疾病数据可视化仪表板
使用 Streamlit 构建交互式数据分析和可视化应用
"""
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
# ============================================
# 数据加载函数 (带缓存)
# ============================================
@st.cache_data
def load_data(file_path: str) -> pd.DataFrame:
"""
加载 Excel 数据文件
Args:
file_path: Excel 文件路径
Returns:
加载的 DataFrame
"""
df = pd.read_excel(file_path)
return df
# ============================================
# 数据清洗函数 (带缓存)
# ============================================
@st.cache_data
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""
数据清洗和特征工程
处理步骤:
1. 将 age(天) 转换为年 (age_years)
2. 计算 BMI
3. 异常值处理
4. 类别转换
Args:
df: 原始 DataFrame
Returns:
清洗后的 DataFrame
"""
# 创建副本避免修改原始数据
df_clean = df.copy()
# 1. 年龄转换: 天 -> 年 (四舍五入)
df_clean['age_years'] = (df_clean['age'] / 365).round().astype(int)
# 2. 计算 BMI: BMI = weight / (height/100)^2
df_clean['bmi'] = df_clean['weight'] / ((df_clean['height'] / 100) ** 2)
# 3. 异常值处理
# 3.1 删除舒张压 >= 收缩压的记录
df_clean = df_clean[df_clean['ap_hi'] > df_clean['ap_lo']]
# 3.2 删除血压极端异常值
# 收缩压应在 [90, 250] 范围
df_clean = df_clean[(df_clean['ap_hi'] >= 90) & (df_clean['ap_hi'] <= 250)]
# 舒张压应在 [60, 150] 范围
df_clean = df_clean[(df_clean['ap_lo'] >= 60) & (df_clean['ap_lo'] <= 150)]
# 4. 类别转换
# 4.1 cholesterol 数值转描述性字符串
cholesterol_map = {
1: '正常',
2: '高于正常',
3: '远高于正常'
}
df_clean['cholesterol_category'] = df_clean['cholesterol'].map(cholesterol_map)
# 4.2 gluc 数值转描述性字符串
gluc_map = {
1: '正常',
2: '高于正常',
3: '远高于正常'
}
df_clean['gluc_category'] = df_clean['gluc'].map(gluc_map)
# 4.3 BMI 分类
def categorize_bmi(bmi):
if bmi < 18.5:
return '偏瘦'
elif bmi < 24:
return '正常'
elif bmi < 28:
return '超重'
else:
return '肥胖'
df_clean['bmi_category'] = df_clean['bmi'].apply(categorize_bmi)
# 4.4 gender 转描述性字符串
gender_map = {1: '女性', 2: '男性'}
df_clean['gender_category'] = df_clean['gender'].map(gender_map)
# 4.5 cardio 转描述性字符串
cardio_map = {0: '无心血管疾病', 1: '有心血管疾病'}
df_clean['cardio_category'] = df_clean['cardio'].map(cardio_map)
return df_clean
# ============================================
# Streamlit 页面配置
# ============================================
st.set_page_config(
page_title="CardioAI 心血管疾病分析",
page_icon="❤️",
layout="wide"
)
# ============================================
# 主程序
# ============================================
def main():
"""主程序入口"""
# 页面标题
st.title("❤️ CardioAI 心血管疾病数据分析")
st.markdown("---")
# 数据路径
DATA_PATH = "C:/Users/SAM/Desktop/sam_test/ai_code/aicodes/data/心血管疾病.xlsx"
# 加载数据
try:
df = load_data(DATA_PATH)
st.success(f"✅ 成功加载数据,共 {len(df)} 条记录")
except Exception as e:
st.error(f"❌ 数据加载失败: {e}")
return
# 数据清洗和特征工程
df_clean = clean_data(df)
st.info(f"📊 数据清洗后剩余 {len(df_clean)} 条记录 (剔除了异常值)")
st.markdown("---")
# ============================================
# 侧边栏 - 筛选器
# ============================================
st.sidebar.header("🔍 筛选条件")
# 年龄范围滑块
age_min = int(df_clean['age_years'].min())
age_max = int(df_clean['age_years'].max())
age_range = st.sidebar.slider(
"年龄范围 (岁)",
min_value=age_min,
max_value=age_max,
value=(age_min, age_max)
)
# 性别多选框
gender_options = df_clean['gender_category'].unique().tolist()
selected_genders = st.sidebar.multiselect(
"性别",
options=gender_options,
default=gender_options
)
# 心血管疾病多选框
cardio_options = df_clean['cardio_category'].unique().tolist()
selected_cardios = st.sidebar.multiselect(
"心血管疾病状态",
options=cardio_options,
default=cardio_options
)
# ============================================
# 数据筛选
# ============================================
# 应用筛选条件
df_filtered = df_clean[
(df_clean['age_years'] >= age_range[0]) &
(df_clean['age_years'] <= age_range[1]) &
(df_clean['gender_category'].isin(selected_genders)) &
(df_clean['cardio_category'].isin(selected_cardios))
]
st.markdown("---")
# ============================================
# 主页展示 - 统计指标
# ============================================
st.header("📈 数据概览")
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
label="筛选后记录数",
value=f"{len(df_filtered):,}"
)
with col2:
# 计算心血管疾病风险率
total_count = len(df_filtered)
disease_count = len(df_filtered[df_filtered['cardio'] == 1])
risk_rate = (disease_count / total_count * 100) if total_count > 0 else 0
st.metric(
label="心血管疾病风险率",
value=f"{risk_rate:.1f}%",
delta=f"{disease_count} / {total_count}"
)
with col3:
avg_age = df_filtered['age_years'].mean()
st.metric(
label="平均年龄",
value=f"{avg_age:.1f}"
)
st.markdown("---")
# ============================================
# 图表展示
# ============================================
st.header("📊 可视化分析")
# Tab 切换不同图表
tab1, tab2 = st.tabs(["年龄分布", "BMI与心血管疾病关系"])
with tab1:
st.subheader("年龄分布直方图 (按心血管疾病状态区分)")
# 年龄分布直方图
fig_age = px.histogram(
df_filtered,
x='age_years',
color='cardio_category',
nbins=20,
title="年龄分布 (按心血管疾病状态)",
labels={
'age_years': '年龄 (岁)',
'count': '人数',
'cardio_category': '心血管疾病状态'
},
color_discrete_map={
'无心血管疾病': '#2ecc71',
'有心血管疾病': '#e74c3c'
},
barmode='overlay'
)
fig_age.update_layout(bargap=0.1)
st.plotly_chart(fig_age, use_container_width=True)
with tab2:
st.subheader("BMI分类对心血管疾病的影响")
# BMI 分类统计
bmi_cardio = df_filtered.groupby(['bmi_category', 'cardio_category']).size().reset_index(name='count')
# 按 BMI 分类排序
bmi_order = ['偏瘦', '正常', '超重', '肥胖']
# 堆叠柱状图
fig_bmi = px.bar(
bmi_cardio,
x='bmi_category',
y='count',
color='cardio_category',
title="BMI分类与心血管疾病关系",
labels={
'bmi_category': 'BMI分类',
'count': '人数',
'cardio_category': '心血管疾病状态'
},
color_discrete_map={
'无心血管疾病': '#2ecc71',
'有心血管疾病': '#e74c3c'
},
category_orders={'bmi_category': bmi_order}
)
fig_bmi.update_layout(bargap=0.1)
st.plotly_chart(fig_bmi, use_container_width=True)
st.markdown("---")
# ============================================
# 数据预览
# ============================================
with st.expander("📋 查看清洗后的数据"):
st.dataframe(df_clean.head(100))
st.caption(f"显示前 100 条记录,共 {len(df_clean)}")
if __name__ == "__main__":
main()