Files
itcast_01/module1_dashboard/cardio_dashboard.py
Zane Xu 3a3394da49 添加Module1数据可视化仪表板和心血管疾病数据
1. 创建module1_dashboard/cardio_dashboard.py
   - Streamlit应用程序,提供交互式数据可视化
   - 包含数据清洗、特征工程、异常值处理
   - 集成年龄转换、BMI计算、类别转换
   - 侧边栏筛选器、Plotly图表展示
   - 智能路径解析,支持多种运行环境

2. 添加data/心血管疾病.xlsx数据文件
   - 心血管疾病原始数据集
   - 用于模型训练和可视化分析

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 17:59:10 +08:00

395 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/opt/anaconda3/envs/cardioenv/bin/python
"""
CardioAI - 心血管疾病数据可视化仪表板
Streamlit应用程序用于对心血管疾病数据进行清洗、特征工程和交互式可视化
"""
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import os
warnings.filterwarnings('ignore')
# 页面配置
st.set_page_config(
page_title="CardioAI - 心血管疾病智能分析",
page_icon="❤️",
layout="wide",
initial_sidebar_state="expanded"
)
# 数据路径
DATA_PATH = "./data/心血管疾病.xlsx"
@st.cache_data(ttl=3600)
def load_and_process_data():
"""
加载并处理心血管疾病数据
返回处理后的DataFrame
"""
try:
# 调试信息:检查文件路径
current_dir = os.getcwd()
script_dir = os.path.dirname(os.path.abspath(__file__))
st.sidebar.info(f"当前工作目录: {current_dir}")
st.sidebar.info(f"脚本目录: {script_dir}")
st.sidebar.info(f"数据路径: {DATA_PATH}")
# 尝试多种路径
possible_paths = [
os.path.abspath(DATA_PATH), # 绝对路径
DATA_PATH, # 原始相对路径
os.path.join(current_dir, DATA_PATH),
os.path.join(current_dir, "data", "心血管疾病.xlsx"),
os.path.join(script_dir, "..", DATA_PATH),
os.path.join(script_dir, "..", "data", "心血管疾病.xlsx"),
os.path.join(script_dir, "data", "心血管疾病.xlsx")
]
data_path = None
for path in possible_paths:
if os.path.exists(path):
data_path = path
st.sidebar.success(f"找到数据文件: {path}")
break
if data_path is None:
st.sidebar.error("未找到数据文件,请检查路径")
st.sidebar.info(f"尝试过的路径: {possible_paths}")
return pd.DataFrame()
# 加载数据
df = pd.read_excel(data_path)
# 1. 特征工程
# 将age(天)转换为年,四舍五入
df['age_years'] = (df['age'] / 365.25).round().astype(int)
# 计算BMI: weight / (height/100)^2
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
# 2. 异常值处理
# 删除舒张压 >= 收缩压的记录
df = df[df['ap_lo'] < df['ap_hi']].copy()
# 删除血压极端异常值
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
df = df[
(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250) &
(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
].copy()
# 3. 类别转换
# cholesterol转换
cholesterol_map = {
1: '正常',
2: '高于正常',
3: '很高'
}
df['cholesterol_str'] = df['cholesterol'].map(cholesterol_map)
# gluc转换
gluc_map = {
1: '正常',
2: '高于正常',
3: '很高'
}
df['gluc_str'] = df['gluc'].map(gluc_map)
# BMI分类
def categorize_bmi(bmi):
if bmi < 18.5:
return '偏瘦'
elif 18.5 <= bmi < 25:
return '正常'
elif 25 <= bmi < 30:
return '超重'
else:
return '肥胖'
df['bmi_category'] = df['bmi'].apply(categorize_bmi)
# 添加血压分类
df['bp_category'] = pd.cut(
df['ap_hi'],
bins=[0, 120, 140, 160, 180, 250],
labels=['正常', '偏高', '高血压1级', '高血压2级', '高血压3级']
)
return df
except Exception as e:
st.error(f"数据加载失败: {e}")
return pd.DataFrame()
def create_filters(df):
"""
创建侧边栏筛选器
返回筛选后的数据
"""
st.sidebar.header("🔍 数据筛选")
# 年龄范围滑块
min_age = int(df['age_years'].min())
max_age = int(df['age_years'].max())
age_range = st.sidebar.slider(
"选择年龄范围",
min_value=min_age,
max_value=max_age,
value=(min_age, max_age),
step=1
)
# 性别筛选(多选)
gender_options = df['gender'].unique()
gender_options = sorted(gender_options)
gender_labels = {1: '女性', 2: '男性'}
selected_genders = st.sidebar.multiselect(
"选择性别",
options=gender_options,
default=gender_options,
format_func=lambda x: gender_labels.get(x, f"性别{x}")
)
# 心血管疾病筛选(多选)
cardio_options = df['cardio'].unique()
cardio_options = sorted(cardio_options)
cardio_labels = {0: '无疾病', 1: '有疾病'}
selected_cardio = st.sidebar.multiselect(
"选择心血管疾病状态",
options=cardio_options,
default=cardio_options,
format_func=lambda x: cardio_labels.get(x, f"状态{x}")
)
# 应用筛选
filtered_df = df[
(df['age_years'] >= age_range[0]) &
(df['age_years'] <= age_range[1]) &
(df['gender'].isin(selected_genders)) &
(df['cardio'].isin(selected_cardio))
].copy()
return filtered_df, age_range, selected_genders, selected_cardio
def display_metrics(filtered_df, original_df):
"""
显示关键指标
"""
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
label="总记录数",
value=f"{len(filtered_df):,}",
delta=f"{len(filtered_df) - len(original_df):+,}" if len(filtered_df) != len(original_df) else None
)
with col2:
disease_count = filtered_df['cardio'].sum()
disease_rate = (disease_count / len(filtered_df) * 100) if len(filtered_df) > 0 else 0
st.metric(
label="心血管疾病风险率",
value=f"{disease_rate:.1f}%",
delta=f"{disease_count:,}"
)
with col3:
avg_age = filtered_df['age_years'].mean()
st.metric(
label="平均年龄",
value=f"{avg_age:.1f}"
)
with col4:
avg_bmi = filtered_df['bmi'].mean()
st.metric(
label="平均BMI",
value=f"{avg_bmi:.1f}"
)
def create_visualizations(filtered_df):
"""
创建可视化图表
"""
st.subheader("📊 数据分析")
# 创建两列布局
col1, col2 = st.columns(2)
with col1:
st.markdown("##### 年龄分布(按心血管疾病状态)")
if not filtered_df.empty:
fig1 = px.histogram(
filtered_df,
x='age_years',
color='cardio',
nbins=30,
barmode='overlay',
color_discrete_map={0: '#636EFA', 1: '#EF553B'},
labels={
'age_years': '年龄(岁)',
'cardio': '心血管疾病',
'count': '人数'
},
category_orders={'cardio': [0, 1]},
opacity=0.7
)
fig1.update_layout(
legend_title_text='疾病状态',
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1
)
)
st.plotly_chart(fig1, use_container_width=True)
else:
st.info("没有数据可显示")
with col2:
st.markdown("##### BMI分类与心血管疾病关系")
if not filtered_df.empty:
# 创建交叉表
bmi_cardio_cross = pd.crosstab(
filtered_df['bmi_category'],
filtered_df['cardio'],
normalize='index'
).reset_index()
# 转换为长格式
bmi_cardio_long = pd.melt(
bmi_cardio_cross,
id_vars=['bmi_category'],
value_vars=[0, 1],
var_name='cardio',
value_name='proportion'
)
# 添加标签
bmi_cardio_long['cardio_label'] = bmi_cardio_long['cardio'].map({0: '无疾病', 1: '有疾病'})
fig2 = px.bar(
bmi_cardio_long,
x='bmi_category',
y='proportion',
color='cardio_label',
barmode='stack',
color_discrete_map={'无疾病': '#00CC96', '有疾病': '#AB63FA'},
labels={
'bmi_category': 'BMI分类',
'proportion': '比例',
'cardio_label': '疾病状态'
},
category_orders={
'bmi_category': ['偏瘦', '正常', '超重', '肥胖']
}
)
fig2.update_layout(
yaxis_tickformat=',.0%',
legend_title_text='疾病状态'
)
st.plotly_chart(fig2, use_container_width=True)
else:
st.info("没有数据可显示")
# 额外分析
st.subheader("🔍 详细分析")
col3, col4 = st.columns(2)
with col3:
st.markdown("##### 胆固醇水平分布")
if not filtered_df.empty:
fig3 = px.pie(
filtered_df,
names='cholesterol_str',
color='cholesterol_str',
color_discrete_sequence=px.colors.sequential.RdBu
)
fig3.update_traces(textposition='inside', textinfo='percent+label')
st.plotly_chart(fig3, use_container_width=True)
with col4:
st.markdown("##### 血糖水平分布")
if not filtered_df.empty:
fig4 = px.pie(
filtered_df,
names='gluc_str',
color='gluc_str',
color_discrete_sequence=px.colors.sequential.Bluyl
)
fig4.update_traces(textposition='inside', textinfo='percent+label')
st.plotly_chart(fig4, use_container_width=True)
def display_data_summary(filtered_df):
"""
显示数据摘要
"""
with st.expander("📋 数据摘要(点击展开)"):
st.dataframe(
filtered_df.describe().round(2),
use_container_width=True
)
st.markdown("##### 数据样本")
st.dataframe(
filtered_df.head(10),
use_container_width=True
)
def main():
"""
主函数
"""
# 标题和介绍
st.title("❤️ CardioAI - 心血管疾病智能分析系统")
st.markdown("""
本仪表板提供心血管疾病数据的交互式可视化分析。使用侧边栏筛选器探索数据模式。
""")
# 加载数据
with st.spinner('正在加载和处理数据...'):
df = load_and_process_data()
if df.empty:
st.error("无法加载数据。请检查数据文件路径和格式。")
return
# 创建筛选器并获取筛选后的数据
filtered_df, age_range, selected_genders, selected_cardio = create_filters(df)
# 显示筛选条件
st.sidebar.markdown("---")
st.sidebar.markdown(f"**筛选结果:** {len(filtered_df):,} 条记录")
# 显示关键指标
display_metrics(filtered_df, df)
# 显示可视化图表
create_visualizations(filtered_df)
# 显示数据摘要
display_data_summary(filtered_df)
# 页脚
st.markdown("---")
st.caption("CardioAI - 心血管疾病智能辅助系统 | Module 1: 数据可视化仪表板")
if __name__ == "__main__":
main()