395 lines
12 KiB
Python
395 lines
12 KiB
Python
|
|
#!/opt/anaconda3/envs/cardioenv/bin/python
|
|||
|
|
"""
|
|||
|
|
CardioAI - 心血管疾病数据可视化仪表板
|
|||
|
|
Streamlit应用程序,用于对心血管疾病数据进行清洗、特征工程和交互式可视化
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import streamlit as st
|
|||
|
|
import pandas as pd
|
|||
|
|
import numpy as np
|
|||
|
|
import plotly.express as px
|
|||
|
|
import plotly.graph_objects as go
|
|||
|
|
from plotly.subplots import make_subplots
|
|||
|
|
import warnings
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
warnings.filterwarnings('ignore')
|
|||
|
|
|
|||
|
|
# 页面配置
|
|||
|
|
st.set_page_config(
|
|||
|
|
page_title="CardioAI - 心血管疾病智能分析",
|
|||
|
|
page_icon="❤️",
|
|||
|
|
layout="wide",
|
|||
|
|
initial_sidebar_state="expanded"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 数据路径
|
|||
|
|
DATA_PATH = "./data/心血管疾病.xlsx"
|
|||
|
|
|
|||
|
|
|
|||
|
|
@st.cache_data(ttl=3600)
|
|||
|
|
def load_and_process_data():
|
|||
|
|
"""
|
|||
|
|
加载并处理心血管疾病数据
|
|||
|
|
返回处理后的DataFrame
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 调试信息:检查文件路径
|
|||
|
|
current_dir = os.getcwd()
|
|||
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|||
|
|
st.sidebar.info(f"当前工作目录: {current_dir}")
|
|||
|
|
st.sidebar.info(f"脚本目录: {script_dir}")
|
|||
|
|
st.sidebar.info(f"数据路径: {DATA_PATH}")
|
|||
|
|
|
|||
|
|
# 尝试多种路径
|
|||
|
|
possible_paths = [
|
|||
|
|
os.path.abspath(DATA_PATH), # 绝对路径
|
|||
|
|
DATA_PATH, # 原始相对路径
|
|||
|
|
os.path.join(current_dir, DATA_PATH),
|
|||
|
|
os.path.join(current_dir, "data", "心血管疾病.xlsx"),
|
|||
|
|
os.path.join(script_dir, "..", DATA_PATH),
|
|||
|
|
os.path.join(script_dir, "..", "data", "心血管疾病.xlsx"),
|
|||
|
|
os.path.join(script_dir, "data", "心血管疾病.xlsx")
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
data_path = None
|
|||
|
|
for path in possible_paths:
|
|||
|
|
if os.path.exists(path):
|
|||
|
|
data_path = path
|
|||
|
|
st.sidebar.success(f"找到数据文件: {path}")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if data_path is None:
|
|||
|
|
st.sidebar.error("未找到数据文件,请检查路径")
|
|||
|
|
st.sidebar.info(f"尝试过的路径: {possible_paths}")
|
|||
|
|
return pd.DataFrame()
|
|||
|
|
|
|||
|
|
# 加载数据
|
|||
|
|
df = pd.read_excel(data_path)
|
|||
|
|
|
|||
|
|
# 1. 特征工程
|
|||
|
|
# 将age(天)转换为年,四舍五入
|
|||
|
|
df['age_years'] = (df['age'] / 365.25).round().astype(int)
|
|||
|
|
|
|||
|
|
# 计算BMI: weight / (height/100)^2
|
|||
|
|
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
|
|||
|
|
|
|||
|
|
# 2. 异常值处理
|
|||
|
|
# 删除舒张压 >= 收缩压的记录
|
|||
|
|
df = df[df['ap_lo'] < df['ap_hi']].copy()
|
|||
|
|
|
|||
|
|
# 删除血压极端异常值
|
|||
|
|
# 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150]
|
|||
|
|
df = df[
|
|||
|
|
(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250) &
|
|||
|
|
(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
|
|||
|
|
].copy()
|
|||
|
|
|
|||
|
|
# 3. 类别转换
|
|||
|
|
# cholesterol转换
|
|||
|
|
cholesterol_map = {
|
|||
|
|
1: '正常',
|
|||
|
|
2: '高于正常',
|
|||
|
|
3: '很高'
|
|||
|
|
}
|
|||
|
|
df['cholesterol_str'] = df['cholesterol'].map(cholesterol_map)
|
|||
|
|
|
|||
|
|
# gluc转换
|
|||
|
|
gluc_map = {
|
|||
|
|
1: '正常',
|
|||
|
|
2: '高于正常',
|
|||
|
|
3: '很高'
|
|||
|
|
}
|
|||
|
|
df['gluc_str'] = df['gluc'].map(gluc_map)
|
|||
|
|
|
|||
|
|
# BMI分类
|
|||
|
|
def categorize_bmi(bmi):
|
|||
|
|
if bmi < 18.5:
|
|||
|
|
return '偏瘦'
|
|||
|
|
elif 18.5 <= bmi < 25:
|
|||
|
|
return '正常'
|
|||
|
|
elif 25 <= bmi < 30:
|
|||
|
|
return '超重'
|
|||
|
|
else:
|
|||
|
|
return '肥胖'
|
|||
|
|
|
|||
|
|
df['bmi_category'] = df['bmi'].apply(categorize_bmi)
|
|||
|
|
|
|||
|
|
# 添加血压分类
|
|||
|
|
df['bp_category'] = pd.cut(
|
|||
|
|
df['ap_hi'],
|
|||
|
|
bins=[0, 120, 140, 160, 180, 250],
|
|||
|
|
labels=['正常', '偏高', '高血压1级', '高血压2级', '高血压3级']
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return df
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
st.error(f"数据加载失败: {e}")
|
|||
|
|
return pd.DataFrame()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def create_filters(df):
|
|||
|
|
"""
|
|||
|
|
创建侧边栏筛选器
|
|||
|
|
返回筛选后的数据
|
|||
|
|
"""
|
|||
|
|
st.sidebar.header("🔍 数据筛选")
|
|||
|
|
|
|||
|
|
# 年龄范围滑块
|
|||
|
|
min_age = int(df['age_years'].min())
|
|||
|
|
max_age = int(df['age_years'].max())
|
|||
|
|
age_range = st.sidebar.slider(
|
|||
|
|
"选择年龄范围",
|
|||
|
|
min_value=min_age,
|
|||
|
|
max_value=max_age,
|
|||
|
|
value=(min_age, max_age),
|
|||
|
|
step=1
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 性别筛选(多选)
|
|||
|
|
gender_options = df['gender'].unique()
|
|||
|
|
gender_options = sorted(gender_options)
|
|||
|
|
gender_labels = {1: '女性', 2: '男性'}
|
|||
|
|
selected_genders = st.sidebar.multiselect(
|
|||
|
|
"选择性别",
|
|||
|
|
options=gender_options,
|
|||
|
|
default=gender_options,
|
|||
|
|
format_func=lambda x: gender_labels.get(x, f"性别{x}")
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 心血管疾病筛选(多选)
|
|||
|
|
cardio_options = df['cardio'].unique()
|
|||
|
|
cardio_options = sorted(cardio_options)
|
|||
|
|
cardio_labels = {0: '无疾病', 1: '有疾病'}
|
|||
|
|
selected_cardio = st.sidebar.multiselect(
|
|||
|
|
"选择心血管疾病状态",
|
|||
|
|
options=cardio_options,
|
|||
|
|
default=cardio_options,
|
|||
|
|
format_func=lambda x: cardio_labels.get(x, f"状态{x}")
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 应用筛选
|
|||
|
|
filtered_df = df[
|
|||
|
|
(df['age_years'] >= age_range[0]) &
|
|||
|
|
(df['age_years'] <= age_range[1]) &
|
|||
|
|
(df['gender'].isin(selected_genders)) &
|
|||
|
|
(df['cardio'].isin(selected_cardio))
|
|||
|
|
].copy()
|
|||
|
|
|
|||
|
|
return filtered_df, age_range, selected_genders, selected_cardio
|
|||
|
|
|
|||
|
|
|
|||
|
|
def display_metrics(filtered_df, original_df):
|
|||
|
|
"""
|
|||
|
|
显示关键指标
|
|||
|
|
"""
|
|||
|
|
col1, col2, col3, col4 = st.columns(4)
|
|||
|
|
|
|||
|
|
with col1:
|
|||
|
|
st.metric(
|
|||
|
|
label="总记录数",
|
|||
|
|
value=f"{len(filtered_df):,}",
|
|||
|
|
delta=f"{len(filtered_df) - len(original_df):+,}" if len(filtered_df) != len(original_df) else None
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
with col2:
|
|||
|
|
disease_count = filtered_df['cardio'].sum()
|
|||
|
|
disease_rate = (disease_count / len(filtered_df) * 100) if len(filtered_df) > 0 else 0
|
|||
|
|
st.metric(
|
|||
|
|
label="心血管疾病风险率",
|
|||
|
|
value=f"{disease_rate:.1f}%",
|
|||
|
|
delta=f"{disease_count:,} 例"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
with col3:
|
|||
|
|
avg_age = filtered_df['age_years'].mean()
|
|||
|
|
st.metric(
|
|||
|
|
label="平均年龄",
|
|||
|
|
value=f"{avg_age:.1f} 岁"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
with col4:
|
|||
|
|
avg_bmi = filtered_df['bmi'].mean()
|
|||
|
|
st.metric(
|
|||
|
|
label="平均BMI",
|
|||
|
|
value=f"{avg_bmi:.1f}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def create_visualizations(filtered_df):
|
|||
|
|
"""
|
|||
|
|
创建可视化图表
|
|||
|
|
"""
|
|||
|
|
st.subheader("📊 数据分析")
|
|||
|
|
|
|||
|
|
# 创建两列布局
|
|||
|
|
col1, col2 = st.columns(2)
|
|||
|
|
|
|||
|
|
with col1:
|
|||
|
|
st.markdown("##### 年龄分布(按心血管疾病状态)")
|
|||
|
|
if not filtered_df.empty:
|
|||
|
|
fig1 = px.histogram(
|
|||
|
|
filtered_df,
|
|||
|
|
x='age_years',
|
|||
|
|
color='cardio',
|
|||
|
|
nbins=30,
|
|||
|
|
barmode='overlay',
|
|||
|
|
color_discrete_map={0: '#636EFA', 1: '#EF553B'},
|
|||
|
|
labels={
|
|||
|
|
'age_years': '年龄(岁)',
|
|||
|
|
'cardio': '心血管疾病',
|
|||
|
|
'count': '人数'
|
|||
|
|
},
|
|||
|
|
category_orders={'cardio': [0, 1]},
|
|||
|
|
opacity=0.7
|
|||
|
|
)
|
|||
|
|
fig1.update_layout(
|
|||
|
|
legend_title_text='疾病状态',
|
|||
|
|
legend=dict(
|
|||
|
|
orientation="h",
|
|||
|
|
yanchor="bottom",
|
|||
|
|
y=1.02,
|
|||
|
|
xanchor="right",
|
|||
|
|
x=1
|
|||
|
|
)
|
|||
|
|
)
|
|||
|
|
st.plotly_chart(fig1, use_container_width=True)
|
|||
|
|
else:
|
|||
|
|
st.info("没有数据可显示")
|
|||
|
|
|
|||
|
|
with col2:
|
|||
|
|
st.markdown("##### BMI分类与心血管疾病关系")
|
|||
|
|
if not filtered_df.empty:
|
|||
|
|
# 创建交叉表
|
|||
|
|
bmi_cardio_cross = pd.crosstab(
|
|||
|
|
filtered_df['bmi_category'],
|
|||
|
|
filtered_df['cardio'],
|
|||
|
|
normalize='index'
|
|||
|
|
).reset_index()
|
|||
|
|
|
|||
|
|
# 转换为长格式
|
|||
|
|
bmi_cardio_long = pd.melt(
|
|||
|
|
bmi_cardio_cross,
|
|||
|
|
id_vars=['bmi_category'],
|
|||
|
|
value_vars=[0, 1],
|
|||
|
|
var_name='cardio',
|
|||
|
|
value_name='proportion'
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 添加标签
|
|||
|
|
bmi_cardio_long['cardio_label'] = bmi_cardio_long['cardio'].map({0: '无疾病', 1: '有疾病'})
|
|||
|
|
|
|||
|
|
fig2 = px.bar(
|
|||
|
|
bmi_cardio_long,
|
|||
|
|
x='bmi_category',
|
|||
|
|
y='proportion',
|
|||
|
|
color='cardio_label',
|
|||
|
|
barmode='stack',
|
|||
|
|
color_discrete_map={'无疾病': '#00CC96', '有疾病': '#AB63FA'},
|
|||
|
|
labels={
|
|||
|
|
'bmi_category': 'BMI分类',
|
|||
|
|
'proportion': '比例',
|
|||
|
|
'cardio_label': '疾病状态'
|
|||
|
|
},
|
|||
|
|
category_orders={
|
|||
|
|
'bmi_category': ['偏瘦', '正常', '超重', '肥胖']
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
fig2.update_layout(
|
|||
|
|
yaxis_tickformat=',.0%',
|
|||
|
|
legend_title_text='疾病状态'
|
|||
|
|
)
|
|||
|
|
st.plotly_chart(fig2, use_container_width=True)
|
|||
|
|
else:
|
|||
|
|
st.info("没有数据可显示")
|
|||
|
|
|
|||
|
|
# 额外分析
|
|||
|
|
st.subheader("🔍 详细分析")
|
|||
|
|
|
|||
|
|
col3, col4 = st.columns(2)
|
|||
|
|
|
|||
|
|
with col3:
|
|||
|
|
st.markdown("##### 胆固醇水平分布")
|
|||
|
|
if not filtered_df.empty:
|
|||
|
|
fig3 = px.pie(
|
|||
|
|
filtered_df,
|
|||
|
|
names='cholesterol_str',
|
|||
|
|
color='cholesterol_str',
|
|||
|
|
color_discrete_sequence=px.colors.sequential.RdBu
|
|||
|
|
)
|
|||
|
|
fig3.update_traces(textposition='inside', textinfo='percent+label')
|
|||
|
|
st.plotly_chart(fig3, use_container_width=True)
|
|||
|
|
|
|||
|
|
with col4:
|
|||
|
|
st.markdown("##### 血糖水平分布")
|
|||
|
|
if not filtered_df.empty:
|
|||
|
|
fig4 = px.pie(
|
|||
|
|
filtered_df,
|
|||
|
|
names='gluc_str',
|
|||
|
|
color='gluc_str',
|
|||
|
|
color_discrete_sequence=px.colors.sequential.Bluyl
|
|||
|
|
)
|
|||
|
|
fig4.update_traces(textposition='inside', textinfo='percent+label')
|
|||
|
|
st.plotly_chart(fig4, use_container_width=True)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def display_data_summary(filtered_df):
|
|||
|
|
"""
|
|||
|
|
显示数据摘要
|
|||
|
|
"""
|
|||
|
|
with st.expander("📋 数据摘要(点击展开)"):
|
|||
|
|
st.dataframe(
|
|||
|
|
filtered_df.describe().round(2),
|
|||
|
|
use_container_width=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
st.markdown("##### 数据样本")
|
|||
|
|
st.dataframe(
|
|||
|
|
filtered_df.head(10),
|
|||
|
|
use_container_width=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""
|
|||
|
|
主函数
|
|||
|
|
"""
|
|||
|
|
# 标题和介绍
|
|||
|
|
st.title("❤️ CardioAI - 心血管疾病智能分析系统")
|
|||
|
|
st.markdown("""
|
|||
|
|
本仪表板提供心血管疾病数据的交互式可视化分析。使用侧边栏筛选器探索数据模式。
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
# 加载数据
|
|||
|
|
with st.spinner('正在加载和处理数据...'):
|
|||
|
|
df = load_and_process_data()
|
|||
|
|
|
|||
|
|
if df.empty:
|
|||
|
|
st.error("无法加载数据。请检查数据文件路径和格式。")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 创建筛选器并获取筛选后的数据
|
|||
|
|
filtered_df, age_range, selected_genders, selected_cardio = create_filters(df)
|
|||
|
|
|
|||
|
|
# 显示筛选条件
|
|||
|
|
st.sidebar.markdown("---")
|
|||
|
|
st.sidebar.markdown(f"**筛选结果:** {len(filtered_df):,} 条记录")
|
|||
|
|
|
|||
|
|
# 显示关键指标
|
|||
|
|
display_metrics(filtered_df, df)
|
|||
|
|
|
|||
|
|
# 显示可视化图表
|
|||
|
|
create_visualizations(filtered_df)
|
|||
|
|
|
|||
|
|
# 显示数据摘要
|
|||
|
|
display_data_summary(filtered_df)
|
|||
|
|
|
|||
|
|
# 页脚
|
|||
|
|
st.markdown("---")
|
|||
|
|
st.caption("CardioAI - 心血管疾病智能辅助系统 | Module 1: 数据可视化仪表板")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|