diff --git a/data/心血管疾病.xlsx b/data/心血管疾病.xlsx new file mode 100644 index 0000000..6a4700e Binary files /dev/null and b/data/心血管疾病.xlsx differ diff --git a/module1_dashboard/cardio_dashboard.py b/module1_dashboard/cardio_dashboard.py new file mode 100644 index 0000000..dcf4680 --- /dev/null +++ b/module1_dashboard/cardio_dashboard.py @@ -0,0 +1,394 @@ +#!/opt/anaconda3/envs/cardioenv/bin/python +""" +CardioAI - 心血管疾病数据可视化仪表板 +Streamlit应用程序,用于对心血管疾病数据进行清洗、特征工程和交互式可视化 +""" + +import streamlit as st +import pandas as pd +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from plotly.subplots import make_subplots +import warnings +import os + +warnings.filterwarnings('ignore') + +# 页面配置 +st.set_page_config( + page_title="CardioAI - 心血管疾病智能分析", + page_icon="❤️", + layout="wide", + initial_sidebar_state="expanded" +) + +# 数据路径 +DATA_PATH = "./data/心血管疾病.xlsx" + + +@st.cache_data(ttl=3600) +def load_and_process_data(): + """ + 加载并处理心血管疾病数据 + 返回处理后的DataFrame + """ + try: + # 调试信息:检查文件路径 + current_dir = os.getcwd() + script_dir = os.path.dirname(os.path.abspath(__file__)) + st.sidebar.info(f"当前工作目录: {current_dir}") + st.sidebar.info(f"脚本目录: {script_dir}") + st.sidebar.info(f"数据路径: {DATA_PATH}") + + # 尝试多种路径 + possible_paths = [ + os.path.abspath(DATA_PATH), # 绝对路径 + DATA_PATH, # 原始相对路径 + os.path.join(current_dir, DATA_PATH), + os.path.join(current_dir, "data", "心血管疾病.xlsx"), + os.path.join(script_dir, "..", DATA_PATH), + os.path.join(script_dir, "..", "data", "心血管疾病.xlsx"), + os.path.join(script_dir, "data", "心血管疾病.xlsx") + ] + + data_path = None + for path in possible_paths: + if os.path.exists(path): + data_path = path + st.sidebar.success(f"找到数据文件: {path}") + break + + if data_path is None: + st.sidebar.error("未找到数据文件,请检查路径") + st.sidebar.info(f"尝试过的路径: {possible_paths}") + return pd.DataFrame() + + # 加载数据 + df = pd.read_excel(data_path) + + # 1. 特征工程 + # 将age(天)转换为年,四舍五入 + df['age_years'] = (df['age'] / 365.25).round().astype(int) + + # 计算BMI: weight / (height/100)^2 + df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2) + + # 2. 异常值处理 + # 删除舒张压 >= 收缩压的记录 + df = df[df['ap_lo'] < df['ap_hi']].copy() + + # 删除血压极端异常值 + # 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150] + df = df[ + (df['ap_hi'] >= 90) & (df['ap_hi'] <= 250) & + (df['ap_lo'] >= 60) & (df['ap_lo'] <= 150) + ].copy() + + # 3. 类别转换 + # cholesterol转换 + cholesterol_map = { + 1: '正常', + 2: '高于正常', + 3: '很高' + } + df['cholesterol_str'] = df['cholesterol'].map(cholesterol_map) + + # gluc转换 + gluc_map = { + 1: '正常', + 2: '高于正常', + 3: '很高' + } + df['gluc_str'] = df['gluc'].map(gluc_map) + + # BMI分类 + def categorize_bmi(bmi): + if bmi < 18.5: + return '偏瘦' + elif 18.5 <= bmi < 25: + return '正常' + elif 25 <= bmi < 30: + return '超重' + else: + return '肥胖' + + df['bmi_category'] = df['bmi'].apply(categorize_bmi) + + # 添加血压分类 + df['bp_category'] = pd.cut( + df['ap_hi'], + bins=[0, 120, 140, 160, 180, 250], + labels=['正常', '偏高', '高血压1级', '高血压2级', '高血压3级'] + ) + + return df + + except Exception as e: + st.error(f"数据加载失败: {e}") + return pd.DataFrame() + + +def create_filters(df): + """ + 创建侧边栏筛选器 + 返回筛选后的数据 + """ + st.sidebar.header("🔍 数据筛选") + + # 年龄范围滑块 + min_age = int(df['age_years'].min()) + max_age = int(df['age_years'].max()) + age_range = st.sidebar.slider( + "选择年龄范围", + min_value=min_age, + max_value=max_age, + value=(min_age, max_age), + step=1 + ) + + # 性别筛选(多选) + gender_options = df['gender'].unique() + gender_options = sorted(gender_options) + gender_labels = {1: '女性', 2: '男性'} + selected_genders = st.sidebar.multiselect( + "选择性别", + options=gender_options, + default=gender_options, + format_func=lambda x: gender_labels.get(x, f"性别{x}") + ) + + # 心血管疾病筛选(多选) + cardio_options = df['cardio'].unique() + cardio_options = sorted(cardio_options) + cardio_labels = {0: '无疾病', 1: '有疾病'} + selected_cardio = st.sidebar.multiselect( + "选择心血管疾病状态", + options=cardio_options, + default=cardio_options, + format_func=lambda x: cardio_labels.get(x, f"状态{x}") + ) + + # 应用筛选 + filtered_df = df[ + (df['age_years'] >= age_range[0]) & + (df['age_years'] <= age_range[1]) & + (df['gender'].isin(selected_genders)) & + (df['cardio'].isin(selected_cardio)) + ].copy() + + return filtered_df, age_range, selected_genders, selected_cardio + + +def display_metrics(filtered_df, original_df): + """ + 显示关键指标 + """ + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric( + label="总记录数", + value=f"{len(filtered_df):,}", + delta=f"{len(filtered_df) - len(original_df):+,}" if len(filtered_df) != len(original_df) else None + ) + + with col2: + disease_count = filtered_df['cardio'].sum() + disease_rate = (disease_count / len(filtered_df) * 100) if len(filtered_df) > 0 else 0 + st.metric( + label="心血管疾病风险率", + value=f"{disease_rate:.1f}%", + delta=f"{disease_count:,} 例" + ) + + with col3: + avg_age = filtered_df['age_years'].mean() + st.metric( + label="平均年龄", + value=f"{avg_age:.1f} 岁" + ) + + with col4: + avg_bmi = filtered_df['bmi'].mean() + st.metric( + label="平均BMI", + value=f"{avg_bmi:.1f}" + ) + + +def create_visualizations(filtered_df): + """ + 创建可视化图表 + """ + st.subheader("📊 数据分析") + + # 创建两列布局 + col1, col2 = st.columns(2) + + with col1: + st.markdown("##### 年龄分布(按心血管疾病状态)") + if not filtered_df.empty: + fig1 = px.histogram( + filtered_df, + x='age_years', + color='cardio', + nbins=30, + barmode='overlay', + color_discrete_map={0: '#636EFA', 1: '#EF553B'}, + labels={ + 'age_years': '年龄(岁)', + 'cardio': '心血管疾病', + 'count': '人数' + }, + category_orders={'cardio': [0, 1]}, + opacity=0.7 + ) + fig1.update_layout( + legend_title_text='疾病状态', + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1 + ) + ) + st.plotly_chart(fig1, use_container_width=True) + else: + st.info("没有数据可显示") + + with col2: + st.markdown("##### BMI分类与心血管疾病关系") + if not filtered_df.empty: + # 创建交叉表 + bmi_cardio_cross = pd.crosstab( + filtered_df['bmi_category'], + filtered_df['cardio'], + normalize='index' + ).reset_index() + + # 转换为长格式 + bmi_cardio_long = pd.melt( + bmi_cardio_cross, + id_vars=['bmi_category'], + value_vars=[0, 1], + var_name='cardio', + value_name='proportion' + ) + + # 添加标签 + bmi_cardio_long['cardio_label'] = bmi_cardio_long['cardio'].map({0: '无疾病', 1: '有疾病'}) + + fig2 = px.bar( + bmi_cardio_long, + x='bmi_category', + y='proportion', + color='cardio_label', + barmode='stack', + color_discrete_map={'无疾病': '#00CC96', '有疾病': '#AB63FA'}, + labels={ + 'bmi_category': 'BMI分类', + 'proportion': '比例', + 'cardio_label': '疾病状态' + }, + category_orders={ + 'bmi_category': ['偏瘦', '正常', '超重', '肥胖'] + } + ) + fig2.update_layout( + yaxis_tickformat=',.0%', + legend_title_text='疾病状态' + ) + st.plotly_chart(fig2, use_container_width=True) + else: + st.info("没有数据可显示") + + # 额外分析 + st.subheader("🔍 详细分析") + + col3, col4 = st.columns(2) + + with col3: + st.markdown("##### 胆固醇水平分布") + if not filtered_df.empty: + fig3 = px.pie( + filtered_df, + names='cholesterol_str', + color='cholesterol_str', + color_discrete_sequence=px.colors.sequential.RdBu + ) + fig3.update_traces(textposition='inside', textinfo='percent+label') + st.plotly_chart(fig3, use_container_width=True) + + with col4: + st.markdown("##### 血糖水平分布") + if not filtered_df.empty: + fig4 = px.pie( + filtered_df, + names='gluc_str', + color='gluc_str', + color_discrete_sequence=px.colors.sequential.Bluyl + ) + fig4.update_traces(textposition='inside', textinfo='percent+label') + st.plotly_chart(fig4, use_container_width=True) + + +def display_data_summary(filtered_df): + """ + 显示数据摘要 + """ + with st.expander("📋 数据摘要(点击展开)"): + st.dataframe( + filtered_df.describe().round(2), + use_container_width=True + ) + + st.markdown("##### 数据样本") + st.dataframe( + filtered_df.head(10), + use_container_width=True + ) + + +def main(): + """ + 主函数 + """ + # 标题和介绍 + st.title("❤️ CardioAI - 心血管疾病智能分析系统") + st.markdown(""" + 本仪表板提供心血管疾病数据的交互式可视化分析。使用侧边栏筛选器探索数据模式。 + """) + + # 加载数据 + with st.spinner('正在加载和处理数据...'): + df = load_and_process_data() + + if df.empty: + st.error("无法加载数据。请检查数据文件路径和格式。") + return + + # 创建筛选器并获取筛选后的数据 + filtered_df, age_range, selected_genders, selected_cardio = create_filters(df) + + # 显示筛选条件 + st.sidebar.markdown("---") + st.sidebar.markdown(f"**筛选结果:** {len(filtered_df):,} 条记录") + + # 显示关键指标 + display_metrics(filtered_df, df) + + # 显示可视化图表 + create_visualizations(filtered_df) + + # 显示数据摘要 + display_data_summary(filtered_df) + + # 页脚 + st.markdown("---") + st.caption("CardioAI - 心血管疾病智能辅助系统 | Module 1: 数据可视化仪表板") + + +if __name__ == "__main__": + main()