#!/opt/anaconda3/envs/cardioenv/bin/python """ CardioAI - 心血管疾病数据可视化仪表板 Streamlit应用程序,用于对心血管疾病数据进行清洗、特征工程和交互式可视化 """ import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import warnings import os warnings.filterwarnings('ignore') # 页面配置 st.set_page_config( page_title="CardioAI - 心血管疾病智能分析", page_icon="❤️", layout="wide", initial_sidebar_state="expanded" ) # 数据路径 DATA_PATH = "./data/心血管疾病.xlsx" @st.cache_data(ttl=3600) def load_and_process_data(): """ 加载并处理心血管疾病数据 返回处理后的DataFrame """ try: # 调试信息:检查文件路径 current_dir = os.getcwd() script_dir = os.path.dirname(os.path.abspath(__file__)) st.sidebar.info(f"当前工作目录: {current_dir}") st.sidebar.info(f"脚本目录: {script_dir}") st.sidebar.info(f"数据路径: {DATA_PATH}") # 尝试多种路径 possible_paths = [ os.path.abspath(DATA_PATH), # 绝对路径 DATA_PATH, # 原始相对路径 os.path.join(current_dir, DATA_PATH), os.path.join(current_dir, "data", "心血管疾病.xlsx"), os.path.join(script_dir, "..", DATA_PATH), os.path.join(script_dir, "..", "data", "心血管疾病.xlsx"), os.path.join(script_dir, "data", "心血管疾病.xlsx") ] data_path = None for path in possible_paths: if os.path.exists(path): data_path = path st.sidebar.success(f"找到数据文件: {path}") break if data_path is None: st.sidebar.error("未找到数据文件,请检查路径") st.sidebar.info(f"尝试过的路径: {possible_paths}") return pd.DataFrame() # 加载数据 df = pd.read_excel(data_path) # 1. 特征工程 # 将age(天)转换为年,四舍五入 df['age_years'] = (df['age'] / 365.25).round().astype(int) # 计算BMI: weight / (height/100)^2 df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2) # 2. 异常值处理 # 删除舒张压 >= 收缩压的记录 df = df[df['ap_lo'] < df['ap_hi']].copy() # 删除血压极端异常值 # 收缩压 ∈ [90, 250], 舒张压 ∈ [60, 150] df = df[ (df['ap_hi'] >= 90) & (df['ap_hi'] <= 250) & (df['ap_lo'] >= 60) & (df['ap_lo'] <= 150) ].copy() # 3. 类别转换 # cholesterol转换 cholesterol_map = { 1: '正常', 2: '高于正常', 3: '很高' } df['cholesterol_str'] = df['cholesterol'].map(cholesterol_map) # gluc转换 gluc_map = { 1: '正常', 2: '高于正常', 3: '很高' } df['gluc_str'] = df['gluc'].map(gluc_map) # BMI分类 def categorize_bmi(bmi): if bmi < 18.5: return '偏瘦' elif 18.5 <= bmi < 25: return '正常' elif 25 <= bmi < 30: return '超重' else: return '肥胖' df['bmi_category'] = df['bmi'].apply(categorize_bmi) # 添加血压分类 df['bp_category'] = pd.cut( df['ap_hi'], bins=[0, 120, 140, 160, 180, 250], labels=['正常', '偏高', '高血压1级', '高血压2级', '高血压3级'] ) return df except Exception as e: st.error(f"数据加载失败: {e}") return pd.DataFrame() def create_filters(df): """ 创建侧边栏筛选器 返回筛选后的数据 """ st.sidebar.header("🔍 数据筛选") # 年龄范围滑块 min_age = int(df['age_years'].min()) max_age = int(df['age_years'].max()) age_range = st.sidebar.slider( "选择年龄范围", min_value=min_age, max_value=max_age, value=(min_age, max_age), step=1 ) # 性别筛选(多选) gender_options = df['gender'].unique() gender_options = sorted(gender_options) gender_labels = {1: '女性', 2: '男性'} selected_genders = st.sidebar.multiselect( "选择性别", options=gender_options, default=gender_options, format_func=lambda x: gender_labels.get(x, f"性别{x}") ) # 心血管疾病筛选(多选) cardio_options = df['cardio'].unique() cardio_options = sorted(cardio_options) cardio_labels = {0: '无疾病', 1: '有疾病'} selected_cardio = st.sidebar.multiselect( "选择心血管疾病状态", options=cardio_options, default=cardio_options, format_func=lambda x: cardio_labels.get(x, f"状态{x}") ) # 应用筛选 filtered_df = df[ (df['age_years'] >= age_range[0]) & (df['age_years'] <= age_range[1]) & (df['gender'].isin(selected_genders)) & (df['cardio'].isin(selected_cardio)) ].copy() return filtered_df, age_range, selected_genders, selected_cardio def display_metrics(filtered_df, original_df): """ 显示关键指标 """ col1, col2, col3, col4 = st.columns(4) with col1: st.metric( label="总记录数", value=f"{len(filtered_df):,}", delta=f"{len(filtered_df) - len(original_df):+,}" if len(filtered_df) != len(original_df) else None ) with col2: disease_count = filtered_df['cardio'].sum() disease_rate = (disease_count / len(filtered_df) * 100) if len(filtered_df) > 0 else 0 st.metric( label="心血管疾病风险率", value=f"{disease_rate:.1f}%", delta=f"{disease_count:,} 例" ) with col3: avg_age = filtered_df['age_years'].mean() st.metric( label="平均年龄", value=f"{avg_age:.1f} 岁" ) with col4: avg_bmi = filtered_df['bmi'].mean() st.metric( label="平均BMI", value=f"{avg_bmi:.1f}" ) def create_visualizations(filtered_df): """ 创建可视化图表 """ st.subheader("📊 数据分析") # 创建两列布局 col1, col2 = st.columns(2) with col1: st.markdown("##### 年龄分布(按心血管疾病状态)") if not filtered_df.empty: fig1 = px.histogram( filtered_df, x='age_years', color='cardio', nbins=30, barmode='overlay', color_discrete_map={0: '#636EFA', 1: '#EF553B'}, labels={ 'age_years': '年龄(岁)', 'cardio': '心血管疾病', 'count': '人数' }, category_orders={'cardio': [0, 1]}, opacity=0.7 ) fig1.update_layout( legend_title_text='疾病状态', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ) ) st.plotly_chart(fig1, use_container_width=True) else: st.info("没有数据可显示") with col2: st.markdown("##### BMI分类与心血管疾病关系") if not filtered_df.empty: # 创建交叉表 bmi_cardio_cross = pd.crosstab( filtered_df['bmi_category'], filtered_df['cardio'], normalize='index' ).reset_index() # 转换为长格式 bmi_cardio_long = pd.melt( bmi_cardio_cross, id_vars=['bmi_category'], value_vars=[0, 1], var_name='cardio', value_name='proportion' ) # 添加标签 bmi_cardio_long['cardio_label'] = bmi_cardio_long['cardio'].map({0: '无疾病', 1: '有疾病'}) fig2 = px.bar( bmi_cardio_long, x='bmi_category', y='proportion', color='cardio_label', barmode='stack', color_discrete_map={'无疾病': '#00CC96', '有疾病': '#AB63FA'}, labels={ 'bmi_category': 'BMI分类', 'proportion': '比例', 'cardio_label': '疾病状态' }, category_orders={ 'bmi_category': ['偏瘦', '正常', '超重', '肥胖'] } ) fig2.update_layout( yaxis_tickformat=',.0%', legend_title_text='疾病状态' ) st.plotly_chart(fig2, use_container_width=True) else: st.info("没有数据可显示") # 额外分析 st.subheader("🔍 详细分析") col3, col4 = st.columns(2) with col3: st.markdown("##### 胆固醇水平分布") if not filtered_df.empty: fig3 = px.pie( filtered_df, names='cholesterol_str', color='cholesterol_str', color_discrete_sequence=px.colors.sequential.RdBu ) fig3.update_traces(textposition='inside', textinfo='percent+label') st.plotly_chart(fig3, use_container_width=True) with col4: st.markdown("##### 血糖水平分布") if not filtered_df.empty: fig4 = px.pie( filtered_df, names='gluc_str', color='gluc_str', color_discrete_sequence=px.colors.sequential.Bluyl ) fig4.update_traces(textposition='inside', textinfo='percent+label') st.plotly_chart(fig4, use_container_width=True) def display_data_summary(filtered_df): """ 显示数据摘要 """ with st.expander("📋 数据摘要(点击展开)"): st.dataframe( filtered_df.describe().round(2), use_container_width=True ) st.markdown("##### 数据样本") st.dataframe( filtered_df.head(10), use_container_width=True ) def main(): """ 主函数 """ # 标题和介绍 st.title("❤️ CardioAI - 心血管疾病智能分析系统") st.markdown(""" 本仪表板提供心血管疾病数据的交互式可视化分析。使用侧边栏筛选器探索数据模式。 """) # 加载数据 with st.spinner('正在加载和处理数据...'): df = load_and_process_data() if df.empty: st.error("无法加载数据。请检查数据文件路径和格式。") return # 创建筛选器并获取筛选后的数据 filtered_df, age_range, selected_genders, selected_cardio = create_filters(df) # 显示筛选条件 st.sidebar.markdown("---") st.sidebar.markdown(f"**筛选结果:** {len(filtered_df):,} 条记录") # 显示关键指标 display_metrics(filtered_df, df) # 显示可视化图表 create_visualizations(filtered_df) # 显示数据摘要 display_data_summary(filtered_df) # 页脚 st.markdown("---") st.caption("CardioAI - 心血管疾病智能辅助系统 | Module 1: 数据可视化仪表板") if __name__ == "__main__": main()