diff --git a/aiheart/cardio_dashboard.py b/aiheart/cardio_dashboard.py new file mode 100644 index 0000000..07aee3e --- /dev/null +++ b/aiheart/cardio_dashboard.py @@ -0,0 +1,403 @@ +""" +心血管疾病数据仪表板 +Streamlit应用程序,用于心血管疾病数据的清洗、特征工程和交互式可视化 +终端启动程序命令 +streamlit run D:\AI_Class\PyCharm\Work_Space\CardAI\module1_dashboard\cardio_dashboard.py +""" + +import streamlit as st +import pandas as pd +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from pathlib import Path +import warnings +warnings.filterwarnings('ignore') + +# 设置页面配置 +st.set_page_config( + page_title="心血管疾病数据分析仪表板", + page_icon="❤️", + layout="wide", + initial_sidebar_state="expanded" +) + +# 常量定义 - 使用绝对路径 +DATA_PATH = Path("D:/AI_Class/PyCharm/Work_Space/CardAI/data/心血管疾病.xlsx") +# BMI分类(中国标准) +BMI_CATEGORIES = { + "偏瘦": (0, 18.5), + "正常": (18.5, 24), + "超重": (24, 28), + "肥胖": (28, float('inf')) +} +CHOLESTEROL_MAP = {1: "正常", 2: "高于正常", 3: "远高于正常"} +GLUC_MAP = {1: "正常", 2: "高于正常", 3: "远高于正常"} +GENDER_MAP = {1: "女性", 2: "男性"} + +# 数据加载和清洗函数(使用缓存) +@st.cache_data +def load_and_clean_data(file_path): + """ + 加载并清洗心血管疾病数据 + + Args: + file_path: Excel文件路径 + + Returns: + pandas.DataFrame: 清洗后的数据框 + """ + try: + # 加载数据 + df = pd.read_excel(file_path) + + # 1. 特征工程 + # 将年龄从天数转换为年(四舍五入) + df['age_years'] = round(df['age'] / 365.25).astype(int) + + # 计算BMI + df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2) + + # 2. 异常值处理 + # 删除舒张压 ≥ 收缩压的记录 + df = df[df['ap_lo'] < df['ap_hi']] + + # 删除血压极端异常值(保留收缩压∈[90,250],舒张压∈[60,150]) + df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)] + df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)] + + # 3. 类别转换 + # 胆固醇水平转换 + df['cholesterol_str'] = df['cholesterol'].map(CHOLESTEROL_MAP) + + # 血糖水平转换 + df['gluc_str'] = df['gluc'].map(GLUC_MAP) + + # 性别转换 + df['gender_str'] = df['gender'].map(GENDER_MAP) + + # 心血管疾病状态转换 + df['cardio_str'] = df['cardio'].map({0: "无心血管疾病", 1: "有心血管疾病"}) + + # 4. BMI分类(中国标准) + def categorize_bmi(bmi): + for category, (low, high) in BMI_CATEGORIES.items(): + if low <= bmi < high: + return category + return "未知" + + df['bmi_category'] = df['BMI'].apply(categorize_bmi) + + # 重置索引 + df = df.reset_index(drop=True) + + return df + + except FileNotFoundError: + st.error(f"数据文件未找到: {file_path}") + return pd.DataFrame() + except Exception as e: + st.error(f"数据加载失败: {str(e)}") + return pd.DataFrame() + +@st.cache_data +def filter_data(df, age_range, gender_filter, cardio_filter): + """ + 根据筛选条件过滤数据 + + Args: + df: 原始数据框 + age_range: 年龄范围 [min, max] + gender_filter: 性别筛选列表 + cardio_filter: 心血管疾病筛选列表 + + Returns: + pandas.DataFrame: 筛选后的数据框 + """ + filtered_df = df.copy() + + # 年龄筛选 + filtered_df = filtered_df[ + (filtered_df['age_years'] >= age_range[0]) & + (filtered_df['age_years'] <= age_range[1]) + ] + + # 性别筛选 + if "全部" not in gender_filter: + gender_values = [k for k, v in GENDER_MAP.items() if v in gender_filter] + filtered_df = filtered_df[filtered_df['gender'].isin(gender_values)] + + # 心血管疾病筛选 + if "全部" not in cardio_filter: + cardio_values = [] + if "有心血管疾病" in cardio_filter: + cardio_values.append(1) + if "无心血管疾病" in cardio_filter: + cardio_values.append(0) + filtered_df = filtered_df[filtered_df['cardio'].isin(cardio_values)] + + return filtered_df + +def create_age_distribution_chart(df): + """ + 创建年龄分布直方图 + + Args: + df: 数据框 + + Returns: + plotly.graph_objects.Figure: 年龄分布图表 + """ + fig = px.histogram( + df, + x='age_years', + color='cardio_str', + nbins=30, + title='年龄分布(按心血管疾病状态)', + labels={'age_years': '年龄(岁)', 'count': '人数', 'cardio_str': '心血管疾病状态'}, + color_discrete_map={"有心血管疾病": "#EF553B", "无心血管疾病": "#636EFA"}, + opacity=0.7 + ) + + fig.update_layout( + bargap=0.1, + xaxis_title="年龄(岁)", + yaxis_title="人数", + legend_title="心血管疾病状态", + hovermode='x unified' + ) + + return fig + +def create_bmi_cardio_chart(df): + """ + 创建BMI分类对心血管疾病影响的堆叠柱状图 + + Args: + df: 数据框 + + Returns: + plotly.graph_objects.Figure: BMI分类图表 + """ + # 计算交叉表 + cross_tab = pd.crosstab( + df['bmi_category'], + df['cardio_str'], + normalize='index' + ).reset_index() + + # 转换为长格式 + cross_tab_melted = cross_tab.melt( + id_vars='bmi_category', + var_name='cardio_status', + value_name='percentage' + ) + + # 创建堆叠柱状图 + fig = px.bar( + cross_tab_melted, + x='bmi_category', + y='percentage', + color='cardio_status', + title='BMI分类对心血管疾病的影响', + labels={ + 'bmi_category': 'BMI分类', + 'percentage': '比例', + 'cardio_status': '心血管疾病状态' + }, + color_discrete_map={"有心血管疾病": "#EF553B", "无心血管疾病": "#636EFA"}, + text_auto='.1%' + ) + + fig.update_layout( + xaxis_title="BMI分类", + yaxis_title="比例", + legend_title="心血管疾病状态", + yaxis_tickformat=',.0%', + hovermode='x unified' + ) + + return fig + +def display_summary_stats(df): + """ + 显示摘要统计信息 + + Args: + df: 数据框 + """ + total_records = len(df) + cardio_cases = df['cardio'].sum() + cardio_rate = (cardio_cases / total_records * 100) if total_records > 0 else 0 + + col1, col2, col3 = st.columns(3) + + with col1: + st.metric( + label="总记录数", + value=f"{total_records:,}", + delta=None + ) + + with col2: + st.metric( + label="心血管疾病病例数", + value=f"{cardio_cases:,}", + delta=None + ) + + with col3: + st.metric( + label="心血管疾病风险率", + value=f"{cardio_rate:.2f}%", + delta=None + ) + +def main(): + """ + 主函数:Streamlit应用程序入口 + """ + # 标题 + st.title("❤️ 心血管疾病数据分析仪表板") + st.markdown("---") + + # 加载数据 + with st.spinner("正在加载数据..."): + df = load_and_clean_data(DATA_PATH) + + if df.empty: + st.error("数据加载失败,请检查数据文件路径和格式。") + return + + # 侧边栏 - 筛选器 + st.sidebar.header("🔍 数据筛选") + + # 年龄范围滑块 + age_min = int(df['age_years'].min()) + age_max = int(df['age_years'].max()) + age_range = st.sidebar.slider( + "选择年龄范围(岁)", + min_value=age_min, + max_value=age_max, + value=[20, 80], + step=1 + ) + + # 性别筛选器 + gender_options = ["女性", "男性", "全部"] + gender_filter = st.sidebar.multiselect( + "选择性别", + options=gender_options, + default=["全部"] + ) + + # 心血管疾病筛选器 + cardio_options = ["有心血管疾病", "无心血管疾病", "全部"] + cardio_filter = st.sidebar.multiselect( + "选择心血管疾病状态", + options=cardio_options, + default=["全部"] + ) + + # 应用筛选 + filtered_df = filter_data(df, age_range, gender_filter, cardio_filter) + + # 显示筛选信息 + st.sidebar.markdown("---") + st.sidebar.info(f"**筛选结果**: {len(filtered_df):,} 条记录") + + # 主页面 + # 1. 摘要统计 + st.header("📊 数据摘要") + display_summary_stats(filtered_df) + + st.markdown("---") + + # 2. 年龄分布图表 + st.header("📈 年龄分布分析") + col1, col2 = st.columns([3, 1]) + + with col1: + age_chart = create_age_distribution_chart(filtered_df) + st.plotly_chart(age_chart, use_container_width=True) + + with col2: + st.markdown("### 年龄统计") + st.metric("平均年龄", f"{filtered_df['age_years'].mean():.1f} 岁") + st.metric("年龄中位数", f"{filtered_df['age_years'].median():.1f} 岁") + st.metric("年龄标准差", f"{filtered_df['age_years'].std():.1f} 岁") + + st.markdown("---") + + # 3. BMI分类分析 + st.header("⚖️ BMI分类分析") + col1, col2 = st.columns([3, 1]) + + with col1: + bmi_chart = create_bmi_cardio_chart(filtered_df) + st.plotly_chart(bmi_chart, use_container_width=True) + + with col2: + st.markdown("### BMI统计") + st.metric("平均BMI", f"{filtered_df['BMI'].mean():.1f}") + st.metric("BMI中位数", f"{filtered_df['BMI'].median():.1f}") + + # BMI分类分布 + bmi_dist = filtered_df['bmi_category'].value_counts() + st.markdown("### BMI分类分布") + for category, count in bmi_dist.items(): + percentage = (count / len(filtered_df)) * 100 + st.markdown(f"**{category}**: {count:,} ({percentage:.1f}%)") + + st.markdown("---") + + # 4. 数据预览 + st.header("🔍 数据预览") + + with st.expander("查看筛选后的数据"): + st.dataframe( + filtered_df[ + ['id', 'age_years', 'gender_str', 'height', 'weight', 'BMI', + 'bmi_category', 'ap_hi', 'ap_lo', 'cholesterol_str', + 'gluc_str', 'smoke', 'alco', 'active', 'cardio_str'] + ].head(100), + use_container_width=True + ) + + # 5. 数据下载 + st.header("📥 数据导出") + + @st.cache_data + def convert_df_to_csv(df): + return df.to_csv(index=False).encode('utf-8') + + csv_data = convert_df_to_csv(filtered_df) + + col1, col2 = st.columns(2) + + with col1: + st.download_button( + label="📥 下载筛选数据 (CSV)", + data=csv_data, + file_name="filtered_cardio_data.csv", + mime="text/csv", + help="下载当前筛选条件下的数据" + ) + + with col2: + if st.button("🔄 重置筛选"): + st.rerun() + + # 页脚 + st.markdown("---") + st.markdown( + """ +
+

心血管疾病数据分析仪表板 | 数据来源: 心血管疾病.xlsx | 总记录数: 70,000

+
+ """, + unsafe_allow_html=True + ) + +if __name__ == "__main__": + main() \ No newline at end of file