# -*- coding: utf-8 -*- """ CardioAI - Module 1: 数据可视化仪表板 心血管疾病数据清洗、特征工程与交互式可视化 """ import streamlit as st import pandas as pd import numpy as np import plotly.express as px from pathlib import Path import os # ============================================ # 配置与常量 # ============================================ CODE_ROOT = Path(r"E:\project_ai\claude_project1\aicodes") DATA_PATH = Path(r"E:\project_ai\claude_project1\data\心血管疾病.xlsx") # 类别映射字典 CHOLESTEROL_MAP = { 1: "正常", 2: "高于正常", 3: "远高于正常" } GLUC_MAP = { 1: "正常", 2: "高于正常", 3: "远高于正常" } GENDER_MAP = { 1: "女性", 2: "男性" } CARDIO_MAP = { 0: "无疾病", 1: "有疾病" } # ============================================ # 数据加载与清洗函数 # ============================================ @st.cache_data def load_data(): """ 加载心血管疾病数据 Returns: pd.DataFrame: 原始数据 """ try: df = pd.read_excel(DATA_PATH, engine='openpyxl') return df except Exception as e: st.error(f"数据加载失败: {str(e)}") return pd.DataFrame() @st.cache_data def clean_and_process_data(df): """ 数据清洗与特征工程 处理步骤: 1. 将age(天)转换为年 2. 计算BMI 3. 删除血压异常值 4. 类别转换 5. 创建BMI分类 Args: df (pd.DataFrame): 原始数据 Returns: pd.DataFrame: 处理后的数据 """ if df.empty: return df df_clean = df.copy() # 1. 年龄转换: 天 -> 年 (四舍五入) df_clean['age_years'] = (df_clean['age'] / 365.25).round().astype(int) # 2. 计算BMI: weight / (height/100)^2 df_clean['bmi'] = df_clean['weight'] / ((df_clean['height'] / 100) ** 2) # 3. 异常值处理 # 删除舒张压 >= 收缩压的记录 (生理上不可能) df_clean = df_clean[df_clean['ap_lo'] < df_clean['ap_hi']] # 删除血压极端异常值 (收缩压范围: [90, 250], 舒张压范围: [60, 150]) df_clean = df_clean[ (df_clean['ap_hi'] >= 90) & (df_clean['ap_hi'] <= 250) & (df_clean['ap_lo'] >= 60) & (df_clean['ap_lo'] <= 150) ] # 4. 类别转换 df_clean['cholesterol_desc'] = df_clean['cholesterol'].map(CHOLESTEROL_MAP) df_clean['gluc_desc'] = df_clean['gluc'].map(GLUC_MAP) df_clean['gender_desc'] = df_clean['gender'].map(GENDER_MAP) df_clean['cardio_desc'] = df_clean['cardio'].map(CARDIO_MAP) # 5. 创建BMI分类 def categorize_bmi(bmi): if bmi < 18.5: return "体重过轻" elif bmi < 24: return "正常体重" elif bmi < 28: return "超重" else: return "肥胖" df_clean['bmi_category'] = df_clean['bmi'].apply(categorize_bmi) return df_clean # ============================================ # 筛选函数 # ============================================ def apply_filters(df, age_range, gender_filter, cardio_filter): """ 根据用户选择的筛选条件过滤数据 Args: df (pd.DataFrame): 处理后的数据 age_range (tuple): 年龄范围 (min, max) gender_filter (list): 性别筛选列表 cardio_filter (list): 心血管疾病状态筛选列表 Returns: pd.DataFrame: 筛选后的数据 """ if df.empty: return df df_filtered = df.copy() # 年龄范围筛选 df_filtered = df_filtered[ df_filtered['age_years'].between(age_range[0], age_range[1]) ] # 性别筛选 if gender_filter: df_filtered = df_filtered[df_filtered['gender_desc'].isin(gender_filter)] # 心血管疾病状态筛选 if cardio_filter: df_filtered = df_filtered[df_filtered['cardio_desc'].isin(cardio_filter)] return df_filtered # ============================================ # 统计指标函数 # ============================================ def calculate_statistics(df): """ 计算统计指标 Args: df (pd.DataFrame): 筛选后的数据 Returns: dict: 包含总记录数和风险率的字典 """ if df.empty: return {"total_records": 0, "risk_rate": 0.0} total_records = len(df) disease_count = df['cardio'].sum() risk_rate = (disease_count / total_records * 100) if total_records > 0 else 0 return { "total_records": total_records, "risk_rate": risk_rate } # ============================================ # 可视化图表函数 # ============================================ def plot_age_distribution(df): """ 绘制年龄分布直方图 (按cardio区分) Args: df (pd.DataFrame): 数据 Returns: plotly.graph_objects.Figure: 年龄分布图 """ if df.empty: return None fig = px.histogram( df, x='age_years', color='cardio_desc', nbins=30, title='年龄分布 (按心血管疾病状态)', labels={'age_years': '年龄 (岁)', 'count': '人数'}, color_discrete_map={'无疾病': '#2ecc71', '有疾病': '#e74c3c'}, barmode='overlay' ) fig.update_layout( xaxis_title="年龄 (岁)", yaxis_title="人数", legend_title="疾病状态", hovermode='x unified' ) return fig def plot_bmi_vs_cardio(df): """ 绘制BMI分类对心血管疾病影响的堆叠柱状图 Args: df (pd.DataFrame): 数据 Returns: plotly.graph_objects.Figure: BMI与疾病关系图 """ if df.empty: return None # 计算每个BMI分类的疾病比例 bmi_cardio = df.groupby(['bmi_category', 'cardio_desc']).size().reset_index(name='count') # 确保BMI分类顺序正确 bmi_order = ["体重过轻", "正常体重", "超重", "肥胖"] bmi_cardio['bmi_category'] = pd.Categorical( bmi_cardio['bmi_category'], categories=bmi_order, ordered=True ) bmi_cardio = bmi_cardio.sort_values('bmi_category') fig = px.bar( bmi_cardio, x='bmi_category', y='count', color='cardio_desc', title='BMI分类与心血管疾病关系', labels={'bmi_category': 'BMI分类', 'count': '人数'}, color_discrete_map={'无疾病': '#2ecc71', '有疾病': '#e74c3c'}, category_orders={'bmi_category': bmi_order} ) fig.update_layout( xaxis_title="BMI分类", yaxis_title="人数", legend_title="疾病状态", barmode='stack' ) return fig # ============================================ # 主应用程序 # ============================================ def main(): """Streamlit 主应用程序""" # 页面配置 st.set_page_config( page_title="CardioAI - 心血管疾病数据分析仪表板", page_icon="❤️", layout="wide", initial_sidebar_state="expanded" ) # 标题与描述 st.title("❤️ CardioAI - 心血管疾病智能辅助系统") st.markdown("### Module 1: 数据可视化仪表板") st.markdown("---") # 数据加载 with st.spinner("正在加载数据..."): raw_data = load_data() if raw_data.empty: st.error("无法加载数据,请检查数据路径是否正确。") st.stop() # 数据清洗与特征工程 processed_data = clean_and_process_data(raw_data) if processed_data.empty: st.warning("数据清洗后无有效记录,请检查数据质量。") st.stop() # ============================================ # 侧边栏 - 筛选器 # ============================================ st.sidebar.header("🔍 数据筛选") # 年龄范围滑块 age_min = int(processed_data['age_years'].min()) age_max = int(processed_data['age_years'].max()) age_range = st.sidebar.slider( "年龄范围 (岁)", min_value=age_min, max_value=age_max, value=(age_min, age_max), step=1 ) # 性别多选框 gender_options = processed_data['gender_desc'].unique().tolist() gender_filter = st.sidebar.multiselect( "性别", options=gender_options, default=gender_options ) # 心血管疾病状态多选框 cardio_options = processed_data['cardio_desc'].unique().tolist() cardio_filter = st.sidebar.multiselect( "心血管疾病状态", options=cardio_options, default=cardio_options ) st.sidebar.markdown("---") st.sidebar.markdown("### 📊 数据概览") st.sidebar.markdown(f"- 原始记录数: **{len(raw_data):,}**") st.sidebar.markdown(f"- 清洗后记录数: **{len(processed_data):,}**") st.sidebar.markdown(f"- 数据清洗率: **{(1 - len(processed_data)/len(raw_data))*100:.2f}%**") # ============================================ # 应用筛选条件 # ============================================ filtered_data = apply_filters( processed_data, age_range, gender_filter, cardio_filter ) # ============================================ # 主页 - 统计指标 # ============================================ st.subheader("📈 筛选结果统计") stats = calculate_statistics(filtered_data) col1, col2 = st.columns(2) with col1: st.metric( label="筛选后总记录数", value=f"{stats['total_records']:,}", delta=None ) with col2: # 根据风险率设置颜色 risk_rate = stats['risk_rate'] risk_color = "normal" if risk_rate < 50 else "inverse" st.metric( label="心血管疾病风险率", value=f"{risk_rate:.2f}%", delta=None, delta_color=risk_color ) st.markdown("---") # ============================================ # 图表展示 # ============================================ st.subheader("📊 数据可视化") # 第一行: 年龄分布图 col1, col2 = st.columns(2) with col1: age_fig = plot_age_distribution(filtered_data) if age_fig: st.plotly_chart(age_fig, use_container_width=True) with col2: # 添加性别分布饼图 gender_dist = filtered_data['gender_desc'].value_counts().reset_index() gender_dist.columns = ['性别', '人数'] gender_pie = px.pie( gender_dist, values='人数', names='性别', title='性别分布', color_discrete_sequence=['#3498db', '#e91e63'] ) st.plotly_chart(gender_pie, use_container_width=True) # 第二行: BMI与疾病关系图 st.markdown("### BMI分类与心血管疾病风险") bmi_fig = plot_bmi_vs_cardio(filtered_data) if bmi_fig: st.plotly_chart(bmi_fig, use_container_width=True) # 第三行: 其他因素分析 col1, col2 = st.columns(2) with col1: # 胆固醇分布 chol_cardio = filtered_data.groupby(['cholesterol_desc', 'cardio_desc']).size().reset_index(name='count') chol_fig = px.bar( chol_cardio, x='cholesterol_desc', y='count', color='cardio_desc', title='胆固醇水平与心血管疾病', labels={'cholesterol_desc': '胆固醇水平', 'count': '人数'}, color_discrete_map={'无疾病': '#2ecc71', '有疾病': '#e74c3c'} ) st.plotly_chart(chol_fig, use_container_width=True) with col2: # 血糖分布 gluc_cardio = filtered_data.groupby(['gluc_desc', 'cardio_desc']).size().reset_index(name='count') gluc_order = ["正常", "高于正常", "远高于正常"] gluc_cardio['gluc_desc'] = pd.Categorical( gluc_cardio['gluc_desc'], categories=gluc_order, ordered=True ) gluc_cardio = gluc_cardio.sort_values('gluc_desc') gluc_fig = px.bar( gluc_cardio, x='gluc_desc', y='count', color='cardio_desc', title='血糖水平与心血管疾病', labels={'gluc_desc': '血糖水平', 'count': '人数'}, category_orders={'gluc_desc': gluc_order}, color_discrete_map={'无疾病': '#2ecc71', '有疾病': '#e74c3c'} ) st.plotly_chart(gluc_fig, use_container_width=True) # ============================================ # 数据表格预览 # ============================================ st.markdown("---") st.subheader("📋 数据预览") display_columns = [ 'age_years', 'gender_desc', 'height', 'weight', 'bmi', 'bmi_category', 'ap_hi', 'ap_lo', 'cholesterol_desc', 'gluc_desc', 'smoke', 'alco', 'active', 'cardio_desc' ] column_rename = { 'age_years': '年龄(岁)', 'gender_desc': '性别', 'height': '身高(cm)', 'weight': '体重(kg)', 'bmi': 'BMI', 'bmi_category': 'BMI分类', 'ap_hi': '收缩压', 'ap_lo': '舒张压', 'cholesterol_desc': '胆固醇', 'gluc_desc': '血糖', 'smoke': '吸烟', 'alco': '饮酒', 'active': '运动', 'cardio_desc': '心血管疾病' } if not filtered_data.empty: display_df = filtered_data[display_columns].copy() display_df = display_df.rename(columns=column_rename) st.dataframe(display_df.head(100), use_container_width=True, height=400) # ============================================ # 程序入口 # ============================================ if __name__ == "__main__": main()